geekai/api/utils/file.go
2024-07-12 18:10:32 +08:00

107 lines
2.3 KiB
Go

package utils
import (
"context"
"fmt"
"github.com/microcosm-cc/bluemonday"
"io"
"net/http"
"os"
"path/filepath"
"strings"
"github.com/google/go-tika/tika"
)
func ReadFileContent(filePath string, tikaHost string) (string, error) {
// for remote file, download it first
if strings.HasPrefix(filePath, "http") {
file, err := downloadFile(filePath)
if err != nil {
return "", err
}
filePath = file
}
// 创建 Tika 客户端
client := tika.NewClient(nil, tikaHost)
// 打开 PDF 文件
file, err := os.Open(filePath)
if err != nil {
return "", fmt.Errorf("error with open file: %v", err)
}
defer file.Close()
// 使用 Tika 提取 PDF 文件的文本内容
content, err := client.Parse(context.TODO(), file)
if err != nil {
return "", fmt.Errorf("error with parse file: %v", err)
}
ext := filepath.Ext(filePath)
switch ext {
case ".doc", ".docx", ".pdf", ".pptx", "ppt":
return cleanBlankLine(cleanHtml(content, false)), nil
case ".xls", ".xlsx":
return cleanBlankLine(cleanHtml(content, true)), nil
default:
return cleanBlankLine(content), nil
}
}
// 清理文本内容
func cleanHtml(html string, keepTable bool) string {
// 清理 HTML 标签
var policy *bluemonday.Policy
if keepTable {
policy = bluemonday.NewPolicy()
policy.AllowElements("table", "thead", "tbody", "tfoot", "tr", "td", "th")
} else {
policy = bluemonday.StrictPolicy()
}
return policy.Sanitize(html)
}
func cleanBlankLine(content string) string {
lines := strings.Split(content, "\n")
texts := make([]string, 0)
for _, line := range lines {
line = strings.TrimSpace(line)
if len(line) < 2 {
continue
}
// discard image
if strings.HasSuffix(line, ".png") ||
strings.HasSuffix(line, ".jpg") ||
strings.HasSuffix(line, ".jpeg") {
continue
}
texts = append(texts, line)
}
return strings.Join(texts, "\n")
}
// 下载文件
func downloadFile(url string) (string, error) {
base := filepath.Base(url)
dir := os.TempDir()
filename := filepath.Join(dir, base)
out, err := os.Create(filename)
if err != nil {
return "", err
}
defer out.Close()
// 获取数据
resp, err := http.Get(url)
if err != nil {
return "", err
}
defer resp.Body.Close()
// 写入数据到文件
_, err = io.Copy(out, resp.Body)
return filename, err
}