mirror of
https://github.com/yangjian102621/geekai.git
synced 2025-09-18 01:06:39 +08:00
107 lines
2.3 KiB
Go
107 lines
2.3 KiB
Go
package utils
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"github.com/microcosm-cc/bluemonday"
|
|
"io"
|
|
"net/http"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
|
|
"github.com/google/go-tika/tika"
|
|
)
|
|
|
|
func ReadFileContent(filePath string) (string, error) {
|
|
// for remote file, download it first
|
|
if strings.HasPrefix(filePath, "http") {
|
|
file, err := downloadFile(filePath)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
filePath = file
|
|
}
|
|
// 创建 Tika 客户端
|
|
client := tika.NewClient(nil, "http://172.22.11.69:9998")
|
|
// 打开 PDF 文件
|
|
file, err := os.Open(filePath)
|
|
if err != nil {
|
|
return "", fmt.Errorf("error with open file: %v", err)
|
|
}
|
|
defer file.Close()
|
|
|
|
// 使用 Tika 提取 PDF 文件的文本内容
|
|
content, err := client.Parse(context.TODO(), file)
|
|
if err != nil {
|
|
return "", fmt.Errorf("error with parse file: %v", err)
|
|
}
|
|
|
|
ext := filepath.Ext(filePath)
|
|
switch ext {
|
|
case ".doc", ".docx", ".pdf", ".pptx", "ppt":
|
|
return cleanBlankLine(cleanHtml(content, false)), nil
|
|
case ".xls", ".xlsx":
|
|
return cleanBlankLine(cleanHtml(content, true)), nil
|
|
default:
|
|
return cleanBlankLine(content), nil
|
|
}
|
|
|
|
}
|
|
|
|
// 清理文本内容
|
|
func cleanHtml(html string, keepTable bool) string {
|
|
// 清理 HTML 标签
|
|
var policy *bluemonday.Policy
|
|
if keepTable {
|
|
policy = bluemonday.NewPolicy()
|
|
policy.AllowElements("table", "thead", "tbody", "tfoot", "tr", "td", "th")
|
|
} else {
|
|
policy = bluemonday.StrictPolicy()
|
|
}
|
|
return policy.Sanitize(html)
|
|
}
|
|
|
|
func cleanBlankLine(content string) string {
|
|
lines := strings.Split(content, "\n")
|
|
texts := make([]string, 0)
|
|
for _, line := range lines {
|
|
line = strings.TrimSpace(line)
|
|
if len(line) < 2 {
|
|
continue
|
|
}
|
|
// discard image
|
|
if strings.HasSuffix(line, ".png") ||
|
|
strings.HasSuffix(line, ".jpg") ||
|
|
strings.HasSuffix(line, ".jpeg") {
|
|
continue
|
|
}
|
|
texts = append(texts, line)
|
|
}
|
|
|
|
return strings.Join(texts, "\n")
|
|
}
|
|
|
|
// 下载文件
|
|
func downloadFile(url string) (string, error) {
|
|
base := filepath.Base(url)
|
|
dir := os.TempDir()
|
|
filename := filepath.Join(dir, base)
|
|
out, err := os.Create(filename)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
defer out.Close()
|
|
|
|
// 获取数据
|
|
resp, err := http.Get(url)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
// 写入数据到文件
|
|
_, err = io.Copy(out, resp.Body)
|
|
return filename, err
|
|
}
|