feat: chat with file function is ready

This commit is contained in:
RockYang
2024-06-27 18:01:49 +08:00
parent 3fdcc895ed
commit a27ce36a32
14 changed files with 329 additions and 75 deletions

View File

@@ -13,7 +13,8 @@ import (
"github.com/google/go-tika/tika"
)
func ReadPdf(filePath string) (string, error) {
func ReadFileContent(filePath string) (string, error) {
// for remote file, download it first
if strings.HasPrefix(filePath, "http") {
file, err := downloadFile(filePath)
if err != nil {
@@ -31,22 +32,34 @@ func ReadPdf(filePath string) (string, error) {
defer file.Close()
// 使用 Tika 提取 PDF 文件的文本内容
html, err := client.Parse(context.TODO(), file)
content, err := client.Parse(context.TODO(), file)
if err != nil {
return "", fmt.Errorf("error with parse file: %v", err)
}
fmt.Println(html)
return cleanBlankLine(html), nil
ext := filepath.Ext(filePath)
switch ext {
case ".doc", ".docx", ".pdf", ".pptx", "ppt":
return cleanBlankLine(cleanHtml(content, false)), nil
case ".xls", ".xlsx":
return cleanBlankLine(cleanHtml(content, true)), nil
default:
return cleanBlankLine(content), nil
}
}
// 清理文本内容
func cleanHtml(html string) string {
func cleanHtml(html string, keepTable bool) string {
// 清理 HTML 标签
p := bluemonday.StrictPolicy()
return p.Sanitize(html)
var policy *bluemonday.Policy
if keepTable {
policy = bluemonday.NewPolicy()
policy.AllowElements("table", "thead", "tbody", "tfoot", "tr", "td", "th")
} else {
policy = bluemonday.StrictPolicy()
}
return policy.Sanitize(html)
}
func cleanBlankLine(content string) string {
@@ -57,6 +70,12 @@ func cleanBlankLine(content string) string {
if len(line) < 2 {
continue
}
// discard image
if strings.HasSuffix(line, ".png") ||
strings.HasSuffix(line, ".jpg") ||
strings.HasSuffix(line, ".jpeg") {
continue
}
texts = append(texts, line)
}

View File

@@ -88,7 +88,7 @@ func GetImgExt(filename string) string {
return ext
}
func ExtractImgURL(text string) []string {
func ExtractImgURLs(text string) []string {
re := regexp.MustCompile(`(http[s]?:\/\/.*?\.(?:png|jpg|jpeg|gif))`)
matches := re.FindAllStringSubmatch(text, 10)
urls := make([]string, 0)
@@ -99,3 +99,15 @@ func ExtractImgURL(text string) []string {
}
return urls
}
func ExtractFileURLs(text string) []string {
re := regexp.MustCompile(`(http[s]?:\/\/.*?\.(?:docx?|pdf|pptx?|xlsx?|txt))`)
matches := re.FindAllStringSubmatch(text, 10)
urls := make([]string, 0)
if len(matches) > 0 {
for _, m := range matches {
urls = append(urls, m[1])
}
}
return urls
}