From 91cb5ca36ca3207081817d38e3667238f4b3b4ba Mon Sep 17 00:00:00 2001
From: WangCham <651122857@qq.com>
Date: Sat, 19 Jul 2025 19:57:57 +0800
Subject: [PATCH] feat: add html and epub

---
 pkg/rag/knowledge/services/parser.py          | 66 +++++++++----------
 .../components/kb-docs/FileUploadZone.tsx     |  2 +-
 2 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/pkg/rag/knowledge/services/parser.py b/pkg/rag/knowledge/services/parser.py
index 3f15642d..4c3e8b5c 100644
--- a/pkg/rag/knowledge/services/parser.py
+++ b/pkg/rag/knowledge/services/parser.py
@@ -6,7 +6,6 @@ from docx import Document
 import pandas as pd
 import chardet
 from typing import Union, Callable, Any
-import logging
 import markdown
 from bs4 import BeautifulSoup
 import ebooklib
@@ -15,8 +14,7 @@ import re
 import asyncio  # Import asyncio for async operations
 from pkg.core import app
 
-# Configure logging
-logger = logging.getLogger(__name__)
+
 
 
 class FileParser:
@@ -146,43 +144,43 @@ class FileParser:
         self.ap.logger.warning(f'Direct .doc parsing is not supported for {file_name}. Please convert to .docx first.')
         raise NotImplementedError('Direct .doc parsing not supported. Please convert to .docx first.')
 
-    async def _parse_xlsx(self, file_name: str) -> str:
-        """Parses an XLSX file, returning text from all sheets."""
-        self.ap.logger.info(f'Parsing XLSX file: {file_name}')
+    # async def _parse_xlsx(self, file_name: str) -> str:
+    #     """Parses an XLSX file, returning text from all sheets."""
+    #     self.ap.logger.info(f'Parsing XLSX file: {file_name}')
 
-        xlsx_bytes = await self.ap.storage_mgr.storage_provider.load(file_name)
+    #     xlsx_bytes = await self.ap.storage_mgr.storage_provider.load(file_name)
 
-        def _parse_xlsx_sync():
-            excel_file = pd.ExcelFile(io.BytesIO(xlsx_bytes))
-            all_sheet_content = []
-            for sheet_name in excel_file.sheet_names:
-                df = pd.read_excel(io.BytesIO(xlsx_bytes), sheet_name=sheet_name)
-                sheet_text = f'--- Sheet: {sheet_name} ---\n{df.to_string(index=False)}\n'
-                all_sheet_content.append(sheet_text)
-            return '\n'.join(all_sheet_content)
+    #     def _parse_xlsx_sync():
+    #         excel_file = pd.ExcelFile(io.BytesIO(xlsx_bytes))
+    #         all_sheet_content = []
+    #         for sheet_name in excel_file.sheet_names:
+    #             df = pd.read_excel(io.BytesIO(xlsx_bytes), sheet_name=sheet_name)
+    #             sheet_text = f'--- Sheet: {sheet_name} ---\n{df.to_string(index=False)}\n'
+    #             all_sheet_content.append(sheet_text)
+    #         return '\n'.join(all_sheet_content)
 
-        return await self._run_sync(_parse_xlsx_sync)
+    #     return await self._run_sync(_parse_xlsx_sync)
 
-    async def _parse_csv(self, file_name: str) -> str:
-        """Parses a CSV file and returns its content as a string."""
-        self.ap.logger.info(f'Parsing CSV file: {file_name}')
+    # async def _parse_csv(self, file_name: str) -> str:
+    #     """Parses a CSV file and returns its content as a string."""
+    #     self.ap.logger.info(f'Parsing CSV file: {file_name}')
 
-        csv_bytes = await self.ap.storage_mgr.storage_provider.load(file_name)
+    #     csv_bytes = await self.ap.storage_mgr.storage_provider.load(file_name)
 
-        def _parse_csv_sync():
-            # pd.read_csv can often detect encoding, but explicit detection is safer
-            # raw_data = self._read_file_content(
-            #     file_name, mode='rb'
-            # )  # Note: this will need to be await outside this sync function
-            # _ = raw_data
-            # For simplicity, we'll let pandas handle encoding internally after a raw read.
-            # A more robust solution might pass encoding directly to pd.read_csv after detection.
-            detected = chardet.detect(io.BytesIO(csv_bytes))
-            encoding = detected['encoding'] or 'utf-8'
-            df = pd.read_csv(io.BytesIO(csv_bytes), encoding=encoding)
-            return df.to_string(index=False)
+    #     def _parse_csv_sync():
+    #         # pd.read_csv can often detect encoding, but explicit detection is safer
+    #         # raw_data = self._read_file_content(
+    #         #     file_name, mode='rb'
+    #         # )  # Note: this will need to be await outside this sync function
+    #         # _ = raw_data
+    #         # For simplicity, we'll let pandas handle encoding internally after a raw read.
+    #         # A more robust solution might pass encoding directly to pd.read_csv after detection.
+    #         detected = chardet.detect(io.BytesIO(csv_bytes))
+    #         encoding = detected['encoding'] or 'utf-8'
+    #         df = pd.read_csv(io.BytesIO(csv_bytes), encoding=encoding)
+    #         return df.to_string(index=False)
 
-        return await self._run_sync(_parse_csv_sync)
+    #     return await self._run_sync(_parse_csv_sync)
 
     async def _parse_md(self, file_name: str) -> str:
         """Parses a Markdown file, converting it to structured plain text."""
@@ -269,6 +267,7 @@ class FileParser:
 
         epub_bytes = await self.ap.storage_mgr.storage_provider.load(file_name)
 
+
         def _parse_epub_sync():
             book = epub.read_epub(io.BytesIO(epub_bytes))
             text_content = []
@@ -296,6 +295,7 @@ class FileParser:
                     text = re.sub(r'\n\s*\n', '\n\n', text)
                     if text:
                         text_content.append(text)
+            
             return re.sub(r'\n\s*\n', '\n\n', '\n'.join(text_content)).strip()
 
         return await self._run_sync(_parse_epub_sync)
diff --git a/web/src/app/home/knowledge/components/kb-docs/FileUploadZone.tsx b/web/src/app/home/knowledge/components/kb-docs/FileUploadZone.tsx
index aa8adede..25c397d6 100644
--- a/web/src/app/home/knowledge/components/kb-docs/FileUploadZone.tsx
+++ b/web/src/app/home/knowledge/components/kb-docs/FileUploadZone.tsx
@@ -104,7 +104,7 @@ export default function FileUploadZone({
             id="file-upload"
             className="hidden"
             onChange={handleFileSelect}
-            accept=".pdf,.doc,.docx,.txt,.md"
+            accept=".pdf,.doc,.docx,.txt,.md,.html,.epub,.epub3"
             disabled={isUploading}
           />