chore: stash

2026-06-09 07:16:04 +00:00 · 2025-07-15 22:09:10 +08:00
parent 199164fc4b
commit 67bc065ccd
15 changed files with 508 additions and 338 deletions
--- a/pkg/rag/knowledge/services/parser.py
+++ b/pkg/rag/knowledge/services/parser.py
@@ -1,4 +1,5 @@
 import PyPDF2
+import io
 from docx import Document
 import pandas as pd
 import chardet
@@ -10,7 +11,7 @@ import ebooklib
 from ebooklib import epub
 import re
 import asyncio  # Import asyncio for async operations
-import os
+from pkg.core import app

 # Configure logging
 logger = logging.getLogger(__name__)
@@ -24,8 +25,8 @@ class FileParser:
    to avoid blocking the asyncio event loop.
    """

-    def __init__(self):
-        self.logger = logging.getLogger(self.__class__.__name__)
+    def __init__(self, ap: app.Application):
+        self.ap = ap

    async def _run_sync(self, sync_func: Callable, *args: Any, **kwargs: Any) -> Any:
        """
@@ -35,138 +36,160 @@ class FileParser:
        try:
            return await asyncio.to_thread(sync_func, *args, **kwargs)
        except Exception as e:
-            self.logger.error(f'Error running synchronous function {sync_func.__name__}: {e}')
+            self.ap.logger.error(f'Error running synchronous function {sync_func.__name__}: {e}')
            raise

-    async def parse(self, file_path: str) -> Union[str, None]:
+    async def parse(self, file_name: str, extension: str) -> Union[str, None]:
        """
        Parses the file based on its extension and returns the extracted text content.
        This is the main asynchronous entry point for parsing.

        Args:
-            file_path (str): The path to the file to be parsed.
+            file_name (str): The name of the file to be parsed, get from ap.storage_mgr

        Returns:
            Union[str, None]: The extracted text content as a single string, or None if parsing fails.
        """
-        if not file_path or not os.path.exists(file_path):
-            self.logger.error(f'Invalid file path provided: {file_path}')
-            return None

-        file_extension = file_path.split('.')[-1].lower()
+        file_extension = extension.lower()
        parser_method = getattr(self, f'_parse_{file_extension}', None)

        if parser_method is None:
-            self.logger.error(f'Unsupported file format: {file_extension} for file {file_path}')
+            self.ap.logger.error(f'Unsupported file format: {file_extension} for file {file_name}')
            return None

        try:
            # Pass file_path to the specific parser methods
-            return await parser_method(file_path)
+            return await parser_method(file_name)
        except Exception as e:
-            self.logger.error(f'Failed to parse {file_extension} file {file_path}: {e}')
+            self.ap.logger.error(f'Failed to parse {file_extension} file {file_name}: {e}')
            return None

    # --- Helper for reading files with encoding detection ---
-    async def _read_file_content(self, file_path: str, mode: str = 'r') -> Union[str, bytes]:
+    async def _read_file_content(self, file_name: str) -> Union[str, bytes]:
        """
        Reads a file with automatic encoding detection, ensuring the synchronous
        file read operation runs in a separate thread.
        """

-        def _read_sync():
-            with open(file_path, 'rb') as file:
-                raw_data = file.read()
-                detected = chardet.detect(raw_data)
-                encoding = detected['encoding'] or 'utf-8'
+        # def _read_sync():
+        #     with open(file_path, 'rb') as file:
+        #         raw_data = file.read()
+        #         detected = chardet.detect(raw_data)
+        #         encoding = detected['encoding'] or 'utf-8'

-            if mode == 'r':
-                return raw_data.decode(encoding, errors='ignore')
-            return raw_data  # For binary mode
+        #     if mode == 'r':
+        #         return raw_data.decode(encoding, errors='ignore')
+        #     return raw_data  # For binary mode

-        return await self._run_sync(_read_sync)
+        # return await self._run_sync(_read_sync)
+        file_bytes = await self.ap.storage_mgr.storage_provider.load(file_name)
+
+        detected = chardet.detect(file_bytes)
+        encoding = detected['encoding'] or 'utf-8'
+
+        return file_bytes.decode(encoding, errors='ignore')

    # --- Specific Parser Methods ---

-    async def _parse_txt(self, file_path: str) -> str:
+    async def _parse_txt(self, file_name: str) -> str:
        """Parses a TXT file and returns its content."""
-        self.logger.info(f'Parsing TXT file: {file_path}')
-        return await self._read_file_content(file_path, mode='r')
+        self.ap.logger.info(f'Parsing TXT file: {file_name}')
+        return await self._read_file_content(file_name)

-    async def _parse_pdf(self, file_path: str) -> str:
+    async def _parse_pdf(self, file_name: str) -> str:
        """Parses a PDF file and returns its text content."""
-        self.logger.info(f'Parsing PDF file: {file_path}')
+        self.ap.logger.info(f'Parsing PDF file: {file_name}')
+
+        # def _parse_pdf_sync():
+        #     text_content = []
+        #     with open(file_name, 'rb') as file:
+        #         pdf_reader = PyPDF2.PdfReader(file)
+        #         for page in pdf_reader.pages:
+        #             text = page.extract_text()
+        #             if text:
+        #                 text_content.append(text)
+        #     return '\n'.join(text_content)
+
+        # return await self._run_sync(_parse_pdf_sync)
+
+        pdf_bytes = await self.ap.storage_mgr.storage_provider.load(file_name)

        def _parse_pdf_sync():
+            pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_bytes))
            text_content = []
-            with open(file_path, 'rb') as file:
-                pdf_reader = PyPDF2.PdfReader(file)
-                for page in pdf_reader.pages:
-                    text = page.extract_text()
-                    if text:
-                        text_content.append(text)
+            for page in pdf_reader.pages:
+                text = page.extract_text()
+                if text:
+                    text_content.append(text)
            return '\n'.join(text_content)

        return await self._run_sync(_parse_pdf_sync)

-    async def _parse_docx(self, file_path: str) -> str:
+    async def _parse_docx(self, file_name: str) -> str:
        """Parses a DOCX file and returns its text content."""
-        self.logger.info(f'Parsing DOCX file: {file_path}')
+        self.ap.logger.info(f'Parsing DOCX file: {file_name}')
+
+        docx_bytes = await self.ap.storage_mgr.storage_provider.load(file_name)

        def _parse_docx_sync():
-            doc = Document(file_path)
+            doc = Document(io.BytesIO(docx_bytes))
            text_content = [paragraph.text for paragraph in doc.paragraphs if paragraph.text.strip()]
            return '\n'.join(text_content)

        return await self._run_sync(_parse_docx_sync)

-    async def _parse_doc(self, file_path: str) -> str:
+    async def _parse_doc(self, file_name: str) -> str:
        """Handles .doc files, explicitly stating lack of direct support."""
-        self.logger.warning(f'Direct .doc parsing is not supported for {file_path}. Please convert to .docx first.')
+        self.ap.logger.warning(f'Direct .doc parsing is not supported for {file_name}. Please convert to .docx first.')
        raise NotImplementedError('Direct .doc parsing not supported. Please convert to .docx first.')

-    async def _parse_xlsx(self, file_path: str) -> str:
+    async def _parse_xlsx(self, file_name: str) -> str:
        """Parses an XLSX file, returning text from all sheets."""
-        self.logger.info(f'Parsing XLSX file: {file_path}')
+        self.ap.logger.info(f'Parsing XLSX file: {file_name}')
+
+        xlsx_bytes = await self.ap.storage_mgr.storage_provider.load(file_name)

        def _parse_xlsx_sync():
-            excel_file = pd.ExcelFile(file_path)
+            excel_file = pd.ExcelFile(io.BytesIO(xlsx_bytes))
            all_sheet_content = []
            for sheet_name in excel_file.sheet_names:
-                df = pd.read_excel(file_path, sheet_name=sheet_name)
+                df = pd.read_excel(io.BytesIO(xlsx_bytes), sheet_name=sheet_name)
                sheet_text = f'--- Sheet: {sheet_name} ---\n{df.to_string(index=False)}\n'
                all_sheet_content.append(sheet_text)
            return '\n'.join(all_sheet_content)

        return await self._run_sync(_parse_xlsx_sync)

-    async def _parse_csv(self, file_path: str) -> str:
+    async def _parse_csv(self, file_name: str) -> str:
        """Parses a CSV file and returns its content as a string."""
-        self.logger.info(f'Parsing CSV file: {file_path}')
+        self.ap.logger.info(f'Parsing CSV file: {file_name}')
+
+        csv_bytes = await self.ap.storage_mgr.storage_provider.load(file_name)

        def _parse_csv_sync():
            # pd.read_csv can often detect encoding, but explicit detection is safer
-            raw_data = self._read_file_content(
-                file_path, mode='rb'
-            )  # Note: this will need to be await outside this sync function
-            _ = raw_data
+            # raw_data = self._read_file_content(
+            #     file_name, mode='rb'
+            # )  # Note: this will need to be await outside this sync function
+            # _ = raw_data
            # For simplicity, we'll let pandas handle encoding internally after a raw read.
            # A more robust solution might pass encoding directly to pd.read_csv after detection.
-            detected = chardet.detect(open(file_path, 'rb').read())
+            detected = chardet.detect(io.BytesIO(csv_bytes))
            encoding = detected['encoding'] or 'utf-8'
-            df = pd.read_csv(file_path, encoding=encoding)
+            df = pd.read_csv(io.BytesIO(csv_bytes), encoding=encoding)
            return df.to_string(index=False)

        return await self._run_sync(_parse_csv_sync)

-    async def _parse_markdown(self, file_path: str) -> str:
+    async def _parse_markdown(self, file_name: str) -> str:
        """Parses a Markdown file, converting it to structured plain text."""
-        self.logger.info(f'Parsing Markdown file: {file_path}')
+        self.ap.logger.info(f'Parsing Markdown file: {file_name}')
+
+        md_bytes = await self.ap.storage_mgr.storage_provider.load(file_name)

        def _parse_markdown_sync():
-            md_content = self._read_file_content(
-                file_path, mode='r'
-            )  # This is a synchronous call within a sync function
+            md_content = io.BytesIO(md_bytes).read().decode('utf-8', errors='ignore')
            html_content = markdown.markdown(
                md_content, extensions=['extra', 'codehilite', 'tables', 'toc', 'fenced_code']
            )
@@ -200,12 +223,14 @@ class FileParser:

        return await self._run_sync(_parse_markdown_sync)

-    async def _parse_html(self, file_path: str) -> str:
+    async def _parse_html(self, file_name: str) -> str:
        """Parses an HTML file, extracting structured plain text."""
-        self.logger.info(f'Parsing HTML file: {file_path}')
+        self.ap.logger.info(f'Parsing HTML file: {file_name}')
+
+        html_bytes = await self.ap.storage_mgr.storage_provider.load(file_name)

        def _parse_html_sync():
-            html_content = self._read_file_content(file_path, mode='r')  # Sync call within sync function
+            html_content = io.BytesIO(html_bytes).read().decode('utf-8', errors='ignore')
            soup = BeautifulSoup(html_content, 'html.parser')
            for script_or_style in soup(['script', 'style']):
                script_or_style.decompose()
@@ -236,12 +261,14 @@ class FileParser:

        return await self._run_sync(_parse_html_sync)

-    async def _parse_epub(self, file_path: str) -> str:
+    async def _parse_epub(self, file_name: str) -> str:
        """Parses an EPUB file, extracting metadata and content."""
-        self.logger.info(f'Parsing EPUB file: {file_path}')
+        self.ap.logger.info(f'Parsing EPUB file: {file_name}')
+
+        epub_bytes = await self.ap.storage_mgr.storage_provider.load(file_name)

        def _parse_epub_sync():
-            book = epub.read_epub(file_path)
+            book = epub.read_epub(io.BytesIO(epub_bytes))
            text_content = []
            title_meta = book.get_metadata('DC', 'title')
            if title_meta: