chore: stash

This commit is contained in:
Junyan Qin
2025-07-15 22:09:10 +08:00
parent 199164fc4b
commit 67bc065ccd
15 changed files with 508 additions and 338 deletions

View File

@@ -1,4 +1,5 @@
import PyPDF2
import io
from docx import Document
import pandas as pd
import chardet
@@ -10,7 +11,7 @@ import ebooklib
from ebooklib import epub
import re
import asyncio # Import asyncio for async operations
import os
from pkg.core import app
# Configure logging
logger = logging.getLogger(__name__)
@@ -24,8 +25,8 @@ class FileParser:
to avoid blocking the asyncio event loop.
"""
def __init__(self):
self.logger = logging.getLogger(self.__class__.__name__)
def __init__(self, ap: app.Application):
self.ap = ap
async def _run_sync(self, sync_func: Callable, *args: Any, **kwargs: Any) -> Any:
"""
@@ -35,138 +36,160 @@ class FileParser:
try:
return await asyncio.to_thread(sync_func, *args, **kwargs)
except Exception as e:
self.logger.error(f'Error running synchronous function {sync_func.__name__}: {e}')
self.ap.logger.error(f'Error running synchronous function {sync_func.__name__}: {e}')
raise
async def parse(self, file_path: str) -> Union[str, None]:
async def parse(self, file_name: str, extension: str) -> Union[str, None]:
"""
Parses the file based on its extension and returns the extracted text content.
This is the main asynchronous entry point for parsing.
Args:
file_path (str): The path to the file to be parsed.
file_name (str): The name of the file to be parsed, get from ap.storage_mgr
Returns:
Union[str, None]: The extracted text content as a single string, or None if parsing fails.
"""
if not file_path or not os.path.exists(file_path):
self.logger.error(f'Invalid file path provided: {file_path}')
return None
file_extension = file_path.split('.')[-1].lower()
file_extension = extension.lower()
parser_method = getattr(self, f'_parse_{file_extension}', None)
if parser_method is None:
self.logger.error(f'Unsupported file format: {file_extension} for file {file_path}')
self.ap.logger.error(f'Unsupported file format: {file_extension} for file {file_name}')
return None
try:
# Pass file_path to the specific parser methods
return await parser_method(file_path)
return await parser_method(file_name)
except Exception as e:
self.logger.error(f'Failed to parse {file_extension} file {file_path}: {e}')
self.ap.logger.error(f'Failed to parse {file_extension} file {file_name}: {e}')
return None
# --- Helper for reading files with encoding detection ---
async def _read_file_content(self, file_path: str, mode: str = 'r') -> Union[str, bytes]:
async def _read_file_content(self, file_name: str) -> Union[str, bytes]:
"""
Reads a file with automatic encoding detection, ensuring the synchronous
file read operation runs in a separate thread.
"""
def _read_sync():
with open(file_path, 'rb') as file:
raw_data = file.read()
detected = chardet.detect(raw_data)
encoding = detected['encoding'] or 'utf-8'
# def _read_sync():
# with open(file_path, 'rb') as file:
# raw_data = file.read()
# detected = chardet.detect(raw_data)
# encoding = detected['encoding'] or 'utf-8'
if mode == 'r':
return raw_data.decode(encoding, errors='ignore')
return raw_data # For binary mode
# if mode == 'r':
# return raw_data.decode(encoding, errors='ignore')
# return raw_data # For binary mode
return await self._run_sync(_read_sync)
# return await self._run_sync(_read_sync)
file_bytes = await self.ap.storage_mgr.storage_provider.load(file_name)
detected = chardet.detect(file_bytes)
encoding = detected['encoding'] or 'utf-8'
return file_bytes.decode(encoding, errors='ignore')
# --- Specific Parser Methods ---
async def _parse_txt(self, file_path: str) -> str:
async def _parse_txt(self, file_name: str) -> str:
"""Parses a TXT file and returns its content."""
self.logger.info(f'Parsing TXT file: {file_path}')
return await self._read_file_content(file_path, mode='r')
self.ap.logger.info(f'Parsing TXT file: {file_name}')
return await self._read_file_content(file_name)
async def _parse_pdf(self, file_path: str) -> str:
async def _parse_pdf(self, file_name: str) -> str:
"""Parses a PDF file and returns its text content."""
self.logger.info(f'Parsing PDF file: {file_path}')
self.ap.logger.info(f'Parsing PDF file: {file_name}')
# def _parse_pdf_sync():
# text_content = []
# with open(file_name, 'rb') as file:
# pdf_reader = PyPDF2.PdfReader(file)
# for page in pdf_reader.pages:
# text = page.extract_text()
# if text:
# text_content.append(text)
# return '\n'.join(text_content)
# return await self._run_sync(_parse_pdf_sync)
pdf_bytes = await self.ap.storage_mgr.storage_provider.load(file_name)
def _parse_pdf_sync():
pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_bytes))
text_content = []
with open(file_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
for page in pdf_reader.pages:
text = page.extract_text()
if text:
text_content.append(text)
for page in pdf_reader.pages:
text = page.extract_text()
if text:
text_content.append(text)
return '\n'.join(text_content)
return await self._run_sync(_parse_pdf_sync)
async def _parse_docx(self, file_path: str) -> str:
async def _parse_docx(self, file_name: str) -> str:
"""Parses a DOCX file and returns its text content."""
self.logger.info(f'Parsing DOCX file: {file_path}')
self.ap.logger.info(f'Parsing DOCX file: {file_name}')
docx_bytes = await self.ap.storage_mgr.storage_provider.load(file_name)
def _parse_docx_sync():
doc = Document(file_path)
doc = Document(io.BytesIO(docx_bytes))
text_content = [paragraph.text for paragraph in doc.paragraphs if paragraph.text.strip()]
return '\n'.join(text_content)
return await self._run_sync(_parse_docx_sync)
async def _parse_doc(self, file_path: str) -> str:
async def _parse_doc(self, file_name: str) -> str:
"""Handles .doc files, explicitly stating lack of direct support."""
self.logger.warning(f'Direct .doc parsing is not supported for {file_path}. Please convert to .docx first.')
self.ap.logger.warning(f'Direct .doc parsing is not supported for {file_name}. Please convert to .docx first.')
raise NotImplementedError('Direct .doc parsing not supported. Please convert to .docx first.')
async def _parse_xlsx(self, file_path: str) -> str:
async def _parse_xlsx(self, file_name: str) -> str:
"""Parses an XLSX file, returning text from all sheets."""
self.logger.info(f'Parsing XLSX file: {file_path}')
self.ap.logger.info(f'Parsing XLSX file: {file_name}')
xlsx_bytes = await self.ap.storage_mgr.storage_provider.load(file_name)
def _parse_xlsx_sync():
excel_file = pd.ExcelFile(file_path)
excel_file = pd.ExcelFile(io.BytesIO(xlsx_bytes))
all_sheet_content = []
for sheet_name in excel_file.sheet_names:
df = pd.read_excel(file_path, sheet_name=sheet_name)
df = pd.read_excel(io.BytesIO(xlsx_bytes), sheet_name=sheet_name)
sheet_text = f'--- Sheet: {sheet_name} ---\n{df.to_string(index=False)}\n'
all_sheet_content.append(sheet_text)
return '\n'.join(all_sheet_content)
return await self._run_sync(_parse_xlsx_sync)
async def _parse_csv(self, file_path: str) -> str:
async def _parse_csv(self, file_name: str) -> str:
"""Parses a CSV file and returns its content as a string."""
self.logger.info(f'Parsing CSV file: {file_path}')
self.ap.logger.info(f'Parsing CSV file: {file_name}')
csv_bytes = await self.ap.storage_mgr.storage_provider.load(file_name)
def _parse_csv_sync():
# pd.read_csv can often detect encoding, but explicit detection is safer
raw_data = self._read_file_content(
file_path, mode='rb'
) # Note: this will need to be await outside this sync function
_ = raw_data
# raw_data = self._read_file_content(
# file_name, mode='rb'
# ) # Note: this will need to be await outside this sync function
# _ = raw_data
# For simplicity, we'll let pandas handle encoding internally after a raw read.
# A more robust solution might pass encoding directly to pd.read_csv after detection.
detected = chardet.detect(open(file_path, 'rb').read())
detected = chardet.detect(io.BytesIO(csv_bytes))
encoding = detected['encoding'] or 'utf-8'
df = pd.read_csv(file_path, encoding=encoding)
df = pd.read_csv(io.BytesIO(csv_bytes), encoding=encoding)
return df.to_string(index=False)
return await self._run_sync(_parse_csv_sync)
async def _parse_markdown(self, file_path: str) -> str:
async def _parse_markdown(self, file_name: str) -> str:
"""Parses a Markdown file, converting it to structured plain text."""
self.logger.info(f'Parsing Markdown file: {file_path}')
self.ap.logger.info(f'Parsing Markdown file: {file_name}')
md_bytes = await self.ap.storage_mgr.storage_provider.load(file_name)
def _parse_markdown_sync():
md_content = self._read_file_content(
file_path, mode='r'
) # This is a synchronous call within a sync function
md_content = io.BytesIO(md_bytes).read().decode('utf-8', errors='ignore')
html_content = markdown.markdown(
md_content, extensions=['extra', 'codehilite', 'tables', 'toc', 'fenced_code']
)
@@ -200,12 +223,14 @@ class FileParser:
return await self._run_sync(_parse_markdown_sync)
async def _parse_html(self, file_path: str) -> str:
async def _parse_html(self, file_name: str) -> str:
"""Parses an HTML file, extracting structured plain text."""
self.logger.info(f'Parsing HTML file: {file_path}')
self.ap.logger.info(f'Parsing HTML file: {file_name}')
html_bytes = await self.ap.storage_mgr.storage_provider.load(file_name)
def _parse_html_sync():
html_content = self._read_file_content(file_path, mode='r') # Sync call within sync function
html_content = io.BytesIO(html_bytes).read().decode('utf-8', errors='ignore')
soup = BeautifulSoup(html_content, 'html.parser')
for script_or_style in soup(['script', 'style']):
script_or_style.decompose()
@@ -236,12 +261,14 @@ class FileParser:
return await self._run_sync(_parse_html_sync)
async def _parse_epub(self, file_path: str) -> str:
async def _parse_epub(self, file_name: str) -> str:
"""Parses an EPUB file, extracting metadata and content."""
self.logger.info(f'Parsing EPUB file: {file_path}')
self.ap.logger.info(f'Parsing EPUB file: {file_name}')
epub_bytes = await self.ap.storage_mgr.storage_provider.load(file_name)
def _parse_epub_sync():
book = epub.read_epub(file_path)
book = epub.read_epub(io.BytesIO(epub_bytes))
text_content = []
title_meta = book.get_metadata('DC', 'title')
if title_meta: