mirror of
https://github.com/langbot-app/LangBot.git
synced 2026-06-09 07:16:04 +00:00
chore: stash
This commit is contained in:
@@ -1,4 +1,5 @@
|
||||
import PyPDF2
|
||||
import io
|
||||
from docx import Document
|
||||
import pandas as pd
|
||||
import chardet
|
||||
@@ -10,7 +11,7 @@ import ebooklib
|
||||
from ebooklib import epub
|
||||
import re
|
||||
import asyncio # Import asyncio for async operations
|
||||
import os
|
||||
from pkg.core import app
|
||||
|
||||
# Configure logging
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -24,8 +25,8 @@ class FileParser:
|
||||
to avoid blocking the asyncio event loop.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.logger = logging.getLogger(self.__class__.__name__)
|
||||
def __init__(self, ap: app.Application):
|
||||
self.ap = ap
|
||||
|
||||
async def _run_sync(self, sync_func: Callable, *args: Any, **kwargs: Any) -> Any:
|
||||
"""
|
||||
@@ -35,138 +36,160 @@ class FileParser:
|
||||
try:
|
||||
return await asyncio.to_thread(sync_func, *args, **kwargs)
|
||||
except Exception as e:
|
||||
self.logger.error(f'Error running synchronous function {sync_func.__name__}: {e}')
|
||||
self.ap.logger.error(f'Error running synchronous function {sync_func.__name__}: {e}')
|
||||
raise
|
||||
|
||||
async def parse(self, file_path: str) -> Union[str, None]:
|
||||
async def parse(self, file_name: str, extension: str) -> Union[str, None]:
|
||||
"""
|
||||
Parses the file based on its extension and returns the extracted text content.
|
||||
This is the main asynchronous entry point for parsing.
|
||||
|
||||
Args:
|
||||
file_path (str): The path to the file to be parsed.
|
||||
file_name (str): The name of the file to be parsed, get from ap.storage_mgr
|
||||
|
||||
Returns:
|
||||
Union[str, None]: The extracted text content as a single string, or None if parsing fails.
|
||||
"""
|
||||
if not file_path or not os.path.exists(file_path):
|
||||
self.logger.error(f'Invalid file path provided: {file_path}')
|
||||
return None
|
||||
|
||||
file_extension = file_path.split('.')[-1].lower()
|
||||
file_extension = extension.lower()
|
||||
parser_method = getattr(self, f'_parse_{file_extension}', None)
|
||||
|
||||
if parser_method is None:
|
||||
self.logger.error(f'Unsupported file format: {file_extension} for file {file_path}')
|
||||
self.ap.logger.error(f'Unsupported file format: {file_extension} for file {file_name}')
|
||||
return None
|
||||
|
||||
try:
|
||||
# Pass file_path to the specific parser methods
|
||||
return await parser_method(file_path)
|
||||
return await parser_method(file_name)
|
||||
except Exception as e:
|
||||
self.logger.error(f'Failed to parse {file_extension} file {file_path}: {e}')
|
||||
self.ap.logger.error(f'Failed to parse {file_extension} file {file_name}: {e}')
|
||||
return None
|
||||
|
||||
# --- Helper for reading files with encoding detection ---
|
||||
async def _read_file_content(self, file_path: str, mode: str = 'r') -> Union[str, bytes]:
|
||||
async def _read_file_content(self, file_name: str) -> Union[str, bytes]:
|
||||
"""
|
||||
Reads a file with automatic encoding detection, ensuring the synchronous
|
||||
file read operation runs in a separate thread.
|
||||
"""
|
||||
|
||||
def _read_sync():
|
||||
with open(file_path, 'rb') as file:
|
||||
raw_data = file.read()
|
||||
detected = chardet.detect(raw_data)
|
||||
encoding = detected['encoding'] or 'utf-8'
|
||||
# def _read_sync():
|
||||
# with open(file_path, 'rb') as file:
|
||||
# raw_data = file.read()
|
||||
# detected = chardet.detect(raw_data)
|
||||
# encoding = detected['encoding'] or 'utf-8'
|
||||
|
||||
if mode == 'r':
|
||||
return raw_data.decode(encoding, errors='ignore')
|
||||
return raw_data # For binary mode
|
||||
# if mode == 'r':
|
||||
# return raw_data.decode(encoding, errors='ignore')
|
||||
# return raw_data # For binary mode
|
||||
|
||||
return await self._run_sync(_read_sync)
|
||||
# return await self._run_sync(_read_sync)
|
||||
file_bytes = await self.ap.storage_mgr.storage_provider.load(file_name)
|
||||
|
||||
detected = chardet.detect(file_bytes)
|
||||
encoding = detected['encoding'] or 'utf-8'
|
||||
|
||||
return file_bytes.decode(encoding, errors='ignore')
|
||||
|
||||
# --- Specific Parser Methods ---
|
||||
|
||||
async def _parse_txt(self, file_path: str) -> str:
|
||||
async def _parse_txt(self, file_name: str) -> str:
|
||||
"""Parses a TXT file and returns its content."""
|
||||
self.logger.info(f'Parsing TXT file: {file_path}')
|
||||
return await self._read_file_content(file_path, mode='r')
|
||||
self.ap.logger.info(f'Parsing TXT file: {file_name}')
|
||||
return await self._read_file_content(file_name)
|
||||
|
||||
async def _parse_pdf(self, file_path: str) -> str:
|
||||
async def _parse_pdf(self, file_name: str) -> str:
|
||||
"""Parses a PDF file and returns its text content."""
|
||||
self.logger.info(f'Parsing PDF file: {file_path}')
|
||||
self.ap.logger.info(f'Parsing PDF file: {file_name}')
|
||||
|
||||
# def _parse_pdf_sync():
|
||||
# text_content = []
|
||||
# with open(file_name, 'rb') as file:
|
||||
# pdf_reader = PyPDF2.PdfReader(file)
|
||||
# for page in pdf_reader.pages:
|
||||
# text = page.extract_text()
|
||||
# if text:
|
||||
# text_content.append(text)
|
||||
# return '\n'.join(text_content)
|
||||
|
||||
# return await self._run_sync(_parse_pdf_sync)
|
||||
|
||||
pdf_bytes = await self.ap.storage_mgr.storage_provider.load(file_name)
|
||||
|
||||
def _parse_pdf_sync():
|
||||
pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_bytes))
|
||||
text_content = []
|
||||
with open(file_path, 'rb') as file:
|
||||
pdf_reader = PyPDF2.PdfReader(file)
|
||||
for page in pdf_reader.pages:
|
||||
text = page.extract_text()
|
||||
if text:
|
||||
text_content.append(text)
|
||||
for page in pdf_reader.pages:
|
||||
text = page.extract_text()
|
||||
if text:
|
||||
text_content.append(text)
|
||||
return '\n'.join(text_content)
|
||||
|
||||
return await self._run_sync(_parse_pdf_sync)
|
||||
|
||||
async def _parse_docx(self, file_path: str) -> str:
|
||||
async def _parse_docx(self, file_name: str) -> str:
|
||||
"""Parses a DOCX file and returns its text content."""
|
||||
self.logger.info(f'Parsing DOCX file: {file_path}')
|
||||
self.ap.logger.info(f'Parsing DOCX file: {file_name}')
|
||||
|
||||
docx_bytes = await self.ap.storage_mgr.storage_provider.load(file_name)
|
||||
|
||||
def _parse_docx_sync():
|
||||
doc = Document(file_path)
|
||||
doc = Document(io.BytesIO(docx_bytes))
|
||||
text_content = [paragraph.text for paragraph in doc.paragraphs if paragraph.text.strip()]
|
||||
return '\n'.join(text_content)
|
||||
|
||||
return await self._run_sync(_parse_docx_sync)
|
||||
|
||||
async def _parse_doc(self, file_path: str) -> str:
|
||||
async def _parse_doc(self, file_name: str) -> str:
|
||||
"""Handles .doc files, explicitly stating lack of direct support."""
|
||||
self.logger.warning(f'Direct .doc parsing is not supported for {file_path}. Please convert to .docx first.')
|
||||
self.ap.logger.warning(f'Direct .doc parsing is not supported for {file_name}. Please convert to .docx first.')
|
||||
raise NotImplementedError('Direct .doc parsing not supported. Please convert to .docx first.')
|
||||
|
||||
async def _parse_xlsx(self, file_path: str) -> str:
|
||||
async def _parse_xlsx(self, file_name: str) -> str:
|
||||
"""Parses an XLSX file, returning text from all sheets."""
|
||||
self.logger.info(f'Parsing XLSX file: {file_path}')
|
||||
self.ap.logger.info(f'Parsing XLSX file: {file_name}')
|
||||
|
||||
xlsx_bytes = await self.ap.storage_mgr.storage_provider.load(file_name)
|
||||
|
||||
def _parse_xlsx_sync():
|
||||
excel_file = pd.ExcelFile(file_path)
|
||||
excel_file = pd.ExcelFile(io.BytesIO(xlsx_bytes))
|
||||
all_sheet_content = []
|
||||
for sheet_name in excel_file.sheet_names:
|
||||
df = pd.read_excel(file_path, sheet_name=sheet_name)
|
||||
df = pd.read_excel(io.BytesIO(xlsx_bytes), sheet_name=sheet_name)
|
||||
sheet_text = f'--- Sheet: {sheet_name} ---\n{df.to_string(index=False)}\n'
|
||||
all_sheet_content.append(sheet_text)
|
||||
return '\n'.join(all_sheet_content)
|
||||
|
||||
return await self._run_sync(_parse_xlsx_sync)
|
||||
|
||||
async def _parse_csv(self, file_path: str) -> str:
|
||||
async def _parse_csv(self, file_name: str) -> str:
|
||||
"""Parses a CSV file and returns its content as a string."""
|
||||
self.logger.info(f'Parsing CSV file: {file_path}')
|
||||
self.ap.logger.info(f'Parsing CSV file: {file_name}')
|
||||
|
||||
csv_bytes = await self.ap.storage_mgr.storage_provider.load(file_name)
|
||||
|
||||
def _parse_csv_sync():
|
||||
# pd.read_csv can often detect encoding, but explicit detection is safer
|
||||
raw_data = self._read_file_content(
|
||||
file_path, mode='rb'
|
||||
) # Note: this will need to be await outside this sync function
|
||||
_ = raw_data
|
||||
# raw_data = self._read_file_content(
|
||||
# file_name, mode='rb'
|
||||
# ) # Note: this will need to be await outside this sync function
|
||||
# _ = raw_data
|
||||
# For simplicity, we'll let pandas handle encoding internally after a raw read.
|
||||
# A more robust solution might pass encoding directly to pd.read_csv after detection.
|
||||
detected = chardet.detect(open(file_path, 'rb').read())
|
||||
detected = chardet.detect(io.BytesIO(csv_bytes))
|
||||
encoding = detected['encoding'] or 'utf-8'
|
||||
df = pd.read_csv(file_path, encoding=encoding)
|
||||
df = pd.read_csv(io.BytesIO(csv_bytes), encoding=encoding)
|
||||
return df.to_string(index=False)
|
||||
|
||||
return await self._run_sync(_parse_csv_sync)
|
||||
|
||||
async def _parse_markdown(self, file_path: str) -> str:
|
||||
async def _parse_markdown(self, file_name: str) -> str:
|
||||
"""Parses a Markdown file, converting it to structured plain text."""
|
||||
self.logger.info(f'Parsing Markdown file: {file_path}')
|
||||
self.ap.logger.info(f'Parsing Markdown file: {file_name}')
|
||||
|
||||
md_bytes = await self.ap.storage_mgr.storage_provider.load(file_name)
|
||||
|
||||
def _parse_markdown_sync():
|
||||
md_content = self._read_file_content(
|
||||
file_path, mode='r'
|
||||
) # This is a synchronous call within a sync function
|
||||
md_content = io.BytesIO(md_bytes).read().decode('utf-8', errors='ignore')
|
||||
html_content = markdown.markdown(
|
||||
md_content, extensions=['extra', 'codehilite', 'tables', 'toc', 'fenced_code']
|
||||
)
|
||||
@@ -200,12 +223,14 @@ class FileParser:
|
||||
|
||||
return await self._run_sync(_parse_markdown_sync)
|
||||
|
||||
async def _parse_html(self, file_path: str) -> str:
|
||||
async def _parse_html(self, file_name: str) -> str:
|
||||
"""Parses an HTML file, extracting structured plain text."""
|
||||
self.logger.info(f'Parsing HTML file: {file_path}')
|
||||
self.ap.logger.info(f'Parsing HTML file: {file_name}')
|
||||
|
||||
html_bytes = await self.ap.storage_mgr.storage_provider.load(file_name)
|
||||
|
||||
def _parse_html_sync():
|
||||
html_content = self._read_file_content(file_path, mode='r') # Sync call within sync function
|
||||
html_content = io.BytesIO(html_bytes).read().decode('utf-8', errors='ignore')
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
for script_or_style in soup(['script', 'style']):
|
||||
script_or_style.decompose()
|
||||
@@ -236,12 +261,14 @@ class FileParser:
|
||||
|
||||
return await self._run_sync(_parse_html_sync)
|
||||
|
||||
async def _parse_epub(self, file_path: str) -> str:
|
||||
async def _parse_epub(self, file_name: str) -> str:
|
||||
"""Parses an EPUB file, extracting metadata and content."""
|
||||
self.logger.info(f'Parsing EPUB file: {file_path}')
|
||||
self.ap.logger.info(f'Parsing EPUB file: {file_name}')
|
||||
|
||||
epub_bytes = await self.ap.storage_mgr.storage_provider.load(file_name)
|
||||
|
||||
def _parse_epub_sync():
|
||||
book = epub.read_epub(file_path)
|
||||
book = epub.read_epub(io.BytesIO(epub_bytes))
|
||||
text_content = []
|
||||
title_meta = book.get_metadata('DC', 'title')
|
||||
if title_meta:
|
||||
|
||||
Reference in New Issue
Block a user