perf: ruff check --fix

This commit is contained in:
Junyan Qin
2025-07-05 21:56:54 +08:00
parent 39c062f73e
commit 8d28ace252
23 changed files with 647 additions and 737 deletions

View File

@@ -1,22 +1,21 @@
import PyPDF2
from docx import Document
import pandas as pd
import csv
import chardet
from typing import Union, List, Callable, Any
from typing import Union, Callable, Any
import logging
import markdown
from bs4 import BeautifulSoup
import ebooklib
from ebooklib import epub
import re
import asyncio # Import asyncio for async operations
import asyncio # Import asyncio for async operations
import os
# Configure logging
logger = logging.getLogger(__name__)
class FileParser:
"""
A robust file parser class to extract text content from various document formats.
@@ -24,8 +23,8 @@ class FileParser:
All core file reading operations are designed to be run synchronously in a thread pool
to avoid blocking the asyncio event loop.
"""
def __init__(self):
self.logger = logging.getLogger(self.__class__.__name__)
async def _run_sync(self, sync_func: Callable, *args: Any, **kwargs: Any) -> Any:
@@ -36,14 +35,14 @@ class FileParser:
try:
return await asyncio.to_thread(sync_func, *args, **kwargs)
except Exception as e:
self.logger.error(f"Error running synchronous function {sync_func.__name__}: {e}")
self.logger.error(f'Error running synchronous function {sync_func.__name__}: {e}')
raise
async def parse(self, file_path: str) -> Union[str, None]:
"""
Parses the file based on its extension and returns the extracted text content.
This is the main asynchronous entry point for parsing.
Args:
file_path (str): The path to the file to be parsed.
@@ -51,21 +50,21 @@ class FileParser:
Union[str, None]: The extracted text content as a single string, or None if parsing fails.
"""
if not file_path or not os.path.exists(file_path):
self.logger.error(f"Invalid file path provided: {file_path}")
self.logger.error(f'Invalid file path provided: {file_path}')
return None
file_extension = file_path.split('.')[-1].lower()
parser_method = getattr(self, f'_parse_{file_extension}', None)
if parser_method is None:
self.logger.error(f"Unsupported file format: {file_extension} for file {file_path}")
self.logger.error(f'Unsupported file format: {file_extension} for file {file_path}')
return None
try:
# Pass file_path to the specific parser methods
return await parser_method(file_path)
except Exception as e:
self.logger.error(f"Failed to parse {file_extension} file {file_path}: {e}")
self.logger.error(f'Failed to parse {file_extension} file {file_path}: {e}')
return None
# --- Helper for reading files with encoding detection ---
@@ -74,15 +73,16 @@ class FileParser:
Reads a file with automatic encoding detection, ensuring the synchronous
file read operation runs in a separate thread.
"""
def _read_sync():
with open(file_path, 'rb') as file:
raw_data = file.read()
detected = chardet.detect(raw_data)
encoding = detected['encoding'] or 'utf-8'
if mode == 'r':
return raw_data.decode(encoding, errors='ignore')
return raw_data # For binary mode
return raw_data # For binary mode
return await self._run_sync(_read_sync)
@@ -90,12 +90,13 @@ class FileParser:
async def _parse_txt(self, file_path: str) -> str:
"""Parses a TXT file and returns its content."""
self.logger.info(f"Parsing TXT file: {file_path}")
self.logger.info(f'Parsing TXT file: {file_path}')
return await self._read_file_content(file_path, mode='r')
async def _parse_pdf(self, file_path: str) -> str:
"""Parses a PDF file and returns its text content."""
self.logger.info(f"Parsing PDF file: {file_path}")
self.logger.info(f'Parsing PDF file: {file_path}')
def _parse_pdf_sync():
text_content = []
with open(file_path, 'rb') as file:
@@ -105,57 +106,69 @@ class FileParser:
if text:
text_content.append(text)
return '\n'.join(text_content)
return await self._run_sync(_parse_pdf_sync)
async def _parse_docx(self, file_path: str) -> str:
"""Parses a DOCX file and returns its text content."""
self.logger.info(f"Parsing DOCX file: {file_path}")
self.logger.info(f'Parsing DOCX file: {file_path}')
def _parse_docx_sync():
doc = Document(file_path)
text_content = [paragraph.text for paragraph in doc.paragraphs if paragraph.text.strip()]
return '\n'.join(text_content)
return await self._run_sync(_parse_docx_sync)
async def _parse_doc(self, file_path: str) -> str:
"""Handles .doc files, explicitly stating lack of direct support."""
self.logger.warning(f"Direct .doc parsing is not supported for {file_path}. Please convert to .docx first.")
raise NotImplementedError("Direct .doc parsing not supported. Please convert to .docx first.")
self.logger.warning(f'Direct .doc parsing is not supported for {file_path}. Please convert to .docx first.')
raise NotImplementedError('Direct .doc parsing not supported. Please convert to .docx first.')
async def _parse_xlsx(self, file_path: str) -> str:
"""Parses an XLSX file, returning text from all sheets."""
self.logger.info(f"Parsing XLSX file: {file_path}")
self.logger.info(f'Parsing XLSX file: {file_path}')
def _parse_xlsx_sync():
excel_file = pd.ExcelFile(file_path)
all_sheet_content = []
for sheet_name in excel_file.sheet_names:
df = pd.read_excel(file_path, sheet_name=sheet_name)
sheet_text = f"--- Sheet: {sheet_name} ---\n{df.to_string(index=False)}\n"
sheet_text = f'--- Sheet: {sheet_name} ---\n{df.to_string(index=False)}\n'
all_sheet_content.append(sheet_text)
return '\n'.join(all_sheet_content)
return await self._run_sync(_parse_xlsx_sync)
async def _parse_csv(self, file_path: str) -> str:
"""Parses a CSV file and returns its content as a string."""
self.logger.info(f"Parsing CSV file: {file_path}")
self.logger.info(f'Parsing CSV file: {file_path}')
def _parse_csv_sync():
# pd.read_csv can often detect encoding, but explicit detection is safer
raw_data = self._read_file_content(file_path, mode='rb') # Note: this will need to be await outside this sync function
raw_data = self._read_file_content(
file_path, mode='rb'
) # Note: this will need to be await outside this sync function
_ = raw_data
# For simplicity, we'll let pandas handle encoding internally after a raw read.
# A more robust solution might pass encoding directly to pd.read_csv after detection.
detected = chardet.detect(open(file_path, 'rb').read())
encoding = detected['encoding'] or 'utf-8'
df = pd.read_csv(file_path, encoding=encoding)
return df.to_string(index=False)
return await self._run_sync(_parse_csv_sync)
async def _parse_markdown(self, file_path: str) -> str:
"""Parses a Markdown file, converting it to structured plain text."""
self.logger.info(f"Parsing Markdown file: {file_path}")
self.logger.info(f'Parsing Markdown file: {file_path}')
def _parse_markdown_sync():
md_content = self._read_file_content(file_path, mode='r') # This is a synchronous call within a sync function
md_content = self._read_file_content(
file_path, mode='r'
) # This is a synchronous call within a sync function
html_content = markdown.markdown(
md_content,
extensions=['extra', 'codehilite', 'tables', 'toc', 'fenced_code']
md_content, extensions=['extra', 'codehilite', 'tables', 'toc', 'fenced_code']
)
soup = BeautifulSoup(html_content, 'html.parser')
text_parts = []
@@ -169,13 +182,13 @@ class FileParser:
text_parts.append(text)
elif element.name in ['ul', 'ol']:
for li in element.find_all('li'):
text_parts.append(f"* {li.get_text().strip()}")
text_parts.append(f'* {li.get_text().strip()}')
elif element.name == 'pre':
code_block = element.get_text().strip()
if code_block:
text_parts.append(f"```\n{code_block}\n```")
text_parts.append(f'```\n{code_block}\n```')
elif element.name == 'table':
table_str = self._extract_table_to_markdown_sync(element) # Call sync helper
table_str = self._extract_table_to_markdown_sync(element) # Call sync helper
if table_str:
text_parts.append(table_str)
elif element.name:
@@ -184,15 +197,17 @@ class FileParser:
text_parts.append(text)
cleaned_text = re.sub(r'\n\s*\n', '\n\n', '\n'.join(text_parts))
return cleaned_text.strip()
return await self._run_sync(_parse_markdown_sync)
async def _parse_html(self, file_path: str) -> str:
"""Parses an HTML file, extracting structured plain text."""
self.logger.info(f"Parsing HTML file: {file_path}")
self.logger.info(f'Parsing HTML file: {file_path}')
def _parse_html_sync():
html_content = self._read_file_content(file_path, mode='r') # Sync call within sync function
html_content = self._read_file_content(file_path, mode='r') # Sync call within sync function
soup = BeautifulSoup(html_content, 'html.parser')
for script_or_style in soup(["script", "style"]):
for script_or_style in soup(['script', 'style']):
script_or_style.decompose()
text_parts = []
for element in soup.body.children if soup.body else soup.children:
@@ -207,9 +222,9 @@ class FileParser:
for li in element.find_all('li'):
text = li.get_text().strip()
if text:
text_parts.append(f"* {text}")
text_parts.append(f'* {text}')
elif element.name == 'table':
table_str = self._extract_table_to_markdown_sync(element) # Call sync helper
table_str = self._extract_table_to_markdown_sync(element) # Call sync helper
if table_str:
text_parts.append(table_str)
elif element.name:
@@ -218,39 +233,42 @@ class FileParser:
text_parts.append(text)
cleaned_text = re.sub(r'\n\s*\n', '\n\n', '\n'.join(text_parts))
return cleaned_text.strip()
return await self._run_sync(_parse_html_sync)
async def _parse_epub(self, file_path: str) -> str:
"""Parses an EPUB file, extracting metadata and content."""
self.logger.info(f"Parsing EPUB file: {file_path}")
self.logger.info(f'Parsing EPUB file: {file_path}')
def _parse_epub_sync():
book = epub.read_epub(file_path)
text_content = []
title_meta = book.get_metadata('DC', 'title')
if title_meta:
text_content.append(f"Title: {title_meta[0][0]}")
text_content.append(f'Title: {title_meta[0][0]}')
creator_meta = book.get_metadata('DC', 'creator')
if creator_meta:
text_content.append(f"Author: {creator_meta[0][0]}")
text_content.append(f'Author: {creator_meta[0][0]}')
date_meta = book.get_metadata('DC', 'date')
if date_meta:
text_content.append(f"Publish Date: {date_meta[0][0]}")
text_content.append(f'Publish Date: {date_meta[0][0]}')
toc = book.get_toc()
if toc:
text_content.append("\n--- Table of Contents ---")
self._add_toc_items_sync(toc, text_content, level=0) # Call sync helper
text_content.append("--- End of Table of Contents ---\n")
text_content.append('\n--- Table of Contents ---')
self._add_toc_items_sync(toc, text_content, level=0) # Call sync helper
text_content.append('--- End of Table of Contents ---\n')
for item in book.get_items():
if item.get_type() == ebooklib.ITEM_DOCUMENT:
html_content = item.get_content().decode('utf-8', errors='ignore')
soup = BeautifulSoup(html_content, 'html.parser')
for junk in soup(["script", "style", "nav", "header", "footer"]):
for junk in soup(['script', 'style', 'nav', 'header', 'footer']):
junk.decompose()
text = soup.get_text(separator='\n', strip=True)
text = re.sub(r'\n\s*\n', '\n\n', text)
if text:
text_content.append(text)
return re.sub(r'\n\s*\n', '\n\n', '\n'.join(text_content)).strip()
return await self._run_sync(_parse_epub_sync)
def _add_toc_items_sync(self, toc_list: list, text_content: list, level: int):
@@ -259,10 +277,10 @@ class FileParser:
for item in toc_list:
if isinstance(item, tuple):
chapter, subchapters = item
text_content.append(f"{indent}- {chapter.title}")
text_content.append(f'{indent}- {chapter.title}')
self._add_toc_items_sync(subchapters, text_content, level + 1)
else:
text_content.append(f"{indent}- {item.title}")
text_content.append(f'{indent}- {item.title}')
def _extract_table_to_markdown_sync(self, table_element: BeautifulSoup) -> str:
"""Helper to convert a BeautifulSoup table element into a Markdown table string (synchronous)."""
@@ -272,17 +290,17 @@ class FileParser:
cells = [td.get_text().strip() for td in tr.find_all('td')]
if cells:
rows.append(cells)
if not headers and not rows:
return ""
return ''
table_lines = []
if headers:
table_lines.append(' | '.join(headers))
table_lines.append(' | '.join(['---'] * len(headers)))
for row_cells in rows:
padded_cells = row_cells + [''] * (len(headers) - len(row_cells)) if headers else row_cells
table_lines.append(' | '.join(padded_cells))
return '\n'.join(table_lines)
return '\n'.join(table_lines)