chore: stash

This commit is contained in:
Junyan Qin
2025-07-15 22:09:10 +08:00
parent 199164fc4b
commit 67bc065ccd
15 changed files with 508 additions and 338 deletions

View File

@@ -1,43 +1,43 @@
import numpy as np
import logging
from chromadb import PersistentClient
import os
from pkg.core import app
logger = logging.getLogger(__name__)
class ChromaIndexManager:
def __init__(self, collection_name: str = "default_collection"):
self.logger = logging.getLogger(self.__class__.__name__)
chroma_data_path = os.path.abspath(os.path.join(__file__, "../../../../../../data/chroma"))
os.makedirs(chroma_data_path, exist_ok=True)
def __init__(self, ap: app.Application, collection_name: str = 'default_collection'):
self.ap = ap
chroma_data_path = './data/chroma'
self.client = PersistentClient(path=chroma_data_path)
self._collection_name = collection_name
self._collection = None
self.logger.info(f"ChromaIndexManager initialized. Collection name: {self._collection_name}")
self.ap.logger.info(f'ChromaIndexManager initialized. Collection name: {self._collection_name}')
@property
def collection(self):
if self._collection is None:
self._collection = self.client.get_or_create_collection(name=self._collection_name)
self.logger.info(f"Chroma collection '{self._collection_name}' accessed/created.")
self.ap.logger.info(f"Chroma collection '{self._collection_name}' accessed/created.")
return self._collection
def add_embeddings_sync(self, file_ids: list[int], chunk_ids: list[int], embeddings: np.ndarray, documents: list[str]):
if embeddings.shape[0] != len(chunk_ids) or embeddings.shape[0] != len(file_ids) or embeddings.shape[0] != len(documents):
raise ValueError("Embedding, file_id, chunk_id, and document count mismatch.")
def add_embeddings_sync(
self, file_ids: list[int], chunk_ids: list[int], embeddings: np.ndarray, documents: list[str]
):
if (
embeddings.shape[0] != len(chunk_ids)
or embeddings.shape[0] != len(file_ids)
or embeddings.shape[0] != len(documents)
):
raise ValueError('Embedding, file_id, chunk_id, and document count mismatch.')
chroma_ids = [f"{file_id}_{chunk_id}" for file_id, chunk_id in zip(file_ids, chunk_ids)]
metadatas = [{"file_id": fid, "chunk_id": cid} for fid, cid in zip(file_ids, chunk_ids)]
chroma_ids = [f'{file_id}_{chunk_id}' for file_id, chunk_id in zip(file_ids, chunk_ids)]
metadatas = [{'file_id': fid, 'chunk_id': cid} for fid, cid in zip(file_ids, chunk_ids)]
self.logger.debug(f"Adding {len(embeddings)} embeddings to Chroma collection '{self._collection_name}'.")
self.collection.add(
embeddings=embeddings.tolist(),
ids=chroma_ids,
metadatas=metadatas,
documents=documents
)
self.collection.add(embeddings=embeddings.tolist(), ids=chroma_ids, metadatas=metadatas, documents=documents)
self.logger.info(f"Added {len(embeddings)} embeddings to Chroma collection '{self._collection_name}'.")
def search_sync(self, query_embedding: np.ndarray, k: int = 5):
@@ -54,12 +54,14 @@ class ChromaIndexManager:
query_embeddings=query_embedding.tolist(),
n_results=k,
# REMOVE 'ids' from the include list. It's returned by default.
include=["metadatas", "distances", "documents"]
include=['metadatas', 'distances', 'documents'],
)
self.logger.debug(f"Chroma search returned {len(results.get('ids', [[]])[0])} results.")
self.logger.debug(f'Chroma search returned {len(results.get("ids", [[]])[0])} results.')
return results
def delete_by_file_id_sync(self, file_id: int):
self.logger.info(f"Deleting embeddings for file_id: {file_id} from Chroma collection '{self._collection_name}'.")
self.collection.delete(where={"file_id": file_id})
self.logger.info(f"Deleted embeddings for file_id: {file_id} from Chroma.")
self.logger.info(
f"Deleting embeddings for file_id: {file_id} from Chroma collection '{self._collection_name}'."
)
self.collection.delete(where={'file_id': file_id})
self.logger.info(f'Deleted embeddings for file_id: {file_id} from Chroma.')

View File

@@ -1,21 +1,26 @@
# services/chunker.py
import logging
from typing import List
from pkg.rag.knowledge.services.base_service import BaseService # Assuming BaseService provides _run_sync
from pkg.rag.knowledge.services.base_service import BaseService # Assuming BaseService provides _run_sync
from pkg.core import app
logger = logging.getLogger(__name__)
class Chunker(BaseService):
"""
A class for splitting long texts into smaller, overlapping chunks.
"""
def __init__(self, chunk_size: int = 500, chunk_overlap: int = 50):
super().__init__() # Initialize BaseService
self.logger = logging.getLogger(self.__class__.__name__)
def __init__(self, ap: app.Application, chunk_size: int = 500, chunk_overlap: int = 50):
super().__init__(ap) # Initialize BaseService
self.ap = ap
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
if self.chunk_overlap >= self.chunk_size:
self.logger.warning("Chunk overlap is greater than or equal to chunk size. This may lead to empty or malformed chunks.")
self.logger.warning(
'Chunk overlap is greater than or equal to chunk size. This may lead to empty or malformed chunks.'
)
def _split_text_sync(self, text: str) -> List[str]:
"""
@@ -27,18 +32,19 @@ class Chunker(BaseService):
# words = text.split()
# chunks = []
# current_chunk = []
# for word in words:
# current_chunk.append(word)
# if len(current_chunk) > self.chunk_size:
# chunks.append(" ".join(current_chunk[:self.chunk_size]))
# current_chunk = current_chunk[self.chunk_size - self.chunk_overlap:]
# if current_chunk:
# chunks.append(" ".join(current_chunk))
# A more robust chunking strategy (e.g., using recursive character text splitter)
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=self.chunk_size,
chunk_overlap=self.chunk_overlap,
@@ -51,8 +57,8 @@ class Chunker(BaseService):
"""
Asynchronously chunks a given text into smaller pieces.
"""
self.logger.info(f"Chunking text (length: {len(text)})...")
self.ap.logger.info(f'Chunking text (length: {len(text)})...')
# Run the synchronous splitting logic in a separate thread
chunks = await self._run_sync(self._split_text_sync, text)
self.logger.info(f"Text chunked into {len(chunks)} pieces.")
return chunks
self.ap.logger.info(f'Text chunked into {len(chunks)} pieces.')
return chunks

View File

@@ -7,16 +7,10 @@ from sqlalchemy.orm import Session
from pkg.rag.knowledge.services.base_service import BaseService
from pkg.rag.knowledge.services.database import Chunk, SessionLocal
from pkg.rag.knowledge.services.chroma_manager import ChromaIndexManager
from sqlalchemy.orm import declarative_base, sessionmaker
from ....core import app
from ....entity.persistence import model as persistence_model
import sqlalchemy
from ....provider.modelmgr.requester import RuntimeEmbeddingModel
base = declarative_base()
logger = logging.getLogger(__name__)
class Embedder(BaseService):
def __init__(self, ap: app.Application, chroma_manager: ChromaIndexManager = None) -> None:
super().__init__()
@@ -30,61 +24,66 @@ class Embedder(BaseService):
This function assumes it's called within a context where the session
will be committed/rolled back and closed by the caller.
"""
self.logger.debug(f"Saving {len(chunks_texts)} chunks for file_id {file_id} to DB (sync).")
self.logger.debug(f'Saving {len(chunks_texts)} chunks for file_id {file_id} to DB (sync).')
chunk_objects = []
for text in chunks_texts:
chunk = Chunk(file_id=file_id, text=text)
session.add(chunk)
chunk_objects.append(chunk)
session.flush() # This populates the .id attribute for each new chunk object
self.logger.debug(f"Successfully added {len(chunk_objects)} chunk entries to DB.")
session.flush() # This populates the .id attribute for each new chunk object
self.logger.debug(f'Successfully added {len(chunk_objects)} chunk entries to DB.')
return chunk_objects
async def embed_and_store(self, file_id: int, chunks: List[str], embedding_model: RuntimeEmbeddingModel) -> List[Chunk]:
async def embed_and_store(
self, file_id: int, chunks: List[str], embedding_model: RuntimeEmbeddingModel
) -> List[Chunk]:
if not embedding_model:
raise RuntimeError("Embedding model not loaded. Please check Embedder initialization.")
raise RuntimeError('Embedding model not loaded. Please check Embedder initialization.')
session = SessionLocal() # Start a session that will live for the whole operation
session = SessionLocal() # Start a session that will live for the whole operation
chunk_objects = []
try:
# 1. Save chunks to the relational database first to get their IDs
# We call _db_save_chunks_sync directly without _run_sync's session management
# because we manage the session here across multiple async calls.
chunk_objects = await asyncio.to_thread(self._db_save_chunks_sync, session, file_id, chunks)
session.commit() # Commit chunks to make their IDs permanent and accessible
session.commit() # Commit chunks to make their IDs permanent and accessible
if not chunk_objects:
self.logger.warning(f"No chunk objects created for file_id {file_id}. Skipping embedding and Chroma storage.")
self.logger.warning(
f'No chunk objects created for file_id {file_id}. Skipping embedding and Chroma storage.'
)
return []
# get the embeddings for the chunks
embeddings = []
i = 0
while i <len(chunks):
chunk = chunks[i]
embeddings: list[list[float]] = []
for chunk in chunks:
result = await embedding_model.requester.invoke_embedding(
model=embedding_model,
input_text=chunk,
)
embeddings.append(result)
i += 1
embeddings_np = np.array(embeddings, dtype=np.float32)
self.logger.info("Saving embeddings to Chroma...")
chunk_ids = [c.id for c in chunk_objects]
self.logger.info('Saving embeddings to Chroma...')
chunk_ids = [c.id for c in chunk_objects]
file_ids_for_chroma = [file_id] * len(chunk_ids)
await self._run_sync( # Use _run_sync for the Chroma operation, as it's a sync call
await self._run_sync( # Use _run_sync for the Chroma operation, as it's a sync call
self.chroma_manager.add_embeddings_sync,
file_ids_for_chroma, chunk_ids, embeddings_np, chunks # Pass original chunks texts for documents
file_ids_for_chroma,
chunk_ids,
embeddings_np,
chunks, # Pass original chunks texts for documents
)
self.logger.info(f"Successfully saved {len(chunk_objects)} embeddings to Chroma.")
self.logger.info(f'Successfully saved {len(chunk_objects)} embeddings to Chroma.')
return chunk_objects
except Exception as e:
session.rollback() # Rollback on any error
self.logger.error(f"Failed to process and store data for file_id {file_id}: {e}", exc_info=True)
raise # Re-raise the exception to propagate it
session.rollback() # Rollback on any error
self.logger.error(f'Failed to process and store data for file_id {file_id}: {e}', exc_info=True)
raise # Re-raise the exception to propagate it
finally:
session.close() # Ensure the session is always closed
session.close() # Ensure the session is always closed

View File

@@ -1,4 +1,5 @@
import PyPDF2
import io
from docx import Document
import pandas as pd
import chardet
@@ -10,7 +11,7 @@ import ebooklib
from ebooklib import epub
import re
import asyncio # Import asyncio for async operations
import os
from pkg.core import app
# Configure logging
logger = logging.getLogger(__name__)
@@ -24,8 +25,8 @@ class FileParser:
to avoid blocking the asyncio event loop.
"""
def __init__(self):
self.logger = logging.getLogger(self.__class__.__name__)
def __init__(self, ap: app.Application):
self.ap = ap
async def _run_sync(self, sync_func: Callable, *args: Any, **kwargs: Any) -> Any:
"""
@@ -35,138 +36,160 @@ class FileParser:
try:
return await asyncio.to_thread(sync_func, *args, **kwargs)
except Exception as e:
self.logger.error(f'Error running synchronous function {sync_func.__name__}: {e}')
self.ap.logger.error(f'Error running synchronous function {sync_func.__name__}: {e}')
raise
async def parse(self, file_path: str) -> Union[str, None]:
async def parse(self, file_name: str, extension: str) -> Union[str, None]:
"""
Parses the file based on its extension and returns the extracted text content.
This is the main asynchronous entry point for parsing.
Args:
file_path (str): The path to the file to be parsed.
file_name (str): The name of the file to be parsed, get from ap.storage_mgr
Returns:
Union[str, None]: The extracted text content as a single string, or None if parsing fails.
"""
if not file_path or not os.path.exists(file_path):
self.logger.error(f'Invalid file path provided: {file_path}')
return None
file_extension = file_path.split('.')[-1].lower()
file_extension = extension.lower()
parser_method = getattr(self, f'_parse_{file_extension}', None)
if parser_method is None:
self.logger.error(f'Unsupported file format: {file_extension} for file {file_path}')
self.ap.logger.error(f'Unsupported file format: {file_extension} for file {file_name}')
return None
try:
# Pass file_path to the specific parser methods
return await parser_method(file_path)
return await parser_method(file_name)
except Exception as e:
self.logger.error(f'Failed to parse {file_extension} file {file_path}: {e}')
self.ap.logger.error(f'Failed to parse {file_extension} file {file_name}: {e}')
return None
# --- Helper for reading files with encoding detection ---
async def _read_file_content(self, file_path: str, mode: str = 'r') -> Union[str, bytes]:
async def _read_file_content(self, file_name: str) -> Union[str, bytes]:
"""
Reads a file with automatic encoding detection, ensuring the synchronous
file read operation runs in a separate thread.
"""
def _read_sync():
with open(file_path, 'rb') as file:
raw_data = file.read()
detected = chardet.detect(raw_data)
encoding = detected['encoding'] or 'utf-8'
# def _read_sync():
# with open(file_path, 'rb') as file:
# raw_data = file.read()
# detected = chardet.detect(raw_data)
# encoding = detected['encoding'] or 'utf-8'
if mode == 'r':
return raw_data.decode(encoding, errors='ignore')
return raw_data # For binary mode
# if mode == 'r':
# return raw_data.decode(encoding, errors='ignore')
# return raw_data # For binary mode
return await self._run_sync(_read_sync)
# return await self._run_sync(_read_sync)
file_bytes = await self.ap.storage_mgr.storage_provider.load(file_name)
detected = chardet.detect(file_bytes)
encoding = detected['encoding'] or 'utf-8'
return file_bytes.decode(encoding, errors='ignore')
# --- Specific Parser Methods ---
async def _parse_txt(self, file_path: str) -> str:
async def _parse_txt(self, file_name: str) -> str:
"""Parses a TXT file and returns its content."""
self.logger.info(f'Parsing TXT file: {file_path}')
return await self._read_file_content(file_path, mode='r')
self.ap.logger.info(f'Parsing TXT file: {file_name}')
return await self._read_file_content(file_name)
async def _parse_pdf(self, file_path: str) -> str:
async def _parse_pdf(self, file_name: str) -> str:
"""Parses a PDF file and returns its text content."""
self.logger.info(f'Parsing PDF file: {file_path}')
self.ap.logger.info(f'Parsing PDF file: {file_name}')
# def _parse_pdf_sync():
# text_content = []
# with open(file_name, 'rb') as file:
# pdf_reader = PyPDF2.PdfReader(file)
# for page in pdf_reader.pages:
# text = page.extract_text()
# if text:
# text_content.append(text)
# return '\n'.join(text_content)
# return await self._run_sync(_parse_pdf_sync)
pdf_bytes = await self.ap.storage_mgr.storage_provider.load(file_name)
def _parse_pdf_sync():
pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_bytes))
text_content = []
with open(file_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
for page in pdf_reader.pages:
text = page.extract_text()
if text:
text_content.append(text)
for page in pdf_reader.pages:
text = page.extract_text()
if text:
text_content.append(text)
return '\n'.join(text_content)
return await self._run_sync(_parse_pdf_sync)
async def _parse_docx(self, file_path: str) -> str:
async def _parse_docx(self, file_name: str) -> str:
"""Parses a DOCX file and returns its text content."""
self.logger.info(f'Parsing DOCX file: {file_path}')
self.ap.logger.info(f'Parsing DOCX file: {file_name}')
docx_bytes = await self.ap.storage_mgr.storage_provider.load(file_name)
def _parse_docx_sync():
doc = Document(file_path)
doc = Document(io.BytesIO(docx_bytes))
text_content = [paragraph.text for paragraph in doc.paragraphs if paragraph.text.strip()]
return '\n'.join(text_content)
return await self._run_sync(_parse_docx_sync)
async def _parse_doc(self, file_path: str) -> str:
async def _parse_doc(self, file_name: str) -> str:
"""Handles .doc files, explicitly stating lack of direct support."""
self.logger.warning(f'Direct .doc parsing is not supported for {file_path}. Please convert to .docx first.')
self.ap.logger.warning(f'Direct .doc parsing is not supported for {file_name}. Please convert to .docx first.')
raise NotImplementedError('Direct .doc parsing not supported. Please convert to .docx first.')
async def _parse_xlsx(self, file_path: str) -> str:
async def _parse_xlsx(self, file_name: str) -> str:
"""Parses an XLSX file, returning text from all sheets."""
self.logger.info(f'Parsing XLSX file: {file_path}')
self.ap.logger.info(f'Parsing XLSX file: {file_name}')
xlsx_bytes = await self.ap.storage_mgr.storage_provider.load(file_name)
def _parse_xlsx_sync():
excel_file = pd.ExcelFile(file_path)
excel_file = pd.ExcelFile(io.BytesIO(xlsx_bytes))
all_sheet_content = []
for sheet_name in excel_file.sheet_names:
df = pd.read_excel(file_path, sheet_name=sheet_name)
df = pd.read_excel(io.BytesIO(xlsx_bytes), sheet_name=sheet_name)
sheet_text = f'--- Sheet: {sheet_name} ---\n{df.to_string(index=False)}\n'
all_sheet_content.append(sheet_text)
return '\n'.join(all_sheet_content)
return await self._run_sync(_parse_xlsx_sync)
async def _parse_csv(self, file_path: str) -> str:
async def _parse_csv(self, file_name: str) -> str:
"""Parses a CSV file and returns its content as a string."""
self.logger.info(f'Parsing CSV file: {file_path}')
self.ap.logger.info(f'Parsing CSV file: {file_name}')
csv_bytes = await self.ap.storage_mgr.storage_provider.load(file_name)
def _parse_csv_sync():
# pd.read_csv can often detect encoding, but explicit detection is safer
raw_data = self._read_file_content(
file_path, mode='rb'
) # Note: this will need to be await outside this sync function
_ = raw_data
# raw_data = self._read_file_content(
# file_name, mode='rb'
# ) # Note: this will need to be await outside this sync function
# _ = raw_data
# For simplicity, we'll let pandas handle encoding internally after a raw read.
# A more robust solution might pass encoding directly to pd.read_csv after detection.
detected = chardet.detect(open(file_path, 'rb').read())
detected = chardet.detect(io.BytesIO(csv_bytes))
encoding = detected['encoding'] or 'utf-8'
df = pd.read_csv(file_path, encoding=encoding)
df = pd.read_csv(io.BytesIO(csv_bytes), encoding=encoding)
return df.to_string(index=False)
return await self._run_sync(_parse_csv_sync)
async def _parse_markdown(self, file_path: str) -> str:
async def _parse_markdown(self, file_name: str) -> str:
"""Parses a Markdown file, converting it to structured plain text."""
self.logger.info(f'Parsing Markdown file: {file_path}')
self.ap.logger.info(f'Parsing Markdown file: {file_name}')
md_bytes = await self.ap.storage_mgr.storage_provider.load(file_name)
def _parse_markdown_sync():
md_content = self._read_file_content(
file_path, mode='r'
) # This is a synchronous call within a sync function
md_content = io.BytesIO(md_bytes).read().decode('utf-8', errors='ignore')
html_content = markdown.markdown(
md_content, extensions=['extra', 'codehilite', 'tables', 'toc', 'fenced_code']
)
@@ -200,12 +223,14 @@ class FileParser:
return await self._run_sync(_parse_markdown_sync)
async def _parse_html(self, file_path: str) -> str:
async def _parse_html(self, file_name: str) -> str:
"""Parses an HTML file, extracting structured plain text."""
self.logger.info(f'Parsing HTML file: {file_path}')
self.ap.logger.info(f'Parsing HTML file: {file_name}')
html_bytes = await self.ap.storage_mgr.storage_provider.load(file_name)
def _parse_html_sync():
html_content = self._read_file_content(file_path, mode='r') # Sync call within sync function
html_content = io.BytesIO(html_bytes).read().decode('utf-8', errors='ignore')
soup = BeautifulSoup(html_content, 'html.parser')
for script_or_style in soup(['script', 'style']):
script_or_style.decompose()
@@ -236,12 +261,14 @@ class FileParser:
return await self._run_sync(_parse_html_sync)
async def _parse_epub(self, file_path: str) -> str:
async def _parse_epub(self, file_name: str) -> str:
"""Parses an EPUB file, extracting metadata and content."""
self.logger.info(f'Parsing EPUB file: {file_path}')
self.ap.logger.info(f'Parsing EPUB file: {file_name}')
epub_bytes = await self.ap.storage_mgr.storage_provider.load(file_name)
def _parse_epub_sync():
book = epub.read_epub(file_path)
book = epub.read_epub(io.BytesIO(epub_bytes))
text_content = []
title_meta = book.get_metadata('DC', 'title')
if title_meta: