fix: embbeding and chunking

This commit is contained in:
WangCham
2025-07-12 01:07:49 +08:00
parent fe122281fd
commit f395cac893
5 changed files with 64 additions and 71 deletions

View File

@@ -24,33 +24,28 @@ class Chunker(BaseService):
"""
if not text:
return []
# Simple whitespace-based splitting for demonstration
# For more advanced chunking, consider libraries like LangChain's text splitters
words = text.split()
chunks = []
current_chunk = []
# words = text.split()
# chunks = []
# current_chunk = []
for word in words:
current_chunk.append(word)
if len(current_chunk) > self.chunk_size:
chunks.append(" ".join(current_chunk[:self.chunk_size]))
current_chunk = current_chunk[self.chunk_size - self.chunk_overlap:]
# for word in words:
# current_chunk.append(word)
# if len(current_chunk) > self.chunk_size:
# chunks.append(" ".join(current_chunk[:self.chunk_size]))
# current_chunk = current_chunk[self.chunk_size - self.chunk_overlap:]
if current_chunk:
chunks.append(" ".join(current_chunk))
# if current_chunk:
# chunks.append(" ".join(current_chunk))
# A more robust chunking strategy (e.g., using recursive character text splitter)
# from langchain.text_splitter import RecursiveCharacterTextSplitter
# text_splitter = RecursiveCharacterTextSplitter(
# chunk_size=self.chunk_size,
# chunk_overlap=self.chunk_overlap,
# length_function=len,
# is_separator_regex=False,
# )
# return text_splitter.split_text(text)
return [chunk for chunk in chunks if chunk.strip()] # Filter out empty chunks
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=self.chunk_size,
chunk_overlap=self.chunk_overlap,
length_function=len,
is_separator_regex=False,
)
return text_splitter.split_text(text)
async def chunk(self, text: str) -> List[str]:
"""

View File

@@ -12,7 +12,7 @@ from pkg.rag.knowledge.services.chroma_manager import ChromaIndexManager # Impor
logger = logging.getLogger(__name__)
class Embedder(BaseService):
def __init__(self, model_type: str, model_name_key: str, chroma_manager: ChromaIndexManager):
def __init__(self, model_type: str, model_name_key: str, chroma_manager: ChromaIndexManager = None):
super().__init__()
self.logger = logging.getLogger(self.__class__.__name__)
self.model_type = model_type