mirror of
https://github.com/langbot-app/LangBot.git
synced 2026-06-07 22:36:02 +00:00
* Initial plan * Add package structure and resource path utilities - Created langbot/ package with __init__.py and __main__.py entry point - Added paths utility to find frontend and resource files from package installation - Updated config loading to use resource paths - Updated frontend serving to use resource paths - Added MANIFEST.in for package data inclusion - Updated pyproject.toml with build system and entry points Co-authored-by: RockChinQ <45992437+RockChinQ@users.noreply.github.com> * Add PyPI publishing workflow and update license - Created GitHub Actions workflow to build frontend and publish to PyPI - Added license field to pyproject.toml to fix deprecation warning - Updated .gitignore to exclude build artifacts - Tested package building successfully Co-authored-by: RockChinQ <45992437+RockChinQ@users.noreply.github.com> * Add PyPI installation documentation - Created PYPI_INSTALLATION.md with detailed installation and usage instructions - Updated README.md to feature uvx/pip installation as recommended method - Updated README_EN.md with same changes for English documentation Co-authored-by: RockChinQ <45992437+RockChinQ@users.noreply.github.com> * Address code review feedback - Made package-data configuration more specific to langbot package only - Improved path detection with caching to avoid repeated file I/O - Removed sys.path searching which was incorrect for package data - Removed interactive input() call for non-interactive environment compatibility - Simplified error messages for version check Co-authored-by: RockChinQ <45992437+RockChinQ@users.noreply.github.com> * Fix code review issues - Use specific exception types instead of bare except - Fix misleading comments about directory levels - Remove redundant existence check before makedirs with exist_ok=True - Use context manager for file opening to ensure proper cleanup Co-authored-by: RockChinQ <45992437+RockChinQ@users.noreply.github.com> * Simplify package configuration and document behavioral differences - Removed redundant package-data configuration, relying on MANIFEST.in - Added documentation about behavioral differences between package and source installation - Clarified that include-package-data=true uses MANIFEST.in for data files Co-authored-by: RockChinQ <45992437+RockChinQ@users.noreply.github.com> * chore: update pyproject.toml * chore: try pack templates in langbot/ * chore: update * chore: update * chore: update * chore: update * chore: update * chore: adjust dir structure * chore: fix imports * fix: read default-pipeline-config.json * fix: read default-pipeline-config.json * fix: tests * ci: publish pypi * chore: bump version 4.6.0-beta.1 for testing * chore: add templates/** * fix: send adapters and requesters icons * chore: bump version 4.6.0b2 for testing * chore: add platform field for docker-compose.yaml --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: RockChinQ <45992437+RockChinQ@users.noreply.github.com> Co-authored-by: Junyan Qin <rockchinq@gmail.com>
50 lines
1.8 KiB
Python
50 lines
1.8 KiB
Python
from __future__ import annotations
|
|
|
|
import json
|
|
from typing import List
|
|
from langbot.pkg.rag.knowledge.services import base_service
|
|
from langbot.pkg.core import app
|
|
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
|
|
|
|
|
class Chunker(base_service.BaseService):
|
|
"""
|
|
A class for splitting long texts into smaller, overlapping chunks.
|
|
"""
|
|
|
|
def __init__(self, ap: app.Application, chunk_size: int = 500, chunk_overlap: int = 50):
|
|
self.ap = ap
|
|
self.chunk_size = chunk_size
|
|
self.chunk_overlap = chunk_overlap
|
|
if self.chunk_overlap >= self.chunk_size:
|
|
self.ap.logger.warning(
|
|
'Chunk overlap is greater than or equal to chunk size. This may lead to empty or malformed chunks.'
|
|
)
|
|
|
|
def _split_text_sync(self, text: str) -> List[str]:
|
|
"""
|
|
Synchronously splits a long text into chunks with specified overlap.
|
|
This is a CPU-bound operation, intended to be run in a separate thread.
|
|
"""
|
|
if not text:
|
|
return []
|
|
|
|
text_splitter = RecursiveCharacterTextSplitter(
|
|
chunk_size=self.chunk_size,
|
|
chunk_overlap=self.chunk_overlap,
|
|
length_function=len,
|
|
is_separator_regex=False,
|
|
)
|
|
return text_splitter.split_text(text)
|
|
|
|
async def chunk(self, text: str) -> List[str]:
|
|
"""
|
|
Asynchronously chunks a given text into smaller pieces.
|
|
"""
|
|
self.ap.logger.info(f'Chunking text (length: {len(text)})...')
|
|
# Run the synchronous splitting logic in a separate thread
|
|
chunks = await self._run_sync(self._split_text_sync, text)
|
|
self.ap.logger.info(f'Text chunked into {len(chunks)} pieces.')
|
|
self.ap.logger.debug(f'Chunks: {json.dumps(chunks, indent=4, ensure_ascii=False)}')
|
|
return chunks
|