mirror of
https://github.com/langbot-app/LangBot.git
synced 2026-06-09 15:26:03 +00:00
feat: external knowledge bases (#1783)
* Initial plan * Add backend support for external knowledge bases Co-authored-by: RockChinQ <45992437+RockChinQ@users.noreply.github.com> * Add frontend support for external knowledge bases with tabs UI Co-authored-by: RockChinQ <45992437+RockChinQ@users.noreply.github.com> * Add i18n translations for all languages (Traditional Chinese and Japanese) Co-authored-by: RockChinQ <45992437+RockChinQ@users.noreply.github.com> * Update knowledge base tab list styling to match plugins page Co-authored-by: RockChinQ <45992437+RockChinQ@users.noreply.github.com> * perf: margin-top for kb page * refactor: switch RetrievalResultEntry to langbot_plugin pkg ones * feat: knowledge retriever listing and creating * stash * refactor: unify sync mechanism for polymorphic components * feat: use unified retireval result struct in retrieval test page * chore: remove unused methods * feat: retriever icon displaying * feat: localagent retrieval with external kbs * chore: bump version of langbot-plugin to 0.2.0b1 * fix: i18n --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: RockChinQ <45992437+RockChinQ@users.noreply.github.com> Co-authored-by: Junyan Qin <rockchinq@gmail.com>
This commit is contained in:
55
src/langbot/pkg/rag/knowledge/base.py
Normal file
55
src/langbot/pkg/rag/knowledge/base.py
Normal file
@@ -0,0 +1,55 @@
|
||||
"""Base classes and interfaces for knowledge bases"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import abc
|
||||
|
||||
from langbot.pkg.core import app
|
||||
from langbot_plugin.api.entities.builtin.rag import context as rag_context
|
||||
|
||||
|
||||
class KnowledgeBaseInterface(metaclass=abc.ABCMeta):
|
||||
"""Abstract interface for all knowledge base types"""
|
||||
|
||||
ap: app.Application
|
||||
|
||||
def __init__(self, ap: app.Application):
|
||||
self.ap = ap
|
||||
|
||||
@abc.abstractmethod
|
||||
async def initialize(self):
|
||||
"""Initialize the knowledge base"""
|
||||
pass
|
||||
|
||||
@abc.abstractmethod
|
||||
async def retrieve(self, query: str, top_k: int) -> list[rag_context.RetrievalResultEntry]:
|
||||
"""Retrieve relevant documents from the knowledge base
|
||||
|
||||
Args:
|
||||
query: The query string
|
||||
top_k: Number of top results to return
|
||||
|
||||
Returns:
|
||||
List of retrieve result entries
|
||||
"""
|
||||
pass
|
||||
|
||||
@abc.abstractmethod
|
||||
def get_uuid(self) -> str:
|
||||
"""Get the UUID of the knowledge base"""
|
||||
pass
|
||||
|
||||
@abc.abstractmethod
|
||||
def get_name(self) -> str:
|
||||
"""Get the name of the knowledge base"""
|
||||
pass
|
||||
|
||||
@abc.abstractmethod
|
||||
def get_type(self) -> str:
|
||||
"""Get the type of knowledge base (internal/external)"""
|
||||
pass
|
||||
|
||||
@abc.abstractmethod
|
||||
async def dispose(self):
|
||||
"""Clean up resources"""
|
||||
pass
|
||||
85
src/langbot/pkg/rag/knowledge/external.py
Normal file
85
src/langbot/pkg/rag/knowledge/external.py
Normal file
@@ -0,0 +1,85 @@
|
||||
"""External knowledge base implementation"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from langbot.pkg.core import app
|
||||
from langbot.pkg.entity.persistence import rag as persistence_rag
|
||||
from langbot_plugin.api.entities.builtin.rag import context as rag_context
|
||||
from .base import KnowledgeBaseInterface
|
||||
|
||||
|
||||
class ExternalKnowledgeBase(KnowledgeBaseInterface):
|
||||
"""External knowledge base that queries via HTTP API or plugin retriever"""
|
||||
|
||||
external_kb_entity: persistence_rag.ExternalKnowledgeBase
|
||||
|
||||
# Plugin retriever instance ID
|
||||
retriever_instance_id: str | None
|
||||
|
||||
def __init__(self, ap: app.Application, external_kb_entity: persistence_rag.ExternalKnowledgeBase):
|
||||
super().__init__(ap)
|
||||
self.external_kb_entity = external_kb_entity
|
||||
self.retriever_instance_id = None
|
||||
|
||||
async def initialize(self):
|
||||
"""Initialize the external knowledge base"""
|
||||
# Use KB UUID as instance ID
|
||||
# Instance creation is now handled by the unified sync mechanism
|
||||
# when LangBot connects to runtime
|
||||
self.retriever_instance_id = self.external_kb_entity.uuid
|
||||
|
||||
self.ap.logger.info(
|
||||
f'Initialized external KB {self.external_kb_entity.uuid}, instance will be created by sync mechanism'
|
||||
)
|
||||
|
||||
async def retrieve(self, query: str, top_k: int = 5) -> list[rag_context.RetrievalResultEntry]:
|
||||
"""Retrieve documents from external knowledge base via plugin retriever"""
|
||||
if not self.retriever_instance_id:
|
||||
self.ap.logger.error(f'No retriever instance for KB {self.external_kb_entity.uuid}')
|
||||
return []
|
||||
|
||||
try:
|
||||
results = await self.ap.plugin_connector.retrieve_knowledge(
|
||||
self.external_kb_entity.plugin_author,
|
||||
self.external_kb_entity.plugin_name,
|
||||
self.external_kb_entity.retriever_name,
|
||||
self.retriever_instance_id,
|
||||
{'query': query},
|
||||
)
|
||||
|
||||
# Convert plugin results to RetrievalResultEntry
|
||||
retrieval_entries = []
|
||||
for result in results:
|
||||
retrieval_entries.append(rag_context.RetrievalResultEntry(**result))
|
||||
|
||||
return retrieval_entries
|
||||
except Exception as e:
|
||||
self.ap.logger.error(f'Plugin retriever error: {e}')
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
return []
|
||||
|
||||
def get_uuid(self) -> str:
|
||||
"""Get the UUID of the external knowledge base"""
|
||||
return self.external_kb_entity.uuid
|
||||
|
||||
def get_name(self) -> str:
|
||||
"""Get the name of the external knowledge base"""
|
||||
return self.external_kb_entity.name
|
||||
|
||||
def get_type(self) -> str:
|
||||
"""Get the type of knowledge base"""
|
||||
return 'external'
|
||||
|
||||
async def dispose(self):
|
||||
"""Clean up resources"""
|
||||
# Trigger sync to immediately delete the instance from plugin process
|
||||
# This ensures instance is cleaned up without waiting for next LangBot restart
|
||||
try:
|
||||
await self.ap.plugin_connector.sync_polymorphic_component_instances()
|
||||
self.ap.logger.info(
|
||||
f'Disposed external KB {self.external_kb_entity.uuid}, triggered sync to delete instance'
|
||||
)
|
||||
except Exception as e:
|
||||
self.ap.logger.error(f'Failed to sync after disposing KB: {e}')
|
||||
@@ -10,10 +10,12 @@ from langbot.pkg.rag.knowledge.services.retriever import Retriever
|
||||
import sqlalchemy
|
||||
from langbot.pkg.entity.persistence import rag as persistence_rag
|
||||
from langbot.pkg.core import taskmgr
|
||||
from langbot.pkg.entity.rag import retriever as retriever_entities
|
||||
from langbot_plugin.api.entities.builtin.rag import context as rag_context
|
||||
from .base import KnowledgeBaseInterface
|
||||
from .external import ExternalKnowledgeBase
|
||||
|
||||
|
||||
class RuntimeKnowledgeBase:
|
||||
class RuntimeKnowledgeBase(KnowledgeBaseInterface):
|
||||
ap: app.Application
|
||||
|
||||
knowledge_base_entity: persistence_rag.KnowledgeBase
|
||||
@@ -27,7 +29,7 @@ class RuntimeKnowledgeBase:
|
||||
retriever: Retriever
|
||||
|
||||
def __init__(self, ap: app.Application, knowledge_base_entity: persistence_rag.KnowledgeBase):
|
||||
self.ap = ap
|
||||
super().__init__(ap)
|
||||
self.knowledge_base_entity = knowledge_base_entity
|
||||
self.parser = parser.FileParser(ap=self.ap)
|
||||
self.chunker = chunker.Chunker(ap=self.ap)
|
||||
@@ -187,7 +189,7 @@ class RuntimeKnowledgeBase:
|
||||
|
||||
return stored_file_tasks[0] if stored_file_tasks else ''
|
||||
|
||||
async def retrieve(self, query: str, top_k: int) -> list[retriever_entities.RetrieveResultEntry]:
|
||||
async def retrieve(self, query: str, top_k: int) -> list[rag_context.RetrievalResultEntry]:
|
||||
embedding_model = await self.ap.model_mgr.get_embedding_model_by_uuid(
|
||||
self.knowledge_base_entity.embedding_model_uuid
|
||||
)
|
||||
@@ -206,6 +208,18 @@ class RuntimeKnowledgeBase:
|
||||
sqlalchemy.delete(persistence_rag.File).where(persistence_rag.File.uuid == file_id)
|
||||
)
|
||||
|
||||
def get_uuid(self) -> str:
|
||||
"""Get the UUID of the knowledge base"""
|
||||
return self.knowledge_base_entity.uuid
|
||||
|
||||
def get_name(self) -> str:
|
||||
"""Get the name of the knowledge base"""
|
||||
return self.knowledge_base_entity.name
|
||||
|
||||
def get_type(self) -> str:
|
||||
"""Get the type of knowledge base"""
|
||||
return 'internal'
|
||||
|
||||
async def dispose(self):
|
||||
await self.ap.vector_db_mgr.vector_db.delete_collection(self.knowledge_base_entity.uuid)
|
||||
|
||||
@@ -213,7 +227,7 @@ class RuntimeKnowledgeBase:
|
||||
class RAGManager:
|
||||
ap: app.Application
|
||||
|
||||
knowledge_bases: list[RuntimeKnowledgeBase]
|
||||
knowledge_bases: list[KnowledgeBaseInterface]
|
||||
|
||||
def __init__(self, ap: app.Application):
|
||||
self.ap = ap
|
||||
@@ -227,8 +241,8 @@ class RAGManager:
|
||||
|
||||
self.knowledge_bases = []
|
||||
|
||||
# Load internal knowledge bases
|
||||
result = await self.ap.persistence_mgr.execute_async(sqlalchemy.select(persistence_rag.KnowledgeBase))
|
||||
|
||||
knowledge_bases = result.all()
|
||||
|
||||
for knowledge_base in knowledge_bases:
|
||||
@@ -239,6 +253,21 @@ class RAGManager:
|
||||
f'Error loading knowledge base {knowledge_base.uuid}: {e}\n{traceback.format_exc()}'
|
||||
)
|
||||
|
||||
# Load external knowledge bases
|
||||
external_result = await self.ap.persistence_mgr.execute_async(
|
||||
sqlalchemy.select(persistence_rag.ExternalKnowledgeBase)
|
||||
)
|
||||
external_kbs = external_result.all()
|
||||
|
||||
for external_kb in external_kbs:
|
||||
try:
|
||||
# Don't trigger sync during batch loading - will sync once after LangBot connects to runtime
|
||||
await self.load_external_knowledge_base(external_kb, trigger_sync=False)
|
||||
except Exception as e:
|
||||
self.ap.logger.error(
|
||||
f'Error loading external knowledge base {external_kb.uuid}: {e}\n{traceback.format_exc()}'
|
||||
)
|
||||
|
||||
async def load_knowledge_base(
|
||||
self,
|
||||
knowledge_base_entity: persistence_rag.KnowledgeBase | sqlalchemy.Row | dict,
|
||||
@@ -256,21 +285,54 @@ class RAGManager:
|
||||
|
||||
return runtime_knowledge_base
|
||||
|
||||
async def get_knowledge_base_by_uuid(self, kb_uuid: str) -> RuntimeKnowledgeBase | None:
|
||||
async def load_external_knowledge_base(
|
||||
self,
|
||||
external_kb_entity: persistence_rag.ExternalKnowledgeBase | sqlalchemy.Row | dict,
|
||||
trigger_sync: bool = True,
|
||||
) -> ExternalKnowledgeBase:
|
||||
"""Load external knowledge base into runtime
|
||||
|
||||
Args:
|
||||
external_kb_entity: External KB entity to load
|
||||
trigger_sync: Whether to trigger sync after loading (default True for manual creation, False for batch loading)
|
||||
"""
|
||||
if isinstance(external_kb_entity, sqlalchemy.Row):
|
||||
external_kb_entity = persistence_rag.ExternalKnowledgeBase(**external_kb_entity._mapping)
|
||||
elif isinstance(external_kb_entity, dict):
|
||||
external_kb_entity = persistence_rag.ExternalKnowledgeBase(**external_kb_entity)
|
||||
|
||||
external_kb = ExternalKnowledgeBase(ap=self.ap, external_kb_entity=external_kb_entity)
|
||||
|
||||
await external_kb.initialize()
|
||||
|
||||
self.knowledge_bases.append(external_kb)
|
||||
|
||||
# Trigger sync to create the instance immediately (for manual creation)
|
||||
# Skip sync during batch loading from DB to avoid multiple sync calls
|
||||
if trigger_sync:
|
||||
try:
|
||||
await self.ap.plugin_connector.sync_polymorphic_component_instances()
|
||||
self.ap.logger.info(f'Triggered sync after loading external KB {external_kb_entity.uuid}')
|
||||
except Exception as e:
|
||||
self.ap.logger.error(f'Failed to sync after loading external KB: {e}')
|
||||
|
||||
return external_kb
|
||||
|
||||
async def get_knowledge_base_by_uuid(self, kb_uuid: str) -> KnowledgeBaseInterface | None:
|
||||
for kb in self.knowledge_bases:
|
||||
if kb.knowledge_base_entity.uuid == kb_uuid:
|
||||
if kb.get_uuid() == kb_uuid:
|
||||
return kb
|
||||
return None
|
||||
|
||||
async def remove_knowledge_base_from_runtime(self, kb_uuid: str):
|
||||
for kb in self.knowledge_bases:
|
||||
if kb.knowledge_base_entity.uuid == kb_uuid:
|
||||
if kb.get_uuid() == kb_uuid:
|
||||
self.knowledge_bases.remove(kb)
|
||||
return
|
||||
|
||||
async def delete_knowledge_base(self, kb_uuid: str):
|
||||
for kb in self.knowledge_bases:
|
||||
if kb.knowledge_base_entity.uuid == kb_uuid:
|
||||
if kb.get_uuid() == kb_uuid:
|
||||
await kb.dispose()
|
||||
self.knowledge_bases.remove(kb)
|
||||
return
|
||||
|
||||
@@ -3,7 +3,8 @@ from __future__ import annotations
|
||||
from . import base_service
|
||||
from ....core import app
|
||||
from ....provider.modelmgr.requester import RuntimeEmbeddingModel
|
||||
from ....entity.rag import retriever as retriever_entities
|
||||
from langbot_plugin.api.entities.builtin.rag import context as rag_context
|
||||
from langbot_plugin.api.entities.builtin.provider.message import ContentElement
|
||||
|
||||
|
||||
class Retriever(base_service.BaseService):
|
||||
@@ -13,7 +14,7 @@ class Retriever(base_service.BaseService):
|
||||
|
||||
async def retrieve(
|
||||
self, kb_id: str, query: str, embedding_model: RuntimeEmbeddingModel, k: int = 5
|
||||
) -> list[retriever_entities.RetrieveResultEntry]:
|
||||
) -> list[rag_context.RetrievalResultEntry]:
|
||||
self.ap.logger.info(
|
||||
f"Retrieving for query: '{query[:10]}' with k={k} using {embedding_model.model_entity.uuid}"
|
||||
)
|
||||
@@ -35,11 +36,12 @@ class Retriever(base_service.BaseService):
|
||||
self.ap.logger.info('No relevant chunks found in vector database.')
|
||||
return []
|
||||
|
||||
result: list[retriever_entities.RetrieveResultEntry] = []
|
||||
result: list[rag_context.RetrievalResultEntry] = []
|
||||
|
||||
for i, id in enumerate(matched_vector_ids):
|
||||
entry = retriever_entities.RetrieveResultEntry(
|
||||
entry = rag_context.RetrievalResultEntry(
|
||||
id=id,
|
||||
content=[ContentElement.from_text(vector_metadatas[i].get('text', ''))],
|
||||
metadata=vector_metadatas[i],
|
||||
distance=distances[i],
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user