feat: external knowledge bases (#1783)

* Initial plan

* Add backend support for external knowledge bases

Co-authored-by: RockChinQ <45992437+RockChinQ@users.noreply.github.com>

* Add frontend support for external knowledge bases with tabs UI

Co-authored-by: RockChinQ <45992437+RockChinQ@users.noreply.github.com>

* Add i18n translations for all languages (Traditional Chinese and Japanese)

Co-authored-by: RockChinQ <45992437+RockChinQ@users.noreply.github.com>

* Update knowledge base tab list styling to match plugins page

Co-authored-by: RockChinQ <45992437+RockChinQ@users.noreply.github.com>

* perf: margin-top for kb page

* refactor: switch RetrievalResultEntry to langbot_plugin pkg ones

* feat: knowledge retriever listing and creating

* stash

* refactor: unify sync mechanism for polymorphic components

* feat: use unified retireval result struct in retrieval test page

* chore: remove unused methods

* feat: retriever icon displaying

* feat: localagent retrieval with external kbs

* chore: bump version of langbot-plugin to 0.2.0b1

* fix: i18n

---------

Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
Co-authored-by: RockChinQ <45992437+RockChinQ@users.noreply.github.com>
Co-authored-by: Junyan Qin <rockchinq@gmail.com>
This commit is contained in:
Copilot
2025-11-27 23:19:43 +08:00
committed by GitHub
parent 3c04eeaff9
commit a8481e43f0
33 changed files with 1924 additions and 161 deletions

View File

@@ -0,0 +1,55 @@
"""Base classes and interfaces for knowledge bases"""
from __future__ import annotations
import abc
from langbot.pkg.core import app
from langbot_plugin.api.entities.builtin.rag import context as rag_context
class KnowledgeBaseInterface(metaclass=abc.ABCMeta):
"""Abstract interface for all knowledge base types"""
ap: app.Application
def __init__(self, ap: app.Application):
self.ap = ap
@abc.abstractmethod
async def initialize(self):
"""Initialize the knowledge base"""
pass
@abc.abstractmethod
async def retrieve(self, query: str, top_k: int) -> list[rag_context.RetrievalResultEntry]:
"""Retrieve relevant documents from the knowledge base
Args:
query: The query string
top_k: Number of top results to return
Returns:
List of retrieve result entries
"""
pass
@abc.abstractmethod
def get_uuid(self) -> str:
"""Get the UUID of the knowledge base"""
pass
@abc.abstractmethod
def get_name(self) -> str:
"""Get the name of the knowledge base"""
pass
@abc.abstractmethod
def get_type(self) -> str:
"""Get the type of knowledge base (internal/external)"""
pass
@abc.abstractmethod
async def dispose(self):
"""Clean up resources"""
pass

View File

@@ -0,0 +1,85 @@
"""External knowledge base implementation"""
from __future__ import annotations
from langbot.pkg.core import app
from langbot.pkg.entity.persistence import rag as persistence_rag
from langbot_plugin.api.entities.builtin.rag import context as rag_context
from .base import KnowledgeBaseInterface
class ExternalKnowledgeBase(KnowledgeBaseInterface):
"""External knowledge base that queries via HTTP API or plugin retriever"""
external_kb_entity: persistence_rag.ExternalKnowledgeBase
# Plugin retriever instance ID
retriever_instance_id: str | None
def __init__(self, ap: app.Application, external_kb_entity: persistence_rag.ExternalKnowledgeBase):
super().__init__(ap)
self.external_kb_entity = external_kb_entity
self.retriever_instance_id = None
async def initialize(self):
"""Initialize the external knowledge base"""
# Use KB UUID as instance ID
# Instance creation is now handled by the unified sync mechanism
# when LangBot connects to runtime
self.retriever_instance_id = self.external_kb_entity.uuid
self.ap.logger.info(
f'Initialized external KB {self.external_kb_entity.uuid}, instance will be created by sync mechanism'
)
async def retrieve(self, query: str, top_k: int = 5) -> list[rag_context.RetrievalResultEntry]:
"""Retrieve documents from external knowledge base via plugin retriever"""
if not self.retriever_instance_id:
self.ap.logger.error(f'No retriever instance for KB {self.external_kb_entity.uuid}')
return []
try:
results = await self.ap.plugin_connector.retrieve_knowledge(
self.external_kb_entity.plugin_author,
self.external_kb_entity.plugin_name,
self.external_kb_entity.retriever_name,
self.retriever_instance_id,
{'query': query},
)
# Convert plugin results to RetrievalResultEntry
retrieval_entries = []
for result in results:
retrieval_entries.append(rag_context.RetrievalResultEntry(**result))
return retrieval_entries
except Exception as e:
self.ap.logger.error(f'Plugin retriever error: {e}')
import traceback
traceback.print_exc()
return []
def get_uuid(self) -> str:
"""Get the UUID of the external knowledge base"""
return self.external_kb_entity.uuid
def get_name(self) -> str:
"""Get the name of the external knowledge base"""
return self.external_kb_entity.name
def get_type(self) -> str:
"""Get the type of knowledge base"""
return 'external'
async def dispose(self):
"""Clean up resources"""
# Trigger sync to immediately delete the instance from plugin process
# This ensures instance is cleaned up without waiting for next LangBot restart
try:
await self.ap.plugin_connector.sync_polymorphic_component_instances()
self.ap.logger.info(
f'Disposed external KB {self.external_kb_entity.uuid}, triggered sync to delete instance'
)
except Exception as e:
self.ap.logger.error(f'Failed to sync after disposing KB: {e}')

View File

@@ -10,10 +10,12 @@ from langbot.pkg.rag.knowledge.services.retriever import Retriever
import sqlalchemy
from langbot.pkg.entity.persistence import rag as persistence_rag
from langbot.pkg.core import taskmgr
from langbot.pkg.entity.rag import retriever as retriever_entities
from langbot_plugin.api.entities.builtin.rag import context as rag_context
from .base import KnowledgeBaseInterface
from .external import ExternalKnowledgeBase
class RuntimeKnowledgeBase:
class RuntimeKnowledgeBase(KnowledgeBaseInterface):
ap: app.Application
knowledge_base_entity: persistence_rag.KnowledgeBase
@@ -27,7 +29,7 @@ class RuntimeKnowledgeBase:
retriever: Retriever
def __init__(self, ap: app.Application, knowledge_base_entity: persistence_rag.KnowledgeBase):
self.ap = ap
super().__init__(ap)
self.knowledge_base_entity = knowledge_base_entity
self.parser = parser.FileParser(ap=self.ap)
self.chunker = chunker.Chunker(ap=self.ap)
@@ -187,7 +189,7 @@ class RuntimeKnowledgeBase:
return stored_file_tasks[0] if stored_file_tasks else ''
async def retrieve(self, query: str, top_k: int) -> list[retriever_entities.RetrieveResultEntry]:
async def retrieve(self, query: str, top_k: int) -> list[rag_context.RetrievalResultEntry]:
embedding_model = await self.ap.model_mgr.get_embedding_model_by_uuid(
self.knowledge_base_entity.embedding_model_uuid
)
@@ -206,6 +208,18 @@ class RuntimeKnowledgeBase:
sqlalchemy.delete(persistence_rag.File).where(persistence_rag.File.uuid == file_id)
)
def get_uuid(self) -> str:
"""Get the UUID of the knowledge base"""
return self.knowledge_base_entity.uuid
def get_name(self) -> str:
"""Get the name of the knowledge base"""
return self.knowledge_base_entity.name
def get_type(self) -> str:
"""Get the type of knowledge base"""
return 'internal'
async def dispose(self):
await self.ap.vector_db_mgr.vector_db.delete_collection(self.knowledge_base_entity.uuid)
@@ -213,7 +227,7 @@ class RuntimeKnowledgeBase:
class RAGManager:
ap: app.Application
knowledge_bases: list[RuntimeKnowledgeBase]
knowledge_bases: list[KnowledgeBaseInterface]
def __init__(self, ap: app.Application):
self.ap = ap
@@ -227,8 +241,8 @@ class RAGManager:
self.knowledge_bases = []
# Load internal knowledge bases
result = await self.ap.persistence_mgr.execute_async(sqlalchemy.select(persistence_rag.KnowledgeBase))
knowledge_bases = result.all()
for knowledge_base in knowledge_bases:
@@ -239,6 +253,21 @@ class RAGManager:
f'Error loading knowledge base {knowledge_base.uuid}: {e}\n{traceback.format_exc()}'
)
# Load external knowledge bases
external_result = await self.ap.persistence_mgr.execute_async(
sqlalchemy.select(persistence_rag.ExternalKnowledgeBase)
)
external_kbs = external_result.all()
for external_kb in external_kbs:
try:
# Don't trigger sync during batch loading - will sync once after LangBot connects to runtime
await self.load_external_knowledge_base(external_kb, trigger_sync=False)
except Exception as e:
self.ap.logger.error(
f'Error loading external knowledge base {external_kb.uuid}: {e}\n{traceback.format_exc()}'
)
async def load_knowledge_base(
self,
knowledge_base_entity: persistence_rag.KnowledgeBase | sqlalchemy.Row | dict,
@@ -256,21 +285,54 @@ class RAGManager:
return runtime_knowledge_base
async def get_knowledge_base_by_uuid(self, kb_uuid: str) -> RuntimeKnowledgeBase | None:
async def load_external_knowledge_base(
self,
external_kb_entity: persistence_rag.ExternalKnowledgeBase | sqlalchemy.Row | dict,
trigger_sync: bool = True,
) -> ExternalKnowledgeBase:
"""Load external knowledge base into runtime
Args:
external_kb_entity: External KB entity to load
trigger_sync: Whether to trigger sync after loading (default True for manual creation, False for batch loading)
"""
if isinstance(external_kb_entity, sqlalchemy.Row):
external_kb_entity = persistence_rag.ExternalKnowledgeBase(**external_kb_entity._mapping)
elif isinstance(external_kb_entity, dict):
external_kb_entity = persistence_rag.ExternalKnowledgeBase(**external_kb_entity)
external_kb = ExternalKnowledgeBase(ap=self.ap, external_kb_entity=external_kb_entity)
await external_kb.initialize()
self.knowledge_bases.append(external_kb)
# Trigger sync to create the instance immediately (for manual creation)
# Skip sync during batch loading from DB to avoid multiple sync calls
if trigger_sync:
try:
await self.ap.plugin_connector.sync_polymorphic_component_instances()
self.ap.logger.info(f'Triggered sync after loading external KB {external_kb_entity.uuid}')
except Exception as e:
self.ap.logger.error(f'Failed to sync after loading external KB: {e}')
return external_kb
async def get_knowledge_base_by_uuid(self, kb_uuid: str) -> KnowledgeBaseInterface | None:
for kb in self.knowledge_bases:
if kb.knowledge_base_entity.uuid == kb_uuid:
if kb.get_uuid() == kb_uuid:
return kb
return None
async def remove_knowledge_base_from_runtime(self, kb_uuid: str):
for kb in self.knowledge_bases:
if kb.knowledge_base_entity.uuid == kb_uuid:
if kb.get_uuid() == kb_uuid:
self.knowledge_bases.remove(kb)
return
async def delete_knowledge_base(self, kb_uuid: str):
for kb in self.knowledge_bases:
if kb.knowledge_base_entity.uuid == kb_uuid:
if kb.get_uuid() == kb_uuid:
await kb.dispose()
self.knowledge_bases.remove(kb)
return

View File

@@ -3,7 +3,8 @@ from __future__ import annotations
from . import base_service
from ....core import app
from ....provider.modelmgr.requester import RuntimeEmbeddingModel
from ....entity.rag import retriever as retriever_entities
from langbot_plugin.api.entities.builtin.rag import context as rag_context
from langbot_plugin.api.entities.builtin.provider.message import ContentElement
class Retriever(base_service.BaseService):
@@ -13,7 +14,7 @@ class Retriever(base_service.BaseService):
async def retrieve(
self, kb_id: str, query: str, embedding_model: RuntimeEmbeddingModel, k: int = 5
) -> list[retriever_entities.RetrieveResultEntry]:
) -> list[rag_context.RetrievalResultEntry]:
self.ap.logger.info(
f"Retrieving for query: '{query[:10]}' with k={k} using {embedding_model.model_entity.uuid}"
)
@@ -35,11 +36,12 @@ class Retriever(base_service.BaseService):
self.ap.logger.info('No relevant chunks found in vector database.')
return []
result: list[retriever_entities.RetrieveResultEntry] = []
result: list[rag_context.RetrievalResultEntry] = []
for i, id in enumerate(matched_vector_ids):
entry = retriever_entities.RetrieveResultEntry(
entry = rag_context.RetrievalResultEntry(
id=id,
content=[ContentElement.from_text(vector_metadatas[i].get('text', ''))],
metadata=vector_metadatas[i],
distance=distances[i],
)