mirror of
https://github.com/langbot-app/LangBot.git
synced 2026-06-02 03:55:55 +00:00
feat: 支持可配置的混合检索融合权重 (#2071)
* feat: 支持可配置的混合检索融合权重 * style: 修复 ruff format 检查
This commit is contained in:
@@ -531,6 +531,7 @@ class RuntimeConnectionHandler(handler.Handler):
|
||||
filters = data.get('filters')
|
||||
search_type = data.get('search_type', 'vector')
|
||||
query_text = data.get('query_text', '')
|
||||
vector_weight = data.get('vector_weight')
|
||||
try:
|
||||
results = await self.ap.rag_runtime_service.vector_search(
|
||||
collection_id,
|
||||
@@ -539,6 +540,7 @@ class RuntimeConnectionHandler(handler.Handler):
|
||||
filters,
|
||||
search_type,
|
||||
query_text,
|
||||
vector_weight=vector_weight,
|
||||
)
|
||||
return handler.ActionResponse.success(data={'results': results})
|
||||
except Exception as e:
|
||||
|
||||
@@ -41,6 +41,7 @@ class RAGRuntimeService:
|
||||
filters: dict[str, Any] | None = None,
|
||||
search_type: str = 'vector',
|
||||
query_text: str = '',
|
||||
vector_weight: float | None = None,
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Handle VECTOR_SEARCH action."""
|
||||
return await self.ap.vector_db_mgr.search(
|
||||
@@ -50,6 +51,7 @@ class RAGRuntimeService:
|
||||
filter=filters,
|
||||
search_type=search_type,
|
||||
query_text=query_text,
|
||||
vector_weight=vector_weight,
|
||||
)
|
||||
|
||||
async def vector_delete(
|
||||
|
||||
@@ -97,6 +97,7 @@ class VectorDBManager:
|
||||
filter: dict | None = None,
|
||||
search_type: str = 'vector',
|
||||
query_text: str = '',
|
||||
vector_weight: float | None = None,
|
||||
) -> list[dict]:
|
||||
"""Proxy: Search vectors.
|
||||
|
||||
@@ -111,6 +112,7 @@ class VectorDBManager:
|
||||
search_type=search_type,
|
||||
query_text=query_text,
|
||||
filter=filter,
|
||||
vector_weight=vector_weight,
|
||||
)
|
||||
|
||||
if not results or 'ids' not in results or not results['ids']:
|
||||
|
||||
@@ -53,6 +53,7 @@ class VectorDatabase(abc.ABC):
|
||||
search_type: str = 'vector',
|
||||
query_text: str = '',
|
||||
filter: dict[str, Any] | None = None,
|
||||
vector_weight: float | None = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""Search for the most similar vectors in the specified collection.
|
||||
|
||||
@@ -70,6 +71,8 @@ class VectorDatabase(abc.ABC):
|
||||
{"file_id": "abc"}
|
||||
{"created_at": {"$gte": 1700000000}}
|
||||
{"file_type": {"$in": ["pdf", "docx"]}}
|
||||
vector_weight: Weight for vector search in hybrid mode (0.0–1.0).
|
||||
``None`` means use equal weights (backward compatible).
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
@@ -52,13 +52,16 @@ class ChromaVectorDatabase(VectorDatabase):
|
||||
search_type: str = 'vector',
|
||||
query_text: str = '',
|
||||
filter: dict[str, Any] | None = None,
|
||||
vector_weight: float | None = None,
|
||||
) -> dict[str, Any]:
|
||||
col = await self.get_or_create_collection(collection)
|
||||
|
||||
if search_type == SearchType.FULL_TEXT:
|
||||
return await self._full_text_search(col, collection, k, query_text, filter)
|
||||
elif search_type == SearchType.HYBRID:
|
||||
return await self._hybrid_search(col, collection, query_embedding, k, query_text, filter)
|
||||
return await self._hybrid_search(
|
||||
col, collection, query_embedding, k, query_text, filter, vector_weight=vector_weight
|
||||
)
|
||||
|
||||
# Default: vector search
|
||||
return await self._vector_search(col, collection, query_embedding, k, filter)
|
||||
@@ -127,6 +130,7 @@ class ChromaVectorDatabase(VectorDatabase):
|
||||
k: int,
|
||||
query_text: str,
|
||||
filter: dict[str, Any] | None,
|
||||
vector_weight: float | None = None,
|
||||
) -> dict[str, Any]:
|
||||
# Fall back to pure vector search when no text is provided
|
||||
if not query_text:
|
||||
@@ -144,7 +148,15 @@ class ChromaVectorDatabase(VectorDatabase):
|
||||
return {'ids': [[]], 'metadatas': [[]], 'distances': [[]], 'documents': [[]]}
|
||||
|
||||
# RRF fusion
|
||||
fused = self._rrf_fuse([vector_ids, text_ids], k)
|
||||
weights = None
|
||||
if vector_weight is not None:
|
||||
weights = [vector_weight, 1.0 - vector_weight]
|
||||
self.ap.logger.info(
|
||||
f"Chroma hybrid fusion config in '{collection}': "
|
||||
f'vector_weight={vector_weight}, weights={weights or [1.0, 1.0]}, '
|
||||
f'vector_hits={len(vector_ids)}, text_hits={len(text_ids)}'
|
||||
)
|
||||
fused = self._rrf_fuse([vector_ids, text_ids], k, weights=weights)
|
||||
if not fused:
|
||||
return {'ids': [[]], 'metadatas': [[]], 'distances': [[]], 'documents': [[]]}
|
||||
|
||||
@@ -197,16 +209,24 @@ class ChromaVectorDatabase(VectorDatabase):
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def _rrf_fuse(result_lists: list[list[str]], k: int) -> list[tuple[str, float]]:
|
||||
def _rrf_fuse(result_lists: list[list[str]], k: int, weights: list[float] | None = None) -> list[tuple[str, float]]:
|
||||
"""Reciprocal Rank Fusion over multiple ranked ID lists.
|
||||
|
||||
Returns a list of (doc_id, rrf_score) sorted by descending score,
|
||||
truncated to *k* entries.
|
||||
|
||||
Args:
|
||||
result_lists: Ranked ID lists from different search methods.
|
||||
k: Number of results to return.
|
||||
weights: Per-list weights. ``None`` means equal weight (1.0 each).
|
||||
"""
|
||||
if weights is None:
|
||||
weights = [1.0] * len(result_lists)
|
||||
scores: dict[str, float] = {}
|
||||
for ranked_ids in result_lists:
|
||||
for list_idx, ranked_ids in enumerate(result_lists):
|
||||
w = weights[list_idx]
|
||||
for rank, doc_id in enumerate(ranked_ids):
|
||||
scores[doc_id] = scores.get(doc_id, 0.0) + 1.0 / (_RRF_K + rank + 1)
|
||||
scores[doc_id] = scores.get(doc_id, 0.0) + w / (_RRF_K + rank + 1)
|
||||
sorted_results = sorted(scores.items(), key=lambda x: x[1], reverse=True)
|
||||
return sorted_results[:k]
|
||||
|
||||
|
||||
@@ -255,6 +255,7 @@ class MilvusVectorDatabase(VectorDatabase):
|
||||
search_type: str = 'vector',
|
||||
query_text: str = '',
|
||||
filter: dict[str, Any] | None = None,
|
||||
vector_weight: float | None = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""Search for similar vectors in Milvus collection
|
||||
|
||||
|
||||
@@ -192,6 +192,7 @@ class PgVectorDatabase(VectorDatabase):
|
||||
search_type: str = 'vector',
|
||||
query_text: str = '',
|
||||
filter: dict[str, Any] | None = None,
|
||||
vector_weight: float | None = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""Search for similar vectors using cosine distance
|
||||
|
||||
|
||||
@@ -100,6 +100,7 @@ class QdrantVectorDatabase(VectorDatabase):
|
||||
search_type: str = 'vector',
|
||||
query_text: str = '',
|
||||
filter: dict[str, Any] | None = None,
|
||||
vector_weight: float | None = None,
|
||||
) -> dict[str, Any]:
|
||||
exists = await self.client.collection_exists(collection)
|
||||
if not exists:
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
from decimal import Decimal
|
||||
import re
|
||||
from typing import Any, Dict, List
|
||||
|
||||
|
||||
@@ -101,8 +103,28 @@ class SeekDBVectorDatabase(VectorDatabase):
|
||||
}
|
||||
)
|
||||
|
||||
def _normalize_collection_name(self, collection: str) -> str:
|
||||
"""SeekDB only accepts [a-zA-Z0-9_], while LangBot uses UUID-like KB IDs."""
|
||||
normalized = re.sub(r'[^A-Za-z0-9_]', '_', collection)
|
||||
if normalized != collection:
|
||||
self.ap.logger.info(f"Normalized SeekDB collection name: '{collection}' -> '{normalized}'")
|
||||
return normalized
|
||||
|
||||
def _json_safe(self, value: Any) -> Any:
|
||||
"""Convert SeekDB result values into JSON-serializable Python primitives."""
|
||||
if isinstance(value, Decimal):
|
||||
return float(value)
|
||||
if isinstance(value, dict):
|
||||
return {k: self._json_safe(v) for k, v in value.items()}
|
||||
if isinstance(value, list):
|
||||
return [self._json_safe(v) for v in value]
|
||||
if isinstance(value, tuple):
|
||||
return [self._json_safe(v) for v in value]
|
||||
return value
|
||||
|
||||
async def _get_or_create_collection_internal(self, collection: str, vector_size: int = None) -> Any:
|
||||
"""Internal method to get or create a collection with proper configuration."""
|
||||
collection = self._normalize_collection_name(collection)
|
||||
if collection in self._collections:
|
||||
return self._collections[collection]
|
||||
|
||||
@@ -173,6 +195,7 @@ class SeekDBVectorDatabase(VectorDatabase):
|
||||
if not embeddings_list:
|
||||
return
|
||||
|
||||
collection = self._normalize_collection_name(collection)
|
||||
# Ensure collection exists with correct dimension
|
||||
vector_size = len(embeddings_list[0])
|
||||
coll = await self._get_or_create_collection_internal(collection, vector_size)
|
||||
@@ -194,6 +217,7 @@ class SeekDBVectorDatabase(VectorDatabase):
|
||||
search_type: str = 'vector',
|
||||
query_text: str = '',
|
||||
filter: Dict[str, Any] | None = None,
|
||||
vector_weight: float | None = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""Search for the most similar vectors in the specified collection.
|
||||
|
||||
@@ -210,6 +234,7 @@ class SeekDBVectorDatabase(VectorDatabase):
|
||||
Returns:
|
||||
Dictionary with 'ids', 'metadatas', 'distances' keys
|
||||
"""
|
||||
collection = self._normalize_collection_name(collection)
|
||||
# Check if collection exists
|
||||
exists = await asyncio.to_thread(self.client.has_collection, collection)
|
||||
if not exists:
|
||||
@@ -271,6 +296,17 @@ class SeekDBVectorDatabase(VectorDatabase):
|
||||
query_cfg['where'] = filter
|
||||
knn_cfg['where'] = filter
|
||||
|
||||
# Apply vector_weight via pyseekdb's native boost parameter
|
||||
if vector_weight is not None:
|
||||
knn_cfg['boost'] = vector_weight
|
||||
query_cfg['boost'] = 1.0 - vector_weight
|
||||
self.ap.logger.info(
|
||||
f"SeekDB hybrid fusion config in '{collection}': "
|
||||
f'vector_weight={vector_weight}, '
|
||||
f'knn_boost={knn_cfg.get("boost", 1.0)}, '
|
||||
f'query_boost={query_cfg.get("boost", 1.0)}'
|
||||
)
|
||||
|
||||
results = await asyncio.to_thread(
|
||||
coll.hybrid_search,
|
||||
query=query_cfg,
|
||||
@@ -279,6 +315,9 @@ class SeekDBVectorDatabase(VectorDatabase):
|
||||
n_results=k,
|
||||
include=['documents', 'metadatas'],
|
||||
)
|
||||
self.ap.logger.info(
|
||||
f"SeekDB hybrid search in '{collection}' returned {len(results.get('ids', [[]])[0])} results."
|
||||
)
|
||||
else:
|
||||
# Default: vector search via query()
|
||||
query_kwargs = {'n_results': k, 'query_embeddings': query_embedding}
|
||||
@@ -286,6 +325,7 @@ class SeekDBVectorDatabase(VectorDatabase):
|
||||
query_kwargs['where'] = filter
|
||||
results = await asyncio.to_thread(coll.query, **query_kwargs)
|
||||
|
||||
results = self._json_safe(results)
|
||||
self.ap.logger.info(
|
||||
f"SeekDB {search_type} search in '{collection}' returned {len(results.get('ids', [[]])[0])} results"
|
||||
)
|
||||
@@ -299,6 +339,7 @@ class SeekDBVectorDatabase(VectorDatabase):
|
||||
collection: Collection name
|
||||
file_id: File ID to delete
|
||||
"""
|
||||
collection = self._normalize_collection_name(collection)
|
||||
# Check if collection exists
|
||||
exists = await asyncio.to_thread(self.client.has_collection, collection)
|
||||
if not exists:
|
||||
@@ -325,6 +366,7 @@ class SeekDBVectorDatabase(VectorDatabase):
|
||||
collection: Collection name
|
||||
filter: Chroma-style ``where`` filter dict
|
||||
"""
|
||||
collection = self._normalize_collection_name(collection)
|
||||
exists = await asyncio.to_thread(self.client.has_collection, collection)
|
||||
if not exists:
|
||||
self.ap.logger.warning(f"SeekDB collection '{collection}' not found for deletion")
|
||||
@@ -347,6 +389,7 @@ class SeekDBVectorDatabase(VectorDatabase):
|
||||
limit: int = 20,
|
||||
offset: int = 0,
|
||||
) -> tuple[list[Dict[str, Any]], int]:
|
||||
collection = self._normalize_collection_name(collection)
|
||||
exists = await asyncio.to_thread(self.client.has_collection, collection)
|
||||
if not exists:
|
||||
return [], 0
|
||||
@@ -367,6 +410,7 @@ class SeekDBVectorDatabase(VectorDatabase):
|
||||
|
||||
results = await asyncio.to_thread(coll.get, **get_kwargs)
|
||||
|
||||
results = self._json_safe(results)
|
||||
ids = results.get('ids', [])
|
||||
metadatas = results.get('metadatas', []) or [None] * len(ids)
|
||||
documents = results.get('documents', []) or [None] * len(ids)
|
||||
@@ -390,6 +434,7 @@ class SeekDBVectorDatabase(VectorDatabase):
|
||||
Args:
|
||||
collection: Collection name
|
||||
"""
|
||||
collection = self._normalize_collection_name(collection)
|
||||
# Remove from cache
|
||||
if collection in self._collections:
|
||||
del self._collections[collection]
|
||||
|
||||
Reference in New Issue
Block a user