feat: 支持可配置的混合检索融合权重 (#2071)

* feat: 支持可配置的混合检索融合权重

* style: 修复 ruff format 检查
This commit is contained in:
zpf2000
2026-03-24 09:50:08 +08:00
committed by GitHub
parent c13971d7d6
commit 6fa653f232
9 changed files with 82 additions and 5 deletions

View File

@@ -531,6 +531,7 @@ class RuntimeConnectionHandler(handler.Handler):
filters = data.get('filters')
search_type = data.get('search_type', 'vector')
query_text = data.get('query_text', '')
vector_weight = data.get('vector_weight')
try:
results = await self.ap.rag_runtime_service.vector_search(
collection_id,
@@ -539,6 +540,7 @@ class RuntimeConnectionHandler(handler.Handler):
filters,
search_type,
query_text,
vector_weight=vector_weight,
)
return handler.ActionResponse.success(data={'results': results})
except Exception as e:

View File

@@ -41,6 +41,7 @@ class RAGRuntimeService:
filters: dict[str, Any] | None = None,
search_type: str = 'vector',
query_text: str = '',
vector_weight: float | None = None,
) -> list[dict[str, Any]]:
"""Handle VECTOR_SEARCH action."""
return await self.ap.vector_db_mgr.search(
@@ -50,6 +51,7 @@ class RAGRuntimeService:
filter=filters,
search_type=search_type,
query_text=query_text,
vector_weight=vector_weight,
)
async def vector_delete(

View File

@@ -97,6 +97,7 @@ class VectorDBManager:
filter: dict | None = None,
search_type: str = 'vector',
query_text: str = '',
vector_weight: float | None = None,
) -> list[dict]:
"""Proxy: Search vectors.
@@ -111,6 +112,7 @@ class VectorDBManager:
search_type=search_type,
query_text=query_text,
filter=filter,
vector_weight=vector_weight,
)
if not results or 'ids' not in results or not results['ids']:

View File

@@ -53,6 +53,7 @@ class VectorDatabase(abc.ABC):
search_type: str = 'vector',
query_text: str = '',
filter: dict[str, Any] | None = None,
vector_weight: float | None = None,
) -> Dict[str, Any]:
"""Search for the most similar vectors in the specified collection.
@@ -70,6 +71,8 @@ class VectorDatabase(abc.ABC):
{"file_id": "abc"}
{"created_at": {"$gte": 1700000000}}
{"file_type": {"$in": ["pdf", "docx"]}}
vector_weight: Weight for vector search in hybrid mode (0.01.0).
``None`` means use equal weights (backward compatible).
"""
pass

View File

@@ -52,13 +52,16 @@ class ChromaVectorDatabase(VectorDatabase):
search_type: str = 'vector',
query_text: str = '',
filter: dict[str, Any] | None = None,
vector_weight: float | None = None,
) -> dict[str, Any]:
col = await self.get_or_create_collection(collection)
if search_type == SearchType.FULL_TEXT:
return await self._full_text_search(col, collection, k, query_text, filter)
elif search_type == SearchType.HYBRID:
return await self._hybrid_search(col, collection, query_embedding, k, query_text, filter)
return await self._hybrid_search(
col, collection, query_embedding, k, query_text, filter, vector_weight=vector_weight
)
# Default: vector search
return await self._vector_search(col, collection, query_embedding, k, filter)
@@ -127,6 +130,7 @@ class ChromaVectorDatabase(VectorDatabase):
k: int,
query_text: str,
filter: dict[str, Any] | None,
vector_weight: float | None = None,
) -> dict[str, Any]:
# Fall back to pure vector search when no text is provided
if not query_text:
@@ -144,7 +148,15 @@ class ChromaVectorDatabase(VectorDatabase):
return {'ids': [[]], 'metadatas': [[]], 'distances': [[]], 'documents': [[]]}
# RRF fusion
fused = self._rrf_fuse([vector_ids, text_ids], k)
weights = None
if vector_weight is not None:
weights = [vector_weight, 1.0 - vector_weight]
self.ap.logger.info(
f"Chroma hybrid fusion config in '{collection}': "
f'vector_weight={vector_weight}, weights={weights or [1.0, 1.0]}, '
f'vector_hits={len(vector_ids)}, text_hits={len(text_ids)}'
)
fused = self._rrf_fuse([vector_ids, text_ids], k, weights=weights)
if not fused:
return {'ids': [[]], 'metadatas': [[]], 'distances': [[]], 'documents': [[]]}
@@ -197,16 +209,24 @@ class ChromaVectorDatabase(VectorDatabase):
}
@staticmethod
def _rrf_fuse(result_lists: list[list[str]], k: int) -> list[tuple[str, float]]:
def _rrf_fuse(result_lists: list[list[str]], k: int, weights: list[float] | None = None) -> list[tuple[str, float]]:
"""Reciprocal Rank Fusion over multiple ranked ID lists.
Returns a list of (doc_id, rrf_score) sorted by descending score,
truncated to *k* entries.
Args:
result_lists: Ranked ID lists from different search methods.
k: Number of results to return.
weights: Per-list weights. ``None`` means equal weight (1.0 each).
"""
if weights is None:
weights = [1.0] * len(result_lists)
scores: dict[str, float] = {}
for ranked_ids in result_lists:
for list_idx, ranked_ids in enumerate(result_lists):
w = weights[list_idx]
for rank, doc_id in enumerate(ranked_ids):
scores[doc_id] = scores.get(doc_id, 0.0) + 1.0 / (_RRF_K + rank + 1)
scores[doc_id] = scores.get(doc_id, 0.0) + w / (_RRF_K + rank + 1)
sorted_results = sorted(scores.items(), key=lambda x: x[1], reverse=True)
return sorted_results[:k]

View File

@@ -255,6 +255,7 @@ class MilvusVectorDatabase(VectorDatabase):
search_type: str = 'vector',
query_text: str = '',
filter: dict[str, Any] | None = None,
vector_weight: float | None = None,
) -> Dict[str, Any]:
"""Search for similar vectors in Milvus collection

View File

@@ -192,6 +192,7 @@ class PgVectorDatabase(VectorDatabase):
search_type: str = 'vector',
query_text: str = '',
filter: dict[str, Any] | None = None,
vector_weight: float | None = None,
) -> Dict[str, Any]:
"""Search for similar vectors using cosine distance

View File

@@ -100,6 +100,7 @@ class QdrantVectorDatabase(VectorDatabase):
search_type: str = 'vector',
query_text: str = '',
filter: dict[str, Any] | None = None,
vector_weight: float | None = None,
) -> dict[str, Any]:
exists = await self.client.collection_exists(collection)
if not exists:

View File

@@ -1,6 +1,8 @@
from __future__ import annotations
import asyncio
from decimal import Decimal
import re
from typing import Any, Dict, List
@@ -101,8 +103,28 @@ class SeekDBVectorDatabase(VectorDatabase):
}
)
def _normalize_collection_name(self, collection: str) -> str:
"""SeekDB only accepts [a-zA-Z0-9_], while LangBot uses UUID-like KB IDs."""
normalized = re.sub(r'[^A-Za-z0-9_]', '_', collection)
if normalized != collection:
self.ap.logger.info(f"Normalized SeekDB collection name: '{collection}' -> '{normalized}'")
return normalized
def _json_safe(self, value: Any) -> Any:
"""Convert SeekDB result values into JSON-serializable Python primitives."""
if isinstance(value, Decimal):
return float(value)
if isinstance(value, dict):
return {k: self._json_safe(v) for k, v in value.items()}
if isinstance(value, list):
return [self._json_safe(v) for v in value]
if isinstance(value, tuple):
return [self._json_safe(v) for v in value]
return value
async def _get_or_create_collection_internal(self, collection: str, vector_size: int = None) -> Any:
"""Internal method to get or create a collection with proper configuration."""
collection = self._normalize_collection_name(collection)
if collection in self._collections:
return self._collections[collection]
@@ -173,6 +195,7 @@ class SeekDBVectorDatabase(VectorDatabase):
if not embeddings_list:
return
collection = self._normalize_collection_name(collection)
# Ensure collection exists with correct dimension
vector_size = len(embeddings_list[0])
coll = await self._get_or_create_collection_internal(collection, vector_size)
@@ -194,6 +217,7 @@ class SeekDBVectorDatabase(VectorDatabase):
search_type: str = 'vector',
query_text: str = '',
filter: Dict[str, Any] | None = None,
vector_weight: float | None = None,
) -> Dict[str, Any]:
"""Search for the most similar vectors in the specified collection.
@@ -210,6 +234,7 @@ class SeekDBVectorDatabase(VectorDatabase):
Returns:
Dictionary with 'ids', 'metadatas', 'distances' keys
"""
collection = self._normalize_collection_name(collection)
# Check if collection exists
exists = await asyncio.to_thread(self.client.has_collection, collection)
if not exists:
@@ -271,6 +296,17 @@ class SeekDBVectorDatabase(VectorDatabase):
query_cfg['where'] = filter
knn_cfg['where'] = filter
# Apply vector_weight via pyseekdb's native boost parameter
if vector_weight is not None:
knn_cfg['boost'] = vector_weight
query_cfg['boost'] = 1.0 - vector_weight
self.ap.logger.info(
f"SeekDB hybrid fusion config in '{collection}': "
f'vector_weight={vector_weight}, '
f'knn_boost={knn_cfg.get("boost", 1.0)}, '
f'query_boost={query_cfg.get("boost", 1.0)}'
)
results = await asyncio.to_thread(
coll.hybrid_search,
query=query_cfg,
@@ -279,6 +315,9 @@ class SeekDBVectorDatabase(VectorDatabase):
n_results=k,
include=['documents', 'metadatas'],
)
self.ap.logger.info(
f"SeekDB hybrid search in '{collection}' returned {len(results.get('ids', [[]])[0])} results."
)
else:
# Default: vector search via query()
query_kwargs = {'n_results': k, 'query_embeddings': query_embedding}
@@ -286,6 +325,7 @@ class SeekDBVectorDatabase(VectorDatabase):
query_kwargs['where'] = filter
results = await asyncio.to_thread(coll.query, **query_kwargs)
results = self._json_safe(results)
self.ap.logger.info(
f"SeekDB {search_type} search in '{collection}' returned {len(results.get('ids', [[]])[0])} results"
)
@@ -299,6 +339,7 @@ class SeekDBVectorDatabase(VectorDatabase):
collection: Collection name
file_id: File ID to delete
"""
collection = self._normalize_collection_name(collection)
# Check if collection exists
exists = await asyncio.to_thread(self.client.has_collection, collection)
if not exists:
@@ -325,6 +366,7 @@ class SeekDBVectorDatabase(VectorDatabase):
collection: Collection name
filter: Chroma-style ``where`` filter dict
"""
collection = self._normalize_collection_name(collection)
exists = await asyncio.to_thread(self.client.has_collection, collection)
if not exists:
self.ap.logger.warning(f"SeekDB collection '{collection}' not found for deletion")
@@ -347,6 +389,7 @@ class SeekDBVectorDatabase(VectorDatabase):
limit: int = 20,
offset: int = 0,
) -> tuple[list[Dict[str, Any]], int]:
collection = self._normalize_collection_name(collection)
exists = await asyncio.to_thread(self.client.has_collection, collection)
if not exists:
return [], 0
@@ -367,6 +410,7 @@ class SeekDBVectorDatabase(VectorDatabase):
results = await asyncio.to_thread(coll.get, **get_kwargs)
results = self._json_safe(results)
ids = results.get('ids', [])
metadatas = results.get('metadatas', []) or [None] * len(ids)
documents = results.get('documents', []) or [None] * len(ids)
@@ -390,6 +434,7 @@ class SeekDBVectorDatabase(VectorDatabase):
Args:
collection: Collection name
"""
collection = self._normalize_collection_name(collection)
# Remove from cache
if collection in self._collections:
del self._collections[collection]