feat: 支持可配置的混合检索融合权重 (#2071)

* feat: 支持可配置的混合检索融合权重 * style: 修复 ruff format 检查
2026-07-19 19:06:07 +00:00 · 2026-03-24 09:50:08 +08:00
parent c13971d7d6
commit 6fa653f232
9 changed files with 82 additions and 5 deletions
@@ -531,6 +531,7 @@ class RuntimeConnectionHandler(handler.Handler):
            filters = data.get('filters')
            search_type = data.get('search_type', 'vector')
            query_text = data.get('query_text', '')
+            vector_weight = data.get('vector_weight')
            try:
                results = await self.ap.rag_runtime_service.vector_search(
                    collection_id,
@@ -539,6 +540,7 @@ class RuntimeConnectionHandler(handler.Handler):
                    filters,
                    search_type,
                    query_text,
+                    vector_weight=vector_weight,
                )
                return handler.ActionResponse.success(data={'results': results})
            except Exception as e:
@@ -41,6 +41,7 @@ class RAGRuntimeService:
        filters: dict[str, Any] | None = None,
        search_type: str = 'vector',
        query_text: str = '',
+        vector_weight: float | None = None,
    ) -> list[dict[str, Any]]:
        """Handle VECTOR_SEARCH action."""
        return await self.ap.vector_db_mgr.search(
@@ -50,6 +51,7 @@ class RAGRuntimeService:
            filter=filters,
            search_type=search_type,
            query_text=query_text,
+            vector_weight=vector_weight,
        )

    async def vector_delete(
@@ -97,6 +97,7 @@ class VectorDBManager:
        filter: dict | None = None,
        search_type: str = 'vector',
        query_text: str = '',
+        vector_weight: float | None = None,
    ) -> list[dict]:
        """Proxy: Search vectors.

@@ -111,6 +112,7 @@ class VectorDBManager:
            search_type=search_type,
            query_text=query_text,
            filter=filter,
+            vector_weight=vector_weight,
        )

        if not results or 'ids' not in results or not results['ids']:
@@ -53,6 +53,7 @@ class VectorDatabase(abc.ABC):
        search_type: str = 'vector',
        query_text: str = '',
        filter: dict[str, Any] | None = None,
+        vector_weight: float | None = None,
    ) -> Dict[str, Any]:
        """Search for the most similar vectors in the specified collection.

@@ -70,6 +71,8 @@ class VectorDatabase(abc.ABC):
                    {"file_id": "abc"}
                    {"created_at": {"$gte": 1700000000}}
                    {"file_type": {"$in": ["pdf", "docx"]}}
+            vector_weight: Weight for vector search in hybrid mode (0.0–1.0).
+                ``None`` means use equal weights (backward compatible).
        """
        pass

@@ -52,13 +52,16 @@ class ChromaVectorDatabase(VectorDatabase):
        search_type: str = 'vector',
        query_text: str = '',
        filter: dict[str, Any] | None = None,
+        vector_weight: float | None = None,
    ) -> dict[str, Any]:
        col = await self.get_or_create_collection(collection)

        if search_type == SearchType.FULL_TEXT:
            return await self._full_text_search(col, collection, k, query_text, filter)
        elif search_type == SearchType.HYBRID:
-            return await self._hybrid_search(col, collection, query_embedding, k, query_text, filter)
+            return await self._hybrid_search(
+                col, collection, query_embedding, k, query_text, filter, vector_weight=vector_weight
+            )

        # Default: vector search
        return await self._vector_search(col, collection, query_embedding, k, filter)
@@ -127,6 +130,7 @@ class ChromaVectorDatabase(VectorDatabase):
        k: int,
        query_text: str,
        filter: dict[str, Any] | None,
+        vector_weight: float | None = None,
    ) -> dict[str, Any]:
        # Fall back to pure vector search when no text is provided
        if not query_text:
@@ -144,7 +148,15 @@ class ChromaVectorDatabase(VectorDatabase):
            return {'ids': [[]], 'metadatas': [[]], 'distances': [[]], 'documents': [[]]}

        # RRF fusion
-        fused = self._rrf_fuse([vector_ids, text_ids], k)
+        weights = None
+        if vector_weight is not None:
+            weights = [vector_weight, 1.0 - vector_weight]
+        self.ap.logger.info(
+            f"Chroma hybrid fusion config in '{collection}': "
+            f'vector_weight={vector_weight}, weights={weights or [1.0, 1.0]}, '
+            f'vector_hits={len(vector_ids)}, text_hits={len(text_ids)}'
+        )
+        fused = self._rrf_fuse([vector_ids, text_ids], k, weights=weights)
        if not fused:
            return {'ids': [[]], 'metadatas': [[]], 'distances': [[]], 'documents': [[]]}

@@ -197,16 +209,24 @@ class ChromaVectorDatabase(VectorDatabase):
        }

    @staticmethod
-    def _rrf_fuse(result_lists: list[list[str]], k: int) -> list[tuple[str, float]]:
+    def _rrf_fuse(result_lists: list[list[str]], k: int, weights: list[float] | None = None) -> list[tuple[str, float]]:
        """Reciprocal Rank Fusion over multiple ranked ID lists.

        Returns a list of (doc_id, rrf_score) sorted by descending score,
        truncated to *k* entries.
+
+        Args:
+            result_lists: Ranked ID lists from different search methods.
+            k: Number of results to return.
+            weights: Per-list weights.  ``None`` means equal weight (1.0 each).
        """
+        if weights is None:
+            weights = [1.0] * len(result_lists)
        scores: dict[str, float] = {}
-        for ranked_ids in result_lists:
+        for list_idx, ranked_ids in enumerate(result_lists):
+            w = weights[list_idx]
            for rank, doc_id in enumerate(ranked_ids):
-                scores[doc_id] = scores.get(doc_id, 0.0) + 1.0 / (_RRF_K + rank + 1)
+                scores[doc_id] = scores.get(doc_id, 0.0) + w / (_RRF_K + rank + 1)
        sorted_results = sorted(scores.items(), key=lambda x: x[1], reverse=True)
        return sorted_results[:k]

@@ -255,6 +255,7 @@ class MilvusVectorDatabase(VectorDatabase):
        search_type: str = 'vector',
        query_text: str = '',
        filter: dict[str, Any] | None = None,
+        vector_weight: float | None = None,
    ) -> Dict[str, Any]:
        """Search for similar vectors in Milvus collection

@@ -192,6 +192,7 @@ class PgVectorDatabase(VectorDatabase):
        search_type: str = 'vector',
        query_text: str = '',
        filter: dict[str, Any] | None = None,
+        vector_weight: float | None = None,
    ) -> Dict[str, Any]:
        """Search for similar vectors using cosine distance

@@ -100,6 +100,7 @@ class QdrantVectorDatabase(VectorDatabase):
        search_type: str = 'vector',
        query_text: str = '',
        filter: dict[str, Any] | None = None,
+        vector_weight: float | None = None,
    ) -> dict[str, Any]:
        exists = await self.client.collection_exists(collection)
        if not exists:
@@ -1,6 +1,8 @@
 from __future__ import annotations

 import asyncio
+from decimal import Decimal
+import re
 from typing import Any, Dict, List


@@ -101,8 +103,28 @@ class SeekDBVectorDatabase(VectorDatabase):
            }
        )

+    def _normalize_collection_name(self, collection: str) -> str:
+        """SeekDB only accepts [a-zA-Z0-9_], while LangBot uses UUID-like KB IDs."""
+        normalized = re.sub(r'[^A-Za-z0-9_]', '_', collection)
+        if normalized != collection:
+            self.ap.logger.info(f"Normalized SeekDB collection name: '{collection}' -> '{normalized}'")
+        return normalized
+
+    def _json_safe(self, value: Any) -> Any:
+        """Convert SeekDB result values into JSON-serializable Python primitives."""
+        if isinstance(value, Decimal):
+            return float(value)
+        if isinstance(value, dict):
+            return {k: self._json_safe(v) for k, v in value.items()}
+        if isinstance(value, list):
+            return [self._json_safe(v) for v in value]
+        if isinstance(value, tuple):
+            return [self._json_safe(v) for v in value]
+        return value
+
    async def _get_or_create_collection_internal(self, collection: str, vector_size: int = None) -> Any:
        """Internal method to get or create a collection with proper configuration."""
+        collection = self._normalize_collection_name(collection)
        if collection in self._collections:
            return self._collections[collection]

@@ -173,6 +195,7 @@ class SeekDBVectorDatabase(VectorDatabase):
        if not embeddings_list:
            return

+        collection = self._normalize_collection_name(collection)
        # Ensure collection exists with correct dimension
        vector_size = len(embeddings_list[0])
        coll = await self._get_or_create_collection_internal(collection, vector_size)
@@ -194,6 +217,7 @@ class SeekDBVectorDatabase(VectorDatabase):
        search_type: str = 'vector',
        query_text: str = '',
        filter: Dict[str, Any] | None = None,
+        vector_weight: float | None = None,
    ) -> Dict[str, Any]:
        """Search for the most similar vectors in the specified collection.

@@ -210,6 +234,7 @@ class SeekDBVectorDatabase(VectorDatabase):
        Returns:
            Dictionary with 'ids', 'metadatas', 'distances' keys
        """
+        collection = self._normalize_collection_name(collection)
        # Check if collection exists
        exists = await asyncio.to_thread(self.client.has_collection, collection)
        if not exists:
@@ -271,6 +296,17 @@ class SeekDBVectorDatabase(VectorDatabase):
                    query_cfg['where'] = filter
                    knn_cfg['where'] = filter

+                # Apply vector_weight via pyseekdb's native boost parameter
+                if vector_weight is not None:
+                    knn_cfg['boost'] = vector_weight
+                    query_cfg['boost'] = 1.0 - vector_weight
+                self.ap.logger.info(
+                    f"SeekDB hybrid fusion config in '{collection}': "
+                    f'vector_weight={vector_weight}, '
+                    f'knn_boost={knn_cfg.get("boost", 1.0)}, '
+                    f'query_boost={query_cfg.get("boost", 1.0)}'
+                )
+
                results = await asyncio.to_thread(
                    coll.hybrid_search,
                    query=query_cfg,
@@ -279,6 +315,9 @@ class SeekDBVectorDatabase(VectorDatabase):
                    n_results=k,
                    include=['documents', 'metadatas'],
                )
+                self.ap.logger.info(
+                    f"SeekDB hybrid search in '{collection}' returned {len(results.get('ids', [[]])[0])} results."
+                )
        else:
            # Default: vector search via query()
            query_kwargs = {'n_results': k, 'query_embeddings': query_embedding}
@@ -286,6 +325,7 @@ class SeekDBVectorDatabase(VectorDatabase):
                query_kwargs['where'] = filter
            results = await asyncio.to_thread(coll.query, **query_kwargs)

+        results = self._json_safe(results)
        self.ap.logger.info(
            f"SeekDB {search_type} search in '{collection}' returned {len(results.get('ids', [[]])[0])} results"
        )
@@ -299,6 +339,7 @@ class SeekDBVectorDatabase(VectorDatabase):
            collection: Collection name
            file_id: File ID to delete
        """
+        collection = self._normalize_collection_name(collection)
        # Check if collection exists
        exists = await asyncio.to_thread(self.client.has_collection, collection)
        if not exists:
@@ -325,6 +366,7 @@ class SeekDBVectorDatabase(VectorDatabase):
            collection: Collection name
            filter: Chroma-style ``where`` filter dict
        """
+        collection = self._normalize_collection_name(collection)
        exists = await asyncio.to_thread(self.client.has_collection, collection)
        if not exists:
            self.ap.logger.warning(f"SeekDB collection '{collection}' not found for deletion")
@@ -347,6 +389,7 @@ class SeekDBVectorDatabase(VectorDatabase):
        limit: int = 20,
        offset: int = 0,
    ) -> tuple[list[Dict[str, Any]], int]:
+        collection = self._normalize_collection_name(collection)
        exists = await asyncio.to_thread(self.client.has_collection, collection)
        if not exists:
            return [], 0
@@ -367,6 +410,7 @@ class SeekDBVectorDatabase(VectorDatabase):

        results = await asyncio.to_thread(coll.get, **get_kwargs)

+        results = self._json_safe(results)
        ids = results.get('ids', [])
        metadatas = results.get('metadatas', []) or [None] * len(ids)
        documents = results.get('documents', []) or [None] * len(ids)
@@ -390,6 +434,7 @@ class SeekDBVectorDatabase(VectorDatabase):
        Args:
            collection: Collection name
        """
+        collection = self._normalize_collection_name(collection)
        # Remove from cache
        if collection in self._collections:
            del self._collections[collection]