fix(modelmgr): keep id-less streamed tool calls (Ollama) (#2262)

Ollama's OpenAI-compatible streaming endpoint emits a tool-call delta carrying an `index` and a `function` payload but never an OpenAI-style `id`. `_normalize_stream_tool_calls` dropped any tool call without an `id`, so a tool-only turn yielded neither content nor a tool call: the stream "completed" with 0 chars, the tool never ran, and the chat appeared stuck. Models on standard OpenAI APIs (e.g. SiliconFlow) were unaffected because they always send a `call_...` id. Synthesize a stable per-index id (`call_<index>`) when the provider omits one but a function name is present. Providers that do send ids keep theirs, and parallel id-less calls keep distinct ids. Adds regression tests for the single and multi id-less tool-call cases. Fixes #2261
2026-06-19 12:04:21 +00:00 · 2026-06-19 18:07:25 +08:00
parent 83623f6afe
commit 3d5b70cc5d
2 changed files with 122 additions and 0 deletions
@@ -352,6 +352,117 @@ class TestInvokeLLMStreamUsage:
        assert tool_chunks[1].tool_calls[0].function.arguments == '{"text":'
        assert tool_chunks[2].tool_calls[0].function.arguments == '"plugin-tool-ok"}'

+    @pytest.mark.asyncio
+    async def test_stream_tool_call_without_id_is_not_dropped(self):
+        """Regression for #2261.
+
+        Ollama's OpenAI-compatible streaming endpoint emits a tool-call delta
+        carrying an ``index`` and a ``function`` payload but never an
+        OpenAI-style ``id``. The requester used to drop any id-less tool call,
+        so a tool-only turn yielded nothing, the stream "completed" with 0
+        chars, and the chat got stuck. A stable per-index id must be
+        synthesized so the tool call survives.
+        """
+        import langbot_plugin.api.entities.builtin.pipeline.query as pipeline_query
+        import langbot_plugin.api.entities.builtin.provider.message as provider_message
+
+        mock_ap = Mock()
+        mock_ap.tool_mgr = Mock()
+        mock_ap.tool_mgr.generate_tools_for_openai = AsyncMock(
+            return_value=[{'type': 'function', 'function': {'name': 'zotero_search_items'}}]
+        )
+        requester = litellmchat.LiteLLMRequester(ap=mock_ap, config={'custom_llm_provider': 'openai'})
+        model = MockRuntimeModel('gpt-oss:20b', 'ollama')
+
+        # Ollama delivers the whole tool call in a single delta, with no id.
+        chunks = [
+            self._make_chunk(
+                tool_calls=[
+                    {
+                        'index': 0,
+                        'function': {'name': 'zotero_search_items', 'arguments': '{"query":"hello"}'},
+                    }
+                ]
+            ),
+            self._make_chunk(finish_reason='tool_calls'),
+        ]
+
+        async def _aiter(*args, **kwargs):
+            for c in chunks:
+                yield c
+
+        query = Mock(spec=pipeline_query.Query)
+        query.variables = {}
+        messages = [provider_message.Message(role='user', content='hello?')]
+        funcs = [Mock()]
+
+        with patch.object(litellmchat, 'acompletion', new=AsyncMock(side_effect=lambda **kw: _aiter())):
+            collected = [
+                chunk
+                async for chunk in requester.invoke_llm_stream(
+                    query=query,
+                    model=model,
+                    messages=messages,
+                    funcs=funcs,
+                )
+            ]
+
+        tool_chunks = [chunk for chunk in collected if chunk.tool_calls]
+        assert len(tool_chunks) == 1, 'id-less Ollama tool call must not be dropped'
+        tc = tool_chunks[0].tool_calls[0]
+        assert tc.id == 'call_0'
+        assert tc.function.name == 'zotero_search_items'
+        assert tc.function.arguments == '{"query":"hello"}'
+
+    @pytest.mark.asyncio
+    async def test_stream_multiple_tool_calls_without_id_get_distinct_ids(self):
+        """Two parallel id-less tool calls must keep distinct synthesized ids."""
+        import langbot_plugin.api.entities.builtin.pipeline.query as pipeline_query
+        import langbot_plugin.api.entities.builtin.provider.message as provider_message
+
+        mock_ap = Mock()
+        mock_ap.tool_mgr = Mock()
+        mock_ap.tool_mgr.generate_tools_for_openai = AsyncMock(
+            return_value=[{'type': 'function', 'function': {'name': 'zotero_search_items'}}]
+        )
+        requester = litellmchat.LiteLLMRequester(ap=mock_ap, config={'custom_llm_provider': 'openai'})
+        model = MockRuntimeModel('gpt-oss:20b', 'ollama')
+
+        chunks = [
+            self._make_chunk(
+                tool_calls=[
+                    {'index': 0, 'function': {'name': 'zotero_search_items', 'arguments': '{"q":"a"}'}},
+                    {'index': 1, 'function': {'name': 'zotero_get_notes', 'arguments': '{"q":"b"}'}},
+                ]
+            ),
+            self._make_chunk(finish_reason='tool_calls'),
+        ]
+
+        async def _aiter(*args, **kwargs):
+            for c in chunks:
+                yield c
+
+        query = Mock(spec=pipeline_query.Query)
+        query.variables = {}
+        messages = [provider_message.Message(role='user', content='hello?')]
+        funcs = [Mock()]
+
+        with patch.object(litellmchat, 'acompletion', new=AsyncMock(side_effect=lambda **kw: _aiter())):
+            collected = [
+                chunk
+                async for chunk in requester.invoke_llm_stream(
+                    query=query,
+                    model=model,
+                    messages=messages,
+                    funcs=funcs,
+                )
+            ]
+
+        tool_chunks = [chunk for chunk in collected if chunk.tool_calls]
+        assert len(tool_chunks) == 1
+        ids = {tc.id for tc in tool_chunks[0].tool_calls}
+        assert ids == {'call_0', 'call_1'}
+

 class TestProcessThinkingContent:
    """Test _process_thinking_content method"""