From cafef1a306266f0a0c4c9a7aafb4993496407480 Mon Sep 17 00:00:00 2001 From: Junyan Qin Date: Tue, 2 Jun 2026 15:07:58 +0800 Subject: [PATCH] fix(box): cap tool-call loop and run workspace-quota walk off the event loop Two robustness fixes that bite under normal sandbox usage (not just attack), hardening the self-hosted community edition before release: - localagent: cap the tool-call loop at MAX_TOOL_CALL_ROUNDS (128). A looping or adversarial model could otherwise emit tool calls indefinitely (each potentially a sandbox exec), producing a non-terminating request and runaway cost. The cap is generous enough not to interrupt legitimate multi-step agentic workflows. - box.service: make _enforce_workspace_quota async and run the recursive workspace scan via asyncio.to_thread. It ran on every quota-enforced exec and a large workspace would block the whole asyncio runtime (all bots/pipelines). Co-Authored-By: Claude Opus 4.8 (1M context) --- src/langbot/pkg/box/service.py | 11 +++++++---- src/langbot/pkg/provider/runners/localagent.py | 15 +++++++++++++++ 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/src/langbot/pkg/box/service.py b/src/langbot/pkg/box/service.py index 13469634..6c8e8299 100644 --- a/src/langbot/pkg/box/service.py +++ b/src/langbot/pkg/box/service.py @@ -168,7 +168,7 @@ class BoxService: f'spec={json.dumps(self._summarize_spec(spec), ensure_ascii=False)}' ) try: - self._enforce_workspace_quota(spec, phase='before execution') + await self._enforce_workspace_quota(spec, phase='before execution') except BoxError as exc: self._record_error(exc, query) raise @@ -178,7 +178,7 @@ class BoxService: self._record_error(exc, query) raise try: - self._enforce_workspace_quota(spec, phase='after execution') + await self._enforce_workspace_quota(spec, phase='after execution') except BoxError as exc: await self._cleanup_exceeded_session(spec) self._record_error(exc, query) @@ -683,7 +683,7 @@ class BoxService: _walk(root) return total - def _enforce_workspace_quota(self, spec: BoxSpec, *, phase: str) -> None: + async def _enforce_workspace_quota(self, spec: BoxSpec, *, phase: str) -> None: if spec.host_path is None or spec.workspace_quota_mb <= 0: return @@ -691,7 +691,10 @@ class BoxService: if not os.path.isdir(host_path): return - used_bytes = self._get_workspace_size_bytes(host_path) + # Walk the workspace off the event loop — this runs on every + # quota-enforced exec, and a large tree would otherwise block the whole + # asyncio runtime (all bots/pipelines) for the duration of the scan. + used_bytes = await asyncio.to_thread(self._get_workspace_size_bytes, host_path) limit_bytes = spec.workspace_quota_mb * _MIB if used_bytes <= limit_bytes: return diff --git a/src/langbot/pkg/provider/runners/localagent.py b/src/langbot/pkg/provider/runners/localagent.py index 6cac3e83..28d014d0 100644 --- a/src/langbot/pkg/provider/runners/localagent.py +++ b/src/langbot/pkg/provider/runners/localagent.py @@ -34,6 +34,13 @@ SANDBOX_EXEC_SYSTEM_GUIDANCE = ( ) +# Hard cap on tool-call rounds within a single agent turn. A looping or +# adversarial model can otherwise emit tool calls indefinitely (each potentially +# a sandbox exec), yielding a non-terminating request and runaway cost. Set +# generously so it never interrupts legitimate multi-step agentic workflows. +MAX_TOOL_CALL_ROUNDS = 128 + + @runner.runner_class('local-agent') class LocalAgentRunner(runner.RequestRunner): """Local agent request runner""" @@ -363,7 +370,15 @@ class LocalAgentRunner(runner.RequestRunner): # Once a model succeeds, commit to it for the tool call loop # (no fallback mid-conversation — different models may interpret tool results differently) + tool_call_round = 0 while pending_tool_calls: + tool_call_round += 1 + if tool_call_round > MAX_TOOL_CALL_ROUNDS: + self.ap.logger.warning( + f'Tool-call loop reached the {MAX_TOOL_CALL_ROUNDS}-round cap ' + f'(query_id={query.query_id}); stopping to avoid a non-terminating request.' + ) + break for tool_call in pending_tool_calls: try: func = tool_call.function