From 93104a947a3023c8279afabde783c2fd4fb849b2 Mon Sep 17 00:00:00 2001 From: youhuanghe <1051233107@qq.com> Date: Tue, 24 Mar 2026 07:57:05 +0000 Subject: [PATCH] feat(box): unify native agent tools around exec/read/write/edit --- src/langbot/pkg/box/policy.py | 98 ++++++++ src/langbot/pkg/box/service.py | 31 ++- .../pkg/provider/runners/localagent.py | 4 +- .../pkg/provider/tools/loaders/native.py | 231 ++++++++++++++---- src/langbot/pkg/provider/tools/toolmgr.py | 12 +- .../box/test_box_integration.py | 4 +- tests/unit_tests/box/test_box_service.py | 55 ++--- .../pipeline/test_chat_handler_logging.py | 4 +- .../provider/test_localagent_sandbox_exec.py | 24 +- .../provider/test_tool_manager_native.py | 170 ++++++++++++- 10 files changed, 519 insertions(+), 114 deletions(-) create mode 100644 src/langbot/pkg/box/policy.py diff --git a/src/langbot/pkg/box/policy.py b/src/langbot/pkg/box/policy.py new file mode 100644 index 00000000..15f4c45c --- /dev/null +++ b/src/langbot/pkg/box/policy.py @@ -0,0 +1,98 @@ +"""Three-layer security policy for LangBot Box. + +The design separates concerns into three independent layers, aligned with +OpenCode / OpenClaw patterns: + +1. **SandboxPolicy** – *where* tools run (host vs sandbox). +2. **ToolPolicy** – *which* tools are allowed (allow/deny lists). +3. **ElevatedPolicy** – *whether* a single exec call may temporarily + escape the default sandbox boundary. + +These three layers are orthogonal: +- ToolPolicy is a hard boundary; ``elevated`` cannot bypass a denied tool. +- SandboxPolicy decides the default execution location. +- ElevatedPolicy only affects ``exec`` and only when the framework allows it. +""" + +from __future__ import annotations + +import enum +from typing import Sequence + + +# ── Layer 1: Sandbox Policy ────────────────────────────────────────── + + +class SandboxMode(str, enum.Enum): + """Determines when agent execution is routed through the sandbox.""" + + OFF = 'off' + """Sandbox disabled; all exec runs on the host.""" + + NON_DEFAULT = 'non_default' + """Only non-default sessions are sandboxed (e.g. sub-agents, MCP).""" + + ALL = 'all' + """Every agent exec call is routed through the sandbox.""" + + +class SandboxPolicy: + """Decides whether a given execution context should use the sandbox.""" + + def __init__(self, mode: SandboxMode = SandboxMode.ALL): + self.mode = mode + + def should_sandbox(self, *, is_default_session: bool = True) -> bool: + if self.mode == SandboxMode.OFF: + return False + if self.mode == SandboxMode.ALL: + return True + # NON_DEFAULT: sandbox everything except the default session + return not is_default_session + + +# ── Layer 2: Tool Policy ───────────────────────────────────────────── + + +class ToolPolicy: + """Controls which tools are available to the current agent/session. + + Rules: + - ``deny`` always takes precedence over ``allow``. + - An empty ``allow`` list means "all tools allowed" (no allowlist filter). + - ``elevated`` cannot bypass a denied tool. + """ + + def __init__( + self, + allow: Sequence[str] = (), + deny: Sequence[str] = (), + ): + self._allow: frozenset[str] = frozenset(allow) + self._deny: frozenset[str] = frozenset(deny) + + def is_tool_allowed(self, tool_name: str) -> bool: + if tool_name in self._deny: + return False + if self._allow and tool_name not in self._allow: + return False + return True + + +# ── Layer 3: Elevated Policy ───────────────────────────────────────── + + +class ElevatedPolicy: + """Controls whether ``exec`` may request temporary privilege escalation. + + ``elevated`` only applies to the ``exec`` tool. It means "run this + command outside the default sandbox boundary" (e.g. with network, or + on the host). The framework decides whether to honor the request. + """ + + def __init__(self, *, allow_elevated: bool = False, require_approval: bool = True): + self.allow_elevated = allow_elevated + self.require_approval = require_approval + + def is_elevation_permitted(self) -> bool: + return self.allow_elevated diff --git a/src/langbot/pkg/box/service.py b/src/langbot/pkg/box/service.py index 87ce94f8..82eb60b4 100644 --- a/src/langbot/pkg/box/service.py +++ b/src/langbot/pkg/box/service.py @@ -105,9 +105,22 @@ class BoxService: ) return self._serialize_result(result) - async def execute_sandbox_tool(self, parameters: dict, query: pipeline_query.Query) -> dict: - spec_payload = dict(parameters) + async def execute_tool(self, parameters: dict, query: pipeline_query.Query) -> dict: + """Execute an agent-facing ``exec`` tool call. + + Translates the agent-facing ``command`` field to the internal + ``BoxSpec.cmd`` field and injects the session id from the query. + """ + spec_payload: dict = {'cmd': parameters['command']} + + # Pass through allowed agent-facing fields + for key in ('workdir', 'timeout_sec', 'env'): + if key in parameters: + spec_payload[key] = parameters[key] + + # Inject context the agent must not control spec_payload.setdefault('session_id', str(query.query_id)) + return await self.execute_spec_payload(spec_payload, query) async def shutdown(self): @@ -379,23 +392,23 @@ class BoxService: return list(self._recent_errors) def get_system_guidance(self) -> str: - """Return LLM system-prompt guidance for sandbox_exec. + """Return LLM system-prompt guidance for the exec tool. - All sandbox-specific prompt text is kept here so that callers + All execution-specific prompt text is kept here so that callers (e.g. LocalAgentRunner) stay free of box domain knowledge. """ guidance = ( - 'When sandbox_exec is available, use it for exact calculations, statistics, structured data parsing, ' + 'When the exec tool is available, use it for exact calculations, statistics, structured data parsing, ' 'and code execution instead of estimating mentally. If the user provides numbers, tables, CSV-like text, ' - 'JSON, or other data and asks for a computed answer, prefer running a short Python script in sandbox_exec ' + 'JSON, or other data and asks for a computed answer, prefer running a short Python script via exec ' 'and then answer from the tool result. Unless the user explicitly asks for the script, code, or implementation ' 'details, do not include the generated script in the final answer; return the result and a brief explanation only.' ) if self.default_host_workspace: guidance += ( - ' A default host workspace is mounted at /workspace for file tasks. When the user asks to read, create, or ' - 'modify local files in the working directory, use sandbox_exec with /workspace paths directly; do not ask the ' - 'user for sandbox parameters such as host_path unless they explicitly need a different directory.' + ' A default workspace is mounted at /workspace for file tasks. When the user asks to read, create, or ' + 'modify local files in the working directory, use exec with /workspace paths directly; do not ask the ' + 'user for directory parameters unless they explicitly need a different directory.' ) return guidance diff --git a/src/langbot/pkg/provider/runners/localagent.py b/src/langbot/pkg/provider/runners/localagent.py index a033efd7..5a1189b4 100644 --- a/src/langbot/pkg/provider/runners/localagent.py +++ b/src/langbot/pkg/provider/runners/localagent.py @@ -5,7 +5,7 @@ import copy import typing from .. import runner from ..modelmgr import requester as modelmgr_requester -from ..tools.loaders.native import SANDBOX_EXEC_TOOL_NAME +from ..tools.loaders.native import EXEC_TOOL_NAME import langbot_plugin.api.entities.builtin.pipeline.query as pipeline_query import langbot_plugin.api.entities.builtin.provider.message as provider_message import langbot_plugin.api.entities.builtin.rag.context as rag_context @@ -37,7 +37,7 @@ class LocalAgentRunner(runner.RequestRunner): ) -> list[provider_message.Message]: req_messages = query.prompt.messages.copy() + query.messages.copy() - if any(getattr(tool, 'name', None) == SANDBOX_EXEC_TOOL_NAME for tool in query.use_funcs or []): + if any(getattr(tool, 'name', None) == EXEC_TOOL_NAME for tool in query.use_funcs or []): req_messages.append( provider_message.Message( role='system', diff --git a/src/langbot/pkg/provider/tools/loaders/native.py b/src/langbot/pkg/provider/tools/loaders/native.py index f8b8774e..3433345a 100644 --- a/src/langbot/pkg/provider/tools/loaders/native.py +++ b/src/langbot/pkg/provider/tools/loaders/native.py @@ -1,77 +1,154 @@ from __future__ import annotations import json +import os import langbot_plugin.api.entities.builtin.resource.tool as resource_tool from langbot_plugin.api.entities.events import pipeline_query -from langbot_plugin.box.models import BoxNetworkMode from .. import loader -SANDBOX_EXEC_TOOL_NAME = 'sandbox_exec' +EXEC_TOOL_NAME = 'exec' +READ_TOOL_NAME = 'read' +WRITE_TOOL_NAME = 'write' +EDIT_TOOL_NAME = 'edit' + +_ALL_TOOL_NAMES = {EXEC_TOOL_NAME, READ_TOOL_NAME, WRITE_TOOL_NAME, EDIT_TOOL_NAME} class NativeToolLoader(loader.ToolLoader): def __init__(self, ap): super().__init__(ap) - self._sandbox_exec_tool: resource_tool.LLMTool | None = None + self._tools: list[resource_tool.LLMTool] | None = None async def get_tools(self, bound_plugins: list[str] | None = None) -> list[resource_tool.LLMTool]: if not self._is_sandbox_available(): return [] - if self._sandbox_exec_tool is None: - self._sandbox_exec_tool = self._build_sandbox_exec_tool() - return [self._sandbox_exec_tool] + if self._tools is None: + self._tools = [ + self._build_exec_tool(), + self._build_read_tool(), + self._build_write_tool(), + self._build_edit_tool(), + ] + return list(self._tools) async def has_tool(self, name: str) -> bool: - return name == SANDBOX_EXEC_TOOL_NAME and self._is_sandbox_available() + return name in _ALL_TOOL_NAMES and self._is_sandbox_available() async def invoke_tool(self, name: str, parameters: dict, query: pipeline_query.Query): - if name != SANDBOX_EXEC_TOOL_NAME: + if name == EXEC_TOOL_NAME: + self.ap.logger.info( + 'exec tool invoked: ' + f'query_id={query.query_id} ' + f'parameters={json.dumps(self._summarize_parameters(parameters), ensure_ascii=False)}' + ) + return await self.ap.box_service.execute_tool(parameters, query) + elif name == READ_TOOL_NAME: + return await self._invoke_read(parameters, query) + elif name == WRITE_TOOL_NAME: + return await self._invoke_write(parameters, query) + elif name == EDIT_TOOL_NAME: + return await self._invoke_edit(parameters, query) + else: raise ValueError(f'未找到工具: {name}') - self.ap.logger.info( - 'sandbox_exec tool invoked: ' - f'query_id={query.query_id} ' - f'parameters={json.dumps(self._summarize_parameters(parameters), ensure_ascii=False)}' - ) - return await self.ap.box_service.execute_sandbox_tool(parameters, query) async def shutdown(self): pass + # ── File tool implementations ──────────────────────────────────── + + def _resolve_host_path(self, sandbox_path: str) -> str: + """Map a sandbox /workspace path to the host filesystem path.""" + box_service = self.ap.box_service + host_root = box_service.default_host_workspace + if host_root is None: + raise ValueError('No default host workspace configured for file operations.') + + mount_path = '/workspace' + if not sandbox_path.startswith(mount_path): + raise ValueError(f'Path must be under {mount_path}.') + + relative = sandbox_path[len(mount_path):].lstrip('/') + host_path = os.path.realpath(os.path.join(host_root, relative)) + + if not (host_path == host_root or host_path.startswith(host_root + os.sep)): + raise ValueError('Path escapes the workspace boundary.') + + return host_path + + async def _invoke_read(self, parameters: dict, query: pipeline_query.Query) -> dict: + path = parameters['path'] + self.ap.logger.info(f'read tool invoked: query_id={query.query_id} path={path}') + host_path = self._resolve_host_path(path) + if not os.path.exists(host_path): + return {'ok': False, 'error': f'File not found: {path}'} + if os.path.isdir(host_path): + entries = os.listdir(host_path) + return {'ok': True, 'content': '\n'.join(sorted(entries)), 'is_directory': True} + with open(host_path, 'r', errors='replace') as f: + content = f.read() + return {'ok': True, 'content': content} + + async def _invoke_write(self, parameters: dict, query: pipeline_query.Query) -> dict: + path = parameters['path'] + content = parameters['content'] + self.ap.logger.info(f'write tool invoked: query_id={query.query_id} path={path} length={len(content)}') + host_path = self._resolve_host_path(path) + os.makedirs(os.path.dirname(host_path), exist_ok=True) + with open(host_path, 'w') as f: + f.write(content) + return {'ok': True, 'path': path} + + async def _invoke_edit(self, parameters: dict, query: pipeline_query.Query) -> dict: + path = parameters['path'] + old_string = parameters['old_string'] + new_string = parameters['new_string'] + self.ap.logger.info( + f'edit tool invoked: query_id={query.query_id} path={path} ' + f'old_len={len(old_string)} new_len={len(new_string)}' + ) + host_path = self._resolve_host_path(path) + if not os.path.isfile(host_path): + return {'ok': False, 'error': f'File not found: {path}'} + with open(host_path, 'r', errors='replace') as f: + content = f.read() + count = content.count(old_string) + if count == 0: + return {'ok': False, 'error': 'old_string not found in file.'} + if count > 1: + return {'ok': False, 'error': f'old_string matches {count} locations; provide a more unique string.'} + new_content = content.replace(old_string, new_string, 1) + with open(host_path, 'w') as f: + f.write(new_content) + return {'ok': True, 'path': path} + + # ── Internals ──────────────────────────────────────────────────── + def _is_sandbox_available(self) -> bool: box_service = getattr(self.ap, 'box_service', None) return bool(getattr(box_service, 'available', False)) - def _build_sandbox_exec_tool(self) -> resource_tool.LLMTool: + def _build_exec_tool(self) -> resource_tool.LLMTool: return resource_tool.LLMTool( - name=SANDBOX_EXEC_TOOL_NAME, - human_desc='Execute a command inside the LangBot Box sandbox', + name=EXEC_TOOL_NAME, + human_desc='Execute a command in an isolated environment', description=( - 'Run shell commands only inside the isolated LangBot Box sandbox. ' - 'Use this tool for local file edits, bash commands, Python execution, and exact calculations over ' - 'user-provided data that must not touch the host.' + 'Run shell commands in an isolated execution environment. ' + 'Use this tool for bash commands, Python execution, and exact calculations ' + 'over user-provided data.' ), parameters={ 'type': 'object', 'properties': { - 'cmd': { + 'command': { 'type': 'string', - 'description': 'Shell command to execute inside the sandbox.', + 'description': 'Shell command to execute.', }, 'workdir': { 'type': 'string', 'description': ( - 'Absolute working directory path inside the sandbox. ' - 'Defaults to mount_path, or /workspace when mount_path is omitted.' - ), - 'default': '/workspace', - }, - 'mount_path': { - 'type': 'string', - 'description': ( - 'Absolute sandbox path where host_path is mounted. ' - 'Defaults to /workspace. When omitted, workdir defaults to the same path.' + 'Working directory for the command. Defaults to /workspace.' ), 'default': '/workspace', }, @@ -81,20 +158,90 @@ class NativeToolLoader(loader.ToolLoader): 'default': 30, 'minimum': 1, }, - 'network': { - 'type': 'string', - 'description': 'Network policy for the sandbox session. Prefer off unless network is required.', - 'enum': [e.value for e in BoxNetworkMode], - 'default': 'off', - }, 'env': { 'type': 'object', - 'description': 'Optional environment variables to expose inside the sandbox.', + 'description': 'Optional environment variables for the execution.', 'additionalProperties': {'type': 'string'}, 'default': {}, }, + 'description': { + 'type': 'string', + 'description': 'Brief description of what this command does, for logging and audit.', + }, }, - 'required': ['cmd'], + 'required': ['command'], + 'additionalProperties': False, + }, + func=lambda parameters: parameters, + ) + + def _build_read_tool(self) -> resource_tool.LLMTool: + return resource_tool.LLMTool( + name=READ_TOOL_NAME, + human_desc='Read a file from the workspace', + description='Read the contents of a file at the given path under /workspace.', + parameters={ + 'type': 'object', + 'properties': { + 'path': { + 'type': 'string', + 'description': 'Absolute path to the file (must be under /workspace).', + }, + }, + 'required': ['path'], + 'additionalProperties': False, + }, + func=lambda parameters: parameters, + ) + + def _build_write_tool(self) -> resource_tool.LLMTool: + return resource_tool.LLMTool( + name=WRITE_TOOL_NAME, + human_desc='Write a file to the workspace', + description='Create or overwrite a file at the given path under /workspace with the provided content.', + parameters={ + 'type': 'object', + 'properties': { + 'path': { + 'type': 'string', + 'description': 'Absolute path to the file (must be under /workspace).', + }, + 'content': { + 'type': 'string', + 'description': 'Content to write to the file.', + }, + }, + 'required': ['path', 'content'], + 'additionalProperties': False, + }, + func=lambda parameters: parameters, + ) + + def _build_edit_tool(self) -> resource_tool.LLMTool: + return resource_tool.LLMTool( + name=EDIT_TOOL_NAME, + human_desc='Edit a file in the workspace', + description=( + 'Perform an exact string replacement in a file under /workspace. ' + 'The old_string must appear exactly once in the file.' + ), + parameters={ + 'type': 'object', + 'properties': { + 'path': { + 'type': 'string', + 'description': 'Absolute path to the file (must be under /workspace).', + }, + 'old_string': { + 'type': 'string', + 'description': 'The exact string to find and replace.', + }, + 'new_string': { + 'type': 'string', + 'description': 'The replacement string.', + }, + }, + 'required': ['path', 'old_string', 'new_string'], 'additionalProperties': False, }, func=lambda parameters: parameters, @@ -102,10 +249,10 @@ class NativeToolLoader(loader.ToolLoader): def _summarize_parameters(self, parameters: dict) -> dict: summary = dict(parameters) - cmd = str(summary.get('cmd', '')).strip() + cmd = str(summary.get('command', '')).strip() if len(cmd) > 400: cmd = f'{cmd[:397]}...' - summary['cmd'] = cmd + summary['command'] = cmd env = summary.get('env') if isinstance(env, dict): diff --git a/src/langbot/pkg/provider/tools/toolmgr.py b/src/langbot/pkg/provider/tools/toolmgr.py index b819180a..e652b388 100644 --- a/src/langbot/pkg/provider/tools/toolmgr.py +++ b/src/langbot/pkg/provider/tools/toolmgr.py @@ -3,16 +3,12 @@ from __future__ import annotations import typing from typing import TYPE_CHECKING -from langbot.pkg.utils import importutil -from langbot.pkg.provider.tools import loaders -from langbot.pkg.provider.tools.loaders import mcp as mcp_loader, native as native_loader, plugin as plugin_loader import langbot_plugin.api.entities.builtin.resource.tool as resource_tool from langbot_plugin.api.entities.events import pipeline_query if TYPE_CHECKING: from ...core import app - -importutil.import_modules_in_pkg(loaders) + from langbot.pkg.provider.tools.loaders import mcp as mcp_loader, native as native_loader, plugin as plugin_loader class ToolManager: @@ -28,6 +24,12 @@ class ToolManager: self.ap = ap async def initialize(self): + from langbot.pkg.utils import importutil + from langbot.pkg.provider.tools import loaders + from langbot.pkg.provider.tools.loaders import mcp as mcp_loader, native as native_loader, plugin as plugin_loader + + importutil.import_modules_in_pkg(loaders) + self.native_tool_loader = native_loader.NativeToolLoader(self.ap) await self.native_tool_loader.initialize() self.plugin_tool_loader = plugin_loader.PluginToolLoader(self.ap) diff --git a/tests/integration_tests/box/test_box_integration.py b/tests/integration_tests/box/test_box_integration.py index ca0189c9..2754293d 100644 --- a/tests/integration_tests/box/test_box_integration.py +++ b/tests/integration_tests/box/test_box_integration.py @@ -309,8 +309,8 @@ async def test_full_service_to_remote_runtime(tmp_path): await service.initialize() query = pipeline_query.Query.model_construct(query_id=42) - result = await service.execute_sandbox_tool( - {'cmd': 'echo service-path', 'image': _TEST_IMAGE}, + result = await service.execute_tool( + {'command': 'echo service-path'}, query, ) diff --git a/tests/unit_tests/box/test_box_service.py b/tests/unit_tests/box/test_box_service.py index 4f741fba..71f61dea 100644 --- a/tests/unit_tests/box/test_box_service.py +++ b/tests/unit_tests/box/test_box_service.py @@ -236,7 +236,7 @@ async def test_box_service_defaults_session_id_from_query(): service = BoxService(make_app(logger), client=_InProcessBoxRuntimeClient(logger, runtime)) await service.initialize() - result = await service.execute_sandbox_tool({'cmd': 'pwd', 'network': BoxNetworkMode.OFF.value}, make_query(7)) + result = await service.execute_tool({'command': 'pwd'}, make_query(7)) assert result['session_id'] == '7' assert result['ok'] is True @@ -252,7 +252,7 @@ async def test_box_service_fails_closed_when_backend_unavailable(): await service.initialize() with pytest.raises(BoxBackendUnavailableError): - await service.execute_sandbox_tool({'cmd': 'echo hello'}, make_query(9)) + await service.execute_tool({'command': 'echo hello'}, make_query(9)) @pytest.mark.asyncio @@ -265,11 +265,12 @@ async def test_box_service_allows_host_mount_under_configured_root(tmp_path): service = BoxService(make_app(logger, [str(tmp_path)]), client=_InProcessBoxRuntimeClient(logger, runtime)) await service.initialize() - result = await service.execute_sandbox_tool( + result = await service.execute_spec_payload( { 'cmd': 'pwd', 'host_path': str(host_dir), 'host_path_mode': BoxHostMountMode.READ_WRITE.value, + 'session_id': '11', }, make_query(11), ) @@ -290,7 +291,7 @@ async def test_box_service_uses_default_host_workspace_when_host_path_omitted(tm service = BoxService(app, client=_InProcessBoxRuntimeClient(logger, runtime)) await service.initialize() - result = await service.execute_sandbox_tool({'cmd': 'pwd'}, make_query(15)) + result = await service.execute_tool({'command': 'pwd'}, make_query(15)) assert result['ok'] is True assert backend.start_calls == ['15'] @@ -345,10 +346,11 @@ async def test_box_service_rejects_host_mount_outside_allowed_roots(tmp_path): await service.initialize() with pytest.raises(BoxValidationError): - await service.execute_sandbox_tool( + await service.execute_spec_payload( { 'cmd': 'pwd', 'host_path': str(disallowed_root), + 'session_id': '12', }, make_query(12), ) @@ -435,7 +437,7 @@ async def test_truncate_short_output_unchanged(): service = BoxService(make_app(logger), client=_InProcessBoxRuntimeClient(logger, runtime), output_limit_chars=100) await service.initialize() - result = await service.execute_sandbox_tool({'cmd': 'echo hello'}, make_query(20)) + result = await service.execute_tool({'command': 'echo hello'}, make_query(20)) assert result['stdout'] == 'hello world' assert result['stdout_truncated'] is False @@ -456,7 +458,7 @@ async def test_truncate_preserves_head_and_tail(): service = BoxService(make_app(logger), client=_InProcessBoxRuntimeClient(logger, runtime), output_limit_chars=limit) await service.initialize() - result = await service.execute_sandbox_tool({'cmd': 'cat big'}, make_query(21)) + result = await service.execute_tool({'command': 'cat big'}, make_query(21)) assert result['stdout_truncated'] is True stdout = result['stdout'] @@ -478,7 +480,7 @@ async def test_truncate_at_exact_limit_not_truncated(): service = BoxService(make_app(logger), client=_InProcessBoxRuntimeClient(logger, runtime), output_limit_chars=200) await service.initialize() - result = await service.execute_sandbox_tool({'cmd': 'echo a'}, make_query(22)) + result = await service.execute_tool({'command': 'echo a'}, make_query(22)) assert result['stdout'] == exact_output assert result['stdout_truncated'] is False @@ -492,7 +494,7 @@ async def test_truncate_stderr_independently(): service = BoxService(make_app(logger), client=_InProcessBoxRuntimeClient(logger, runtime), output_limit_chars=100) await service.initialize() - result = await service.execute_sandbox_tool({'cmd': 'fail'}, make_query(23)) + result = await service.execute_tool({'command': 'fail'}, make_query(23)) assert result['stdout_truncated'] is False assert result['stderr_truncated'] is True @@ -512,7 +514,7 @@ async def test_profile_default_provides_defaults(): service = BoxService(make_app(logger), client=_InProcessBoxRuntimeClient(logger, runtime)) await service.initialize() - result = await service.execute_sandbox_tool({'cmd': 'echo hi'}, make_query(30)) + result = await service.execute_tool({'command': 'echo hi'}, make_query(30)) assert result['ok'] is True spec = backend.start_specs[0] @@ -523,15 +525,15 @@ async def test_profile_default_provides_defaults(): @pytest.mark.asyncio async def test_profile_unlocked_field_can_be_overridden(): - """Tool call can override unlocked profile fields.""" + """Spec payload can override unlocked profile fields.""" logger = Mock() backend = FakeBackend(logger) runtime = BoxRuntime(logger=logger, backends=[backend], session_ttl_sec=300) service = BoxService(make_app(logger), client=_InProcessBoxRuntimeClient(logger, runtime)) await service.initialize() - result = await service.execute_sandbox_tool( - {'cmd': 'echo hi', 'timeout_sec': 60, 'network': 'on'}, + result = await service.execute_spec_payload( + {'cmd': 'echo hi', 'timeout_sec': 60, 'network': 'on', 'session_id': '31'}, make_query(31), ) @@ -552,8 +554,8 @@ async def test_profile_locked_field_cannot_be_overridden(): ) await service.initialize() - result = await service.execute_sandbox_tool( - {'cmd': 'echo hi', 'network': 'on', 'host_path_mode': 'rw'}, + result = await service.execute_spec_payload( + {'cmd': 'echo hi', 'network': 'on', 'host_path_mode': 'rw', 'session_id': '32'}, make_query(32), ) @@ -572,10 +574,7 @@ async def test_profile_timeout_clamped_to_max(): service = BoxService(make_app(logger), client=_InProcessBoxRuntimeClient(logger, runtime)) await service.initialize() - result = await service.execute_sandbox_tool( - {'cmd': 'echo hi', 'timeout_sec': 999}, - make_query(33), - ) + result = await service.execute_tool({'command': 'echo hi', 'timeout_sec': 999}, make_query(33)) assert result['ok'] is True spec = backend.start_specs[0] @@ -592,10 +591,7 @@ async def test_profile_timeout_clamped_for_coercible_inputs(timeout_value): service = BoxService(make_app(logger), client=_InProcessBoxRuntimeClient(logger, runtime)) await service.initialize() - await service.execute_sandbox_tool( - {'cmd': 'echo hi', 'timeout_sec': timeout_value}, - make_query(34), - ) + await service.execute_tool({'command': 'echo hi', 'timeout_sec': timeout_value}, make_query(34)) spec = backend.start_specs[0] assert spec.timeout_sec == 120 @@ -644,7 +640,7 @@ async def test_profile_default_applies_resource_limits(): service = BoxService(make_app(logger), client=_InProcessBoxRuntimeClient(logger, runtime)) await service.initialize() - await service.execute_sandbox_tool({'cmd': 'echo hi'}, make_query(40)) + await service.execute_tool({'command': 'echo hi'}, make_query(40)) spec = backend.start_specs[0] profile = BUILTIN_PROFILES['default'] @@ -665,10 +661,7 @@ async def test_profile_offline_readonly_locks_read_only_rootfs(): ) await service.initialize() - await service.execute_sandbox_tool( - {'cmd': 'echo hi', 'read_only_rootfs': False}, - make_query(41), - ) + await service.execute_spec_payload({'cmd': 'echo hi', 'read_only_rootfs': False, 'session_id': '41'}, make_query(41)) spec = backend.start_specs[0] assert spec.read_only_rootfs is True @@ -685,7 +678,7 @@ async def test_profile_network_extended_has_relaxed_limits(): ) await service.initialize() - await service.execute_sandbox_tool({'cmd': 'echo hi'}, make_query(42)) + await service.execute_tool({'command': 'echo hi'}, make_query(42)) spec = backend.start_specs[0] assert spec.network == BoxNetworkMode.ON @@ -761,7 +754,7 @@ async def test_service_records_errors_on_failure(): await service.initialize() with pytest.raises(Exception): - await service.execute_sandbox_tool({'cmd': 'echo hello'}, make_query(50)) + await service.execute_tool({'command': 'echo hello'}, make_query(50)) errors = service.get_recent_errors() assert len(errors) == 1 @@ -780,7 +773,7 @@ async def test_service_error_ring_buffer_capped(): for i in range(60): with pytest.raises(Exception): - await service.execute_sandbox_tool({'cmd': 'fail'}, make_query(100 + i)) + await service.execute_tool({'command': 'fail'}, make_query(100 + i)) errors = service.get_recent_errors() assert len(errors) == 50 diff --git a/tests/unit_tests/pipeline/test_chat_handler_logging.py b/tests/unit_tests/pipeline/test_chat_handler_logging.py index 681386be..6ae85558 100644 --- a/tests/unit_tests/pipeline/test_chat_handler_logging.py +++ b/tests/unit_tests/pipeline/test_chat_handler_logging.py @@ -30,14 +30,14 @@ def test_chat_handler_formats_tool_call_request_log(): provider_message.ToolCall( id='call-1', type='function', - function=provider_message.FunctionCall(name='sandbox_exec', arguments='{}'), + function=provider_message.FunctionCall(name='exec', arguments='{}'), ) ], ) summary = handler.format_result_log(result) - assert summary == 'assistant: requested tools: sandbox_exec' + assert summary == 'assistant: requested tools: exec' def test_chat_handler_formats_tool_result_log(): diff --git a/tests/unit_tests/provider/test_localagent_sandbox_exec.py b/tests/unit_tests/provider/test_localagent_sandbox_exec.py index df1e8747..f508d0d5 100644 --- a/tests/unit_tests/provider/test_localagent_sandbox_exec.py +++ b/tests/unit_tests/provider/test_localagent_sandbox_exec.py @@ -35,9 +35,9 @@ class RecordingProvider: id='call-1', type='function', function=provider_message.FunctionCall( - name='sandbox_exec', + name='exec', arguments=json.dumps( - {'cmd': ("python - <<'PY'\nnums = [1, 2, 3, 4]\nprint(sum(nums) / len(nums))\nPY")} + {'command': ("python - <<'PY'\nnums = [1, 2, 3, 4]\nprint(sum(nums) / len(nums))\nPY")} ), ), ) @@ -73,8 +73,8 @@ class RecordingStreamProvider: id='call-1', type='function', function=provider_message.FunctionCall( - name='sandbox_exec', - arguments=json.dumps({'cmd': "python -c 'print(1)'"}), + name='exec', + arguments=json.dumps({'command': "python -c 'print(1)'"}), ), ) ], @@ -118,14 +118,14 @@ def make_query() -> pipeline_query.Query: role='user', content='Please calculate the average of 1, 2, 3, and 4.', ), - use_funcs=[SimpleNamespace(name='sandbox_exec')], + use_funcs=[SimpleNamespace(name='exec')], use_llm_model_uuid='test-model-uuid', variables={}, ) @pytest.mark.asyncio -async def test_localagent_uses_sandbox_exec_for_exact_calculation(): +async def test_localagent_uses_exec_for_exact_calculation(): provider = RecordingProvider() model = SimpleNamespace( provider=provider, @@ -160,11 +160,11 @@ async def test_localagent_uses_sandbox_exec_for_exact_calculation(): box_service=SimpleNamespace( get_system_guidance=Mock( return_value=( - 'When sandbox_exec is available, use it for exact calculations, statistics, ' + 'When the exec tool is available, use it for exact calculations, statistics, ' 'structured data parsing, and code execution instead of estimating mentally. ' 'Unless the user explicitly asks for the script, code, or implementation details, ' 'do not include the generated script in the final answer. ' - 'A default host workspace is mounted at /workspace for file tasks.' + 'A default workspace is mounted at /workspace for file tasks.' ) ), ), @@ -180,19 +180,19 @@ async def test_localagent_uses_sandbox_exec_for_exact_calculation(): tool_manager.execute_func_call.assert_awaited_once() tool_name, tool_parameters = tool_manager.execute_func_call.await_args.args[:2] - assert tool_name == 'sandbox_exec' - assert 'print(sum(nums) / len(nums))' in tool_parameters['cmd'] + assert tool_name == 'exec' + assert 'print(sum(nums) / len(nums))' in tool_parameters['command'] first_request = provider.requests[0] assert any( message.role == 'system' - and 'sandbox_exec' in str(message.content) + and 'exec' in str(message.content) and 'exact calculations' in str(message.content) and 'Unless the user explicitly asks for the script' in str(message.content) and '/workspace' in str(message.content) for message in first_request['messages'] ) - assert [tool.name for tool in first_request['funcs']] == ['sandbox_exec'] + assert [tool.name for tool in first_request['funcs']] == ['exec'] @pytest.mark.asyncio diff --git a/tests/unit_tests/provider/test_tool_manager_native.py b/tests/unit_tests/provider/test_tool_manager_native.py index f43ee27c..d08dad8b 100644 --- a/tests/unit_tests/provider/test_tool_manager_native.py +++ b/tests/unit_tests/provider/test_tool_manager_native.py @@ -1,5 +1,7 @@ from __future__ import annotations +import os +import tempfile from types import SimpleNamespace from unittest.mock import Mock @@ -42,41 +44,191 @@ def make_tool(name: str) -> resource_tool.LLMTool: @pytest.mark.asyncio async def test_tool_manager_lists_native_tools_first(): manager = ToolManager(SimpleNamespace()) - manager.native_tool_loader = StubLoader([make_tool('sandbox_exec')]) + manager.native_tool_loader = StubLoader([make_tool('exec')]) manager.plugin_tool_loader = StubLoader([make_tool('plugin_tool')]) manager.mcp_tool_loader = StubLoader([make_tool('mcp_tool')]) tools = await manager.get_all_tools() - assert [tool.name for tool in tools] == ['sandbox_exec', 'plugin_tool', 'mcp_tool'] + assert [tool.name for tool in tools] == ['exec', 'plugin_tool', 'mcp_tool'] @pytest.mark.asyncio async def test_tool_manager_routes_native_tool_calls(): app = SimpleNamespace() manager = ToolManager(app) - manager.native_tool_loader = StubLoader([make_tool('sandbox_exec')], invoke_result={'backend': 'fake'}) + manager.native_tool_loader = StubLoader([make_tool('exec')], invoke_result={'backend': 'fake'}) manager.plugin_tool_loader = StubLoader([make_tool('plugin_tool')]) manager.mcp_tool_loader = StubLoader([make_tool('mcp_tool')]) - result = await manager.execute_func_call('sandbox_exec', {'cmd': 'pwd'}, query=Mock()) + result = await manager.execute_func_call('exec', {'command': 'pwd'}, query=Mock()) assert result == {'backend': 'fake'} @pytest.mark.asyncio -async def test_native_tool_loader_hides_sandbox_exec_when_box_unavailable(): +async def test_native_tool_loader_hides_tools_when_box_unavailable(): loader = NativeToolLoader(SimpleNamespace(box_service=SimpleNamespace(available=False))) assert await loader.get_tools() == [] - assert await loader.has_tool('sandbox_exec') is False + assert await loader.has_tool('exec') is False + assert await loader.has_tool('read') is False + assert await loader.has_tool('write') is False + assert await loader.has_tool('edit') is False @pytest.mark.asyncio -async def test_native_tool_loader_exposes_sandbox_exec_when_box_available(): +async def test_native_tool_loader_exposes_all_tools_when_box_available(): loader = NativeToolLoader(SimpleNamespace(box_service=SimpleNamespace(available=True))) tools = await loader.get_tools() - assert [tool.name for tool in tools] == ['sandbox_exec'] - assert await loader.has_tool('sandbox_exec') is True + assert [tool.name for tool in tools] == ['exec', 'read', 'write', 'edit'] + assert await loader.has_tool('exec') is True + assert await loader.has_tool('read') is True + assert await loader.has_tool('write') is True + assert await loader.has_tool('edit') is True + + +# ── read/write/edit file tool tests ───────────────────────────── + + +def _make_loader_with_workspace(tmpdir: str) -> tuple[NativeToolLoader, Mock]: + logger = Mock() + box_service = SimpleNamespace(available=True, default_host_workspace=tmpdir) + ap = SimpleNamespace(box_service=box_service, logger=logger) + return NativeToolLoader(ap), logger + + +def _make_query() -> Mock: + q = Mock() + q.query_id = 'test-query-1' + return q + + +@pytest.mark.asyncio +async def test_read_file(): + with tempfile.TemporaryDirectory() as tmpdir: + loader, _ = _make_loader_with_workspace(tmpdir) + with open(os.path.join(tmpdir, 'hello.txt'), 'w') as f: + f.write('hello world') + + result = await loader.invoke_tool('read', {'path': '/workspace/hello.txt'}, _make_query()) + + assert result['ok'] is True + assert result['content'] == 'hello world' + + +@pytest.mark.asyncio +async def test_read_nonexistent_file(): + with tempfile.TemporaryDirectory() as tmpdir: + loader, _ = _make_loader_with_workspace(tmpdir) + + result = await loader.invoke_tool('read', {'path': '/workspace/no_such.txt'}, _make_query()) + + assert result['ok'] is False + assert 'not found' in result['error'].lower() + + +@pytest.mark.asyncio +async def test_read_directory(): + with tempfile.TemporaryDirectory() as tmpdir: + loader, _ = _make_loader_with_workspace(tmpdir) + os.makedirs(os.path.join(tmpdir, 'subdir')) + with open(os.path.join(tmpdir, 'a.txt'), 'w') as f: + f.write('a') + + result = await loader.invoke_tool('read', {'path': '/workspace'}, _make_query()) + + assert result['ok'] is True + assert result['is_directory'] is True + assert 'a.txt' in result['content'] + + +@pytest.mark.asyncio +async def test_write_creates_file(): + with tempfile.TemporaryDirectory() as tmpdir: + loader, _ = _make_loader_with_workspace(tmpdir) + + result = await loader.invoke_tool( + 'write', {'path': '/workspace/new.txt', 'content': 'new content'}, _make_query() + ) + + assert result['ok'] is True + with open(os.path.join(tmpdir, 'new.txt')) as f: + assert f.read() == 'new content' + + +@pytest.mark.asyncio +async def test_write_creates_subdirectories(): + with tempfile.TemporaryDirectory() as tmpdir: + loader, _ = _make_loader_with_workspace(tmpdir) + + result = await loader.invoke_tool( + 'write', {'path': '/workspace/sub/deep/file.txt', 'content': 'nested'}, _make_query() + ) + + assert result['ok'] is True + with open(os.path.join(tmpdir, 'sub', 'deep', 'file.txt')) as f: + assert f.read() == 'nested' + + +@pytest.mark.asyncio +async def test_edit_replaces_unique_string(): + with tempfile.TemporaryDirectory() as tmpdir: + loader, _ = _make_loader_with_workspace(tmpdir) + with open(os.path.join(tmpdir, 'code.py'), 'w') as f: + f.write('def foo():\n return 1\n') + + result = await loader.invoke_tool( + 'edit', + {'path': '/workspace/code.py', 'old_string': 'return 1', 'new_string': 'return 42'}, + _make_query(), + ) + + assert result['ok'] is True + with open(os.path.join(tmpdir, 'code.py')) as f: + assert f.read() == 'def foo():\n return 42\n' + + +@pytest.mark.asyncio +async def test_edit_rejects_ambiguous_match(): + with tempfile.TemporaryDirectory() as tmpdir: + loader, _ = _make_loader_with_workspace(tmpdir) + with open(os.path.join(tmpdir, 'dup.txt'), 'w') as f: + f.write('aaa\naaa\n') + + result = await loader.invoke_tool( + 'edit', + {'path': '/workspace/dup.txt', 'old_string': 'aaa', 'new_string': 'bbb'}, + _make_query(), + ) + + assert result['ok'] is False + assert '2' in result['error'] + + +@pytest.mark.asyncio +async def test_edit_rejects_missing_string(): + with tempfile.TemporaryDirectory() as tmpdir: + loader, _ = _make_loader_with_workspace(tmpdir) + with open(os.path.join(tmpdir, 'x.txt'), 'w') as f: + f.write('hello') + + result = await loader.invoke_tool( + 'edit', + {'path': '/workspace/x.txt', 'old_string': 'nope', 'new_string': 'yes'}, + _make_query(), + ) + + assert result['ok'] is False + assert 'not found' in result['error'].lower() + + +@pytest.mark.asyncio +async def test_path_escape_blocked(): + with tempfile.TemporaryDirectory() as tmpdir: + loader, _ = _make_loader_with_workspace(tmpdir) + + with pytest.raises(ValueError, match='escapes'): + await loader.invoke_tool('read', {'path': '/workspace/../../etc/passwd'}, _make_query())