feat(box): unify native agent tools around exec/read/write/edit

2026-06-19 03:54:19 +00:00 · 2026-03-24 07:57:05 +00:00
parent 3f368c5764
commit 93104a947a
10 changed files with 519 additions and 114 deletions
@@ -309,8 +309,8 @@ async def test_full_service_to_remote_runtime(tmp_path):
        await service.initialize()

        query = pipeline_query.Query.model_construct(query_id=42)
-        result = await service.execute_sandbox_tool(
-            {'cmd': 'echo service-path', 'image': _TEST_IMAGE},
+        result = await service.execute_tool(
+            {'command': 'echo service-path'},
            query,
        )

@@ -236,7 +236,7 @@ async def test_box_service_defaults_session_id_from_query():
    service = BoxService(make_app(logger), client=_InProcessBoxRuntimeClient(logger, runtime))
    await service.initialize()

-    result = await service.execute_sandbox_tool({'cmd': 'pwd', 'network': BoxNetworkMode.OFF.value}, make_query(7))
+    result = await service.execute_tool({'command': 'pwd'}, make_query(7))

    assert result['session_id'] == '7'
    assert result['ok'] is True
@@ -252,7 +252,7 @@ async def test_box_service_fails_closed_when_backend_unavailable():
    await service.initialize()

    with pytest.raises(BoxBackendUnavailableError):
-        await service.execute_sandbox_tool({'cmd': 'echo hello'}, make_query(9))
+        await service.execute_tool({'command': 'echo hello'}, make_query(9))


@pytest.mark.asyncio
@@ -265,11 +265,12 @@ async def test_box_service_allows_host_mount_under_configured_root(tmp_path):
    service = BoxService(make_app(logger, [str(tmp_path)]), client=_InProcessBoxRuntimeClient(logger, runtime))
    await service.initialize()

-    result = await service.execute_sandbox_tool(
+    result = await service.execute_spec_payload(
        {
            'cmd': 'pwd',
            'host_path': str(host_dir),
            'host_path_mode': BoxHostMountMode.READ_WRITE.value,
+            'session_id': '11',
        },
        make_query(11),
    )
@@ -290,7 +291,7 @@ async def test_box_service_uses_default_host_workspace_when_host_path_omitted(tm
    service = BoxService(app, client=_InProcessBoxRuntimeClient(logger, runtime))
    await service.initialize()

-    result = await service.execute_sandbox_tool({'cmd': 'pwd'}, make_query(15))
+    result = await service.execute_tool({'command': 'pwd'}, make_query(15))

    assert result['ok'] is True
    assert backend.start_calls == ['15']
@@ -345,10 +346,11 @@ async def test_box_service_rejects_host_mount_outside_allowed_roots(tmp_path):
    await service.initialize()

    with pytest.raises(BoxValidationError):
-        await service.execute_sandbox_tool(
+        await service.execute_spec_payload(
            {
                'cmd': 'pwd',
                'host_path': str(disallowed_root),
+                'session_id': '12',
            },
            make_query(12),
        )
@@ -435,7 +437,7 @@ async def test_truncate_short_output_unchanged():
    service = BoxService(make_app(logger), client=_InProcessBoxRuntimeClient(logger, runtime), output_limit_chars=100)
    await service.initialize()

-    result = await service.execute_sandbox_tool({'cmd': 'echo hello'}, make_query(20))
+    result = await service.execute_tool({'command': 'echo hello'}, make_query(20))

    assert result['stdout'] == 'hello world'
    assert result['stdout_truncated'] is False
@@ -456,7 +458,7 @@ async def test_truncate_preserves_head_and_tail():
    service = BoxService(make_app(logger), client=_InProcessBoxRuntimeClient(logger, runtime), output_limit_chars=limit)
    await service.initialize()

-    result = await service.execute_sandbox_tool({'cmd': 'cat big'}, make_query(21))
+    result = await service.execute_tool({'command': 'cat big'}, make_query(21))

    assert result['stdout_truncated'] is True
    stdout = result['stdout']
@@ -478,7 +480,7 @@ async def test_truncate_at_exact_limit_not_truncated():
    service = BoxService(make_app(logger), client=_InProcessBoxRuntimeClient(logger, runtime), output_limit_chars=200)
    await service.initialize()

-    result = await service.execute_sandbox_tool({'cmd': 'echo a'}, make_query(22))
+    result = await service.execute_tool({'command': 'echo a'}, make_query(22))

    assert result['stdout'] == exact_output
    assert result['stdout_truncated'] is False
@@ -492,7 +494,7 @@ async def test_truncate_stderr_independently():
    service = BoxService(make_app(logger), client=_InProcessBoxRuntimeClient(logger, runtime), output_limit_chars=100)
    await service.initialize()

-    result = await service.execute_sandbox_tool({'cmd': 'fail'}, make_query(23))
+    result = await service.execute_tool({'command': 'fail'}, make_query(23))

    assert result['stdout_truncated'] is False
    assert result['stderr_truncated'] is True
@@ -512,7 +514,7 @@ async def test_profile_default_provides_defaults():
    service = BoxService(make_app(logger), client=_InProcessBoxRuntimeClient(logger, runtime))
    await service.initialize()

-    result = await service.execute_sandbox_tool({'cmd': 'echo hi'}, make_query(30))
+    result = await service.execute_tool({'command': 'echo hi'}, make_query(30))

    assert result['ok'] is True
    spec = backend.start_specs[0]
@@ -523,15 +525,15 @@ async def test_profile_default_provides_defaults():

@pytest.mark.asyncio
 async def test_profile_unlocked_field_can_be_overridden():
-    """Tool call can override unlocked profile fields."""
+    """Spec payload can override unlocked profile fields."""
    logger = Mock()
    backend = FakeBackend(logger)
    runtime = BoxRuntime(logger=logger, backends=[backend], session_ttl_sec=300)
    service = BoxService(make_app(logger), client=_InProcessBoxRuntimeClient(logger, runtime))
    await service.initialize()

-    result = await service.execute_sandbox_tool(
-        {'cmd': 'echo hi', 'timeout_sec': 60, 'network': 'on'},
+    result = await service.execute_spec_payload(
+        {'cmd': 'echo hi', 'timeout_sec': 60, 'network': 'on', 'session_id': '31'},
        make_query(31),
    )

@@ -552,8 +554,8 @@ async def test_profile_locked_field_cannot_be_overridden():
    )
    await service.initialize()

-    result = await service.execute_sandbox_tool(
-        {'cmd': 'echo hi', 'network': 'on', 'host_path_mode': 'rw'},
+    result = await service.execute_spec_payload(
+        {'cmd': 'echo hi', 'network': 'on', 'host_path_mode': 'rw', 'session_id': '32'},
        make_query(32),
    )

@@ -572,10 +574,7 @@ async def test_profile_timeout_clamped_to_max():
    service = BoxService(make_app(logger), client=_InProcessBoxRuntimeClient(logger, runtime))
    await service.initialize()

-    result = await service.execute_sandbox_tool(
-        {'cmd': 'echo hi', 'timeout_sec': 999},
-        make_query(33),
-    )
+    result = await service.execute_tool({'command': 'echo hi', 'timeout_sec': 999}, make_query(33))

    assert result['ok'] is True
    spec = backend.start_specs[0]
@@ -592,10 +591,7 @@ async def test_profile_timeout_clamped_for_coercible_inputs(timeout_value):
    service = BoxService(make_app(logger), client=_InProcessBoxRuntimeClient(logger, runtime))
    await service.initialize()

-    await service.execute_sandbox_tool(
-        {'cmd': 'echo hi', 'timeout_sec': timeout_value},
-        make_query(34),
-    )
+    await service.execute_tool({'command': 'echo hi', 'timeout_sec': timeout_value}, make_query(34))

    spec = backend.start_specs[0]
    assert spec.timeout_sec == 120
@@ -644,7 +640,7 @@ async def test_profile_default_applies_resource_limits():
    service = BoxService(make_app(logger), client=_InProcessBoxRuntimeClient(logger, runtime))
    await service.initialize()

-    await service.execute_sandbox_tool({'cmd': 'echo hi'}, make_query(40))
+    await service.execute_tool({'command': 'echo hi'}, make_query(40))

    spec = backend.start_specs[0]
    profile = BUILTIN_PROFILES['default']
@@ -665,10 +661,7 @@ async def test_profile_offline_readonly_locks_read_only_rootfs():
    )
    await service.initialize()

-    await service.execute_sandbox_tool(
-        {'cmd': 'echo hi', 'read_only_rootfs': False},
-        make_query(41),
-    )
+    await service.execute_spec_payload({'cmd': 'echo hi', 'read_only_rootfs': False, 'session_id': '41'}, make_query(41))

    spec = backend.start_specs[0]
    assert spec.read_only_rootfs is True
@@ -685,7 +678,7 @@ async def test_profile_network_extended_has_relaxed_limits():
    )
    await service.initialize()

-    await service.execute_sandbox_tool({'cmd': 'echo hi'}, make_query(42))
+    await service.execute_tool({'command': 'echo hi'}, make_query(42))

    spec = backend.start_specs[0]
    assert spec.network == BoxNetworkMode.ON
@@ -761,7 +754,7 @@ async def test_service_records_errors_on_failure():
    await service.initialize()

    with pytest.raises(Exception):
-        await service.execute_sandbox_tool({'cmd': 'echo hello'}, make_query(50))
+        await service.execute_tool({'command': 'echo hello'}, make_query(50))

    errors = service.get_recent_errors()
    assert len(errors) == 1
@@ -780,7 +773,7 @@ async def test_service_error_ring_buffer_capped():

    for i in range(60):
        with pytest.raises(Exception):
-            await service.execute_sandbox_tool({'cmd': 'fail'}, make_query(100 + i))
+            await service.execute_tool({'command': 'fail'}, make_query(100 + i))

    errors = service.get_recent_errors()
    assert len(errors) == 50
@@ -30,14 +30,14 @@ def test_chat_handler_formats_tool_call_request_log():
            provider_message.ToolCall(
                id='call-1',
                type='function',
-                function=provider_message.FunctionCall(name='sandbox_exec', arguments='{}'),
+                function=provider_message.FunctionCall(name='exec', arguments='{}'),
            )
        ],
    )

    summary = handler.format_result_log(result)

-    assert summary == 'assistant: requested tools: sandbox_exec'
+    assert summary == 'assistant: requested tools: exec'


 def test_chat_handler_formats_tool_result_log():
@@ -35,9 +35,9 @@ class RecordingProvider:
                        id='call-1',
                        type='function',
                        function=provider_message.FunctionCall(
-                            name='sandbox_exec',
+                            name='exec',
                            arguments=json.dumps(
-                                {'cmd': ("python - <<'PY'\nnums = [1, 2, 3, 4]\nprint(sum(nums) / len(nums))\nPY")}
+                                {'command': ("python - <<'PY'\nnums = [1, 2, 3, 4]\nprint(sum(nums) / len(nums))\nPY")}
                            ),
                        ),
                    )
@@ -73,8 +73,8 @@ class RecordingStreamProvider:
                            id='call-1',
                            type='function',
                            function=provider_message.FunctionCall(
-                                name='sandbox_exec',
-                                arguments=json.dumps({'cmd': "python -c 'print(1)'"}),
+                                name='exec',
+                                arguments=json.dumps({'command': "python -c 'print(1)'"}),
                            ),
                        )
                    ],
@@ -118,14 +118,14 @@ def make_query() -> pipeline_query.Query:
            role='user',
            content='Please calculate the average of 1, 2, 3, and 4.',
        ),
-        use_funcs=[SimpleNamespace(name='sandbox_exec')],
+        use_funcs=[SimpleNamespace(name='exec')],
        use_llm_model_uuid='test-model-uuid',
        variables={},
    )


@pytest.mark.asyncio
-async def test_localagent_uses_sandbox_exec_for_exact_calculation():
+async def test_localagent_uses_exec_for_exact_calculation():
    provider = RecordingProvider()
    model = SimpleNamespace(
        provider=provider,
@@ -160,11 +160,11 @@ async def test_localagent_uses_sandbox_exec_for_exact_calculation():
        box_service=SimpleNamespace(
            get_system_guidance=Mock(
                return_value=(
-                    'When sandbox_exec is available, use it for exact calculations, statistics, '
+                    'When the exec tool is available, use it for exact calculations, statistics, '
                    'structured data parsing, and code execution instead of estimating mentally. '
                    'Unless the user explicitly asks for the script, code, or implementation details, '
                    'do not include the generated script in the final answer. '
-                    'A default host workspace is mounted at /workspace for file tasks.'
+                    'A default workspace is mounted at /workspace for file tasks.'
                )
            ),
        ),
@@ -180,19 +180,19 @@ async def test_localagent_uses_sandbox_exec_for_exact_calculation():

    tool_manager.execute_func_call.assert_awaited_once()
    tool_name, tool_parameters = tool_manager.execute_func_call.await_args.args[:2]
-    assert tool_name == 'sandbox_exec'
-    assert 'print(sum(nums) / len(nums))' in tool_parameters['cmd']
+    assert tool_name == 'exec'
+    assert 'print(sum(nums) / len(nums))' in tool_parameters['command']

    first_request = provider.requests[0]
    assert any(
        message.role == 'system'
-        and 'sandbox_exec' in str(message.content)
+        and 'exec' in str(message.content)
        and 'exact calculations' in str(message.content)
        and 'Unless the user explicitly asks for the script' in str(message.content)
        and '/workspace' in str(message.content)
        for message in first_request['messages']
    )
-    assert [tool.name for tool in first_request['funcs']] == ['sandbox_exec']
+    assert [tool.name for tool in first_request['funcs']] == ['exec']


@pytest.mark.asyncio
@@ -1,5 +1,7 @@
 from __future__ import annotations

+import os
+import tempfile
 from types import SimpleNamespace
 from unittest.mock import Mock

@@ -42,41 +44,191 @@ def make_tool(name: str) -> resource_tool.LLMTool:
@pytest.mark.asyncio
 async def test_tool_manager_lists_native_tools_first():
    manager = ToolManager(SimpleNamespace())
-    manager.native_tool_loader = StubLoader([make_tool('sandbox_exec')])
+    manager.native_tool_loader = StubLoader([make_tool('exec')])
    manager.plugin_tool_loader = StubLoader([make_tool('plugin_tool')])
    manager.mcp_tool_loader = StubLoader([make_tool('mcp_tool')])

    tools = await manager.get_all_tools()

-    assert [tool.name for tool in tools] == ['sandbox_exec', 'plugin_tool', 'mcp_tool']
+    assert [tool.name for tool in tools] == ['exec', 'plugin_tool', 'mcp_tool']


@pytest.mark.asyncio
 async def test_tool_manager_routes_native_tool_calls():
    app = SimpleNamespace()
    manager = ToolManager(app)
-    manager.native_tool_loader = StubLoader([make_tool('sandbox_exec')], invoke_result={'backend': 'fake'})
+    manager.native_tool_loader = StubLoader([make_tool('exec')], invoke_result={'backend': 'fake'})
    manager.plugin_tool_loader = StubLoader([make_tool('plugin_tool')])
    manager.mcp_tool_loader = StubLoader([make_tool('mcp_tool')])

-    result = await manager.execute_func_call('sandbox_exec', {'cmd': 'pwd'}, query=Mock())
+    result = await manager.execute_func_call('exec', {'command': 'pwd'}, query=Mock())

    assert result == {'backend': 'fake'}


@pytest.mark.asyncio
-async def test_native_tool_loader_hides_sandbox_exec_when_box_unavailable():
+async def test_native_tool_loader_hides_tools_when_box_unavailable():
    loader = NativeToolLoader(SimpleNamespace(box_service=SimpleNamespace(available=False)))

    assert await loader.get_tools() == []
-    assert await loader.has_tool('sandbox_exec') is False
+    assert await loader.has_tool('exec') is False
+    assert await loader.has_tool('read') is False
+    assert await loader.has_tool('write') is False
+    assert await loader.has_tool('edit') is False


@pytest.mark.asyncio
-async def test_native_tool_loader_exposes_sandbox_exec_when_box_available():
+async def test_native_tool_loader_exposes_all_tools_when_box_available():
    loader = NativeToolLoader(SimpleNamespace(box_service=SimpleNamespace(available=True)))

    tools = await loader.get_tools()

-    assert [tool.name for tool in tools] == ['sandbox_exec']
-    assert await loader.has_tool('sandbox_exec') is True
+    assert [tool.name for tool in tools] == ['exec', 'read', 'write', 'edit']
+    assert await loader.has_tool('exec') is True
+    assert await loader.has_tool('read') is True
+    assert await loader.has_tool('write') is True
+    assert await loader.has_tool('edit') is True
+
+
+# ── read/write/edit file tool tests ─────────────────────────────
+
+
+def _make_loader_with_workspace(tmpdir: str) -> tuple[NativeToolLoader, Mock]:
+    logger = Mock()
+    box_service = SimpleNamespace(available=True, default_host_workspace=tmpdir)
+    ap = SimpleNamespace(box_service=box_service, logger=logger)
+    return NativeToolLoader(ap), logger
+
+
+def _make_query() -> Mock:
+    q = Mock()
+    q.query_id = 'test-query-1'
+    return q
+
+
+@pytest.mark.asyncio
+async def test_read_file():
+    with tempfile.TemporaryDirectory() as tmpdir:
+        loader, _ = _make_loader_with_workspace(tmpdir)
+        with open(os.path.join(tmpdir, 'hello.txt'), 'w') as f:
+            f.write('hello world')
+
+        result = await loader.invoke_tool('read', {'path': '/workspace/hello.txt'}, _make_query())
+
+        assert result['ok'] is True
+        assert result['content'] == 'hello world'
+
+
+@pytest.mark.asyncio
+async def test_read_nonexistent_file():
+    with tempfile.TemporaryDirectory() as tmpdir:
+        loader, _ = _make_loader_with_workspace(tmpdir)
+
+        result = await loader.invoke_tool('read', {'path': '/workspace/no_such.txt'}, _make_query())
+
+        assert result['ok'] is False
+        assert 'not found' in result['error'].lower()
+
+
+@pytest.mark.asyncio
+async def test_read_directory():
+    with tempfile.TemporaryDirectory() as tmpdir:
+        loader, _ = _make_loader_with_workspace(tmpdir)
+        os.makedirs(os.path.join(tmpdir, 'subdir'))
+        with open(os.path.join(tmpdir, 'a.txt'), 'w') as f:
+            f.write('a')
+
+        result = await loader.invoke_tool('read', {'path': '/workspace'}, _make_query())
+
+        assert result['ok'] is True
+        assert result['is_directory'] is True
+        assert 'a.txt' in result['content']
+
+
+@pytest.mark.asyncio
+async def test_write_creates_file():
+    with tempfile.TemporaryDirectory() as tmpdir:
+        loader, _ = _make_loader_with_workspace(tmpdir)
+
+        result = await loader.invoke_tool(
+            'write', {'path': '/workspace/new.txt', 'content': 'new content'}, _make_query()
+        )
+
+        assert result['ok'] is True
+        with open(os.path.join(tmpdir, 'new.txt')) as f:
+            assert f.read() == 'new content'
+
+
+@pytest.mark.asyncio
+async def test_write_creates_subdirectories():
+    with tempfile.TemporaryDirectory() as tmpdir:
+        loader, _ = _make_loader_with_workspace(tmpdir)
+
+        result = await loader.invoke_tool(
+            'write', {'path': '/workspace/sub/deep/file.txt', 'content': 'nested'}, _make_query()
+        )
+
+        assert result['ok'] is True
+        with open(os.path.join(tmpdir, 'sub', 'deep', 'file.txt')) as f:
+            assert f.read() == 'nested'
+
+
+@pytest.mark.asyncio
+async def test_edit_replaces_unique_string():
+    with tempfile.TemporaryDirectory() as tmpdir:
+        loader, _ = _make_loader_with_workspace(tmpdir)
+        with open(os.path.join(tmpdir, 'code.py'), 'w') as f:
+            f.write('def foo():\n    return 1\n')
+
+        result = await loader.invoke_tool(
+            'edit',
+            {'path': '/workspace/code.py', 'old_string': 'return 1', 'new_string': 'return 42'},
+            _make_query(),
+        )
+
+        assert result['ok'] is True
+        with open(os.path.join(tmpdir, 'code.py')) as f:
+            assert f.read() == 'def foo():\n    return 42\n'
+
+
+@pytest.mark.asyncio
+async def test_edit_rejects_ambiguous_match():
+    with tempfile.TemporaryDirectory() as tmpdir:
+        loader, _ = _make_loader_with_workspace(tmpdir)
+        with open(os.path.join(tmpdir, 'dup.txt'), 'w') as f:
+            f.write('aaa\naaa\n')
+
+        result = await loader.invoke_tool(
+            'edit',
+            {'path': '/workspace/dup.txt', 'old_string': 'aaa', 'new_string': 'bbb'},
+            _make_query(),
+        )
+
+        assert result['ok'] is False
+        assert '2' in result['error']
+
+
+@pytest.mark.asyncio
+async def test_edit_rejects_missing_string():
+    with tempfile.TemporaryDirectory() as tmpdir:
+        loader, _ = _make_loader_with_workspace(tmpdir)
+        with open(os.path.join(tmpdir, 'x.txt'), 'w') as f:
+            f.write('hello')
+
+        result = await loader.invoke_tool(
+            'edit',
+            {'path': '/workspace/x.txt', 'old_string': 'nope', 'new_string': 'yes'},
+            _make_query(),
+        )
+
+        assert result['ok'] is False
+        assert 'not found' in result['error'].lower()
+
+
+@pytest.mark.asyncio
+async def test_path_escape_blocked():
+    with tempfile.TemporaryDirectory() as tmpdir:
+        loader, _ = _make_loader_with_workspace(tmpdir)
+
+        with pytest.raises(ValueError, match='escapes'):
+            await loader.invoke_tool('read', {'path': '/workspace/../../etc/passwd'}, _make_query())