feat(box/mcp): instance-based orphan cleanup, error classification, session API, and integration tests

## Changes

  ### Precise orphan container cleanup
  - Runtime generates a unique instance_id on startup
  - Every container gets a `langbot.box.instance_id` label
  - `cleanup_orphaned_containers()` only removes containers from
    previous instances, preserving containers owned by the current one
  - Containers from older versions (no label) are also cleaned up
  - `cleanup_orphaned_containers` added to `BaseSandboxBackend` as
    a no-op default method, removing hasattr duck-typing

  ### Fine-grained MCP error classification
  - New `MCPSessionErrorPhase` enum with 7 phases: session_create,
    dep_install, process_start, relay_connect, mcp_init, runtime,
    tool_call
  - Each phase in `_init_box_stdio_server()` sets the error phase
    before re-raising, enabling precise failure diagnosis
  - `retry_count` tracked across retry attempts
  - `get_runtime_info_dict()` exposes `error_phase` and `retry_count`

  ### GET /v1/sessions/{id} API
  - `BoxRuntime.get_session()` returns session details including
    managed process info when present
  - `handle_get_session` HTTP handler + route in server.py
  - `BoxRuntimeClient.get_session()` abstract method + remote impl

  ### stdio defaults to Box when runtime is available
  - `_uses_box_stdio()` checks `box_service.available` instead of
    requiring explicit `box` key in server_config
  - `BoxService.initialize()` catches runtime errors gracefully,
    sets `available=False` instead of crashing LangBot startup
  - When no container runtime exists, stdio MCP falls back to
    host-direct execution

  ### Code quality (from /simplify review)
  - Extracted `_VENV_DIRS` / `_VENV_BIN_DIRS` module-level constants
  - Removed dead `_box_network_mode()` method and unused `bc` variable
  - Fixed broken import `from ....box.models` → `from ...box.models`
  - Cached `_resolve_host_path()` result — computed once, passed through
  - Config hash now includes `host_path` field
  - Batched orphan cleanup into single `rm -f` command

  ### Session leak fix
  - `_cleanup_box_stdio_session()` now runs in `_lifecycle_loop`'s
    finally block, covering all exit paths (normal shutdown, error,
    retry, final failure)

  ### Integration tests
  - 6 end-to-end tests covering managed process lifecycle, WebSocket
    stdio bidirectional IO, session cleanup verification, single
    session query, process exit detection, and orphan cleanup safety
This commit is contained in:
youhuanghe
2026-03-21 05:19:48 +00:00
committed by WangCham
parent e8aa7b2e6d
commit 791d052687
7 changed files with 442 additions and 29 deletions

View File

@@ -34,6 +34,7 @@ class _CommandResult:
class BaseSandboxBackend(abc.ABC):
name: str
instance_id: str = ''
def __init__(self, logger: logging.Logger):
self.logger = logger
@@ -60,7 +61,7 @@ class BaseSandboxBackend(abc.ABC):
async def start_managed_process(self, session: BoxSessionInfo, spec):
raise BoxError(f'{self.name} backend does not support managed processes')
async def cleanup_orphaned_containers(self):
async def cleanup_orphaned_containers(self, current_instance_id: str = ''):
"""Remove lingering containers from previous runs. No-op by default."""
pass
@@ -97,6 +98,8 @@ class CLISandboxBackend(BaseSandboxBackend):
'langbot.box=true',
'--label',
f'langbot.session_id={spec.session_id}',
'--label',
f'langbot.box.instance_id={self.instance_id}',
]
# Config hash label for identifying configuration drift
@@ -218,22 +221,37 @@ class CLISandboxBackend(BaseSandboxBackend):
check=False,
)
async def cleanup_orphaned_containers(self):
"""Remove any lingering langbot.box containers from previous runs."""
async def cleanup_orphaned_containers(self, current_instance_id: str = ''):
"""Remove langbot.box containers from previous instances.
Only removes containers whose ``langbot.box.instance_id`` label does
NOT match *current_instance_id*. Containers without the label (from
older versions) are also removed.
"""
result = await self._run_command(
[self.command, 'ps', '-a', '--filter', 'label=langbot.box=true', '-q'],
[self.command, 'ps', '-a', '--filter', 'label=langbot.box=true',
'--format', '{{.ID}}\t{{.Label "langbot.box.instance_id"}}'],
timeout_sec=10,
check=False,
)
if result.return_code != 0 or not result.stdout.strip():
return
container_ids = [cid.strip() for cid in result.stdout.strip().split('\n') if cid.strip()]
if not container_ids:
orphan_ids = []
for line in result.stdout.strip().split('\n'):
line = line.strip()
if not line:
continue
parts = line.split('\t', 1)
cid = parts[0].strip()
label_instance = parts[1].strip() if len(parts) > 1 else ''
if label_instance != current_instance_id:
orphan_ids.append(cid)
if not orphan_ids:
return
for cid in container_ids:
for cid in orphan_ids:
self.logger.info(f'Cleaning up orphaned Box container: {cid}')
await self._run_command(
[self.command, 'rm', '-f', *container_ids],
[self.command, 'rm', '-f', *orphan_ids],
timeout_sec=30,
check=False,
)

View File

@@ -86,6 +86,9 @@ class BoxRuntimeClient(abc.ABC):
@abc.abstractmethod
async def get_managed_process(self, session_id: str) -> BoxManagedProcessInfo: ...
@abc.abstractmethod
async def get_session(self, session_id: str) -> dict: ...
class RemoteBoxRuntimeClient(BoxRuntimeClient):
"""HTTP client that talks to a standalone Box Runtime service."""
@@ -168,6 +171,15 @@ class RemoteBoxRuntimeClient(BoxRuntimeClient):
except aiohttp.ClientError as exc:
raise BoxRuntimeUnavailableError(f'box runtime unavailable: {exc}') from exc
async def get_session(self, session_id: str) -> dict:
session = self._get_session()
try:
async with session.get(f'{self._base_url}/v1/sessions/{session_id}') as resp:
await self._check_response(resp)
return await resp.json()
except aiohttp.ClientError as exc:
raise BoxRuntimeUnavailableError(f'box runtime unavailable: {exc}') from exc
async def get_backend_info(self) -> dict:
session = self._get_session()
try:

View File

@@ -5,6 +5,7 @@ import collections
import dataclasses
import datetime as dt
import logging
import uuid
from .backend import BaseSandboxBackend, DockerBackend, PodmanBackend
from .errors import (
@@ -64,12 +65,14 @@ class BoxRuntime:
self._backend: BaseSandboxBackend | None = None
self._sessions: dict[str, _RuntimeSession] = {}
self._lock = asyncio.Lock()
self.instance_id = uuid.uuid4().hex[:12]
async def initialize(self):
self._backend = await self._select_backend()
if self._backend is not None:
self._backend.instance_id = self.instance_id
try:
await self._backend.cleanup_orphaned_containers()
await self._backend.cleanup_orphaned_containers(self.instance_id)
except Exception as exc:
self.logger.warning(f'LangBot Box orphan container cleanup failed: {exc}')
@@ -164,6 +167,17 @@ class BoxRuntime:
def get_sessions(self) -> list[dict]:
return [self._session_to_dict(s.info) for s in self._sessions.values()]
def get_session(self, session_id: str) -> dict:
runtime_session = self._sessions.get(session_id)
if runtime_session is None:
raise BoxSessionNotFoundError(f'session {session_id} not found')
result = self._session_to_dict(runtime_session.info)
if runtime_session.managed_process is not None:
result['managed_process'] = self._managed_process_to_dict(
session_id, runtime_session.managed_process
)
return result
async def get_status(self) -> dict:
backend_info = await self.get_backend_info()
return {

View File

@@ -117,6 +117,15 @@ async def handle_delete_session(request: web.Request) -> web.Response:
return _error_response(exc)
async def handle_get_session(request: web.Request) -> web.Response:
runtime: BoxRuntime = request.app['runtime']
session_id = request.match_info['session_id']
try:
return web.json_response(runtime.get_session(session_id))
except BoxError as exc:
return _error_response(exc)
async def handle_status(request: web.Request) -> web.Response:
runtime: BoxRuntime = request.app['runtime']
try:
@@ -234,6 +243,7 @@ def create_app(runtime: BoxRuntime | None = None) -> web.Application:
app.router.add_post('/v1/sessions/{session_id}/exec', handle_exec)
app.router.add_post('/v1/sessions/{session_id}', handle_create_session)
app.router.add_get('/v1/sessions/{session_id}', handle_get_session)
app.router.add_get('/v1/sessions', handle_get_sessions)
app.router.add_delete('/v1/sessions/{session_id}', handle_delete_session)
app.router.add_post('/v1/sessions/{session_id}/managed-process', handle_start_managed_process)