Files
LangBot/src/langbot/pkg/box/runtime.py
youhuanghe 791d052687 feat(box/mcp): instance-based orphan cleanup, error classification, session API, and integration tests
## Changes

  ### Precise orphan container cleanup
  - Runtime generates a unique instance_id on startup
  - Every container gets a `langbot.box.instance_id` label
  - `cleanup_orphaned_containers()` only removes containers from
    previous instances, preserving containers owned by the current one
  - Containers from older versions (no label) are also cleaned up
  - `cleanup_orphaned_containers` added to `BaseSandboxBackend` as
    a no-op default method, removing hasattr duck-typing

  ### Fine-grained MCP error classification
  - New `MCPSessionErrorPhase` enum with 7 phases: session_create,
    dep_install, process_start, relay_connect, mcp_init, runtime,
    tool_call
  - Each phase in `_init_box_stdio_server()` sets the error phase
    before re-raising, enabling precise failure diagnosis
  - `retry_count` tracked across retry attempts
  - `get_runtime_info_dict()` exposes `error_phase` and `retry_count`

  ### GET /v1/sessions/{id} API
  - `BoxRuntime.get_session()` returns session details including
    managed process info when present
  - `handle_get_session` HTTP handler + route in server.py
  - `BoxRuntimeClient.get_session()` abstract method + remote impl

  ### stdio defaults to Box when runtime is available
  - `_uses_box_stdio()` checks `box_service.available` instead of
    requiring explicit `box` key in server_config
  - `BoxService.initialize()` catches runtime errors gracefully,
    sets `available=False` instead of crashing LangBot startup
  - When no container runtime exists, stdio MCP falls back to
    host-direct execution

  ### Code quality (from /simplify review)
  - Extracted `_VENV_DIRS` / `_VENV_BIN_DIRS` module-level constants
  - Removed dead `_box_network_mode()` method and unused `bc` variable
  - Fixed broken import `from ....box.models` → `from ...box.models`
  - Cached `_resolve_host_path()` result — computed once, passed through
  - Config hash now includes `host_path` field
  - Batched orphan cleanup into single `rm -f` command

  ### Session leak fix
  - `_cleanup_box_stdio_session()` now runs in `_lifecycle_loop`'s
    finally block, covering all exit paths (normal shutdown, error,
    retry, final failure)

  ### Integration tests
  - 6 end-to-end tests covering managed process lifecycle, WebSocket
    stdio bidirectional IO, session cleanup verification, single
    session query, process exit detection, and orphan cleanup safety
2026-05-04 21:23:23 +08:00

396 lines
16 KiB
Python

from __future__ import annotations
import asyncio
import collections
import dataclasses
import datetime as dt
import logging
import uuid
from .backend import BaseSandboxBackend, DockerBackend, PodmanBackend
from .errors import (
BoxBackendUnavailableError,
BoxManagedProcessConflictError,
BoxManagedProcessNotFoundError,
BoxSessionConflictError,
BoxSessionNotFoundError,
BoxValidationError,
)
from .models import (
BoxExecutionResult,
BoxExecutionStatus,
BoxManagedProcessInfo,
BoxManagedProcessSpec,
BoxManagedProcessStatus,
BoxSessionInfo,
BoxSpec,
)
_UTC = dt.timezone.utc
_MANAGED_PROCESS_STDERR_PREVIEW_LIMIT = 4000
@dataclasses.dataclass(slots=True)
class _ManagedProcess:
spec: BoxManagedProcessSpec
process: asyncio.subprocess.Process
started_at: dt.datetime
attach_lock: asyncio.Lock
stderr_chunks: collections.deque[str]
exit_code: int | None = None
exited_at: dt.datetime | None = None
@property
def is_running(self) -> bool:
return self.exit_code is None and self.process.returncode is None
@dataclasses.dataclass(slots=True)
class _RuntimeSession:
info: BoxSessionInfo
lock: asyncio.Lock
managed_process: _ManagedProcess | None = None
class BoxRuntime:
def __init__(
self,
logger: logging.Logger,
backends: list[BaseSandboxBackend] | None = None,
session_ttl_sec: int = 300,
):
self.logger = logger
self.backends = backends or [PodmanBackend(logger), DockerBackend(logger)]
self.session_ttl_sec = session_ttl_sec
self._backend: BaseSandboxBackend | None = None
self._sessions: dict[str, _RuntimeSession] = {}
self._lock = asyncio.Lock()
self.instance_id = uuid.uuid4().hex[:12]
async def initialize(self):
self._backend = await self._select_backend()
if self._backend is not None:
self._backend.instance_id = self.instance_id
try:
await self._backend.cleanup_orphaned_containers(self.instance_id)
except Exception as exc:
self.logger.warning(f'LangBot Box orphan container cleanup failed: {exc}')
async def execute(self, spec: BoxSpec) -> BoxExecutionResult:
if not spec.cmd:
raise BoxValidationError('cmd must not be empty')
session = await self._get_or_create_session(spec)
async with session.lock:
self.logger.info(
'LangBot Box execute: '
f'session_id={spec.session_id} '
f'backend_session_id={session.info.backend_session_id} '
f'backend={session.info.backend_name} '
f'workdir={spec.workdir} '
f'timeout_sec={spec.timeout_sec}'
)
result = await (await self._get_backend()).exec(session.info, spec)
async with self._lock:
now = dt.datetime.now(_UTC)
if spec.session_id in self._sessions:
self._sessions[spec.session_id].info.last_used_at = now
if result.status == BoxExecutionStatus.TIMED_OUT:
await self._drop_session_locked(spec.session_id)
return result
async def shutdown(self):
async with self._lock:
session_ids = list(self._sessions.keys())
for session_id in session_ids:
await self._drop_session_locked(session_id)
async def create_session(self, spec: BoxSpec) -> dict:
session = await self._get_or_create_session(spec)
return self._session_to_dict(session.info)
async def delete_session(self, session_id: str) -> None:
async with self._lock:
if session_id not in self._sessions:
raise BoxSessionNotFoundError(f'session {session_id} not found')
await self._drop_session_locked(session_id)
async def start_managed_process(self, session_id: str, spec: BoxManagedProcessSpec) -> dict:
async with self._lock:
runtime_session = self._sessions.get(session_id)
if runtime_session is None:
raise BoxSessionNotFoundError(f'session {session_id} not found')
async with runtime_session.lock:
existing = runtime_session.managed_process
if existing is not None and existing.is_running:
raise BoxManagedProcessConflictError(f'session {session_id} already has a managed process')
backend = await self._get_backend()
process = await backend.start_managed_process(runtime_session.info, spec)
managed_process = _ManagedProcess(
spec=spec,
process=process,
started_at=dt.datetime.now(_UTC),
attach_lock=asyncio.Lock(),
stderr_chunks=collections.deque(),
)
runtime_session.managed_process = managed_process
runtime_session.info.last_used_at = dt.datetime.now(_UTC)
asyncio.create_task(self._drain_managed_process_stderr(runtime_session.info.session_id, managed_process))
asyncio.create_task(self._watch_managed_process(runtime_session.info.session_id, managed_process))
return self._managed_process_to_dict(runtime_session.info.session_id, managed_process)
def get_managed_process(self, session_id: str) -> dict:
runtime_session = self._sessions.get(session_id)
if runtime_session is None:
raise BoxSessionNotFoundError(f'session {session_id} not found')
if runtime_session.managed_process is None:
raise BoxManagedProcessNotFoundError(f'session {session_id} has no managed process')
return self._managed_process_to_dict(session_id, runtime_session.managed_process)
# ── Observability ─────────────────────────────────────────────────
async def get_backend_info(self) -> dict:
backend = self._backend
if backend is None:
return {'name': None, 'available': False}
try:
available = await backend.is_available()
except Exception:
available = False
return {'name': backend.name, 'available': available}
def get_sessions(self) -> list[dict]:
return [self._session_to_dict(s.info) for s in self._sessions.values()]
def get_session(self, session_id: str) -> dict:
runtime_session = self._sessions.get(session_id)
if runtime_session is None:
raise BoxSessionNotFoundError(f'session {session_id} not found')
result = self._session_to_dict(runtime_session.info)
if runtime_session.managed_process is not None:
result['managed_process'] = self._managed_process_to_dict(
session_id, runtime_session.managed_process
)
return result
async def get_status(self) -> dict:
backend_info = await self.get_backend_info()
return {
'backend': backend_info,
'active_sessions': len(self._sessions),
'managed_processes': sum(
1
for runtime_session in self._sessions.values()
if runtime_session.managed_process is not None and runtime_session.managed_process.is_running
),
'session_ttl_sec': self.session_ttl_sec,
}
async def _get_or_create_session(self, spec: BoxSpec) -> _RuntimeSession:
async with self._lock:
await self._reap_expired_sessions_locked()
existing = self._sessions.get(spec.session_id)
if existing is not None:
self._assert_session_compatible(existing.info, spec)
existing.info.last_used_at = dt.datetime.now(_UTC)
self.logger.info(
'LangBot Box session reused: '
f'session_id={spec.session_id} '
f'backend_session_id={existing.info.backend_session_id} '
f'backend={existing.info.backend_name}'
)
return existing
backend = await self._get_backend()
info = await backend.start_session(spec)
runtime_session = _RuntimeSession(info=info, lock=asyncio.Lock())
self._sessions[spec.session_id] = runtime_session
self.logger.info(
'LangBot Box session created: '
f'session_id={spec.session_id} '
f'backend_session_id={info.backend_session_id} '
f'backend={info.backend_name} '
f'image={info.image} '
f'network={info.network.value} '
f'host_path={info.host_path} '
f'host_path_mode={info.host_path_mode.value}'
)
return runtime_session
async def _get_backend(self) -> BaseSandboxBackend:
if self._backend is None:
self._backend = await self._select_backend()
if self._backend is None:
raise BoxBackendUnavailableError(
'LangBot Box backend unavailable. Install and start Podman or Docker before using sandbox_exec.'
)
return self._backend
async def _select_backend(self) -> BaseSandboxBackend | None:
for backend in self.backends:
try:
await backend.initialize()
if await backend.is_available():
self.logger.info(f'LangBot Box using backend: {backend.name}')
return backend
except Exception as exc:
self.logger.warning(f'LangBot Box backend {backend.name} probe failed: {exc}')
self.logger.warning('LangBot Box backend unavailable: neither Podman nor Docker is ready')
return None
async def _reap_expired_sessions_locked(self):
if self.session_ttl_sec <= 0:
return
deadline = dt.datetime.now(_UTC) - dt.timedelta(seconds=self.session_ttl_sec)
expired_session_ids = [
session_id
for session_id, session in self._sessions.items()
if session.info.last_used_at < deadline
and not (session.managed_process is not None and session.managed_process.is_running)
]
for session_id in expired_session_ids:
await self._drop_session_locked(session_id)
async def _drop_session_locked(self, session_id: str):
runtime_session = self._sessions.pop(session_id, None)
if runtime_session is None or self._backend is None:
return
await self._terminate_managed_process(runtime_session)
try:
self.logger.info(
'LangBot Box session cleanup: '
f'session_id={session_id} '
f'backend_session_id={runtime_session.info.backend_session_id} '
f'backend={runtime_session.info.backend_name}'
)
await self._backend.stop_session(runtime_session.info)
except Exception as exc:
self.logger.warning(f'Failed to clean up box session {session_id}: {exc}')
def _assert_session_compatible(self, session: BoxSessionInfo, spec: BoxSpec):
_COMPAT_FIELDS = (
'network', 'image', 'host_path', 'host_path_mode',
'cpus', 'memory_mb', 'pids_limit', 'read_only_rootfs',
)
for field in _COMPAT_FIELDS:
session_val = getattr(session, field)
spec_val = getattr(spec, field)
if session_val != spec_val:
display = session_val.value if hasattr(session_val, 'value') else session_val
raise BoxSessionConflictError(
f'sandbox_exec session {spec.session_id} already exists with {field}={display}'
)
async def _drain_managed_process_stderr(self, session_id: str, managed_process: _ManagedProcess) -> None:
stream = managed_process.process.stderr
if stream is None:
return
try:
while True:
chunk = await stream.readline()
if not chunk:
break
text = chunk.decode('utf-8', errors='replace').rstrip()
if not text:
continue
managed_process.stderr_chunks.append(text)
preview = '\n'.join(managed_process.stderr_chunks)
while len(preview) > _MANAGED_PROCESS_STDERR_PREVIEW_LIMIT and managed_process.stderr_chunks:
managed_process.stderr_chunks.popleft()
preview = '\n'.join(managed_process.stderr_chunks)
self.logger.info(f'LangBot Box managed process stderr: session_id={session_id} {text}')
except Exception as exc:
self.logger.warning(f'Failed to drain managed process stderr for {session_id}: {exc}')
async def _watch_managed_process(self, session_id: str, managed_process: _ManagedProcess) -> None:
return_code = await managed_process.process.wait()
managed_process.exit_code = return_code
managed_process.exited_at = dt.datetime.now(_UTC)
runtime_session = self._sessions.get(session_id)
if runtime_session is not None:
runtime_session.info.last_used_at = managed_process.exited_at
self.logger.info(
'LangBot Box managed process exited: '
f'session_id={session_id} return_code={return_code}'
)
async def _terminate_managed_process(self, runtime_session: _RuntimeSession) -> None:
managed_process = runtime_session.managed_process
if managed_process is None or not managed_process.is_running:
return
process = managed_process.process
try:
if process.stdin is not None:
process.stdin.close()
except Exception:
pass
try:
await asyncio.wait_for(asyncio.shield(process.wait()), timeout=5)
except asyncio.TimeoutError:
if process.returncode is None:
try:
process.terminate()
except ProcessLookupError:
pass
try:
await asyncio.wait_for(asyncio.shield(process.wait()), timeout=5)
except asyncio.TimeoutError:
if process.returncode is None:
try:
process.kill()
except ProcessLookupError:
pass
await process.wait()
finally:
managed_process.exit_code = process.returncode
managed_process.exited_at = dt.datetime.now(_UTC)
def _managed_process_to_dict(self, session_id: str, managed_process: _ManagedProcess) -> dict:
stderr_preview = '\n'.join(managed_process.stderr_chunks)
status = BoxManagedProcessStatus.RUNNING if managed_process.is_running else BoxManagedProcessStatus.EXITED
return BoxManagedProcessInfo(
session_id=session_id,
status=status,
command=managed_process.spec.command,
args=managed_process.spec.args,
cwd=managed_process.spec.cwd,
env_keys=sorted(managed_process.spec.env.keys()),
attached=managed_process.attach_lock.locked(),
started_at=managed_process.started_at,
exited_at=managed_process.exited_at,
exit_code=managed_process.exit_code,
stderr_preview=stderr_preview,
).model_dump(mode='json')
@staticmethod
def _session_to_dict(info: BoxSessionInfo) -> dict:
return {
'session_id': info.session_id,
'backend_name': info.backend_name,
'backend_session_id': info.backend_session_id,
'image': info.image,
'network': info.network.value,
'host_path': info.host_path,
'host_path_mode': info.host_path_mode.value,
'cpus': info.cpus,
'memory_mb': info.memory_mb,
'pids_limit': info.pids_limit,
'read_only_rootfs': info.read_only_rootfs,
'created_at': info.created_at.isoformat(),
'last_used_at': info.last_used_at.isoformat(),
}