feat(box): add obs

This commit is contained in:
youhuanghe
2026-03-20 05:14:16 +00:00
committed by WangCham
parent 86b2d517f2
commit 15c03fe96b
4 changed files with 198 additions and 3 deletions

View File

@@ -0,0 +1,22 @@
from __future__ import annotations
from .. import group
@group.group_class('box', '/api/v1/box')
class BoxRouterGroup(group.RouterGroup):
async def initialize(self) -> None:
@self.route('/status', methods=['GET'], auth_type=group.AuthType.USER_TOKEN)
async def _() -> str:
status = await self.ap.box_service.get_status()
return self.success(data=status)
@self.route('/sessions', methods=['GET'], auth_type=group.AuthType.USER_TOKEN)
async def _() -> str:
sessions = self.ap.box_service.runtime.get_sessions()
return self.success(data=sessions)
@self.route('/errors', methods=['GET'], auth_type=group.AuthType.USER_TOKEN)
async def _() -> str:
errors = self.ap.box_service.get_recent_errors()
return self.success(data=errors)

View File

@@ -65,6 +65,46 @@ class BoxRuntime:
for session_id in session_ids:
await self._drop_session_locked(session_id)
# ── Observability ─────────────────────────────────────────────────
async def get_backend_info(self) -> dict:
backend = self._backend
if backend is None:
return {'name': None, 'available': False}
try:
available = await backend.is_available()
except Exception:
available = False
return {'name': backend.name, 'available': available}
def get_sessions(self) -> list[dict]:
return [
{
'session_id': s.info.session_id,
'backend_name': s.info.backend_name,
'backend_session_id': s.info.backend_session_id,
'image': s.info.image,
'network': s.info.network.value,
'host_path': s.info.host_path,
'host_path_mode': s.info.host_path_mode.value,
'cpus': s.info.cpus,
'memory_mb': s.info.memory_mb,
'pids_limit': s.info.pids_limit,
'read_only_rootfs': s.info.read_only_rootfs,
'created_at': s.info.created_at.isoformat(),
'last_used_at': s.info.last_used_at.isoformat(),
}
for s in self._sessions.values()
]
async def get_status(self) -> dict:
backend_info = await self.get_backend_info()
return {
'backend': backend_info,
'active_sessions': len(self._sessions),
'session_ttl_sec': self.session_ttl_sec,
}
async def _get_or_create_session(self, spec: BoxSpec) -> _RuntimeSession:
async with self._lock:
await self._reap_expired_sessions_locked()

View File

@@ -1,5 +1,7 @@
from __future__ import annotations
import collections
import datetime as _dt
import enum
import json
import os
@@ -7,11 +9,13 @@ from typing import TYPE_CHECKING
import pydantic
from .errors import BoxValidationError
from .errors import BoxError, BoxValidationError
from .models import BUILTIN_PROFILES, BoxExecutionResult, BoxProfile, BoxSpec
from .runtime import BoxRuntime
_INT_ADAPTER = pydantic.TypeAdapter(int)
_UTC = _dt.timezone.utc
_MAX_RECENT_ERRORS = 50
if TYPE_CHECKING:
from ..core import app as core_app
@@ -31,6 +35,7 @@ class BoxService:
self.allowed_host_mount_roots = self._load_allowed_host_mount_roots()
self.default_host_workspace = self._load_default_host_workspace()
self.profile = self._load_profile()
self._recent_errors: collections.deque[dict] = collections.deque(maxlen=_MAX_RECENT_ERRORS)
async def initialize(self):
await self.runtime.initialize()
@@ -48,7 +53,9 @@ class BoxService:
spec = BoxSpec.model_validate(spec_payload)
except pydantic.ValidationError as exc:
first_error = exc.errors()[0]
raise BoxValidationError(first_error.get('msg', 'invalid sandbox_exec arguments')) from exc
err = BoxValidationError(first_error.get('msg', 'invalid sandbox_exec arguments'))
self._record_error(err, query)
raise err from exc
self._validate_host_mount(spec)
self.ap.logger.info(
@@ -56,7 +63,11 @@ class BoxService:
f'query_id={query.query_id} '
f'spec={json.dumps(self._summarize_spec(spec), ensure_ascii=False)}'
)
result = await self.runtime.execute(spec)
try:
result = await self.runtime.execute(spec)
except BoxError as exc:
self._record_error(exc, query)
raise
self.ap.logger.info(
'LangBot Box result: '
f'query_id={query.query_id} '
@@ -229,3 +240,24 @@ class BoxService:
if normalized_timeout > profile.max_timeout_sec:
params['timeout_sec'] = profile.max_timeout_sec
# ── Observability ─────────────────────────────────────────────────
def _record_error(self, exc: Exception, query: 'pipeline_query.Query'):
self._recent_errors.append({
'timestamp': _dt.datetime.now(_UTC).isoformat(),
'type': type(exc).__name__,
'message': str(exc),
'query_id': str(query.query_id),
})
def get_recent_errors(self) -> list[dict]:
return list(self._recent_errors)
async def get_status(self) -> dict:
runtime_status = await self.runtime.get_status()
return {
**runtime_status,
'profile': self.profile.name,
'recent_error_count': len(self._recent_errors),
}

View File

@@ -545,3 +545,104 @@ def test_box_spec_validates_resource_limits():
BoxSpec.model_validate({'cmd': 'echo', 'session_id': 's1', 'memory_mb': 10})
with pytest.raises(Exception):
BoxSpec.model_validate({'cmd': 'echo', 'session_id': 's1', 'pids_limit': 0})
# ── Observability tests ───────────────────────────────────────────────
@pytest.mark.asyncio
async def test_runtime_get_status_reports_backend_and_sessions():
logger = Mock()
backend = FakeBackend(logger)
runtime = BoxRuntime(logger=logger, backends=[backend], session_ttl_sec=300)
await runtime.initialize()
status = await runtime.get_status()
assert status['backend']['name'] == 'fake'
assert status['backend']['available'] is True
assert status['active_sessions'] == 0
await runtime.execute(BoxSpec.model_validate({'cmd': 'echo', 'session_id': 'obs-1'}))
status = await runtime.get_status()
assert status['active_sessions'] == 1
@pytest.mark.asyncio
async def test_runtime_get_sessions_returns_session_info():
logger = Mock()
backend = FakeBackend(logger)
runtime = BoxRuntime(logger=logger, backends=[backend], session_ttl_sec=300)
await runtime.initialize()
await runtime.execute(BoxSpec.model_validate({'cmd': 'echo', 'session_id': 'obs-2'}))
sessions = runtime.get_sessions()
assert len(sessions) == 1
assert sessions[0]['session_id'] == 'obs-2'
assert sessions[0]['backend_name'] == 'fake'
assert 'created_at' in sessions[0]
assert 'last_used_at' in sessions[0]
@pytest.mark.asyncio
async def test_runtime_get_backend_info_when_no_backend():
logger = Mock()
backend = FakeBackend(logger, available=False)
runtime = BoxRuntime(logger=logger, backends=[backend], session_ttl_sec=300)
await runtime.initialize()
info = await runtime.get_backend_info()
assert info['name'] is None
assert info['available'] is False
@pytest.mark.asyncio
async def test_service_records_errors_on_failure():
logger = Mock()
backend = FakeBackend(logger, available=False)
runtime = BoxRuntime(logger=logger, backends=[backend], session_ttl_sec=300)
service = BoxService(make_app(logger), runtime=runtime)
await service.initialize()
with pytest.raises(Exception):
await service.execute_sandbox_tool({'cmd': 'echo hello'}, make_query(50))
errors = service.get_recent_errors()
assert len(errors) == 1
assert errors[0]['type'] == 'BoxBackendUnavailableError'
assert errors[0]['query_id'] == '50'
assert 'timestamp' in errors[0]
@pytest.mark.asyncio
async def test_service_error_ring_buffer_capped():
logger = Mock()
backend = FakeBackend(logger, available=False)
runtime = BoxRuntime(logger=logger, backends=[backend], session_ttl_sec=300)
service = BoxService(make_app(logger), runtime=runtime)
await service.initialize()
for i in range(60):
with pytest.raises(Exception):
await service.execute_sandbox_tool({'cmd': 'fail'}, make_query(100 + i))
errors = service.get_recent_errors()
assert len(errors) == 50
# Oldest should have been evicted, newest kept
assert errors[0]['query_id'] == '110'
assert errors[-1]['query_id'] == '159'
@pytest.mark.asyncio
async def test_service_get_status_aggregates_runtime_and_profile():
logger = Mock()
backend = FakeBackend(logger)
runtime = BoxRuntime(logger=logger, backends=[backend], session_ttl_sec=300)
service = BoxService(make_app(logger), runtime=runtime)
await service.initialize()
status = await service.get_status()
assert status['profile'] == 'default'
assert status['backend']['name'] == 'fake'
assert status['backend']['available'] is True
assert status['active_sessions'] == 0
assert status['recent_error_count'] == 0