diff --git a/src/langbot/pkg/api/http/controller/groups/box.py b/src/langbot/pkg/api/http/controller/groups/box.py new file mode 100644 index 00000000..13b9a139 --- /dev/null +++ b/src/langbot/pkg/api/http/controller/groups/box.py @@ -0,0 +1,22 @@ +from __future__ import annotations + +from .. import group + + +@group.group_class('box', '/api/v1/box') +class BoxRouterGroup(group.RouterGroup): + async def initialize(self) -> None: + @self.route('/status', methods=['GET'], auth_type=group.AuthType.USER_TOKEN) + async def _() -> str: + status = await self.ap.box_service.get_status() + return self.success(data=status) + + @self.route('/sessions', methods=['GET'], auth_type=group.AuthType.USER_TOKEN) + async def _() -> str: + sessions = self.ap.box_service.runtime.get_sessions() + return self.success(data=sessions) + + @self.route('/errors', methods=['GET'], auth_type=group.AuthType.USER_TOKEN) + async def _() -> str: + errors = self.ap.box_service.get_recent_errors() + return self.success(data=errors) diff --git a/src/langbot/pkg/box/runtime.py b/src/langbot/pkg/box/runtime.py index d4a93ed5..10996727 100644 --- a/src/langbot/pkg/box/runtime.py +++ b/src/langbot/pkg/box/runtime.py @@ -65,6 +65,46 @@ class BoxRuntime: for session_id in session_ids: await self._drop_session_locked(session_id) + # ── Observability ───────────────────────────────────────────────── + + async def get_backend_info(self) -> dict: + backend = self._backend + if backend is None: + return {'name': None, 'available': False} + try: + available = await backend.is_available() + except Exception: + available = False + return {'name': backend.name, 'available': available} + + def get_sessions(self) -> list[dict]: + return [ + { + 'session_id': s.info.session_id, + 'backend_name': s.info.backend_name, + 'backend_session_id': s.info.backend_session_id, + 'image': s.info.image, + 'network': s.info.network.value, + 'host_path': s.info.host_path, + 'host_path_mode': s.info.host_path_mode.value, + 'cpus': s.info.cpus, + 'memory_mb': s.info.memory_mb, + 'pids_limit': s.info.pids_limit, + 'read_only_rootfs': s.info.read_only_rootfs, + 'created_at': s.info.created_at.isoformat(), + 'last_used_at': s.info.last_used_at.isoformat(), + } + for s in self._sessions.values() + ] + + async def get_status(self) -> dict: + backend_info = await self.get_backend_info() + return { + 'backend': backend_info, + 'active_sessions': len(self._sessions), + 'session_ttl_sec': self.session_ttl_sec, + } + async def _get_or_create_session(self, spec: BoxSpec) -> _RuntimeSession: async with self._lock: await self._reap_expired_sessions_locked() diff --git a/src/langbot/pkg/box/service.py b/src/langbot/pkg/box/service.py index 8736706f..3a486165 100644 --- a/src/langbot/pkg/box/service.py +++ b/src/langbot/pkg/box/service.py @@ -1,5 +1,7 @@ from __future__ import annotations +import collections +import datetime as _dt import enum import json import os @@ -7,11 +9,13 @@ from typing import TYPE_CHECKING import pydantic -from .errors import BoxValidationError +from .errors import BoxError, BoxValidationError from .models import BUILTIN_PROFILES, BoxExecutionResult, BoxProfile, BoxSpec from .runtime import BoxRuntime _INT_ADAPTER = pydantic.TypeAdapter(int) +_UTC = _dt.timezone.utc +_MAX_RECENT_ERRORS = 50 if TYPE_CHECKING: from ..core import app as core_app @@ -31,6 +35,7 @@ class BoxService: self.allowed_host_mount_roots = self._load_allowed_host_mount_roots() self.default_host_workspace = self._load_default_host_workspace() self.profile = self._load_profile() + self._recent_errors: collections.deque[dict] = collections.deque(maxlen=_MAX_RECENT_ERRORS) async def initialize(self): await self.runtime.initialize() @@ -48,7 +53,9 @@ class BoxService: spec = BoxSpec.model_validate(spec_payload) except pydantic.ValidationError as exc: first_error = exc.errors()[0] - raise BoxValidationError(first_error.get('msg', 'invalid sandbox_exec arguments')) from exc + err = BoxValidationError(first_error.get('msg', 'invalid sandbox_exec arguments')) + self._record_error(err, query) + raise err from exc self._validate_host_mount(spec) self.ap.logger.info( @@ -56,7 +63,11 @@ class BoxService: f'query_id={query.query_id} ' f'spec={json.dumps(self._summarize_spec(spec), ensure_ascii=False)}' ) - result = await self.runtime.execute(spec) + try: + result = await self.runtime.execute(spec) + except BoxError as exc: + self._record_error(exc, query) + raise self.ap.logger.info( 'LangBot Box result: ' f'query_id={query.query_id} ' @@ -229,3 +240,24 @@ class BoxService: if normalized_timeout > profile.max_timeout_sec: params['timeout_sec'] = profile.max_timeout_sec + + # ── Observability ───────────────────────────────────────────────── + + def _record_error(self, exc: Exception, query: 'pipeline_query.Query'): + self._recent_errors.append({ + 'timestamp': _dt.datetime.now(_UTC).isoformat(), + 'type': type(exc).__name__, + 'message': str(exc), + 'query_id': str(query.query_id), + }) + + def get_recent_errors(self) -> list[dict]: + return list(self._recent_errors) + + async def get_status(self) -> dict: + runtime_status = await self.runtime.get_status() + return { + **runtime_status, + 'profile': self.profile.name, + 'recent_error_count': len(self._recent_errors), + } diff --git a/tests/unit_tests/box/test_box_service.py b/tests/unit_tests/box/test_box_service.py index 104f34ec..d8ccf815 100644 --- a/tests/unit_tests/box/test_box_service.py +++ b/tests/unit_tests/box/test_box_service.py @@ -545,3 +545,104 @@ def test_box_spec_validates_resource_limits(): BoxSpec.model_validate({'cmd': 'echo', 'session_id': 's1', 'memory_mb': 10}) with pytest.raises(Exception): BoxSpec.model_validate({'cmd': 'echo', 'session_id': 's1', 'pids_limit': 0}) + + +# ── Observability tests ─────────────────────────────────────────────── + + +@pytest.mark.asyncio +async def test_runtime_get_status_reports_backend_and_sessions(): + logger = Mock() + backend = FakeBackend(logger) + runtime = BoxRuntime(logger=logger, backends=[backend], session_ttl_sec=300) + await runtime.initialize() + + status = await runtime.get_status() + assert status['backend']['name'] == 'fake' + assert status['backend']['available'] is True + assert status['active_sessions'] == 0 + + await runtime.execute(BoxSpec.model_validate({'cmd': 'echo', 'session_id': 'obs-1'})) + status = await runtime.get_status() + assert status['active_sessions'] == 1 + + +@pytest.mark.asyncio +async def test_runtime_get_sessions_returns_session_info(): + logger = Mock() + backend = FakeBackend(logger) + runtime = BoxRuntime(logger=logger, backends=[backend], session_ttl_sec=300) + await runtime.initialize() + + await runtime.execute(BoxSpec.model_validate({'cmd': 'echo', 'session_id': 'obs-2'})) + sessions = runtime.get_sessions() + assert len(sessions) == 1 + assert sessions[0]['session_id'] == 'obs-2' + assert sessions[0]['backend_name'] == 'fake' + assert 'created_at' in sessions[0] + assert 'last_used_at' in sessions[0] + + +@pytest.mark.asyncio +async def test_runtime_get_backend_info_when_no_backend(): + logger = Mock() + backend = FakeBackend(logger, available=False) + runtime = BoxRuntime(logger=logger, backends=[backend], session_ttl_sec=300) + await runtime.initialize() + + info = await runtime.get_backend_info() + assert info['name'] is None + assert info['available'] is False + + +@pytest.mark.asyncio +async def test_service_records_errors_on_failure(): + logger = Mock() + backend = FakeBackend(logger, available=False) + runtime = BoxRuntime(logger=logger, backends=[backend], session_ttl_sec=300) + service = BoxService(make_app(logger), runtime=runtime) + await service.initialize() + + with pytest.raises(Exception): + await service.execute_sandbox_tool({'cmd': 'echo hello'}, make_query(50)) + + errors = service.get_recent_errors() + assert len(errors) == 1 + assert errors[0]['type'] == 'BoxBackendUnavailableError' + assert errors[0]['query_id'] == '50' + assert 'timestamp' in errors[0] + + +@pytest.mark.asyncio +async def test_service_error_ring_buffer_capped(): + logger = Mock() + backend = FakeBackend(logger, available=False) + runtime = BoxRuntime(logger=logger, backends=[backend], session_ttl_sec=300) + service = BoxService(make_app(logger), runtime=runtime) + await service.initialize() + + for i in range(60): + with pytest.raises(Exception): + await service.execute_sandbox_tool({'cmd': 'fail'}, make_query(100 + i)) + + errors = service.get_recent_errors() + assert len(errors) == 50 + # Oldest should have been evicted, newest kept + assert errors[0]['query_id'] == '110' + assert errors[-1]['query_id'] == '159' + + +@pytest.mark.asyncio +async def test_service_get_status_aggregates_runtime_and_profile(): + logger = Mock() + backend = FakeBackend(logger) + runtime = BoxRuntime(logger=logger, backends=[backend], session_ttl_sec=300) + service = BoxService(make_app(logger), runtime=runtime) + await service.initialize() + + status = await service.get_status() + assert status['profile'] == 'default' + assert status['backend']['name'] == 'fake' + assert status['backend']['available'] is True + assert status['active_sessions'] == 0 + assert status['recent_error_count'] == 0