From 99328cf4c019739a294f23de7f1ba46fc2f64e98 Mon Sep 17 00:00:00 2001 From: Junyan Qin Date: Wed, 20 May 2026 16:50:46 +0800 Subject: [PATCH] fix(skill): harden mount/reload paths and HTTP errors against stale skill cache MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Box backends behave inconsistently when extra_mounts reference a missing host directory (nsjail aborts the entire sandbox start, Docker silently creates a root-owned empty dir on the host, E2B silently skips the upload). The cache in skill_mgr.skills is only refreshed on in-process mutations, so out-of-band changes — container rebuilds, manual rm in the box volume, anything the LangBot API didn't drive — leave a stale skill that later produces one of those bad mount paths. - box/service.py: build_skill_extra_mounts now filters skills whose package_root is not isdir on the LangBot-visible filesystem and logs a warning, instead of passing the bad mount through to the backend - skill/manager.py: reload_skills (Box path) drops skills whose package_root is missing on the LangBot-side filesystem before they reach the in-memory cache, with a summary warning - api/http/controller/groups/skills.py: file/CRUD handlers now also catch BoxError (RuntimeError subclass, previously slipping past ``except ValueError`` and surfacing as 500); list/get handlers gain a try/except so a transient Box RPC failure becomes a clean 400 instead of a stack trace Tests added for build_skill_extra_mounts (skip missing, skip empty, no skill manager) and SkillManager.reload_skills (drop missing on Box path). Full unit suite: 279 passed. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../pkg/api/http/controller/groups/skills.py | 34 +++++++---- src/langbot/pkg/box/service.py | 16 +++++ src/langbot/pkg/skill/manager.py | 27 +++++++- tests/unit_tests/box/test_box_service.py | 61 +++++++++++++++++++ tests/unit_tests/provider/test_skill_tools.py | 30 +++++++++ 5 files changed, 152 insertions(+), 16 deletions(-) diff --git a/src/langbot/pkg/api/http/controller/groups/skills.py b/src/langbot/pkg/api/http/controller/groups/skills.py index 21add5a6..946741d7 100644 --- a/src/langbot/pkg/api/http/controller/groups/skills.py +++ b/src/langbot/pkg/api/http/controller/groups/skills.py @@ -2,6 +2,8 @@ from __future__ import annotations import quart +from langbot_plugin.box.errors import BoxError + from .. import group @@ -13,7 +15,10 @@ class SkillsRouterGroup(group.RouterGroup): @self.route('', methods=['GET', 'POST'], auth_type=group.AuthType.USER_TOKEN_OR_API_KEY) async def list_or_create_skills() -> quart.Response: if quart.request.method == 'GET': - skills = await self.ap.skill_service.list_skills() + try: + skills = await self.ap.skill_service.list_skills() + except (ValueError, BoxError) as exc: + return self.http_status(400, -1, str(exc)) return self.success(data={'skills': skills}) data = await quart.request.json @@ -23,13 +28,16 @@ class SkillsRouterGroup(group.RouterGroup): try: skill = await self.ap.skill_service.create_skill(data) return self.success(data={'skill': skill}) - except ValueError as exc: + except (ValueError, BoxError) as exc: return self.http_status(400, -1, str(exc)) @self.route('/', methods=['GET', 'PUT', 'DELETE'], auth_type=group.AuthType.USER_TOKEN_OR_API_KEY) async def get_update_delete_skill(skill_name: str) -> quart.Response: if quart.request.method == 'GET': - skill = await self.ap.skill_service.get_skill(skill_name) + try: + skill = await self.ap.skill_service.get_skill(skill_name) + except (ValueError, BoxError) as exc: + return self.http_status(400, -1, str(exc)) if not skill: return self.http_status(404, -1, 'Skill not found') return self.success(data={'skill': skill}) @@ -39,13 +47,13 @@ class SkillsRouterGroup(group.RouterGroup): try: skill = await self.ap.skill_service.update_skill(skill_name, data) return self.success(data={'skill': skill}) - except ValueError as exc: + except (ValueError, BoxError) as exc: return self.http_status(400, -1, str(exc)) try: await self.ap.skill_service.delete_skill(skill_name) return self.success() - except ValueError as exc: + except (ValueError, BoxError) as exc: return self.http_status(400, -1, str(exc)) @self.route('//files', methods=['GET'], auth_type=group.AuthType.USER_TOKEN_OR_API_KEY) @@ -61,7 +69,7 @@ class SkillsRouterGroup(group.RouterGroup): include_hidden=include_hidden, ) return self.success(data=result) - except ValueError as exc: + except (ValueError, BoxError) as exc: return self.http_status(400, -1, str(exc)) @self.route( @@ -73,7 +81,7 @@ class SkillsRouterGroup(group.RouterGroup): try: result = await self.ap.skill_service.read_skill_file(skill_name, path) return self.success(data=result) - except ValueError as exc: + except (ValueError, BoxError) as exc: return self.http_status(400, -1, str(exc)) # PUT - write file @@ -85,7 +93,7 @@ class SkillsRouterGroup(group.RouterGroup): try: result = await self.ap.skill_service.write_skill_file(skill_name, path, content) return self.success(data=result) - except ValueError as exc: + except (ValueError, BoxError) as exc: return self.http_status(400, -1, str(exc)) @self.route('//preview', methods=['GET'], auth_type=group.AuthType.USER_TOKEN_OR_API_KEY) @@ -109,7 +117,7 @@ class SkillsRouterGroup(group.RouterGroup): try: skill = await self.ap.skill_service.install_from_github(data) return self.success(data={'skills': skill}) - except ValueError as exc: + except (ValueError, BoxError) as exc: return self.http_status(400, -1, str(exc)) except Exception as exc: return self.http_status(500, -1, f'Failed to install skill: {exc}') @@ -128,7 +136,7 @@ class SkillsRouterGroup(group.RouterGroup): try: preview = await self.ap.skill_service.preview_install_from_github(data) return self.success(data={'skills': preview}) - except ValueError as exc: + except (ValueError, BoxError) as exc: return self.http_status(400, -1, str(exc)) except Exception as exc: return self.http_status(500, -1, f'Failed to preview skill: {exc}') @@ -147,7 +155,7 @@ class SkillsRouterGroup(group.RouterGroup): source_paths=form.getlist('source_paths'), ) return self.success(data={'skills': skill}) - except ValueError as exc: + except (ValueError, BoxError) as exc: return self.http_status(400, -1, str(exc)) except Exception as exc: return self.http_status(500, -1, f'Failed to install skill: {exc}') @@ -164,7 +172,7 @@ class SkillsRouterGroup(group.RouterGroup): filename=file.filename or '', ) return self.success(data={'skills': preview}) - except ValueError as exc: + except (ValueError, BoxError) as exc: return self.http_status(400, -1, str(exc)) except Exception as exc: return self.http_status(500, -1, f'Failed to preview skill: {exc}') @@ -178,5 +186,5 @@ class SkillsRouterGroup(group.RouterGroup): try: result = await self.ap.skill_service.scan_directory_async(path) return self.success(data=result) - except ValueError as exc: + except (ValueError, BoxError) as exc: return self.http_status(400, -1, str(exc)) diff --git a/src/langbot/pkg/box/service.py b/src/langbot/pkg/box/service.py index 4b3d6a1a..038d8119 100644 --- a/src/langbot/pkg/box/service.py +++ b/src/langbot/pkg/box/service.py @@ -192,6 +192,15 @@ class BoxService: This ensures that when a container is first created it already has all skill packages mounted, regardless of which skill is currently activated. + + Skills whose ``package_root`` is missing or no longer a directory on + the LangBot-visible filesystem are skipped with a warning instead of + being passed through to the backend. Without this guard the three + backends behave inconsistently on a stale mount: nsjail refuses to + start the sandbox (failing every exec in the session), Docker + silently auto-creates a root-owned empty directory on the host, and + E2B silently skips the upload — none of which surfaces an + actionable error to the agent or operator. """ skill_mgr = getattr(self.ap, 'skill_mgr', None) if skill_mgr is None: @@ -205,6 +214,13 @@ class BoxService: package_root = str(skill_data.get('package_root', '') or '').strip() if not package_root: continue + if not os.path.isdir(package_root): + self.ap.logger.warning( + f'Skill "{skill_name}" package_root missing on filesystem ' + f'({package_root}); skipping mount to prevent sandbox failures. ' + f'The skill cache may be stale — consider reloading skills.' + ) + continue mounts.append( { 'host_path': package_root, diff --git a/src/langbot/pkg/skill/manager.py b/src/langbot/pkg/skill/manager.py index d5d94351..2bf1b4d8 100644 --- a/src/langbot/pkg/skill/manager.py +++ b/src/langbot/pkg/skill/manager.py @@ -50,11 +50,32 @@ class SkillManager: box_service = getattr(self.ap, 'box_service', None) if box_service is not None and getattr(box_service, 'available', False): try: + dropped = 0 for skill_data in await box_service.list_skills(): skill_name = skill_data.get('name') - if skill_name: - self.skills[skill_name] = skill_data - self.ap.logger.info(f'Loaded {len(self.skills)} skills from Box runtime') + if not skill_name: + continue + # Drop skills whose package_root is no longer visible on the + # LangBot-side filesystem (e.g. Box volume was rebuilt or the + # directory was deleted out-of-band). Keeping them in the cache + # would cause stale extra_mounts and confusing UI states. + package_root = str(skill_data.get('package_root', '') or '').strip() + if package_root and not os.path.isdir(package_root): + self.ap.logger.warning( + f'Skill "{skill_name}" reported by Box runtime but ' + f'package_root missing on LangBot filesystem ' + f'({package_root}); dropping from in-memory cache.' + ) + dropped += 1 + continue + self.skills[skill_name] = skill_data + if dropped: + self.ap.logger.warning( + f'Loaded {len(self.skills)} skills from Box runtime ' + f'({dropped} dropped due to missing package_root).' + ) + else: + self.ap.logger.info(f'Loaded {len(self.skills)} skills from Box runtime') return except Exception as exc: self.ap.logger.warning(f'Failed to load skills from Box runtime, falling back to local data: {exc}') diff --git a/tests/unit_tests/box/test_box_service.py b/tests/unit_tests/box/test_box_service.py index 36bda025..1f87b073 100644 --- a/tests/unit_tests/box/test_box_service.py +++ b/tests/unit_tests/box/test_box_service.py @@ -3,6 +3,7 @@ from __future__ import annotations import asyncio import datetime as dt import os +import tempfile from types import SimpleNamespace from unittest.mock import AsyncMock, Mock @@ -1222,3 +1223,63 @@ class TestBoxHostMountModeNone: mount_path='/project', workdir='/workspace', ) + + +class TestBuildSkillExtraMounts: + """Robustness of skill mount construction against a stale skill cache. + + The three sandbox backends behave inconsistently when a skill's + package_root no longer exists on disk (nsjail aborts the whole sandbox + start, Docker silently auto-creates a root-owned empty directory, E2B + silently skips). Mount construction must filter these out up front so + the backend never sees a bad mount. + """ + + def _make_service(self, logger, skills): + app = make_app(logger) + app.skill_mgr = SimpleNamespace(skills=skills) + client = Mock(spec=BoxRuntimeClient) + return BoxService(app, client=client) + + def test_skips_skill_with_missing_package_root(self): + logger = Mock() + with tempfile.TemporaryDirectory() as live_dir: + skills = { + 'alive': {'name': 'alive', 'package_root': live_dir}, + 'ghost': {'name': 'ghost', 'package_root': '/nonexistent/path/should/never/exist'}, + } + service = self._make_service(logger, skills) + query = make_query() + + mounts = service.build_skill_extra_mounts(query) + + assert mounts == [ + { + 'host_path': live_dir, + 'mount_path': '/workspace/.skills/alive', + 'mode': 'rw', + } + ] + # Warning logged so operators can see what was dropped + assert any( + 'ghost' in str(call.args[0]) and 'package_root missing' in str(call.args[0]) + for call in logger.warning.call_args_list + ) + + def test_skips_skill_with_empty_package_root(self): + logger = Mock() + skills = { + 'no_root': {'name': 'no_root', 'package_root': ''}, + 'whitespace': {'name': 'whitespace', 'package_root': ' '}, + } + service = self._make_service(logger, skills) + + assert service.build_skill_extra_mounts(make_query()) == [] + + def test_returns_empty_when_no_skill_manager(self): + logger = Mock() + app = make_app(logger) + # no skill_mgr attribute + service = BoxService(app, client=Mock(spec=BoxRuntimeClient)) + + assert service.build_skill_extra_mounts(make_query()) == [] diff --git a/tests/unit_tests/provider/test_skill_tools.py b/tests/unit_tests/provider/test_skill_tools.py index 5c8f901f..26b4bf84 100644 --- a/tests/unit_tests/provider/test_skill_tools.py +++ b/tests/unit_tests/provider/test_skill_tools.py @@ -78,6 +78,36 @@ class TestSkillManagerPackageLoading: assert skill_data['instructions'] == 'Updated instructions' assert skill_data['description'] == 'Second' + @pytest.mark.asyncio + async def test_reload_skills_drops_box_skills_with_missing_package_root(self): + """When Box reports a skill whose package_root is gone from the + LangBot-visible filesystem, the cache must drop it instead of + keeping a stale entry that would later produce a bad mount.""" + from langbot.pkg.skill.manager import SkillManager + + with tempfile.TemporaryDirectory() as live_dir: + ghost_dir = os.path.join(live_dir, '_does_not_exist') + box_service = SimpleNamespace( + available=True, + list_skills=AsyncMock( + return_value=[ + _make_skill_data(name='alive', package_root=live_dir), + _make_skill_data(name='ghost', package_root=ghost_dir), + ] + ), + ) + + ap = _make_ap() + ap.box_service = box_service + mgr = SkillManager(ap) + + await mgr.reload_skills() + + assert list(mgr.skills) == ['alive'] + # Warning fired with the dropped skill name so operators can see it. + warning_messages = [str(call.args[0]) for call in ap.logger.warning.call_args_list] + assert any('ghost' in msg and 'package_root missing' in msg for msg in warning_messages) + class TestSkillActivationHelper: """Skill activation is now Tool-Call based.