fix(skill): harden mount/reload paths and HTTP errors against stale skill cache

The Box backends behave inconsistently when extra_mounts reference a
missing host directory (nsjail aborts the entire sandbox start, Docker
silently creates a root-owned empty dir on the host, E2B silently skips
the upload). The cache in skill_mgr.skills is only refreshed on
in-process mutations, so out-of-band changes — container rebuilds,
manual rm in the box volume, anything the LangBot API didn't drive —
leave a stale skill that later produces one of those bad mount paths.

- box/service.py: build_skill_extra_mounts now filters skills whose
  package_root is not isdir on the LangBot-visible filesystem and logs
  a warning, instead of passing the bad mount through to the backend
- skill/manager.py: reload_skills (Box path) drops skills whose
  package_root is missing on the LangBot-side filesystem before they
  reach the in-memory cache, with a summary warning
- api/http/controller/groups/skills.py: file/CRUD handlers now also
  catch BoxError (RuntimeError subclass, previously slipping past
  ``except ValueError`` and surfacing as 500); list/get handlers gain
  a try/except so a transient Box RPC failure becomes a clean 400
  instead of a stack trace

Tests added for build_skill_extra_mounts (skip missing, skip empty,
no skill manager) and SkillManager.reload_skills (drop missing on Box
path). Full unit suite: 279 passed.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Junyan Qin
2026-05-20 16:50:46 +08:00
parent 28c00cb8d1
commit 99328cf4c0
5 changed files with 152 additions and 16 deletions

View File

@@ -2,6 +2,8 @@ from __future__ import annotations
import quart import quart
from langbot_plugin.box.errors import BoxError
from .. import group from .. import group
@@ -13,7 +15,10 @@ class SkillsRouterGroup(group.RouterGroup):
@self.route('', methods=['GET', 'POST'], auth_type=group.AuthType.USER_TOKEN_OR_API_KEY) @self.route('', methods=['GET', 'POST'], auth_type=group.AuthType.USER_TOKEN_OR_API_KEY)
async def list_or_create_skills() -> quart.Response: async def list_or_create_skills() -> quart.Response:
if quart.request.method == 'GET': if quart.request.method == 'GET':
skills = await self.ap.skill_service.list_skills() try:
skills = await self.ap.skill_service.list_skills()
except (ValueError, BoxError) as exc:
return self.http_status(400, -1, str(exc))
return self.success(data={'skills': skills}) return self.success(data={'skills': skills})
data = await quart.request.json data = await quart.request.json
@@ -23,13 +28,16 @@ class SkillsRouterGroup(group.RouterGroup):
try: try:
skill = await self.ap.skill_service.create_skill(data) skill = await self.ap.skill_service.create_skill(data)
return self.success(data={'skill': skill}) return self.success(data={'skill': skill})
except ValueError as exc: except (ValueError, BoxError) as exc:
return self.http_status(400, -1, str(exc)) return self.http_status(400, -1, str(exc))
@self.route('/<skill_name>', methods=['GET', 'PUT', 'DELETE'], auth_type=group.AuthType.USER_TOKEN_OR_API_KEY) @self.route('/<skill_name>', methods=['GET', 'PUT', 'DELETE'], auth_type=group.AuthType.USER_TOKEN_OR_API_KEY)
async def get_update_delete_skill(skill_name: str) -> quart.Response: async def get_update_delete_skill(skill_name: str) -> quart.Response:
if quart.request.method == 'GET': if quart.request.method == 'GET':
skill = await self.ap.skill_service.get_skill(skill_name) try:
skill = await self.ap.skill_service.get_skill(skill_name)
except (ValueError, BoxError) as exc:
return self.http_status(400, -1, str(exc))
if not skill: if not skill:
return self.http_status(404, -1, 'Skill not found') return self.http_status(404, -1, 'Skill not found')
return self.success(data={'skill': skill}) return self.success(data={'skill': skill})
@@ -39,13 +47,13 @@ class SkillsRouterGroup(group.RouterGroup):
try: try:
skill = await self.ap.skill_service.update_skill(skill_name, data) skill = await self.ap.skill_service.update_skill(skill_name, data)
return self.success(data={'skill': skill}) return self.success(data={'skill': skill})
except ValueError as exc: except (ValueError, BoxError) as exc:
return self.http_status(400, -1, str(exc)) return self.http_status(400, -1, str(exc))
try: try:
await self.ap.skill_service.delete_skill(skill_name) await self.ap.skill_service.delete_skill(skill_name)
return self.success() return self.success()
except ValueError as exc: except (ValueError, BoxError) as exc:
return self.http_status(400, -1, str(exc)) return self.http_status(400, -1, str(exc))
@self.route('/<skill_name>/files', methods=['GET'], auth_type=group.AuthType.USER_TOKEN_OR_API_KEY) @self.route('/<skill_name>/files', methods=['GET'], auth_type=group.AuthType.USER_TOKEN_OR_API_KEY)
@@ -61,7 +69,7 @@ class SkillsRouterGroup(group.RouterGroup):
include_hidden=include_hidden, include_hidden=include_hidden,
) )
return self.success(data=result) return self.success(data=result)
except ValueError as exc: except (ValueError, BoxError) as exc:
return self.http_status(400, -1, str(exc)) return self.http_status(400, -1, str(exc))
@self.route( @self.route(
@@ -73,7 +81,7 @@ class SkillsRouterGroup(group.RouterGroup):
try: try:
result = await self.ap.skill_service.read_skill_file(skill_name, path) result = await self.ap.skill_service.read_skill_file(skill_name, path)
return self.success(data=result) return self.success(data=result)
except ValueError as exc: except (ValueError, BoxError) as exc:
return self.http_status(400, -1, str(exc)) return self.http_status(400, -1, str(exc))
# PUT - write file # PUT - write file
@@ -85,7 +93,7 @@ class SkillsRouterGroup(group.RouterGroup):
try: try:
result = await self.ap.skill_service.write_skill_file(skill_name, path, content) result = await self.ap.skill_service.write_skill_file(skill_name, path, content)
return self.success(data=result) return self.success(data=result)
except ValueError as exc: except (ValueError, BoxError) as exc:
return self.http_status(400, -1, str(exc)) return self.http_status(400, -1, str(exc))
@self.route('/<skill_name>/preview', methods=['GET'], auth_type=group.AuthType.USER_TOKEN_OR_API_KEY) @self.route('/<skill_name>/preview', methods=['GET'], auth_type=group.AuthType.USER_TOKEN_OR_API_KEY)
@@ -109,7 +117,7 @@ class SkillsRouterGroup(group.RouterGroup):
try: try:
skill = await self.ap.skill_service.install_from_github(data) skill = await self.ap.skill_service.install_from_github(data)
return self.success(data={'skills': skill}) return self.success(data={'skills': skill})
except ValueError as exc: except (ValueError, BoxError) as exc:
return self.http_status(400, -1, str(exc)) return self.http_status(400, -1, str(exc))
except Exception as exc: except Exception as exc:
return self.http_status(500, -1, f'Failed to install skill: {exc}') return self.http_status(500, -1, f'Failed to install skill: {exc}')
@@ -128,7 +136,7 @@ class SkillsRouterGroup(group.RouterGroup):
try: try:
preview = await self.ap.skill_service.preview_install_from_github(data) preview = await self.ap.skill_service.preview_install_from_github(data)
return self.success(data={'skills': preview}) return self.success(data={'skills': preview})
except ValueError as exc: except (ValueError, BoxError) as exc:
return self.http_status(400, -1, str(exc)) return self.http_status(400, -1, str(exc))
except Exception as exc: except Exception as exc:
return self.http_status(500, -1, f'Failed to preview skill: {exc}') return self.http_status(500, -1, f'Failed to preview skill: {exc}')
@@ -147,7 +155,7 @@ class SkillsRouterGroup(group.RouterGroup):
source_paths=form.getlist('source_paths'), source_paths=form.getlist('source_paths'),
) )
return self.success(data={'skills': skill}) return self.success(data={'skills': skill})
except ValueError as exc: except (ValueError, BoxError) as exc:
return self.http_status(400, -1, str(exc)) return self.http_status(400, -1, str(exc))
except Exception as exc: except Exception as exc:
return self.http_status(500, -1, f'Failed to install skill: {exc}') return self.http_status(500, -1, f'Failed to install skill: {exc}')
@@ -164,7 +172,7 @@ class SkillsRouterGroup(group.RouterGroup):
filename=file.filename or '', filename=file.filename or '',
) )
return self.success(data={'skills': preview}) return self.success(data={'skills': preview})
except ValueError as exc: except (ValueError, BoxError) as exc:
return self.http_status(400, -1, str(exc)) return self.http_status(400, -1, str(exc))
except Exception as exc: except Exception as exc:
return self.http_status(500, -1, f'Failed to preview skill: {exc}') return self.http_status(500, -1, f'Failed to preview skill: {exc}')
@@ -178,5 +186,5 @@ class SkillsRouterGroup(group.RouterGroup):
try: try:
result = await self.ap.skill_service.scan_directory_async(path) result = await self.ap.skill_service.scan_directory_async(path)
return self.success(data=result) return self.success(data=result)
except ValueError as exc: except (ValueError, BoxError) as exc:
return self.http_status(400, -1, str(exc)) return self.http_status(400, -1, str(exc))

View File

@@ -192,6 +192,15 @@ class BoxService:
This ensures that when a container is first created it already has This ensures that when a container is first created it already has
all skill packages mounted, regardless of which skill is currently all skill packages mounted, regardless of which skill is currently
activated. activated.
Skills whose ``package_root`` is missing or no longer a directory on
the LangBot-visible filesystem are skipped with a warning instead of
being passed through to the backend. Without this guard the three
backends behave inconsistently on a stale mount: nsjail refuses to
start the sandbox (failing every exec in the session), Docker
silently auto-creates a root-owned empty directory on the host, and
E2B silently skips the upload — none of which surfaces an
actionable error to the agent or operator.
""" """
skill_mgr = getattr(self.ap, 'skill_mgr', None) skill_mgr = getattr(self.ap, 'skill_mgr', None)
if skill_mgr is None: if skill_mgr is None:
@@ -205,6 +214,13 @@ class BoxService:
package_root = str(skill_data.get('package_root', '') or '').strip() package_root = str(skill_data.get('package_root', '') or '').strip()
if not package_root: if not package_root:
continue continue
if not os.path.isdir(package_root):
self.ap.logger.warning(
f'Skill "{skill_name}" package_root missing on filesystem '
f'({package_root}); skipping mount to prevent sandbox failures. '
f'The skill cache may be stale — consider reloading skills.'
)
continue
mounts.append( mounts.append(
{ {
'host_path': package_root, 'host_path': package_root,

View File

@@ -50,11 +50,32 @@ class SkillManager:
box_service = getattr(self.ap, 'box_service', None) box_service = getattr(self.ap, 'box_service', None)
if box_service is not None and getattr(box_service, 'available', False): if box_service is not None and getattr(box_service, 'available', False):
try: try:
dropped = 0
for skill_data in await box_service.list_skills(): for skill_data in await box_service.list_skills():
skill_name = skill_data.get('name') skill_name = skill_data.get('name')
if skill_name: if not skill_name:
self.skills[skill_name] = skill_data continue
self.ap.logger.info(f'Loaded {len(self.skills)} skills from Box runtime') # Drop skills whose package_root is no longer visible on the
# LangBot-side filesystem (e.g. Box volume was rebuilt or the
# directory was deleted out-of-band). Keeping them in the cache
# would cause stale extra_mounts and confusing UI states.
package_root = str(skill_data.get('package_root', '') or '').strip()
if package_root and not os.path.isdir(package_root):
self.ap.logger.warning(
f'Skill "{skill_name}" reported by Box runtime but '
f'package_root missing on LangBot filesystem '
f'({package_root}); dropping from in-memory cache.'
)
dropped += 1
continue
self.skills[skill_name] = skill_data
if dropped:
self.ap.logger.warning(
f'Loaded {len(self.skills)} skills from Box runtime '
f'({dropped} dropped due to missing package_root).'
)
else:
self.ap.logger.info(f'Loaded {len(self.skills)} skills from Box runtime')
return return
except Exception as exc: except Exception as exc:
self.ap.logger.warning(f'Failed to load skills from Box runtime, falling back to local data: {exc}') self.ap.logger.warning(f'Failed to load skills from Box runtime, falling back to local data: {exc}')

View File

@@ -3,6 +3,7 @@ from __future__ import annotations
import asyncio import asyncio
import datetime as dt import datetime as dt
import os import os
import tempfile
from types import SimpleNamespace from types import SimpleNamespace
from unittest.mock import AsyncMock, Mock from unittest.mock import AsyncMock, Mock
@@ -1222,3 +1223,63 @@ class TestBoxHostMountModeNone:
mount_path='/project', mount_path='/project',
workdir='/workspace', workdir='/workspace',
) )
class TestBuildSkillExtraMounts:
"""Robustness of skill mount construction against a stale skill cache.
The three sandbox backends behave inconsistently when a skill's
package_root no longer exists on disk (nsjail aborts the whole sandbox
start, Docker silently auto-creates a root-owned empty directory, E2B
silently skips). Mount construction must filter these out up front so
the backend never sees a bad mount.
"""
def _make_service(self, logger, skills):
app = make_app(logger)
app.skill_mgr = SimpleNamespace(skills=skills)
client = Mock(spec=BoxRuntimeClient)
return BoxService(app, client=client)
def test_skips_skill_with_missing_package_root(self):
logger = Mock()
with tempfile.TemporaryDirectory() as live_dir:
skills = {
'alive': {'name': 'alive', 'package_root': live_dir},
'ghost': {'name': 'ghost', 'package_root': '/nonexistent/path/should/never/exist'},
}
service = self._make_service(logger, skills)
query = make_query()
mounts = service.build_skill_extra_mounts(query)
assert mounts == [
{
'host_path': live_dir,
'mount_path': '/workspace/.skills/alive',
'mode': 'rw',
}
]
# Warning logged so operators can see what was dropped
assert any(
'ghost' in str(call.args[0]) and 'package_root missing' in str(call.args[0])
for call in logger.warning.call_args_list
)
def test_skips_skill_with_empty_package_root(self):
logger = Mock()
skills = {
'no_root': {'name': 'no_root', 'package_root': ''},
'whitespace': {'name': 'whitespace', 'package_root': ' '},
}
service = self._make_service(logger, skills)
assert service.build_skill_extra_mounts(make_query()) == []
def test_returns_empty_when_no_skill_manager(self):
logger = Mock()
app = make_app(logger)
# no skill_mgr attribute
service = BoxService(app, client=Mock(spec=BoxRuntimeClient))
assert service.build_skill_extra_mounts(make_query()) == []

View File

@@ -78,6 +78,36 @@ class TestSkillManagerPackageLoading:
assert skill_data['instructions'] == 'Updated instructions' assert skill_data['instructions'] == 'Updated instructions'
assert skill_data['description'] == 'Second' assert skill_data['description'] == 'Second'
@pytest.mark.asyncio
async def test_reload_skills_drops_box_skills_with_missing_package_root(self):
"""When Box reports a skill whose package_root is gone from the
LangBot-visible filesystem, the cache must drop it instead of
keeping a stale entry that would later produce a bad mount."""
from langbot.pkg.skill.manager import SkillManager
with tempfile.TemporaryDirectory() as live_dir:
ghost_dir = os.path.join(live_dir, '_does_not_exist')
box_service = SimpleNamespace(
available=True,
list_skills=AsyncMock(
return_value=[
_make_skill_data(name='alive', package_root=live_dir),
_make_skill_data(name='ghost', package_root=ghost_dir),
]
),
)
ap = _make_ap()
ap.box_service = box_service
mgr = SkillManager(ap)
await mgr.reload_skills()
assert list(mgr.skills) == ['alive']
# Warning fired with the dropped skill name so operators can see it.
warning_messages = [str(call.args[0]) for call in ap.logger.warning.call_args_list]
assert any('ghost' in msg and 'package_root missing' in msg for msg in warning_messages)
class TestSkillActivationHelper: class TestSkillActivationHelper:
"""Skill activation is now Tool-Call based. """Skill activation is now Tool-Call based.