From 3e2190a15354510fa1d2ec001cd9c58340c337d8 Mon Sep 17 00:00:00 2001 From: Junyan Qin Date: Sun, 19 Apr 2026 14:24:36 +0800 Subject: [PATCH] fix(box): add persistent reconnection loop with exponential backoff The previous disconnect handler only retried once and then gave up. Now spawns a background task that retries with exponential backoff (3s, 6s, 12s, ... up to 60s) until the Box runtime is reachable again. Uses a _reconnecting guard to prevent duplicate loops. Calls connector.dispose() before each retry to clean up stale tasks. --- src/langbot/pkg/box/service.py | 41 +++++++++++++++++++++++++--------- 1 file changed, 31 insertions(+), 10 deletions(-) diff --git a/src/langbot/pkg/box/service.py b/src/langbot/pkg/box/service.py index 7f89459f..a5383225 100644 --- a/src/langbot/pkg/box/service.py +++ b/src/langbot/pkg/box/service.py @@ -67,6 +67,7 @@ class BoxService: self._shutdown_task = None self._available = False self._connector_error: str = '' + self._reconnecting = False async def initialize(self): self._ensure_default_host_workspace() @@ -87,19 +88,39 @@ class BoxService: self._connector_error = str(exc) async def _on_runtime_disconnect(self, connector: BoxRuntimeConnector) -> None: - """Called by the connector when the Box runtime connection drops.""" + """Called by the connector when the Box runtime connection drops. + + Spawns a background reconnection loop so the caller is not blocked. + """ + if self._reconnecting: + return # Another reconnect loop is already running + self._reconnecting = True self._available = False self._connector_error = 'Disconnected from Box runtime' - self.ap.logger.warning('Box runtime disconnected, sandbox features temporarily disabled. Reconnecting in 3s...') - await asyncio.sleep(3) + self.ap.logger.warning('Box runtime disconnected, sandbox features temporarily disabled.') + asyncio.create_task(self._reconnect_loop(connector)) + + async def _reconnect_loop(self, connector: BoxRuntimeConnector) -> None: + """Retry reconnection with exponential backoff (3s → 60s max).""" + delay = 3 + max_delay = 60 try: - await connector.initialize() - self._available = True - self._connector_error = '' - self.ap.logger.info('Box runtime reconnected, sandbox features restored.') - except Exception as exc: - self._connector_error = str(exc) - self.ap.logger.warning(f'Box runtime reconnection failed: {exc}. Will retry on next heartbeat disconnect.') + while True: + self.ap.logger.info(f'Attempting to reconnect to Box runtime in {delay}s...') + await asyncio.sleep(delay) + try: + connector.dispose() + await connector.initialize() + self._available = True + self._connector_error = '' + self.ap.logger.info('Box runtime reconnected, sandbox features restored.') + return + except Exception as exc: + self._connector_error = str(exc) + self.ap.logger.warning(f'Box runtime reconnection failed: {exc}') + delay = min(delay * 2, max_delay) + finally: + self._reconnecting = False @property def available(self) -> bool: