mirror of
https://github.com/langbot-app/LangBot.git
synced 2026-06-02 12:05:54 +00:00
feat(box): add heartbeat and reconnection for Box runtime connector
Add 20-second heartbeat ping loop to detect silent Box runtime disconnections. On disconnect, set available=false and attempt reconnection after 3 seconds via the disconnect callback chain. - BoxRuntimeConnector: heartbeat loop, disconnect callback parameter, disconnect detection in connection callback and WS failure handler - BoxService: wire disconnect callback to toggle available state and re-initialize the connector on reconnection
This commit is contained in:
@@ -3,6 +3,7 @@ from __future__ import annotations
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
import typing
|
||||
from typing import TYPE_CHECKING
|
||||
from urllib.parse import urlparse
|
||||
|
||||
@@ -24,6 +25,8 @@ if TYPE_CHECKING:
|
||||
_DOCKER_BOX_HOST = 'langbot_box'
|
||||
_DEFAULT_PORT = 5410
|
||||
|
||||
_HEARTBEAT_INTERVAL_SEC = 20
|
||||
|
||||
|
||||
def _get_box_config(ap) -> dict:
|
||||
"""Return the 'box' section from instance config, with safe fallbacks."""
|
||||
@@ -61,8 +64,16 @@ class BoxRuntimeConnector(ManagedRuntimeConnector):
|
||||
3. Unix / macOS -> subprocess + stdio pipe
|
||||
"""
|
||||
|
||||
def __init__(self, ap: core_app.Application):
|
||||
def __init__(
|
||||
self,
|
||||
ap: core_app.Application,
|
||||
runtime_disconnect_callback: typing.Callable[
|
||||
['BoxRuntimeConnector'], typing.Coroutine[typing.Any, typing.Any, None]
|
||||
]
|
||||
| None = None,
|
||||
):
|
||||
super().__init__(ap)
|
||||
self.runtime_disconnect_callback = runtime_disconnect_callback
|
||||
self.configured_runtime_url = self._load_configured_runtime_url()
|
||||
self.ws_relay_base_url = resolve_box_ws_relay_url(ap)
|
||||
self.client = ActionRPCBoxClient(logger=ap.logger)
|
||||
@@ -70,6 +81,7 @@ class BoxRuntimeConnector(ManagedRuntimeConnector):
|
||||
self._handler: Handler | None = None
|
||||
self._handler_task: asyncio.Task | None = None
|
||||
self._ctrl_task: asyncio.Task | None = None
|
||||
self._heartbeat_task: asyncio.Task | None = None
|
||||
|
||||
# Parse the relay URL once for reuse.
|
||||
parsed = urlparse(self.ws_relay_base_url)
|
||||
@@ -93,16 +105,33 @@ class BoxRuntimeConnector(ManagedRuntimeConnector):
|
||||
async def initialize(self) -> None:
|
||||
if self._uses_websocket():
|
||||
if platform.get_platform() == 'win32' and not self.configured_runtime_url:
|
||||
# Windows without an explicit URL: launch the box server as a
|
||||
# detached subprocess (no stdio pipe) then talk to it via WS,
|
||||
# because Windows ProactorEventLoop does not support async
|
||||
# subprocess stdin/stdout pipes.
|
||||
await self._start_subprocess_then_ws()
|
||||
else:
|
||||
await self._connect_remote_ws()
|
||||
else:
|
||||
await self._start_local_stdio()
|
||||
|
||||
# Start heartbeat after successful connection
|
||||
if self._heartbeat_task is None:
|
||||
self._heartbeat_task = asyncio.create_task(self._heartbeat_loop())
|
||||
|
||||
# -- heartbeat -----------------------------------------------------------
|
||||
|
||||
async def _heartbeat_loop(self) -> None:
|
||||
"""Periodically ping the Box runtime to detect silent disconnections."""
|
||||
while True:
|
||||
await asyncio.sleep(_HEARTBEAT_INTERVAL_SEC)
|
||||
try:
|
||||
await self.ping()
|
||||
self.ap.logger.debug('Heartbeat to Box runtime success.')
|
||||
except Exception as e:
|
||||
self.ap.logger.debug(f'Failed to heartbeat to Box runtime: {e}')
|
||||
|
||||
async def ping(self) -> None:
|
||||
if self._handler is None:
|
||||
raise BoxRuntimeUnavailableError('Box runtime is not connected')
|
||||
await self._handler.call_action(CommonAction.PING, {})
|
||||
|
||||
# -- transport paths -----------------------------------------------------
|
||||
|
||||
async def _start_local_stdio(self) -> None:
|
||||
@@ -125,7 +154,6 @@ class BoxRuntimeConnector(ManagedRuntimeConnector):
|
||||
ctrl.run(self._make_connection_callback('stdio', connected, connect_error))
|
||||
)
|
||||
|
||||
# Wait for connection or failure
|
||||
try:
|
||||
await asyncio.wait_for(connected.wait(), timeout=30.0)
|
||||
except asyncio.TimeoutError:
|
||||
@@ -134,14 +162,12 @@ class BoxRuntimeConnector(ManagedRuntimeConnector):
|
||||
if connect_error:
|
||||
raise BoxRuntimeUnavailableError(f'box runtime connection failed: {connect_error[0]}')
|
||||
|
||||
# Store subprocess reference for dispose (StdioClientController manages it)
|
||||
self._subprocess = ctrl.process
|
||||
|
||||
async def _start_subprocess_then_ws(self) -> None:
|
||||
"""Launch box server as detached subprocess, then connect via WS (Windows)."""
|
||||
self.ap.logger.info('(windows) Use cmd to launch box runtime and communicate via ws')
|
||||
|
||||
# Launch the box server subprocess in ws mode (no stdio pipe).
|
||||
await self._start_runtime_subprocess(
|
||||
'-m',
|
||||
'langbot_plugin.box.server',
|
||||
@@ -151,7 +177,6 @@ class BoxRuntimeConnector(ManagedRuntimeConnector):
|
||||
str(self._relay_port),
|
||||
)
|
||||
|
||||
# Wait for the WS endpoint to become reachable, then connect.
|
||||
ws_url = f'ws://localhost:{self._relay_port}/rpc/ws'
|
||||
await self._connect_ws(ws_url, '(windows) WebSocket')
|
||||
|
||||
@@ -167,11 +192,6 @@ class BoxRuntimeConnector(ManagedRuntimeConnector):
|
||||
"""Determine the action-RPC WebSocket URL.
|
||||
|
||||
All endpoints share a single port; action RPC is at ``/rpc/ws``.
|
||||
|
||||
Priority:
|
||||
1. Explicit ``box.runtime_url`` from config (user-supplied, used as-is)
|
||||
2. Docker environment -> ``ws://langbot_box:5410/rpc/ws``
|
||||
3. --standalone-box / Windows fallback -> ``ws://localhost:5410/rpc/ws``
|
||||
"""
|
||||
if self.configured_runtime_url:
|
||||
return self.configured_runtime_url
|
||||
@@ -189,8 +209,14 @@ class BoxRuntimeConnector(ManagedRuntimeConnector):
|
||||
connect_error: list[Exception] = []
|
||||
|
||||
async def on_connect_failed(ctrl, exc):
|
||||
if exc is not None:
|
||||
self.ap.logger.error(f'Failed to connect to Box runtime ({ws_url}): {exc}')
|
||||
else:
|
||||
self.ap.logger.error(f'Failed to connect to Box runtime ({ws_url}), trying to reconnect...')
|
||||
connect_error.append(exc or BoxRuntimeUnavailableError('ws connection failed'))
|
||||
connected.set()
|
||||
if self.runtime_disconnect_callback is not None:
|
||||
await self.runtime_disconnect_callback(self)
|
||||
|
||||
ctrl = WebSocketClientController(ws_url=ws_url, make_connection_failed_callback=on_connect_failed)
|
||||
self._ctrl_task = asyncio.create_task(
|
||||
@@ -225,12 +251,28 @@ class BoxRuntimeConnector(ManagedRuntimeConnector):
|
||||
if not connected.is_set():
|
||||
connect_error.append(exc)
|
||||
connected.set()
|
||||
else:
|
||||
# Connection was established but then dropped — this is a
|
||||
# disconnect after successful handshake.
|
||||
if self._uses_websocket():
|
||||
self.ap.logger.error('Disconnected from Box runtime, trying to reconnect...')
|
||||
if self.runtime_disconnect_callback is not None:
|
||||
await self.runtime_disconnect_callback(self)
|
||||
else:
|
||||
self.ap.logger.error(
|
||||
'Disconnected from Box runtime via stdio. '
|
||||
'Cannot automatically reconnect — please restart LangBot.'
|
||||
)
|
||||
|
||||
return new_connection_callback
|
||||
|
||||
# -- lifecycle -----------------------------------------------------------
|
||||
|
||||
def dispose(self) -> None:
|
||||
if self._heartbeat_task is not None:
|
||||
self._heartbeat_task.cancel()
|
||||
self._heartbeat_task = None
|
||||
|
||||
if self._handler_task is not None:
|
||||
self._handler_task.cancel()
|
||||
self._handler_task = None
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import collections
|
||||
import datetime as _dt
|
||||
import enum
|
||||
@@ -52,7 +53,7 @@ class BoxService:
|
||||
self.ap = ap
|
||||
self._runtime_connector: BoxRuntimeConnector | None = None
|
||||
if client is None:
|
||||
self._runtime_connector = BoxRuntimeConnector(ap)
|
||||
self._runtime_connector = BoxRuntimeConnector(ap, runtime_disconnect_callback=self._on_runtime_disconnect)
|
||||
client = self._runtime_connector.client
|
||||
self.client = client
|
||||
self.output_limit_chars = output_limit_chars
|
||||
@@ -82,6 +83,18 @@ class BoxService:
|
||||
self.ap.logger.warning(f'LangBot Box runtime unavailable, sandbox features disabled: {exc}')
|
||||
self._available = False
|
||||
|
||||
async def _on_runtime_disconnect(self, connector: BoxRuntimeConnector) -> None:
|
||||
"""Called by the connector when the Box runtime connection drops."""
|
||||
self._available = False
|
||||
self.ap.logger.warning('Box runtime disconnected, sandbox features temporarily disabled. Reconnecting in 3s...')
|
||||
await asyncio.sleep(3)
|
||||
try:
|
||||
await connector.initialize()
|
||||
self._available = True
|
||||
self.ap.logger.info('Box runtime reconnected, sandbox features restored.')
|
||||
except Exception as exc:
|
||||
self.ap.logger.warning(f'Box runtime reconnection failed: {exc}. Will retry on next heartbeat disconnect.')
|
||||
|
||||
@property
|
||||
def available(self) -> bool:
|
||||
return self._available
|
||||
|
||||
Reference in New Issue
Block a user