LangBot/src/langbot/pkg/agent/runner/run_ledger_store.py

"""Run ledger store for Host-owned AgentRun and AgentRunEvent records."""

from __future__ import annotations

import datetime
import json
import typing
import uuid

import sqlalchemy
from sqlalchemy.ext.asyncio import AsyncEngine, AsyncSession
from sqlalchemy.orm import sessionmaker

from ...entity.persistence.agent_run import AgentRun, AgentRunEvent, AgentRuntime


UTC = datetime.timezone.utc
RUN_STATUSES = {'created', 'queued', 'claimed', 'running', 'completed', 'failed', 'cancelled', 'timeout'}
TERMINAL_STATUSES = {'completed', 'failed', 'cancelled', 'timeout'}


def _utc_now() -> datetime.datetime:
    return datetime.datetime.now(UTC)


def _as_utc(value: datetime.datetime | None) -> datetime.datetime | None:
    if value is None:
        return None
    if value.tzinfo is None:
        return value.replace(tzinfo=UTC)
    return value.astimezone(UTC)


def _datetime_to_epoch(value: datetime.datetime | None) -> int | None:
    if value is None:
        return None
    if value.tzinfo is None:
        value = value.replace(tzinfo=UTC)
    else:
        value = value.astimezone(UTC)
    return int(value.timestamp())


def _epoch_to_datetime(value: typing.Any) -> datetime.datetime | None:
    if value is None:
        return None
    try:
        return datetime.datetime.fromtimestamp(float(value), UTC)
    except (TypeError, ValueError, OSError):
        return None


def _json_dumps(value: typing.Any) -> str | None:
    if value is None:
        return None
    return json.dumps(value)


def _json_loads(value: str | None, default: typing.Any) -> typing.Any:
    if not value:
        return default
    try:
        return json.loads(value)
    except (TypeError, ValueError):
        return default


def _validate_run_status(status: str) -> str:
    normalized = str(status)
    if normalized not in RUN_STATUSES:
        raise ValueError(f'Unknown run status: {normalized}')
    return normalized


def _claim_is_active(
    run: AgentRun,
    *,
    runtime_id: str | None,
    claim_token: str,
    now: datetime.datetime,
) -> bool:
    if run.status != 'claimed' or run.claim_token != claim_token:
        return False
    if runtime_id is not None and run.claimed_by_runtime_id != runtime_id:
        return False
    lease_expires_at = _as_utc(run.claim_lease_expires_at)
    return lease_expires_at is not None and lease_expires_at > now


class RunLedgerStore:
    """Store for Host-owned run lifecycle and result event facts."""

    engine: AsyncEngine

    def __init__(self, engine: AsyncEngine):
        self.engine = engine
        self._session_factory = sessionmaker(engine, class_=AsyncSession, expire_on_commit=False)

    async def create_run(
        self,
        *,
        run_id: str,
        event_id: str | None,
        binding_id: str | None,
        runner_id: str,
        conversation_id: str | None = None,
        thread_id: str | None = None,
        workspace_id: str | None = None,
        bot_id: str | None = None,
        agent_id: str | None = None,
        deadline_at: int | float | None = None,
        authorization: dict[str, typing.Any] | None = None,
        metadata: dict[str, typing.Any] | None = None,
        status: str = 'running',
        queue_name: str | None = None,
        priority: int = 0,
        requested_runtime_id: str | None = None,
    ) -> dict[str, typing.Any]:
        """Create a run if it does not already exist."""
        status = _validate_run_status(status)
        now = _utc_now()
        async with self._session_factory() as session:
            existing = await self._get_run_row(session, run_id)
            if existing is not None:
                return self._run_to_dict(existing)

            run = AgentRun(
                run_id=run_id,
                event_id=event_id,
                agent_id=agent_id,
                binding_id=binding_id,
                runner_id=runner_id,
                conversation_id=conversation_id,
                thread_id=thread_id,
                workspace_id=workspace_id,
                bot_id=bot_id,
                status=status,
                queue_name=queue_name,
                priority=priority,
                requested_runtime_id=requested_runtime_id,
                created_at=now,
                started_at=now if status == 'running' else None,
                updated_at=now,
                deadline_at=_epoch_to_datetime(deadline_at),
                authorization_json=_json_dumps(authorization),
                metadata_json=_json_dumps(metadata),
            )
            session.add(run)
            await session.commit()
            return self._run_to_dict(run)

    async def claim_next_run(
        self,
        *,
        runtime_id: str,
        queue_name: str | None = None,
        lease_seconds: int = 60,
        runner_ids: list[str] | None = None,
        conversation_id: str | None = None,
        bot_id: str | None = None,
        workspace_id: str | None = None,
        thread_id: str | None = None,
        strict_thread: bool = False,
    ) -> dict[str, typing.Any] | None:
        """Claim the next queued or expired-leased run for a runtime."""
        now = _utc_now()
        lease_expires_at = now + datetime.timedelta(seconds=max(int(lease_seconds), 1))
        async with self._session_factory() as session:
            query = sqlalchemy.select(AgentRun).where(
                sqlalchemy.or_(
                    AgentRun.status == 'queued',
                    sqlalchemy.and_(
                        AgentRun.status == 'claimed',
                        AgentRun.claim_lease_expires_at.is_not(None),
                        AgentRun.claim_lease_expires_at <= now,
                    ),
                ),
                sqlalchemy.or_(
                    AgentRun.requested_runtime_id.is_(None),
                    AgentRun.requested_runtime_id == runtime_id,
                ),
            )
            if queue_name is not None:
                query = query.where(AgentRun.queue_name == queue_name)
            if runner_ids:
                query = query.where(AgentRun.runner_id.in_(runner_ids))
            if conversation_id is not None:
                query = query.where(AgentRun.conversation_id == conversation_id)
            query = self._apply_scope_filters(query, bot_id, workspace_id, thread_id, strict_thread)

            query = query.order_by(AgentRun.priority.desc(), AgentRun.id.asc()).limit(1).with_for_update(
                skip_locked=True
            )
            result = await session.execute(query)
            run = result.scalars().first()
            if run is None:
                return None

            run.status = 'claimed'
            run.claimed_by_runtime_id = runtime_id
            run.claim_token = uuid.uuid4().hex
            run.claim_lease_expires_at = lease_expires_at
            run.dispatch_attempts = (run.dispatch_attempts or 0) + 1
            run.last_claimed_at = now
            run.updated_at = now
            await session.commit()
            return self._run_to_dict(run, include_claim_token=True)

    async def renew_claim(
        self,
        *,
        run_id: str,
        claim_token: str,
        runtime_id: str | None = None,
        lease_seconds: int = 60,
    ) -> dict[str, typing.Any] | None:
        """Extend a current claim lease if the token still matches."""
        now = _utc_now()
        async with self._session_factory() as session:
            run = await self._get_run_row(session, run_id)
            if run is None or not _claim_is_active(run, runtime_id=runtime_id, claim_token=claim_token, now=now):
                return None

            run.claim_lease_expires_at = now + datetime.timedelta(seconds=max(int(lease_seconds), 1))
            run.updated_at = now
            await session.commit()
            return self._run_to_dict(run)

    async def release_claim(
        self,
        *,
        run_id: str,
        claim_token: str,
        runtime_id: str | None = None,
        status: str = 'queued',
        status_reason: str | None = None,
    ) -> dict[str, typing.Any] | None:
        """Release a current claim lease if the token still matches."""
        status = _validate_run_status(status)
        now = _utc_now()
        async with self._session_factory() as session:
            run = await self._get_run_row(session, run_id)
            if run is None or not _claim_is_active(run, runtime_id=runtime_id, claim_token=claim_token, now=now):
                return None

            run.status = status
            run.status_reason = status_reason
            run.claimed_by_runtime_id = None
            run.claim_token = None
            run.claim_lease_expires_at = None
            run.updated_at = now
            if status in TERMINAL_STATUSES:
                run.finished_at = run.finished_at or now
            await session.commit()
            return self._run_to_dict(run)

    async def release_expired_claims(
        self,
        *,
        now: datetime.datetime | None = None,
        status: str = 'queued',
        status_reason: str = 'claim lease expired',
        limit: int = 100,
    ) -> list[dict[str, typing.Any]]:
        """Release claimed runs whose claim lease has expired."""
        status = _validate_run_status(status)
        current_time = now or _utc_now()
        if current_time.tzinfo is None:
            current_time = current_time.replace(tzinfo=UTC)
        limit = min(max(int(limit), 1), 500)

        async with self._session_factory() as session:
            result = await session.execute(
                sqlalchemy.select(AgentRun)
                .where(
                    AgentRun.status == 'claimed',
                    AgentRun.claim_lease_expires_at.is_not(None),
                    AgentRun.claim_lease_expires_at <= current_time,
                )
                .order_by(AgentRun.claim_lease_expires_at.asc(), AgentRun.id.asc())
                .limit(limit)
            )
            runs = result.scalars().all()
            for run in runs:
                run.status = status
                run.status_reason = status_reason
                run.claimed_by_runtime_id = None
                run.claim_token = None
                run.claim_lease_expires_at = None
                run.updated_at = current_time
                if status in TERMINAL_STATUSES:
                    run.finished_at = run.finished_at or current_time
            await session.commit()
            return [self._run_to_dict(run) for run in runs]

    async def append_event(
        self,
        *,
        run_id: str,
        sequence: int,
        event_type: str,
        data: dict[str, typing.Any] | None = None,
        usage: dict[str, typing.Any] | None = None,
        source: str = 'runner',
        metadata: dict[str, typing.Any] | None = None,
    ) -> dict[str, typing.Any]:
        """Append one run result event.

        If the same run_id + sequence already exists, the existing row is
        returned. This supports retrying append calls idempotently.
        """
        async with self._session_factory() as session:
            result = await session.execute(
                sqlalchemy.select(AgentRunEvent).where(
                    AgentRunEvent.run_id == run_id,
                    AgentRunEvent.sequence == sequence,
                )
            )
            existing = result.scalars().first()
            if existing is not None:
                return self._event_to_dict(existing)

            row = AgentRunEvent(
                run_id=run_id,
                sequence=sequence,
                type=event_type,
                data_json=_json_dumps(data or {}),
                usage_json=_json_dumps(usage),
                created_at=_utc_now(),
                source=source,
                metadata_json=_json_dumps(metadata),
            )
            session.add(row)
            await session.commit()
            return self._event_to_dict(row)

    async def append_audit_event(
        self,
        *,
        run_id: str,
        event_type: str,
        data: dict[str, typing.Any] | None = None,
        metadata: dict[str, typing.Any] | None = None,
    ) -> dict[str, typing.Any] | None:
        """Append a Host-authored audit event after the current max sequence."""
        async with self._session_factory() as session:
            run = await self._get_run_row(session, run_id)
            if run is None:
                return None

            result = await session.execute(
                sqlalchemy.select(sqlalchemy.func.max(AgentRunEvent.sequence)).where(
                    AgentRunEvent.run_id == run_id,
                )
            )
            next_sequence = int(result.scalar_one_or_none() or 0) + 1
            row = AgentRunEvent(
                run_id=run_id,
                sequence=next_sequence,
                type=event_type,
                data_json=_json_dumps(data or {}),
                usage_json=None,
                created_at=_utc_now(),
                source='host',
                metadata_json=_json_dumps(metadata or {}),
            )
            session.add(row)
            await session.commit()
            return self._event_to_dict(row)

    async def finalize_run(
        self,
        *,
        run_id: str,
        status: str,
        status_reason: str | None = None,
        usage: dict[str, typing.Any] | None = None,
        cost: dict[str, typing.Any] | None = None,
        metadata: dict[str, typing.Any] | None = None,
    ) -> dict[str, typing.Any] | None:
        """Update a run to a terminal or current status."""
        status = _validate_run_status(status)
        now = _utc_now()
        async with self._session_factory() as session:
            run = await self._get_run_row(session, run_id)
            if run is None:
                return None

            if run.status in TERMINAL_STATUSES and run.status != status:
                raise ValueError(f'Cannot transition terminal run {run_id} from {run.status} to {status}')

            run.status = status
            if status_reason is not None:
                run.status_reason = status_reason
            run.updated_at = now
            if status in TERMINAL_STATUSES:
                run.finished_at = run.finished_at or now
                run.claimed_by_runtime_id = None
                run.claim_token = None
                run.claim_lease_expires_at = None
            if usage is not None:
                run.usage_json = _json_dumps(usage)
            if cost is not None:
                run.cost_json = _json_dumps(cost)
            if metadata is not None:
                existing_metadata = _json_loads(run.metadata_json, {})
                if isinstance(existing_metadata, dict):
                    existing_metadata.update(metadata)
                    run.metadata_json = _json_dumps(existing_metadata)
                else:
                    run.metadata_json = _json_dumps(metadata)
            await session.commit()
            return self._run_to_dict(run)

    async def validate_active_claim(
        self,
        *,
        run_id: str,
        runtime_id: str,
        claim_token: str,
    ) -> bool:
        """Return whether a runtime currently owns an unexpired claim lease."""
        now = _utc_now()
        async with self._session_factory() as session:
            run = await self._get_run_row(session, run_id)
            if run is None:
                return False
            return _claim_is_active(run, runtime_id=runtime_id, claim_token=claim_token, now=now)

    async def request_cancel(
        self,
        *,
        run_id: str,
        status_reason: str | None = None,
    ) -> dict[str, typing.Any] | None:
        """Record a cancellation request."""
        now = _utc_now()
        async with self._session_factory() as session:
            run = await self._get_run_row(session, run_id)
            if run is None:
                return None
            run.cancel_requested_at = now
            run.updated_at = now
            run.status_reason = status_reason or run.status_reason
            await session.commit()
            return self._run_to_dict(run)

    async def get_run(self, run_id: str) -> dict[str, typing.Any] | None:
        """Get one run by run_id."""
        async with self._session_factory() as session:
            row = await self._get_run_row(session, run_id)
            return self._run_to_dict(row) if row is not None else None

    async def register_runtime(
        self,
        *,
        runtime_id: str,
        status: str = 'online',
        display_name: str | None = None,
        endpoint: str | None = None,
        version: str | None = None,
        capabilities: dict[str, typing.Any] | None = None,
        labels: dict[str, typing.Any] | None = None,
        metadata: dict[str, typing.Any] | None = None,
        heartbeat_deadline_seconds: int = 60,
    ) -> dict[str, typing.Any]:
        """Create or update a runtime registry row and record a heartbeat."""
        now = _utc_now()
        async with self._session_factory() as session:
            runtime = await self._get_runtime_row(session, runtime_id)
            if runtime is None:
                runtime = AgentRuntime(runtime_id=runtime_id, created_at=now)
                session.add(runtime)

            runtime.status = status
            runtime.display_name = display_name
            runtime.endpoint = endpoint
            runtime.version = version
            runtime.capabilities_json = _json_dumps(capabilities or {})
            runtime.labels_json = _json_dumps(labels or {})
            runtime.metadata_json = _json_dumps(metadata or {})
            runtime.last_heartbeat_at = now
            runtime.heartbeat_deadline_at = now + datetime.timedelta(seconds=max(int(heartbeat_deadline_seconds), 1))
            runtime.updated_at = now
            await session.commit()
            return self._runtime_to_dict(runtime)

    async def heartbeat_runtime(
        self,
        *,
        runtime_id: str,
        status: str = 'online',
        heartbeat_deadline_seconds: int = 60,
        capabilities: dict[str, typing.Any] | None = None,
        labels: dict[str, typing.Any] | None = None,
        metadata: dict[str, typing.Any] | None = None,
    ) -> dict[str, typing.Any] | None:
        """Refresh a runtime heartbeat."""
        now = _utc_now()
        async with self._session_factory() as session:
            runtime = await self._get_runtime_row(session, runtime_id)
            if runtime is None:
                return None

            runtime.status = status
            runtime.last_heartbeat_at = now
            runtime.heartbeat_deadline_at = now + datetime.timedelta(seconds=max(int(heartbeat_deadline_seconds), 1))
            runtime.updated_at = now
            if capabilities is not None:
                runtime.capabilities_json = _json_dumps(capabilities)
            if labels is not None:
                runtime.labels_json = _json_dumps(labels)
            if metadata is not None:
                existing_metadata = _json_loads(runtime.metadata_json, {})
                if isinstance(existing_metadata, dict):
                    existing_metadata.update(metadata)
                    runtime.metadata_json = _json_dumps(existing_metadata)
                else:
                    runtime.metadata_json = _json_dumps(metadata)
            await session.commit()
            return self._runtime_to_dict(runtime)

    async def get_runtime(self, runtime_id: str) -> dict[str, typing.Any] | None:
        """Get one runtime by runtime_id."""
        async with self._session_factory() as session:
            row = await self._get_runtime_row(session, runtime_id)
            return self._runtime_to_dict(row) if row is not None else None

    async def list_runtimes(
        self,
        *,
        statuses: list[str] | None = None,
        labels: dict[str, str] | None = None,
        limit: int = 100,
    ) -> tuple[list[dict[str, typing.Any]], int]:
        """List runtime registry rows.

        Args:
            statuses: Filter by status list
            labels: Filter by labels (key-value pairs)
            limit: Maximum number of rows to return

        Returns:
            Tuple of (runtimes, total_count).
        """
        limit = min(max(int(limit), 1), 500)
        async with self._session_factory() as session:
            # Build base query with status filter
            base_query = sqlalchemy.select(AgentRuntime)
            if statuses:
                base_query = base_query.where(AgentRuntime.status.in_(statuses))

            # Get total count (before label filtering)
            if not labels:
                # Simple case - can count directly in DB
                count_query = sqlalchemy.select(sqlalchemy.func.count(AgentRuntime.id))
                if statuses:
                    count_query = count_query.where(AgentRuntime.status.in_(statuses))
                count_result = await session.execute(count_query)
                total_count = count_result.scalar() or 0

                # Get items
                query = base_query.order_by(AgentRuntime.id.asc()).limit(limit)
                result = await session.execute(query)
                runtimes = [self._runtime_to_dict(row) for row in result.scalars().all()]
            else:
                # Need to fetch all and filter by labels in Python
                query = base_query.order_by(AgentRuntime.id.asc())
                result = await session.execute(query)
                all_runtimes = [self._runtime_to_dict(row) for row in result.scalars().all()]

                # Filter by labels
                runtimes = [
                    rt for rt in all_runtimes
                    if all(rt.get('labels', {}).get(k) == v for k, v in labels.items())
                ]
                total_count = len(runtimes)

                # Apply limit after filtering
                runtimes = runtimes[:limit]

            return runtimes, total_count

    async def mark_stale_runtimes(
        self,
        *,
        now: datetime.datetime | None = None,
        stale_status: str = 'stale',
        stale_after_seconds: int | float | None = None,
    ) -> list[dict[str, typing.Any]]:
        """Mark runtimes stale when their heartbeat deadline has passed."""
        current_time = now or _utc_now()
        if current_time.tzinfo is None:
            current_time = current_time.replace(tzinfo=UTC)
        stale_conditions: list[typing.Any] = [
            sqlalchemy.and_(
                AgentRuntime.heartbeat_deadline_at.is_not(None),
                AgentRuntime.heartbeat_deadline_at < current_time,
            )
        ]
        if stale_after_seconds is not None:
            try:
                stale_after_delta = datetime.timedelta(seconds=max(float(stale_after_seconds), 0))
            except (TypeError, ValueError):
                stale_after_delta = None
            if stale_after_delta is not None:
                stale_conditions.append(
                    sqlalchemy.and_(
                        AgentRuntime.last_heartbeat_at.is_not(None),
                        AgentRuntime.last_heartbeat_at < current_time - stale_after_delta,
                    )
                )
        async with self._session_factory() as session:
            result = await session.execute(
                sqlalchemy.select(AgentRuntime).where(
                    sqlalchemy.or_(*stale_conditions),
                    AgentRuntime.status != stale_status,
                )
            )
            runtimes = result.scalars().all()
            for runtime in runtimes:
                runtime.status = stale_status
                runtime.updated_at = current_time
            await session.commit()
            return [self._runtime_to_dict(runtime) for runtime in runtimes]

    async def list_runs(
        self,
        *,
        conversation_id: str | None = None,
        statuses: list[str] | None = None,
        before_id: int | None = None,
        limit: int = 50,
        bot_id: str | None = None,
        workspace_id: str | None = None,
        thread_id: str | None = None,
        strict_thread: bool = False,
        runner_id: str | None = None,
    ) -> tuple[list[dict[str, typing.Any]], int | None, bool, int]:
        """Page runs by scope.

        Returns:
            Tuple of (items, next_cursor, has_more, total_count).
        """
        limit = min(max(int(limit), 1), 100)
        async with self._session_factory() as session:
            # First get total count
            count_query = sqlalchemy.select(sqlalchemy.func.count(AgentRun.id))
            if conversation_id is not None:
                count_query = count_query.where(AgentRun.conversation_id == conversation_id)
            if statuses:
                count_query = count_query.where(AgentRun.status.in_(statuses))
            if runner_id is not None:
                count_query = count_query.where(AgentRun.runner_id == runner_id)
            count_query = self._apply_scope_filters(count_query, bot_id, workspace_id, thread_id, strict_thread)
            count_result = await session.execute(count_query)
            total_count = count_result.scalar() or 0

            # Then get items
            query = sqlalchemy.select(AgentRun)
            if conversation_id is not None:
                query = query.where(AgentRun.conversation_id == conversation_id)
            if statuses:
                query = query.where(AgentRun.status.in_(statuses))
            if runner_id is not None:
                query = query.where(AgentRun.runner_id == runner_id)
            if before_id is not None:
                query = query.where(AgentRun.id < before_id)
            query = self._apply_scope_filters(query, bot_id, workspace_id, thread_id, strict_thread)
            query = query.order_by(AgentRun.id.desc()).limit(limit + 1)

            result = await session.execute(query)
            rows = result.scalars().all()
            items = [self._run_to_dict(row) for row in rows[:limit]]
            has_more = len(rows) > limit
            next_cursor = items[-1]['id'] if items and has_more else None

            return items, next_cursor, has_more, total_count

    async def page_run_events(
        self,
        *,
        run_id: str,
        before_sequence: int | None = None,
        after_sequence: int | None = None,
        limit: int = 50,
        direction: str = 'forward',
    ) -> tuple[list[dict[str, typing.Any]], int | None, int | None, bool]:
        """Page result events for one run."""
        limit = min(max(int(limit), 1), 100)
        direction = direction if direction in {'forward', 'backward'} else 'forward'
        async with self._session_factory() as session:
            query = sqlalchemy.select(AgentRunEvent).where(AgentRunEvent.run_id == run_id)
            if before_sequence is not None:
                query = query.where(AgentRunEvent.sequence < before_sequence)
            if after_sequence is not None:
                query = query.where(AgentRunEvent.sequence > after_sequence)

            if direction == 'backward':
                query = query.order_by(AgentRunEvent.sequence.desc())
            else:
                query = query.order_by(AgentRunEvent.sequence.asc())
            query = query.limit(limit + 1)

            result = await session.execute(query)
            rows = result.scalars().all()
            items = [self._event_to_dict(row) for row in rows[:limit]]
            has_more = len(rows) > limit

            if direction == 'backward':
                next_cursor = items[-1]['sequence'] if items and has_more else None
                prev_cursor = items[0]['sequence'] if items else None
            else:
                next_cursor = items[-1]['sequence'] if items and has_more else None
                prev_cursor = items[0]['sequence'] if items else None
            return items, next_cursor, prev_cursor, has_more

    async def _get_run_row(
        self,
        session: AsyncSession,
        run_id: str,
    ) -> AgentRun | None:
        result = await session.execute(sqlalchemy.select(AgentRun).where(AgentRun.run_id == run_id))
        return result.scalars().first()

    async def _get_runtime_row(
        self,
        session: AsyncSession,
        runtime_id: str,
    ) -> AgentRuntime | None:
        result = await session.execute(sqlalchemy.select(AgentRuntime).where(AgentRuntime.runtime_id == runtime_id))
        return result.scalars().first()

    def _apply_scope_filters(
        self,
        query: typing.Any,
        bot_id: str | None,
        workspace_id: str | None,
        thread_id: str | None,
        strict_thread: bool,
    ) -> typing.Any:
        if bot_id is not None:
            query = query.where(AgentRun.bot_id == bot_id)
        if workspace_id is not None:
            query = query.where(AgentRun.workspace_id == workspace_id)
        if strict_thread:
            if thread_id is None:
                query = query.where(AgentRun.thread_id.is_(None))
            else:
                query = query.where(AgentRun.thread_id == thread_id)
        return query

    def _run_to_dict(self, row: AgentRun, *, include_claim_token: bool = False) -> dict[str, typing.Any]:
        data = {
            'id': row.id,
            'run_id': row.run_id,
            'event_id': row.event_id,
            'agent_id': row.agent_id,
            'binding_id': row.binding_id,
            'runner_id': row.runner_id,
            'conversation_id': row.conversation_id,
            'thread_id': row.thread_id,
            'workspace_id': row.workspace_id,
            'bot_id': row.bot_id,
            'status': row.status,
            'status_reason': row.status_reason,
            'queue_name': row.queue_name,
            'priority': row.priority,
            'requested_runtime_id': row.requested_runtime_id,
            'claimed_by_runtime_id': row.claimed_by_runtime_id,
            'claim_lease_expires_at': _datetime_to_epoch(row.claim_lease_expires_at),
            'dispatch_attempts': row.dispatch_attempts,
            'last_claimed_at': _datetime_to_epoch(row.last_claimed_at),
            'created_at': _datetime_to_epoch(row.created_at),
            'started_at': _datetime_to_epoch(row.started_at),
            'finished_at': _datetime_to_epoch(row.finished_at),
            'updated_at': _datetime_to_epoch(row.updated_at),
            'deadline_at': _datetime_to_epoch(row.deadline_at),
            'cancel_requested_at': _datetime_to_epoch(row.cancel_requested_at),
            'usage': _json_loads(row.usage_json, None),
            'cost': _json_loads(row.cost_json, None),
            'metadata': _json_loads(row.metadata_json, {}),
        }
        if include_claim_token:
            data['claim_token'] = row.claim_token
        return data

    def _runtime_to_dict(self, row: AgentRuntime) -> dict[str, typing.Any]:
        return {
            'id': row.id,
            'runtime_id': row.runtime_id,
            'status': row.status,
            'display_name': row.display_name,
            'endpoint': row.endpoint,
            'version': row.version,
            'capabilities': _json_loads(row.capabilities_json, {}),
            'labels': _json_loads(row.labels_json, {}),
            'metadata': _json_loads(row.metadata_json, {}),
            'last_heartbeat_at': _datetime_to_epoch(row.last_heartbeat_at),
            'heartbeat_deadline_at': _datetime_to_epoch(row.heartbeat_deadline_at),
            'created_at': _datetime_to_epoch(row.created_at),
            'updated_at': _datetime_to_epoch(row.updated_at),
        }

    def _event_to_dict(self, row: AgentRunEvent) -> dict[str, typing.Any]:
        return {
            'id': row.id,
            'run_id': row.run_id,
            'sequence': row.sequence,
            'type': row.type,
            'data': _json_loads(row.data_json, {}),
            'usage': _json_loads(row.usage_json, None),
            'created_at': _datetime_to_epoch(row.created_at),
            'source': row.source,
            'metadata': _json_loads(row.metadata_json, {}),
        }

    async def get_run_stats(
        self,
        *,
        start_time: int,
        end_time: int,
        runner_id: str | None = None,
    ) -> dict[str, typing.Any]:
        """Get run statistics within a time window.

        Args:
            start_time: Unix timestamp for start of window
            end_time: Unix timestamp for end of window
            runner_id: Optional filter by runner

        Returns:
            Dict with status counts, rates, and duration stats.
        """
        from sqlalchemy import func

        start_dt = _epoch_to_datetime(start_time)
        end_dt = _epoch_to_datetime(end_time)

        async with self._session_factory() as session:
            # Base filter for time window
            base_filter = [
                AgentRun.created_at >= start_dt,
                AgentRun.created_at <= end_dt,
            ]
            if runner_id:
                base_filter.append(AgentRun.runner_id == runner_id)

            # Count by status
            status_query = (
                sqlalchemy.select(
                    AgentRun.status,
                    func.count(AgentRun.id).label('count')
                )
                .where(*base_filter)
                .group_by(AgentRun.status)
            )
            status_result = await session.execute(status_query)
            status_counts = {row.status: row.count for row in status_result}

            total_count = sum(status_counts.values())
            completed_count = status_counts.get('completed', 0)
            failed_count = status_counts.get('failed', 0) + status_counts.get('timeout', 0)

            # Calculate rates
            window_hours = max((end_time - start_time) / 3600, 0.001)
            throughput = total_count / window_hours if total_count > 0 else 0
            success_rate = completed_count / total_count if total_count > 0 else None
            failure_rate = failed_count / total_count if total_count > 0 else None

            # Duration stats for completed runs - compute in Python for DB compatibility
            avg_duration_seconds = None
            avg_queue_wait_seconds = None

            # Fetch completed runs with timing data
            timing_query = (
                sqlalchemy.select(
                    AgentRun.started_at,
                    AgentRun.finished_at,
                    AgentRun.created_at,
                )
                .where(
                    AgentRun.status == 'completed',
                    AgentRun.started_at.is_not(None),
                    AgentRun.finished_at.is_not(None),
                    *base_filter
                )
            )
            timing_result = await session.execute(timing_query)
            timing_rows = timing_result.all()

            if timing_rows:
                durations = []
                for row in timing_rows:
                    if row.finished_at and row.started_at:
                        delta = row.finished_at - row.started_at
                        durations.append(delta.total_seconds())
                if durations:
                    avg_duration_seconds = round(sum(durations) / len(durations), 2)

            # Queue wait time - compute in Python
            queue_query = (
                sqlalchemy.select(
                    AgentRun.created_at,
                    AgentRun.started_at,
                )
                .where(
                    AgentRun.started_at.is_not(None),
                    *base_filter
                )
            )
            queue_result = await session.execute(queue_query)
            queue_rows = queue_result.all()

            if queue_rows:
                waits = []
                for row in queue_rows:
                    if row.started_at and row.created_at:
                        delta = row.started_at - row.created_at
                        wait_seconds = delta.total_seconds()
                        if wait_seconds >= 0:  # Only count positive waits
                            waits.append(wait_seconds)
                if waits:
                    avg_queue_wait_seconds = round(sum(waits) / len(waits), 2)

            return {
                'start_time': start_time,
                'end_time': end_time,
                'total_count': total_count,
                'created_count': status_counts.get('created', 0),
                'queued_count': status_counts.get('queued', 0),
                'claimed_count': status_counts.get('claimed', 0),
                'running_count': status_counts.get('running', 0),
                'completed_count': completed_count,
                'failed_count': status_counts.get('failed', 0),
                'cancelled_count': status_counts.get('cancelled', 0),
                'timeout_count': status_counts.get('timeout', 0),
                'throughput_per_hour': round(throughput, 2),
                'success_rate': round(success_rate, 4) if success_rate is not None else None,
                'failure_rate': round(failure_rate, 4) if failure_rate is not None else None,
                'avg_duration_seconds': avg_duration_seconds,
                'p50_duration_seconds': None,  # Requires more complex calculation
                'p95_duration_seconds': None,
                'p99_duration_seconds': None,
                'avg_queue_wait_seconds': avg_queue_wait_seconds,
            }

    async def get_runtime_stats(self) -> dict[str, typing.Any]:
        """Get runtime registry statistics.

        Returns:
            Dict with counts, heartbeat health, and capacity.
        """
        from sqlalchemy import func

        now = _utc_now()

        async with self._session_factory() as session:
            # Count by status
            status_query = (
                sqlalchemy.select(
                    AgentRuntime.status,
                    func.count(AgentRuntime.id).label('count')
                )
                .group_by(AgentRuntime.status)
            )
            status_result = await session.execute(status_query)
            status_counts = {row.status: row.count for row in status_result}

            total_count = sum(status_counts.values())
            online_count = status_counts.get('online', 0)
            stale_count = status_counts.get('stale', 0)

            # Heartbeat age stats - compute in Python for DB compatibility
            avg_heartbeat_age = None
            max_heartbeat_age = None

            heartbeat_query = (
                sqlalchemy.select(AgentRuntime.last_heartbeat_at)
                .where(AgentRuntime.last_heartbeat_at.is_not(None))
            )
            heartbeat_result = await session.execute(heartbeat_query)
            heartbeat_rows = heartbeat_result.all()

            if heartbeat_rows:
                ages = []
                for row in heartbeat_rows:
                    heartbeat_at = _as_utc(row.last_heartbeat_at)
                    if heartbeat_at:
                        delta = now - heartbeat_at
                        age_seconds = delta.total_seconds()
                        if age_seconds >= 0:
                            ages.append(age_seconds)
                if ages:
                    avg_heartbeat_age = round(sum(ages) / len(ages), 2)
                    max_heartbeat_age = round(max(ages), 2)

            active_runs_query = (
                sqlalchemy.select(func.count(AgentRun.id))
                .where(AgentRun.status.in_(['running', 'claimed']))
            )
            active_runs_result = await session.execute(active_runs_query)
            active_runs = active_runs_result.scalar() or 0
            claimed_runs_query = (
                sqlalchemy.select(func.count(AgentRun.id))
                .where(AgentRun.status == 'claimed')
            )
            claimed_runs_result = await session.execute(claimed_runs_query)
            claimed_runs = claimed_runs_result.scalar() or 0

            return {
                'total_count': total_count,
                'online_count': online_count,
                'stale_count': stale_count,
                'avg_heartbeat_age_seconds': avg_heartbeat_age,
                'max_heartbeat_age_seconds': max_heartbeat_age,
                'active_runs': active_runs,
                'claimed_runs': claimed_runs,
            }

    async def get_runner_stats(
        self,
        *,
        start_time: int,
        end_time: int,
        limit: int = 50,
    ) -> list[dict[str, typing.Any]]:
        """Get runner-aggregated statistics.

        Args:
            start_time: Unix timestamp for start of window
            end_time: Unix timestamp for end of window
            limit: Maximum number of runners to return

        Returns:
            List of dicts with per-runner statistics.
        """
        from sqlalchemy import func

        start_dt = _epoch_to_datetime(start_time)
        end_dt = _epoch_to_datetime(end_time)
        limit = min(max(limit, 1), 100)

        async with self._session_factory() as session:
            # Aggregate runs by runner_id
            query = (
                sqlalchemy.select(
                    AgentRun.runner_id,
                    func.count(AgentRun.id).label('total'),
                    func.sum(
                        sqlalchemy.case(
                            (AgentRun.status.in_(['queued', 'claimed', 'running']), 1),
                            else_=0
                        )
                    ).label('active'),
                    func.sum(
                        sqlalchemy.case(
                            (AgentRun.status == 'completed', 1),
                            else_=0
                        )
                    ).label('completed'),
                    func.sum(
                        sqlalchemy.case(
                            (AgentRun.status.in_(['failed', 'timeout']), 1),
                            else_=0
                        )
                    ).label('failed'),
                )
                .where(
                    AgentRun.created_at >= start_dt,
                    AgentRun.created_at <= end_dt,
                    AgentRun.runner_id.is_not(None),
                )
                .group_by(AgentRun.runner_id)
                .order_by(func.count(AgentRun.id).desc())
                .limit(limit)
            )

            result = await session.execute(query)
            rows = result.all()

            stats = []
            for row in rows:
                runner_id = row.runner_id or 'unknown'
                total = row.total or 0
                completed = row.completed or 0
                failed = row.failed or 0
                success_rate = completed / total if total > 0 else None

                stats.append({
                    'runner_id': runner_id,
                    'runner_label': None,  # Would need to join with runner descriptors
                    'plugin_identity': None,
                    'total_runs': total,
                    'active_runs': row.active or 0,
                    'completed_runs': completed,
                    'failed_runs': failed,
                    'success_rate': round(success_rate, 4) if success_rate is not None else None,
                    'avg_duration_seconds': None,  # Would need more complex query
                })

            return stats