fix(provider): capture streaming token usage; add token observability

The LiteLLM streaming requester only captured usage when a chunk had an empty `choices` list. Many OpenAI-compatible gateways (e.g. new-api) and providers send the final usage payload in a chunk that still carries an empty-delta choice, so streamed calls always recorded 0 tokens in the monitoring logs/dashboard (non-streaming worked). - Capture stream usage whenever a chunk carries it, regardless of choices - Add robust _normalize_usage (dict/obj shapes, derive missing total_tokens) - Register litellm in bootutils/deps.py (was in pyproject only) - Add MonitoringService.get_token_statistics + /monitoring/token-statistics endpoint: summary, per-model breakdown, token timeseries, and a zero-token-success data-quality signal - Add TokenMonitoring dashboard tab (summary tiles, stacked token chart, per-model table) + i18n (en/zh) - Regression tests for stream usage capture and usage normalization Verified end-to-end against a real OpenAI-compatible endpoint with gpt-5.5 and claude-opus-4-8: tokens now recorded non-zero for both streaming and non-streaming paths.
2026-06-18 19:44:21 +00:00 · 2026-06-05 09:13:57 -04:00
parent d450226701
commit 39673444d2
10 changed files with 986 additions and 15 deletions
@@ -46,6 +46,30 @@ class MonitoringRouterGroup(group.RouterGroup):
            return self.success(data=metrics)
        @self.route('/token-statistics', methods=['GET'], auth_type=group.AuthType.USER_TOKEN)
        async def get_token_statistics() -> str:
            """Get detailed token usage statistics (summary, per-model, timeseries)."""
            bot_ids = quart.request.args.getlist('botId')
            pipeline_ids = quart.request.args.getlist('pipelineId')
            start_time_str = quart.request.args.get('startTime')
            end_time_str = quart.request.args.get('endTime')
            bucket = quart.request.args.get('bucket', 'hour')
            if bucket not in ('hour', 'day'):
                bucket = 'hour'
            start_time = parse_iso_datetime(start_time_str)
            end_time = parse_iso_datetime(end_time_str)
            stats = await self.ap.monitoring_service.get_token_statistics(
                bot_ids=bot_ids if bot_ids else None,
                pipeline_ids=pipeline_ids if pipeline_ids else None,
                start_time=start_time,
                end_time=end_time,
                bucket=bucket,
            )
            return self.success(data=stats)
        @self.route('/messages', methods=['GET'], auth_type=group.AuthType.USER_TOKEN)
        async def get_messages() -> str:
            """Get message logs"""
@@ -472,6 +472,185 @@ class MonitoringService:
            'active_sessions': active_sessions,
        }
    async def get_token_statistics(
        self,
        bot_ids: list[str] | None = None,
        pipeline_ids: list[str] | None = None,
        start_time: datetime.datetime | None = None,
        end_time: datetime.datetime | None = None,
        bucket: str = 'hour',
    ) -> dict:
        """Get detailed token usage statistics for production observability.
        Returns:
        - summary: aggregate token counters and call/latency stats over the window
        - by_model: per-model token + call breakdown (sorted by total tokens desc)
        - timeseries: token usage bucketed by `bucket` ('hour' or 'day')
        Only successful LLM calls are counted toward token totals; error calls are
        reported separately so a spike in failures is visible without polluting
        token accounting.
        """
        LLMCall = persistence_monitoring.MonitoringLLMCall
        conditions = []
        if bot_ids:
            conditions.append(LLMCall.bot_id.in_(bot_ids))
        if pipeline_ids:
            conditions.append(LLMCall.pipeline_id.in_(pipeline_ids))
        if start_time:
            conditions.append(LLMCall.timestamp >= start_time)
        if end_time:
            conditions.append(LLMCall.timestamp <= end_time)
        def _apply(query):
            if conditions:
                query = query.where(sqlalchemy.and_(*conditions))
            return query
        # ---- Summary aggregates ----
        summary_query = _apply(
            sqlalchemy.select(
                sqlalchemy.func.count(LLMCall.id),
                sqlalchemy.func.coalesce(sqlalchemy.func.sum(LLMCall.input_tokens), 0),
                sqlalchemy.func.coalesce(sqlalchemy.func.sum(LLMCall.output_tokens), 0),
                sqlalchemy.func.coalesce(sqlalchemy.func.sum(LLMCall.total_tokens), 0),
                sqlalchemy.func.coalesce(sqlalchemy.func.sum(LLMCall.duration), 0),
                sqlalchemy.func.coalesce(sqlalchemy.func.sum(LLMCall.cost), 0.0),
                sqlalchemy.func.sum(
                    sqlalchemy.case((LLMCall.status == 'success', 1), else_=0)
                ),
                sqlalchemy.func.sum(
                    sqlalchemy.case((LLMCall.status == 'error', 1), else_=0)
                ),
                # Count of successful calls that nonetheless recorded zero tokens —
                # a data-quality signal that usage reporting may be broken upstream.
                sqlalchemy.func.sum(
                    sqlalchemy.case(
                        (sqlalchemy.and_(LLMCall.status == 'success', LLMCall.total_tokens == 0), 1),
                        else_=0,
                    )
                ),
            )
        )
        summary_result = await self.ap.persistence_mgr.execute_async(summary_query)
        row = summary_result.first()
        (
            total_calls,
            total_input_tokens,
            total_output_tokens,
            total_tokens,
            total_duration,
            total_cost,
            success_calls,
            error_calls,
            zero_token_success_calls,
        ) = row if row else (0, 0, 0, 0, 0, 0.0, 0, 0, 0)
        total_calls = total_calls or 0
        success_calls = success_calls or 0
        error_calls = error_calls or 0
        zero_token_success_calls = zero_token_success_calls or 0
        summary = {
            'total_calls': total_calls,
            'success_calls': success_calls,
            'error_calls': error_calls,
            'total_input_tokens': int(total_input_tokens or 0),
            'total_output_tokens': int(total_output_tokens or 0),
            'total_tokens': int(total_tokens or 0),
            'total_cost': round(float(total_cost or 0.0), 6),
            'avg_tokens_per_call': int((total_tokens or 0) / total_calls) if total_calls > 0 else 0,
            'avg_duration_ms': int((total_duration or 0) / total_calls) if total_calls > 0 else 0,
            'avg_tokens_per_second': round((total_output_tokens or 0) / (total_duration / 1000), 2)
            if total_duration and total_duration > 0
            else 0,
            'zero_token_success_calls': zero_token_success_calls,
        }
        # ---- Per-model breakdown ----
        by_model_query = _apply(
            sqlalchemy.select(
                LLMCall.model_name,
                sqlalchemy.func.count(LLMCall.id),
                sqlalchemy.func.coalesce(sqlalchemy.func.sum(LLMCall.input_tokens), 0),
                sqlalchemy.func.coalesce(sqlalchemy.func.sum(LLMCall.output_tokens), 0),
                sqlalchemy.func.coalesce(sqlalchemy.func.sum(LLMCall.total_tokens), 0),
                sqlalchemy.func.coalesce(sqlalchemy.func.sum(LLMCall.duration), 0),
                sqlalchemy.func.coalesce(sqlalchemy.func.sum(LLMCall.cost), 0.0),
                sqlalchemy.func.sum(
                    sqlalchemy.case((LLMCall.status == 'error', 1), else_=0)
                ),
            ).group_by(LLMCall.model_name)
        )
        by_model_result = await self.ap.persistence_mgr.execute_async(by_model_query)
        by_model = []
        for mrow in by_model_result.all():
            (
                model_name,
                m_calls,
                m_in,
                m_out,
                m_total,
                m_duration,
                m_cost,
                m_errors,
            ) = mrow
            m_calls = m_calls or 0
            by_model.append(
                {
                    'model_name': model_name,
                    'calls': m_calls,
                    'error_calls': m_errors or 0,
                    'input_tokens': int(m_in or 0),
                    'output_tokens': int(m_out or 0),
                    'total_tokens': int(m_total or 0),
                    'cost': round(float(m_cost or 0.0), 6),
                    'avg_tokens_per_call': int((m_total or 0) / m_calls) if m_calls > 0 else 0,
                    'avg_duration_ms': int((m_duration or 0) / m_calls) if m_calls > 0 else 0,
                }
            )
        by_model.sort(key=lambda x: x['total_tokens'], reverse=True)
        # ---- Time-bucketed series ----
        # Use a DB-agnostic bucketing approach: fetch (timestamp, tokens) rows and
        # aggregate in Python. The window is bounded by the time filter, so this is
        # cheap for typical dashboard ranges (hours/days).
        series_query = _apply(
            sqlalchemy.select(
                LLMCall.timestamp,
                LLMCall.input_tokens,
                LLMCall.output_tokens,
                LLMCall.total_tokens,
            ).order_by(LLMCall.timestamp.asc())
        )
        series_result = await self.ap.persistence_mgr.execute_async(series_query)
        bucket_fmt = '%Y-%m-%d %H:00' if bucket == 'hour' else '%Y-%m-%d'
        buckets: dict[str, dict] = {}
        for srow in series_result.all():
            ts, s_in, s_out, s_total = srow
            if ts is None:
                continue
            key = ts.strftime(bucket_fmt)
            b = buckets.setdefault(
                key,
                {'bucket': key, 'input_tokens': 0, 'output_tokens': 0, 'total_tokens': 0, 'calls': 0},
            )
            b['input_tokens'] += int(s_in or 0)
            b['output_tokens'] += int(s_out or 0)
            b['total_tokens'] += int(s_total or 0)
            b['calls'] += 1
        timeseries = [buckets[k] for k in sorted(buckets.keys())]
        return {
            'summary': summary,
            'by_model': by_model,
            'timeseries': timeseries,
            'bucket': bucket,
        }
    async def get_messages(
        self,
        bot_ids: list[str] | None = None,
@@ -42,6 +42,7 @@ required_deps = {
    'telegramify_markdown': 'telegramify-markdown',
    'slack_sdk': 'slack_sdk',
    'asyncpg': 'asyncpg',
    'litellm': 'litellm',
 }
@@ -85,15 +85,42 @@ class LiteLLMRequester(requester.ProviderAPIRequester):
        # because it's typically internal model reasoning, not user-visible thinking
        return content or ''
-    def _extract_usage(self, response) -> dict:
+    @staticmethod
-        """Extract usage info from LiteLLM response."""
+    def _normalize_usage(usage: typing.Any) -> dict:
-        usage = response.usage
+        """Normalize a LiteLLM/OpenAI usage object into a plain token dict.
        Handles several real-world shapes returned by different upstreams:
        - object with ``prompt_tokens`` / ``completion_tokens`` / ``total_tokens`` attrs
        - dict with the same keys
        - missing ``total_tokens`` (derived from prompt + completion)
        - ``None`` / partially-populated usage (defaults to 0)
        """
        if usage is None:
            return {'prompt_tokens': 0, 'completion_tokens': 0, 'total_tokens': 0}
        def _get(key: str) -> typing.Any:
            if isinstance(usage, dict):
                return usage.get(key)
            return getattr(usage, key, None)
        prompt_tokens = _get('prompt_tokens') or 0
        completion_tokens = _get('completion_tokens') or 0
        total_tokens = _get('total_tokens') or 0
        # Some providers omit total_tokens in streaming usage; derive it.
        if not total_tokens:
            total_tokens = prompt_tokens + completion_tokens
        return {
-            'prompt_tokens': usage.prompt_tokens or 0,
+            'prompt_tokens': int(prompt_tokens),
-            'completion_tokens': usage.completion_tokens or 0,
+            'completion_tokens': int(completion_tokens),
-            'total_tokens': usage.total_tokens or 0,
+            'total_tokens': int(total_tokens),
        }
    def _extract_usage(self, response) -> dict:
        """Extract usage info from a non-streaming LiteLLM response."""
        return self._normalize_usage(getattr(response, 'usage', None))
    def _build_common_args(self, args: dict, include_retry_params: bool = True) -> dict:
        """Apply common requester config to args dict."""
        if self.requester_cfg.get('base_url'):
@@ -217,18 +244,21 @@ class LiteLLMRequester(requester.ProviderAPIRequester):
        try:
            response = await acompletion(**args)
            async for chunk in response:
-                # Check for usage chunk (final chunk with stream_options include_usage)
+                # Capture usage whenever a chunk carries it.
-                if hasattr(chunk, 'usage') and chunk.usage and (not hasattr(chunk, 'choices') or not chunk.choices):
+                #
-                    usage_info = {
+                # Important: many OpenAI-compatible gateways (e.g. new-api) and
-                        'prompt_tokens': chunk.usage.prompt_tokens or 0,
+                # providers send the final usage payload in a chunk that STILL
-                        'completion_tokens': chunk.usage.completion_tokens or 0,
+                # contains a (empty-delta) choice, not an empty `choices` list.
-                        'total_tokens': chunk.usage.total_tokens or 0,
+                # The previous implementation only captured usage when `choices`
-                    }
+                # was empty, so streamed calls always recorded 0 tokens.
-                    if query:
+                # We therefore capture usage independently of `choices`, and then
                # fall through to also process any content this chunk may carry.
                if getattr(chunk, 'usage', None):
                    usage_info = self._normalize_usage(chunk.usage)
                    if query is not None:
                        if query.variables is None:
                            query.variables = {}
                        query.variables['_stream_usage'] = usage_info
                    continue
                if not hasattr(chunk, 'choices') or not chunk.choices:
                    continue
@@ -110,6 +110,147 @@ class TestExtractUsage:
        assert result['completion_tokens'] == 0
 class TestNormalizeUsage:
    """Test _normalize_usage helper covering real-world usage shapes"""
    def test_none_usage(self):
        """None usage -> all zeros (no crash)"""
        result = litellmchat.LiteLLMRequester._normalize_usage(None)
        assert result == {'prompt_tokens': 0, 'completion_tokens': 0, 'total_tokens': 0}
    def test_dict_usage(self):
        """Usage given as a plain dict"""
        result = litellmchat.LiteLLMRequester._normalize_usage(
            {'prompt_tokens': 12, 'completion_tokens': 8, 'total_tokens': 20}
        )
        assert result == {'prompt_tokens': 12, 'completion_tokens': 8, 'total_tokens': 20}
    def test_missing_total_is_derived(self):
        """When total_tokens is absent/zero it is derived from prompt + completion"""
        usage = Mock()
        usage.prompt_tokens = 42
        usage.completion_tokens = 10
        usage.total_tokens = 0
        result = litellmchat.LiteLLMRequester._normalize_usage(usage)
        assert result['total_tokens'] == 52
    def test_partial_attrs_default_to_zero(self):
        """Missing attributes default to 0 instead of raising"""
        usage = Mock(spec=['prompt_tokens'])
        usage.prompt_tokens = 5
        result = litellmchat.LiteLLMRequester._normalize_usage(usage)
        assert result == {'prompt_tokens': 5, 'completion_tokens': 0, 'total_tokens': 5}
 class TestInvokeLLMStreamUsage:
    """Regression tests for streaming token usage capture.
    Real OpenAI-compatible gateways (e.g. new-api) send the final usage payload
    in a chunk that still carries a (empty-delta) choice rather than an empty
    `choices` list. The usage must be captured regardless, otherwise streamed
    calls record 0 tokens.
    """
    def _make_chunk(self, *, content=None, finish_reason=None, usage=None, has_choice=True):
        chunk = Mock()
        if usage is not None:
            chunk.usage = usage
        else:
            chunk.usage = None
        if has_choice:
            choice = Mock()
            delta = Mock()
            delta.model_dump = Mock(
                return_value={'role': 'assistant', 'content': content, 'tool_calls': None}
            )
            choice.delta = delta
            choice.finish_reason = finish_reason
            chunk.choices = [choice]
        else:
            chunk.choices = []
        return chunk
    @pytest.mark.asyncio
    async def test_stream_usage_with_nonempty_choices(self):
        """Usage chunk that still has a choice must populate _stream_usage."""
        import langbot_plugin.api.entities.builtin.pipeline.query as pipeline_query
        import langbot_plugin.api.entities.builtin.provider.message as provider_message
        mock_ap = Mock()
        mock_ap.tool_mgr = Mock()
        mock_ap.tool_mgr.generate_tools_for_openai = AsyncMock(return_value=None)
        requester = litellmchat.LiteLLMRequester(ap=mock_ap, config={})
        model = MockRuntimeModel('gpt-4o', 'test-api-key')
        usage = Mock()
        usage.prompt_tokens = 24
        usage.completion_tokens = 48
        usage.total_tokens = 72
        chunks = [
            self._make_chunk(content='Hello'),
            self._make_chunk(content=None, finish_reason='stop'),
            # Final usage chunk WITH a non-empty (empty-delta) choice — the bug case.
            self._make_chunk(content=None, usage=usage, has_choice=True),
        ]
        async def _aiter(*args, **kwargs):
            for c in chunks:
                yield c
        query = Mock(spec=pipeline_query.Query)
        query.variables = {}
        messages = [provider_message.Message(role='user', content='Hi')]
        with patch.object(litellmchat, 'acompletion', new=AsyncMock(side_effect=lambda **kw: _aiter())):
            collected = []
            async for ch in requester.invoke_llm_stream(query=query, model=model, messages=messages):
                collected.append(ch)
        assert '_stream_usage' in query.variables
        assert query.variables['_stream_usage']['prompt_tokens'] == 24
        assert query.variables['_stream_usage']['completion_tokens'] == 48
        assert query.variables['_stream_usage']['total_tokens'] == 72
    @pytest.mark.asyncio
    async def test_stream_usage_with_empty_choices(self):
        """Usage chunk with empty choices list must also populate _stream_usage."""
        import langbot_plugin.api.entities.builtin.pipeline.query as pipeline_query
        import langbot_plugin.api.entities.builtin.provider.message as provider_message
        mock_ap = Mock()
        mock_ap.tool_mgr = Mock()
        mock_ap.tool_mgr.generate_tools_for_openai = AsyncMock(return_value=None)
        requester = litellmchat.LiteLLMRequester(ap=mock_ap, config={})
        model = MockRuntimeModel('gpt-4o', 'test-api-key')
        usage = Mock()
        usage.prompt_tokens = 5
        usage.completion_tokens = 7
        usage.total_tokens = 12
        chunks = [
            self._make_chunk(content='Hi there'),
            self._make_chunk(content=None, finish_reason='stop'),
            self._make_chunk(usage=usage, has_choice=False),
        ]
        async def _aiter(*args, **kwargs):
            for c in chunks:
                yield c
        query = Mock(spec=pipeline_query.Query)
        query.variables = {}
        messages = [provider_message.Message(role='user', content='Hi')]
        with patch.object(litellmchat, 'acompletion', new=AsyncMock(side_effect=lambda **kw: _aiter())):
            async for _ in requester.invoke_llm_stream(query=query, model=model, messages=messages):
                pass
        assert query.variables['_stream_usage']['total_tokens'] == 12
 class TestProcessThinkingContent:
    """Test _process_thinking_content method"""
@@ -0,0 +1,462 @@
 import React, { useEffect, useMemo, useState, useCallback } from 'react';
 import { useTranslation } from 'react-i18next';
 import {
  ComposedChart,
  Area,
  Bar,
  XAxis,
  YAxis,
  CartesianGrid,
  Tooltip,
  ResponsiveContainer,
  Legend,
 } from 'recharts';
 import {
  Coins,
  ArrowDownToLine,
  ArrowUpFromLine,
  Gauge,
  AlertTriangle,
  TrendingUp,
 } from 'lucide-react';
 import { httpClient } from '@/app/infra/http/HttpClient';
 interface TokenSummary {
  total_calls: number;
  success_calls: number;
  error_calls: number;
  total_input_tokens: number;
  total_output_tokens: number;
  total_tokens: number;
  total_cost: number;
  avg_tokens_per_call: number;
  avg_duration_ms: number;
  avg_tokens_per_second: number;
  zero_token_success_calls: number;
 }
 interface TokenByModel {
  model_name: string;
  calls: number;
  error_calls: number;
  input_tokens: number;
  output_tokens: number;
  total_tokens: number;
  cost: number;
  avg_tokens_per_call: number;
  avg_duration_ms: number;
 }
 interface TokenTimeseriesPoint {
  bucket: string;
  input_tokens: number;
  output_tokens: number;
  total_tokens: number;
  calls: number;
 }
 interface TokenStatistics {
  summary: TokenSummary;
  by_model: TokenByModel[];
  timeseries: TokenTimeseriesPoint[];
  bucket: string;
 }
 interface TokenMonitoringProps {
  botIds?: string[];
  pipelineIds?: string[];
  startTime?: string;
  endTime?: string;
  /** Bumped by the parent to trigger a refetch on manual refresh. */
  refreshKey?: number;
 }
 function formatNumber(n: number): string {
  if (n >= 1_000_000) return `${(n / 1_000_000).toFixed(2)}M`;
  if (n >= 1_000) return `${(n / 1_000).toFixed(1)}K`;
  return n.toLocaleString();
 }
 const TOOLTIP_STYLE: React.CSSProperties = {
  backgroundColor: 'var(--card)',
  border: '1px solid var(--border)',
  borderRadius: '12px',
  boxShadow:
    '0 10px 15px -3px rgb(0 0 0 / 0.1), 0 4px 6px -4px rgb(0 0 0 / 0.1)',
  fontSize: '13px',
  padding: '12px',
  color: 'var(--foreground)',
 };
 function MetricTile({
  icon,
  label,
  value,
  sub,
  accent,
 }: {
  icon: React.ReactNode;
  label: string;
  value: string;
  sub?: string;
  accent?: string;
 }) {
  return (
    <div className="bg-card rounded-xl border p-4 flex flex-col gap-2">
      <div className="flex items-center gap-2 text-muted-foreground text-sm">
        <span
          className="flex items-center justify-center h-7 w-7 rounded-lg"
          style={{
            backgroundColor: accent ? `${accent}1a` : 'var(--muted)',
            color: accent || 'var(--foreground)',
          }}
        >
          {icon}
        </span>
        {label}
      </div>
      <div className="text-2xl font-semibold text-foreground tabular-nums">
        {value}
      </div>
      {sub && <div className="text-xs text-muted-foreground">{sub}</div>}
    </div>
  );
 }
 export default function TokenMonitoring({
  botIds,
  pipelineIds,
  startTime,
  endTime,
  refreshKey,
 }: TokenMonitoringProps) {
  const { t } = useTranslation();
  const [bucket, setBucket] = useState<'hour' | 'day'>('hour');
  const [stats, setStats] = useState<TokenStatistics | null>(null);
  const [loading, setLoading] = useState(true);
  const [error, setError] = useState<string | null>(null);
  const botIdsKey = JSON.stringify(botIds);
  const pipelineIdsKey = JSON.stringify(pipelineIds);
  const fetchStats = useCallback(async () => {
    setLoading(true);
    setError(null);
    try {
      const result = await httpClient.getTokenStatistics({
        botId: botIds,
        pipelineId: pipelineIds,
        startTime,
        endTime,
        bucket,
      });
      setStats(result);
    } catch (e) {
      setError(e instanceof Error ? e.message : String(e));
    } finally {
      setLoading(false);
    }
    // eslint-disable-next-line react-hooks/exhaustive-deps
  }, [botIdsKey, pipelineIdsKey, startTime, endTime, bucket, refreshKey]);
  useEffect(() => {
    fetchStats();
  }, [fetchStats]);
  const chartData = useMemo(() => {
    if (!stats) return [];
    return stats.timeseries.map((p) => ({
      bucket: p.bucket,
      input: p.input_tokens,
      output: p.output_tokens,
      total: p.total_tokens,
    }));
  }, [stats]);
  if (loading) {
    return (
      <div className="space-y-4">
        <div className="grid grid-cols-2 md:grid-cols-3 lg:grid-cols-6 gap-4">
          {Array.from({ length: 6 }).map((_, i) => (
            <div
              key={i}
              className="bg-card rounded-xl border p-4 h-24 animate-pulse"
            />
          ))}
        </div>
        <div className="bg-card rounded-xl border p-6 h-[320px] animate-pulse" />
      </div>
    );
  }
  if (error) {
    return (
      <div className="bg-card rounded-xl border p-6 text-sm text-destructive flex items-center gap-2">
        <AlertTriangle className="h-4 w-4" />
        {t('monitoring.tokens.loadError', { error })}
      </div>
    );
  }
  if (!stats || stats.summary.total_calls === 0) {
    return (
      <div className="bg-card rounded-xl border p-6">
        <div className="h-[260px] flex flex-col items-center justify-center text-muted-foreground gap-2">
          <Coins className="h-[3rem] w-[3rem]" />
          <div className="text-sm">{t('monitoring.tokens.noData')}</div>
        </div>
      </div>
    );
  }
  const { summary, by_model } = stats;
  return (
    <div className="space-y-6">
      {/* Data-quality warning: streamed calls that recorded 0 tokens */}
      {summary.zero_token_success_calls > 0 && (
        <div className="bg-amber-500/10 border border-amber-500/30 text-amber-700 dark:text-amber-400 rounded-xl p-4 text-sm flex items-start gap-2">
          <AlertTriangle className="h-4 w-4 mt-0.5 shrink-0" />
          <span>
            {t('monitoring.tokens.zeroTokenWarning', {
              count: summary.zero_token_success_calls,
            })}
          </span>
        </div>
      )}
      {/* Summary tiles */}
      <div className="grid grid-cols-2 md:grid-cols-3 lg:grid-cols-6 gap-4">
        <MetricTile
          icon={<Coins className="h-4 w-4" />}
          label={t('monitoring.tokens.totalTokens')}
          value={formatNumber(summary.total_tokens)}
          sub={t('monitoring.tokens.acrossCalls', {
            count: summary.total_calls,
          })}
          accent="#8b5cf6"
        />
        <MetricTile
          icon={<ArrowDownToLine className="h-4 w-4" />}
          label={t('monitoring.tokens.inputTokens')}
          value={formatNumber(summary.total_input_tokens)}
          accent="#3b82f6"
        />
        <MetricTile
          icon={<ArrowUpFromLine className="h-4 w-4" />}
          label={t('monitoring.tokens.outputTokens')}
          value={formatNumber(summary.total_output_tokens)}
          accent="#10b981"
        />
        <MetricTile
          icon={<TrendingUp className="h-4 w-4" />}
          label={t('monitoring.tokens.avgPerCall')}
          value={formatNumber(summary.avg_tokens_per_call)}
          accent="#f59e0b"
        />
        <MetricTile
          icon={<Gauge className="h-4 w-4" />}
          label={t('monitoring.tokens.throughput')}
          value={`${summary.avg_tokens_per_second}`}
          sub={t('monitoring.tokens.tokensPerSec')}
          accent="#06b6d4"
        />
        <MetricTile
          icon={<AlertTriangle className="h-4 w-4" />}
          label={t('monitoring.tokens.errorCalls')}
          value={`${summary.error_calls}`}
          sub={t('monitoring.tokens.ofTotal', { count: summary.total_calls })}
          accent="#ef4444"
        />
      </div>
      {/* Token usage over time */}
      <div className="bg-card rounded-xl border p-6">
        <div className="flex items-center justify-between mb-6">
          <h3 className="text-base font-semibold text-foreground">
            {t('monitoring.tokens.usageOverTime')}
          </h3>
          <div className="inline-flex rounded-lg border p-0.5 text-sm">
            {(['hour', 'day'] as const).map((b) => (
              <button
                key={b}
                onClick={() => setBucket(b)}
                className={`px-3 py-1 rounded-md transition-colors ${
                  bucket === b
                    ? 'bg-primary text-primary-foreground'
                    : 'text-muted-foreground hover:text-foreground'
                }`}
              >
                {t(`monitoring.tokens.bucket.${b}`)}
              </button>
            ))}
          </div>
        </div>
        <div className="h-[320px]">
          <ResponsiveContainer width="100%" height="100%">
            <ComposedChart
              data={chartData}
              margin={{ top: 10, right: 20, left: 0, bottom: 0 }}
            >
              <defs>
                <linearGradient id="tokTotal" x1="0" y1="0" x2="0" y2="1">
                  <stop offset="5%" stopColor="#8b5cf6" stopOpacity={0.35} />
                  <stop offset="95%" stopColor="#8b5cf6" stopOpacity={0.03} />
                </linearGradient>
              </defs>
              <CartesianGrid
                strokeDasharray="3 3"
                stroke="var(--border)"
                vertical={false}
              />
              <XAxis
                dataKey="bucket"
                tick={{ fontSize: 12, fill: 'var(--muted-foreground)' }}
                tickLine={false}
                axisLine={{ stroke: 'var(--border)' }}
                dy={10}
              />
              <YAxis
                tick={{ fontSize: 12, fill: 'var(--muted-foreground)' }}
                tickLine={false}
                axisLine={{ stroke: 'var(--border)' }}
                width={48}
                tickFormatter={(v) => formatNumber(Number(v))}
              />
              <Tooltip
                contentStyle={TOOLTIP_STYLE}
                labelStyle={{
                  fontWeight: 600,
                  marginBottom: '8px',
                  color: 'var(--foreground)',
                }}
                formatter={(value: number) => formatNumber(Number(value))}
              />
              <Legend
                wrapperStyle={{
                  fontSize: '13px',
                  paddingTop: '16px',
                  fontWeight: 500,
                }}
                iconType="circle"
                iconSize={10}
              />
              <Bar
                dataKey="input"
                name={t('monitoring.tokens.inputTokens')}
                stackId="io"
                fill="#3b82f6"
                radius={[0, 0, 0, 0]}
                barSize={18}
              />
              <Bar
                dataKey="output"
                name={t('monitoring.tokens.outputTokens')}
                stackId="io"
                fill="#10b981"
                radius={[4, 4, 0, 0]}
                barSize={18}
              />
              <Area
                type="monotone"
                dataKey="total"
                name={t('monitoring.tokens.totalTokens')}
                stroke="#8b5cf6"
                strokeWidth={2.5}
                fill="url(#tokTotal)"
                dot={false}
                activeDot={{ r: 5, strokeWidth: 2 }}
              />
            </ComposedChart>
          </ResponsiveContainer>
        </div>
      </div>
      {/* Per-model breakdown */}
      <div className="bg-card rounded-xl border p-6">
        <h3 className="text-base font-semibold text-foreground mb-4">
          {t('monitoring.tokens.byModel')}
        </h3>
        <div className="overflow-x-auto">
          <table className="w-full text-sm">
            <thead>
              <tr className="text-left text-muted-foreground border-b">
                <th className="py-2 pr-4 font-medium">
                  {t('monitoring.tokens.model')}
                </th>
                <th className="py-2 px-4 font-medium text-right">
                  {t('monitoring.tokens.calls')}
                </th>
                <th className="py-2 px-4 font-medium text-right">
                  {t('monitoring.tokens.inputTokens')}
                </th>
                <th className="py-2 px-4 font-medium text-right">
                  {t('monitoring.tokens.outputTokens')}
                </th>
                <th className="py-2 px-4 font-medium text-right">
                  {t('monitoring.tokens.totalTokens')}
                </th>
                <th className="py-2 px-4 font-medium text-right">
                  {t('monitoring.tokens.avgPerCall')}
                </th>
                <th className="py-2 pl-4 font-medium text-right">
                  {t('monitoring.tokens.avgLatency')}
                </th>
              </tr>
            </thead>
            <tbody>
              {by_model.map((m) => {
                const share =
                  summary.total_tokens > 0
                    ? (m.total_tokens / summary.total_tokens) * 100
                    : 0;
                return (
                  <tr
                    key={m.model_name}
                    className="border-b last:border-0 hover:bg-muted/40 transition-colors"
                  >
                    <td className="py-2.5 pr-4">
                      <div className="font-medium text-foreground">
                        {m.model_name}
                      </div>
                      <div className="mt-1 h-1.5 w-32 rounded-full bg-muted overflow-hidden">
                        <div
                          className="h-full rounded-full bg-violet-500"
                          style={{ width: `${share}%` }}
                        />
                      </div>
                    </td>
                    <td className="py-2.5 px-4 text-right tabular-nums">
                      {m.calls}
                      {m.error_calls > 0 && (
                        <span className="text-destructive">
                          {' '}
                          ({m.error_calls}✕)
                        </span>
                      )}
                    </td>
                    <td className="py-2.5 px-4 text-right tabular-nums">
                      {formatNumber(m.input_tokens)}
                    </td>
                    <td className="py-2.5 px-4 text-right tabular-nums">
                      {formatNumber(m.output_tokens)}
                    </td>
                    <td className="py-2.5 px-4 text-right tabular-nums font-medium">
                      {formatNumber(m.total_tokens)}
                    </td>
                    <td className="py-2.5 px-4 text-right tabular-nums">
                      {formatNumber(m.avg_tokens_per_call)}
                    </td>
                    <td className="py-2.5 pl-4 text-right tabular-nums">
                      {m.avg_duration_ms}ms
                    </td>
                  </tr>
                );
              })}
            </tbody>
          </table>
        </div>
      </div>
    </div>
  );
 }
@@ -13,6 +13,7 @@ import {
 } from 'lucide-react';
 import OverviewCards from './components/overview-cards/OverviewCards';
 import MonitoringFilters from './components/filters/MonitoringFilters';
 import TokenMonitoring from './components/TokenMonitoring';
 import { ExportDropdown } from './components/ExportDropdown';
 import { useMonitoringFilters } from './hooks/useMonitoringFilters';
 import { useMonitoringData } from './hooks/useMonitoringData';
@@ -319,6 +320,9 @@ function MonitoringPageContent() {
                <TabsTrigger value="modelCalls" className="px-6 py-2">
                  {t('monitoring.tabs.modelCalls')}
                </TabsTrigger>
                <TabsTrigger value="tokens" className="px-6 py-2">
                  {t('monitoring.tabs.tokens')}
                </TabsTrigger>
                <TabsTrigger value="feedback" className="px-6 py-2">
                  {t('monitoring.tabs.feedback')}
                </TabsTrigger>
@@ -668,6 +672,24 @@ function MonitoringPageContent() {
              </div>
            </TabsContent>
            <TabsContent value="tokens" className="p-6 m-0">
              <TokenMonitoring
                botIds={
                  filterState.selectedBots.length > 0
                    ? filterState.selectedBots
                    : undefined
                }
                pipelineIds={
                  filterState.selectedPipelines.length > 0
                    ? filterState.selectedPipelines
                    : undefined
                }
                startTime={feedbackTimeRange.startTime}
                endTime={feedbackTimeRange.endTime}
                refreshKey={feedbackRefreshKey}
              />
            </TabsContent>
            <TabsContent value="feedback" className="p-6 m-0">
              <div>
                {loading && (
@@ -1224,6 +1224,68 @@ export class BackendClient extends BaseHttpClient {
    return this.get(`/api/v1/monitoring/overview?${queryParams.toString()}`);
  }
  public getTokenStatistics(params: {
    botId?: string[];
    pipelineId?: string[];
    startTime?: string;
    endTime?: string;
    bucket?: 'hour' | 'day';
  }): Promise<{
    summary: {
      total_calls: number;
      success_calls: number;
      error_calls: number;
      total_input_tokens: number;
      total_output_tokens: number;
      total_tokens: number;
      total_cost: number;
      avg_tokens_per_call: number;
      avg_duration_ms: number;
      avg_tokens_per_second: number;
      zero_token_success_calls: number;
    };
    by_model: Array<{
      model_name: string;
      calls: number;
      error_calls: number;
      input_tokens: number;
      output_tokens: number;
      total_tokens: number;
      cost: number;
      avg_tokens_per_call: number;
      avg_duration_ms: number;
    }>;
    timeseries: Array<{
      bucket: string;
      input_tokens: number;
      output_tokens: number;
      total_tokens: number;
      calls: number;
    }>;
    bucket: string;
  }> {
    const queryParams = new URLSearchParams();
    if (params.botId) {
      params.botId.forEach((id) => queryParams.append('botId', id));
    }
    if (params.pipelineId) {
      params.pipelineId.forEach((id) => queryParams.append('pipelineId', id));
    }
    if (params.startTime) {
      queryParams.append('startTime', params.startTime);
    }
    if (params.endTime) {
      queryParams.append('endTime', params.endTime);
    }
    if (params.bucket) {
      queryParams.append('bucket', params.bucket);
    }
    return this.get(
      `/api/v1/monitoring/token-statistics?${queryParams.toString()}`,
    );
  }
  // ============ Survey API ============
  public getSurveyPending(): Promise<{
    survey: {
@@ -1196,6 +1196,7 @@ const enUS = {
      llmCalls: 'LLM Calls',
      embeddingCalls: 'Embedding Calls',
      modelCalls: 'Model Calls',
      tokens: 'Token Monitoring',
      feedback: 'User Feedback',
      sessions: 'Session Analysis',
      errors: 'Error Logs',
@@ -1234,6 +1235,30 @@ const enUS = {
      avgDuration: 'Avg Duration',
      calls: 'Calls',
    },
    tokens: {
      totalTokens: 'Total Tokens',
      inputTokens: 'Input Tokens',
      outputTokens: 'Output Tokens',
      avgPerCall: 'Avg / Call',
      throughput: 'Throughput',
      tokensPerSec: 'tokens/sec',
      errorCalls: 'Failed Calls',
      acrossCalls: 'across {{count}} calls',
      ofTotal: 'of {{count}} total',
      usageOverTime: 'Token Usage Over Time',
      byModel: 'By Model',
      model: 'Model',
      calls: 'Calls',
      avgLatency: 'Avg Latency',
      noData: 'No token usage in the selected time range',
      loadError: 'Failed to load token statistics: {{error}}',
      zeroTokenWarning:
        '{{count}} successful call(s) reported zero token usage. This usually means the upstream provider did not return usage info — check the model provider configuration.',
      bucket: {
        hour: 'Hourly',
        day: 'Daily',
      },
    },
    embeddingCalls: {
      title: 'Embedding Calls',
      model: 'Model',
@@ -1140,6 +1140,7 @@ const zhHans = {
      llmCalls: 'LLM调用',
      embeddingCalls: 'Embedding调用',
      modelCalls: '模型调用',
      tokens: 'Token 监控',
      feedback: '用户反馈',
      sessions: '会话分析',
      errors: '错误日志',
@@ -1178,6 +1179,30 @@ const zhHans = {
      avgDuration: '平均耗时',
      calls: '调用次数',
    },
    tokens: {
      totalTokens: '总 Token 数',
      inputTokens: '输入 Token',
      outputTokens: '输出 Token',
      avgPerCall: '平均每次调用',
      throughput: '吞吐量',
      tokensPerSec: 'Token/秒',
      errorCalls: '失败调用',
      acrossCalls: '共 {{count}} 次调用',
      ofTotal: '共 {{count}} 次',
      usageOverTime: 'Token 用量趋势',
      byModel: '按模型统计',
      model: '模型',
      calls: '调用次数',
      avgLatency: '平均延迟',
      noData: '所选时间范围内暂无 Token 用量数据',
      loadError: '加载 Token 统计失败：{{error}}',
      zeroTokenWarning:
        '检测到 {{count}} 次成功调用未上报 Token 用量（记为 0）。这通常表示上游未返回 usage 信息，请检查模型供应商配置。',
      bucket: {
        hour: '按小时',
        day: '按天',
      },
    },
    embeddingCalls: {
      title: 'Embedding调用',
      model: '模型',