mirror of
https://github.com/langbot-app/LangBot.git
synced 2026-06-17 19:24:19 +00:00
feat(monitoring): add host RAG trace observability
This commit is contained in:
@@ -313,18 +313,30 @@ class MonitoringRouterGroup(group.RouterGroup):
|
||||
offset=0,
|
||||
)
|
||||
|
||||
# Get traces
|
||||
traces, traces_total = await self.ap.monitoring_service.get_traces(
|
||||
bot_ids=bot_ids if bot_ids else None,
|
||||
pipeline_ids=pipeline_ids if pipeline_ids else None,
|
||||
start_time=start_time,
|
||||
end_time=end_time,
|
||||
limit=limit,
|
||||
offset=0,
|
||||
)
|
||||
|
||||
return self.success(
|
||||
data={
|
||||
'overview': overview,
|
||||
'messages': messages,
|
||||
'llmCalls': llm_calls,
|
||||
'embeddingCalls': embedding_calls,
|
||||
'traces': traces,
|
||||
'sessions': sessions,
|
||||
'errors': errors,
|
||||
'totalCount': {
|
||||
'messages': messages_total,
|
||||
'llmCalls': llm_calls_total,
|
||||
'embeddingCalls': embedding_calls_total,
|
||||
'traces': traces_total,
|
||||
'sessions': sessions_total,
|
||||
'errors': errors_total,
|
||||
},
|
||||
@@ -350,6 +362,49 @@ class MonitoringRouterGroup(group.RouterGroup):
|
||||
|
||||
return self.success(data=details)
|
||||
|
||||
@self.route('/traces', methods=['GET'], auth_type=group.AuthType.USER_TOKEN)
|
||||
async def get_traces() -> str:
|
||||
"""Get end-to-end trace records."""
|
||||
bot_ids = quart.request.args.getlist('botId')
|
||||
pipeline_ids = quart.request.args.getlist('pipelineId')
|
||||
session_ids = quart.request.args.getlist('sessionId')
|
||||
statuses = quart.request.args.getlist('status')
|
||||
start_time_str = quart.request.args.get('startTime')
|
||||
end_time_str = quart.request.args.get('endTime')
|
||||
limit = int(quart.request.args.get('limit', 100))
|
||||
offset = int(quart.request.args.get('offset', 0))
|
||||
|
||||
start_time = parse_iso_datetime(start_time_str)
|
||||
end_time = parse_iso_datetime(end_time_str)
|
||||
|
||||
traces, total = await self.ap.monitoring_service.get_traces(
|
||||
bot_ids=bot_ids if bot_ids else None,
|
||||
pipeline_ids=pipeline_ids if pipeline_ids else None,
|
||||
session_ids=session_ids if session_ids else None,
|
||||
statuses=statuses if statuses else None,
|
||||
start_time=start_time,
|
||||
end_time=end_time,
|
||||
limit=limit,
|
||||
offset=offset,
|
||||
)
|
||||
|
||||
return self.success(
|
||||
data={
|
||||
'traces': traces,
|
||||
'total': total,
|
||||
'limit': limit,
|
||||
'offset': offset,
|
||||
}
|
||||
)
|
||||
|
||||
@self.route('/traces/<trace_id>', methods=['GET'], auth_type=group.AuthType.USER_TOKEN)
|
||||
async def get_trace_details(trace_id: str) -> str:
|
||||
"""Get one trace with all spans."""
|
||||
details = await self.ap.monitoring_service.get_trace_details(trace_id)
|
||||
if not details.get('found'):
|
||||
return self.http_status(404, -1, f'Trace {trace_id} not found')
|
||||
return self.success(data=details)
|
||||
|
||||
@self.route('/export', methods=['GET'], auth_type=group.AuthType.USER_TOKEN)
|
||||
async def export_data() -> tuple[str, int]:
|
||||
"""Export monitoring data as CSV"""
|
||||
|
||||
@@ -350,8 +350,24 @@ class PluginsRouterGroup(group.RouterGroup):
|
||||
if not endpoint.startswith('/') or '..' in endpoint:
|
||||
return self.http_status(400, -1, 'invalid endpoint')
|
||||
|
||||
caller = {
|
||||
'plugin_author': author,
|
||||
'plugin_name': plugin_name,
|
||||
'page_id': page_id,
|
||||
'origin': _get_request_origin(),
|
||||
}
|
||||
headers = {
|
||||
key: value
|
||||
for key, value in {
|
||||
'user-agent': quart.request.headers.get('User-Agent'),
|
||||
'x-request-id': quart.request.headers.get('X-Request-ID'),
|
||||
'x-forwarded-for': quart.request.headers.get('X-Forwarded-For'),
|
||||
}.items()
|
||||
if value
|
||||
}
|
||||
|
||||
result = await self.ap.plugin_connector.handle_page_api(
|
||||
author, plugin_name, page_id, endpoint, method.upper(), body
|
||||
author, plugin_name, page_id, endpoint, method.upper(), body, caller, headers
|
||||
)
|
||||
if result.get('error'):
|
||||
return self.http_status(400, -1, result['error'])
|
||||
|
||||
@@ -3,11 +3,53 @@ from __future__ import annotations
|
||||
import uuid
|
||||
import datetime
|
||||
import sqlalchemy
|
||||
import json
|
||||
|
||||
from ....core import app
|
||||
from ....entity.persistence import monitoring as persistence_monitoring
|
||||
|
||||
|
||||
def _utc_now() -> datetime.datetime:
|
||||
return datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None)
|
||||
|
||||
|
||||
def _json_dumps(value: dict | list | None) -> str | None:
|
||||
if value is None:
|
||||
return None
|
||||
try:
|
||||
return json.dumps(value, ensure_ascii=False, default=str)
|
||||
except Exception:
|
||||
return json.dumps({'serialization_error': str(value)}, ensure_ascii=False)
|
||||
|
||||
|
||||
def _json_loads(value: str | None) -> dict | list | None:
|
||||
if not value:
|
||||
return None
|
||||
try:
|
||||
return json.loads(value)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def new_trace_id() -> str:
|
||||
return f'trace-{uuid.uuid4().hex[:16]}'
|
||||
|
||||
|
||||
def new_span_id() -> str:
|
||||
return f'span-{uuid.uuid4().hex[:16]}'
|
||||
|
||||
|
||||
def normalize_trace_status(status: str | None) -> str:
|
||||
"""Normalize operation status to the monitoring UI vocabulary."""
|
||||
if status in ('completed', 'ok'):
|
||||
return 'success'
|
||||
if status in ('failed', 'failure', 'exception'):
|
||||
return 'error'
|
||||
if status in ('running', 'success', 'error'):
|
||||
return status
|
||||
return 'success'
|
||||
|
||||
|
||||
class MonitoringService:
|
||||
"""Monitoring service"""
|
||||
|
||||
@@ -74,6 +116,18 @@ class MonitoringService:
|
||||
persistence_monitoring.MonitoringFeedback.timestamp,
|
||||
persistence_monitoring.MonitoringFeedback.id,
|
||||
),
|
||||
(
|
||||
'monitoring_traces',
|
||||
persistence_monitoring.MonitoringTrace,
|
||||
persistence_monitoring.MonitoringTrace.started_at,
|
||||
persistence_monitoring.MonitoringTrace.trace_id,
|
||||
),
|
||||
(
|
||||
'monitoring_spans',
|
||||
persistence_monitoring.MonitoringSpan,
|
||||
persistence_monitoring.MonitoringSpan.started_at,
|
||||
persistence_monitoring.MonitoringSpan.span_id,
|
||||
),
|
||||
]
|
||||
|
||||
deleted_counts: dict[str, int] = {}
|
||||
@@ -133,6 +187,116 @@ class MonitoringService:
|
||||
|
||||
# ========== Recording Methods ==========
|
||||
|
||||
async def start_trace(
|
||||
self,
|
||||
trace_id: str | None = None,
|
||||
name: str = 'LangBot query',
|
||||
bot_id: str | None = None,
|
||||
bot_name: str | None = None,
|
||||
pipeline_id: str | None = None,
|
||||
pipeline_name: str | None = None,
|
||||
session_id: str | None = None,
|
||||
message_id: str | None = None,
|
||||
query_id: str | int | None = None,
|
||||
attributes: dict | None = None,
|
||||
) -> str:
|
||||
"""Create or update a trace header row."""
|
||||
trace_id = trace_id or new_trace_id()
|
||||
trace_data = {
|
||||
'trace_id': trace_id,
|
||||
'started_at': _utc_now(),
|
||||
'ended_at': None,
|
||||
'duration': None,
|
||||
'status': 'running',
|
||||
'name': name,
|
||||
'bot_id': bot_id,
|
||||
'bot_name': bot_name,
|
||||
'pipeline_id': pipeline_id,
|
||||
'pipeline_name': pipeline_name,
|
||||
'session_id': session_id,
|
||||
'message_id': message_id,
|
||||
'query_id': str(query_id) if query_id is not None else None,
|
||||
'attributes': _json_dumps(attributes),
|
||||
}
|
||||
|
||||
await self.ap.persistence_mgr.execute_async(
|
||||
sqlalchemy.insert(persistence_monitoring.MonitoringTrace).values(trace_data)
|
||||
)
|
||||
return trace_id
|
||||
|
||||
async def finish_trace(
|
||||
self,
|
||||
trace_id: str,
|
||||
status: str = 'success',
|
||||
duration: int | None = None,
|
||||
message_id: str | None = None,
|
||||
attributes: dict | None = None,
|
||||
) -> None:
|
||||
"""Mark a trace complete."""
|
||||
update_values: dict = {
|
||||
'ended_at': _utc_now(),
|
||||
'status': normalize_trace_status(status),
|
||||
}
|
||||
if duration is not None:
|
||||
update_values['duration'] = duration
|
||||
if message_id is not None:
|
||||
update_values['message_id'] = message_id
|
||||
if attributes is not None:
|
||||
update_values['attributes'] = _json_dumps(attributes)
|
||||
|
||||
await self.ap.persistence_mgr.execute_async(
|
||||
sqlalchemy.update(persistence_monitoring.MonitoringTrace)
|
||||
.where(persistence_monitoring.MonitoringTrace.trace_id == trace_id)
|
||||
.values(update_values)
|
||||
)
|
||||
|
||||
async def record_span(
|
||||
self,
|
||||
trace_id: str,
|
||||
name: str,
|
||||
kind: str,
|
||||
status: str = 'success',
|
||||
span_id: str | None = None,
|
||||
parent_span_id: str | None = None,
|
||||
started_at: datetime.datetime | None = None,
|
||||
ended_at: datetime.datetime | None = None,
|
||||
duration: int | None = None,
|
||||
message_id: str | None = None,
|
||||
session_id: str | None = None,
|
||||
bot_id: str | None = None,
|
||||
pipeline_id: str | None = None,
|
||||
attributes: dict | None = None,
|
||||
error_message: str | None = None,
|
||||
) -> str:
|
||||
"""Record a single completed span."""
|
||||
started_at = started_at or _utc_now()
|
||||
if duration is None and ended_at is not None:
|
||||
duration = int((ended_at - started_at).total_seconds() * 1000)
|
||||
elif duration is not None:
|
||||
duration = int(round(float(duration)))
|
||||
span_data = {
|
||||
'span_id': span_id or new_span_id(),
|
||||
'trace_id': trace_id,
|
||||
'parent_span_id': parent_span_id,
|
||||
'name': name,
|
||||
'kind': kind,
|
||||
'status': normalize_trace_status(status),
|
||||
'started_at': started_at,
|
||||
'ended_at': ended_at or _utc_now(),
|
||||
'duration': duration,
|
||||
'message_id': message_id,
|
||||
'session_id': session_id,
|
||||
'bot_id': bot_id,
|
||||
'pipeline_id': pipeline_id,
|
||||
'attributes': _json_dumps(attributes),
|
||||
'error_message': error_message,
|
||||
}
|
||||
|
||||
await self.ap.persistence_mgr.execute_async(
|
||||
sqlalchemy.insert(persistence_monitoring.MonitoringSpan).values(span_data)
|
||||
)
|
||||
return span_data['span_id']
|
||||
|
||||
async def record_message(
|
||||
self,
|
||||
bot_id: str,
|
||||
@@ -1076,6 +1240,19 @@ class MonitoringService:
|
||||
for row in error_rows
|
||||
]
|
||||
|
||||
trace_query = (
|
||||
sqlalchemy.select(persistence_monitoring.MonitoringTrace)
|
||||
.where(persistence_monitoring.MonitoringTrace.message_id == message_id)
|
||||
.order_by(persistence_monitoring.MonitoringTrace.started_at.desc())
|
||||
.limit(1)
|
||||
)
|
||||
trace_result = await self.ap.persistence_mgr.execute_async(trace_query)
|
||||
trace_row = trace_result.first()
|
||||
trace = None
|
||||
if trace_row:
|
||||
trace_model = trace_row[0] if isinstance(trace_row, tuple) else trace_row
|
||||
trace = self._serialize_trace(trace_model)
|
||||
|
||||
return {
|
||||
'message_id': message_id,
|
||||
'found': True,
|
||||
@@ -1090,6 +1267,90 @@ class MonitoringService:
|
||||
'average_duration_ms': int(total_duration / len(llm_rows)) if len(llm_rows) > 0 else 0,
|
||||
},
|
||||
'errors': errors,
|
||||
'trace': trace,
|
||||
}
|
||||
|
||||
def _serialize_trace(self, trace: persistence_monitoring.MonitoringTrace) -> dict:
|
||||
data = self.ap.persistence_mgr.serialize_model(persistence_monitoring.MonitoringTrace, trace)
|
||||
data['attributes'] = _json_loads(data.get('attributes')) or {}
|
||||
return data
|
||||
|
||||
def _serialize_span(self, span: persistence_monitoring.MonitoringSpan) -> dict:
|
||||
data = self.ap.persistence_mgr.serialize_model(persistence_monitoring.MonitoringSpan, span)
|
||||
data['attributes'] = _json_loads(data.get('attributes')) or {}
|
||||
return data
|
||||
|
||||
async def get_traces(
|
||||
self,
|
||||
bot_ids: list[str] | None = None,
|
||||
pipeline_ids: list[str] | None = None,
|
||||
session_ids: list[str] | None = None,
|
||||
statuses: list[str] | None = None,
|
||||
start_time: datetime.datetime | None = None,
|
||||
end_time: datetime.datetime | None = None,
|
||||
limit: int = 100,
|
||||
offset: int = 0,
|
||||
) -> tuple[list[dict], int]:
|
||||
"""Get trace headers with filters."""
|
||||
conditions = []
|
||||
if bot_ids:
|
||||
conditions.append(persistence_monitoring.MonitoringTrace.bot_id.in_(bot_ids))
|
||||
if pipeline_ids:
|
||||
conditions.append(persistence_monitoring.MonitoringTrace.pipeline_id.in_(pipeline_ids))
|
||||
if session_ids:
|
||||
conditions.append(persistence_monitoring.MonitoringTrace.session_id.in_(session_ids))
|
||||
if statuses:
|
||||
conditions.append(persistence_monitoring.MonitoringTrace.status.in_(statuses))
|
||||
if start_time:
|
||||
conditions.append(persistence_monitoring.MonitoringTrace.started_at >= start_time)
|
||||
if end_time:
|
||||
conditions.append(persistence_monitoring.MonitoringTrace.started_at <= end_time)
|
||||
|
||||
count_query = sqlalchemy.select(sqlalchemy.func.count(persistence_monitoring.MonitoringTrace.trace_id))
|
||||
query = sqlalchemy.select(persistence_monitoring.MonitoringTrace)
|
||||
if conditions:
|
||||
clause = sqlalchemy.and_(*conditions)
|
||||
count_query = count_query.where(clause)
|
||||
query = query.where(clause)
|
||||
|
||||
total_result = await self.ap.persistence_mgr.execute_async(count_query)
|
||||
total = total_result.scalar() or 0
|
||||
|
||||
query = query.order_by(persistence_monitoring.MonitoringTrace.started_at.desc()).limit(limit).offset(offset)
|
||||
result = await self.ap.persistence_mgr.execute_async(query)
|
||||
traces = [
|
||||
self._serialize_trace(row[0] if isinstance(row, tuple) else row)
|
||||
for row in result.all()
|
||||
]
|
||||
return traces, total
|
||||
|
||||
async def get_trace_details(self, trace_id: str) -> dict:
|
||||
"""Get a single trace and all spans in chronological order."""
|
||||
trace_query = sqlalchemy.select(persistence_monitoring.MonitoringTrace).where(
|
||||
persistence_monitoring.MonitoringTrace.trace_id == trace_id
|
||||
)
|
||||
trace_result = await self.ap.persistence_mgr.execute_async(trace_query)
|
||||
trace_row = trace_result.first()
|
||||
if not trace_row:
|
||||
return {'trace_id': trace_id, 'found': False}
|
||||
|
||||
trace = trace_row[0] if isinstance(trace_row, tuple) else trace_row
|
||||
span_query = (
|
||||
sqlalchemy.select(persistence_monitoring.MonitoringSpan)
|
||||
.where(persistence_monitoring.MonitoringSpan.trace_id == trace_id)
|
||||
.order_by(persistence_monitoring.MonitoringSpan.started_at.asc())
|
||||
)
|
||||
span_result = await self.ap.persistence_mgr.execute_async(span_query)
|
||||
spans = [
|
||||
self._serialize_span(row[0] if isinstance(row, tuple) else row)
|
||||
for row in span_result.all()
|
||||
]
|
||||
|
||||
return {
|
||||
'trace_id': trace_id,
|
||||
'found': True,
|
||||
'trace': self._serialize_trace(trace),
|
||||
'spans': spans,
|
||||
}
|
||||
|
||||
# ========== Export Methods ==========
|
||||
|
||||
Reference in New Issue
Block a user