From 67437c2f5a7cbd2a89e66ba3901fb935e265fcde Mon Sep 17 00:00:00 2001 From: huanghuoguoguo <60681390+huanghuoguoguo@users.noreply.github.com> Date: Thu, 25 Jun 2026 00:07:37 +0800 Subject: [PATCH] Add performance and reliability QA gates --- skills/schemas/case.schema.json | 34 +- skills/schemas/suite.schema.json | 12 +- skills/scripts/e2e/pipeline-debug-chat.mjs | 80 ++++- skills/skills.index.json | 231 +++++++++++++ skills/skills/langbot-testing/SKILL.md | 3 + .../langbot-fault-taxonomy-contract.yaml | 35 ++ .../cases/langbot-live-backend-latency.yaml | 42 +++ .../langbot-live-backend-log-health.yaml | 45 +++ .../cases/langbot-live-control-plane-api.yaml | 44 +++ .../langbot-overhead-accounting-contract.yaml | 37 +++ .../pipeline-debug-chat-performance.yaml | 75 +++++ .../plugins/qa-plugin-smoke/.gitignore | 4 +- .../dist/qa-plugin-smoke-0.1.0.lbpkg | Bin 0 -> 5160 bytes .../langbot-fault-taxonomy-contract.mjs | 159 +++++++++ .../probes/langbot-live-backend-latency.mjs | 212 ++++++++++++ .../langbot-live-backend-log-health.mjs | 205 ++++++++++++ .../probes/langbot-live-control-plane-api.mjs | 311 ++++++++++++++++++ .../langbot-overhead-accounting-contract.mjs | 162 +++++++++ .../performance-reliability-testing.md | 173 ++++++++++ .../suites/langbot-live-backend-gate.yaml | 14 + .../langbot-performance-contract-gate.yaml | 13 + .../langbot-performance-reliability-gate.yaml | 16 + .../langbot-user-path-performance-gate.yaml | 12 + skills/src/commands/env.ts | 35 ++ skills/src/commands/suite.ts | 47 ++- skills/src/commands/test.ts | 109 +++++- skills/src/commands/validate.ts | 3 + skills/src/constants.ts | 29 +- skills/src/log-guard.ts | 20 ++ skills/src/readiness.ts | 2 + skills/test/lbs-cli.test.ts | 160 ++++++++- 31 files changed, 2299 insertions(+), 25 deletions(-) create mode 100644 skills/skills/langbot-testing/cases/langbot-fault-taxonomy-contract.yaml create mode 100644 skills/skills/langbot-testing/cases/langbot-live-backend-latency.yaml create mode 100644 skills/skills/langbot-testing/cases/langbot-live-backend-log-health.yaml create mode 100644 skills/skills/langbot-testing/cases/langbot-live-control-plane-api.yaml create mode 100644 skills/skills/langbot-testing/cases/langbot-overhead-accounting-contract.yaml create mode 100644 skills/skills/langbot-testing/cases/pipeline-debug-chat-performance.yaml create mode 100644 skills/skills/langbot-testing/fixtures/plugins/qa-plugin-smoke/dist/qa-plugin-smoke-0.1.0.lbpkg create mode 100644 skills/skills/langbot-testing/probes/langbot-fault-taxonomy-contract.mjs create mode 100644 skills/skills/langbot-testing/probes/langbot-live-backend-latency.mjs create mode 100644 skills/skills/langbot-testing/probes/langbot-live-backend-log-health.mjs create mode 100644 skills/skills/langbot-testing/probes/langbot-live-control-plane-api.mjs create mode 100644 skills/skills/langbot-testing/probes/langbot-overhead-accounting-contract.mjs create mode 100644 skills/skills/langbot-testing/references/performance-reliability-testing.md create mode 100644 skills/skills/langbot-testing/suites/langbot-live-backend-gate.yaml create mode 100644 skills/skills/langbot-testing/suites/langbot-performance-contract-gate.yaml create mode 100644 skills/skills/langbot-testing/suites/langbot-performance-reliability-gate.yaml create mode 100644 skills/skills/langbot-testing/suites/langbot-user-path-performance-gate.yaml diff --git a/skills/schemas/case.schema.json b/skills/schemas/case.schema.json index f6365c062..0d63d8dec 100644 --- a/skills/schemas/case.schema.json +++ b/skills/schemas/case.schema.json @@ -48,7 +48,18 @@ }, "type": { "type": "string", - "enum": ["smoke", "regression", "feature", "provider", "exploratory"] + "enum": [ + "smoke", + "regression", + "feature", + "provider", + "exploratory", + "contract", + "performance", + "reliability", + "chaos", + "security" + ] }, "priority": { "type": "string", @@ -102,7 +113,11 @@ "backend_log", "frontend_log", "api_diagnostic", - "filesystem" + "filesystem", + "metrics", + "trace", + "profile", + "resource_log" ] }, "minItems": 1 @@ -188,9 +203,24 @@ "type": "string", "enum": ["person", "group"] }, + "automation_debug_chat_response_p95_ms": { + "type": "string" + }, + "automation_debug_chat_max_error_rate": { + "type": "string" + }, "automation_filesystem_checks_json": { "type": "string" }, + "metrics_thresholds_json": { + "type": "string" + }, + "load_profile_json": { + "type": "string" + }, + "fault_model_json": { + "type": "string" + }, "automation_pipeline_url_env": { "type": "string", "pattern": "^[A-Z][A-Z0-9_]*$" diff --git a/skills/schemas/suite.schema.json b/skills/schemas/suite.schema.json index 3da1a3e85..4f3fa7c7a 100644 --- a/skills/schemas/suite.schema.json +++ b/skills/schemas/suite.schema.json @@ -18,7 +18,17 @@ }, "type": { "type": "string", - "enum": ["smoke", "regression", "release_gate", "exploratory"] + "enum": [ + "smoke", + "regression", + "release_gate", + "exploratory", + "contract", + "performance", + "reliability", + "chaos", + "security" + ] }, "priority": { "type": "string", diff --git a/skills/scripts/e2e/pipeline-debug-chat.mjs b/skills/scripts/e2e/pipeline-debug-chat.mjs index 87fe9ae79..4b20f7757 100755 --- a/skills/scripts/e2e/pipeline-debug-chat.mjs +++ b/skills/scripts/e2e/pipeline-debug-chat.mjs @@ -54,6 +54,7 @@ const debugChatSessionType = env.LANGBOT_E2E_DEBUG_CHAT_SESSION_TYPE || "person" const pipelineConfigDiagnosticPath = resolve(paths.evidenceDir, "pipeline-config-diagnostic.json"); const debugChatResetDiagnosticPath = resolve(paths.evidenceDir, "debug-chat-reset-diagnostic.json"); const pipelineConfigRestoreDiagnosticPath = resolve(paths.evidenceDir, "pipeline-config-restore-diagnostic.json"); +const metricsPath = resolve(paths.evidenceDir, "metrics.json"); const startedAt = new Date(); let browser; @@ -80,10 +81,11 @@ let result = { console_log: paths.consoleLog, network_log: paths.networkLog, screenshot: paths.screenshot, + metrics_json: metricsPath, automation_result_json: paths.automationResultJson, result_json: paths.resultJson, }, - evidence_collected: ["ui", "screenshot", "console", "network"], + evidence_collected: ["ui", "screenshot", "console", "network", "metrics"], }; function boolFromEnv(value, defaultValue) { @@ -103,6 +105,29 @@ function parseJsonEnv(key, fallback) { } } +function positiveNumberEnv(key, fallback) { + const value = Number(env[key] || ""); + return Number.isFinite(value) && value >= 0 ? value : fallback; +} + +function percentile(values, percentileValue) { + if (values.length === 0) return 0; + const sorted = [...values].sort((a, b) => a - b); + const index = Math.min(sorted.length - 1, Math.ceil((percentileValue / 100) * sorted.length) - 1); + return Number(sorted[index].toFixed(3)); +} + +function stats(values) { + if (values.length === 0) return { min: 0, p50: 0, p95: 0, p99: 0, max: 0 }; + return { + min: Number(Math.min(...values).toFixed(3)), + p50: percentile(values, 50), + p95: percentile(values, 95), + p99: percentile(values, 99), + max: Number(Math.max(...values).toFixed(3)), + }; +} + function promptStepsFromEnv() { const rawSteps = parseJsonEnv("LANGBOT_E2E_PROMPTS_JSON", null); if (rawSteps === null) { @@ -658,6 +683,7 @@ try { } else { for (let index = 0; index < promptSteps.length; index += 1) { const step = promptSteps[index]; + const promptStartedAt = Date.now(); const chatResult = await runDebugChatPrompt(page, { prompt: step.prompt, expectedText: step.expectedText, @@ -665,11 +691,13 @@ try { imagePath: index === 0 ? imagePath : "", failureSignals: failureSignals.length > 0 ? failureSignals : undefined, }); + const promptDurationMs = Date.now() - promptStartedAt; result.chat_results.push({ index, expected_text: step.expectedText, status: chatResult.status, reason: chatResult.reason, + response_duration_ms: promptDurationMs, min_expected_count: chatResult.min_expected_count, final_count: chatResult.final_count, before_assistant_expected_count: chatResult.before_assistant_expected_count, @@ -714,6 +742,56 @@ try { const finishedAt = new Date(); result.finished_at = finishedAt.toISOString(); result.finished_at_local = localIsoWithOffset(finishedAt); + result.duration_ms = finishedAt.getTime() - startedAt.getTime(); + const responseDurations = result.chat_results + .map((item) => item.response_duration_ms) + .filter((value) => Number.isFinite(value)); + const passedPrompts = result.chat_results.filter((item) => item.status === "pass").length; + const attemptedPrompts = result.chat_results.length; + const errorRate = attemptedPrompts === 0 ? 1 : Number(((attemptedPrompts - passedPrompts) / attemptedPrompts).toFixed(4)); + const responseStats = stats(responseDurations); + const responseP95BudgetMs = positiveNumberEnv( + "LANGBOT_E2E_DEBUG_CHAT_RESPONSE_P95_MS", + positiveNumberEnv("LANGBOT_DEBUG_CHAT_RESPONSE_P95_MS", safeResponseTimeoutMs), + ); + const maxErrorRate = positiveNumberEnv("LANGBOT_E2E_DEBUG_CHAT_MAX_ERROR_RATE", 0); + const metrics = { + probe: caseId, + url: result.url, + prompt_count: result.prompt_count, + attempted_prompt_count: attemptedPrompts, + passed_prompt_count: passedPrompts, + error_rate: errorRate, + response_duration_ms: responseStats, + total_duration_ms: result.duration_ms, + chat_results: result.chat_results, + }; + result.metrics_summary = { + prompt_count: metrics.prompt_count, + attempted_prompt_count: metrics.attempted_prompt_count, + passed_prompt_count: metrics.passed_prompt_count, + error_rate: metrics.error_rate, + response_p50_ms: metrics.response_duration_ms.p50, + response_p95_ms: metrics.response_duration_ms.p95, + total_duration_ms: metrics.total_duration_ms, + }; + result.thresholds_summary = { + response_p95_ms: { + actual: metrics.response_duration_ms.p95, + max: responseP95BudgetMs, + pass: attemptedPrompts > 0 && metrics.response_duration_ms.p95 <= responseP95BudgetMs, + }, + error_rate: { + actual: metrics.error_rate, + max: maxErrorRate, + pass: metrics.error_rate <= maxErrorRate, + }, + }; + await writeFile(metricsPath, `${JSON.stringify(metrics, null, 2)}\n`, "utf8"); + if (result.status === "pass" && !Object.values(result.thresholds_summary).every((item) => item.pass)) { + result.status = "fail"; + result.reason = "Debug Chat performance breached response latency or error-rate thresholds."; + } const existingEvidence = {}; for (const [key, value] of Object.entries(result.evidence)) { if (typeof value !== "string") continue; diff --git a/skills/skills.index.json b/skills/skills.index.json index d56a84822..190cf1305 100644 --- a/skills/skills.index.json +++ b/skills/skills.index.json @@ -130,6 +130,7 @@ "references/local-agent-runner.md", "references/mcp-stdio-testing.md", "references/model-provider-testing.md", + "references/performance-reliability-testing.md", "references/pipeline-debug-chat.md", "references/plugin-e2e-smoke.md", "references/sandbox-skill-authoring.md", @@ -150,6 +151,11 @@ "agent-runner-release-preflight", "agent-runner-runtime-chaos", "dify-agent-debug-chat", + "langbot-fault-taxonomy-contract", + "langbot-live-backend-latency", + "langbot-live-backend-log-health", + "langbot-live-control-plane-api", + "langbot-overhead-accounting-contract", "langrag-kb-retrieve", "langrag-parser-golden-e2e", "langrag-sentinel-kb-discover", @@ -165,6 +171,7 @@ "mcp-stdio-register", "mcp-stdio-tool-call", "pipeline-debug-chat", + "pipeline-debug-chat-performance", "plugin-e2e-smoke", "provider-deepseek", "qa-plugin-smoke-live-install", @@ -486,6 +493,128 @@ "backend_log" ] }, + { + "id": "langbot-fault-taxonomy-contract", + "title": "LangBot fault taxonomy and cleanup contract", + "mode": "probe", + "area": "reliability", + "type": "chaos", + "priority": "p1", + "risk": "medium", + "ci_eligible": true, + "tags": [ + "reliability", + "chaos", + "contract", + "synthetic" + ], + "automation": "skills/langbot-testing/probes/langbot-fault-taxonomy-contract.mjs", + "setup_automation": [], + "setup_provides_env": [], + "evidence_required": [ + "metrics", + "filesystem" + ] + }, + { + "id": "langbot-live-backend-latency", + "title": "LangBot live backend basic latency probe", + "mode": "probe", + "area": "performance", + "type": "performance", + "priority": "p1", + "risk": "medium", + "ci_eligible": false, + "tags": [ + "performance", + "live-backend", + "latency", + "metrics" + ], + "automation": "skills/langbot-testing/probes/langbot-live-backend-latency.mjs", + "setup_automation": [], + "setup_provides_env": [], + "evidence_required": [ + "metrics", + "network", + "api_diagnostic", + "filesystem" + ] + }, + { + "id": "langbot-live-backend-log-health", + "title": "LangBot live backend log health probe", + "mode": "probe", + "area": "reliability", + "type": "reliability", + "priority": "p1", + "risk": "medium", + "ci_eligible": false, + "tags": [ + "reliability", + "live-backend", + "backend-log", + "metrics" + ], + "automation": "skills/langbot-testing/probes/langbot-live-backend-log-health.mjs", + "setup_automation": [], + "setup_provides_env": [], + "evidence_required": [ + "metrics", + "backend_log", + "filesystem" + ] + }, + { + "id": "langbot-live-control-plane-api", + "title": "LangBot live control-plane API probe", + "mode": "probe", + "area": "performance", + "type": "performance", + "priority": "p1", + "risk": "medium", + "ci_eligible": false, + "tags": [ + "performance", + "reliability", + "live-backend", + "control-plane", + "metrics" + ], + "automation": "skills/langbot-testing/probes/langbot-live-control-plane-api.mjs", + "setup_automation": [], + "setup_provides_env": [], + "evidence_required": [ + "metrics", + "network", + "api_diagnostic", + "filesystem" + ] + }, + { + "id": "langbot-overhead-accounting-contract", + "title": "LangBot overhead accounting metrics contract", + "mode": "probe", + "area": "performance", + "type": "performance", + "priority": "p1", + "risk": "medium", + "ci_eligible": true, + "tags": [ + "performance", + "metrics", + "contract", + "synthetic" + ], + "automation": "skills/langbot-testing/probes/langbot-overhead-accounting-contract.mjs", + "setup_automation": [], + "setup_provides_env": [], + "evidence_required": [ + "metrics", + "resource_log", + "filesystem" + ] + }, { "id": "langrag-kb-retrieve", "title": "LangRAG knowledge base ingests and retrieves a sentinel document", @@ -911,6 +1040,33 @@ "backend_log" ] }, + { + "id": "pipeline-debug-chat-performance", + "title": "Pipeline Debug Chat user-path performance probe", + "mode": "agent-browser", + "area": "pipeline", + "type": "performance", + "priority": "p1", + "risk": "medium", + "ci_eligible": false, + "tags": [ + "performance", + "pipeline", + "debug-chat", + "user-path", + "metrics" + ], + "automation": "scripts/e2e/pipeline-debug-chat.mjs", + "setup_automation": [], + "setup_provides_env": [], + "evidence_required": [ + "ui", + "screenshot", + "console", + "network", + "metrics" + ] + }, { "id": "plugin-e2e-smoke", "title": "Plugin system installs a local plugin and exposes tool/page APIs", @@ -1059,6 +1215,10 @@ "suites": [ "agent-runner-release-gate", "core-smoke", + "langbot-live-backend-gate", + "langbot-performance-contract-gate", + "langbot-performance-reliability-gate", + "langbot-user-path-performance-gate", "local-agent-gate" ], "suite_summaries": [ @@ -1121,6 +1281,77 @@ "local-agent-basic-debug-chat" ] }, + { + "id": "langbot-live-backend-gate", + "title": "LangBot live backend reliability gate", + "description": "Live backend control-plane responsiveness and runtime log health checks for a locally running LangBot instance.", + "type": "reliability", + "priority": "p1", + "tags": [ + "performance", + "reliability", + "live-backend", + "metrics" + ], + "cases": [ + "langbot-live-backend-latency", + "langbot-live-control-plane-api", + "langbot-live-backend-log-health" + ] + }, + { + "id": "langbot-performance-contract-gate", + "title": "LangBot performance contract gate", + "description": "Fast synthetic contract checks for performance metric accounting and non-destructive reliability fault taxonomy.", + "type": "contract", + "priority": "p1", + "tags": [ + "performance", + "reliability", + "contract", + "metrics" + ], + "cases": [ + "langbot-overhead-accounting-contract", + "langbot-fault-taxonomy-contract" + ] + }, + { + "id": "langbot-performance-reliability-gate", + "title": "LangBot performance and reliability starter gate", + "description": "Starter gate for LangBot performance accounting, live backend control-plane latency, and non-destructive fault taxonomy checks.", + "type": "reliability", + "priority": "p1", + "tags": [ + "performance", + "reliability", + "metrics", + "chaos" + ], + "cases": [ + "langbot-overhead-accounting-contract", + "langbot-fault-taxonomy-contract", + "langbot-live-backend-latency", + "langbot-live-control-plane-api", + "langbot-live-backend-log-health" + ] + }, + { + "id": "langbot-user-path-performance-gate", + "title": "LangBot user-path performance gate", + "description": "Browser-visible performance checks for user-facing LangBot paths such as Pipeline Debug Chat.", + "type": "performance", + "priority": "p1", + "tags": [ + "performance", + "browser", + "debug-chat", + "user-path" + ], + "cases": [ + "pipeline-debug-chat-performance" + ] + }, { "id": "local-agent-gate", "title": "Local Agent runner regression gate", diff --git a/skills/skills/langbot-testing/SKILL.md b/skills/skills/langbot-testing/SKILL.md index e9db1980f..748ae9b81 100644 --- a/skills/skills/langbot-testing/SKILL.md +++ b/skills/skills/langbot-testing/SKILL.md @@ -21,6 +21,7 @@ Use this skill when an agent needs to verify LangBot behavior through the WebUI - **Sandbox-backed skill authoring**: read `references/sandbox-skill-authoring.md`. - **LangRAG knowledge bases**: read `references/langrag-knowledge-base.md`. - **MCP stdio tool testing**: read `references/mcp-stdio-testing.md`. +- **Performance, reliability, or chaos probes**: read `references/performance-reliability-testing.md`. - **Drive a live instance over MCP (not raw HTTP)**: use the `langbot-mcp-ops` skill — the instance exposes an MCP server at `http://:5300/mcp` (reuses API keys). Useful for setting up bots/pipelines/models as test fixtures programmatically. - **Known failures and fixes**: read `references/troubleshooting.md`. - **Reusable test groups**: run `bin/lbs suite list` and `bin/lbs suite plan ` before manually assembling a case set. @@ -36,6 +37,8 @@ Use this skill when an agent needs to verify LangBot behavior through the WebUI - Use an authenticated browser profile prepared by `langbot-env-setup`. - Do not expose API keys, OAuth secrets, tokens, or localStorage token values in output. - A WebUI test is not complete until the visible UI result is checked against backend logs or network behavior. +- A performance result is not complete without `metrics` evidence and a clear split between LangBot overhead and external provider/tool/network time. +- A chaos or reliability result is not complete until the fault scope, cleanup, and recovery checks are recorded. - For a suite, use `bin/lbs suite start ` to create the suite evidence root, per-case directories, and `suite-start.json`/`suite-start.md` handoff files; use `bin/lbs test result ` to write final per-case `result.json`, then run `bin/lbs suite report --evidence-dir `. - Do not mark a case `pass` until `test result --evidence` covers every value in the case's `evidence_required`. - For runner-specific Debug Chat cases, use the case-specific pipeline env declared by `automation_pipeline_url_env` / `automation_pipeline_name_env`; do not silently reuse a generic `LANGBOT_PIPELINE_URL`. diff --git a/skills/skills/langbot-testing/cases/langbot-fault-taxonomy-contract.yaml b/skills/skills/langbot-testing/cases/langbot-fault-taxonomy-contract.yaml new file mode 100644 index 000000000..2b990f837 --- /dev/null +++ b/skills/skills/langbot-testing/cases/langbot-fault-taxonomy-contract.yaml @@ -0,0 +1,35 @@ +id: langbot-fault-taxonomy-contract +title: "LangBot fault taxonomy and cleanup contract" +mode: probe +area: reliability +type: chaos +priority: p1 +risk: medium +ci_eligible: true +tags: + - reliability + - chaos + - contract + - synthetic +skills: + - langbot-testing +automation: skills/langbot-testing/probes/langbot-fault-taxonomy-contract.mjs +fault_model_json: '{"kind":"taxonomy-contract","destructive":false,"scenarios":["provider-timeout","plugin-runtime-disconnect","mcp-stdio-server-exit","operator-missing-login","transient-marketplace-timeout"]}' +steps: + - "Run `rtk bin/lbs test run langbot-fault-taxonomy-contract --dry-run` first; remove `--dry-run` after checking the evidence directory." + - "Automation validates that representative fault scenarios declare target, injected fault, expected status, recovery check, and cleanup." + - "Review metrics.json, fault-model.json, and automation-result.json under LBS_EVIDENCE_DIR." +checks: + - "automation-result.json status is pass." + - "Every scenario has an expected status in pass, fail, blocked, env_issue, or flaky." + - "Every scenario declares a cleanup action and recovery check." +evidence_required: + - metrics + - filesystem +diagnostics: + - "This is a non-destructive taxonomy contract probe; it does not inject real runtime faults." + - "Use it as a gate before adding live chaos cases that kill runtimes, route traffic through a proxy, or disrupt a backend dependency." +success_patterns: + - "Fault taxonomy contract declares status" +failure_patterns: + - "missing required scenario fields" diff --git a/skills/skills/langbot-testing/cases/langbot-live-backend-latency.yaml b/skills/skills/langbot-testing/cases/langbot-live-backend-latency.yaml new file mode 100644 index 000000000..1922d06f0 --- /dev/null +++ b/skills/skills/langbot-testing/cases/langbot-live-backend-latency.yaml @@ -0,0 +1,42 @@ +id: langbot-live-backend-latency +title: "LangBot live backend basic latency probe" +mode: probe +area: performance +type: performance +priority: p1 +risk: medium +ci_eligible: false +tags: + - performance + - live-backend + - latency + - metrics +skills: + - langbot-testing +env: + - LANGBOT_BACKEND_URL +automation: skills/langbot-testing/probes/langbot-live-backend-latency.mjs +metrics_thresholds_json: '{"backend_p95_ms":{"max":1000},"error_rate":{"max":0}}' +load_profile_json: '{"requests":12,"concurrency":2,"endpoints":["/healthz"]}' +steps: + - "Confirm the selected LangBot backend is the intended test target." + - "Run `rtk bin/lbs test run langbot-live-backend-latency --dry-run` first; remove `--dry-run` after checking LANGBOT_BACKEND_URL and evidence directory." + - "Automation sends a small request batch to LANGBOT_BACKEND_URL/healthz and records latency, status counts, and network errors." +checks: + - "automation-result.json status is pass when the backend responds and p95/error-rate thresholds pass." + - "automation-result.json status is env_issue when the backend is not reachable." + - "metrics.json and network.log are written under LBS_EVIDENCE_DIR." +evidence_required: + - metrics + - network + - api_diagnostic + - filesystem +diagnostics: + - "This probe measures backend health endpoint reachability latency only; it does not cover model/provider, browser, Debug Chat, RAG, or plugin runtime latency." +success_patterns: + - "Live backend latency probe passed" +failure_patterns: + - "Backend did not respond" + - "breached latency or error-rate thresholds" +troubleshooting: + - socks-proxy-without-socksio diff --git a/skills/skills/langbot-testing/cases/langbot-live-backend-log-health.yaml b/skills/skills/langbot-testing/cases/langbot-live-backend-log-health.yaml new file mode 100644 index 000000000..8ff911371 --- /dev/null +++ b/skills/skills/langbot-testing/cases/langbot-live-backend-log-health.yaml @@ -0,0 +1,45 @@ +id: langbot-live-backend-log-health +title: "LangBot live backend log health probe" +mode: probe +area: reliability +type: reliability +priority: p1 +risk: medium +ci_eligible: false +tags: + - reliability + - live-backend + - backend-log + - metrics +skills: + - langbot-testing +env: + - LANGBOT_BACKEND_URL +automation: skills/langbot-testing/probes/langbot-live-backend-log-health.mjs +metrics_thresholds_json: '{"fail_count":{"max":0}}' +load_profile_json: '{"lookback_seconds":300,"log_source":"LANGBOT_BACKEND_LOG or latest LANGBOT_REPO/data/logs/langbot-*.log"}' +steps: + - "Confirm the selected LangBot backend log belongs to the intended test target." + - "Run `rtk bin/lbs test run langbot-live-backend-log-health --dry-run` first; remove `--dry-run` after checking evidence directory and log source." + - "Automation scans the recent backend log window for fail-severity runtime findings such as Traceback, ImportError, ERROR, unclosed sessions, and unawaited coroutines." +checks: + - "automation-result.json status is pass only when fail_count is 0." + - "metrics_summary includes scanned_line_count, fail_count, warning_count, and finding_count." + - "findings.json and scanned-backend.log are written under LBS_EVIDENCE_DIR." +evidence_required: + - metrics + - backend_log + - filesystem +diagnostics: + - "Set LANGBOT_BACKEND_LOG to an explicit log path when the latest log file is not the run target." + - "Set LANGBOT_BACKEND_LOG_SINCE or LANGBOT_BACKEND_LOG_LOOKBACK_SECONDS to control the scan window." + - "This probe measures runtime log health; it does not prove user-facing Debug Chat, plugin, model, or RAG behavior." +success_patterns: + - "Live backend log health passed" +failure_patterns: + - "Traceback" + - "ImportError" + - "ERROR" + - "unclosed" +troubleshooting: + - socks-proxy-without-socksio diff --git a/skills/skills/langbot-testing/cases/langbot-live-control-plane-api.yaml b/skills/skills/langbot-testing/cases/langbot-live-control-plane-api.yaml new file mode 100644 index 000000000..2cd8ee2c7 --- /dev/null +++ b/skills/skills/langbot-testing/cases/langbot-live-control-plane-api.yaml @@ -0,0 +1,44 @@ +id: langbot-live-control-plane-api +title: "LangBot live control-plane API probe" +mode: probe +area: performance +type: performance +priority: p1 +risk: medium +ci_eligible: false +tags: + - performance + - reliability + - live-backend + - control-plane + - metrics +skills: + - langbot-testing +env: + - LANGBOT_BACKEND_URL +automation: skills/langbot-testing/probes/langbot-live-control-plane-api.mjs +metrics_thresholds_json: '{"error_rate":{"max":0},"response_shape_failures":{"max":0},"healthz_p95_ms":{"max":500},"system_info_p95_ms":{"max":1000}}' +load_profile_json: '{"requests":20,"concurrency":4,"endpoints":["/healthz","/api/v1/system/info"],"auth_required":false}' +steps: + - "Confirm the selected LangBot backend is the intended test target." + - "Run `rtk bin/lbs test run langbot-live-control-plane-api --dry-run` first; remove `--dry-run` after checking LANGBOT_BACKEND_URL and evidence directory." + - "Automation sends a small request batch to /healthz and /api/v1/system/info, then validates status code, JSON shape, and latency budgets." +checks: + - "automation-result.json status is pass when every control-plane request returns HTTP 200, JSON code 0, and required response fields." + - "metrics_summary includes per-endpoint p50/p95 latency, error rate, status counts, and response_shape_failures." + - "thresholds_summary shows error_rate, response_shape_failures, healthz_p95_ms, and system_info_p95_ms all pass." +evidence_required: + - metrics + - network + - api_diagnostic + - filesystem +diagnostics: + - "This probe measures unauthenticated backend control-plane readiness; it does not cover authenticated UI flows, Debug Chat, model calls, plugins, or RAG." + - "A system_info shape failure usually means the API contract or startup state changed and should be investigated before treating latency as healthy." +success_patterns: + - "Live control-plane API probe passed" +failure_patterns: + - "Backend did not respond" + - "breached shape, latency, or error-rate thresholds" +troubleshooting: + - socks-proxy-without-socksio diff --git a/skills/skills/langbot-testing/cases/langbot-overhead-accounting-contract.yaml b/skills/skills/langbot-testing/cases/langbot-overhead-accounting-contract.yaml new file mode 100644 index 000000000..650dfe7d9 --- /dev/null +++ b/skills/skills/langbot-testing/cases/langbot-overhead-accounting-contract.yaml @@ -0,0 +1,37 @@ +id: langbot-overhead-accounting-contract +title: "LangBot overhead accounting metrics contract" +mode: probe +area: performance +type: performance +priority: p1 +risk: medium +ci_eligible: true +tags: + - performance + - metrics + - contract + - synthetic +skills: + - langbot-testing +automation: skills/langbot-testing/probes/langbot-overhead-accounting-contract.mjs +metrics_thresholds_json: '{"sample_count":{"min":50},"langbot_overhead_p95_ms":{"max":25},"accounting_gap_max_ms":{"max":0.001}}' +load_profile_json: '{"kind":"synthetic-overhead-accounting","samples":80,"external_latency_segments":["provider","external_tool","network"]}' +steps: + - "Run `rtk bin/lbs test run langbot-overhead-accounting-contract --dry-run` first; remove `--dry-run` after checking the evidence directory." + - "Automation generates deterministic message-path latency samples and separates LangBot overhead from provider/tool/network latency." + - "Review metrics.json, thresholds.json, resource-log.json, and automation-result.json under LBS_EVIDENCE_DIR." +checks: + - "automation-result.json status is pass." + - "metrics_summary includes sample_count, langbot_overhead_p95_ms, e2e_latency_p95_ms, external_latency_p95_ms, and accounting_gap_max_ms." + - "thresholds_summary shows sample_count, langbot_overhead_p95_ms, and accounting_gap_max_ms all pass." +evidence_required: + - metrics + - resource_log + - filesystem +diagnostics: + - "This is a synthetic contract probe for the QA harness; it is not live product performance." + - "Use it to verify that reports can carry overhead accounting metrics before running live backend or browser performance probes." +success_patterns: + - "Overhead accounting contract passed" +failure_patterns: + - "breached one or more thresholds" diff --git a/skills/skills/langbot-testing/cases/pipeline-debug-chat-performance.yaml b/skills/skills/langbot-testing/cases/pipeline-debug-chat-performance.yaml new file mode 100644 index 000000000..a1a4944b5 --- /dev/null +++ b/skills/skills/langbot-testing/cases/pipeline-debug-chat-performance.yaml @@ -0,0 +1,75 @@ +id: pipeline-debug-chat-performance +title: "Pipeline Debug Chat user-path performance probe" +mode: agent-browser +area: pipeline +type: performance +priority: p1 +risk: medium +ci_eligible: false +tags: + - performance + - pipeline + - debug-chat + - user-path + - metrics +skills: + - langbot-env-setup + - langbot-testing +env: + - LANGBOT_FRONTEND_URL + - LANGBOT_BACKEND_URL +env_any: + - LANGBOT_PIPELINE_URL|LANGBOT_PIPELINE_NAME +automation: scripts/e2e/pipeline-debug-chat.mjs +automation_env: + - LANGBOT_FRONTEND_URL + - LANGBOT_BACKEND_URL + - LANGBOT_BROWSER_PROFILE + - LANGBOT_CHROMIUM_EXECUTABLE + - LANGBOT_E2E_PROMPT + - LANGBOT_E2E_EXPECTED_TEXT + - LANGBOT_E2E_RESPONSE_TIMEOUT_MS +automation_env_any: + - LANGBOT_PIPELINE_URL|LANGBOT_PIPELINE_NAME +automation_prompt: "请只回复 OK,用于性能测试。" +automation_expected_text: "OK" +automation_response_timeout_ms: "120000" +automation_reset_debug_chat: "true" +automation_debug_chat_response_p95_ms: "120000" +automation_debug_chat_max_error_rate: "0" +metrics_thresholds_json: '{"response_p95_ms":{"max":120000},"error_rate":{"max":0}}' +load_profile_json: '{"prompts":1,"browser":true,"path":"Pipeline Debug Chat","metric":"send-to-visible-completion"}' +preconditions: + - "LANGBOT_PIPELINE_URL or LANGBOT_PIPELINE_NAME points to the pipeline intended for this Debug Chat performance run." + - "The target pipeline is safe to reset Debug Chat history for this run." + - "The target pipeline has a known-good runner/model; provider latency should be interpreted separately from LangBot overhead." +steps: + - "Open LANGBOT_FRONTEND_URL with the prepared browser profile." + - "Open the target pipeline and select Debug Chat." + - "Reset Debug Chat history through the backend API when configured." + - "Send the deterministic prompt and wait for the expected assistant response." +checks: + - "automation-result.json status is pass when the expected assistant response appears." + - "metrics_summary includes response_p50_ms, response_p95_ms, error_rate, and total_duration_ms." + - "thresholds_summary shows response_p95_ms and error_rate pass." +evidence_required: + - ui + - screenshot + - console + - network + - metrics +diagnostics: + - "This case measures browser-visible send-to-completion latency; it does not split provider latency from LangBot overhead." + - "Use backend logs and provider diagnostics to explain slow runs before calling them LangBot regressions." +success_patterns: + - "Processing request from person_websocket" + - "Streaming completed" +failure_patterns: + - "Action invoke_llm_stream call timed out" + - "Task exception was never retrieved" + - "All models failed during streaming setup" +troubleshooting: + - debug-chat-history-contaminates-automation + - local-agent-model-route-unavailable + - plugin-runtime-timeout + - proxy-env-mismatch diff --git a/skills/skills/langbot-testing/fixtures/plugins/qa-plugin-smoke/.gitignore b/skills/skills/langbot-testing/fixtures/plugins/qa-plugin-smoke/.gitignore index 849ddff3b..89d8e500c 100644 --- a/skills/skills/langbot-testing/fixtures/plugins/qa-plugin-smoke/.gitignore +++ b/skills/skills/langbot-testing/fixtures/plugins/qa-plugin-smoke/.gitignore @@ -1 +1,3 @@ -dist/ +dist/* +!dist/ +!dist/qa-plugin-smoke-0.1.0.lbpkg diff --git a/skills/skills/langbot-testing/fixtures/plugins/qa-plugin-smoke/dist/qa-plugin-smoke-0.1.0.lbpkg b/skills/skills/langbot-testing/fixtures/plugins/qa-plugin-smoke/dist/qa-plugin-smoke-0.1.0.lbpkg new file mode 100644 index 0000000000000000000000000000000000000000..a4a50f803e0af7218ec8800d3cadff9e64d9b60b GIT binary patch literal 5160 zcmaJ_1yq#X+8s&+W(Y|U1w?5EB&BmGr9n`-q+=-Ql2JmsyCft8q(g+ErDf<+q#TeG zQ1ZUs?|zr--24AC>#VcZtl955=h^#t&a+?j2ROJCAQ0#($hmmR;5?VCIuQ#5+Q9~a zAi%E$y~;?vXZtzDBz={5HXt7%@i zNhDen^GS0(*JMUNCz%eL;C^yVh?uHvb98K?#dV<=DQM}Lgx~(`W1Ep)oOi0WcNv`U ziZtEv+&-D!)`=lAr@jsY1M&qb4?2lq8uytI_6f(8CYsmkBQQDE zoz>i8AE?O8r#kZ?#q(25ArW`<@xn=Kec)GO;yR~S&&M(yi?vk?>eSPvkBM_Lo6|&k zYwZs2WV6m%v@u}^b9M{FYFgdyuhL|sAn|M8Qx5vf(2p(RyFnN2b)N`H5ET!Pp^CLy z-J(z6GP}W(w9IZ4lq^>Rx)*|(`jsbk?7BJ$Z=>Urvdw)2L?Vc z987H-q0U~gsC&dn9x&`E18HXaav7w^;|vPv#JW8uk~rSUrX-RuHpVY=I(Q)&Fk3*= z_M|>ZAhx#6H0%v1yfYY)Z&6@iDfaP4co)<^Qx6Qamv`YbU>uuUJcRIx6DzO@cn(~j z`3Z&>ZqXlE;WhI(@wOS^E6{tNt0(t0-}*thrBMvm85az5p0jV`*9s=eSSX&w7nvkF z?o~r$Y=5JzE(Zl{TLz%vDlpraJ2^sKJ**Hh?W!F-;1}CIg#8)ycOd#Ww#-#yP`U=^ z?N4P@is|f-{?Fg~Kq{gPjE~>lJvs638}vng5&H?BSl3Lub-iD>;++9vsyf_t=p!C7 z%zgn;El*4*cvL`Yw)2PyVmU8wo7^g{SFyW7zI^z&@Q}9dQ&6#RI(P;qY6_IN*pAu9FMPV{HEsHp5L`7V_~V=C`Ez%8 z(+bFQbzkVhgXttnTZa#=A)`aK0C|rJAsDqQ^y1b|61b9I6ck}TvKZReug@HHVx%i8 zdaIFjx~(6F-@v)LvCOBFi1bP7BJo{kq|UfYfpj3)w_g6Zi)it#Jor;EVKdsu+H>rN znbLTd--`cox%?M~6LQHwPEqynn~bCjjh0E4G&pZKCB5xlQTSI0RB5>!1hI0}UvGaY z6v6>bc$)VjaF#~g9kj351}0jnHKl(l+&H;x^jeWsao8A(W!&azy#G z3TGQ=Nj)bU5u{#k6IFaV?W&>23h_G9fqNZAxGk(5vu&uen_HDOuImhT|a4Od{C#>KDJMKkvh#&Vz}4yWH%l5n!4cTGMej1Jfvd~^@U3Dcg$txh6>z+wu%4;9*~TpLay z@r3k0onk*&8@f9$vKafr7MAK}i};u~dckir6qv(QAM%-eEj<`=Cmm z!gApun?b%>-6PuLHcUiH-~4^OaTiRlntuD&Nk~|WBM-N%sOjC^TAzjwvwjsGB=tX! zz8&sXHC-4K{#qAdPwOg`{4;xAnL=WdfUKDS1{r4d{Jw&9qV5r5-y?;cWSFTkOchdd za^j8(4r~$MX-I|iHEeoga}Y&;{(|dddQD>r9kA-R8UWssyVn>b@XF5JW6oykRp&cb zrBF6@wcwkCws>vpO0uEPNzC5yCq#=2qNoX_?4$4@C$y}ZI3QiSy54VUMn`KWa4k0V zQc4w+(d&o3(>xJpO`95DI5MrCc9%Eu6Re~!MDD{xD_+bv=9sd{1D}0sYnu)Q45kd6bB zHp^==x^Fm16*)~iksvAFAV%=x5O~R)xViX%7*-(f%shdq502ms;?Jy+btyFhdJ7A> zli9?oBi%3T_LNJX`xBPu6GVQF(_gwr8g=zB5i4`W586&BAoWs2^H4MZwpx9JHsihU zOz`x|}P)YMz#V*`Lpxelp?<=pc%4|cfOmj%dv%q`f zfr85Uj8Pv*asjlo`g{B`;>&)Svv!{pjywM8rD+bpd70*KQJ zVmi}|`Yv?vr#iDiR>QaD`4O}ha@`f(8>onLj7!O(<&_l?5a3y0{7x%CkZPbb0unx@>&>pSmonxzJ^*y@MyaC#K&+03Q7?yU_VQ9jxI3QmITvQIgBJsgk#M zeDl|7*)a0+;P|&RX(>g%8#I~;_cr88s7fn{YY@Hv{xkLd#IIHE1MWlg)Oo73gyROu z0dIN(>ukMAtWZPnDpj1VPt24r&zwcsT-l0s11GNmN}WQ-T|HmP_yrFI^}IdFaM2`JY=b#X<`p#CZvq+jd`g3ak4}P2J4P?G&V!(nmMiEpW>4v4|Xi@M(@paZxdG z>_lCxztX)7A(`EwH4v1QgV!AQKIBhUQg!Q0;sI9Ffkx}vAFR0ATUt8*uC!VzPIC+t zE&Cig3l#<`w1YMiEq^ z4(=1zBAHrY>3NDnsa`YuSvF^Ht|Z1_%fqkND&Zb~K1Wk^Ji2*L9xcUeIvdB)O=I^e zoXlV1JcTkFoHzX4GwWj-R;2LF7+vdxd%oR!q1H{d0sHL5j1wbh=Cg1%ZQbX%7G z#53m3vAq@zV+C+L$^kM{hzI@h1EN^Fpd|f1R_3eyjV~KBhCfm~tMXuYYc54aJoC2H zlKKu-8OoMmV(SeuJdm@GbPwg||Oyd-_zN9kV1g$X{TW02CeoiS^rlUzVi5 zDEysS4w~X>boWTVAL$N|^k(KLRvR`YNaNe+wjMaIeXfnPqJzItr%k(h8Xg{505ML%I!hg-IirV30z5%7LM*JQf$JxYOHyBV7GetA;#TDJ{*B=c$Xg zP|cY~Ws5{Wi-@C|Q_>XSE?pt>}dFg6h<*{ z8yFxm{gG*|ZC)EA)yP)R6_1W*>)+NT@pTJ2UQ>5c?Kv(=mBYT~tzea~Bc5k2+lfIubn34Yk_U>o=3YHaWp^%`7=HQI%Smo`NFf*i4oJ+BQX ziu`)+T(QOcoI0%IV`E;jd}gvdYU<{uwpH8Q!&fPEQnYf5t&S`o8%xI|6>&Fy8{{;S2sTp05Ly@Znj ztsVyaf31&M>n|Tr2)v-Xe8At8A9Dkkh2|1(1Nd`H{9il3yYl;AVBk9lhzIt&Wdi*J D9m=s2 literal 0 HcmV?d00001 diff --git a/skills/skills/langbot-testing/probes/langbot-fault-taxonomy-contract.mjs b/skills/skills/langbot-testing/probes/langbot-fault-taxonomy-contract.mjs new file mode 100644 index 000000000..8c9628e58 --- /dev/null +++ b/skills/skills/langbot-testing/probes/langbot-fault-taxonomy-contract.mjs @@ -0,0 +1,159 @@ +#!/usr/bin/env node + +import { mkdir, writeFile } from "node:fs/promises"; +import { join, resolve } from "node:path"; +import { env, exit } from "node:process"; + +function pad(value, size = 2) { + return String(value).padStart(size, "0"); +} + +function localIsoWithOffset(date = new Date()) { + const offsetMinutes = -date.getTimezoneOffset(); + const sign = offsetMinutes >= 0 ? "+" : "-"; + const absolute = Math.abs(offsetMinutes); + return [ + `${date.getFullYear()}-${pad(date.getMonth() + 1)}-${pad(date.getDate())}`, + `T${pad(date.getHours())}:${pad(date.getMinutes())}:${pad(date.getSeconds())}.${pad(date.getMilliseconds(), 3)}`, + `${sign}${pad(Math.floor(absolute / 60))}:${pad(absolute % 60)}`, + ].join(""); +} + +function timestampSlug(date = new Date()) { + return date.toISOString().replace(/\.\d{3}Z$/, "Z").replace(/[^0-9A-Za-z]+/g, "-").replace(/^-|-$/g, ""); +} + +const scenarios = [ + { + id: "provider-timeout", + target: "provider", + injected_fault: "fake provider request exceeds the configured timeout", + expected_status: "env_issue", + recovery_check: "provider route is reachable or the case remains outside product pass/fail", + cleanup: "stop fake provider or reset proxy route", + }, + { + id: "plugin-runtime-disconnect", + target: "plugin-runtime", + injected_fault: "runtime control channel disconnects during an action", + expected_status: "fail", + recovery_check: "runtime reconnects and a deterministic plugin action succeeds", + cleanup: "restart the local plugin runtime process", + }, + { + id: "mcp-stdio-server-exit", + target: "mcp", + injected_fault: "stdio server exits mid-call", + expected_status: "fail", + recovery_check: "server can be registered again and exposes the expected tool", + cleanup: "remove temporary MCP server registration", + }, + { + id: "operator-missing-login", + target: "webui", + injected_fault: "browser profile is not authenticated", + expected_status: "blocked", + recovery_check: "authenticated profile can open the same WebUI origin", + cleanup: "no product cleanup; refresh local login state", + }, + { + id: "transient-marketplace-timeout", + target: "marketplace", + injected_fault: "marketplace request times out once and then succeeds", + expected_status: "flaky", + recovery_check: "rerun passes with the same product revision and no code change", + cleanup: "clear retry-only evidence and keep the run classified as flaky", + }, +]; + +function validateScenario(scenario) { + const missing = ["id", "target", "injected_fault", "expected_status", "recovery_check", "cleanup"] + .filter((key) => !scenario[key]); + const allowedStatuses = new Set(["pass", "fail", "blocked", "env_issue", "flaky"]); + return { + id: scenario.id, + pass: missing.length === 0 && allowedStatuses.has(scenario.expected_status), + missing, + expected_status: scenario.expected_status, + }; +} + +async function main() { + const root = resolve(env.LBS_ROOT || process.cwd()); + const caseId = "langbot-fault-taxonomy-contract"; + const runId = env.LBS_RUN_ID || `${timestampSlug()}-${caseId}`; + const evidenceDir = resolve(env.LBS_EVIDENCE_DIR || join(root, "reports", "evidence", runId)); + await mkdir(evidenceDir, { recursive: true }); + + const startedAt = new Date(); + const validations = scenarios.map(validateScenario); + const statusCounts = {}; + for (const scenario of scenarios) { + statusCounts[scenario.expected_status] = (statusCounts[scenario.expected_status] || 0) + 1; + } + const metrics = { + probe: caseId, + scenario_count: scenarios.length, + status_counts: statusCounts, + scenarios, + validations, + }; + const thresholds = { + scenario_count: { actual: scenarios.length, min: 5, pass: scenarios.length >= 5 }, + invalid_scenario_count: { + actual: validations.filter((item) => !item.pass).length, + max: 0, + pass: validations.every((item) => item.pass), + }, + cleanup_declared_count: { + actual: scenarios.filter((item) => item.cleanup).length, + min: scenarios.length, + pass: scenarios.every((item) => item.cleanup), + }, + }; + const status = Object.values(thresholds).every((item) => item.pass) ? "pass" : "fail"; + const metricsPath = join(evidenceDir, "metrics.json"); + const faultModelPath = join(evidenceDir, "fault-model.json"); + const automationResultPath = join(evidenceDir, "automation-result.json"); + const resultPath = join(evidenceDir, "result.json"); + + await writeFile(metricsPath, `${JSON.stringify(metrics, null, 2)}\n`, "utf8"); + await writeFile(faultModelPath, `${JSON.stringify({ scenarios }, null, 2)}\n`, "utf8"); + + const finishedAt = new Date(); + const result = { + source: "automation", + case_id: caseId, + run_id: runId, + status, + reason: status === "pass" + ? "Fault taxonomy contract declares status, recovery, and cleanup for every scenario." + : "Fault taxonomy contract is missing required scenario fields.", + started_at: startedAt.toISOString(), + started_at_local: localIsoWithOffset(startedAt), + finished_at: finishedAt.toISOString(), + finished_at_local: localIsoWithOffset(finishedAt), + duration_ms: finishedAt.getTime() - startedAt.getTime(), + metrics_summary: { + scenario_count: metrics.scenario_count, + status_counts: metrics.status_counts, + invalid_scenario_count: thresholds.invalid_scenario_count.actual, + }, + thresholds_summary: thresholds, + artifacts: { + metrics_json: metricsPath, + fault_model_json: faultModelPath, + automation_result_json: automationResultPath, + result_json: resultPath, + }, + evidence_collected: ["metrics", "filesystem"], + }; + + const resultText = `${JSON.stringify(result, null, 2)}\n`; + await writeFile(automationResultPath, resultText, "utf8"); + await writeFile(resultPath, resultText, "utf8"); + console.log(JSON.stringify(result, null, 2)); + exit(status === "pass" ? 0 : 1); +} + +await main(); diff --git a/skills/skills/langbot-testing/probes/langbot-live-backend-latency.mjs b/skills/skills/langbot-testing/probes/langbot-live-backend-latency.mjs new file mode 100644 index 000000000..747c84c6a --- /dev/null +++ b/skills/skills/langbot-testing/probes/langbot-live-backend-latency.mjs @@ -0,0 +1,212 @@ +#!/usr/bin/env node + +import { mkdir, writeFile } from "node:fs/promises"; +import { join, resolve } from "node:path"; +import { env, exit } from "node:process"; + +function pad(value, size = 2) { + return String(value).padStart(size, "0"); +} + +function localIsoWithOffset(date = new Date()) { + const offsetMinutes = -date.getTimezoneOffset(); + const sign = offsetMinutes >= 0 ? "+" : "-"; + const absolute = Math.abs(offsetMinutes); + return [ + `${date.getFullYear()}-${pad(date.getMonth() + 1)}-${pad(date.getDate())}`, + `T${pad(date.getHours())}:${pad(date.getMinutes())}:${pad(date.getSeconds())}.${pad(date.getMilliseconds(), 3)}`, + `${sign}${pad(Math.floor(absolute / 60))}:${pad(absolute % 60)}`, + ].join(""); +} + +function timestampSlug(date = new Date()) { + return date.toISOString().replace(/\.\d{3}Z$/, "Z").replace(/[^0-9A-Za-z]+/g, "-").replace(/^-|-$/g, ""); +} + +function percentile(values, percentileValue) { + if (values.length === 0) return 0; + const sorted = [...values].sort((a, b) => a - b); + const index = Math.min(sorted.length - 1, Math.ceil((percentileValue / 100) * sorted.length) - 1); + return Number(sorted[index].toFixed(3)); +} + +function stats(values) { + if (values.length === 0) return { min: 0, p50: 0, p95: 0, p99: 0, max: 0 }; + return { + min: Number(Math.min(...values).toFixed(3)), + p50: percentile(values, 50), + p95: percentile(values, 95), + p99: percentile(values, 99), + max: Number(Math.max(...values).toFixed(3)), + }; +} + +function parseJsonList(value, fallback) { + if (!value) return fallback; + try { + const parsed = JSON.parse(value); + return Array.isArray(parsed) && parsed.every((item) => typeof item === "string") ? parsed : fallback; + } catch { + return fallback; + } +} + +function joinUrl(baseUrl, path) { + const base = baseUrl.replace(/\/+$/, ""); + const suffix = path.startsWith("/") ? path : `/${path}`; + return `${base}${suffix}`; +} + +async function fetchOnce(url, timeoutMs) { + const controller = new AbortController(); + const timeout = setTimeout(() => controller.abort(), timeoutMs); + const started = performance.now(); + try { + const response = await fetch(url, { method: "GET", signal: controller.signal }); + await response.arrayBuffer(); + const latencyMs = performance.now() - started; + return { + url, + ok: response.status < 500, + status: response.status, + latency_ms: Number(latencyMs.toFixed(3)), + error: "", + }; + } catch (error) { + const latencyMs = performance.now() - started; + return { + url, + ok: false, + status: 0, + latency_ms: Number(latencyMs.toFixed(3)), + error: error instanceof Error ? error.message : String(error), + }; + } finally { + clearTimeout(timeout); + } +} + +async function runBatches(urls, totalRequests, concurrency, timeoutMs) { + const queue = Array.from({ length: totalRequests }, (_, index) => urls[index % urls.length]); + const results = []; + while (queue.length > 0) { + const batch = queue.splice(0, concurrency); + results.push(...await Promise.all(batch.map((url) => fetchOnce(url, timeoutMs)))); + } + return results; +} + +async function main() { + const root = resolve(env.LBS_ROOT || process.cwd()); + const caseId = "langbot-live-backend-latency"; + const runId = env.LBS_RUN_ID || `${timestampSlug()}-${caseId}`; + const evidenceDir = resolve(env.LBS_EVIDENCE_DIR || join(root, "reports", "evidence", runId)); + await mkdir(evidenceDir, { recursive: true }); + + const startedAt = new Date(); + const backendUrl = env.LANGBOT_BACKEND_URL || ""; + const endpoints = parseJsonList(env.LANGBOT_PERF_ENDPOINTS_JSON, ["/healthz"]); + const totalRequests = Number(env.LANGBOT_PERF_REQUESTS || "12"); + const concurrency = Number(env.LANGBOT_PERF_CONCURRENCY || "2"); + const timeoutMs = Number(env.LANGBOT_PERF_TIMEOUT_MS || "5000"); + const p95BudgetMs = Number(env.LANGBOT_PERF_BACKEND_P95_MS || "1000"); + const maxErrorRate = Number(env.LANGBOT_PERF_MAX_ERROR_RATE || "0"); + const metricsPath = join(evidenceDir, "metrics.json"); + const networkLogPath = join(evidenceDir, "network.log"); + const automationResultPath = join(evidenceDir, "automation-result.json"); + const resultPath = join(evidenceDir, "result.json"); + + let status = "fail"; + let reason = ""; + let results = []; + if (!backendUrl) { + status = "env_issue"; + reason = "LANGBOT_BACKEND_URL is not configured."; + } else { + const urls = endpoints.map((path) => joinUrl(backendUrl, path)); + results = await runBatches(urls, totalRequests, concurrency, timeoutMs); + const okCount = results.filter((item) => item.ok).length; + const errorCount = results.length - okCount; + const errorRate = results.length === 0 ? 1 : errorCount / results.length; + const latencies = results.filter((item) => item.ok).map((item) => item.latency_ms); + const latencyStats = stats(latencies); + const allConnectionFailures = results.length > 0 && results.every((item) => item.status === 0); + if (allConnectionFailures) { + status = "env_issue"; + reason = `Backend did not respond at ${backendUrl}.`; + } else if (latencyStats.p95 <= p95BudgetMs && errorRate <= maxErrorRate) { + status = "pass"; + reason = "Live backend latency probe passed all thresholds."; + } else { + status = "fail"; + reason = "Live backend latency probe breached latency or error-rate thresholds."; + } + } + + const statusCounts = {}; + for (const item of results) { + const key = item.status === 0 ? "network_error" : String(item.status); + statusCounts[key] = (statusCounts[key] || 0) + 1; + } + const okResults = results.filter((item) => item.ok); + const metrics = { + probe: caseId, + backend_url: backendUrl, + endpoints, + total_requests: totalRequests, + concurrency, + timeout_ms: timeoutMs, + ok_count: okResults.length, + error_count: results.length - okResults.length, + error_rate: results.length === 0 ? 1 : Number(((results.length - okResults.length) / results.length).toFixed(4)), + latency_ms: stats(okResults.map((item) => item.latency_ms)), + status_counts: statusCounts, + }; + const thresholds = { + backend_p95_ms: { actual: metrics.latency_ms.p95, max: p95BudgetMs, pass: metrics.latency_ms.p95 <= p95BudgetMs }, + error_rate: { actual: metrics.error_rate, max: maxErrorRate, pass: metrics.error_rate <= maxErrorRate }, + }; + + await writeFile(metricsPath, `${JSON.stringify({ ...metrics, samples: results }, null, 2)}\n`, "utf8"); + await writeFile(networkLogPath, results.map((item) => JSON.stringify(item)).join("\n") + (results.length > 0 ? "\n" : ""), "utf8"); + + const finishedAt = new Date(); + const result = { + source: "automation", + case_id: caseId, + run_id: runId, + status, + reason, + started_at: startedAt.toISOString(), + started_at_local: localIsoWithOffset(startedAt), + finished_at: finishedAt.toISOString(), + finished_at_local: localIsoWithOffset(finishedAt), + duration_ms: finishedAt.getTime() - startedAt.getTime(), + url: backendUrl, + metrics_summary: { + requests: metrics.total_requests, + concurrency: metrics.concurrency, + ok_count: metrics.ok_count, + error_rate: metrics.error_rate, + latency_p50_ms: metrics.latency_ms.p50, + latency_p95_ms: metrics.latency_ms.p95, + status_counts: metrics.status_counts, + }, + thresholds_summary: thresholds, + artifacts: { + metrics_json: metricsPath, + network_log: networkLogPath, + automation_result_json: automationResultPath, + result_json: resultPath, + }, + evidence_collected: ["metrics", "network", "api_diagnostic", "filesystem"], + }; + + const resultText = `${JSON.stringify(result, null, 2)}\n`; + await writeFile(automationResultPath, resultText, "utf8"); + await writeFile(resultPath, resultText, "utf8"); + console.log(JSON.stringify(result, null, 2)); + exit(status === "pass" ? 0 : status === "env_issue" ? 2 : 1); +} + +await main(); diff --git a/skills/skills/langbot-testing/probes/langbot-live-backend-log-health.mjs b/skills/skills/langbot-testing/probes/langbot-live-backend-log-health.mjs new file mode 100644 index 000000000..38a31c389 --- /dev/null +++ b/skills/skills/langbot-testing/probes/langbot-live-backend-log-health.mjs @@ -0,0 +1,205 @@ +#!/usr/bin/env node + +import { existsSync, readdirSync, statSync } from "node:fs"; +import { mkdir, readFile, writeFile } from "node:fs/promises"; +import { join, resolve } from "node:path"; +import { env, exit } from "node:process"; + +function pad(value, size = 2) { + return String(value).padStart(size, "0"); +} + +function localIsoWithOffset(date = new Date()) { + const offsetMinutes = -date.getTimezoneOffset(); + const sign = offsetMinutes >= 0 ? "+" : "-"; + const absolute = Math.abs(offsetMinutes); + return [ + `${date.getFullYear()}-${pad(date.getMonth() + 1)}-${pad(date.getDate())}`, + `T${pad(date.getHours())}:${pad(date.getMinutes())}:${pad(date.getSeconds())}.${pad(date.getMilliseconds(), 3)}`, + `${sign}${pad(Math.floor(absolute / 60))}:${pad(absolute % 60)}`, + ].join(""); +} + +function timestampSlug(date = new Date()) { + return date.toISOString().replace(/\.\d{3}Z$/, "Z").replace(/[^0-9A-Za-z]+/g, "-").replace(/^-|-$/g, ""); +} + +function repoRootFromEnv(root) { + return env.LANGBOT_REPO ? resolve(env.LANGBOT_REPO) : resolve(root, ".."); +} + +function latestBackendLog(root) { + const explicit = env.LANGBOT_BACKEND_LOG; + if (explicit) return resolve(explicit); + + const logsDir = join(repoRootFromEnv(root), "data", "logs"); + if (!existsSync(logsDir)) return ""; + const candidates = readdirSync(logsDir) + .filter((name) => /^langbot-.*\.log$/.test(name)) + .map((name) => join(logsDir, name)) + .filter((path) => { + try { + return statSync(path).isFile(); + } catch { + return false; + } + }) + .sort((left, right) => statSync(right).mtimeMs - statSync(left).mtimeMs); + return candidates[0] || ""; +} + +function parseSince(startedAt) { + if (env.LANGBOT_BACKEND_LOG_SINCE) return new Date(env.LANGBOT_BACKEND_LOG_SINCE); + const lookbackSeconds = Number(env.LANGBOT_BACKEND_LOG_LOOKBACK_SECONDS || "300"); + return new Date(startedAt.getTime() - lookbackSeconds * 1000); +} + +function parseTimestamp(line, year) { + const localMatch = line.match(/^\[(\d{2})-(\d{2}) (\d{2}):(\d{2}):(\d{2})\.(\d{3})\]/); + if (localMatch) { + const [, month, day, hour, minute, second, millisecond] = localMatch; + return new Date(`${year}-${month}-${day}T${hour}:${minute}:${second}.${millisecond}+08:00`); + } + + const accessMatch = line.match(/^\[(\d{4})-(\d{2})-(\d{2}) (\d{2}):(\d{2}):(\d{2}) ([+-]\d{4})\]/); + if (accessMatch) { + const [, fullYear, month, day, hour, minute, second, offset] = accessMatch; + const normalizedOffset = `${offset.slice(0, 3)}:${offset.slice(3)}`; + return new Date(`${fullYear}-${month}-${day}T${hour}:${minute}:${second}${normalizedOffset}`); + } + + return null; +} + +function findingForLine(line, number) { + const rules = [ + { severity: "fail", kind: "python_traceback", pattern: /\bTraceback(?: \(most recent call last\))?/i }, + { severity: "fail", kind: "unretrieved_task_exception", pattern: /Task exception was never retrieved/i }, + { severity: "fail", kind: "unawaited_coroutine", pattern: /RuntimeWarning:\s+coroutine .* was never awaited/i }, + { severity: "fail", kind: "unclosed_client_session", pattern: /Unclosed client session/i }, + { severity: "fail", kind: "unclosed_connector", pattern: /Unclosed connector/i }, + { severity: "fail", kind: "import_error", pattern: /\bImportError\b/i }, + { severity: "fail", kind: "error_log", pattern: /\b(?:ERROR|CRITICAL)\b/ }, + { severity: "warning", kind: "warning_log", pattern: /\bWARNING\b/ }, + ]; + + for (const rule of rules) { + if (rule.pattern.test(line)) { + return { + severity: rule.severity, + kind: rule.kind, + line: number, + excerpt: line, + }; + } + } + return null; +} + +function scanLines(text, since, year) { + const findings = []; + const scanned = []; + let includeContinuation = false; + const lines = text.split(/\r?\n/); + for (const [index, line] of lines.entries()) { + const number = index + 1; + const timestamp = parseTimestamp(line, year); + if (timestamp) includeContinuation = timestamp >= since; + if (!includeContinuation) continue; + scanned.push({ number, text: line }); + const finding = findingForLine(line, number); + if (finding) findings.push(finding); + } + return { findings, scanned, total_lines: lines.length }; +} + +async function main() { + const root = resolve(env.LBS_ROOT || process.cwd()); + const caseId = "langbot-live-backend-log-health"; + const runId = env.LBS_RUN_ID || `${timestampSlug()}-${caseId}`; + const evidenceDir = resolve(env.LBS_EVIDENCE_DIR || join(root, "reports", "evidence", runId)); + await mkdir(evidenceDir, { recursive: true }); + + const startedAt = new Date(); + const since = parseSince(startedAt); + const logPath = latestBackendLog(root); + const metricsPath = join(evidenceDir, "metrics.json"); + const findingsPath = join(evidenceDir, "findings.json"); + const scannedLogPath = join(evidenceDir, "scanned-backend.log"); + const automationResultPath = join(evidenceDir, "automation-result.json"); + const resultPath = join(evidenceDir, "result.json"); + + let status = "fail"; + let reason = ""; + let scan = { findings: [], scanned: [], total_lines: 0 }; + if (!logPath || !existsSync(logPath)) { + status = "env_issue"; + reason = "No LangBot backend log file was found. Set LANGBOT_BACKEND_LOG or LANGBOT_REPO."; + } else { + const text = await readFile(logPath, "utf8"); + scan = scanLines(text, since, startedAt.getFullYear()); + const failCount = scan.findings.filter((item) => item.severity === "fail").length; + status = failCount === 0 ? "pass" : "fail"; + reason = status === "pass" + ? "Live backend log health passed; no fail-severity findings in the scanned window." + : "Live backend log health found fail-severity backend log findings."; + } + + const warningCount = scan.findings.filter((item) => item.severity === "warning").length; + const failCount = scan.findings.filter((item) => item.severity === "fail").length; + const metrics = { + probe: caseId, + backend_log: logPath, + since: since.toISOString(), + scanned_line_count: scan.scanned.length, + total_line_count: scan.total_lines, + fail_count: failCount, + warning_count: warningCount, + finding_count: scan.findings.length, + }; + const thresholds = { + fail_count: { actual: failCount, max: 0, pass: failCount === 0 }, + }; + + await writeFile(metricsPath, `${JSON.stringify(metrics, null, 2)}\n`, "utf8"); + await writeFile(findingsPath, `${JSON.stringify(scan.findings, null, 2)}\n`, "utf8"); + await writeFile(scannedLogPath, scan.scanned.map((item) => `${item.number}: ${item.text}`).join("\n") + (scan.scanned.length > 0 ? "\n" : ""), "utf8"); + + const finishedAt = new Date(); + const result = { + source: "automation", + case_id: caseId, + run_id: runId, + status, + reason, + started_at: startedAt.toISOString(), + started_at_local: localIsoWithOffset(startedAt), + finished_at: finishedAt.toISOString(), + finished_at_local: localIsoWithOffset(finishedAt), + duration_ms: finishedAt.getTime() - startedAt.getTime(), + url: logPath, + metrics_summary: { + scanned_line_count: metrics.scanned_line_count, + fail_count: metrics.fail_count, + warning_count: metrics.warning_count, + finding_count: metrics.finding_count, + }, + thresholds_summary: thresholds, + artifacts: { + metrics_json: metricsPath, + findings_json: findingsPath, + scanned_backend_log: scannedLogPath, + automation_result_json: automationResultPath, + result_json: resultPath, + }, + evidence_collected: ["metrics", "backend_log", "filesystem"], + }; + + const resultText = `${JSON.stringify(result, null, 2)}\n`; + await writeFile(automationResultPath, resultText, "utf8"); + await writeFile(resultPath, resultText, "utf8"); + console.log(JSON.stringify(result, null, 2)); + exit(status === "pass" ? 0 : status === "env_issue" ? 2 : 1); +} + +await main(); diff --git a/skills/skills/langbot-testing/probes/langbot-live-control-plane-api.mjs b/skills/skills/langbot-testing/probes/langbot-live-control-plane-api.mjs new file mode 100644 index 000000000..8232d1fc3 --- /dev/null +++ b/skills/skills/langbot-testing/probes/langbot-live-control-plane-api.mjs @@ -0,0 +1,311 @@ +#!/usr/bin/env node + +import { mkdir, writeFile } from "node:fs/promises"; +import { join, resolve } from "node:path"; +import { env, exit } from "node:process"; + +function pad(value, size = 2) { + return String(value).padStart(size, "0"); +} + +function localIsoWithOffset(date = new Date()) { + const offsetMinutes = -date.getTimezoneOffset(); + const sign = offsetMinutes >= 0 ? "+" : "-"; + const absolute = Math.abs(offsetMinutes); + return [ + `${date.getFullYear()}-${pad(date.getMonth() + 1)}-${pad(date.getDate())}`, + `T${pad(date.getHours())}:${pad(date.getMinutes())}:${pad(date.getSeconds())}.${pad(date.getMilliseconds(), 3)}`, + `${sign}${pad(Math.floor(absolute / 60))}:${pad(absolute % 60)}`, + ].join(""); +} + +function timestampSlug(date = new Date()) { + return date.toISOString().replace(/\.\d{3}Z$/, "Z").replace(/[^0-9A-Za-z]+/g, "-").replace(/^-|-$/g, ""); +} + +function percentile(values, percentileValue) { + if (values.length === 0) return 0; + const sorted = [...values].sort((a, b) => a - b); + const index = Math.min(sorted.length - 1, Math.ceil((percentileValue / 100) * sorted.length) - 1); + return Number(sorted[index].toFixed(3)); +} + +function stats(values) { + if (values.length === 0) return { min: 0, p50: 0, p95: 0, p99: 0, max: 0 }; + return { + min: Number(Math.min(...values).toFixed(3)), + p50: percentile(values, 50), + p95: percentile(values, 95), + p99: percentile(values, 99), + max: Number(Math.max(...values).toFixed(3)), + }; +} + +function joinUrl(baseUrl, path) { + const base = baseUrl.replace(/\/+$/, ""); + const suffix = path.startsWith("/") ? path : `/${path}`; + return `${base}${suffix}`; +} + +function parseJsonObject(value, fallback) { + if (!value) return fallback; + try { + const parsed = JSON.parse(value); + return parsed && typeof parsed === "object" && !Array.isArray(parsed) ? parsed : fallback; + } catch { + return fallback; + } +} + +function controlPlaneEndpoints() { + return [ + { + id: "healthz", + path: "/healthz", + expected_status: 200, + expected_code: 0, + p95_budget_ms: Number(env.LANGBOT_PERF_HEALTHZ_P95_MS || "500"), + required_data_fields: [], + }, + { + id: "system_info", + path: "/api/v1/system/info", + expected_status: 200, + expected_code: 0, + p95_budget_ms: Number(env.LANGBOT_PERF_SYSTEM_INFO_P95_MS || "1000"), + required_data_fields: ["version", "edition", "enable_marketplace"], + }, + ]; +} + +async function fetchEndpoint(backendUrl, endpoint, timeoutMs) { + const url = joinUrl(backendUrl, endpoint.path); + const controller = new AbortController(); + const timeout = setTimeout(() => controller.abort(), timeoutMs); + const started = performance.now(); + let bodyText = ""; + let json = null; + let jsonValid = false; + let error = ""; + + try { + const response = await fetch(url, { + method: "GET", + headers: { "accept": "application/json" }, + signal: controller.signal, + }); + bodyText = await response.text(); + try { + json = bodyText ? JSON.parse(bodyText) : null; + jsonValid = json !== null; + } catch (parseError) { + error = parseError instanceof Error ? parseError.message : String(parseError); + } + + const data = json && typeof json === "object" && json.data && typeof json.data === "object" ? json.data : {}; + const missingFields = endpoint.required_data_fields.filter((field) => !(field in data)); + const statusOk = response.status === endpoint.expected_status; + const codeOk = !json || typeof json !== "object" ? false : json.code === endpoint.expected_code; + const shapeOk = jsonValid && missingFields.length === 0; + const latencyMs = performance.now() - started; + return { + endpoint_id: endpoint.id, + path: endpoint.path, + url, + status: response.status, + ok: statusOk && codeOk && shapeOk, + status_ok: statusOk, + code_ok: codeOk, + json_valid: jsonValid, + missing_fields: missingFields, + response_code: json && typeof json === "object" ? json.code : null, + latency_ms: Number(latencyMs.toFixed(3)), + error, + }; + } catch (fetchError) { + const latencyMs = performance.now() - started; + return { + endpoint_id: endpoint.id, + path: endpoint.path, + url, + status: 0, + ok: false, + status_ok: false, + code_ok: false, + json_valid: false, + missing_fields: endpoint.required_data_fields, + response_code: null, + latency_ms: Number(latencyMs.toFixed(3)), + error: fetchError instanceof Error ? fetchError.message : String(fetchError), + }; + } finally { + clearTimeout(timeout); + } +} + +async function runBatches(backendUrl, endpoints, totalRequests, concurrency, timeoutMs) { + const queue = Array.from({ length: totalRequests }, (_, index) => endpoints[index % endpoints.length]); + const results = []; + while (queue.length > 0) { + const batch = queue.splice(0, concurrency); + results.push(...await Promise.all(batch.map((endpoint) => fetchEndpoint(backendUrl, endpoint, timeoutMs)))); + } + return results; +} + +function endpointMetrics(endpoints, results) { + return Object.fromEntries(endpoints.map((endpoint) => { + const samples = results.filter((item) => item.endpoint_id === endpoint.id); + const okSamples = samples.filter((item) => item.ok); + return [ + endpoint.id, + { + path: endpoint.path, + requests: samples.length, + ok_count: okSamples.length, + error_rate: samples.length === 0 ? 1 : Number(((samples.length - okSamples.length) / samples.length).toFixed(4)), + latency_ms: stats(okSamples.map((item) => item.latency_ms)), + p95_budget_ms: endpoint.p95_budget_ms, + }, + ]; + })); +} + +async function main() { + const root = resolve(env.LBS_ROOT || process.cwd()); + const caseId = "langbot-live-control-plane-api"; + const runId = env.LBS_RUN_ID || `${timestampSlug()}-${caseId}`; + const evidenceDir = resolve(env.LBS_EVIDENCE_DIR || join(root, "reports", "evidence", runId)); + await mkdir(evidenceDir, { recursive: true }); + + const startedAt = new Date(); + const backendUrl = env.LANGBOT_BACKEND_URL || ""; + const endpoints = controlPlaneEndpoints(); + const configuredBudgets = parseJsonObject(env.LANGBOT_CONTROL_PLANE_P95_BUDGETS_JSON, {}); + for (const endpoint of endpoints) { + const budget = configuredBudgets[endpoint.id]; + if (typeof budget === "number" && Number.isFinite(budget)) endpoint.p95_budget_ms = budget; + } + const totalRequests = Number(env.LANGBOT_CONTROL_PLANE_REQUESTS || "20"); + const concurrency = Number(env.LANGBOT_CONTROL_PLANE_CONCURRENCY || "4"); + const timeoutMs = Number(env.LANGBOT_CONTROL_PLANE_TIMEOUT_MS || "5000"); + const maxErrorRate = Number(env.LANGBOT_CONTROL_PLANE_MAX_ERROR_RATE || "0"); + const metricsPath = join(evidenceDir, "metrics.json"); + const endpointsPath = join(evidenceDir, "endpoints.json"); + const networkLogPath = join(evidenceDir, "network.log"); + const automationResultPath = join(evidenceDir, "automation-result.json"); + const resultPath = join(evidenceDir, "result.json"); + + let status = "fail"; + let reason = ""; + let results = []; + if (!backendUrl) { + status = "env_issue"; + reason = "LANGBOT_BACKEND_URL is not configured."; + } else { + results = await runBatches(backendUrl, endpoints, totalRequests, concurrency, timeoutMs); + const allConnectionFailures = results.length > 0 && results.every((item) => item.status === 0); + if (allConnectionFailures) { + status = "env_issue"; + reason = `Backend did not respond at ${backendUrl}.`; + } + } + + const okResults = results.filter((item) => item.ok); + const statusCounts = {}; + for (const item of results) { + const key = item.status === 0 ? "network_error" : String(item.status); + statusCounts[key] = (statusCounts[key] || 0) + 1; + } + const perEndpoint = endpointMetrics(endpoints, results); + const responseShapeFailures = results.filter((item) => !item.json_valid || item.missing_fields.length > 0 || !item.code_ok).length; + const errorRate = results.length === 0 ? 1 : Number(((results.length - okResults.length) / results.length).toFixed(4)); + const thresholds = { + error_rate: { actual: errorRate, max: maxErrorRate, pass: errorRate <= maxErrorRate }, + response_shape_failures: { actual: responseShapeFailures, max: 0, pass: responseShapeFailures === 0 }, + }; + for (const endpoint of endpoints) { + const actual = perEndpoint[endpoint.id].latency_ms.p95; + thresholds[`${endpoint.id}_p95_ms`] = { + actual, + max: endpoint.p95_budget_ms, + pass: actual <= endpoint.p95_budget_ms, + }; + } + + if (status !== "env_issue") { + const passed = Object.values(thresholds).every((item) => item.pass); + status = passed ? "pass" : "fail"; + reason = passed + ? "Live control-plane API probe passed all thresholds." + : "Live control-plane API probe breached shape, latency, or error-rate thresholds."; + } + + const metrics = { + probe: caseId, + backend_url: backendUrl, + total_requests: totalRequests, + concurrency, + timeout_ms: timeoutMs, + ok_count: okResults.length, + error_count: results.length - okResults.length, + error_rate: errorRate, + status_counts: statusCounts, + response_shape_failures: responseShapeFailures, + endpoints: perEndpoint, + }; + + await writeFile(metricsPath, `${JSON.stringify({ ...metrics, samples: results }, null, 2)}\n`, "utf8"); + await writeFile(endpointsPath, `${JSON.stringify(endpoints, null, 2)}\n`, "utf8"); + await writeFile(networkLogPath, results.map((item) => JSON.stringify(item)).join("\n") + (results.length > 0 ? "\n" : ""), "utf8"); + + const finishedAt = new Date(); + const result = { + source: "automation", + case_id: caseId, + run_id: runId, + status, + reason, + started_at: startedAt.toISOString(), + started_at_local: localIsoWithOffset(startedAt), + finished_at: finishedAt.toISOString(), + finished_at_local: localIsoWithOffset(finishedAt), + duration_ms: finishedAt.getTime() - startedAt.getTime(), + url: backendUrl, + metrics_summary: { + requests: metrics.total_requests, + concurrency: metrics.concurrency, + ok_count: metrics.ok_count, + error_rate: metrics.error_rate, + response_shape_failures: metrics.response_shape_failures, + endpoints: Object.fromEntries(Object.entries(metrics.endpoints).map(([id, value]) => [ + id, + { + path: value.path, + ok_count: value.ok_count, + error_rate: value.error_rate, + latency_p50_ms: value.latency_ms.p50, + latency_p95_ms: value.latency_ms.p95, + }, + ])), + status_counts: metrics.status_counts, + }, + thresholds_summary: thresholds, + artifacts: { + metrics_json: metricsPath, + endpoints_json: endpointsPath, + network_log: networkLogPath, + automation_result_json: automationResultPath, + result_json: resultPath, + }, + evidence_collected: ["metrics", "network", "api_diagnostic", "filesystem"], + }; + + const resultText = `${JSON.stringify(result, null, 2)}\n`; + await writeFile(automationResultPath, resultText, "utf8"); + await writeFile(resultPath, resultText, "utf8"); + console.log(JSON.stringify(result, null, 2)); + exit(status === "pass" ? 0 : status === "env_issue" ? 2 : 1); +} + +await main(); diff --git a/skills/skills/langbot-testing/probes/langbot-overhead-accounting-contract.mjs b/skills/skills/langbot-testing/probes/langbot-overhead-accounting-contract.mjs new file mode 100644 index 000000000..5338df003 --- /dev/null +++ b/skills/skills/langbot-testing/probes/langbot-overhead-accounting-contract.mjs @@ -0,0 +1,162 @@ +#!/usr/bin/env node + +import { mkdir, writeFile } from "node:fs/promises"; +import { join, resolve } from "node:path"; +import { env, exit } from "node:process"; + +function pad(value, size = 2) { + return String(value).padStart(size, "0"); +} + +function localIsoWithOffset(date = new Date()) { + const offsetMinutes = -date.getTimezoneOffset(); + const sign = offsetMinutes >= 0 ? "+" : "-"; + const absolute = Math.abs(offsetMinutes); + return [ + `${date.getFullYear()}-${pad(date.getMonth() + 1)}-${pad(date.getDate())}`, + `T${pad(date.getHours())}:${pad(date.getMinutes())}:${pad(date.getSeconds())}.${pad(date.getMilliseconds(), 3)}`, + `${sign}${pad(Math.floor(absolute / 60))}:${pad(absolute % 60)}`, + ].join(""); +} + +function timestampSlug(date = new Date()) { + return date.toISOString().replace(/\.\d{3}Z$/, "Z").replace(/[^0-9A-Za-z]+/g, "-").replace(/^-|-$/g, ""); +} + +function percentile(values, percentileValue) { + if (values.length === 0) return 0; + const sorted = [...values].sort((a, b) => a - b); + const index = Math.min(sorted.length - 1, Math.ceil((percentileValue / 100) * sorted.length) - 1); + return Number(sorted[index].toFixed(3)); +} + +function stats(values) { + return { + min: Number(Math.min(...values).toFixed(3)), + p50: percentile(values, 50), + p95: percentile(values, 95), + p99: percentile(values, 99), + max: Number(Math.max(...values).toFixed(3)), + }; +} + +function threshold(actual, limit, operator) { + const pass = operator === "<=" ? actual <= limit : actual >= limit; + return { actual, [operator === "<=" ? "max" : "min"]: limit, pass }; +} + +function makeSample(index) { + const ingress = 1 + (index % 5) * 0.22; + const pipeline = 2.8 + (index % 7) * 0.31; + const persistence = 1.1 + (index % 4) * 0.2; + const pluginIpc = 1.9 + (index % 6) * 0.27; + const rag = index % 3 === 0 ? 4.4 : 0.8 + (index % 5) * 0.18; + const streaming = 1.5 + (index % 8) * 0.24; + const provider = 80 + (index % 13) * 11; + const externalTool = index % 4 === 0 ? 25 + (index % 9) * 3 : 0; + const network = 8 + (index % 10) * 1.7; + const overhead = ingress + pipeline + persistence + pluginIpc + rag + streaming; + const external = provider + externalTool + network; + const total = overhead + external; + return { + index, + segments_ms: { + ingress, + pipeline, + persistence, + plugin_ipc: pluginIpc, + rag, + streaming, + provider, + external_tool: externalTool, + network, + }, + langbot_overhead_ms: Number(overhead.toFixed(3)), + external_latency_ms: Number(external.toFixed(3)), + e2e_latency_ms: Number(total.toFixed(3)), + accounting_gap_ms: Number((total - external - overhead).toFixed(6)), + }; +} + +async function main() { + const root = resolve(env.LBS_ROOT || process.cwd()); + const caseId = "langbot-overhead-accounting-contract"; + const runId = env.LBS_RUN_ID || `${timestampSlug()}-${caseId}`; + const evidenceDir = resolve(env.LBS_EVIDENCE_DIR || join(root, "reports", "evidence", runId)); + await mkdir(evidenceDir, { recursive: true }); + + const startedAt = new Date(); + const sampleCount = Number(env.LANGBOT_PERF_CONTRACT_SAMPLES || "80"); + const overheadP95BudgetMs = Number(env.LANGBOT_PERF_OVERHEAD_P95_MS || "25"); + const samples = Array.from({ length: sampleCount }, (_, index) => makeSample(index)); + const overheads = samples.map((sample) => sample.langbot_overhead_ms); + const e2e = samples.map((sample) => sample.e2e_latency_ms); + const external = samples.map((sample) => sample.external_latency_ms); + const gaps = samples.map((sample) => Math.abs(sample.accounting_gap_ms)); + const memory = process.memoryUsage(); + + const metrics = { + probe: caseId, + sample_count: sampleCount, + langbot_overhead_ms: stats(overheads), + e2e_latency_ms: stats(e2e), + external_latency_ms: stats(external), + accounting_gap_max_ms: Number(Math.max(...gaps).toFixed(6)), + samples, + }; + const thresholds = { + sample_count: threshold(sampleCount, 50, ">="), + langbot_overhead_p95_ms: threshold(metrics.langbot_overhead_ms.p95, overheadP95BudgetMs, "<="), + accounting_gap_max_ms: threshold(metrics.accounting_gap_max_ms, 0.001, "<="), + }; + const status = Object.values(thresholds).every((item) => item.pass) ? "pass" : "fail"; + const metricsPath = join(evidenceDir, "metrics.json"); + const thresholdsPath = join(evidenceDir, "thresholds.json"); + const resourceLogPath = join(evidenceDir, "resource-log.json"); + const automationResultPath = join(evidenceDir, "automation-result.json"); + const resultPath = join(evidenceDir, "result.json"); + + await writeFile(metricsPath, `${JSON.stringify(metrics, null, 2)}\n`, "utf8"); + await writeFile(thresholdsPath, `${JSON.stringify(thresholds, null, 2)}\n`, "utf8"); + await writeFile(resourceLogPath, `${JSON.stringify({ memory, pid: process.pid }, null, 2)}\n`, "utf8"); + + const finishedAt = new Date(); + const result = { + source: "automation", + case_id: caseId, + run_id: runId, + status, + reason: status === "pass" + ? "Overhead accounting contract passed all thresholds." + : "Overhead accounting contract breached one or more thresholds.", + started_at: startedAt.toISOString(), + started_at_local: localIsoWithOffset(startedAt), + finished_at: finishedAt.toISOString(), + finished_at_local: localIsoWithOffset(finishedAt), + duration_ms: finishedAt.getTime() - startedAt.getTime(), + metrics_summary: { + sample_count: metrics.sample_count, + langbot_overhead_p95_ms: metrics.langbot_overhead_ms.p95, + e2e_latency_p95_ms: metrics.e2e_latency_ms.p95, + external_latency_p95_ms: metrics.external_latency_ms.p95, + accounting_gap_max_ms: metrics.accounting_gap_max_ms, + }, + thresholds_summary: thresholds, + artifacts: { + metrics_json: metricsPath, + thresholds_json: thresholdsPath, + resource_log_json: resourceLogPath, + automation_result_json: automationResultPath, + result_json: resultPath, + }, + evidence_collected: ["metrics", "resource_log", "filesystem"], + }; + + const resultText = `${JSON.stringify(result, null, 2)}\n`; + await writeFile(automationResultPath, resultText, "utf8"); + await writeFile(resultPath, resultText, "utf8"); + console.log(JSON.stringify(result, null, 2)); + exit(status === "pass" ? 0 : 1); +} + +await main(); diff --git a/skills/skills/langbot-testing/references/performance-reliability-testing.md b/skills/skills/langbot-testing/references/performance-reliability-testing.md new file mode 100644 index 000000000..6517858d8 --- /dev/null +++ b/skills/skills/langbot-testing/references/performance-reliability-testing.md @@ -0,0 +1,173 @@ +# Performance And Reliability Testing + +Use this reference when a QA request asks whether LangBot is fast enough, +stable under load, or resilient to controlled faults. + +## Scope + +Treat `skills/` as the QA control plane: + +- Cases define intent, readiness, thresholds, and required evidence. +- Probe scripts collect metrics, traces, resource logs, and artifacts. +- Reports classify the same run as `pass`, `fail`, `blocked`, + `env_issue`, or `flaky`. + +Do not turn `skills/` into a load generator or chaos engine. Call a focused +tool from a `mode: probe` case when the test needs one, for example k6, +Locust, pytest-benchmark, Playwright trace collection, Toxiproxy, Docker, or a +Kubernetes disruption tool. + +## LangBot Performance Model + +For LangBot, performance is the cost LangBot adds around external systems: + +```text +LangBot overhead = end-to-end latency - provider latency - external tool latency - network/fault injection latency +``` + +Measure user experience and internal composition separately: + +- WebUI load and interaction latency. +- Debug Chat send-to-first-visible-token and send-to-completion latency. +- Pipeline, RAG, plugin runtime, MCP, AgentRunner, and persistence segment + latency. +- Queue wait time, concurrency, throughput, timeout rate, and p95/p99 latency. +- Startup, plugin install, knowledge-base ingestion, migration, and recovery + time. + +Do not report a single message round-trip time as "LangBot performance" unless +the report also explains external provider/tool/network time. + +## Evidence Contract + +Performance and reliability cases should declare the evidence they need: + +- `metrics`: machine-readable latency, throughput, error-rate, or recovery + metrics, usually `metrics.json`. +- `resource_log`: CPU, memory, process, connection, queue, or file descriptor + samples. +- `trace`: browser, HTTP, database, or runtime trace artifacts. +- `profile`: CPU, memory, or flamegraph profile artifacts. +- `backend_log`, `network`, `api_diagnostic`, and `filesystem` as supporting + evidence when relevant. + +Automation should write `automation-result.json` with these fields when +available: + +```json +{ + "status": "pass", + "reason": "Probe passed all thresholds.", + "metrics_summary": { + "langbot_overhead_p95_ms": 12.4, + "error_rate": 0 + }, + "thresholds_summary": { + "langbot_overhead_p95_ms": { "actual": 12.4, "max": 50, "pass": true } + }, + "artifacts": { + "metrics_json": "/path/to/metrics.json" + }, + "evidence_collected": ["metrics", "filesystem"] +} +``` + +Synthetic contract probes are useful for checking the QA harness, but they are +not live product performance results. Label them as contract probes in the case +title, checks, and report. + +## Chaos And Reliability Rules + +Chaos tests must be narrow and reversible: + +- Declare the fault model in `fault_model_json`. +- Record blast radius, target component, injection method, duration, and abort + conditions. +- Capture recovery checks and cleanup steps in the case. +- Classify unavailable dependencies as `env_issue` unless the target behavior + is LangBot's handling of that dependency failure. +- Do not run destructive fault injection against a shared or production-like + instance without explicit operator approval. + +Recommended first fault models: + +- Provider timeout or HTTP 429 from a fake provider endpoint. +- Plugin runtime disconnect/reconnect in a local instance. +- MCP stdio server exits mid-call. +- RAG parser fixture fails once and recovers on retry. +- Backend API endpoint returns 5xx from a controlled local proxy. + +## Starter Live Probes + +The starter gate separates QA-harness contracts from live product checks: + +- `langbot-overhead-accounting-contract` verifies that reports can carry + overhead accounting metrics. It uses deterministic synthetic samples and is + not live product performance. +- `langbot-fault-taxonomy-contract` verifies that fault scenarios declare + expected status, recovery, and cleanup before destructive chaos tests are + added. +- `langbot-live-backend-latency` checks the unauthenticated `/healthz` + endpoint for basic backend responsiveness. +- `langbot-live-control-plane-api` checks `/healthz` and + `/api/v1/system/info` for HTTP 200, JSON `code: 0`, response shape, and + per-endpoint p95 latency. +- `langbot-live-backend-log-health` scans the recent backend log window for + fail-severity runtime findings. It is the reliability guard that should fail + the gate when HTTP probes pass but backend logs contain Traceback, ImportError, + ERROR, unclosed sessions, or unawaited coroutine signals. + +Do not treat these starter live probes as Debug Chat or model-provider +performance. They are control-plane readiness checks; user-facing performance +needs browser/WebSocket/message-path measurements. + +## Gate Layers + +Use the smallest gate that answers the quality question: + +- `langbot-performance-contract-gate`: fast synthetic checks for report shape, + threshold accounting, and fault taxonomy. Good for PR feedback when no live + service is running. +- `langbot-live-backend-gate`: live backend `/healthz`, + `/api/v1/system/info`, and backend log health. Good after starting a local + LangBot backend. +- `langbot-user-path-performance-gate`: browser-visible user path performance, + starting with Pipeline Debug Chat send-to-visible-completion latency. Run it + only when the browser profile and target pipeline are ready. +- `langbot-performance-reliability-gate`: combined starter gate for synthetic + contracts plus live backend checks. + +Keep environment diagnostics separate from product regressions. For example, a +SOCKS proxy without Python `socksio` support should be fixed or clearly +classified by `bin/lbs env doctor`; do not hide the resulting backend +Traceback in reports. + +## Debug Chat Performance + +`pipeline-debug-chat-performance` reuses the browser Debug Chat automation and +adds `metrics.json`, `metrics_summary`, and `thresholds_summary` to +`automation-result.json`. + +Current metric: + +```text +response_duration_ms = prompt send -> expected assistant response visible and stable +``` + +This is a user-path metric, not pure LangBot overhead. If it regresses, inspect +provider latency, model route health, plugin/runtime logs, WebSocket behavior, +and browser console/network evidence before attributing the whole duration to +LangBot. + +## Running The First Gate + +Start with the reusable suite: + +```bash +rtk bin/lbs suite plan langbot-performance-reliability-gate +rtk bin/lbs suite start langbot-performance-reliability-gate --run-id langbot-perf-rel-local +``` + +Run synthetic contract probes first. Run live probes only after the selected +backend/frontend instance is reachable and the run owner accepts any fault +scope. diff --git a/skills/skills/langbot-testing/suites/langbot-live-backend-gate.yaml b/skills/skills/langbot-testing/suites/langbot-live-backend-gate.yaml new file mode 100644 index 000000000..58a978527 --- /dev/null +++ b/skills/skills/langbot-testing/suites/langbot-live-backend-gate.yaml @@ -0,0 +1,14 @@ +id: langbot-live-backend-gate +title: "LangBot live backend reliability gate" +description: "Live backend control-plane responsiveness and runtime log health checks for a locally running LangBot instance." +type: reliability +priority: p1 +tags: + - performance + - reliability + - live-backend + - metrics +cases: + - langbot-live-backend-latency + - langbot-live-control-plane-api + - langbot-live-backend-log-health diff --git a/skills/skills/langbot-testing/suites/langbot-performance-contract-gate.yaml b/skills/skills/langbot-testing/suites/langbot-performance-contract-gate.yaml new file mode 100644 index 000000000..b5a9eb47f --- /dev/null +++ b/skills/skills/langbot-testing/suites/langbot-performance-contract-gate.yaml @@ -0,0 +1,13 @@ +id: langbot-performance-contract-gate +title: "LangBot performance contract gate" +description: "Fast synthetic contract checks for performance metric accounting and non-destructive reliability fault taxonomy." +type: contract +priority: p1 +tags: + - performance + - reliability + - contract + - metrics +cases: + - langbot-overhead-accounting-contract + - langbot-fault-taxonomy-contract diff --git a/skills/skills/langbot-testing/suites/langbot-performance-reliability-gate.yaml b/skills/skills/langbot-testing/suites/langbot-performance-reliability-gate.yaml new file mode 100644 index 000000000..1e0d58d26 --- /dev/null +++ b/skills/skills/langbot-testing/suites/langbot-performance-reliability-gate.yaml @@ -0,0 +1,16 @@ +id: langbot-performance-reliability-gate +title: "LangBot performance and reliability starter gate" +description: "Starter gate for LangBot performance accounting, live backend control-plane latency, and non-destructive fault taxonomy checks." +type: reliability +priority: p1 +tags: + - performance + - reliability + - metrics + - chaos +cases: + - langbot-overhead-accounting-contract + - langbot-fault-taxonomy-contract + - langbot-live-backend-latency + - langbot-live-control-plane-api + - langbot-live-backend-log-health diff --git a/skills/skills/langbot-testing/suites/langbot-user-path-performance-gate.yaml b/skills/skills/langbot-testing/suites/langbot-user-path-performance-gate.yaml new file mode 100644 index 000000000..a6a138ec0 --- /dev/null +++ b/skills/skills/langbot-testing/suites/langbot-user-path-performance-gate.yaml @@ -0,0 +1,12 @@ +id: langbot-user-path-performance-gate +title: "LangBot user-path performance gate" +description: "Browser-visible performance checks for user-facing LangBot paths such as Pipeline Debug Chat." +type: performance +priority: p1 +tags: + - performance + - browser + - debug-chat + - user-path +cases: + - pipeline-debug-chat-performance diff --git a/skills/src/commands/env.ts b/skills/src/commands/env.ts index d5d1eeaf4..76ef33aec 100644 --- a/skills/src/commands/env.ts +++ b/skills/src/commands/env.ts @@ -1,5 +1,7 @@ import { existsSync } from "node:fs"; +import { spawnSync } from "node:child_process"; import { Socket } from "node:net"; +import { join } from "node:path"; import type { CommandContext } from "../types.ts"; import { parseOptions } from "../cli.ts"; import { loadEnv } from "../fs.ts"; @@ -88,6 +90,37 @@ function compareProxyPair(env: Record, upper: string, lower: str return null; } +function envValue(env: Record, key: string): string { + return process.env[key] ?? env[key] ?? ""; +} + +function activeSocksProxy(env: Record): { key: string; value: string } | null { + for (const key of ["ALL_PROXY", "all_proxy", "HTTPS_PROXY", "https_proxy", "HTTP_PROXY", "http_proxy"]) { + const value = envValue(env, key); + if (/^socks/i.test(value)) return { key, value }; + } + return null; +} + +function checkSocksio(env: Record): string | null { + const proxy = activeSocksProxy(env); + if (!proxy) return null; + + const repo = env.LANGBOT_REPO; + const python = repo ? join(repo, ".venv", "bin", "python") : ""; + if (!python || !existsSync(python)) { + return `SOCKS proxy ${proxy.key} is configured (${redactEnvValue(proxy.key, proxy.value)}), but LangBot venv python was not found; after creating the venv, verify it can import socksio.`; + } + + const result = spawnSync(python, ["-c", "import socksio"], { + encoding: "utf8", + timeout: 5000, + }); + if (result.status === 0) return null; + + return `SOCKS proxy ${proxy.key} is configured (${redactEnvValue(proxy.key, proxy.value)}), but ${python} cannot import socksio; run \`${python} -m pip install socksio\` or start LangBot without SOCKS proxy env.`; +} + export async function commandEnvDoctor(ctx: CommandContext): Promise { const env = loadEnv(ctx.root); const failures: string[] = []; @@ -117,6 +150,8 @@ export async function commandEnvDoctor(ctx: CommandContext): Promise { ]) { if (mismatch) failures.push(mismatch); } + const socksioFailure = checkSocksio(env); + if (socksioFailure) failures.push(socksioFailure); for (const [label, result] of await Promise.all([ checkUrl("LANGBOT_BACKEND_URL", env.LANGBOT_BACKEND_URL).then((result) => ["LANGBOT_BACKEND_URL", result] as const), diff --git a/skills/src/commands/suite.ts b/skills/src/commands/suite.ts index 403156100..7ab556c5b 100644 --- a/skills/src/commands/suite.ts +++ b/skills/src/commands/suite.ts @@ -465,6 +465,41 @@ function outputTail(value: string | Buffer | null | undefined): string { return String(value ?? "").trim().slice(-4000); } +function exitStatusFromResultStatus(status: string): number { + if (status === "pass") return 0; + if (status === "blocked" || status === "env_issue" || status === "flaky") return 2; + return 1; +} + +function executionStatusFromExitStatus(status: number): string { + if (status === 0) return "ok"; + if (status === 2) return "classified"; + return "nonzero"; +} + +function executionFromCaseResultFile(caseItem: Record): Record | null { + const resultPath = join(String(caseItem.evidence_dir), "result.json"); + if (!existsSync(resultPath)) return null; + try { + const parsed = JSON.parse(readFileSync(resultPath, "utf8")) as Record; + if ( + parsed.case_id !== caseItem.id || + parsed.run_id !== caseItem.run_id || + typeof parsed.status !== "string" + ) return null; + const exitStatus = exitStatusFromResultStatus(parsed.status); + return { + status: executionStatusFromExitStatus(exitStatus), + exit_status: exitStatus, + reason: typeof parsed.reason === "string" ? parsed.reason : "result.json completed", + result_status: parsed.status, + result_json: resultPath, + }; + } catch { + return null; + } +} + function executionProblemStatus(executions: Array>): string { const statuses = executions.map((item) => String(item.status)); if (statuses.includes("nonzero")) return "fail"; @@ -523,12 +558,18 @@ export function commandSuiteRun(ctx: CommandContext): number { encoding: "utf8", stdio: options.json === true ? "pipe" : "inherit", }); - const status = result.error ? 1 : result.status ?? 1; + const fileExecution = result.error ? executionFromCaseResultFile(caseItem) : null; + const status = typeof fileExecution?.exit_status === "number" + ? fileExecution.exit_status + : result.error ? 1 : result.status ?? 1; executions.push({ id: caseItem.id, - status: status === 0 ? "ok" : "nonzero", + status: fileExecution?.status ?? executionStatusFromExitStatus(status), exit_status: status, - reason: result.error?.message || "", + reason: fileExecution?.reason ?? result.error?.message ?? "", + result_status: fileExecution?.result_status, + result_json: fileExecution?.result_json, + spawn_error: fileExecution && result.error ? result.error.message : undefined, stdout: outputTail(result.stdout), stderr: outputTail(result.stderr), }); diff --git a/skills/src/commands/test.ts b/skills/src/commands/test.ts index 2cce7a1e5..67ddc3122 100644 --- a/skills/src/commands/test.ts +++ b/skills/src/commands/test.ts @@ -271,7 +271,7 @@ function reportTemplate(mode: string): Record { target_tested: "Probe target, endpoint, file, command, or service actually checked", execution_path: "automation script | shell command | direct API | other", probe_result: "What the probe observed", - logs_or_artifacts: "Log, filesystem, API, or other artifact paths collected", + metrics_or_artifacts: "Metrics, logs, filesystem artifacts, traces, or profiles collected", diagnostics: "Extra diagnostics used, if any", matched_troubleshooting: "Troubleshooting ids matched, if any", assets_to_update: "New case/reference/troubleshooting entries to add", @@ -320,7 +320,7 @@ function manualEvidenceTemplate(mode: string): ManualEvidenceTemplate { target_tested: "TODO: probe target, endpoint, file, command, or service actually checked", execution_path: "TODO: automation script | shell command | direct API | other", probe_result: "TODO: observed probe result", - logs_or_artifacts: "TODO: evidence paths or skipped reason", + metrics_or_artifacts: "TODO: metrics, logs, filesystem artifacts, traces, or profiles collected", diagnostics: "TODO: additional diagnostics used, if any", matched_troubleshooting: "TODO: troubleshooting ids matched, if any", assets_to_update: "TODO: case/reference/troubleshooting updates to make", @@ -1099,6 +1099,41 @@ function executionTail(value: string | Buffer | null | undefined): string { return String(value ?? "").trim().slice(-4000); } +function exitStatusFromResultStatus(status: string): number { + if (status === "pass") return 0; + if (status === "blocked" || status === "env_issue" || status === "flaky") return 2; + return 1; +} + +function executionStatusFromExitStatus(status: number): string { + if (status === 0) return "ok"; + if (status === 2) return "classified"; + return "nonzero"; +} + +function executionFromAutomationResultFile( + evidenceDir: string, + caseId: string, + runId: string, +): { status: string; exit_status: number; reason: string; result_status: string; path: string } | null { + const resultPath = join(evidenceDir, "automation-result.json"); + if (!existsSync(resultPath)) return null; + try { + const parsed = JSON.parse(readFileSync(resultPath, "utf8")) as Record; + if (parsed.case_id !== caseId || parsed.run_id !== runId || typeof parsed.status !== "string") return null; + const exitStatus = exitStatusFromResultStatus(parsed.status); + return { + status: executionStatusFromExitStatus(exitStatus), + exit_status: exitStatus, + reason: typeof parsed.reason === "string" ? parsed.reason : "automation-result.json completed", + result_status: parsed.status, + path: resultPath, + }; + } catch { + return null; + } +} + function runSetupAutomation( ctx: CommandContext, item: StructuredItem, @@ -1224,6 +1259,30 @@ export function commandTestRun(ctx: CommandContext): number { }); if (result.error) { + const fileExecution = executionFromAutomationResultFile( + run.automation.evidence_dir, + String(run.case.id), + run.run_id, + ); + if (fileExecution) { + if (options.json !== true) { + console.error(`WARN: automation spawn reported an error, but ${fileExecution.path} completed: ${result.error.message}`); + } + if (options.json === true) { + console.log(JSON.stringify({ + run, + setup_executions: setupExecutions, + automation_execution: { + ...fileExecution, + spawn_error: result.error.message, + stdout: executionTail(result.stdout), + stderr: executionTail(result.stderr), + }, + exit_status: fileExecution.exit_status, + }, null, 2)); + } + return fileExecution.exit_status; + } if (options.json !== true) console.error(`ERROR: failed to run automation: ${result.error.message}`); if (options.json === true) { console.log(JSON.stringify({ @@ -1247,7 +1306,7 @@ export function commandTestRun(ctx: CommandContext): number { run, setup_executions: setupExecutions, automation_execution: { - status: status === 0 ? "ok" : "nonzero", + status: executionStatusFromExitStatus(status), exit_status: status, stdout: executionTail(result.stdout), stderr: executionTail(result.stderr), @@ -1311,6 +1370,7 @@ function renderMarkdownReport(report: TestReport): string { const environment = report.environment; const logGuard = report.log_guard; const troubleshooting = report.troubleshooting; + const automation = report.automation_result; const lines: string[] = []; lines.push(`# Test Report: ${reportCase.id}`); @@ -1323,20 +1383,41 @@ function renderMarkdownReport(report: TestReport): string { lines.push(`Type: ${reportCase.type}`); lines.push(""); lines.push("## Result"); - lines.push(`- result: ${evidence.result}`); - for (const [key, value] of Object.entries(evidence)) { - if (key !== "result") lines.push(`- ${key}: ${value}`); + if (automation.status === "loaded" && automation.result) { + lines.push(`- result: ${automation.result}`); + if (automation.reason) lines.push(`- reason: ${automation.reason}`); + if (automation.url) lines.push(`- target_tested: ${automation.url}`); + if (automation.path) lines.push(`- automation_result: ${automation.path}`); + if (automation.artifacts) lines.push(`- artifacts: ${JSON.stringify(automation.artifacts)}`); + } else { + lines.push(`- result: ${evidence.result}`); + for (const [key, value] of Object.entries(evidence)) { + if (key !== "result") lines.push(`- ${key}: ${value}`); + } } lines.push(""); lines.push("## Automation Result"); - lines.push(`- status: ${report.automation_result.status}`); - if (report.automation_result.path) lines.push(`- path: ${report.automation_result.path}`); - if (report.automation_result.result) lines.push(`- result: ${report.automation_result.result}`); - if (report.automation_result.reason) lines.push(`- reason: ${report.automation_result.reason}`); - if (report.automation_result.started_at_local) lines.push(`- started_at_local: ${report.automation_result.started_at_local}`); - if (report.automation_result.finished_at_local) lines.push(`- finished_at_local: ${report.automation_result.finished_at_local}`); - if (report.automation_result.url) lines.push(`- url: ${report.automation_result.url}`); - if (report.automation_result.expected_text) lines.push(`- expected_text: ${report.automation_result.expected_text}`); + lines.push(`- status: ${automation.status}`); + if (automation.path) lines.push(`- path: ${automation.path}`); + if (automation.result) lines.push(`- result: ${automation.result}`); + if (automation.reason) lines.push(`- reason: ${automation.reason}`); + if (automation.duration_ms !== undefined) lines.push(`- duration_ms: ${automation.duration_ms}`); + if (automation.started_at_local) lines.push(`- started_at_local: ${automation.started_at_local}`); + if (automation.finished_at_local) lines.push(`- finished_at_local: ${automation.finished_at_local}`); + if (automation.url) lines.push(`- url: ${automation.url}`); + if (automation.expected_text) lines.push(`- expected_text: ${automation.expected_text}`); + if (automation.metrics_summary) { + lines.push("- metrics_summary:"); + lines.push(` ${JSON.stringify(automation.metrics_summary)}`); + } + if (automation.thresholds_summary) { + lines.push("- thresholds_summary:"); + lines.push(` ${JSON.stringify(automation.thresholds_summary)}`); + } + if (automation.artifacts) { + lines.push("- artifacts:"); + lines.push(` ${JSON.stringify(automation.artifacts)}`); + } lines.push(""); lines.push("## Environment"); for (const [key, value] of Object.entries(environment)) lines.push(`- ${key}=${value}`); diff --git a/skills/src/commands/validate.ts b/skills/src/commands/validate.ts index 8b15d6344..1c0ef945d 100644 --- a/skills/src/commands/validate.ts +++ b/skills/src/commands/validate.ts @@ -126,6 +126,9 @@ function validateCaseItem(root: string, item: StructuredItem, skillNames: Set ( validateSetupAutomationEntry(root, entry, caseIds).map((error) => `${item.path}: ${error}`) )), diff --git a/skills/src/constants.ts b/skills/src/constants.ts index 015a9bd39..5cfe37f8a 100644 --- a/skills/src/constants.ts +++ b/skills/src/constants.ts @@ -9,7 +9,18 @@ export const requiredEnvKeys = [ ]; export const caseModeValues = ["agent-browser", "probe"]; -export const caseTypeValues = ["smoke", "regression", "feature", "provider", "exploratory"]; +export const caseTypeValues = [ + "smoke", + "regression", + "feature", + "provider", + "exploratory", + "contract", + "performance", + "reliability", + "chaos", + "security", +]; export const casePriorityValues = ["p0", "p1", "p2"]; export const caseRiskValues = ["low", "medium", "high"]; export const caseEvidenceValues = [ @@ -21,10 +32,24 @@ export const caseEvidenceValues = [ "frontend_log", "api_diagnostic", "filesystem", + "metrics", + "trace", + "profile", + "resource_log", ]; export const testResultStatusValues = ["pass", "fail", "blocked", "env_issue", "flaky"]; export const troubleshootingCategoryValues = ["product", "env_issue", "external_dependency", "blocked", "flaky"]; -export const suiteTypeValues = ["smoke", "regression", "release_gate", "exploratory"]; +export const suiteTypeValues = [ + "smoke", + "regression", + "release_gate", + "exploratory", + "contract", + "performance", + "reliability", + "chaos", + "security", +]; export const suiteRequiredStrings = ["id", "title", "description", "type", "priority"]; export const suiteRequiredLists = ["tags", "cases"]; diff --git a/skills/src/log-guard.ts b/skills/src/log-guard.ts index 253cb229e..6f7f541a7 100644 --- a/skills/src/log-guard.ts +++ b/skills/src/log-guard.ts @@ -91,6 +91,7 @@ export type AutomationResultEvidence = { path?: string; result?: string; reason?: string; + duration_ms?: number; started_at?: string; started_at_local?: string; finished_at?: string; @@ -98,6 +99,9 @@ export type AutomationResultEvidence = { url?: string; prompt?: string; expected_text?: string; + metrics_summary?: Record; + thresholds_summary?: Record; + artifacts?: Record; }; type MutableScanState = { @@ -594,6 +598,18 @@ function stringField(data: Record, key: string): string | undef return typeof value === "string" && value.trim() ? value : undefined; } +function numberField(data: Record, key: string): number | undefined { + const value = data[key]; + return typeof value === "number" && Number.isFinite(value) ? value : undefined; +} + +function objectField(data: Record, key: string): Record | undefined { + const value = data[key]; + return value && typeof value === "object" && !Array.isArray(value) + ? value as Record + : undefined; +} + function evidenceDirFromOptions(options: Record): string | undefined { const explicit = typeof options["evidence-dir"] === "string" ? options["evidence-dir"] : undefined; if (explicit) return resolve(explicit); @@ -628,6 +644,7 @@ export function readAutomationResultEvidence(options: Record { } }); +test("suite run preserves classified env_issue automation results", () => { + const tmp = mkdtempSync(join(tmpdir(), "lbs-suite-run-env-issue-")); + try { + const skillDir = join(tmp, "skills", "langbot-testing"); + const casesDir = join(skillDir, "cases"); + const suitesDir = join(skillDir, "suites"); + const scriptsDir = join(tmp, "scripts"); + mkdirSync(casesDir, { recursive: true }); + mkdirSync(suitesDir, { recursive: true }); + mkdirSync(scriptsDir, { recursive: true }); + writeFileSync(join(skillDir, "SKILL.md"), "---\nname: langbot-testing\ndescription: Testing.\n---\n\n# Testing\n"); + writeFileSync(join(tmp, "skills", ".env"), ""); + writeFileSync( + join(casesDir, "env-case.yaml"), + [ + "id: env-case", + "title: Env Case", + "mode: probe", + "area: qa", + "type: smoke", + "priority: p2", + "risk: low", + "ci_eligible: true", + "automation: scripts/env-issue.mjs", + "evidence_required:", + " - filesystem", + ].join("\n"), + ); + writeFileSync( + join(suitesDir, "mini.yaml"), + [ + "id: mini", + "title: Mini", + "description: Mini suite.", + "type: smoke", + "priority: p2", + "tags:", + " - qa", + "cases:", + " - env-case", + ].join("\n"), + ); + writeFileSync( + join(scriptsDir, "env-issue.mjs"), + [ + "import { mkdirSync, writeFileSync } from 'node:fs';", + "import { join } from 'node:path';", + "mkdirSync(process.env.LBS_EVIDENCE_DIR, { recursive: true });", + "const result = {", + " case_id: process.env.LBS_CASE_ID,", + " run_id: process.env.LBS_RUN_ID,", + " status: 'env_issue',", + " reason: 'backend not reachable',", + " evidence_collected: ['filesystem']", + "};", + "writeFileSync(join(process.env.LBS_EVIDENCE_DIR, 'result.json'), JSON.stringify(result));", + "writeFileSync(join(process.env.LBS_EVIDENCE_DIR, 'automation-result.json'), JSON.stringify({ ...result, source: 'automation' }));", + "process.exit(2);", + ].join("\n"), + ); + + const result = capture(() => commandSuiteRun({ + root: tmp, + args: ["suite", "run", "mini", "--run-id", "mini-run", "--evidence-dir", join(tmp, "evidence"), "--json"], + })); + + assert.equal(result.code, 2); + const payload = JSON.parse(result.output); + assert.equal(payload.executions[0].status, "classified"); + assert.equal(payload.report.status, "env_issue"); + assert.equal(payload.report.execution_status, "ok"); + } finally { + rmSync(tmp, { recursive: true, force: true }); + } +}); + test("suite run failure cannot be masked by stale pass result", () => { const tmp = mkdtempSync(join(tmpdir(), "lbs-suite-run-stale-pass-")); try { @@ -1369,6 +1445,56 @@ test("env doctor does not require proxy variables", async () => { } }); +test("env doctor reports missing socksio for active SOCKS proxy", async () => { + const tmp = mkdtempSync(join(tmpdir(), "lbs-env-doctor-socksio-")); + const originalAllProxy = process.env.ALL_PROXY; + const originalAllProxyLower = process.env.all_proxy; + try { + delete process.env.ALL_PROXY; + delete process.env.all_proxy; + const skillsDir = join(tmp, "skills"); + const repoDir = join(tmp, "LangBot"); + const webDir = join(repoDir, "web"); + const venvBin = join(repoDir, ".venv", "bin"); + const browserProfile = join(tmp, "browser-profile"); + const chromium = join(tmp, "chromium"); + mkdirSync(skillsDir, { recursive: true }); + mkdirSync(webDir, { recursive: true }); + mkdirSync(venvBin, { recursive: true }); + mkdirSync(browserProfile, { recursive: true }); + writeFileSync(chromium, ""); + const python = join(venvBin, "python"); + writeFileSync(python, "#!/bin/sh\nexit 1\n"); + chmodSync(python, 0o755); + writeFileSync( + join(skillsDir, ".env"), + [ + "LANGBOT_BACKEND_URL=http://127.0.0.1:59996", + "LANGBOT_FRONTEND_URL=http://127.0.0.1:59996", + "LANGBOT_DEV_FRONTEND_URL=http://127.0.0.1:59996", + `LANGBOT_REPO=${repoDir}`, + `LANGBOT_WEB_REPO=${webDir}`, + `LANGBOT_BROWSER_PROFILE=${browserProfile}`, + `LANGBOT_CHROMIUM_EXECUTABLE=${chromium}`, + "ALL_PROXY=socks5://127.0.0.1:7890", + ].join("\n"), + ); + + const result = await captureAsync(() => commandEnvDoctor({ root: tmp, args: ["env", "doctor"] })); + + assert.equal(result.code, 1); + assert.match(result.output, /FAIL: SOCKS proxy ALL_PROXY is configured/); + assert.match(result.output, /cannot import socksio/); + assert.match(result.output, /-m pip install socksio/); + } finally { + if (originalAllProxy === undefined) delete process.env.ALL_PROXY; + else process.env.ALL_PROXY = originalAllProxy; + if (originalAllProxyLower === undefined) delete process.env.all_proxy; + else process.env.all_proxy = originalAllProxyLower; + rmSync(tmp, { recursive: true, force: true }); + } +}); + test("env show redacts secret-like values by default", () => { const tmp = mkdtempSync(join(tmpdir(), "lbs-env-show-redact-")); try { @@ -2521,6 +2647,38 @@ test("test report renders a reusable evidence template", () => { assert.match(result.output, /no log files provided/); }); +test("test report promotes loaded automation evidence into result section", () => { + const tmp = mkdtempSync(join(tmpdir(), "lbs-report-automation-")); + try { + writeFileSync( + join(tmp, "automation-result.json"), + JSON.stringify({ + status: "pass", + reason: "latency thresholds passed", + url: "http://127.0.0.1:5300", + artifacts: { metrics_json: join(tmp, "metrics.json") }, + }), + ); + + const result = capture(() => commandTestReport(ctx([ + "test", + "report", + "langbot-live-backend-latency", + "--evidence-dir", + tmp, + "--no-auto-log", + ]))); + + assert.equal(result.code, 0); + assert.match(result.output, /## Result\n- result: pass\n- reason: latency thresholds passed/); + assert.match(result.output, /- target_tested: http:\/\/127\.0\.0\.1:5300/); + assert.doesNotMatch(result.output, /target_tested: TODO/); + assert.match(result.output, /## Automation Result/); + } finally { + rmSync(tmp, { recursive: true, force: true }); + } +}); + test("validate rejects dangling case references and missing automation scripts", () => { const tmp = mkdtempSync(join(tmpdir(), "lbs-validate-strict-")); try {