From 5b2826fa49d0a5062f0a41b9ea01d2c52cb28a67 Mon Sep 17 00:00:00 2001 From: huanghuoguoguo <1051233107@qq.com> Date: Thu, 25 Jun 2026 13:02:44 +0000 Subject: [PATCH] Add performance and reliability QA gates (#2283) * Add performance and reliability QA gates * test(skills): prepare user path performance gate * test(skills): add debug chat load gate * test(skills): extend fake provider load profiles * test(skills): add debug chat timing and isolation probes * test(skills): clarify manual QA perf gates --- .gitignore | 1 + skills/README.md | 3 +- skills/docs/user-guide.md | 171 ++++ skills/schemas/case.schema.json | 111 ++- skills/schemas/suite.schema.json | 12 +- .../ensure-fake-provider-cross-pipelines.mjs | 205 +++++ .../e2e/ensure-fake-provider-pipeline.mjs | 635 +++++++++++++ .../e2e/ensure-local-agent-pipeline.mjs | 325 ++++++- skills/scripts/e2e/fake-openai-provider.mjs | 496 ++++++++++ skills/scripts/e2e/lib/langbot-e2e.mjs | 3 +- skills/scripts/e2e/pipeline-debug-chat.mjs | 80 +- skills/skills.index.json | 476 ++++++++++ skills/skills/.env.example | 17 + .../references/service-startup.md | 6 +- skills/skills/langbot-testing/SKILL.md | 3 + ...r-debug-chat-cross-pipeline-isolation.yaml | 84 ++ ...ke-provider-debug-chat-fault-recovery.yaml | 95 ++ ...langbot-fake-provider-debug-chat-load.yaml | 81 ++ ...ot-fake-provider-debug-chat-slow-load.yaml | 88 ++ .../langbot-fault-taxonomy-contract.yaml | 35 + .../cases/langbot-live-backend-latency.yaml | 42 + .../langbot-live-backend-log-health.yaml | 45 + .../cases/langbot-live-control-plane-api.yaml | 44 + .../langbot-overhead-accounting-contract.yaml | 37 + ...ot-space-debug-chat-concurrency-smoke.yaml | 84 ++ .../pipeline-debug-chat-performance.yaml | 80 ++ .../plugins/qa-plugin-smoke/.gitignore | 4 +- .../dist/qa-plugin-smoke-0.1.0.lbpkg | Bin 0 -> 5160 bytes .../probes/langbot-debug-chat-concurrency.mjs | 837 +++++++++++++++++ ...ot-debug-chat-cross-pipeline-isolation.mjs | 861 ++++++++++++++++++ .../langbot-fault-taxonomy-contract.mjs | 159 ++++ .../probes/langbot-live-backend-latency.mjs | 212 +++++ .../langbot-live-backend-log-health.mjs | 205 +++++ .../probes/langbot-live-control-plane-api.mjs | 311 +++++++ .../langbot-overhead-accounting-contract.mjs | 162 ++++ .../probes/lib/fake-provider-timing.mjs | 134 +++ .../performance-reliability-testing.md | 285 ++++++ .../langbot-debug-chat-isolation-gate.yaml | 13 + .../suites/langbot-debug-chat-load-gate.yaml | 15 + .../suites/langbot-live-backend-gate.yaml | 14 + .../langbot-performance-contract-gate.yaml | 13 + .../langbot-performance-reliability-gate.yaml | 16 + .../langbot-user-path-performance-gate.yaml | 12 + .../telemetry-proxy-noise.yaml | 23 + skills/src/commands/env.ts | 35 + skills/src/commands/suite.ts | 47 +- skills/src/commands/test.ts | 109 ++- skills/src/commands/validate.ts | 55 ++ skills/src/constants.ts | 29 +- skills/src/log-guard.ts | 20 + skills/src/readiness.ts | 26 + skills/test/lbs-cli.test.ts | 160 +++- 52 files changed, 6974 insertions(+), 42 deletions(-) create mode 100644 skills/docs/user-guide.md create mode 100644 skills/scripts/e2e/ensure-fake-provider-cross-pipelines.mjs create mode 100644 skills/scripts/e2e/ensure-fake-provider-pipeline.mjs create mode 100644 skills/scripts/e2e/fake-openai-provider.mjs create mode 100644 skills/skills/langbot-testing/cases/langbot-fake-provider-debug-chat-cross-pipeline-isolation.yaml create mode 100644 skills/skills/langbot-testing/cases/langbot-fake-provider-debug-chat-fault-recovery.yaml create mode 100644 skills/skills/langbot-testing/cases/langbot-fake-provider-debug-chat-load.yaml create mode 100644 skills/skills/langbot-testing/cases/langbot-fake-provider-debug-chat-slow-load.yaml create mode 100644 skills/skills/langbot-testing/cases/langbot-fault-taxonomy-contract.yaml create mode 100644 skills/skills/langbot-testing/cases/langbot-live-backend-latency.yaml create mode 100644 skills/skills/langbot-testing/cases/langbot-live-backend-log-health.yaml create mode 100644 skills/skills/langbot-testing/cases/langbot-live-control-plane-api.yaml create mode 100644 skills/skills/langbot-testing/cases/langbot-overhead-accounting-contract.yaml create mode 100644 skills/skills/langbot-testing/cases/langbot-space-debug-chat-concurrency-smoke.yaml create mode 100644 skills/skills/langbot-testing/cases/pipeline-debug-chat-performance.yaml create mode 100644 skills/skills/langbot-testing/fixtures/plugins/qa-plugin-smoke/dist/qa-plugin-smoke-0.1.0.lbpkg create mode 100644 skills/skills/langbot-testing/probes/langbot-debug-chat-concurrency.mjs create mode 100644 skills/skills/langbot-testing/probes/langbot-debug-chat-cross-pipeline-isolation.mjs create mode 100644 skills/skills/langbot-testing/probes/langbot-fault-taxonomy-contract.mjs create mode 100644 skills/skills/langbot-testing/probes/langbot-live-backend-latency.mjs create mode 100644 skills/skills/langbot-testing/probes/langbot-live-backend-log-health.mjs create mode 100644 skills/skills/langbot-testing/probes/langbot-live-control-plane-api.mjs create mode 100644 skills/skills/langbot-testing/probes/langbot-overhead-accounting-contract.mjs create mode 100644 skills/skills/langbot-testing/probes/lib/fake-provider-timing.mjs create mode 100644 skills/skills/langbot-testing/references/performance-reliability-testing.md create mode 100644 skills/skills/langbot-testing/suites/langbot-debug-chat-isolation-gate.yaml create mode 100644 skills/skills/langbot-testing/suites/langbot-debug-chat-load-gate.yaml create mode 100644 skills/skills/langbot-testing/suites/langbot-live-backend-gate.yaml create mode 100644 skills/skills/langbot-testing/suites/langbot-performance-contract-gate.yaml create mode 100644 skills/skills/langbot-testing/suites/langbot-performance-reliability-gate.yaml create mode 100644 skills/skills/langbot-testing/suites/langbot-user-path-performance-gate.yaml create mode 100644 skills/skills/langbot-testing/troubleshooting/telemetry-proxy-noise.yaml diff --git a/.gitignore b/.gitignore index d0fe6acb6..97a64ba81 100644 --- a/.gitignore +++ b/.gitignore @@ -48,6 +48,7 @@ coverage.xml .coverage src/langbot/web/ testsdk/ +.qa/ # Build artifacts /dist diff --git a/skills/README.md b/skills/README.md index f45b52859..091e3d3b1 100644 --- a/skills/README.md +++ b/skills/README.md @@ -26,7 +26,7 @@ and LangBot's own Local Agent) working with the LangBot ecosystem. ## Quick start (for an AI agent) -1. Read this README, `AGENTS.md`, and `qa-agent-docs/` to understand the layout. +1. Read this README, `AGENTS.md`, and `docs/user-guide.md` to understand the layout. 2. Read `skills/.env` for shared local defaults. On a new machine, copy `skills/.env.example` to `skills/.env.local` (gitignored) and override machine-specific values there. Never commit secrets. @@ -48,6 +48,7 @@ bin/lbs env show # inspect resolved env defaults (redacted) bin/lbs env doctor # diagnose local environment readiness bin/lbs case list --ready bin/lbs test plan +bin/lbs suite plan langbot-debug-chat-load-gate ``` ## Maintenance rule diff --git a/skills/docs/user-guide.md b/skills/docs/user-guide.md new file mode 100644 index 000000000..124d3af36 --- /dev/null +++ b/skills/docs/user-guide.md @@ -0,0 +1,171 @@ +# LangBot QA Skills User Guide + +Use this guide as the first operational path after reading `README.md` and +`AGENTS.md`. + +## 1. Configure Local Inputs + +Read `skills/.env`, then create `skills/.env.local` for machine-local values. +Do not commit `.env.local`, browser profiles, reports, tokens, API keys, OAuth +state, or provider credentials. + +Minimum local fields for live browser QA: + +```bash +LANGBOT_REPO=/path/to/LangBot +LANGBOT_WEB_REPO=/path/to/LangBot/web +LANGBOT_BACKEND_URL=http://127.0.0.1:5300 +LANGBOT_FRONTEND_URL=http://127.0.0.1:3000 +LANGBOT_DEV_FRONTEND_URL=http://127.0.0.1:3000 +LANGBOT_BROWSER_PROFILE=/path/to/langbot-browser-profile +LANGBOT_CHROMIUM_EXECUTABLE=/path/to/chromium-or-playwright-chrome +LANGBOT_E2E_LOGIN_USER=qa-local@example.com +``` + +`LANGBOT_E2E_LOGIN_USER` is a local QA account. The setup automation uses the +LangBot recovery key from the active checkout to initialize or refresh that +local account and write a browser `localStorage` token. It does not need the +user's GitHub or Space credentials. + +## 2. Check Readiness + +From `skills/`: + +```bash +bin/lbs env show +bin/lbs env doctor +bin/lbs validate +bin/lbs index --check +``` + +`env doctor` should report reachable backend and frontend URLs before live +browser cases are run. Missing Space provider credentials are not a LangBot +product pass; classify them as `env_issue` and configure the local Space +provider before measuring Debug Chat performance. + +## 3. Start Services + +Start the backend from `LANGBOT_REPO`: + +```bash +cd "$LANGBOT_REPO" +uv run main.py +``` + +Start the standalone frontend from `LANGBOT_WEB_REPO` and point it at the +backend: + +```bash +cd "$LANGBOT_WEB_REPO" +VITE_API_BASE_URL="$LANGBOT_BACKEND_URL" pnpm dev --host 0.0.0.0 +``` + +If `VITE_API_BASE_URL` is missing, browser tests can load the Vite page but send +API requests to the frontend port, which produces false UI failures. + +## 4. Prepare User-Path Fixtures + +For local-agent Debug Chat cases and the user-path performance gate: + +```bash +node scripts/e2e/ensure-local-agent-pipeline.mjs --write-env +``` + +The script: + +- refreshes the local QA login and browser token; +- marks the local wizard as skipped; +- creates or updates a local QA pipeline; +- scans Space LLM models, tests candidates, and switches to the first working + Space model with tested fallback models; +- writes `LANGBOT_PIPELINE_URL`, `LANGBOT_PIPELINE_NAME`, and local-agent + pipeline/model variables into `skills/.env.local`; +- returns `env_issue` when no Space model can be scanned or tested. + +Useful model controls: + +```bash +LANGBOT_E2E_MODEL_TEST_LIMIT=8 +LANGBOT_E2E_MODEL_FALLBACK_COUNT=3 +LANGBOT_E2E_SKIP_MODEL_UUIDS=uuid-a,uuid-b +LANGBOT_E2E_SKIP_MODEL_NAMES=model-a,model-b +LANGBOT_E2E_SCAN_SPACE_MODELS=true +``` + +The setup writes a current-runtime compatibility `max-round` value into the +pipeline config because this backend still reads that field directly during +message truncation. Do not treat it as a long-term QA contract. + +## 5. Run Gates + +Fast contract gate, no live service required: + +```bash +bin/lbs suite run langbot-performance-contract-gate --run-id langbot-contract-local +``` + +Live backend gate: + +```bash +bin/lbs suite run langbot-live-backend-gate --run-id langbot-backend-local +``` + +Browser-visible user-path performance gate: + +```bash +bin/lbs suite plan langbot-user-path-performance-gate +bin/lbs suite run langbot-user-path-performance-gate --run-id langbot-user-path-local --include-manual-check +``` + +Controlled Debug Chat message-path load gate (manual/non-required; run fake-provider cases serially when they share `LANGBOT_FAKE_PROVIDER_URL`): + +```bash +bin/lbs suite plan langbot-debug-chat-load-gate +bin/lbs test run langbot-fake-provider-debug-chat-load --run-id langbot-fake-load-local +bin/lbs test run langbot-fake-provider-debug-chat-slow-load --run-id langbot-fake-slow-local +bin/lbs test run langbot-fake-provider-debug-chat-fault-recovery --run-id langbot-fake-fault-local +bin/lbs test run langbot-space-debug-chat-concurrency-smoke --run-id langbot-space-smoke-local +``` + +Cross-pipeline Debug Chat isolation is a separate manual regression gate because +current releases may fail it due to product bug #2286: + +```bash +bin/lbs suite plan langbot-debug-chat-isolation-gate +bin/lbs suite run langbot-debug-chat-isolation-gate --run-id langbot-debug-chat-isolation-local --include-manual-check +``` + +Start with `langbot-fake-provider-debug-chat-load`. It launches a local +OpenAI-compatible fake provider, creates the matching provider/model/pipeline, +then sends concurrent WebSocket Debug Chat messages through the real backend. +Use `langbot-fake-provider-debug-chat-slow-load` to measure the same path under +deterministic streaming latency. Use +`langbot-fake-provider-debug-chat-fault-recovery` to inject bounded provider +HTTP failures and confirm later Debug Chat requests recover. Use the separate +`langbot-debug-chat-isolation-gate` to verify that concurrent Debug Chat traffic +on two pipelines does not leak assistant responses across pipeline boundaries; +current releases may fail that gate because of #2286, so keep it out of the +normal load gate until the product fix lands. +Use `langbot-space-debug-chat-concurrency-smoke` only as a low-volume live +provider smoke; it includes Space/model/network latency and should be compared +against the fake-provider baseline before attributing failures to LangBot. + +`manual_check` means the agent must confirm the declared preconditions for that +run window. When setup automation is declared, run output may stop early with +`env_issue`; fix that environment input before treating the product path as +measured. + +## 6. Read Results + +Suite reports live under `skills/reports/`. Evidence lives under +`skills/reports/evidence//`. + +For performance cases, inspect: + +- `metrics.json` for p50/p95/p99, error rate, and total duration; +- `automation-result.json` for threshold decisions and artifacts; +- `console.log` and `network.log` for frontend/API failures; +- backend logs for provider, runner, WebSocket, or persistence failures. + +Do not call a user-path performance result a LangBot overhead regression until +provider/tool/network time has been separated or ruled out. diff --git a/skills/schemas/case.schema.json b/skills/schemas/case.schema.json index f6365c062..46601142a 100644 --- a/skills/schemas/case.schema.json +++ b/skills/schemas/case.schema.json @@ -48,7 +48,18 @@ }, "type": { "type": "string", - "enum": ["smoke", "regression", "feature", "provider", "exploratory"] + "enum": [ + "smoke", + "regression", + "feature", + "provider", + "exploratory", + "contract", + "performance", + "reliability", + "chaos", + "security" + ] }, "priority": { "type": "string", @@ -102,7 +113,11 @@ "backend_log", "frontend_log", "api_diagnostic", - "filesystem" + "filesystem", + "metrics", + "trace", + "profile", + "resource_log" ] }, "minItems": 1 @@ -188,9 +203,101 @@ "type": "string", "enum": ["person", "group"] }, + "automation_debug_chat_response_p95_ms": { + "type": "string" + }, + "automation_debug_chat_max_error_rate": { + "type": "string" + }, + "automation_debug_chat_load_requests": { + "type": "string" + }, + "automation_debug_chat_load_concurrency": { + "type": "string" + }, + "automation_debug_chat_load_timeout_ms": { + "type": "string" + }, + "automation_debug_chat_load_response_p95_ms": { + "type": "string" + }, + "automation_debug_chat_load_first_response_p95_ms": { + "type": "string" + }, + "automation_debug_chat_load_max_error_rate": { + "type": "string" + }, + "automation_debug_chat_load_min_error_rate": { + "type": "string" + }, + "automation_debug_chat_load_min_error_count": { + "type": "string" + }, + "automation_debug_chat_load_min_ok_count": { + "type": "string" + }, + "automation_debug_chat_load_min_provider_fault_count": { + "type": "string" + }, + "automation_debug_chat_load_expected_prefix": { + "type": "string" + }, + "automation_debug_chat_load_prompt_template": { + "type": "string" + }, + "automation_debug_chat_load_stream": { + "type": "string", + "enum": ["0", "1", "false", "true"] + }, + "automation_debug_chat_load_reset": { + "type": "string", + "enum": ["0", "1", "false", "true"] + }, + "automation_debug_chat_load_fail_on_final_mismatch": { + "type": "string", + "enum": ["0", "1", "false", "true"] + }, + "automation_fake_provider_response_text": { + "type": "string" + }, + "automation_fake_provider_first_token_delay_ms": { + "type": "string" + }, + "automation_fake_provider_chunk_delay_ms": { + "type": "string" + }, + "automation_fake_provider_chunk_count": { + "type": "string" + }, + "automation_fake_provider_fail_first_n": { + "type": "string" + }, + "automation_fake_provider_fail_every_n": { + "type": "string" + }, + "automation_fake_provider_fault_status": { + "type": "string" + }, + "automation_fake_provider_fail_after_first_chunk": { + "type": "string", + "enum": ["0", "1", "false", "true"] + }, + "automation_fake_provider_dynamic_response": { + "type": "string", + "enum": ["0", "1", "false", "true"] + }, "automation_filesystem_checks_json": { "type": "string" }, + "metrics_thresholds_json": { + "type": "string" + }, + "load_profile_json": { + "type": "string" + }, + "fault_model_json": { + "type": "string" + }, "automation_pipeline_url_env": { "type": "string", "pattern": "^[A-Z][A-Z0-9_]*$" diff --git a/skills/schemas/suite.schema.json b/skills/schemas/suite.schema.json index 3da1a3e85..4f3fa7c7a 100644 --- a/skills/schemas/suite.schema.json +++ b/skills/schemas/suite.schema.json @@ -18,7 +18,17 @@ }, "type": { "type": "string", - "enum": ["smoke", "regression", "release_gate", "exploratory"] + "enum": [ + "smoke", + "regression", + "release_gate", + "exploratory", + "contract", + "performance", + "reliability", + "chaos", + "security" + ] }, "priority": { "type": "string", diff --git a/skills/scripts/e2e/ensure-fake-provider-cross-pipelines.mjs b/skills/scripts/e2e/ensure-fake-provider-cross-pipelines.mjs new file mode 100644 index 000000000..592a7b7f9 --- /dev/null +++ b/skills/scripts/e2e/ensure-fake-provider-cross-pipelines.mjs @@ -0,0 +1,205 @@ +#!/usr/bin/env node + +import { spawn } from "node:child_process"; +import { mkdir, readFile, writeFile } from "node:fs/promises"; +import { dirname, resolve } from "node:path"; +import { env } from "node:process"; +import { + appendLine, + ensureEvidence, + evidencePaths, + loadEnvFiles, + redact, + writeResult, +} from "./lib/langbot-e2e.mjs"; + +const caseId = "ensure-fake-provider-cross-pipelines"; +const DEFAULT_PIPELINE_A_NAME = "LangBot QA Fake Provider Debug Chat A"; +const DEFAULT_PIPELINE_B_NAME = "LangBot QA Fake Provider Debug Chat B"; + +await loadEnvFiles(); +const paths = evidencePaths(caseId); +await ensureEvidence(paths); + +const writeEnv = process.argv.includes("--write-env"); +const envLocalPath = resolve("skills/.env.local"); +const pipelineAName = env.LANGBOT_FAKE_PROVIDER_PIPELINE_A_NAME || DEFAULT_PIPELINE_A_NAME; +const pipelineBName = env.LANGBOT_FAKE_PROVIDER_PIPELINE_B_NAME || DEFAULT_PIPELINE_B_NAME; + +const result = { + source: "setup_automation", + case_id: caseId, + run_id: paths.runId, + status: "fail", + reason: "", + pipeline_a: { + name: pipelineAName, + id: "", + url: "", + }, + pipeline_b: { + name: pipelineBName, + id: "", + url: "", + }, + fake_provider: { + url: "", + base_url: "", + pid: null, + }, + wrote_env: false, + evidence: { + console_log: paths.consoleLog, + automation_result_json: paths.automationResultJson, + result_json: paths.resultJson, + }, + evidence_collected: ["api_diagnostic", "filesystem"], +}; + +try { + console.error(`[langbot-qa] configuring cross-pipeline QA fixtures: pipeline_a=\"${pipelineAName}\", pipeline_b=\"${pipelineBName}\"`); + console.error("[langbot-qa] run these fake-provider setup/probe commands serially when they share LANGBOT_FAKE_PROVIDER_URL."); + if (pipelineAName === pipelineBName) { + throw new Error("LANGBOT_FAKE_PROVIDER_PIPELINE_A_NAME and LANGBOT_FAKE_PROVIDER_PIPELINE_B_NAME must be different."); + } + + const setupA = await runPipelineSetup(pipelineAName, "A"); + const setupB = await runPipelineSetup(pipelineBName, "B"); + result.pipeline_a = { + name: setupA.pipeline_name || pipelineAName, + id: setupA.pipeline_id || "", + url: setupA.pipeline_url || "", + }; + result.pipeline_b = { + name: setupB.pipeline_name || pipelineBName, + id: setupB.pipeline_id || "", + url: setupB.pipeline_url || "", + }; + result.fake_provider = { + url: setupB.fake_provider?.url || setupA.fake_provider?.url || "", + base_url: setupB.fake_provider?.base_url || setupA.fake_provider?.base_url || "", + pid: setupB.fake_provider?.pid ?? setupA.fake_provider?.pid ?? null, + }; + + if (!result.pipeline_a.url || !result.pipeline_b.url || !result.fake_provider.url) { + throw new Error("Cross-pipeline fake provider setup did not return both pipeline URLs and provider URL."); + } + + if (writeEnv) { + await upsertEnvLocal(envLocalPath, { + LANGBOT_FAKE_PROVIDER_URL: result.fake_provider.url, + LANGBOT_FAKE_PROVIDER_BASE_URL: result.fake_provider.base_url, + LANGBOT_FAKE_PROVIDER_PID: result.fake_provider.pid ? String(result.fake_provider.pid) : "", + LANGBOT_FAKE_PROVIDER_PIPELINE_A_URL: result.pipeline_a.url, + LANGBOT_FAKE_PROVIDER_PIPELINE_A_NAME: result.pipeline_a.name, + LANGBOT_FAKE_PROVIDER_PIPELINE_B_URL: result.pipeline_b.url, + LANGBOT_FAKE_PROVIDER_PIPELINE_B_NAME: result.pipeline_b.name, + }); + result.wrote_env = true; + } + + result.status = "pass"; + result.reason = "Fake provider cross-pipeline fixtures are configured."; +} catch (error) { + result.status = looksLikeEnvIssue(error) ? "env_issue" : "fail"; + result.reason = safeReason(error.message); +} finally { + await writeResult(paths, result); + console.log(JSON.stringify(result, null, 2)); +} + +process.exit(result.status === "pass" ? 0 : result.status === "env_issue" ? 2 : 1); + +function runPipelineSetup(pipelineName, label) { + return new Promise((resolvePromise, rejectPromise) => { + const child = spawn(process.execPath, ["scripts/e2e/ensure-fake-provider-pipeline.mjs"], { + cwd: resolve("."), + env: { + ...env, + LANGBOT_FAKE_PROVIDER_PIPELINE_NAME: pipelineName, + LANGBOT_FAKE_PROVIDER_FIRST_TOKEN_DELAY_MS: env.LANGBOT_FAKE_PROVIDER_FIRST_TOKEN_DELAY_MS || "25", + LANGBOT_FAKE_PROVIDER_CHUNK_DELAY_MS: env.LANGBOT_FAKE_PROVIDER_CHUNK_DELAY_MS || "10", + LANGBOT_FAKE_PROVIDER_CHUNK_COUNT: env.LANGBOT_FAKE_PROVIDER_CHUNK_COUNT || "0", + LANGBOT_FAKE_PROVIDER_FAIL_FIRST_N: "0", + LANGBOT_FAKE_PROVIDER_FAIL_EVERY_N: "0", + LANGBOT_FAKE_PROVIDER_FAULT_STATUS: env.LANGBOT_FAKE_PROVIDER_FAULT_STATUS || "500", + LANGBOT_FAKE_PROVIDER_FAIL_AFTER_FIRST_CHUNK: "false", + LANGBOT_FAKE_PROVIDER_DYNAMIC_RESPONSE: "true", + }, + stdio: ["ignore", "pipe", "pipe"], + }); + + let stdout = ""; + let stderr = ""; + child.stdout.on("data", (chunk) => { + const text = chunk.toString(); + stdout += text; + appendLine(paths.consoleLog, `[setup ${label} stdout] ${text.trimEnd()}`).catch(() => {}); + }); + child.stderr.on("data", (chunk) => { + const text = chunk.toString(); + stderr += text; + appendLine(paths.consoleLog, `[setup ${label} stderr] ${text.trimEnd()}`).catch(() => {}); + }); + child.on("error", rejectPromise); + child.on("close", (code) => { + const parsed = parseJsonOutput(stdout); + if (code !== 0 || parsed.status !== "pass") { + rejectPromise(new Error(parsed.reason || stderr || `Fake provider pipeline setup ${label} exited with ${code}.`)); + return; + } + resolvePromise(parsed); + }); + }); +} + +function parseJsonOutput(text) { + const trimmed = String(text || "").trim(); + if (!trimmed) return {}; + try { + return JSON.parse(trimmed); + } catch { + const start = trimmed.indexOf("{"); + const end = trimmed.lastIndexOf("}"); + if (start >= 0 && end > start) { + try { + return JSON.parse(trimmed.slice(start, end + 1)); + } catch { + return {}; + } + } + return {}; + } +} + +async function upsertEnvLocal(path, updates) { + await mkdir(dirname(path), { recursive: true }); + let text = ""; + try { + text = await readFile(path, "utf8"); + } catch { + text = ""; + } + const lines = text.split(/\r?\n/); + const seen = new Set(); + const next = lines.map((line) => { + const trimmed = line.trim(); + const match = trimmed.match(/^([A-Z][A-Z0-9_]*)=/); + if (!match || updates[match[1]] === undefined) return line; + seen.add(match[1]); + return `${match[1]}=${updates[match[1]]}`; + }); + for (const [key, value] of Object.entries(updates)) { + if (!seen.has(key)) next.push(`${key}=${value}`); + } + await writeFile(path, `${next.join("\n").replace(/\n+$/, "")}\n`, "utf8"); +} + +function looksLikeEnvIssue(error) { + const message = String(error?.message || error || ""); + return /fetch failed|ECONNREFUSED|ENOTFOUND|LANGBOT_.*not configured|Could not read recovery_key|Backend did not respond/i.test(message); +} + +function safeReason(value) { + return redact(String(value || "")).slice(0, 1000); +} diff --git a/skills/scripts/e2e/ensure-fake-provider-pipeline.mjs b/skills/scripts/e2e/ensure-fake-provider-pipeline.mjs new file mode 100644 index 000000000..73f2465fd --- /dev/null +++ b/skills/scripts/e2e/ensure-fake-provider-pipeline.mjs @@ -0,0 +1,635 @@ +#!/usr/bin/env node + +import { spawn } from "node:child_process"; +import { open, readFile, mkdir, writeFile } from "node:fs/promises"; +import { dirname, resolve } from "node:path"; +import { env } from "node:process"; +import { + apiJson, + ensureEvidence, + evidencePaths, + loadEnvFiles, + redact, + resetAndAuthLocalUser, + writeResult, +} from "./lib/langbot-e2e.mjs"; + +const RUNNER_ID = "local-agent"; +const DEFAULT_LOCAL_PASSWORD = "LangBotE2ELocalPass!2026"; +const DEFAULT_PIPELINE_NAME = "LangBot QA Fake Provider Debug Chat"; +const DEFAULT_PROVIDER_NAME = "LangBot QA Fake OpenAI Provider"; +const QA_RESOURCE_DESCRIPTION = "Managed by LangBot skills QA automation for controlled fake-provider Debug Chat tests. Safe to delete when local QA fixtures are no longer needed."; +const DEFAULT_MODEL_NAME = "gpt-4o-mini"; +const DEFAULT_REQUESTER = "openai-chat-completions"; + +const caseId = "ensure-fake-provider-pipeline"; + +await loadEnvFiles(); +const paths = evidencePaths(caseId); +await ensureEvidence(paths); + +const writeEnv = process.argv.includes("--write-env"); +const frontendUrl = env.LANGBOT_FRONTEND_URL || ""; +const backendUrl = env.LANGBOT_BACKEND_URL || ""; +const envLocalPath = resolve("skills/.env.local"); +const repoRoot = resolve(env.LANGBOT_REPO || ".."); +const fakeStateDir = resolve(env.LANGBOT_FAKE_PROVIDER_STATE_DIR || resolve(repoRoot, ".qa/fake-provider")); +const fakeStatePath = resolve(fakeStateDir, "state.json"); +const fakeStdoutPath = resolve(fakeStateDir, "fake-provider.stdout.log"); +const fakeStderrPath = resolve(fakeStateDir, "fake-provider.stderr.log"); +const pipelineName = env.LANGBOT_FAKE_PROVIDER_PIPELINE_NAME || DEFAULT_PIPELINE_NAME; +const providerName = env.LANGBOT_FAKE_PROVIDER_NAME || DEFAULT_PROVIDER_NAME; +const requester = env.LANGBOT_FAKE_PROVIDER_REQUESTER || DEFAULT_REQUESTER; +const modelName = env.LANGBOT_FAKE_PROVIDER_MODEL_NAME || DEFAULT_MODEL_NAME; + +const result = { + source: "automation", + case_id: caseId, + run_id: paths.runId, + status: "fail", + reason: "", + frontend_url: frontendUrl, + backend_url: backendUrl, + fake_provider: { + url: "", + base_url: "", + pid: null, + reused: false, + config: {}, + state_file: fakeStatePath, + stdout_log: fakeStdoutPath, + stderr_log: fakeStderrPath, + }, + provider: { + uuid: "", + name: providerName, + requester, + created: false, + updated: false, + }, + model: { + uuid: "", + name: modelName, + created: false, + updated: false, + test_status: "not_run", + test_reason: "", + }, + pipeline_id: "", + pipeline_name: pipelineName, + pipeline_url: "", + created: false, + updated: false, + wrote_env: false, + evidence: { + console_log: paths.consoleLog, + network_log: paths.networkLog, + automation_result_json: paths.automationResultJson, + result_json: paths.resultJson, + }, + evidence_collected: ["api_diagnostic", "network", "filesystem"], +}; + +try { + console.error(`[langbot-qa] configuring QA-owned fake-provider fixtures: provider=\"${providerName}\", pipeline=\"${pipelineName}\"`); + console.error("[langbot-qa] this setup may create or update local QA provider/model/pipeline resources on the selected backend."); + if (!backendUrl) { + result.status = "env_issue"; + throw new Error("LANGBOT_BACKEND_URL is not configured."); + } + if (!frontendUrl) { + result.status = "env_issue"; + throw new Error("LANGBOT_FRONTEND_URL is not configured."); + } + + const fakeProvider = await ensureFakeProvider(); + const setupConfig = await configureFakeProvider(fakeProvider.url, healthyFakeProviderConfig(), true); + result.fake_provider = { + ...result.fake_provider, + ...fakeProvider, + config: setupConfig.config || healthyFakeProviderConfig(), + }; + + const user = env.LANGBOT_E2E_LOGIN_USER || ""; + const password = env.LANGBOT_E2E_LOGIN_PASSWORD || DEFAULT_LOCAL_PASSWORD; + if (!user) { + result.status = "env_issue"; + throw new Error("LANGBOT_E2E_LOGIN_USER is required so this setup can create/update the fake provider pipeline."); + } + + const auth = await resetAndAuthLocalUser({ backendUrl, user, password }); + const wizard = await skipWizard({ backendUrl, token: auth.token }); + if (wizard.status !== "pass") { + result.status = "fail"; + throw new Error(wizard.reason || "Failed to mark the local QA wizard as skipped."); + } + + const provider = await ensureProvider({ + backendUrl, + token: auth.token, + name: providerName, + requester, + baseUrl: fakeProvider.base_url, + }); + result.provider = provider; + + const model = await ensureModel({ + backendUrl, + token: auth.token, + providerUuid: provider.uuid, + name: modelName, + }); + result.model = model; + + const pipeline = await ensurePipeline({ + backendUrl, + token: auth.token, + name: pipelineName, + modelUuid: model.uuid, + }); + Object.assign(result, pipeline); + result.pipeline_url = `${frontendUrl.replace(/\/$/, "")}/home/pipelines?id=${encodeURIComponent(pipeline.pipeline_id)}`; + + const runConfig = await configureFakeProvider(fakeProvider.url, targetFakeProviderConfig(), true); + result.fake_provider.config = runConfig.config || targetFakeProviderConfig(); + + if (writeEnv) { + await upsertEnvLocal(envLocalPath, { + LANGBOT_E2E_LOGIN_USER: user, + LANGBOT_FAKE_PROVIDER_URL: fakeProvider.url, + LANGBOT_FAKE_PROVIDER_BASE_URL: fakeProvider.base_url, + LANGBOT_FAKE_PROVIDER_PID: fakeProvider.pid ? String(fakeProvider.pid) : "", + LANGBOT_FAKE_PROVIDER_PROVIDER_UUID: provider.uuid, + LANGBOT_FAKE_PROVIDER_MODEL_UUID: model.uuid, + LANGBOT_FAKE_PROVIDER_PIPELINE_URL: result.pipeline_url, + LANGBOT_FAKE_PROVIDER_PIPELINE_NAME: pipelineName, + }); + result.wrote_env = true; + } + + result.status = "pass"; + result.reason = `Fake provider pipeline is configured with ${requester}/${modelName}.`; +} catch (error) { + result.status = result.status === "env_issue" ? "env_issue" : "fail"; + result.reason = result.reason || safeReason(error.message); +} finally { + await writeResult(paths, result); + console.log(JSON.stringify(result, null, 2)); +} + +process.exit(result.status === "pass" ? 0 : result.status === "env_issue" ? 2 : 1); + +async function ensureFakeProvider() { + const envUrl = normalizeProviderRootUrl(env.LANGBOT_FAKE_PROVIDER_URL || ""); + if (envUrl && await fakeProviderHealthy(envUrl) && await fakeProviderConfigurable(envUrl)) { + return { + url: envUrl, + base_url: `${envUrl}/v1`, + pid: null, + reused: true, + }; + } + + const state = await readState(fakeStatePath); + const stateUrl = normalizeProviderRootUrl(state.url || ""); + if (stateUrl && await fakeProviderHealthy(stateUrl)) { + if (await fakeProviderConfigurable(stateUrl)) { + return { + url: stateUrl, + base_url: state.base_url || `${stateUrl}/v1`, + pid: Number.isInteger(state.pid) ? state.pid : null, + reused: true, + }; + } + if (Number.isInteger(state.pid)) await stopProcess(state.pid); + } + + await mkdir(fakeStateDir, { recursive: true }); + await writeFile(fakeStatePath, `${JSON.stringify({ status: "starting", started_at: new Date().toISOString() }, null, 2)}\n`, "utf8"); + const stdout = await open(fakeStdoutPath, "a"); + const stderr = await open(fakeStderrPath, "a"); + const scriptPath = resolve("scripts/e2e/fake-openai-provider.mjs"); + const host = env.LANGBOT_FAKE_PROVIDER_HOST || "127.0.0.1"; + const port = env.LANGBOT_FAKE_PROVIDER_PORT || "0"; + const child = spawn(process.execPath, [ + scriptPath, + `--host=${host}`, + `--port=${port}`, + `--state-file=${fakeStatePath}`, + ], { + cwd: resolve("."), + detached: true, + env: { + ...env, + LANGBOT_FAKE_PROVIDER_MODEL_NAME: modelName, + }, + stdio: ["ignore", stdout.fd, stderr.fd], + }); + child.unref(); + await stdout.close(); + await stderr.close(); + + const started = await waitForFakeProviderState(fakeStatePath, child.pid, 10_000); + if (!started.url || !await fakeProviderHealthy(started.url) || !await fakeProviderConfigurable(started.url)) { + throw new Error(`Fake provider did not become healthy. See ${fakeStderrPath}`); + } + + return { + url: started.url, + base_url: started.base_url || `${started.url}/v1`, + pid: child.pid ?? started.pid ?? null, + reused: false, + }; +} + +async function configureFakeProvider(rootUrl, config, resetRequestCount) { + const response = await fetch(`${normalizeProviderRootUrl(rootUrl)}/__qa/config`, { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify({ + config, + reset_request_count: resetRequestCount, + }), + signal: AbortSignal.timeout(3000), + }); + const json = await response.json().catch(() => ({})); + if (!response.ok || json.ok !== true) { + throw new Error(`Fake provider config failed with HTTP ${response.status}.`); + } + return json; +} + +async function fakeProviderHealthy(rootUrl) { + try { + const response = await fetch(`${rootUrl.replace(/\/$/, "")}/healthz`, { + signal: AbortSignal.timeout(2000), + }); + if (!response.ok) return false; + const json = await response.json().catch(() => ({})); + return json.ok === true; + } catch { + return false; + } +} + +async function fakeProviderConfigurable(rootUrl) { + try { + const response = await fetch(`${rootUrl.replace(/\/$/, "")}/__qa/config`, { + signal: AbortSignal.timeout(2000), + }); + if (!response.ok) return false; + const json = await response.json().catch(() => ({})); + return json.ok === true && json.config && typeof json.config === "object"; + } catch { + return false; + } +} + +async function stopProcess(pid) { + try { + process.kill(pid, "SIGTERM"); + } catch { + return; + } + await sleep(500); +} + +async function waitForFakeProviderState(path, expectedPid, timeoutMs) { + const startedAt = Date.now(); + let lastState = {}; + while (Date.now() - startedAt < timeoutMs) { + const state = await readState(path); + if (state.url && (!expectedPid || state.pid === expectedPid)) return state; + lastState = state; + await sleep(150); + } + return lastState; +} + +async function readState(path) { + try { + return JSON.parse(await readFile(path, "utf8")); + } catch { + return {}; + } +} + +function normalizeProviderRootUrl(value) { + const trimmed = String(value || "").trim().replace(/\/$/, ""); + return trimmed.endsWith("/v1") ? trimmed.slice(0, -3) : trimmed; +} + +function healthyFakeProviderConfig() { + return { + response_text: "OK", + first_token_delay_ms: 25, + chunk_delay_ms: 10, + chunk_count: 0, + fault_status: 500, + fail_first_n: 0, + fail_every_n: 0, + fail_after_first_chunk: false, + dynamic_response: true, + }; +} + +function targetFakeProviderConfig() { + return { + response_text: env.LANGBOT_FAKE_PROVIDER_RESPONSE_TEXT || "OK", + first_token_delay_ms: nonNegativeInteger(env.LANGBOT_FAKE_PROVIDER_FIRST_TOKEN_DELAY_MS, 25), + chunk_delay_ms: nonNegativeInteger(env.LANGBOT_FAKE_PROVIDER_CHUNK_DELAY_MS, 10), + chunk_count: nonNegativeInteger(env.LANGBOT_FAKE_PROVIDER_CHUNK_COUNT, 0), + fault_status: httpFaultStatus(env.LANGBOT_FAKE_PROVIDER_FAULT_STATUS, 500), + fail_first_n: nonNegativeInteger(env.LANGBOT_FAKE_PROVIDER_FAIL_FIRST_N, 0), + fail_every_n: nonNegativeInteger(env.LANGBOT_FAKE_PROVIDER_FAIL_EVERY_N, 0), + fail_after_first_chunk: envBool(env.LANGBOT_FAKE_PROVIDER_FAIL_AFTER_FIRST_CHUNK, false), + dynamic_response: envBool(env.LANGBOT_FAKE_PROVIDER_DYNAMIC_RESPONSE, true), + }; +} + +async function skipWizard({ backendUrl, token }) { + const response = await apiJson(backendUrl, "/api/v1/system/wizard/completed", { + method: "POST", + token, + body: { status: "skipped" }, + }); + const ok = response.status < 400 && response.json.code === 0; + return { + status: ok ? "pass" : "fail", + http_status: response.status, + code: response.json.code ?? null, + reason: ok ? "Wizard marked skipped for local QA." : response.json.msg || "Wizard status update failed.", + }; +} + +async function ensureProvider({ backendUrl, token, name, requester, baseUrl }) { + const list = await apiJson(backendUrl, "/api/v1/provider/providers", { token }); + if (isApiFailure(list)) { + throw new Error(list.json.msg || "Failed to list providers."); + } + const providers = list.json.data?.providers || []; + const existing = providers.find((provider) => ( + provider.name === name + || (provider.requester === requester && String(provider.base_url || "").replace(/\/$/, "") === baseUrl.replace(/\/$/, "")) + )); + const body = { + name, + requester, + base_url: baseUrl, + api_keys: [env.LANGBOT_FAKE_PROVIDER_API_KEY || "langbot-fake-provider-key"], + }; + + if (existing?.uuid) { + const update = await apiJson(backendUrl, `/api/v1/provider/providers/${encodeURIComponent(existing.uuid)}`, { + method: "PUT", + token, + body, + }); + if (isApiFailure(update)) { + throw new Error(update.json.msg || "Failed to update fake provider."); + } + return { + uuid: existing.uuid, + name, + requester, + created: false, + updated: true, + }; + } + + const create = await apiJson(backendUrl, "/api/v1/provider/providers", { + method: "POST", + token, + body, + }); + const uuid = create.json.data?.uuid || ""; + if (isApiFailure(create) || !uuid) { + throw new Error(create.json.msg || "Failed to create fake provider."); + } + return { + uuid, + name, + requester, + created: true, + updated: false, + }; +} + +async function ensureModel({ backendUrl, token, providerUuid, name }) { + const list = await apiJson(backendUrl, `/api/v1/provider/models/llm?provider_uuid=${encodeURIComponent(providerUuid)}`, { token }); + if (isApiFailure(list)) { + throw new Error(list.json.msg || "Failed to list fake provider models."); + } + const models = list.json.data?.models || []; + const existing = models.find((model) => model.name === name); + const body = { + name, + provider_uuid: providerUuid, + abilities: [], + context_length: positiveInteger(env.LANGBOT_FAKE_PROVIDER_CONTEXT_LENGTH, 8192), + extra_args: {}, + prefered_ranking: 0, + }; + let modelUuid = existing?.uuid || ""; + let created = false; + let updated = false; + + if (modelUuid) { + const update = await apiJson(backendUrl, `/api/v1/provider/models/llm/${encodeURIComponent(modelUuid)}`, { + method: "PUT", + token, + body, + }); + if (isApiFailure(update)) { + throw new Error(update.json.msg || "Failed to update fake provider model."); + } + updated = true; + } else { + const create = await apiJson(backendUrl, "/api/v1/provider/models/llm", { + method: "POST", + token, + body, + }); + modelUuid = create.json.data?.uuid || ""; + if (isApiFailure(create) || !modelUuid) { + throw new Error(create.json.msg || "Failed to create fake provider model."); + } + created = true; + } + + const test = await apiJson(backendUrl, `/api/v1/provider/models/llm/${encodeURIComponent(modelUuid)}/test`, { + method: "POST", + token, + body: { extra_args: {} }, + }); + if (isApiFailure(test)) { + throw new Error(safeReason(test.json.msg || test.json.message || "Fake provider model test failed.")); + } + + return { + uuid: modelUuid, + name, + created, + updated, + test_status: "pass", + test_reason: "", + }; +} + +async function ensurePipeline({ backendUrl, token, name, modelUuid }) { + const list = await apiJson(backendUrl, "/api/v1/pipelines", { token }); + if (isApiFailure(list)) { + throw new Error(list.json.msg || "Failed to list pipelines."); + } + const pipelines = list.json.data?.pipelines || []; + let pipeline = pipelines.find((item) => item.name === name) || null; + let created = false; + + if (!pipeline) { + const create = await apiJson(backendUrl, "/api/v1/pipelines", { + method: "POST", + token, + body: { + name, + description: QA_RESOURCE_DESCRIPTION, + emoji: "QA", + }, + }); + const pipelineId = create.json.data?.uuid || ""; + if (isApiFailure(create) || !pipelineId) { + throw new Error(create.json.msg || "Failed to create fake provider pipeline."); + } + created = true; + pipeline = { uuid: pipelineId }; + } + + const loaded = await apiJson(backendUrl, `/api/v1/pipelines/${encodeURIComponent(pipeline.uuid)}`, { token }); + pipeline = loaded.json.data?.pipeline || null; + if (isApiFailure(loaded) || !pipeline?.uuid) { + throw new Error(loaded.json.msg || "Failed to load fake provider pipeline."); + } + + const config = pipeline.config && typeof pipeline.config === "object" ? pipeline.config : {}; + const ai = config.ai && typeof config.ai === "object" ? config.ai : {}; + const existingLocalAgentConfig = ai["local-agent"] && typeof ai["local-agent"] === "object" + ? ai["local-agent"] + : {}; + const localAgentConfig = { + timeout: 60, + prompt: [{ role: "system", content: "You are a deterministic QA assistant. Reply exactly as instructed." }], + "remove-think": false, + "knowledge-bases": [], + "box-session-id-template": "{launcher_type}_{launcher_id}", + "retrieval-top-k": 5, + "rerank-model": "", + "rerank-top-k": 5, + "max-tool-iterations": 20, + "tool-execution-mode": "parallel", + "max-tool-result-chars": 20000, + "context-history-fetch-limit": 20, + "context-window-tokens": 8192, + "context-reserve-tokens": 1024, + "context-keep-recent-tokens": 2048, + "context-summary-tokens": 1024, + ...existingLocalAgentConfig, + // Current backend truncation still reads this field directly. + "max-round": positiveInteger(existingLocalAgentConfig["max-round"], 10), + model: { + primary: modelUuid, + fallbacks: [], + }, + }; + const updatedConfig = { + ...config, + ai: { + ...ai, + runner: { + ...(ai.runner && typeof ai.runner === "object" ? ai.runner : {}), + id: RUNNER_ID, + runner: RUNNER_ID, + "expire-time": 0, + }, + "local-agent": localAgentConfig, + }, + }; + + const update = await apiJson(backendUrl, `/api/v1/pipelines/${encodeURIComponent(pipeline.uuid)}`, { + method: "PUT", + token, + body: { + name, + description: QA_RESOURCE_DESCRIPTION, + emoji: "QA", + config: updatedConfig, + }, + }); + if (isApiFailure(update)) { + throw new Error(update.json.msg || "Failed to update fake provider pipeline."); + } + + return { + pipeline_id: pipeline.uuid, + pipeline_name: name, + created, + updated: true, + }; +} + +function isApiFailure(response) { + return response.status >= 400 || (response.json.code !== undefined && response.json.code !== 0); +} + +function positiveInteger(value, fallback) { + const parsed = Number(value); + return Number.isInteger(parsed) && parsed > 0 ? parsed : fallback; +} + +function nonNegativeInteger(value, fallback) { + const parsed = Number(value); + return Number.isInteger(parsed) && parsed >= 0 ? parsed : fallback; +} + +function httpFaultStatus(value, fallback) { + const parsed = Number(value); + return Number.isInteger(parsed) && parsed >= 400 && parsed <= 599 ? parsed : fallback; +} + +function envBool(value, fallback) { + if (value === undefined || value === "") return fallback; + if (/^(1|true|yes|on)$/i.test(String(value))) return true; + if (/^(0|false|no|off)$/i.test(String(value))) return false; + return fallback; +} + +function sleep(ms) { + return new Promise((resolve) => setTimeout(resolve, ms)); +} + +function safeReason(value) { + return redact(String(value || "")).slice(0, 1000); +} + +async function upsertEnvLocal(path, updates) { + await mkdir(dirname(path), { recursive: true }); + let text = ""; + try { + text = await readFile(path, "utf8"); + } catch { + text = ""; + } + const lines = text.split(/\r?\n/); + const seen = new Set(); + const next = lines.map((line) => { + const trimmed = line.trim(); + const equals = trimmed.indexOf("="); + if (equals <= 0 || trimmed.startsWith("#")) return line; + const key = trimmed.slice(0, equals).trim(); + if (!(key in updates)) return line; + seen.add(key); + return `${key}=${updates[key]}`; + }); + for (const [key, value] of Object.entries(updates)) { + if (!seen.has(key)) next.push(`${key}=${value}`); + } + await writeFile(path, `${next.filter((line, index) => line !== "" || index < next.length - 1).join("\n")}\n`, "utf8"); +} diff --git a/skills/scripts/e2e/ensure-local-agent-pipeline.mjs b/skills/scripts/e2e/ensure-local-agent-pipeline.mjs index 0962c6bf5..da4336211 100644 --- a/skills/scripts/e2e/ensure-local-agent-pipeline.mjs +++ b/skills/scripts/e2e/ensure-local-agent-pipeline.mjs @@ -10,6 +10,7 @@ import { ensureEvidence, evidencePaths, loadEnvFiles, + redact, resetAndAuthLocalUser, safeScreenshot, setBrowserToken, @@ -17,9 +18,12 @@ import { writeResult, } from "./lib/langbot-e2e.mjs"; -const RUNNER_ID = "plugin:langbot/local-agent/default"; +const RUNNER_ID = "local-agent"; +const SPACE_PROVIDER_UUID = "00000000-0000-0000-0000-000000000000"; const DEFAULT_PIPELINE_NAME = "Agent QA Local Agent Debug Chat"; const DEFAULT_LOCAL_PASSWORD = "LangBotE2ELocalPass!2026"; +const DEFAULT_MODEL_TEST_LIMIT = 8; +const DEFAULT_MODEL_FALLBACK_COUNT = 3; const caseId = "ensure-local-agent-pipeline"; await loadEnvFiles(); @@ -45,11 +49,18 @@ const result = { pipeline_url: "", runner_id: RUNNER_ID, selected_model_id: "", + selected_model_name: "", + fallback_model_ids: [], model_count: 0, + space_model_count: 0, + scanned_space_model_count: 0, + tested_model_count: 0, + model_tests: [], created: false, updated: false, wrote_env: false, auth: null, + wizard: null, browser_token_check: null, page_signal: "", evidence: { @@ -71,6 +82,7 @@ try { const user = env.LANGBOT_E2E_LOGIN_USER || ""; const password = env.LANGBOT_E2E_LOGIN_PASSWORD || DEFAULT_LOCAL_PASSWORD; if (!user) { + result.status = "env_issue"; throw new Error("LANGBOT_E2E_LOGIN_USER is required so this setup can create/update the pipeline via backend API."); } @@ -81,6 +93,13 @@ try { backend_token_check: auth.check, }; + const wizard = await skipWizard({ backendUrl, token: auth.token }); + result.wizard = wizard; + if (wizard.status !== "pass") { + result.status = "fail"; + throw new Error(wizard.reason || "Failed to mark the local QA wizard as skipped."); + } + const prepared = await ensureLocalAgentPipeline({ backendUrl, token: auth.token, @@ -99,6 +118,10 @@ try { LANGBOT_PIPELINE_NAME: result.pipeline_name || pipelineName, LANGBOT_LOCAL_AGENT_PIPELINE_URL: result.pipeline_url, LANGBOT_LOCAL_AGENT_PIPELINE_NAME: result.pipeline_name || pipelineName, + ...(result.selected_model_id ? { + LANGBOT_LOCAL_AGENT_MODEL_UUID: result.selected_model_id, + LANGBOT_E2E_MODEL_UUID: result.selected_model_id, + } : {}), }); result.wrote_env = true; } @@ -127,6 +150,21 @@ try { process.exit(result.status === "pass" ? 0 : result.status === "env_issue" ? 2 : 1); +async function skipWizard({ backendUrl, token }) { + const response = await apiJson(backendUrl, "/api/v1/system/wizard/completed", { + method: "POST", + token, + body: { status: "skipped" }, + }); + const ok = response.status < 400 && response.json.code === 0; + return { + status: ok ? "pass" : "fail", + http_status: response.status, + code: response.json.code ?? null, + reason: ok ? "Wizard marked skipped for local QA." : response.json.msg || "Wizard status update failed.", + }; +} + async function ensureLocalAgentPipeline({ backendUrl, token, pipelineName, runnerId }) { const [pipelineList, modelList] = await Promise.all([ apiJson(backendUrl, "/api/v1/pipelines", { token }), @@ -149,7 +187,19 @@ async function ensureLocalAgentPipeline({ backendUrl, token, pipelineName, runne } const models = modelList.json.data?.models || []; - const selectedModel = models.find((model) => model.uuid) || null; + const skippedModelIds = new Set( + String(env.LANGBOT_E2E_SKIP_MODEL_UUIDS || "") + .split(",") + .map((item) => item.trim()) + .filter(Boolean), + ); + const skippedModelNames = new Set( + String(env.LANGBOT_E2E_SKIP_MODEL_NAMES || "") + .split(",") + .map((item) => item.trim()) + .filter(Boolean), + ); + const spaceModels = models.filter((model) => isSpaceModel(model) && !skippedModelIds.has(model.uuid)); const pipelines = pipelineList.json.data?.pipelines || []; let pipeline = pipelines.find((item) => item.name === pipelineName) || null; let created = false; @@ -170,6 +220,7 @@ async function ensureLocalAgentPipeline({ backendUrl, token, pipelineName, runne reason: createdResponse.json.msg || "Failed to create pipeline.", create_status: createdResponse.status, model_count: models.length, + space_model_count: spaceModels.length, }; } const pipelineId = createdResponse.json.data?.uuid || ""; @@ -183,6 +234,7 @@ async function ensureLocalAgentPipeline({ backendUrl, token, pipelineName, runne status: "fail", reason: "Pipeline was not created or resolved.", model_count: models.length, + space_model_count: spaceModels.length, }; } @@ -194,27 +246,37 @@ async function ensureLocalAgentPipeline({ backendUrl, token, pipelineName, runne get_status: loaded.status, pipeline_id: pipeline.uuid, model_count: models.length, + space_model_count: spaceModels.length, }; } pipeline = loaded.json.data.pipeline; const config = pipeline.config && typeof pipeline.config === "object" ? pipeline.config : {}; const ai = config.ai && typeof config.ai === "object" ? config.ai : {}; - const runnerConfig = ai.runner_config && typeof ai.runner_config === "object" ? ai.runner_config : {}; - const rawExistingLocalAgentConfig = runnerConfig[runnerId] && typeof runnerConfig[runnerId] === "object" - ? runnerConfig[runnerId] + const rawExistingLocalAgentConfig = ai["local-agent"] && typeof ai["local-agent"] === "object" + ? ai["local-agent"] : {}; const existingLocalAgentConfig = rawExistingLocalAgentConfig; const existingModel = existingLocalAgentConfig.model && typeof existingLocalAgentConfig.model === "object" ? existingLocalAgentConfig.model : {}; const requestedModelId = env.LANGBOT_LOCAL_AGENT_MODEL_UUID || env.LANGBOT_E2E_MODEL_UUID || ""; - const selectedModelId = requestedModelId || existingModel.primary || selectedModel?.uuid || ""; + const selected = await selectWorkingSpaceModel({ + backendUrl, + token, + models, + skippedModelIds, + skippedModelNames, + requestedModelId, + existingModelId: existingModel.primary || "", + }); + const selectedModelId = selected.selected_model_id || ""; const localAgentConfig = { timeout: 300, prompt: [{ role: "system", content: "You are a helpful assistant." }], "remove-think": false, "knowledge-bases": [], + "box-session-id-template": "{launcher_type}_{launcher_id}", "retrieval-top-k": 5, "rerank-model": "", "rerank-top-k": 5, @@ -227,9 +289,11 @@ async function ensureLocalAgentPipeline({ backendUrl, token, pipelineName, runne "context-keep-recent-tokens": 20000, "context-summary-tokens": 8000, ...existingLocalAgentConfig, + // Current backend truncation still reads this field directly. + "max-round": positiveInteger(existingLocalAgentConfig["max-round"], 10), model: { primary: selectedModelId, - fallbacks: requestedModelId ? [] : Array.isArray(existingModel.fallbacks) ? existingModel.fallbacks : [], + fallbacks: selected.fallback_model_ids || [], }, }; const updatedConfig = { @@ -239,12 +303,10 @@ async function ensureLocalAgentPipeline({ backendUrl, token, pipelineName, runne runner: { ...(ai.runner && typeof ai.runner === "object" ? ai.runner : {}), id: runnerId, + runner: runnerId, "expire-time": 0, }, - runner_config: { - ...runnerConfig, - [runnerId]: localAgentConfig, - }, + "local-agent": localAgentConfig, }, }; @@ -265,19 +327,31 @@ async function ensureLocalAgentPipeline({ backendUrl, token, pipelineName, runne update_status: updateResponse.status, pipeline_id: pipeline.uuid, model_count: models.length, + space_model_count: spaceModels.length, + scanned_space_model_count: selected.scanned_space_model_count, + tested_model_count: selected.tested_model_count, + model_tests: selected.model_tests, selected_model_id: selectedModelId, + selected_model_name: selected.selected_model_name, + fallback_model_ids: selected.fallback_model_ids, }; } return { status: selectedModelId ? "pass" : "env_issue", reason: selectedModelId - ? "Local-agent pipeline is configured for Debug Chat." - : "Pipeline was created but no LLM model is configured in this LangBot instance.", + ? `Local-agent pipeline is configured for Debug Chat with Space model ${selected.selected_model_name || selectedModelId} and ${selected.fallback_model_ids.length} fallback(s).` + : selected.reason || "No working Space LLM model is configured in this LangBot instance.", pipeline_id: pipeline.uuid, - pipeline_name: pipeline.name, + pipeline_name: pipelineName, model_count: models.length, + space_model_count: spaceModels.length, + scanned_space_model_count: selected.scanned_space_model_count, + tested_model_count: selected.tested_model_count, + model_tests: selected.model_tests, selected_model_id: selectedModelId, + selected_model_name: selected.selected_model_name, + fallback_model_ids: selected.fallback_model_ids, created, updated: true, }; @@ -287,6 +361,229 @@ function isApiFailure(response) { return response.status >= 400 || (response.json.code !== undefined && response.json.code !== 0); } +function isSpaceModel(model) { + const provider = model?.provider && typeof model.provider === "object" ? model.provider : {}; + return model?.provider_uuid === SPACE_PROVIDER_UUID + || provider.uuid === SPACE_PROVIDER_UUID + || provider.requester === "space-chat-completions" + || provider.name === "LangBot Models"; +} + +async function selectWorkingSpaceModel({ + backendUrl, + token, + models, + skippedModelIds, + skippedModelNames, + requestedModelId, + existingModelId, +}) { + const modelTests = []; + const testLimit = positiveInteger(env.LANGBOT_E2E_MODEL_TEST_LIMIT, DEFAULT_MODEL_TEST_LIMIT); + const fallbackCount = positiveInteger(env.LANGBOT_E2E_MODEL_FALLBACK_COUNT, DEFAULT_MODEL_FALLBACK_COUNT); + const workingModels = []; + const spaceModels = rankModels(models.filter((model) => ( + model.uuid + && isSpaceModel(model) + && !skippedModelIds.has(model.uuid) + && !skippedModelNames.has(model.name) + ))); + const requestedModel = requestedModelId + ? spaceModels.find((model) => model.uuid === requestedModelId) || null + : null; + const existingModel = existingModelId + ? spaceModels.find((model) => model.uuid === existingModelId) || null + : null; + const candidates = uniqueCandidates([ + ...(requestedModel ? [existingCandidate(requestedModel, "requested")] : []), + ...(existingModel ? [existingCandidate(existingModel, "existing-pipeline")] : []), + ...spaceModels.map((model) => existingCandidate(model, "configured-space")), + ]); + + let scanResult = { status: "skipped", models: [], reason: "" }; + if (env.LANGBOT_E2E_SCAN_SPACE_MODELS !== "false") { + scanResult = await scanSpaceModels({ backendUrl, token }); + if (scanResult.status === "pass") { + const knownNames = new Set(spaceModels.map((model) => model.name)); + candidates.push(...scanResult.models + .filter((model) => model.name && !knownNames.has(model.name) && !skippedModelNames.has(model.name)) + .map((model) => scannedCandidate(model))); + } + } + + const unique = uniqueCandidates(candidates); + for (const candidate of unique.slice(0, testLimit)) { + const test = await ensureAndTestModel({ backendUrl, token, candidate }); + modelTests.push(test); + if (test.status === "pass" && test.model_uuid) { + workingModels.push(test); + if (workingModels.length >= fallbackCount + 1) break; + } + } + + if (workingModels.length > 0) { + const [primary, ...fallbacks] = workingModels; + return { + status: "pass", + reason: "", + selected_model_id: primary.model_uuid, + selected_model_name: primary.model_name, + fallback_model_ids: fallbacks.map((model) => model.model_uuid), + scanned_space_model_count: scanResult.models.length, + tested_model_count: modelTests.length, + model_tests: modelTests, + }; + } + + const baseReason = unique.length === 0 + ? scanResult.reason || "No Space LLM model candidates are available." + : `No working Space LLM model found after testing ${modelTests.length} candidate(s).`; + return { + status: "env_issue", + reason: requestedModelId && !requestedModel + ? `Requested Space LLM model ${requestedModelId} is missing or skipped; ${baseReason}` + : baseReason, + selected_model_id: "", + selected_model_name: "", + fallback_model_ids: [], + scanned_space_model_count: scanResult.models.length, + tested_model_count: modelTests.length, + model_tests: modelTests, + }; +} + +async function scanSpaceModels({ backendUrl, token }) { + const response = await apiJson( + backendUrl, + `/api/v1/provider/providers/${encodeURIComponent(SPACE_PROVIDER_UUID)}/scan-models?type=llm`, + { token }, + ); + if (isApiFailure(response)) { + return { + status: "env_issue", + models: [], + reason: safeReason(response.json.msg || response.json.message || "Failed to scan Space LLM models."), + }; + } + return { + status: "pass", + models: response.json.data?.models || [], + reason: "", + }; +} + +async function ensureAndTestModel({ backendUrl, token, candidate }) { + let modelUuid = candidate.uuid || ""; + let created = false; + if (!modelUuid) { + const create = await apiJson(backendUrl, "/api/v1/provider/models/llm", { + method: "POST", + token, + body: { + name: candidate.name, + provider_uuid: SPACE_PROVIDER_UUID, + abilities: candidate.abilities || [], + context_length: candidate.context_length ?? null, + extra_args: {}, + prefered_ranking: positiveInteger(candidate.prefered_ranking, 0), + }, + }); + modelUuid = create.json.data?.uuid || ""; + if (isApiFailure(create) || !modelUuid) { + return modelTestResult(candidate, { + status: "fail", + reason: safeReason(create.json.msg || "Failed to create scanned Space model."), + http_status: create.status, + }); + } + created = true; + } + + const test = await apiJson(backendUrl, `/api/v1/provider/models/llm/${encodeURIComponent(modelUuid)}/test`, { + method: "POST", + token, + body: { extra_args: {} }, + }); + const passed = !isApiFailure(test); + if (!passed && created) { + await apiJson(backendUrl, `/api/v1/provider/models/llm/${encodeURIComponent(modelUuid)}`, { + method: "DELETE", + token, + }).catch(() => {}); + } + return modelTestResult(candidate, { + status: passed ? "pass" : "fail", + reason: passed ? "" : safeReason(test.json.msg || test.json.message || "Space model test failed."), + http_status: test.status, + model_uuid: modelUuid, + created, + }); +} + +function modelTestResult(candidate, details) { + return { + source: candidate.source, + model_uuid: details.model_uuid || candidate.uuid || "", + model_name: candidate.name, + status: details.status, + reason: details.reason || "", + http_status: details.http_status ?? null, + created: Boolean(details.created), + }; +} + +function existingCandidate(model, source) { + return { + source, + uuid: model.uuid, + name: model.name, + abilities: model.abilities || [], + context_length: model.context_length, + prefered_ranking: model.prefered_ranking, + }; +} + +function scannedCandidate(model) { + return { + source: "scanned-space", + uuid: "", + name: model.name || model.id, + abilities: model.abilities || [], + context_length: model.context_length, + prefered_ranking: model.prefered_ranking, + }; +} + +function uniqueCandidates(candidates) { + const seen = new Set(); + const result = []; + for (const candidate of candidates) { + const key = candidate.uuid ? `uuid:${candidate.uuid}` : `name:${candidate.name}`; + if (!candidate.name || seen.has(key)) continue; + seen.add(key); + result.push(candidate); + } + return result; +} + +function rankModels(models) { + return [...models].sort((left, right) => { + const leftRank = Number.isFinite(Number(left.prefered_ranking)) ? Number(left.prefered_ranking) : 9999; + const rightRank = Number.isFinite(Number(right.prefered_ranking)) ? Number(right.prefered_ranking) : 9999; + if (leftRank !== rightRank) return leftRank - rightRank; + return String(left.name || "").localeCompare(String(right.name || "")); + }); +} + +function positiveInteger(value, fallback) { + const parsed = Number(value); + return Number.isInteger(parsed) && parsed > 0 ? parsed : fallback; +} + +function safeReason(value) { + return redact(String(value || "")).slice(0, 1000); +} + async function upsertEnvLocal(path, updates) { let text = ""; try { diff --git a/skills/scripts/e2e/fake-openai-provider.mjs b/skills/scripts/e2e/fake-openai-provider.mjs new file mode 100644 index 000000000..1cca9c46b --- /dev/null +++ b/skills/scripts/e2e/fake-openai-provider.mjs @@ -0,0 +1,496 @@ +#!/usr/bin/env node + +import { createServer } from "node:http"; +import { mkdir, writeFile } from "node:fs/promises"; +import { dirname, resolve } from "node:path"; +import { env, exit } from "node:process"; + +const args = parseArgs(process.argv.slice(2)); +const host = args.host || env.LANGBOT_FAKE_PROVIDER_HOST || "127.0.0.1"; +const port = integer(args.port ?? env.LANGBOT_FAKE_PROVIDER_PORT, 0); +const stateFile = args["state-file"] || env.LANGBOT_FAKE_PROVIDER_STATE_FILE || ""; +const modelName = env.LANGBOT_FAKE_PROVIDER_MODEL_NAME || "gpt-4o-mini"; +const config = { + response_text: env.LANGBOT_FAKE_PROVIDER_RESPONSE_TEXT || "OK", + first_token_delay_ms: integer(env.LANGBOT_FAKE_PROVIDER_FIRST_TOKEN_DELAY_MS, 25), + chunk_delay_ms: integer(env.LANGBOT_FAKE_PROVIDER_CHUNK_DELAY_MS, 10), + chunk_count: integer(env.LANGBOT_FAKE_PROVIDER_CHUNK_COUNT, 0), + fault_status: integer(env.LANGBOT_FAKE_PROVIDER_FAULT_STATUS, 500), + fail_first_n: integer(env.LANGBOT_FAKE_PROVIDER_FAIL_FIRST_N, 0), + fail_every_n: integer(env.LANGBOT_FAKE_PROVIDER_FAIL_EVERY_N, 0), + fail_after_first_chunk: bool(env.LANGBOT_FAKE_PROVIDER_FAIL_AFTER_FIRST_CHUNK, false), + dynamic_response: !/^(0|false|no|off)$/i.test(env.LANGBOT_FAKE_PROVIDER_DYNAMIC_RESPONSE || ""), + request_log_limit: integer(env.LANGBOT_FAKE_PROVIDER_REQUEST_LOG_LIMIT, 500), +}; + +let requestCount = 0; +const recentRequests = []; + +const server = createServer(async (request, response) => { + const startedAt = Date.now(); + const startedPerf = performance.now(); + let requestRecord = null; + const url = new URL(request.url || "/", `http://${request.headers.host || `${host}:${port}`}`); + try { + if (request.method === "GET" && url.pathname === "/healthz") { + sendJson(response, 200, { + ok: true, + model: modelName, + config, + request_count: requestCount, + recent_request_count: recentRequests.length, + }); + return; + } + + if (request.method === "GET" && url.pathname === "/__qa/config") { + sendJson(response, 200, { + ok: true, + model: modelName, + config, + request_count: requestCount, + recent_requests: recentRequests, + }); + return; + } + + if (request.method === "POST" && url.pathname === "/__qa/config") { + const body = await readJson(request); + applyConfig(body.config && typeof body.config === "object" ? body.config : body); + if (body.reset_request_count !== false) resetRequestState(); + sendJson(response, 200, { + ok: true, + model: modelName, + config, + request_count: requestCount, + }); + return; + } + + if (request.method === "POST" && url.pathname === "/__qa/reset") { + resetRequestState(); + sendJson(response, 200, { + ok: true, + model: modelName, + config, + request_count: requestCount, + }); + return; + } + + if (request.method === "GET" && ["/models", "/v1/models"].includes(url.pathname)) { + sendJson(response, 200, { + object: "list", + data: [ + { + id: modelName, + object: "model", + created: 1, + owned_by: "langbot-qa", + type: "llm", + }, + ], + }); + return; + } + + if (request.method === "POST" && ["/chat/completions", "/v1/chat/completions"].includes(url.pathname)) { + requestCount += 1; + const body = await readJson(request); + const requestId = `chatcmpl-langbot-fake-${requestCount}`; + const shouldFail = requestCount <= config.fail_first_n + || (config.fail_every_n > 0 && requestCount % config.fail_every_n === 0); + const replyText = responseTextForBody(body); + requestRecord = recordRequest({ + id: requestId, + request_number: requestCount, + path: url.pathname, + stream: Boolean(body.stream), + model: body.model || "", + message_count: Array.isArray(body.messages) ? body.messages.length : 0, + should_fail: shouldFail, + status: "running", + http_status: null, + expected_text: replyText, + response_text_preview: previewText(replyText), + started_at: new Date(startedAt).toISOString(), + started_epoch_ms: startedAt, + configured_first_token_delay_ms: config.first_token_delay_ms, + configured_chunk_delay_ms: config.chunk_delay_ms, + configured_chunk_count: config.chunk_count, + }); + + if (shouldFail) { + await sleep(config.first_token_delay_ms); + sendJson(response, config.fault_status, { + error: { + message: `LangBot fake provider injected HTTP ${config.fault_status}`, + type: "fake_provider_fault", + code: "fake_provider_fault", + }, + }); + finishRequestRecord(requestRecord, startedPerf, { + status: "http_fault", + http_status: config.fault_status, + }); + return; + } + + if (body.stream) { + await streamCompletion(response, { + requestId, + model: body.model || modelName, + content: replyText, + failAfterFirstChunk: config.fail_after_first_chunk, + requestRecord, + startedPerf, + }); + } else { + await sleep(config.first_token_delay_ms + config.chunk_delay_ms); + sendJson(response, 200, completionPayload({ + requestId, + model: body.model || modelName, + content: replyText, + })); + markRequestTiming(requestRecord, "first_chunk", startedPerf); + markRequestTiming(requestRecord, "first_content_chunk", startedPerf); + requestRecord.content_chunk_count = 1; + finishRequestRecord(requestRecord, startedPerf, { + status: "ok", + http_status: 200, + }); + } + return; + } + + sendJson(response, 404, { + error: { + message: `No fake provider route for ${request.method} ${url.pathname}`, + type: "not_found", + }, + }); + } catch (error) { + if (requestRecord) { + finishRequestRecord(requestRecord, startedPerf, { + status: "fake_provider_error", + http_status: 500, + error: error instanceof Error ? error.message : String(error), + }); + } + sendJson(response, 500, { + error: { + message: error instanceof Error ? error.message : String(error), + type: "fake_provider_error", + }, + }); + } finally { + const durationMs = Date.now() - startedAt; + if (url.pathname !== "/healthz") { + console.log(JSON.stringify({ + at: new Date().toISOString(), + method: request.method, + path: url.pathname, + duration_ms: durationMs, + })); + } + } +}); + +server.listen(port, host, async () => { + const address = server.address(); + const selectedPort = typeof address === "object" && address ? address.port : port; + const url = `http://${host}:${selectedPort}`; + const state = { + status: "ready", + pid: process.pid, + url, + base_url: `${url}/v1`, + model: modelName, + started_at: new Date().toISOString(), + }; + if (stateFile) { + const path = resolve(stateFile); + await mkdir(dirname(path), { recursive: true }); + await writeFile(path, `${JSON.stringify(state, null, 2)}\n`, "utf8"); + } + console.log(JSON.stringify(state)); +}); + +server.on("error", (error) => { + console.error(JSON.stringify({ + status: "error", + reason: error instanceof Error ? error.message : String(error), + })); + exit(1); +}); + +process.on("SIGTERM", () => { + server.close(() => exit(0)); +}); + +function parseArgs(argv) { + const result = {}; + for (const item of argv) { + const match = item.match(/^--([^=]+)(?:=(.*))?$/); + if (!match) continue; + result[match[1]] = match[2] ?? "1"; + } + return result; +} + +function integer(value, fallback) { + const parsed = Number.parseInt(String(value ?? ""), 10); + return Number.isFinite(parsed) && parsed >= 0 ? parsed : fallback; +} + +function bool(value, fallback) { + if (value === undefined || value === "") return fallback; + if (/^(1|true|yes|on)$/i.test(String(value))) return true; + if (/^(0|false|no|off)$/i.test(String(value))) return false; + return fallback; +} + +function sleep(ms) { + return new Promise((resolve) => setTimeout(resolve, Math.max(0, ms))); +} + +async function readJson(request) { + let text = ""; + for await (const chunk of request) text += chunk.toString(); + if (!text) return {}; + return JSON.parse(text); +} + +function sendJson(response, status, payload) { + const text = `${JSON.stringify(payload)}\n`; + response.writeHead(status, { + "content-type": "application/json", + "content-length": Buffer.byteLength(text), + }); + response.end(text); +} + +function completionPayload({ requestId, model, content }) { + const completionTokens = tokenEstimate(content); + return { + id: requestId, + object: "chat.completion", + created: Math.floor(Date.now() / 1000), + model, + choices: [ + { + index: 0, + message: { + role: "assistant", + content, + }, + finish_reason: "stop", + }, + ], + usage: { + prompt_tokens: 8, + completion_tokens: completionTokens, + total_tokens: 8 + completionTokens, + }, + }; +} + +async function streamCompletion(response, { + requestId, + model, + content, + failAfterFirstChunk: failMidStream, + requestRecord, + startedPerf, +}) { + response.writeHead(200, { + "content-type": "text/event-stream; charset=utf-8", + "cache-control": "no-cache", + "connection": "keep-alive", + }); + + await sleep(config.first_token_delay_ms); + markRequestTiming(requestRecord, "first_chunk", startedPerf); + writeSse(response, { + id: requestId, + object: "chat.completion.chunk", + created: Math.floor(Date.now() / 1000), + model, + choices: [{ index: 0, delta: { role: "assistant" }, finish_reason: null }], + }); + + const chunks = splitContent(content); + for (let index = 0; index < chunks.length; index += 1) { + await sleep(config.chunk_delay_ms); + if (index === 0) markRequestTiming(requestRecord, "first_content_chunk", startedPerf); + requestRecord.content_chunk_count = (requestRecord.content_chunk_count || 0) + 1; + writeSse(response, { + id: requestId, + object: "chat.completion.chunk", + created: Math.floor(Date.now() / 1000), + model, + choices: [{ index: 0, delta: { content: chunks[index] }, finish_reason: null }], + }); + if (failMidStream && index === 0) { + finishRequestRecord(requestRecord, startedPerf, { + status: "mid_stream_disconnect", + http_status: 200, + }); + response.destroy(new Error("LangBot fake provider injected mid-stream disconnect")); + return; + } + } + + await sleep(config.chunk_delay_ms); + const completionTokens = tokenEstimate(content); + writeSse(response, { + id: requestId, + object: "chat.completion.chunk", + created: Math.floor(Date.now() / 1000), + model, + choices: [{ index: 0, delta: {}, finish_reason: "stop" }], + usage: { + prompt_tokens: 8, + completion_tokens: completionTokens, + total_tokens: 8 + completionTokens, + }, + }); + response.write("data: [DONE]\n\n"); + response.end(); + finishRequestRecord(requestRecord, startedPerf, { + status: "ok", + http_status: 200, + }); +} + +function writeSse(response, payload) { + response.write(`data: ${JSON.stringify(payload)}\n\n`); +} + +function splitContent(content) { + const text = String(content); + const requested = config.chunk_count; + if (requested <= 1 || text.length <= 1) return [text]; + const chunkSize = Math.max(1, Math.ceil(text.length / requested)); + const chunks = []; + for (let index = 0; index < text.length; index += chunkSize) { + chunks.push(text.slice(index, index + chunkSize)); + } + return chunks; +} + +function tokenEstimate(content) { + return Math.max(1, Math.ceil(String(content || "").length / 4)); +} + +function responseTextForBody(body) { + if (!config.dynamic_response) { + return config.response_text; + } + const messages = Array.isArray(body.messages) ? body.messages : []; + const lastUser = [...messages].reverse().find((message) => message?.role === "user"); + const text = flattenContent(lastUser?.content || ""); + const quoted = text.match(/["'“”](.{1,80}?)["'“”]/); + if (quoted?.[1]) return quoted[1].trim(); + const exact = text.match(/(?:reply|回复|输出|return)\s+(?:exactly\s+)?([A-Za-z0-9_.:@-]{1,80})/i); + if (exact?.[1]) return exact[1].trim().replace(/[。.!?]+$/, ""); + const only = text.match(/只回复\s*([A-Za-z0-9_.:@-]{1,80})/); + if (only?.[1]) return only[1].trim().replace(/[。.!?]+$/, ""); + return config.response_text; +} + +function flattenContent(content) { + if (typeof content === "string") return content; + if (Array.isArray(content)) { + return content + .map((item) => { + if (typeof item === "string") return item; + if (item && typeof item === "object") return item.text || ""; + return ""; + }) + .join("\n"); + } + return ""; +} + +function recordRequest(entry) { + const item = { + ...entry, + at: new Date().toISOString(), + finished_at: null, + finished_epoch_ms: null, + duration_ms: null, + first_chunk_at: null, + first_chunk_epoch_ms: null, + first_chunk_ms: null, + first_content_chunk_at: null, + first_content_chunk_epoch_ms: null, + first_content_chunk_ms: null, + content_chunk_count: 0, + }; + recentRequests.push(item); + while (recentRequests.length > config.request_log_limit) recentRequests.shift(); + return item; +} + +function markRequestTiming(entry, key, startedPerf) { + if (!entry || entry[`${key}_at`]) return; + const now = Date.now(); + entry[`${key}_at`] = new Date(now).toISOString(); + entry[`${key}_epoch_ms`] = now; + entry[`${key}_ms`] = rounded(performance.now() - startedPerf); +} + +function finishRequestRecord(entry, startedPerf, updates = {}) { + if (!entry || entry.finished_at) return; + const now = Date.now(); + Object.assign(entry, updates); + entry.finished_at = new Date(now).toISOString(); + entry.finished_epoch_ms = now; + entry.duration_ms = rounded(performance.now() - startedPerf); +} + +function rounded(value) { + return Number(value.toFixed(3)); +} + +function previewText(value) { + return String(value || "").slice(0, 120); +} + +function resetRequestState() { + requestCount = 0; + recentRequests.length = 0; +} + +function applyConfig(updates) { + if (!updates || typeof updates !== "object") return; + assignString(updates, "response_text"); + assignNonNegativeInteger(updates, "first_token_delay_ms"); + assignNonNegativeInteger(updates, "chunk_delay_ms"); + assignNonNegativeInteger(updates, "chunk_count"); + assignNonNegativeInteger(updates, "fail_first_n"); + assignNonNegativeInteger(updates, "fail_every_n"); + assignNonNegativeInteger(updates, "request_log_limit"); + if (updates.fault_status !== undefined) { + const parsed = Number.parseInt(String(updates.fault_status), 10); + if (Number.isInteger(parsed) && parsed >= 400 && parsed <= 599) config.fault_status = parsed; + } + assignBoolean(updates, "fail_after_first_chunk"); + assignBoolean(updates, "dynamic_response"); +} + +function assignString(updates, key) { + if (updates[key] !== undefined) config[key] = String(updates[key]); +} + +function assignNonNegativeInteger(updates, key) { + if (updates[key] === undefined) return; + const parsed = Number.parseInt(String(updates[key]), 10); + if (Number.isInteger(parsed) && parsed >= 0) config[key] = parsed; +} + +function assignBoolean(updates, key) { + if (updates[key] === undefined) return; + config[key] = bool(updates[key], config[key]); +} diff --git a/skills/scripts/e2e/lib/langbot-e2e.mjs b/skills/scripts/e2e/lib/langbot-e2e.mjs index fc7a52e4f..a7584c904 100644 --- a/skills/scripts/e2e/lib/langbot-e2e.mjs +++ b/skills/scripts/e2e/lib/langbot-e2e.mjs @@ -72,6 +72,7 @@ export async function writeResult(paths, result) { } export async function loadEnvFiles(paths = ["skills/.env", "skills/.env.local"]) { + const processEnvKeys = new Set(Object.keys(env)); for (const path of paths) { let text = ""; try { @@ -86,7 +87,7 @@ export async function loadEnvFiles(paths = ["skills/.env", "skills/.env.local"]) if (equals <= 0) continue; const key = trimmed.slice(0, equals).trim(); const value = trimmed.slice(equals + 1).trim().replace(/^["']|["']$/g, ""); - if (!(key in env)) env[key] = value; + if (!processEnvKeys.has(key)) env[key] = value; } } } diff --git a/skills/scripts/e2e/pipeline-debug-chat.mjs b/skills/scripts/e2e/pipeline-debug-chat.mjs index 87fe9ae79..4b20f7757 100755 --- a/skills/scripts/e2e/pipeline-debug-chat.mjs +++ b/skills/scripts/e2e/pipeline-debug-chat.mjs @@ -54,6 +54,7 @@ const debugChatSessionType = env.LANGBOT_E2E_DEBUG_CHAT_SESSION_TYPE || "person" const pipelineConfigDiagnosticPath = resolve(paths.evidenceDir, "pipeline-config-diagnostic.json"); const debugChatResetDiagnosticPath = resolve(paths.evidenceDir, "debug-chat-reset-diagnostic.json"); const pipelineConfigRestoreDiagnosticPath = resolve(paths.evidenceDir, "pipeline-config-restore-diagnostic.json"); +const metricsPath = resolve(paths.evidenceDir, "metrics.json"); const startedAt = new Date(); let browser; @@ -80,10 +81,11 @@ let result = { console_log: paths.consoleLog, network_log: paths.networkLog, screenshot: paths.screenshot, + metrics_json: metricsPath, automation_result_json: paths.automationResultJson, result_json: paths.resultJson, }, - evidence_collected: ["ui", "screenshot", "console", "network"], + evidence_collected: ["ui", "screenshot", "console", "network", "metrics"], }; function boolFromEnv(value, defaultValue) { @@ -103,6 +105,29 @@ function parseJsonEnv(key, fallback) { } } +function positiveNumberEnv(key, fallback) { + const value = Number(env[key] || ""); + return Number.isFinite(value) && value >= 0 ? value : fallback; +} + +function percentile(values, percentileValue) { + if (values.length === 0) return 0; + const sorted = [...values].sort((a, b) => a - b); + const index = Math.min(sorted.length - 1, Math.ceil((percentileValue / 100) * sorted.length) - 1); + return Number(sorted[index].toFixed(3)); +} + +function stats(values) { + if (values.length === 0) return { min: 0, p50: 0, p95: 0, p99: 0, max: 0 }; + return { + min: Number(Math.min(...values).toFixed(3)), + p50: percentile(values, 50), + p95: percentile(values, 95), + p99: percentile(values, 99), + max: Number(Math.max(...values).toFixed(3)), + }; +} + function promptStepsFromEnv() { const rawSteps = parseJsonEnv("LANGBOT_E2E_PROMPTS_JSON", null); if (rawSteps === null) { @@ -658,6 +683,7 @@ try { } else { for (let index = 0; index < promptSteps.length; index += 1) { const step = promptSteps[index]; + const promptStartedAt = Date.now(); const chatResult = await runDebugChatPrompt(page, { prompt: step.prompt, expectedText: step.expectedText, @@ -665,11 +691,13 @@ try { imagePath: index === 0 ? imagePath : "", failureSignals: failureSignals.length > 0 ? failureSignals : undefined, }); + const promptDurationMs = Date.now() - promptStartedAt; result.chat_results.push({ index, expected_text: step.expectedText, status: chatResult.status, reason: chatResult.reason, + response_duration_ms: promptDurationMs, min_expected_count: chatResult.min_expected_count, final_count: chatResult.final_count, before_assistant_expected_count: chatResult.before_assistant_expected_count, @@ -714,6 +742,56 @@ try { const finishedAt = new Date(); result.finished_at = finishedAt.toISOString(); result.finished_at_local = localIsoWithOffset(finishedAt); + result.duration_ms = finishedAt.getTime() - startedAt.getTime(); + const responseDurations = result.chat_results + .map((item) => item.response_duration_ms) + .filter((value) => Number.isFinite(value)); + const passedPrompts = result.chat_results.filter((item) => item.status === "pass").length; + const attemptedPrompts = result.chat_results.length; + const errorRate = attemptedPrompts === 0 ? 1 : Number(((attemptedPrompts - passedPrompts) / attemptedPrompts).toFixed(4)); + const responseStats = stats(responseDurations); + const responseP95BudgetMs = positiveNumberEnv( + "LANGBOT_E2E_DEBUG_CHAT_RESPONSE_P95_MS", + positiveNumberEnv("LANGBOT_DEBUG_CHAT_RESPONSE_P95_MS", safeResponseTimeoutMs), + ); + const maxErrorRate = positiveNumberEnv("LANGBOT_E2E_DEBUG_CHAT_MAX_ERROR_RATE", 0); + const metrics = { + probe: caseId, + url: result.url, + prompt_count: result.prompt_count, + attempted_prompt_count: attemptedPrompts, + passed_prompt_count: passedPrompts, + error_rate: errorRate, + response_duration_ms: responseStats, + total_duration_ms: result.duration_ms, + chat_results: result.chat_results, + }; + result.metrics_summary = { + prompt_count: metrics.prompt_count, + attempted_prompt_count: metrics.attempted_prompt_count, + passed_prompt_count: metrics.passed_prompt_count, + error_rate: metrics.error_rate, + response_p50_ms: metrics.response_duration_ms.p50, + response_p95_ms: metrics.response_duration_ms.p95, + total_duration_ms: metrics.total_duration_ms, + }; + result.thresholds_summary = { + response_p95_ms: { + actual: metrics.response_duration_ms.p95, + max: responseP95BudgetMs, + pass: attemptedPrompts > 0 && metrics.response_duration_ms.p95 <= responseP95BudgetMs, + }, + error_rate: { + actual: metrics.error_rate, + max: maxErrorRate, + pass: metrics.error_rate <= maxErrorRate, + }, + }; + await writeFile(metricsPath, `${JSON.stringify(metrics, null, 2)}\n`, "utf8"); + if (result.status === "pass" && !Object.values(result.thresholds_summary).every((item) => item.pass)) { + result.status = "fail"; + result.reason = "Debug Chat performance breached response latency or error-rate thresholds."; + } const existingEvidence = {}; for (const [key, value] of Object.entries(result.evidence)) { if (typeof value !== "string") continue; diff --git a/skills/skills.index.json b/skills/skills.index.json index d56a84822..640996adc 100644 --- a/skills/skills.index.json +++ b/skills/skills.index.json @@ -130,6 +130,7 @@ "references/local-agent-runner.md", "references/mcp-stdio-testing.md", "references/model-provider-testing.md", + "references/performance-reliability-testing.md", "references/pipeline-debug-chat.md", "references/plugin-e2e-smoke.md", "references/sandbox-skill-authoring.md", @@ -150,6 +151,16 @@ "agent-runner-release-preflight", "agent-runner-runtime-chaos", "dify-agent-debug-chat", + "langbot-fake-provider-debug-chat-cross-pipeline-isolation", + "langbot-fake-provider-debug-chat-fault-recovery", + "langbot-fake-provider-debug-chat-load", + "langbot-fake-provider-debug-chat-slow-load", + "langbot-fault-taxonomy-contract", + "langbot-live-backend-latency", + "langbot-live-backend-log-health", + "langbot-live-control-plane-api", + "langbot-overhead-accounting-contract", + "langbot-space-debug-chat-concurrency-smoke", "langrag-kb-retrieve", "langrag-parser-golden-e2e", "langrag-sentinel-kb-discover", @@ -165,6 +176,7 @@ "mcp-stdio-register", "mcp-stdio-tool-call", "pipeline-debug-chat", + "pipeline-debug-chat-performance", "plugin-e2e-smoke", "provider-deepseek", "qa-plugin-smoke-live-install", @@ -486,6 +498,316 @@ "backend_log" ] }, + { + "id": "langbot-fake-provider-debug-chat-cross-pipeline-isolation", + "title": "LangBot Debug Chat fake-provider cross-pipeline isolation probe", + "mode": "probe", + "area": "reliability", + "type": "reliability", + "priority": "p1", + "risk": "high", + "ci_eligible": false, + "tags": [ + "reliability", + "debug-chat", + "websocket", + "fake-provider", + "isolation", + "concurrency", + "metrics" + ], + "automation": "skills/langbot-testing/probes/langbot-debug-chat-cross-pipeline-isolation.mjs", + "setup_automation": [ + "node:scripts/e2e/ensure-fake-provider-cross-pipelines.mjs --write-env" + ], + "setup_provides_env": [ + "LANGBOT_FAKE_PROVIDER_URL", + "LANGBOT_FAKE_PROVIDER_BASE_URL", + "LANGBOT_FAKE_PROVIDER_PID", + "LANGBOT_FAKE_PROVIDER_PIPELINE_A_URL", + "LANGBOT_FAKE_PROVIDER_PIPELINE_A_NAME", + "LANGBOT_FAKE_PROVIDER_PIPELINE_B_URL", + "LANGBOT_FAKE_PROVIDER_PIPELINE_B_NAME" + ], + "evidence_required": [ + "metrics", + "network", + "api_diagnostic", + "filesystem" + ] + }, + { + "id": "langbot-fake-provider-debug-chat-fault-recovery", + "title": "LangBot Debug Chat fake-provider fault recovery probe", + "mode": "probe", + "area": "reliability", + "type": "chaos", + "priority": "p1", + "risk": "high", + "ci_eligible": false, + "tags": [ + "reliability", + "chaos", + "debug-chat", + "websocket", + "fake-provider", + "fault-injection", + "metrics" + ], + "automation": "skills/langbot-testing/probes/langbot-debug-chat-concurrency.mjs", + "setup_automation": [ + "node:scripts/e2e/ensure-fake-provider-pipeline.mjs --write-env" + ], + "setup_provides_env": [ + "LANGBOT_FAKE_PROVIDER_URL", + "LANGBOT_FAKE_PROVIDER_BASE_URL", + "LANGBOT_FAKE_PROVIDER_PID", + "LANGBOT_FAKE_PROVIDER_PROVIDER_UUID", + "LANGBOT_FAKE_PROVIDER_MODEL_UUID", + "LANGBOT_FAKE_PROVIDER_PIPELINE_URL", + "LANGBOT_FAKE_PROVIDER_PIPELINE_NAME" + ], + "evidence_required": [ + "metrics", + "network", + "api_diagnostic", + "filesystem" + ] + }, + { + "id": "langbot-fake-provider-debug-chat-load", + "title": "LangBot Debug Chat controlled fake-provider load probe", + "mode": "probe", + "area": "performance", + "type": "performance", + "priority": "p1", + "risk": "medium", + "ci_eligible": false, + "tags": [ + "performance", + "debug-chat", + "websocket", + "fake-provider", + "load", + "metrics" + ], + "automation": "skills/langbot-testing/probes/langbot-debug-chat-concurrency.mjs", + "setup_automation": [ + "node:scripts/e2e/ensure-fake-provider-pipeline.mjs --write-env" + ], + "setup_provides_env": [ + "LANGBOT_FAKE_PROVIDER_URL", + "LANGBOT_FAKE_PROVIDER_BASE_URL", + "LANGBOT_FAKE_PROVIDER_PID", + "LANGBOT_FAKE_PROVIDER_PROVIDER_UUID", + "LANGBOT_FAKE_PROVIDER_MODEL_UUID", + "LANGBOT_FAKE_PROVIDER_PIPELINE_URL", + "LANGBOT_FAKE_PROVIDER_PIPELINE_NAME" + ], + "evidence_required": [ + "metrics", + "network", + "api_diagnostic", + "filesystem" + ] + }, + { + "id": "langbot-fake-provider-debug-chat-slow-load", + "title": "LangBot Debug Chat slow fake-provider load probe", + "mode": "probe", + "area": "performance", + "type": "performance", + "priority": "p1", + "risk": "medium", + "ci_eligible": false, + "tags": [ + "performance", + "debug-chat", + "websocket", + "fake-provider", + "slow-provider", + "load", + "metrics" + ], + "automation": "skills/langbot-testing/probes/langbot-debug-chat-concurrency.mjs", + "setup_automation": [ + "node:scripts/e2e/ensure-fake-provider-pipeline.mjs --write-env" + ], + "setup_provides_env": [ + "LANGBOT_FAKE_PROVIDER_URL", + "LANGBOT_FAKE_PROVIDER_BASE_URL", + "LANGBOT_FAKE_PROVIDER_PID", + "LANGBOT_FAKE_PROVIDER_PROVIDER_UUID", + "LANGBOT_FAKE_PROVIDER_MODEL_UUID", + "LANGBOT_FAKE_PROVIDER_PIPELINE_URL", + "LANGBOT_FAKE_PROVIDER_PIPELINE_NAME" + ], + "evidence_required": [ + "metrics", + "network", + "api_diagnostic", + "filesystem" + ] + }, + { + "id": "langbot-fault-taxonomy-contract", + "title": "LangBot fault taxonomy and cleanup contract", + "mode": "probe", + "area": "reliability", + "type": "chaos", + "priority": "p1", + "risk": "medium", + "ci_eligible": true, + "tags": [ + "reliability", + "chaos", + "contract", + "synthetic" + ], + "automation": "skills/langbot-testing/probes/langbot-fault-taxonomy-contract.mjs", + "setup_automation": [], + "setup_provides_env": [], + "evidence_required": [ + "metrics", + "filesystem" + ] + }, + { + "id": "langbot-live-backend-latency", + "title": "LangBot live backend basic latency probe", + "mode": "probe", + "area": "performance", + "type": "performance", + "priority": "p1", + "risk": "medium", + "ci_eligible": false, + "tags": [ + "performance", + "live-backend", + "latency", + "metrics" + ], + "automation": "skills/langbot-testing/probes/langbot-live-backend-latency.mjs", + "setup_automation": [], + "setup_provides_env": [], + "evidence_required": [ + "metrics", + "network", + "api_diagnostic", + "filesystem" + ] + }, + { + "id": "langbot-live-backend-log-health", + "title": "LangBot live backend log health probe", + "mode": "probe", + "area": "reliability", + "type": "reliability", + "priority": "p1", + "risk": "medium", + "ci_eligible": false, + "tags": [ + "reliability", + "live-backend", + "backend-log", + "metrics" + ], + "automation": "skills/langbot-testing/probes/langbot-live-backend-log-health.mjs", + "setup_automation": [], + "setup_provides_env": [], + "evidence_required": [ + "metrics", + "backend_log", + "filesystem" + ] + }, + { + "id": "langbot-live-control-plane-api", + "title": "LangBot live control-plane API probe", + "mode": "probe", + "area": "performance", + "type": "performance", + "priority": "p1", + "risk": "medium", + "ci_eligible": false, + "tags": [ + "performance", + "reliability", + "live-backend", + "control-plane", + "metrics" + ], + "automation": "skills/langbot-testing/probes/langbot-live-control-plane-api.mjs", + "setup_automation": [], + "setup_provides_env": [], + "evidence_required": [ + "metrics", + "network", + "api_diagnostic", + "filesystem" + ] + }, + { + "id": "langbot-overhead-accounting-contract", + "title": "LangBot overhead accounting metrics contract", + "mode": "probe", + "area": "performance", + "type": "performance", + "priority": "p1", + "risk": "medium", + "ci_eligible": true, + "tags": [ + "performance", + "metrics", + "contract", + "synthetic" + ], + "automation": "skills/langbot-testing/probes/langbot-overhead-accounting-contract.mjs", + "setup_automation": [], + "setup_provides_env": [], + "evidence_required": [ + "metrics", + "resource_log", + "filesystem" + ] + }, + { + "id": "langbot-space-debug-chat-concurrency-smoke", + "title": "LangBot Debug Chat real Space-provider concurrency smoke", + "mode": "probe", + "area": "performance", + "type": "performance", + "priority": "p1", + "risk": "high", + "ci_eligible": false, + "tags": [ + "performance", + "debug-chat", + "websocket", + "space", + "live-provider", + "smoke", + "metrics" + ], + "automation": "skills/langbot-testing/probes/langbot-debug-chat-concurrency.mjs", + "setup_automation": [ + "node:scripts/e2e/ensure-local-agent-pipeline.mjs --write-env" + ], + "setup_provides_env": [ + "LANGBOT_PIPELINE_URL", + "LANGBOT_PIPELINE_NAME", + "LANGBOT_LOCAL_AGENT_PIPELINE_URL", + "LANGBOT_LOCAL_AGENT_PIPELINE_NAME", + "LANGBOT_LOCAL_AGENT_MODEL_UUID", + "LANGBOT_E2E_MODEL_UUID" + ], + "evidence_required": [ + "metrics", + "network", + "api_diagnostic", + "filesystem" + ] + }, { "id": "langrag-kb-retrieve", "title": "LangRAG knowledge base ingests and retrieves a sentinel document", @@ -911,6 +1233,38 @@ "backend_log" ] }, + { + "id": "pipeline-debug-chat-performance", + "title": "Pipeline Debug Chat user-path performance probe", + "mode": "agent-browser", + "area": "pipeline", + "type": "performance", + "priority": "p1", + "risk": "medium", + "ci_eligible": false, + "tags": [ + "performance", + "pipeline", + "debug-chat", + "user-path", + "metrics" + ], + "automation": "scripts/e2e/pipeline-debug-chat.mjs", + "setup_automation": [ + "node:scripts/e2e/ensure-local-agent-pipeline.mjs --write-env" + ], + "setup_provides_env": [ + "LANGBOT_PIPELINE_URL", + "LANGBOT_PIPELINE_NAME" + ], + "evidence_required": [ + "ui", + "screenshot", + "console", + "network", + "metrics" + ] + }, { "id": "plugin-e2e-smoke", "title": "Plugin system installs a local plugin and exposes tool/page APIs", @@ -1059,6 +1413,12 @@ "suites": [ "agent-runner-release-gate", "core-smoke", + "langbot-debug-chat-isolation-gate", + "langbot-debug-chat-load-gate", + "langbot-live-backend-gate", + "langbot-performance-contract-gate", + "langbot-performance-reliability-gate", + "langbot-user-path-performance-gate", "local-agent-gate" ], "suite_summaries": [ @@ -1121,6 +1481,113 @@ "local-agent-basic-debug-chat" ] }, + { + "id": "langbot-debug-chat-isolation-gate", + "title": "LangBot Debug Chat isolation gate", + "description": "Manual/non-required cross-pipeline Debug Chat isolation gate. Current releases may fail this gate because of product bug #2286; use it as regression evidence after the routing fix lands.", + "type": "reliability", + "priority": "p1", + "tags": [ + "reliability", + "debug-chat", + "websocket", + "isolation", + "concurrency" + ], + "cases": [ + "langbot-fake-provider-debug-chat-cross-pipeline-isolation" + ] + }, + { + "id": "langbot-debug-chat-load-gate", + "title": "LangBot Debug Chat load gate", + "description": "Manual/non-required message-path load checks for Pipeline Debug Chat: controlled fake-provider baseline, slow-provider and fault-recovery profiles, plus optional real Space-provider smoke. Cross-pipeline isolation is split into langbot-debug-chat-isolation-gate because current releases may fail it due to product bug #2286.", + "type": "performance", + "priority": "p1", + "tags": [ + "performance", + "debug-chat", + "websocket", + "load" + ], + "cases": [ + "langbot-fake-provider-debug-chat-load", + "langbot-fake-provider-debug-chat-slow-load", + "langbot-fake-provider-debug-chat-fault-recovery", + "langbot-space-debug-chat-concurrency-smoke" + ] + }, + { + "id": "langbot-live-backend-gate", + "title": "LangBot live backend reliability gate", + "description": "Live backend control-plane responsiveness and runtime log health checks for a locally running LangBot instance.", + "type": "reliability", + "priority": "p1", + "tags": [ + "performance", + "reliability", + "live-backend", + "metrics" + ], + "cases": [ + "langbot-live-backend-latency", + "langbot-live-control-plane-api", + "langbot-live-backend-log-health" + ] + }, + { + "id": "langbot-performance-contract-gate", + "title": "LangBot performance contract gate", + "description": "Fast synthetic contract checks for performance metric accounting and non-destructive reliability fault taxonomy.", + "type": "contract", + "priority": "p1", + "tags": [ + "performance", + "reliability", + "contract", + "metrics" + ], + "cases": [ + "langbot-overhead-accounting-contract", + "langbot-fault-taxonomy-contract" + ] + }, + { + "id": "langbot-performance-reliability-gate", + "title": "LangBot performance and reliability starter gate", + "description": "Starter gate for LangBot performance accounting, live backend control-plane latency, and non-destructive fault taxonomy checks.", + "type": "reliability", + "priority": "p1", + "tags": [ + "performance", + "reliability", + "metrics", + "chaos" + ], + "cases": [ + "langbot-overhead-accounting-contract", + "langbot-fault-taxonomy-contract", + "langbot-live-backend-latency", + "langbot-live-control-plane-api", + "langbot-live-backend-log-health" + ] + }, + { + "id": "langbot-user-path-performance-gate", + "title": "LangBot user-path performance gate", + "description": "Browser-visible performance checks for user-facing LangBot paths such as Pipeline Debug Chat.", + "type": "performance", + "priority": "p1", + "tags": [ + "performance", + "browser", + "debug-chat", + "user-path" + ], + "cases": [ + "pipeline-debug-chat-performance" + ] + }, { "id": "local-agent-gate", "title": "Local Agent runner regression gate", @@ -1265,6 +1732,7 @@ "sandbox-native-tools-unavailable", "socks-proxy-without-socksio", "survey-widget-blocks-debug-chat", + "telemetry-proxy-noise", "tool-name-collision-between-mcp-and-plugin", "uv-run-resyncs-local-sdk" ], @@ -1449,6 +1917,14 @@ "mcp-stdio-tool-call" ] }, + { + "id": "telemetry-proxy-noise", + "title": "Telemetry posting fails through the proxy while the target flow succeeds", + "category": "env_issue", + "related_cases": [ + "langbot-space-debug-chat-concurrency-smoke" + ] + }, { "id": "tool-name-collision-between-mcp-and-plugin", "title": "MCP and plugin expose the same tool name", diff --git a/skills/skills/.env.example b/skills/skills/.env.example index a8f5ebf09..888c5721d 100644 --- a/skills/skills/.env.example +++ b/skills/skills/.env.example @@ -26,6 +26,23 @@ LANGBOT_NO_PROXY=localhost,127.0.0.1,::1 LANGBOT_PIPELINE_URL= LANGBOT_PIPELINE_NAME= +# Optional fake OpenAI-compatible provider controls for Debug Chat load tests. +# Leave URL empty to let setup automation start a local provider and write the +# selected URL to skills/.env.local. +LANGBOT_FAKE_PROVIDER_URL= +LANGBOT_FAKE_PROVIDER_HOST=127.0.0.1 +LANGBOT_FAKE_PROVIDER_PORT= +LANGBOT_FAKE_PROVIDER_MODEL_NAME=gpt-4o-mini +LANGBOT_FAKE_PROVIDER_RESPONSE_TEXT=OK +LANGBOT_FAKE_PROVIDER_FIRST_TOKEN_DELAY_MS=25 +LANGBOT_FAKE_PROVIDER_CHUNK_DELAY_MS=10 +LANGBOT_FAKE_PROVIDER_CHUNK_COUNT=0 +LANGBOT_FAKE_PROVIDER_FAIL_FIRST_N=0 +LANGBOT_FAKE_PROVIDER_FAIL_EVERY_N=0 +LANGBOT_FAKE_PROVIDER_FAULT_STATUS=500 +LANGBOT_FAKE_PROVIDER_FAIL_AFTER_FIRST_CHUNK=false +LANGBOT_FAKE_PROVIDER_DYNAMIC_RESPONSE=true + # Optional case-specific runner targets. Prefer these for runner-specific cases # so the automation cannot silently test the wrong runner. LANGBOT_LOCAL_AGENT_PIPELINE_URL= diff --git a/skills/skills/langbot-env-setup/references/service-startup.md b/skills/skills/langbot-env-setup/references/service-startup.md index 4f7b3ec27..b63960cdb 100644 --- a/skills/skills/langbot-env-setup/references/service-startup.md +++ b/skills/skills/langbot-env-setup/references/service-startup.md @@ -53,7 +53,7 @@ Start the new frontend from the web repo: ```bash cd "$LANGBOT_WEB_REPO" -npm run dev +VITE_API_BASE_URL="$LANGBOT_BACKEND_URL" pnpm dev --host 0.0.0.0 ``` Healthy startup includes: @@ -68,6 +68,10 @@ Quick check: curl -I --max-time 3 "$LANGBOT_FRONTEND_URL" ``` +If `VITE_API_BASE_URL` is missing, Vite still serves the page but frontend API +calls may go to the frontend port instead of the backend port. That produces +false browser failures in login, wizard, pipeline, and Debug Chat cases. + ## Completion Signal Environment setup is not complete until the required frontend/backend URLs are reachable and the chosen browser-control path can open the WebUI. diff --git a/skills/skills/langbot-testing/SKILL.md b/skills/skills/langbot-testing/SKILL.md index e9db1980f..748ae9b81 100644 --- a/skills/skills/langbot-testing/SKILL.md +++ b/skills/skills/langbot-testing/SKILL.md @@ -21,6 +21,7 @@ Use this skill when an agent needs to verify LangBot behavior through the WebUI - **Sandbox-backed skill authoring**: read `references/sandbox-skill-authoring.md`. - **LangRAG knowledge bases**: read `references/langrag-knowledge-base.md`. - **MCP stdio tool testing**: read `references/mcp-stdio-testing.md`. +- **Performance, reliability, or chaos probes**: read `references/performance-reliability-testing.md`. - **Drive a live instance over MCP (not raw HTTP)**: use the `langbot-mcp-ops` skill — the instance exposes an MCP server at `http://:5300/mcp` (reuses API keys). Useful for setting up bots/pipelines/models as test fixtures programmatically. - **Known failures and fixes**: read `references/troubleshooting.md`. - **Reusable test groups**: run `bin/lbs suite list` and `bin/lbs suite plan ` before manually assembling a case set. @@ -36,6 +37,8 @@ Use this skill when an agent needs to verify LangBot behavior through the WebUI - Use an authenticated browser profile prepared by `langbot-env-setup`. - Do not expose API keys, OAuth secrets, tokens, or localStorage token values in output. - A WebUI test is not complete until the visible UI result is checked against backend logs or network behavior. +- A performance result is not complete without `metrics` evidence and a clear split between LangBot overhead and external provider/tool/network time. +- A chaos or reliability result is not complete until the fault scope, cleanup, and recovery checks are recorded. - For a suite, use `bin/lbs suite start ` to create the suite evidence root, per-case directories, and `suite-start.json`/`suite-start.md` handoff files; use `bin/lbs test result ` to write final per-case `result.json`, then run `bin/lbs suite report --evidence-dir `. - Do not mark a case `pass` until `test result --evidence` covers every value in the case's `evidence_required`. - For runner-specific Debug Chat cases, use the case-specific pipeline env declared by `automation_pipeline_url_env` / `automation_pipeline_name_env`; do not silently reuse a generic `LANGBOT_PIPELINE_URL`. diff --git a/skills/skills/langbot-testing/cases/langbot-fake-provider-debug-chat-cross-pipeline-isolation.yaml b/skills/skills/langbot-testing/cases/langbot-fake-provider-debug-chat-cross-pipeline-isolation.yaml new file mode 100644 index 000000000..9e8e09af0 --- /dev/null +++ b/skills/skills/langbot-testing/cases/langbot-fake-provider-debug-chat-cross-pipeline-isolation.yaml @@ -0,0 +1,84 @@ +id: langbot-fake-provider-debug-chat-cross-pipeline-isolation +title: "LangBot Debug Chat fake-provider cross-pipeline isolation probe" +mode: probe +area: reliability +type: reliability +priority: p1 +risk: high +ci_eligible: false +tags: + - reliability + - debug-chat + - websocket + - fake-provider + - isolation + - concurrency + - metrics +skills: + - langbot-env-setup + - langbot-testing +env: + - LANGBOT_BACKEND_URL + - LANGBOT_FRONTEND_URL + - LANGBOT_E2E_LOGIN_USER +automation: skills/langbot-testing/probes/langbot-debug-chat-cross-pipeline-isolation.mjs +automation_env: + - LANGBOT_BACKEND_URL + - LANGBOT_E2E_LOGIN_USER + - LANGBOT_FAKE_PROVIDER_URL + - LANGBOT_FAKE_PROVIDER_PIPELINE_A_URL + - LANGBOT_FAKE_PROVIDER_PIPELINE_A_NAME + - LANGBOT_FAKE_PROVIDER_PIPELINE_B_URL + - LANGBOT_FAKE_PROVIDER_PIPELINE_B_NAME +automation_debug_chat_load_requests: "6" +automation_debug_chat_load_concurrency: "4" +automation_debug_chat_load_timeout_ms: "30000" +automation_debug_chat_load_response_p95_ms: "5000" +automation_debug_chat_load_max_error_rate: "0" +automation_debug_chat_load_prompt_template: '请只回复 "{expected}",不要解释,不要添加其他字符。' +automation_debug_chat_load_stream: "true" +automation_debug_chat_load_reset: "true" +metrics_thresholds_json: '{"cross_pipeline_leak_count":{"max":0},"response_p95_ms":{"max":5000},"error_rate":{"max":0}}' +load_profile_json: '{"requests_per_pipeline":6,"pipelines":2,"concurrency":4,"path":"Pipeline Debug Chat WebSocket","provider":"controlled fake OpenAI-compatible provider","metric":"cross-pipeline response isolation and send-to-final-assistant-response"}' +setup_automation: + - "node:scripts/e2e/ensure-fake-provider-cross-pipelines.mjs --write-env" +setup_provides_env: + - LANGBOT_FAKE_PROVIDER_URL + - LANGBOT_FAKE_PROVIDER_BASE_URL + - LANGBOT_FAKE_PROVIDER_PID + - LANGBOT_FAKE_PROVIDER_PIPELINE_A_URL + - LANGBOT_FAKE_PROVIDER_PIPELINE_A_NAME + - LANGBOT_FAKE_PROVIDER_PIPELINE_B_URL + - LANGBOT_FAKE_PROVIDER_PIPELINE_B_NAME +steps: + - "Start or reuse the local fake OpenAI-compatible provider." + - "Create or update two local-agent pipelines that both point at the controlled fake provider." + - "Reset both Debug Chat sessions and the fake-provider request log." + - "Open concurrent WebSocket Debug Chat connections to both pipelines and send unique pipeline-scoped response tokens." +checks: + - "automation-result.json status is pass only when every request receives its own expected token and cross_pipeline_leak_count is zero." + - "metrics_summary includes by_pipeline status counts, fake-provider request count, and LangBot/provider timing estimates." + - "samples.json contains per-request pipeline labels so any leak can be attributed to the receiving pipeline." +evidence_required: + - metrics + - network + - api_diagnostic + - filesystem +diagnostics: + - "This probe targets Debug Chat isolation under concurrent traffic from two pipelines." + - "It is designed to expose regressions where global pipeline state causes one pipeline's assistant response to be delivered to another pipeline's Debug Chat session." + - "Same-pipeline foreign responses are tolerated because Debug Chat intentionally broadcasts within the same pipeline/session; cross-pipeline tokens are never tolerated." + - "Known product bug: current releases may fail this probe because Debug Chat replies can read singleton WebSocket proxy pipeline state after another pipeline overwrites it. See https://github.com/langbot-app/LangBot/issues/2286." +expected_failures: + - "https://github.com/langbot-app/LangBot/issues/2286" +success_patterns: + - "Debug Chat cross-pipeline isolation probe passed" +failure_patterns: + - "cross_pipeline_leak" + - "Timed out after" + - "WebSocket connection error" + - "Final assistant response did not include" +troubleshooting: + - backend-not-listening + - debug-chat-history-contaminates-automation + - local-agent-model-route-unavailable diff --git a/skills/skills/langbot-testing/cases/langbot-fake-provider-debug-chat-fault-recovery.yaml b/skills/skills/langbot-testing/cases/langbot-fake-provider-debug-chat-fault-recovery.yaml new file mode 100644 index 000000000..7dfa45c91 --- /dev/null +++ b/skills/skills/langbot-testing/cases/langbot-fake-provider-debug-chat-fault-recovery.yaml @@ -0,0 +1,95 @@ +id: langbot-fake-provider-debug-chat-fault-recovery +title: "LangBot Debug Chat fake-provider fault recovery probe" +mode: probe +area: reliability +type: chaos +priority: p1 +risk: high +ci_eligible: false +tags: + - reliability + - chaos + - debug-chat + - websocket + - fake-provider + - fault-injection + - metrics +skills: + - langbot-env-setup + - langbot-testing +env: + - LANGBOT_BACKEND_URL + - LANGBOT_FRONTEND_URL + - LANGBOT_E2E_LOGIN_USER +automation: skills/langbot-testing/probes/langbot-debug-chat-concurrency.mjs +automation_env: + - LANGBOT_BACKEND_URL + - LANGBOT_E2E_LOGIN_USER + - LANGBOT_FAKE_PROVIDER_PIPELINE_URL + - LANGBOT_FAKE_PROVIDER_PIPELINE_NAME +automation_pipeline_url_env: LANGBOT_FAKE_PROVIDER_PIPELINE_URL +automation_pipeline_name_env: LANGBOT_FAKE_PROVIDER_PIPELINE_NAME +automation_debug_chat_load_requests: "6" +automation_debug_chat_load_concurrency: "1" +automation_debug_chat_load_timeout_ms: "15000" +automation_debug_chat_load_response_p95_ms: "5000" +automation_debug_chat_load_max_error_rate: "0" +automation_debug_chat_load_min_ok_count: "6" +automation_debug_chat_load_min_provider_fault_count: "2" +automation_debug_chat_load_expected_prefix: "FAULTQA" +automation_debug_chat_load_prompt_template: '请只回复 "{expected}",不要解释,不要添加其他字符。' +automation_debug_chat_load_stream: "true" +automation_debug_chat_load_reset: "true" +automation_debug_chat_load_fail_on_final_mismatch: "true" +automation_fake_provider_first_token_delay_ms: "25" +automation_fake_provider_chunk_delay_ms: "10" +automation_fake_provider_chunk_count: "0" +automation_fake_provider_fail_first_n: "2" +automation_fake_provider_fail_every_n: "0" +automation_fake_provider_fault_status: "503" +metrics_thresholds_json: '{"response_p95_ms":{"max":5000},"error_rate":{"max":0},"ok_count_min":{"min":6},"fake_provider_fault_count_min":{"min":2}}' +fault_model_json: '{"provider_fault":"HTTP 503 for first 2 fake-provider chat completions after reset","expected_behavior":"LangBot retries or otherwise recovers from bounded provider failures so every Debug Chat request receives its expected response without backend crash."}' +load_profile_json: '{"requests":6,"concurrency":1,"path":"Pipeline Debug Chat WebSocket","provider":"controlled fake OpenAI-compatible provider","classification":"fault-recovery-not-throughput-benchmark"}' +setup_automation: + - "node:scripts/e2e/ensure-fake-provider-pipeline.mjs --write-env" +setup_provides_env: + - LANGBOT_FAKE_PROVIDER_URL + - LANGBOT_FAKE_PROVIDER_BASE_URL + - LANGBOT_FAKE_PROVIDER_PID + - LANGBOT_FAKE_PROVIDER_PROVIDER_UUID + - LANGBOT_FAKE_PROVIDER_MODEL_UUID + - LANGBOT_FAKE_PROVIDER_PIPELINE_URL + - LANGBOT_FAKE_PROVIDER_PIPELINE_NAME +steps: + - "Configure the local fake provider to return HTTP 503 for the first two chat completions after reset." + - "Create or update the LangBot provider, model, and local-agent pipeline that points at the fake provider." + - "Reset the target Debug Chat session and fake-provider request counter." + - "Send a sequential Debug Chat batch and verify later requests recover after the injected provider faults." +checks: + - "automation-result.json status is pass when the fake provider records at least two injected faults, every Debug Chat request succeeds, and total user-visible error rate stays at zero." + - "metrics_summary includes fake_provider_fault_count and status_counts for the same run window." + - "backend logs show request handling for the same run window without unexpected Traceback or task-leak findings." +evidence_required: + - metrics + - network + - api_diagnostic + - filesystem +diagnostics: + - "This is a fault-recovery probe, not a throughput benchmark." + - "Provider faults may be retried inside the provider/requester path; judge this case by fake_provider_fault_count plus user-visible success/error metrics." + - "The profile uses concurrency 1 because Debug Chat broadcasts assistant responses to every connection in a session, and failed responses do not carry the unique success token needed for concurrent attribution." +success_patterns: + - "Debug Chat WebSocket concurrency probe passed" + - "Streaming completed" +failure_patterns: + - "fake_provider_fault" + - "HTTP 503" + - "Timed out after" + - "All models failed during streaming setup" +expected_failures: + - "fake_provider_fault" + - "HTTP 503" +troubleshooting: + - backend-not-listening + - debug-chat-history-contaminates-automation + - local-agent-model-route-unavailable diff --git a/skills/skills/langbot-testing/cases/langbot-fake-provider-debug-chat-load.yaml b/skills/skills/langbot-testing/cases/langbot-fake-provider-debug-chat-load.yaml new file mode 100644 index 000000000..8a71c3558 --- /dev/null +++ b/skills/skills/langbot-testing/cases/langbot-fake-provider-debug-chat-load.yaml @@ -0,0 +1,81 @@ +id: langbot-fake-provider-debug-chat-load +title: "LangBot Debug Chat controlled fake-provider load probe" +mode: probe +area: performance +type: performance +priority: p1 +risk: medium +ci_eligible: false +tags: + - performance + - debug-chat + - websocket + - fake-provider + - load + - metrics +skills: + - langbot-env-setup + - langbot-testing +env: + - LANGBOT_BACKEND_URL + - LANGBOT_FRONTEND_URL + - LANGBOT_E2E_LOGIN_USER +automation: skills/langbot-testing/probes/langbot-debug-chat-concurrency.mjs +automation_env: + - LANGBOT_BACKEND_URL + - LANGBOT_E2E_LOGIN_USER + - LANGBOT_FAKE_PROVIDER_PIPELINE_URL + - LANGBOT_FAKE_PROVIDER_PIPELINE_NAME +automation_pipeline_url_env: LANGBOT_FAKE_PROVIDER_PIPELINE_URL +automation_pipeline_name_env: LANGBOT_FAKE_PROVIDER_PIPELINE_NAME +automation_debug_chat_load_requests: "12" +automation_debug_chat_load_concurrency: "4" +automation_debug_chat_load_timeout_ms: "30000" +automation_debug_chat_load_response_p95_ms: "5000" +automation_debug_chat_load_first_response_p95_ms: "3000" +automation_debug_chat_load_max_error_rate: "0" +automation_debug_chat_load_expected_prefix: "FAKEQA" +automation_debug_chat_load_prompt_template: '请只回复 "{expected}",不要解释,不要添加其他字符。' +automation_debug_chat_load_stream: "true" +automation_debug_chat_load_reset: "true" +metrics_thresholds_json: '{"response_p95_ms":{"max":5000},"first_response_p95_ms":{"max":3000},"error_rate":{"max":0}}' +load_profile_json: '{"requests":12,"concurrency":4,"path":"Pipeline Debug Chat WebSocket","provider":"controlled fake OpenAI-compatible provider","metric":"send-to-final-assistant-response"}' +setup_automation: + - "node:scripts/e2e/ensure-fake-provider-pipeline.mjs --write-env" +setup_provides_env: + - LANGBOT_FAKE_PROVIDER_URL + - LANGBOT_FAKE_PROVIDER_BASE_URL + - LANGBOT_FAKE_PROVIDER_PID + - LANGBOT_FAKE_PROVIDER_PROVIDER_UUID + - LANGBOT_FAKE_PROVIDER_MODEL_UUID + - LANGBOT_FAKE_PROVIDER_PIPELINE_URL + - LANGBOT_FAKE_PROVIDER_PIPELINE_NAME +steps: + - "Start or reuse the local fake OpenAI-compatible provider." + - "Create or update the LangBot provider, model, and local-agent pipeline that points at the fake provider." + - "Reset the target Debug Chat session." + - "Open concurrent WebSocket Debug Chat connections and send unique deterministic prompts through the real backend pipeline." +checks: + - "automation-result.json status is pass when every request receives its own expected assistant response." + - "metrics_summary includes request count, concurrency, p50/p95 response latency, first response latency, throughput, and error rate." + - "thresholds_summary shows response_p95_ms, first_response_p95_ms, and error_rate pass." +evidence_required: + - metrics + - network + - api_diagnostic + - filesystem +diagnostics: + - "This probe removes external model latency from the measurement; it still exercises the live LangBot backend, provider requester, local-agent runner, pipeline, and Debug Chat WebSocket adapter." + - "Use this as the repeatable message-path baseline before comparing against Space or another real provider." +success_patterns: + - "Debug Chat WebSocket concurrency probe passed" + - "Streaming completed" +failure_patterns: + - "WebSocket connection error" + - "Timed out after" + - "Final assistant response did not include" + - "All models failed during streaming setup" +troubleshooting: + - backend-not-listening + - debug-chat-history-contaminates-automation + - local-agent-model-route-unavailable diff --git a/skills/skills/langbot-testing/cases/langbot-fake-provider-debug-chat-slow-load.yaml b/skills/skills/langbot-testing/cases/langbot-fake-provider-debug-chat-slow-load.yaml new file mode 100644 index 000000000..afa7de154 --- /dev/null +++ b/skills/skills/langbot-testing/cases/langbot-fake-provider-debug-chat-slow-load.yaml @@ -0,0 +1,88 @@ +id: langbot-fake-provider-debug-chat-slow-load +title: "LangBot Debug Chat slow fake-provider load probe" +mode: probe +area: performance +type: performance +priority: p1 +risk: medium +ci_eligible: false +tags: + - performance + - debug-chat + - websocket + - fake-provider + - slow-provider + - load + - metrics +skills: + - langbot-env-setup + - langbot-testing +env: + - LANGBOT_BACKEND_URL + - LANGBOT_FRONTEND_URL + - LANGBOT_E2E_LOGIN_USER +automation: skills/langbot-testing/probes/langbot-debug-chat-concurrency.mjs +automation_env: + - LANGBOT_BACKEND_URL + - LANGBOT_E2E_LOGIN_USER + - LANGBOT_FAKE_PROVIDER_PIPELINE_URL + - LANGBOT_FAKE_PROVIDER_PIPELINE_NAME +automation_pipeline_url_env: LANGBOT_FAKE_PROVIDER_PIPELINE_URL +automation_pipeline_name_env: LANGBOT_FAKE_PROVIDER_PIPELINE_NAME +automation_debug_chat_load_requests: "8" +automation_debug_chat_load_concurrency: "4" +automation_debug_chat_load_timeout_ms: "45000" +automation_debug_chat_load_response_p95_ms: "10000" +automation_debug_chat_load_first_response_p95_ms: "7000" +automation_debug_chat_load_max_error_rate: "0" +automation_debug_chat_load_expected_prefix: "SLOWQA" +automation_debug_chat_load_prompt_template: '请只回复 "{expected}",不要解释,不要添加其他字符。' +automation_debug_chat_load_stream: "true" +automation_debug_chat_load_reset: "true" +automation_fake_provider_first_token_delay_ms: "1000" +automation_fake_provider_chunk_delay_ms: "250" +automation_fake_provider_chunk_count: "4" +automation_fake_provider_fail_first_n: "0" +automation_fake_provider_fail_every_n: "0" +automation_fake_provider_fault_status: "500" +metrics_thresholds_json: '{"response_p95_ms":{"max":10000},"first_response_p95_ms":{"max":7000},"error_rate":{"max":0}}' +load_profile_json: '{"requests":8,"concurrency":4,"path":"Pipeline Debug Chat WebSocket","provider":"controlled slow fake OpenAI-compatible provider","metric":"send-to-final-assistant-response","provider_profile":{"first_token_delay_ms":1000,"chunk_delay_ms":250,"chunk_count":4}}' +setup_automation: + - "node:scripts/e2e/ensure-fake-provider-pipeline.mjs --write-env" +setup_provides_env: + - LANGBOT_FAKE_PROVIDER_URL + - LANGBOT_FAKE_PROVIDER_BASE_URL + - LANGBOT_FAKE_PROVIDER_PID + - LANGBOT_FAKE_PROVIDER_PROVIDER_UUID + - LANGBOT_FAKE_PROVIDER_MODEL_UUID + - LANGBOT_FAKE_PROVIDER_PIPELINE_URL + - LANGBOT_FAKE_PROVIDER_PIPELINE_NAME +steps: + - "Configure the local fake provider with deterministic slow streaming latency." + - "Create or update the LangBot provider, model, and local-agent pipeline that points at the fake provider." + - "Reset the target Debug Chat session." + - "Open concurrent WebSocket Debug Chat connections and send unique deterministic prompts through the real backend pipeline." +checks: + - "automation-result.json status is pass when every request receives its own expected assistant response." + - "metrics_summary shows zero errors under the slow-provider profile." + - "thresholds_summary shows response_p95_ms, first_response_p95_ms, and error_rate pass." +evidence_required: + - metrics + - network + - api_diagnostic + - filesystem +diagnostics: + - "This probe keeps the model deterministic while injecting provider latency, so it catches backend timeout, streaming, and WebSocket backpressure issues without Space variability." + - "Compare with langbot-fake-provider-debug-chat-load to separate fixed LangBot overhead from provider-latency amplification." +success_patterns: + - "Debug Chat WebSocket concurrency probe passed" + - "Streaming completed" +failure_patterns: + - "WebSocket connection error" + - "Timed out after" + - "Final assistant response did not include" + - "All models failed during streaming setup" +troubleshooting: + - backend-not-listening + - debug-chat-history-contaminates-automation + - local-agent-model-route-unavailable diff --git a/skills/skills/langbot-testing/cases/langbot-fault-taxonomy-contract.yaml b/skills/skills/langbot-testing/cases/langbot-fault-taxonomy-contract.yaml new file mode 100644 index 000000000..2b990f837 --- /dev/null +++ b/skills/skills/langbot-testing/cases/langbot-fault-taxonomy-contract.yaml @@ -0,0 +1,35 @@ +id: langbot-fault-taxonomy-contract +title: "LangBot fault taxonomy and cleanup contract" +mode: probe +area: reliability +type: chaos +priority: p1 +risk: medium +ci_eligible: true +tags: + - reliability + - chaos + - contract + - synthetic +skills: + - langbot-testing +automation: skills/langbot-testing/probes/langbot-fault-taxonomy-contract.mjs +fault_model_json: '{"kind":"taxonomy-contract","destructive":false,"scenarios":["provider-timeout","plugin-runtime-disconnect","mcp-stdio-server-exit","operator-missing-login","transient-marketplace-timeout"]}' +steps: + - "Run `rtk bin/lbs test run langbot-fault-taxonomy-contract --dry-run` first; remove `--dry-run` after checking the evidence directory." + - "Automation validates that representative fault scenarios declare target, injected fault, expected status, recovery check, and cleanup." + - "Review metrics.json, fault-model.json, and automation-result.json under LBS_EVIDENCE_DIR." +checks: + - "automation-result.json status is pass." + - "Every scenario has an expected status in pass, fail, blocked, env_issue, or flaky." + - "Every scenario declares a cleanup action and recovery check." +evidence_required: + - metrics + - filesystem +diagnostics: + - "This is a non-destructive taxonomy contract probe; it does not inject real runtime faults." + - "Use it as a gate before adding live chaos cases that kill runtimes, route traffic through a proxy, or disrupt a backend dependency." +success_patterns: + - "Fault taxonomy contract declares status" +failure_patterns: + - "missing required scenario fields" diff --git a/skills/skills/langbot-testing/cases/langbot-live-backend-latency.yaml b/skills/skills/langbot-testing/cases/langbot-live-backend-latency.yaml new file mode 100644 index 000000000..1922d06f0 --- /dev/null +++ b/skills/skills/langbot-testing/cases/langbot-live-backend-latency.yaml @@ -0,0 +1,42 @@ +id: langbot-live-backend-latency +title: "LangBot live backend basic latency probe" +mode: probe +area: performance +type: performance +priority: p1 +risk: medium +ci_eligible: false +tags: + - performance + - live-backend + - latency + - metrics +skills: + - langbot-testing +env: + - LANGBOT_BACKEND_URL +automation: skills/langbot-testing/probes/langbot-live-backend-latency.mjs +metrics_thresholds_json: '{"backend_p95_ms":{"max":1000},"error_rate":{"max":0}}' +load_profile_json: '{"requests":12,"concurrency":2,"endpoints":["/healthz"]}' +steps: + - "Confirm the selected LangBot backend is the intended test target." + - "Run `rtk bin/lbs test run langbot-live-backend-latency --dry-run` first; remove `--dry-run` after checking LANGBOT_BACKEND_URL and evidence directory." + - "Automation sends a small request batch to LANGBOT_BACKEND_URL/healthz and records latency, status counts, and network errors." +checks: + - "automation-result.json status is pass when the backend responds and p95/error-rate thresholds pass." + - "automation-result.json status is env_issue when the backend is not reachable." + - "metrics.json and network.log are written under LBS_EVIDENCE_DIR." +evidence_required: + - metrics + - network + - api_diagnostic + - filesystem +diagnostics: + - "This probe measures backend health endpoint reachability latency only; it does not cover model/provider, browser, Debug Chat, RAG, or plugin runtime latency." +success_patterns: + - "Live backend latency probe passed" +failure_patterns: + - "Backend did not respond" + - "breached latency or error-rate thresholds" +troubleshooting: + - socks-proxy-without-socksio diff --git a/skills/skills/langbot-testing/cases/langbot-live-backend-log-health.yaml b/skills/skills/langbot-testing/cases/langbot-live-backend-log-health.yaml new file mode 100644 index 000000000..8ff911371 --- /dev/null +++ b/skills/skills/langbot-testing/cases/langbot-live-backend-log-health.yaml @@ -0,0 +1,45 @@ +id: langbot-live-backend-log-health +title: "LangBot live backend log health probe" +mode: probe +area: reliability +type: reliability +priority: p1 +risk: medium +ci_eligible: false +tags: + - reliability + - live-backend + - backend-log + - metrics +skills: + - langbot-testing +env: + - LANGBOT_BACKEND_URL +automation: skills/langbot-testing/probes/langbot-live-backend-log-health.mjs +metrics_thresholds_json: '{"fail_count":{"max":0}}' +load_profile_json: '{"lookback_seconds":300,"log_source":"LANGBOT_BACKEND_LOG or latest LANGBOT_REPO/data/logs/langbot-*.log"}' +steps: + - "Confirm the selected LangBot backend log belongs to the intended test target." + - "Run `rtk bin/lbs test run langbot-live-backend-log-health --dry-run` first; remove `--dry-run` after checking evidence directory and log source." + - "Automation scans the recent backend log window for fail-severity runtime findings such as Traceback, ImportError, ERROR, unclosed sessions, and unawaited coroutines." +checks: + - "automation-result.json status is pass only when fail_count is 0." + - "metrics_summary includes scanned_line_count, fail_count, warning_count, and finding_count." + - "findings.json and scanned-backend.log are written under LBS_EVIDENCE_DIR." +evidence_required: + - metrics + - backend_log + - filesystem +diagnostics: + - "Set LANGBOT_BACKEND_LOG to an explicit log path when the latest log file is not the run target." + - "Set LANGBOT_BACKEND_LOG_SINCE or LANGBOT_BACKEND_LOG_LOOKBACK_SECONDS to control the scan window." + - "This probe measures runtime log health; it does not prove user-facing Debug Chat, plugin, model, or RAG behavior." +success_patterns: + - "Live backend log health passed" +failure_patterns: + - "Traceback" + - "ImportError" + - "ERROR" + - "unclosed" +troubleshooting: + - socks-proxy-without-socksio diff --git a/skills/skills/langbot-testing/cases/langbot-live-control-plane-api.yaml b/skills/skills/langbot-testing/cases/langbot-live-control-plane-api.yaml new file mode 100644 index 000000000..2cd8ee2c7 --- /dev/null +++ b/skills/skills/langbot-testing/cases/langbot-live-control-plane-api.yaml @@ -0,0 +1,44 @@ +id: langbot-live-control-plane-api +title: "LangBot live control-plane API probe" +mode: probe +area: performance +type: performance +priority: p1 +risk: medium +ci_eligible: false +tags: + - performance + - reliability + - live-backend + - control-plane + - metrics +skills: + - langbot-testing +env: + - LANGBOT_BACKEND_URL +automation: skills/langbot-testing/probes/langbot-live-control-plane-api.mjs +metrics_thresholds_json: '{"error_rate":{"max":0},"response_shape_failures":{"max":0},"healthz_p95_ms":{"max":500},"system_info_p95_ms":{"max":1000}}' +load_profile_json: '{"requests":20,"concurrency":4,"endpoints":["/healthz","/api/v1/system/info"],"auth_required":false}' +steps: + - "Confirm the selected LangBot backend is the intended test target." + - "Run `rtk bin/lbs test run langbot-live-control-plane-api --dry-run` first; remove `--dry-run` after checking LANGBOT_BACKEND_URL and evidence directory." + - "Automation sends a small request batch to /healthz and /api/v1/system/info, then validates status code, JSON shape, and latency budgets." +checks: + - "automation-result.json status is pass when every control-plane request returns HTTP 200, JSON code 0, and required response fields." + - "metrics_summary includes per-endpoint p50/p95 latency, error rate, status counts, and response_shape_failures." + - "thresholds_summary shows error_rate, response_shape_failures, healthz_p95_ms, and system_info_p95_ms all pass." +evidence_required: + - metrics + - network + - api_diagnostic + - filesystem +diagnostics: + - "This probe measures unauthenticated backend control-plane readiness; it does not cover authenticated UI flows, Debug Chat, model calls, plugins, or RAG." + - "A system_info shape failure usually means the API contract or startup state changed and should be investigated before treating latency as healthy." +success_patterns: + - "Live control-plane API probe passed" +failure_patterns: + - "Backend did not respond" + - "breached shape, latency, or error-rate thresholds" +troubleshooting: + - socks-proxy-without-socksio diff --git a/skills/skills/langbot-testing/cases/langbot-overhead-accounting-contract.yaml b/skills/skills/langbot-testing/cases/langbot-overhead-accounting-contract.yaml new file mode 100644 index 000000000..650dfe7d9 --- /dev/null +++ b/skills/skills/langbot-testing/cases/langbot-overhead-accounting-contract.yaml @@ -0,0 +1,37 @@ +id: langbot-overhead-accounting-contract +title: "LangBot overhead accounting metrics contract" +mode: probe +area: performance +type: performance +priority: p1 +risk: medium +ci_eligible: true +tags: + - performance + - metrics + - contract + - synthetic +skills: + - langbot-testing +automation: skills/langbot-testing/probes/langbot-overhead-accounting-contract.mjs +metrics_thresholds_json: '{"sample_count":{"min":50},"langbot_overhead_p95_ms":{"max":25},"accounting_gap_max_ms":{"max":0.001}}' +load_profile_json: '{"kind":"synthetic-overhead-accounting","samples":80,"external_latency_segments":["provider","external_tool","network"]}' +steps: + - "Run `rtk bin/lbs test run langbot-overhead-accounting-contract --dry-run` first; remove `--dry-run` after checking the evidence directory." + - "Automation generates deterministic message-path latency samples and separates LangBot overhead from provider/tool/network latency." + - "Review metrics.json, thresholds.json, resource-log.json, and automation-result.json under LBS_EVIDENCE_DIR." +checks: + - "automation-result.json status is pass." + - "metrics_summary includes sample_count, langbot_overhead_p95_ms, e2e_latency_p95_ms, external_latency_p95_ms, and accounting_gap_max_ms." + - "thresholds_summary shows sample_count, langbot_overhead_p95_ms, and accounting_gap_max_ms all pass." +evidence_required: + - metrics + - resource_log + - filesystem +diagnostics: + - "This is a synthetic contract probe for the QA harness; it is not live product performance." + - "Use it to verify that reports can carry overhead accounting metrics before running live backend or browser performance probes." +success_patterns: + - "Overhead accounting contract passed" +failure_patterns: + - "breached one or more thresholds" diff --git a/skills/skills/langbot-testing/cases/langbot-space-debug-chat-concurrency-smoke.yaml b/skills/skills/langbot-testing/cases/langbot-space-debug-chat-concurrency-smoke.yaml new file mode 100644 index 000000000..4f9fc779b --- /dev/null +++ b/skills/skills/langbot-testing/cases/langbot-space-debug-chat-concurrency-smoke.yaml @@ -0,0 +1,84 @@ +id: langbot-space-debug-chat-concurrency-smoke +title: "LangBot Debug Chat real Space-provider concurrency smoke" +mode: probe +area: performance +type: performance +priority: p1 +risk: high +ci_eligible: false +tags: + - performance + - debug-chat + - websocket + - space + - live-provider + - smoke + - metrics +skills: + - langbot-env-setup + - langbot-testing +env: + - LANGBOT_BACKEND_URL + - LANGBOT_FRONTEND_URL + - LANGBOT_E2E_LOGIN_USER +automation: skills/langbot-testing/probes/langbot-debug-chat-concurrency.mjs +automation_env: + - LANGBOT_BACKEND_URL + - LANGBOT_E2E_LOGIN_USER + - LANGBOT_LOCAL_AGENT_PIPELINE_URL + - LANGBOT_LOCAL_AGENT_PIPELINE_NAME +automation_pipeline_url_env: LANGBOT_LOCAL_AGENT_PIPELINE_URL +automation_pipeline_name_env: LANGBOT_LOCAL_AGENT_PIPELINE_NAME +automation_debug_chat_load_requests: "3" +automation_debug_chat_load_concurrency: "2" +automation_debug_chat_load_timeout_ms: "120000" +automation_debug_chat_load_response_p95_ms: "120000" +automation_debug_chat_load_max_error_rate: "0" +automation_debug_chat_load_expected_prefix: "SPACEQA" +automation_debug_chat_load_prompt_template: '请只回复 "{expected}",不要解释,不要添加其他字符。' +automation_debug_chat_load_stream: "true" +automation_debug_chat_load_reset: "true" +metrics_thresholds_json: '{"response_p95_ms":{"max":120000},"error_rate":{"max":0}}' +load_profile_json: '{"requests":3,"concurrency":2,"path":"Pipeline Debug Chat WebSocket","provider":"LangBot Space model route","metric":"send-to-final-assistant-response","classification":"smoke-not-benchmark"}' +setup_automation: + - "node:scripts/e2e/ensure-local-agent-pipeline.mjs --write-env" +setup_provides_env: + - LANGBOT_PIPELINE_URL + - LANGBOT_PIPELINE_NAME + - LANGBOT_LOCAL_AGENT_PIPELINE_URL + - LANGBOT_LOCAL_AGENT_PIPELINE_NAME + - LANGBOT_LOCAL_AGENT_MODEL_UUID + - LANGBOT_E2E_MODEL_UUID +preconditions: + - "The selected local LangBot instance is safe for a low-volume real Space model smoke run." + - "Treat Space/provider/network failures as environment or dependency findings until fake-provider baseline evidence separates LangBot overhead." +steps: + - "Prepare a local-agent pipeline with a tested Space model and fallback models." + - "Reset the target Debug Chat session." + - "Open a small number of concurrent WebSocket Debug Chat connections and send unique deterministic prompts through the live Space provider path." +checks: + - "automation-result.json status is pass when every request receives its own expected assistant response." + - "metrics_summary includes request count, concurrency, p95 response latency, throughput, and error rate." + - "The report classifies the result as a live-provider smoke, not a stable LangBot overhead benchmark." +evidence_required: + - metrics + - network + - api_diagnostic + - filesystem +diagnostics: + - "This probe measures real user-path latency through Space and includes provider latency, model behavior, and network effects." + - "Compare with langbot-fake-provider-debug-chat-load before attributing slow or failed runs to LangBot itself." +success_patterns: + - "Debug Chat WebSocket concurrency probe passed" + - "Streaming completed" +failure_patterns: + - "invalid api key" + - "WebSocket connection error" + - "Timed out after" + - "Final assistant response did not include" + - "All models failed during streaming setup" +troubleshooting: + - local-agent-model-route-unavailable + - marketplace-network-flaky + - proxy-env-mismatch + - telemetry-proxy-noise diff --git a/skills/skills/langbot-testing/cases/pipeline-debug-chat-performance.yaml b/skills/skills/langbot-testing/cases/pipeline-debug-chat-performance.yaml new file mode 100644 index 000000000..266cbb57d --- /dev/null +++ b/skills/skills/langbot-testing/cases/pipeline-debug-chat-performance.yaml @@ -0,0 +1,80 @@ +id: pipeline-debug-chat-performance +title: "Pipeline Debug Chat user-path performance probe" +mode: agent-browser +area: pipeline +type: performance +priority: p1 +risk: medium +ci_eligible: false +tags: + - performance + - pipeline + - debug-chat + - user-path + - metrics +skills: + - langbot-env-setup + - langbot-testing +env: + - LANGBOT_FRONTEND_URL + - LANGBOT_BACKEND_URL +env_any: + - LANGBOT_PIPELINE_URL|LANGBOT_PIPELINE_NAME +automation: scripts/e2e/pipeline-debug-chat.mjs +automation_env: + - LANGBOT_FRONTEND_URL + - LANGBOT_BACKEND_URL + - LANGBOT_BROWSER_PROFILE + - LANGBOT_CHROMIUM_EXECUTABLE + - LANGBOT_E2E_PROMPT + - LANGBOT_E2E_EXPECTED_TEXT + - LANGBOT_E2E_RESPONSE_TIMEOUT_MS +automation_env_any: + - LANGBOT_PIPELINE_URL|LANGBOT_PIPELINE_NAME +automation_prompt: "请只回复 OK,用于性能测试。" +automation_expected_text: "OK" +automation_response_timeout_ms: "120000" +automation_reset_debug_chat: "true" +automation_debug_chat_response_p95_ms: "120000" +automation_debug_chat_max_error_rate: "0" +metrics_thresholds_json: '{"response_p95_ms":{"max":120000},"error_rate":{"max":0}}' +load_profile_json: '{"prompts":1,"browser":true,"path":"Pipeline Debug Chat","metric":"send-to-visible-completion"}' +setup_automation: + - "node:scripts/e2e/ensure-local-agent-pipeline.mjs --write-env" +setup_provides_env: + - LANGBOT_PIPELINE_URL + - LANGBOT_PIPELINE_NAME +preconditions: + - "LANGBOT_PIPELINE_URL or LANGBOT_PIPELINE_NAME points to the pipeline intended for this Debug Chat performance run." + - "The target pipeline is safe to reset Debug Chat history for this run." + - "The target pipeline has a known-good runner/model; provider latency should be interpreted separately from LangBot overhead." +steps: + - "Open LANGBOT_FRONTEND_URL with the prepared browser profile." + - "Open the target pipeline and select Debug Chat." + - "Reset Debug Chat history through the backend API when configured." + - "Send the deterministic prompt and wait for the expected assistant response." +checks: + - "automation-result.json status is pass when the expected assistant response appears." + - "metrics_summary includes response_p50_ms, response_p95_ms, error_rate, and total_duration_ms." + - "thresholds_summary shows response_p95_ms and error_rate pass." +evidence_required: + - ui + - screenshot + - console + - network + - metrics +diagnostics: + - "This case measures browser-visible send-to-completion latency; it does not split provider latency from LangBot overhead." + - "Use backend logs and provider diagnostics to explain slow runs before calling them LangBot regressions." +success_patterns: + - "Processing request from person_websocket" + - "Streaming completed" +failure_patterns: + - "Action invoke_llm_stream call timed out" + - "Task exception was never retrieved" + - "All models failed during streaming setup" +troubleshooting: + - debug-chat-history-contaminates-automation + - local-agent-model-route-unavailable + - plugin-runtime-timeout + - proxy-env-mismatch diff --git a/skills/skills/langbot-testing/fixtures/plugins/qa-plugin-smoke/.gitignore b/skills/skills/langbot-testing/fixtures/plugins/qa-plugin-smoke/.gitignore index 849ddff3b..89d8e500c 100644 --- a/skills/skills/langbot-testing/fixtures/plugins/qa-plugin-smoke/.gitignore +++ b/skills/skills/langbot-testing/fixtures/plugins/qa-plugin-smoke/.gitignore @@ -1 +1,3 @@ -dist/ +dist/* +!dist/ +!dist/qa-plugin-smoke-0.1.0.lbpkg diff --git a/skills/skills/langbot-testing/fixtures/plugins/qa-plugin-smoke/dist/qa-plugin-smoke-0.1.0.lbpkg b/skills/skills/langbot-testing/fixtures/plugins/qa-plugin-smoke/dist/qa-plugin-smoke-0.1.0.lbpkg new file mode 100644 index 0000000000000000000000000000000000000000..a4a50f803e0af7218ec8800d3cadff9e64d9b60b GIT binary patch literal 5160 zcmaJ_1yq#X+8s&+W(Y|U1w?5EB&BmGr9n`-q+=-Ql2JmsyCft8q(g+ErDf<+q#TeG zQ1ZUs?|zr--24AC>#VcZtl955=h^#t&a+?j2ROJCAQ0#($hmmR;5?VCIuQ#5+Q9~a zAi%E$y~;?vXZtzDBz={5HXt7%@i zNhDen^GS0(*JMUNCz%eL;C^yVh?uHvb98K?#dV<=DQM}Lgx~(`W1Ep)oOi0WcNv`U ziZtEv+&-D!)`=lAr@jsY1M&qb4?2lq8uytI_6f(8CYsmkBQQDE zoz>i8AE?O8r#kZ?#q(25ArW`<@xn=Kec)GO;yR~S&&M(yi?vk?>eSPvkBM_Lo6|&k zYwZs2WV6m%v@u}^b9M{FYFgdyuhL|sAn|M8Qx5vf(2p(RyFnN2b)N`H5ET!Pp^CLy z-J(z6GP}W(w9IZ4lq^>Rx)*|(`jsbk?7BJ$Z=>Urvdw)2L?Vc z987H-q0U~gsC&dn9x&`E18HXaav7w^;|vPv#JW8uk~rSUrX-RuHpVY=I(Q)&Fk3*= z_M|>ZAhx#6H0%v1yfYY)Z&6@iDfaP4co)<^Qx6Qamv`YbU>uuUJcRIx6DzO@cn(~j z`3Z&>ZqXlE;WhI(@wOS^E6{tNt0(t0-}*thrBMvm85az5p0jV`*9s=eSSX&w7nvkF z?o~r$Y=5JzE(Zl{TLz%vDlpraJ2^sKJ**Hh?W!F-;1}CIg#8)ycOd#Ww#-#yP`U=^ z?N4P@is|f-{?Fg~Kq{gPjE~>lJvs638}vng5&H?BSl3Lub-iD>;++9vsyf_t=p!C7 z%zgn;El*4*cvL`Yw)2PyVmU8wo7^g{SFyW7zI^z&@Q}9dQ&6#RI(P;qY6_IN*pAu9FMPV{HEsHp5L`7V_~V=C`Ez%8 z(+bFQbzkVhgXttnTZa#=A)`aK0C|rJAsDqQ^y1b|61b9I6ck}TvKZReug@HHVx%i8 zdaIFjx~(6F-@v)LvCOBFi1bP7BJo{kq|UfYfpj3)w_g6Zi)it#Jor;EVKdsu+H>rN znbLTd--`cox%?M~6LQHwPEqynn~bCjjh0E4G&pZKCB5xlQTSI0RB5>!1hI0}UvGaY z6v6>bc$)VjaF#~g9kj351}0jnHKl(l+&H;x^jeWsao8A(W!&azy#G z3TGQ=Nj)bU5u{#k6IFaV?W&>23h_G9fqNZAxGk(5vu&uen_HDOuImhT|a4Od{C#>KDJMKkvh#&Vz}4yWH%l5n!4cTGMej1Jfvd~^@U3Dcg$txh6>z+wu%4;9*~TpLay z@r3k0onk*&8@f9$vKafr7MAK}i};u~dckir6qv(QAM%-eEj<`=Cmm z!gApun?b%>-6PuLHcUiH-~4^OaTiRlntuD&Nk~|WBM-N%sOjC^TAzjwvwjsGB=tX! zz8&sXHC-4K{#qAdPwOg`{4;xAnL=WdfUKDS1{r4d{Jw&9qV5r5-y?;cWSFTkOchdd za^j8(4r~$MX-I|iHEeoga}Y&;{(|dddQD>r9kA-R8UWssyVn>b@XF5JW6oykRp&cb zrBF6@wcwkCws>vpO0uEPNzC5yCq#=2qNoX_?4$4@C$y}ZI3QiSy54VUMn`KWa4k0V zQc4w+(d&o3(>xJpO`95DI5MrCc9%Eu6Re~!MDD{xD_+bv=9sd{1D}0sYnu)Q45kd6bB zHp^==x^Fm16*)~iksvAFAV%=x5O~R)xViX%7*-(f%shdq502ms;?Jy+btyFhdJ7A> zli9?oBi%3T_LNJX`xBPu6GVQF(_gwr8g=zB5i4`W586&BAoWs2^H4MZwpx9JHsihU zOz`x|}P)YMz#V*`Lpxelp?<=pc%4|cfOmj%dv%q`f zfr85Uj8Pv*asjlo`g{B`;>&)Svv!{pjywM8rD+bpd70*KQJ zVmi}|`Yv?vr#iDiR>QaD`4O}ha@`f(8>onLj7!O(<&_l?5a3y0{7x%CkZPbb0unx@>&>pSmonxzJ^*y@MyaC#K&+03Q7?yU_VQ9jxI3QmITvQIgBJsgk#M zeDl|7*)a0+;P|&RX(>g%8#I~;_cr88s7fn{YY@Hv{xkLd#IIHE1MWlg)Oo73gyROu z0dIN(>ukMAtWZPnDpj1VPt24r&zwcsT-l0s11GNmN}WQ-T|HmP_yrFI^}IdFaM2`JY=b#X<`p#CZvq+jd`g3ak4}P2J4P?G&V!(nmMiEpW>4v4|Xi@M(@paZxdG z>_lCxztX)7A(`EwH4v1QgV!AQKIBhUQg!Q0;sI9Ffkx}vAFR0ATUt8*uC!VzPIC+t zE&Cig3l#<`w1YMiEq^ z4(=1zBAHrY>3NDnsa`YuSvF^Ht|Z1_%fqkND&Zb~K1Wk^Ji2*L9xcUeIvdB)O=I^e zoXlV1JcTkFoHzX4GwWj-R;2LF7+vdxd%oR!q1H{d0sHL5j1wbh=Cg1%ZQbX%7G z#53m3vAq@zV+C+L$^kM{hzI@h1EN^Fpd|f1R_3eyjV~KBhCfm~tMXuYYc54aJoC2H zlKKu-8OoMmV(SeuJdm@GbPwg||Oyd-_zN9kV1g$X{TW02CeoiS^rlUzVi5 zDEysS4w~X>boWTVAL$N|^k(KLRvR`YNaNe+wjMaIeXfnPqJzItr%k(h8Xg{505ML%I!hg-IirV30z5%7LM*JQf$JxYOHyBV7GetA;#TDJ{*B=c$Xg zP|cY~Ws5{Wi-@C|Q_>XSE?pt>}dFg6h<*{ z8yFxm{gG*|ZC)EA)yP)R6_1W*>)+NT@pTJ2UQ>5c?Kv(=mBYT~tzea~Bc5k2+lfIubn34Yk_U>o=3YHaWp^%`7=HQI%Smo`NFf*i4oJ+BQX ziu`)+T(QOcoI0%IV`E;jd}gvdYU<{uwpH8Q!&fPEQnYf5t&S`o8%xI|6>&Fy8{{;S2sTp05Ly@Znj ztsVyaf31&M>n|Tr2)v-Xe8At8A9Dkkh2|1(1Nd`H{9il3yYl;AVBk9lhzIt&Wdi*J D9m=s2 literal 0 HcmV?d00001 diff --git a/skills/skills/langbot-testing/probes/langbot-debug-chat-concurrency.mjs b/skills/skills/langbot-testing/probes/langbot-debug-chat-concurrency.mjs new file mode 100644 index 000000000..af5153dbf --- /dev/null +++ b/skills/skills/langbot-testing/probes/langbot-debug-chat-concurrency.mjs @@ -0,0 +1,837 @@ +#!/usr/bin/env node + +import crypto from "node:crypto"; +import net from "node:net"; +import tls from "node:tls"; +import { mkdir, writeFile } from "node:fs/promises"; +import { join, resolve } from "node:path"; +import { env, exit } from "node:process"; +import { + apiJson, + appendLine, + ensureEvidence, + evidencePaths, + loadEnvFiles, + localIsoWithOffset, + redact, + resetAndAuthLocalUser, + writeResult, +} from "../../../scripts/e2e/lib/langbot-e2e.mjs"; +import { + buildProviderTimingMetrics, + summarizeFakeProviderState, +} from "./lib/fake-provider-timing.mjs"; + +const DEFAULT_LOCAL_PASSWORD = "LangBotE2ELocalPass!2026"; + +await loadEnvFiles(); +const caseId = env.LBS_CASE_ID || "langbot-debug-chat-concurrency"; +const paths = evidencePaths(caseId); +await ensureEvidence(paths); + +const startedAt = new Date(); +const metricsPath = resolve(paths.evidenceDir, "metrics.json"); +const samplesPath = resolve(paths.evidenceDir, "samples.json"); +const fakeProviderStatePath = resolve(paths.evidenceDir, "fake-provider-state.json"); +const resetDiagnosticPath = resolve(paths.evidenceDir, "debug-chat-reset-diagnostic.json"); +const backendUrl = env.LANGBOT_BACKEND_URL || ""; +const fakeProviderUrl = env.LANGBOT_FAKE_PROVIDER_URL || ""; +const pipelineUrl = env.LANGBOT_E2E_PIPELINE_URL || env.LANGBOT_PIPELINE_URL || ""; +const pipelineName = env.LANGBOT_E2E_PIPELINE_NAME || env.LANGBOT_PIPELINE_NAME || ""; +const sessionType = env.LANGBOT_DEBUG_CHAT_LOAD_SESSION_TYPE || env.LANGBOT_E2E_DEBUG_CHAT_SESSION_TYPE || "person"; +const totalRequests = positiveInteger(env.LANGBOT_DEBUG_CHAT_LOAD_REQUESTS, defaultRequests(caseId)); +const concurrency = Math.min(totalRequests, positiveInteger(env.LANGBOT_DEBUG_CHAT_LOAD_CONCURRENCY, defaultConcurrency(caseId))); +const timeoutMs = positiveInteger(env.LANGBOT_DEBUG_CHAT_LOAD_TIMEOUT_MS, defaultTimeout(caseId)); +const expectedPrefix = env.LANGBOT_DEBUG_CHAT_LOAD_EXPECTED_PREFIX || "LBQA"; +const promptTemplate = env.LANGBOT_DEBUG_CHAT_LOAD_PROMPT_TEMPLATE + || "请只回复 \"{expected}\",不要解释,不要添加其他字符。"; +const stream = bool(env.LANGBOT_DEBUG_CHAT_LOAD_STREAM, true); +const resetBeforeRun = bool(env.LANGBOT_DEBUG_CHAT_LOAD_RESET, true); +const responseP95BudgetMs = positiveNumber(env.LANGBOT_DEBUG_CHAT_LOAD_RESPONSE_P95_MS, defaultP95Budget(caseId)); +const firstResponseP95BudgetMs = positiveNumber(env.LANGBOT_DEBUG_CHAT_LOAD_FIRST_RESPONSE_P95_MS, 0); +const maxErrorRate = positiveNumber(env.LANGBOT_DEBUG_CHAT_LOAD_MAX_ERROR_RATE, 0); +const minErrorRate = positiveNumber(env.LANGBOT_DEBUG_CHAT_LOAD_MIN_ERROR_RATE, 0); +const minErrorCount = nonNegativeInteger(env.LANGBOT_DEBUG_CHAT_LOAD_MIN_ERROR_COUNT, 0); +const minOkCount = nonNegativeInteger(env.LANGBOT_DEBUG_CHAT_LOAD_MIN_OK_COUNT, 0); +const minProviderFaultCount = nonNegativeInteger(env.LANGBOT_DEBUG_CHAT_LOAD_MIN_PROVIDER_FAULT_COUNT, 0); +const failOnFinalMismatch = bool(env.LANGBOT_DEBUG_CHAT_LOAD_FAIL_ON_FINAL_MISMATCH, false); +const failureSignals = textList(env.LANGBOT_E2E_FAILURE_SIGNALS || env.LANGBOT_DEBUG_CHAT_LOAD_FAILURE_SIGNALS || ""); + +const result = { + source: "automation", + case_id: caseId, + run_id: paths.runId, + status: "fail", + reason: "", + started_at: startedAt.toISOString(), + started_at_local: localIsoWithOffset(startedAt), + finished_at: "", + finished_at_local: "", + duration_ms: 0, + backend_url: backendUrl, + pipeline_url: pipelineUrl, + pipeline_name: pipelineName, + pipeline_id: "", + session_type: sessionType, + load_profile: { + requests: totalRequests, + concurrency, + timeout_ms: timeoutMs, + stream, + reset_before_run: resetBeforeRun, + fail_on_final_mismatch: failOnFinalMismatch, + }, + evidence: { + network_log: paths.networkLog, + metrics_json: metricsPath, + samples_json: samplesPath, + fake_provider_state_json: fakeProviderStatePath, + debug_chat_reset_diagnostic_json: resetDiagnosticPath, + automation_result_json: paths.automationResultJson, + result_json: paths.resultJson, + }, + evidence_collected: ["metrics", "network", "api_diagnostic", "filesystem"], +}; + +try { + if (!backendUrl) { + result.status = "env_issue"; + throw new Error("LANGBOT_BACKEND_URL is not configured."); + } + if (!["person", "group"].includes(sessionType)) { + throw new Error(`LANGBOT_DEBUG_CHAT_LOAD_SESSION_TYPE must be person or group, got ${sessionType}.`); + } + const backendReady = await backendReachable(backendUrl); + if (!backendReady) { + result.status = "env_issue"; + throw new Error(`Backend did not respond at ${backendUrl}.`); + } + + const user = env.LANGBOT_E2E_LOGIN_USER || ""; + const password = env.LANGBOT_E2E_LOGIN_PASSWORD || DEFAULT_LOCAL_PASSWORD; + if (!user) { + result.status = "env_issue"; + throw new Error("LANGBOT_E2E_LOGIN_USER is required so this probe can resolve/reset the Debug Chat session."); + } + const auth = await resetAndAuthLocalUser({ backendUrl, user, password }); + + const pipeline = await resolvePipeline({ backendUrl, token: auth.token, pipelineUrl, pipelineName }); + result.pipeline_id = pipeline.id; + result.pipeline_name = pipeline.name || pipelineName; + if (!result.pipeline_url && env.LANGBOT_FRONTEND_URL) { + result.pipeline_url = `${env.LANGBOT_FRONTEND_URL.replace(/\/$/, "")}/home/pipelines?id=${encodeURIComponent(pipeline.id)}`; + } + + if (resetBeforeRun) { + const reset = await apiJson(backendUrl, `/api/v1/pipelines/${encodeURIComponent(pipeline.id)}/ws/reset/${encodeURIComponent(sessionType)}`, { + method: "POST", + token: auth.token, + }); + const resetDiagnostic = { + status: isApiFailure(reset) ? "fail" : "ready", + http_status: reset.status, + code: reset.json.code ?? null, + reason: isApiFailure(reset) ? reset.json.msg || "Debug Chat reset failed." : "Debug Chat session reset.", + }; + await writeFile(resetDiagnosticPath, `${JSON.stringify(resetDiagnostic, null, 2)}\n`, "utf8"); + if (resetDiagnostic.status === "fail") { + throw new Error(resetDiagnostic.reason); + } + } + + const wsUrl = websocketUrl(backendUrl, pipeline.id, sessionType); + const loadStartedAt = performance.now(); + const samples = await runLoad({ + wsUrl, + totalRequests, + concurrency, + timeoutMs, + promptTemplate, + expectedPrefix, + stream, + failOnFinalMismatch, + failureSignals, + }); + const loadDurationMs = performance.now() - loadStartedAt; + const fakeProviderState = await readFakeProviderState(fakeProviderUrl); + if (fakeProviderState) { + await writeFile(fakeProviderStatePath, `${JSON.stringify(fakeProviderState, null, 2)}\n`, "utf8"); + } + const metrics = buildMetrics({ + samples, + totalRequests, + concurrency, + timeoutMs, + loadDurationMs, + backendUrl, + pipelineId: pipeline.id, + sessionType, + fakeProviderState, + }); + const thresholds = buildThresholds(metrics); + const passed = Object.values(thresholds).every((item) => item.pass); + result.status = passed ? "pass" : "fail"; + result.reason = passed + ? "Debug Chat WebSocket concurrency probe passed all thresholds." + : "Debug Chat WebSocket concurrency probe breached latency or error-rate thresholds."; + result.metrics_summary = { + requests: metrics.total_requests, + concurrency: metrics.concurrency, + ok_count: metrics.ok_count, + error_count: metrics.error_count, + timeout_count: metrics.timeout_count, + error_rate: metrics.error_rate, + response_p50_ms: metrics.response_duration_ms.p50, + response_p95_ms: metrics.response_duration_ms.p95, + first_assistant_event_p95_ms: metrics.first_assistant_event_ms.p95, + first_assistant_content_p95_ms: metrics.first_assistant_content_ms.p95, + first_response_p95_ms: metrics.first_response_ms.p95, + throughput_rps: metrics.throughput_rps, + status_counts: metrics.status_counts, + fake_provider_request_count: metrics.fake_provider?.request_count ?? null, + fake_provider_fault_count: metrics.fake_provider?.fault_count ?? null, + fake_provider_duration_p95_ms: metrics.provider_timing?.provider_duration_ms.p95 ?? null, + langbot_overhead_estimate_p95_ms: metrics.provider_timing?.langbot_overhead_estimate_ms.p95 ?? null, + send_to_provider_start_p95_ms: metrics.provider_timing?.send_to_provider_start_ms.p95 ?? null, + provider_finish_to_ws_final_p95_ms: metrics.provider_timing?.provider_finish_to_ws_final_ms.p95 ?? null, + provider_timing_matched_request_count: metrics.provider_timing?.matched_request_count ?? null, + }; + result.thresholds_summary = thresholds; + result.artifacts = { + metrics_json: metricsPath, + samples_json: samplesPath, + fake_provider_state_json: fakeProviderState ? fakeProviderStatePath : "", + network_log: paths.networkLog, + automation_result_json: paths.automationResultJson, + result_json: paths.resultJson, + }; + + await writeFile(metricsPath, `${JSON.stringify({ ...metrics, thresholds }, null, 2)}\n`, "utf8"); + await writeFile(samplesPath, `${JSON.stringify(samples, null, 2)}\n`, "utf8"); +} catch (error) { + if (!["env_issue", "blocked"].includes(result.status)) { + result.status = looksLikeEnvIssue(error) ? "env_issue" : "fail"; + } + result.reason = result.reason || safeReason(error.message); +} finally { + const finishedAt = new Date(); + result.finished_at = finishedAt.toISOString(); + result.finished_at_local = localIsoWithOffset(finishedAt); + result.duration_ms = finishedAt.getTime() - startedAt.getTime(); + await mkdir(paths.evidenceDir, { recursive: true }); + await writeResult(paths, result); + console.log(JSON.stringify(result, null, 2)); +} + +exit(result.status === "pass" ? 0 : result.status === "env_issue" || result.status === "blocked" ? 2 : 1); + +function defaultRequests(id) { + return id.includes("space") ? 3 : 12; +} + +function defaultConcurrency(id) { + return id.includes("space") ? 1 : 4; +} + +function defaultTimeout(id) { + return id.includes("space") ? 120_000 : 30_000; +} + +function defaultP95Budget(id) { + return id.includes("space") ? 120_000 : 5_000; +} + +function positiveInteger(value, fallback) { + const parsed = Number.parseInt(String(value || ""), 10); + return Number.isInteger(parsed) && parsed > 0 ? parsed : fallback; +} + +function nonNegativeInteger(value, fallback) { + const parsed = Number.parseInt(String(value ?? ""), 10); + return Number.isInteger(parsed) && parsed >= 0 ? parsed : fallback; +} + +function positiveNumber(value, fallback) { + const parsed = Number(value || ""); + return Number.isFinite(parsed) && parsed >= 0 ? parsed : fallback; +} + +function bool(value, fallback) { + if (value === undefined || value === "") return fallback; + if (/^(1|true|yes|on)$/i.test(String(value))) return true; + if (/^(0|false|no|off)$/i.test(String(value))) return false; + return fallback; +} + +function textList(value) { + return String(value || "") + .split(/\r?\n|,/) + .map((item) => item.trim()) + .filter(Boolean); +} + +async function backendReachable(baseUrl) { + try { + const response = await fetch(`${baseUrl.replace(/\/$/, "")}/healthz`, { + signal: AbortSignal.timeout(3000), + }); + return response.status < 500; + } catch { + return false; + } +} + +async function readFakeProviderState(rootUrl) { + if (!rootUrl) return null; + try { + const response = await fetch(`${normalizeProviderRootUrl(rootUrl)}/__qa/config`, { + signal: AbortSignal.timeout(3000), + }); + const json = await response.json().catch(() => ({})); + return { + status: response.ok && json.ok === true ? "loaded" : "unavailable", + url: normalizeProviderRootUrl(rootUrl), + http_status: response.status, + model: json.model || "", + config: json.config || {}, + request_count: Number.isFinite(json.request_count) ? json.request_count : null, + recent_requests: Array.isArray(json.recent_requests) ? json.recent_requests : [], + }; + } catch (error) { + return { + status: "unavailable", + url: normalizeProviderRootUrl(rootUrl), + reason: safeReason(error.message), + request_count: null, + recent_requests: [], + }; + } +} + +function normalizeProviderRootUrl(value) { + const trimmed = String(value || "").trim().replace(/\/$/, ""); + return trimmed.endsWith("/v1") ? trimmed.slice(0, -3) : trimmed; +} + +function pipelineIdFromUrl(url) { + if (!url) return ""; + try { + const parsed = new URL(url); + return parsed.searchParams.get("id") || ""; + } catch { + return ""; + } +} + +async function resolvePipeline({ backendUrl, token, pipelineUrl, pipelineName }) { + const idFromUrl = pipelineIdFromUrl(pipelineUrl); + if (idFromUrl) { + const response = await apiJson(backendUrl, `/api/v1/pipelines/${encodeURIComponent(idFromUrl)}`, { token }); + const pipeline = response.json.data?.pipeline; + if (isApiFailure(response) || !pipeline?.uuid) { + throw new Error(response.json.msg || `Could not load pipeline ${idFromUrl}.`); + } + return { id: pipeline.uuid, name: pipeline.name || "" }; + } + if (!pipelineName) { + throw new Error("Set LANGBOT_E2E_PIPELINE_URL or LANGBOT_E2E_PIPELINE_NAME before running this probe."); + } + const response = await apiJson(backendUrl, "/api/v1/pipelines", { token }); + if (isApiFailure(response)) { + throw new Error(response.json.msg || "Failed to list pipelines."); + } + const pipeline = (response.json.data?.pipelines || []).find((item) => item.name === pipelineName); + if (!pipeline?.uuid) { + throw new Error(`Could not find pipeline named ${pipelineName}.`); + } + return { id: pipeline.uuid, name: pipeline.name || pipelineName }; +} + +function isApiFailure(response) { + return response.status >= 400 || (response.json.code !== undefined && response.json.code !== 0); +} + +function websocketUrl(baseUrl, pipelineId, sessionType) { + const parsed = new URL(baseUrl); + parsed.protocol = parsed.protocol === "https:" ? "wss:" : "ws:"; + parsed.pathname = `/api/v1/pipelines/${encodeURIComponent(pipelineId)}/ws/connect`; + parsed.search = `?session_type=${encodeURIComponent(sessionType)}`; + return parsed.toString(); +} + +async function runLoad(options) { + const samples = []; + let nextIndex = 0; + const workers = Array.from({ length: options.concurrency }, async () => { + while (nextIndex < options.totalRequests) { + const index = nextIndex; + nextIndex += 1; + const sample = await runSingleRequest({ ...options, index }); + samples.push(sample); + } + }); + await Promise.all(workers); + return samples.sort((left, right) => left.index - right.index); +} + +function expectedForIndex(prefix, index) { + return `${prefix}-${String(index + 1).padStart(4, "0")}`; +} + +function promptForIndex(template, expected) { + return template.replaceAll("{expected}", expected); +} + +function runSingleRequest({ + wsUrl, + index, + timeoutMs, + promptTemplate, + expectedPrefix, + stream, + failOnFinalMismatch, + failureSignals, +}) { + return new Promise((resolve) => { + const expected = expectedForIndex(expectedPrefix, index); + const prompt = promptForIndex(promptTemplate, expected); + const sample = { + index, + status: "running", + ok: false, + expected_text: expected, + prompt, + response_text: "", + started_at: new Date().toISOString(), + started_epoch_ms: Date.now(), + connected_at: null, + connected_epoch_ms: null, + sent_at: null, + sent_epoch_ms: null, + first_assistant_event_at: null, + first_assistant_event_epoch_ms: null, + first_assistant_event_ms: null, + first_assistant_content_at: null, + first_assistant_content_epoch_ms: null, + first_assistant_content_ms: null, + first_response_at: null, + first_response_epoch_ms: null, + connected_ms: null, + first_response_ms: null, + response_duration_ms: null, + finished_at: null, + finished_epoch_ms: null, + event_count: 0, + foreign_response_count: 0, + last_foreign_response_text: "", + error: "", + close_code: null, + close_reason: "", + }; + let closed = false; + let connectedAt = 0; + let sentAt = 0; + const startedAt = performance.now(); + let client = null; + const timer = setTimeout(() => { + finish("timeout", `Timed out after ${timeoutMs} ms.`); + }, timeoutMs); + + client = openRawWebSocket(wsUrl, { + onOpen() { + connectedAt = performance.now(); + const now = Date.now(); + sample.connected_at = new Date(now).toISOString(); + sample.connected_epoch_ms = now; + sample.connected_ms = rounded(connectedAt - startedAt); + }, + onMessage(text) { + sample.event_count += 1; + let data; + try { + data = JSON.parse(String(text || "")); + } catch (error) { + finish("error", `Invalid WebSocket JSON: ${error.message}`); + return; + } + appendLine(paths.networkLog, JSON.stringify({ + request_index: index, + type: data.type, + session_type: data.session_type || "", + role: data.data?.role || "", + is_final: data.data?.is_final ?? null, + content_preview: redact(String(data.data?.content || data.message || "").slice(0, 200)), + })).catch(() => {}); + + if (data.type === "connected") { + sentAt = performance.now(); + const now = Date.now(); + sample.sent_at = new Date(now).toISOString(); + sample.sent_epoch_ms = now; + client.send(JSON.stringify({ + type: "message", + message: [{ type: "Plain", text: prompt }], + stream, + })); + return; + } + if (data.type === "error") { + finish("error", data.message || "WebSocket error message."); + return; + } + if (data.type !== "response" || data.data?.role !== "assistant") return; + + const content = String(data.data.content || ""); + markFirstAssistantEvent(sample, sentAt); + if (content) sample.response_text = content; + if (content) markFirstAssistantContent(sample, sentAt); + if (content.includes(expected) && sample.first_response_ms === null && sentAt > 0) { + const now = Date.now(); + sample.first_response_at = new Date(now).toISOString(); + sample.first_response_epoch_ms = now; + sample.first_response_ms = rounded(performance.now() - sentAt); + } + if (data.data.is_final === true) { + const ok = sample.response_text.includes(expected); + if (ok) { + if (sample.first_response_ms === null && sentAt > 0) { + sample.first_response_ms = rounded(performance.now() - sentAt); + } + finish("pass", ""); + } else if (matchesFailureSignal(sample.response_text, failureSignals)) { + finish("app_error", `Assistant final response matched a failure signal: ${sample.response_text}`); + } else if (failOnFinalMismatch && !containsLoadToken(sample.response_text, expectedPrefix)) { + finish("mismatch", `Final assistant response did not include ${expected}: ${sample.response_text}`); + } else { + sample.foreign_response_count += 1; + sample.last_foreign_response_text = sample.response_text; + } + } + }, + onError(error) { + finish("connection_error", `WebSocket connection error: ${error.message}`); + }, + onClose(event) { + sample.close_code = event.code; + sample.close_reason = event.reason || ""; + if (!closed) finish("closed", `WebSocket closed before final assistant response: ${event.code}`); + }, + }); + + function finish(status, reason) { + if (closed) return; + closed = true; + clearTimeout(timer); + sample.status = status; + sample.ok = status === "pass"; + sample.error = status === "timeout" && sample.foreign_response_count > 0 + ? `${reason || ""} Saw ${sample.foreign_response_count} foreign assistant response(s); last=${sample.last_foreign_response_text}` + : reason || ""; + if (sentAt > 0) sample.response_duration_ms = rounded(performance.now() - sentAt); + else sample.response_duration_ms = rounded(performance.now() - startedAt); + const now = Date.now(); + sample.finished_at = new Date(now).toISOString(); + sample.finished_epoch_ms = now; + try { + client?.close(); + } catch { + // Closing a failed socket should not hide the sample result. + } + resolve(sample); + } + }); +} + +function markFirstAssistantEvent(sample, sentAt) { + if (sample.first_assistant_event_ms !== null || sentAt <= 0) return; + const now = Date.now(); + sample.first_assistant_event_at = new Date(now).toISOString(); + sample.first_assistant_event_epoch_ms = now; + sample.first_assistant_event_ms = rounded(performance.now() - sentAt); +} + +function markFirstAssistantContent(sample, sentAt) { + if (sample.first_assistant_content_ms !== null || sentAt <= 0) return; + const now = Date.now(); + sample.first_assistant_content_at = new Date(now).toISOString(); + sample.first_assistant_content_epoch_ms = now; + sample.first_assistant_content_ms = rounded(performance.now() - sentAt); +} + +function containsLoadToken(text, prefix) { + const escaped = String(prefix).replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); + return new RegExp(`${escaped}-\\d{4}`).test(String(text || "")); +} + +function matchesFailureSignal(text, signals) { + const lower = String(text || "").toLowerCase(); + return signals.some((signal) => lower.includes(signal.toLowerCase())); +} + +function openRawWebSocket(wsUrl, handlers) { + const parsed = new URL(wsUrl); + const secure = parsed.protocol === "wss:"; + const port = Number(parsed.port || (secure ? 443 : 80)); + const host = parsed.hostname; + const path = `${parsed.pathname}${parsed.search}`; + const key = crypto.randomBytes(16).toString("base64"); + const socket = secure + ? tls.connect({ host, port, servername: host }) + : net.connect({ host, port }); + let opened = false; + let closed = false; + let buffer = Buffer.alloc(0); + + socket.setNoDelay(true); + socket.on("connect", () => { + const originProtocol = secure ? "https" : "http"; + const request = [ + `GET ${path} HTTP/1.1`, + `Host: ${parsed.host}`, + "Upgrade: websocket", + "Connection: Upgrade", + `Sec-WebSocket-Key: ${key}`, + "Sec-WebSocket-Version: 13", + `Origin: ${originProtocol}://${parsed.host}`, + "", + "", + ].join("\r\n"); + socket.write(request); + }); + socket.on("data", (chunk) => { + buffer = Buffer.concat([buffer, chunk]); + if (!opened) { + const headerEnd = buffer.indexOf("\r\n\r\n"); + if (headerEnd === -1) return; + const headerText = buffer.slice(0, headerEnd).toString("utf8"); + buffer = buffer.slice(headerEnd + 4); + if (!/^HTTP\/1\.1 101\b/i.test(headerText)) { + handlers.onError(new Error(`Handshake failed: ${headerText.split("\r\n")[0] || "missing status"}`)); + socket.destroy(); + return; + } + opened = true; + handlers.onOpen(); + } + processFrames(); + }); + socket.on("error", (error) => { + if (!closed) handlers.onError(error); + }); + socket.on("close", () => { + if (closed) return; + closed = true; + handlers.onClose({ code: null, reason: "" }); + }); + + function processFrames() { + while (true) { + const frame = readFrame(buffer); + if (!frame) return; + buffer = buffer.slice(frame.consumed); + if (frame.opcode === 0x1) { + handlers.onMessage(frame.payload.toString("utf8")); + } else if (frame.opcode === 0x8) { + const code = frame.payload.length >= 2 ? frame.payload.readUInt16BE(0) : null; + const reason = frame.payload.length > 2 ? frame.payload.slice(2).toString("utf8") : ""; + closed = true; + handlers.onClose({ code, reason }); + socket.end(); + return; + } else if (frame.opcode === 0x9) { + writeFrame(socket, 0xA, frame.payload); + } + } + } + + return { + send(text) { + if (closed || !opened) return; + writeFrame(socket, 0x1, Buffer.from(text, "utf8")); + }, + close() { + if (closed) return; + closed = true; + if (!socket.destroyed) { + if (opened) writeFrame(socket, 0x8, Buffer.alloc(0)); + setTimeout(() => socket.end(), 50).unref(); + } + }, + }; +} + +function readFrame(buffer) { + if (buffer.length < 2) return null; + const first = buffer[0]; + const second = buffer[1]; + const opcode = first & 0x0f; + const masked = Boolean(second & 0x80); + let length = second & 0x7f; + let offset = 2; + if (length === 126) { + if (buffer.length < offset + 2) return null; + length = buffer.readUInt16BE(offset); + offset += 2; + } else if (length === 127) { + if (buffer.length < offset + 8) return null; + const high = buffer.readUInt32BE(offset); + const low = buffer.readUInt32BE(offset + 4); + length = high * 2 ** 32 + low; + offset += 8; + } + let mask = null; + if (masked) { + if (buffer.length < offset + 4) return null; + mask = buffer.slice(offset, offset + 4); + offset += 4; + } + if (buffer.length < offset + length) return null; + let payload = buffer.slice(offset, offset + length); + if (mask) { + payload = Buffer.from(payload); + for (let index = 0; index < payload.length; index += 1) { + payload[index] ^= mask[index % 4]; + } + } + return { + opcode, + payload, + consumed: offset + length, + }; +} + +function writeFrame(socket, opcode, payload) { + const body = Buffer.isBuffer(payload) ? payload : Buffer.from(payload || ""); + const mask = crypto.randomBytes(4); + const headerLength = body.length < 126 ? 2 : body.length <= 0xffff ? 4 : 10; + const header = Buffer.alloc(headerLength); + header[0] = 0x80 | opcode; + if (body.length < 126) { + header[1] = 0x80 | body.length; + } else if (body.length <= 0xffff) { + header[1] = 0x80 | 126; + header.writeUInt16BE(body.length, 2); + } else { + header[1] = 0x80 | 127; + header.writeUInt32BE(Math.floor(body.length / 2 ** 32), 2); + header.writeUInt32BE(body.length >>> 0, 6); + } + const masked = Buffer.from(body); + for (let index = 0; index < masked.length; index += 1) { + masked[index] ^= mask[index % 4]; + } + socket.write(Buffer.concat([header, mask, masked])); +} + +function rounded(value) { + return Number(value.toFixed(3)); +} + +function percentile(values, percentileValue) { + if (values.length === 0) return 0; + const sorted = [...values].sort((a, b) => a - b); + const index = Math.min(sorted.length - 1, Math.ceil((percentileValue / 100) * sorted.length) - 1); + return rounded(sorted[index]); +} + +function stats(values) { + if (values.length === 0) return { min: 0, p50: 0, p95: 0, p99: 0, max: 0 }; + return { + min: rounded(Math.min(...values)), + p50: percentile(values, 50), + p95: percentile(values, 95), + p99: percentile(values, 99), + max: rounded(Math.max(...values)), + }; +} + +function buildMetrics({ samples, totalRequests, concurrency, timeoutMs, loadDurationMs, backendUrl, pipelineId, sessionType, fakeProviderState }) { + const okSamples = samples.filter((sample) => sample.ok); + const statusCounts = {}; + for (const sample of samples) { + statusCounts[sample.status] = (statusCounts[sample.status] || 0) + 1; + } + const errorCount = samples.length - okSamples.length; + return { + probe: caseId, + backend_url: backendUrl, + pipeline_id: pipelineId, + session_type: sessionType, + total_requests: totalRequests, + completed_requests: samples.length, + concurrency, + timeout_ms: timeoutMs, + ok_count: okSamples.length, + error_count: errorCount, + timeout_count: samples.filter((sample) => sample.status === "timeout").length, + error_rate: samples.length === 0 ? 1 : rounded(errorCount / samples.length), + load_duration_ms: rounded(loadDurationMs), + throughput_rps: loadDurationMs <= 0 ? 0 : rounded(okSamples.length / (loadDurationMs / 1000)), + status_counts: statusCounts, + connected_ms: stats(samples.map((sample) => sample.connected_ms).filter(Number.isFinite)), + first_assistant_event_ms: stats(samples.map((sample) => sample.first_assistant_event_ms).filter(Number.isFinite)), + first_assistant_content_ms: stats(samples.map((sample) => sample.first_assistant_content_ms).filter(Number.isFinite)), + first_response_ms: stats(okSamples.map((sample) => sample.first_response_ms).filter(Number.isFinite)), + response_duration_ms: stats(okSamples.map((sample) => sample.response_duration_ms).filter(Number.isFinite)), + fake_provider: summarizeFakeProviderState(fakeProviderState), + provider_timing: buildProviderTimingMetrics(samples, fakeProviderState), + samples, + }; +} + +function buildThresholds(metrics) { + const thresholds = { + error_rate: { actual: metrics.error_rate, max: maxErrorRate, pass: metrics.error_rate <= maxErrorRate }, + response_p95_ms: { + actual: metrics.response_duration_ms.p95, + max: responseP95BudgetMs, + pass: metrics.ok_count > 0 && metrics.response_duration_ms.p95 <= responseP95BudgetMs, + }, + }; + if (minErrorRate > 0) { + thresholds.error_rate_min = { + actual: metrics.error_rate, + min: minErrorRate, + pass: metrics.error_rate >= minErrorRate, + }; + } + if (minErrorCount > 0) { + thresholds.error_count_min = { + actual: metrics.error_count, + min: minErrorCount, + pass: metrics.error_count >= minErrorCount, + }; + } + if (minOkCount > 0) { + thresholds.ok_count_min = { + actual: metrics.ok_count, + min: minOkCount, + pass: metrics.ok_count >= minOkCount, + }; + } + if (minProviderFaultCount > 0) { + const actual = metrics.fake_provider?.fault_count ?? 0; + thresholds.fake_provider_fault_count_min = { + actual, + min: minProviderFaultCount, + pass: actual >= minProviderFaultCount, + }; + } + if (firstResponseP95BudgetMs > 0) { + thresholds.first_response_p95_ms = { + actual: metrics.first_response_ms.p95, + max: firstResponseP95BudgetMs, + pass: metrics.ok_count > 0 && metrics.first_response_ms.p95 <= firstResponseP95BudgetMs, + }; + } + return thresholds; +} + +function looksLikeEnvIssue(error) { + const message = String(error?.message || error || ""); + return /fetch failed|ECONNREFUSED|ENOTFOUND|LANGBOT_.*not configured|Could not read recovery_key|Backend did not respond/i.test(message); +} + +function safeReason(value) { + return redact(String(value || "")).slice(0, 1000); +} diff --git a/skills/skills/langbot-testing/probes/langbot-debug-chat-cross-pipeline-isolation.mjs b/skills/skills/langbot-testing/probes/langbot-debug-chat-cross-pipeline-isolation.mjs new file mode 100644 index 000000000..b83f6161d --- /dev/null +++ b/skills/skills/langbot-testing/probes/langbot-debug-chat-cross-pipeline-isolation.mjs @@ -0,0 +1,861 @@ +#!/usr/bin/env node + +import crypto from "node:crypto"; +import net from "node:net"; +import tls from "node:tls"; +import { mkdir, writeFile } from "node:fs/promises"; +import { resolve } from "node:path"; +import { env, exit } from "node:process"; +import { + apiJson, + appendLine, + ensureEvidence, + evidencePaths, + loadEnvFiles, + localIsoWithOffset, + redact, + resetAndAuthLocalUser, + writeResult, +} from "../../../scripts/e2e/lib/langbot-e2e.mjs"; +import { + buildProviderTimingMetrics, + summarizeFakeProviderState, +} from "./lib/fake-provider-timing.mjs"; + +const DEFAULT_LOCAL_PASSWORD = "LangBotE2ELocalPass!2026"; + +await loadEnvFiles(); +const caseId = env.LBS_CASE_ID || "langbot-debug-chat-cross-pipeline-isolation"; +const paths = evidencePaths(caseId); +await ensureEvidence(paths); + +const startedAt = new Date(); +const metricsPath = resolve(paths.evidenceDir, "metrics.json"); +const samplesPath = resolve(paths.evidenceDir, "samples.json"); +const fakeProviderStatePath = resolve(paths.evidenceDir, "fake-provider-state.json"); +const resetDiagnosticPath = resolve(paths.evidenceDir, "debug-chat-reset-diagnostic.json"); +const backendUrl = env.LANGBOT_BACKEND_URL || ""; +const fakeProviderUrl = env.LANGBOT_FAKE_PROVIDER_URL || ""; +const sessionType = env.LANGBOT_DEBUG_CHAT_LOAD_SESSION_TYPE || env.LANGBOT_E2E_DEBUG_CHAT_SESSION_TYPE || "person"; +const requestsPerPipeline = positiveInteger(env.LANGBOT_DEBUG_CHAT_LOAD_REQUESTS, 6); +const concurrency = Math.min(requestsPerPipeline * 2, positiveInteger(env.LANGBOT_DEBUG_CHAT_LOAD_CONCURRENCY, 4)); +const timeoutMs = positiveInteger(env.LANGBOT_DEBUG_CHAT_LOAD_TIMEOUT_MS, 30_000); +const stream = bool(env.LANGBOT_DEBUG_CHAT_LOAD_STREAM, true); +const resetBeforeRun = bool(env.LANGBOT_DEBUG_CHAT_LOAD_RESET, true); +const responseP95BudgetMs = positiveNumber(env.LANGBOT_DEBUG_CHAT_LOAD_RESPONSE_P95_MS, 5_000); +const maxErrorRate = positiveNumber(env.LANGBOT_DEBUG_CHAT_LOAD_MAX_ERROR_RATE, 0); +const promptTemplate = env.LANGBOT_DEBUG_CHAT_LOAD_PROMPT_TEMPLATE + || "请只回复 \"{expected}\",不要解释,不要添加其他字符。"; +const failureSignals = textList(env.LANGBOT_E2E_FAILURE_SIGNALS || env.LANGBOT_DEBUG_CHAT_LOAD_FAILURE_SIGNALS || ""); + +const pipelineTargets = [ + { + label: "A", + expectedPrefix: "PIPEA", + otherPrefix: "PIPEB", + url: env.LANGBOT_FAKE_PROVIDER_PIPELINE_A_URL || "", + name: env.LANGBOT_FAKE_PROVIDER_PIPELINE_A_NAME || "", + }, + { + label: "B", + expectedPrefix: "PIPEB", + otherPrefix: "PIPEA", + url: env.LANGBOT_FAKE_PROVIDER_PIPELINE_B_URL || "", + name: env.LANGBOT_FAKE_PROVIDER_PIPELINE_B_NAME || "", + }, +]; + +const result = { + source: "automation", + case_id: caseId, + run_id: paths.runId, + status: "fail", + reason: "", + started_at: startedAt.toISOString(), + started_at_local: localIsoWithOffset(startedAt), + finished_at: "", + finished_at_local: "", + duration_ms: 0, + backend_url: backendUrl, + session_type: sessionType, + pipelines: [], + load_profile: { + requests_per_pipeline: requestsPerPipeline, + total_requests: requestsPerPipeline * 2, + concurrency, + timeout_ms: timeoutMs, + stream, + reset_before_run: resetBeforeRun, + }, + evidence: { + network_log: paths.networkLog, + metrics_json: metricsPath, + samples_json: samplesPath, + fake_provider_state_json: fakeProviderStatePath, + debug_chat_reset_diagnostic_json: resetDiagnosticPath, + automation_result_json: paths.automationResultJson, + result_json: paths.resultJson, + }, + evidence_collected: ["metrics", "network", "api_diagnostic", "filesystem"], +}; + +try { + if (!backendUrl) { + result.status = "env_issue"; + throw new Error("LANGBOT_BACKEND_URL is not configured."); + } + if (!["person", "group"].includes(sessionType)) { + throw new Error(`LANGBOT_DEBUG_CHAT_LOAD_SESSION_TYPE must be person or group, got ${sessionType}.`); + } + for (const target of pipelineTargets) { + if (!target.url && !target.name) { + result.status = "env_issue"; + throw new Error(`Set LANGBOT_FAKE_PROVIDER_PIPELINE_${target.label}_URL or LANGBOT_FAKE_PROVIDER_PIPELINE_${target.label}_NAME.`); + } + } + + const backendReady = await backendReachable(backendUrl); + if (!backendReady) { + result.status = "env_issue"; + throw new Error(`Backend did not respond at ${backendUrl}.`); + } + + const user = env.LANGBOT_E2E_LOGIN_USER || ""; + const password = env.LANGBOT_E2E_LOGIN_PASSWORD || DEFAULT_LOCAL_PASSWORD; + if (!user) { + result.status = "env_issue"; + throw new Error("LANGBOT_E2E_LOGIN_USER is required so this probe can resolve/reset Debug Chat sessions."); + } + const auth = await resetAndAuthLocalUser({ backendUrl, user, password }); + const pipelines = []; + for (const target of pipelineTargets) { + const pipeline = await resolvePipeline({ + backendUrl, + token: auth.token, + pipelineUrl: target.url, + pipelineName: target.name, + }); + pipelines.push({ + ...target, + id: pipeline.id, + name: pipeline.name || target.name, + wsUrl: websocketUrl(backendUrl, pipeline.id, sessionType), + }); + } + result.pipelines = pipelines.map((pipeline) => ({ + label: pipeline.label, + id: pipeline.id, + name: pipeline.name, + url: pipeline.url, + })); + + if (resetBeforeRun) { + const resetDiagnostics = []; + for (const pipeline of pipelines) { + const reset = await apiJson(backendUrl, `/api/v1/pipelines/${encodeURIComponent(pipeline.id)}/ws/reset/${encodeURIComponent(sessionType)}`, { + method: "POST", + token: auth.token, + }); + resetDiagnostics.push({ + pipeline_label: pipeline.label, + pipeline_id: pipeline.id, + status: isApiFailure(reset) ? "fail" : "ready", + http_status: reset.status, + code: reset.json.code ?? null, + reason: isApiFailure(reset) ? reset.json.msg || "Debug Chat reset failed." : "Debug Chat session reset.", + }); + } + await writeFile(resetDiagnosticPath, `${JSON.stringify(resetDiagnostics, null, 2)}\n`, "utf8"); + const failedReset = resetDiagnostics.find((item) => item.status === "fail"); + if (failedReset) throw new Error(failedReset.reason); + } + await resetFakeProvider(fakeProviderUrl); + + const jobs = []; + for (let index = 0; index < requestsPerPipeline; index += 1) { + for (const pipeline of pipelines) { + jobs.push({ ...pipeline, index }); + } + } + + const loadStartedAt = performance.now(); + const samples = await runLoad({ + jobs, + concurrency, + timeoutMs, + promptTemplate, + stream, + failureSignals, + }); + const loadDurationMs = performance.now() - loadStartedAt; + const fakeProviderState = await readFakeProviderState(fakeProviderUrl); + if (fakeProviderState) { + await writeFile(fakeProviderStatePath, `${JSON.stringify(fakeProviderState, null, 2)}\n`, "utf8"); + } + const metrics = buildMetrics({ + samples, + requestsPerPipeline, + concurrency, + timeoutMs, + loadDurationMs, + backendUrl, + sessionType, + fakeProviderState, + }); + const thresholds = buildThresholds(metrics); + const passed = Object.values(thresholds).every((item) => item.pass); + result.status = passed ? "pass" : "fail"; + result.reason = passed + ? "Debug Chat cross-pipeline isolation probe passed all thresholds." + : "Debug Chat cross-pipeline isolation probe found leaks, errors, or latency threshold breaches."; + result.metrics_summary = { + requests_per_pipeline: metrics.requests_per_pipeline, + total_requests: metrics.total_requests, + concurrency: metrics.concurrency, + ok_count: metrics.ok_count, + error_count: metrics.error_count, + cross_pipeline_leak_count: metrics.cross_pipeline_leak_count, + timeout_count: metrics.timeout_count, + error_rate: metrics.error_rate, + response_p95_ms: metrics.response_duration_ms.p95, + first_response_p95_ms: metrics.first_response_ms.p95, + throughput_rps: metrics.throughput_rps, + status_counts: metrics.status_counts, + by_pipeline: metrics.by_pipeline, + fake_provider_request_count: metrics.fake_provider?.request_count ?? null, + fake_provider_duration_p95_ms: metrics.provider_timing?.provider_duration_ms.p95 ?? null, + langbot_overhead_estimate_p95_ms: metrics.provider_timing?.langbot_overhead_estimate_ms.p95 ?? null, + send_to_provider_start_p95_ms: metrics.provider_timing?.send_to_provider_start_ms.p95 ?? null, + provider_finish_to_ws_final_p95_ms: metrics.provider_timing?.provider_finish_to_ws_final_ms.p95 ?? null, + }; + result.thresholds_summary = thresholds; + result.artifacts = { + metrics_json: metricsPath, + samples_json: samplesPath, + fake_provider_state_json: fakeProviderState ? fakeProviderStatePath : "", + network_log: paths.networkLog, + automation_result_json: paths.automationResultJson, + result_json: paths.resultJson, + }; + + await writeFile(metricsPath, `${JSON.stringify({ ...metrics, thresholds }, null, 2)}\n`, "utf8"); + await writeFile(samplesPath, `${JSON.stringify(samples, null, 2)}\n`, "utf8"); +} catch (error) { + if (!["env_issue", "blocked"].includes(result.status)) { + result.status = looksLikeEnvIssue(error) ? "env_issue" : "fail"; + } + result.reason = result.reason || safeReason(error.message); +} finally { + const finishedAt = new Date(); + result.finished_at = finishedAt.toISOString(); + result.finished_at_local = localIsoWithOffset(finishedAt); + result.duration_ms = finishedAt.getTime() - startedAt.getTime(); + await mkdir(paths.evidenceDir, { recursive: true }); + await writeResult(paths, result); + console.log(JSON.stringify(result, null, 2)); +} + +exit(result.status === "pass" ? 0 : result.status === "env_issue" || result.status === "blocked" ? 2 : 1); + +async function backendReachable(baseUrl) { + try { + const response = await fetch(`${baseUrl.replace(/\/$/, "")}/healthz`, { + signal: AbortSignal.timeout(3000), + }); + return response.status < 500; + } catch { + return false; + } +} + +async function resetFakeProvider(rootUrl) { + if (!rootUrl) return; + try { + await fetch(`${normalizeProviderRootUrl(rootUrl)}/__qa/reset`, { + method: "POST", + signal: AbortSignal.timeout(3000), + }); + } catch { + // Missing fake-provider diagnostics should not hide the isolation result. + } +} + +async function readFakeProviderState(rootUrl) { + if (!rootUrl) return null; + try { + const response = await fetch(`${normalizeProviderRootUrl(rootUrl)}/__qa/config`, { + signal: AbortSignal.timeout(3000), + }); + const json = await response.json().catch(() => ({})); + return { + status: response.ok && json.ok === true ? "loaded" : "unavailable", + url: normalizeProviderRootUrl(rootUrl), + http_status: response.status, + model: json.model || "", + config: json.config || {}, + request_count: Number.isFinite(json.request_count) ? json.request_count : null, + recent_requests: Array.isArray(json.recent_requests) ? json.recent_requests : [], + }; + } catch (error) { + return { + status: "unavailable", + url: normalizeProviderRootUrl(rootUrl), + reason: safeReason(error.message), + request_count: null, + recent_requests: [], + }; + } +} + +function normalizeProviderRootUrl(value) { + const trimmed = String(value || "").trim().replace(/\/$/, ""); + return trimmed.endsWith("/v1") ? trimmed.slice(0, -3) : trimmed; +} + +function pipelineIdFromUrl(url) { + if (!url) return ""; + try { + const parsed = new URL(url); + return parsed.searchParams.get("id") || ""; + } catch { + return ""; + } +} + +async function resolvePipeline({ backendUrl, token, pipelineUrl, pipelineName }) { + const idFromUrl = pipelineIdFromUrl(pipelineUrl); + if (idFromUrl) { + const response = await apiJson(backendUrl, `/api/v1/pipelines/${encodeURIComponent(idFromUrl)}`, { token }); + const pipeline = response.json.data?.pipeline; + if (isApiFailure(response) || !pipeline?.uuid) { + throw new Error(response.json.msg || `Could not load pipeline ${idFromUrl}.`); + } + return { id: pipeline.uuid, name: pipeline.name || "" }; + } + if (!pipelineName) { + throw new Error("Set pipeline URL or name before running this probe."); + } + const response = await apiJson(backendUrl, "/api/v1/pipelines", { token }); + if (isApiFailure(response)) { + throw new Error(response.json.msg || "Failed to list pipelines."); + } + const pipeline = (response.json.data?.pipelines || []).find((item) => item.name === pipelineName); + if (!pipeline?.uuid) { + throw new Error(`Could not find pipeline named ${pipelineName}.`); + } + return { id: pipeline.uuid, name: pipeline.name || pipelineName }; +} + +function isApiFailure(response) { + return response.status >= 400 || (response.json.code !== undefined && response.json.code !== 0); +} + +function websocketUrl(baseUrl, pipelineId, sessionTypeValue) { + const parsed = new URL(baseUrl); + parsed.protocol = parsed.protocol === "https:" ? "wss:" : "ws:"; + parsed.pathname = `/api/v1/pipelines/${encodeURIComponent(pipelineId)}/ws/connect`; + parsed.search = `?session_type=${encodeURIComponent(sessionTypeValue)}`; + return parsed.toString(); +} + +async function runLoad(options) { + const samples = []; + const queue = [...options.jobs]; + const workers = Array.from({ length: options.concurrency }, async () => { + while (queue.length > 0) { + const job = queue.shift(); + if (!job) continue; + const sample = await runSingleRequest({ ...options, job }); + samples.push(sample); + } + }); + await Promise.all(workers); + return samples.sort((left, right) => ( + left.pipeline_label.localeCompare(right.pipeline_label) || left.index - right.index + )); +} + +function expectedForIndex(prefix, index) { + return `${prefix}-${String(index + 1).padStart(4, "0")}`; +} + +function promptForIndex(template, expected) { + return template.replaceAll("{expected}", expected); +} + +function runSingleRequest({ + job, + timeoutMs, + promptTemplate, + stream, + failureSignals, +}) { + return new Promise((resolvePromise) => { + const expected = expectedForIndex(job.expectedPrefix, job.index); + const prompt = promptForIndex(promptTemplate, expected); + const sample = { + index: job.index, + pipeline_label: job.label, + pipeline_id: job.id, + pipeline_name: job.name, + status: "running", + ok: false, + expected_text: expected, + expected_prefix: job.expectedPrefix, + other_prefix: job.otherPrefix, + prompt, + response_text: "", + started_at: new Date().toISOString(), + started_epoch_ms: Date.now(), + connected_at: null, + connected_epoch_ms: null, + sent_at: null, + sent_epoch_ms: null, + first_assistant_event_at: null, + first_assistant_event_epoch_ms: null, + first_assistant_event_ms: null, + first_assistant_content_at: null, + first_assistant_content_epoch_ms: null, + first_assistant_content_ms: null, + first_response_at: null, + first_response_epoch_ms: null, + connected_ms: null, + first_response_ms: null, + response_duration_ms: null, + finished_at: null, + finished_epoch_ms: null, + event_count: 0, + same_pipeline_foreign_response_count: 0, + cross_pipeline_leak_count: 0, + last_foreign_response_text: "", + error: "", + close_code: null, + close_reason: "", + }; + let closed = false; + let connectedAt = 0; + let sentAt = 0; + const startedPerf = performance.now(); + let client = null; + const timer = setTimeout(() => { + finish("timeout", `Timed out after ${timeoutMs} ms.`); + }, timeoutMs); + + client = openRawWebSocket(job.wsUrl, { + onOpen() { + connectedAt = performance.now(); + const now = Date.now(); + sample.connected_at = new Date(now).toISOString(); + sample.connected_epoch_ms = now; + sample.connected_ms = rounded(connectedAt - startedPerf); + }, + onMessage(text) { + sample.event_count += 1; + let data; + try { + data = JSON.parse(String(text || "")); + } catch (error) { + finish("error", `Invalid WebSocket JSON: ${error.message}`); + return; + } + appendLine(paths.networkLog, JSON.stringify({ + pipeline_label: job.label, + request_index: job.index, + type: data.type, + session_type: data.session_type || "", + role: data.data?.role || "", + is_final: data.data?.is_final ?? null, + content_preview: redact(String(data.data?.content || data.message || "").slice(0, 200)), + })).catch(() => {}); + + if (data.type === "connected") { + sentAt = performance.now(); + const now = Date.now(); + sample.sent_at = new Date(now).toISOString(); + sample.sent_epoch_ms = now; + client.send(JSON.stringify({ + type: "message", + message: [{ type: "Plain", text: prompt }], + stream, + })); + return; + } + if (data.type === "error") { + finish("error", data.message || "WebSocket error message."); + return; + } + if (data.type !== "response" || data.data?.role !== "assistant") return; + + const content = String(data.data.content || ""); + markFirstAssistantEvent(sample, sentAt); + if (content) sample.response_text = content; + if (content) markFirstAssistantContent(sample, sentAt); + if (containsPipelineToken(content, job.otherPrefix)) { + sample.cross_pipeline_leak_count += 1; + finish("cross_pipeline_leak", `Pipeline ${job.label} received response from ${job.otherPrefix}: ${content}`); + return; + } + if (content.includes(expected) && sample.first_response_ms === null && sentAt > 0) { + const now = Date.now(); + sample.first_response_at = new Date(now).toISOString(); + sample.first_response_epoch_ms = now; + sample.first_response_ms = rounded(performance.now() - sentAt); + } + if (data.data.is_final === true) { + const ok = sample.response_text.includes(expected); + if (ok) { + if (sample.first_response_ms === null && sentAt > 0) { + const now = Date.now(); + sample.first_response_at = new Date(now).toISOString(); + sample.first_response_epoch_ms = now; + sample.first_response_ms = rounded(performance.now() - sentAt); + } + finish("pass", ""); + } else if (matchesFailureSignal(sample.response_text, failureSignals)) { + finish("app_error", `Assistant final response matched a failure signal: ${sample.response_text}`); + } else if (containsPipelineToken(sample.response_text, job.expectedPrefix)) { + sample.same_pipeline_foreign_response_count += 1; + sample.last_foreign_response_text = sample.response_text; + } else { + finish("mismatch", `Final assistant response did not include ${expected}: ${sample.response_text}`); + } + } + }, + onError(error) { + finish("connection_error", `WebSocket connection error: ${error.message}`); + }, + onClose(event) { + sample.close_code = event.code; + sample.close_reason = event.reason || ""; + if (!closed) finish("closed", `WebSocket closed before final assistant response: ${event.code}`); + }, + }); + + function finish(status, reason) { + if (closed) return; + closed = true; + clearTimeout(timer); + sample.status = status; + sample.ok = status === "pass"; + sample.error = status === "timeout" && sample.same_pipeline_foreign_response_count > 0 + ? `${reason || ""} Saw ${sample.same_pipeline_foreign_response_count} same-pipeline foreign assistant response(s); last=${sample.last_foreign_response_text}` + : reason || ""; + if (sentAt > 0) sample.response_duration_ms = rounded(performance.now() - sentAt); + else sample.response_duration_ms = rounded(performance.now() - startedPerf); + const now = Date.now(); + sample.finished_at = new Date(now).toISOString(); + sample.finished_epoch_ms = now; + try { + client?.close(); + } catch { + // Closing a failed socket should not hide the sample result. + } + resolvePromise(sample); + } + }); +} + +function markFirstAssistantEvent(sample, sentAt) { + if (sample.first_assistant_event_ms !== null || sentAt <= 0) return; + const now = Date.now(); + sample.first_assistant_event_at = new Date(now).toISOString(); + sample.first_assistant_event_epoch_ms = now; + sample.first_assistant_event_ms = rounded(performance.now() - sentAt); +} + +function markFirstAssistantContent(sample, sentAt) { + if (sample.first_assistant_content_ms !== null || sentAt <= 0) return; + const now = Date.now(); + sample.first_assistant_content_at = new Date(now).toISOString(); + sample.first_assistant_content_epoch_ms = now; + sample.first_assistant_content_ms = rounded(performance.now() - sentAt); +} + +function containsPipelineToken(text, prefix) { + const escaped = String(prefix).replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); + return new RegExp(`${escaped}-\\d{4}`).test(String(text || "")); +} + +function matchesFailureSignal(text, signals) { + const lower = String(text || "").toLowerCase(); + return signals.some((signal) => lower.includes(signal.toLowerCase())); +} + +function openRawWebSocket(wsUrl, handlers) { + const parsed = new URL(wsUrl); + const secure = parsed.protocol === "wss:"; + const port = Number(parsed.port || (secure ? 443 : 80)); + const host = parsed.hostname; + const path = `${parsed.pathname}${parsed.search}`; + const key = crypto.randomBytes(16).toString("base64"); + const socket = secure + ? tls.connect({ host, port, servername: host }) + : net.connect({ host, port }); + let opened = false; + let closed = false; + let buffer = Buffer.alloc(0); + + socket.setNoDelay(true); + socket.on("connect", () => { + const originProtocol = secure ? "https" : "http"; + const request = [ + `GET ${path} HTTP/1.1`, + `Host: ${parsed.host}`, + "Upgrade: websocket", + "Connection: Upgrade", + `Sec-WebSocket-Key: ${key}`, + "Sec-WebSocket-Version: 13", + `Origin: ${originProtocol}://${parsed.host}`, + "", + "", + ].join("\r\n"); + socket.write(request); + }); + socket.on("data", (chunk) => { + buffer = Buffer.concat([buffer, chunk]); + if (!opened) { + const headerEnd = buffer.indexOf("\r\n\r\n"); + if (headerEnd === -1) return; + const headerText = buffer.slice(0, headerEnd).toString("utf8"); + buffer = buffer.slice(headerEnd + 4); + if (!/^HTTP\/1\.1 101\b/i.test(headerText)) { + handlers.onError(new Error(`Handshake failed: ${headerText.split("\r\n")[0] || "missing status"}`)); + socket.destroy(); + return; + } + opened = true; + handlers.onOpen(); + } + processFrames(); + }); + socket.on("error", (error) => { + if (!closed) handlers.onError(error); + }); + socket.on("close", () => { + if (closed) return; + closed = true; + handlers.onClose({ code: null, reason: "" }); + }); + + function processFrames() { + while (true) { + const frame = readFrame(buffer); + if (!frame) return; + buffer = buffer.slice(frame.consumed); + if (frame.opcode === 0x1) { + handlers.onMessage(frame.payload.toString("utf8")); + } else if (frame.opcode === 0x8) { + const code = frame.payload.length >= 2 ? frame.payload.readUInt16BE(0) : null; + const reason = frame.payload.length > 2 ? frame.payload.slice(2).toString("utf8") : ""; + closed = true; + handlers.onClose({ code, reason }); + socket.end(); + return; + } else if (frame.opcode === 0x9) { + writeFrame(socket, 0xA, frame.payload); + } + } + } + + return { + send(text) { + if (closed || !opened) return; + writeFrame(socket, 0x1, Buffer.from(text, "utf8")); + }, + close() { + if (closed) return; + closed = true; + if (!socket.destroyed) { + if (opened) writeFrame(socket, 0x8, Buffer.alloc(0)); + setTimeout(() => socket.end(), 50).unref(); + } + }, + }; +} + +function readFrame(buffer) { + if (buffer.length < 2) return null; + const first = buffer[0]; + const second = buffer[1]; + const opcode = first & 0x0f; + const masked = Boolean(second & 0x80); + let length = second & 0x7f; + let offset = 2; + if (length === 126) { + if (buffer.length < offset + 2) return null; + length = buffer.readUInt16BE(offset); + offset += 2; + } else if (length === 127) { + if (buffer.length < offset + 8) return null; + const high = buffer.readUInt32BE(offset); + const low = buffer.readUInt32BE(offset + 4); + length = high * 2 ** 32 + low; + offset += 8; + } + let mask = null; + if (masked) { + if (buffer.length < offset + 4) return null; + mask = buffer.slice(offset, offset + 4); + offset += 4; + } + if (buffer.length < offset + length) return null; + let payload = buffer.slice(offset, offset + length); + if (mask) { + payload = Buffer.from(payload); + for (let index = 0; index < payload.length; index += 1) { + payload[index] ^= mask[index % 4]; + } + } + return { + opcode, + payload, + consumed: offset + length, + }; +} + +function writeFrame(socket, opcode, payload) { + const body = Buffer.isBuffer(payload) ? payload : Buffer.from(payload || ""); + const mask = crypto.randomBytes(4); + const headerLength = body.length < 126 ? 2 : body.length <= 0xffff ? 4 : 10; + const header = Buffer.alloc(headerLength); + header[0] = 0x80 | opcode; + if (body.length < 126) { + header[1] = 0x80 | body.length; + } else if (body.length <= 0xffff) { + header[1] = 0x80 | 126; + header.writeUInt16BE(body.length, 2); + } else { + header[1] = 0x80 | 127; + header.writeUInt32BE(Math.floor(body.length / 2 ** 32), 2); + header.writeUInt32BE(body.length >>> 0, 6); + } + const masked = Buffer.from(body); + for (let index = 0; index < masked.length; index += 1) { + masked[index] ^= mask[index % 4]; + } + socket.write(Buffer.concat([header, mask, masked])); +} + +function buildMetrics({ samples, requestsPerPipeline, concurrency, timeoutMs, loadDurationMs, backendUrl, sessionType, fakeProviderState }) { + const okSamples = samples.filter((sample) => sample.ok); + const statusCounts = {}; + const byPipeline = {}; + for (const sample of samples) { + statusCounts[sample.status] = (statusCounts[sample.status] || 0) + 1; + if (!byPipeline[sample.pipeline_label]) { + byPipeline[sample.pipeline_label] = { + ok_count: 0, + error_count: 0, + cross_pipeline_leak_count: 0, + timeout_count: 0, + }; + } + if (sample.ok) byPipeline[sample.pipeline_label].ok_count += 1; + else byPipeline[sample.pipeline_label].error_count += 1; + byPipeline[sample.pipeline_label].cross_pipeline_leak_count += sample.cross_pipeline_leak_count || 0; + if (sample.status === "timeout") byPipeline[sample.pipeline_label].timeout_count += 1; + } + const errorCount = samples.length - okSamples.length; + return { + probe: caseId, + backend_url: backendUrl, + session_type: sessionType, + requests_per_pipeline: requestsPerPipeline, + total_requests: requestsPerPipeline * 2, + completed_requests: samples.length, + concurrency, + timeout_ms: timeoutMs, + ok_count: okSamples.length, + error_count: errorCount, + timeout_count: samples.filter((sample) => sample.status === "timeout").length, + cross_pipeline_leak_count: samples.reduce((count, sample) => count + (sample.cross_pipeline_leak_count || 0), 0), + error_rate: samples.length === 0 ? 1 : rounded(errorCount / samples.length), + load_duration_ms: rounded(loadDurationMs), + throughput_rps: loadDurationMs <= 0 ? 0 : rounded(okSamples.length / (loadDurationMs / 1000)), + status_counts: statusCounts, + by_pipeline: byPipeline, + connected_ms: stats(samples.map((sample) => sample.connected_ms).filter(Number.isFinite)), + first_assistant_event_ms: stats(samples.map((sample) => sample.first_assistant_event_ms).filter(Number.isFinite)), + first_assistant_content_ms: stats(samples.map((sample) => sample.first_assistant_content_ms).filter(Number.isFinite)), + first_response_ms: stats(okSamples.map((sample) => sample.first_response_ms).filter(Number.isFinite)), + response_duration_ms: stats(okSamples.map((sample) => sample.response_duration_ms).filter(Number.isFinite)), + fake_provider: summarizeFakeProviderState(fakeProviderState), + provider_timing: buildProviderTimingMetrics(samples, fakeProviderState), + samples, + }; +} + +function buildThresholds(metrics) { + return { + cross_pipeline_leak_count: { + actual: metrics.cross_pipeline_leak_count, + max: 0, + pass: metrics.cross_pipeline_leak_count === 0, + }, + error_rate: { + actual: metrics.error_rate, + max: maxErrorRate, + pass: metrics.error_rate <= maxErrorRate, + }, + response_p95_ms: { + actual: metrics.response_duration_ms.p95, + max: responseP95BudgetMs, + pass: metrics.ok_count > 0 && metrics.response_duration_ms.p95 <= responseP95BudgetMs, + }, + }; +} + +function positiveInteger(value, fallback) { + const parsed = Number.parseInt(String(value || ""), 10); + return Number.isInteger(parsed) && parsed > 0 ? parsed : fallback; +} + +function positiveNumber(value, fallback) { + const parsed = Number(value || ""); + return Number.isFinite(parsed) && parsed >= 0 ? parsed : fallback; +} + +function bool(value, fallback) { + if (value === undefined || value === "") return fallback; + if (/^(1|true|yes|on)$/i.test(String(value))) return true; + if (/^(0|false|no|off)$/i.test(String(value))) return false; + return fallback; +} + +function textList(value) { + return String(value || "") + .split(/\r?\n|,/) + .map((item) => item.trim()) + .filter(Boolean); +} + +function rounded(value) { + return Number(value.toFixed(3)); +} + +function percentile(values, percentileValue) { + if (values.length === 0) return 0; + const sorted = [...values].sort((a, b) => a - b); + const index = Math.min(sorted.length - 1, Math.ceil((percentileValue / 100) * sorted.length) - 1); + return rounded(sorted[index]); +} + +function stats(values) { + if (values.length === 0) return { min: 0, p50: 0, p95: 0, p99: 0, max: 0 }; + return { + min: rounded(Math.min(...values)), + p50: percentile(values, 50), + p95: percentile(values, 95), + p99: percentile(values, 99), + max: rounded(Math.max(...values)), + }; +} + +function looksLikeEnvIssue(error) { + const message = String(error?.message || error || ""); + return /fetch failed|ECONNREFUSED|ENOTFOUND|LANGBOT_.*not configured|Could not read recovery_key|Backend did not respond/i.test(message); +} + +function safeReason(value) { + return redact(String(value || "")).slice(0, 1000); +} diff --git a/skills/skills/langbot-testing/probes/langbot-fault-taxonomy-contract.mjs b/skills/skills/langbot-testing/probes/langbot-fault-taxonomy-contract.mjs new file mode 100644 index 000000000..8c9628e58 --- /dev/null +++ b/skills/skills/langbot-testing/probes/langbot-fault-taxonomy-contract.mjs @@ -0,0 +1,159 @@ +#!/usr/bin/env node + +import { mkdir, writeFile } from "node:fs/promises"; +import { join, resolve } from "node:path"; +import { env, exit } from "node:process"; + +function pad(value, size = 2) { + return String(value).padStart(size, "0"); +} + +function localIsoWithOffset(date = new Date()) { + const offsetMinutes = -date.getTimezoneOffset(); + const sign = offsetMinutes >= 0 ? "+" : "-"; + const absolute = Math.abs(offsetMinutes); + return [ + `${date.getFullYear()}-${pad(date.getMonth() + 1)}-${pad(date.getDate())}`, + `T${pad(date.getHours())}:${pad(date.getMinutes())}:${pad(date.getSeconds())}.${pad(date.getMilliseconds(), 3)}`, + `${sign}${pad(Math.floor(absolute / 60))}:${pad(absolute % 60)}`, + ].join(""); +} + +function timestampSlug(date = new Date()) { + return date.toISOString().replace(/\.\d{3}Z$/, "Z").replace(/[^0-9A-Za-z]+/g, "-").replace(/^-|-$/g, ""); +} + +const scenarios = [ + { + id: "provider-timeout", + target: "provider", + injected_fault: "fake provider request exceeds the configured timeout", + expected_status: "env_issue", + recovery_check: "provider route is reachable or the case remains outside product pass/fail", + cleanup: "stop fake provider or reset proxy route", + }, + { + id: "plugin-runtime-disconnect", + target: "plugin-runtime", + injected_fault: "runtime control channel disconnects during an action", + expected_status: "fail", + recovery_check: "runtime reconnects and a deterministic plugin action succeeds", + cleanup: "restart the local plugin runtime process", + }, + { + id: "mcp-stdio-server-exit", + target: "mcp", + injected_fault: "stdio server exits mid-call", + expected_status: "fail", + recovery_check: "server can be registered again and exposes the expected tool", + cleanup: "remove temporary MCP server registration", + }, + { + id: "operator-missing-login", + target: "webui", + injected_fault: "browser profile is not authenticated", + expected_status: "blocked", + recovery_check: "authenticated profile can open the same WebUI origin", + cleanup: "no product cleanup; refresh local login state", + }, + { + id: "transient-marketplace-timeout", + target: "marketplace", + injected_fault: "marketplace request times out once and then succeeds", + expected_status: "flaky", + recovery_check: "rerun passes with the same product revision and no code change", + cleanup: "clear retry-only evidence and keep the run classified as flaky", + }, +]; + +function validateScenario(scenario) { + const missing = ["id", "target", "injected_fault", "expected_status", "recovery_check", "cleanup"] + .filter((key) => !scenario[key]); + const allowedStatuses = new Set(["pass", "fail", "blocked", "env_issue", "flaky"]); + return { + id: scenario.id, + pass: missing.length === 0 && allowedStatuses.has(scenario.expected_status), + missing, + expected_status: scenario.expected_status, + }; +} + +async function main() { + const root = resolve(env.LBS_ROOT || process.cwd()); + const caseId = "langbot-fault-taxonomy-contract"; + const runId = env.LBS_RUN_ID || `${timestampSlug()}-${caseId}`; + const evidenceDir = resolve(env.LBS_EVIDENCE_DIR || join(root, "reports", "evidence", runId)); + await mkdir(evidenceDir, { recursive: true }); + + const startedAt = new Date(); + const validations = scenarios.map(validateScenario); + const statusCounts = {}; + for (const scenario of scenarios) { + statusCounts[scenario.expected_status] = (statusCounts[scenario.expected_status] || 0) + 1; + } + const metrics = { + probe: caseId, + scenario_count: scenarios.length, + status_counts: statusCounts, + scenarios, + validations, + }; + const thresholds = { + scenario_count: { actual: scenarios.length, min: 5, pass: scenarios.length >= 5 }, + invalid_scenario_count: { + actual: validations.filter((item) => !item.pass).length, + max: 0, + pass: validations.every((item) => item.pass), + }, + cleanup_declared_count: { + actual: scenarios.filter((item) => item.cleanup).length, + min: scenarios.length, + pass: scenarios.every((item) => item.cleanup), + }, + }; + const status = Object.values(thresholds).every((item) => item.pass) ? "pass" : "fail"; + const metricsPath = join(evidenceDir, "metrics.json"); + const faultModelPath = join(evidenceDir, "fault-model.json"); + const automationResultPath = join(evidenceDir, "automation-result.json"); + const resultPath = join(evidenceDir, "result.json"); + + await writeFile(metricsPath, `${JSON.stringify(metrics, null, 2)}\n`, "utf8"); + await writeFile(faultModelPath, `${JSON.stringify({ scenarios }, null, 2)}\n`, "utf8"); + + const finishedAt = new Date(); + const result = { + source: "automation", + case_id: caseId, + run_id: runId, + status, + reason: status === "pass" + ? "Fault taxonomy contract declares status, recovery, and cleanup for every scenario." + : "Fault taxonomy contract is missing required scenario fields.", + started_at: startedAt.toISOString(), + started_at_local: localIsoWithOffset(startedAt), + finished_at: finishedAt.toISOString(), + finished_at_local: localIsoWithOffset(finishedAt), + duration_ms: finishedAt.getTime() - startedAt.getTime(), + metrics_summary: { + scenario_count: metrics.scenario_count, + status_counts: metrics.status_counts, + invalid_scenario_count: thresholds.invalid_scenario_count.actual, + }, + thresholds_summary: thresholds, + artifacts: { + metrics_json: metricsPath, + fault_model_json: faultModelPath, + automation_result_json: automationResultPath, + result_json: resultPath, + }, + evidence_collected: ["metrics", "filesystem"], + }; + + const resultText = `${JSON.stringify(result, null, 2)}\n`; + await writeFile(automationResultPath, resultText, "utf8"); + await writeFile(resultPath, resultText, "utf8"); + console.log(JSON.stringify(result, null, 2)); + exit(status === "pass" ? 0 : 1); +} + +await main(); diff --git a/skills/skills/langbot-testing/probes/langbot-live-backend-latency.mjs b/skills/skills/langbot-testing/probes/langbot-live-backend-latency.mjs new file mode 100644 index 000000000..747c84c6a --- /dev/null +++ b/skills/skills/langbot-testing/probes/langbot-live-backend-latency.mjs @@ -0,0 +1,212 @@ +#!/usr/bin/env node + +import { mkdir, writeFile } from "node:fs/promises"; +import { join, resolve } from "node:path"; +import { env, exit } from "node:process"; + +function pad(value, size = 2) { + return String(value).padStart(size, "0"); +} + +function localIsoWithOffset(date = new Date()) { + const offsetMinutes = -date.getTimezoneOffset(); + const sign = offsetMinutes >= 0 ? "+" : "-"; + const absolute = Math.abs(offsetMinutes); + return [ + `${date.getFullYear()}-${pad(date.getMonth() + 1)}-${pad(date.getDate())}`, + `T${pad(date.getHours())}:${pad(date.getMinutes())}:${pad(date.getSeconds())}.${pad(date.getMilliseconds(), 3)}`, + `${sign}${pad(Math.floor(absolute / 60))}:${pad(absolute % 60)}`, + ].join(""); +} + +function timestampSlug(date = new Date()) { + return date.toISOString().replace(/\.\d{3}Z$/, "Z").replace(/[^0-9A-Za-z]+/g, "-").replace(/^-|-$/g, ""); +} + +function percentile(values, percentileValue) { + if (values.length === 0) return 0; + const sorted = [...values].sort((a, b) => a - b); + const index = Math.min(sorted.length - 1, Math.ceil((percentileValue / 100) * sorted.length) - 1); + return Number(sorted[index].toFixed(3)); +} + +function stats(values) { + if (values.length === 0) return { min: 0, p50: 0, p95: 0, p99: 0, max: 0 }; + return { + min: Number(Math.min(...values).toFixed(3)), + p50: percentile(values, 50), + p95: percentile(values, 95), + p99: percentile(values, 99), + max: Number(Math.max(...values).toFixed(3)), + }; +} + +function parseJsonList(value, fallback) { + if (!value) return fallback; + try { + const parsed = JSON.parse(value); + return Array.isArray(parsed) && parsed.every((item) => typeof item === "string") ? parsed : fallback; + } catch { + return fallback; + } +} + +function joinUrl(baseUrl, path) { + const base = baseUrl.replace(/\/+$/, ""); + const suffix = path.startsWith("/") ? path : `/${path}`; + return `${base}${suffix}`; +} + +async function fetchOnce(url, timeoutMs) { + const controller = new AbortController(); + const timeout = setTimeout(() => controller.abort(), timeoutMs); + const started = performance.now(); + try { + const response = await fetch(url, { method: "GET", signal: controller.signal }); + await response.arrayBuffer(); + const latencyMs = performance.now() - started; + return { + url, + ok: response.status < 500, + status: response.status, + latency_ms: Number(latencyMs.toFixed(3)), + error: "", + }; + } catch (error) { + const latencyMs = performance.now() - started; + return { + url, + ok: false, + status: 0, + latency_ms: Number(latencyMs.toFixed(3)), + error: error instanceof Error ? error.message : String(error), + }; + } finally { + clearTimeout(timeout); + } +} + +async function runBatches(urls, totalRequests, concurrency, timeoutMs) { + const queue = Array.from({ length: totalRequests }, (_, index) => urls[index % urls.length]); + const results = []; + while (queue.length > 0) { + const batch = queue.splice(0, concurrency); + results.push(...await Promise.all(batch.map((url) => fetchOnce(url, timeoutMs)))); + } + return results; +} + +async function main() { + const root = resolve(env.LBS_ROOT || process.cwd()); + const caseId = "langbot-live-backend-latency"; + const runId = env.LBS_RUN_ID || `${timestampSlug()}-${caseId}`; + const evidenceDir = resolve(env.LBS_EVIDENCE_DIR || join(root, "reports", "evidence", runId)); + await mkdir(evidenceDir, { recursive: true }); + + const startedAt = new Date(); + const backendUrl = env.LANGBOT_BACKEND_URL || ""; + const endpoints = parseJsonList(env.LANGBOT_PERF_ENDPOINTS_JSON, ["/healthz"]); + const totalRequests = Number(env.LANGBOT_PERF_REQUESTS || "12"); + const concurrency = Number(env.LANGBOT_PERF_CONCURRENCY || "2"); + const timeoutMs = Number(env.LANGBOT_PERF_TIMEOUT_MS || "5000"); + const p95BudgetMs = Number(env.LANGBOT_PERF_BACKEND_P95_MS || "1000"); + const maxErrorRate = Number(env.LANGBOT_PERF_MAX_ERROR_RATE || "0"); + const metricsPath = join(evidenceDir, "metrics.json"); + const networkLogPath = join(evidenceDir, "network.log"); + const automationResultPath = join(evidenceDir, "automation-result.json"); + const resultPath = join(evidenceDir, "result.json"); + + let status = "fail"; + let reason = ""; + let results = []; + if (!backendUrl) { + status = "env_issue"; + reason = "LANGBOT_BACKEND_URL is not configured."; + } else { + const urls = endpoints.map((path) => joinUrl(backendUrl, path)); + results = await runBatches(urls, totalRequests, concurrency, timeoutMs); + const okCount = results.filter((item) => item.ok).length; + const errorCount = results.length - okCount; + const errorRate = results.length === 0 ? 1 : errorCount / results.length; + const latencies = results.filter((item) => item.ok).map((item) => item.latency_ms); + const latencyStats = stats(latencies); + const allConnectionFailures = results.length > 0 && results.every((item) => item.status === 0); + if (allConnectionFailures) { + status = "env_issue"; + reason = `Backend did not respond at ${backendUrl}.`; + } else if (latencyStats.p95 <= p95BudgetMs && errorRate <= maxErrorRate) { + status = "pass"; + reason = "Live backend latency probe passed all thresholds."; + } else { + status = "fail"; + reason = "Live backend latency probe breached latency or error-rate thresholds."; + } + } + + const statusCounts = {}; + for (const item of results) { + const key = item.status === 0 ? "network_error" : String(item.status); + statusCounts[key] = (statusCounts[key] || 0) + 1; + } + const okResults = results.filter((item) => item.ok); + const metrics = { + probe: caseId, + backend_url: backendUrl, + endpoints, + total_requests: totalRequests, + concurrency, + timeout_ms: timeoutMs, + ok_count: okResults.length, + error_count: results.length - okResults.length, + error_rate: results.length === 0 ? 1 : Number(((results.length - okResults.length) / results.length).toFixed(4)), + latency_ms: stats(okResults.map((item) => item.latency_ms)), + status_counts: statusCounts, + }; + const thresholds = { + backend_p95_ms: { actual: metrics.latency_ms.p95, max: p95BudgetMs, pass: metrics.latency_ms.p95 <= p95BudgetMs }, + error_rate: { actual: metrics.error_rate, max: maxErrorRate, pass: metrics.error_rate <= maxErrorRate }, + }; + + await writeFile(metricsPath, `${JSON.stringify({ ...metrics, samples: results }, null, 2)}\n`, "utf8"); + await writeFile(networkLogPath, results.map((item) => JSON.stringify(item)).join("\n") + (results.length > 0 ? "\n" : ""), "utf8"); + + const finishedAt = new Date(); + const result = { + source: "automation", + case_id: caseId, + run_id: runId, + status, + reason, + started_at: startedAt.toISOString(), + started_at_local: localIsoWithOffset(startedAt), + finished_at: finishedAt.toISOString(), + finished_at_local: localIsoWithOffset(finishedAt), + duration_ms: finishedAt.getTime() - startedAt.getTime(), + url: backendUrl, + metrics_summary: { + requests: metrics.total_requests, + concurrency: metrics.concurrency, + ok_count: metrics.ok_count, + error_rate: metrics.error_rate, + latency_p50_ms: metrics.latency_ms.p50, + latency_p95_ms: metrics.latency_ms.p95, + status_counts: metrics.status_counts, + }, + thresholds_summary: thresholds, + artifacts: { + metrics_json: metricsPath, + network_log: networkLogPath, + automation_result_json: automationResultPath, + result_json: resultPath, + }, + evidence_collected: ["metrics", "network", "api_diagnostic", "filesystem"], + }; + + const resultText = `${JSON.stringify(result, null, 2)}\n`; + await writeFile(automationResultPath, resultText, "utf8"); + await writeFile(resultPath, resultText, "utf8"); + console.log(JSON.stringify(result, null, 2)); + exit(status === "pass" ? 0 : status === "env_issue" ? 2 : 1); +} + +await main(); diff --git a/skills/skills/langbot-testing/probes/langbot-live-backend-log-health.mjs b/skills/skills/langbot-testing/probes/langbot-live-backend-log-health.mjs new file mode 100644 index 000000000..38a31c389 --- /dev/null +++ b/skills/skills/langbot-testing/probes/langbot-live-backend-log-health.mjs @@ -0,0 +1,205 @@ +#!/usr/bin/env node + +import { existsSync, readdirSync, statSync } from "node:fs"; +import { mkdir, readFile, writeFile } from "node:fs/promises"; +import { join, resolve } from "node:path"; +import { env, exit } from "node:process"; + +function pad(value, size = 2) { + return String(value).padStart(size, "0"); +} + +function localIsoWithOffset(date = new Date()) { + const offsetMinutes = -date.getTimezoneOffset(); + const sign = offsetMinutes >= 0 ? "+" : "-"; + const absolute = Math.abs(offsetMinutes); + return [ + `${date.getFullYear()}-${pad(date.getMonth() + 1)}-${pad(date.getDate())}`, + `T${pad(date.getHours())}:${pad(date.getMinutes())}:${pad(date.getSeconds())}.${pad(date.getMilliseconds(), 3)}`, + `${sign}${pad(Math.floor(absolute / 60))}:${pad(absolute % 60)}`, + ].join(""); +} + +function timestampSlug(date = new Date()) { + return date.toISOString().replace(/\.\d{3}Z$/, "Z").replace(/[^0-9A-Za-z]+/g, "-").replace(/^-|-$/g, ""); +} + +function repoRootFromEnv(root) { + return env.LANGBOT_REPO ? resolve(env.LANGBOT_REPO) : resolve(root, ".."); +} + +function latestBackendLog(root) { + const explicit = env.LANGBOT_BACKEND_LOG; + if (explicit) return resolve(explicit); + + const logsDir = join(repoRootFromEnv(root), "data", "logs"); + if (!existsSync(logsDir)) return ""; + const candidates = readdirSync(logsDir) + .filter((name) => /^langbot-.*\.log$/.test(name)) + .map((name) => join(logsDir, name)) + .filter((path) => { + try { + return statSync(path).isFile(); + } catch { + return false; + } + }) + .sort((left, right) => statSync(right).mtimeMs - statSync(left).mtimeMs); + return candidates[0] || ""; +} + +function parseSince(startedAt) { + if (env.LANGBOT_BACKEND_LOG_SINCE) return new Date(env.LANGBOT_BACKEND_LOG_SINCE); + const lookbackSeconds = Number(env.LANGBOT_BACKEND_LOG_LOOKBACK_SECONDS || "300"); + return new Date(startedAt.getTime() - lookbackSeconds * 1000); +} + +function parseTimestamp(line, year) { + const localMatch = line.match(/^\[(\d{2})-(\d{2}) (\d{2}):(\d{2}):(\d{2})\.(\d{3})\]/); + if (localMatch) { + const [, month, day, hour, minute, second, millisecond] = localMatch; + return new Date(`${year}-${month}-${day}T${hour}:${minute}:${second}.${millisecond}+08:00`); + } + + const accessMatch = line.match(/^\[(\d{4})-(\d{2})-(\d{2}) (\d{2}):(\d{2}):(\d{2}) ([+-]\d{4})\]/); + if (accessMatch) { + const [, fullYear, month, day, hour, minute, second, offset] = accessMatch; + const normalizedOffset = `${offset.slice(0, 3)}:${offset.slice(3)}`; + return new Date(`${fullYear}-${month}-${day}T${hour}:${minute}:${second}${normalizedOffset}`); + } + + return null; +} + +function findingForLine(line, number) { + const rules = [ + { severity: "fail", kind: "python_traceback", pattern: /\bTraceback(?: \(most recent call last\))?/i }, + { severity: "fail", kind: "unretrieved_task_exception", pattern: /Task exception was never retrieved/i }, + { severity: "fail", kind: "unawaited_coroutine", pattern: /RuntimeWarning:\s+coroutine .* was never awaited/i }, + { severity: "fail", kind: "unclosed_client_session", pattern: /Unclosed client session/i }, + { severity: "fail", kind: "unclosed_connector", pattern: /Unclosed connector/i }, + { severity: "fail", kind: "import_error", pattern: /\bImportError\b/i }, + { severity: "fail", kind: "error_log", pattern: /\b(?:ERROR|CRITICAL)\b/ }, + { severity: "warning", kind: "warning_log", pattern: /\bWARNING\b/ }, + ]; + + for (const rule of rules) { + if (rule.pattern.test(line)) { + return { + severity: rule.severity, + kind: rule.kind, + line: number, + excerpt: line, + }; + } + } + return null; +} + +function scanLines(text, since, year) { + const findings = []; + const scanned = []; + let includeContinuation = false; + const lines = text.split(/\r?\n/); + for (const [index, line] of lines.entries()) { + const number = index + 1; + const timestamp = parseTimestamp(line, year); + if (timestamp) includeContinuation = timestamp >= since; + if (!includeContinuation) continue; + scanned.push({ number, text: line }); + const finding = findingForLine(line, number); + if (finding) findings.push(finding); + } + return { findings, scanned, total_lines: lines.length }; +} + +async function main() { + const root = resolve(env.LBS_ROOT || process.cwd()); + const caseId = "langbot-live-backend-log-health"; + const runId = env.LBS_RUN_ID || `${timestampSlug()}-${caseId}`; + const evidenceDir = resolve(env.LBS_EVIDENCE_DIR || join(root, "reports", "evidence", runId)); + await mkdir(evidenceDir, { recursive: true }); + + const startedAt = new Date(); + const since = parseSince(startedAt); + const logPath = latestBackendLog(root); + const metricsPath = join(evidenceDir, "metrics.json"); + const findingsPath = join(evidenceDir, "findings.json"); + const scannedLogPath = join(evidenceDir, "scanned-backend.log"); + const automationResultPath = join(evidenceDir, "automation-result.json"); + const resultPath = join(evidenceDir, "result.json"); + + let status = "fail"; + let reason = ""; + let scan = { findings: [], scanned: [], total_lines: 0 }; + if (!logPath || !existsSync(logPath)) { + status = "env_issue"; + reason = "No LangBot backend log file was found. Set LANGBOT_BACKEND_LOG or LANGBOT_REPO."; + } else { + const text = await readFile(logPath, "utf8"); + scan = scanLines(text, since, startedAt.getFullYear()); + const failCount = scan.findings.filter((item) => item.severity === "fail").length; + status = failCount === 0 ? "pass" : "fail"; + reason = status === "pass" + ? "Live backend log health passed; no fail-severity findings in the scanned window." + : "Live backend log health found fail-severity backend log findings."; + } + + const warningCount = scan.findings.filter((item) => item.severity === "warning").length; + const failCount = scan.findings.filter((item) => item.severity === "fail").length; + const metrics = { + probe: caseId, + backend_log: logPath, + since: since.toISOString(), + scanned_line_count: scan.scanned.length, + total_line_count: scan.total_lines, + fail_count: failCount, + warning_count: warningCount, + finding_count: scan.findings.length, + }; + const thresholds = { + fail_count: { actual: failCount, max: 0, pass: failCount === 0 }, + }; + + await writeFile(metricsPath, `${JSON.stringify(metrics, null, 2)}\n`, "utf8"); + await writeFile(findingsPath, `${JSON.stringify(scan.findings, null, 2)}\n`, "utf8"); + await writeFile(scannedLogPath, scan.scanned.map((item) => `${item.number}: ${item.text}`).join("\n") + (scan.scanned.length > 0 ? "\n" : ""), "utf8"); + + const finishedAt = new Date(); + const result = { + source: "automation", + case_id: caseId, + run_id: runId, + status, + reason, + started_at: startedAt.toISOString(), + started_at_local: localIsoWithOffset(startedAt), + finished_at: finishedAt.toISOString(), + finished_at_local: localIsoWithOffset(finishedAt), + duration_ms: finishedAt.getTime() - startedAt.getTime(), + url: logPath, + metrics_summary: { + scanned_line_count: metrics.scanned_line_count, + fail_count: metrics.fail_count, + warning_count: metrics.warning_count, + finding_count: metrics.finding_count, + }, + thresholds_summary: thresholds, + artifacts: { + metrics_json: metricsPath, + findings_json: findingsPath, + scanned_backend_log: scannedLogPath, + automation_result_json: automationResultPath, + result_json: resultPath, + }, + evidence_collected: ["metrics", "backend_log", "filesystem"], + }; + + const resultText = `${JSON.stringify(result, null, 2)}\n`; + await writeFile(automationResultPath, resultText, "utf8"); + await writeFile(resultPath, resultText, "utf8"); + console.log(JSON.stringify(result, null, 2)); + exit(status === "pass" ? 0 : status === "env_issue" ? 2 : 1); +} + +await main(); diff --git a/skills/skills/langbot-testing/probes/langbot-live-control-plane-api.mjs b/skills/skills/langbot-testing/probes/langbot-live-control-plane-api.mjs new file mode 100644 index 000000000..8232d1fc3 --- /dev/null +++ b/skills/skills/langbot-testing/probes/langbot-live-control-plane-api.mjs @@ -0,0 +1,311 @@ +#!/usr/bin/env node + +import { mkdir, writeFile } from "node:fs/promises"; +import { join, resolve } from "node:path"; +import { env, exit } from "node:process"; + +function pad(value, size = 2) { + return String(value).padStart(size, "0"); +} + +function localIsoWithOffset(date = new Date()) { + const offsetMinutes = -date.getTimezoneOffset(); + const sign = offsetMinutes >= 0 ? "+" : "-"; + const absolute = Math.abs(offsetMinutes); + return [ + `${date.getFullYear()}-${pad(date.getMonth() + 1)}-${pad(date.getDate())}`, + `T${pad(date.getHours())}:${pad(date.getMinutes())}:${pad(date.getSeconds())}.${pad(date.getMilliseconds(), 3)}`, + `${sign}${pad(Math.floor(absolute / 60))}:${pad(absolute % 60)}`, + ].join(""); +} + +function timestampSlug(date = new Date()) { + return date.toISOString().replace(/\.\d{3}Z$/, "Z").replace(/[^0-9A-Za-z]+/g, "-").replace(/^-|-$/g, ""); +} + +function percentile(values, percentileValue) { + if (values.length === 0) return 0; + const sorted = [...values].sort((a, b) => a - b); + const index = Math.min(sorted.length - 1, Math.ceil((percentileValue / 100) * sorted.length) - 1); + return Number(sorted[index].toFixed(3)); +} + +function stats(values) { + if (values.length === 0) return { min: 0, p50: 0, p95: 0, p99: 0, max: 0 }; + return { + min: Number(Math.min(...values).toFixed(3)), + p50: percentile(values, 50), + p95: percentile(values, 95), + p99: percentile(values, 99), + max: Number(Math.max(...values).toFixed(3)), + }; +} + +function joinUrl(baseUrl, path) { + const base = baseUrl.replace(/\/+$/, ""); + const suffix = path.startsWith("/") ? path : `/${path}`; + return `${base}${suffix}`; +} + +function parseJsonObject(value, fallback) { + if (!value) return fallback; + try { + const parsed = JSON.parse(value); + return parsed && typeof parsed === "object" && !Array.isArray(parsed) ? parsed : fallback; + } catch { + return fallback; + } +} + +function controlPlaneEndpoints() { + return [ + { + id: "healthz", + path: "/healthz", + expected_status: 200, + expected_code: 0, + p95_budget_ms: Number(env.LANGBOT_PERF_HEALTHZ_P95_MS || "500"), + required_data_fields: [], + }, + { + id: "system_info", + path: "/api/v1/system/info", + expected_status: 200, + expected_code: 0, + p95_budget_ms: Number(env.LANGBOT_PERF_SYSTEM_INFO_P95_MS || "1000"), + required_data_fields: ["version", "edition", "enable_marketplace"], + }, + ]; +} + +async function fetchEndpoint(backendUrl, endpoint, timeoutMs) { + const url = joinUrl(backendUrl, endpoint.path); + const controller = new AbortController(); + const timeout = setTimeout(() => controller.abort(), timeoutMs); + const started = performance.now(); + let bodyText = ""; + let json = null; + let jsonValid = false; + let error = ""; + + try { + const response = await fetch(url, { + method: "GET", + headers: { "accept": "application/json" }, + signal: controller.signal, + }); + bodyText = await response.text(); + try { + json = bodyText ? JSON.parse(bodyText) : null; + jsonValid = json !== null; + } catch (parseError) { + error = parseError instanceof Error ? parseError.message : String(parseError); + } + + const data = json && typeof json === "object" && json.data && typeof json.data === "object" ? json.data : {}; + const missingFields = endpoint.required_data_fields.filter((field) => !(field in data)); + const statusOk = response.status === endpoint.expected_status; + const codeOk = !json || typeof json !== "object" ? false : json.code === endpoint.expected_code; + const shapeOk = jsonValid && missingFields.length === 0; + const latencyMs = performance.now() - started; + return { + endpoint_id: endpoint.id, + path: endpoint.path, + url, + status: response.status, + ok: statusOk && codeOk && shapeOk, + status_ok: statusOk, + code_ok: codeOk, + json_valid: jsonValid, + missing_fields: missingFields, + response_code: json && typeof json === "object" ? json.code : null, + latency_ms: Number(latencyMs.toFixed(3)), + error, + }; + } catch (fetchError) { + const latencyMs = performance.now() - started; + return { + endpoint_id: endpoint.id, + path: endpoint.path, + url, + status: 0, + ok: false, + status_ok: false, + code_ok: false, + json_valid: false, + missing_fields: endpoint.required_data_fields, + response_code: null, + latency_ms: Number(latencyMs.toFixed(3)), + error: fetchError instanceof Error ? fetchError.message : String(fetchError), + }; + } finally { + clearTimeout(timeout); + } +} + +async function runBatches(backendUrl, endpoints, totalRequests, concurrency, timeoutMs) { + const queue = Array.from({ length: totalRequests }, (_, index) => endpoints[index % endpoints.length]); + const results = []; + while (queue.length > 0) { + const batch = queue.splice(0, concurrency); + results.push(...await Promise.all(batch.map((endpoint) => fetchEndpoint(backendUrl, endpoint, timeoutMs)))); + } + return results; +} + +function endpointMetrics(endpoints, results) { + return Object.fromEntries(endpoints.map((endpoint) => { + const samples = results.filter((item) => item.endpoint_id === endpoint.id); + const okSamples = samples.filter((item) => item.ok); + return [ + endpoint.id, + { + path: endpoint.path, + requests: samples.length, + ok_count: okSamples.length, + error_rate: samples.length === 0 ? 1 : Number(((samples.length - okSamples.length) / samples.length).toFixed(4)), + latency_ms: stats(okSamples.map((item) => item.latency_ms)), + p95_budget_ms: endpoint.p95_budget_ms, + }, + ]; + })); +} + +async function main() { + const root = resolve(env.LBS_ROOT || process.cwd()); + const caseId = "langbot-live-control-plane-api"; + const runId = env.LBS_RUN_ID || `${timestampSlug()}-${caseId}`; + const evidenceDir = resolve(env.LBS_EVIDENCE_DIR || join(root, "reports", "evidence", runId)); + await mkdir(evidenceDir, { recursive: true }); + + const startedAt = new Date(); + const backendUrl = env.LANGBOT_BACKEND_URL || ""; + const endpoints = controlPlaneEndpoints(); + const configuredBudgets = parseJsonObject(env.LANGBOT_CONTROL_PLANE_P95_BUDGETS_JSON, {}); + for (const endpoint of endpoints) { + const budget = configuredBudgets[endpoint.id]; + if (typeof budget === "number" && Number.isFinite(budget)) endpoint.p95_budget_ms = budget; + } + const totalRequests = Number(env.LANGBOT_CONTROL_PLANE_REQUESTS || "20"); + const concurrency = Number(env.LANGBOT_CONTROL_PLANE_CONCURRENCY || "4"); + const timeoutMs = Number(env.LANGBOT_CONTROL_PLANE_TIMEOUT_MS || "5000"); + const maxErrorRate = Number(env.LANGBOT_CONTROL_PLANE_MAX_ERROR_RATE || "0"); + const metricsPath = join(evidenceDir, "metrics.json"); + const endpointsPath = join(evidenceDir, "endpoints.json"); + const networkLogPath = join(evidenceDir, "network.log"); + const automationResultPath = join(evidenceDir, "automation-result.json"); + const resultPath = join(evidenceDir, "result.json"); + + let status = "fail"; + let reason = ""; + let results = []; + if (!backendUrl) { + status = "env_issue"; + reason = "LANGBOT_BACKEND_URL is not configured."; + } else { + results = await runBatches(backendUrl, endpoints, totalRequests, concurrency, timeoutMs); + const allConnectionFailures = results.length > 0 && results.every((item) => item.status === 0); + if (allConnectionFailures) { + status = "env_issue"; + reason = `Backend did not respond at ${backendUrl}.`; + } + } + + const okResults = results.filter((item) => item.ok); + const statusCounts = {}; + for (const item of results) { + const key = item.status === 0 ? "network_error" : String(item.status); + statusCounts[key] = (statusCounts[key] || 0) + 1; + } + const perEndpoint = endpointMetrics(endpoints, results); + const responseShapeFailures = results.filter((item) => !item.json_valid || item.missing_fields.length > 0 || !item.code_ok).length; + const errorRate = results.length === 0 ? 1 : Number(((results.length - okResults.length) / results.length).toFixed(4)); + const thresholds = { + error_rate: { actual: errorRate, max: maxErrorRate, pass: errorRate <= maxErrorRate }, + response_shape_failures: { actual: responseShapeFailures, max: 0, pass: responseShapeFailures === 0 }, + }; + for (const endpoint of endpoints) { + const actual = perEndpoint[endpoint.id].latency_ms.p95; + thresholds[`${endpoint.id}_p95_ms`] = { + actual, + max: endpoint.p95_budget_ms, + pass: actual <= endpoint.p95_budget_ms, + }; + } + + if (status !== "env_issue") { + const passed = Object.values(thresholds).every((item) => item.pass); + status = passed ? "pass" : "fail"; + reason = passed + ? "Live control-plane API probe passed all thresholds." + : "Live control-plane API probe breached shape, latency, or error-rate thresholds."; + } + + const metrics = { + probe: caseId, + backend_url: backendUrl, + total_requests: totalRequests, + concurrency, + timeout_ms: timeoutMs, + ok_count: okResults.length, + error_count: results.length - okResults.length, + error_rate: errorRate, + status_counts: statusCounts, + response_shape_failures: responseShapeFailures, + endpoints: perEndpoint, + }; + + await writeFile(metricsPath, `${JSON.stringify({ ...metrics, samples: results }, null, 2)}\n`, "utf8"); + await writeFile(endpointsPath, `${JSON.stringify(endpoints, null, 2)}\n`, "utf8"); + await writeFile(networkLogPath, results.map((item) => JSON.stringify(item)).join("\n") + (results.length > 0 ? "\n" : ""), "utf8"); + + const finishedAt = new Date(); + const result = { + source: "automation", + case_id: caseId, + run_id: runId, + status, + reason, + started_at: startedAt.toISOString(), + started_at_local: localIsoWithOffset(startedAt), + finished_at: finishedAt.toISOString(), + finished_at_local: localIsoWithOffset(finishedAt), + duration_ms: finishedAt.getTime() - startedAt.getTime(), + url: backendUrl, + metrics_summary: { + requests: metrics.total_requests, + concurrency: metrics.concurrency, + ok_count: metrics.ok_count, + error_rate: metrics.error_rate, + response_shape_failures: metrics.response_shape_failures, + endpoints: Object.fromEntries(Object.entries(metrics.endpoints).map(([id, value]) => [ + id, + { + path: value.path, + ok_count: value.ok_count, + error_rate: value.error_rate, + latency_p50_ms: value.latency_ms.p50, + latency_p95_ms: value.latency_ms.p95, + }, + ])), + status_counts: metrics.status_counts, + }, + thresholds_summary: thresholds, + artifacts: { + metrics_json: metricsPath, + endpoints_json: endpointsPath, + network_log: networkLogPath, + automation_result_json: automationResultPath, + result_json: resultPath, + }, + evidence_collected: ["metrics", "network", "api_diagnostic", "filesystem"], + }; + + const resultText = `${JSON.stringify(result, null, 2)}\n`; + await writeFile(automationResultPath, resultText, "utf8"); + await writeFile(resultPath, resultText, "utf8"); + console.log(JSON.stringify(result, null, 2)); + exit(status === "pass" ? 0 : status === "env_issue" ? 2 : 1); +} + +await main(); diff --git a/skills/skills/langbot-testing/probes/langbot-overhead-accounting-contract.mjs b/skills/skills/langbot-testing/probes/langbot-overhead-accounting-contract.mjs new file mode 100644 index 000000000..5338df003 --- /dev/null +++ b/skills/skills/langbot-testing/probes/langbot-overhead-accounting-contract.mjs @@ -0,0 +1,162 @@ +#!/usr/bin/env node + +import { mkdir, writeFile } from "node:fs/promises"; +import { join, resolve } from "node:path"; +import { env, exit } from "node:process"; + +function pad(value, size = 2) { + return String(value).padStart(size, "0"); +} + +function localIsoWithOffset(date = new Date()) { + const offsetMinutes = -date.getTimezoneOffset(); + const sign = offsetMinutes >= 0 ? "+" : "-"; + const absolute = Math.abs(offsetMinutes); + return [ + `${date.getFullYear()}-${pad(date.getMonth() + 1)}-${pad(date.getDate())}`, + `T${pad(date.getHours())}:${pad(date.getMinutes())}:${pad(date.getSeconds())}.${pad(date.getMilliseconds(), 3)}`, + `${sign}${pad(Math.floor(absolute / 60))}:${pad(absolute % 60)}`, + ].join(""); +} + +function timestampSlug(date = new Date()) { + return date.toISOString().replace(/\.\d{3}Z$/, "Z").replace(/[^0-9A-Za-z]+/g, "-").replace(/^-|-$/g, ""); +} + +function percentile(values, percentileValue) { + if (values.length === 0) return 0; + const sorted = [...values].sort((a, b) => a - b); + const index = Math.min(sorted.length - 1, Math.ceil((percentileValue / 100) * sorted.length) - 1); + return Number(sorted[index].toFixed(3)); +} + +function stats(values) { + return { + min: Number(Math.min(...values).toFixed(3)), + p50: percentile(values, 50), + p95: percentile(values, 95), + p99: percentile(values, 99), + max: Number(Math.max(...values).toFixed(3)), + }; +} + +function threshold(actual, limit, operator) { + const pass = operator === "<=" ? actual <= limit : actual >= limit; + return { actual, [operator === "<=" ? "max" : "min"]: limit, pass }; +} + +function makeSample(index) { + const ingress = 1 + (index % 5) * 0.22; + const pipeline = 2.8 + (index % 7) * 0.31; + const persistence = 1.1 + (index % 4) * 0.2; + const pluginIpc = 1.9 + (index % 6) * 0.27; + const rag = index % 3 === 0 ? 4.4 : 0.8 + (index % 5) * 0.18; + const streaming = 1.5 + (index % 8) * 0.24; + const provider = 80 + (index % 13) * 11; + const externalTool = index % 4 === 0 ? 25 + (index % 9) * 3 : 0; + const network = 8 + (index % 10) * 1.7; + const overhead = ingress + pipeline + persistence + pluginIpc + rag + streaming; + const external = provider + externalTool + network; + const total = overhead + external; + return { + index, + segments_ms: { + ingress, + pipeline, + persistence, + plugin_ipc: pluginIpc, + rag, + streaming, + provider, + external_tool: externalTool, + network, + }, + langbot_overhead_ms: Number(overhead.toFixed(3)), + external_latency_ms: Number(external.toFixed(3)), + e2e_latency_ms: Number(total.toFixed(3)), + accounting_gap_ms: Number((total - external - overhead).toFixed(6)), + }; +} + +async function main() { + const root = resolve(env.LBS_ROOT || process.cwd()); + const caseId = "langbot-overhead-accounting-contract"; + const runId = env.LBS_RUN_ID || `${timestampSlug()}-${caseId}`; + const evidenceDir = resolve(env.LBS_EVIDENCE_DIR || join(root, "reports", "evidence", runId)); + await mkdir(evidenceDir, { recursive: true }); + + const startedAt = new Date(); + const sampleCount = Number(env.LANGBOT_PERF_CONTRACT_SAMPLES || "80"); + const overheadP95BudgetMs = Number(env.LANGBOT_PERF_OVERHEAD_P95_MS || "25"); + const samples = Array.from({ length: sampleCount }, (_, index) => makeSample(index)); + const overheads = samples.map((sample) => sample.langbot_overhead_ms); + const e2e = samples.map((sample) => sample.e2e_latency_ms); + const external = samples.map((sample) => sample.external_latency_ms); + const gaps = samples.map((sample) => Math.abs(sample.accounting_gap_ms)); + const memory = process.memoryUsage(); + + const metrics = { + probe: caseId, + sample_count: sampleCount, + langbot_overhead_ms: stats(overheads), + e2e_latency_ms: stats(e2e), + external_latency_ms: stats(external), + accounting_gap_max_ms: Number(Math.max(...gaps).toFixed(6)), + samples, + }; + const thresholds = { + sample_count: threshold(sampleCount, 50, ">="), + langbot_overhead_p95_ms: threshold(metrics.langbot_overhead_ms.p95, overheadP95BudgetMs, "<="), + accounting_gap_max_ms: threshold(metrics.accounting_gap_max_ms, 0.001, "<="), + }; + const status = Object.values(thresholds).every((item) => item.pass) ? "pass" : "fail"; + const metricsPath = join(evidenceDir, "metrics.json"); + const thresholdsPath = join(evidenceDir, "thresholds.json"); + const resourceLogPath = join(evidenceDir, "resource-log.json"); + const automationResultPath = join(evidenceDir, "automation-result.json"); + const resultPath = join(evidenceDir, "result.json"); + + await writeFile(metricsPath, `${JSON.stringify(metrics, null, 2)}\n`, "utf8"); + await writeFile(thresholdsPath, `${JSON.stringify(thresholds, null, 2)}\n`, "utf8"); + await writeFile(resourceLogPath, `${JSON.stringify({ memory, pid: process.pid }, null, 2)}\n`, "utf8"); + + const finishedAt = new Date(); + const result = { + source: "automation", + case_id: caseId, + run_id: runId, + status, + reason: status === "pass" + ? "Overhead accounting contract passed all thresholds." + : "Overhead accounting contract breached one or more thresholds.", + started_at: startedAt.toISOString(), + started_at_local: localIsoWithOffset(startedAt), + finished_at: finishedAt.toISOString(), + finished_at_local: localIsoWithOffset(finishedAt), + duration_ms: finishedAt.getTime() - startedAt.getTime(), + metrics_summary: { + sample_count: metrics.sample_count, + langbot_overhead_p95_ms: metrics.langbot_overhead_ms.p95, + e2e_latency_p95_ms: metrics.e2e_latency_ms.p95, + external_latency_p95_ms: metrics.external_latency_ms.p95, + accounting_gap_max_ms: metrics.accounting_gap_max_ms, + }, + thresholds_summary: thresholds, + artifacts: { + metrics_json: metricsPath, + thresholds_json: thresholdsPath, + resource_log_json: resourceLogPath, + automation_result_json: automationResultPath, + result_json: resultPath, + }, + evidence_collected: ["metrics", "resource_log", "filesystem"], + }; + + const resultText = `${JSON.stringify(result, null, 2)}\n`; + await writeFile(automationResultPath, resultText, "utf8"); + await writeFile(resultPath, resultText, "utf8"); + console.log(JSON.stringify(result, null, 2)); + exit(status === "pass" ? 0 : 1); +} + +await main(); diff --git a/skills/skills/langbot-testing/probes/lib/fake-provider-timing.mjs b/skills/skills/langbot-testing/probes/lib/fake-provider-timing.mjs new file mode 100644 index 000000000..b383b2663 --- /dev/null +++ b/skills/skills/langbot-testing/probes/lib/fake-provider-timing.mjs @@ -0,0 +1,134 @@ +export function summarizeFakeProviderState(state) { + if (!state) return null; + const recentRequests = Array.isArray(state.recent_requests) ? state.recent_requests : []; + const chatRequests = recentRequests.filter((request) => String(request?.path || "").includes("/chat/completions")); + const successfulRequests = chatRequests.filter((request) => request?.status === "ok"); + const faultRequests = chatRequests.filter((request) => ( + request?.should_fail === true + || request?.status === "http_fault" + || (Number.isFinite(request?.http_status) && request.http_status >= 400) + )); + + return { + status: state.status || "unknown", + url: state.url || "", + request_count: Number.isFinite(state.request_count) ? state.request_count : recentRequests.length, + recent_request_count: recentRequests.length, + chat_request_count: chatRequests.length, + fault_count: faultRequests.length, + streamed_request_count: chatRequests.filter((request) => request?.stream === true).length, + duration_ms: stats(chatRequests.map((request) => numberOrNull(request?.duration_ms)).filter(Number.isFinite)), + successful_duration_ms: stats(successfulRequests.map((request) => numberOrNull(request?.duration_ms)).filter(Number.isFinite)), + first_chunk_ms: stats(successfulRequests.map((request) => numberOrNull(request?.first_chunk_ms)).filter(Number.isFinite)), + first_content_chunk_ms: stats(successfulRequests.map((request) => numberOrNull(request?.first_content_chunk_ms)).filter(Number.isFinite)), + content_chunk_count: stats(successfulRequests.map((request) => numberOrNull(request?.content_chunk_count)).filter(Number.isFinite)), + config: state.config || {}, + }; +} + +export function buildProviderTimingMetrics(samples, state) { + const recentRequests = Array.isArray(state?.recent_requests) ? state.recent_requests : []; + const byExpectedText = new Map(); + for (const request of recentRequests) { + const expected = String(request?.expected_text || ""); + if (!expected) continue; + if (!byExpectedText.has(expected)) byExpectedText.set(expected, []); + byExpectedText.get(expected).push(request); + } + + const segments = []; + const missingExpectedText = []; + for (const sample of samples) { + const expected = String(sample?.expected_text || ""); + if (!expected) continue; + const request = (byExpectedText.get(expected) || []).shift(); + if (!request) { + missingExpectedText.push(expected); + continue; + } + const segment = buildTimingSegment(sample, request); + if (segment) segments.push(segment); + } + + const values = (key) => segments.map((segment) => numberOrNull(segment[key])).filter(Number.isFinite); + return { + matched_request_count: segments.length, + missing_provider_match_count: missingExpectedText.length, + missing_expected_text: missingExpectedText.slice(0, 20), + send_to_provider_start_ms: stats(values("send_to_provider_start_ms")), + provider_duration_ms: stats(values("provider_duration_ms")), + provider_finish_to_ws_final_ms: stats(values("provider_finish_to_ws_final_ms")), + langbot_overhead_estimate_ms: stats(values("langbot_overhead_estimate_ms")), + e2e_minus_provider_ms: stats(values("e2e_minus_provider_ms")), + provider_first_content_to_ws_first_content_ms: stats(values("provider_first_content_to_ws_first_content_ms")), + segments, + }; +} + +function buildTimingSegment(sample, request) { + const sentEpochMs = numberOrNull(sample.sent_epoch_ms); + const finishedEpochMs = numberOrNull(sample.finished_epoch_ms); + const providerStartedEpochMs = numberOrNull(request.started_epoch_ms); + const providerFinishedEpochMs = numberOrNull(request.finished_epoch_ms); + const providerFirstContentEpochMs = numberOrNull(request.first_content_chunk_epoch_ms); + const wsFirstContentEpochMs = numberOrNull(sample.first_assistant_content_epoch_ms); + const responseDurationMs = numberOrNull(sample.response_duration_ms); + const providerDurationMs = numberOrNull(request.duration_ms); + + const sendToProviderStartMs = finiteDelta(providerStartedEpochMs, sentEpochMs); + const providerFinishToWsFinalMs = finiteDelta(finishedEpochMs, providerFinishedEpochMs); + const e2eMinusProviderMs = Number.isFinite(responseDurationMs) && Number.isFinite(providerDurationMs) + ? rounded(responseDurationMs - providerDurationMs) + : null; + const overheadEstimateMs = Number.isFinite(sendToProviderStartMs) && Number.isFinite(providerFinishToWsFinalMs) + ? rounded(sendToProviderStartMs + providerFinishToWsFinalMs) + : e2eMinusProviderMs; + + return { + sample_index: sample.index, + pipeline_label: sample.pipeline_label || "", + expected_text: sample.expected_text || "", + provider_request_id: request.id || "", + provider_request_number: request.request_number ?? null, + response_duration_ms: responseDurationMs, + provider_duration_ms: providerDurationMs, + send_to_provider_start_ms: sendToProviderStartMs, + provider_finish_to_ws_final_ms: providerFinishToWsFinalMs, + langbot_overhead_estimate_ms: overheadEstimateMs, + e2e_minus_provider_ms: e2eMinusProviderMs, + provider_first_content_to_ws_first_content_ms: finiteDelta(wsFirstContentEpochMs, providerFirstContentEpochMs), + provider_status: request.status || "", + provider_http_status: request.http_status ?? null, + }; +} + +function finiteDelta(left, right) { + return Number.isFinite(left) && Number.isFinite(right) ? rounded(left - right) : null; +} + +export function stats(values) { + if (values.length === 0) return { min: 0, p50: 0, p95: 0, p99: 0, max: 0 }; + return { + min: rounded(Math.min(...values)), + p50: percentile(values, 50), + p95: percentile(values, 95), + p99: percentile(values, 99), + max: rounded(Math.max(...values)), + }; +} + +export function percentile(values, percentileValue) { + if (values.length === 0) return 0; + const sorted = [...values].sort((a, b) => a - b); + const index = Math.min(sorted.length - 1, Math.ceil((percentileValue / 100) * sorted.length) - 1); + return rounded(sorted[index]); +} + +export function rounded(value) { + return Number(value.toFixed(3)); +} + +function numberOrNull(value) { + const number = Number(value); + return Number.isFinite(number) ? number : null; +} diff --git a/skills/skills/langbot-testing/references/performance-reliability-testing.md b/skills/skills/langbot-testing/references/performance-reliability-testing.md new file mode 100644 index 000000000..42aaa0467 --- /dev/null +++ b/skills/skills/langbot-testing/references/performance-reliability-testing.md @@ -0,0 +1,285 @@ +# Performance And Reliability Testing + +Use this reference when a QA request asks whether LangBot is fast enough, +stable under load, or resilient to controlled faults. + +These probes are manual/non-required QA gates unless a case or suite explicitly +states otherwise. They depend on a live local backend, mutable QA fixtures, and +operator-selected environment variables, so do not promote them to required CI +checks until fake-provider isolation, ownership markers, and cleanup are in +place. + +## Scope + +Treat `skills/` as the QA control plane: + +- Cases define intent, readiness, thresholds, and required evidence. +- Probe scripts collect metrics, traces, resource logs, and artifacts. +- Reports classify the same run as `pass`, `fail`, `blocked`, + `env_issue`, or `flaky`. + +Do not turn `skills/` into a load generator or chaos engine. Call a focused +tool from a `mode: probe` case when the test needs one, for example k6, +Locust, pytest-benchmark, Playwright trace collection, Toxiproxy, Docker, or a +Kubernetes disruption tool. + +## LangBot Performance Model + +For LangBot, performance is the cost LangBot adds around external systems: + +```text +LangBot overhead = end-to-end latency - provider latency - external tool latency - network/fault injection latency +``` + +Measure user experience and internal composition separately: + +- WebUI load and interaction latency. +- Debug Chat send-to-first-visible-token and send-to-completion latency. +- Pipeline, RAG, plugin runtime, MCP, AgentRunner, and persistence segment + latency. +- Queue wait time, concurrency, throughput, timeout rate, and p95/p99 latency. +- Startup, plugin install, knowledge-base ingestion, migration, and recovery + time. + +Do not report a single message round-trip time as "LangBot performance" unless +the report also explains external provider/tool/network time. + +## Evidence Contract + +Performance and reliability cases should declare the evidence they need: + +- `metrics`: machine-readable latency, throughput, error-rate, or recovery + metrics, usually `metrics.json`. +- `resource_log`: CPU, memory, process, connection, queue, or file descriptor + samples. +- `trace`: browser, HTTP, database, or runtime trace artifacts. +- `profile`: CPU, memory, or flamegraph profile artifacts. +- `backend_log`, `network`, `api_diagnostic`, and `filesystem` as supporting + evidence when relevant. + +Automation should write `automation-result.json` with these fields when +available: + +```json +{ + "status": "pass", + "reason": "Probe passed all thresholds.", + "metrics_summary": { + "langbot_overhead_p95_ms": 12.4, + "error_rate": 0 + }, + "thresholds_summary": { + "langbot_overhead_p95_ms": { "actual": 12.4, "max": 50, "pass": true } + }, + "artifacts": { + "metrics_json": "/path/to/metrics.json" + }, + "evidence_collected": ["metrics", "filesystem"] +} +``` + +Synthetic contract probes are useful for checking the QA harness, but they are +not live product performance results. Label them as contract probes in the case +title, checks, and report. + +## Chaos And Reliability Rules + +Chaos tests must be narrow and reversible: + +- Declare the fault model in `fault_model_json`. +- Record blast radius, target component, injection method, duration, and abort + conditions. +- Capture recovery checks and cleanup steps in the case. +- Classify unavailable dependencies as `env_issue` unless the target behavior + is LangBot's handling of that dependency failure. +- Do not run destructive fault injection against a shared or production-like + instance without explicit operator approval. + +Recommended first fault models: + +- Provider timeout or HTTP 429 from a fake provider endpoint. +- Plugin runtime disconnect/reconnect in a local instance. +- MCP stdio server exits mid-call. +- RAG parser fixture fails once and recovers on retry. +- Backend API endpoint returns 5xx from a controlled local proxy. + +## Starter Live Probes + +The starter gate separates QA-harness contracts from live product checks: + +- `langbot-overhead-accounting-contract` verifies that reports can carry + overhead accounting metrics. It uses deterministic synthetic samples and is + not live product performance. +- `langbot-fault-taxonomy-contract` verifies that fault scenarios declare + expected status, recovery, and cleanup before destructive chaos tests are + added. +- `langbot-live-backend-latency` checks the unauthenticated `/healthz` + endpoint for basic backend responsiveness. +- `langbot-live-control-plane-api` checks `/healthz` and + `/api/v1/system/info` for HTTP 200, JSON `code: 0`, response shape, and + per-endpoint p95 latency. +- `langbot-live-backend-log-health` scans the recent backend log window for + fail-severity runtime findings. It is the reliability guard that should fail + the gate when HTTP probes pass but backend logs contain Traceback, ImportError, + ERROR, unclosed sessions, or unawaited coroutine signals. + +Do not treat these starter live probes as Debug Chat or model-provider +performance. They are control-plane readiness checks; user-facing performance +needs browser/WebSocket/message-path measurements. + +## Debug Chat Load And Fake Provider Baseline + +Use `langbot-fake-provider-debug-chat-load` before real-provider load checks. +The setup automation starts a local OpenAI-compatible fake provider, registers +it as a normal LangBot provider/model, configures a local-agent pipeline, resets +Debug Chat, and then drives concurrent WebSocket messages through the live +backend. + +This is not a mocked backend test. It still exercises: + +- provider/model persistence and runtime reload; +- LiteLLM OpenAI-compatible requester path; +- local-agent runner selection and pipeline execution; +- Debug Chat WebSocket adapter and broadcast behavior; +- backend concurrency, timeout, and error-rate accounting. + +The fake provider is deterministic and can inject controlled latency or faults +with `LANGBOT_FAKE_PROVIDER_*` variables, so it is the baseline for LangBot +message-path overhead. A fake-provider process keeps process-global config, +request counters, and recent request history; run fake-provider probes serially +or give each run its own provider instance. Concurrent probes against the same +fake-provider URL can reset or reconfigure each other's metrics. + +The probe uses unique expected response tokens per +request because Debug Chat broadcasts messages to every connection in the same +session; unique tokens prevent one connection from counting another +connection's response as its own. + +When the fake provider is used, reports also include provider-side timing in +`metrics.json`: + +- `fake_provider.duration_ms` and `fake_provider.first_content_chunk_ms` + measure the controlled provider itself. +- `provider_timing.send_to_provider_start_ms` estimates WebSocket ingress, + pipeline dispatch, runner setup, and requester time before the provider + receives the request. +- `provider_timing.provider_finish_to_ws_final_ms` estimates the path from + provider completion back to the final Debug Chat WebSocket response. +- `provider_timing.langbot_overhead_estimate_ms` is the sum of those two + LangBot-side segments when wall-clock timestamps can be matched by the + unique expected response token. + +After the baseline passes, run `langbot-fake-provider-debug-chat-slow-load` to +keep the same live backend path while injecting deterministic streaming latency. +Run `langbot-fake-provider-debug-chat-fault-recovery` to inject bounded HTTP +provider failures and require both observed failures and later successful +requests. The fault-recovery case is deliberately sequential because failed +Debug Chat responses do not carry a unique success token that can be attributed +to one concurrent connection. + +Run `langbot-fake-provider-debug-chat-cross-pipeline-isolation` separately via +`langbot-debug-chat-isolation-gate`. Current LangBot releases may fail it because +of product bug [#2286](https://github.com/langbot-app/LangBot/issues/2286), where +Debug Chat replies can read singleton WebSocket proxy pipeline state after a +later message overwrites it. Treat that failure as regression evidence for the +product fix rather than as a fake-provider latency finding. + +Use `langbot-space-debug-chat-concurrency-smoke` after the fake-provider +baseline. It runs a deliberately small real Space-provider batch and reports +user-visible latency, not pure LangBot overhead. Space/model/network failures +are dependency findings until the fake baseline shows the same symptom. +If a Space smoke passes but log guard finds telemetry posting Tracebacks, +classify that separately as `telemetry-proxy-noise` instead of clearing the +proxy or treating the Debug Chat path as failed. + +Useful commands: + +```bash +rtk bin/lbs test run langbot-fake-provider-debug-chat-load --run-id langbot-fake-load-local +rtk bin/lbs test run langbot-fake-provider-debug-chat-slow-load --run-id langbot-fake-slow-local +rtk bin/lbs test run langbot-fake-provider-debug-chat-fault-recovery --run-id langbot-fake-fault-local +rtk bin/lbs suite run langbot-debug-chat-isolation-gate --run-id langbot-debug-chat-isolation-local --include-manual-check +rtk bin/lbs test run langbot-space-debug-chat-concurrency-smoke --run-id langbot-space-smoke-local +rtk bin/lbs suite run langbot-debug-chat-load-gate --run-id langbot-debug-chat-load-local --include-manual-check +``` + +## Gate Layers + +Use the smallest gate that answers the quality question: + +- `langbot-performance-contract-gate`: fast synthetic checks for report shape, + threshold accounting, and fault taxonomy. Good for PR feedback when no live + service is running. +- `langbot-live-backend-gate`: live backend `/healthz`, + `/api/v1/system/info`, and backend log health. Good after starting a local + LangBot backend. +- `langbot-user-path-performance-gate`: browser-visible user path performance, + starting with Pipeline Debug Chat send-to-visible-completion latency. Run it + only when the browser profile and target pipeline are ready. +- `langbot-debug-chat-load-gate`: manual WebSocket Debug Chat load checks, + starting with controlled fake-provider baseline, slow-provider, and + fault-recovery profiles, plus an optional low-volume real Space-provider + smoke. Run fake-provider cases serially when they share a provider URL. +- `langbot-debug-chat-isolation-gate`: manual cross-pipeline Debug Chat + isolation regression gate. Current releases may fail because of #2286; keep it + separate from the normal load gate until that product fix lands. +- `langbot-performance-reliability-gate`: combined starter gate for synthetic + contracts plus live backend checks. + +Keep environment diagnostics separate from product regressions. For example, a +SOCKS proxy without Python `socksio` support should be fixed or clearly +classified by `bin/lbs env doctor`; do not hide the resulting backend +Traceback in reports. + +## Debug Chat Performance + +`pipeline-debug-chat-performance` reuses the browser Debug Chat automation and +adds `metrics.json`, `metrics_summary`, and `thresholds_summary` to +`automation-result.json`. + +Current metric: + +```text +response_duration_ms = prompt send -> expected assistant response visible and stable +``` + +This is a user-path metric, not pure LangBot overhead. If it regresses, inspect +provider latency, model route health, plugin/runtime logs, WebSocket behavior, +and browser console/network evidence before attributing the whole duration to +LangBot. + +### User-Path Gate Runbook + +1. Start the backend and frontend. The frontend must be launched with + `VITE_API_BASE_URL="$LANGBOT_BACKEND_URL"` so browser API calls reach the + backend. +2. Run `node scripts/e2e/ensure-local-agent-pipeline.mjs --write-env`. The + setup refreshes the local QA login, skips the wizard, prepares a Debug Chat + pipeline, scans Space models, tests candidates, writes tested fallback + models, and writes the selected pipeline/model env values to + `skills/.env.local`. +3. If setup returns `env_issue`, read `model_tests` and provider errors first. + A missing Space key, failed Space scan, or unavailable model route is not a + LangBot performance regression. +4. Run + `bin/lbs suite run langbot-user-path-performance-gate --include-manual-check`. +5. Interpret `response_p95_ms` as browser-visible send-to-completion time. It + includes provider latency; use backend logs and model test evidence to + separate LangBot overhead from the external model route. + +The setup keeps a `max-round` value in the generated pipeline config only +because the current backend truncator still reads that field directly. Do not +use it as a quality requirement for future local-agent behavior. + +## Running The First Gate + +Start with the reusable suite: + +```bash +rtk bin/lbs suite plan langbot-performance-reliability-gate +rtk bin/lbs suite start langbot-performance-reliability-gate --run-id langbot-perf-rel-local +``` + +Run synthetic contract probes first. Run live probes only after the selected +backend/frontend instance is reachable and the run owner accepts any fault +scope. diff --git a/skills/skills/langbot-testing/suites/langbot-debug-chat-isolation-gate.yaml b/skills/skills/langbot-testing/suites/langbot-debug-chat-isolation-gate.yaml new file mode 100644 index 000000000..d2b31dd32 --- /dev/null +++ b/skills/skills/langbot-testing/suites/langbot-debug-chat-isolation-gate.yaml @@ -0,0 +1,13 @@ +id: langbot-debug-chat-isolation-gate +title: "LangBot Debug Chat isolation gate" +description: "Manual/non-required cross-pipeline Debug Chat isolation gate. Current releases may fail this gate because of product bug #2286; use it as regression evidence after the routing fix lands." +type: reliability +priority: p1 +tags: + - reliability + - debug-chat + - websocket + - isolation + - concurrency +cases: + - langbot-fake-provider-debug-chat-cross-pipeline-isolation diff --git a/skills/skills/langbot-testing/suites/langbot-debug-chat-load-gate.yaml b/skills/skills/langbot-testing/suites/langbot-debug-chat-load-gate.yaml new file mode 100644 index 000000000..5b4950f16 --- /dev/null +++ b/skills/skills/langbot-testing/suites/langbot-debug-chat-load-gate.yaml @@ -0,0 +1,15 @@ +id: langbot-debug-chat-load-gate +title: "LangBot Debug Chat load gate" +description: "Manual/non-required message-path load checks for Pipeline Debug Chat: controlled fake-provider baseline, slow-provider and fault-recovery profiles, plus optional real Space-provider smoke. Cross-pipeline isolation is split into langbot-debug-chat-isolation-gate because current releases may fail it due to product bug #2286." +type: performance +priority: p1 +tags: + - performance + - debug-chat + - websocket + - load +cases: + - langbot-fake-provider-debug-chat-load + - langbot-fake-provider-debug-chat-slow-load + - langbot-fake-provider-debug-chat-fault-recovery + - langbot-space-debug-chat-concurrency-smoke diff --git a/skills/skills/langbot-testing/suites/langbot-live-backend-gate.yaml b/skills/skills/langbot-testing/suites/langbot-live-backend-gate.yaml new file mode 100644 index 000000000..58a978527 --- /dev/null +++ b/skills/skills/langbot-testing/suites/langbot-live-backend-gate.yaml @@ -0,0 +1,14 @@ +id: langbot-live-backend-gate +title: "LangBot live backend reliability gate" +description: "Live backend control-plane responsiveness and runtime log health checks for a locally running LangBot instance." +type: reliability +priority: p1 +tags: + - performance + - reliability + - live-backend + - metrics +cases: + - langbot-live-backend-latency + - langbot-live-control-plane-api + - langbot-live-backend-log-health diff --git a/skills/skills/langbot-testing/suites/langbot-performance-contract-gate.yaml b/skills/skills/langbot-testing/suites/langbot-performance-contract-gate.yaml new file mode 100644 index 000000000..b5a9eb47f --- /dev/null +++ b/skills/skills/langbot-testing/suites/langbot-performance-contract-gate.yaml @@ -0,0 +1,13 @@ +id: langbot-performance-contract-gate +title: "LangBot performance contract gate" +description: "Fast synthetic contract checks for performance metric accounting and non-destructive reliability fault taxonomy." +type: contract +priority: p1 +tags: + - performance + - reliability + - contract + - metrics +cases: + - langbot-overhead-accounting-contract + - langbot-fault-taxonomy-contract diff --git a/skills/skills/langbot-testing/suites/langbot-performance-reliability-gate.yaml b/skills/skills/langbot-testing/suites/langbot-performance-reliability-gate.yaml new file mode 100644 index 000000000..1e0d58d26 --- /dev/null +++ b/skills/skills/langbot-testing/suites/langbot-performance-reliability-gate.yaml @@ -0,0 +1,16 @@ +id: langbot-performance-reliability-gate +title: "LangBot performance and reliability starter gate" +description: "Starter gate for LangBot performance accounting, live backend control-plane latency, and non-destructive fault taxonomy checks." +type: reliability +priority: p1 +tags: + - performance + - reliability + - metrics + - chaos +cases: + - langbot-overhead-accounting-contract + - langbot-fault-taxonomy-contract + - langbot-live-backend-latency + - langbot-live-control-plane-api + - langbot-live-backend-log-health diff --git a/skills/skills/langbot-testing/suites/langbot-user-path-performance-gate.yaml b/skills/skills/langbot-testing/suites/langbot-user-path-performance-gate.yaml new file mode 100644 index 000000000..a6a138ec0 --- /dev/null +++ b/skills/skills/langbot-testing/suites/langbot-user-path-performance-gate.yaml @@ -0,0 +1,12 @@ +id: langbot-user-path-performance-gate +title: "LangBot user-path performance gate" +description: "Browser-visible performance checks for user-facing LangBot paths such as Pipeline Debug Chat." +type: performance +priority: p1 +tags: + - performance + - browser + - debug-chat + - user-path +cases: + - pipeline-debug-chat-performance diff --git a/skills/skills/langbot-testing/troubleshooting/telemetry-proxy-noise.yaml b/skills/skills/langbot-testing/troubleshooting/telemetry-proxy-noise.yaml new file mode 100644 index 000000000..945109029 --- /dev/null +++ b/skills/skills/langbot-testing/troubleshooting/telemetry-proxy-noise.yaml @@ -0,0 +1,23 @@ +id: telemetry-proxy-noise +title: "Telemetry posting fails through the proxy while the target flow succeeds" +date: 2026-06-25 +category: env_issue +symptoms: + - "The target Debug Chat or provider smoke request completes successfully." + - "The same log window contains a Traceback for telemetry posting." + - "The traceback references the Space telemetry endpoint." +patterns: + - "Failed to post telemetry" + - "https://space.langbot.app/api/v1/telemetry" + - "httpx.ConnectError" +likely_causes: + - "The backend process inherited proxy settings that are required for model/provider access but unreliable for telemetry posting." + - "The telemetry endpoint is temporarily unreachable through the local proxy route." + - "TLS or proxy negotiation failed for the non-critical telemetry request." +fix_steps: + - "Keep the proxy configuration needed for model/provider access; do not clear it only to hide telemetry noise." + - "Check that uppercase and lowercase proxy variables are consistent before rerunning a live Space smoke." + - "Classify the target flow and log-health result separately: a successful Debug Chat run can still have an environment log-health finding." +verification: "A rerun shows the target case success patterns and no telemetry Traceback in the scanned log window, or the report explicitly records the telemetry issue as environment noise." +related_cases: + - langbot-space-debug-chat-concurrency-smoke diff --git a/skills/src/commands/env.ts b/skills/src/commands/env.ts index d5d1eeaf4..76ef33aec 100644 --- a/skills/src/commands/env.ts +++ b/skills/src/commands/env.ts @@ -1,5 +1,7 @@ import { existsSync } from "node:fs"; +import { spawnSync } from "node:child_process"; import { Socket } from "node:net"; +import { join } from "node:path"; import type { CommandContext } from "../types.ts"; import { parseOptions } from "../cli.ts"; import { loadEnv } from "../fs.ts"; @@ -88,6 +90,37 @@ function compareProxyPair(env: Record, upper: string, lower: str return null; } +function envValue(env: Record, key: string): string { + return process.env[key] ?? env[key] ?? ""; +} + +function activeSocksProxy(env: Record): { key: string; value: string } | null { + for (const key of ["ALL_PROXY", "all_proxy", "HTTPS_PROXY", "https_proxy", "HTTP_PROXY", "http_proxy"]) { + const value = envValue(env, key); + if (/^socks/i.test(value)) return { key, value }; + } + return null; +} + +function checkSocksio(env: Record): string | null { + const proxy = activeSocksProxy(env); + if (!proxy) return null; + + const repo = env.LANGBOT_REPO; + const python = repo ? join(repo, ".venv", "bin", "python") : ""; + if (!python || !existsSync(python)) { + return `SOCKS proxy ${proxy.key} is configured (${redactEnvValue(proxy.key, proxy.value)}), but LangBot venv python was not found; after creating the venv, verify it can import socksio.`; + } + + const result = spawnSync(python, ["-c", "import socksio"], { + encoding: "utf8", + timeout: 5000, + }); + if (result.status === 0) return null; + + return `SOCKS proxy ${proxy.key} is configured (${redactEnvValue(proxy.key, proxy.value)}), but ${python} cannot import socksio; run \`${python} -m pip install socksio\` or start LangBot without SOCKS proxy env.`; +} + export async function commandEnvDoctor(ctx: CommandContext): Promise { const env = loadEnv(ctx.root); const failures: string[] = []; @@ -117,6 +150,8 @@ export async function commandEnvDoctor(ctx: CommandContext): Promise { ]) { if (mismatch) failures.push(mismatch); } + const socksioFailure = checkSocksio(env); + if (socksioFailure) failures.push(socksioFailure); for (const [label, result] of await Promise.all([ checkUrl("LANGBOT_BACKEND_URL", env.LANGBOT_BACKEND_URL).then((result) => ["LANGBOT_BACKEND_URL", result] as const), diff --git a/skills/src/commands/suite.ts b/skills/src/commands/suite.ts index 403156100..7ab556c5b 100644 --- a/skills/src/commands/suite.ts +++ b/skills/src/commands/suite.ts @@ -465,6 +465,41 @@ function outputTail(value: string | Buffer | null | undefined): string { return String(value ?? "").trim().slice(-4000); } +function exitStatusFromResultStatus(status: string): number { + if (status === "pass") return 0; + if (status === "blocked" || status === "env_issue" || status === "flaky") return 2; + return 1; +} + +function executionStatusFromExitStatus(status: number): string { + if (status === 0) return "ok"; + if (status === 2) return "classified"; + return "nonzero"; +} + +function executionFromCaseResultFile(caseItem: Record): Record | null { + const resultPath = join(String(caseItem.evidence_dir), "result.json"); + if (!existsSync(resultPath)) return null; + try { + const parsed = JSON.parse(readFileSync(resultPath, "utf8")) as Record; + if ( + parsed.case_id !== caseItem.id || + parsed.run_id !== caseItem.run_id || + typeof parsed.status !== "string" + ) return null; + const exitStatus = exitStatusFromResultStatus(parsed.status); + return { + status: executionStatusFromExitStatus(exitStatus), + exit_status: exitStatus, + reason: typeof parsed.reason === "string" ? parsed.reason : "result.json completed", + result_status: parsed.status, + result_json: resultPath, + }; + } catch { + return null; + } +} + function executionProblemStatus(executions: Array>): string { const statuses = executions.map((item) => String(item.status)); if (statuses.includes("nonzero")) return "fail"; @@ -523,12 +558,18 @@ export function commandSuiteRun(ctx: CommandContext): number { encoding: "utf8", stdio: options.json === true ? "pipe" : "inherit", }); - const status = result.error ? 1 : result.status ?? 1; + const fileExecution = result.error ? executionFromCaseResultFile(caseItem) : null; + const status = typeof fileExecution?.exit_status === "number" + ? fileExecution.exit_status + : result.error ? 1 : result.status ?? 1; executions.push({ id: caseItem.id, - status: status === 0 ? "ok" : "nonzero", + status: fileExecution?.status ?? executionStatusFromExitStatus(status), exit_status: status, - reason: result.error?.message || "", + reason: fileExecution?.reason ?? result.error?.message ?? "", + result_status: fileExecution?.result_status, + result_json: fileExecution?.result_json, + spawn_error: fileExecution && result.error ? result.error.message : undefined, stdout: outputTail(result.stdout), stderr: outputTail(result.stderr), }); diff --git a/skills/src/commands/test.ts b/skills/src/commands/test.ts index 2cce7a1e5..67ddc3122 100644 --- a/skills/src/commands/test.ts +++ b/skills/src/commands/test.ts @@ -271,7 +271,7 @@ function reportTemplate(mode: string): Record { target_tested: "Probe target, endpoint, file, command, or service actually checked", execution_path: "automation script | shell command | direct API | other", probe_result: "What the probe observed", - logs_or_artifacts: "Log, filesystem, API, or other artifact paths collected", + metrics_or_artifacts: "Metrics, logs, filesystem artifacts, traces, or profiles collected", diagnostics: "Extra diagnostics used, if any", matched_troubleshooting: "Troubleshooting ids matched, if any", assets_to_update: "New case/reference/troubleshooting entries to add", @@ -320,7 +320,7 @@ function manualEvidenceTemplate(mode: string): ManualEvidenceTemplate { target_tested: "TODO: probe target, endpoint, file, command, or service actually checked", execution_path: "TODO: automation script | shell command | direct API | other", probe_result: "TODO: observed probe result", - logs_or_artifacts: "TODO: evidence paths or skipped reason", + metrics_or_artifacts: "TODO: metrics, logs, filesystem artifacts, traces, or profiles collected", diagnostics: "TODO: additional diagnostics used, if any", matched_troubleshooting: "TODO: troubleshooting ids matched, if any", assets_to_update: "TODO: case/reference/troubleshooting updates to make", @@ -1099,6 +1099,41 @@ function executionTail(value: string | Buffer | null | undefined): string { return String(value ?? "").trim().slice(-4000); } +function exitStatusFromResultStatus(status: string): number { + if (status === "pass") return 0; + if (status === "blocked" || status === "env_issue" || status === "flaky") return 2; + return 1; +} + +function executionStatusFromExitStatus(status: number): string { + if (status === 0) return "ok"; + if (status === 2) return "classified"; + return "nonzero"; +} + +function executionFromAutomationResultFile( + evidenceDir: string, + caseId: string, + runId: string, +): { status: string; exit_status: number; reason: string; result_status: string; path: string } | null { + const resultPath = join(evidenceDir, "automation-result.json"); + if (!existsSync(resultPath)) return null; + try { + const parsed = JSON.parse(readFileSync(resultPath, "utf8")) as Record; + if (parsed.case_id !== caseId || parsed.run_id !== runId || typeof parsed.status !== "string") return null; + const exitStatus = exitStatusFromResultStatus(parsed.status); + return { + status: executionStatusFromExitStatus(exitStatus), + exit_status: exitStatus, + reason: typeof parsed.reason === "string" ? parsed.reason : "automation-result.json completed", + result_status: parsed.status, + path: resultPath, + }; + } catch { + return null; + } +} + function runSetupAutomation( ctx: CommandContext, item: StructuredItem, @@ -1224,6 +1259,30 @@ export function commandTestRun(ctx: CommandContext): number { }); if (result.error) { + const fileExecution = executionFromAutomationResultFile( + run.automation.evidence_dir, + String(run.case.id), + run.run_id, + ); + if (fileExecution) { + if (options.json !== true) { + console.error(`WARN: automation spawn reported an error, but ${fileExecution.path} completed: ${result.error.message}`); + } + if (options.json === true) { + console.log(JSON.stringify({ + run, + setup_executions: setupExecutions, + automation_execution: { + ...fileExecution, + spawn_error: result.error.message, + stdout: executionTail(result.stdout), + stderr: executionTail(result.stderr), + }, + exit_status: fileExecution.exit_status, + }, null, 2)); + } + return fileExecution.exit_status; + } if (options.json !== true) console.error(`ERROR: failed to run automation: ${result.error.message}`); if (options.json === true) { console.log(JSON.stringify({ @@ -1247,7 +1306,7 @@ export function commandTestRun(ctx: CommandContext): number { run, setup_executions: setupExecutions, automation_execution: { - status: status === 0 ? "ok" : "nonzero", + status: executionStatusFromExitStatus(status), exit_status: status, stdout: executionTail(result.stdout), stderr: executionTail(result.stderr), @@ -1311,6 +1370,7 @@ function renderMarkdownReport(report: TestReport): string { const environment = report.environment; const logGuard = report.log_guard; const troubleshooting = report.troubleshooting; + const automation = report.automation_result; const lines: string[] = []; lines.push(`# Test Report: ${reportCase.id}`); @@ -1323,20 +1383,41 @@ function renderMarkdownReport(report: TestReport): string { lines.push(`Type: ${reportCase.type}`); lines.push(""); lines.push("## Result"); - lines.push(`- result: ${evidence.result}`); - for (const [key, value] of Object.entries(evidence)) { - if (key !== "result") lines.push(`- ${key}: ${value}`); + if (automation.status === "loaded" && automation.result) { + lines.push(`- result: ${automation.result}`); + if (automation.reason) lines.push(`- reason: ${automation.reason}`); + if (automation.url) lines.push(`- target_tested: ${automation.url}`); + if (automation.path) lines.push(`- automation_result: ${automation.path}`); + if (automation.artifacts) lines.push(`- artifacts: ${JSON.stringify(automation.artifacts)}`); + } else { + lines.push(`- result: ${evidence.result}`); + for (const [key, value] of Object.entries(evidence)) { + if (key !== "result") lines.push(`- ${key}: ${value}`); + } } lines.push(""); lines.push("## Automation Result"); - lines.push(`- status: ${report.automation_result.status}`); - if (report.automation_result.path) lines.push(`- path: ${report.automation_result.path}`); - if (report.automation_result.result) lines.push(`- result: ${report.automation_result.result}`); - if (report.automation_result.reason) lines.push(`- reason: ${report.automation_result.reason}`); - if (report.automation_result.started_at_local) lines.push(`- started_at_local: ${report.automation_result.started_at_local}`); - if (report.automation_result.finished_at_local) lines.push(`- finished_at_local: ${report.automation_result.finished_at_local}`); - if (report.automation_result.url) lines.push(`- url: ${report.automation_result.url}`); - if (report.automation_result.expected_text) lines.push(`- expected_text: ${report.automation_result.expected_text}`); + lines.push(`- status: ${automation.status}`); + if (automation.path) lines.push(`- path: ${automation.path}`); + if (automation.result) lines.push(`- result: ${automation.result}`); + if (automation.reason) lines.push(`- reason: ${automation.reason}`); + if (automation.duration_ms !== undefined) lines.push(`- duration_ms: ${automation.duration_ms}`); + if (automation.started_at_local) lines.push(`- started_at_local: ${automation.started_at_local}`); + if (automation.finished_at_local) lines.push(`- finished_at_local: ${automation.finished_at_local}`); + if (automation.url) lines.push(`- url: ${automation.url}`); + if (automation.expected_text) lines.push(`- expected_text: ${automation.expected_text}`); + if (automation.metrics_summary) { + lines.push("- metrics_summary:"); + lines.push(` ${JSON.stringify(automation.metrics_summary)}`); + } + if (automation.thresholds_summary) { + lines.push("- thresholds_summary:"); + lines.push(` ${JSON.stringify(automation.thresholds_summary)}`); + } + if (automation.artifacts) { + lines.push("- artifacts:"); + lines.push(` ${JSON.stringify(automation.artifacts)}`); + } lines.push(""); lines.push("## Environment"); for (const [key, value] of Object.entries(environment)) lines.push(`- ${key}=${value}`); diff --git a/skills/src/commands/validate.ts b/skills/src/commands/validate.ts index 8b15d6344..590032ef8 100644 --- a/skills/src/commands/validate.ts +++ b/skills/src/commands/validate.ts @@ -126,6 +126,9 @@ function validateCaseItem(root: string, item: StructuredItem, skillNames: Set ( validateSetupAutomationEntry(root, entry, caseIds).map((error) => `${item.path}: ${error}`) )), @@ -183,10 +186,62 @@ function validateCaseItem(root: string, item: StructuredItem, skillNames: Set 599) { + errors.push(`${item.path}: 'automation_fake_provider_fault_status' must be an HTTP 4xx or 5xx status string`); + } + } const streamOutput = scalar(item.fields, "automation_stream_output"); if (streamOutput && !["0", "1", "false", "true"].includes(streamOutput)) { errors.push(`${item.path}: 'automation_stream_output' must be one of 0, 1, false, or true`); } + for (const key of [ + "automation_debug_chat_load_stream", + "automation_debug_chat_load_reset", + "automation_debug_chat_load_fail_on_final_mismatch", + "automation_fake_provider_fail_after_first_chunk", + "automation_fake_provider_dynamic_response", + ]) { + const value = scalar(item.fields, key); + if (value && !["0", "1", "false", "true"].includes(value)) { + errors.push(`${item.path}: '${key}' must be one of 0, 1, false, or true`); + } + } const imageBase64Fixture = scalar(item.fields, "automation_image_base64_fixture"); if (imageBase64Fixture && !existsSync(join(root, imageBase64Fixture))) { errors.push(`${item.path}: automation image fixture does not exist: ${imageBase64Fixture}`); diff --git a/skills/src/constants.ts b/skills/src/constants.ts index 015a9bd39..5cfe37f8a 100644 --- a/skills/src/constants.ts +++ b/skills/src/constants.ts @@ -9,7 +9,18 @@ export const requiredEnvKeys = [ ]; export const caseModeValues = ["agent-browser", "probe"]; -export const caseTypeValues = ["smoke", "regression", "feature", "provider", "exploratory"]; +export const caseTypeValues = [ + "smoke", + "regression", + "feature", + "provider", + "exploratory", + "contract", + "performance", + "reliability", + "chaos", + "security", +]; export const casePriorityValues = ["p0", "p1", "p2"]; export const caseRiskValues = ["low", "medium", "high"]; export const caseEvidenceValues = [ @@ -21,10 +32,24 @@ export const caseEvidenceValues = [ "frontend_log", "api_diagnostic", "filesystem", + "metrics", + "trace", + "profile", + "resource_log", ]; export const testResultStatusValues = ["pass", "fail", "blocked", "env_issue", "flaky"]; export const troubleshootingCategoryValues = ["product", "env_issue", "external_dependency", "blocked", "flaky"]; -export const suiteTypeValues = ["smoke", "regression", "release_gate", "exploratory"]; +export const suiteTypeValues = [ + "smoke", + "regression", + "release_gate", + "exploratory", + "contract", + "performance", + "reliability", + "chaos", + "security", +]; export const suiteRequiredStrings = ["id", "title", "description", "type", "priority"]; export const suiteRequiredLists = ["tags", "cases"]; diff --git a/skills/src/log-guard.ts b/skills/src/log-guard.ts index 253cb229e..6f7f541a7 100644 --- a/skills/src/log-guard.ts +++ b/skills/src/log-guard.ts @@ -91,6 +91,7 @@ export type AutomationResultEvidence = { path?: string; result?: string; reason?: string; + duration_ms?: number; started_at?: string; started_at_local?: string; finished_at?: string; @@ -98,6 +99,9 @@ export type AutomationResultEvidence = { url?: string; prompt?: string; expected_text?: string; + metrics_summary?: Record; + thresholds_summary?: Record; + artifacts?: Record; }; type MutableScanState = { @@ -594,6 +598,18 @@ function stringField(data: Record, key: string): string | undef return typeof value === "string" && value.trim() ? value : undefined; } +function numberField(data: Record, key: string): number | undefined { + const value = data[key]; + return typeof value === "number" && Number.isFinite(value) ? value : undefined; +} + +function objectField(data: Record, key: string): Record | undefined { + const value = data[key]; + return value && typeof value === "object" && !Array.isArray(value) + ? value as Record + : undefined; +} + function evidenceDirFromOptions(options: Record): string | undefined { const explicit = typeof options["evidence-dir"] === "string" ? options["evidence-dir"] : undefined; if (explicit) return resolve(explicit); @@ -628,6 +644,7 @@ export function readAutomationResultEvidence(options: Record { } }); +test("suite run preserves classified env_issue automation results", () => { + const tmp = mkdtempSync(join(tmpdir(), "lbs-suite-run-env-issue-")); + try { + const skillDir = join(tmp, "skills", "langbot-testing"); + const casesDir = join(skillDir, "cases"); + const suitesDir = join(skillDir, "suites"); + const scriptsDir = join(tmp, "scripts"); + mkdirSync(casesDir, { recursive: true }); + mkdirSync(suitesDir, { recursive: true }); + mkdirSync(scriptsDir, { recursive: true }); + writeFileSync(join(skillDir, "SKILL.md"), "---\nname: langbot-testing\ndescription: Testing.\n---\n\n# Testing\n"); + writeFileSync(join(tmp, "skills", ".env"), ""); + writeFileSync( + join(casesDir, "env-case.yaml"), + [ + "id: env-case", + "title: Env Case", + "mode: probe", + "area: qa", + "type: smoke", + "priority: p2", + "risk: low", + "ci_eligible: true", + "automation: scripts/env-issue.mjs", + "evidence_required:", + " - filesystem", + ].join("\n"), + ); + writeFileSync( + join(suitesDir, "mini.yaml"), + [ + "id: mini", + "title: Mini", + "description: Mini suite.", + "type: smoke", + "priority: p2", + "tags:", + " - qa", + "cases:", + " - env-case", + ].join("\n"), + ); + writeFileSync( + join(scriptsDir, "env-issue.mjs"), + [ + "import { mkdirSync, writeFileSync } from 'node:fs';", + "import { join } from 'node:path';", + "mkdirSync(process.env.LBS_EVIDENCE_DIR, { recursive: true });", + "const result = {", + " case_id: process.env.LBS_CASE_ID,", + " run_id: process.env.LBS_RUN_ID,", + " status: 'env_issue',", + " reason: 'backend not reachable',", + " evidence_collected: ['filesystem']", + "};", + "writeFileSync(join(process.env.LBS_EVIDENCE_DIR, 'result.json'), JSON.stringify(result));", + "writeFileSync(join(process.env.LBS_EVIDENCE_DIR, 'automation-result.json'), JSON.stringify({ ...result, source: 'automation' }));", + "process.exit(2);", + ].join("\n"), + ); + + const result = capture(() => commandSuiteRun({ + root: tmp, + args: ["suite", "run", "mini", "--run-id", "mini-run", "--evidence-dir", join(tmp, "evidence"), "--json"], + })); + + assert.equal(result.code, 2); + const payload = JSON.parse(result.output); + assert.equal(payload.executions[0].status, "classified"); + assert.equal(payload.report.status, "env_issue"); + assert.equal(payload.report.execution_status, "ok"); + } finally { + rmSync(tmp, { recursive: true, force: true }); + } +}); + test("suite run failure cannot be masked by stale pass result", () => { const tmp = mkdtempSync(join(tmpdir(), "lbs-suite-run-stale-pass-")); try { @@ -1369,6 +1445,56 @@ test("env doctor does not require proxy variables", async () => { } }); +test("env doctor reports missing socksio for active SOCKS proxy", async () => { + const tmp = mkdtempSync(join(tmpdir(), "lbs-env-doctor-socksio-")); + const originalAllProxy = process.env.ALL_PROXY; + const originalAllProxyLower = process.env.all_proxy; + try { + delete process.env.ALL_PROXY; + delete process.env.all_proxy; + const skillsDir = join(tmp, "skills"); + const repoDir = join(tmp, "LangBot"); + const webDir = join(repoDir, "web"); + const venvBin = join(repoDir, ".venv", "bin"); + const browserProfile = join(tmp, "browser-profile"); + const chromium = join(tmp, "chromium"); + mkdirSync(skillsDir, { recursive: true }); + mkdirSync(webDir, { recursive: true }); + mkdirSync(venvBin, { recursive: true }); + mkdirSync(browserProfile, { recursive: true }); + writeFileSync(chromium, ""); + const python = join(venvBin, "python"); + writeFileSync(python, "#!/bin/sh\nexit 1\n"); + chmodSync(python, 0o755); + writeFileSync( + join(skillsDir, ".env"), + [ + "LANGBOT_BACKEND_URL=http://127.0.0.1:59996", + "LANGBOT_FRONTEND_URL=http://127.0.0.1:59996", + "LANGBOT_DEV_FRONTEND_URL=http://127.0.0.1:59996", + `LANGBOT_REPO=${repoDir}`, + `LANGBOT_WEB_REPO=${webDir}`, + `LANGBOT_BROWSER_PROFILE=${browserProfile}`, + `LANGBOT_CHROMIUM_EXECUTABLE=${chromium}`, + "ALL_PROXY=socks5://127.0.0.1:7890", + ].join("\n"), + ); + + const result = await captureAsync(() => commandEnvDoctor({ root: tmp, args: ["env", "doctor"] })); + + assert.equal(result.code, 1); + assert.match(result.output, /FAIL: SOCKS proxy ALL_PROXY is configured/); + assert.match(result.output, /cannot import socksio/); + assert.match(result.output, /-m pip install socksio/); + } finally { + if (originalAllProxy === undefined) delete process.env.ALL_PROXY; + else process.env.ALL_PROXY = originalAllProxy; + if (originalAllProxyLower === undefined) delete process.env.all_proxy; + else process.env.all_proxy = originalAllProxyLower; + rmSync(tmp, { recursive: true, force: true }); + } +}); + test("env show redacts secret-like values by default", () => { const tmp = mkdtempSync(join(tmpdir(), "lbs-env-show-redact-")); try { @@ -2521,6 +2647,38 @@ test("test report renders a reusable evidence template", () => { assert.match(result.output, /no log files provided/); }); +test("test report promotes loaded automation evidence into result section", () => { + const tmp = mkdtempSync(join(tmpdir(), "lbs-report-automation-")); + try { + writeFileSync( + join(tmp, "automation-result.json"), + JSON.stringify({ + status: "pass", + reason: "latency thresholds passed", + url: "http://127.0.0.1:5300", + artifacts: { metrics_json: join(tmp, "metrics.json") }, + }), + ); + + const result = capture(() => commandTestReport(ctx([ + "test", + "report", + "langbot-live-backend-latency", + "--evidence-dir", + tmp, + "--no-auto-log", + ]))); + + assert.equal(result.code, 0); + assert.match(result.output, /## Result\n- result: pass\n- reason: latency thresholds passed/); + assert.match(result.output, /- target_tested: http:\/\/127\.0\.0\.1:5300/); + assert.doesNotMatch(result.output, /target_tested: TODO/); + assert.match(result.output, /## Automation Result/); + } finally { + rmSync(tmp, { recursive: true, force: true }); + } +}); + test("validate rejects dangling case references and missing automation scripts", () => { const tmp = mkdtempSync(join(tmpdir(), "lbs-validate-strict-")); try {