From 67437c2f5a7cbd2a89e66ba3901fb935e265fcde Mon Sep 17 00:00:00 2001
From: huanghuoguoguo <60681390+huanghuoguoguo@users.noreply.github.com>
Date: Thu, 25 Jun 2026 00:07:37 +0800
Subject: [PATCH] Add performance and reliability QA gates

---
 skills/schemas/case.schema.json               |  34 +-
 skills/schemas/suite.schema.json              |  12 +-
 skills/scripts/e2e/pipeline-debug-chat.mjs    |  80 ++++-
 skills/skills.index.json                      | 231 +++++++++++++
 skills/skills/langbot-testing/SKILL.md        |   3 +
 .../langbot-fault-taxonomy-contract.yaml      |  35 ++
 .../cases/langbot-live-backend-latency.yaml   |  42 +++
 .../langbot-live-backend-log-health.yaml      |  45 +++
 .../cases/langbot-live-control-plane-api.yaml |  44 +++
 .../langbot-overhead-accounting-contract.yaml |  37 +++
 .../pipeline-debug-chat-performance.yaml      |  75 +++++
 .../plugins/qa-plugin-smoke/.gitignore        |   4 +-
 .../dist/qa-plugin-smoke-0.1.0.lbpkg          | Bin 0 -> 5160 bytes
 .../langbot-fault-taxonomy-contract.mjs       | 159 +++++++++
 .../probes/langbot-live-backend-latency.mjs   | 212 ++++++++++++
 .../langbot-live-backend-log-health.mjs       | 205 ++++++++++++
 .../probes/langbot-live-control-plane-api.mjs | 311 ++++++++++++++++++
 .../langbot-overhead-accounting-contract.mjs  | 162 +++++++++
 .../performance-reliability-testing.md        | 173 ++++++++++
 .../suites/langbot-live-backend-gate.yaml     |  14 +
 .../langbot-performance-contract-gate.yaml    |  13 +
 .../langbot-performance-reliability-gate.yaml |  16 +
 .../langbot-user-path-performance-gate.yaml   |  12 +
 skills/src/commands/env.ts                    |  35 ++
 skills/src/commands/suite.ts                  |  47 ++-
 skills/src/commands/test.ts                   | 109 +++++-
 skills/src/commands/validate.ts               |   3 +
 skills/src/constants.ts                       |  29 +-
 skills/src/log-guard.ts                       |  20 ++
 skills/src/readiness.ts                       |   2 +
 skills/test/lbs-cli.test.ts                   | 160 ++++++++-
 31 files changed, 2299 insertions(+), 25 deletions(-)
 create mode 100644 skills/skills/langbot-testing/cases/langbot-fault-taxonomy-contract.yaml
 create mode 100644 skills/skills/langbot-testing/cases/langbot-live-backend-latency.yaml
 create mode 100644 skills/skills/langbot-testing/cases/langbot-live-backend-log-health.yaml
 create mode 100644 skills/skills/langbot-testing/cases/langbot-live-control-plane-api.yaml
 create mode 100644 skills/skills/langbot-testing/cases/langbot-overhead-accounting-contract.yaml
 create mode 100644 skills/skills/langbot-testing/cases/pipeline-debug-chat-performance.yaml
 create mode 100644 skills/skills/langbot-testing/fixtures/plugins/qa-plugin-smoke/dist/qa-plugin-smoke-0.1.0.lbpkg
 create mode 100644 skills/skills/langbot-testing/probes/langbot-fault-taxonomy-contract.mjs
 create mode 100644 skills/skills/langbot-testing/probes/langbot-live-backend-latency.mjs
 create mode 100644 skills/skills/langbot-testing/probes/langbot-live-backend-log-health.mjs
 create mode 100644 skills/skills/langbot-testing/probes/langbot-live-control-plane-api.mjs
 create mode 100644 skills/skills/langbot-testing/probes/langbot-overhead-accounting-contract.mjs
 create mode 100644 skills/skills/langbot-testing/references/performance-reliability-testing.md
 create mode 100644 skills/skills/langbot-testing/suites/langbot-live-backend-gate.yaml
 create mode 100644 skills/skills/langbot-testing/suites/langbot-performance-contract-gate.yaml
 create mode 100644 skills/skills/langbot-testing/suites/langbot-performance-reliability-gate.yaml
 create mode 100644 skills/skills/langbot-testing/suites/langbot-user-path-performance-gate.yaml

diff --git a/skills/schemas/case.schema.json b/skills/schemas/case.schema.json
index f6365c062..0d63d8dec 100644
--- a/skills/schemas/case.schema.json
+++ b/skills/schemas/case.schema.json
@@ -48,7 +48,18 @@
     },
     "type": {
       "type": "string",
-      "enum": ["smoke", "regression", "feature", "provider", "exploratory"]
+      "enum": [
+        "smoke",
+        "regression",
+        "feature",
+        "provider",
+        "exploratory",
+        "contract",
+        "performance",
+        "reliability",
+        "chaos",
+        "security"
+      ]
     },
     "priority": {
       "type": "string",
@@ -102,7 +113,11 @@
           "backend_log",
           "frontend_log",
           "api_diagnostic",
-          "filesystem"
+          "filesystem",
+          "metrics",
+          "trace",
+          "profile",
+          "resource_log"
         ]
       },
       "minItems": 1
@@ -188,9 +203,24 @@
       "type": "string",
       "enum": ["person", "group"]
     },
+    "automation_debug_chat_response_p95_ms": {
+      "type": "string"
+    },
+    "automation_debug_chat_max_error_rate": {
+      "type": "string"
+    },
     "automation_filesystem_checks_json": {
       "type": "string"
     },
+    "metrics_thresholds_json": {
+      "type": "string"
+    },
+    "load_profile_json": {
+      "type": "string"
+    },
+    "fault_model_json": {
+      "type": "string"
+    },
     "automation_pipeline_url_env": {
       "type": "string",
       "pattern": "^[A-Z][A-Z0-9_]*$"
diff --git a/skills/schemas/suite.schema.json b/skills/schemas/suite.schema.json
index 3da1a3e85..4f3fa7c7a 100644
--- a/skills/schemas/suite.schema.json
+++ b/skills/schemas/suite.schema.json
@@ -18,7 +18,17 @@
     },
     "type": {
       "type": "string",
-      "enum": ["smoke", "regression", "release_gate", "exploratory"]
+      "enum": [
+        "smoke",
+        "regression",
+        "release_gate",
+        "exploratory",
+        "contract",
+        "performance",
+        "reliability",
+        "chaos",
+        "security"
+      ]
     },
     "priority": {
       "type": "string",
diff --git a/skills/scripts/e2e/pipeline-debug-chat.mjs b/skills/scripts/e2e/pipeline-debug-chat.mjs
index 87fe9ae79..4b20f7757 100755
--- a/skills/scripts/e2e/pipeline-debug-chat.mjs
+++ b/skills/scripts/e2e/pipeline-debug-chat.mjs
@@ -54,6 +54,7 @@ const debugChatSessionType = env.LANGBOT_E2E_DEBUG_CHAT_SESSION_TYPE || "person"
 const pipelineConfigDiagnosticPath = resolve(paths.evidenceDir, "pipeline-config-diagnostic.json");
 const debugChatResetDiagnosticPath = resolve(paths.evidenceDir, "debug-chat-reset-diagnostic.json");
 const pipelineConfigRestoreDiagnosticPath = resolve(paths.evidenceDir, "pipeline-config-restore-diagnostic.json");
+const metricsPath = resolve(paths.evidenceDir, "metrics.json");
 const startedAt = new Date();
 
 let browser;
@@ -80,10 +81,11 @@ let result = {
     console_log: paths.consoleLog,
     network_log: paths.networkLog,
     screenshot: paths.screenshot,
+    metrics_json: metricsPath,
     automation_result_json: paths.automationResultJson,
     result_json: paths.resultJson,
   },
-  evidence_collected: ["ui", "screenshot", "console", "network"],
+  evidence_collected: ["ui", "screenshot", "console", "network", "metrics"],
 };
 
 function boolFromEnv(value, defaultValue) {
@@ -103,6 +105,29 @@ function parseJsonEnv(key, fallback) {
   }
 }
 
+function positiveNumberEnv(key, fallback) {
+  const value = Number(env[key] || "");
+  return Number.isFinite(value) && value >= 0 ? value : fallback;
+}
+
+function percentile(values, percentileValue) {
+  if (values.length === 0) return 0;
+  const sorted = [...values].sort((a, b) => a - b);
+  const index = Math.min(sorted.length - 1, Math.ceil((percentileValue / 100) * sorted.length) - 1);
+  return Number(sorted[index].toFixed(3));
+}
+
+function stats(values) {
+  if (values.length === 0) return { min: 0, p50: 0, p95: 0, p99: 0, max: 0 };
+  return {
+    min: Number(Math.min(...values).toFixed(3)),
+    p50: percentile(values, 50),
+    p95: percentile(values, 95),
+    p99: percentile(values, 99),
+    max: Number(Math.max(...values).toFixed(3)),
+  };
+}
+
 function promptStepsFromEnv() {
   const rawSteps = parseJsonEnv("LANGBOT_E2E_PROMPTS_JSON", null);
   if (rawSteps === null) {
@@ -658,6 +683,7 @@ try {
       } else {
         for (let index = 0; index < promptSteps.length; index += 1) {
           const step = promptSteps[index];
+          const promptStartedAt = Date.now();
           const chatResult = await runDebugChatPrompt(page, {
             prompt: step.prompt,
             expectedText: step.expectedText,
@@ -665,11 +691,13 @@ try {
             imagePath: index === 0 ? imagePath : "",
             failureSignals: failureSignals.length > 0 ? failureSignals : undefined,
           });
+          const promptDurationMs = Date.now() - promptStartedAt;
           result.chat_results.push({
             index,
             expected_text: step.expectedText,
             status: chatResult.status,
             reason: chatResult.reason,
+            response_duration_ms: promptDurationMs,
             min_expected_count: chatResult.min_expected_count,
             final_count: chatResult.final_count,
             before_assistant_expected_count: chatResult.before_assistant_expected_count,
@@ -714,6 +742,56 @@ try {
   const finishedAt = new Date();
   result.finished_at = finishedAt.toISOString();
   result.finished_at_local = localIsoWithOffset(finishedAt);
+  result.duration_ms = finishedAt.getTime() - startedAt.getTime();
+  const responseDurations = result.chat_results
+    .map((item) => item.response_duration_ms)
+    .filter((value) => Number.isFinite(value));
+  const passedPrompts = result.chat_results.filter((item) => item.status === "pass").length;
+  const attemptedPrompts = result.chat_results.length;
+  const errorRate = attemptedPrompts === 0 ? 1 : Number(((attemptedPrompts - passedPrompts) / attemptedPrompts).toFixed(4));
+  const responseStats = stats(responseDurations);
+  const responseP95BudgetMs = positiveNumberEnv(
+    "LANGBOT_E2E_DEBUG_CHAT_RESPONSE_P95_MS",
+    positiveNumberEnv("LANGBOT_DEBUG_CHAT_RESPONSE_P95_MS", safeResponseTimeoutMs),
+  );
+  const maxErrorRate = positiveNumberEnv("LANGBOT_E2E_DEBUG_CHAT_MAX_ERROR_RATE", 0);
+  const metrics = {
+    probe: caseId,
+    url: result.url,
+    prompt_count: result.prompt_count,
+    attempted_prompt_count: attemptedPrompts,
+    passed_prompt_count: passedPrompts,
+    error_rate: errorRate,
+    response_duration_ms: responseStats,
+    total_duration_ms: result.duration_ms,
+    chat_results: result.chat_results,
+  };
+  result.metrics_summary = {
+    prompt_count: metrics.prompt_count,
+    attempted_prompt_count: metrics.attempted_prompt_count,
+    passed_prompt_count: metrics.passed_prompt_count,
+    error_rate: metrics.error_rate,
+    response_p50_ms: metrics.response_duration_ms.p50,
+    response_p95_ms: metrics.response_duration_ms.p95,
+    total_duration_ms: metrics.total_duration_ms,
+  };
+  result.thresholds_summary = {
+    response_p95_ms: {
+      actual: metrics.response_duration_ms.p95,
+      max: responseP95BudgetMs,
+      pass: attemptedPrompts > 0 && metrics.response_duration_ms.p95 <= responseP95BudgetMs,
+    },
+    error_rate: {
+      actual: metrics.error_rate,
+      max: maxErrorRate,
+      pass: metrics.error_rate <= maxErrorRate,
+    },
+  };
+  await writeFile(metricsPath, `${JSON.stringify(metrics, null, 2)}\n`, "utf8");
+  if (result.status === "pass" && !Object.values(result.thresholds_summary).every((item) => item.pass)) {
+    result.status = "fail";
+    result.reason = "Debug Chat performance breached response latency or error-rate thresholds.";
+  }
   const existingEvidence = {};
   for (const [key, value] of Object.entries(result.evidence)) {
     if (typeof value !== "string") continue;
diff --git a/skills/skills.index.json b/skills/skills.index.json
index d56a84822..190cf1305 100644
--- a/skills/skills.index.json
+++ b/skills/skills.index.json
@@ -130,6 +130,7 @@
         "references/local-agent-runner.md",
         "references/mcp-stdio-testing.md",
         "references/model-provider-testing.md",
+        "references/performance-reliability-testing.md",
         "references/pipeline-debug-chat.md",
         "references/plugin-e2e-smoke.md",
         "references/sandbox-skill-authoring.md",
@@ -150,6 +151,11 @@
         "agent-runner-release-preflight",
         "agent-runner-runtime-chaos",
         "dify-agent-debug-chat",
+        "langbot-fault-taxonomy-contract",
+        "langbot-live-backend-latency",
+        "langbot-live-backend-log-health",
+        "langbot-live-control-plane-api",
+        "langbot-overhead-accounting-contract",
         "langrag-kb-retrieve",
         "langrag-parser-golden-e2e",
         "langrag-sentinel-kb-discover",
@@ -165,6 +171,7 @@
         "mcp-stdio-register",
         "mcp-stdio-tool-call",
         "pipeline-debug-chat",
+        "pipeline-debug-chat-performance",
         "plugin-e2e-smoke",
         "provider-deepseek",
         "qa-plugin-smoke-live-install",
@@ -486,6 +493,128 @@
             "backend_log"
           ]
         },
+        {
+          "id": "langbot-fault-taxonomy-contract",
+          "title": "LangBot fault taxonomy and cleanup contract",
+          "mode": "probe",
+          "area": "reliability",
+          "type": "chaos",
+          "priority": "p1",
+          "risk": "medium",
+          "ci_eligible": true,
+          "tags": [
+            "reliability",
+            "chaos",
+            "contract",
+            "synthetic"
+          ],
+          "automation": "skills/langbot-testing/probes/langbot-fault-taxonomy-contract.mjs",
+          "setup_automation": [],
+          "setup_provides_env": [],
+          "evidence_required": [
+            "metrics",
+            "filesystem"
+          ]
+        },
+        {
+          "id": "langbot-live-backend-latency",
+          "title": "LangBot live backend basic latency probe",
+          "mode": "probe",
+          "area": "performance",
+          "type": "performance",
+          "priority": "p1",
+          "risk": "medium",
+          "ci_eligible": false,
+          "tags": [
+            "performance",
+            "live-backend",
+            "latency",
+            "metrics"
+          ],
+          "automation": "skills/langbot-testing/probes/langbot-live-backend-latency.mjs",
+          "setup_automation": [],
+          "setup_provides_env": [],
+          "evidence_required": [
+            "metrics",
+            "network",
+            "api_diagnostic",
+            "filesystem"
+          ]
+        },
+        {
+          "id": "langbot-live-backend-log-health",
+          "title": "LangBot live backend log health probe",
+          "mode": "probe",
+          "area": "reliability",
+          "type": "reliability",
+          "priority": "p1",
+          "risk": "medium",
+          "ci_eligible": false,
+          "tags": [
+            "reliability",
+            "live-backend",
+            "backend-log",
+            "metrics"
+          ],
+          "automation": "skills/langbot-testing/probes/langbot-live-backend-log-health.mjs",
+          "setup_automation": [],
+          "setup_provides_env": [],
+          "evidence_required": [
+            "metrics",
+            "backend_log",
+            "filesystem"
+          ]
+        },
+        {
+          "id": "langbot-live-control-plane-api",
+          "title": "LangBot live control-plane API probe",
+          "mode": "probe",
+          "area": "performance",
+          "type": "performance",
+          "priority": "p1",
+          "risk": "medium",
+          "ci_eligible": false,
+          "tags": [
+            "performance",
+            "reliability",
+            "live-backend",
+            "control-plane",
+            "metrics"
+          ],
+          "automation": "skills/langbot-testing/probes/langbot-live-control-plane-api.mjs",
+          "setup_automation": [],
+          "setup_provides_env": [],
+          "evidence_required": [
+            "metrics",
+            "network",
+            "api_diagnostic",
+            "filesystem"
+          ]
+        },
+        {
+          "id": "langbot-overhead-accounting-contract",
+          "title": "LangBot overhead accounting metrics contract",
+          "mode": "probe",
+          "area": "performance",
+          "type": "performance",
+          "priority": "p1",
+          "risk": "medium",
+          "ci_eligible": true,
+          "tags": [
+            "performance",
+            "metrics",
+            "contract",
+            "synthetic"
+          ],
+          "automation": "skills/langbot-testing/probes/langbot-overhead-accounting-contract.mjs",
+          "setup_automation": [],
+          "setup_provides_env": [],
+          "evidence_required": [
+            "metrics",
+            "resource_log",
+            "filesystem"
+          ]
+        },
         {
           "id": "langrag-kb-retrieve",
           "title": "LangRAG knowledge base ingests and retrieves a sentinel document",
@@ -911,6 +1040,33 @@
             "backend_log"
           ]
         },
+        {
+          "id": "pipeline-debug-chat-performance",
+          "title": "Pipeline Debug Chat user-path performance probe",
+          "mode": "agent-browser",
+          "area": "pipeline",
+          "type": "performance",
+          "priority": "p1",
+          "risk": "medium",
+          "ci_eligible": false,
+          "tags": [
+            "performance",
+            "pipeline",
+            "debug-chat",
+            "user-path",
+            "metrics"
+          ],
+          "automation": "scripts/e2e/pipeline-debug-chat.mjs",
+          "setup_automation": [],
+          "setup_provides_env": [],
+          "evidence_required": [
+            "ui",
+            "screenshot",
+            "console",
+            "network",
+            "metrics"
+          ]
+        },
         {
           "id": "plugin-e2e-smoke",
           "title": "Plugin system installs a local plugin and exposes tool/page APIs",
@@ -1059,6 +1215,10 @@
       "suites": [
         "agent-runner-release-gate",
         "core-smoke",
+        "langbot-live-backend-gate",
+        "langbot-performance-contract-gate",
+        "langbot-performance-reliability-gate",
+        "langbot-user-path-performance-gate",
         "local-agent-gate"
       ],
       "suite_summaries": [
@@ -1121,6 +1281,77 @@
             "local-agent-basic-debug-chat"
           ]
         },
+        {
+          "id": "langbot-live-backend-gate",
+          "title": "LangBot live backend reliability gate",
+          "description": "Live backend control-plane responsiveness and runtime log health checks for a locally running LangBot instance.",
+          "type": "reliability",
+          "priority": "p1",
+          "tags": [
+            "performance",
+            "reliability",
+            "live-backend",
+            "metrics"
+          ],
+          "cases": [
+            "langbot-live-backend-latency",
+            "langbot-live-control-plane-api",
+            "langbot-live-backend-log-health"
+          ]
+        },
+        {
+          "id": "langbot-performance-contract-gate",
+          "title": "LangBot performance contract gate",
+          "description": "Fast synthetic contract checks for performance metric accounting and non-destructive reliability fault taxonomy.",
+          "type": "contract",
+          "priority": "p1",
+          "tags": [
+            "performance",
+            "reliability",
+            "contract",
+            "metrics"
+          ],
+          "cases": [
+            "langbot-overhead-accounting-contract",
+            "langbot-fault-taxonomy-contract"
+          ]
+        },
+        {
+          "id": "langbot-performance-reliability-gate",
+          "title": "LangBot performance and reliability starter gate",
+          "description": "Starter gate for LangBot performance accounting, live backend control-plane latency, and non-destructive fault taxonomy checks.",
+          "type": "reliability",
+          "priority": "p1",
+          "tags": [
+            "performance",
+            "reliability",
+            "metrics",
+            "chaos"
+          ],
+          "cases": [
+            "langbot-overhead-accounting-contract",
+            "langbot-fault-taxonomy-contract",
+            "langbot-live-backend-latency",
+            "langbot-live-control-plane-api",
+            "langbot-live-backend-log-health"
+          ]
+        },
+        {
+          "id": "langbot-user-path-performance-gate",
+          "title": "LangBot user-path performance gate",
+          "description": "Browser-visible performance checks for user-facing LangBot paths such as Pipeline Debug Chat.",
+          "type": "performance",
+          "priority": "p1",
+          "tags": [
+            "performance",
+            "browser",
+            "debug-chat",
+            "user-path"
+          ],
+          "cases": [
+            "pipeline-debug-chat-performance"
+          ]
+        },
         {
           "id": "local-agent-gate",
           "title": "Local Agent runner regression gate",
diff --git a/skills/skills/langbot-testing/SKILL.md b/skills/skills/langbot-testing/SKILL.md
index e9db1980f..748ae9b81 100644
--- a/skills/skills/langbot-testing/SKILL.md
+++ b/skills/skills/langbot-testing/SKILL.md
@@ -21,6 +21,7 @@ Use this skill when an agent needs to verify LangBot behavior through the WebUI
 - **Sandbox-backed skill authoring**: read `references/sandbox-skill-authoring.md`.
 - **LangRAG knowledge bases**: read `references/langrag-knowledge-base.md`.
 - **MCP stdio tool testing**: read `references/mcp-stdio-testing.md`.
+- **Performance, reliability, or chaos probes**: read `references/performance-reliability-testing.md`.
 - **Drive a live instance over MCP (not raw HTTP)**: use the `langbot-mcp-ops` skill — the instance exposes an MCP server at `http://<host>:5300/mcp` (reuses API keys). Useful for setting up bots/pipelines/models as test fixtures programmatically.
 - **Known failures and fixes**: read `references/troubleshooting.md`.
 - **Reusable test groups**: run `bin/lbs suite list` and `bin/lbs suite plan <suite-id>` before manually assembling a case set.
@@ -36,6 +37,8 @@ Use this skill when an agent needs to verify LangBot behavior through the WebUI
 - Use an authenticated browser profile prepared by `langbot-env-setup`.
 - Do not expose API keys, OAuth secrets, tokens, or localStorage token values in output.
 - A WebUI test is not complete until the visible UI result is checked against backend logs or network behavior.
+- A performance result is not complete without `metrics` evidence and a clear split between LangBot overhead and external provider/tool/network time.
+- A chaos or reliability result is not complete until the fault scope, cleanup, and recovery checks are recorded.
 - For a suite, use `bin/lbs suite start <suite-id>` to create the suite evidence root, per-case directories, and `suite-start.json`/`suite-start.md` handoff files; use `bin/lbs test result <case-id>` to write final per-case `result.json`, then run `bin/lbs suite report <suite-id> --evidence-dir <dir>`.
 - Do not mark a case `pass` until `test result --evidence` covers every value in the case's `evidence_required`.
 - For runner-specific Debug Chat cases, use the case-specific pipeline env declared by `automation_pipeline_url_env` / `automation_pipeline_name_env`; do not silently reuse a generic `LANGBOT_PIPELINE_URL`.
diff --git a/skills/skills/langbot-testing/cases/langbot-fault-taxonomy-contract.yaml b/skills/skills/langbot-testing/cases/langbot-fault-taxonomy-contract.yaml
new file mode 100644
index 000000000..2b990f837
--- /dev/null
+++ b/skills/skills/langbot-testing/cases/langbot-fault-taxonomy-contract.yaml
@@ -0,0 +1,35 @@
+id: langbot-fault-taxonomy-contract
+title: "LangBot fault taxonomy and cleanup contract"
+mode: probe
+area: reliability
+type: chaos
+priority: p1
+risk: medium
+ci_eligible: true
+tags:
+  - reliability
+  - chaos
+  - contract
+  - synthetic
+skills:
+  - langbot-testing
+automation: skills/langbot-testing/probes/langbot-fault-taxonomy-contract.mjs
+fault_model_json: '{"kind":"taxonomy-contract","destructive":false,"scenarios":["provider-timeout","plugin-runtime-disconnect","mcp-stdio-server-exit","operator-missing-login","transient-marketplace-timeout"]}'
+steps:
+  - "Run `rtk bin/lbs test run langbot-fault-taxonomy-contract --dry-run` first; remove `--dry-run` after checking the evidence directory."
+  - "Automation validates that representative fault scenarios declare target, injected fault, expected status, recovery check, and cleanup."
+  - "Review metrics.json, fault-model.json, and automation-result.json under LBS_EVIDENCE_DIR."
+checks:
+  - "automation-result.json status is pass."
+  - "Every scenario has an expected status in pass, fail, blocked, env_issue, or flaky."
+  - "Every scenario declares a cleanup action and recovery check."
+evidence_required:
+  - metrics
+  - filesystem
+diagnostics:
+  - "This is a non-destructive taxonomy contract probe; it does not inject real runtime faults."
+  - "Use it as a gate before adding live chaos cases that kill runtimes, route traffic through a proxy, or disrupt a backend dependency."
+success_patterns:
+  - "Fault taxonomy contract declares status"
+failure_patterns:
+  - "missing required scenario fields"
diff --git a/skills/skills/langbot-testing/cases/langbot-live-backend-latency.yaml b/skills/skills/langbot-testing/cases/langbot-live-backend-latency.yaml
new file mode 100644
index 000000000..1922d06f0
--- /dev/null
+++ b/skills/skills/langbot-testing/cases/langbot-live-backend-latency.yaml
@@ -0,0 +1,42 @@
+id: langbot-live-backend-latency
+title: "LangBot live backend basic latency probe"
+mode: probe
+area: performance
+type: performance
+priority: p1
+risk: medium
+ci_eligible: false
+tags:
+  - performance
+  - live-backend
+  - latency
+  - metrics
+skills:
+  - langbot-testing
+env:
+  - LANGBOT_BACKEND_URL
+automation: skills/langbot-testing/probes/langbot-live-backend-latency.mjs
+metrics_thresholds_json: '{"backend_p95_ms":{"max":1000},"error_rate":{"max":0}}'
+load_profile_json: '{"requests":12,"concurrency":2,"endpoints":["/healthz"]}'
+steps:
+  - "Confirm the selected LangBot backend is the intended test target."
+  - "Run `rtk bin/lbs test run langbot-live-backend-latency --dry-run` first; remove `--dry-run` after checking LANGBOT_BACKEND_URL and evidence directory."
+  - "Automation sends a small request batch to LANGBOT_BACKEND_URL/healthz and records latency, status counts, and network errors."
+checks:
+  - "automation-result.json status is pass when the backend responds and p95/error-rate thresholds pass."
+  - "automation-result.json status is env_issue when the backend is not reachable."
+  - "metrics.json and network.log are written under LBS_EVIDENCE_DIR."
+evidence_required:
+  - metrics
+  - network
+  - api_diagnostic
+  - filesystem
+diagnostics:
+  - "This probe measures backend health endpoint reachability latency only; it does not cover model/provider, browser, Debug Chat, RAG, or plugin runtime latency."
+success_patterns:
+  - "Live backend latency probe passed"
+failure_patterns:
+  - "Backend did not respond"
+  - "breached latency or error-rate thresholds"
+troubleshooting:
+  - socks-proxy-without-socksio
diff --git a/skills/skills/langbot-testing/cases/langbot-live-backend-log-health.yaml b/skills/skills/langbot-testing/cases/langbot-live-backend-log-health.yaml
new file mode 100644
index 000000000..8ff911371
--- /dev/null
+++ b/skills/skills/langbot-testing/cases/langbot-live-backend-log-health.yaml
@@ -0,0 +1,45 @@
+id: langbot-live-backend-log-health
+title: "LangBot live backend log health probe"
+mode: probe
+area: reliability
+type: reliability
+priority: p1
+risk: medium
+ci_eligible: false
+tags:
+  - reliability
+  - live-backend
+  - backend-log
+  - metrics
+skills:
+  - langbot-testing
+env:
+  - LANGBOT_BACKEND_URL
+automation: skills/langbot-testing/probes/langbot-live-backend-log-health.mjs
+metrics_thresholds_json: '{"fail_count":{"max":0}}'
+load_profile_json: '{"lookback_seconds":300,"log_source":"LANGBOT_BACKEND_LOG or latest LANGBOT_REPO/data/logs/langbot-*.log"}'
+steps:
+  - "Confirm the selected LangBot backend log belongs to the intended test target."
+  - "Run `rtk bin/lbs test run langbot-live-backend-log-health --dry-run` first; remove `--dry-run` after checking evidence directory and log source."
+  - "Automation scans the recent backend log window for fail-severity runtime findings such as Traceback, ImportError, ERROR, unclosed sessions, and unawaited coroutines."
+checks:
+  - "automation-result.json status is pass only when fail_count is 0."
+  - "metrics_summary includes scanned_line_count, fail_count, warning_count, and finding_count."
+  - "findings.json and scanned-backend.log are written under LBS_EVIDENCE_DIR."
+evidence_required:
+  - metrics
+  - backend_log
+  - filesystem
+diagnostics:
+  - "Set LANGBOT_BACKEND_LOG to an explicit log path when the latest log file is not the run target."
+  - "Set LANGBOT_BACKEND_LOG_SINCE or LANGBOT_BACKEND_LOG_LOOKBACK_SECONDS to control the scan window."
+  - "This probe measures runtime log health; it does not prove user-facing Debug Chat, plugin, model, or RAG behavior."
+success_patterns:
+  - "Live backend log health passed"
+failure_patterns:
+  - "Traceback"
+  - "ImportError"
+  - "ERROR"
+  - "unclosed"
+troubleshooting:
+  - socks-proxy-without-socksio
diff --git a/skills/skills/langbot-testing/cases/langbot-live-control-plane-api.yaml b/skills/skills/langbot-testing/cases/langbot-live-control-plane-api.yaml
new file mode 100644
index 000000000..2cd8ee2c7
--- /dev/null
+++ b/skills/skills/langbot-testing/cases/langbot-live-control-plane-api.yaml
@@ -0,0 +1,44 @@
+id: langbot-live-control-plane-api
+title: "LangBot live control-plane API probe"
+mode: probe
+area: performance
+type: performance
+priority: p1
+risk: medium
+ci_eligible: false
+tags:
+  - performance
+  - reliability
+  - live-backend
+  - control-plane
+  - metrics
+skills:
+  - langbot-testing
+env:
+  - LANGBOT_BACKEND_URL
+automation: skills/langbot-testing/probes/langbot-live-control-plane-api.mjs
+metrics_thresholds_json: '{"error_rate":{"max":0},"response_shape_failures":{"max":0},"healthz_p95_ms":{"max":500},"system_info_p95_ms":{"max":1000}}'
+load_profile_json: '{"requests":20,"concurrency":4,"endpoints":["/healthz","/api/v1/system/info"],"auth_required":false}'
+steps:
+  - "Confirm the selected LangBot backend is the intended test target."
+  - "Run `rtk bin/lbs test run langbot-live-control-plane-api --dry-run` first; remove `--dry-run` after checking LANGBOT_BACKEND_URL and evidence directory."
+  - "Automation sends a small request batch to /healthz and /api/v1/system/info, then validates status code, JSON shape, and latency budgets."
+checks:
+  - "automation-result.json status is pass when every control-plane request returns HTTP 200, JSON code 0, and required response fields."
+  - "metrics_summary includes per-endpoint p50/p95 latency, error rate, status counts, and response_shape_failures."
+  - "thresholds_summary shows error_rate, response_shape_failures, healthz_p95_ms, and system_info_p95_ms all pass."
+evidence_required:
+  - metrics
+  - network
+  - api_diagnostic
+  - filesystem
+diagnostics:
+  - "This probe measures unauthenticated backend control-plane readiness; it does not cover authenticated UI flows, Debug Chat, model calls, plugins, or RAG."
+  - "A system_info shape failure usually means the API contract or startup state changed and should be investigated before treating latency as healthy."
+success_patterns:
+  - "Live control-plane API probe passed"
+failure_patterns:
+  - "Backend did not respond"
+  - "breached shape, latency, or error-rate thresholds"
+troubleshooting:
+  - socks-proxy-without-socksio
diff --git a/skills/skills/langbot-testing/cases/langbot-overhead-accounting-contract.yaml b/skills/skills/langbot-testing/cases/langbot-overhead-accounting-contract.yaml
new file mode 100644
index 000000000..650dfe7d9
--- /dev/null
+++ b/skills/skills/langbot-testing/cases/langbot-overhead-accounting-contract.yaml
@@ -0,0 +1,37 @@
+id: langbot-overhead-accounting-contract
+title: "LangBot overhead accounting metrics contract"
+mode: probe
+area: performance
+type: performance
+priority: p1
+risk: medium
+ci_eligible: true
+tags:
+  - performance
+  - metrics
+  - contract
+  - synthetic
+skills:
+  - langbot-testing
+automation: skills/langbot-testing/probes/langbot-overhead-accounting-contract.mjs
+metrics_thresholds_json: '{"sample_count":{"min":50},"langbot_overhead_p95_ms":{"max":25},"accounting_gap_max_ms":{"max":0.001}}'
+load_profile_json: '{"kind":"synthetic-overhead-accounting","samples":80,"external_latency_segments":["provider","external_tool","network"]}'
+steps:
+  - "Run `rtk bin/lbs test run langbot-overhead-accounting-contract --dry-run` first; remove `--dry-run` after checking the evidence directory."
+  - "Automation generates deterministic message-path latency samples and separates LangBot overhead from provider/tool/network latency."
+  - "Review metrics.json, thresholds.json, resource-log.json, and automation-result.json under LBS_EVIDENCE_DIR."
+checks:
+  - "automation-result.json status is pass."
+  - "metrics_summary includes sample_count, langbot_overhead_p95_ms, e2e_latency_p95_ms, external_latency_p95_ms, and accounting_gap_max_ms."
+  - "thresholds_summary shows sample_count, langbot_overhead_p95_ms, and accounting_gap_max_ms all pass."
+evidence_required:
+  - metrics
+  - resource_log
+  - filesystem
+diagnostics:
+  - "This is a synthetic contract probe for the QA harness; it is not live product performance."
+  - "Use it to verify that reports can carry overhead accounting metrics before running live backend or browser performance probes."
+success_patterns:
+  - "Overhead accounting contract passed"
+failure_patterns:
+  - "breached one or more thresholds"
diff --git a/skills/skills/langbot-testing/cases/pipeline-debug-chat-performance.yaml b/skills/skills/langbot-testing/cases/pipeline-debug-chat-performance.yaml
new file mode 100644
index 000000000..a1a4944b5
--- /dev/null
+++ b/skills/skills/langbot-testing/cases/pipeline-debug-chat-performance.yaml
@@ -0,0 +1,75 @@
+id: pipeline-debug-chat-performance
+title: "Pipeline Debug Chat user-path performance probe"
+mode: agent-browser
+area: pipeline
+type: performance
+priority: p1
+risk: medium
+ci_eligible: false
+tags:
+  - performance
+  - pipeline
+  - debug-chat
+  - user-path
+  - metrics
+skills:
+  - langbot-env-setup
+  - langbot-testing
+env:
+  - LANGBOT_FRONTEND_URL
+  - LANGBOT_BACKEND_URL
+env_any:
+  - LANGBOT_PIPELINE_URL|LANGBOT_PIPELINE_NAME
+automation: scripts/e2e/pipeline-debug-chat.mjs
+automation_env:
+  - LANGBOT_FRONTEND_URL
+  - LANGBOT_BACKEND_URL
+  - LANGBOT_BROWSER_PROFILE
+  - LANGBOT_CHROMIUM_EXECUTABLE
+  - LANGBOT_E2E_PROMPT
+  - LANGBOT_E2E_EXPECTED_TEXT
+  - LANGBOT_E2E_RESPONSE_TIMEOUT_MS
+automation_env_any:
+  - LANGBOT_PIPELINE_URL|LANGBOT_PIPELINE_NAME
+automation_prompt: "请只回复 OK，用于性能测试。"
+automation_expected_text: "OK"
+automation_response_timeout_ms: "120000"
+automation_reset_debug_chat: "true"
+automation_debug_chat_response_p95_ms: "120000"
+automation_debug_chat_max_error_rate: "0"
+metrics_thresholds_json: '{"response_p95_ms":{"max":120000},"error_rate":{"max":0}}'
+load_profile_json: '{"prompts":1,"browser":true,"path":"Pipeline Debug Chat","metric":"send-to-visible-completion"}'
+preconditions:
+  - "LANGBOT_PIPELINE_URL or LANGBOT_PIPELINE_NAME points to the pipeline intended for this Debug Chat performance run."
+  - "The target pipeline is safe to reset Debug Chat history for this run."
+  - "The target pipeline has a known-good runner/model; provider latency should be interpreted separately from LangBot overhead."
+steps:
+  - "Open LANGBOT_FRONTEND_URL with the prepared browser profile."
+  - "Open the target pipeline and select Debug Chat."
+  - "Reset Debug Chat history through the backend API when configured."
+  - "Send the deterministic prompt and wait for the expected assistant response."
+checks:
+  - "automation-result.json status is pass when the expected assistant response appears."
+  - "metrics_summary includes response_p50_ms, response_p95_ms, error_rate, and total_duration_ms."
+  - "thresholds_summary shows response_p95_ms and error_rate pass."
+evidence_required:
+  - ui
+  - screenshot
+  - console
+  - network
+  - metrics
+diagnostics:
+  - "This case measures browser-visible send-to-completion latency; it does not split provider latency from LangBot overhead."
+  - "Use backend logs and provider diagnostics to explain slow runs before calling them LangBot regressions."
+success_patterns:
+  - "Processing request from person_websocket"
+  - "Streaming completed"
+failure_patterns:
+  - "Action invoke_llm_stream call timed out"
+  - "Task exception was never retrieved"
+  - "All models failed during streaming setup"
+troubleshooting:
+  - debug-chat-history-contaminates-automation
+  - local-agent-model-route-unavailable
+  - plugin-runtime-timeout
+  - proxy-env-mismatch
diff --git a/skills/skills/langbot-testing/fixtures/plugins/qa-plugin-smoke/.gitignore b/skills/skills/langbot-testing/fixtures/plugins/qa-plugin-smoke/.gitignore
index 849ddff3b..89d8e500c 100644
--- a/skills/skills/langbot-testing/fixtures/plugins/qa-plugin-smoke/.gitignore
+++ b/skills/skills/langbot-testing/fixtures/plugins/qa-plugin-smoke/.gitignore
@@ -1 +1,3 @@
-dist/
+dist/*
+!dist/
+!dist/qa-plugin-smoke-0.1.0.lbpkg
diff --git a/skills/skills/langbot-testing/fixtures/plugins/qa-plugin-smoke/dist/qa-plugin-smoke-0.1.0.lbpkg b/skills/skills/langbot-testing/fixtures/plugins/qa-plugin-smoke/dist/qa-plugin-smoke-0.1.0.lbpkg
new file mode 100644
index 0000000000000000000000000000000000000000..a4a50f803e0af7218ec8800d3cadff9e64d9b60b
GIT binary patch
literal 5160
zcmaJ_1yq#X+8s&+W(Y|U1w?5EB&BmGr9n`-q+=-Ql2JmsyCft8q(g+ErDf<+q#TeG
zQ1ZUs?|zr--24AC>#VcZtl955=h^#t&a+?j2ROJCAQ0#($hmmR;5?VCIuQ#5+Q9~a
zAi%E<rj9mGEnVH9UZxKAiE+cWvr?qp+uq?`SPwj+d>$y~;?vXZtzDBz={5HXt7%@i
zNhDen^GS0(*JMUNCz%eL;C^yVh?uHvb98K?#dV<=DQM}Lgx~(`W1Ep)oOi0WcNv`U
ziZtEv+&-D!)`=lAr@jsY1<G#OahL=eu?{E(vS~q*ziI6i-p$XnaOlB49_S8Z%_Dot
zT%cY;r!^g^sofX0Y8@C!J?=Fx$7hdFadPYHdPEC-$c?3Vb9<Z6dV@x5y^oB9i_0f4
zt8V0QlWVXRb_4e=T?$=urA5X;v}B%(8D&;L_Nm|L`-aq^)&<Iq<i4e`ieG2-0b|(_
z^ppGjv(qc{n`qO+UuucIA|G~gDAZWbDoQg4R}`MluOKFf_qj{a?k&;14GlaZ3Y&Q{
zAy74JD^}=~(UHaN&pX8wsiWZ(&FCqT#>M&qb4?2lq8uytI<E3k0Vy1?i*Z?^LPB!1
z4lYB-s9@^t3HEvM^08-()6^|Wn<g^bw`q-c2;odQWL+KJ(1EGp6&D%yhc7e{PYpzR
z?Kj|W{RjBe|KP9iZVpy1z?&ezAptOJ!ldO?U{D8(f=FqiU@20>_6f(8CYsmkBQQDE
zoz>i8AE?O8r#kZ?#q(25ArW`<@xn=Kec)GO;yR~S&&M(yi?vk?>eSPvkBM_Lo6|&k
zYwZs2WV6m%v@u}^b9M{FYFgdyuhL|sAn|M8Qx5vf(2p(RyFnN2b)N`H5ET!Pp^CLy
z-J(z6GP}W(w9IZ4lq^<D_M#P!QO~J;GRN>>Rx)*|(`jsbk?7BJ$Z=>Urvdw)2L?Vc
z987H-q0U~gsC&dn9x&`E18HXaav7w^;|vPv#JW8uk~rSUrX-RuHpVY=I(Q)&Fk3*=
z_M|>ZAhx#6H0%v1yfYY)Z&6@iDfaP4co)<^Qx6Qamv`YbU>uuUJcRIx6DzO@cn(~j
z`3Z&>ZqXlE;WhI(@wOS^E6{tNt0(t0-}*thrBMvm85az5p0jV`*9s=eSSX&w7nvkF
z?o~r$Y=5JzE(Zl{TLz%vDlpraJ2^sKJ**Hh?W!F-;1}CIg#8)ycOd#Ww#-#yP`U=^
z?N4P@is|f-{?Fg~Kq{gPjE~>lJvs638}vng5&H?BSl3Lub-iD>;++9vsyf_t=p!C7
z%zgn;El*4*cvL`Yw)2PyVmU8wo7^g{SFyW7z<jYVc6Fb0fis4H$&nhQQo}4=2o;W`
z4yr~q|KRIcqM}5``oiZ37`1hnPF4#Ah1dBdi16NdkoZFc%91U;DQ&q?Yn1Wmk(=Y(
zPg!Z5+9_OXrFrNJO5<K@oM*b?C?0T+7)Vqb`eOY_95-$BG#L&EGzh4V1UR{~i<5)1
zn=vq%Spoz`>I^z&@Q}9dQ&6#RI(P;qY6_IN*pAu9FMPV{HEsHp5L`7V_~V=C`Ez%8
z(+bFQbzkVhgXttnTZa#=A)`aK0C|rJAsDqQ^y1b|61b9I6ck}TvKZReug@HHVx%i8
zdaIFjx~(6F-@v)LvCOBFi1bP7BJo{kq|UfYfpj3)w_g6Zi)it#Jor;EVKdsu+H>rN
znbLTd--`cox%?M~6LQHwPEqynn~bCjjh0E4G&pZKCB5xlQTSI0RB5>!1hI0}UvGaY
z6v6>bc$)VjaF#~g9kj351}0jnHKl(l+&H;x^<H?GkApS^Wosv`X1$VX5MN|fYUBOz
z&5Nj=X%x~7DOEh0tS|y|#1%g456EAL!Ko0qJHpOz)$+-M3g>jeWsao8A(W!&azy#G
z3TGQ=Nj)bU5u{#k6IFaV?W&>23h_G9fqNZAxGk(5vu&uen_HDO<S;^qgL1vxLmgMt
z&X5hEEiQQh$Eys<Vknm<Z%|7aOz=0)>uImhT|a4Od{C#<jk;sIG~=1@Nlu(SX2?e<
z#NW5hG)h?%kv(^tn8jt=9amU~Wwp05e~P<XAWTu^xei}tZ~6EArq$%&$B*xQBWjFH
z>>KDJMKkvh#&Vz}4yWH%l5n!4cTGMej1Jfvd~^@U3Dcg$txh6>z+wu%4;9*~TpLay
z@r3k0onk*&8@f9$vKafr7MAK}i};u~dck<Q(}SNNjNBw4R*4lOEbbb}=T^)MUk%;9
zqSDc4;gAyaR=<#9`Y~7Q5vid#tY(j8hgbRVi8<E=1QA?U#dobf_y(Q;aWAZ&Pt1^T
zZ*-H(IOTrLXMOd=cBZy$TG@_%52asA(%kIS*Ztd0y*C4*-8SD=Se<`-n`4tRaf+)u
zB(())`6Aoo7Cw^V#^y%Vr&WM#n3}Ss*K6Q;<aX|2#8Fn>ir6qv(QAM%-eEj<`=Cmm
z!gApun?b%>-6PuLHcUiH-~4^OaTiRlntuD&Nk~|WBM-N%sOjC^TAzjwvwjsGB=tX!
zz8&sXHC-4K{#qAdPwOg`{4;xAnL=WdfUKDS1{r4d{Jw&9qV5r5-y?;cWSFTkOchdd
za^j8(4r~$MX-I|iHEeoga}Y&;{(|dddQD>r9kA-R8UWssyVn>b@XF5JW6oykRp&cb
zrBF6@wcwkCws>vpO0uEPNzC5yCq#=2qNoX_?4$4@C$y}ZI3QiSy54VUMn`KWa4k0V
zQc4w+(d&o3(>xJpO`95<?;I#smDTT@XN@PVER+0s2)aQ+N&=wpjRS)aI0W3(*wWnE
z3CMnTjegZm9&pPVM_JaBH-q@v*fu1-ATy_{uxUXqvN!ROhH{9X&*D7vV@M_L4|$5k
z_mH09Dns~&M_KVa2qk!}<%ZMB6|R}}(isJJ!MC%W8^5T2_1rBBPFIM~#OLtg)2s2)
z^N`pbWr(-fo=@|LbS=3US7cIZk7*g0fMd)4*sc?TXEVzt&`FVRa=CYekY<j_?Ra{x
z8~Giwxt2V^V&mtUk`}q+Hq+?R<q`vz_1g@N^3J33p55hxm+fa7qm4JmOV7SSv+E9C
zdS|K_Bz)Je?7d+i?Mk9lE3RlSL9@mC$Z^7j-+<~m)Rr~8M5K81^{@<0c&TF@i=FkJ
zQ{mabW`&F|&YuD6cyqHz2nZGq&~$<SBVhmRxip%A8-rqgpYjO5xHByhD~wV@k4dPJ
zzi>DI5MrCc9%Eu6Re~!MDD{xD_+bv=9sd{1D}0sYnu)Q45<Qb^@YGcwYffF>kd6bB
zHp^==x^Fm16*)~iksvAFAV%=x5O~R)xViX%7*-(f%shdq502ms;?Jy+btyFhdJ7A>
zli9?oBi%3T_LNJX`xBPu6GVQF(_gwr8g=zB5i4`W586&BAoWs2^H4MZwpx9JHsihU
z<WOUYh<}MbI6YZ;7i%b{+W$>Oz`x|}P)YMz#V*`Lpxelp?<=pc%4|cfOmj%dv%q`f
zfr85Uj8Pv*as<b-X0aZ03qJ`iWz5}>jlo`g{B`;>&)Svv!{pjywM8rD+bpd70*KQJ
zVmi}|`Yv?vr#iDiR>QaD`4O}ha@`f(8>onLj7!O(<&_l?5a3y0{7x%Ck<Rw+RyK~8
z!faRW;vofunUwX!dLIi9@oeYqy2_p9S{q1Vddub`&ZQ6OH5EFfN0mxXt*iw5{y$w-
z0J>ZPbb0unx@>&>pSmonxzJ^*y@MyaC#K&+03Q7?yU_VQ9jxI3QmITvQIgBJsgk#M
zeDl|7*)a0+;P|&RX(>g%8#I~;_cr88s7fn{YY@Hv{xkLd#IIHE1MWlg)Oo73gyROu
z0dIN(>ukMAtWZPnDpj1VPt24r&zwcsT-l0s11GNmN}WQ-T|HmP_yrFI^}IdF<jC^k
z!;{o$W5mxZe@ZjNF^0hsh?)u@P4fSa+;4@t#|<iXNKqh;yuwegxf%wC5T7-xnjm98
z>aM2`JY=b#X<`p<xCVpC!JHYJVuEjQgdXz<ezUtT8L8cF*syl)>#CZvq+jd`g<TbW
zAe|iX=qiM5j6v3nOP&)9!tLh?xfP*3v+Yb~nxUwr`j}j+6elNCP)J74_=UhEHb*O-
z33u3|)ZWZTx3X0ijlwr(_x3tMxdzM!DW^9g^fx(I<(W~}sl`}s;d}`+N!G#!i^+58
z((KobgDanxT}TM~U1VfR9+!?yRxkakOPQm_k6&8ul^xjX?)q6mC!Rq4a%FIA(ps|e
z0av8i3@1Nsqg8$`@2%cd!R}>3<p+qg`$IR{7#}T;OR1(s45`6)`$j6&ZWBzXs=ck6
zT%*+=PfDA$DOauTr5m8|@;MhM>ak4}P2J4P?G&V!(nmMiEpW>4v4|Xi@M(@paZxdG
z>_lCxztX)7A(`EwH4v1QgV!AQKIBhUQg!Q0;sI9Ffkx}vAFR0ATUt8*uC!VzPIC+t
zE&Ci<zQ{Og2jcg7wgq(FPKm6^B{F4a_PJ&Zf$6n*r$-W}wQnDN3EbszT6hI9bjDGf
zj<f-@-d%5Rj*)u25vM#e-8=Q7t50yI3Yz@7&u-S~hk|Gop3uib>g3l#<`w1YMiEq^
z4(=1zBAHrY>3NDnsa`YuSvF^Ht|Z1_%fqkND&Zb~K1Wk^Ji2*L9xcUeIvdB)O=I^e
zoXlV1JcTkFoHzX4GwWj-R;2LF7+vdxd%oR!q1H{d0sHL5j1wbh=Cg1%ZQ<i>bX%7G
z#53m3vAq@zV+C+L$^kM{hzI@h1EN^Fpd|f1R_3eyjV~KBhCfm~tMXuYYc54aJoC2H
zlKKu-8Oo<Q&q|SY&n2(|vc&vJ+BHh3ExRoZ{h2+ftURKL(I0J!f~ECak(6S-WNUjS
z<P~8J2?_K}mMMD_T|p%a2@BC#p}&TAn?W25kxe;|5#RE+I<L9a$A(139ai?VJ&+=Y
z7cSadd40n)m&0h>MmV(SeuJdm@GbPwg||Oyd-_zN9kV1g$X{TW02CeoiS^rlUzVi5
zDEysS4w~X>boWTVAL$N|^k(KLRvR`YNaNe+wjMaIeXfnPqJzItr%k(h8<!&RT0veI
zDJ29?^2Ho-IQ08G3}4C8OJbo#Xg@@Yel-4FVaV`CR}}BN5EVQ3Q3fJ5y#%~KGDuP-
z2YXS+!_;v8ya+f~VX+n#D9zSR6WJ~LGse#3(<I0@Rd*Yan_(Xvbul*&-FD303g~_<
zSMuH^W3>Xg{505ML%I!hg-IirV30z5%7LM*JQf$JxYOHyBV7GetA;#TDJ{*B=c$Xg
zP|cY~Ws5{Wi-@C|Q_><y_Lf=_wdAb;!yc37fbp!GHOU|74?j>XSE?pt>}dFg6h<*{
z8yFxm{gG*|ZC)EA)yP)R6_1W*>)+NT@pTJ2UQ>5c?Kv(=mBYT~t<!uFM$TBDae10P
zy}v~khn!>zea~Bc5k2+lfIubn34Yk_U>o=3YHaWp^%`7=HQI%Smo`NFf*i4oJ+BQX
ziu`)+T(QOcoI0%IV`E;jd}gvdYU<{uwpH8Q!&fP<JFni0B%13G0Yl+LStS1O;I)hU
zuA5ZCY$<*&hHJeKx%5xC)gNGCQ{epHuTcPk|M{s2`aS*eQUw!=>EQnYf<QGvZGc?<
z?dLIJn1=Zh778frU-lUjglSVRK?*=+`P;N&qA)GqB}y9izYHHH7_-B@1P1^<{|~qL
zI|VS|n7!pCTn_Nrmz^di4ztC&#JK}T^>5t&S`o8%xI|6>&Fy8{{;S2sTp05Ly@Znj
ztsVyaf31&M>n|Tr2)v-Xe8At8A9Dkkh2|1(1Nd`H{9il3yYl;AVBk9lhzIt&Wdi*J
D9m=s2

literal 0
HcmV?d00001

diff --git a/skills/skills/langbot-testing/probes/langbot-fault-taxonomy-contract.mjs b/skills/skills/langbot-testing/probes/langbot-fault-taxonomy-contract.mjs
new file mode 100644
index 000000000..8c9628e58
--- /dev/null
+++ b/skills/skills/langbot-testing/probes/langbot-fault-taxonomy-contract.mjs
@@ -0,0 +1,159 @@
+#!/usr/bin/env node
+
+import { mkdir, writeFile } from "node:fs/promises";
+import { join, resolve } from "node:path";
+import { env, exit } from "node:process";
+
+function pad(value, size = 2) {
+  return String(value).padStart(size, "0");
+}
+
+function localIsoWithOffset(date = new Date()) {
+  const offsetMinutes = -date.getTimezoneOffset();
+  const sign = offsetMinutes >= 0 ? "+" : "-";
+  const absolute = Math.abs(offsetMinutes);
+  return [
+    `${date.getFullYear()}-${pad(date.getMonth() + 1)}-${pad(date.getDate())}`,
+    `T${pad(date.getHours())}:${pad(date.getMinutes())}:${pad(date.getSeconds())}.${pad(date.getMilliseconds(), 3)}`,
+    `${sign}${pad(Math.floor(absolute / 60))}:${pad(absolute % 60)}`,
+  ].join("");
+}
+
+function timestampSlug(date = new Date()) {
+  return date.toISOString().replace(/\.\d{3}Z$/, "Z").replace(/[^0-9A-Za-z]+/g, "-").replace(/^-|-$/g, "");
+}
+
+const scenarios = [
+  {
+    id: "provider-timeout",
+    target: "provider",
+    injected_fault: "fake provider request exceeds the configured timeout",
+    expected_status: "env_issue",
+    recovery_check: "provider route is reachable or the case remains outside product pass/fail",
+    cleanup: "stop fake provider or reset proxy route",
+  },
+  {
+    id: "plugin-runtime-disconnect",
+    target: "plugin-runtime",
+    injected_fault: "runtime control channel disconnects during an action",
+    expected_status: "fail",
+    recovery_check: "runtime reconnects and a deterministic plugin action succeeds",
+    cleanup: "restart the local plugin runtime process",
+  },
+  {
+    id: "mcp-stdio-server-exit",
+    target: "mcp",
+    injected_fault: "stdio server exits mid-call",
+    expected_status: "fail",
+    recovery_check: "server can be registered again and exposes the expected tool",
+    cleanup: "remove temporary MCP server registration",
+  },
+  {
+    id: "operator-missing-login",
+    target: "webui",
+    injected_fault: "browser profile is not authenticated",
+    expected_status: "blocked",
+    recovery_check: "authenticated profile can open the same WebUI origin",
+    cleanup: "no product cleanup; refresh local login state",
+  },
+  {
+    id: "transient-marketplace-timeout",
+    target: "marketplace",
+    injected_fault: "marketplace request times out once and then succeeds",
+    expected_status: "flaky",
+    recovery_check: "rerun passes with the same product revision and no code change",
+    cleanup: "clear retry-only evidence and keep the run classified as flaky",
+  },
+];
+
+function validateScenario(scenario) {
+  const missing = ["id", "target", "injected_fault", "expected_status", "recovery_check", "cleanup"]
+    .filter((key) => !scenario[key]);
+  const allowedStatuses = new Set(["pass", "fail", "blocked", "env_issue", "flaky"]);
+  return {
+    id: scenario.id,
+    pass: missing.length === 0 && allowedStatuses.has(scenario.expected_status),
+    missing,
+    expected_status: scenario.expected_status,
+  };
+}
+
+async function main() {
+  const root = resolve(env.LBS_ROOT || process.cwd());
+  const caseId = "langbot-fault-taxonomy-contract";
+  const runId = env.LBS_RUN_ID || `${timestampSlug()}-${caseId}`;
+  const evidenceDir = resolve(env.LBS_EVIDENCE_DIR || join(root, "reports", "evidence", runId));
+  await mkdir(evidenceDir, { recursive: true });
+
+  const startedAt = new Date();
+  const validations = scenarios.map(validateScenario);
+  const statusCounts = {};
+  for (const scenario of scenarios) {
+    statusCounts[scenario.expected_status] = (statusCounts[scenario.expected_status] || 0) + 1;
+  }
+  const metrics = {
+    probe: caseId,
+    scenario_count: scenarios.length,
+    status_counts: statusCounts,
+    scenarios,
+    validations,
+  };
+  const thresholds = {
+    scenario_count: { actual: scenarios.length, min: 5, pass: scenarios.length >= 5 },
+    invalid_scenario_count: {
+      actual: validations.filter((item) => !item.pass).length,
+      max: 0,
+      pass: validations.every((item) => item.pass),
+    },
+    cleanup_declared_count: {
+      actual: scenarios.filter((item) => item.cleanup).length,
+      min: scenarios.length,
+      pass: scenarios.every((item) => item.cleanup),
+    },
+  };
+  const status = Object.values(thresholds).every((item) => item.pass) ? "pass" : "fail";
+  const metricsPath = join(evidenceDir, "metrics.json");
+  const faultModelPath = join(evidenceDir, "fault-model.json");
+  const automationResultPath = join(evidenceDir, "automation-result.json");
+  const resultPath = join(evidenceDir, "result.json");
+
+  await writeFile(metricsPath, `${JSON.stringify(metrics, null, 2)}\n`, "utf8");
+  await writeFile(faultModelPath, `${JSON.stringify({ scenarios }, null, 2)}\n`, "utf8");
+
+  const finishedAt = new Date();
+  const result = {
+    source: "automation",
+    case_id: caseId,
+    run_id: runId,
+    status,
+    reason: status === "pass"
+      ? "Fault taxonomy contract declares status, recovery, and cleanup for every scenario."
+      : "Fault taxonomy contract is missing required scenario fields.",
+    started_at: startedAt.toISOString(),
+    started_at_local: localIsoWithOffset(startedAt),
+    finished_at: finishedAt.toISOString(),
+    finished_at_local: localIsoWithOffset(finishedAt),
+    duration_ms: finishedAt.getTime() - startedAt.getTime(),
+    metrics_summary: {
+      scenario_count: metrics.scenario_count,
+      status_counts: metrics.status_counts,
+      invalid_scenario_count: thresholds.invalid_scenario_count.actual,
+    },
+    thresholds_summary: thresholds,
+    artifacts: {
+      metrics_json: metricsPath,
+      fault_model_json: faultModelPath,
+      automation_result_json: automationResultPath,
+      result_json: resultPath,
+    },
+    evidence_collected: ["metrics", "filesystem"],
+  };
+
+  const resultText = `${JSON.stringify(result, null, 2)}\n`;
+  await writeFile(automationResultPath, resultText, "utf8");
+  await writeFile(resultPath, resultText, "utf8");
+  console.log(JSON.stringify(result, null, 2));
+  exit(status === "pass" ? 0 : 1);
+}
+
+await main();
diff --git a/skills/skills/langbot-testing/probes/langbot-live-backend-latency.mjs b/skills/skills/langbot-testing/probes/langbot-live-backend-latency.mjs
new file mode 100644
index 000000000..747c84c6a
--- /dev/null
+++ b/skills/skills/langbot-testing/probes/langbot-live-backend-latency.mjs
@@ -0,0 +1,212 @@
+#!/usr/bin/env node
+
+import { mkdir, writeFile } from "node:fs/promises";
+import { join, resolve } from "node:path";
+import { env, exit } from "node:process";
+
+function pad(value, size = 2) {
+  return String(value).padStart(size, "0");
+}
+
+function localIsoWithOffset(date = new Date()) {
+  const offsetMinutes = -date.getTimezoneOffset();
+  const sign = offsetMinutes >= 0 ? "+" : "-";
+  const absolute = Math.abs(offsetMinutes);
+  return [
+    `${date.getFullYear()}-${pad(date.getMonth() + 1)}-${pad(date.getDate())}`,
+    `T${pad(date.getHours())}:${pad(date.getMinutes())}:${pad(date.getSeconds())}.${pad(date.getMilliseconds(), 3)}`,
+    `${sign}${pad(Math.floor(absolute / 60))}:${pad(absolute % 60)}`,
+  ].join("");
+}
+
+function timestampSlug(date = new Date()) {
+  return date.toISOString().replace(/\.\d{3}Z$/, "Z").replace(/[^0-9A-Za-z]+/g, "-").replace(/^-|-$/g, "");
+}
+
+function percentile(values, percentileValue) {
+  if (values.length === 0) return 0;
+  const sorted = [...values].sort((a, b) => a - b);
+  const index = Math.min(sorted.length - 1, Math.ceil((percentileValue / 100) * sorted.length) - 1);
+  return Number(sorted[index].toFixed(3));
+}
+
+function stats(values) {
+  if (values.length === 0) return { min: 0, p50: 0, p95: 0, p99: 0, max: 0 };
+  return {
+    min: Number(Math.min(...values).toFixed(3)),
+    p50: percentile(values, 50),
+    p95: percentile(values, 95),
+    p99: percentile(values, 99),
+    max: Number(Math.max(...values).toFixed(3)),
+  };
+}
+
+function parseJsonList(value, fallback) {
+  if (!value) return fallback;
+  try {
+    const parsed = JSON.parse(value);
+    return Array.isArray(parsed) && parsed.every((item) => typeof item === "string") ? parsed : fallback;
+  } catch {
+    return fallback;
+  }
+}
+
+function joinUrl(baseUrl, path) {
+  const base = baseUrl.replace(/\/+$/, "");
+  const suffix = path.startsWith("/") ? path : `/${path}`;
+  return `${base}${suffix}`;
+}
+
+async function fetchOnce(url, timeoutMs) {
+  const controller = new AbortController();
+  const timeout = setTimeout(() => controller.abort(), timeoutMs);
+  const started = performance.now();
+  try {
+    const response = await fetch(url, { method: "GET", signal: controller.signal });
+    await response.arrayBuffer();
+    const latencyMs = performance.now() - started;
+    return {
+      url,
+      ok: response.status < 500,
+      status: response.status,
+      latency_ms: Number(latencyMs.toFixed(3)),
+      error: "",
+    };
+  } catch (error) {
+    const latencyMs = performance.now() - started;
+    return {
+      url,
+      ok: false,
+      status: 0,
+      latency_ms: Number(latencyMs.toFixed(3)),
+      error: error instanceof Error ? error.message : String(error),
+    };
+  } finally {
+    clearTimeout(timeout);
+  }
+}
+
+async function runBatches(urls, totalRequests, concurrency, timeoutMs) {
+  const queue = Array.from({ length: totalRequests }, (_, index) => urls[index % urls.length]);
+  const results = [];
+  while (queue.length > 0) {
+    const batch = queue.splice(0, concurrency);
+    results.push(...await Promise.all(batch.map((url) => fetchOnce(url, timeoutMs))));
+  }
+  return results;
+}
+
+async function main() {
+  const root = resolve(env.LBS_ROOT || process.cwd());
+  const caseId = "langbot-live-backend-latency";
+  const runId = env.LBS_RUN_ID || `${timestampSlug()}-${caseId}`;
+  const evidenceDir = resolve(env.LBS_EVIDENCE_DIR || join(root, "reports", "evidence", runId));
+  await mkdir(evidenceDir, { recursive: true });
+
+  const startedAt = new Date();
+  const backendUrl = env.LANGBOT_BACKEND_URL || "";
+  const endpoints = parseJsonList(env.LANGBOT_PERF_ENDPOINTS_JSON, ["/healthz"]);
+  const totalRequests = Number(env.LANGBOT_PERF_REQUESTS || "12");
+  const concurrency = Number(env.LANGBOT_PERF_CONCURRENCY || "2");
+  const timeoutMs = Number(env.LANGBOT_PERF_TIMEOUT_MS || "5000");
+  const p95BudgetMs = Number(env.LANGBOT_PERF_BACKEND_P95_MS || "1000");
+  const maxErrorRate = Number(env.LANGBOT_PERF_MAX_ERROR_RATE || "0");
+  const metricsPath = join(evidenceDir, "metrics.json");
+  const networkLogPath = join(evidenceDir, "network.log");
+  const automationResultPath = join(evidenceDir, "automation-result.json");
+  const resultPath = join(evidenceDir, "result.json");
+
+  let status = "fail";
+  let reason = "";
+  let results = [];
+  if (!backendUrl) {
+    status = "env_issue";
+    reason = "LANGBOT_BACKEND_URL is not configured.";
+  } else {
+    const urls = endpoints.map((path) => joinUrl(backendUrl, path));
+    results = await runBatches(urls, totalRequests, concurrency, timeoutMs);
+    const okCount = results.filter((item) => item.ok).length;
+    const errorCount = results.length - okCount;
+    const errorRate = results.length === 0 ? 1 : errorCount / results.length;
+    const latencies = results.filter((item) => item.ok).map((item) => item.latency_ms);
+    const latencyStats = stats(latencies);
+    const allConnectionFailures = results.length > 0 && results.every((item) => item.status === 0);
+    if (allConnectionFailures) {
+      status = "env_issue";
+      reason = `Backend did not respond at ${backendUrl}.`;
+    } else if (latencyStats.p95 <= p95BudgetMs && errorRate <= maxErrorRate) {
+      status = "pass";
+      reason = "Live backend latency probe passed all thresholds.";
+    } else {
+      status = "fail";
+      reason = "Live backend latency probe breached latency or error-rate thresholds.";
+    }
+  }
+
+  const statusCounts = {};
+  for (const item of results) {
+    const key = item.status === 0 ? "network_error" : String(item.status);
+    statusCounts[key] = (statusCounts[key] || 0) + 1;
+  }
+  const okResults = results.filter((item) => item.ok);
+  const metrics = {
+    probe: caseId,
+    backend_url: backendUrl,
+    endpoints,
+    total_requests: totalRequests,
+    concurrency,
+    timeout_ms: timeoutMs,
+    ok_count: okResults.length,
+    error_count: results.length - okResults.length,
+    error_rate: results.length === 0 ? 1 : Number(((results.length - okResults.length) / results.length).toFixed(4)),
+    latency_ms: stats(okResults.map((item) => item.latency_ms)),
+    status_counts: statusCounts,
+  };
+  const thresholds = {
+    backend_p95_ms: { actual: metrics.latency_ms.p95, max: p95BudgetMs, pass: metrics.latency_ms.p95 <= p95BudgetMs },
+    error_rate: { actual: metrics.error_rate, max: maxErrorRate, pass: metrics.error_rate <= maxErrorRate },
+  };
+
+  await writeFile(metricsPath, `${JSON.stringify({ ...metrics, samples: results }, null, 2)}\n`, "utf8");
+  await writeFile(networkLogPath, results.map((item) => JSON.stringify(item)).join("\n") + (results.length > 0 ? "\n" : ""), "utf8");
+
+  const finishedAt = new Date();
+  const result = {
+    source: "automation",
+    case_id: caseId,
+    run_id: runId,
+    status,
+    reason,
+    started_at: startedAt.toISOString(),
+    started_at_local: localIsoWithOffset(startedAt),
+    finished_at: finishedAt.toISOString(),
+    finished_at_local: localIsoWithOffset(finishedAt),
+    duration_ms: finishedAt.getTime() - startedAt.getTime(),
+    url: backendUrl,
+    metrics_summary: {
+      requests: metrics.total_requests,
+      concurrency: metrics.concurrency,
+      ok_count: metrics.ok_count,
+      error_rate: metrics.error_rate,
+      latency_p50_ms: metrics.latency_ms.p50,
+      latency_p95_ms: metrics.latency_ms.p95,
+      status_counts: metrics.status_counts,
+    },
+    thresholds_summary: thresholds,
+    artifacts: {
+      metrics_json: metricsPath,
+      network_log: networkLogPath,
+      automation_result_json: automationResultPath,
+      result_json: resultPath,
+    },
+    evidence_collected: ["metrics", "network", "api_diagnostic", "filesystem"],
+  };
+
+  const resultText = `${JSON.stringify(result, null, 2)}\n`;
+  await writeFile(automationResultPath, resultText, "utf8");
+  await writeFile(resultPath, resultText, "utf8");
+  console.log(JSON.stringify(result, null, 2));
+  exit(status === "pass" ? 0 : status === "env_issue" ? 2 : 1);
+}
+
+await main();
diff --git a/skills/skills/langbot-testing/probes/langbot-live-backend-log-health.mjs b/skills/skills/langbot-testing/probes/langbot-live-backend-log-health.mjs
new file mode 100644
index 000000000..38a31c389
--- /dev/null
+++ b/skills/skills/langbot-testing/probes/langbot-live-backend-log-health.mjs
@@ -0,0 +1,205 @@
+#!/usr/bin/env node
+
+import { existsSync, readdirSync, statSync } from "node:fs";
+import { mkdir, readFile, writeFile } from "node:fs/promises";
+import { join, resolve } from "node:path";
+import { env, exit } from "node:process";
+
+function pad(value, size = 2) {
+  return String(value).padStart(size, "0");
+}
+
+function localIsoWithOffset(date = new Date()) {
+  const offsetMinutes = -date.getTimezoneOffset();
+  const sign = offsetMinutes >= 0 ? "+" : "-";
+  const absolute = Math.abs(offsetMinutes);
+  return [
+    `${date.getFullYear()}-${pad(date.getMonth() + 1)}-${pad(date.getDate())}`,
+    `T${pad(date.getHours())}:${pad(date.getMinutes())}:${pad(date.getSeconds())}.${pad(date.getMilliseconds(), 3)}`,
+    `${sign}${pad(Math.floor(absolute / 60))}:${pad(absolute % 60)}`,
+  ].join("");
+}
+
+function timestampSlug(date = new Date()) {
+  return date.toISOString().replace(/\.\d{3}Z$/, "Z").replace(/[^0-9A-Za-z]+/g, "-").replace(/^-|-$/g, "");
+}
+
+function repoRootFromEnv(root) {
+  return env.LANGBOT_REPO ? resolve(env.LANGBOT_REPO) : resolve(root, "..");
+}
+
+function latestBackendLog(root) {
+  const explicit = env.LANGBOT_BACKEND_LOG;
+  if (explicit) return resolve(explicit);
+
+  const logsDir = join(repoRootFromEnv(root), "data", "logs");
+  if (!existsSync(logsDir)) return "";
+  const candidates = readdirSync(logsDir)
+    .filter((name) => /^langbot-.*\.log$/.test(name))
+    .map((name) => join(logsDir, name))
+    .filter((path) => {
+      try {
+        return statSync(path).isFile();
+      } catch {
+        return false;
+      }
+    })
+    .sort((left, right) => statSync(right).mtimeMs - statSync(left).mtimeMs);
+  return candidates[0] || "";
+}
+
+function parseSince(startedAt) {
+  if (env.LANGBOT_BACKEND_LOG_SINCE) return new Date(env.LANGBOT_BACKEND_LOG_SINCE);
+  const lookbackSeconds = Number(env.LANGBOT_BACKEND_LOG_LOOKBACK_SECONDS || "300");
+  return new Date(startedAt.getTime() - lookbackSeconds * 1000);
+}
+
+function parseTimestamp(line, year) {
+  const localMatch = line.match(/^\[(\d{2})-(\d{2}) (\d{2}):(\d{2}):(\d{2})\.(\d{3})\]/);
+  if (localMatch) {
+    const [, month, day, hour, minute, second, millisecond] = localMatch;
+    return new Date(`${year}-${month}-${day}T${hour}:${minute}:${second}.${millisecond}+08:00`);
+  }
+
+  const accessMatch = line.match(/^\[(\d{4})-(\d{2})-(\d{2}) (\d{2}):(\d{2}):(\d{2}) ([+-]\d{4})\]/);
+  if (accessMatch) {
+    const [, fullYear, month, day, hour, minute, second, offset] = accessMatch;
+    const normalizedOffset = `${offset.slice(0, 3)}:${offset.slice(3)}`;
+    return new Date(`${fullYear}-${month}-${day}T${hour}:${minute}:${second}${normalizedOffset}`);
+  }
+
+  return null;
+}
+
+function findingForLine(line, number) {
+  const rules = [
+    { severity: "fail", kind: "python_traceback", pattern: /\bTraceback(?: \(most recent call last\))?/i },
+    { severity: "fail", kind: "unretrieved_task_exception", pattern: /Task exception was never retrieved/i },
+    { severity: "fail", kind: "unawaited_coroutine", pattern: /RuntimeWarning:\s+coroutine .* was never awaited/i },
+    { severity: "fail", kind: "unclosed_client_session", pattern: /Unclosed client session/i },
+    { severity: "fail", kind: "unclosed_connector", pattern: /Unclosed connector/i },
+    { severity: "fail", kind: "import_error", pattern: /\bImportError\b/i },
+    { severity: "fail", kind: "error_log", pattern: /\b(?:ERROR|CRITICAL)\b/ },
+    { severity: "warning", kind: "warning_log", pattern: /\bWARNING\b/ },
+  ];
+
+  for (const rule of rules) {
+    if (rule.pattern.test(line)) {
+      return {
+        severity: rule.severity,
+        kind: rule.kind,
+        line: number,
+        excerpt: line,
+      };
+    }
+  }
+  return null;
+}
+
+function scanLines(text, since, year) {
+  const findings = [];
+  const scanned = [];
+  let includeContinuation = false;
+  const lines = text.split(/\r?\n/);
+  for (const [index, line] of lines.entries()) {
+    const number = index + 1;
+    const timestamp = parseTimestamp(line, year);
+    if (timestamp) includeContinuation = timestamp >= since;
+    if (!includeContinuation) continue;
+    scanned.push({ number, text: line });
+    const finding = findingForLine(line, number);
+    if (finding) findings.push(finding);
+  }
+  return { findings, scanned, total_lines: lines.length };
+}
+
+async function main() {
+  const root = resolve(env.LBS_ROOT || process.cwd());
+  const caseId = "langbot-live-backend-log-health";
+  const runId = env.LBS_RUN_ID || `${timestampSlug()}-${caseId}`;
+  const evidenceDir = resolve(env.LBS_EVIDENCE_DIR || join(root, "reports", "evidence", runId));
+  await mkdir(evidenceDir, { recursive: true });
+
+  const startedAt = new Date();
+  const since = parseSince(startedAt);
+  const logPath = latestBackendLog(root);
+  const metricsPath = join(evidenceDir, "metrics.json");
+  const findingsPath = join(evidenceDir, "findings.json");
+  const scannedLogPath = join(evidenceDir, "scanned-backend.log");
+  const automationResultPath = join(evidenceDir, "automation-result.json");
+  const resultPath = join(evidenceDir, "result.json");
+
+  let status = "fail";
+  let reason = "";
+  let scan = { findings: [], scanned: [], total_lines: 0 };
+  if (!logPath || !existsSync(logPath)) {
+    status = "env_issue";
+    reason = "No LangBot backend log file was found. Set LANGBOT_BACKEND_LOG or LANGBOT_REPO.";
+  } else {
+    const text = await readFile(logPath, "utf8");
+    scan = scanLines(text, since, startedAt.getFullYear());
+    const failCount = scan.findings.filter((item) => item.severity === "fail").length;
+    status = failCount === 0 ? "pass" : "fail";
+    reason = status === "pass"
+      ? "Live backend log health passed; no fail-severity findings in the scanned window."
+      : "Live backend log health found fail-severity backend log findings.";
+  }
+
+  const warningCount = scan.findings.filter((item) => item.severity === "warning").length;
+  const failCount = scan.findings.filter((item) => item.severity === "fail").length;
+  const metrics = {
+    probe: caseId,
+    backend_log: logPath,
+    since: since.toISOString(),
+    scanned_line_count: scan.scanned.length,
+    total_line_count: scan.total_lines,
+    fail_count: failCount,
+    warning_count: warningCount,
+    finding_count: scan.findings.length,
+  };
+  const thresholds = {
+    fail_count: { actual: failCount, max: 0, pass: failCount === 0 },
+  };
+
+  await writeFile(metricsPath, `${JSON.stringify(metrics, null, 2)}\n`, "utf8");
+  await writeFile(findingsPath, `${JSON.stringify(scan.findings, null, 2)}\n`, "utf8");
+  await writeFile(scannedLogPath, scan.scanned.map((item) => `${item.number}: ${item.text}`).join("\n") + (scan.scanned.length > 0 ? "\n" : ""), "utf8");
+
+  const finishedAt = new Date();
+  const result = {
+    source: "automation",
+    case_id: caseId,
+    run_id: runId,
+    status,
+    reason,
+    started_at: startedAt.toISOString(),
+    started_at_local: localIsoWithOffset(startedAt),
+    finished_at: finishedAt.toISOString(),
+    finished_at_local: localIsoWithOffset(finishedAt),
+    duration_ms: finishedAt.getTime() - startedAt.getTime(),
+    url: logPath,
+    metrics_summary: {
+      scanned_line_count: metrics.scanned_line_count,
+      fail_count: metrics.fail_count,
+      warning_count: metrics.warning_count,
+      finding_count: metrics.finding_count,
+    },
+    thresholds_summary: thresholds,
+    artifacts: {
+      metrics_json: metricsPath,
+      findings_json: findingsPath,
+      scanned_backend_log: scannedLogPath,
+      automation_result_json: automationResultPath,
+      result_json: resultPath,
+    },
+    evidence_collected: ["metrics", "backend_log", "filesystem"],
+  };
+
+  const resultText = `${JSON.stringify(result, null, 2)}\n`;
+  await writeFile(automationResultPath, resultText, "utf8");
+  await writeFile(resultPath, resultText, "utf8");
+  console.log(JSON.stringify(result, null, 2));
+  exit(status === "pass" ? 0 : status === "env_issue" ? 2 : 1);
+}
+
+await main();
diff --git a/skills/skills/langbot-testing/probes/langbot-live-control-plane-api.mjs b/skills/skills/langbot-testing/probes/langbot-live-control-plane-api.mjs
new file mode 100644
index 000000000..8232d1fc3
--- /dev/null
+++ b/skills/skills/langbot-testing/probes/langbot-live-control-plane-api.mjs
@@ -0,0 +1,311 @@
+#!/usr/bin/env node
+
+import { mkdir, writeFile } from "node:fs/promises";
+import { join, resolve } from "node:path";
+import { env, exit } from "node:process";
+
+function pad(value, size = 2) {
+  return String(value).padStart(size, "0");
+}
+
+function localIsoWithOffset(date = new Date()) {
+  const offsetMinutes = -date.getTimezoneOffset();
+  const sign = offsetMinutes >= 0 ? "+" : "-";
+  const absolute = Math.abs(offsetMinutes);
+  return [
+    `${date.getFullYear()}-${pad(date.getMonth() + 1)}-${pad(date.getDate())}`,
+    `T${pad(date.getHours())}:${pad(date.getMinutes())}:${pad(date.getSeconds())}.${pad(date.getMilliseconds(), 3)}`,
+    `${sign}${pad(Math.floor(absolute / 60))}:${pad(absolute % 60)}`,
+  ].join("");
+}
+
+function timestampSlug(date = new Date()) {
+  return date.toISOString().replace(/\.\d{3}Z$/, "Z").replace(/[^0-9A-Za-z]+/g, "-").replace(/^-|-$/g, "");
+}
+
+function percentile(values, percentileValue) {
+  if (values.length === 0) return 0;
+  const sorted = [...values].sort((a, b) => a - b);
+  const index = Math.min(sorted.length - 1, Math.ceil((percentileValue / 100) * sorted.length) - 1);
+  return Number(sorted[index].toFixed(3));
+}
+
+function stats(values) {
+  if (values.length === 0) return { min: 0, p50: 0, p95: 0, p99: 0, max: 0 };
+  return {
+    min: Number(Math.min(...values).toFixed(3)),
+    p50: percentile(values, 50),
+    p95: percentile(values, 95),
+    p99: percentile(values, 99),
+    max: Number(Math.max(...values).toFixed(3)),
+  };
+}
+
+function joinUrl(baseUrl, path) {
+  const base = baseUrl.replace(/\/+$/, "");
+  const suffix = path.startsWith("/") ? path : `/${path}`;
+  return `${base}${suffix}`;
+}
+
+function parseJsonObject(value, fallback) {
+  if (!value) return fallback;
+  try {
+    const parsed = JSON.parse(value);
+    return parsed && typeof parsed === "object" && !Array.isArray(parsed) ? parsed : fallback;
+  } catch {
+    return fallback;
+  }
+}
+
+function controlPlaneEndpoints() {
+  return [
+    {
+      id: "healthz",
+      path: "/healthz",
+      expected_status: 200,
+      expected_code: 0,
+      p95_budget_ms: Number(env.LANGBOT_PERF_HEALTHZ_P95_MS || "500"),
+      required_data_fields: [],
+    },
+    {
+      id: "system_info",
+      path: "/api/v1/system/info",
+      expected_status: 200,
+      expected_code: 0,
+      p95_budget_ms: Number(env.LANGBOT_PERF_SYSTEM_INFO_P95_MS || "1000"),
+      required_data_fields: ["version", "edition", "enable_marketplace"],
+    },
+  ];
+}
+
+async function fetchEndpoint(backendUrl, endpoint, timeoutMs) {
+  const url = joinUrl(backendUrl, endpoint.path);
+  const controller = new AbortController();
+  const timeout = setTimeout(() => controller.abort(), timeoutMs);
+  const started = performance.now();
+  let bodyText = "";
+  let json = null;
+  let jsonValid = false;
+  let error = "";
+
+  try {
+    const response = await fetch(url, {
+      method: "GET",
+      headers: { "accept": "application/json" },
+      signal: controller.signal,
+    });
+    bodyText = await response.text();
+    try {
+      json = bodyText ? JSON.parse(bodyText) : null;
+      jsonValid = json !== null;
+    } catch (parseError) {
+      error = parseError instanceof Error ? parseError.message : String(parseError);
+    }
+
+    const data = json && typeof json === "object" && json.data && typeof json.data === "object" ? json.data : {};
+    const missingFields = endpoint.required_data_fields.filter((field) => !(field in data));
+    const statusOk = response.status === endpoint.expected_status;
+    const codeOk = !json || typeof json !== "object" ? false : json.code === endpoint.expected_code;
+    const shapeOk = jsonValid && missingFields.length === 0;
+    const latencyMs = performance.now() - started;
+    return {
+      endpoint_id: endpoint.id,
+      path: endpoint.path,
+      url,
+      status: response.status,
+      ok: statusOk && codeOk && shapeOk,
+      status_ok: statusOk,
+      code_ok: codeOk,
+      json_valid: jsonValid,
+      missing_fields: missingFields,
+      response_code: json && typeof json === "object" ? json.code : null,
+      latency_ms: Number(latencyMs.toFixed(3)),
+      error,
+    };
+  } catch (fetchError) {
+    const latencyMs = performance.now() - started;
+    return {
+      endpoint_id: endpoint.id,
+      path: endpoint.path,
+      url,
+      status: 0,
+      ok: false,
+      status_ok: false,
+      code_ok: false,
+      json_valid: false,
+      missing_fields: endpoint.required_data_fields,
+      response_code: null,
+      latency_ms: Number(latencyMs.toFixed(3)),
+      error: fetchError instanceof Error ? fetchError.message : String(fetchError),
+    };
+  } finally {
+    clearTimeout(timeout);
+  }
+}
+
+async function runBatches(backendUrl, endpoints, totalRequests, concurrency, timeoutMs) {
+  const queue = Array.from({ length: totalRequests }, (_, index) => endpoints[index % endpoints.length]);
+  const results = [];
+  while (queue.length > 0) {
+    const batch = queue.splice(0, concurrency);
+    results.push(...await Promise.all(batch.map((endpoint) => fetchEndpoint(backendUrl, endpoint, timeoutMs))));
+  }
+  return results;
+}
+
+function endpointMetrics(endpoints, results) {
+  return Object.fromEntries(endpoints.map((endpoint) => {
+    const samples = results.filter((item) => item.endpoint_id === endpoint.id);
+    const okSamples = samples.filter((item) => item.ok);
+    return [
+      endpoint.id,
+      {
+        path: endpoint.path,
+        requests: samples.length,
+        ok_count: okSamples.length,
+        error_rate: samples.length === 0 ? 1 : Number(((samples.length - okSamples.length) / samples.length).toFixed(4)),
+        latency_ms: stats(okSamples.map((item) => item.latency_ms)),
+        p95_budget_ms: endpoint.p95_budget_ms,
+      },
+    ];
+  }));
+}
+
+async function main() {
+  const root = resolve(env.LBS_ROOT || process.cwd());
+  const caseId = "langbot-live-control-plane-api";
+  const runId = env.LBS_RUN_ID || `${timestampSlug()}-${caseId}`;
+  const evidenceDir = resolve(env.LBS_EVIDENCE_DIR || join(root, "reports", "evidence", runId));
+  await mkdir(evidenceDir, { recursive: true });
+
+  const startedAt = new Date();
+  const backendUrl = env.LANGBOT_BACKEND_URL || "";
+  const endpoints = controlPlaneEndpoints();
+  const configuredBudgets = parseJsonObject(env.LANGBOT_CONTROL_PLANE_P95_BUDGETS_JSON, {});
+  for (const endpoint of endpoints) {
+    const budget = configuredBudgets[endpoint.id];
+    if (typeof budget === "number" && Number.isFinite(budget)) endpoint.p95_budget_ms = budget;
+  }
+  const totalRequests = Number(env.LANGBOT_CONTROL_PLANE_REQUESTS || "20");
+  const concurrency = Number(env.LANGBOT_CONTROL_PLANE_CONCURRENCY || "4");
+  const timeoutMs = Number(env.LANGBOT_CONTROL_PLANE_TIMEOUT_MS || "5000");
+  const maxErrorRate = Number(env.LANGBOT_CONTROL_PLANE_MAX_ERROR_RATE || "0");
+  const metricsPath = join(evidenceDir, "metrics.json");
+  const endpointsPath = join(evidenceDir, "endpoints.json");
+  const networkLogPath = join(evidenceDir, "network.log");
+  const automationResultPath = join(evidenceDir, "automation-result.json");
+  const resultPath = join(evidenceDir, "result.json");
+
+  let status = "fail";
+  let reason = "";
+  let results = [];
+  if (!backendUrl) {
+    status = "env_issue";
+    reason = "LANGBOT_BACKEND_URL is not configured.";
+  } else {
+    results = await runBatches(backendUrl, endpoints, totalRequests, concurrency, timeoutMs);
+    const allConnectionFailures = results.length > 0 && results.every((item) => item.status === 0);
+    if (allConnectionFailures) {
+      status = "env_issue";
+      reason = `Backend did not respond at ${backendUrl}.`;
+    }
+  }
+
+  const okResults = results.filter((item) => item.ok);
+  const statusCounts = {};
+  for (const item of results) {
+    const key = item.status === 0 ? "network_error" : String(item.status);
+    statusCounts[key] = (statusCounts[key] || 0) + 1;
+  }
+  const perEndpoint = endpointMetrics(endpoints, results);
+  const responseShapeFailures = results.filter((item) => !item.json_valid || item.missing_fields.length > 0 || !item.code_ok).length;
+  const errorRate = results.length === 0 ? 1 : Number(((results.length - okResults.length) / results.length).toFixed(4));
+  const thresholds = {
+    error_rate: { actual: errorRate, max: maxErrorRate, pass: errorRate <= maxErrorRate },
+    response_shape_failures: { actual: responseShapeFailures, max: 0, pass: responseShapeFailures === 0 },
+  };
+  for (const endpoint of endpoints) {
+    const actual = perEndpoint[endpoint.id].latency_ms.p95;
+    thresholds[`${endpoint.id}_p95_ms`] = {
+      actual,
+      max: endpoint.p95_budget_ms,
+      pass: actual <= endpoint.p95_budget_ms,
+    };
+  }
+
+  if (status !== "env_issue") {
+    const passed = Object.values(thresholds).every((item) => item.pass);
+    status = passed ? "pass" : "fail";
+    reason = passed
+      ? "Live control-plane API probe passed all thresholds."
+      : "Live control-plane API probe breached shape, latency, or error-rate thresholds.";
+  }
+
+  const metrics = {
+    probe: caseId,
+    backend_url: backendUrl,
+    total_requests: totalRequests,
+    concurrency,
+    timeout_ms: timeoutMs,
+    ok_count: okResults.length,
+    error_count: results.length - okResults.length,
+    error_rate: errorRate,
+    status_counts: statusCounts,
+    response_shape_failures: responseShapeFailures,
+    endpoints: perEndpoint,
+  };
+
+  await writeFile(metricsPath, `${JSON.stringify({ ...metrics, samples: results }, null, 2)}\n`, "utf8");
+  await writeFile(endpointsPath, `${JSON.stringify(endpoints, null, 2)}\n`, "utf8");
+  await writeFile(networkLogPath, results.map((item) => JSON.stringify(item)).join("\n") + (results.length > 0 ? "\n" : ""), "utf8");
+
+  const finishedAt = new Date();
+  const result = {
+    source: "automation",
+    case_id: caseId,
+    run_id: runId,
+    status,
+    reason,
+    started_at: startedAt.toISOString(),
+    started_at_local: localIsoWithOffset(startedAt),
+    finished_at: finishedAt.toISOString(),
+    finished_at_local: localIsoWithOffset(finishedAt),
+    duration_ms: finishedAt.getTime() - startedAt.getTime(),
+    url: backendUrl,
+    metrics_summary: {
+      requests: metrics.total_requests,
+      concurrency: metrics.concurrency,
+      ok_count: metrics.ok_count,
+      error_rate: metrics.error_rate,
+      response_shape_failures: metrics.response_shape_failures,
+      endpoints: Object.fromEntries(Object.entries(metrics.endpoints).map(([id, value]) => [
+        id,
+        {
+          path: value.path,
+          ok_count: value.ok_count,
+          error_rate: value.error_rate,
+          latency_p50_ms: value.latency_ms.p50,
+          latency_p95_ms: value.latency_ms.p95,
+        },
+      ])),
+      status_counts: metrics.status_counts,
+    },
+    thresholds_summary: thresholds,
+    artifacts: {
+      metrics_json: metricsPath,
+      endpoints_json: endpointsPath,
+      network_log: networkLogPath,
+      automation_result_json: automationResultPath,
+      result_json: resultPath,
+    },
+    evidence_collected: ["metrics", "network", "api_diagnostic", "filesystem"],
+  };
+
+  const resultText = `${JSON.stringify(result, null, 2)}\n`;
+  await writeFile(automationResultPath, resultText, "utf8");
+  await writeFile(resultPath, resultText, "utf8");
+  console.log(JSON.stringify(result, null, 2));
+  exit(status === "pass" ? 0 : status === "env_issue" ? 2 : 1);
+}
+
+await main();
diff --git a/skills/skills/langbot-testing/probes/langbot-overhead-accounting-contract.mjs b/skills/skills/langbot-testing/probes/langbot-overhead-accounting-contract.mjs
new file mode 100644
index 000000000..5338df003
--- /dev/null
+++ b/skills/skills/langbot-testing/probes/langbot-overhead-accounting-contract.mjs
@@ -0,0 +1,162 @@
+#!/usr/bin/env node
+
+import { mkdir, writeFile } from "node:fs/promises";
+import { join, resolve } from "node:path";
+import { env, exit } from "node:process";
+
+function pad(value, size = 2) {
+  return String(value).padStart(size, "0");
+}
+
+function localIsoWithOffset(date = new Date()) {
+  const offsetMinutes = -date.getTimezoneOffset();
+  const sign = offsetMinutes >= 0 ? "+" : "-";
+  const absolute = Math.abs(offsetMinutes);
+  return [
+    `${date.getFullYear()}-${pad(date.getMonth() + 1)}-${pad(date.getDate())}`,
+    `T${pad(date.getHours())}:${pad(date.getMinutes())}:${pad(date.getSeconds())}.${pad(date.getMilliseconds(), 3)}`,
+    `${sign}${pad(Math.floor(absolute / 60))}:${pad(absolute % 60)}`,
+  ].join("");
+}
+
+function timestampSlug(date = new Date()) {
+  return date.toISOString().replace(/\.\d{3}Z$/, "Z").replace(/[^0-9A-Za-z]+/g, "-").replace(/^-|-$/g, "");
+}
+
+function percentile(values, percentileValue) {
+  if (values.length === 0) return 0;
+  const sorted = [...values].sort((a, b) => a - b);
+  const index = Math.min(sorted.length - 1, Math.ceil((percentileValue / 100) * sorted.length) - 1);
+  return Number(sorted[index].toFixed(3));
+}
+
+function stats(values) {
+  return {
+    min: Number(Math.min(...values).toFixed(3)),
+    p50: percentile(values, 50),
+    p95: percentile(values, 95),
+    p99: percentile(values, 99),
+    max: Number(Math.max(...values).toFixed(3)),
+  };
+}
+
+function threshold(actual, limit, operator) {
+  const pass = operator === "<=" ? actual <= limit : actual >= limit;
+  return { actual, [operator === "<=" ? "max" : "min"]: limit, pass };
+}
+
+function makeSample(index) {
+  const ingress = 1 + (index % 5) * 0.22;
+  const pipeline = 2.8 + (index % 7) * 0.31;
+  const persistence = 1.1 + (index % 4) * 0.2;
+  const pluginIpc = 1.9 + (index % 6) * 0.27;
+  const rag = index % 3 === 0 ? 4.4 : 0.8 + (index % 5) * 0.18;
+  const streaming = 1.5 + (index % 8) * 0.24;
+  const provider = 80 + (index % 13) * 11;
+  const externalTool = index % 4 === 0 ? 25 + (index % 9) * 3 : 0;
+  const network = 8 + (index % 10) * 1.7;
+  const overhead = ingress + pipeline + persistence + pluginIpc + rag + streaming;
+  const external = provider + externalTool + network;
+  const total = overhead + external;
+  return {
+    index,
+    segments_ms: {
+      ingress,
+      pipeline,
+      persistence,
+      plugin_ipc: pluginIpc,
+      rag,
+      streaming,
+      provider,
+      external_tool: externalTool,
+      network,
+    },
+    langbot_overhead_ms: Number(overhead.toFixed(3)),
+    external_latency_ms: Number(external.toFixed(3)),
+    e2e_latency_ms: Number(total.toFixed(3)),
+    accounting_gap_ms: Number((total - external - overhead).toFixed(6)),
+  };
+}
+
+async function main() {
+  const root = resolve(env.LBS_ROOT || process.cwd());
+  const caseId = "langbot-overhead-accounting-contract";
+  const runId = env.LBS_RUN_ID || `${timestampSlug()}-${caseId}`;
+  const evidenceDir = resolve(env.LBS_EVIDENCE_DIR || join(root, "reports", "evidence", runId));
+  await mkdir(evidenceDir, { recursive: true });
+
+  const startedAt = new Date();
+  const sampleCount = Number(env.LANGBOT_PERF_CONTRACT_SAMPLES || "80");
+  const overheadP95BudgetMs = Number(env.LANGBOT_PERF_OVERHEAD_P95_MS || "25");
+  const samples = Array.from({ length: sampleCount }, (_, index) => makeSample(index));
+  const overheads = samples.map((sample) => sample.langbot_overhead_ms);
+  const e2e = samples.map((sample) => sample.e2e_latency_ms);
+  const external = samples.map((sample) => sample.external_latency_ms);
+  const gaps = samples.map((sample) => Math.abs(sample.accounting_gap_ms));
+  const memory = process.memoryUsage();
+
+  const metrics = {
+    probe: caseId,
+    sample_count: sampleCount,
+    langbot_overhead_ms: stats(overheads),
+    e2e_latency_ms: stats(e2e),
+    external_latency_ms: stats(external),
+    accounting_gap_max_ms: Number(Math.max(...gaps).toFixed(6)),
+    samples,
+  };
+  const thresholds = {
+    sample_count: threshold(sampleCount, 50, ">="),
+    langbot_overhead_p95_ms: threshold(metrics.langbot_overhead_ms.p95, overheadP95BudgetMs, "<="),
+    accounting_gap_max_ms: threshold(metrics.accounting_gap_max_ms, 0.001, "<="),
+  };
+  const status = Object.values(thresholds).every((item) => item.pass) ? "pass" : "fail";
+  const metricsPath = join(evidenceDir, "metrics.json");
+  const thresholdsPath = join(evidenceDir, "thresholds.json");
+  const resourceLogPath = join(evidenceDir, "resource-log.json");
+  const automationResultPath = join(evidenceDir, "automation-result.json");
+  const resultPath = join(evidenceDir, "result.json");
+
+  await writeFile(metricsPath, `${JSON.stringify(metrics, null, 2)}\n`, "utf8");
+  await writeFile(thresholdsPath, `${JSON.stringify(thresholds, null, 2)}\n`, "utf8");
+  await writeFile(resourceLogPath, `${JSON.stringify({ memory, pid: process.pid }, null, 2)}\n`, "utf8");
+
+  const finishedAt = new Date();
+  const result = {
+    source: "automation",
+    case_id: caseId,
+    run_id: runId,
+    status,
+    reason: status === "pass"
+      ? "Overhead accounting contract passed all thresholds."
+      : "Overhead accounting contract breached one or more thresholds.",
+    started_at: startedAt.toISOString(),
+    started_at_local: localIsoWithOffset(startedAt),
+    finished_at: finishedAt.toISOString(),
+    finished_at_local: localIsoWithOffset(finishedAt),
+    duration_ms: finishedAt.getTime() - startedAt.getTime(),
+    metrics_summary: {
+      sample_count: metrics.sample_count,
+      langbot_overhead_p95_ms: metrics.langbot_overhead_ms.p95,
+      e2e_latency_p95_ms: metrics.e2e_latency_ms.p95,
+      external_latency_p95_ms: metrics.external_latency_ms.p95,
+      accounting_gap_max_ms: metrics.accounting_gap_max_ms,
+    },
+    thresholds_summary: thresholds,
+    artifacts: {
+      metrics_json: metricsPath,
+      thresholds_json: thresholdsPath,
+      resource_log_json: resourceLogPath,
+      automation_result_json: automationResultPath,
+      result_json: resultPath,
+    },
+    evidence_collected: ["metrics", "resource_log", "filesystem"],
+  };
+
+  const resultText = `${JSON.stringify(result, null, 2)}\n`;
+  await writeFile(automationResultPath, resultText, "utf8");
+  await writeFile(resultPath, resultText, "utf8");
+  console.log(JSON.stringify(result, null, 2));
+  exit(status === "pass" ? 0 : 1);
+}
+
+await main();
diff --git a/skills/skills/langbot-testing/references/performance-reliability-testing.md b/skills/skills/langbot-testing/references/performance-reliability-testing.md
new file mode 100644
index 000000000..6517858d8
--- /dev/null
+++ b/skills/skills/langbot-testing/references/performance-reliability-testing.md
@@ -0,0 +1,173 @@
+# Performance And Reliability Testing
+
+Use this reference when a QA request asks whether LangBot is fast enough,
+stable under load, or resilient to controlled faults.
+
+## Scope
+
+Treat `skills/` as the QA control plane:
+
+- Cases define intent, readiness, thresholds, and required evidence.
+- Probe scripts collect metrics, traces, resource logs, and artifacts.
+- Reports classify the same run as `pass`, `fail`, `blocked`,
+  `env_issue`, or `flaky`.
+
+Do not turn `skills/` into a load generator or chaos engine. Call a focused
+tool from a `mode: probe` case when the test needs one, for example k6,
+Locust, pytest-benchmark, Playwright trace collection, Toxiproxy, Docker, or a
+Kubernetes disruption tool.
+
+## LangBot Performance Model
+
+For LangBot, performance is the cost LangBot adds around external systems:
+
+```text
+LangBot overhead = end-to-end latency - provider latency - external tool latency - network/fault injection latency
+```
+
+Measure user experience and internal composition separately:
+
+- WebUI load and interaction latency.
+- Debug Chat send-to-first-visible-token and send-to-completion latency.
+- Pipeline, RAG, plugin runtime, MCP, AgentRunner, and persistence segment
+  latency.
+- Queue wait time, concurrency, throughput, timeout rate, and p95/p99 latency.
+- Startup, plugin install, knowledge-base ingestion, migration, and recovery
+  time.
+
+Do not report a single message round-trip time as "LangBot performance" unless
+the report also explains external provider/tool/network time.
+
+## Evidence Contract
+
+Performance and reliability cases should declare the evidence they need:
+
+- `metrics`: machine-readable latency, throughput, error-rate, or recovery
+  metrics, usually `metrics.json`.
+- `resource_log`: CPU, memory, process, connection, queue, or file descriptor
+  samples.
+- `trace`: browser, HTTP, database, or runtime trace artifacts.
+- `profile`: CPU, memory, or flamegraph profile artifacts.
+- `backend_log`, `network`, `api_diagnostic`, and `filesystem` as supporting
+  evidence when relevant.
+
+Automation should write `automation-result.json` with these fields when
+available:
+
+```json
+{
+  "status": "pass",
+  "reason": "Probe passed all thresholds.",
+  "metrics_summary": {
+    "langbot_overhead_p95_ms": 12.4,
+    "error_rate": 0
+  },
+  "thresholds_summary": {
+    "langbot_overhead_p95_ms": { "actual": 12.4, "max": 50, "pass": true }
+  },
+  "artifacts": {
+    "metrics_json": "/path/to/metrics.json"
+  },
+  "evidence_collected": ["metrics", "filesystem"]
+}
+```
+
+Synthetic contract probes are useful for checking the QA harness, but they are
+not live product performance results. Label them as contract probes in the case
+title, checks, and report.
+
+## Chaos And Reliability Rules
+
+Chaos tests must be narrow and reversible:
+
+- Declare the fault model in `fault_model_json`.
+- Record blast radius, target component, injection method, duration, and abort
+  conditions.
+- Capture recovery checks and cleanup steps in the case.
+- Classify unavailable dependencies as `env_issue` unless the target behavior
+  is LangBot's handling of that dependency failure.
+- Do not run destructive fault injection against a shared or production-like
+  instance without explicit operator approval.
+
+Recommended first fault models:
+
+- Provider timeout or HTTP 429 from a fake provider endpoint.
+- Plugin runtime disconnect/reconnect in a local instance.
+- MCP stdio server exits mid-call.
+- RAG parser fixture fails once and recovers on retry.
+- Backend API endpoint returns 5xx from a controlled local proxy.
+
+## Starter Live Probes
+
+The starter gate separates QA-harness contracts from live product checks:
+
+- `langbot-overhead-accounting-contract` verifies that reports can carry
+  overhead accounting metrics. It uses deterministic synthetic samples and is
+  not live product performance.
+- `langbot-fault-taxonomy-contract` verifies that fault scenarios declare
+  expected status, recovery, and cleanup before destructive chaos tests are
+  added.
+- `langbot-live-backend-latency` checks the unauthenticated `/healthz`
+  endpoint for basic backend responsiveness.
+- `langbot-live-control-plane-api` checks `/healthz` and
+  `/api/v1/system/info` for HTTP 200, JSON `code: 0`, response shape, and
+  per-endpoint p95 latency.
+- `langbot-live-backend-log-health` scans the recent backend log window for
+  fail-severity runtime findings. It is the reliability guard that should fail
+  the gate when HTTP probes pass but backend logs contain Traceback, ImportError,
+  ERROR, unclosed sessions, or unawaited coroutine signals.
+
+Do not treat these starter live probes as Debug Chat or model-provider
+performance. They are control-plane readiness checks; user-facing performance
+needs browser/WebSocket/message-path measurements.
+
+## Gate Layers
+
+Use the smallest gate that answers the quality question:
+
+- `langbot-performance-contract-gate`: fast synthetic checks for report shape,
+  threshold accounting, and fault taxonomy. Good for PR feedback when no live
+  service is running.
+- `langbot-live-backend-gate`: live backend `/healthz`,
+  `/api/v1/system/info`, and backend log health. Good after starting a local
+  LangBot backend.
+- `langbot-user-path-performance-gate`: browser-visible user path performance,
+  starting with Pipeline Debug Chat send-to-visible-completion latency. Run it
+  only when the browser profile and target pipeline are ready.
+- `langbot-performance-reliability-gate`: combined starter gate for synthetic
+  contracts plus live backend checks.
+
+Keep environment diagnostics separate from product regressions. For example, a
+SOCKS proxy without Python `socksio` support should be fixed or clearly
+classified by `bin/lbs env doctor`; do not hide the resulting backend
+Traceback in reports.
+
+## Debug Chat Performance
+
+`pipeline-debug-chat-performance` reuses the browser Debug Chat automation and
+adds `metrics.json`, `metrics_summary`, and `thresholds_summary` to
+`automation-result.json`.
+
+Current metric:
+
+```text
+response_duration_ms = prompt send -> expected assistant response visible and stable
+```
+
+This is a user-path metric, not pure LangBot overhead. If it regresses, inspect
+provider latency, model route health, plugin/runtime logs, WebSocket behavior,
+and browser console/network evidence before attributing the whole duration to
+LangBot.
+
+## Running The First Gate
+
+Start with the reusable suite:
+
+```bash
+rtk bin/lbs suite plan langbot-performance-reliability-gate
+rtk bin/lbs suite start langbot-performance-reliability-gate --run-id langbot-perf-rel-local
+```
+
+Run synthetic contract probes first. Run live probes only after the selected
+backend/frontend instance is reachable and the run owner accepts any fault
+scope.
diff --git a/skills/skills/langbot-testing/suites/langbot-live-backend-gate.yaml b/skills/skills/langbot-testing/suites/langbot-live-backend-gate.yaml
new file mode 100644
index 000000000..58a978527
--- /dev/null
+++ b/skills/skills/langbot-testing/suites/langbot-live-backend-gate.yaml
@@ -0,0 +1,14 @@
+id: langbot-live-backend-gate
+title: "LangBot live backend reliability gate"
+description: "Live backend control-plane responsiveness and runtime log health checks for a locally running LangBot instance."
+type: reliability
+priority: p1
+tags:
+  - performance
+  - reliability
+  - live-backend
+  - metrics
+cases:
+  - langbot-live-backend-latency
+  - langbot-live-control-plane-api
+  - langbot-live-backend-log-health
diff --git a/skills/skills/langbot-testing/suites/langbot-performance-contract-gate.yaml b/skills/skills/langbot-testing/suites/langbot-performance-contract-gate.yaml
new file mode 100644
index 000000000..b5a9eb47f
--- /dev/null
+++ b/skills/skills/langbot-testing/suites/langbot-performance-contract-gate.yaml
@@ -0,0 +1,13 @@
+id: langbot-performance-contract-gate
+title: "LangBot performance contract gate"
+description: "Fast synthetic contract checks for performance metric accounting and non-destructive reliability fault taxonomy."
+type: contract
+priority: p1
+tags:
+  - performance
+  - reliability
+  - contract
+  - metrics
+cases:
+  - langbot-overhead-accounting-contract
+  - langbot-fault-taxonomy-contract
diff --git a/skills/skills/langbot-testing/suites/langbot-performance-reliability-gate.yaml b/skills/skills/langbot-testing/suites/langbot-performance-reliability-gate.yaml
new file mode 100644
index 000000000..1e0d58d26
--- /dev/null
+++ b/skills/skills/langbot-testing/suites/langbot-performance-reliability-gate.yaml
@@ -0,0 +1,16 @@
+id: langbot-performance-reliability-gate
+title: "LangBot performance and reliability starter gate"
+description: "Starter gate for LangBot performance accounting, live backend control-plane latency, and non-destructive fault taxonomy checks."
+type: reliability
+priority: p1
+tags:
+  - performance
+  - reliability
+  - metrics
+  - chaos
+cases:
+  - langbot-overhead-accounting-contract
+  - langbot-fault-taxonomy-contract
+  - langbot-live-backend-latency
+  - langbot-live-control-plane-api
+  - langbot-live-backend-log-health
diff --git a/skills/skills/langbot-testing/suites/langbot-user-path-performance-gate.yaml b/skills/skills/langbot-testing/suites/langbot-user-path-performance-gate.yaml
new file mode 100644
index 000000000..a6a138ec0
--- /dev/null
+++ b/skills/skills/langbot-testing/suites/langbot-user-path-performance-gate.yaml
@@ -0,0 +1,12 @@
+id: langbot-user-path-performance-gate
+title: "LangBot user-path performance gate"
+description: "Browser-visible performance checks for user-facing LangBot paths such as Pipeline Debug Chat."
+type: performance
+priority: p1
+tags:
+  - performance
+  - browser
+  - debug-chat
+  - user-path
+cases:
+  - pipeline-debug-chat-performance
diff --git a/skills/src/commands/env.ts b/skills/src/commands/env.ts
index d5d1eeaf4..76ef33aec 100644
--- a/skills/src/commands/env.ts
+++ b/skills/src/commands/env.ts
@@ -1,5 +1,7 @@
 import { existsSync } from "node:fs";
+import { spawnSync } from "node:child_process";
 import { Socket } from "node:net";
+import { join } from "node:path";
 import type { CommandContext } from "../types.ts";
 import { parseOptions } from "../cli.ts";
 import { loadEnv } from "../fs.ts";
@@ -88,6 +90,37 @@ function compareProxyPair(env: Record<string, string>, upper: string, lower: str
   return null;
 }
 
+function envValue(env: Record<string, string>, key: string): string {
+  return process.env[key] ?? env[key] ?? "";
+}
+
+function activeSocksProxy(env: Record<string, string>): { key: string; value: string } | null {
+  for (const key of ["ALL_PROXY", "all_proxy", "HTTPS_PROXY", "https_proxy", "HTTP_PROXY", "http_proxy"]) {
+    const value = envValue(env, key);
+    if (/^socks/i.test(value)) return { key, value };
+  }
+  return null;
+}
+
+function checkSocksio(env: Record<string, string>): string | null {
+  const proxy = activeSocksProxy(env);
+  if (!proxy) return null;
+
+  const repo = env.LANGBOT_REPO;
+  const python = repo ? join(repo, ".venv", "bin", "python") : "";
+  if (!python || !existsSync(python)) {
+    return `SOCKS proxy ${proxy.key} is configured (${redactEnvValue(proxy.key, proxy.value)}), but LangBot venv python was not found; after creating the venv, verify it can import socksio.`;
+  }
+
+  const result = spawnSync(python, ["-c", "import socksio"], {
+    encoding: "utf8",
+    timeout: 5000,
+  });
+  if (result.status === 0) return null;
+
+  return `SOCKS proxy ${proxy.key} is configured (${redactEnvValue(proxy.key, proxy.value)}), but ${python} cannot import socksio; run \`${python} -m pip install socksio\` or start LangBot without SOCKS proxy env.`;
+}
+
 export async function commandEnvDoctor(ctx: CommandContext): Promise<number> {
   const env = loadEnv(ctx.root);
   const failures: string[] = [];
@@ -117,6 +150,8 @@ export async function commandEnvDoctor(ctx: CommandContext): Promise<number> {
   ]) {
     if (mismatch) failures.push(mismatch);
   }
+  const socksioFailure = checkSocksio(env);
+  if (socksioFailure) failures.push(socksioFailure);
 
   for (const [label, result] of await Promise.all([
     checkUrl("LANGBOT_BACKEND_URL", env.LANGBOT_BACKEND_URL).then((result) => ["LANGBOT_BACKEND_URL", result] as const),
diff --git a/skills/src/commands/suite.ts b/skills/src/commands/suite.ts
index 403156100..7ab556c5b 100644
--- a/skills/src/commands/suite.ts
+++ b/skills/src/commands/suite.ts
@@ -465,6 +465,41 @@ function outputTail(value: string | Buffer | null | undefined): string {
   return String(value ?? "").trim().slice(-4000);
 }
 
+function exitStatusFromResultStatus(status: string): number {
+  if (status === "pass") return 0;
+  if (status === "blocked" || status === "env_issue" || status === "flaky") return 2;
+  return 1;
+}
+
+function executionStatusFromExitStatus(status: number): string {
+  if (status === 0) return "ok";
+  if (status === 2) return "classified";
+  return "nonzero";
+}
+
+function executionFromCaseResultFile(caseItem: Record<string, unknown>): Record<string, unknown> | null {
+  const resultPath = join(String(caseItem.evidence_dir), "result.json");
+  if (!existsSync(resultPath)) return null;
+  try {
+    const parsed = JSON.parse(readFileSync(resultPath, "utf8")) as Record<string, unknown>;
+    if (
+      parsed.case_id !== caseItem.id ||
+      parsed.run_id !== caseItem.run_id ||
+      typeof parsed.status !== "string"
+    ) return null;
+    const exitStatus = exitStatusFromResultStatus(parsed.status);
+    return {
+      status: executionStatusFromExitStatus(exitStatus),
+      exit_status: exitStatus,
+      reason: typeof parsed.reason === "string" ? parsed.reason : "result.json completed",
+      result_status: parsed.status,
+      result_json: resultPath,
+    };
+  } catch {
+    return null;
+  }
+}
+
 function executionProblemStatus(executions: Array<Record<string, unknown>>): string {
   const statuses = executions.map((item) => String(item.status));
   if (statuses.includes("nonzero")) return "fail";
@@ -523,12 +558,18 @@ export function commandSuiteRun(ctx: CommandContext): number {
       encoding: "utf8",
       stdio: options.json === true ? "pipe" : "inherit",
     });
-    const status = result.error ? 1 : result.status ?? 1;
+    const fileExecution = result.error ? executionFromCaseResultFile(caseItem) : null;
+    const status = typeof fileExecution?.exit_status === "number"
+      ? fileExecution.exit_status
+      : result.error ? 1 : result.status ?? 1;
     executions.push({
       id: caseItem.id,
-      status: status === 0 ? "ok" : "nonzero",
+      status: fileExecution?.status ?? executionStatusFromExitStatus(status),
       exit_status: status,
-      reason: result.error?.message || "",
+      reason: fileExecution?.reason ?? result.error?.message ?? "",
+      result_status: fileExecution?.result_status,
+      result_json: fileExecution?.result_json,
+      spawn_error: fileExecution && result.error ? result.error.message : undefined,
       stdout: outputTail(result.stdout),
       stderr: outputTail(result.stderr),
     });
diff --git a/skills/src/commands/test.ts b/skills/src/commands/test.ts
index 2cce7a1e5..67ddc3122 100644
--- a/skills/src/commands/test.ts
+++ b/skills/src/commands/test.ts
@@ -271,7 +271,7 @@ function reportTemplate(mode: string): Record<string, string> {
       target_tested: "Probe target, endpoint, file, command, or service actually checked",
       execution_path: "automation script | shell command | direct API | other",
       probe_result: "What the probe observed",
-      logs_or_artifacts: "Log, filesystem, API, or other artifact paths collected",
+      metrics_or_artifacts: "Metrics, logs, filesystem artifacts, traces, or profiles collected",
       diagnostics: "Extra diagnostics used, if any",
       matched_troubleshooting: "Troubleshooting ids matched, if any",
       assets_to_update: "New case/reference/troubleshooting entries to add",
@@ -320,7 +320,7 @@ function manualEvidenceTemplate(mode: string): ManualEvidenceTemplate {
       target_tested: "TODO: probe target, endpoint, file, command, or service actually checked",
       execution_path: "TODO: automation script | shell command | direct API | other",
       probe_result: "TODO: observed probe result",
-      logs_or_artifacts: "TODO: evidence paths or skipped reason",
+      metrics_or_artifacts: "TODO: metrics, logs, filesystem artifacts, traces, or profiles collected",
       diagnostics: "TODO: additional diagnostics used, if any",
       matched_troubleshooting: "TODO: troubleshooting ids matched, if any",
       assets_to_update: "TODO: case/reference/troubleshooting updates to make",
@@ -1099,6 +1099,41 @@ function executionTail(value: string | Buffer | null | undefined): string {
   return String(value ?? "").trim().slice(-4000);
 }
 
+function exitStatusFromResultStatus(status: string): number {
+  if (status === "pass") return 0;
+  if (status === "blocked" || status === "env_issue" || status === "flaky") return 2;
+  return 1;
+}
+
+function executionStatusFromExitStatus(status: number): string {
+  if (status === 0) return "ok";
+  if (status === 2) return "classified";
+  return "nonzero";
+}
+
+function executionFromAutomationResultFile(
+  evidenceDir: string,
+  caseId: string,
+  runId: string,
+): { status: string; exit_status: number; reason: string; result_status: string; path: string } | null {
+  const resultPath = join(evidenceDir, "automation-result.json");
+  if (!existsSync(resultPath)) return null;
+  try {
+    const parsed = JSON.parse(readFileSync(resultPath, "utf8")) as Record<string, unknown>;
+    if (parsed.case_id !== caseId || parsed.run_id !== runId || typeof parsed.status !== "string") return null;
+    const exitStatus = exitStatusFromResultStatus(parsed.status);
+    return {
+      status: executionStatusFromExitStatus(exitStatus),
+      exit_status: exitStatus,
+      reason: typeof parsed.reason === "string" ? parsed.reason : "automation-result.json completed",
+      result_status: parsed.status,
+      path: resultPath,
+    };
+  } catch {
+    return null;
+  }
+}
+
 function runSetupAutomation(
   ctx: CommandContext,
   item: StructuredItem,
@@ -1224,6 +1259,30 @@ export function commandTestRun(ctx: CommandContext): number {
   });
 
   if (result.error) {
+    const fileExecution = executionFromAutomationResultFile(
+      run.automation.evidence_dir,
+      String(run.case.id),
+      run.run_id,
+    );
+    if (fileExecution) {
+      if (options.json !== true) {
+        console.error(`WARN: automation spawn reported an error, but ${fileExecution.path} completed: ${result.error.message}`);
+      }
+      if (options.json === true) {
+        console.log(JSON.stringify({
+          run,
+          setup_executions: setupExecutions,
+          automation_execution: {
+            ...fileExecution,
+            spawn_error: result.error.message,
+            stdout: executionTail(result.stdout),
+            stderr: executionTail(result.stderr),
+          },
+          exit_status: fileExecution.exit_status,
+        }, null, 2));
+      }
+      return fileExecution.exit_status;
+    }
     if (options.json !== true) console.error(`ERROR: failed to run automation: ${result.error.message}`);
     if (options.json === true) {
       console.log(JSON.stringify({
@@ -1247,7 +1306,7 @@ export function commandTestRun(ctx: CommandContext): number {
       run,
       setup_executions: setupExecutions,
       automation_execution: {
-        status: status === 0 ? "ok" : "nonzero",
+        status: executionStatusFromExitStatus(status),
         exit_status: status,
         stdout: executionTail(result.stdout),
         stderr: executionTail(result.stderr),
@@ -1311,6 +1370,7 @@ function renderMarkdownReport(report: TestReport): string {
   const environment = report.environment;
   const logGuard = report.log_guard;
   const troubleshooting = report.troubleshooting;
+  const automation = report.automation_result;
   const lines: string[] = [];
 
   lines.push(`# Test Report: ${reportCase.id}`);
@@ -1323,20 +1383,41 @@ function renderMarkdownReport(report: TestReport): string {
   lines.push(`Type: ${reportCase.type}`);
   lines.push("");
   lines.push("## Result");
-  lines.push(`- result: ${evidence.result}`);
-  for (const [key, value] of Object.entries(evidence)) {
-    if (key !== "result") lines.push(`- ${key}: ${value}`);
+  if (automation.status === "loaded" && automation.result) {
+    lines.push(`- result: ${automation.result}`);
+    if (automation.reason) lines.push(`- reason: ${automation.reason}`);
+    if (automation.url) lines.push(`- target_tested: ${automation.url}`);
+    if (automation.path) lines.push(`- automation_result: ${automation.path}`);
+    if (automation.artifacts) lines.push(`- artifacts: ${JSON.stringify(automation.artifacts)}`);
+  } else {
+    lines.push(`- result: ${evidence.result}`);
+    for (const [key, value] of Object.entries(evidence)) {
+      if (key !== "result") lines.push(`- ${key}: ${value}`);
+    }
   }
   lines.push("");
   lines.push("## Automation Result");
-  lines.push(`- status: ${report.automation_result.status}`);
-  if (report.automation_result.path) lines.push(`- path: ${report.automation_result.path}`);
-  if (report.automation_result.result) lines.push(`- result: ${report.automation_result.result}`);
-  if (report.automation_result.reason) lines.push(`- reason: ${report.automation_result.reason}`);
-  if (report.automation_result.started_at_local) lines.push(`- started_at_local: ${report.automation_result.started_at_local}`);
-  if (report.automation_result.finished_at_local) lines.push(`- finished_at_local: ${report.automation_result.finished_at_local}`);
-  if (report.automation_result.url) lines.push(`- url: ${report.automation_result.url}`);
-  if (report.automation_result.expected_text) lines.push(`- expected_text: ${report.automation_result.expected_text}`);
+  lines.push(`- status: ${automation.status}`);
+  if (automation.path) lines.push(`- path: ${automation.path}`);
+  if (automation.result) lines.push(`- result: ${automation.result}`);
+  if (automation.reason) lines.push(`- reason: ${automation.reason}`);
+  if (automation.duration_ms !== undefined) lines.push(`- duration_ms: ${automation.duration_ms}`);
+  if (automation.started_at_local) lines.push(`- started_at_local: ${automation.started_at_local}`);
+  if (automation.finished_at_local) lines.push(`- finished_at_local: ${automation.finished_at_local}`);
+  if (automation.url) lines.push(`- url: ${automation.url}`);
+  if (automation.expected_text) lines.push(`- expected_text: ${automation.expected_text}`);
+  if (automation.metrics_summary) {
+    lines.push("- metrics_summary:");
+    lines.push(`  ${JSON.stringify(automation.metrics_summary)}`);
+  }
+  if (automation.thresholds_summary) {
+    lines.push("- thresholds_summary:");
+    lines.push(`  ${JSON.stringify(automation.thresholds_summary)}`);
+  }
+  if (automation.artifacts) {
+    lines.push("- artifacts:");
+    lines.push(`  ${JSON.stringify(automation.artifacts)}`);
+  }
   lines.push("");
   lines.push("## Environment");
   for (const [key, value] of Object.entries(environment)) lines.push(`- ${key}=${value}`);
diff --git a/skills/src/commands/validate.ts b/skills/src/commands/validate.ts
index 8b15d6344..1c0ef945d 100644
--- a/skills/src/commands/validate.ts
+++ b/skills/src/commands/validate.ts
@@ -126,6 +126,9 @@ function validateCaseItem(root: string, item: StructuredItem, skillNames: Set<st
     ...validateEnvKeyScalar(item, "automation_pipeline_url_env"),
     ...validateEnvKeyScalar(item, "automation_pipeline_name_env"),
     ...validateJsonScalar(item, "automation_filesystem_checks_json"),
+    ...validateJsonScalar(item, "metrics_thresholds_json"),
+    ...validateJsonScalar(item, "load_profile_json"),
+    ...validateJsonScalar(item, "fault_model_json"),
     ...listValue(item.fields, "setup_automation").flatMap((entry) => (
       validateSetupAutomationEntry(root, entry, caseIds).map((error) => `${item.path}: ${error}`)
     )),
diff --git a/skills/src/constants.ts b/skills/src/constants.ts
index 015a9bd39..5cfe37f8a 100644
--- a/skills/src/constants.ts
+++ b/skills/src/constants.ts
@@ -9,7 +9,18 @@ export const requiredEnvKeys = [
 ];
 
 export const caseModeValues = ["agent-browser", "probe"];
-export const caseTypeValues = ["smoke", "regression", "feature", "provider", "exploratory"];
+export const caseTypeValues = [
+  "smoke",
+  "regression",
+  "feature",
+  "provider",
+  "exploratory",
+  "contract",
+  "performance",
+  "reliability",
+  "chaos",
+  "security",
+];
 export const casePriorityValues = ["p0", "p1", "p2"];
 export const caseRiskValues = ["low", "medium", "high"];
 export const caseEvidenceValues = [
@@ -21,10 +32,24 @@ export const caseEvidenceValues = [
   "frontend_log",
   "api_diagnostic",
   "filesystem",
+  "metrics",
+  "trace",
+  "profile",
+  "resource_log",
 ];
 export const testResultStatusValues = ["pass", "fail", "blocked", "env_issue", "flaky"];
 export const troubleshootingCategoryValues = ["product", "env_issue", "external_dependency", "blocked", "flaky"];
-export const suiteTypeValues = ["smoke", "regression", "release_gate", "exploratory"];
+export const suiteTypeValues = [
+  "smoke",
+  "regression",
+  "release_gate",
+  "exploratory",
+  "contract",
+  "performance",
+  "reliability",
+  "chaos",
+  "security",
+];
 export const suiteRequiredStrings = ["id", "title", "description", "type", "priority"];
 export const suiteRequiredLists = ["tags", "cases"];
 
diff --git a/skills/src/log-guard.ts b/skills/src/log-guard.ts
index 253cb229e..6f7f541a7 100644
--- a/skills/src/log-guard.ts
+++ b/skills/src/log-guard.ts
@@ -91,6 +91,7 @@ export type AutomationResultEvidence = {
   path?: string;
   result?: string;
   reason?: string;
+  duration_ms?: number;
   started_at?: string;
   started_at_local?: string;
   finished_at?: string;
@@ -98,6 +99,9 @@ export type AutomationResultEvidence = {
   url?: string;
   prompt?: string;
   expected_text?: string;
+  metrics_summary?: Record<string, unknown>;
+  thresholds_summary?: Record<string, unknown>;
+  artifacts?: Record<string, unknown>;
 };
 
 type MutableScanState = {
@@ -594,6 +598,18 @@ function stringField(data: Record<string, unknown>, key: string): string | undef
   return typeof value === "string" && value.trim() ? value : undefined;
 }
 
+function numberField(data: Record<string, unknown>, key: string): number | undefined {
+  const value = data[key];
+  return typeof value === "number" && Number.isFinite(value) ? value : undefined;
+}
+
+function objectField(data: Record<string, unknown>, key: string): Record<string, unknown> | undefined {
+  const value = data[key];
+  return value && typeof value === "object" && !Array.isArray(value)
+    ? value as Record<string, unknown>
+    : undefined;
+}
+
 function evidenceDirFromOptions(options: Record<string, string | boolean>): string | undefined {
   const explicit = typeof options["evidence-dir"] === "string" ? options["evidence-dir"] : undefined;
   if (explicit) return resolve(explicit);
@@ -628,6 +644,7 @@ export function readAutomationResultEvidence(options: Record<string, string | bo
       path: resultPath,
       result: stringField(result, "status"),
       reason: stringField(result, "reason"),
+      duration_ms: numberField(result, "duration_ms"),
       started_at: stringField(result, "started_at"),
       started_at_local: stringField(result, "started_at_local"),
       finished_at: stringField(result, "finished_at"),
@@ -635,6 +652,9 @@ export function readAutomationResultEvidence(options: Record<string, string | bo
       url: stringField(result, "url"),
       prompt: redactSecrets(stringField(result, "prompt") ?? ""),
       expected_text: stringField(result, "expected_text"),
+      metrics_summary: objectField(result, "metrics_summary"),
+      thresholds_summary: objectField(result, "thresholds_summary"),
+      artifacts: objectField(result, "artifacts"),
     };
   } catch (error) {
     return { status: "invalid", path: resultPath, reason: String(error) };
diff --git a/skills/src/readiness.ts b/skills/src/readiness.ts
index d72a09104..e96f889b4 100644
--- a/skills/src/readiness.ts
+++ b/skills/src/readiness.ts
@@ -114,6 +114,8 @@ export function automationEnvDefaults(item: StructuredItem, env: EnvSource = pro
     ["automation_expected_runner_id", "LANGBOT_E2E_EXPECTED_RUNNER_ID"],
     ["automation_reset_debug_chat", "LANGBOT_E2E_RESET_DEBUG_CHAT"],
     ["automation_debug_chat_session_type", "LANGBOT_E2E_DEBUG_CHAT_SESSION_TYPE"],
+    ["automation_debug_chat_response_p95_ms", "LANGBOT_E2E_DEBUG_CHAT_RESPONSE_P95_MS"],
+    ["automation_debug_chat_max_error_rate", "LANGBOT_E2E_DEBUG_CHAT_MAX_ERROR_RATE"],
     ["automation_filesystem_checks_json", "LANGBOT_E2E_FILESYSTEM_CHECKS_JSON"],
     ["automation_plugin_package", "LANGBOT_E2E_PLUGIN_PACKAGE"],
     ["automation_expected_plugin_id", "LANGBOT_E2E_EXPECTED_PLUGIN_ID"],
diff --git a/skills/test/lbs-cli.test.ts b/skills/test/lbs-cli.test.ts
index d0fac2ee6..66c0411f2 100644
--- a/skills/test/lbs-cli.test.ts
+++ b/skills/test/lbs-cli.test.ts
@@ -1,6 +1,6 @@
 import assert from "node:assert/strict";
 import { test } from "node:test";
-import { appendFileSync, existsSync, mkdtempSync, mkdirSync, readFileSync, rmSync, writeFileSync } from "node:fs";
+import { appendFileSync, chmodSync, existsSync, mkdtempSync, mkdirSync, readFileSync, rmSync, writeFileSync } from "node:fs";
 import { spawnSync } from "node:child_process";
 import { tmpdir } from "node:os";
 import { join } from "node:path";
@@ -676,6 +676,82 @@ test("suite run JSON captures failed case output", () => {
   }
 });
 
+test("suite run preserves classified env_issue automation results", () => {
+  const tmp = mkdtempSync(join(tmpdir(), "lbs-suite-run-env-issue-"));
+  try {
+    const skillDir = join(tmp, "skills", "langbot-testing");
+    const casesDir = join(skillDir, "cases");
+    const suitesDir = join(skillDir, "suites");
+    const scriptsDir = join(tmp, "scripts");
+    mkdirSync(casesDir, { recursive: true });
+    mkdirSync(suitesDir, { recursive: true });
+    mkdirSync(scriptsDir, { recursive: true });
+    writeFileSync(join(skillDir, "SKILL.md"), "---\nname: langbot-testing\ndescription: Testing.\n---\n\n# Testing\n");
+    writeFileSync(join(tmp, "skills", ".env"), "");
+    writeFileSync(
+      join(casesDir, "env-case.yaml"),
+      [
+        "id: env-case",
+        "title: Env Case",
+        "mode: probe",
+        "area: qa",
+        "type: smoke",
+        "priority: p2",
+        "risk: low",
+        "ci_eligible: true",
+        "automation: scripts/env-issue.mjs",
+        "evidence_required:",
+        "  - filesystem",
+      ].join("\n"),
+    );
+    writeFileSync(
+      join(suitesDir, "mini.yaml"),
+      [
+        "id: mini",
+        "title: Mini",
+        "description: Mini suite.",
+        "type: smoke",
+        "priority: p2",
+        "tags:",
+        "  - qa",
+        "cases:",
+        "  - env-case",
+      ].join("\n"),
+    );
+    writeFileSync(
+      join(scriptsDir, "env-issue.mjs"),
+      [
+        "import { mkdirSync, writeFileSync } from 'node:fs';",
+        "import { join } from 'node:path';",
+        "mkdirSync(process.env.LBS_EVIDENCE_DIR, { recursive: true });",
+        "const result = {",
+        "  case_id: process.env.LBS_CASE_ID,",
+        "  run_id: process.env.LBS_RUN_ID,",
+        "  status: 'env_issue',",
+        "  reason: 'backend not reachable',",
+        "  evidence_collected: ['filesystem']",
+        "};",
+        "writeFileSync(join(process.env.LBS_EVIDENCE_DIR, 'result.json'), JSON.stringify(result));",
+        "writeFileSync(join(process.env.LBS_EVIDENCE_DIR, 'automation-result.json'), JSON.stringify({ ...result, source: 'automation' }));",
+        "process.exit(2);",
+      ].join("\n"),
+    );
+
+    const result = capture(() => commandSuiteRun({
+      root: tmp,
+      args: ["suite", "run", "mini", "--run-id", "mini-run", "--evidence-dir", join(tmp, "evidence"), "--json"],
+    }));
+
+    assert.equal(result.code, 2);
+    const payload = JSON.parse(result.output);
+    assert.equal(payload.executions[0].status, "classified");
+    assert.equal(payload.report.status, "env_issue");
+    assert.equal(payload.report.execution_status, "ok");
+  } finally {
+    rmSync(tmp, { recursive: true, force: true });
+  }
+});
+
 test("suite run failure cannot be masked by stale pass result", () => {
   const tmp = mkdtempSync(join(tmpdir(), "lbs-suite-run-stale-pass-"));
   try {
@@ -1369,6 +1445,56 @@ test("env doctor does not require proxy variables", async () => {
   }
 });
 
+test("env doctor reports missing socksio for active SOCKS proxy", async () => {
+  const tmp = mkdtempSync(join(tmpdir(), "lbs-env-doctor-socksio-"));
+  const originalAllProxy = process.env.ALL_PROXY;
+  const originalAllProxyLower = process.env.all_proxy;
+  try {
+    delete process.env.ALL_PROXY;
+    delete process.env.all_proxy;
+    const skillsDir = join(tmp, "skills");
+    const repoDir = join(tmp, "LangBot");
+    const webDir = join(repoDir, "web");
+    const venvBin = join(repoDir, ".venv", "bin");
+    const browserProfile = join(tmp, "browser-profile");
+    const chromium = join(tmp, "chromium");
+    mkdirSync(skillsDir, { recursive: true });
+    mkdirSync(webDir, { recursive: true });
+    mkdirSync(venvBin, { recursive: true });
+    mkdirSync(browserProfile, { recursive: true });
+    writeFileSync(chromium, "");
+    const python = join(venvBin, "python");
+    writeFileSync(python, "#!/bin/sh\nexit 1\n");
+    chmodSync(python, 0o755);
+    writeFileSync(
+      join(skillsDir, ".env"),
+      [
+        "LANGBOT_BACKEND_URL=http://127.0.0.1:59996",
+        "LANGBOT_FRONTEND_URL=http://127.0.0.1:59996",
+        "LANGBOT_DEV_FRONTEND_URL=http://127.0.0.1:59996",
+        `LANGBOT_REPO=${repoDir}`,
+        `LANGBOT_WEB_REPO=${webDir}`,
+        `LANGBOT_BROWSER_PROFILE=${browserProfile}`,
+        `LANGBOT_CHROMIUM_EXECUTABLE=${chromium}`,
+        "ALL_PROXY=socks5://127.0.0.1:7890",
+      ].join("\n"),
+    );
+
+    const result = await captureAsync(() => commandEnvDoctor({ root: tmp, args: ["env", "doctor"] }));
+
+    assert.equal(result.code, 1);
+    assert.match(result.output, /FAIL: SOCKS proxy ALL_PROXY is configured/);
+    assert.match(result.output, /cannot import socksio/);
+    assert.match(result.output, /-m pip install socksio/);
+  } finally {
+    if (originalAllProxy === undefined) delete process.env.ALL_PROXY;
+    else process.env.ALL_PROXY = originalAllProxy;
+    if (originalAllProxyLower === undefined) delete process.env.all_proxy;
+    else process.env.all_proxy = originalAllProxyLower;
+    rmSync(tmp, { recursive: true, force: true });
+  }
+});
+
 test("env show redacts secret-like values by default", () => {
   const tmp = mkdtempSync(join(tmpdir(), "lbs-env-show-redact-"));
   try {
@@ -2521,6 +2647,38 @@ test("test report renders a reusable evidence template", () => {
   assert.match(result.output, /no log files provided/);
 });
 
+test("test report promotes loaded automation evidence into result section", () => {
+  const tmp = mkdtempSync(join(tmpdir(), "lbs-report-automation-"));
+  try {
+    writeFileSync(
+      join(tmp, "automation-result.json"),
+      JSON.stringify({
+        status: "pass",
+        reason: "latency thresholds passed",
+        url: "http://127.0.0.1:5300",
+        artifacts: { metrics_json: join(tmp, "metrics.json") },
+      }),
+    );
+
+    const result = capture(() => commandTestReport(ctx([
+      "test",
+      "report",
+      "langbot-live-backend-latency",
+      "--evidence-dir",
+      tmp,
+      "--no-auto-log",
+    ])));
+
+    assert.equal(result.code, 0);
+    assert.match(result.output, /## Result\n- result: pass\n- reason: latency thresholds passed/);
+    assert.match(result.output, /- target_tested: http:\/\/127\.0\.0\.1:5300/);
+    assert.doesNotMatch(result.output, /target_tested: TODO/);
+    assert.match(result.output, /## Automation Result/);
+  } finally {
+    rmSync(tmp, { recursive: true, force: true });
+  }
+});
+
 test("validate rejects dangling case references and missing automation scripts", () => {
   const tmp = mkdtempSync(join(tmpdir(), "lbs-validate-strict-"));
   try {