test(skills): clarify manual QA perf gates

2026-06-25 23:14:20 +00:00 · 2026-06-25 20:46:31 +08:00
parent 9b0f5b36f3
commit 9d877b41c2
8 changed files with 87 additions and 25 deletions
@@ -117,27 +117,35 @@ bin/lbs suite plan langbot-user-path-performance-gate
 bin/lbs suite run langbot-user-path-performance-gate --run-id langbot-user-path-local --include-manual-check
 ```

-Controlled Debug Chat message-path load gate:
+Controlled Debug Chat message-path load gate (manual/non-required; run fake-provider cases serially when they share `LANGBOT_FAKE_PROVIDER_URL`):

 ```bash
 bin/lbs suite plan langbot-debug-chat-load-gate
 bin/lbs test run langbot-fake-provider-debug-chat-load --run-id langbot-fake-load-local
 bin/lbs test run langbot-fake-provider-debug-chat-slow-load --run-id langbot-fake-slow-local
-bin/lbs test run langbot-fake-provider-debug-chat-cross-pipeline-isolation --run-id langbot-fake-cross-pipeline-local
 bin/lbs test run langbot-fake-provider-debug-chat-fault-recovery --run-id langbot-fake-fault-local
 bin/lbs test run langbot-space-debug-chat-concurrency-smoke --run-id langbot-space-smoke-local
 ```

+Cross-pipeline Debug Chat isolation is a separate manual regression gate because
+current releases may fail it due to product bug #2286:
+
+```bash
+bin/lbs suite plan langbot-debug-chat-isolation-gate
+bin/lbs suite run langbot-debug-chat-isolation-gate --run-id langbot-debug-chat-isolation-local --include-manual-check
+```
+
 Start with `langbot-fake-provider-debug-chat-load`. It launches a local
 OpenAI-compatible fake provider, creates the matching provider/model/pipeline,
 then sends concurrent WebSocket Debug Chat messages through the real backend.
 Use `langbot-fake-provider-debug-chat-slow-load` to measure the same path under
 deterministic streaming latency. Use
-`langbot-fake-provider-debug-chat-cross-pipeline-isolation` to verify that
-concurrent Debug Chat traffic on two pipelines does not leak assistant
-responses across pipeline boundaries. Use
 `langbot-fake-provider-debug-chat-fault-recovery` to inject bounded provider
-HTTP failures and confirm later Debug Chat requests recover.
+HTTP failures and confirm later Debug Chat requests recover. Use the separate
+`langbot-debug-chat-isolation-gate` to verify that concurrent Debug Chat traffic
+on two pipelines does not leak assistant responses across pipeline boundaries;
+current releases may fail that gate because of #2286, so keep it out of the
+normal load gate until the product fix lands.
 Use `langbot-space-debug-chat-concurrency-smoke` only as a low-volume live
 provider smoke; it includes Space/model/network latency and should be compared
 against the fake-provider baseline before attributing failures to LangBot.
@@ -14,8 +14,8 @@ import {
 } from "./lib/langbot-e2e.mjs";

 const caseId = "ensure-fake-provider-cross-pipelines";
-const DEFAULT_PIPELINE_A_NAME = "Agent QA Fake Provider Debug Chat A";
-const DEFAULT_PIPELINE_B_NAME = "Agent QA Fake Provider Debug Chat B";
+const DEFAULT_PIPELINE_A_NAME = "LangBot QA Fake Provider Debug Chat A";
+const DEFAULT_PIPELINE_B_NAME = "LangBot QA Fake Provider Debug Chat B";

 await loadEnvFiles();
 const paths = evidencePaths(caseId);
@@ -57,6 +57,8 @@ const result = {
 };

 try {
+  console.error(`[langbot-qa] configuring cross-pipeline QA fixtures: pipeline_a=\"${pipelineAName}\", pipeline_b=\"${pipelineBName}\"`);
+  console.error("[langbot-qa] run these fake-provider setup/probe commands serially when they share LANGBOT_FAKE_PROVIDER_URL.");
  if (pipelineAName === pipelineBName) {
    throw new Error("LANGBOT_FAKE_PROVIDER_PIPELINE_A_NAME and LANGBOT_FAKE_PROVIDER_PIPELINE_B_NAME must be different.");
  }
@@ -16,8 +16,9 @@ import {

 const RUNNER_ID = "local-agent";
 const DEFAULT_LOCAL_PASSWORD = "LangBotE2ELocalPass!2026";
-const DEFAULT_PIPELINE_NAME = "Agent QA Fake Provider Debug Chat";
+const DEFAULT_PIPELINE_NAME = "LangBot QA Fake Provider Debug Chat";
 const DEFAULT_PROVIDER_NAME = "LangBot QA Fake OpenAI Provider";
+const QA_RESOURCE_DESCRIPTION = "Managed by LangBot skills QA automation for controlled fake-provider Debug Chat tests. Safe to delete when local QA fixtures are no longer needed.";
 const DEFAULT_MODEL_NAME = "gpt-4o-mini";
 const DEFAULT_REQUESTER = "openai-chat-completions";

@@ -90,6 +91,8 @@ const result = {
 };

 try {
+  console.error(`[langbot-qa] configuring QA-owned fake-provider fixtures: provider=\"${providerName}\", pipeline=\"${pipelineName}\"`);
+  console.error("[langbot-qa] this setup may create or update local QA provider/model/pipeline resources on the selected backend.");
  if (!backendUrl) {
    result.status = "env_issue";
    throw new Error("LANGBOT_BACKEND_URL is not configured.");
@@ -488,7 +491,7 @@ async function ensurePipeline({ backendUrl, token, name, modelUuid }) {
      token,
      body: {
        name,
-        description: "Local QA pipeline for controlled fake-provider Debug Chat load tests.",
+        description: QA_RESOURCE_DESCRIPTION,
        emoji: "QA",
      },
    });
@@ -555,7 +558,7 @@ async function ensurePipeline({ backendUrl, token, name, modelUuid }) {
    token,
    body: {
      name,
-      description: "Local QA pipeline for controlled fake-provider Debug Chat load tests.",
+      description: QA_RESOURCE_DESCRIPTION,
      emoji: "QA",
      config: updatedConfig,
    },
@@ -1413,6 +1413,7 @@
      "suites": [
        "agent-runner-release-gate",
        "core-smoke",
+        "langbot-debug-chat-isolation-gate",
        "langbot-debug-chat-load-gate",
        "langbot-live-backend-gate",
        "langbot-performance-contract-gate",
@@ -1480,10 +1481,27 @@
            "local-agent-basic-debug-chat"
          ]
        },
+        {
+          "id": "langbot-debug-chat-isolation-gate",
+          "title": "LangBot Debug Chat isolation gate",
+          "description": "Manual/non-required cross-pipeline Debug Chat isolation gate. Current releases may fail this gate because of product bug #2286; use it as regression evidence after the routing fix lands.",
+          "type": "reliability",
+          "priority": "p1",
+          "tags": [
+            "reliability",
+            "debug-chat",
+            "websocket",
+            "isolation",
+            "concurrency"
+          ],
+          "cases": [
+            "langbot-fake-provider-debug-chat-cross-pipeline-isolation"
+          ]
+        },
        {
          "id": "langbot-debug-chat-load-gate",
          "title": "LangBot Debug Chat load gate",
-          "description": "Message-path load checks for Pipeline Debug Chat: controlled fake-provider baseline, slow-provider and fault-recovery profiles, plus optional real Space-provider smoke.",
+          "description": "Manual/non-required message-path load checks for Pipeline Debug Chat: controlled fake-provider baseline, slow-provider and fault-recovery profiles, plus optional real Space-provider smoke. Cross-pipeline isolation is split into langbot-debug-chat-isolation-gate because current releases may fail it due to product bug #2286.",
          "type": "performance",
          "priority": "p1",
          "tags": [
@@ -1495,7 +1513,6 @@
          "cases": [
            "langbot-fake-provider-debug-chat-load",
            "langbot-fake-provider-debug-chat-slow-load",
-            "langbot-fake-provider-debug-chat-cross-pipeline-isolation",
            "langbot-fake-provider-debug-chat-fault-recovery",
            "langbot-space-debug-chat-concurrency-smoke"
          ]
@@ -68,6 +68,9 @@ diagnostics:
  - "This probe targets Debug Chat isolation under concurrent traffic from two pipelines."
  - "It is designed to expose regressions where global pipeline state causes one pipeline's assistant response to be delivered to another pipeline's Debug Chat session."
  - "Same-pipeline foreign responses are tolerated because Debug Chat intentionally broadcasts within the same pipeline/session; cross-pipeline tokens are never tolerated."
+  - "Known product bug: current releases may fail this probe because Debug Chat replies can read singleton WebSocket proxy pipeline state after another pipeline overwrites it. See https://github.com/langbot-app/LangBot/issues/2286."
+expected_failures:
+  - "https://github.com/langbot-app/LangBot/issues/2286"
 success_patterns:
  - "Debug Chat cross-pipeline isolation probe passed"
 failure_patterns:
@@ -3,6 +3,12 @@
 Use this reference when a QA request asks whether LangBot is fast enough,
 stable under load, or resilient to controlled faults.

+These probes are manual/non-required QA gates unless a case or suite explicitly
+states otherwise. They depend on a live local backend, mutable QA fixtures, and
+operator-selected environment variables, so do not promote them to required CI
+checks until fake-provider isolation, ownership markers, and cleanup are in
+place.
+
 ## Scope

 Treat `skills/` as the QA control plane:
@@ -139,7 +145,12 @@ This is not a mocked backend test. It still exercises:

 The fake provider is deterministic and can inject controlled latency or faults
 with `LANGBOT_FAKE_PROVIDER_*` variables, so it is the baseline for LangBot
-message-path overhead. The probe uses unique expected response tokens per
+message-path overhead. A fake-provider process keeps process-global config,
+request counters, and recent request history; run fake-provider probes serially
+or give each run its own provider instance. Concurrent probes against the same
+fake-provider URL can reset or reconfigure each other's metrics.
+
+The probe uses unique expected response tokens per
 request because Debug Chat broadcasts messages to every connection in the same
 session; unique tokens prevent one connection from counting another
 connection's response as its own.
@@ -160,16 +171,19 @@ When the fake provider is used, reports also include provider-side timing in

 After the baseline passes, run `langbot-fake-provider-debug-chat-slow-load` to
 keep the same live backend path while injecting deterministic streaming latency.
-Run `langbot-fake-provider-debug-chat-cross-pipeline-isolation` to open
-concurrent Debug Chat connections against two fake-provider pipelines and fail
-if one pipeline receives the other pipeline's response token. This targets
-global pipeline-state regressions in the WebSocket Debug Chat path.
 Run `langbot-fake-provider-debug-chat-fault-recovery` to inject bounded HTTP
 provider failures and require both observed failures and later successful
 requests. The fault-recovery case is deliberately sequential because failed
 Debug Chat responses do not carry a unique success token that can be attributed
 to one concurrent connection.

+Run `langbot-fake-provider-debug-chat-cross-pipeline-isolation` separately via
+`langbot-debug-chat-isolation-gate`. Current LangBot releases may fail it because
+of product bug [#2286](https://github.com/langbot-app/LangBot/issues/2286), where
+Debug Chat replies can read singleton WebSocket proxy pipeline state after a
+later message overwrites it. Treat that failure as regression evidence for the
+product fix rather than as a fake-provider latency finding.
+
 Use `langbot-space-debug-chat-concurrency-smoke` after the fake-provider
 baseline. It runs a deliberately small real Space-provider batch and reports
 user-visible latency, not pure LangBot overhead. Space/model/network failures
@@ -183,8 +197,8 @@ Useful commands:
 ```bash
 rtk bin/lbs test run langbot-fake-provider-debug-chat-load --run-id langbot-fake-load-local
 rtk bin/lbs test run langbot-fake-provider-debug-chat-slow-load --run-id langbot-fake-slow-local
-rtk bin/lbs test run langbot-fake-provider-debug-chat-cross-pipeline-isolation --run-id langbot-fake-cross-pipeline-local
 rtk bin/lbs test run langbot-fake-provider-debug-chat-fault-recovery --run-id langbot-fake-fault-local
+rtk bin/lbs suite run langbot-debug-chat-isolation-gate --run-id langbot-debug-chat-isolation-local --include-manual-check
 rtk bin/lbs test run langbot-space-debug-chat-concurrency-smoke --run-id langbot-space-smoke-local
 rtk bin/lbs suite run langbot-debug-chat-load-gate --run-id langbot-debug-chat-load-local --include-manual-check
 ```
@@ -202,10 +216,13 @@ Use the smallest gate that answers the quality question:
 - `langbot-user-path-performance-gate`: browser-visible user path performance,
  starting with Pipeline Debug Chat send-to-visible-completion latency. Run it
  only when the browser profile and target pipeline are ready.
- `langbot-debug-chat-load-gate`: WebSocket Debug Chat load checks, starting
-  with controlled fake-provider baseline, slow-provider, cross-pipeline
-  isolation, and fault-recovery profiles, plus an optional low-volume real
-  Space-provider smoke.
+- `langbot-debug-chat-load-gate`: manual WebSocket Debug Chat load checks,
+  starting with controlled fake-provider baseline, slow-provider, and
+  fault-recovery profiles, plus an optional low-volume real Space-provider
+  smoke. Run fake-provider cases serially when they share a provider URL.
+- `langbot-debug-chat-isolation-gate`: manual cross-pipeline Debug Chat
+  isolation regression gate. Current releases may fail because of #2286; keep it
+  separate from the normal load gate until that product fix lands.
 - `langbot-performance-reliability-gate`: combined starter gate for synthetic
  contracts plus live backend checks.

@@ -0,0 +1,13 @@
+id: langbot-debug-chat-isolation-gate
+title: "LangBot Debug Chat isolation gate"
+description: "Manual/non-required cross-pipeline Debug Chat isolation gate. Current releases may fail this gate because of product bug #2286; use it as regression evidence after the routing fix lands."
+type: reliability
+priority: p1
+tags:
+  - reliability
+  - debug-chat
+  - websocket
+  - isolation
+  - concurrency
+cases:
+  - langbot-fake-provider-debug-chat-cross-pipeline-isolation
@@ -1,6 +1,6 @@
 id: langbot-debug-chat-load-gate
 title: "LangBot Debug Chat load gate"
-description: "Message-path load checks for Pipeline Debug Chat: controlled fake-provider baseline, slow-provider and fault-recovery profiles, plus optional real Space-provider smoke."
+description: "Manual/non-required message-path load checks for Pipeline Debug Chat: controlled fake-provider baseline, slow-provider and fault-recovery profiles, plus optional real Space-provider smoke. Cross-pipeline isolation is split into langbot-debug-chat-isolation-gate because current releases may fail it due to product bug #2286."
 type: performance
 priority: p1
 tags:
@@ -11,6 +11,5 @@ tags:
 cases:
  - langbot-fake-provider-debug-chat-load
  - langbot-fake-provider-debug-chat-slow-load
-  - langbot-fake-provider-debug-chat-cross-pipeline-isolation
  - langbot-fake-provider-debug-chat-fault-recovery
  - langbot-space-debug-chat-concurrency-smoke