test(skills): clarify manual QA perf gates

This commit is contained in:
huanghuoguoguo
2026-06-25 20:46:31 +08:00
parent 9b0f5b36f3
commit 9d877b41c2
8 changed files with 87 additions and 25 deletions
+14 -6
View File
@@ -117,27 +117,35 @@ bin/lbs suite plan langbot-user-path-performance-gate
bin/lbs suite run langbot-user-path-performance-gate --run-id langbot-user-path-local --include-manual-check
```
Controlled Debug Chat message-path load gate:
Controlled Debug Chat message-path load gate (manual/non-required; run fake-provider cases serially when they share `LANGBOT_FAKE_PROVIDER_URL`):
```bash
bin/lbs suite plan langbot-debug-chat-load-gate
bin/lbs test run langbot-fake-provider-debug-chat-load --run-id langbot-fake-load-local
bin/lbs test run langbot-fake-provider-debug-chat-slow-load --run-id langbot-fake-slow-local
bin/lbs test run langbot-fake-provider-debug-chat-cross-pipeline-isolation --run-id langbot-fake-cross-pipeline-local
bin/lbs test run langbot-fake-provider-debug-chat-fault-recovery --run-id langbot-fake-fault-local
bin/lbs test run langbot-space-debug-chat-concurrency-smoke --run-id langbot-space-smoke-local
```
Cross-pipeline Debug Chat isolation is a separate manual regression gate because
current releases may fail it due to product bug #2286:
```bash
bin/lbs suite plan langbot-debug-chat-isolation-gate
bin/lbs suite run langbot-debug-chat-isolation-gate --run-id langbot-debug-chat-isolation-local --include-manual-check
```
Start with `langbot-fake-provider-debug-chat-load`. It launches a local
OpenAI-compatible fake provider, creates the matching provider/model/pipeline,
then sends concurrent WebSocket Debug Chat messages through the real backend.
Use `langbot-fake-provider-debug-chat-slow-load` to measure the same path under
deterministic streaming latency. Use
`langbot-fake-provider-debug-chat-cross-pipeline-isolation` to verify that
concurrent Debug Chat traffic on two pipelines does not leak assistant
responses across pipeline boundaries. Use
`langbot-fake-provider-debug-chat-fault-recovery` to inject bounded provider
HTTP failures and confirm later Debug Chat requests recover.
HTTP failures and confirm later Debug Chat requests recover. Use the separate
`langbot-debug-chat-isolation-gate` to verify that concurrent Debug Chat traffic
on two pipelines does not leak assistant responses across pipeline boundaries;
current releases may fail that gate because of #2286, so keep it out of the
normal load gate until the product fix lands.
Use `langbot-space-debug-chat-concurrency-smoke` only as a low-volume live
provider smoke; it includes Space/model/network latency and should be compared
against the fake-provider baseline before attributing failures to LangBot.
@@ -14,8 +14,8 @@ import {
} from "./lib/langbot-e2e.mjs";
const caseId = "ensure-fake-provider-cross-pipelines";
const DEFAULT_PIPELINE_A_NAME = "Agent QA Fake Provider Debug Chat A";
const DEFAULT_PIPELINE_B_NAME = "Agent QA Fake Provider Debug Chat B";
const DEFAULT_PIPELINE_A_NAME = "LangBot QA Fake Provider Debug Chat A";
const DEFAULT_PIPELINE_B_NAME = "LangBot QA Fake Provider Debug Chat B";
await loadEnvFiles();
const paths = evidencePaths(caseId);
@@ -57,6 +57,8 @@ const result = {
};
try {
console.error(`[langbot-qa] configuring cross-pipeline QA fixtures: pipeline_a=\"${pipelineAName}\", pipeline_b=\"${pipelineBName}\"`);
console.error("[langbot-qa] run these fake-provider setup/probe commands serially when they share LANGBOT_FAKE_PROVIDER_URL.");
if (pipelineAName === pipelineBName) {
throw new Error("LANGBOT_FAKE_PROVIDER_PIPELINE_A_NAME and LANGBOT_FAKE_PROVIDER_PIPELINE_B_NAME must be different.");
}
@@ -16,8 +16,9 @@ import {
const RUNNER_ID = "local-agent";
const DEFAULT_LOCAL_PASSWORD = "LangBotE2ELocalPass!2026";
const DEFAULT_PIPELINE_NAME = "Agent QA Fake Provider Debug Chat";
const DEFAULT_PIPELINE_NAME = "LangBot QA Fake Provider Debug Chat";
const DEFAULT_PROVIDER_NAME = "LangBot QA Fake OpenAI Provider";
const QA_RESOURCE_DESCRIPTION = "Managed by LangBot skills QA automation for controlled fake-provider Debug Chat tests. Safe to delete when local QA fixtures are no longer needed.";
const DEFAULT_MODEL_NAME = "gpt-4o-mini";
const DEFAULT_REQUESTER = "openai-chat-completions";
@@ -90,6 +91,8 @@ const result = {
};
try {
console.error(`[langbot-qa] configuring QA-owned fake-provider fixtures: provider=\"${providerName}\", pipeline=\"${pipelineName}\"`);
console.error("[langbot-qa] this setup may create or update local QA provider/model/pipeline resources on the selected backend.");
if (!backendUrl) {
result.status = "env_issue";
throw new Error("LANGBOT_BACKEND_URL is not configured.");
@@ -488,7 +491,7 @@ async function ensurePipeline({ backendUrl, token, name, modelUuid }) {
token,
body: {
name,
description: "Local QA pipeline for controlled fake-provider Debug Chat load tests.",
description: QA_RESOURCE_DESCRIPTION,
emoji: "QA",
},
});
@@ -555,7 +558,7 @@ async function ensurePipeline({ backendUrl, token, name, modelUuid }) {
token,
body: {
name,
description: "Local QA pipeline for controlled fake-provider Debug Chat load tests.",
description: QA_RESOURCE_DESCRIPTION,
emoji: "QA",
config: updatedConfig,
},
+19 -2
View File
@@ -1413,6 +1413,7 @@
"suites": [
"agent-runner-release-gate",
"core-smoke",
"langbot-debug-chat-isolation-gate",
"langbot-debug-chat-load-gate",
"langbot-live-backend-gate",
"langbot-performance-contract-gate",
@@ -1480,10 +1481,27 @@
"local-agent-basic-debug-chat"
]
},
{
"id": "langbot-debug-chat-isolation-gate",
"title": "LangBot Debug Chat isolation gate",
"description": "Manual/non-required cross-pipeline Debug Chat isolation gate. Current releases may fail this gate because of product bug #2286; use it as regression evidence after the routing fix lands.",
"type": "reliability",
"priority": "p1",
"tags": [
"reliability",
"debug-chat",
"websocket",
"isolation",
"concurrency"
],
"cases": [
"langbot-fake-provider-debug-chat-cross-pipeline-isolation"
]
},
{
"id": "langbot-debug-chat-load-gate",
"title": "LangBot Debug Chat load gate",
"description": "Message-path load checks for Pipeline Debug Chat: controlled fake-provider baseline, slow-provider and fault-recovery profiles, plus optional real Space-provider smoke.",
"description": "Manual/non-required message-path load checks for Pipeline Debug Chat: controlled fake-provider baseline, slow-provider and fault-recovery profiles, plus optional real Space-provider smoke. Cross-pipeline isolation is split into langbot-debug-chat-isolation-gate because current releases may fail it due to product bug #2286.",
"type": "performance",
"priority": "p1",
"tags": [
@@ -1495,7 +1513,6 @@
"cases": [
"langbot-fake-provider-debug-chat-load",
"langbot-fake-provider-debug-chat-slow-load",
"langbot-fake-provider-debug-chat-cross-pipeline-isolation",
"langbot-fake-provider-debug-chat-fault-recovery",
"langbot-space-debug-chat-concurrency-smoke"
]
@@ -68,6 +68,9 @@ diagnostics:
- "This probe targets Debug Chat isolation under concurrent traffic from two pipelines."
- "It is designed to expose regressions where global pipeline state causes one pipeline's assistant response to be delivered to another pipeline's Debug Chat session."
- "Same-pipeline foreign responses are tolerated because Debug Chat intentionally broadcasts within the same pipeline/session; cross-pipeline tokens are never tolerated."
- "Known product bug: current releases may fail this probe because Debug Chat replies can read singleton WebSocket proxy pipeline state after another pipeline overwrites it. See https://github.com/langbot-app/LangBot/issues/2286."
expected_failures:
- "https://github.com/langbot-app/LangBot/issues/2286"
success_patterns:
- "Debug Chat cross-pipeline isolation probe passed"
failure_patterns:
@@ -3,6 +3,12 @@
Use this reference when a QA request asks whether LangBot is fast enough,
stable under load, or resilient to controlled faults.
These probes are manual/non-required QA gates unless a case or suite explicitly
states otherwise. They depend on a live local backend, mutable QA fixtures, and
operator-selected environment variables, so do not promote them to required CI
checks until fake-provider isolation, ownership markers, and cleanup are in
place.
## Scope
Treat `skills/` as the QA control plane:
@@ -139,7 +145,12 @@ This is not a mocked backend test. It still exercises:
The fake provider is deterministic and can inject controlled latency or faults
with `LANGBOT_FAKE_PROVIDER_*` variables, so it is the baseline for LangBot
message-path overhead. The probe uses unique expected response tokens per
message-path overhead. A fake-provider process keeps process-global config,
request counters, and recent request history; run fake-provider probes serially
or give each run its own provider instance. Concurrent probes against the same
fake-provider URL can reset or reconfigure each other's metrics.
The probe uses unique expected response tokens per
request because Debug Chat broadcasts messages to every connection in the same
session; unique tokens prevent one connection from counting another
connection's response as its own.
@@ -160,16 +171,19 @@ When the fake provider is used, reports also include provider-side timing in
After the baseline passes, run `langbot-fake-provider-debug-chat-slow-load` to
keep the same live backend path while injecting deterministic streaming latency.
Run `langbot-fake-provider-debug-chat-cross-pipeline-isolation` to open
concurrent Debug Chat connections against two fake-provider pipelines and fail
if one pipeline receives the other pipeline's response token. This targets
global pipeline-state regressions in the WebSocket Debug Chat path.
Run `langbot-fake-provider-debug-chat-fault-recovery` to inject bounded HTTP
provider failures and require both observed failures and later successful
requests. The fault-recovery case is deliberately sequential because failed
Debug Chat responses do not carry a unique success token that can be attributed
to one concurrent connection.
Run `langbot-fake-provider-debug-chat-cross-pipeline-isolation` separately via
`langbot-debug-chat-isolation-gate`. Current LangBot releases may fail it because
of product bug [#2286](https://github.com/langbot-app/LangBot/issues/2286), where
Debug Chat replies can read singleton WebSocket proxy pipeline state after a
later message overwrites it. Treat that failure as regression evidence for the
product fix rather than as a fake-provider latency finding.
Use `langbot-space-debug-chat-concurrency-smoke` after the fake-provider
baseline. It runs a deliberately small real Space-provider batch and reports
user-visible latency, not pure LangBot overhead. Space/model/network failures
@@ -183,8 +197,8 @@ Useful commands:
```bash
rtk bin/lbs test run langbot-fake-provider-debug-chat-load --run-id langbot-fake-load-local
rtk bin/lbs test run langbot-fake-provider-debug-chat-slow-load --run-id langbot-fake-slow-local
rtk bin/lbs test run langbot-fake-provider-debug-chat-cross-pipeline-isolation --run-id langbot-fake-cross-pipeline-local
rtk bin/lbs test run langbot-fake-provider-debug-chat-fault-recovery --run-id langbot-fake-fault-local
rtk bin/lbs suite run langbot-debug-chat-isolation-gate --run-id langbot-debug-chat-isolation-local --include-manual-check
rtk bin/lbs test run langbot-space-debug-chat-concurrency-smoke --run-id langbot-space-smoke-local
rtk bin/lbs suite run langbot-debug-chat-load-gate --run-id langbot-debug-chat-load-local --include-manual-check
```
@@ -202,10 +216,13 @@ Use the smallest gate that answers the quality question:
- `langbot-user-path-performance-gate`: browser-visible user path performance,
starting with Pipeline Debug Chat send-to-visible-completion latency. Run it
only when the browser profile and target pipeline are ready.
- `langbot-debug-chat-load-gate`: WebSocket Debug Chat load checks, starting
with controlled fake-provider baseline, slow-provider, cross-pipeline
isolation, and fault-recovery profiles, plus an optional low-volume real
Space-provider smoke.
- `langbot-debug-chat-load-gate`: manual WebSocket Debug Chat load checks,
starting with controlled fake-provider baseline, slow-provider, and
fault-recovery profiles, plus an optional low-volume real Space-provider
smoke. Run fake-provider cases serially when they share a provider URL.
- `langbot-debug-chat-isolation-gate`: manual cross-pipeline Debug Chat
isolation regression gate. Current releases may fail because of #2286; keep it
separate from the normal load gate until that product fix lands.
- `langbot-performance-reliability-gate`: combined starter gate for synthetic
contracts plus live backend checks.
@@ -0,0 +1,13 @@
id: langbot-debug-chat-isolation-gate
title: "LangBot Debug Chat isolation gate"
description: "Manual/non-required cross-pipeline Debug Chat isolation gate. Current releases may fail this gate because of product bug #2286; use it as regression evidence after the routing fix lands."
type: reliability
priority: p1
tags:
- reliability
- debug-chat
- websocket
- isolation
- concurrency
cases:
- langbot-fake-provider-debug-chat-cross-pipeline-isolation
@@ -1,6 +1,6 @@
id: langbot-debug-chat-load-gate
title: "LangBot Debug Chat load gate"
description: "Message-path load checks for Pipeline Debug Chat: controlled fake-provider baseline, slow-provider and fault-recovery profiles, plus optional real Space-provider smoke."
description: "Manual/non-required message-path load checks for Pipeline Debug Chat: controlled fake-provider baseline, slow-provider and fault-recovery profiles, plus optional real Space-provider smoke. Cross-pipeline isolation is split into langbot-debug-chat-isolation-gate because current releases may fail it due to product bug #2286."
type: performance
priority: p1
tags:
@@ -11,6 +11,5 @@ tags:
cases:
- langbot-fake-provider-debug-chat-load
- langbot-fake-provider-debug-chat-slow-load
- langbot-fake-provider-debug-chat-cross-pipeline-isolation
- langbot-fake-provider-debug-chat-fault-recovery
- langbot-space-debug-chat-concurrency-smoke