diff --git a/skills/docs/user-guide.md b/skills/docs/user-guide.md index ae39ef294..124d3af36 100644 --- a/skills/docs/user-guide.md +++ b/skills/docs/user-guide.md @@ -117,27 +117,35 @@ bin/lbs suite plan langbot-user-path-performance-gate bin/lbs suite run langbot-user-path-performance-gate --run-id langbot-user-path-local --include-manual-check ``` -Controlled Debug Chat message-path load gate: +Controlled Debug Chat message-path load gate (manual/non-required; run fake-provider cases serially when they share `LANGBOT_FAKE_PROVIDER_URL`): ```bash bin/lbs suite plan langbot-debug-chat-load-gate bin/lbs test run langbot-fake-provider-debug-chat-load --run-id langbot-fake-load-local bin/lbs test run langbot-fake-provider-debug-chat-slow-load --run-id langbot-fake-slow-local -bin/lbs test run langbot-fake-provider-debug-chat-cross-pipeline-isolation --run-id langbot-fake-cross-pipeline-local bin/lbs test run langbot-fake-provider-debug-chat-fault-recovery --run-id langbot-fake-fault-local bin/lbs test run langbot-space-debug-chat-concurrency-smoke --run-id langbot-space-smoke-local ``` +Cross-pipeline Debug Chat isolation is a separate manual regression gate because +current releases may fail it due to product bug #2286: + +```bash +bin/lbs suite plan langbot-debug-chat-isolation-gate +bin/lbs suite run langbot-debug-chat-isolation-gate --run-id langbot-debug-chat-isolation-local --include-manual-check +``` + Start with `langbot-fake-provider-debug-chat-load`. It launches a local OpenAI-compatible fake provider, creates the matching provider/model/pipeline, then sends concurrent WebSocket Debug Chat messages through the real backend. Use `langbot-fake-provider-debug-chat-slow-load` to measure the same path under deterministic streaming latency. Use -`langbot-fake-provider-debug-chat-cross-pipeline-isolation` to verify that -concurrent Debug Chat traffic on two pipelines does not leak assistant -responses across pipeline boundaries. Use `langbot-fake-provider-debug-chat-fault-recovery` to inject bounded provider -HTTP failures and confirm later Debug Chat requests recover. +HTTP failures and confirm later Debug Chat requests recover. Use the separate +`langbot-debug-chat-isolation-gate` to verify that concurrent Debug Chat traffic +on two pipelines does not leak assistant responses across pipeline boundaries; +current releases may fail that gate because of #2286, so keep it out of the +normal load gate until the product fix lands. Use `langbot-space-debug-chat-concurrency-smoke` only as a low-volume live provider smoke; it includes Space/model/network latency and should be compared against the fake-provider baseline before attributing failures to LangBot. diff --git a/skills/scripts/e2e/ensure-fake-provider-cross-pipelines.mjs b/skills/scripts/e2e/ensure-fake-provider-cross-pipelines.mjs index 9d2ff295b..592a7b7f9 100644 --- a/skills/scripts/e2e/ensure-fake-provider-cross-pipelines.mjs +++ b/skills/scripts/e2e/ensure-fake-provider-cross-pipelines.mjs @@ -14,8 +14,8 @@ import { } from "./lib/langbot-e2e.mjs"; const caseId = "ensure-fake-provider-cross-pipelines"; -const DEFAULT_PIPELINE_A_NAME = "Agent QA Fake Provider Debug Chat A"; -const DEFAULT_PIPELINE_B_NAME = "Agent QA Fake Provider Debug Chat B"; +const DEFAULT_PIPELINE_A_NAME = "LangBot QA Fake Provider Debug Chat A"; +const DEFAULT_PIPELINE_B_NAME = "LangBot QA Fake Provider Debug Chat B"; await loadEnvFiles(); const paths = evidencePaths(caseId); @@ -57,6 +57,8 @@ const result = { }; try { + console.error(`[langbot-qa] configuring cross-pipeline QA fixtures: pipeline_a=\"${pipelineAName}\", pipeline_b=\"${pipelineBName}\"`); + console.error("[langbot-qa] run these fake-provider setup/probe commands serially when they share LANGBOT_FAKE_PROVIDER_URL."); if (pipelineAName === pipelineBName) { throw new Error("LANGBOT_FAKE_PROVIDER_PIPELINE_A_NAME and LANGBOT_FAKE_PROVIDER_PIPELINE_B_NAME must be different."); } diff --git a/skills/scripts/e2e/ensure-fake-provider-pipeline.mjs b/skills/scripts/e2e/ensure-fake-provider-pipeline.mjs index 9b709be0a..73f2465fd 100644 --- a/skills/scripts/e2e/ensure-fake-provider-pipeline.mjs +++ b/skills/scripts/e2e/ensure-fake-provider-pipeline.mjs @@ -16,8 +16,9 @@ import { const RUNNER_ID = "local-agent"; const DEFAULT_LOCAL_PASSWORD = "LangBotE2ELocalPass!2026"; -const DEFAULT_PIPELINE_NAME = "Agent QA Fake Provider Debug Chat"; +const DEFAULT_PIPELINE_NAME = "LangBot QA Fake Provider Debug Chat"; const DEFAULT_PROVIDER_NAME = "LangBot QA Fake OpenAI Provider"; +const QA_RESOURCE_DESCRIPTION = "Managed by LangBot skills QA automation for controlled fake-provider Debug Chat tests. Safe to delete when local QA fixtures are no longer needed."; const DEFAULT_MODEL_NAME = "gpt-4o-mini"; const DEFAULT_REQUESTER = "openai-chat-completions"; @@ -90,6 +91,8 @@ const result = { }; try { + console.error(`[langbot-qa] configuring QA-owned fake-provider fixtures: provider=\"${providerName}\", pipeline=\"${pipelineName}\"`); + console.error("[langbot-qa] this setup may create or update local QA provider/model/pipeline resources on the selected backend."); if (!backendUrl) { result.status = "env_issue"; throw new Error("LANGBOT_BACKEND_URL is not configured."); @@ -488,7 +491,7 @@ async function ensurePipeline({ backendUrl, token, name, modelUuid }) { token, body: { name, - description: "Local QA pipeline for controlled fake-provider Debug Chat load tests.", + description: QA_RESOURCE_DESCRIPTION, emoji: "QA", }, }); @@ -555,7 +558,7 @@ async function ensurePipeline({ backendUrl, token, name, modelUuid }) { token, body: { name, - description: "Local QA pipeline for controlled fake-provider Debug Chat load tests.", + description: QA_RESOURCE_DESCRIPTION, emoji: "QA", config: updatedConfig, }, diff --git a/skills/skills.index.json b/skills/skills.index.json index 39acc97f1..640996adc 100644 --- a/skills/skills.index.json +++ b/skills/skills.index.json @@ -1413,6 +1413,7 @@ "suites": [ "agent-runner-release-gate", "core-smoke", + "langbot-debug-chat-isolation-gate", "langbot-debug-chat-load-gate", "langbot-live-backend-gate", "langbot-performance-contract-gate", @@ -1480,10 +1481,27 @@ "local-agent-basic-debug-chat" ] }, + { + "id": "langbot-debug-chat-isolation-gate", + "title": "LangBot Debug Chat isolation gate", + "description": "Manual/non-required cross-pipeline Debug Chat isolation gate. Current releases may fail this gate because of product bug #2286; use it as regression evidence after the routing fix lands.", + "type": "reliability", + "priority": "p1", + "tags": [ + "reliability", + "debug-chat", + "websocket", + "isolation", + "concurrency" + ], + "cases": [ + "langbot-fake-provider-debug-chat-cross-pipeline-isolation" + ] + }, { "id": "langbot-debug-chat-load-gate", "title": "LangBot Debug Chat load gate", - "description": "Message-path load checks for Pipeline Debug Chat: controlled fake-provider baseline, slow-provider and fault-recovery profiles, plus optional real Space-provider smoke.", + "description": "Manual/non-required message-path load checks for Pipeline Debug Chat: controlled fake-provider baseline, slow-provider and fault-recovery profiles, plus optional real Space-provider smoke. Cross-pipeline isolation is split into langbot-debug-chat-isolation-gate because current releases may fail it due to product bug #2286.", "type": "performance", "priority": "p1", "tags": [ @@ -1495,7 +1513,6 @@ "cases": [ "langbot-fake-provider-debug-chat-load", "langbot-fake-provider-debug-chat-slow-load", - "langbot-fake-provider-debug-chat-cross-pipeline-isolation", "langbot-fake-provider-debug-chat-fault-recovery", "langbot-space-debug-chat-concurrency-smoke" ] diff --git a/skills/skills/langbot-testing/cases/langbot-fake-provider-debug-chat-cross-pipeline-isolation.yaml b/skills/skills/langbot-testing/cases/langbot-fake-provider-debug-chat-cross-pipeline-isolation.yaml index acf1d7f0f..9e8e09af0 100644 --- a/skills/skills/langbot-testing/cases/langbot-fake-provider-debug-chat-cross-pipeline-isolation.yaml +++ b/skills/skills/langbot-testing/cases/langbot-fake-provider-debug-chat-cross-pipeline-isolation.yaml @@ -68,6 +68,9 @@ diagnostics: - "This probe targets Debug Chat isolation under concurrent traffic from two pipelines." - "It is designed to expose regressions where global pipeline state causes one pipeline's assistant response to be delivered to another pipeline's Debug Chat session." - "Same-pipeline foreign responses are tolerated because Debug Chat intentionally broadcasts within the same pipeline/session; cross-pipeline tokens are never tolerated." + - "Known product bug: current releases may fail this probe because Debug Chat replies can read singleton WebSocket proxy pipeline state after another pipeline overwrites it. See https://github.com/langbot-app/LangBot/issues/2286." +expected_failures: + - "https://github.com/langbot-app/LangBot/issues/2286" success_patterns: - "Debug Chat cross-pipeline isolation probe passed" failure_patterns: diff --git a/skills/skills/langbot-testing/references/performance-reliability-testing.md b/skills/skills/langbot-testing/references/performance-reliability-testing.md index 8bbaa2fb4..42aaa0467 100644 --- a/skills/skills/langbot-testing/references/performance-reliability-testing.md +++ b/skills/skills/langbot-testing/references/performance-reliability-testing.md @@ -3,6 +3,12 @@ Use this reference when a QA request asks whether LangBot is fast enough, stable under load, or resilient to controlled faults. +These probes are manual/non-required QA gates unless a case or suite explicitly +states otherwise. They depend on a live local backend, mutable QA fixtures, and +operator-selected environment variables, so do not promote them to required CI +checks until fake-provider isolation, ownership markers, and cleanup are in +place. + ## Scope Treat `skills/` as the QA control plane: @@ -139,7 +145,12 @@ This is not a mocked backend test. It still exercises: The fake provider is deterministic and can inject controlled latency or faults with `LANGBOT_FAKE_PROVIDER_*` variables, so it is the baseline for LangBot -message-path overhead. The probe uses unique expected response tokens per +message-path overhead. A fake-provider process keeps process-global config, +request counters, and recent request history; run fake-provider probes serially +or give each run its own provider instance. Concurrent probes against the same +fake-provider URL can reset or reconfigure each other's metrics. + +The probe uses unique expected response tokens per request because Debug Chat broadcasts messages to every connection in the same session; unique tokens prevent one connection from counting another connection's response as its own. @@ -160,16 +171,19 @@ When the fake provider is used, reports also include provider-side timing in After the baseline passes, run `langbot-fake-provider-debug-chat-slow-load` to keep the same live backend path while injecting deterministic streaming latency. -Run `langbot-fake-provider-debug-chat-cross-pipeline-isolation` to open -concurrent Debug Chat connections against two fake-provider pipelines and fail -if one pipeline receives the other pipeline's response token. This targets -global pipeline-state regressions in the WebSocket Debug Chat path. Run `langbot-fake-provider-debug-chat-fault-recovery` to inject bounded HTTP provider failures and require both observed failures and later successful requests. The fault-recovery case is deliberately sequential because failed Debug Chat responses do not carry a unique success token that can be attributed to one concurrent connection. +Run `langbot-fake-provider-debug-chat-cross-pipeline-isolation` separately via +`langbot-debug-chat-isolation-gate`. Current LangBot releases may fail it because +of product bug [#2286](https://github.com/langbot-app/LangBot/issues/2286), where +Debug Chat replies can read singleton WebSocket proxy pipeline state after a +later message overwrites it. Treat that failure as regression evidence for the +product fix rather than as a fake-provider latency finding. + Use `langbot-space-debug-chat-concurrency-smoke` after the fake-provider baseline. It runs a deliberately small real Space-provider batch and reports user-visible latency, not pure LangBot overhead. Space/model/network failures @@ -183,8 +197,8 @@ Useful commands: ```bash rtk bin/lbs test run langbot-fake-provider-debug-chat-load --run-id langbot-fake-load-local rtk bin/lbs test run langbot-fake-provider-debug-chat-slow-load --run-id langbot-fake-slow-local -rtk bin/lbs test run langbot-fake-provider-debug-chat-cross-pipeline-isolation --run-id langbot-fake-cross-pipeline-local rtk bin/lbs test run langbot-fake-provider-debug-chat-fault-recovery --run-id langbot-fake-fault-local +rtk bin/lbs suite run langbot-debug-chat-isolation-gate --run-id langbot-debug-chat-isolation-local --include-manual-check rtk bin/lbs test run langbot-space-debug-chat-concurrency-smoke --run-id langbot-space-smoke-local rtk bin/lbs suite run langbot-debug-chat-load-gate --run-id langbot-debug-chat-load-local --include-manual-check ``` @@ -202,10 +216,13 @@ Use the smallest gate that answers the quality question: - `langbot-user-path-performance-gate`: browser-visible user path performance, starting with Pipeline Debug Chat send-to-visible-completion latency. Run it only when the browser profile and target pipeline are ready. -- `langbot-debug-chat-load-gate`: WebSocket Debug Chat load checks, starting - with controlled fake-provider baseline, slow-provider, cross-pipeline - isolation, and fault-recovery profiles, plus an optional low-volume real - Space-provider smoke. +- `langbot-debug-chat-load-gate`: manual WebSocket Debug Chat load checks, + starting with controlled fake-provider baseline, slow-provider, and + fault-recovery profiles, plus an optional low-volume real Space-provider + smoke. Run fake-provider cases serially when they share a provider URL. +- `langbot-debug-chat-isolation-gate`: manual cross-pipeline Debug Chat + isolation regression gate. Current releases may fail because of #2286; keep it + separate from the normal load gate until that product fix lands. - `langbot-performance-reliability-gate`: combined starter gate for synthetic contracts plus live backend checks. diff --git a/skills/skills/langbot-testing/suites/langbot-debug-chat-isolation-gate.yaml b/skills/skills/langbot-testing/suites/langbot-debug-chat-isolation-gate.yaml new file mode 100644 index 000000000..d2b31dd32 --- /dev/null +++ b/skills/skills/langbot-testing/suites/langbot-debug-chat-isolation-gate.yaml @@ -0,0 +1,13 @@ +id: langbot-debug-chat-isolation-gate +title: "LangBot Debug Chat isolation gate" +description: "Manual/non-required cross-pipeline Debug Chat isolation gate. Current releases may fail this gate because of product bug #2286; use it as regression evidence after the routing fix lands." +type: reliability +priority: p1 +tags: + - reliability + - debug-chat + - websocket + - isolation + - concurrency +cases: + - langbot-fake-provider-debug-chat-cross-pipeline-isolation diff --git a/skills/skills/langbot-testing/suites/langbot-debug-chat-load-gate.yaml b/skills/skills/langbot-testing/suites/langbot-debug-chat-load-gate.yaml index 4d9d90510..5b4950f16 100644 --- a/skills/skills/langbot-testing/suites/langbot-debug-chat-load-gate.yaml +++ b/skills/skills/langbot-testing/suites/langbot-debug-chat-load-gate.yaml @@ -1,6 +1,6 @@ id: langbot-debug-chat-load-gate title: "LangBot Debug Chat load gate" -description: "Message-path load checks for Pipeline Debug Chat: controlled fake-provider baseline, slow-provider and fault-recovery profiles, plus optional real Space-provider smoke." +description: "Manual/non-required message-path load checks for Pipeline Debug Chat: controlled fake-provider baseline, slow-provider and fault-recovery profiles, plus optional real Space-provider smoke. Cross-pipeline isolation is split into langbot-debug-chat-isolation-gate because current releases may fail it due to product bug #2286." type: performance priority: p1 tags: @@ -11,6 +11,5 @@ tags: cases: - langbot-fake-provider-debug-chat-load - langbot-fake-provider-debug-chat-slow-load - - langbot-fake-provider-debug-chat-cross-pipeline-isolation - langbot-fake-provider-debug-chat-fault-recovery - langbot-space-debug-chat-concurrency-smoke