Add performance and reliability QA gates

2026-06-26 23:44:19 +00:00 · 2026-06-25 00:07:37 +08:00
parent 74a18191dd
commit 67437c2f5a
31 changed files with 2299 additions and 25 deletions
@@ -1,5 +1,7 @@
 import { existsSync } from "node:fs";
+import { spawnSync } from "node:child_process";
 import { Socket } from "node:net";
+import { join } from "node:path";
 import type { CommandContext } from "../types.ts";
 import { parseOptions } from "../cli.ts";
 import { loadEnv } from "../fs.ts";
@@ -88,6 +90,37 @@ function compareProxyPair(env: Record<string, string>, upper: string, lower: str
  return null;
 }

+function envValue(env: Record<string, string>, key: string): string {
+  return process.env[key] ?? env[key] ?? "";
+}
+
+function activeSocksProxy(env: Record<string, string>): { key: string; value: string } | null {
+  for (const key of ["ALL_PROXY", "all_proxy", "HTTPS_PROXY", "https_proxy", "HTTP_PROXY", "http_proxy"]) {
+    const value = envValue(env, key);
+    if (/^socks/i.test(value)) return { key, value };
+  }
+  return null;
+}
+
+function checkSocksio(env: Record<string, string>): string | null {
+  const proxy = activeSocksProxy(env);
+  if (!proxy) return null;
+
+  const repo = env.LANGBOT_REPO;
+  const python = repo ? join(repo, ".venv", "bin", "python") : "";
+  if (!python || !existsSync(python)) {
+    return `SOCKS proxy ${proxy.key} is configured (${redactEnvValue(proxy.key, proxy.value)}), but LangBot venv python was not found; after creating the venv, verify it can import socksio.`;
+  }
+
+  const result = spawnSync(python, ["-c", "import socksio"], {
+    encoding: "utf8",
+    timeout: 5000,
+  });
+  if (result.status === 0) return null;
+
+  return `SOCKS proxy ${proxy.key} is configured (${redactEnvValue(proxy.key, proxy.value)}), but ${python} cannot import socksio; run \`${python} -m pip install socksio\` or start LangBot without SOCKS proxy env.`;
+}
+
 export async function commandEnvDoctor(ctx: CommandContext): Promise<number> {
  const env = loadEnv(ctx.root);
  const failures: string[] = [];
@@ -117,6 +150,8 @@ export async function commandEnvDoctor(ctx: CommandContext): Promise<number> {
  ]) {
    if (mismatch) failures.push(mismatch);
  }
+  const socksioFailure = checkSocksio(env);
+  if (socksioFailure) failures.push(socksioFailure);

  for (const [label, result] of await Promise.all([
    checkUrl("LANGBOT_BACKEND_URL", env.LANGBOT_BACKEND_URL).then((result) => ["LANGBOT_BACKEND_URL", result] as const),
@@ -465,6 +465,41 @@ function outputTail(value: string | Buffer | null | undefined): string {
  return String(value ?? "").trim().slice(-4000);
 }

+function exitStatusFromResultStatus(status: string): number {
+  if (status === "pass") return 0;
+  if (status === "blocked" || status === "env_issue" || status === "flaky") return 2;
+  return 1;
+}
+
+function executionStatusFromExitStatus(status: number): string {
+  if (status === 0) return "ok";
+  if (status === 2) return "classified";
+  return "nonzero";
+}
+
+function executionFromCaseResultFile(caseItem: Record<string, unknown>): Record<string, unknown> | null {
+  const resultPath = join(String(caseItem.evidence_dir), "result.json");
+  if (!existsSync(resultPath)) return null;
+  try {
+    const parsed = JSON.parse(readFileSync(resultPath, "utf8")) as Record<string, unknown>;
+    if (
+      parsed.case_id !== caseItem.id ||
+      parsed.run_id !== caseItem.run_id ||
+      typeof parsed.status !== "string"
+    ) return null;
+    const exitStatus = exitStatusFromResultStatus(parsed.status);
+    return {
+      status: executionStatusFromExitStatus(exitStatus),
+      exit_status: exitStatus,
+      reason: typeof parsed.reason === "string" ? parsed.reason : "result.json completed",
+      result_status: parsed.status,
+      result_json: resultPath,
+    };
+  } catch {
+    return null;
+  }
+}
+
 function executionProblemStatus(executions: Array<Record<string, unknown>>): string {
  const statuses = executions.map((item) => String(item.status));
  if (statuses.includes("nonzero")) return "fail";
@@ -523,12 +558,18 @@ export function commandSuiteRun(ctx: CommandContext): number {
      encoding: "utf8",
      stdio: options.json === true ? "pipe" : "inherit",
    });
-    const status = result.error ? 1 : result.status ?? 1;
+    const fileExecution = result.error ? executionFromCaseResultFile(caseItem) : null;
+    const status = typeof fileExecution?.exit_status === "number"
+      ? fileExecution.exit_status
+      : result.error ? 1 : result.status ?? 1;
    executions.push({
      id: caseItem.id,
-      status: status === 0 ? "ok" : "nonzero",
+      status: fileExecution?.status ?? executionStatusFromExitStatus(status),
      exit_status: status,
-      reason: result.error?.message || "",
+      reason: fileExecution?.reason ?? result.error?.message ?? "",
+      result_status: fileExecution?.result_status,
+      result_json: fileExecution?.result_json,
+      spawn_error: fileExecution && result.error ? result.error.message : undefined,
      stdout: outputTail(result.stdout),
      stderr: outputTail(result.stderr),
    });
@@ -271,7 +271,7 @@ function reportTemplate(mode: string): Record<string, string> {
      target_tested: "Probe target, endpoint, file, command, or service actually checked",
      execution_path: "automation script | shell command | direct API | other",
      probe_result: "What the probe observed",
-      logs_or_artifacts: "Log, filesystem, API, or other artifact paths collected",
+      metrics_or_artifacts: "Metrics, logs, filesystem artifacts, traces, or profiles collected",
      diagnostics: "Extra diagnostics used, if any",
      matched_troubleshooting: "Troubleshooting ids matched, if any",
      assets_to_update: "New case/reference/troubleshooting entries to add",
@@ -320,7 +320,7 @@ function manualEvidenceTemplate(mode: string): ManualEvidenceTemplate {
      target_tested: "TODO: probe target, endpoint, file, command, or service actually checked",
      execution_path: "TODO: automation script | shell command | direct API | other",
      probe_result: "TODO: observed probe result",
-      logs_or_artifacts: "TODO: evidence paths or skipped reason",
+      metrics_or_artifacts: "TODO: metrics, logs, filesystem artifacts, traces, or profiles collected",
      diagnostics: "TODO: additional diagnostics used, if any",
      matched_troubleshooting: "TODO: troubleshooting ids matched, if any",
      assets_to_update: "TODO: case/reference/troubleshooting updates to make",
@@ -1099,6 +1099,41 @@ function executionTail(value: string | Buffer | null | undefined): string {
  return String(value ?? "").trim().slice(-4000);
 }

+function exitStatusFromResultStatus(status: string): number {
+  if (status === "pass") return 0;
+  if (status === "blocked" || status === "env_issue" || status === "flaky") return 2;
+  return 1;
+}
+
+function executionStatusFromExitStatus(status: number): string {
+  if (status === 0) return "ok";
+  if (status === 2) return "classified";
+  return "nonzero";
+}
+
+function executionFromAutomationResultFile(
+  evidenceDir: string,
+  caseId: string,
+  runId: string,
+): { status: string; exit_status: number; reason: string; result_status: string; path: string } | null {
+  const resultPath = join(evidenceDir, "automation-result.json");
+  if (!existsSync(resultPath)) return null;
+  try {
+    const parsed = JSON.parse(readFileSync(resultPath, "utf8")) as Record<string, unknown>;
+    if (parsed.case_id !== caseId || parsed.run_id !== runId || typeof parsed.status !== "string") return null;
+    const exitStatus = exitStatusFromResultStatus(parsed.status);
+    return {
+      status: executionStatusFromExitStatus(exitStatus),
+      exit_status: exitStatus,
+      reason: typeof parsed.reason === "string" ? parsed.reason : "automation-result.json completed",
+      result_status: parsed.status,
+      path: resultPath,
+    };
+  } catch {
+    return null;
+  }
+}
+
 function runSetupAutomation(
  ctx: CommandContext,
  item: StructuredItem,
@@ -1224,6 +1259,30 @@ export function commandTestRun(ctx: CommandContext): number {
  });

  if (result.error) {
+    const fileExecution = executionFromAutomationResultFile(
+      run.automation.evidence_dir,
+      String(run.case.id),
+      run.run_id,
+    );
+    if (fileExecution) {
+      if (options.json !== true) {
+        console.error(`WARN: automation spawn reported an error, but ${fileExecution.path} completed: ${result.error.message}`);
+      }
+      if (options.json === true) {
+        console.log(JSON.stringify({
+          run,
+          setup_executions: setupExecutions,
+          automation_execution: {
+            ...fileExecution,
+            spawn_error: result.error.message,
+            stdout: executionTail(result.stdout),
+            stderr: executionTail(result.stderr),
+          },
+          exit_status: fileExecution.exit_status,
+        }, null, 2));
+      }
+      return fileExecution.exit_status;
+    }
    if (options.json !== true) console.error(`ERROR: failed to run automation: ${result.error.message}`);
    if (options.json === true) {
      console.log(JSON.stringify({
@@ -1247,7 +1306,7 @@ export function commandTestRun(ctx: CommandContext): number {
      run,
      setup_executions: setupExecutions,
      automation_execution: {
-        status: status === 0 ? "ok" : "nonzero",
+        status: executionStatusFromExitStatus(status),
        exit_status: status,
        stdout: executionTail(result.stdout),
        stderr: executionTail(result.stderr),
@@ -1311,6 +1370,7 @@ function renderMarkdownReport(report: TestReport): string {
  const environment = report.environment;
  const logGuard = report.log_guard;
  const troubleshooting = report.troubleshooting;
+  const automation = report.automation_result;
  const lines: string[] = [];

  lines.push(`# Test Report: ${reportCase.id}`);
@@ -1323,20 +1383,41 @@ function renderMarkdownReport(report: TestReport): string {
  lines.push(`Type: ${reportCase.type}`);
  lines.push("");
  lines.push("## Result");
-  lines.push(`- result: ${evidence.result}`);
-  for (const [key, value] of Object.entries(evidence)) {
-    if (key !== "result") lines.push(`- ${key}: ${value}`);
+  if (automation.status === "loaded" && automation.result) {
+    lines.push(`- result: ${automation.result}`);
+    if (automation.reason) lines.push(`- reason: ${automation.reason}`);
+    if (automation.url) lines.push(`- target_tested: ${automation.url}`);
+    if (automation.path) lines.push(`- automation_result: ${automation.path}`);
+    if (automation.artifacts) lines.push(`- artifacts: ${JSON.stringify(automation.artifacts)}`);
+  } else {
+    lines.push(`- result: ${evidence.result}`);
+    for (const [key, value] of Object.entries(evidence)) {
+      if (key !== "result") lines.push(`- ${key}: ${value}`);
+    }
  }
  lines.push("");
  lines.push("## Automation Result");
-  lines.push(`- status: ${report.automation_result.status}`);
-  if (report.automation_result.path) lines.push(`- path: ${report.automation_result.path}`);
-  if (report.automation_result.result) lines.push(`- result: ${report.automation_result.result}`);
-  if (report.automation_result.reason) lines.push(`- reason: ${report.automation_result.reason}`);
-  if (report.automation_result.started_at_local) lines.push(`- started_at_local: ${report.automation_result.started_at_local}`);
-  if (report.automation_result.finished_at_local) lines.push(`- finished_at_local: ${report.automation_result.finished_at_local}`);
-  if (report.automation_result.url) lines.push(`- url: ${report.automation_result.url}`);
-  if (report.automation_result.expected_text) lines.push(`- expected_text: ${report.automation_result.expected_text}`);
+  lines.push(`- status: ${automation.status}`);
+  if (automation.path) lines.push(`- path: ${automation.path}`);
+  if (automation.result) lines.push(`- result: ${automation.result}`);
+  if (automation.reason) lines.push(`- reason: ${automation.reason}`);
+  if (automation.duration_ms !== undefined) lines.push(`- duration_ms: ${automation.duration_ms}`);
+  if (automation.started_at_local) lines.push(`- started_at_local: ${automation.started_at_local}`);
+  if (automation.finished_at_local) lines.push(`- finished_at_local: ${automation.finished_at_local}`);
+  if (automation.url) lines.push(`- url: ${automation.url}`);
+  if (automation.expected_text) lines.push(`- expected_text: ${automation.expected_text}`);
+  if (automation.metrics_summary) {
+    lines.push("- metrics_summary:");
+    lines.push(`  ${JSON.stringify(automation.metrics_summary)}`);
+  }
+  if (automation.thresholds_summary) {
+    lines.push("- thresholds_summary:");
+    lines.push(`  ${JSON.stringify(automation.thresholds_summary)}`);
+  }
+  if (automation.artifacts) {
+    lines.push("- artifacts:");
+    lines.push(`  ${JSON.stringify(automation.artifacts)}`);
+  }
  lines.push("");
  lines.push("## Environment");
  for (const [key, value] of Object.entries(environment)) lines.push(`- ${key}=${value}`);
@@ -126,6 +126,9 @@ function validateCaseItem(root: string, item: StructuredItem, skillNames: Set<st
    ...validateEnvKeyScalar(item, "automation_pipeline_url_env"),
    ...validateEnvKeyScalar(item, "automation_pipeline_name_env"),
    ...validateJsonScalar(item, "automation_filesystem_checks_json"),
+    ...validateJsonScalar(item, "metrics_thresholds_json"),
+    ...validateJsonScalar(item, "load_profile_json"),
+    ...validateJsonScalar(item, "fault_model_json"),
    ...listValue(item.fields, "setup_automation").flatMap((entry) => (
      validateSetupAutomationEntry(root, entry, caseIds).map((error) => `${item.path}: ${error}`)
    )),
@@ -9,7 +9,18 @@ export const requiredEnvKeys = [
 ];

 export const caseModeValues = ["agent-browser", "probe"];
-export const caseTypeValues = ["smoke", "regression", "feature", "provider", "exploratory"];
+export const caseTypeValues = [
+  "smoke",
+  "regression",
+  "feature",
+  "provider",
+  "exploratory",
+  "contract",
+  "performance",
+  "reliability",
+  "chaos",
+  "security",
+];
 export const casePriorityValues = ["p0", "p1", "p2"];
 export const caseRiskValues = ["low", "medium", "high"];
 export const caseEvidenceValues = [
@@ -21,10 +32,24 @@ export const caseEvidenceValues = [
  "frontend_log",
  "api_diagnostic",
  "filesystem",
+  "metrics",
+  "trace",
+  "profile",
+  "resource_log",
 ];
 export const testResultStatusValues = ["pass", "fail", "blocked", "env_issue", "flaky"];
 export const troubleshootingCategoryValues = ["product", "env_issue", "external_dependency", "blocked", "flaky"];
-export const suiteTypeValues = ["smoke", "regression", "release_gate", "exploratory"];
+export const suiteTypeValues = [
+  "smoke",
+  "regression",
+  "release_gate",
+  "exploratory",
+  "contract",
+  "performance",
+  "reliability",
+  "chaos",
+  "security",
+];
 export const suiteRequiredStrings = ["id", "title", "description", "type", "priority"];
 export const suiteRequiredLists = ["tags", "cases"];

@@ -91,6 +91,7 @@ export type AutomationResultEvidence = {
  path?: string;
  result?: string;
  reason?: string;
+  duration_ms?: number;
  started_at?: string;
  started_at_local?: string;
  finished_at?: string;
@@ -98,6 +99,9 @@ export type AutomationResultEvidence = {
  url?: string;
  prompt?: string;
  expected_text?: string;
+  metrics_summary?: Record<string, unknown>;
+  thresholds_summary?: Record<string, unknown>;
+  artifacts?: Record<string, unknown>;
 };

 type MutableScanState = {
@@ -594,6 +598,18 @@ function stringField(data: Record<string, unknown>, key: string): string | undef
  return typeof value === "string" && value.trim() ? value : undefined;
 }

+function numberField(data: Record<string, unknown>, key: string): number | undefined {
+  const value = data[key];
+  return typeof value === "number" && Number.isFinite(value) ? value : undefined;
+}
+
+function objectField(data: Record<string, unknown>, key: string): Record<string, unknown> | undefined {
+  const value = data[key];
+  return value && typeof value === "object" && !Array.isArray(value)
+    ? value as Record<string, unknown>
+    : undefined;
+}
+
 function evidenceDirFromOptions(options: Record<string, string | boolean>): string | undefined {
  const explicit = typeof options["evidence-dir"] === "string" ? options["evidence-dir"] : undefined;
  if (explicit) return resolve(explicit);
@@ -628,6 +644,7 @@ export function readAutomationResultEvidence(options: Record<string, string | bo
      path: resultPath,
      result: stringField(result, "status"),
      reason: stringField(result, "reason"),
+      duration_ms: numberField(result, "duration_ms"),
      started_at: stringField(result, "started_at"),
      started_at_local: stringField(result, "started_at_local"),
      finished_at: stringField(result, "finished_at"),
@@ -635,6 +652,9 @@ export function readAutomationResultEvidence(options: Record<string, string | bo
      url: stringField(result, "url"),
      prompt: redactSecrets(stringField(result, "prompt") ?? ""),
      expected_text: stringField(result, "expected_text"),
+      metrics_summary: objectField(result, "metrics_summary"),
+      thresholds_summary: objectField(result, "thresholds_summary"),
+      artifacts: objectField(result, "artifacts"),
    };
  } catch (error) {
    return { status: "invalid", path: resultPath, reason: String(error) };
@@ -114,6 +114,8 @@ export function automationEnvDefaults(item: StructuredItem, env: EnvSource = pro
    ["automation_expected_runner_id", "LANGBOT_E2E_EXPECTED_RUNNER_ID"],
    ["automation_reset_debug_chat", "LANGBOT_E2E_RESET_DEBUG_CHAT"],
    ["automation_debug_chat_session_type", "LANGBOT_E2E_DEBUG_CHAT_SESSION_TYPE"],
+    ["automation_debug_chat_response_p95_ms", "LANGBOT_E2E_DEBUG_CHAT_RESPONSE_P95_MS"],
+    ["automation_debug_chat_max_error_rate", "LANGBOT_E2E_DEBUG_CHAT_MAX_ERROR_RATE"],
    ["automation_filesystem_checks_json", "LANGBOT_E2E_FILESYSTEM_CHECKS_JSON"],
    ["automation_plugin_package", "LANGBOT_E2E_PLUGIN_PACKAGE"],
    ["automation_expected_plugin_id", "LANGBOT_E2E_EXPECTED_PLUGIN_ID"],