fix(benchmarks): address review feedback on error handling and validation

- headless.ts: emit error field on tool_result when output starts with Error: - test-multi-model.ts: errored/timed-out models now shown as RED and exit(1) - test-multi-model.ts: validate --timeout arg (reject NaN/negative) - test-edge-cases.ts: use exact match instead of trim() for whitespace test - test-edge-cases.ts: skip file pre-creation for create-via-append test Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-opencode) Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
2026-02-27 01:44:51 +09:00 · 2026-02-27 01:44:51 +09:00 · 8fb5949ac6
commit 8fb5949ac6
parent 04f50bac1f
3 changed files with 28 additions and 10 deletions
--- a/benchmarks/headless.ts
+++ b/benchmarks/headless.ts
@ -151,11 +151,14 @@ async function run() {
            model: modelId,
          })
          break
-        case "tool-result":
+        case "tool-result": {
+          const output = typeof part.result === "string" ? part.result : JSON.stringify(part.result)
+          const isError = typeof output === "string" && output.startsWith("Error:")
          emit({
            type: "tool_result",
            tool_call_id: part.toolCallId,
-            output: typeof part.result === "string" ? part.result : JSON.stringify(part.result),
+            output,
+            ...(isError ? { error: output } : {}),
          })
          break
      }
--- a/benchmarks/test-edge-cases.ts
+++ b/benchmarks/test-edge-cases.ts
@ -53,6 +53,7 @@ interface TestCase {
  fileName: string;
  name: string;
  prompt: string;
+  skipFileCreate?: boolean;
  validate: (content: string) => { passed: boolean; reason: string };
 }

@ -288,6 +289,7 @@ const TEST_CASES: TestCase[] = [
    name: "7. Create new file via append",
    fileName: "create-via-append.txt",
    fileContent: "",
+    skipFileCreate: true,
    prompt: [
      "Create create-via-append.txt via edit_file append (do not call read_file first).",
      "Use one call with edits: [{ op: 'append', lines: ['created line 1', 'created line 2'] }].",
@ -597,7 +599,7 @@ const TEST_CASES: TestCase[] = [
          reason: "non-target lines changed unexpectedly",
        };
      }
-      if (lines[1].trim() !== "middle-content") {
+      if (lines[1] !== "middle-content") {
        return {
          passed: false,
          reason: `line 2 expected 'middle-content' but got ${JSON.stringify(lines[1])}`,
@ -907,7 +909,9 @@ async function runTestCase(
  duration: number;
 }> {
  const testFile = join(testDir, tc.fileName);
-  writeFileSync(testFile, tc.fileContent, "utf-8");
+  if (!tc.skipFileCreate) {
+    writeFileSync(testFile, tc.fileContent, "utf-8");
+  }

  const headlessScript = resolve(import.meta.dir, "headless.ts");
  const headlessArgs = [
--- a/benchmarks/test-multi-model.ts
+++ b/benchmarks/test-multi-model.ts
@ -25,9 +25,13 @@ let perModelTimeoutSec = 900; // 15 min default per model (5 tests)
 const rawArgs = process.argv.slice(2);
 for (let i = 0; i < rawArgs.length; i++) {
  if (rawArgs[i] === "--timeout" && i + 1 < rawArgs.length) {
-    perModelTimeoutSec = Number.parseInt(rawArgs[i + 1], 10);
+    const parsed = Number.parseInt(rawArgs[i + 1], 10);
+    if (Number.isNaN(parsed) || parsed <= 0) {
+      console.error(`Invalid --timeout value: ${rawArgs[i + 1]}`);
+      process.exit(1);
+    }
+    perModelTimeoutSec = parsed;
    i++;
-  }
 }

 // ── Colors ────────────────────────────────────────────────────
@ -228,8 +232,9 @@ const main = async () => {
  // Per-model results
  for (const r of allResults) {
    const timeStr = `${(r.durationMs / 1000).toFixed(0)}s`;
-    const color = r.totalPassed === r.totalTests ? GREEN : r.totalPassed > 0 ? YELLOW : RED;
-    console.log(`  ${r.modelShort.padEnd(8)} ${color}${r.totalPassed}/${r.totalTests}${RESET} (${timeStr})`);
+    const color = r.error ? RED : r.totalPassed === r.totalTests ? GREEN : r.totalPassed > 0 ? YELLOW : RED;
+    const label = r.error ? `ERROR: ${r.error}` : `${r.totalPassed}/${r.totalTests}`;
+    console.log(`  ${r.modelShort.padEnd(8)} ${color}${label}${RESET} (${timeStr})`);
    for (const t of r.tests) {
      const icon = t.passed ? `${GREEN}✓${RESET}` : `${RED}✗${RESET}`;
      console.log(`    ${icon} ${t.name}`);
@ -240,8 +245,9 @@ const main = async () => {

  // Overall
  const totalModels = allResults.length;
+  const erroredModels = allResults.filter((r) => r.error).length;
  const perfectModels = allResults.filter(
-    (r) => r.totalPassed === r.totalTests
+    (r) => !r.error && r.totalPassed === r.totalTests && r.totalTests > 0
  ).length;
  console.log(
    `${BOLD}Models with 100%: ${perfectModels}/${totalModels}${RESET}`
@ -255,7 +261,12 @@ const main = async () => {

  console.log();

-  if (perfectModels === totalModels) {
+  if (erroredModels > 0) {
+    console.log(
+      `${BOLD}${RED}${erroredModels} model(s) errored. See details above.${RESET}\n`
+    );
+    process.exit(1);
+  } else if (perfectModels === totalModels) {
    console.log(`${BOLD}${GREEN}🎉 ALL MODELS PASSED ALL TESTS!${RESET}\n`);
    process.exit(0);
  } else {