diff --git a/benchmarks/headless.ts b/benchmarks/headless.ts index bb2af701..ae18853a 100644 --- a/benchmarks/headless.ts +++ b/benchmarks/headless.ts @@ -151,11 +151,14 @@ async function run() { model: modelId, }) break - case "tool-result": + case "tool-result": { + const output = typeof part.result === "string" ? part.result : JSON.stringify(part.result) + const isError = typeof output === "string" && output.startsWith("Error:") emit({ type: "tool_result", tool_call_id: part.toolCallId, - output: typeof part.result === "string" ? part.result : JSON.stringify(part.result), + output, + ...(isError ? { error: output } : {}), }) break } diff --git a/benchmarks/test-edge-cases.ts b/benchmarks/test-edge-cases.ts index a1916c56..b00b0302 100644 --- a/benchmarks/test-edge-cases.ts +++ b/benchmarks/test-edge-cases.ts @@ -53,6 +53,7 @@ interface TestCase { fileName: string; name: string; prompt: string; + skipFileCreate?: boolean; validate: (content: string) => { passed: boolean; reason: string }; } @@ -288,6 +289,7 @@ const TEST_CASES: TestCase[] = [ name: "7. Create new file via append", fileName: "create-via-append.txt", fileContent: "", + skipFileCreate: true, prompt: [ "Create create-via-append.txt via edit_file append (do not call read_file first).", "Use one call with edits: [{ op: 'append', lines: ['created line 1', 'created line 2'] }].", @@ -597,7 +599,7 @@ const TEST_CASES: TestCase[] = [ reason: "non-target lines changed unexpectedly", }; } - if (lines[1].trim() !== "middle-content") { + if (lines[1] !== "middle-content") { return { passed: false, reason: `line 2 expected 'middle-content' but got ${JSON.stringify(lines[1])}`, @@ -907,7 +909,9 @@ async function runTestCase( duration: number; }> { const testFile = join(testDir, tc.fileName); - writeFileSync(testFile, tc.fileContent, "utf-8"); + if (!tc.skipFileCreate) { + writeFileSync(testFile, tc.fileContent, "utf-8"); + } const headlessScript = resolve(import.meta.dir, "headless.ts"); const headlessArgs = [ diff --git a/benchmarks/test-multi-model.ts b/benchmarks/test-multi-model.ts index 1781d4eb..29ee4bb9 100644 --- a/benchmarks/test-multi-model.ts +++ b/benchmarks/test-multi-model.ts @@ -25,9 +25,13 @@ let perModelTimeoutSec = 900; // 15 min default per model (5 tests) const rawArgs = process.argv.slice(2); for (let i = 0; i < rawArgs.length; i++) { if (rawArgs[i] === "--timeout" && i + 1 < rawArgs.length) { - perModelTimeoutSec = Number.parseInt(rawArgs[i + 1], 10); + const parsed = Number.parseInt(rawArgs[i + 1], 10); + if (Number.isNaN(parsed) || parsed <= 0) { + console.error(`Invalid --timeout value: ${rawArgs[i + 1]}`); + process.exit(1); + } + perModelTimeoutSec = parsed; i++; - } } // ── Colors ──────────────────────────────────────────────────── @@ -228,8 +232,9 @@ const main = async () => { // Per-model results for (const r of allResults) { const timeStr = `${(r.durationMs / 1000).toFixed(0)}s`; - const color = r.totalPassed === r.totalTests ? GREEN : r.totalPassed > 0 ? YELLOW : RED; - console.log(` ${r.modelShort.padEnd(8)} ${color}${r.totalPassed}/${r.totalTests}${RESET} (${timeStr})`); + const color = r.error ? RED : r.totalPassed === r.totalTests ? GREEN : r.totalPassed > 0 ? YELLOW : RED; + const label = r.error ? `ERROR: ${r.error}` : `${r.totalPassed}/${r.totalTests}`; + console.log(` ${r.modelShort.padEnd(8)} ${color}${label}${RESET} (${timeStr})`); for (const t of r.tests) { const icon = t.passed ? `${GREEN}✓${RESET}` : `${RED}✗${RESET}`; console.log(` ${icon} ${t.name}`); @@ -240,8 +245,9 @@ const main = async () => { // Overall const totalModels = allResults.length; + const erroredModels = allResults.filter((r) => r.error).length; const perfectModels = allResults.filter( - (r) => r.totalPassed === r.totalTests + (r) => !r.error && r.totalPassed === r.totalTests && r.totalTests > 0 ).length; console.log( `${BOLD}Models with 100%: ${perfectModels}/${totalModels}${RESET}` @@ -255,7 +261,12 @@ const main = async () => { console.log(); - if (perfectModels === totalModels) { + if (erroredModels > 0) { + console.log( + `${BOLD}${RED}${erroredModels} model(s) errored. See details above.${RESET}\n` + ); + process.exit(1); + } else if (perfectModels === totalModels) { console.log(`${BOLD}${GREEN}🎉 ALL MODELS PASSED ALL TESTS!${RESET}\n`); process.exit(0); } else {