fix(benchmarks): address review feedback on error handling and validation

- headless.ts: emit error field on tool_result when output starts with Error:
- test-multi-model.ts: errored/timed-out models now shown as RED and exit(1)
- test-multi-model.ts: validate --timeout arg (reject NaN/negative)
- test-edge-cases.ts: use exact match instead of trim() for whitespace test
- test-edge-cases.ts: skip file pre-creation for create-via-append test

Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-opencode)

Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
This commit is contained in:
minpeter 2026-02-27 01:44:51 +09:00
parent 04f50bac1f
commit 8fb5949ac6
3 changed files with 28 additions and 10 deletions

View File

@ -151,11 +151,14 @@ async function run() {
model: modelId, model: modelId,
}) })
break break
case "tool-result": case "tool-result": {
const output = typeof part.result === "string" ? part.result : JSON.stringify(part.result)
const isError = typeof output === "string" && output.startsWith("Error:")
emit({ emit({
type: "tool_result", type: "tool_result",
tool_call_id: part.toolCallId, tool_call_id: part.toolCallId,
output: typeof part.result === "string" ? part.result : JSON.stringify(part.result), output,
...(isError ? { error: output } : {}),
}) })
break break
} }

View File

@ -53,6 +53,7 @@ interface TestCase {
fileName: string; fileName: string;
name: string; name: string;
prompt: string; prompt: string;
skipFileCreate?: boolean;
validate: (content: string) => { passed: boolean; reason: string }; validate: (content: string) => { passed: boolean; reason: string };
} }
@ -288,6 +289,7 @@ const TEST_CASES: TestCase[] = [
name: "7. Create new file via append", name: "7. Create new file via append",
fileName: "create-via-append.txt", fileName: "create-via-append.txt",
fileContent: "", fileContent: "",
skipFileCreate: true,
prompt: [ prompt: [
"Create create-via-append.txt via edit_file append (do not call read_file first).", "Create create-via-append.txt via edit_file append (do not call read_file first).",
"Use one call with edits: [{ op: 'append', lines: ['created line 1', 'created line 2'] }].", "Use one call with edits: [{ op: 'append', lines: ['created line 1', 'created line 2'] }].",
@ -597,7 +599,7 @@ const TEST_CASES: TestCase[] = [
reason: "non-target lines changed unexpectedly", reason: "non-target lines changed unexpectedly",
}; };
} }
if (lines[1].trim() !== "middle-content") { if (lines[1] !== "middle-content") {
return { return {
passed: false, passed: false,
reason: `line 2 expected 'middle-content' but got ${JSON.stringify(lines[1])}`, reason: `line 2 expected 'middle-content' but got ${JSON.stringify(lines[1])}`,
@ -907,7 +909,9 @@ async function runTestCase(
duration: number; duration: number;
}> { }> {
const testFile = join(testDir, tc.fileName); const testFile = join(testDir, tc.fileName);
writeFileSync(testFile, tc.fileContent, "utf-8"); if (!tc.skipFileCreate) {
writeFileSync(testFile, tc.fileContent, "utf-8");
}
const headlessScript = resolve(import.meta.dir, "headless.ts"); const headlessScript = resolve(import.meta.dir, "headless.ts");
const headlessArgs = [ const headlessArgs = [

View File

@ -25,9 +25,13 @@ let perModelTimeoutSec = 900; // 15 min default per model (5 tests)
const rawArgs = process.argv.slice(2); const rawArgs = process.argv.slice(2);
for (let i = 0; i < rawArgs.length; i++) { for (let i = 0; i < rawArgs.length; i++) {
if (rawArgs[i] === "--timeout" && i + 1 < rawArgs.length) { if (rawArgs[i] === "--timeout" && i + 1 < rawArgs.length) {
perModelTimeoutSec = Number.parseInt(rawArgs[i + 1], 10); const parsed = Number.parseInt(rawArgs[i + 1], 10);
if (Number.isNaN(parsed) || parsed <= 0) {
console.error(`Invalid --timeout value: ${rawArgs[i + 1]}`);
process.exit(1);
}
perModelTimeoutSec = parsed;
i++; i++;
}
} }
// ── Colors ──────────────────────────────────────────────────── // ── Colors ────────────────────────────────────────────────────
@ -228,8 +232,9 @@ const main = async () => {
// Per-model results // Per-model results
for (const r of allResults) { for (const r of allResults) {
const timeStr = `${(r.durationMs / 1000).toFixed(0)}s`; const timeStr = `${(r.durationMs / 1000).toFixed(0)}s`;
const color = r.totalPassed === r.totalTests ? GREEN : r.totalPassed > 0 ? YELLOW : RED; const color = r.error ? RED : r.totalPassed === r.totalTests ? GREEN : r.totalPassed > 0 ? YELLOW : RED;
console.log(` ${r.modelShort.padEnd(8)} ${color}${r.totalPassed}/${r.totalTests}${RESET} (${timeStr})`); const label = r.error ? `ERROR: ${r.error}` : `${r.totalPassed}/${r.totalTests}`;
console.log(` ${r.modelShort.padEnd(8)} ${color}${label}${RESET} (${timeStr})`);
for (const t of r.tests) { for (const t of r.tests) {
const icon = t.passed ? `${GREEN}${RESET}` : `${RED}${RESET}`; const icon = t.passed ? `${GREEN}${RESET}` : `${RED}${RESET}`;
console.log(` ${icon} ${t.name}`); console.log(` ${icon} ${t.name}`);
@ -240,8 +245,9 @@ const main = async () => {
// Overall // Overall
const totalModels = allResults.length; const totalModels = allResults.length;
const erroredModels = allResults.filter((r) => r.error).length;
const perfectModels = allResults.filter( const perfectModels = allResults.filter(
(r) => r.totalPassed === r.totalTests (r) => !r.error && r.totalPassed === r.totalTests && r.totalTests > 0
).length; ).length;
console.log( console.log(
`${BOLD}Models with 100%: ${perfectModels}/${totalModels}${RESET}` `${BOLD}Models with 100%: ${perfectModels}/${totalModels}${RESET}`
@ -255,7 +261,12 @@ const main = async () => {
console.log(); console.log();
if (perfectModels === totalModels) { if (erroredModels > 0) {
console.log(
`${BOLD}${RED}${erroredModels} model(s) errored. See details above.${RESET}\n`
);
process.exit(1);
} else if (perfectModels === totalModels) {
console.log(`${BOLD}${GREEN}🎉 ALL MODELS PASSED ALL TESTS!${RESET}\n`); console.log(`${BOLD}${GREEN}🎉 ALL MODELS PASSED ALL TESTS!${RESET}\n`);
process.exit(0); process.exit(0);
} else { } else {