fix(benchmarks): address review feedback on error handling and validation

- headless.ts: emit error field on tool_result when output starts with Error:
- test-multi-model.ts: errored/timed-out models now shown as RED and exit(1)
- test-multi-model.ts: validate --timeout arg (reject NaN/negative)
- test-edge-cases.ts: use exact match instead of trim() for whitespace test
- test-edge-cases.ts: skip file pre-creation for create-via-append test

Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-opencode)

Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
This commit is contained in:
minpeter 2026-02-27 01:44:51 +09:00
parent 04f50bac1f
commit 8fb5949ac6
3 changed files with 28 additions and 10 deletions

View File

@ -151,11 +151,14 @@ async function run() {
model: modelId,
})
break
case "tool-result":
case "tool-result": {
const output = typeof part.result === "string" ? part.result : JSON.stringify(part.result)
const isError = typeof output === "string" && output.startsWith("Error:")
emit({
type: "tool_result",
tool_call_id: part.toolCallId,
output: typeof part.result === "string" ? part.result : JSON.stringify(part.result),
output,
...(isError ? { error: output } : {}),
})
break
}

View File

@ -53,6 +53,7 @@ interface TestCase {
fileName: string;
name: string;
prompt: string;
skipFileCreate?: boolean;
validate: (content: string) => { passed: boolean; reason: string };
}
@ -288,6 +289,7 @@ const TEST_CASES: TestCase[] = [
name: "7. Create new file via append",
fileName: "create-via-append.txt",
fileContent: "",
skipFileCreate: true,
prompt: [
"Create create-via-append.txt via edit_file append (do not call read_file first).",
"Use one call with edits: [{ op: 'append', lines: ['created line 1', 'created line 2'] }].",
@ -597,7 +599,7 @@ const TEST_CASES: TestCase[] = [
reason: "non-target lines changed unexpectedly",
};
}
if (lines[1].trim() !== "middle-content") {
if (lines[1] !== "middle-content") {
return {
passed: false,
reason: `line 2 expected 'middle-content' but got ${JSON.stringify(lines[1])}`,
@ -907,7 +909,9 @@ async function runTestCase(
duration: number;
}> {
const testFile = join(testDir, tc.fileName);
writeFileSync(testFile, tc.fileContent, "utf-8");
if (!tc.skipFileCreate) {
writeFileSync(testFile, tc.fileContent, "utf-8");
}
const headlessScript = resolve(import.meta.dir, "headless.ts");
const headlessArgs = [

View File

@ -25,9 +25,13 @@ let perModelTimeoutSec = 900; // 15 min default per model (5 tests)
const rawArgs = process.argv.slice(2);
for (let i = 0; i < rawArgs.length; i++) {
if (rawArgs[i] === "--timeout" && i + 1 < rawArgs.length) {
perModelTimeoutSec = Number.parseInt(rawArgs[i + 1], 10);
const parsed = Number.parseInt(rawArgs[i + 1], 10);
if (Number.isNaN(parsed) || parsed <= 0) {
console.error(`Invalid --timeout value: ${rawArgs[i + 1]}`);
process.exit(1);
}
perModelTimeoutSec = parsed;
i++;
}
}
// ── Colors ────────────────────────────────────────────────────
@ -228,8 +232,9 @@ const main = async () => {
// Per-model results
for (const r of allResults) {
const timeStr = `${(r.durationMs / 1000).toFixed(0)}s`;
const color = r.totalPassed === r.totalTests ? GREEN : r.totalPassed > 0 ? YELLOW : RED;
console.log(` ${r.modelShort.padEnd(8)} ${color}${r.totalPassed}/${r.totalTests}${RESET} (${timeStr})`);
const color = r.error ? RED : r.totalPassed === r.totalTests ? GREEN : r.totalPassed > 0 ? YELLOW : RED;
const label = r.error ? `ERROR: ${r.error}` : `${r.totalPassed}/${r.totalTests}`;
console.log(` ${r.modelShort.padEnd(8)} ${color}${label}${RESET} (${timeStr})`);
for (const t of r.tests) {
const icon = t.passed ? `${GREEN}${RESET}` : `${RED}${RESET}`;
console.log(` ${icon} ${t.name}`);
@ -240,8 +245,9 @@ const main = async () => {
// Overall
const totalModels = allResults.length;
const erroredModels = allResults.filter((r) => r.error).length;
const perfectModels = allResults.filter(
(r) => r.totalPassed === r.totalTests
(r) => !r.error && r.totalPassed === r.totalTests && r.totalTests > 0
).length;
console.log(
`${BOLD}Models with 100%: ${perfectModels}/${totalModels}${RESET}`
@ -255,7 +261,12 @@ const main = async () => {
console.log();
if (perfectModels === totalModels) {
if (erroredModels > 0) {
console.log(
`${BOLD}${RED}${erroredModels} model(s) errored. See details above.${RESET}\n`
);
process.exit(1);
} else if (perfectModels === totalModels) {
console.log(`${BOLD}${GREEN}🎉 ALL MODELS PASSED ALL TESTS!${RESET}\n`);
process.exit(0);
} else {