fix(benchmarks): address review feedback on error handling and validation
- headless.ts: emit error field on tool_result when output starts with Error: - test-multi-model.ts: errored/timed-out models now shown as RED and exit(1) - test-multi-model.ts: validate --timeout arg (reject NaN/negative) - test-edge-cases.ts: use exact match instead of trim() for whitespace test - test-edge-cases.ts: skip file pre-creation for create-via-append test Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-opencode) Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
This commit is contained in:
parent
04f50bac1f
commit
8fb5949ac6
@ -151,11 +151,14 @@ async function run() {
|
|||||||
model: modelId,
|
model: modelId,
|
||||||
})
|
})
|
||||||
break
|
break
|
||||||
case "tool-result":
|
case "tool-result": {
|
||||||
|
const output = typeof part.result === "string" ? part.result : JSON.stringify(part.result)
|
||||||
|
const isError = typeof output === "string" && output.startsWith("Error:")
|
||||||
emit({
|
emit({
|
||||||
type: "tool_result",
|
type: "tool_result",
|
||||||
tool_call_id: part.toolCallId,
|
tool_call_id: part.toolCallId,
|
||||||
output: typeof part.result === "string" ? part.result : JSON.stringify(part.result),
|
output,
|
||||||
|
...(isError ? { error: output } : {}),
|
||||||
})
|
})
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
|
|||||||
@ -53,6 +53,7 @@ interface TestCase {
|
|||||||
fileName: string;
|
fileName: string;
|
||||||
name: string;
|
name: string;
|
||||||
prompt: string;
|
prompt: string;
|
||||||
|
skipFileCreate?: boolean;
|
||||||
validate: (content: string) => { passed: boolean; reason: string };
|
validate: (content: string) => { passed: boolean; reason: string };
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -288,6 +289,7 @@ const TEST_CASES: TestCase[] = [
|
|||||||
name: "7. Create new file via append",
|
name: "7. Create new file via append",
|
||||||
fileName: "create-via-append.txt",
|
fileName: "create-via-append.txt",
|
||||||
fileContent: "",
|
fileContent: "",
|
||||||
|
skipFileCreate: true,
|
||||||
prompt: [
|
prompt: [
|
||||||
"Create create-via-append.txt via edit_file append (do not call read_file first).",
|
"Create create-via-append.txt via edit_file append (do not call read_file first).",
|
||||||
"Use one call with edits: [{ op: 'append', lines: ['created line 1', 'created line 2'] }].",
|
"Use one call with edits: [{ op: 'append', lines: ['created line 1', 'created line 2'] }].",
|
||||||
@ -597,7 +599,7 @@ const TEST_CASES: TestCase[] = [
|
|||||||
reason: "non-target lines changed unexpectedly",
|
reason: "non-target lines changed unexpectedly",
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
if (lines[1].trim() !== "middle-content") {
|
if (lines[1] !== "middle-content") {
|
||||||
return {
|
return {
|
||||||
passed: false,
|
passed: false,
|
||||||
reason: `line 2 expected 'middle-content' but got ${JSON.stringify(lines[1])}`,
|
reason: `line 2 expected 'middle-content' but got ${JSON.stringify(lines[1])}`,
|
||||||
@ -907,7 +909,9 @@ async function runTestCase(
|
|||||||
duration: number;
|
duration: number;
|
||||||
}> {
|
}> {
|
||||||
const testFile = join(testDir, tc.fileName);
|
const testFile = join(testDir, tc.fileName);
|
||||||
writeFileSync(testFile, tc.fileContent, "utf-8");
|
if (!tc.skipFileCreate) {
|
||||||
|
writeFileSync(testFile, tc.fileContent, "utf-8");
|
||||||
|
}
|
||||||
|
|
||||||
const headlessScript = resolve(import.meta.dir, "headless.ts");
|
const headlessScript = resolve(import.meta.dir, "headless.ts");
|
||||||
const headlessArgs = [
|
const headlessArgs = [
|
||||||
|
|||||||
@ -25,9 +25,13 @@ let perModelTimeoutSec = 900; // 15 min default per model (5 tests)
|
|||||||
const rawArgs = process.argv.slice(2);
|
const rawArgs = process.argv.slice(2);
|
||||||
for (let i = 0; i < rawArgs.length; i++) {
|
for (let i = 0; i < rawArgs.length; i++) {
|
||||||
if (rawArgs[i] === "--timeout" && i + 1 < rawArgs.length) {
|
if (rawArgs[i] === "--timeout" && i + 1 < rawArgs.length) {
|
||||||
perModelTimeoutSec = Number.parseInt(rawArgs[i + 1], 10);
|
const parsed = Number.parseInt(rawArgs[i + 1], 10);
|
||||||
|
if (Number.isNaN(parsed) || parsed <= 0) {
|
||||||
|
console.error(`Invalid --timeout value: ${rawArgs[i + 1]}`);
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
perModelTimeoutSec = parsed;
|
||||||
i++;
|
i++;
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Colors ────────────────────────────────────────────────────
|
// ── Colors ────────────────────────────────────────────────────
|
||||||
@ -228,8 +232,9 @@ const main = async () => {
|
|||||||
// Per-model results
|
// Per-model results
|
||||||
for (const r of allResults) {
|
for (const r of allResults) {
|
||||||
const timeStr = `${(r.durationMs / 1000).toFixed(0)}s`;
|
const timeStr = `${(r.durationMs / 1000).toFixed(0)}s`;
|
||||||
const color = r.totalPassed === r.totalTests ? GREEN : r.totalPassed > 0 ? YELLOW : RED;
|
const color = r.error ? RED : r.totalPassed === r.totalTests ? GREEN : r.totalPassed > 0 ? YELLOW : RED;
|
||||||
console.log(` ${r.modelShort.padEnd(8)} ${color}${r.totalPassed}/${r.totalTests}${RESET} (${timeStr})`);
|
const label = r.error ? `ERROR: ${r.error}` : `${r.totalPassed}/${r.totalTests}`;
|
||||||
|
console.log(` ${r.modelShort.padEnd(8)} ${color}${label}${RESET} (${timeStr})`);
|
||||||
for (const t of r.tests) {
|
for (const t of r.tests) {
|
||||||
const icon = t.passed ? `${GREEN}✓${RESET}` : `${RED}✗${RESET}`;
|
const icon = t.passed ? `${GREEN}✓${RESET}` : `${RED}✗${RESET}`;
|
||||||
console.log(` ${icon} ${t.name}`);
|
console.log(` ${icon} ${t.name}`);
|
||||||
@ -240,8 +245,9 @@ const main = async () => {
|
|||||||
|
|
||||||
// Overall
|
// Overall
|
||||||
const totalModels = allResults.length;
|
const totalModels = allResults.length;
|
||||||
|
const erroredModels = allResults.filter((r) => r.error).length;
|
||||||
const perfectModels = allResults.filter(
|
const perfectModels = allResults.filter(
|
||||||
(r) => r.totalPassed === r.totalTests
|
(r) => !r.error && r.totalPassed === r.totalTests && r.totalTests > 0
|
||||||
).length;
|
).length;
|
||||||
console.log(
|
console.log(
|
||||||
`${BOLD}Models with 100%: ${perfectModels}/${totalModels}${RESET}`
|
`${BOLD}Models with 100%: ${perfectModels}/${totalModels}${RESET}`
|
||||||
@ -255,7 +261,12 @@ const main = async () => {
|
|||||||
|
|
||||||
console.log();
|
console.log();
|
||||||
|
|
||||||
if (perfectModels === totalModels) {
|
if (erroredModels > 0) {
|
||||||
|
console.log(
|
||||||
|
`${BOLD}${RED}${erroredModels} model(s) errored. See details above.${RESET}\n`
|
||||||
|
);
|
||||||
|
process.exit(1);
|
||||||
|
} else if (perfectModels === totalModels) {
|
||||||
console.log(`${BOLD}${GREEN}🎉 ALL MODELS PASSED ALL TESTS!${RESET}\n`);
|
console.log(`${BOLD}${GREEN}🎉 ALL MODELS PASSED ALL TESTS!${RESET}\n`);
|
||||||
process.exit(0);
|
process.exit(0);
|
||||||
} else {
|
} else {
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user