fix(benchmarks): address review feedback on error handling and validation
- headless.ts: emit error field on tool_result when output starts with Error: - test-multi-model.ts: errored/timed-out models now shown as RED and exit(1) - test-multi-model.ts: validate --timeout arg (reject NaN/negative) - test-edge-cases.ts: use exact match instead of trim() for whitespace test - test-edge-cases.ts: skip file pre-creation for create-via-append test Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-opencode) Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
This commit is contained in:
parent
04f50bac1f
commit
8fb5949ac6
@ -151,11 +151,14 @@ async function run() {
|
||||
model: modelId,
|
||||
})
|
||||
break
|
||||
case "tool-result":
|
||||
case "tool-result": {
|
||||
const output = typeof part.result === "string" ? part.result : JSON.stringify(part.result)
|
||||
const isError = typeof output === "string" && output.startsWith("Error:")
|
||||
emit({
|
||||
type: "tool_result",
|
||||
tool_call_id: part.toolCallId,
|
||||
output: typeof part.result === "string" ? part.result : JSON.stringify(part.result),
|
||||
output,
|
||||
...(isError ? { error: output } : {}),
|
||||
})
|
||||
break
|
||||
}
|
||||
|
||||
@ -53,6 +53,7 @@ interface TestCase {
|
||||
fileName: string;
|
||||
name: string;
|
||||
prompt: string;
|
||||
skipFileCreate?: boolean;
|
||||
validate: (content: string) => { passed: boolean; reason: string };
|
||||
}
|
||||
|
||||
@ -288,6 +289,7 @@ const TEST_CASES: TestCase[] = [
|
||||
name: "7. Create new file via append",
|
||||
fileName: "create-via-append.txt",
|
||||
fileContent: "",
|
||||
skipFileCreate: true,
|
||||
prompt: [
|
||||
"Create create-via-append.txt via edit_file append (do not call read_file first).",
|
||||
"Use one call with edits: [{ op: 'append', lines: ['created line 1', 'created line 2'] }].",
|
||||
@ -597,7 +599,7 @@ const TEST_CASES: TestCase[] = [
|
||||
reason: "non-target lines changed unexpectedly",
|
||||
};
|
||||
}
|
||||
if (lines[1].trim() !== "middle-content") {
|
||||
if (lines[1] !== "middle-content") {
|
||||
return {
|
||||
passed: false,
|
||||
reason: `line 2 expected 'middle-content' but got ${JSON.stringify(lines[1])}`,
|
||||
@ -907,7 +909,9 @@ async function runTestCase(
|
||||
duration: number;
|
||||
}> {
|
||||
const testFile = join(testDir, tc.fileName);
|
||||
writeFileSync(testFile, tc.fileContent, "utf-8");
|
||||
if (!tc.skipFileCreate) {
|
||||
writeFileSync(testFile, tc.fileContent, "utf-8");
|
||||
}
|
||||
|
||||
const headlessScript = resolve(import.meta.dir, "headless.ts");
|
||||
const headlessArgs = [
|
||||
|
||||
@ -25,9 +25,13 @@ let perModelTimeoutSec = 900; // 15 min default per model (5 tests)
|
||||
const rawArgs = process.argv.slice(2);
|
||||
for (let i = 0; i < rawArgs.length; i++) {
|
||||
if (rawArgs[i] === "--timeout" && i + 1 < rawArgs.length) {
|
||||
perModelTimeoutSec = Number.parseInt(rawArgs[i + 1], 10);
|
||||
const parsed = Number.parseInt(rawArgs[i + 1], 10);
|
||||
if (Number.isNaN(parsed) || parsed <= 0) {
|
||||
console.error(`Invalid --timeout value: ${rawArgs[i + 1]}`);
|
||||
process.exit(1);
|
||||
}
|
||||
perModelTimeoutSec = parsed;
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
// ── Colors ────────────────────────────────────────────────────
|
||||
@ -228,8 +232,9 @@ const main = async () => {
|
||||
// Per-model results
|
||||
for (const r of allResults) {
|
||||
const timeStr = `${(r.durationMs / 1000).toFixed(0)}s`;
|
||||
const color = r.totalPassed === r.totalTests ? GREEN : r.totalPassed > 0 ? YELLOW : RED;
|
||||
console.log(` ${r.modelShort.padEnd(8)} ${color}${r.totalPassed}/${r.totalTests}${RESET} (${timeStr})`);
|
||||
const color = r.error ? RED : r.totalPassed === r.totalTests ? GREEN : r.totalPassed > 0 ? YELLOW : RED;
|
||||
const label = r.error ? `ERROR: ${r.error}` : `${r.totalPassed}/${r.totalTests}`;
|
||||
console.log(` ${r.modelShort.padEnd(8)} ${color}${label}${RESET} (${timeStr})`);
|
||||
for (const t of r.tests) {
|
||||
const icon = t.passed ? `${GREEN}✓${RESET}` : `${RED}✗${RESET}`;
|
||||
console.log(` ${icon} ${t.name}`);
|
||||
@ -240,8 +245,9 @@ const main = async () => {
|
||||
|
||||
// Overall
|
||||
const totalModels = allResults.length;
|
||||
const erroredModels = allResults.filter((r) => r.error).length;
|
||||
const perfectModels = allResults.filter(
|
||||
(r) => r.totalPassed === r.totalTests
|
||||
(r) => !r.error && r.totalPassed === r.totalTests && r.totalTests > 0
|
||||
).length;
|
||||
console.log(
|
||||
`${BOLD}Models with 100%: ${perfectModels}/${totalModels}${RESET}`
|
||||
@ -255,7 +261,12 @@ const main = async () => {
|
||||
|
||||
console.log();
|
||||
|
||||
if (perfectModels === totalModels) {
|
||||
if (erroredModels > 0) {
|
||||
console.log(
|
||||
`${BOLD}${RED}${erroredModels} model(s) errored. See details above.${RESET}\n`
|
||||
);
|
||||
process.exit(1);
|
||||
} else if (perfectModels === totalModels) {
|
||||
console.log(`${BOLD}${GREEN}🎉 ALL MODELS PASSED ALL TESTS!${RESET}\n`);
|
||||
process.exit(0);
|
||||
} else {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user