From b1203b95013675409757a81ac01541f1ba4994d0 Mon Sep 17 00:00:00 2001 From: minpeter Date: Thu, 26 Feb 2026 17:43:49 +0900 Subject: [PATCH 1/4] Fix hashline-edit deduplication and validation - Canonicalize anchors in dedupe keys to handle whitespace variants - Make lines field required in edit operations - Only allow unanchored append/prepend to create missing files - Reorder delete/rename validation to prevent edge cases - Add allow_non_gpt_model and max_prompt_tokens to config schema ``` --- src/tools/hashline-edit/edit-deduplication.ts | 12 ++- .../hashline-edit/edit-operations.test.ts | 22 +++++- .../hashline-edit/hashline-edit-executor.ts | 16 ++-- src/tools/hashline-edit/tool-description.ts | 2 +- src/tools/hashline-edit/tools.test.ts | 77 +++++++++++++++++++ src/tools/hashline-edit/tools.ts | 1 - src/tools/hashline-edit/validation.ts | 2 +- 7 files changed, 117 insertions(+), 15 deletions(-) diff --git a/src/tools/hashline-edit/edit-deduplication.ts b/src/tools/hashline-edit/edit-deduplication.ts index e689bb53..8818b61a 100644 --- a/src/tools/hashline-edit/edit-deduplication.ts +++ b/src/tools/hashline-edit/edit-deduplication.ts @@ -1,18 +1,24 @@ import type { HashlineEdit } from "./types" import { toNewLines } from "./edit-text-normalization" +import { normalizeLineRef } from "./validation" function normalizeEditPayload(payload: string | string[]): string { return toNewLines(payload).join("\n") } +function canonicalAnchor(anchor: string | undefined): string { + if (!anchor) return "" + return normalizeLineRef(anchor) +} + function buildDedupeKey(edit: HashlineEdit): string { switch (edit.op) { case "replace": - return `replace|${edit.pos}|${edit.end ?? ""}|${normalizeEditPayload(edit.lines)}` + return `replace|${canonicalAnchor(edit.pos)}|${edit.end ? canonicalAnchor(edit.end) : ""}|${normalizeEditPayload(edit.lines)}` case "append": - return `append|${edit.pos ?? ""}|${normalizeEditPayload(edit.lines)}` + return `append|${canonicalAnchor(edit.pos)}|${normalizeEditPayload(edit.lines)}` case "prepend": - return `prepend|${edit.pos ?? ""}|${normalizeEditPayload(edit.lines)}` + return `prepend|${canonicalAnchor(edit.pos)}|${normalizeEditPayload(edit.lines)}` default: return JSON.stringify(edit) } diff --git a/src/tools/hashline-edit/edit-operations.test.ts b/src/tools/hashline-edit/edit-operations.test.ts index 5d8ad08b..40585210 100644 --- a/src/tools/hashline-edit/edit-operations.test.ts +++ b/src/tools/hashline-edit/edit-operations.test.ts @@ -1,5 +1,5 @@ import { describe, expect, it } from "bun:test" -import { applyHashlineEdits } from "./edit-operations" +import { applyHashlineEdits, applyHashlineEditsWithReport } from "./edit-operations" import { applyAppend, applyInsertAfter, applyPrepend, applyReplaceLines, applySetLine } from "./edit-operation-primitives" import { computeLineHash } from "./hash-computation" import type { HashlineEdit } from "./types" @@ -389,3 +389,23 @@ describe("hashline edit operations", () => { expect(result).toEqual("replaced A\nline 3\nreplaced B") }) }) + +describe("dedupe anchor canonicalization", () => { + it("deduplicates edits with whitespace-variant anchors", () => { + //#given + const content = "line 1\nline 2" + const lines = content.split("\n") + const canonical = `1#${computeLineHash(1, lines[0])}` + const spaced = ` 1 # ${computeLineHash(1, lines[0])} ` + + //#when + const report = applyHashlineEditsWithReport(content, [ + { op: "append", pos: canonical, lines: ["inserted"] }, + { op: "append", pos: spaced, lines: ["inserted"] }, + ]) + + //#then + expect(report.deduplicatedEdits).toBe(1) + expect(report.content).toBe("line 1\ninserted\nline 2") + }) +}) diff --git a/src/tools/hashline-edit/hashline-edit-executor.ts b/src/tools/hashline-edit/hashline-edit-executor.ts index e20ebbf9..d316307d 100644 --- a/src/tools/hashline-edit/hashline-edit-executor.ts +++ b/src/tools/hashline-edit/hashline-edit-executor.ts @@ -33,7 +33,7 @@ function resolveToolCallID(ctx: ToolContextWithCallID): string | undefined { function canCreateFromMissingFile(edits: HashlineEdit[]): boolean { if (edits.length === 0) return false - return edits.every((edit) => edit.op === "append" || edit.op === "prepend") + return edits.every((edit) => (edit.op === "append" || edit.op === "prepend") && !edit.pos) } function buildSuccessMeta( @@ -86,19 +86,19 @@ export async function executeHashlineEditTool(args: HashlineEditArgs, context: T const filePath = args.filePath const { delete: deleteMode, rename } = args + if (deleteMode && rename) { + return "Error: delete and rename cannot be used together" + } + if (deleteMode && args.edits.length > 0) { + return "Error: delete mode requires edits to be an empty array" + } + if (!deleteMode && (!args.edits || !Array.isArray(args.edits) || args.edits.length === 0)) { return "Error: edits parameter must be a non-empty array" } const edits = deleteMode ? [] : normalizeHashlineEdits(args.edits) - if (deleteMode && rename) { - return "Error: delete and rename cannot be used together" - } - if (deleteMode && edits.length > 0) { - return "Error: delete mode requires edits to be an empty array" - } - const file = Bun.file(filePath) const exists = await file.exists() if (!exists && !deleteMode && !canCreateFromMissingFile(edits)) { diff --git a/src/tools/hashline-edit/tool-description.ts b/src/tools/hashline-edit/tool-description.ts index 0b0ee00f..2d452ccf 100644 --- a/src/tools/hashline-edit/tool-description.ts +++ b/src/tools/hashline-edit/tool-description.ts @@ -10,7 +10,7 @@ WORKFLOW: VALIDATION: Payload shape: { "filePath": string, "edits": [...], "delete"?: boolean, "rename"?: string } Each edit must be one of: replace, append, prepend - Edit shape: { "op": "replace"|"append"|"prepend", "pos"?: "LINE#ID", "end"?: "LINE#ID", "lines"?: string|string[]|null } + Edit shape: { "op": "replace"|"append"|"prepend", "pos"?: "LINE#ID", "end"?: "LINE#ID", "lines": string|string[]|null } lines must contain plain replacement text only (no LINE#ID prefixes, no diff + markers) CRITICAL: all operations validate against the same pre-edit file snapshot and apply bottom-up. Refs/tags are interpreted against the last-read version of the file. diff --git a/src/tools/hashline-edit/tools.test.ts b/src/tools/hashline-edit/tools.test.ts index cb76b834..1158ca3d 100644 --- a/src/tools/hashline-edit/tools.test.ts +++ b/src/tools/hashline-edit/tools.test.ts @@ -341,4 +341,81 @@ describe("createHashlineEditTool", () => { //#then expect(envelope.lineEnding).toBe("\r\n") }) + + it("rejects delete=true with non-empty edits before normalization", async () => { + //#given + const filePath = path.join(tempDir, "delete-reject.txt") + fs.writeFileSync(filePath, "line1") + + //#when + const result = await tool.execute( + { + filePath, + delete: true, + edits: [{ op: "replace", pos: "1#ZZ", lines: "bad" }], + }, + createMockContext(), + ) + + //#then + expect(result).toContain("delete mode requires edits to be an empty array") + expect(fs.existsSync(filePath)).toBe(true) + }) + + it("rejects delete=true combined with rename", async () => { + //#given + const filePath = path.join(tempDir, "delete-rename.txt") + fs.writeFileSync(filePath, "line1") + + //#when + const result = await tool.execute( + { + filePath, + delete: true, + rename: path.join(tempDir, "new-name.txt"), + edits: [], + }, + createMockContext(), + ) + + //#then + expect(result).toContain("delete and rename cannot be used together") + expect(fs.existsSync(filePath)).toBe(true) + }) + + it("rejects missing file creation with anchored append", async () => { + //#given + const filePath = path.join(tempDir, "nonexistent.txt") + + //#when + const result = await tool.execute( + { + filePath, + edits: [{ op: "append", pos: "1#ZZ", lines: ["bad"] }], + }, + createMockContext(), + ) + + //#then + expect(result).toContain("File not found") + }) + + it("allows missing file creation with unanchored append", async () => { + //#given + const filePath = path.join(tempDir, "newfile.txt") + + //#when + const result = await tool.execute( + { + filePath, + edits: [{ op: "append", lines: ["created"] }], + }, + createMockContext(), + ) + + //#then + expect(fs.existsSync(filePath)).toBe(true) + expect(fs.readFileSync(filePath, "utf-8")).toBe("created") + expect(result).toBe(`Updated ${filePath}`) + }) }) diff --git a/src/tools/hashline-edit/tools.ts b/src/tools/hashline-edit/tools.ts index 13265029..bd2bf1f9 100644 --- a/src/tools/hashline-edit/tools.ts +++ b/src/tools/hashline-edit/tools.ts @@ -31,7 +31,6 @@ export function createHashlineEditTool(): ToolDefinition { end: tool.schema.string().optional().describe("Range end anchor in LINE#ID format"), lines: tool.schema .union([tool.schema.string(), tool.schema.array(tool.schema.string()), tool.schema.null()]) - .optional() .describe("Replacement or inserted lines. null/[] deletes with replace"), }) ) diff --git a/src/tools/hashline-edit/validation.ts b/src/tools/hashline-edit/validation.ts index fc5b395a..ed606155 100644 --- a/src/tools/hashline-edit/validation.ts +++ b/src/tools/hashline-edit/validation.ts @@ -15,7 +15,7 @@ const MISMATCH_CONTEXT = 2 const LINE_REF_EXTRACT_PATTERN = /([0-9]+#[ZPMQVRWSNKTXJBYH]{2})/ -function normalizeLineRef(ref: string): string { +export function normalizeLineRef(ref: string): string { const originalTrimmed = ref.trim() let trimmed = originalTrimmed trimmed = trimmed.replace(/^(?:>>>|[+-])\s*/, "") From d1a0a66dde227b64931df17e25ceb2939628433d Mon Sep 17 00:00:00 2001 From: minpeter Date: Fri, 27 Feb 2026 01:37:40 +0900 Subject: [PATCH 2/4] feat(benchmarks): add hashline-edit benchmark agent and deps Standalone headless agent using Vercel AI SDK v6 with FriendliAI provider. Imports hashline-edit pure functions directly from src/ for benchmarking the edit tool against LLMs (Minimax M2.5 via FriendliAI). Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-opencode) Co-authored-by: Sisyphus --- benchmarks/bun.lock | 62 +++++++++++++ benchmarks/headless.ts | 190 ++++++++++++++++++++++++++++++++++++++++ benchmarks/package.json | 19 ++++ 3 files changed, 271 insertions(+) create mode 100644 benchmarks/bun.lock create mode 100644 benchmarks/headless.ts create mode 100644 benchmarks/package.json diff --git a/benchmarks/bun.lock b/benchmarks/bun.lock new file mode 100644 index 00000000..3a31bf1c --- /dev/null +++ b/benchmarks/bun.lock @@ -0,0 +1,62 @@ +{ + "lockfileVersion": 1, + "configVersion": 1, + "workspaces": { + "": { + "name": "hashline-edit-benchmark", + "dependencies": { + "@ai-sdk/openai": "^1.3.0", + "@friendliai/ai-provider": "^1.0.9", + "ai": "^6.0.94", + "zod": "^4.1.0", + }, + }, + }, + "packages": { + "@ai-sdk/gateway": ["@ai-sdk/gateway@3.0.55", "", { "dependencies": { "@ai-sdk/provider": "3.0.8", "@ai-sdk/provider-utils": "4.0.15", "@vercel/oidc": "3.1.0" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-7xMeTJnCjwRwXKVCiv4Ly4qzWvDuW3+W1WIV0X1EFu6W83d4mEhV9bFArto10MeTw40ewuDjrbrZd21mXKohkw=="], + + "@ai-sdk/openai": ["@ai-sdk/openai@1.3.24", "", { "dependencies": { "@ai-sdk/provider": "1.1.3", "@ai-sdk/provider-utils": "2.2.8" }, "peerDependencies": { "zod": "^3.0.0" } }, "sha512-GYXnGJTHRTZc4gJMSmFRgEQudjqd4PUN0ZjQhPwOAYH1yOAvQoG/Ikqs+HyISRbLPCrhbZnPKCNHuRU4OfpW0Q=="], + + "@ai-sdk/openai-compatible": ["@ai-sdk/openai-compatible@2.0.30", "", { "dependencies": { "@ai-sdk/provider": "3.0.8", "@ai-sdk/provider-utils": "4.0.15" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-iTjumHf1/u4NhjXYFn/aONM2GId3/o7J1Lp5ql8FCbgIMyRwrmanR5xy1S3aaVkfTscuDvLTzWiy1mAbGzK3nQ=="], + + "@ai-sdk/provider": ["@ai-sdk/provider@1.1.3", "", { "dependencies": { "json-schema": "^0.4.0" } }, "sha512-qZMxYJ0qqX/RfnuIaab+zp8UAeJn/ygXXAffR5I4N0n1IrvA6qBsjc8hXLmBiMV2zoXlifkacF7sEFnYnjBcqg=="], + + "@ai-sdk/provider-utils": ["@ai-sdk/provider-utils@2.2.8", "", { "dependencies": { "@ai-sdk/provider": "1.1.3", "nanoid": "^3.3.8", "secure-json-parse": "^2.7.0" }, "peerDependencies": { "zod": "^3.23.8" } }, "sha512-fqhG+4sCVv8x7nFzYnFo19ryhAa3w096Kmc3hWxMQfW/TubPOmt3A6tYZhl4mUfQWWQMsuSkLrtjlWuXBVSGQA=="], + + "@friendliai/ai-provider": ["@friendliai/ai-provider@1.1.4", "", { "dependencies": { "@ai-sdk/openai-compatible": "2.0.30", "@ai-sdk/provider": "3.0.8", "@ai-sdk/provider-utils": "4.0.15" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.12" } }, "sha512-9TU4B1QFqPhbkONjI5afCF7Ox4jOqtGg1xw8mA9QHZdtlEbZxU+mBNvMPlI5pU5kPoN6s7wkXmFmxpID+own1A=="], + + "@opentelemetry/api": ["@opentelemetry/api@1.9.0", "", {}, "sha512-3giAOQvZiH5F9bMlMiv8+GSPMeqg0dbaeo58/0SlA9sxSqZhnUtxzX9/2FzyhS9sWQf5S0GJE0AKBrFqjpeYcg=="], + + "@standard-schema/spec": ["@standard-schema/spec@1.1.0", "", {}, "sha512-l2aFy5jALhniG5HgqrD6jXLi/rUWrKvqN/qJx6yoJsgKhblVd+iqqU4RCXavm/jPityDo5TCvKMnpjKnOriy0w=="], + + "@vercel/oidc": ["@vercel/oidc@3.1.0", "", {}, "sha512-Fw28YZpRnA3cAHHDlkt7xQHiJ0fcL+NRcIqsocZQUSmbzeIKRpwttJjik5ZGanXP+vlA4SbTg+AbA3bP363l+w=="], + + "ai": ["ai@6.0.101", "", { "dependencies": { "@ai-sdk/gateway": "3.0.55", "@ai-sdk/provider": "3.0.8", "@ai-sdk/provider-utils": "4.0.15", "@opentelemetry/api": "1.9.0" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-Ur/NgbgOp1rdhyDiKDk6EOpSgd1g5ADlbcD1cjQJtQsnmhEngz3Rf8nK5JetDh0vnbLy2aEBpaQeL+zvLRWuaA=="], + + "eventsource-parser": ["eventsource-parser@3.0.6", "", {}, "sha512-Vo1ab+QXPzZ4tCa8SwIHJFaSzy4R6SHf7BY79rFBDf0idraZWAkYrDjDj8uWaSm3S2TK+hJ7/t1CEmZ7jXw+pg=="], + + "json-schema": ["json-schema@0.4.0", "", {}, "sha512-es94M3nTIfsEPisRafak+HDLfHXnKBhV3vU5eqPcS3flIWqcxJWgXHXiey3YrpaNsanY5ei1VoYEbOzijuq9BA=="], + + "nanoid": ["nanoid@3.3.11", "", { "bin": { "nanoid": "bin/nanoid.cjs" } }, "sha512-N8SpfPUnUp1bK+PMYW8qSWdl9U+wwNWI4QKxOYDy9JAro3WMX7p2OeVRF9v+347pnakNevPmiHhNmZ2HbFA76w=="], + + "secure-json-parse": ["secure-json-parse@2.7.0", "", {}, "sha512-6aU+Rwsezw7VR8/nyvKTx8QpWH9FrcYiXXlqC4z5d5XQBDRqtbfsRjnwGyqbi3gddNtWHuEk9OANUotL26qKUw=="], + + "zod": ["zod@4.3.6", "", {}, "sha512-rftlrkhHZOcjDwkGlnUtZZkvaPHCsDATp4pGpuOOMDaTdDDXF91wuVDJoWoPsKX/3YPQ5fHuF3STjcYyKr+Qhg=="], + + "@ai-sdk/gateway/@ai-sdk/provider": ["@ai-sdk/provider@3.0.8", "", { "dependencies": { "json-schema": "^0.4.0" } }, "sha512-oGMAgGoQdBXbZqNG0Ze56CHjDZ1IDYOwGYxYjO5KLSlz5HiNQ9udIXsPZ61VWaHGZ5XW/jyjmr6t2xz2jGVwbQ=="], + + "@ai-sdk/gateway/@ai-sdk/provider-utils": ["@ai-sdk/provider-utils@4.0.15", "", { "dependencies": { "@ai-sdk/provider": "3.0.8", "@standard-schema/spec": "^1.1.0", "eventsource-parser": "^3.0.6" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-8XiKWbemmCbvNN0CLR9u3PQiet4gtEVIrX4zzLxnCj06AwsEDJwJVBbKrEI4t6qE8XRSIvU2irka0dcpziKW6w=="], + + "@ai-sdk/openai-compatible/@ai-sdk/provider": ["@ai-sdk/provider@3.0.8", "", { "dependencies": { "json-schema": "^0.4.0" } }, "sha512-oGMAgGoQdBXbZqNG0Ze56CHjDZ1IDYOwGYxYjO5KLSlz5HiNQ9udIXsPZ61VWaHGZ5XW/jyjmr6t2xz2jGVwbQ=="], + + "@ai-sdk/openai-compatible/@ai-sdk/provider-utils": ["@ai-sdk/provider-utils@4.0.15", "", { "dependencies": { "@ai-sdk/provider": "3.0.8", "@standard-schema/spec": "^1.1.0", "eventsource-parser": "^3.0.6" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-8XiKWbemmCbvNN0CLR9u3PQiet4gtEVIrX4zzLxnCj06AwsEDJwJVBbKrEI4t6qE8XRSIvU2irka0dcpziKW6w=="], + + "@friendliai/ai-provider/@ai-sdk/provider": ["@ai-sdk/provider@3.0.8", "", { "dependencies": { "json-schema": "^0.4.0" } }, "sha512-oGMAgGoQdBXbZqNG0Ze56CHjDZ1IDYOwGYxYjO5KLSlz5HiNQ9udIXsPZ61VWaHGZ5XW/jyjmr6t2xz2jGVwbQ=="], + + "@friendliai/ai-provider/@ai-sdk/provider-utils": ["@ai-sdk/provider-utils@4.0.15", "", { "dependencies": { "@ai-sdk/provider": "3.0.8", "@standard-schema/spec": "^1.1.0", "eventsource-parser": "^3.0.6" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-8XiKWbemmCbvNN0CLR9u3PQiet4gtEVIrX4zzLxnCj06AwsEDJwJVBbKrEI4t6qE8XRSIvU2irka0dcpziKW6w=="], + + "ai/@ai-sdk/provider": ["@ai-sdk/provider@3.0.8", "", { "dependencies": { "json-schema": "^0.4.0" } }, "sha512-oGMAgGoQdBXbZqNG0Ze56CHjDZ1IDYOwGYxYjO5KLSlz5HiNQ9udIXsPZ61VWaHGZ5XW/jyjmr6t2xz2jGVwbQ=="], + + "ai/@ai-sdk/provider-utils": ["@ai-sdk/provider-utils@4.0.15", "", { "dependencies": { "@ai-sdk/provider": "3.0.8", "@standard-schema/spec": "^1.1.0", "eventsource-parser": "^3.0.6" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-8XiKWbemmCbvNN0CLR9u3PQiet4gtEVIrX4zzLxnCj06AwsEDJwJVBbKrEI4t6qE8XRSIvU2irka0dcpziKW6w=="], + } +} diff --git a/benchmarks/headless.ts b/benchmarks/headless.ts new file mode 100644 index 00000000..bb2af701 --- /dev/null +++ b/benchmarks/headless.ts @@ -0,0 +1,190 @@ +#!/usr/bin/env bun +import { readFile, writeFile, mkdir } from "node:fs/promises" +import { join, dirname } from "node:path" +import { stepCountIs, streamText, type CoreMessage } from "ai" +import { tool } from "ai" +import { createFriendli } from "@friendliai/ai-provider" +import { z } from "zod" +import { formatHashLines } from "../src/tools/hashline-edit/hash-computation" +import { normalizeHashlineEdits } from "../src/tools/hashline-edit/normalize-edits" +import { applyHashlineEditsWithReport } from "../src/tools/hashline-edit/edit-operations" +import { canonicalizeFileText, restoreFileText } from "../src/tools/hashline-edit/file-text-canonicalization" + +const DEFAULT_MODEL = "MiniMaxAI/MiniMax-M2.5" +const MAX_STEPS = 50 +const sessionId = `bench-${Date.now()}-${Math.random().toString(36).slice(2, 8)}` + +const emit = (event: Record) => + console.log(JSON.stringify({ sessionId, timestamp: new Date().toISOString(), ...event })) + +// ── CLI ────────────────────────────────────────────────────── +function parseArgs(): { prompt: string; modelId: string } { + const args = process.argv.slice(2) + let prompt = "" + let modelId = DEFAULT_MODEL + for (let i = 0; i < args.length; i++) { + if ((args[i] === "-p" || args[i] === "--prompt") && args[i + 1]) { + prompt = args[++i] + } else if ((args[i] === "-m" || args[i] === "--model") && args[i + 1]) { + modelId = args[++i] + } else if (args[i] === "--reasoning-mode" && args[i + 1]) { + i++ // consume + } + // --no-translate, --think consumed silently + } + if (!prompt) { + console.error("Usage: bun run benchmarks/headless.ts -p [-m ]") + process.exit(1) + } + return { prompt, modelId } +} + +// ── Tools ──────────────────────────────────────────────────── +const readFileTool = tool({ + description: "Read a file with hashline-tagged content (LINE#ID format)", + inputSchema: z.object({ path: z.string().describe("File path") }), + execute: async ({ path }) => { + const fullPath = join(process.cwd(), path) + try { + const content = await readFile(fullPath, "utf-8") + const lines = content.split("\n") + const tagged = formatHashLines(content) + return `OK - read file\npath: ${path}\nlines: ${lines.length}\n\n${tagged}` + } catch { + return `Error: File not found: ${path}` + } + }, +}) + +const editFileTool = tool({ + description: "Edit a file using hashline anchors (LINE#ID format)", + inputSchema: z.object({ + path: z.string(), + edits: z.array( + z.object({ + op: z.enum(["replace", "append", "prepend"]), + pos: z.string().optional(), + end: z.string().optional(), + lines: z.union([z.array(z.string()), z.string(), z.null()]), + }) + ).min(1), + }), + execute: async ({ path, edits }) => { + const fullPath = join(process.cwd(), path) + try { + let rawContent = "" + let exists = true + try { + rawContent = await readFile(fullPath, "utf-8") + } catch { + exists = false + } + + const normalized = normalizeHashlineEdits(edits) + + if (!exists) { + const canCreate = normalized.every( + (e) => (e.op === "append" || e.op === "prepend") && !e.pos + ) + if (!canCreate) return `Error: File not found: ${path}` + } + + const envelope = canonicalizeFileText(rawContent) + const result = applyHashlineEditsWithReport(envelope.content, normalized) + + if (result.content === envelope.content) { + return `Error: No changes made to ${path}. The edits produced identical content.` + } + + const writeContent = restoreFileText(result.content, envelope) + await mkdir(dirname(fullPath), { recursive: true }) + await writeFile(fullPath, writeContent, "utf-8") + + const oldLineCount = rawContent.split("\n").length + const newLineCount = writeContent.split("\n").length + const delta = newLineCount - oldLineCount + const sign = delta > 0 ? "+" : "" + const action = exists ? "Updated" : "Created" + return `${action} ${path}\n${edits.length} edit(s) applied, ${sign}${delta} line(s)` + } catch (error) { + return `Error: ${error instanceof Error ? error.message : String(error)}` + } + }, +}) + +// ── Agent Loop ─────────────────────────────────────────────── +async function run() { + const { prompt, modelId } = parseArgs() + + const friendli = createFriendli({ apiKey: process.env.FRIENDLI_TOKEN! }) + const model = friendli(modelId) + const tools = { read_file: readFileTool, edit_file: editFileTool } + + emit({ type: "user", content: prompt }) + + const messages: CoreMessage[] = [{ role: "user", content: prompt }] + const system = + "You are a code editing assistant. Use read_file to read files and edit_file to edit them. " + + "Always read a file before editing it to get fresh LINE#ID anchors." + + for (let step = 0; step < MAX_STEPS; step++) { + const stream = streamText({ + model, + tools, + messages, + system, + stopWhen: stepCountIs(1), + }) + + let currentText = "" + for await (const part of stream.fullStream) { + switch (part.type) { + case "text-delta": + currentText += part.text + break + case "tool-call": + emit({ + type: "tool_call", + tool_call_id: part.toolCallId, + tool_name: part.toolName, + tool_input: part.args, + model: modelId, + }) + break + case "tool-result": + emit({ + type: "tool_result", + tool_call_id: part.toolCallId, + output: typeof part.result === "string" ? part.result : JSON.stringify(part.result), + }) + break + } + } + + const response = await stream.response + messages.push(...response.messages) + + const finishReason = await stream.finishReason + if (finishReason !== "tool-calls") { + if (currentText.trim()) { + emit({ type: "assistant", content: currentText, model: modelId }) + } + break + } + } +} + +// ── Signal + Startup ───────────────────────────────────────── +process.once("SIGINT", () => process.exit(0)) +process.once("SIGTERM", () => process.exit(143)) + +const startTime = Date.now() +run() + .catch((error) => { + emit({ type: "error", error: error instanceof Error ? error.message : String(error) }) + process.exit(1) + }) + .then(() => { + const elapsed = ((Date.now() - startTime) / 1000).toFixed(2) + console.error(`[headless] Completed in ${elapsed}s`) + }) diff --git a/benchmarks/package.json b/benchmarks/package.json new file mode 100644 index 00000000..bbddfed8 --- /dev/null +++ b/benchmarks/package.json @@ -0,0 +1,19 @@ +{ + "name": "hashline-edit-benchmark", + "version": "0.1.0", + "private": true, + "type": "module", + "description": "Hashline edit tool benchmark using Vercel AI SDK with FriendliAI provider", + "scripts": { + "bench:basic": "bun run test-edit-ops.ts", + "bench:edge": "bun run test-edge-cases.ts", + "bench:multi": "bun run test-multi-model.ts", + "bench:all": "bun run bench:basic && bun run bench:edge" + }, + "dependencies": { + "ai": "^6.0.94", + "@ai-sdk/openai": "^1.3.0", + "@friendliai/ai-provider": "^1.0.9", + "zod": "^4.1.0" + } +} From 04f50bac1ffe260c67b5a2ccd4c99f2d7ef27654 Mon Sep 17 00:00:00 2001 From: minpeter Date: Fri, 27 Feb 2026 01:37:49 +0900 Subject: [PATCH 3/4] feat(benchmarks): add hashline-edit test suites (46 tests) Ported from code-editing-agent benchmark: - test-edit-ops.ts: 21 basic edit operations (replace, append, prepend, delete, batch, range) - test-edge-cases.ts: 25 edge cases (unicode, long lines, whitespace, special chars, file creation) - test-multi-model.ts: multi-model comparison runner Verified 21/21 + 25/25 (100%) with Minimax M2.5 via FriendliAI. Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-opencode) Co-authored-by: Sisyphus --- benchmarks/test-edge-cases.ts | 1117 ++++++++++++++++++++++++++++++++ benchmarks/test-edit-ops.ts | 808 +++++++++++++++++++++++ benchmarks/test-multi-model.ts | 269 ++++++++ 3 files changed, 2194 insertions(+) create mode 100644 benchmarks/test-edge-cases.ts create mode 100644 benchmarks/test-edit-ops.ts create mode 100644 benchmarks/test-multi-model.ts diff --git a/benchmarks/test-edge-cases.ts b/benchmarks/test-edge-cases.ts new file mode 100644 index 00000000..a1916c56 --- /dev/null +++ b/benchmarks/test-edge-cases.ts @@ -0,0 +1,1117 @@ +#!/usr/bin/env bun +/** + * Comprehensive headless edit_file stress test: 25 edge cases + * + * Tests: 5 basic ops + 14 creative cases + 6 whitespace cases + * Each runs via headless mode with its own demo file + prompt. + * + * Usage: + * bun run scripts/test-headless-edit-edge-cases.ts [-m ] [--provider ] + */ + +import { spawn } from "node:child_process"; +import { mkdirSync, readFileSync, rmSync, writeFileSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join, resolve } from "node:path"; + +// ── CLI arg passthrough ─────────────────────────────────────── +const extraArgs: string[] = []; +const rawArgs = process.argv.slice(2); +for (let i = 0; i < rawArgs.length; i++) { + const arg = rawArgs[i]; + if ( + (arg === "-m" || arg === "--model" || arg === "--provider") && + i + 1 < rawArgs.length + ) { + extraArgs.push(arg, rawArgs[i + 1]); + i++; + } else if (arg === "--think" || arg === "--no-translate") { + extraArgs.push(arg); + } else if (arg === "--reasoning-mode" && i + 1 < rawArgs.length) { + extraArgs.push(arg, rawArgs[i + 1]); + i++; + } +} + +// ── Colors ──────────────────────────────────────────────────── +const BOLD = "\x1b[1m"; +const GREEN = "\x1b[32m"; +const RED = "\x1b[31m"; +const YELLOW = "\x1b[33m"; +const DIM = "\x1b[2m"; +const CYAN = "\x1b[36m"; +const RESET = "\x1b[0m"; + +const pass = (msg: string) => console.log(` ${GREEN}✓${RESET} ${msg}`); +const fail = (msg: string) => console.log(` ${RED}✗${RESET} ${msg}`); +const info = (msg: string) => console.log(` ${DIM}${msg}${RESET}`); +const warn = (msg: string) => console.log(` ${YELLOW}⚠${RESET} ${msg}`); + +// ── Test case definition ───────────────────────────────────── +interface TestCase { + fileContent: string; + fileName: string; + name: string; + prompt: string; + validate: (content: string) => { passed: boolean; reason: string }; +} + +const TEST_CASES: TestCase[] = [ + { + name: "1. Single-line file — replace only line", + fileName: "single-line.txt", + fileContent: "only_line_original", + prompt: [ + "Read single-line.txt with read_file.", + "Replace the only line using edit_file with edits: [{ op: 'replace', pos: '', lines: ['only_line_updated'] }].", + "Expected final content exactly one line: only_line_updated.", + ].join(" "), + validate: (content) => { + const normalized = content.replace(/\r/g, "").trimEnd(); + const lines = normalized.split("\n"); + if (lines.length === 1 && lines[0] === "only_line_updated") { + return { passed: true, reason: "single line replaced correctly" }; + } + if (normalized.includes("only_line_original")) { + return { passed: false, reason: "original line still present" }; + } + return { + passed: false, + reason: `expected one line 'only_line_updated', got ${lines.length} lines`, + }; + }, + }, + { + name: "2. Large file (20 lines) — replace middle line 11", + fileName: "twenty-lines.txt", + fileContent: Array.from( + { length: 20 }, + (_, i) => `line${String(i + 1).padStart(2, "0")}: value-${i + 1}` + ).join("\n"), + prompt: [ + "Read twenty-lines.txt with read_file.", + "Replace line 11 using edit_file with edits: [{ op: 'replace', pos: '', lines: ['line11: UPDATED-MIDDLE'] }].", + "Keep all other lines unchanged.", + ].join(" "), + validate: (content) => { + const lines = content.replace(/\r/g, "").trimEnd().split("\n"); + if (lines.length !== 20) { + return { + passed: false, + reason: `expected 20 lines, got ${lines.length}`, + }; + } + if (lines[10] !== "line11: UPDATED-MIDDLE") { + return { + passed: false, + reason: `line 11 mismatch: '${lines[10] ?? ""}'`, + }; + } + if (lines[9] !== "line10: value-10" || lines[11] !== "line12: value-12") { + return { + passed: false, + reason: "neighboring lines changed unexpectedly", + }; + } + return { + passed: true, + reason: "line 11 replaced and surrounding lines preserved", + }; + }, + }, + { + name: "3. Range replace entire file (first→last to one line)", + fileName: "range-all.txt", + fileContent: ["first", "second", "third", "fourth", "fifth"].join("\n"), + prompt: [ + "Read range-all.txt with read_file.", + "Replace the full file from first line to last line using one range edit: edits: [{ op: 'replace', pos: '', end: '', lines: ['collapsed-to-one-line'] }].", + "Expected final content exactly: collapsed-to-one-line.", + ].join(" "), + validate: (content) => { + const normalized = content.replace(/\r/g, "").trimEnd(); + if (normalized === "collapsed-to-one-line") { + return { + passed: true, + reason: "entire file collapsed to single replacement line", + }; + } + if (normalized.includes("first") || normalized.includes("fifth")) { + return { + passed: false, + reason: "original range content still present", + }; + } + return { + passed: false, + reason: `unexpected final content: '${normalized.slice(0, 120)}'`, + }; + }, + }, + { + name: "4. Mixed ops in one call (replace + append + prepend)", + fileName: "mixed-one-call.txt", + fileContent: ["alpha", "beta", "gamma"].join("\n"), + prompt: [ + "Read mixed-one-call.txt with read_file.", + "Call edit_file exactly once with three edits in one edits array:", + "edits: [", + "{ op: 'replace', pos: '', lines: ['BETA'] },", + "{ op: 'append', pos: '', lines: ['delta'] },", + "{ op: 'prepend', pos: '', lines: ['start'] }", + "].", + "Expected final content: start, alpha, BETA, gamma, delta.", + ].join(" "), + validate: (content) => { + const lines = content.replace(/\r/g, "").trimEnd().split("\n"); + const expected = ["start", "alpha", "BETA", "gamma", "delta"]; + if (lines.length !== expected.length) { + return { + passed: false, + reason: `expected ${expected.length} lines, got ${lines.length}`, + }; + } + for (let i = 0; i < expected.length; i++) { + if (lines[i] !== expected[i]) { + return { + passed: false, + reason: `line ${i + 1} expected '${expected[i]}' but got '${lines[i]}'`, + }; + } + } + return { + passed: true, + reason: "single call applied replace, append, and prepend", + }; + }, + }, + { + name: "5. Large batch (5 replaces) in one call", + fileName: "batch-five.txt", + fileContent: [ + "row-1", + "row-2", + "row-3", + "row-4", + "row-5", + "row-6", + "row-7", + "row-8", + "row-9", + "row-10", + ].join("\n"), + prompt: [ + "Read batch-five.txt with read_file.", + "Call edit_file once with five replace edits in one edits array:", + "edits: [", + "{ op: 'replace', pos: '', lines: ['ROW-1'] },", + "{ op: 'replace', pos: '', lines: ['ROW-3'] },", + "{ op: 'replace', pos: '', lines: ['ROW-5'] },", + "{ op: 'replace', pos: '', lines: ['ROW-7'] },", + "{ op: 'replace', pos: '', lines: ['ROW-10'] }", + "].", + ].join(" "), + validate: (content) => { + const lines = content.replace(/\r/g, "").trimEnd().split("\n"); + if (lines.length !== 10) { + return { + passed: false, + reason: `expected 10 lines, got ${lines.length}`, + }; + } + const checks: [number, string][] = [ + [0, "ROW-1"], + [2, "ROW-3"], + [4, "ROW-5"], + [6, "ROW-7"], + [9, "ROW-10"], + ]; + for (const [idx, expected] of checks) { + if (lines[idx] !== expected) { + return { + passed: false, + reason: `line ${idx + 1} expected '${expected}' but got '${lines[idx]}'`, + }; + } + } + if ( + lines[1] !== "row-2" || + lines[3] !== "row-4" || + lines[8] !== "row-9" + ) { + return { + passed: false, + reason: "unchanged lines were unexpectedly modified", + }; + } + return { + passed: true, + reason: "all 5 replacements succeeded in one edit_file call", + }; + }, + }, + { + name: "6. Consecutive edits (read→edit→read→edit)", + fileName: "consecutive.txt", + fileContent: ["stage: one", "value: 1", "status: draft"].join("\n"), + prompt: [ + "Read consecutive.txt with read_file.", + "First call edit_file with edits: [{ op: 'replace', pos: '', lines: ['value: 2'] }].", + "Then read consecutive.txt with read_file again.", + "Second, call edit_file again with edits: [{ op: 'replace', pos: '', lines: ['status: final'] }].", + "Expected final content: stage: one, value: 2, status: final.", + ].join(" "), + validate: (content) => { + const lines = content.replace(/\r/g, "").trimEnd().split("\n"); + const expected = ["stage: one", "value: 2", "status: final"]; + if (lines.length !== expected.length) { + return { + passed: false, + reason: `expected ${expected.length} lines, got ${lines.length}`, + }; + } + for (let i = 0; i < expected.length; i++) { + if (lines[i] !== expected[i]) { + return { + passed: false, + reason: `line ${i + 1} expected '${expected[i]}' but got '${lines[i]}'`, + }; + } + } + return { + passed: true, + reason: "two sequential edit_file calls produced expected final state", + }; + }, + }, + { + name: "7. Create new file via append", + fileName: "create-via-append.txt", + fileContent: "", + prompt: [ + "Create create-via-append.txt via edit_file append (do not call read_file first).", + "Use one call with edits: [{ op: 'append', lines: ['created line 1', 'created line 2'] }].", + "Expected final content exactly two lines: created line 1 and created line 2.", + ].join(" "), + validate: (content) => { + const normalized = content.replace(/\r/g, "").trimEnd(); + const lines = normalized === "" ? [] : normalized.split("\n"); + if (lines.length !== 2) { + return { + passed: false, + reason: `expected 2 lines, got ${lines.length}`, + }; + } + if (lines[0] !== "created line 1" || lines[1] !== "created line 2") { + return { + passed: false, + reason: `unexpected file content: '${normalized.slice(0, 120)}'`, + }; + } + return { + passed: true, + reason: "append created expected two-line content", + }; + }, + }, + { + name: "8. Unicode/emoji line replacement", + fileName: "unicode.txt", + fileContent: ["status: pending", "message: old"].join("\n"), + prompt: [ + "Read unicode.txt with read_file.", + "Replace line 2 with Unicode content using edit_file and edits: [{ op: 'replace', pos: '', lines: ['message: 🎉🚀 한국어 테스트 완료'] }].", + "Expected line 2 exactly: message: 🎉🚀 한국어 테스트 완료.", + ].join(" "), + validate: (content) => { + const lines = content.replace(/\r/g, "").trimEnd().split("\n"); + if (lines[1] !== "message: 🎉🚀 한국어 테스트 완료") { + return { + passed: false, + reason: `line 2 mismatch: '${lines[1] ?? ""}'`, + }; + } + if (content.includes("message: old")) { + return { passed: false, reason: "old message still present" }; + } + return { + passed: true, + reason: "Unicode and emoji content replaced correctly", + }; + }, + }, + { + name: "9. Backticks/template literal content", + fileName: "template.ts", + fileContent: ["const name = 'dev';", "const msg = 'old';"].join("\n"), + prompt: [ + "Read template.ts with read_file.", + "Replace line 2 using edit_file with edits: [{ op: 'replace', pos: '', lines: ['const msg = `hello \u0024{name}`;'] }].", + "Expected line 2 exactly: const msg = `hello \u0024{name}`;", + ].join(" "), + validate: (content) => { + const expected = "const msg = `hello \u0024{name}`;"; + const lines = content.replace(/\r/g, "").trimEnd().split("\n"); + if (lines[1] !== expected) { + return { + passed: false, + reason: `line 2 expected '${expected}' but got '${lines[1] ?? ""}'`, + }; + } + if (content.includes("const msg = 'old';")) { + return { passed: false, reason: "old msg assignment still present" }; + } + return { + passed: true, + reason: "template literal with backticks preserved", + }; + }, + }, + { + name: "10. Regex pattern content", + fileName: "regex.ts", + fileContent: ["const re = /old/;", "const ok = true;"].join("\n"), + prompt: [ + "Read regex.ts with read_file.", + "Replace line 1 using edit_file with edits: [{ op: 'replace', pos: '', lines: ['const re = /^[a-z]+\\d{2,}$/gi;'] }].", + "Expected line 1 exactly: const re = /^[a-z]+\\d{2,}$/gi;", + ].join(" "), + validate: (content) => { + const expected = "const re = /^[a-z]+\\d{2,}$/gi;"; + const lines = content.replace(/\r/g, "").trimEnd().split("\n"); + if (lines[0] !== expected) { + return { + passed: false, + reason: `regex line mismatch: '${lines[0] ?? ""}'`, + }; + } + if (content.includes("const re = /old/;")) { + return { passed: false, reason: "old regex still present" }; + } + return { + passed: true, + reason: "regex pattern replacement preserved escaping", + }; + }, + }, + { + name: "11. Escaped quotes and backslashes", + fileName: "path.cfg", + fileContent: ['path = "/tmp/file.txt"', "mode = rw"].join("\n"), + prompt: [ + "Read path.cfg with read_file.", + "Replace line 1 using edit_file with edits: [{ op: 'replace', pos: '', lines: ['path = \"C:\\\\Users\\\\admin\\\\file.txt\"'] }].", + 'The file should contain a Windows-style path with backslashes: C:\\Users\\admin\\file.txt.', + ].join(" "), + validate: (content) => { + const lines = content.replace(/\r/g, "").trimEnd().split("\n"); + const line1 = lines[0] ?? ""; + // Accept either single or double backslashes — both are valid model interpretations + const hasSingleBS = line1.includes('C:\\Users\\admin\\file.txt'); + const hasDoubleBS = line1.includes('C:\\\\Users\\\\admin\\\\file.txt'); + const hasPath = hasSingleBS || hasDoubleBS; + const hasQuotes = line1.includes('"'); + if (hasPath && hasQuotes) { + return { + passed: true, + reason: "backslash path content preserved correctly", + }; + } + return { + passed: false, + reason: `expected Windows path with backslashes but got '${line1}'`, + }; + }, + }, + { + name: "12. HTML tags in content", + fileName: "html-snippet.txt", + fileContent: ["snippet: old", "done: true"].join("\n"), + prompt: [ + "Read html-snippet.txt with read_file.", + "Replace line 1 using edit_file with edits: [{ op: 'replace', pos: '', lines: ['

Hello

'] }].", + 'Expected line 1 exactly:

Hello

.', + ].join(" "), + validate: (content) => { + const expected = '

Hello

'; + const lines = content.replace(/\r/g, "").trimEnd().split("\n"); + if (lines[0] !== expected) { + return { + passed: false, + reason: `HTML line mismatch: '${lines[0] ?? ""}'`, + }; + } + if (content.includes("snippet: old")) { + return { passed: false, reason: "old snippet line still present" }; + } + return { passed: true, reason: "HTML tag content inserted exactly" }; + }, + }, + { + name: "13. Very long line (180 chars)", + fileName: "long-line.txt", + fileContent: ["line-1", "short-line"].join("\n"), + prompt: [ + "Read long-line.txt with read_file.", + `Replace line 2 using edit_file with edits: [{ op: 'replace', pos: '', lines: ['${"L".repeat(180)}'] }].`, + "Expected line 2 to be exactly 180 characters.", + ].join(" "), + validate: (content) => { + const expected = "L".repeat(180); + const lines = content.replace(/\r/g, "").trimEnd().split("\n"); + if (!lines[1]) { + return { passed: false, reason: "line 2 is missing" }; + } + if (Math.abs(lines[1].length - 180) > 2) { + return { + passed: false, + reason: `line 2 length expected ~180 but got ${lines[1].length}`, + }; + } + if (!lines[1].startsWith("LLLL")) { + return { + passed: false, + reason: "line 2 content does not match expected repeated-L string", + }; + } + return { passed: true, reason: `long line replaced (${lines[1].length} chars)` }; + }, + }, + { + name: "14. SQL query content", + fileName: "sql-content.txt", + fileContent: ["SELECT 1;", "done"].join("\n"), + prompt: [ + "Read sql-content.txt with read_file.", + "Replace line 1 using edit_file with edits: [{ op: 'replace', pos: '', lines: ['SELECT u.name, o.total FROM users u JOIN orders o ON u.id = o.user_id WHERE o.total > 100;'] }].", + "Expected line 1 exactly the provided SQL query.", + ].join(" "), + validate: (content) => { + const expected = + "SELECT u.name, o.total FROM users u JOIN orders o ON u.id = o.user_id WHERE o.total > 100;"; + const lines = content.replace(/\r/g, "").trimEnd().split("\n"); + if (lines[0] !== expected) { + return { + passed: false, + reason: `SQL line mismatch: '${lines[0] ?? ""}'`, + }; + } + return { passed: true, reason: "SQL query line replaced exactly" }; + }, + }, + { + name: "15. Mixed indentation (tab -> spaces)", + fileName: "mixed-indent.ts", + fileContent: [ + "function run() {", + "\tconst tabIndented = true;", + " const twoSpaces = true;", + "}", + ].join("\n"), + prompt: [ + "Read mixed-indent.ts with read_file.", + "Replace the tab-indented line 2 using edit_file with edits: [{ op: 'replace', pos: '', lines: [' const tabIndented = true;'] }].", + "Expected line 2 to be 4 spaces + const tabIndented = true;", + ].join(" "), + validate: (content) => { + const normalized = content.replace(/\r/g, ""); + const lines = normalized.endsWith("\n") + ? normalized.slice(0, -1).split("\n") + : normalized.split("\n"); + if (lines[1] !== " const tabIndented = true;") { + return { + passed: false, + reason: `line 2 mismatch: '${lines[1] ?? ""}'`, + }; + } + if (lines[1].includes("\t")) { + return { + passed: false, + reason: "line 2 still contains a tab character", + }; + } + if (lines[2] !== " const twoSpaces = true;") { + return { passed: false, reason: "line 3 changed unexpectedly" }; + } + return { + passed: true, + reason: "tab-indented line replaced with space-indented line", + }; + }, + }, + { + name: "16. Trailing whitespace preservation", + fileName: "trailing-whitespace.txt", + fileContent: ["start", "text ", "end"].join("\n"), + prompt: [ + "Read trailing-whitespace.txt with read_file.", + "Replace line 2 using edit_file with edits: [{ op: 'replace', pos: '', lines: ['new_text '] }].", + "Keep exactly three trailing spaces after new_text.", + ].join(" "), + validate: (content) => { + const normalized = content.replace(/\r/g, ""); + const lines = normalized.endsWith("\n") + ? normalized.slice(0, -1).split("\n") + : normalized.split("\n"); + if (!lines[1]) { + return { passed: false, reason: "line 2 missing" }; + } + if (lines[1] === "new_text ") { + return { + passed: true, + reason: "trailing spaces preserved on replaced line", + }; + } + if (lines[1] === "new_text") { + return { passed: false, reason: "trailing spaces were stripped" }; + } + return { + passed: false, + reason: `line 2 unexpected value: ${JSON.stringify(lines[1])}`, + }; + }, + }, + { + name: "17. Replace line containing only spaces", + fileName: "spaces-only-line.txt", + fileContent: ["alpha", " ", "omega"].join("\n"), + prompt: [ + "Read spaces-only-line.txt with read_file.", + "Replace the line that contains only 4 spaces (line 2) using edit_file with edits: [{ op: 'replace', pos: '', lines: ['middle-content'] }].", + "Expected final content: alpha, middle-content, omega.", + ].join(" "), + validate: (content) => { + const normalized = content.replace(/\r/g, ""); + const lines = normalized.endsWith("\n") + ? normalized.slice(0, -1).split("\n") + : normalized.split("\n"); + if (lines.length !== 3) { + return { + passed: false, + reason: `expected 3 lines, got ${lines.length}`, + }; + } + if (lines[0] !== "alpha" || lines[2] !== "omega") { + return { + passed: false, + reason: "non-target lines changed unexpectedly", + }; + } + if (lines[1].trim() !== "middle-content") { + return { + passed: false, + reason: `line 2 expected 'middle-content' but got ${JSON.stringify(lines[1])}`, + }; + } + return { + passed: true, + reason: "4-space-only line replaced with content", + }; + }, + }, + { + name: "18. Delete middle blank from consecutive blank lines", + fileName: "consecutive-blanks.txt", + fileContent: ["top", "", "", "", "bottom"].join("\n"), + prompt: [ + "Read consecutive-blanks.txt with read_file.", + "Delete only the middle blank line (line 3 of 5) using edit_file with edits: [{ op: 'replace', pos: '', lines: [] }].", + "Keep the other two blank lines intact.", + ].join(" "), + validate: (content) => { + const normalized = content.replace(/\r/g, ""); + const lines = normalized.endsWith("\n") + ? normalized.slice(0, -1).split("\n") + : normalized.split("\n"); + const expected = ["top", "", "", "bottom"]; + if (lines.length !== expected.length) { + return { + passed: false, + reason: `expected ${expected.length} lines after deleting one blank, got ${lines.length}`, + }; + } + for (let i = 0; i < expected.length; i++) { + if (lines[i] !== expected[i]) { + return { + passed: false, + reason: `line ${i + 1} expected ${JSON.stringify(expected[i])} but got ${JSON.stringify(lines[i])}`, + }; + } + } + return { passed: true, reason: "only the middle blank line was deleted" }; + }, + }, + { + name: "19. Indentation increase (2 spaces -> 8 spaces)", + fileName: "indent-increase.js", + fileContent: ["if (flag) {", " execute();", "}"].join("\n"), + prompt: [ + "Read indent-increase.js with read_file.", + "Replace line 2 using edit_file with edits: [{ op: 'replace', pos: '', lines: [' execute();'] }].", + "Expected line 2 indentation increased from 2 spaces to 8 spaces.", + ].join(" "), + validate: (content) => { + const normalized = content.replace(/\r/g, ""); + const lines = normalized.endsWith("\n") + ? normalized.slice(0, -1).split("\n") + : normalized.split("\n"); + if (lines.length !== 3) { + return { + passed: false, + reason: `expected 3 lines, got ${lines.length}`, + }; + } + if (lines[1] !== " execute();") { + return { + passed: false, + reason: `line 2 expected 8-space indentation, got ${JSON.stringify(lines[1])}`, + }; + } + if (lines[0] !== "if (flag) {" || lines[2] !== "}") { + return { passed: false, reason: "outer lines changed unexpectedly" }; + } + return { + passed: true, + reason: "indentation increased to 8 spaces as expected", + }; + }, + }, + { + name: "20. Content that resembles hashline format", + fileName: "hashline-content.txt", + fileContent: ["anchor: old", "tail"].join("\n"), + prompt: [ + "Read hashline-content.txt with read_file.", + "Replace line 1 using edit_file with edits: [{ op: 'replace', pos: '', lines: ['anchor: 1#AB format is used'] }].", + "Expected line 1 exactly: anchor: 1#AB format is used.", + ].join(" "), + validate: (content) => { + const lines = content.replace(/\r/g, "").trimEnd().split("\n"); + if (lines[0] !== "anchor: 1#AB format is used") { + return { + passed: false, + reason: `line 1 mismatch: '${lines[0] ?? ""}'`, + }; + } + return { + passed: true, + reason: "hashline-like literal content preserved correctly", + }; + }, + }, + { + name: "21. Literal backslash-n content", + fileName: "literal-backslash-n.txt", + fileContent: ["placeholder", "tail"].join("\n"), + prompt: [ + "Read literal-backslash-n.txt with read_file.", + "Replace line 1 using edit_file with edits: [{ op: 'replace', pos: '', lines: ['line1\\nline2 (literal backslash-n, not newline)'] }].", + "Expected first line to contain literal \\n characters, not an actual newline split.", + ].join(" "), + validate: (content) => { + const expected = "line1\\nline2 (literal backslash-n, not newline)"; + const lines = content.replace(/\r/g, "").trimEnd().split("\n"); + if (lines.length !== 2) { + return { + passed: false, + reason: `expected 2 lines total, got ${lines.length}`, + }; + } + if (lines[0] !== expected) { + return { + passed: false, + reason: `line 1 expected '${expected}' but got '${lines[0] ?? ""}'`, + }; + } + return { + passed: true, + reason: "literal \\n sequence preserved in a single line", + }; + }, + }, + { + name: "22. Append multiple lines at once", + fileName: "append-multi.txt", + fileContent: ["header", "anchor-line", "footer"].join("\n"), + prompt: [ + "Read append-multi.txt with read_file.", + "Append three lines after anchor-line (line 2) using edit_file with edits: [{ op: 'append', pos: '', lines: ['item-a', 'item-b', 'item-c'] }].", + "Expected final order: header, anchor-line, item-a, item-b, item-c, footer.", + ].join(" "), + validate: (content) => { + const lines = content.replace(/\r/g, "").trimEnd().split("\n"); + const expected = [ + "header", + "anchor-line", + "item-a", + "item-b", + "item-c", + "footer", + ]; + if (lines.length !== expected.length) { + return { + passed: false, + reason: `expected ${expected.length} lines, got ${lines.length}`, + }; + } + for (let i = 0; i < expected.length; i++) { + if (lines[i] !== expected[i]) { + return { + passed: false, + reason: `line ${i + 1} expected '${expected[i]}' but got '${lines[i]}'`, + }; + } + } + return { + passed: true, + reason: "three lines appended in a single append edit", + }; + }, + }, + { + name: "23. Replace long line with single short word", + fileName: "shrink-line.txt", + fileContent: [ + "prefix", + "this line is intentionally very long so that replacing it with one short token verifies a major length reduction edge case", + "suffix", + ].join("\n"), + prompt: [ + "Read shrink-line.txt with read_file.", + "Replace the long line 2 using edit_file with edits: [{ op: 'replace', pos: '', lines: ['short'] }].", + "Expected final line 2 exactly: short.", + ].join(" "), + validate: (content) => { + const lines = content.replace(/\r/g, "").trimEnd().split("\n"); + if (lines[1] !== "short") { + return { + passed: false, + reason: `line 2 expected 'short' but got '${lines[1] ?? ""}'`, + }; + } + if (content.includes("intentionally very long")) { + return { passed: false, reason: "old long line text still present" }; + } + return { + passed: true, + reason: "long line replaced by single short word", + }; + }, + }, + { + name: "24. Edit file with no trailing newline", + fileName: "no-trailing-newline.txt", + fileContent: "first\nsecond\nthird", + prompt: [ + "Read no-trailing-newline.txt with read_file.", + "Replace line 2 using edit_file with edits: [{ op: 'replace', pos: '', lines: ['SECOND'] }].", + "Expected final content lines: first, SECOND, third, and no trailing newline at EOF.", + ].join(" "), + validate: (content) => { + const normalized = content.replace(/\r/g, ""); + const lines = normalized.split("\n"); + if (lines.length !== 3) { + return { + passed: false, + reason: `expected 3 lines, got ${lines.length}`, + }; + } + if ( + lines[0] !== "first" || + lines[1] !== "SECOND" || + lines[2] !== "third" + ) { + return { + passed: false, + reason: `unexpected lines: ${JSON.stringify(lines)}`, + }; + } + if (normalized.endsWith("\n")) { + return { + passed: false, + reason: "file now has trailing newline but should not", + }; + } + return { + passed: true, + reason: "edited correctly without introducing trailing newline", + }; + }, + }, + { + name: "25. Prepend at BOF without pos anchor", + fileName: "prepend-bof.js", + fileContent: ["console.log('hello');", "console.log('done');"].join("\n"), + prompt: [ + "Read prepend-bof.js with read_file.", + "Prepend a shebang at beginning of file using edit_file with no pos: edits: [{ op: 'prepend', lines: ['#!/usr/bin/env node'] }].", + "Do not include a pos field. Expected first line: #!/usr/bin/env node.", + ].join(" "), + validate: (content) => { + const lines = content.replace(/\r/g, "").trimEnd().split("\n"); + const expected = [ + "#!/usr/bin/env node", + "console.log('hello');", + "console.log('done');", + ]; + if (lines.length !== expected.length) { + return { + passed: false, + reason: `expected ${expected.length} lines, got ${lines.length}`, + }; + } + for (let i = 0; i < expected.length; i++) { + if (lines[i] !== expected[i]) { + return { + passed: false, + reason: `line ${i + 1} expected '${expected[i]}' but got '${lines[i]}'`, + }; + } + } + return { + passed: true, + reason: "shebang prepended at BOF without pos anchor", + }; + }, + }, +]; + +// ── JSONL event types ───────────────────────────────────────── +interface ToolCallEvent { + tool_call_id: string; + tool_input: Record; + tool_name: string; + type: "tool_call"; +} + +interface ToolResultEvent { + error?: string; + output: string; + tool_call_id: string; + type: "tool_result"; +} + +interface AnyEvent { + type: string; + [key: string]: unknown; +} + +// ── Run single test case ───────────────────────────────────── +async function runTestCase( + tc: TestCase, + testDir: string +): Promise<{ + passed: boolean; + editCalls: number; + editSuccesses: number; + duration: number; +}> { + const testFile = join(testDir, tc.fileName); + writeFileSync(testFile, tc.fileContent, "utf-8"); + + const headlessScript = resolve(import.meta.dir, "headless.ts"); + const headlessArgs = [ + "run", + headlessScript, + "-p", + tc.prompt, + "--no-translate", + ...extraArgs, + ]; + + const startTime = Date.now(); + + const output = await new Promise((res, reject) => { + const proc = spawn("bun", headlessArgs, { + cwd: testDir, + env: { ...process.env, BUN_INSTALL: process.env.BUN_INSTALL }, + stdio: ["ignore", "pipe", "pipe"], + }); + + let stdout = ""; + let stderr = ""; + + proc.stdout.on("data", (chunk: Buffer) => { + stdout += chunk.toString(); + }); + proc.stderr.on("data", (chunk: Buffer) => { + stderr += chunk.toString(); + }); + + const timeout = setTimeout( + () => { + proc.kill("SIGTERM"); + reject(new Error("Timed out after 4 minutes")); + }, + 4 * 60 * 1000 + ); + + proc.on("close", (code) => { + clearTimeout(timeout); + if (code !== 0) { + reject(new Error(`Exit code ${code}\n${stderr.slice(-500)}`)); + } else { + res(stdout); + } + }); + proc.on("error", (err) => { + clearTimeout(timeout); + reject(err); + }); + }); + + const duration = Date.now() - startTime; + + // Parse events + const events: AnyEvent[] = []; + for (const line of output.split("\n").filter((l) => l.trim())) { + try { + events.push(JSON.parse(line) as AnyEvent); + } catch { + // skip non-JSON + } + } + + const toolCalls = events.filter( + (e) => e.type === "tool_call" + ) as unknown as ToolCallEvent[]; + const toolResults = events.filter( + (e) => e.type === "tool_result" + ) as unknown as ToolResultEvent[]; + + const editCalls = toolCalls.filter((e) => e.tool_name === "edit_file"); + const editCallIds = new Set(editCalls.map((e) => e.tool_call_id)); + const editResults = toolResults.filter((e) => + editCallIds.has(e.tool_call_id) + ); + const editSuccesses = editResults.filter((e) => !e.error); + + // Show blocked calls + const editErrors = editResults.filter((e) => e.error); + for (const err of editErrors) { + const matchingCall = editCalls.find( + (c) => c.tool_call_id === err.tool_call_id + ); + info(` blocked: ${err.error?.slice(0, 120)}`); + if (matchingCall) { + info(` input: ${JSON.stringify(matchingCall.tool_input).slice(0, 200)}`); + } + } + + // Validate file content + let finalContent: string; + try { + finalContent = readFileSync(testFile, "utf-8"); + } catch { + return { + passed: false, + editCalls: editCalls.length, + editSuccesses: editSuccesses.length, + duration, + }; + } + + const validation = tc.validate(finalContent); + + return { + passed: validation.passed, + editCalls: editCalls.length, + editSuccesses: editSuccesses.length, + duration, + }; +} + +// ── Main ────────────────────────────────────────────────────── +const main = async () => { + console.log( + `\n${BOLD}Headless Edit Operations Test — ${TEST_CASES.length} Types${RESET}\n` + ); + + const testDir = join(tmpdir(), `edit-ops-${Date.now()}`); + mkdirSync(testDir, { recursive: true }); + info(`Test dir: ${testDir}`); + console.log(); + + let totalPassed = 0; + const results: { name: string; passed: boolean; detail: string }[] = []; + + for (const tc of TEST_CASES) { + console.log(`${CYAN}${BOLD}${tc.name}${RESET}`); + info(`File: ${tc.fileName}`); + info(`Prompt: "${tc.prompt.slice(0, 80)}..."`); + + try { + const result = await runTestCase(tc, testDir); + const status = result.passed + ? `${GREEN}PASS${RESET}` + : `${RED}FAIL${RESET}`; + const detail = `edit_file: ${result.editSuccesses}/${result.editCalls} succeeded, ${(result.duration / 1000).toFixed(1)}s`; + + console.log(` ${status} — ${detail}`); + + if (result.passed) { + totalPassed++; + // Validate the file to show reason + const content = readFileSync(join(testDir, tc.fileName), "utf-8"); + const v = tc.validate(content); + pass(v.reason); + } else { + const content = readFileSync(join(testDir, tc.fileName), "utf-8"); + const v = tc.validate(content); + fail(v.reason); + info( + `Final content:\n${content + .split("\n") + .map((l, i) => ` ${i + 1}: ${l}`) + .join("\n")}` + ); + } + + results.push({ name: tc.name, passed: result.passed, detail }); + } catch (error) { + const msg = error instanceof Error ? error.message : String(error); + console.log(` ${RED}ERROR${RESET} — ${msg.slice(0, 200)}`); + fail(msg.slice(0, 200)); + results.push({ name: tc.name, passed: false, detail: msg.slice(0, 100) }); + } + + // Reset file for next test (in case of side effects) + try { + rmSync(join(testDir, tc.fileName), { force: true }); + } catch (error) { + warn(`cleanup failed for ${tc.fileName}: ${error}`); + } + + console.log(); + } + + // Summary + console.log(`${BOLD}━━━ Summary ━━━${RESET}`); + for (const r of results) { + const icon = r.passed ? `${GREEN}✓${RESET}` : `${RED}✗${RESET}`; + console.log(` ${icon} ${r.name} — ${r.detail}`); + } + console.log(); + console.log( + `${BOLD}Result: ${totalPassed}/${TEST_CASES.length} passed (${Math.round((totalPassed / TEST_CASES.length) * 100)}%)${RESET}` + ); + + // Cleanup + try { + rmSync(testDir, { recursive: true, force: true }); + } catch (error) { + warn(`cleanup failed for ${testDir}: ${error}`); + } + + if (totalPassed === TEST_CASES.length) { + console.log( + `\n${BOLD}${GREEN}🎉 ALL TESTS PASSED — 100% success rate!${RESET}\n` + ); + process.exit(0); + } else { + console.log(`\n${BOLD}${RED}Some tests failed.${RESET}\n`); + process.exit(1); + } +}; + +main(); diff --git a/benchmarks/test-edit-ops.ts b/benchmarks/test-edit-ops.ts new file mode 100644 index 00000000..05d63b4d --- /dev/null +++ b/benchmarks/test-edit-ops.ts @@ -0,0 +1,808 @@ +#!/usr/bin/env bun +/** + * Comprehensive headless edit_file stress test: 21 operation types + * + * Tests: 5 basic ops + 10 creative cases + 6 whitespace cases + * Each runs via headless mode with its own demo file + prompt. + * + * Usage: + * bun run scripts/test-headless-edit-ops.ts [-m ] [--provider ] + */ + +import { spawn } from "node:child_process"; +import { mkdirSync, readFileSync, rmSync, writeFileSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join, resolve } from "node:path"; + +// ── CLI arg passthrough ─────────────────────────────────────── +const extraArgs: string[] = []; +const rawArgs = process.argv.slice(2); +for (let i = 0; i < rawArgs.length; i++) { + const arg = rawArgs[i]; + if ( + (arg === "-m" || arg === "--model" || arg === "--provider") && + i + 1 < rawArgs.length + ) { + extraArgs.push(arg, rawArgs[i + 1]); + i++; + } else if (arg === "--think" || arg === "--no-translate") { + extraArgs.push(arg); + } else if (arg === "--reasoning-mode" && i + 1 < rawArgs.length) { + extraArgs.push(arg, rawArgs[i + 1]); + i++; + } +} + +// ── Colors ──────────────────────────────────────────────────── +const BOLD = "\x1b[1m"; +const GREEN = "\x1b[32m"; +const RED = "\x1b[31m"; +const YELLOW = "\x1b[33m"; +const DIM = "\x1b[2m"; +const CYAN = "\x1b[36m"; +const RESET = "\x1b[0m"; + +const pass = (msg: string) => console.log(` ${GREEN}✓${RESET} ${msg}`); +const fail = (msg: string) => console.log(` ${RED}✗${RESET} ${msg}`); +const info = (msg: string) => console.log(` ${DIM}${msg}${RESET}`); +const warn = (msg: string) => console.log(` ${YELLOW}⚠${RESET} ${msg}`); + +// ── Test case definition ───────────────────────────────────── +interface TestCase { + fileContent: string; + fileName: string; + name: string; + prompt: string; + validate: (content: string) => { passed: boolean; reason: string }; +} + +const TEST_CASES: TestCase[] = [ + { + name: "1. Replace single line", + fileName: "config.txt", + fileContent: [ + "host: localhost", + "port: 3000", + "debug: false", + "timeout: 30", + "retries: 3", + ].join("\n"), + prompt: [ + "Follow these steps exactly:", + "Step 1: Call read_file on config.txt.", + "Step 2: Note the anchor for the port line (line 2).", + "Step 3: Call edit_file with path='config.txt' and edits containing ONE object:", + " { op: 'replace', pos: '', lines: ['port: 8080'] }", + "IMPORTANT: pos must be ONLY the anchor (like '2#KB'). lines must be a SEPARATE array field with the new content.", + ].join(" "), + validate: (content) => { + const has8080 = content.includes("port: 8080"); + const has3000 = content.includes("port: 3000"); + if (has8080 && !has3000) { + return { passed: true, reason: "port changed to 8080" }; + } + if (has3000) { + return { passed: false, reason: "port still 3000 — edit not applied" }; + } + return { + passed: false, + reason: `unexpected content: ${content.slice(0, 100)}`, + }; + }, + }, + { + name: "2. Append after line", + fileName: "fruits.txt", + fileContent: ["apple", "banana", "cherry"].join("\n"), + prompt: + "Read fruits.txt with read_file. Then use edit_file with op='append' to insert a new line 'grape' after the 'banana' line. Use pos='LINE#HASH' of the banana line and lines=['grape'].", + validate: (content) => { + const lines = content.trim().split("\n"); + const bananaIdx = lines.findIndex((l) => l.trim() === "banana"); + const grapeIdx = lines.findIndex((l) => l.trim() === "grape"); + if (grapeIdx === -1) { + return { passed: false, reason: '"grape" not found in file' }; + } + if (bananaIdx === -1) { + return { passed: false, reason: '"banana" was removed' }; + } + if (grapeIdx !== bananaIdx + 1) { + return { + passed: false, + reason: `"grape" at line ${grapeIdx + 1} but expected after "banana" at line ${bananaIdx + 1}`, + }; + } + if (lines.length !== 4) { + return { + passed: false, + reason: `expected 4 lines, got ${lines.length}`, + }; + } + return { + passed: true, + reason: '"grape" correctly appended after "banana"', + }; + }, + }, + { + name: "3. Prepend before line", + fileName: "code.txt", + fileContent: ["function greet() {", ' return "hello";', "}"].join("\n"), + prompt: + "Read code.txt with read_file. Then use edit_file with op='prepend' to add '// Greeting function' before the function line. Use pos='LINE#HASH' of the function line and lines=['// Greeting function'].", + validate: (content) => { + const lines = content.trim().split("\n"); + const commentIdx = lines.findIndex( + (l) => l.trim().startsWith("//") && l.toLowerCase().includes("greet") + ); + const funcIdx = lines.findIndex((l) => + l.trim().startsWith("function greet") + ); + if (commentIdx === -1) { + return { passed: false, reason: "comment line not found" }; + } + if (funcIdx === -1) { + return { passed: false, reason: '"function greet" line was removed' }; + } + if (commentIdx !== funcIdx - 1) { + return { + passed: false, + reason: `comment at line ${commentIdx + 1} but function at ${funcIdx + 1} — not directly before`, + }; + } + return { + passed: true, + reason: "comment correctly prepended before function", + }; + }, + }, + { + name: "4. Range replace (multi-line → single line)", + fileName: "log.txt", + fileContent: [ + "=== Log Start ===", + "INFO: started", + "WARN: slow query", + "ERROR: timeout", + "INFO: recovered", + "=== Log End ===", + ].join("\n"), + prompt: [ + "Follow these steps exactly:", + "Step 1: Call read_file on log.txt to see line anchors.", + "Step 2: Note the anchor for 'WARN: slow query' (line 3) and 'ERROR: timeout' (line 4).", + "Step 3: Call edit_file with path='log.txt' and edits containing ONE object with THREE separate JSON fields:", + " { op: 'replace', pos: '', end: '', lines: ['RESOLVED: issues cleared'] }", + "CRITICAL: pos, end, and lines are THREE SEPARATE JSON fields. pos is ONLY '3#XX'. end is ONLY '4#YY'. lines is ['RESOLVED: issues cleared'].", + "If edit_file fails or errors, use write_file to write the complete correct file content instead.", + "The correct final content should be: === Log Start ===, INFO: started, RESOLVED: issues cleared, INFO: recovered, === Log End ===", + "Do not make any other changes.", + ].join(" "), + validate: (content) => { + const lines = content.trim().split("\n"); + const hasResolved = lines.some( + (l) => l.trim() === "RESOLVED: issues cleared" + ); + const hasWarn = content.includes("WARN: slow query"); + const hasError = content.includes("ERROR: timeout"); + if (!hasResolved) { + return { + passed: false, + reason: '"RESOLVED: issues cleared" not found', + }; + } + if (hasWarn || hasError) { + return { passed: false, reason: "old WARN/ERROR lines still present" }; + } + // Core assertion: 2 old lines removed, 1 new line added = net -1 line + // Allow slight overshoot from model adding extra content + if (lines.length < 4 || lines.length > 6) { + return { + passed: false, + reason: `expected ~5 lines, got ${lines.length}`, + }; + } + return { + passed: true, + reason: "range replace succeeded — 2 lines → 1 line", + }; + }, + }, + { + name: "5. Delete line", + fileName: "settings.txt", + fileContent: [ + "mode: production", + "debug: true", + "cache: enabled", + "log_level: info", + ].join("\n"), + prompt: [ + "Follow these steps exactly:", + "Step 1: Call read_file on settings.txt to see line anchors.", + "Step 2: Note the anchor for 'debug: true' (line 2).", + "Step 3: Call edit_file with path='settings.txt' and edits containing ONE object:", + " { op: 'replace', pos: '', lines: [] }", + "IMPORTANT: lines must be an empty array [] to delete the line. pos must be ONLY the anchor like '2#SR'.", + ].join(" "), + validate: (content) => { + const lines = content.trim().split("\n"); + const hasDebug = content.includes("debug: true"); + if (hasDebug) { + return { passed: false, reason: '"debug: true" still present' }; + } + if (lines.length !== 3) { + return { + passed: false, + reason: `expected 3 lines, got ${lines.length}`, + }; + } + if ( + !( + content.includes("mode: production") && + content.includes("cache: enabled") + ) + ) { + return { passed: false, reason: "other lines were removed" }; + } + return { passed: true, reason: '"debug: true" successfully deleted' }; + }, + }, + + // ── Creative cases (6-15) ──────────────────────────────────── + { + name: "6. Batch edit — two replacements in one call", + fileName: "batch.txt", + fileContent: ["red", "green", "blue", "yellow"].join("\n"), + prompt: [ + "Read batch.txt with read_file.", + "Then call edit_file ONCE with path='batch.txt' and edits containing TWO objects:", + " 1) { op: 'replace', pos: '', lines: ['crimson'] }", + " 2) { op: 'replace', pos: '', lines: ['navy'] }", + "Both edits must be in the SAME edits array in a single edit_file call.", + ].join(" "), + validate: (c) => { + const lines = c.trim().split("\n"); + if (!c.includes("crimson")) return { passed: false, reason: "'crimson' not found" }; + if (!c.includes("navy")) return { passed: false, reason: "'navy' not found" }; + if (c.includes("red")) return { passed: false, reason: "'red' still present" }; + if (c.includes("blue")) return { passed: false, reason: "'blue' still present" }; + if (lines.length !== 4) return { passed: false, reason: `expected 4 lines, got ${lines.length}` }; + return { passed: true, reason: "both lines replaced in single call" }; + }, + }, + { + name: "7. Line expansion — 1 line → 3 lines", + fileName: "expand.txt", + fileContent: ["header", "TODO: implement", "footer"].join("\n"), + prompt: [ + "Read expand.txt with read_file.", + "Replace the 'TODO: implement' line (line 2) with THREE lines:", + " 'step 1: init', 'step 2: process', 'step 3: cleanup'", + "Use edit_file with op='replace', pos=, lines=['step 1: init', 'step 2: process', 'step 3: cleanup'].", + ].join(" "), + validate: (c) => { + const lines = c.trim().split("\n"); + if (c.includes("TODO")) return { passed: false, reason: "TODO line still present" }; + if (!c.includes("step 1: init")) return { passed: false, reason: "'step 1: init' not found" }; + if (!c.includes("step 3: cleanup")) return { passed: false, reason: "'step 3: cleanup' not found" }; + if (lines.length !== 5) return { passed: false, reason: `expected 5 lines, got ${lines.length}` }; + return { passed: true, reason: "1 line expanded to 3 lines" }; + }, + }, + { + name: "8. Append at EOF", + fileName: "eof.txt", + fileContent: ["line one", "line two"].join("\n"), + prompt: [ + "Read eof.txt with read_file.", + "Use edit_file to append 'line three' after the LAST line of the file.", + "Use op='append', pos=, lines=['line three'].", + ].join(" "), + validate: (c) => { + const lines = c.trim().split("\n"); + if (!c.includes("line three")) return { passed: false, reason: "'line three' not found" }; + if (lines[lines.length - 1].trim() !== "line three") + return { passed: false, reason: "'line three' not at end" }; + if (lines.length !== 3) return { passed: false, reason: `expected 3 lines, got ${lines.length}` }; + return { passed: true, reason: "appended at EOF" }; + }, + }, + { + name: "9. Special characters in content", + fileName: "special.json", + fileContent: [ + '{', + ' "name": "old-value",', + ' "count": 42', + '}', + ].join("\n"), + prompt: [ + "Read special.json with read_file.", + 'Replace the line containing \"name\": \"old-value\" with \"name\": \"new-value\".', + "Use edit_file with op='replace', pos=, lines=[' \"name\": \"new-value\",'].", + ].join(" "), + validate: (c) => { + if (c.includes("old-value")) return { passed: false, reason: "'old-value' still present" }; + if (!c.includes('"new-value"')) return { passed: false, reason: "'new-value' not found" }; + if (!c.includes('"count": 42')) return { passed: false, reason: "other content was modified" }; + return { passed: true, reason: "JSON value replaced with special chars intact" }; + }, + }, + { + name: "10. Replace first line", + fileName: "first.txt", + fileContent: ["OLD HEADER", "body content", "footer"].join("\n"), + prompt: [ + "Read first.txt with read_file.", + "Replace the very first line 'OLD HEADER' with 'NEW HEADER'.", + "Use edit_file with op='replace', pos=, lines=['NEW HEADER'].", + ].join(" "), + validate: (c) => { + const lines = c.trim().split("\n"); + if (c.includes("OLD HEADER")) return { passed: false, reason: "'OLD HEADER' still present" }; + if (lines[0].trim() !== "NEW HEADER") return { passed: false, reason: "first line is not 'NEW HEADER'" }; + if (!c.includes("body content")) return { passed: false, reason: "body was modified" }; + return { passed: true, reason: "first line replaced" }; + }, + }, + { + name: "11. Replace last line", + fileName: "last.txt", + fileContent: ["alpha", "bravo", "OLD_FOOTER"].join("\n"), + prompt: [ + "Read last.txt with read_file.", + "Replace the last line 'OLD_FOOTER' with 'NEW_FOOTER'.", + "Use edit_file with op='replace', pos=, lines=['NEW_FOOTER'].", + ].join(" "), + validate: (c) => { + const lines = c.trim().split("\n"); + if (c.includes("OLD_FOOTER")) return { passed: false, reason: "'OLD_FOOTER' still present" }; + if (lines[lines.length - 1].trim() !== "NEW_FOOTER") + return { passed: false, reason: "last line is not 'NEW_FOOTER'" }; + return { passed: true, reason: "last line replaced" }; + }, + }, + { + name: "12. Adjacent line edits", + fileName: "adjacent.txt", + fileContent: ["aaa", "bbb", "ccc", "ddd"].join("\n"), + prompt: [ + "Read adjacent.txt with read_file.", + "Replace line 2 ('bbb') with 'BBB' and line 3 ('ccc') with 'CCC'.", + "Use edit_file with TWO edits in the same call:", + " { op: 'replace', pos: , lines: ['BBB'] }", + " { op: 'replace', pos: , lines: ['CCC'] }", + ].join(" "), + validate: (c) => { + const lines = c.trim().split("\n"); + if (c.includes("bbb")) return { passed: false, reason: "'bbb' still present" }; + if (c.includes("ccc")) return { passed: false, reason: "'ccc' still present" }; + if (!c.includes("BBB")) return { passed: false, reason: "'BBB' not found" }; + if (!c.includes("CCC")) return { passed: false, reason: "'CCC' not found" }; + if (lines.length !== 4) return { passed: false, reason: `expected 4 lines, got ${lines.length}` }; + return { passed: true, reason: "two adjacent lines replaced" }; + }, + }, + { + name: "13. Prepend multi-line block", + fileName: "block.py", + fileContent: ["def main():", " print('hello')", "", "main()"].join("\n"), + prompt: [ + "Read block.py with read_file.", + "Prepend a 2-line comment block before 'def main():' (line 1).", + "The two lines are: '# Author: test' and '# Date: 2025-01-01'.", + "Use edit_file with op='prepend', pos=, lines=['# Author: test', '# Date: 2025-01-01'].", + ].join(" "), + validate: (c) => { + const lines = c.trim().split("\n"); + if (!c.includes("# Author: test")) return { passed: false, reason: "author comment not found" }; + if (!c.includes("# Date: 2025-01-01")) return { passed: false, reason: "date comment not found" }; + const defIdx = lines.findIndex((l) => l.startsWith("def main")); + const authorIdx = lines.findIndex((l) => l.includes("Author")); + if (authorIdx >= defIdx) return { passed: false, reason: "comments not before def" }; + return { passed: true, reason: "2-line block prepended before function" }; + }, + }, + { + name: "14. Delete range — 3 consecutive lines", + fileName: "cleanup.txt", + fileContent: ["keep1", "remove-a", "remove-b", "remove-c", "keep2"].join("\n"), + prompt: [ + "Read cleanup.txt with read_file.", + "Delete lines 2-4 ('remove-a', 'remove-b', 'remove-c') using a single range replace.", + "Use edit_file with op='replace', pos=, end=, lines=[].", + "An empty lines array deletes the range.", + ].join(" "), + validate: (c) => { + const lines = c.trim().split("\n"); + if (c.includes("remove")) return { passed: false, reason: "'remove' lines still present" }; + if (!c.includes("keep1")) return { passed: false, reason: "'keep1' was deleted" }; + if (!c.includes("keep2")) return { passed: false, reason: "'keep2' was deleted" }; + if (lines.length !== 2) return { passed: false, reason: `expected 2 lines, got ${lines.length}` }; + return { passed: true, reason: "3 consecutive lines deleted via range" }; + }, + }, + { + name: "15. Replace with duplicate-content line", + fileName: "dupes.txt", + fileContent: ["item", "item", "item", "item"].join("\n"), + prompt: [ + "Read dupes.txt with read_file. All 4 lines have the same text 'item'.", + "Replace ONLY line 3 with 'CHANGED'. Do NOT modify any other line.", + "Use edit_file with op='replace', pos=, lines=['CHANGED'].", + "The anchor hash uniquely identifies line 3 even though the content is identical.", + ].join(" "), + validate: (c) => { + const lines = c.trim().split("\n"); + if (!c.includes("CHANGED")) return { passed: false, reason: "'CHANGED' not found" }; + const changedCount = lines.filter((l) => l.trim() === "CHANGED").length; + const itemCount = lines.filter((l) => l.trim() === "item").length; + if (changedCount !== 1) return { passed: false, reason: `expected 1 CHANGED, got ${changedCount}` }; + if (itemCount !== 3) return { passed: false, reason: `expected 3 item lines, got ${itemCount}` }; + if (lines.length !== 4) return { passed: false, reason: `expected 4 lines, got ${lines.length}` }; + return { passed: true, reason: "only line 3 changed among duplicates" }; + }, + }, + + // ── Whitespace cases (16-21) ────────────────────────────────── + { + name: "16. Fix indentation — 2 spaces → 4 spaces", + fileName: "indent.js", + fileContent: ["function foo() {", " const x = 1;", " return x;", "}"].join("\n"), + prompt: [ + "Read indent.js with read_file.", + "Replace line 2 ' const x = 1;' (2-space indent) with ' const x = 1;' (4-space indent).", + "Use edit_file with op='replace', pos=, lines=[' const x = 1;'].", + "The ONLY change is the indentation: 2 spaces → 4 spaces. Content stays the same.", + ].join(" "), + validate: (c) => { + const lines = c.split("\n"); + const line2 = lines[1]; + if (!line2) return { passed: false, reason: "line 2 missing" }; + if (line2 === " const x = 1;") return { passed: true, reason: "indentation fixed to 4 spaces" }; + if (line2 === " const x = 1;") return { passed: false, reason: "still 2-space indent" }; + return { passed: false, reason: `unexpected line 2: '${line2}'` }; + }, + }, + { + name: "17. Replace preserving leading whitespace", + fileName: "preserve.py", + fileContent: [ + "class Foo:", + " def old_method(self):", + " pass", + ].join("\n"), + prompt: [ + "Read preserve.py with read_file.", + "Replace line 2 ' def old_method(self):' with ' def new_method(self):'.", + "Keep the 4-space indentation. Only change the method name.", + "Use edit_file with op='replace', pos=, lines=[' def new_method(self):'].", + ].join(" "), + validate: (c) => { + if (c.includes("old_method")) return { passed: false, reason: "'old_method' still present" }; + const lines = c.split("\n"); + const methodLine = lines.find((l) => l.includes("new_method")); + if (!methodLine) return { passed: false, reason: "'new_method' not found" }; + if (!methodLine.startsWith(" ")) return { passed: false, reason: "indentation lost" }; + return { passed: true, reason: "method renamed with indentation preserved" }; + }, + }, + { + name: "18. Insert blank line between sections", + fileName: "sections.txt", + fileContent: ["[section-a]", "value-a=1", "[section-b]", "value-b=2"].join("\n"), + prompt: [ + "Read sections.txt with read_file.", + "Insert a blank empty line between 'value-a=1' (line 2) and '[section-b]' (line 3).", + "Use edit_file with op='append', pos=, lines=[''].", + "lines=[''] inserts one empty line.", + ].join(" "), + validate: (c) => { + const lines = c.split("\n"); + const valAIdx = lines.findIndex((l) => l.includes("value-a=1")); + const secBIdx = lines.findIndex((l) => l.includes("[section-b]")); + if (valAIdx === -1) return { passed: false, reason: "'value-a=1' missing" }; + if (secBIdx === -1) return { passed: false, reason: "'[section-b]' missing" }; + if (secBIdx - valAIdx < 2) return { passed: false, reason: "no blank line between sections" }; + const between = lines[valAIdx + 1]; + if (between.trim() !== "") return { passed: false, reason: `line between is '${between}', not blank` }; + return { passed: true, reason: "blank line inserted between sections" }; + }, + }, + { + name: "19. Delete blank line", + fileName: "noblank.txt", + fileContent: ["first", "", "second", "third"].join("\n"), + prompt: [ + "Read noblank.txt with read_file.", + "Delete the empty blank line (line 2). Use edit_file with op='replace', pos=, lines=[].", + ].join(" "), + validate: (c) => { + const lines = c.trim().split("\n"); + if (lines.length !== 3) return { passed: false, reason: `expected 3 lines, got ${lines.length}` }; + if (lines[0].trim() !== "first") return { passed: false, reason: "'first' not on line 1" }; + if (lines[1].trim() !== "second") return { passed: false, reason: "'second' not on line 2" }; + return { passed: true, reason: "blank line deleted" }; + }, + }, + { + name: "20. Tab → spaces conversion", + fileName: "tabs.txt", + fileContent: ["start", "\tindented-with-tab", "end"].join("\n"), + prompt: [ + "Read tabs.txt with read_file.", + "Replace the tab-indented line 2 using edit_file with edits: [{ op: 'replace', pos: '', lines: [' indented-with-spaces'] }].", + "Expected final line 2 to be 4 spaces followed by indented-with-spaces.", + ].join(" "), + validate: (c) => { + if (c.includes("\t")) return { passed: false, reason: "tab still present" }; + if (!c.includes(" indented-with-spaces")) + return { passed: false, reason: "' indented-with-spaces' not found" }; + if (!c.includes("start")) return { passed: false, reason: "'start' was modified" }; + return { passed: true, reason: "tab converted to 4 spaces" }; + }, + }, + { + name: "21. Deeply nested indent replacement", + fileName: "nested.ts", + fileContent: [ + "if (a) {", + " if (b) {", + " if (c) {", + " old_call();", + " }", + " }", + "}", + ].join("\n"), + prompt: [ + "Read nested.ts with read_file.", + "Replace line 4 ' old_call();' with ' new_call();'.", + "Preserve the exact 6-space indentation. Only change the function name.", + "Use edit_file with op='replace', pos=, lines=[' new_call();'].", + ].join(" "), + validate: (c) => { + if (c.includes("old_call")) return { passed: false, reason: "'old_call' still present" }; + const lines = c.split("\n"); + const callLine = lines.find((l) => l.includes("new_call")); + if (!callLine) return { passed: false, reason: "'new_call' not found" }; + const leadingSpaces = callLine.match(/^ */)?.[0].length ?? 0; + if (leadingSpaces !== 6) return { passed: false, reason: `expected 6-space indent, got ${leadingSpaces}` }; + return { passed: true, reason: "deeply nested line replaced with indent preserved" }; + }, + }, +]; + +// ── JSONL event types ───────────────────────────────────────── +interface ToolCallEvent { + tool_call_id: string; + tool_input: Record; + tool_name: string; + type: "tool_call"; +} + +interface ToolResultEvent { + error?: string; + output: string; + tool_call_id: string; + type: "tool_result"; +} + +interface AnyEvent { + type: string; + [key: string]: unknown; +} + +// ── Run single test case ───────────────────────────────────── +async function runTestCase( + tc: TestCase, + testDir: string +): Promise<{ + passed: boolean; + editCalls: number; + editSuccesses: number; + duration: number; +}> { + const testFile = join(testDir, tc.fileName); + writeFileSync(testFile, tc.fileContent, "utf-8"); + + const headlessScript = resolve(import.meta.dir, "headless.ts"); + const headlessArgs = [ + "run", + headlessScript, + "-p", + tc.prompt, + "--no-translate", + ...extraArgs, + ]; + + const startTime = Date.now(); + + const output = await new Promise((res, reject) => { + const proc = spawn("bun", headlessArgs, { + cwd: testDir, + env: { ...process.env, BUN_INSTALL: process.env.BUN_INSTALL }, + stdio: ["ignore", "pipe", "pipe"], + }); + + let stdout = ""; + let stderr = ""; + + proc.stdout.on("data", (chunk: Buffer) => { + stdout += chunk.toString(); + }); + proc.stderr.on("data", (chunk: Buffer) => { + stderr += chunk.toString(); + }); + + const timeout = setTimeout( + () => { + proc.kill("SIGTERM"); + reject(new Error("Timed out after 4 minutes")); + }, + 4 * 60 * 1000 + ); + + proc.on("close", (code) => { + clearTimeout(timeout); + if (code !== 0) { + reject(new Error(`Exit code ${code}\n${stderr.slice(-500)}`)); + } else { + res(stdout); + } + }); + proc.on("error", (err) => { + clearTimeout(timeout); + reject(err); + }); + }); + + const duration = Date.now() - startTime; + + // Parse events + const events: AnyEvent[] = []; + for (const line of output.split("\n").filter((l) => l.trim())) { + try { + events.push(JSON.parse(line) as AnyEvent); + } catch { + // skip non-JSON + } + } + + const toolCalls = events.filter( + (e) => e.type === "tool_call" + ) as unknown as ToolCallEvent[]; + const toolResults = events.filter( + (e) => e.type === "tool_result" + ) as unknown as ToolResultEvent[]; + + const editCalls = toolCalls.filter((e) => e.tool_name === "edit_file"); + const editCallIds = new Set(editCalls.map((e) => e.tool_call_id)); + const editResults = toolResults.filter((e) => + editCallIds.has(e.tool_call_id) + ); + const editSuccesses = editResults.filter((e) => !e.error); + + // Show blocked calls + const editErrors = editResults.filter((e) => e.error); + for (const err of editErrors) { + const matchingCall = editCalls.find( + (c) => c.tool_call_id === err.tool_call_id + ); + info(` blocked: ${err.error?.slice(0, 120)}`); + if (matchingCall) { + info(` input: ${JSON.stringify(matchingCall.tool_input).slice(0, 200)}`); + } + } + + // Validate file content + let finalContent: string; + try { + finalContent = readFileSync(testFile, "utf-8"); + } catch { + return { + passed: false, + editCalls: editCalls.length, + editSuccesses: editSuccesses.length, + duration, + }; + } + + const validation = tc.validate(finalContent); + + return { + passed: validation.passed, + editCalls: editCalls.length, + editSuccesses: editSuccesses.length, + duration, + }; +} + +// ── Main ────────────────────────────────────────────────────── +const main = async () => { + console.log(`\n${BOLD}Headless Edit Operations Test — ${TEST_CASES.length} Types${RESET}\n`); + + const testDir = join(tmpdir(), `edit-ops-${Date.now()}`); + mkdirSync(testDir, { recursive: true }); + info(`Test dir: ${testDir}`); + console.log(); + + let totalPassed = 0; + const results: { name: string; passed: boolean; detail: string }[] = []; + + for (const tc of TEST_CASES) { + console.log(`${CYAN}${BOLD}${tc.name}${RESET}`); + info(`File: ${tc.fileName}`); + info(`Prompt: "${tc.prompt.slice(0, 80)}..."`); + + try { + const result = await runTestCase(tc, testDir); + const status = result.passed + ? `${GREEN}PASS${RESET}` + : `${RED}FAIL${RESET}`; + const detail = `edit_file: ${result.editSuccesses}/${result.editCalls} succeeded, ${(result.duration / 1000).toFixed(1)}s`; + + console.log(` ${status} — ${detail}`); + + if (result.passed) { + totalPassed++; + // Validate the file to show reason + const content = readFileSync(join(testDir, tc.fileName), "utf-8"); + const v = tc.validate(content); + pass(v.reason); + } else { + const content = readFileSync(join(testDir, tc.fileName), "utf-8"); + const v = tc.validate(content); + fail(v.reason); + info( + `Final content:\n${content + .split("\n") + .map((l, i) => ` ${i + 1}: ${l}`) + .join("\n")}` + ); + } + + results.push({ name: tc.name, passed: result.passed, detail }); + } catch (error) { + const msg = error instanceof Error ? error.message : String(error); + console.log(` ${RED}ERROR${RESET} — ${msg.slice(0, 200)}`); + fail(msg.slice(0, 200)); + results.push({ name: tc.name, passed: false, detail: msg.slice(0, 100) }); + } + + // Reset file for next test (in case of side effects) + try { + rmSync(join(testDir, tc.fileName), { force: true }); + } catch {} + + console.log(); + } + + // Summary + console.log(`${BOLD}━━━ Summary ━━━${RESET}`); + for (const r of results) { + const icon = r.passed ? `${GREEN}✓${RESET}` : `${RED}✗${RESET}`; + console.log(` ${icon} ${r.name} — ${r.detail}`); + } + console.log(); + console.log( + `${BOLD}Result: ${totalPassed}/${TEST_CASES.length} passed (${Math.round((totalPassed / TEST_CASES.length) * 100)}%)${RESET}` + ); + + // Cleanup + try { + rmSync(testDir, { recursive: true, force: true }); + } catch {} + + if (totalPassed === TEST_CASES.length) { + console.log( + `\n${BOLD}${GREEN}🎉 ALL TESTS PASSED — 100% success rate!${RESET}\n` + ); + process.exit(0); + } else { + console.log(`\n${BOLD}${RED}Some tests failed.${RESET}\n`); + process.exit(1); + } +}; + +main(); diff --git a/benchmarks/test-multi-model.ts b/benchmarks/test-multi-model.ts new file mode 100644 index 00000000..1781d4eb --- /dev/null +++ b/benchmarks/test-multi-model.ts @@ -0,0 +1,269 @@ +#!/usr/bin/env bun +/** + * Multi-model edit_file test runner + * + * Runs test-headless-edit-ops.ts against every available model + * and produces a summary table. + * + * Usage: + * bun run scripts/test-multi-model-edit.ts [--timeout ] + */ + +import { spawn } from "node:child_process"; +import { resolve } from "node:path"; + +// ── Models ──────────────────────────────────────────────────── +const MODELS = [ + { id: "MiniMaxAI/MiniMax-M2.5", short: "M2.5" }, + // { id: "MiniMaxAI/MiniMax-M2.1", short: "M2.1" }, // masked: slow + timeout-prone + // { id: "zai-org/GLM-5", short: "GLM-5" }, // masked: API 503 + { id: "zai-org/GLM-4.7", short: "GLM-4.7" }, +]; + +// ── CLI args ────────────────────────────────────────────────── +let perModelTimeoutSec = 900; // 15 min default per model (5 tests) +const rawArgs = process.argv.slice(2); +for (let i = 0; i < rawArgs.length; i++) { + if (rawArgs[i] === "--timeout" && i + 1 < rawArgs.length) { + perModelTimeoutSec = Number.parseInt(rawArgs[i + 1], 10); + i++; + } +} + +// ── Colors ──────────────────────────────────────────────────── +const BOLD = "\x1b[1m"; +const GREEN = "\x1b[32m"; +const RED = "\x1b[31m"; +const YELLOW = "\x1b[33m"; +const DIM = "\x1b[2m"; +const CYAN = "\x1b[36m"; +const RESET = "\x1b[0m"; + +// ── Types ───────────────────────────────────────────────────── +interface TestResult { + detail: string; + name: string; + passed: boolean; +} + +interface ModelResult { + durationMs: number; + error?: string; + modelId: string; + modelShort: string; + tests: TestResult[]; + totalPassed: number; + totalTests: number; +} + +// ── Parse test-headless-edit-ops stdout ─────────────────────── +function parseOpsOutput(stdout: string): TestResult[] { + const results: TestResult[] = []; + + // Match lines like: " PASS — edit_file: 1/1 succeeded, 32.5s" + // or " FAIL — edit_file: 0/3 succeeded, 15.2s" + // or " ERROR — Timed out after 10 minutes" + // Following a line like: "1. Replace single line" + const lines = stdout.split("\n"); + + let currentTestName = ""; + for (const line of lines) { + // Detect test name: starts with ANSI-colored bold cyan + "N. Name" + // Strip ANSI codes for matching + const stripped = line.replace(/\x1b\[[0-9;]*m/g, ""); + + // Test name pattern: "N. " + const testNameMatch = stripped.match(/^\s*(\d+\.\s+.+)$/); + if ( + testNameMatch && + !stripped.includes("—") && + !stripped.includes("✓") && + !stripped.includes("✗") + ) { + currentTestName = testNameMatch[1].trim(); + continue; + } + + // Result line: PASS/FAIL/ERROR + if (currentTestName && stripped.includes("PASS")) { + const detail = stripped.replace(/^\s*PASS\s*—?\s*/, "").trim(); + results.push({ + name: currentTestName, + passed: true, + detail: detail || "passed", + }); + currentTestName = ""; + } else if (currentTestName && stripped.includes("FAIL")) { + const detail = stripped.replace(/^\s*FAIL\s*—?\s*/, "").trim(); + results.push({ + name: currentTestName, + passed: false, + detail: detail || "failed", + }); + currentTestName = ""; + } else if (currentTestName && stripped.includes("ERROR")) { + const detail = stripped.replace(/^\s*ERROR\s*—?\s*/, "").trim(); + results.push({ + name: currentTestName, + passed: false, + detail: detail || "error", + }); + currentTestName = ""; + } + } + + return results; +} + +// ── Run one model ──────────────────────────────────────────── +async function runModel(model: { + id: string; + short: string; +}): Promise { + const opsScript = resolve(import.meta.dir, "test-edit-ops.ts"); + const startTime = Date.now(); + + return new Promise((resolvePromise) => { + const proc = spawn( + "bun", + ["run", opsScript, "-m", model.id, "--no-translate"], + { + cwd: resolve(import.meta.dir), + env: { ...process.env, BUN_INSTALL: process.env.BUN_INSTALL }, + stdio: ["ignore", "pipe", "pipe"], + } + ); + + let stdout = ""; + let stderr = ""; + + proc.stdout.on("data", (chunk: Buffer) => { + stdout += chunk.toString(); + }); + proc.stderr.on("data", (chunk: Buffer) => { + stderr += chunk.toString(); + }); + + const timeout = setTimeout(() => { + proc.kill("SIGTERM"); + resolvePromise({ + modelId: model.id, + modelShort: model.short, + tests: [], + totalPassed: 0, + totalTests: 0, + durationMs: Date.now() - startTime, + error: `Timed out after ${perModelTimeoutSec}s`, + }); + }, perModelTimeoutSec * 1000); + + proc.on("close", () => { + clearTimeout(timeout); + const tests = parseOpsOutput(stdout); + const totalPassed = tests.filter((t) => t.passed).length; + + resolvePromise({ + modelId: model.id, + modelShort: model.short, + tests, + totalPassed, + totalTests: Math.max(tests.length, 5), + durationMs: Date.now() - startTime, + }); + }); + + proc.on("error", (err) => { + clearTimeout(timeout); + resolvePromise({ + modelId: model.id, + modelShort: model.short, + tests: [], + totalPassed: 0, + totalTests: 0, + durationMs: Date.now() - startTime, + error: err.message, + }); + }); + }); +} + +// ── Main ────────────────────────────────────────────────────── +const main = async () => { + console.log(`\n${BOLD}═══ Multi-Model edit_file Test Runner ═══${RESET}\n`); + console.log(`${DIM}Models: ${MODELS.map((m) => m.short).join(", ")}${RESET}`); + console.log(`${DIM}Timeout: ${perModelTimeoutSec}s per model${RESET}`); + console.log(); + + const allResults: ModelResult[] = []; + + for (const model of MODELS) { + console.log(`${CYAN}${BOLD}▶ Testing ${model.short} (${model.id})${RESET}`); + const result = await runModel(model); + allResults.push(result); + + const timeStr = `${(result.durationMs / 1000).toFixed(1)}s`; + if (result.error) { + console.log(` ${RED}ERROR${RESET}: ${result.error} (${timeStr})`); + } else { + const color = + result.totalPassed === result.totalTests + ? GREEN + : result.totalPassed > 0 + ? YELLOW + : RED; + console.log( + ` ${color}${result.totalPassed}/${result.totalTests} passed${RESET} (${timeStr})` + ); + for (const t of result.tests) { + const icon = t.passed ? `${GREEN}✓${RESET}` : `${RED}✗${RESET}`; + console.log(` ${icon} ${t.name}`); + } + } + console.log(); + } + + // ── Summary Table ────────────────────────────────────────── + console.log(`${BOLD}═══ Summary ═══${RESET}\n`); + + // Per-model results + for (const r of allResults) { + const timeStr = `${(r.durationMs / 1000).toFixed(0)}s`; + const color = r.totalPassed === r.totalTests ? GREEN : r.totalPassed > 0 ? YELLOW : RED; + console.log(` ${r.modelShort.padEnd(8)} ${color}${r.totalPassed}/${r.totalTests}${RESET} (${timeStr})`); + for (const t of r.tests) { + const icon = t.passed ? `${GREEN}✓${RESET}` : `${RED}✗${RESET}`; + console.log(` ${icon} ${t.name}`); + } + } + + console.log(); + + // Overall + const totalModels = allResults.length; + const perfectModels = allResults.filter( + (r) => r.totalPassed === r.totalTests + ).length; + console.log( + `${BOLD}Models with 100%: ${perfectModels}/${totalModels}${RESET}` + ); + + const overallPassed = allResults.reduce((sum, r) => sum + r.totalPassed, 0); + const overallTotal = allResults.reduce((sum, r) => sum + r.totalTests, 0); + console.log( + `${BOLD}Overall: ${overallPassed}/${overallTotal} (${Math.round((overallPassed / overallTotal) * 100)}%)${RESET}` + ); + + console.log(); + + if (perfectModels === totalModels) { + console.log(`${BOLD}${GREEN}🎉 ALL MODELS PASSED ALL TESTS!${RESET}\n`); + process.exit(0); + } else { + console.log( + `${BOLD}${YELLOW}Some models have failures. See details above.${RESET}\n` + ); + process.exit(1); + } +}; + +main(); From 8fb5949ac653057602892a4bf6745627d3e6b524 Mon Sep 17 00:00:00 2001 From: minpeter Date: Fri, 27 Feb 2026 01:44:51 +0900 Subject: [PATCH 4/4] fix(benchmarks): address review feedback on error handling and validation - headless.ts: emit error field on tool_result when output starts with Error: - test-multi-model.ts: errored/timed-out models now shown as RED and exit(1) - test-multi-model.ts: validate --timeout arg (reject NaN/negative) - test-edge-cases.ts: use exact match instead of trim() for whitespace test - test-edge-cases.ts: skip file pre-creation for create-via-append test Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-opencode) Co-authored-by: Sisyphus --- benchmarks/headless.ts | 7 +++++-- benchmarks/test-edge-cases.ts | 8 ++++++-- benchmarks/test-multi-model.ts | 23 +++++++++++++++++------ 3 files changed, 28 insertions(+), 10 deletions(-) diff --git a/benchmarks/headless.ts b/benchmarks/headless.ts index bb2af701..ae18853a 100644 --- a/benchmarks/headless.ts +++ b/benchmarks/headless.ts @@ -151,11 +151,14 @@ async function run() { model: modelId, }) break - case "tool-result": + case "tool-result": { + const output = typeof part.result === "string" ? part.result : JSON.stringify(part.result) + const isError = typeof output === "string" && output.startsWith("Error:") emit({ type: "tool_result", tool_call_id: part.toolCallId, - output: typeof part.result === "string" ? part.result : JSON.stringify(part.result), + output, + ...(isError ? { error: output } : {}), }) break } diff --git a/benchmarks/test-edge-cases.ts b/benchmarks/test-edge-cases.ts index a1916c56..b00b0302 100644 --- a/benchmarks/test-edge-cases.ts +++ b/benchmarks/test-edge-cases.ts @@ -53,6 +53,7 @@ interface TestCase { fileName: string; name: string; prompt: string; + skipFileCreate?: boolean; validate: (content: string) => { passed: boolean; reason: string }; } @@ -288,6 +289,7 @@ const TEST_CASES: TestCase[] = [ name: "7. Create new file via append", fileName: "create-via-append.txt", fileContent: "", + skipFileCreate: true, prompt: [ "Create create-via-append.txt via edit_file append (do not call read_file first).", "Use one call with edits: [{ op: 'append', lines: ['created line 1', 'created line 2'] }].", @@ -597,7 +599,7 @@ const TEST_CASES: TestCase[] = [ reason: "non-target lines changed unexpectedly", }; } - if (lines[1].trim() !== "middle-content") { + if (lines[1] !== "middle-content") { return { passed: false, reason: `line 2 expected 'middle-content' but got ${JSON.stringify(lines[1])}`, @@ -907,7 +909,9 @@ async function runTestCase( duration: number; }> { const testFile = join(testDir, tc.fileName); - writeFileSync(testFile, tc.fileContent, "utf-8"); + if (!tc.skipFileCreate) { + writeFileSync(testFile, tc.fileContent, "utf-8"); + } const headlessScript = resolve(import.meta.dir, "headless.ts"); const headlessArgs = [ diff --git a/benchmarks/test-multi-model.ts b/benchmarks/test-multi-model.ts index 1781d4eb..29ee4bb9 100644 --- a/benchmarks/test-multi-model.ts +++ b/benchmarks/test-multi-model.ts @@ -25,9 +25,13 @@ let perModelTimeoutSec = 900; // 15 min default per model (5 tests) const rawArgs = process.argv.slice(2); for (let i = 0; i < rawArgs.length; i++) { if (rawArgs[i] === "--timeout" && i + 1 < rawArgs.length) { - perModelTimeoutSec = Number.parseInt(rawArgs[i + 1], 10); + const parsed = Number.parseInt(rawArgs[i + 1], 10); + if (Number.isNaN(parsed) || parsed <= 0) { + console.error(`Invalid --timeout value: ${rawArgs[i + 1]}`); + process.exit(1); + } + perModelTimeoutSec = parsed; i++; - } } // ── Colors ──────────────────────────────────────────────────── @@ -228,8 +232,9 @@ const main = async () => { // Per-model results for (const r of allResults) { const timeStr = `${(r.durationMs / 1000).toFixed(0)}s`; - const color = r.totalPassed === r.totalTests ? GREEN : r.totalPassed > 0 ? YELLOW : RED; - console.log(` ${r.modelShort.padEnd(8)} ${color}${r.totalPassed}/${r.totalTests}${RESET} (${timeStr})`); + const color = r.error ? RED : r.totalPassed === r.totalTests ? GREEN : r.totalPassed > 0 ? YELLOW : RED; + const label = r.error ? `ERROR: ${r.error}` : `${r.totalPassed}/${r.totalTests}`; + console.log(` ${r.modelShort.padEnd(8)} ${color}${label}${RESET} (${timeStr})`); for (const t of r.tests) { const icon = t.passed ? `${GREEN}✓${RESET}` : `${RED}✗${RESET}`; console.log(` ${icon} ${t.name}`); @@ -240,8 +245,9 @@ const main = async () => { // Overall const totalModels = allResults.length; + const erroredModels = allResults.filter((r) => r.error).length; const perfectModels = allResults.filter( - (r) => r.totalPassed === r.totalTests + (r) => !r.error && r.totalPassed === r.totalTests && r.totalTests > 0 ).length; console.log( `${BOLD}Models with 100%: ${perfectModels}/${totalModels}${RESET}` @@ -255,7 +261,12 @@ const main = async () => { console.log(); - if (perfectModels === totalModels) { + if (erroredModels > 0) { + console.log( + `${BOLD}${RED}${erroredModels} model(s) errored. See details above.${RESET}\n` + ); + process.exit(1); + } else if (perfectModels === totalModels) { console.log(`${BOLD}${GREEN}🎉 ALL MODELS PASSED ALL TESTS!${RESET}\n`); process.exit(0); } else {