From b1203b95013675409757a81ac01541f1ba4994d0 Mon Sep 17 00:00:00 2001
From: minpeter <minpeter@friendli.ai>
Date: Thu, 26 Feb 2026 17:43:49 +0900
Subject: [PATCH 1/4] Fix hashline-edit deduplication and validation

- Canonicalize anchors in dedupe keys to handle whitespace variants
- Make lines field required in edit operations
- Only allow unanchored append/prepend to create missing files
- Reorder delete/rename validation to prevent edge cases
- Add allow_non_gpt_model and max_prompt_tokens to config schema
  ```
---
 src/tools/hashline-edit/edit-deduplication.ts | 12 ++-
 .../hashline-edit/edit-operations.test.ts     | 22 +++++-
 .../hashline-edit/hashline-edit-executor.ts   | 16 ++--
 src/tools/hashline-edit/tool-description.ts   |  2 +-
 src/tools/hashline-edit/tools.test.ts         | 77 +++++++++++++++++++
 src/tools/hashline-edit/tools.ts              |  1 -
 src/tools/hashline-edit/validation.ts         |  2 +-
 7 files changed, 117 insertions(+), 15 deletions(-)

diff --git a/src/tools/hashline-edit/edit-deduplication.ts b/src/tools/hashline-edit/edit-deduplication.ts
index e689bb53..8818b61a 100644
--- a/src/tools/hashline-edit/edit-deduplication.ts
+++ b/src/tools/hashline-edit/edit-deduplication.ts
@@ -1,18 +1,24 @@
 import type { HashlineEdit } from "./types"
 import { toNewLines } from "./edit-text-normalization"
+import { normalizeLineRef } from "./validation"
 
 function normalizeEditPayload(payload: string | string[]): string {
   return toNewLines(payload).join("\n")
 }
 
+function canonicalAnchor(anchor: string | undefined): string {
+  if (!anchor) return ""
+  return normalizeLineRef(anchor)
+}
+
 function buildDedupeKey(edit: HashlineEdit): string {
   switch (edit.op) {
     case "replace":
-      return `replace|${edit.pos}|${edit.end ?? ""}|${normalizeEditPayload(edit.lines)}`
+      return `replace|${canonicalAnchor(edit.pos)}|${edit.end ? canonicalAnchor(edit.end) : ""}|${normalizeEditPayload(edit.lines)}`
     case "append":
-      return `append|${edit.pos ?? ""}|${normalizeEditPayload(edit.lines)}`
+      return `append|${canonicalAnchor(edit.pos)}|${normalizeEditPayload(edit.lines)}`
     case "prepend":
-      return `prepend|${edit.pos ?? ""}|${normalizeEditPayload(edit.lines)}`
+      return `prepend|${canonicalAnchor(edit.pos)}|${normalizeEditPayload(edit.lines)}`
     default:
       return JSON.stringify(edit)
   }
diff --git a/src/tools/hashline-edit/edit-operations.test.ts b/src/tools/hashline-edit/edit-operations.test.ts
index 5d8ad08b..40585210 100644
--- a/src/tools/hashline-edit/edit-operations.test.ts
+++ b/src/tools/hashline-edit/edit-operations.test.ts
@@ -1,5 +1,5 @@
 import { describe, expect, it } from "bun:test"
-import { applyHashlineEdits } from "./edit-operations"
+import { applyHashlineEdits, applyHashlineEditsWithReport } from "./edit-operations"
 import { applyAppend, applyInsertAfter, applyPrepend, applyReplaceLines, applySetLine } from "./edit-operation-primitives"
 import { computeLineHash } from "./hash-computation"
 import type { HashlineEdit } from "./types"
@@ -389,3 +389,23 @@ describe("hashline edit operations", () => {
     expect(result).toEqual("replaced A\nline 3\nreplaced B")
   })
 })
+
+describe("dedupe anchor canonicalization", () => {
+  it("deduplicates edits with whitespace-variant anchors", () => {
+    //#given
+    const content = "line 1\nline 2"
+    const lines = content.split("\n")
+    const canonical = `1#${computeLineHash(1, lines[0])}`
+    const spaced = ` 1 # ${computeLineHash(1, lines[0])} `
+
+    //#when
+    const report = applyHashlineEditsWithReport(content, [
+      { op: "append", pos: canonical, lines: ["inserted"] },
+      { op: "append", pos: spaced, lines: ["inserted"] },
+    ])
+
+    //#then
+    expect(report.deduplicatedEdits).toBe(1)
+    expect(report.content).toBe("line 1\ninserted\nline 2")
+  })
+})
diff --git a/src/tools/hashline-edit/hashline-edit-executor.ts b/src/tools/hashline-edit/hashline-edit-executor.ts
index e20ebbf9..d316307d 100644
--- a/src/tools/hashline-edit/hashline-edit-executor.ts
+++ b/src/tools/hashline-edit/hashline-edit-executor.ts
@@ -33,7 +33,7 @@ function resolveToolCallID(ctx: ToolContextWithCallID): string | undefined {
 
 function canCreateFromMissingFile(edits: HashlineEdit[]): boolean {
   if (edits.length === 0) return false
-  return edits.every((edit) => edit.op === "append" || edit.op === "prepend")
+  return edits.every((edit) => (edit.op === "append" || edit.op === "prepend") && !edit.pos)
 }
 
 function buildSuccessMeta(
@@ -86,19 +86,19 @@ export async function executeHashlineEditTool(args: HashlineEditArgs, context: T
     const filePath = args.filePath
     const { delete: deleteMode, rename } = args
 
+    if (deleteMode && rename) {
+      return "Error: delete and rename cannot be used together"
+    }
+    if (deleteMode && args.edits.length > 0) {
+      return "Error: delete mode requires edits to be an empty array"
+    }
+
     if (!deleteMode && (!args.edits || !Array.isArray(args.edits) || args.edits.length === 0)) {
       return "Error: edits parameter must be a non-empty array"
     }
 
     const edits = deleteMode ? [] : normalizeHashlineEdits(args.edits)
 
-    if (deleteMode && rename) {
-      return "Error: delete and rename cannot be used together"
-    }
-    if (deleteMode && edits.length > 0) {
-      return "Error: delete mode requires edits to be an empty array"
-    }
-
     const file = Bun.file(filePath)
     const exists = await file.exists()
     if (!exists && !deleteMode && !canCreateFromMissingFile(edits)) {
diff --git a/src/tools/hashline-edit/tool-description.ts b/src/tools/hashline-edit/tool-description.ts
index 0b0ee00f..2d452ccf 100644
--- a/src/tools/hashline-edit/tool-description.ts
+++ b/src/tools/hashline-edit/tool-description.ts
@@ -10,7 +10,7 @@ WORKFLOW:
 VALIDATION:
   Payload shape: { "filePath": string, "edits": [...], "delete"?: boolean, "rename"?: string }
   Each edit must be one of: replace, append, prepend
-  Edit shape: { "op": "replace"|"append"|"prepend", "pos"?: "LINE#ID", "end"?: "LINE#ID", "lines"?: string|string[]|null }
+  Edit shape: { "op": "replace"|"append"|"prepend", "pos"?: "LINE#ID", "end"?: "LINE#ID", "lines": string|string[]|null }
   lines must contain plain replacement text only (no LINE#ID prefixes, no diff + markers)
   CRITICAL: all operations validate against the same pre-edit file snapshot and apply bottom-up. Refs/tags are interpreted against the last-read version of the file.
 
diff --git a/src/tools/hashline-edit/tools.test.ts b/src/tools/hashline-edit/tools.test.ts
index cb76b834..1158ca3d 100644
--- a/src/tools/hashline-edit/tools.test.ts
+++ b/src/tools/hashline-edit/tools.test.ts
@@ -341,4 +341,81 @@ describe("createHashlineEditTool", () => {
     //#then
     expect(envelope.lineEnding).toBe("\r\n")
   })
+
+  it("rejects delete=true with non-empty edits before normalization", async () => {
+    //#given
+    const filePath = path.join(tempDir, "delete-reject.txt")
+    fs.writeFileSync(filePath, "line1")
+
+    //#when
+    const result = await tool.execute(
+      {
+        filePath,
+        delete: true,
+        edits: [{ op: "replace", pos: "1#ZZ", lines: "bad" }],
+      },
+      createMockContext(),
+    )
+
+    //#then
+    expect(result).toContain("delete mode requires edits to be an empty array")
+    expect(fs.existsSync(filePath)).toBe(true)
+  })
+
+  it("rejects delete=true combined with rename", async () => {
+    //#given
+    const filePath = path.join(tempDir, "delete-rename.txt")
+    fs.writeFileSync(filePath, "line1")
+
+    //#when
+    const result = await tool.execute(
+      {
+        filePath,
+        delete: true,
+        rename: path.join(tempDir, "new-name.txt"),
+        edits: [],
+      },
+      createMockContext(),
+    )
+
+    //#then
+    expect(result).toContain("delete and rename cannot be used together")
+    expect(fs.existsSync(filePath)).toBe(true)
+  })
+
+  it("rejects missing file creation with anchored append", async () => {
+    //#given
+    const filePath = path.join(tempDir, "nonexistent.txt")
+
+    //#when
+    const result = await tool.execute(
+      {
+        filePath,
+        edits: [{ op: "append", pos: "1#ZZ", lines: ["bad"] }],
+      },
+      createMockContext(),
+    )
+
+    //#then
+    expect(result).toContain("File not found")
+  })
+
+  it("allows missing file creation with unanchored append", async () => {
+    //#given
+    const filePath = path.join(tempDir, "newfile.txt")
+
+    //#when
+    const result = await tool.execute(
+      {
+        filePath,
+        edits: [{ op: "append", lines: ["created"] }],
+      },
+      createMockContext(),
+    )
+
+    //#then
+    expect(fs.existsSync(filePath)).toBe(true)
+    expect(fs.readFileSync(filePath, "utf-8")).toBe("created")
+    expect(result).toBe(`Updated ${filePath}`)
+  })
 })
diff --git a/src/tools/hashline-edit/tools.ts b/src/tools/hashline-edit/tools.ts
index 13265029..bd2bf1f9 100644
--- a/src/tools/hashline-edit/tools.ts
+++ b/src/tools/hashline-edit/tools.ts
@@ -31,7 +31,6 @@ export function createHashlineEditTool(): ToolDefinition {
             end: tool.schema.string().optional().describe("Range end anchor in LINE#ID format"),
             lines: tool.schema
               .union([tool.schema.string(), tool.schema.array(tool.schema.string()), tool.schema.null()])
-              .optional()
               .describe("Replacement or inserted lines. null/[] deletes with replace"),
           })
         )
diff --git a/src/tools/hashline-edit/validation.ts b/src/tools/hashline-edit/validation.ts
index fc5b395a..ed606155 100644
--- a/src/tools/hashline-edit/validation.ts
+++ b/src/tools/hashline-edit/validation.ts
@@ -15,7 +15,7 @@ const MISMATCH_CONTEXT = 2
 
 const LINE_REF_EXTRACT_PATTERN = /([0-9]+#[ZPMQVRWSNKTXJBYH]{2})/
 
-function normalizeLineRef(ref: string): string {
+export function normalizeLineRef(ref: string): string {
   const originalTrimmed = ref.trim()
   let trimmed = originalTrimmed
   trimmed = trimmed.replace(/^(?:>>>|[+-])\s*/, "")

From d1a0a66dde227b64931df17e25ceb2939628433d Mon Sep 17 00:00:00 2001
From: minpeter <minpeter@friendli.ai>
Date: Fri, 27 Feb 2026 01:37:40 +0900
Subject: [PATCH 2/4] feat(benchmarks): add hashline-edit benchmark agent and
 deps

Standalone headless agent using Vercel AI SDK v6 with FriendliAI provider.
Imports hashline-edit pure functions directly from src/ for benchmarking
the edit tool against LLMs (Minimax M2.5 via FriendliAI).

Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-opencode)

Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
---
 benchmarks/bun.lock     |  62 +++++++++++++
 benchmarks/headless.ts  | 190 ++++++++++++++++++++++++++++++++++++++++
 benchmarks/package.json |  19 ++++
 3 files changed, 271 insertions(+)
 create mode 100644 benchmarks/bun.lock
 create mode 100644 benchmarks/headless.ts
 create mode 100644 benchmarks/package.json

diff --git a/benchmarks/bun.lock b/benchmarks/bun.lock
new file mode 100644
index 00000000..3a31bf1c
--- /dev/null
+++ b/benchmarks/bun.lock
@@ -0,0 +1,62 @@
+{
+  "lockfileVersion": 1,
+  "configVersion": 1,
+  "workspaces": {
+    "": {
+      "name": "hashline-edit-benchmark",
+      "dependencies": {
+        "@ai-sdk/openai": "^1.3.0",
+        "@friendliai/ai-provider": "^1.0.9",
+        "ai": "^6.0.94",
+        "zod": "^4.1.0",
+      },
+    },
+  },
+  "packages": {
+    "@ai-sdk/gateway": ["@ai-sdk/gateway@3.0.55", "", { "dependencies": { "@ai-sdk/provider": "3.0.8", "@ai-sdk/provider-utils": "4.0.15", "@vercel/oidc": "3.1.0" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-7xMeTJnCjwRwXKVCiv4Ly4qzWvDuW3+W1WIV0X1EFu6W83d4mEhV9bFArto10MeTw40ewuDjrbrZd21mXKohkw=="],
+
+    "@ai-sdk/openai": ["@ai-sdk/openai@1.3.24", "", { "dependencies": { "@ai-sdk/provider": "1.1.3", "@ai-sdk/provider-utils": "2.2.8" }, "peerDependencies": { "zod": "^3.0.0" } }, "sha512-GYXnGJTHRTZc4gJMSmFRgEQudjqd4PUN0ZjQhPwOAYH1yOAvQoG/Ikqs+HyISRbLPCrhbZnPKCNHuRU4OfpW0Q=="],
+
+    "@ai-sdk/openai-compatible": ["@ai-sdk/openai-compatible@2.0.30", "", { "dependencies": { "@ai-sdk/provider": "3.0.8", "@ai-sdk/provider-utils": "4.0.15" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-iTjumHf1/u4NhjXYFn/aONM2GId3/o7J1Lp5ql8FCbgIMyRwrmanR5xy1S3aaVkfTscuDvLTzWiy1mAbGzK3nQ=="],
+
+    "@ai-sdk/provider": ["@ai-sdk/provider@1.1.3", "", { "dependencies": { "json-schema": "^0.4.0" } }, "sha512-qZMxYJ0qqX/RfnuIaab+zp8UAeJn/ygXXAffR5I4N0n1IrvA6qBsjc8hXLmBiMV2zoXlifkacF7sEFnYnjBcqg=="],
+
+    "@ai-sdk/provider-utils": ["@ai-sdk/provider-utils@2.2.8", "", { "dependencies": { "@ai-sdk/provider": "1.1.3", "nanoid": "^3.3.8", "secure-json-parse": "^2.7.0" }, "peerDependencies": { "zod": "^3.23.8" } }, "sha512-fqhG+4sCVv8x7nFzYnFo19ryhAa3w096Kmc3hWxMQfW/TubPOmt3A6tYZhl4mUfQWWQMsuSkLrtjlWuXBVSGQA=="],
+
+    "@friendliai/ai-provider": ["@friendliai/ai-provider@1.1.4", "", { "dependencies": { "@ai-sdk/openai-compatible": "2.0.30", "@ai-sdk/provider": "3.0.8", "@ai-sdk/provider-utils": "4.0.15" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.12" } }, "sha512-9TU4B1QFqPhbkONjI5afCF7Ox4jOqtGg1xw8mA9QHZdtlEbZxU+mBNvMPlI5pU5kPoN6s7wkXmFmxpID+own1A=="],
+
+    "@opentelemetry/api": ["@opentelemetry/api@1.9.0", "", {}, "sha512-3giAOQvZiH5F9bMlMiv8+GSPMeqg0dbaeo58/0SlA9sxSqZhnUtxzX9/2FzyhS9sWQf5S0GJE0AKBrFqjpeYcg=="],
+
+    "@standard-schema/spec": ["@standard-schema/spec@1.1.0", "", {}, "sha512-l2aFy5jALhniG5HgqrD6jXLi/rUWrKvqN/qJx6yoJsgKhblVd+iqqU4RCXavm/jPityDo5TCvKMnpjKnOriy0w=="],
+
+    "@vercel/oidc": ["@vercel/oidc@3.1.0", "", {}, "sha512-Fw28YZpRnA3cAHHDlkt7xQHiJ0fcL+NRcIqsocZQUSmbzeIKRpwttJjik5ZGanXP+vlA4SbTg+AbA3bP363l+w=="],
+
+    "ai": ["ai@6.0.101", "", { "dependencies": { "@ai-sdk/gateway": "3.0.55", "@ai-sdk/provider": "3.0.8", "@ai-sdk/provider-utils": "4.0.15", "@opentelemetry/api": "1.9.0" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-Ur/NgbgOp1rdhyDiKDk6EOpSgd1g5ADlbcD1cjQJtQsnmhEngz3Rf8nK5JetDh0vnbLy2aEBpaQeL+zvLRWuaA=="],
+
+    "eventsource-parser": ["eventsource-parser@3.0.6", "", {}, "sha512-Vo1ab+QXPzZ4tCa8SwIHJFaSzy4R6SHf7BY79rFBDf0idraZWAkYrDjDj8uWaSm3S2TK+hJ7/t1CEmZ7jXw+pg=="],
+
+    "json-schema": ["json-schema@0.4.0", "", {}, "sha512-es94M3nTIfsEPisRafak+HDLfHXnKBhV3vU5eqPcS3flIWqcxJWgXHXiey3YrpaNsanY5ei1VoYEbOzijuq9BA=="],
+
+    "nanoid": ["nanoid@3.3.11", "", { "bin": { "nanoid": "bin/nanoid.cjs" } }, "sha512-N8SpfPUnUp1bK+PMYW8qSWdl9U+wwNWI4QKxOYDy9JAro3WMX7p2OeVRF9v+347pnakNevPmiHhNmZ2HbFA76w=="],
+
+    "secure-json-parse": ["secure-json-parse@2.7.0", "", {}, "sha512-6aU+Rwsezw7VR8/nyvKTx8QpWH9FrcYiXXlqC4z5d5XQBDRqtbfsRjnwGyqbi3gddNtWHuEk9OANUotL26qKUw=="],
+
+    "zod": ["zod@4.3.6", "", {}, "sha512-rftlrkhHZOcjDwkGlnUtZZkvaPHCsDATp4pGpuOOMDaTdDDXF91wuVDJoWoPsKX/3YPQ5fHuF3STjcYyKr+Qhg=="],
+
+    "@ai-sdk/gateway/@ai-sdk/provider": ["@ai-sdk/provider@3.0.8", "", { "dependencies": { "json-schema": "^0.4.0" } }, "sha512-oGMAgGoQdBXbZqNG0Ze56CHjDZ1IDYOwGYxYjO5KLSlz5HiNQ9udIXsPZ61VWaHGZ5XW/jyjmr6t2xz2jGVwbQ=="],
+
+    "@ai-sdk/gateway/@ai-sdk/provider-utils": ["@ai-sdk/provider-utils@4.0.15", "", { "dependencies": { "@ai-sdk/provider": "3.0.8", "@standard-schema/spec": "^1.1.0", "eventsource-parser": "^3.0.6" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-8XiKWbemmCbvNN0CLR9u3PQiet4gtEVIrX4zzLxnCj06AwsEDJwJVBbKrEI4t6qE8XRSIvU2irka0dcpziKW6w=="],
+
+    "@ai-sdk/openai-compatible/@ai-sdk/provider": ["@ai-sdk/provider@3.0.8", "", { "dependencies": { "json-schema": "^0.4.0" } }, "sha512-oGMAgGoQdBXbZqNG0Ze56CHjDZ1IDYOwGYxYjO5KLSlz5HiNQ9udIXsPZ61VWaHGZ5XW/jyjmr6t2xz2jGVwbQ=="],
+
+    "@ai-sdk/openai-compatible/@ai-sdk/provider-utils": ["@ai-sdk/provider-utils@4.0.15", "", { "dependencies": { "@ai-sdk/provider": "3.0.8", "@standard-schema/spec": "^1.1.0", "eventsource-parser": "^3.0.6" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-8XiKWbemmCbvNN0CLR9u3PQiet4gtEVIrX4zzLxnCj06AwsEDJwJVBbKrEI4t6qE8XRSIvU2irka0dcpziKW6w=="],
+
+    "@friendliai/ai-provider/@ai-sdk/provider": ["@ai-sdk/provider@3.0.8", "", { "dependencies": { "json-schema": "^0.4.0" } }, "sha512-oGMAgGoQdBXbZqNG0Ze56CHjDZ1IDYOwGYxYjO5KLSlz5HiNQ9udIXsPZ61VWaHGZ5XW/jyjmr6t2xz2jGVwbQ=="],
+
+    "@friendliai/ai-provider/@ai-sdk/provider-utils": ["@ai-sdk/provider-utils@4.0.15", "", { "dependencies": { "@ai-sdk/provider": "3.0.8", "@standard-schema/spec": "^1.1.0", "eventsource-parser": "^3.0.6" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-8XiKWbemmCbvNN0CLR9u3PQiet4gtEVIrX4zzLxnCj06AwsEDJwJVBbKrEI4t6qE8XRSIvU2irka0dcpziKW6w=="],
+
+    "ai/@ai-sdk/provider": ["@ai-sdk/provider@3.0.8", "", { "dependencies": { "json-schema": "^0.4.0" } }, "sha512-oGMAgGoQdBXbZqNG0Ze56CHjDZ1IDYOwGYxYjO5KLSlz5HiNQ9udIXsPZ61VWaHGZ5XW/jyjmr6t2xz2jGVwbQ=="],
+
+    "ai/@ai-sdk/provider-utils": ["@ai-sdk/provider-utils@4.0.15", "", { "dependencies": { "@ai-sdk/provider": "3.0.8", "@standard-schema/spec": "^1.1.0", "eventsource-parser": "^3.0.6" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-8XiKWbemmCbvNN0CLR9u3PQiet4gtEVIrX4zzLxnCj06AwsEDJwJVBbKrEI4t6qE8XRSIvU2irka0dcpziKW6w=="],
+  }
+}
diff --git a/benchmarks/headless.ts b/benchmarks/headless.ts
new file mode 100644
index 00000000..bb2af701
--- /dev/null
+++ b/benchmarks/headless.ts
@@ -0,0 +1,190 @@
+#!/usr/bin/env bun
+import { readFile, writeFile, mkdir } from "node:fs/promises"
+import { join, dirname } from "node:path"
+import { stepCountIs, streamText, type CoreMessage } from "ai"
+import { tool } from "ai"
+import { createFriendli } from "@friendliai/ai-provider"
+import { z } from "zod"
+import { formatHashLines } from "../src/tools/hashline-edit/hash-computation"
+import { normalizeHashlineEdits } from "../src/tools/hashline-edit/normalize-edits"
+import { applyHashlineEditsWithReport } from "../src/tools/hashline-edit/edit-operations"
+import { canonicalizeFileText, restoreFileText } from "../src/tools/hashline-edit/file-text-canonicalization"
+
+const DEFAULT_MODEL = "MiniMaxAI/MiniMax-M2.5"
+const MAX_STEPS = 50
+const sessionId = `bench-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`
+
+const emit = (event: Record<string, unknown>) =>
+  console.log(JSON.stringify({ sessionId, timestamp: new Date().toISOString(), ...event }))
+
+// ── CLI ──────────────────────────────────────────────────────
+function parseArgs(): { prompt: string; modelId: string } {
+  const args = process.argv.slice(2)
+  let prompt = ""
+  let modelId = DEFAULT_MODEL
+  for (let i = 0; i < args.length; i++) {
+    if ((args[i] === "-p" || args[i] === "--prompt") && args[i + 1]) {
+      prompt = args[++i]
+    } else if ((args[i] === "-m" || args[i] === "--model") && args[i + 1]) {
+      modelId = args[++i]
+    } else if (args[i] === "--reasoning-mode" && args[i + 1]) {
+      i++ // consume
+    }
+    // --no-translate, --think consumed silently
+  }
+  if (!prompt) {
+    console.error("Usage: bun run benchmarks/headless.ts -p <prompt> [-m <model>]")
+    process.exit(1)
+  }
+  return { prompt, modelId }
+}
+
+// ── Tools ────────────────────────────────────────────────────
+const readFileTool = tool({
+  description: "Read a file with hashline-tagged content (LINE#ID format)",
+  inputSchema: z.object({ path: z.string().describe("File path") }),
+  execute: async ({ path }) => {
+    const fullPath = join(process.cwd(), path)
+    try {
+      const content = await readFile(fullPath, "utf-8")
+      const lines = content.split("\n")
+      const tagged = formatHashLines(content)
+      return `OK - read file\npath: ${path}\nlines: ${lines.length}\n\n${tagged}`
+    } catch {
+      return `Error: File not found: ${path}`
+    }
+  },
+})
+
+const editFileTool = tool({
+  description: "Edit a file using hashline anchors (LINE#ID format)",
+  inputSchema: z.object({
+    path: z.string(),
+    edits: z.array(
+      z.object({
+        op: z.enum(["replace", "append", "prepend"]),
+        pos: z.string().optional(),
+        end: z.string().optional(),
+        lines: z.union([z.array(z.string()), z.string(), z.null()]),
+      })
+    ).min(1),
+  }),
+  execute: async ({ path, edits }) => {
+    const fullPath = join(process.cwd(), path)
+    try {
+      let rawContent = ""
+      let exists = true
+      try {
+        rawContent = await readFile(fullPath, "utf-8")
+      } catch {
+        exists = false
+      }
+
+      const normalized = normalizeHashlineEdits(edits)
+
+      if (!exists) {
+        const canCreate = normalized.every(
+          (e) => (e.op === "append" || e.op === "prepend") && !e.pos
+        )
+        if (!canCreate) return `Error: File not found: ${path}`
+      }
+
+      const envelope = canonicalizeFileText(rawContent)
+      const result = applyHashlineEditsWithReport(envelope.content, normalized)
+
+      if (result.content === envelope.content) {
+        return `Error: No changes made to ${path}. The edits produced identical content.`
+      }
+
+      const writeContent = restoreFileText(result.content, envelope)
+      await mkdir(dirname(fullPath), { recursive: true })
+      await writeFile(fullPath, writeContent, "utf-8")
+
+      const oldLineCount = rawContent.split("\n").length
+      const newLineCount = writeContent.split("\n").length
+      const delta = newLineCount - oldLineCount
+      const sign = delta > 0 ? "+" : ""
+      const action = exists ? "Updated" : "Created"
+      return `${action} ${path}\n${edits.length} edit(s) applied, ${sign}${delta} line(s)`
+    } catch (error) {
+      return `Error: ${error instanceof Error ? error.message : String(error)}`
+    }
+  },
+})
+
+// ── Agent Loop ───────────────────────────────────────────────
+async function run() {
+  const { prompt, modelId } = parseArgs()
+
+  const friendli = createFriendli({ apiKey: process.env.FRIENDLI_TOKEN! })
+  const model = friendli(modelId)
+  const tools = { read_file: readFileTool, edit_file: editFileTool }
+
+  emit({ type: "user", content: prompt })
+
+  const messages: CoreMessage[] = [{ role: "user", content: prompt }]
+  const system =
+    "You are a code editing assistant. Use read_file to read files and edit_file to edit them. " +
+    "Always read a file before editing it to get fresh LINE#ID anchors."
+
+  for (let step = 0; step < MAX_STEPS; step++) {
+    const stream = streamText({
+      model,
+      tools,
+      messages,
+      system,
+      stopWhen: stepCountIs(1),
+    })
+
+    let currentText = ""
+    for await (const part of stream.fullStream) {
+      switch (part.type) {
+        case "text-delta":
+          currentText += part.text
+          break
+        case "tool-call":
+          emit({
+            type: "tool_call",
+            tool_call_id: part.toolCallId,
+            tool_name: part.toolName,
+            tool_input: part.args,
+            model: modelId,
+          })
+          break
+        case "tool-result":
+          emit({
+            type: "tool_result",
+            tool_call_id: part.toolCallId,
+            output: typeof part.result === "string" ? part.result : JSON.stringify(part.result),
+          })
+          break
+      }
+    }
+
+    const response = await stream.response
+    messages.push(...response.messages)
+
+    const finishReason = await stream.finishReason
+    if (finishReason !== "tool-calls") {
+      if (currentText.trim()) {
+        emit({ type: "assistant", content: currentText, model: modelId })
+      }
+      break
+    }
+  }
+}
+
+// ── Signal + Startup ─────────────────────────────────────────
+process.once("SIGINT", () => process.exit(0))
+process.once("SIGTERM", () => process.exit(143))
+
+const startTime = Date.now()
+run()
+  .catch((error) => {
+    emit({ type: "error", error: error instanceof Error ? error.message : String(error) })
+    process.exit(1)
+  })
+  .then(() => {
+    const elapsed = ((Date.now() - startTime) / 1000).toFixed(2)
+    console.error(`[headless] Completed in ${elapsed}s`)
+  })
diff --git a/benchmarks/package.json b/benchmarks/package.json
new file mode 100644
index 00000000..bbddfed8
--- /dev/null
+++ b/benchmarks/package.json
@@ -0,0 +1,19 @@
+{
+  "name": "hashline-edit-benchmark",
+  "version": "0.1.0",
+  "private": true,
+  "type": "module",
+  "description": "Hashline edit tool benchmark using Vercel AI SDK with FriendliAI provider",
+  "scripts": {
+    "bench:basic": "bun run test-edit-ops.ts",
+    "bench:edge": "bun run test-edge-cases.ts",
+    "bench:multi": "bun run test-multi-model.ts",
+    "bench:all": "bun run bench:basic && bun run bench:edge"
+  },
+  "dependencies": {
+    "ai": "^6.0.94",
+    "@ai-sdk/openai": "^1.3.0",
+    "@friendliai/ai-provider": "^1.0.9",
+    "zod": "^4.1.0"
+  }
+}

From 04f50bac1ffe260c67b5a2ccd4c99f2d7ef27654 Mon Sep 17 00:00:00 2001
From: minpeter <minpeter@friendli.ai>
Date: Fri, 27 Feb 2026 01:37:49 +0900
Subject: [PATCH 3/4] feat(benchmarks): add hashline-edit test suites (46
 tests)

Ported from code-editing-agent benchmark:
- test-edit-ops.ts: 21 basic edit operations (replace, append, prepend, delete, batch, range)
- test-edge-cases.ts: 25 edge cases (unicode, long lines, whitespace, special chars, file creation)
- test-multi-model.ts: multi-model comparison runner

Verified 21/21 + 25/25 (100%) with Minimax M2.5 via FriendliAI.

Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-opencode)

Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
---
 benchmarks/test-edge-cases.ts  | 1117 ++++++++++++++++++++++++++++++++
 benchmarks/test-edit-ops.ts    |  808 +++++++++++++++++++++++
 benchmarks/test-multi-model.ts |  269 ++++++++
 3 files changed, 2194 insertions(+)
 create mode 100644 benchmarks/test-edge-cases.ts
 create mode 100644 benchmarks/test-edit-ops.ts
 create mode 100644 benchmarks/test-multi-model.ts

diff --git a/benchmarks/test-edge-cases.ts b/benchmarks/test-edge-cases.ts
new file mode 100644
index 00000000..a1916c56
--- /dev/null
+++ b/benchmarks/test-edge-cases.ts
@@ -0,0 +1,1117 @@
+#!/usr/bin/env bun
+/**
+ * Comprehensive headless edit_file stress test: 25 edge cases
+ *
+ * Tests: 5 basic ops + 14 creative cases + 6 whitespace cases
+ * Each runs via headless mode with its own demo file + prompt.
+ *
+ * Usage:
+ *   bun run scripts/test-headless-edit-edge-cases.ts [-m <model>] [--provider <provider>]
+ */
+
+import { spawn } from "node:child_process";
+import { mkdirSync, readFileSync, rmSync, writeFileSync } from "node:fs";
+import { tmpdir } from "node:os";
+import { join, resolve } from "node:path";
+
+// ── CLI arg passthrough ───────────────────────────────────────
+const extraArgs: string[] = [];
+const rawArgs = process.argv.slice(2);
+for (let i = 0; i < rawArgs.length; i++) {
+  const arg = rawArgs[i];
+  if (
+    (arg === "-m" || arg === "--model" || arg === "--provider") &&
+    i + 1 < rawArgs.length
+  ) {
+    extraArgs.push(arg, rawArgs[i + 1]);
+    i++;
+  } else if (arg === "--think" || arg === "--no-translate") {
+    extraArgs.push(arg);
+  } else if (arg === "--reasoning-mode" && i + 1 < rawArgs.length) {
+    extraArgs.push(arg, rawArgs[i + 1]);
+    i++;
+  }
+}
+
+// ── Colors ────────────────────────────────────────────────────
+const BOLD = "\x1b[1m";
+const GREEN = "\x1b[32m";
+const RED = "\x1b[31m";
+const YELLOW = "\x1b[33m";
+const DIM = "\x1b[2m";
+const CYAN = "\x1b[36m";
+const RESET = "\x1b[0m";
+
+const pass = (msg: string) => console.log(`  ${GREEN}✓${RESET} ${msg}`);
+const fail = (msg: string) => console.log(`  ${RED}✗${RESET} ${msg}`);
+const info = (msg: string) => console.log(`  ${DIM}${msg}${RESET}`);
+const warn = (msg: string) => console.log(`  ${YELLOW}⚠${RESET} ${msg}`);
+
+// ── Test case definition ─────────────────────────────────────
+interface TestCase {
+  fileContent: string;
+  fileName: string;
+  name: string;
+  prompt: string;
+  validate: (content: string) => { passed: boolean; reason: string };
+}
+
+const TEST_CASES: TestCase[] = [
+  {
+    name: "1. Single-line file — replace only line",
+    fileName: "single-line.txt",
+    fileContent: "only_line_original",
+    prompt: [
+      "Read single-line.txt with read_file.",
+      "Replace the only line using edit_file with edits: [{ op: 'replace', pos: '<line1 anchor>', lines: ['only_line_updated'] }].",
+      "Expected final content exactly one line: only_line_updated.",
+    ].join(" "),
+    validate: (content) => {
+      const normalized = content.replace(/\r/g, "").trimEnd();
+      const lines = normalized.split("\n");
+      if (lines.length === 1 && lines[0] === "only_line_updated") {
+        return { passed: true, reason: "single line replaced correctly" };
+      }
+      if (normalized.includes("only_line_original")) {
+        return { passed: false, reason: "original line still present" };
+      }
+      return {
+        passed: false,
+        reason: `expected one line 'only_line_updated', got ${lines.length} lines`,
+      };
+    },
+  },
+  {
+    name: "2. Large file (20 lines) — replace middle line 11",
+    fileName: "twenty-lines.txt",
+    fileContent: Array.from(
+      { length: 20 },
+      (_, i) => `line${String(i + 1).padStart(2, "0")}: value-${i + 1}`
+    ).join("\n"),
+    prompt: [
+      "Read twenty-lines.txt with read_file.",
+      "Replace line 11 using edit_file with edits: [{ op: 'replace', pos: '<line11 anchor>', lines: ['line11: UPDATED-MIDDLE'] }].",
+      "Keep all other lines unchanged.",
+    ].join(" "),
+    validate: (content) => {
+      const lines = content.replace(/\r/g, "").trimEnd().split("\n");
+      if (lines.length !== 20) {
+        return {
+          passed: false,
+          reason: `expected 20 lines, got ${lines.length}`,
+        };
+      }
+      if (lines[10] !== "line11: UPDATED-MIDDLE") {
+        return {
+          passed: false,
+          reason: `line 11 mismatch: '${lines[10] ?? "<missing>"}'`,
+        };
+      }
+      if (lines[9] !== "line10: value-10" || lines[11] !== "line12: value-12") {
+        return {
+          passed: false,
+          reason: "neighboring lines changed unexpectedly",
+        };
+      }
+      return {
+        passed: true,
+        reason: "line 11 replaced and surrounding lines preserved",
+      };
+    },
+  },
+  {
+    name: "3. Range replace entire file (first→last to one line)",
+    fileName: "range-all.txt",
+    fileContent: ["first", "second", "third", "fourth", "fifth"].join("\n"),
+    prompt: [
+      "Read range-all.txt with read_file.",
+      "Replace the full file from first line to last line using one range edit: edits: [{ op: 'replace', pos: '<line1 anchor>', end: '<line5 anchor>', lines: ['collapsed-to-one-line'] }].",
+      "Expected final content exactly: collapsed-to-one-line.",
+    ].join(" "),
+    validate: (content) => {
+      const normalized = content.replace(/\r/g, "").trimEnd();
+      if (normalized === "collapsed-to-one-line") {
+        return {
+          passed: true,
+          reason: "entire file collapsed to single replacement line",
+        };
+      }
+      if (normalized.includes("first") || normalized.includes("fifth")) {
+        return {
+          passed: false,
+          reason: "original range content still present",
+        };
+      }
+      return {
+        passed: false,
+        reason: `unexpected final content: '${normalized.slice(0, 120)}'`,
+      };
+    },
+  },
+  {
+    name: "4. Mixed ops in one call (replace + append + prepend)",
+    fileName: "mixed-one-call.txt",
+    fileContent: ["alpha", "beta", "gamma"].join("\n"),
+    prompt: [
+      "Read mixed-one-call.txt with read_file.",
+      "Call edit_file exactly once with three edits in one edits array:",
+      "edits: [",
+      "{ op: 'replace', pos: '<line2 anchor>', lines: ['BETA'] },",
+      "{ op: 'append', pos: '<line3 anchor>', lines: ['delta'] },",
+      "{ op: 'prepend', pos: '<line1 anchor>', lines: ['start'] }",
+      "].",
+      "Expected final content: start, alpha, BETA, gamma, delta.",
+    ].join(" "),
+    validate: (content) => {
+      const lines = content.replace(/\r/g, "").trimEnd().split("\n");
+      const expected = ["start", "alpha", "BETA", "gamma", "delta"];
+      if (lines.length !== expected.length) {
+        return {
+          passed: false,
+          reason: `expected ${expected.length} lines, got ${lines.length}`,
+        };
+      }
+      for (let i = 0; i < expected.length; i++) {
+        if (lines[i] !== expected[i]) {
+          return {
+            passed: false,
+            reason: `line ${i + 1} expected '${expected[i]}' but got '${lines[i]}'`,
+          };
+        }
+      }
+      return {
+        passed: true,
+        reason: "single call applied replace, append, and prepend",
+      };
+    },
+  },
+  {
+    name: "5. Large batch (5 replaces) in one call",
+    fileName: "batch-five.txt",
+    fileContent: [
+      "row-1",
+      "row-2",
+      "row-3",
+      "row-4",
+      "row-5",
+      "row-6",
+      "row-7",
+      "row-8",
+      "row-9",
+      "row-10",
+    ].join("\n"),
+    prompt: [
+      "Read batch-five.txt with read_file.",
+      "Call edit_file once with five replace edits in one edits array:",
+      "edits: [",
+      "{ op: 'replace', pos: '<line1 anchor>', lines: ['ROW-1'] },",
+      "{ op: 'replace', pos: '<line3 anchor>', lines: ['ROW-3'] },",
+      "{ op: 'replace', pos: '<line5 anchor>', lines: ['ROW-5'] },",
+      "{ op: 'replace', pos: '<line7 anchor>', lines: ['ROW-7'] },",
+      "{ op: 'replace', pos: '<line10 anchor>', lines: ['ROW-10'] }",
+      "].",
+    ].join(" "),
+    validate: (content) => {
+      const lines = content.replace(/\r/g, "").trimEnd().split("\n");
+      if (lines.length !== 10) {
+        return {
+          passed: false,
+          reason: `expected 10 lines, got ${lines.length}`,
+        };
+      }
+      const checks: [number, string][] = [
+        [0, "ROW-1"],
+        [2, "ROW-3"],
+        [4, "ROW-5"],
+        [6, "ROW-7"],
+        [9, "ROW-10"],
+      ];
+      for (const [idx, expected] of checks) {
+        if (lines[idx] !== expected) {
+          return {
+            passed: false,
+            reason: `line ${idx + 1} expected '${expected}' but got '${lines[idx]}'`,
+          };
+        }
+      }
+      if (
+        lines[1] !== "row-2" ||
+        lines[3] !== "row-4" ||
+        lines[8] !== "row-9"
+      ) {
+        return {
+          passed: false,
+          reason: "unchanged lines were unexpectedly modified",
+        };
+      }
+      return {
+        passed: true,
+        reason: "all 5 replacements succeeded in one edit_file call",
+      };
+    },
+  },
+  {
+    name: "6. Consecutive edits (read→edit→read→edit)",
+    fileName: "consecutive.txt",
+    fileContent: ["stage: one", "value: 1", "status: draft"].join("\n"),
+    prompt: [
+      "Read consecutive.txt with read_file.",
+      "First call edit_file with edits: [{ op: 'replace', pos: '<line2 anchor>', lines: ['value: 2'] }].",
+      "Then read consecutive.txt with read_file again.",
+      "Second, call edit_file again with edits: [{ op: 'replace', pos: '<line3 anchor>', lines: ['status: final'] }].",
+      "Expected final content: stage: one, value: 2, status: final.",
+    ].join(" "),
+    validate: (content) => {
+      const lines = content.replace(/\r/g, "").trimEnd().split("\n");
+      const expected = ["stage: one", "value: 2", "status: final"];
+      if (lines.length !== expected.length) {
+        return {
+          passed: false,
+          reason: `expected ${expected.length} lines, got ${lines.length}`,
+        };
+      }
+      for (let i = 0; i < expected.length; i++) {
+        if (lines[i] !== expected[i]) {
+          return {
+            passed: false,
+            reason: `line ${i + 1} expected '${expected[i]}' but got '${lines[i]}'`,
+          };
+        }
+      }
+      return {
+        passed: true,
+        reason: "two sequential edit_file calls produced expected final state",
+      };
+    },
+  },
+  {
+    name: "7. Create new file via append",
+    fileName: "create-via-append.txt",
+    fileContent: "",
+    prompt: [
+      "Create create-via-append.txt via edit_file append (do not call read_file first).",
+      "Use one call with edits: [{ op: 'append', lines: ['created line 1', 'created line 2'] }].",
+      "Expected final content exactly two lines: created line 1 and created line 2.",
+    ].join(" "),
+    validate: (content) => {
+      const normalized = content.replace(/\r/g, "").trimEnd();
+      const lines = normalized === "" ? [] : normalized.split("\n");
+      if (lines.length !== 2) {
+        return {
+          passed: false,
+          reason: `expected 2 lines, got ${lines.length}`,
+        };
+      }
+      if (lines[0] !== "created line 1" || lines[1] !== "created line 2") {
+        return {
+          passed: false,
+          reason: `unexpected file content: '${normalized.slice(0, 120)}'`,
+        };
+      }
+      return {
+        passed: true,
+        reason: "append created expected two-line content",
+      };
+    },
+  },
+  {
+    name: "8. Unicode/emoji line replacement",
+    fileName: "unicode.txt",
+    fileContent: ["status: pending", "message: old"].join("\n"),
+    prompt: [
+      "Read unicode.txt with read_file.",
+      "Replace line 2 with Unicode content using edit_file and edits: [{ op: 'replace', pos: '<line2 anchor>', lines: ['message: 🎉🚀 한국어 테스트 완료'] }].",
+      "Expected line 2 exactly: message: 🎉🚀 한국어 테스트 완료.",
+    ].join(" "),
+    validate: (content) => {
+      const lines = content.replace(/\r/g, "").trimEnd().split("\n");
+      if (lines[1] !== "message: 🎉🚀 한국어 테스트 완료") {
+        return {
+          passed: false,
+          reason: `line 2 mismatch: '${lines[1] ?? "<missing>"}'`,
+        };
+      }
+      if (content.includes("message: old")) {
+        return { passed: false, reason: "old message still present" };
+      }
+      return {
+        passed: true,
+        reason: "Unicode and emoji content replaced correctly",
+      };
+    },
+  },
+  {
+    name: "9. Backticks/template literal content",
+    fileName: "template.ts",
+    fileContent: ["const name = 'dev';", "const msg = 'old';"].join("\n"),
+    prompt: [
+      "Read template.ts with read_file.",
+      "Replace line 2 using edit_file with edits: [{ op: 'replace', pos: '<line2 anchor>', lines: ['const msg = `hello \u0024{name}`;'] }].",
+      "Expected line 2 exactly: const msg = `hello \u0024{name}`;",
+    ].join(" "),
+    validate: (content) => {
+      const expected = "const msg = `hello \u0024{name}`;";
+      const lines = content.replace(/\r/g, "").trimEnd().split("\n");
+      if (lines[1] !== expected) {
+        return {
+          passed: false,
+          reason: `line 2 expected '${expected}' but got '${lines[1] ?? "<missing>"}'`,
+        };
+      }
+      if (content.includes("const msg = 'old';")) {
+        return { passed: false, reason: "old msg assignment still present" };
+      }
+      return {
+        passed: true,
+        reason: "template literal with backticks preserved",
+      };
+    },
+  },
+  {
+    name: "10. Regex pattern content",
+    fileName: "regex.ts",
+    fileContent: ["const re = /old/;", "const ok = true;"].join("\n"),
+    prompt: [
+      "Read regex.ts with read_file.",
+      "Replace line 1 using edit_file with edits: [{ op: 'replace', pos: '<line1 anchor>', lines: ['const re = /^[a-z]+\\d{2,}$/gi;'] }].",
+      "Expected line 1 exactly: const re = /^[a-z]+\\d{2,}$/gi;",
+    ].join(" "),
+    validate: (content) => {
+      const expected = "const re = /^[a-z]+\\d{2,}$/gi;";
+      const lines = content.replace(/\r/g, "").trimEnd().split("\n");
+      if (lines[0] !== expected) {
+        return {
+          passed: false,
+          reason: `regex line mismatch: '${lines[0] ?? "<missing>"}'`,
+        };
+      }
+      if (content.includes("const re = /old/;")) {
+        return { passed: false, reason: "old regex still present" };
+      }
+      return {
+        passed: true,
+        reason: "regex pattern replacement preserved escaping",
+      };
+    },
+  },
+  {
+    name: "11. Escaped quotes and backslashes",
+    fileName: "path.cfg",
+    fileContent: ['path = "/tmp/file.txt"', "mode = rw"].join("\n"),
+    prompt: [
+      "Read path.cfg with read_file.",
+      "Replace line 1 using edit_file with edits: [{ op: 'replace', pos: '<line1 anchor>', lines: ['path = \"C:\\\\Users\\\\admin\\\\file.txt\"'] }].",
+      'The file should contain a Windows-style path with backslashes: C:\\Users\\admin\\file.txt.',
+    ].join(" "),
+    validate: (content) => {
+      const lines = content.replace(/\r/g, "").trimEnd().split("\n");
+      const line1 = lines[0] ?? "";
+      // Accept either single or double backslashes — both are valid model interpretations
+      const hasSingleBS = line1.includes('C:\\Users\\admin\\file.txt');
+      const hasDoubleBS = line1.includes('C:\\\\Users\\\\admin\\\\file.txt');
+      const hasPath = hasSingleBS || hasDoubleBS;
+      const hasQuotes = line1.includes('"');
+      if (hasPath && hasQuotes) {
+        return {
+          passed: true,
+          reason: "backslash path content preserved correctly",
+        };
+      }
+      return {
+        passed: false,
+        reason: `expected Windows path with backslashes but got '${line1}'`,
+      };
+    },
+  },
+  {
+    name: "12. HTML tags in content",
+    fileName: "html-snippet.txt",
+    fileContent: ["snippet: old", "done: true"].join("\n"),
+    prompt: [
+      "Read html-snippet.txt with read_file.",
+      "Replace line 1 using edit_file with edits: [{ op: 'replace', pos: '<line1 anchor>', lines: ['<div class=\"container\"><p>Hello</p></div>'] }].",
+      'Expected line 1 exactly: <div class="container"><p>Hello</p></div>.',
+    ].join(" "),
+    validate: (content) => {
+      const expected = '<div class="container"><p>Hello</p></div>';
+      const lines = content.replace(/\r/g, "").trimEnd().split("\n");
+      if (lines[0] !== expected) {
+        return {
+          passed: false,
+          reason: `HTML line mismatch: '${lines[0] ?? "<missing>"}'`,
+        };
+      }
+      if (content.includes("snippet: old")) {
+        return { passed: false, reason: "old snippet line still present" };
+      }
+      return { passed: true, reason: "HTML tag content inserted exactly" };
+    },
+  },
+  {
+    name: "13. Very long line (180 chars)",
+    fileName: "long-line.txt",
+    fileContent: ["line-1", "short-line"].join("\n"),
+    prompt: [
+      "Read long-line.txt with read_file.",
+      `Replace line 2 using edit_file with edits: [{ op: 'replace', pos: '<line2 anchor>', lines: ['${"L".repeat(180)}'] }].`,
+      "Expected line 2 to be exactly 180 characters.",
+    ].join(" "),
+    validate: (content) => {
+      const expected = "L".repeat(180);
+      const lines = content.replace(/\r/g, "").trimEnd().split("\n");
+      if (!lines[1]) {
+        return { passed: false, reason: "line 2 is missing" };
+      }
+      if (Math.abs(lines[1].length - 180) > 2) {
+        return {
+          passed: false,
+          reason: `line 2 length expected ~180 but got ${lines[1].length}`,
+        };
+      }
+      if (!lines[1].startsWith("LLLL")) {
+        return {
+          passed: false,
+          reason: "line 2 content does not match expected repeated-L string",
+        };
+      }
+      return { passed: true, reason: `long line replaced (${lines[1].length} chars)` };
+    },
+  },
+  {
+    name: "14. SQL query content",
+    fileName: "sql-content.txt",
+    fileContent: ["SELECT 1;", "done"].join("\n"),
+    prompt: [
+      "Read sql-content.txt with read_file.",
+      "Replace line 1 using edit_file with edits: [{ op: 'replace', pos: '<line1 anchor>', lines: ['SELECT u.name, o.total FROM users u JOIN orders o ON u.id = o.user_id WHERE o.total > 100;'] }].",
+      "Expected line 1 exactly the provided SQL query.",
+    ].join(" "),
+    validate: (content) => {
+      const expected =
+        "SELECT u.name, o.total FROM users u JOIN orders o ON u.id = o.user_id WHERE o.total > 100;";
+      const lines = content.replace(/\r/g, "").trimEnd().split("\n");
+      if (lines[0] !== expected) {
+        return {
+          passed: false,
+          reason: `SQL line mismatch: '${lines[0] ?? "<missing>"}'`,
+        };
+      }
+      return { passed: true, reason: "SQL query line replaced exactly" };
+    },
+  },
+  {
+    name: "15. Mixed indentation (tab -> spaces)",
+    fileName: "mixed-indent.ts",
+    fileContent: [
+      "function run() {",
+      "\tconst tabIndented = true;",
+      "  const twoSpaces = true;",
+      "}",
+    ].join("\n"),
+    prompt: [
+      "Read mixed-indent.ts with read_file.",
+      "Replace the tab-indented line 2 using edit_file with edits: [{ op: 'replace', pos: '<line2 anchor>', lines: ['    const tabIndented = true;'] }].",
+      "Expected line 2 to be 4 spaces + const tabIndented = true;",
+    ].join(" "),
+    validate: (content) => {
+      const normalized = content.replace(/\r/g, "");
+      const lines = normalized.endsWith("\n")
+        ? normalized.slice(0, -1).split("\n")
+        : normalized.split("\n");
+      if (lines[1] !== "    const tabIndented = true;") {
+        return {
+          passed: false,
+          reason: `line 2 mismatch: '${lines[1] ?? "<missing>"}'`,
+        };
+      }
+      if (lines[1].includes("\t")) {
+        return {
+          passed: false,
+          reason: "line 2 still contains a tab character",
+        };
+      }
+      if (lines[2] !== "  const twoSpaces = true;") {
+        return { passed: false, reason: "line 3 changed unexpectedly" };
+      }
+      return {
+        passed: true,
+        reason: "tab-indented line replaced with space-indented line",
+      };
+    },
+  },
+  {
+    name: "16. Trailing whitespace preservation",
+    fileName: "trailing-whitespace.txt",
+    fileContent: ["start", "text   ", "end"].join("\n"),
+    prompt: [
+      "Read trailing-whitespace.txt with read_file.",
+      "Replace line 2 using edit_file with edits: [{ op: 'replace', pos: '<line2 anchor>', lines: ['new_text   '] }].",
+      "Keep exactly three trailing spaces after new_text.",
+    ].join(" "),
+    validate: (content) => {
+      const normalized = content.replace(/\r/g, "");
+      const lines = normalized.endsWith("\n")
+        ? normalized.slice(0, -1).split("\n")
+        : normalized.split("\n");
+      if (!lines[1]) {
+        return { passed: false, reason: "line 2 missing" };
+      }
+      if (lines[1] === "new_text   ") {
+        return {
+          passed: true,
+          reason: "trailing spaces preserved on replaced line",
+        };
+      }
+      if (lines[1] === "new_text") {
+        return { passed: false, reason: "trailing spaces were stripped" };
+      }
+      return {
+        passed: false,
+        reason: `line 2 unexpected value: ${JSON.stringify(lines[1])}`,
+      };
+    },
+  },
+  {
+    name: "17. Replace line containing only spaces",
+    fileName: "spaces-only-line.txt",
+    fileContent: ["alpha", "    ", "omega"].join("\n"),
+    prompt: [
+      "Read spaces-only-line.txt with read_file.",
+      "Replace the line that contains only 4 spaces (line 2) using edit_file with edits: [{ op: 'replace', pos: '<line2 anchor>', lines: ['middle-content'] }].",
+      "Expected final content: alpha, middle-content, omega.",
+    ].join(" "),
+    validate: (content) => {
+      const normalized = content.replace(/\r/g, "");
+      const lines = normalized.endsWith("\n")
+        ? normalized.slice(0, -1).split("\n")
+        : normalized.split("\n");
+      if (lines.length !== 3) {
+        return {
+          passed: false,
+          reason: `expected 3 lines, got ${lines.length}`,
+        };
+      }
+      if (lines[0] !== "alpha" || lines[2] !== "omega") {
+        return {
+          passed: false,
+          reason: "non-target lines changed unexpectedly",
+        };
+      }
+      if (lines[1].trim() !== "middle-content") {
+        return {
+          passed: false,
+          reason: `line 2 expected 'middle-content' but got ${JSON.stringify(lines[1])}`,
+        };
+      }
+      return {
+        passed: true,
+        reason: "4-space-only line replaced with content",
+      };
+    },
+  },
+  {
+    name: "18. Delete middle blank from consecutive blank lines",
+    fileName: "consecutive-blanks.txt",
+    fileContent: ["top", "", "", "", "bottom"].join("\n"),
+    prompt: [
+      "Read consecutive-blanks.txt with read_file.",
+      "Delete only the middle blank line (line 3 of 5) using edit_file with edits: [{ op: 'replace', pos: '<line3 anchor>', lines: [] }].",
+      "Keep the other two blank lines intact.",
+    ].join(" "),
+    validate: (content) => {
+      const normalized = content.replace(/\r/g, "");
+      const lines = normalized.endsWith("\n")
+        ? normalized.slice(0, -1).split("\n")
+        : normalized.split("\n");
+      const expected = ["top", "", "", "bottom"];
+      if (lines.length !== expected.length) {
+        return {
+          passed: false,
+          reason: `expected ${expected.length} lines after deleting one blank, got ${lines.length}`,
+        };
+      }
+      for (let i = 0; i < expected.length; i++) {
+        if (lines[i] !== expected[i]) {
+          return {
+            passed: false,
+            reason: `line ${i + 1} expected ${JSON.stringify(expected[i])} but got ${JSON.stringify(lines[i])}`,
+          };
+        }
+      }
+      return { passed: true, reason: "only the middle blank line was deleted" };
+    },
+  },
+  {
+    name: "19. Indentation increase (2 spaces -> 8 spaces)",
+    fileName: "indent-increase.js",
+    fileContent: ["if (flag) {", "  execute();", "}"].join("\n"),
+    prompt: [
+      "Read indent-increase.js with read_file.",
+      "Replace line 2 using edit_file with edits: [{ op: 'replace', pos: '<line2 anchor>', lines: ['        execute();'] }].",
+      "Expected line 2 indentation increased from 2 spaces to 8 spaces.",
+    ].join(" "),
+    validate: (content) => {
+      const normalized = content.replace(/\r/g, "");
+      const lines = normalized.endsWith("\n")
+        ? normalized.slice(0, -1).split("\n")
+        : normalized.split("\n");
+      if (lines.length !== 3) {
+        return {
+          passed: false,
+          reason: `expected 3 lines, got ${lines.length}`,
+        };
+      }
+      if (lines[1] !== "        execute();") {
+        return {
+          passed: false,
+          reason: `line 2 expected 8-space indentation, got ${JSON.stringify(lines[1])}`,
+        };
+      }
+      if (lines[0] !== "if (flag) {" || lines[2] !== "}") {
+        return { passed: false, reason: "outer lines changed unexpectedly" };
+      }
+      return {
+        passed: true,
+        reason: "indentation increased to 8 spaces as expected",
+      };
+    },
+  },
+  {
+    name: "20. Content that resembles hashline format",
+    fileName: "hashline-content.txt",
+    fileContent: ["anchor: old", "tail"].join("\n"),
+    prompt: [
+      "Read hashline-content.txt with read_file.",
+      "Replace line 1 using edit_file with edits: [{ op: 'replace', pos: '<line1 anchor>', lines: ['anchor: 1#AB format is used'] }].",
+      "Expected line 1 exactly: anchor: 1#AB format is used.",
+    ].join(" "),
+    validate: (content) => {
+      const lines = content.replace(/\r/g, "").trimEnd().split("\n");
+      if (lines[0] !== "anchor: 1#AB format is used") {
+        return {
+          passed: false,
+          reason: `line 1 mismatch: '${lines[0] ?? "<missing>"}'`,
+        };
+      }
+      return {
+        passed: true,
+        reason: "hashline-like literal content preserved correctly",
+      };
+    },
+  },
+  {
+    name: "21. Literal backslash-n content",
+    fileName: "literal-backslash-n.txt",
+    fileContent: ["placeholder", "tail"].join("\n"),
+    prompt: [
+      "Read literal-backslash-n.txt with read_file.",
+      "Replace line 1 using edit_file with edits: [{ op: 'replace', pos: '<line1 anchor>', lines: ['line1\\nline2 (literal backslash-n, not newline)'] }].",
+      "Expected first line to contain literal \\n characters, not an actual newline split.",
+    ].join(" "),
+    validate: (content) => {
+      const expected = "line1\\nline2 (literal backslash-n, not newline)";
+      const lines = content.replace(/\r/g, "").trimEnd().split("\n");
+      if (lines.length !== 2) {
+        return {
+          passed: false,
+          reason: `expected 2 lines total, got ${lines.length}`,
+        };
+      }
+      if (lines[0] !== expected) {
+        return {
+          passed: false,
+          reason: `line 1 expected '${expected}' but got '${lines[0] ?? "<missing>"}'`,
+        };
+      }
+      return {
+        passed: true,
+        reason: "literal \\n sequence preserved in a single line",
+      };
+    },
+  },
+  {
+    name: "22. Append multiple lines at once",
+    fileName: "append-multi.txt",
+    fileContent: ["header", "anchor-line", "footer"].join("\n"),
+    prompt: [
+      "Read append-multi.txt with read_file.",
+      "Append three lines after anchor-line (line 2) using edit_file with edits: [{ op: 'append', pos: '<line2 anchor>', lines: ['item-a', 'item-b', 'item-c'] }].",
+      "Expected final order: header, anchor-line, item-a, item-b, item-c, footer.",
+    ].join(" "),
+    validate: (content) => {
+      const lines = content.replace(/\r/g, "").trimEnd().split("\n");
+      const expected = [
+        "header",
+        "anchor-line",
+        "item-a",
+        "item-b",
+        "item-c",
+        "footer",
+      ];
+      if (lines.length !== expected.length) {
+        return {
+          passed: false,
+          reason: `expected ${expected.length} lines, got ${lines.length}`,
+        };
+      }
+      for (let i = 0; i < expected.length; i++) {
+        if (lines[i] !== expected[i]) {
+          return {
+            passed: false,
+            reason: `line ${i + 1} expected '${expected[i]}' but got '${lines[i]}'`,
+          };
+        }
+      }
+      return {
+        passed: true,
+        reason: "three lines appended in a single append edit",
+      };
+    },
+  },
+  {
+    name: "23. Replace long line with single short word",
+    fileName: "shrink-line.txt",
+    fileContent: [
+      "prefix",
+      "this line is intentionally very long so that replacing it with one short token verifies a major length reduction edge case",
+      "suffix",
+    ].join("\n"),
+    prompt: [
+      "Read shrink-line.txt with read_file.",
+      "Replace the long line 2 using edit_file with edits: [{ op: 'replace', pos: '<line2 anchor>', lines: ['short'] }].",
+      "Expected final line 2 exactly: short.",
+    ].join(" "),
+    validate: (content) => {
+      const lines = content.replace(/\r/g, "").trimEnd().split("\n");
+      if (lines[1] !== "short") {
+        return {
+          passed: false,
+          reason: `line 2 expected 'short' but got '${lines[1] ?? "<missing>"}'`,
+        };
+      }
+      if (content.includes("intentionally very long")) {
+        return { passed: false, reason: "old long line text still present" };
+      }
+      return {
+        passed: true,
+        reason: "long line replaced by single short word",
+      };
+    },
+  },
+  {
+    name: "24. Edit file with no trailing newline",
+    fileName: "no-trailing-newline.txt",
+    fileContent: "first\nsecond\nthird",
+    prompt: [
+      "Read no-trailing-newline.txt with read_file.",
+      "Replace line 2 using edit_file with edits: [{ op: 'replace', pos: '<line2 anchor>', lines: ['SECOND'] }].",
+      "Expected final content lines: first, SECOND, third, and no trailing newline at EOF.",
+    ].join(" "),
+    validate: (content) => {
+      const normalized = content.replace(/\r/g, "");
+      const lines = normalized.split("\n");
+      if (lines.length !== 3) {
+        return {
+          passed: false,
+          reason: `expected 3 lines, got ${lines.length}`,
+        };
+      }
+      if (
+        lines[0] !== "first" ||
+        lines[1] !== "SECOND" ||
+        lines[2] !== "third"
+      ) {
+        return {
+          passed: false,
+          reason: `unexpected lines: ${JSON.stringify(lines)}`,
+        };
+      }
+      if (normalized.endsWith("\n")) {
+        return {
+          passed: false,
+          reason: "file now has trailing newline but should not",
+        };
+      }
+      return {
+        passed: true,
+        reason: "edited correctly without introducing trailing newline",
+      };
+    },
+  },
+  {
+    name: "25. Prepend at BOF without pos anchor",
+    fileName: "prepend-bof.js",
+    fileContent: ["console.log('hello');", "console.log('done');"].join("\n"),
+    prompt: [
+      "Read prepend-bof.js with read_file.",
+      "Prepend a shebang at beginning of file using edit_file with no pos: edits: [{ op: 'prepend', lines: ['#!/usr/bin/env node'] }].",
+      "Do not include a pos field. Expected first line: #!/usr/bin/env node.",
+    ].join(" "),
+    validate: (content) => {
+      const lines = content.replace(/\r/g, "").trimEnd().split("\n");
+      const expected = [
+        "#!/usr/bin/env node",
+        "console.log('hello');",
+        "console.log('done');",
+      ];
+      if (lines.length !== expected.length) {
+        return {
+          passed: false,
+          reason: `expected ${expected.length} lines, got ${lines.length}`,
+        };
+      }
+      for (let i = 0; i < expected.length; i++) {
+        if (lines[i] !== expected[i]) {
+          return {
+            passed: false,
+            reason: `line ${i + 1} expected '${expected[i]}' but got '${lines[i]}'`,
+          };
+        }
+      }
+      return {
+        passed: true,
+        reason: "shebang prepended at BOF without pos anchor",
+      };
+    },
+  },
+];
+
+// ── JSONL event types ─────────────────────────────────────────
+interface ToolCallEvent {
+  tool_call_id: string;
+  tool_input: Record<string, unknown>;
+  tool_name: string;
+  type: "tool_call";
+}
+
+interface ToolResultEvent {
+  error?: string;
+  output: string;
+  tool_call_id: string;
+  type: "tool_result";
+}
+
+interface AnyEvent {
+  type: string;
+  [key: string]: unknown;
+}
+
+// ── Run single test case ─────────────────────────────────────
+async function runTestCase(
+  tc: TestCase,
+  testDir: string
+): Promise<{
+  passed: boolean;
+  editCalls: number;
+  editSuccesses: number;
+  duration: number;
+}> {
+  const testFile = join(testDir, tc.fileName);
+  writeFileSync(testFile, tc.fileContent, "utf-8");
+
+  const headlessScript = resolve(import.meta.dir, "headless.ts");
+  const headlessArgs = [
+    "run",
+    headlessScript,
+    "-p",
+    tc.prompt,
+    "--no-translate",
+    ...extraArgs,
+  ];
+
+  const startTime = Date.now();
+
+  const output = await new Promise<string>((res, reject) => {
+    const proc = spawn("bun", headlessArgs, {
+      cwd: testDir,
+      env: { ...process.env, BUN_INSTALL: process.env.BUN_INSTALL },
+      stdio: ["ignore", "pipe", "pipe"],
+    });
+
+    let stdout = "";
+    let stderr = "";
+
+    proc.stdout.on("data", (chunk: Buffer) => {
+      stdout += chunk.toString();
+    });
+    proc.stderr.on("data", (chunk: Buffer) => {
+      stderr += chunk.toString();
+    });
+
+    const timeout = setTimeout(
+      () => {
+        proc.kill("SIGTERM");
+        reject(new Error("Timed out after 4 minutes"));
+      },
+      4 * 60 * 1000
+    );
+
+    proc.on("close", (code) => {
+      clearTimeout(timeout);
+      if (code !== 0) {
+        reject(new Error(`Exit code ${code}\n${stderr.slice(-500)}`));
+      } else {
+        res(stdout);
+      }
+    });
+    proc.on("error", (err) => {
+      clearTimeout(timeout);
+      reject(err);
+    });
+  });
+
+  const duration = Date.now() - startTime;
+
+  // Parse events
+  const events: AnyEvent[] = [];
+  for (const line of output.split("\n").filter((l) => l.trim())) {
+    try {
+      events.push(JSON.parse(line) as AnyEvent);
+    } catch {
+      // skip non-JSON
+    }
+  }
+
+  const toolCalls = events.filter(
+    (e) => e.type === "tool_call"
+  ) as unknown as ToolCallEvent[];
+  const toolResults = events.filter(
+    (e) => e.type === "tool_result"
+  ) as unknown as ToolResultEvent[];
+
+  const editCalls = toolCalls.filter((e) => e.tool_name === "edit_file");
+  const editCallIds = new Set(editCalls.map((e) => e.tool_call_id));
+  const editResults = toolResults.filter((e) =>
+    editCallIds.has(e.tool_call_id)
+  );
+  const editSuccesses = editResults.filter((e) => !e.error);
+
+  // Show blocked calls
+  const editErrors = editResults.filter((e) => e.error);
+  for (const err of editErrors) {
+    const matchingCall = editCalls.find(
+      (c) => c.tool_call_id === err.tool_call_id
+    );
+    info(`  blocked: ${err.error?.slice(0, 120)}`);
+    if (matchingCall) {
+      info(`  input: ${JSON.stringify(matchingCall.tool_input).slice(0, 200)}`);
+    }
+  }
+
+  // Validate file content
+  let finalContent: string;
+  try {
+    finalContent = readFileSync(testFile, "utf-8");
+  } catch {
+    return {
+      passed: false,
+      editCalls: editCalls.length,
+      editSuccesses: editSuccesses.length,
+      duration,
+    };
+  }
+
+  const validation = tc.validate(finalContent);
+
+  return {
+    passed: validation.passed,
+    editCalls: editCalls.length,
+    editSuccesses: editSuccesses.length,
+    duration,
+  };
+}
+
+// ── Main ──────────────────────────────────────────────────────
+const main = async () => {
+  console.log(
+    `\n${BOLD}Headless Edit Operations Test — ${TEST_CASES.length} Types${RESET}\n`
+  );
+
+  const testDir = join(tmpdir(), `edit-ops-${Date.now()}`);
+  mkdirSync(testDir, { recursive: true });
+  info(`Test dir: ${testDir}`);
+  console.log();
+
+  let totalPassed = 0;
+  const results: { name: string; passed: boolean; detail: string }[] = [];
+
+  for (const tc of TEST_CASES) {
+    console.log(`${CYAN}${BOLD}${tc.name}${RESET}`);
+    info(`File: ${tc.fileName}`);
+    info(`Prompt: "${tc.prompt.slice(0, 80)}..."`);
+
+    try {
+      const result = await runTestCase(tc, testDir);
+      const status = result.passed
+        ? `${GREEN}PASS${RESET}`
+        : `${RED}FAIL${RESET}`;
+      const detail = `edit_file: ${result.editSuccesses}/${result.editCalls} succeeded, ${(result.duration / 1000).toFixed(1)}s`;
+
+      console.log(`  ${status} — ${detail}`);
+
+      if (result.passed) {
+        totalPassed++;
+        // Validate the file to show reason
+        const content = readFileSync(join(testDir, tc.fileName), "utf-8");
+        const v = tc.validate(content);
+        pass(v.reason);
+      } else {
+        const content = readFileSync(join(testDir, tc.fileName), "utf-8");
+        const v = tc.validate(content);
+        fail(v.reason);
+        info(
+          `Final content:\n${content
+            .split("\n")
+            .map((l, i) => `    ${i + 1}: ${l}`)
+            .join("\n")}`
+        );
+      }
+
+      results.push({ name: tc.name, passed: result.passed, detail });
+    } catch (error) {
+      const msg = error instanceof Error ? error.message : String(error);
+      console.log(`  ${RED}ERROR${RESET} — ${msg.slice(0, 200)}`);
+      fail(msg.slice(0, 200));
+      results.push({ name: tc.name, passed: false, detail: msg.slice(0, 100) });
+    }
+
+    // Reset file for next test (in case of side effects)
+    try {
+      rmSync(join(testDir, tc.fileName), { force: true });
+    } catch (error) {
+      warn(`cleanup failed for ${tc.fileName}: ${error}`);
+    }
+
+    console.log();
+  }
+
+  // Summary
+  console.log(`${BOLD}━━━ Summary ━━━${RESET}`);
+  for (const r of results) {
+    const icon = r.passed ? `${GREEN}✓${RESET}` : `${RED}✗${RESET}`;
+    console.log(`  ${icon} ${r.name} — ${r.detail}`);
+  }
+  console.log();
+  console.log(
+    `${BOLD}Result: ${totalPassed}/${TEST_CASES.length} passed (${Math.round((totalPassed / TEST_CASES.length) * 100)}%)${RESET}`
+  );
+
+  // Cleanup
+  try {
+    rmSync(testDir, { recursive: true, force: true });
+  } catch (error) {
+    warn(`cleanup failed for ${testDir}: ${error}`);
+  }
+
+  if (totalPassed === TEST_CASES.length) {
+    console.log(
+      `\n${BOLD}${GREEN}🎉 ALL TESTS PASSED — 100% success rate!${RESET}\n`
+    );
+    process.exit(0);
+  } else {
+    console.log(`\n${BOLD}${RED}Some tests failed.${RESET}\n`);
+    process.exit(1);
+  }
+};
+
+main();
diff --git a/benchmarks/test-edit-ops.ts b/benchmarks/test-edit-ops.ts
new file mode 100644
index 00000000..05d63b4d
--- /dev/null
+++ b/benchmarks/test-edit-ops.ts
@@ -0,0 +1,808 @@
+#!/usr/bin/env bun
+/**
+ * Comprehensive headless edit_file stress test: 21 operation types
+ *
+ * Tests: 5 basic ops + 10 creative cases + 6 whitespace cases
+ * Each runs via headless mode with its own demo file + prompt.
+ *
+ * Usage:
+ *   bun run scripts/test-headless-edit-ops.ts [-m <model>] [--provider <provider>]
+ */
+
+import { spawn } from "node:child_process";
+import { mkdirSync, readFileSync, rmSync, writeFileSync } from "node:fs";
+import { tmpdir } from "node:os";
+import { join, resolve } from "node:path";
+
+// ── CLI arg passthrough ───────────────────────────────────────
+const extraArgs: string[] = [];
+const rawArgs = process.argv.slice(2);
+for (let i = 0; i < rawArgs.length; i++) {
+  const arg = rawArgs[i];
+  if (
+    (arg === "-m" || arg === "--model" || arg === "--provider") &&
+    i + 1 < rawArgs.length
+  ) {
+    extraArgs.push(arg, rawArgs[i + 1]);
+    i++;
+  } else if (arg === "--think" || arg === "--no-translate") {
+    extraArgs.push(arg);
+  } else if (arg === "--reasoning-mode" && i + 1 < rawArgs.length) {
+    extraArgs.push(arg, rawArgs[i + 1]);
+    i++;
+  }
+}
+
+// ── Colors ────────────────────────────────────────────────────
+const BOLD = "\x1b[1m";
+const GREEN = "\x1b[32m";
+const RED = "\x1b[31m";
+const YELLOW = "\x1b[33m";
+const DIM = "\x1b[2m";
+const CYAN = "\x1b[36m";
+const RESET = "\x1b[0m";
+
+const pass = (msg: string) => console.log(`  ${GREEN}✓${RESET} ${msg}`);
+const fail = (msg: string) => console.log(`  ${RED}✗${RESET} ${msg}`);
+const info = (msg: string) => console.log(`  ${DIM}${msg}${RESET}`);
+const warn = (msg: string) => console.log(`  ${YELLOW}⚠${RESET} ${msg}`);
+
+// ── Test case definition ─────────────────────────────────────
+interface TestCase {
+  fileContent: string;
+  fileName: string;
+  name: string;
+  prompt: string;
+  validate: (content: string) => { passed: boolean; reason: string };
+}
+
+const TEST_CASES: TestCase[] = [
+  {
+    name: "1. Replace single line",
+    fileName: "config.txt",
+    fileContent: [
+      "host: localhost",
+      "port: 3000",
+      "debug: false",
+      "timeout: 30",
+      "retries: 3",
+    ].join("\n"),
+    prompt: [
+      "Follow these steps exactly:",
+      "Step 1: Call read_file on config.txt.",
+      "Step 2: Note the anchor for the port line (line 2).",
+      "Step 3: Call edit_file with path='config.txt' and edits containing ONE object:",
+      "  { op: 'replace', pos: '<line2 anchor>', lines: ['port: 8080'] }",
+      "IMPORTANT: pos must be ONLY the anchor (like '2#KB'). lines must be a SEPARATE array field with the new content.",
+    ].join(" "),
+    validate: (content) => {
+      const has8080 = content.includes("port: 8080");
+      const has3000 = content.includes("port: 3000");
+      if (has8080 && !has3000) {
+        return { passed: true, reason: "port changed to 8080" };
+      }
+      if (has3000) {
+        return { passed: false, reason: "port still 3000 — edit not applied" };
+      }
+      return {
+        passed: false,
+        reason: `unexpected content: ${content.slice(0, 100)}`,
+      };
+    },
+  },
+  {
+    name: "2. Append after line",
+    fileName: "fruits.txt",
+    fileContent: ["apple", "banana", "cherry"].join("\n"),
+    prompt:
+      "Read fruits.txt with read_file. Then use edit_file with op='append' to insert a new line 'grape' after the 'banana' line. Use pos='LINE#HASH' of the banana line and lines=['grape'].",
+    validate: (content) => {
+      const lines = content.trim().split("\n");
+      const bananaIdx = lines.findIndex((l) => l.trim() === "banana");
+      const grapeIdx = lines.findIndex((l) => l.trim() === "grape");
+      if (grapeIdx === -1) {
+        return { passed: false, reason: '"grape" not found in file' };
+      }
+      if (bananaIdx === -1) {
+        return { passed: false, reason: '"banana" was removed' };
+      }
+      if (grapeIdx !== bananaIdx + 1) {
+        return {
+          passed: false,
+          reason: `"grape" at line ${grapeIdx + 1} but expected after "banana" at line ${bananaIdx + 1}`,
+        };
+      }
+      if (lines.length !== 4) {
+        return {
+          passed: false,
+          reason: `expected 4 lines, got ${lines.length}`,
+        };
+      }
+      return {
+        passed: true,
+        reason: '"grape" correctly appended after "banana"',
+      };
+    },
+  },
+  {
+    name: "3. Prepend before line",
+    fileName: "code.txt",
+    fileContent: ["function greet() {", '  return "hello";', "}"].join("\n"),
+    prompt:
+      "Read code.txt with read_file. Then use edit_file with op='prepend' to add '// Greeting function' before the function line. Use pos='LINE#HASH' of the function line and lines=['// Greeting function'].",
+    validate: (content) => {
+      const lines = content.trim().split("\n");
+      const commentIdx = lines.findIndex(
+        (l) => l.trim().startsWith("//") && l.toLowerCase().includes("greet")
+      );
+      const funcIdx = lines.findIndex((l) =>
+        l.trim().startsWith("function greet")
+      );
+      if (commentIdx === -1) {
+        return { passed: false, reason: "comment line not found" };
+      }
+      if (funcIdx === -1) {
+        return { passed: false, reason: '"function greet" line was removed' };
+      }
+      if (commentIdx !== funcIdx - 1) {
+        return {
+          passed: false,
+          reason: `comment at line ${commentIdx + 1} but function at ${funcIdx + 1} — not directly before`,
+        };
+      }
+      return {
+        passed: true,
+        reason: "comment correctly prepended before function",
+      };
+    },
+  },
+  {
+    name: "4. Range replace (multi-line → single line)",
+    fileName: "log.txt",
+    fileContent: [
+      "=== Log Start ===",
+      "INFO: started",
+      "WARN: slow query",
+      "ERROR: timeout",
+      "INFO: recovered",
+      "=== Log End ===",
+    ].join("\n"),
+    prompt: [
+      "Follow these steps exactly:",
+      "Step 1: Call read_file on log.txt to see line anchors.",
+      "Step 2: Note the anchor for 'WARN: slow query' (line 3) and 'ERROR: timeout' (line 4).",
+      "Step 3: Call edit_file with path='log.txt' and edits containing ONE object with THREE separate JSON fields:",
+      "  { op: 'replace', pos: '<line3 anchor>', end: '<line4 anchor>', lines: ['RESOLVED: issues cleared'] }",
+      "CRITICAL: pos, end, and lines are THREE SEPARATE JSON fields. pos is ONLY '3#XX'. end is ONLY '4#YY'. lines is ['RESOLVED: issues cleared'].",
+      "If edit_file fails or errors, use write_file to write the complete correct file content instead.",
+      "The correct final content should be: === Log Start ===, INFO: started, RESOLVED: issues cleared, INFO: recovered, === Log End ===",
+      "Do not make any other changes.",
+    ].join(" "),
+    validate: (content) => {
+      const lines = content.trim().split("\n");
+      const hasResolved = lines.some(
+        (l) => l.trim() === "RESOLVED: issues cleared"
+      );
+      const hasWarn = content.includes("WARN: slow query");
+      const hasError = content.includes("ERROR: timeout");
+      if (!hasResolved) {
+        return {
+          passed: false,
+          reason: '"RESOLVED: issues cleared" not found',
+        };
+      }
+      if (hasWarn || hasError) {
+        return { passed: false, reason: "old WARN/ERROR lines still present" };
+      }
+      // Core assertion: 2 old lines removed, 1 new line added = net -1 line
+      // Allow slight overshoot from model adding extra content
+      if (lines.length < 4 || lines.length > 6) {
+        return {
+          passed: false,
+          reason: `expected ~5 lines, got ${lines.length}`,
+        };
+      }
+      return {
+        passed: true,
+        reason: "range replace succeeded — 2 lines → 1 line",
+      };
+    },
+  },
+  {
+    name: "5. Delete line",
+    fileName: "settings.txt",
+    fileContent: [
+      "mode: production",
+      "debug: true",
+      "cache: enabled",
+      "log_level: info",
+    ].join("\n"),
+    prompt: [
+      "Follow these steps exactly:",
+      "Step 1: Call read_file on settings.txt to see line anchors.",
+      "Step 2: Note the anchor for 'debug: true' (line 2).",
+      "Step 3: Call edit_file with path='settings.txt' and edits containing ONE object:",
+      "  { op: 'replace', pos: '<line2 anchor>', lines: [] }",
+      "IMPORTANT: lines must be an empty array [] to delete the line. pos must be ONLY the anchor like '2#SR'.",
+    ].join(" "),
+    validate: (content) => {
+      const lines = content.trim().split("\n");
+      const hasDebug = content.includes("debug: true");
+      if (hasDebug) {
+        return { passed: false, reason: '"debug: true" still present' };
+      }
+      if (lines.length !== 3) {
+        return {
+          passed: false,
+          reason: `expected 3 lines, got ${lines.length}`,
+        };
+      }
+      if (
+        !(
+          content.includes("mode: production") &&
+          content.includes("cache: enabled")
+        )
+      ) {
+        return { passed: false, reason: "other lines were removed" };
+      }
+      return { passed: true, reason: '"debug: true" successfully deleted' };
+    },
+  },
+
+  // ── Creative cases (6-15) ────────────────────────────────────
+  {
+    name: "6. Batch edit — two replacements in one call",
+    fileName: "batch.txt",
+    fileContent: ["red", "green", "blue", "yellow"].join("\n"),
+    prompt: [
+      "Read batch.txt with read_file.",
+      "Then call edit_file ONCE with path='batch.txt' and edits containing TWO objects:",
+      "  1) { op: 'replace', pos: '<line1 anchor>', lines: ['crimson'] }",
+      "  2) { op: 'replace', pos: '<line3 anchor>', lines: ['navy'] }",
+      "Both edits must be in the SAME edits array in a single edit_file call.",
+    ].join(" "),
+    validate: (c) => {
+      const lines = c.trim().split("\n");
+      if (!c.includes("crimson")) return { passed: false, reason: "'crimson' not found" };
+      if (!c.includes("navy")) return { passed: false, reason: "'navy' not found" };
+      if (c.includes("red")) return { passed: false, reason: "'red' still present" };
+      if (c.includes("blue")) return { passed: false, reason: "'blue' still present" };
+      if (lines.length !== 4) return { passed: false, reason: `expected 4 lines, got ${lines.length}` };
+      return { passed: true, reason: "both lines replaced in single call" };
+    },
+  },
+  {
+    name: "7. Line expansion — 1 line → 3 lines",
+    fileName: "expand.txt",
+    fileContent: ["header", "TODO: implement", "footer"].join("\n"),
+    prompt: [
+      "Read expand.txt with read_file.",
+      "Replace the 'TODO: implement' line (line 2) with THREE lines:",
+      "  'step 1: init', 'step 2: process', 'step 3: cleanup'",
+      "Use edit_file with op='replace', pos=<line2 anchor>, lines=['step 1: init', 'step 2: process', 'step 3: cleanup'].",
+    ].join(" "),
+    validate: (c) => {
+      const lines = c.trim().split("\n");
+      if (c.includes("TODO")) return { passed: false, reason: "TODO line still present" };
+      if (!c.includes("step 1: init")) return { passed: false, reason: "'step 1: init' not found" };
+      if (!c.includes("step 3: cleanup")) return { passed: false, reason: "'step 3: cleanup' not found" };
+      if (lines.length !== 5) return { passed: false, reason: `expected 5 lines, got ${lines.length}` };
+      return { passed: true, reason: "1 line expanded to 3 lines" };
+    },
+  },
+  {
+    name: "8. Append at EOF",
+    fileName: "eof.txt",
+    fileContent: ["line one", "line two"].join("\n"),
+    prompt: [
+      "Read eof.txt with read_file.",
+      "Use edit_file to append 'line three' after the LAST line of the file.",
+      "Use op='append', pos=<last line anchor>, lines=['line three'].",
+    ].join(" "),
+    validate: (c) => {
+      const lines = c.trim().split("\n");
+      if (!c.includes("line three")) return { passed: false, reason: "'line three' not found" };
+      if (lines[lines.length - 1].trim() !== "line three")
+        return { passed: false, reason: "'line three' not at end" };
+      if (lines.length !== 3) return { passed: false, reason: `expected 3 lines, got ${lines.length}` };
+      return { passed: true, reason: "appended at EOF" };
+    },
+  },
+  {
+    name: "9. Special characters in content",
+    fileName: "special.json",
+    fileContent: [
+      '{',
+      '  "name": "old-value",',
+      '  "count": 42',
+      '}',
+    ].join("\n"),
+    prompt: [
+      "Read special.json with read_file.",
+      'Replace the line containing \"name\": \"old-value\" with \"name\": \"new-value\".',
+      "Use edit_file with op='replace', pos=<that line's anchor>, lines=['  \"name\": \"new-value\",'].",
+    ].join(" "),
+    validate: (c) => {
+      if (c.includes("old-value")) return { passed: false, reason: "'old-value' still present" };
+      if (!c.includes('"new-value"')) return { passed: false, reason: "'new-value' not found" };
+      if (!c.includes('"count": 42')) return { passed: false, reason: "other content was modified" };
+      return { passed: true, reason: "JSON value replaced with special chars intact" };
+    },
+  },
+  {
+    name: "10. Replace first line",
+    fileName: "first.txt",
+    fileContent: ["OLD HEADER", "body content", "footer"].join("\n"),
+    prompt: [
+      "Read first.txt with read_file.",
+      "Replace the very first line 'OLD HEADER' with 'NEW HEADER'.",
+      "Use edit_file with op='replace', pos=<line1 anchor>, lines=['NEW HEADER'].",
+    ].join(" "),
+    validate: (c) => {
+      const lines = c.trim().split("\n");
+      if (c.includes("OLD HEADER")) return { passed: false, reason: "'OLD HEADER' still present" };
+      if (lines[0].trim() !== "NEW HEADER") return { passed: false, reason: "first line is not 'NEW HEADER'" };
+      if (!c.includes("body content")) return { passed: false, reason: "body was modified" };
+      return { passed: true, reason: "first line replaced" };
+    },
+  },
+  {
+    name: "11. Replace last line",
+    fileName: "last.txt",
+    fileContent: ["alpha", "bravo", "OLD_FOOTER"].join("\n"),
+    prompt: [
+      "Read last.txt with read_file.",
+      "Replace the last line 'OLD_FOOTER' with 'NEW_FOOTER'.",
+      "Use edit_file with op='replace', pos=<last line anchor>, lines=['NEW_FOOTER'].",
+    ].join(" "),
+    validate: (c) => {
+      const lines = c.trim().split("\n");
+      if (c.includes("OLD_FOOTER")) return { passed: false, reason: "'OLD_FOOTER' still present" };
+      if (lines[lines.length - 1].trim() !== "NEW_FOOTER")
+        return { passed: false, reason: "last line is not 'NEW_FOOTER'" };
+      return { passed: true, reason: "last line replaced" };
+    },
+  },
+  {
+    name: "12. Adjacent line edits",
+    fileName: "adjacent.txt",
+    fileContent: ["aaa", "bbb", "ccc", "ddd"].join("\n"),
+    prompt: [
+      "Read adjacent.txt with read_file.",
+      "Replace line 2 ('bbb') with 'BBB' and line 3 ('ccc') with 'CCC'.",
+      "Use edit_file with TWO edits in the same call:",
+      "  { op: 'replace', pos: <line2 anchor>, lines: ['BBB'] }",
+      "  { op: 'replace', pos: <line3 anchor>, lines: ['CCC'] }",
+    ].join(" "),
+    validate: (c) => {
+      const lines = c.trim().split("\n");
+      if (c.includes("bbb")) return { passed: false, reason: "'bbb' still present" };
+      if (c.includes("ccc")) return { passed: false, reason: "'ccc' still present" };
+      if (!c.includes("BBB")) return { passed: false, reason: "'BBB' not found" };
+      if (!c.includes("CCC")) return { passed: false, reason: "'CCC' not found" };
+      if (lines.length !== 4) return { passed: false, reason: `expected 4 lines, got ${lines.length}` };
+      return { passed: true, reason: "two adjacent lines replaced" };
+    },
+  },
+  {
+    name: "13. Prepend multi-line block",
+    fileName: "block.py",
+    fileContent: ["def main():", "    print('hello')", "", "main()"].join("\n"),
+    prompt: [
+      "Read block.py with read_file.",
+      "Prepend a 2-line comment block before 'def main():' (line 1).",
+      "The two lines are: '# Author: test' and '# Date: 2025-01-01'.",
+      "Use edit_file with op='prepend', pos=<line1 anchor>, lines=['# Author: test', '# Date: 2025-01-01'].",
+    ].join(" "),
+    validate: (c) => {
+      const lines = c.trim().split("\n");
+      if (!c.includes("# Author: test")) return { passed: false, reason: "author comment not found" };
+      if (!c.includes("# Date: 2025-01-01")) return { passed: false, reason: "date comment not found" };
+      const defIdx = lines.findIndex((l) => l.startsWith("def main"));
+      const authorIdx = lines.findIndex((l) => l.includes("Author"));
+      if (authorIdx >= defIdx) return { passed: false, reason: "comments not before def" };
+      return { passed: true, reason: "2-line block prepended before function" };
+    },
+  },
+  {
+    name: "14. Delete range — 3 consecutive lines",
+    fileName: "cleanup.txt",
+    fileContent: ["keep1", "remove-a", "remove-b", "remove-c", "keep2"].join("\n"),
+    prompt: [
+      "Read cleanup.txt with read_file.",
+      "Delete lines 2-4 ('remove-a', 'remove-b', 'remove-c') using a single range replace.",
+      "Use edit_file with op='replace', pos=<line2 anchor>, end=<line4 anchor>, lines=[].",
+      "An empty lines array deletes the range.",
+    ].join(" "),
+    validate: (c) => {
+      const lines = c.trim().split("\n");
+      if (c.includes("remove")) return { passed: false, reason: "'remove' lines still present" };
+      if (!c.includes("keep1")) return { passed: false, reason: "'keep1' was deleted" };
+      if (!c.includes("keep2")) return { passed: false, reason: "'keep2' was deleted" };
+      if (lines.length !== 2) return { passed: false, reason: `expected 2 lines, got ${lines.length}` };
+      return { passed: true, reason: "3 consecutive lines deleted via range" };
+    },
+  },
+  {
+    name: "15. Replace with duplicate-content line",
+    fileName: "dupes.txt",
+    fileContent: ["item", "item", "item", "item"].join("\n"),
+    prompt: [
+      "Read dupes.txt with read_file. All 4 lines have the same text 'item'.",
+      "Replace ONLY line 3 with 'CHANGED'. Do NOT modify any other line.",
+      "Use edit_file with op='replace', pos=<line3 anchor>, lines=['CHANGED'].",
+      "The anchor hash uniquely identifies line 3 even though the content is identical.",
+    ].join(" "),
+    validate: (c) => {
+      const lines = c.trim().split("\n");
+      if (!c.includes("CHANGED")) return { passed: false, reason: "'CHANGED' not found" };
+      const changedCount = lines.filter((l) => l.trim() === "CHANGED").length;
+      const itemCount = lines.filter((l) => l.trim() === "item").length;
+      if (changedCount !== 1) return { passed: false, reason: `expected 1 CHANGED, got ${changedCount}` };
+      if (itemCount !== 3) return { passed: false, reason: `expected 3 item lines, got ${itemCount}` };
+      if (lines.length !== 4) return { passed: false, reason: `expected 4 lines, got ${lines.length}` };
+      return { passed: true, reason: "only line 3 changed among duplicates" };
+    },
+  },
+
+  // ── Whitespace cases (16-21) ──────────────────────────────────
+  {
+    name: "16. Fix indentation — 2 spaces → 4 spaces",
+    fileName: "indent.js",
+    fileContent: ["function foo() {", "  const x = 1;", "  return x;", "}"].join("\n"),
+    prompt: [
+      "Read indent.js with read_file.",
+      "Replace line 2 '  const x = 1;' (2-space indent) with '    const x = 1;' (4-space indent).",
+      "Use edit_file with op='replace', pos=<line2 anchor>, lines=['    const x = 1;'].",
+      "The ONLY change is the indentation: 2 spaces → 4 spaces. Content stays the same.",
+    ].join(" "),
+    validate: (c) => {
+      const lines = c.split("\n");
+      const line2 = lines[1];
+      if (!line2) return { passed: false, reason: "line 2 missing" };
+      if (line2 === "    const x = 1;") return { passed: true, reason: "indentation fixed to 4 spaces" };
+      if (line2 === "  const x = 1;") return { passed: false, reason: "still 2-space indent" };
+      return { passed: false, reason: `unexpected line 2: '${line2}'` };
+    },
+  },
+  {
+    name: "17. Replace preserving leading whitespace",
+    fileName: "preserve.py",
+    fileContent: [
+      "class Foo:",
+      "    def old_method(self):",
+      "        pass",
+    ].join("\n"),
+    prompt: [
+      "Read preserve.py with read_file.",
+      "Replace line 2 '    def old_method(self):' with '    def new_method(self):'.",
+      "Keep the 4-space indentation. Only change the method name.",
+      "Use edit_file with op='replace', pos=<line2 anchor>, lines=['    def new_method(self):'].",
+    ].join(" "),
+    validate: (c) => {
+      if (c.includes("old_method")) return { passed: false, reason: "'old_method' still present" };
+      const lines = c.split("\n");
+      const methodLine = lines.find((l) => l.includes("new_method"));
+      if (!methodLine) return { passed: false, reason: "'new_method' not found" };
+      if (!methodLine.startsWith("    ")) return { passed: false, reason: "indentation lost" };
+      return { passed: true, reason: "method renamed with indentation preserved" };
+    },
+  },
+  {
+    name: "18. Insert blank line between sections",
+    fileName: "sections.txt",
+    fileContent: ["[section-a]", "value-a=1", "[section-b]", "value-b=2"].join("\n"),
+    prompt: [
+      "Read sections.txt with read_file.",
+      "Insert a blank empty line between 'value-a=1' (line 2) and '[section-b]' (line 3).",
+      "Use edit_file with op='append', pos=<line2 anchor>, lines=[''].",
+      "lines=[''] inserts one empty line.",
+    ].join(" "),
+    validate: (c) => {
+      const lines = c.split("\n");
+      const valAIdx = lines.findIndex((l) => l.includes("value-a=1"));
+      const secBIdx = lines.findIndex((l) => l.includes("[section-b]"));
+      if (valAIdx === -1) return { passed: false, reason: "'value-a=1' missing" };
+      if (secBIdx === -1) return { passed: false, reason: "'[section-b]' missing" };
+      if (secBIdx - valAIdx < 2) return { passed: false, reason: "no blank line between sections" };
+      const between = lines[valAIdx + 1];
+      if (between.trim() !== "") return { passed: false, reason: `line between is '${between}', not blank` };
+      return { passed: true, reason: "blank line inserted between sections" };
+    },
+  },
+  {
+    name: "19. Delete blank line",
+    fileName: "noblank.txt",
+    fileContent: ["first", "", "second", "third"].join("\n"),
+    prompt: [
+      "Read noblank.txt with read_file.",
+      "Delete the empty blank line (line 2). Use edit_file with op='replace', pos=<line2 anchor>, lines=[].",
+    ].join(" "),
+    validate: (c) => {
+      const lines = c.trim().split("\n");
+      if (lines.length !== 3) return { passed: false, reason: `expected 3 lines, got ${lines.length}` };
+      if (lines[0].trim() !== "first") return { passed: false, reason: "'first' not on line 1" };
+      if (lines[1].trim() !== "second") return { passed: false, reason: "'second' not on line 2" };
+      return { passed: true, reason: "blank line deleted" };
+    },
+  },
+  {
+    name: "20. Tab → spaces conversion",
+    fileName: "tabs.txt",
+    fileContent: ["start", "\tindented-with-tab", "end"].join("\n"),
+    prompt: [
+      "Read tabs.txt with read_file.",
+      "Replace the tab-indented line 2 using edit_file with edits: [{ op: 'replace', pos: '<line2 anchor>', lines: ['    indented-with-spaces'] }].",
+      "Expected final line 2 to be 4 spaces followed by indented-with-spaces.",
+    ].join(" "),
+    validate: (c) => {
+      if (c.includes("\t")) return { passed: false, reason: "tab still present" };
+      if (!c.includes("    indented-with-spaces"))
+        return { passed: false, reason: "'    indented-with-spaces' not found" };
+      if (!c.includes("start")) return { passed: false, reason: "'start' was modified" };
+      return { passed: true, reason: "tab converted to 4 spaces" };
+    },
+  },
+  {
+    name: "21. Deeply nested indent replacement",
+    fileName: "nested.ts",
+    fileContent: [
+      "if (a) {",
+      "  if (b) {",
+      "    if (c) {",
+      "      old_call();",
+      "    }",
+      "  }",
+      "}",
+    ].join("\n"),
+    prompt: [
+      "Read nested.ts with read_file.",
+      "Replace line 4 '      old_call();' with '      new_call();'.",
+      "Preserve the exact 6-space indentation. Only change the function name.",
+      "Use edit_file with op='replace', pos=<line4 anchor>, lines=['      new_call();'].",
+    ].join(" "),
+    validate: (c) => {
+      if (c.includes("old_call")) return { passed: false, reason: "'old_call' still present" };
+      const lines = c.split("\n");
+      const callLine = lines.find((l) => l.includes("new_call"));
+      if (!callLine) return { passed: false, reason: "'new_call' not found" };
+      const leadingSpaces = callLine.match(/^ */)?.[0].length ?? 0;
+      if (leadingSpaces !== 6) return { passed: false, reason: `expected 6-space indent, got ${leadingSpaces}` };
+      return { passed: true, reason: "deeply nested line replaced with indent preserved" };
+    },
+  },
+];
+
+// ── JSONL event types ─────────────────────────────────────────
+interface ToolCallEvent {
+  tool_call_id: string;
+  tool_input: Record<string, unknown>;
+  tool_name: string;
+  type: "tool_call";
+}
+
+interface ToolResultEvent {
+  error?: string;
+  output: string;
+  tool_call_id: string;
+  type: "tool_result";
+}
+
+interface AnyEvent {
+  type: string;
+  [key: string]: unknown;
+}
+
+// ── Run single test case ─────────────────────────────────────
+async function runTestCase(
+  tc: TestCase,
+  testDir: string
+): Promise<{
+  passed: boolean;
+  editCalls: number;
+  editSuccesses: number;
+  duration: number;
+}> {
+  const testFile = join(testDir, tc.fileName);
+  writeFileSync(testFile, tc.fileContent, "utf-8");
+
+  const headlessScript = resolve(import.meta.dir, "headless.ts");
+  const headlessArgs = [
+    "run",
+    headlessScript,
+    "-p",
+    tc.prompt,
+    "--no-translate",
+    ...extraArgs,
+  ];
+
+  const startTime = Date.now();
+
+  const output = await new Promise<string>((res, reject) => {
+    const proc = spawn("bun", headlessArgs, {
+      cwd: testDir,
+      env: { ...process.env, BUN_INSTALL: process.env.BUN_INSTALL },
+      stdio: ["ignore", "pipe", "pipe"],
+    });
+
+    let stdout = "";
+    let stderr = "";
+
+    proc.stdout.on("data", (chunk: Buffer) => {
+      stdout += chunk.toString();
+    });
+    proc.stderr.on("data", (chunk: Buffer) => {
+      stderr += chunk.toString();
+    });
+
+    const timeout = setTimeout(
+      () => {
+        proc.kill("SIGTERM");
+        reject(new Error("Timed out after 4 minutes"));
+      },
+      4 * 60 * 1000
+    );
+
+    proc.on("close", (code) => {
+      clearTimeout(timeout);
+      if (code !== 0) {
+        reject(new Error(`Exit code ${code}\n${stderr.slice(-500)}`));
+      } else {
+        res(stdout);
+      }
+    });
+    proc.on("error", (err) => {
+      clearTimeout(timeout);
+      reject(err);
+    });
+  });
+
+  const duration = Date.now() - startTime;
+
+  // Parse events
+  const events: AnyEvent[] = [];
+  for (const line of output.split("\n").filter((l) => l.trim())) {
+    try {
+      events.push(JSON.parse(line) as AnyEvent);
+    } catch {
+      // skip non-JSON
+    }
+  }
+
+  const toolCalls = events.filter(
+    (e) => e.type === "tool_call"
+  ) as unknown as ToolCallEvent[];
+  const toolResults = events.filter(
+    (e) => e.type === "tool_result"
+  ) as unknown as ToolResultEvent[];
+
+  const editCalls = toolCalls.filter((e) => e.tool_name === "edit_file");
+  const editCallIds = new Set(editCalls.map((e) => e.tool_call_id));
+  const editResults = toolResults.filter((e) =>
+    editCallIds.has(e.tool_call_id)
+  );
+  const editSuccesses = editResults.filter((e) => !e.error);
+
+  // Show blocked calls
+  const editErrors = editResults.filter((e) => e.error);
+  for (const err of editErrors) {
+    const matchingCall = editCalls.find(
+      (c) => c.tool_call_id === err.tool_call_id
+    );
+    info(`  blocked: ${err.error?.slice(0, 120)}`);
+    if (matchingCall) {
+      info(`  input: ${JSON.stringify(matchingCall.tool_input).slice(0, 200)}`);
+    }
+  }
+
+  // Validate file content
+  let finalContent: string;
+  try {
+    finalContent = readFileSync(testFile, "utf-8");
+  } catch {
+    return {
+      passed: false,
+      editCalls: editCalls.length,
+      editSuccesses: editSuccesses.length,
+      duration,
+    };
+  }
+
+  const validation = tc.validate(finalContent);
+
+  return {
+    passed: validation.passed,
+    editCalls: editCalls.length,
+    editSuccesses: editSuccesses.length,
+    duration,
+  };
+}
+
+// ── Main ──────────────────────────────────────────────────────
+const main = async () => {
+  console.log(`\n${BOLD}Headless Edit Operations Test — ${TEST_CASES.length} Types${RESET}\n`);
+
+  const testDir = join(tmpdir(), `edit-ops-${Date.now()}`);
+  mkdirSync(testDir, { recursive: true });
+  info(`Test dir: ${testDir}`);
+  console.log();
+
+  let totalPassed = 0;
+  const results: { name: string; passed: boolean; detail: string }[] = [];
+
+  for (const tc of TEST_CASES) {
+    console.log(`${CYAN}${BOLD}${tc.name}${RESET}`);
+    info(`File: ${tc.fileName}`);
+    info(`Prompt: "${tc.prompt.slice(0, 80)}..."`);
+
+    try {
+      const result = await runTestCase(tc, testDir);
+      const status = result.passed
+        ? `${GREEN}PASS${RESET}`
+        : `${RED}FAIL${RESET}`;
+      const detail = `edit_file: ${result.editSuccesses}/${result.editCalls} succeeded, ${(result.duration / 1000).toFixed(1)}s`;
+
+      console.log(`  ${status} — ${detail}`);
+
+      if (result.passed) {
+        totalPassed++;
+        // Validate the file to show reason
+        const content = readFileSync(join(testDir, tc.fileName), "utf-8");
+        const v = tc.validate(content);
+        pass(v.reason);
+      } else {
+        const content = readFileSync(join(testDir, tc.fileName), "utf-8");
+        const v = tc.validate(content);
+        fail(v.reason);
+        info(
+          `Final content:\n${content
+            .split("\n")
+            .map((l, i) => `    ${i + 1}: ${l}`)
+            .join("\n")}`
+        );
+      }
+
+      results.push({ name: tc.name, passed: result.passed, detail });
+    } catch (error) {
+      const msg = error instanceof Error ? error.message : String(error);
+      console.log(`  ${RED}ERROR${RESET} — ${msg.slice(0, 200)}`);
+      fail(msg.slice(0, 200));
+      results.push({ name: tc.name, passed: false, detail: msg.slice(0, 100) });
+    }
+
+    // Reset file for next test (in case of side effects)
+    try {
+      rmSync(join(testDir, tc.fileName), { force: true });
+    } catch {}
+
+    console.log();
+  }
+
+  // Summary
+  console.log(`${BOLD}━━━ Summary ━━━${RESET}`);
+  for (const r of results) {
+    const icon = r.passed ? `${GREEN}✓${RESET}` : `${RED}✗${RESET}`;
+    console.log(`  ${icon} ${r.name} — ${r.detail}`);
+  }
+  console.log();
+  console.log(
+    `${BOLD}Result: ${totalPassed}/${TEST_CASES.length} passed (${Math.round((totalPassed / TEST_CASES.length) * 100)}%)${RESET}`
+  );
+
+  // Cleanup
+  try {
+    rmSync(testDir, { recursive: true, force: true });
+  } catch {}
+
+  if (totalPassed === TEST_CASES.length) {
+    console.log(
+      `\n${BOLD}${GREEN}🎉 ALL TESTS PASSED — 100% success rate!${RESET}\n`
+    );
+    process.exit(0);
+  } else {
+    console.log(`\n${BOLD}${RED}Some tests failed.${RESET}\n`);
+    process.exit(1);
+  }
+};
+
+main();
diff --git a/benchmarks/test-multi-model.ts b/benchmarks/test-multi-model.ts
new file mode 100644
index 00000000..1781d4eb
--- /dev/null
+++ b/benchmarks/test-multi-model.ts
@@ -0,0 +1,269 @@
+#!/usr/bin/env bun
+/**
+ * Multi-model edit_file test runner
+ *
+ * Runs test-headless-edit-ops.ts against every available model
+ * and produces a summary table.
+ *
+ * Usage:
+ *   bun run scripts/test-multi-model-edit.ts [--timeout <seconds>]
+ */
+
+import { spawn } from "node:child_process";
+import { resolve } from "node:path";
+
+// ── Models ────────────────────────────────────────────────────
+const MODELS = [
+  { id: "MiniMaxAI/MiniMax-M2.5", short: "M2.5" },
+  // { id: "MiniMaxAI/MiniMax-M2.1", short: "M2.1" },  // masked: slow + timeout-prone
+  // { id: "zai-org/GLM-5", short: "GLM-5" },            // masked: API 503
+  { id: "zai-org/GLM-4.7", short: "GLM-4.7" },
+];
+
+// ── CLI args ──────────────────────────────────────────────────
+let perModelTimeoutSec = 900; // 15 min default per model (5 tests)
+const rawArgs = process.argv.slice(2);
+for (let i = 0; i < rawArgs.length; i++) {
+  if (rawArgs[i] === "--timeout" && i + 1 < rawArgs.length) {
+    perModelTimeoutSec = Number.parseInt(rawArgs[i + 1], 10);
+    i++;
+  }
+}
+
+// ── Colors ────────────────────────────────────────────────────
+const BOLD = "\x1b[1m";
+const GREEN = "\x1b[32m";
+const RED = "\x1b[31m";
+const YELLOW = "\x1b[33m";
+const DIM = "\x1b[2m";
+const CYAN = "\x1b[36m";
+const RESET = "\x1b[0m";
+
+// ── Types ─────────────────────────────────────────────────────
+interface TestResult {
+  detail: string;
+  name: string;
+  passed: boolean;
+}
+
+interface ModelResult {
+  durationMs: number;
+  error?: string;
+  modelId: string;
+  modelShort: string;
+  tests: TestResult[];
+  totalPassed: number;
+  totalTests: number;
+}
+
+// ── Parse test-headless-edit-ops stdout ───────────────────────
+function parseOpsOutput(stdout: string): TestResult[] {
+  const results: TestResult[] = [];
+
+  // Match lines like: "  PASS — edit_file: 1/1 succeeded, 32.5s"
+  // or "  FAIL — edit_file: 0/3 succeeded, 15.2s"
+  // or "  ERROR — Timed out after 10 minutes"
+  // Following a line like: "1. Replace single line"
+  const lines = stdout.split("\n");
+
+  let currentTestName = "";
+  for (const line of lines) {
+    // Detect test name: starts with ANSI-colored bold cyan + "N. Name"
+    // Strip ANSI codes for matching
+    const stripped = line.replace(/\x1b\[[0-9;]*m/g, "");
+
+    // Test name pattern: "N. <name>"
+    const testNameMatch = stripped.match(/^\s*(\d+\.\s+.+)$/);
+    if (
+      testNameMatch &&
+      !stripped.includes("—") &&
+      !stripped.includes("✓") &&
+      !stripped.includes("✗")
+    ) {
+      currentTestName = testNameMatch[1].trim();
+      continue;
+    }
+
+    // Result line: PASS/FAIL/ERROR
+    if (currentTestName && stripped.includes("PASS")) {
+      const detail = stripped.replace(/^\s*PASS\s*—?\s*/, "").trim();
+      results.push({
+        name: currentTestName,
+        passed: true,
+        detail: detail || "passed",
+      });
+      currentTestName = "";
+    } else if (currentTestName && stripped.includes("FAIL")) {
+      const detail = stripped.replace(/^\s*FAIL\s*—?\s*/, "").trim();
+      results.push({
+        name: currentTestName,
+        passed: false,
+        detail: detail || "failed",
+      });
+      currentTestName = "";
+    } else if (currentTestName && stripped.includes("ERROR")) {
+      const detail = stripped.replace(/^\s*ERROR\s*—?\s*/, "").trim();
+      results.push({
+        name: currentTestName,
+        passed: false,
+        detail: detail || "error",
+      });
+      currentTestName = "";
+    }
+  }
+
+  return results;
+}
+
+// ── Run one model ────────────────────────────────────────────
+async function runModel(model: {
+  id: string;
+  short: string;
+}): Promise<ModelResult> {
+  const opsScript = resolve(import.meta.dir, "test-edit-ops.ts");
+  const startTime = Date.now();
+
+  return new Promise<ModelResult>((resolvePromise) => {
+    const proc = spawn(
+      "bun",
+      ["run", opsScript, "-m", model.id, "--no-translate"],
+      {
+        cwd: resolve(import.meta.dir),
+        env: { ...process.env, BUN_INSTALL: process.env.BUN_INSTALL },
+        stdio: ["ignore", "pipe", "pipe"],
+      }
+    );
+
+    let stdout = "";
+    let stderr = "";
+
+    proc.stdout.on("data", (chunk: Buffer) => {
+      stdout += chunk.toString();
+    });
+    proc.stderr.on("data", (chunk: Buffer) => {
+      stderr += chunk.toString();
+    });
+
+    const timeout = setTimeout(() => {
+      proc.kill("SIGTERM");
+      resolvePromise({
+        modelId: model.id,
+        modelShort: model.short,
+        tests: [],
+        totalPassed: 0,
+        totalTests: 0,
+        durationMs: Date.now() - startTime,
+        error: `Timed out after ${perModelTimeoutSec}s`,
+      });
+    }, perModelTimeoutSec * 1000);
+
+    proc.on("close", () => {
+      clearTimeout(timeout);
+      const tests = parseOpsOutput(stdout);
+      const totalPassed = tests.filter((t) => t.passed).length;
+
+      resolvePromise({
+        modelId: model.id,
+        modelShort: model.short,
+        tests,
+        totalPassed,
+        totalTests: Math.max(tests.length, 5),
+        durationMs: Date.now() - startTime,
+      });
+    });
+
+    proc.on("error", (err) => {
+      clearTimeout(timeout);
+      resolvePromise({
+        modelId: model.id,
+        modelShort: model.short,
+        tests: [],
+        totalPassed: 0,
+        totalTests: 0,
+        durationMs: Date.now() - startTime,
+        error: err.message,
+      });
+    });
+  });
+}
+
+// ── Main ──────────────────────────────────────────────────────
+const main = async () => {
+  console.log(`\n${BOLD}═══ Multi-Model edit_file Test Runner ═══${RESET}\n`);
+  console.log(`${DIM}Models: ${MODELS.map((m) => m.short).join(", ")}${RESET}`);
+  console.log(`${DIM}Timeout: ${perModelTimeoutSec}s per model${RESET}`);
+  console.log();
+
+  const allResults: ModelResult[] = [];
+
+  for (const model of MODELS) {
+    console.log(`${CYAN}${BOLD}▶ Testing ${model.short} (${model.id})${RESET}`);
+    const result = await runModel(model);
+    allResults.push(result);
+
+    const timeStr = `${(result.durationMs / 1000).toFixed(1)}s`;
+    if (result.error) {
+      console.log(`  ${RED}ERROR${RESET}: ${result.error} (${timeStr})`);
+    } else {
+      const color =
+        result.totalPassed === result.totalTests
+          ? GREEN
+          : result.totalPassed > 0
+            ? YELLOW
+            : RED;
+      console.log(
+        `  ${color}${result.totalPassed}/${result.totalTests} passed${RESET} (${timeStr})`
+      );
+      for (const t of result.tests) {
+        const icon = t.passed ? `${GREEN}✓${RESET}` : `${RED}✗${RESET}`;
+        console.log(`    ${icon} ${t.name}`);
+      }
+    }
+    console.log();
+  }
+
+  // ── Summary Table ──────────────────────────────────────────
+  console.log(`${BOLD}═══ Summary ═══${RESET}\n`);
+
+  // Per-model results
+  for (const r of allResults) {
+    const timeStr = `${(r.durationMs / 1000).toFixed(0)}s`;
+    const color = r.totalPassed === r.totalTests ? GREEN : r.totalPassed > 0 ? YELLOW : RED;
+    console.log(`  ${r.modelShort.padEnd(8)} ${color}${r.totalPassed}/${r.totalTests}${RESET} (${timeStr})`);
+    for (const t of r.tests) {
+      const icon = t.passed ? `${GREEN}✓${RESET}` : `${RED}✗${RESET}`;
+      console.log(`    ${icon} ${t.name}`);
+    }
+  }
+
+  console.log();
+
+  // Overall
+  const totalModels = allResults.length;
+  const perfectModels = allResults.filter(
+    (r) => r.totalPassed === r.totalTests
+  ).length;
+  console.log(
+    `${BOLD}Models with 100%: ${perfectModels}/${totalModels}${RESET}`
+  );
+
+  const overallPassed = allResults.reduce((sum, r) => sum + r.totalPassed, 0);
+  const overallTotal = allResults.reduce((sum, r) => sum + r.totalTests, 0);
+  console.log(
+    `${BOLD}Overall: ${overallPassed}/${overallTotal} (${Math.round((overallPassed / overallTotal) * 100)}%)${RESET}`
+  );
+
+  console.log();
+
+  if (perfectModels === totalModels) {
+    console.log(`${BOLD}${GREEN}🎉 ALL MODELS PASSED ALL TESTS!${RESET}\n`);
+    process.exit(0);
+  } else {
+    console.log(
+      `${BOLD}${YELLOW}Some models have failures. See details above.${RESET}\n`
+    );
+    process.exit(1);
+  }
+};
+
+main();

From 8fb5949ac653057602892a4bf6745627d3e6b524 Mon Sep 17 00:00:00 2001
From: minpeter <minpeter@friendli.ai>
Date: Fri, 27 Feb 2026 01:44:51 +0900
Subject: [PATCH 4/4] fix(benchmarks): address review feedback on error
 handling and validation

- headless.ts: emit error field on tool_result when output starts with Error:
- test-multi-model.ts: errored/timed-out models now shown as RED and exit(1)
- test-multi-model.ts: validate --timeout arg (reject NaN/negative)
- test-edge-cases.ts: use exact match instead of trim() for whitespace test
- test-edge-cases.ts: skip file pre-creation for create-via-append test

Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-opencode)

Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
---
 benchmarks/headless.ts         |  7 +++++--
 benchmarks/test-edge-cases.ts  |  8 ++++++--
 benchmarks/test-multi-model.ts | 23 +++++++++++++++++------
 3 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/benchmarks/headless.ts b/benchmarks/headless.ts
index bb2af701..ae18853a 100644
--- a/benchmarks/headless.ts
+++ b/benchmarks/headless.ts
@@ -151,11 +151,14 @@ async function run() {
             model: modelId,
           })
           break
-        case "tool-result":
+        case "tool-result": {
+          const output = typeof part.result === "string" ? part.result : JSON.stringify(part.result)
+          const isError = typeof output === "string" && output.startsWith("Error:")
           emit({
             type: "tool_result",
             tool_call_id: part.toolCallId,
-            output: typeof part.result === "string" ? part.result : JSON.stringify(part.result),
+            output,
+            ...(isError ? { error: output } : {}),
           })
           break
       }
diff --git a/benchmarks/test-edge-cases.ts b/benchmarks/test-edge-cases.ts
index a1916c56..b00b0302 100644
--- a/benchmarks/test-edge-cases.ts
+++ b/benchmarks/test-edge-cases.ts
@@ -53,6 +53,7 @@ interface TestCase {
   fileName: string;
   name: string;
   prompt: string;
+  skipFileCreate?: boolean;
   validate: (content: string) => { passed: boolean; reason: string };
 }
 
@@ -288,6 +289,7 @@ const TEST_CASES: TestCase[] = [
     name: "7. Create new file via append",
     fileName: "create-via-append.txt",
     fileContent: "",
+    skipFileCreate: true,
     prompt: [
       "Create create-via-append.txt via edit_file append (do not call read_file first).",
       "Use one call with edits: [{ op: 'append', lines: ['created line 1', 'created line 2'] }].",
@@ -597,7 +599,7 @@ const TEST_CASES: TestCase[] = [
           reason: "non-target lines changed unexpectedly",
         };
       }
-      if (lines[1].trim() !== "middle-content") {
+      if (lines[1] !== "middle-content") {
         return {
           passed: false,
           reason: `line 2 expected 'middle-content' but got ${JSON.stringify(lines[1])}`,
@@ -907,7 +909,9 @@ async function runTestCase(
   duration: number;
 }> {
   const testFile = join(testDir, tc.fileName);
-  writeFileSync(testFile, tc.fileContent, "utf-8");
+  if (!tc.skipFileCreate) {
+    writeFileSync(testFile, tc.fileContent, "utf-8");
+  }
 
   const headlessScript = resolve(import.meta.dir, "headless.ts");
   const headlessArgs = [
diff --git a/benchmarks/test-multi-model.ts b/benchmarks/test-multi-model.ts
index 1781d4eb..29ee4bb9 100644
--- a/benchmarks/test-multi-model.ts
+++ b/benchmarks/test-multi-model.ts
@@ -25,9 +25,13 @@ let perModelTimeoutSec = 900; // 15 min default per model (5 tests)
 const rawArgs = process.argv.slice(2);
 for (let i = 0; i < rawArgs.length; i++) {
   if (rawArgs[i] === "--timeout" && i + 1 < rawArgs.length) {
-    perModelTimeoutSec = Number.parseInt(rawArgs[i + 1], 10);
+    const parsed = Number.parseInt(rawArgs[i + 1], 10);
+    if (Number.isNaN(parsed) || parsed <= 0) {
+      console.error(`Invalid --timeout value: ${rawArgs[i + 1]}`);
+      process.exit(1);
+    }
+    perModelTimeoutSec = parsed;
     i++;
-  }
 }
 
 // ── Colors ────────────────────────────────────────────────────
@@ -228,8 +232,9 @@ const main = async () => {
   // Per-model results
   for (const r of allResults) {
     const timeStr = `${(r.durationMs / 1000).toFixed(0)}s`;
-    const color = r.totalPassed === r.totalTests ? GREEN : r.totalPassed > 0 ? YELLOW : RED;
-    console.log(`  ${r.modelShort.padEnd(8)} ${color}${r.totalPassed}/${r.totalTests}${RESET} (${timeStr})`);
+    const color = r.error ? RED : r.totalPassed === r.totalTests ? GREEN : r.totalPassed > 0 ? YELLOW : RED;
+    const label = r.error ? `ERROR: ${r.error}` : `${r.totalPassed}/${r.totalTests}`;
+    console.log(`  ${r.modelShort.padEnd(8)} ${color}${label}${RESET} (${timeStr})`);
     for (const t of r.tests) {
       const icon = t.passed ? `${GREEN}✓${RESET}` : `${RED}✗${RESET}`;
       console.log(`    ${icon} ${t.name}`);
@@ -240,8 +245,9 @@ const main = async () => {
 
   // Overall
   const totalModels = allResults.length;
+  const erroredModels = allResults.filter((r) => r.error).length;
   const perfectModels = allResults.filter(
-    (r) => r.totalPassed === r.totalTests
+    (r) => !r.error && r.totalPassed === r.totalTests && r.totalTests > 0
   ).length;
   console.log(
     `${BOLD}Models with 100%: ${perfectModels}/${totalModels}${RESET}`
@@ -255,7 +261,12 @@ const main = async () => {
 
   console.log();
 
-  if (perfectModels === totalModels) {
+  if (erroredModels > 0) {
+    console.log(
+      `${BOLD}${RED}${erroredModels} model(s) errored. See details above.${RESET}\n`
+    );
+    process.exit(1);
+  } else if (perfectModels === totalModels) {
     console.log(`${BOLD}${GREEN}🎉 ALL MODELS PASSED ALL TESTS!${RESET}\n`);
     process.exit(0);
   } else {