From dcf5668b27e2dc477ff2494b6eae88c601004653 Mon Sep 17 00:00:00 2001 From: Affaan Mustafa Date: Tue, 12 May 2026 17:04:39 -0400 Subject: [PATCH] docs: add evaluator rag prototype (#1824) --- docs/ECC-2.0-GA-ROADMAP.md | 14 +- docs/ECC-2.0-REFERENCE-ARCHITECTURE.md | 7 + docs/architecture/evaluator-rag-prototype.md | 122 +++++++++++++++ .../candidate-playbook.md | 41 +++++ examples/evaluator-rag-prototype/report.json | 35 +++++ .../evaluator-rag-prototype/scenario.json | 56 +++++++ examples/evaluator-rag-prototype/trace.json | 46 ++++++ .../verifier-result.json | 35 +++++ tests/docs/evaluator-rag-prototype.test.js | 142 ++++++++++++++++++ 9 files changed, 494 insertions(+), 4 deletions(-) create mode 100644 docs/architecture/evaluator-rag-prototype.md create mode 100644 examples/evaluator-rag-prototype/candidate-playbook.md create mode 100644 examples/evaluator-rag-prototype/report.json create mode 100644 examples/evaluator-rag-prototype/scenario.json create mode 100644 examples/evaluator-rag-prototype/trace.json create mode 100644 examples/evaluator-rag-prototype/verifier-result.json create mode 100644 tests/docs/evaluator-rag-prototype.test.js diff --git a/docs/ECC-2.0-GA-ROADMAP.md b/docs/ECC-2.0-GA-ROADMAP.md index 0c1114bf..b9dbf801 100644 --- a/docs/ECC-2.0-GA-ROADMAP.md +++ b/docs/ECC-2.0-GA-ROADMAP.md @@ -54,6 +54,11 @@ As of 2026-05-12: dry-run without `--force`, local marketplace discovery, temp-home local install, enabled plugin listing, and clean uninstall for `ecc@ecc` `2.0.0-rc.1`. +- `docs/architecture/evaluator-rag-prototype.md` and + `examples/evaluator-rag-prototype/` define the first read-only + self-improving harness prototype: scenario spec, trace, report, candidate + playbook, verifier result, accepted maintainer-salvage candidate, and + rejected blind-translation candidate. - The npm package surface now excludes Python bytecode/cache artifacts through package `files` negation rules and a publish-surface regression test. - `docs/legacy-artifact-inventory.md` records that no `_legacy-documents-*` @@ -194,7 +199,7 @@ is not complete unless the evidence column exists and has been freshly verified. | AgentShield enterprise iteration | Policy gates, SARIF, packs, provenance, corpus, HTML reports, exception lifecycle audit | PRs #53, #55-#62 landed with test evidence | Needs PDF/export decision or next enterprise signal | | ECC Tools next-level app | Billing audit, PR checks, deep analyzer, sync backlog | PRs #26-#39 landed with test evidence | Needs capacity-backed Linear rollout / broader evaluator corpus | | GitGuardian/Dependabot/CodeRabbit-style checks | Non-blocking taxonomy and deterministic follow-up checks | ECC-Tools risk taxonomy check plus follow-up signals landed, including Skill Quality, Deep Analyzer Evidence, Analyzer Corpus Evidence, RAG/Evaluator Evidence, and PR Review/Salvage Evidence | Partially complete | -| Harness-agnostic learning system | Audit, adapter matrix, observability, traces, promotion loop | Audit/adapters/observability gates exist | Needs evaluation/RAG prototype | +| Harness-agnostic learning system | Audit, adapter matrix, observability, traces, promotion loop | Audit/adapters/observability gates plus `docs/architecture/evaluator-rag-prototype.md` and `examples/evaluator-rag-prototype/` define the first read-only scenario, trace, report, playbook, and verifier result | Needs broader evaluator corpus | | Linear roadmap is detailed | Linear project status plus repo mirror | Repo mirror exists; issue creation was retried on 2026-05-12 and remains blocked by the workspace free issue limit | Needs recurring status updates after each merge batch | | Flow separation and progress tracking | Flow lanes with owner artifacts and update cadence | This roadmap defines lanes below | Active | | Realtime Linear sync | Project updates while issue limit is blocked; issues later | ECC-Tools #39 implements opt-in Linear API sync for deferred follow-up backlog items | Needs workspace capacity/config rollout | @@ -213,7 +218,7 @@ back to the repo evidence and merge commits. | Queue hygiene and salvage | GitHub PR/issue state, salvage ledger | Append ledger entries for any future stale closures | Every cleanup batch | | Release and publication | rc.1 release docs, publication readiness doc | Naming matrix and plugin submission/contact checklist | Before any tag | | Harness OS core | Audit, adapter matrix, observability docs, `ecc2/` | HUD/session-control acceptance spec | Weekly until GA | -| Evaluation and RAG | Reference-set validation, harness audit, traces | Read-only evaluator/RAG prototype design | Before deep analyzer expansion | +| Evaluation and RAG | Reference-set validation, harness audit, traces | Read-only evaluator/RAG prototype plus fixture contract | Expand to CI, billing, harness-config, and AgentShield scenarios | | AgentShield enterprise | AgentShield PR evidence and roadmap notes | PDF-export decision or next enterprise signal | After value decision | | ECC Tools app | ECC-Tools PR evidence, billing audit, risk taxonomy | Capacity-backed Linear rollout or broader evaluator/RAG corpus slice | Next implementation batch | | Linear progress | Linear project status updates and this mirror | Status update with queue/evidence/missing gates | Every significant merge batch | @@ -407,5 +412,6 @@ Acceptance: executive report, corpus benchmark output, and exception lifecycle audit. 2. Enable/configure the merged Linear backlog sync path after workspace issue capacity clears or the Linear workspace is upgraded. -3. Expand the evaluator/RAG corpus with real cleanup-batch cases as future - maintainer-owned examples land. +3. Expand the evaluator/RAG corpus beyond the first stale-salvage prototype to + CI failure diagnosis, harness-config drift, billing readiness, and + AgentShield policy exception scenarios. diff --git a/docs/ECC-2.0-REFERENCE-ARCHITECTURE.md b/docs/ECC-2.0-REFERENCE-ARCHITECTURE.md index 643b09dc..b08c3158 100644 --- a/docs/ECC-2.0-REFERENCE-ARCHITECTURE.md +++ b/docs/ECC-2.0-REFERENCE-ARCHITECTURE.md @@ -136,6 +136,13 @@ Repo work: - `agentshield`: feed prompt-injection and config-risk findings into regression suites. +Current prototype: + +- `docs/architecture/evaluator-rag-prototype.md` defines the read-only + evaluator/RAG artifact contract. +- `examples/evaluator-rag-prototype/` records the first scenario spec, trace, + report, candidate playbook, and verifier result for stale-PR salvage. + Verification: - read-only prototype that emits a trace, report, candidate playbook, and diff --git a/docs/architecture/evaluator-rag-prototype.md b/docs/architecture/evaluator-rag-prototype.md new file mode 100644 index 00000000..abd002b2 --- /dev/null +++ b/docs/architecture/evaluator-rag-prototype.md @@ -0,0 +1,122 @@ +# Evaluator RAG Prototype + +ECC 2.0 needs a self-improving harness loop that can learn from real work +without blindly mutating a user's Claude, Codex, OpenCode, dmux, Zed, or +terminal setup. This prototype defines the smallest read-only artifact set for +that loop. + +The fixture set lives in +[`examples/evaluator-rag-prototype/`](../../examples/evaluator-rag-prototype/). +It uses the May 2026 stale-PR cleanup and salvage lane as the first concrete +scenario because that lane has real inputs, real accepted work, and real +rejected work. + +## Reference Pressure + +- Meta-Harness: treat the harness itself as an experiment with scenario specs, + verifier results, and promoted playbooks. +- Autocontext: store traces, reports, artifacts, and reusable improvements + before changing installed agent assets. +- Claude HUD: expose context, tools, todos, agent activity, checks, and risk so + an evaluator can judge a run after the fact. +- Hermes Agent: keep skills, memories, scheduler-like follow-ups, and terminal + gateway behavior explicit instead of hiding local commands. +- dmux, Orca, Superset, and Ghast: preserve worktree/session state so parallel + agent work can be compared, resumed, or closed cleanly. +- ECC Tools: route evaluator findings into PR comments, check runs, and Linear + backlog items without flooding GitHub. + +## Artifact Contract + +Every evaluator/RAG run is read-only until a verifier promotes a playbook. + +| Artifact | Purpose | Fixture | +| --- | --- | --- | +| Scenario spec | Declares the objective, allowed evidence, forbidden actions, and pass/fail gates. | `scenario.json` | +| Trace | Captures observation, retrieval, proposal, verification, and promotion events. | `trace.json` | +| Report | Summarizes scores, evidence coverage, risks, and recommended next action. | `report.json` | +| Candidate playbook | Describes the maintainer-owned workflow that could be reused later. | `candidate-playbook.md` | +| Verifier result | Accepts or rejects candidates with concrete reasons and rollback notes. | `verifier-result.json` | + +The prototype deliberately separates retrieval from action. A run can retrieve +closed PR diffs, Linear status, CI history, and local docs, but it cannot close, +merge, publish, tag, or rewrite configs as part of the evaluator pass. + +## Phase Model + +1. Observe the current queue, dirty worktrees, branch state, open PRs/issues, + discussions, CI state, and release gates. +2. Retrieve relevant reference evidence: stale-salvage ledger rows, prior + maintainer PRs, current docs, analyzer findings, CI failures, and harness + adapter rules. +3. Propose one or more playbooks with source attribution and expected + validation gates. +4. Verify each playbook against explicit acceptance and rejection rules. +5. Promote only the candidate that improves the scenario without widening blast + radius. +6. Record rollback guidance and unresolved manual-review tails. + +## First Scenario + +The first scenario is `stale-pr-salvage-maintainer-branch`. + +It models the rule Affaan set during the May 2026 cleanup: stale closure is +queue hygiene, not loss of useful work. Useful closed PR work should be ported +into maintainer-owned PRs with attribution/backlinks, while generated churn, +bulk localization, and ambiguous translator work stay out of blind +cherry-picks. + +The verifier accepts a maintainer salvage branch that: + +- credits source PRs; +- avoids raw private context and personal paths; +- does not import stale bulk localization without translator review; +- records a durable ledger update; +- runs the same validation gates as a normal code, docs, or catalog change; +- leaves release publication actions approval-gated. + +The verifier rejects a blind cherry-pick proposal that: + +- imports stale translation/doc churn wholesale; +- skips the current catalog/install architecture; +- lacks attribution; +- lacks tests or ledger updates; +- mutates release or plugin publication state. + +## ECC Tools Mapping + +ECC Tools already flags missing RAG/evaluator evidence for retrieval, +embedding, ranking, and evaluator changes. This prototype gives those checks a +target shape: + +- `scenario.json` maps to analyzer corpus inputs. +- `trace.json` maps to golden traces and run telemetry. +- `report.json` maps to PR comment summaries and Linear backlog summaries. +- `candidate-playbook.md` maps to the suggested follow-up PR body. +- `verifier-result.json` maps to pass/fail check-run evidence. + +Future ECC Tools work should consume these artifacts as fixture shape before it +adds hosted retrieval or model-backed judging. The local prototype is enough to +prove the contract before any paid API or vector store is introduced. + +## Promotion Rules + +A candidate can be promoted only when: + +- the verifier result is `accepted`; +- at least one rejected candidate proves the verifier can say no; +- every source PR or reference artifact has attribution; +- the proposed action is maintainer-owned and reversible; +- validation commands are named; +- unresolved translator, release, billing, or publication items remain blocked + until separately approved. + +## Next Expansion + +The next evaluator/RAG corpus should add: + +- a CI-failure diagnosis scenario with captured logs and a known fix; +- a harness-config quality scenario covering MCP/plugin/hook drift; +- a billing-readiness scenario that separates verified Marketplace claims from + launch-copy assumptions; +- an AgentShield policy exception scenario with SARIF and report evidence. diff --git a/examples/evaluator-rag-prototype/candidate-playbook.md b/examples/evaluator-rag-prototype/candidate-playbook.md new file mode 100644 index 00000000..125c7c6f --- /dev/null +++ b/examples/evaluator-rag-prototype/candidate-playbook.md @@ -0,0 +1,41 @@ +# Candidate Playbook: Maintainer-Owned Stale Salvage + +Candidate id: `maintainer-salvage-branch` + +## Use When + +- A stale or conflicted PR was closed to keep the public queue usable. +- The closed diff contains a useful focused idea, skill, command, doc, test, or + bug fix. +- The contributor may not have time or interest to rebase. + +## Steps + +1. Record the source PR, author, useful concept, and closure reason in + `docs/stale-pr-salvage-ledger.md`. +2. Re-read the closed PR diff against current `main`. +3. Decide whether the patch can be cherry-picked safely. Prefer reimplementation + when current architecture has moved. +4. Create a maintainer-owned branch with one focused salvage unit. +5. Preserve attribution in the PR body and, when useful, in the commit body. +6. Update the catalog, docs, tests, or release evidence required by the touched + surface. +7. Run the same validation gates a normal change would require. +8. After merge, update the ledger from pending/salvage-branch to landed, + already-present, superseded, skipped, or translator/manual review. + +## Reject Conditions + +- The patch is bulk generated churn. +- The patch is stale localization that needs translator/manual review. +- The patch imports personal paths, secrets, local settings, or private operator context. +- The patch bypasses current install, catalog, plugin, or release architecture. +- The branch would mix unrelated salvage units into one PR. + +## Minimum Validation + +- Targeted test for the touched surface. +- `git diff --check`. +- Markdown lint when docs are touched. +- Catalog/install validation when skills, agents, commands, or plugin surfaces + are touched. diff --git a/examples/evaluator-rag-prototype/report.json b/examples/evaluator-rag-prototype/report.json new file mode 100644 index 00000000..25922783 --- /dev/null +++ b/examples/evaluator-rag-prototype/report.json @@ -0,0 +1,35 @@ +{ + "schema_version": "ecc.evaluator-rag.report.v1", + "scenario_id": "stale-pr-salvage-maintainer-branch", + "run_id": "2026-05-12-cleanup-salvage-prototype", + "result": "prototype_passed", + "read_only": true, + "scores": { + "source_attribution": 1, + "blast_radius_control": 1, + "manual_review_respected": 1, + "validation_specificity": 0.8, + "publication_safety": 1 + }, + "findings": [ + { + "id": "salvage-policy-usable", + "severity": "info", + "summary": "The stale-salvage ledger and maintainer PR examples provide enough evidence to promote a reusable maintainer-owned salvage playbook." + }, + { + "id": "translation-tail-blocked", + "severity": "warning", + "summary": "Localization tails remain useful but must stay translator/manual-review only." + }, + { + "id": "release-actions-blocked", + "severity": "warning", + "summary": "Release, npm, plugin, billing, and announcement actions remain outside this evaluator run and require separate approval." + } + ], + "recommended_next_action": { + "candidate_id": "maintainer-salvage-branch", + "action": "Use the promoted playbook for future stale cleanup batches and add additional evaluator/RAG scenarios for CI failure diagnosis, harness-config drift, billing readiness, and AgentShield policy exceptions." + } +} diff --git a/examples/evaluator-rag-prototype/scenario.json b/examples/evaluator-rag-prototype/scenario.json new file mode 100644 index 00000000..4c3bc43e --- /dev/null +++ b/examples/evaluator-rag-prototype/scenario.json @@ -0,0 +1,56 @@ +{ + "schema_version": "ecc.evaluator-rag.scenario.v1", + "scenario_id": "stale-pr-salvage-maintainer-branch", + "title": "Recover useful stale PR work through maintainer-owned branches", + "mode": "read_only_prototype", + "objective": "Given a closed stale PR batch, identify useful work, reject unsafe bulk imports, and promote only a maintainer-owned salvage playbook with attribution and validation.", + "sources": [ + { + "kind": "repo_doc", + "path": "docs/stale-pr-salvage-ledger.md", + "purpose": "Durable source-to-disposition mapping for stale PR cleanup" + }, + { + "kind": "repo_doc", + "path": "docs/legacy-artifact-inventory.md", + "purpose": "Import guardrails for legacy and private-context material" + }, + { + "kind": "roadmap", + "path": "docs/ECC-2.0-GA-ROADMAP.md", + "purpose": "Operating rule and current execution lane" + }, + { + "kind": "github_pr", + "url": "https://github.com/affaan-m/everything-claude-code/pull/1815", + "purpose": "Example maintainer-owned stale salvage PR with attribution" + }, + { + "kind": "github_pr", + "url": "https://github.com/affaan-m/everything-claude-code/pull/1818", + "purpose": "Example gap pass classifying already-present and skipped stale work" + } + ], + "retrieval_questions": [ + "Which closed PRs contain useful work that is not already present?", + "Which files or concepts are unsafe to cherry-pick without manual review?", + "Which current docs, skills, commands, or tests are the correct integration points?", + "Which validation gates are required before the salvage work can merge?" + ], + "forbidden_actions": [ + "closing, reopening, or commenting on PRs", + "merging PRs", + "creating release tags", + "publishing packages or plugins", + "copying private paths, secrets, or raw personal context", + "blindly cherry-picking bulk localization" + ], + "acceptance_gates": [ + "source attribution is preserved", + "salvage ledger or equivalent tracker is updated", + "translation/manual-review tails remain blocked", + "candidate action is reversible and maintainer-owned", + "validation commands are named", + "at least one unsafe candidate is rejected" + ] +} diff --git a/examples/evaluator-rag-prototype/trace.json b/examples/evaluator-rag-prototype/trace.json new file mode 100644 index 00000000..a43b2fbf --- /dev/null +++ b/examples/evaluator-rag-prototype/trace.json @@ -0,0 +1,46 @@ +{ + "schema_version": "ecc.evaluator-rag.trace.v1", + "scenario_id": "stale-pr-salvage-maintainer-branch", + "run_id": "2026-05-12-cleanup-salvage-prototype", + "read_only": true, + "events": [ + { + "phase": "observation", + "summary": "Public PR, issue, and discussion queues are clear; release publication remains approval-gated; stale-salvage ledger has landed, skipped, superseded, and manual-review states.", + "evidence": [ + "docs/ECC-2.0-GA-ROADMAP.md", + "docs/stale-pr-salvage-ledger.md" + ] + }, + { + "phase": "retrieval", + "summary": "Retrieved stale PR source mappings, existing maintainer salvage examples, legacy import rules, and manual-review localization tails.", + "evidence": [ + "docs/stale-pr-salvage-ledger.md", + "docs/legacy-artifact-inventory.md", + "https://github.com/affaan-m/everything-claude-code/pull/1815", + "https://github.com/affaan-m/everything-claude-code/pull/1818" + ] + }, + { + "phase": "proposal", + "summary": "Generated two candidate playbooks: maintainer-owned salvage branch with attribution, and blind cherry-pick of stale translations.", + "candidate_ids": [ + "maintainer-salvage-branch", + "blind-cherry-pick-translations" + ] + }, + { + "phase": "verification", + "summary": "Accepted the maintainer-owned salvage branch and rejected blind translation cherry-picking because it violates manual-review and attribution gates.", + "evidence": [ + "examples/evaluator-rag-prototype/verifier-result.json" + ] + }, + { + "phase": "promotion", + "summary": "Promoted only the maintainer-owned salvage branch playbook as a reusable process. No repository, GitHub, release, billing, or plugin publication action is performed by this prototype.", + "promoted_candidate_id": "maintainer-salvage-branch" + } + ] +} diff --git a/examples/evaluator-rag-prototype/verifier-result.json b/examples/evaluator-rag-prototype/verifier-result.json new file mode 100644 index 00000000..027eeab6 --- /dev/null +++ b/examples/evaluator-rag-prototype/verifier-result.json @@ -0,0 +1,35 @@ +{ + "schema_version": "ecc.evaluator-rag.verifier.v1", + "scenario_id": "stale-pr-salvage-maintainer-branch", + "run_id": "2026-05-12-cleanup-salvage-prototype", + "read_only": true, + "candidates": [ + { + "candidate_id": "maintainer-salvage-branch", + "decision": "accepted", + "score": 0.94, + "reasons": [ + "preserves source PR attribution", + "keeps work on a fresh maintainer-owned branch", + "updates the salvage ledger", + "names validation gates", + "does not perform release or publication actions" + ], + "rollback": "Close the maintainer PR or revert its merge commit; source PR state remains unchanged." + }, + { + "candidate_id": "blind-cherry-pick-translations", + "decision": "rejected", + "score": 0.21, + "reasons": [ + "bulk localization requires translator/manual review", + "does not preserve enough source attribution", + "could import stale generated docs", + "does not name validation gates", + "risks bypassing current catalog and install architecture" + ], + "rollback": "Do not create this branch; keep the localization tail in translator/manual-review state." + } + ], + "promoted_candidate_id": "maintainer-salvage-branch" +} diff --git a/tests/docs/evaluator-rag-prototype.test.js b/tests/docs/evaluator-rag-prototype.test.js new file mode 100644 index 00000000..80dc5cc1 --- /dev/null +++ b/tests/docs/evaluator-rag-prototype.test.js @@ -0,0 +1,142 @@ +'use strict'; + +const assert = require('assert'); +const fs = require('fs'); +const path = require('path'); + +const repoRoot = path.resolve(__dirname, '..', '..'); +const fixtureRoot = path.join(repoRoot, 'examples', 'evaluator-rag-prototype'); + +let passed = 0; +let failed = 0; + +function test(name, fn) { + try { + fn(); + console.log(` ✓ ${name}`); + passed++; + } catch (error) { + console.log(` ✗ ${name}`); + console.log(` Error: ${error.message}`); + failed++; + } +} + +function read(relativePath) { + return fs.readFileSync(path.join(repoRoot, relativePath), 'utf8'); +} + +function readJson(fileName) { + return JSON.parse(fs.readFileSync(path.join(fixtureRoot, fileName), 'utf8')); +} + +console.log('\n=== Testing evaluator RAG prototype ===\n'); + +test('architecture doc records the artifact contract and reference pressure', () => { + const source = read('docs/architecture/evaluator-rag-prototype.md'); + + for (const required of [ + 'Scenario spec', + 'Trace', + 'Report', + 'Candidate playbook', + 'Verifier result', + 'Meta-Harness', + 'Autocontext', + 'Claude HUD', + 'Hermes Agent', + 'dmux, Orca, Superset, and Ghast', + 'ECC Tools' + ]) { + assert.ok(source.includes(required), `Missing doc requirement: ${required}`); + } +}); + +test('fixtures use one scenario id and declare read-only behavior', () => { + const scenario = readJson('scenario.json'); + const trace = readJson('trace.json'); + const report = readJson('report.json'); + const verifier = readJson('verifier-result.json'); + + assert.strictEqual(scenario.schema_version, 'ecc.evaluator-rag.scenario.v1'); + assert.strictEqual(trace.schema_version, 'ecc.evaluator-rag.trace.v1'); + assert.strictEqual(report.schema_version, 'ecc.evaluator-rag.report.v1'); + assert.strictEqual(verifier.schema_version, 'ecc.evaluator-rag.verifier.v1'); + + for (const artifact of [trace, report, verifier]) { + assert.strictEqual(artifact.scenario_id, scenario.scenario_id); + assert.strictEqual(artifact.read_only, true); + } +}); + +test('trace covers the full self-improving harness loop', () => { + const trace = readJson('trace.json'); + const phases = trace.events.map(event => event.phase); + + for (const phase of ['observation', 'retrieval', 'proposal', 'verification', 'promotion']) { + assert.ok(phases.includes(phase), `Missing trace phase ${phase}`); + } + + assert.ok(trace.events.some(event => event.promoted_candidate_id === 'maintainer-salvage-branch')); +}); + +test('scenario blocks unsafe write actions and release actions', () => { + const scenario = readJson('scenario.json'); + const forbidden = scenario.forbidden_actions.join('\n'); + + for (const blocked of [ + 'closing, reopening, or commenting on PRs', + 'merging PRs', + 'creating release tags', + 'publishing packages or plugins', + 'copying private paths, secrets, or raw personal context', + 'blindly cherry-picking bulk localization' + ]) { + assert.ok(forbidden.includes(blocked), `Missing forbidden action: ${blocked}`); + } +}); + +test('verifier accepts maintainer salvage and rejects blind translation imports', () => { + const verifier = readJson('verifier-result.json'); + const accepted = verifier.candidates.find(candidate => candidate.candidate_id === 'maintainer-salvage-branch'); + const rejected = verifier.candidates.find(candidate => candidate.candidate_id === 'blind-cherry-pick-translations'); + + assert.ok(accepted, 'Missing accepted maintainer salvage candidate'); + assert.ok(rejected, 'Missing rejected blind cherry-pick candidate'); + assert.strictEqual(accepted.decision, 'accepted'); + assert.strictEqual(rejected.decision, 'rejected'); + assert.strictEqual(verifier.promoted_candidate_id, accepted.candidate_id); + assert.ok(accepted.score > rejected.score); + assert.ok(rejected.reasons.join('\n').includes('translator/manual review')); +}); + +test('candidate playbook preserves stale-salvage operating rules', () => { + const playbook = read('examples/evaluator-rag-prototype/candidate-playbook.md'); + + for (const required of [ + 'docs/stale-pr-salvage-ledger.md', + 'source PR', + 'maintainer-owned branch', + 'Preserve attribution', + 'translator/manual review', + 'private operator context', + 'git diff --check' + ]) { + assert.ok(playbook.includes(required), `Missing playbook rule: ${required}`); + } +}); + +test('roadmap points to the evaluator RAG prototype and keeps broader corpus work open', () => { + const roadmap = read('docs/ECC-2.0-GA-ROADMAP.md'); + + assert.ok(roadmap.includes('docs/architecture/evaluator-rag-prototype.md')); + assert.ok(roadmap.includes('examples/evaluator-rag-prototype/')); + assert.ok(roadmap.includes('Needs broader evaluator corpus')); +}); + +if (failed > 0) { + console.log(`\nFailed: ${failed}`); + process.exit(1); +} + +console.log(`\nPassed: ${passed}`);