From 3dddfc8270155d7625573d58a351e4a10daa3423 Mon Sep 17 00:00:00 2001 From: Affaan Mustafa Date: Tue, 12 May 2026 17:48:21 -0400 Subject: [PATCH] docs: add evaluator harness config scenario --- docs/ECC-2.0-GA-ROADMAP.md | 8 +-- docs/architecture/evaluator-rag-prototype.md | 8 ++- .../candidate-playbook.md | 49 ++++++++++++++++ .../harness-config-quality/report.json | 35 ++++++++++++ .../harness-config-quality/scenario.json | 57 +++++++++++++++++++ .../harness-config-quality/trace.json | 45 +++++++++++++++ .../verifier-result.json | 35 ++++++++++++ tests/docs/evaluator-rag-prototype.test.js | 48 +++++++++++++++- 8 files changed, 278 insertions(+), 7 deletions(-) create mode 100644 examples/evaluator-rag-prototype/harness-config-quality/candidate-playbook.md create mode 100644 examples/evaluator-rag-prototype/harness-config-quality/report.json create mode 100644 examples/evaluator-rag-prototype/harness-config-quality/scenario.json create mode 100644 examples/evaluator-rag-prototype/harness-config-quality/trace.json create mode 100644 examples/evaluator-rag-prototype/harness-config-quality/verifier-result.json diff --git a/docs/ECC-2.0-GA-ROADMAP.md b/docs/ECC-2.0-GA-ROADMAP.md index 3033f07f..b6e94b92 100644 --- a/docs/ECC-2.0-GA-ROADMAP.md +++ b/docs/ECC-2.0-GA-ROADMAP.md @@ -58,8 +58,8 @@ As of 2026-05-12: `examples/evaluator-rag-prototype/` define the first read-only self-improving harness prototype: scenario specs, traces, reports, candidate playbooks, verifier results, accepted maintainer-salvage, - billing-readiness, and CI-failure-diagnosis candidates, plus rejected - unsafe candidates. + billing-readiness, CI-failure-diagnosis, and harness-config-quality + candidates, plus rejected unsafe candidates. - The npm package surface now excludes Python bytecode/cache artifacts through package `files` negation rules and a publish-surface regression test. - `docs/legacy-artifact-inventory.md` records that no `_legacy-documents-*` @@ -200,7 +200,7 @@ is not complete unless the evidence column exists and has been freshly verified. | AgentShield enterprise iteration | Policy gates, SARIF, packs, provenance, corpus, HTML reports, exception lifecycle audit | PRs #53, #55-#62 landed with test evidence | Needs PDF/export decision or next enterprise signal | | ECC Tools next-level app | Billing audit, PR checks, deep analyzer, sync backlog | PRs #26-#39 landed with test evidence | Needs capacity-backed Linear rollout / broader evaluator corpus | | GitGuardian/Dependabot/CodeRabbit-style checks | Non-blocking taxonomy and deterministic follow-up checks | ECC-Tools risk taxonomy check plus follow-up signals landed, including Skill Quality, Deep Analyzer Evidence, Analyzer Corpus Evidence, RAG/Evaluator Evidence, and PR Review/Salvage Evidence | Partially complete | -| Harness-agnostic learning system | Audit, adapter matrix, observability, traces, promotion loop | Audit/adapters/observability gates plus `docs/architecture/evaluator-rag-prototype.md` and `examples/evaluator-rag-prototype/` define read-only stale-salvage, billing-readiness, and CI-failure-diagnosis scenarios with trace, report, playbook, and verifier result artifacts | Needs broader evaluator corpus | +| Harness-agnostic learning system | Audit, adapter matrix, observability, traces, promotion loop | Audit/adapters/observability gates plus `docs/architecture/evaluator-rag-prototype.md` and `examples/evaluator-rag-prototype/` define read-only stale-salvage, billing-readiness, CI-failure-diagnosis, and harness-config-quality scenarios with trace, report, playbook, and verifier result artifacts | Needs AgentShield policy exception corpus | | Linear roadmap is detailed | Linear project status plus repo mirror | Repo mirror exists; issue creation was retried on 2026-05-12 and remains blocked by the workspace free issue limit | Needs recurring status updates after each merge batch | | Flow separation and progress tracking | Flow lanes with owner artifacts and update cadence | This roadmap defines lanes below | Active | | Realtime Linear sync | Project updates while issue limit is blocked; issues later | ECC-Tools #39 implements opt-in Linear API sync for deferred follow-up backlog items | Needs workspace capacity/config rollout | @@ -219,7 +219,7 @@ back to the repo evidence and merge commits. | Queue hygiene and salvage | GitHub PR/issue state, salvage ledger | Append ledger entries for any future stale closures | Every cleanup batch | | Release and publication | rc.1 release docs, publication readiness doc | Naming matrix and plugin submission/contact checklist | Before any tag | | Harness OS core | Audit, adapter matrix, observability docs, `ecc2/` | HUD/session-control acceptance spec | Weekly until GA | -| Evaluation and RAG | Reference-set validation, harness audit, traces | Read-only evaluator/RAG prototype plus stale-salvage and billing-readiness fixtures | Expand to CI, harness-config, and AgentShield scenarios | +| Evaluation and RAG | Reference-set validation, harness audit, traces | Read-only evaluator/RAG prototype plus stale-salvage, billing-readiness, CI-failure-diagnosis, and harness-config-quality fixtures | Expand to AgentShield policy exception scenario | | AgentShield enterprise | AgentShield PR evidence and roadmap notes | PDF-export decision or next enterprise signal | After value decision | | ECC Tools app | ECC-Tools PR evidence, billing audit, risk taxonomy | Capacity-backed Linear rollout or broader evaluator/RAG corpus slice | Next implementation batch | | Linear progress | Linear project status updates and this mirror | Status update with queue/evidence/missing gates | Every significant merge batch | diff --git a/docs/architecture/evaluator-rag-prototype.md b/docs/architecture/evaluator-rag-prototype.md index 60bb7a32..feba9d0f 100644 --- a/docs/architecture/evaluator-rag-prototype.md +++ b/docs/architecture/evaluator-rag-prototype.md @@ -12,7 +12,9 @@ lane has real inputs, real accepted work, and real rejected work. The corpus now also includes a billing/Marketplace readiness scenario so launch copy cannot treat dry-run release evidence or roadmap intent as live billing state. A CI-failure diagnosis scenario adds the log-first workflow needed before an -agent proposes fixes for red checks. +agent proposes fixes for red checks. A harness-config quality scenario keeps +MCP, plugin, hook, command, agent, and adapter recommendations tied to the +adapter matrix before they mutate setup guidance. ## Reference Pressure @@ -100,6 +102,9 @@ Current corpus: launch claims before public copy says they are live. - `ci-failure-diagnosis`: requires failed-job logs, changed-file scope, and a named regression command before a CI fix playbook can be promoted. +- `harness-config-quality`: requires adapter state, install/onramp path, + verification commands, risk notes, and config-preservation behavior before a + harness setup recommendation can be promoted. ## ECC Tools Mapping @@ -133,5 +138,4 @@ A candidate can be promoted only when: The next evaluator/RAG corpus should add: -- a harness-config quality scenario covering MCP/plugin/hook drift; - an AgentShield policy exception scenario with SARIF and report evidence. diff --git a/examples/evaluator-rag-prototype/harness-config-quality/candidate-playbook.md b/examples/evaluator-rag-prototype/harness-config-quality/candidate-playbook.md new file mode 100644 index 00000000..3065e360 --- /dev/null +++ b/examples/evaluator-rag-prototype/harness-config-quality/candidate-playbook.md @@ -0,0 +1,49 @@ +# Harness Config Quality Playbook + +Candidate id: `adapter-matrix-backed-drift-check` + +Use this playbook when a PR, install change, or setup recommendation touches +MCP, plugins, hooks, commands, agents, rules, install targets, or harness +adapter surfaces. + +## Accepted Path + +1. Identify the touched harness/config surface. +2. Retrieve the adapter state from + `docs/architecture/harness-adapter-compliance.md` or + `scripts/lib/harness-adapter-compliance.js`. +3. Record whether the harness is `Native`, `Adapter-backed`, + `Instruction-backed`, or `Reference-only`. +4. Name the install/onramp path and verification command from the matrix. +5. Preserve existing user and project config by using merge, dry-run, or + explicit no-overwrite behavior. +6. Run the relevant validation gate: + - `npm run harness:adapters -- --check` + - `npm run harness:audit -- --format json` + - `node tests/lib/install-targets.test.js` + - `node tests/opencode-plugin-hooks.test.js` + - `node tests/docs/mcp-management-docs.test.js` +7. Promote a config recommendation only when the evidence matches the harness + state and the config preservation behavior is explicit. + +## Rejected Path + +Do not claim Claude hook parity for Codex, Gemini, Zed, OpenCode, or other +harnesses unless the adapter matrix and tests prove it. + +Do not overwrite `settings.json`, MCP configs, plugin manifests, rule files, or +command surfaces without a merge/dry-run path and a rollback note. + +Do not toggle live MCP servers, publish plugins, or edit user-level harness +config from the evaluator run. + +## Minimum Validation + +- `npm run harness:adapters -- --check` +- `npm run harness:audit -- --format json` +- Focused install, plugin, MCP, or hook test for the changed surface +- `git diff --check` +- Markdown lint when docs are touched + +Record the adapter state, risk note, validation commands, and config +preservation behavior in the maintainer PR body or handoff. diff --git a/examples/evaluator-rag-prototype/harness-config-quality/report.json b/examples/evaluator-rag-prototype/harness-config-quality/report.json new file mode 100644 index 00000000..8d50f494 --- /dev/null +++ b/examples/evaluator-rag-prototype/harness-config-quality/report.json @@ -0,0 +1,35 @@ +{ + "schema_version": "ecc.evaluator-rag.report.v1", + "scenario_id": "harness-config-quality", + "run_id": "2026-05-12-harness-config-quality-prototype", + "result": "prototype_passed", + "read_only": true, + "scores": { + "adapter_evidence": 0.94, + "config_preservation": 0.88, + "verification_specificity": 0.9, + "parity_claim_safety": 1, + "publication_safety": 1 + }, + "findings": [ + { + "id": "adapter-state-required", + "severity": "warning", + "summary": "Harness recommendations must retrieve the adapter state before claiming native support or runtime enforcement." + }, + { + "id": "config-overwrite-risk", + "severity": "warning", + "summary": "MCP, hook, plugin, command, and rule changes must preserve existing user/project config and use dry-run or merge behavior when available." + }, + { + "id": "verification-command-needed", + "severity": "info", + "summary": "The accepted playbook names harness adapter, harness audit, install-target, or plugin-hook regression gates before a config change can merge." + } + ], + "recommended_next_action": { + "candidate_id": "adapter-matrix-backed-drift-check", + "action": "Use the promoted harness-config quality playbook for PRs or setup work touching MCP, plugin, hook, command, agent, rule, or adapter surfaces." + } +} diff --git a/examples/evaluator-rag-prototype/harness-config-quality/scenario.json b/examples/evaluator-rag-prototype/harness-config-quality/scenario.json new file mode 100644 index 00000000..90dccb94 --- /dev/null +++ b/examples/evaluator-rag-prototype/harness-config-quality/scenario.json @@ -0,0 +1,57 @@ +{ + "schema_version": "ecc.evaluator-rag.scenario.v1", + "scenario_id": "harness-config-quality", + "title": "Detect harness config drift before changing adapters or installs", + "mode": "read_only_prototype", + "objective": "Given a change to MCP, plugin, hook, command, agent, or harness adapter surfaces, retrieve the adapter matrix and validation evidence before promoting a setup recommendation or config change.", + "sources": [ + { + "kind": "repo_doc", + "path": "docs/architecture/harness-adapter-compliance.md", + "purpose": "Public adapter matrix that names harness state, install/onramp paths, verification commands, and risk notes" + }, + { + "kind": "repo_source", + "path": "scripts/lib/harness-adapter-compliance.js", + "purpose": "Structured source of truth for the adapter compliance matrix" + }, + { + "kind": "repo_config", + "path": "hooks/hooks.json", + "purpose": "Claude hook surface that must not be assumed portable without adapter evidence" + }, + { + "kind": "repo_config", + "path": "mcp-configs/mcp-servers.json", + "purpose": "Reference MCP config that can drift from harness-specific runtime semantics" + }, + { + "kind": "repo_test", + "command": "npm run harness:adapters -- --check", + "purpose": "Adapter matrix consistency gate" + } + ], + "retrieval_questions": [ + "Which harness or config surface changed: MCP, plugin, hook, command, agent, rule, or adapter?", + "Does the adapter matrix classify this harness as native, adapter-backed, instruction-backed, or reference-only?", + "Which install path, verification command, risk note, owner, and source doc apply?", + "Does the recommendation preserve existing user config rather than overwriting it?", + "Which compatibility regression or harness audit command proves the setup still works?" + ], + "forbidden_actions": [ + "claiming native support for instruction-backed or reference-only harnesses", + "copying Claude hook semantics into Codex, Gemini, Zed, or OpenCode without adapter evidence", + "silently overwriting existing user MCP, hook, plugin, command, or rule config", + "disabling or enabling live MCP servers from a read-only evaluator run", + "shipping an adapter change without a verification command", + "publishing packages or plugins from this evaluator run" + ], + "acceptance_gates": [ + "adapter state is retrieved from the matrix", + "install or onramp path is named", + "verification command is named", + "risk note is preserved", + "config-preservation behavior is explicit", + "at least one unsupported parity claim is rejected" + ] +} diff --git a/examples/evaluator-rag-prototype/harness-config-quality/trace.json b/examples/evaluator-rag-prototype/harness-config-quality/trace.json new file mode 100644 index 00000000..3a7c29b4 --- /dev/null +++ b/examples/evaluator-rag-prototype/harness-config-quality/trace.json @@ -0,0 +1,45 @@ +{ + "schema_version": "ecc.evaluator-rag.trace.v1", + "scenario_id": "harness-config-quality", + "run_id": "2026-05-12-harness-config-quality-prototype", + "read_only": true, + "events": [ + { + "phase": "observation", + "summary": "A setup recommendation or PR touches MCP, plugin, hook, command, agent, rule, or adapter surfaces. The evaluator records the surface without editing local or user-level config.", + "evidence": [ + "docs/architecture/harness-adapter-compliance.md", + "scripts/lib/harness-adapter-compliance.js" + ] + }, + { + "phase": "retrieval", + "summary": "Retrieved the adapter state, install/onramp path, verification commands, risk notes, and config-preservation tests for the affected harness.", + "evidence": [ + "npm run harness:adapters -- --check", + "npm run harness:audit -- --format json", + "node tests/lib/install-targets.test.js" + ] + }, + { + "phase": "proposal", + "summary": "Generated two candidate playbooks: adapter-matrix-backed drift check, and unsupported hook parity claim that copies Claude semantics into every harness.", + "candidate_ids": [ + "adapter-matrix-backed-drift-check", + "unsupported-hook-parity-claim" + ] + }, + { + "phase": "verification", + "summary": "Accepted the matrix-backed drift check because it names state, install path, verification, and preservation behavior. Rejected unsupported hook parity because it overclaims portability.", + "evidence": [ + "examples/evaluator-rag-prototype/harness-config-quality/verifier-result.json" + ] + }, + { + "phase": "promotion", + "summary": "Promoted only the read-only harness-config quality playbook. The evaluator does not overwrite configs, toggle MCP servers, publish plugins, or claim native support.", + "promoted_candidate_id": "adapter-matrix-backed-drift-check" + } + ] +} diff --git a/examples/evaluator-rag-prototype/harness-config-quality/verifier-result.json b/examples/evaluator-rag-prototype/harness-config-quality/verifier-result.json new file mode 100644 index 00000000..d0c78391 --- /dev/null +++ b/examples/evaluator-rag-prototype/harness-config-quality/verifier-result.json @@ -0,0 +1,35 @@ +{ + "schema_version": "ecc.evaluator-rag.verifier.v1", + "scenario_id": "harness-config-quality", + "run_id": "2026-05-12-harness-config-quality-prototype", + "read_only": true, + "candidates": [ + { + "candidate_id": "adapter-matrix-backed-drift-check", + "decision": "accepted", + "score": 0.92, + "reasons": [ + "retrieves adapter state before making a support claim", + "names install or onramp path and verification commands", + "preserves existing user and project config", + "keeps runtime MCP toggles and plugin publication out of the evaluator run", + "requires focused compatibility regression coverage" + ], + "rollback": "Revert the future adapter/config PR or restore the prior config merge behavior; no live user config is changed by this read-only playbook." + }, + { + "candidate_id": "unsupported-hook-parity-claim", + "decision": "rejected", + "score": 0.16, + "reasons": [ + "claims native support without adapter matrix evidence", + "copies Claude hook semantics into instruction-backed harnesses", + "does not name a verification command", + "does not preserve existing MCP or hook config", + "risks publishing or installing unsupported plugin behavior" + ], + "rollback": "Do not publish this setup recommendation; restart from adapter state, risk note, and config-preservation evidence." + } + ], + "promoted_candidate_id": "adapter-matrix-backed-drift-check" +} diff --git a/tests/docs/evaluator-rag-prototype.test.js b/tests/docs/evaluator-rag-prototype.test.js index 7c853131..9714d10f 100644 --- a/tests/docs/evaluator-rag-prototype.test.js +++ b/tests/docs/evaluator-rag-prototype.test.js @@ -135,7 +135,7 @@ test('roadmap points to the evaluator RAG prototype and keeps broader corpus wor assert.ok(roadmap.includes('docs/architecture/evaluator-rag-prototype.md')); assert.ok(roadmap.includes('examples/evaluator-rag-prototype/')); - assert.ok(roadmap.includes('Needs broader evaluator corpus')); + assert.ok(roadmap.includes('Needs AgentShield policy exception corpus')); }); test('billing readiness scenario rejects launch copy overclaims', () => { @@ -221,6 +221,52 @@ test('ci failure diagnosis scenario rejects rerun-only fixes', () => { assert.ok(playbook.includes('Full required GitHub Actions matrix before merge')); }); +test('harness config quality scenario rejects unsupported parity claims', () => { + const scenario = readFixtureJson('harness-config-quality/scenario.json'); + const trace = readFixtureJson('harness-config-quality/trace.json'); + const report = readFixtureJson('harness-config-quality/report.json'); + const verifier = readFixtureJson('harness-config-quality/verifier-result.json'); + const playbook = read('examples/evaluator-rag-prototype/harness-config-quality/candidate-playbook.md'); + + assert.strictEqual(scenario.scenario_id, 'harness-config-quality'); + assert.strictEqual(trace.scenario_id, scenario.scenario_id); + assert.strictEqual(report.scenario_id, scenario.scenario_id); + assert.strictEqual(verifier.scenario_id, scenario.scenario_id); + assert.strictEqual(trace.read_only, true); + assert.strictEqual(report.read_only, true); + assert.strictEqual(verifier.read_only, true); + + for (const blocked of [ + 'claiming native support for instruction-backed or reference-only harnesses', + 'copying Claude hook semantics into Codex, Gemini, Zed, or OpenCode without adapter evidence', + 'silently overwriting existing user MCP, hook, plugin, command, or rule config', + 'publishing packages or plugins from this evaluator run' + ]) { + assert.ok(scenario.forbidden_actions.includes(blocked), `Missing harness forbidden action: ${blocked}`); + } + + for (const required of [ + 'adapter state is retrieved from the matrix', + 'install or onramp path is named', + 'verification command is named', + 'config-preservation behavior is explicit' + ]) { + assert.ok(scenario.acceptance_gates.includes(required), `Missing harness acceptance gate: ${required}`); + } + + const accepted = verifier.candidates.find(candidate => candidate.candidate_id === 'adapter-matrix-backed-drift-check'); + const rejected = verifier.candidates.find(candidate => candidate.candidate_id === 'unsupported-hook-parity-claim'); + + assert.ok(accepted, 'Missing accepted adapter-matrix candidate'); + assert.ok(rejected, 'Missing rejected unsupported parity candidate'); + assert.strictEqual(accepted.decision, 'accepted'); + assert.strictEqual(rejected.decision, 'rejected'); + assert.strictEqual(verifier.promoted_candidate_id, accepted.candidate_id); + assert.ok(rejected.reasons.join('\n').includes('native support')); + assert.ok(playbook.includes('npm run harness:adapters -- --check')); + assert.ok(playbook.includes('node tests/docs/mcp-management-docs.test.js')); +}); + if (failed > 0) { console.log(`\nFailed: ${failed}`); process.exit(1);