mirror of
https://github.com/affaan-m/everything-claude-code.git
synced 2026-05-14 02:10:07 +08:00
docs: add evaluator rag prototype (#1824)
This commit is contained in:
parent
f2deedcf3d
commit
dcf5668b27
@ -54,6 +54,11 @@ As of 2026-05-12:
|
|||||||
dry-run without `--force`, local marketplace discovery, temp-home local
|
dry-run without `--force`, local marketplace discovery, temp-home local
|
||||||
install, enabled plugin listing, and clean uninstall for `ecc@ecc`
|
install, enabled plugin listing, and clean uninstall for `ecc@ecc`
|
||||||
`2.0.0-rc.1`.
|
`2.0.0-rc.1`.
|
||||||
|
- `docs/architecture/evaluator-rag-prototype.md` and
|
||||||
|
`examples/evaluator-rag-prototype/` define the first read-only
|
||||||
|
self-improving harness prototype: scenario spec, trace, report, candidate
|
||||||
|
playbook, verifier result, accepted maintainer-salvage candidate, and
|
||||||
|
rejected blind-translation candidate.
|
||||||
- The npm package surface now excludes Python bytecode/cache artifacts through
|
- The npm package surface now excludes Python bytecode/cache artifacts through
|
||||||
package `files` negation rules and a publish-surface regression test.
|
package `files` negation rules and a publish-surface regression test.
|
||||||
- `docs/legacy-artifact-inventory.md` records that no `_legacy-documents-*`
|
- `docs/legacy-artifact-inventory.md` records that no `_legacy-documents-*`
|
||||||
@ -194,7 +199,7 @@ is not complete unless the evidence column exists and has been freshly verified.
|
|||||||
| AgentShield enterprise iteration | Policy gates, SARIF, packs, provenance, corpus, HTML reports, exception lifecycle audit | PRs #53, #55-#62 landed with test evidence | Needs PDF/export decision or next enterprise signal |
|
| AgentShield enterprise iteration | Policy gates, SARIF, packs, provenance, corpus, HTML reports, exception lifecycle audit | PRs #53, #55-#62 landed with test evidence | Needs PDF/export decision or next enterprise signal |
|
||||||
| ECC Tools next-level app | Billing audit, PR checks, deep analyzer, sync backlog | PRs #26-#39 landed with test evidence | Needs capacity-backed Linear rollout / broader evaluator corpus |
|
| ECC Tools next-level app | Billing audit, PR checks, deep analyzer, sync backlog | PRs #26-#39 landed with test evidence | Needs capacity-backed Linear rollout / broader evaluator corpus |
|
||||||
| GitGuardian/Dependabot/CodeRabbit-style checks | Non-blocking taxonomy and deterministic follow-up checks | ECC-Tools risk taxonomy check plus follow-up signals landed, including Skill Quality, Deep Analyzer Evidence, Analyzer Corpus Evidence, RAG/Evaluator Evidence, and PR Review/Salvage Evidence | Partially complete |
|
| GitGuardian/Dependabot/CodeRabbit-style checks | Non-blocking taxonomy and deterministic follow-up checks | ECC-Tools risk taxonomy check plus follow-up signals landed, including Skill Quality, Deep Analyzer Evidence, Analyzer Corpus Evidence, RAG/Evaluator Evidence, and PR Review/Salvage Evidence | Partially complete |
|
||||||
| Harness-agnostic learning system | Audit, adapter matrix, observability, traces, promotion loop | Audit/adapters/observability gates exist | Needs evaluation/RAG prototype |
|
| Harness-agnostic learning system | Audit, adapter matrix, observability, traces, promotion loop | Audit/adapters/observability gates plus `docs/architecture/evaluator-rag-prototype.md` and `examples/evaluator-rag-prototype/` define the first read-only scenario, trace, report, playbook, and verifier result | Needs broader evaluator corpus |
|
||||||
| Linear roadmap is detailed | Linear project status plus repo mirror | Repo mirror exists; issue creation was retried on 2026-05-12 and remains blocked by the workspace free issue limit | Needs recurring status updates after each merge batch |
|
| Linear roadmap is detailed | Linear project status plus repo mirror | Repo mirror exists; issue creation was retried on 2026-05-12 and remains blocked by the workspace free issue limit | Needs recurring status updates after each merge batch |
|
||||||
| Flow separation and progress tracking | Flow lanes with owner artifacts and update cadence | This roadmap defines lanes below | Active |
|
| Flow separation and progress tracking | Flow lanes with owner artifacts and update cadence | This roadmap defines lanes below | Active |
|
||||||
| Realtime Linear sync | Project updates while issue limit is blocked; issues later | ECC-Tools #39 implements opt-in Linear API sync for deferred follow-up backlog items | Needs workspace capacity/config rollout |
|
| Realtime Linear sync | Project updates while issue limit is blocked; issues later | ECC-Tools #39 implements opt-in Linear API sync for deferred follow-up backlog items | Needs workspace capacity/config rollout |
|
||||||
@ -213,7 +218,7 @@ back to the repo evidence and merge commits.
|
|||||||
| Queue hygiene and salvage | GitHub PR/issue state, salvage ledger | Append ledger entries for any future stale closures | Every cleanup batch |
|
| Queue hygiene and salvage | GitHub PR/issue state, salvage ledger | Append ledger entries for any future stale closures | Every cleanup batch |
|
||||||
| Release and publication | rc.1 release docs, publication readiness doc | Naming matrix and plugin submission/contact checklist | Before any tag |
|
| Release and publication | rc.1 release docs, publication readiness doc | Naming matrix and plugin submission/contact checklist | Before any tag |
|
||||||
| Harness OS core | Audit, adapter matrix, observability docs, `ecc2/` | HUD/session-control acceptance spec | Weekly until GA |
|
| Harness OS core | Audit, adapter matrix, observability docs, `ecc2/` | HUD/session-control acceptance spec | Weekly until GA |
|
||||||
| Evaluation and RAG | Reference-set validation, harness audit, traces | Read-only evaluator/RAG prototype design | Before deep analyzer expansion |
|
| Evaluation and RAG | Reference-set validation, harness audit, traces | Read-only evaluator/RAG prototype plus fixture contract | Expand to CI, billing, harness-config, and AgentShield scenarios |
|
||||||
| AgentShield enterprise | AgentShield PR evidence and roadmap notes | PDF-export decision or next enterprise signal | After value decision |
|
| AgentShield enterprise | AgentShield PR evidence and roadmap notes | PDF-export decision or next enterprise signal | After value decision |
|
||||||
| ECC Tools app | ECC-Tools PR evidence, billing audit, risk taxonomy | Capacity-backed Linear rollout or broader evaluator/RAG corpus slice | Next implementation batch |
|
| ECC Tools app | ECC-Tools PR evidence, billing audit, risk taxonomy | Capacity-backed Linear rollout or broader evaluator/RAG corpus slice | Next implementation batch |
|
||||||
| Linear progress | Linear project status updates and this mirror | Status update with queue/evidence/missing gates | Every significant merge batch |
|
| Linear progress | Linear project status updates and this mirror | Status update with queue/evidence/missing gates | Every significant merge batch |
|
||||||
@ -407,5 +412,6 @@ Acceptance:
|
|||||||
executive report, corpus benchmark output, and exception lifecycle audit.
|
executive report, corpus benchmark output, and exception lifecycle audit.
|
||||||
2. Enable/configure the merged Linear backlog sync path after workspace issue
|
2. Enable/configure the merged Linear backlog sync path after workspace issue
|
||||||
capacity clears or the Linear workspace is upgraded.
|
capacity clears or the Linear workspace is upgraded.
|
||||||
3. Expand the evaluator/RAG corpus with real cleanup-batch cases as future
|
3. Expand the evaluator/RAG corpus beyond the first stale-salvage prototype to
|
||||||
maintainer-owned examples land.
|
CI failure diagnosis, harness-config drift, billing readiness, and
|
||||||
|
AgentShield policy exception scenarios.
|
||||||
|
|||||||
@ -136,6 +136,13 @@ Repo work:
|
|||||||
- `agentshield`: feed prompt-injection and config-risk findings into regression
|
- `agentshield`: feed prompt-injection and config-risk findings into regression
|
||||||
suites.
|
suites.
|
||||||
|
|
||||||
|
Current prototype:
|
||||||
|
|
||||||
|
- `docs/architecture/evaluator-rag-prototype.md` defines the read-only
|
||||||
|
evaluator/RAG artifact contract.
|
||||||
|
- `examples/evaluator-rag-prototype/` records the first scenario spec, trace,
|
||||||
|
report, candidate playbook, and verifier result for stale-PR salvage.
|
||||||
|
|
||||||
Verification:
|
Verification:
|
||||||
|
|
||||||
- read-only prototype that emits a trace, report, candidate playbook, and
|
- read-only prototype that emits a trace, report, candidate playbook, and
|
||||||
|
|||||||
122
docs/architecture/evaluator-rag-prototype.md
Normal file
122
docs/architecture/evaluator-rag-prototype.md
Normal file
@ -0,0 +1,122 @@
|
|||||||
|
# Evaluator RAG Prototype
|
||||||
|
|
||||||
|
ECC 2.0 needs a self-improving harness loop that can learn from real work
|
||||||
|
without blindly mutating a user's Claude, Codex, OpenCode, dmux, Zed, or
|
||||||
|
terminal setup. This prototype defines the smallest read-only artifact set for
|
||||||
|
that loop.
|
||||||
|
|
||||||
|
The fixture set lives in
|
||||||
|
[`examples/evaluator-rag-prototype/`](../../examples/evaluator-rag-prototype/).
|
||||||
|
It uses the May 2026 stale-PR cleanup and salvage lane as the first concrete
|
||||||
|
scenario because that lane has real inputs, real accepted work, and real
|
||||||
|
rejected work.
|
||||||
|
|
||||||
|
## Reference Pressure
|
||||||
|
|
||||||
|
- Meta-Harness: treat the harness itself as an experiment with scenario specs,
|
||||||
|
verifier results, and promoted playbooks.
|
||||||
|
- Autocontext: store traces, reports, artifacts, and reusable improvements
|
||||||
|
before changing installed agent assets.
|
||||||
|
- Claude HUD: expose context, tools, todos, agent activity, checks, and risk so
|
||||||
|
an evaluator can judge a run after the fact.
|
||||||
|
- Hermes Agent: keep skills, memories, scheduler-like follow-ups, and terminal
|
||||||
|
gateway behavior explicit instead of hiding local commands.
|
||||||
|
- dmux, Orca, Superset, and Ghast: preserve worktree/session state so parallel
|
||||||
|
agent work can be compared, resumed, or closed cleanly.
|
||||||
|
- ECC Tools: route evaluator findings into PR comments, check runs, and Linear
|
||||||
|
backlog items without flooding GitHub.
|
||||||
|
|
||||||
|
## Artifact Contract
|
||||||
|
|
||||||
|
Every evaluator/RAG run is read-only until a verifier promotes a playbook.
|
||||||
|
|
||||||
|
| Artifact | Purpose | Fixture |
|
||||||
|
| --- | --- | --- |
|
||||||
|
| Scenario spec | Declares the objective, allowed evidence, forbidden actions, and pass/fail gates. | `scenario.json` |
|
||||||
|
| Trace | Captures observation, retrieval, proposal, verification, and promotion events. | `trace.json` |
|
||||||
|
| Report | Summarizes scores, evidence coverage, risks, and recommended next action. | `report.json` |
|
||||||
|
| Candidate playbook | Describes the maintainer-owned workflow that could be reused later. | `candidate-playbook.md` |
|
||||||
|
| Verifier result | Accepts or rejects candidates with concrete reasons and rollback notes. | `verifier-result.json` |
|
||||||
|
|
||||||
|
The prototype deliberately separates retrieval from action. A run can retrieve
|
||||||
|
closed PR diffs, Linear status, CI history, and local docs, but it cannot close,
|
||||||
|
merge, publish, tag, or rewrite configs as part of the evaluator pass.
|
||||||
|
|
||||||
|
## Phase Model
|
||||||
|
|
||||||
|
1. Observe the current queue, dirty worktrees, branch state, open PRs/issues,
|
||||||
|
discussions, CI state, and release gates.
|
||||||
|
2. Retrieve relevant reference evidence: stale-salvage ledger rows, prior
|
||||||
|
maintainer PRs, current docs, analyzer findings, CI failures, and harness
|
||||||
|
adapter rules.
|
||||||
|
3. Propose one or more playbooks with source attribution and expected
|
||||||
|
validation gates.
|
||||||
|
4. Verify each playbook against explicit acceptance and rejection rules.
|
||||||
|
5. Promote only the candidate that improves the scenario without widening blast
|
||||||
|
radius.
|
||||||
|
6. Record rollback guidance and unresolved manual-review tails.
|
||||||
|
|
||||||
|
## First Scenario
|
||||||
|
|
||||||
|
The first scenario is `stale-pr-salvage-maintainer-branch`.
|
||||||
|
|
||||||
|
It models the rule Affaan set during the May 2026 cleanup: stale closure is
|
||||||
|
queue hygiene, not loss of useful work. Useful closed PR work should be ported
|
||||||
|
into maintainer-owned PRs with attribution/backlinks, while generated churn,
|
||||||
|
bulk localization, and ambiguous translator work stay out of blind
|
||||||
|
cherry-picks.
|
||||||
|
|
||||||
|
The verifier accepts a maintainer salvage branch that:
|
||||||
|
|
||||||
|
- credits source PRs;
|
||||||
|
- avoids raw private context and personal paths;
|
||||||
|
- does not import stale bulk localization without translator review;
|
||||||
|
- records a durable ledger update;
|
||||||
|
- runs the same validation gates as a normal code, docs, or catalog change;
|
||||||
|
- leaves release publication actions approval-gated.
|
||||||
|
|
||||||
|
The verifier rejects a blind cherry-pick proposal that:
|
||||||
|
|
||||||
|
- imports stale translation/doc churn wholesale;
|
||||||
|
- skips the current catalog/install architecture;
|
||||||
|
- lacks attribution;
|
||||||
|
- lacks tests or ledger updates;
|
||||||
|
- mutates release or plugin publication state.
|
||||||
|
|
||||||
|
## ECC Tools Mapping
|
||||||
|
|
||||||
|
ECC Tools already flags missing RAG/evaluator evidence for retrieval,
|
||||||
|
embedding, ranking, and evaluator changes. This prototype gives those checks a
|
||||||
|
target shape:
|
||||||
|
|
||||||
|
- `scenario.json` maps to analyzer corpus inputs.
|
||||||
|
- `trace.json` maps to golden traces and run telemetry.
|
||||||
|
- `report.json` maps to PR comment summaries and Linear backlog summaries.
|
||||||
|
- `candidate-playbook.md` maps to the suggested follow-up PR body.
|
||||||
|
- `verifier-result.json` maps to pass/fail check-run evidence.
|
||||||
|
|
||||||
|
Future ECC Tools work should consume these artifacts as fixture shape before it
|
||||||
|
adds hosted retrieval or model-backed judging. The local prototype is enough to
|
||||||
|
prove the contract before any paid API or vector store is introduced.
|
||||||
|
|
||||||
|
## Promotion Rules
|
||||||
|
|
||||||
|
A candidate can be promoted only when:
|
||||||
|
|
||||||
|
- the verifier result is `accepted`;
|
||||||
|
- at least one rejected candidate proves the verifier can say no;
|
||||||
|
- every source PR or reference artifact has attribution;
|
||||||
|
- the proposed action is maintainer-owned and reversible;
|
||||||
|
- validation commands are named;
|
||||||
|
- unresolved translator, release, billing, or publication items remain blocked
|
||||||
|
until separately approved.
|
||||||
|
|
||||||
|
## Next Expansion
|
||||||
|
|
||||||
|
The next evaluator/RAG corpus should add:
|
||||||
|
|
||||||
|
- a CI-failure diagnosis scenario with captured logs and a known fix;
|
||||||
|
- a harness-config quality scenario covering MCP/plugin/hook drift;
|
||||||
|
- a billing-readiness scenario that separates verified Marketplace claims from
|
||||||
|
launch-copy assumptions;
|
||||||
|
- an AgentShield policy exception scenario with SARIF and report evidence.
|
||||||
41
examples/evaluator-rag-prototype/candidate-playbook.md
Normal file
41
examples/evaluator-rag-prototype/candidate-playbook.md
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
# Candidate Playbook: Maintainer-Owned Stale Salvage
|
||||||
|
|
||||||
|
Candidate id: `maintainer-salvage-branch`
|
||||||
|
|
||||||
|
## Use When
|
||||||
|
|
||||||
|
- A stale or conflicted PR was closed to keep the public queue usable.
|
||||||
|
- The closed diff contains a useful focused idea, skill, command, doc, test, or
|
||||||
|
bug fix.
|
||||||
|
- The contributor may not have time or interest to rebase.
|
||||||
|
|
||||||
|
## Steps
|
||||||
|
|
||||||
|
1. Record the source PR, author, useful concept, and closure reason in
|
||||||
|
`docs/stale-pr-salvage-ledger.md`.
|
||||||
|
2. Re-read the closed PR diff against current `main`.
|
||||||
|
3. Decide whether the patch can be cherry-picked safely. Prefer reimplementation
|
||||||
|
when current architecture has moved.
|
||||||
|
4. Create a maintainer-owned branch with one focused salvage unit.
|
||||||
|
5. Preserve attribution in the PR body and, when useful, in the commit body.
|
||||||
|
6. Update the catalog, docs, tests, or release evidence required by the touched
|
||||||
|
surface.
|
||||||
|
7. Run the same validation gates a normal change would require.
|
||||||
|
8. After merge, update the ledger from pending/salvage-branch to landed,
|
||||||
|
already-present, superseded, skipped, or translator/manual review.
|
||||||
|
|
||||||
|
## Reject Conditions
|
||||||
|
|
||||||
|
- The patch is bulk generated churn.
|
||||||
|
- The patch is stale localization that needs translator/manual review.
|
||||||
|
- The patch imports personal paths, secrets, local settings, or private operator context.
|
||||||
|
- The patch bypasses current install, catalog, plugin, or release architecture.
|
||||||
|
- The branch would mix unrelated salvage units into one PR.
|
||||||
|
|
||||||
|
## Minimum Validation
|
||||||
|
|
||||||
|
- Targeted test for the touched surface.
|
||||||
|
- `git diff --check`.
|
||||||
|
- Markdown lint when docs are touched.
|
||||||
|
- Catalog/install validation when skills, agents, commands, or plugin surfaces
|
||||||
|
are touched.
|
||||||
35
examples/evaluator-rag-prototype/report.json
Normal file
35
examples/evaluator-rag-prototype/report.json
Normal file
@ -0,0 +1,35 @@
|
|||||||
|
{
|
||||||
|
"schema_version": "ecc.evaluator-rag.report.v1",
|
||||||
|
"scenario_id": "stale-pr-salvage-maintainer-branch",
|
||||||
|
"run_id": "2026-05-12-cleanup-salvage-prototype",
|
||||||
|
"result": "prototype_passed",
|
||||||
|
"read_only": true,
|
||||||
|
"scores": {
|
||||||
|
"source_attribution": 1,
|
||||||
|
"blast_radius_control": 1,
|
||||||
|
"manual_review_respected": 1,
|
||||||
|
"validation_specificity": 0.8,
|
||||||
|
"publication_safety": 1
|
||||||
|
},
|
||||||
|
"findings": [
|
||||||
|
{
|
||||||
|
"id": "salvage-policy-usable",
|
||||||
|
"severity": "info",
|
||||||
|
"summary": "The stale-salvage ledger and maintainer PR examples provide enough evidence to promote a reusable maintainer-owned salvage playbook."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "translation-tail-blocked",
|
||||||
|
"severity": "warning",
|
||||||
|
"summary": "Localization tails remain useful but must stay translator/manual-review only."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "release-actions-blocked",
|
||||||
|
"severity": "warning",
|
||||||
|
"summary": "Release, npm, plugin, billing, and announcement actions remain outside this evaluator run and require separate approval."
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"recommended_next_action": {
|
||||||
|
"candidate_id": "maintainer-salvage-branch",
|
||||||
|
"action": "Use the promoted playbook for future stale cleanup batches and add additional evaluator/RAG scenarios for CI failure diagnosis, harness-config drift, billing readiness, and AgentShield policy exceptions."
|
||||||
|
}
|
||||||
|
}
|
||||||
56
examples/evaluator-rag-prototype/scenario.json
Normal file
56
examples/evaluator-rag-prototype/scenario.json
Normal file
@ -0,0 +1,56 @@
|
|||||||
|
{
|
||||||
|
"schema_version": "ecc.evaluator-rag.scenario.v1",
|
||||||
|
"scenario_id": "stale-pr-salvage-maintainer-branch",
|
||||||
|
"title": "Recover useful stale PR work through maintainer-owned branches",
|
||||||
|
"mode": "read_only_prototype",
|
||||||
|
"objective": "Given a closed stale PR batch, identify useful work, reject unsafe bulk imports, and promote only a maintainer-owned salvage playbook with attribution and validation.",
|
||||||
|
"sources": [
|
||||||
|
{
|
||||||
|
"kind": "repo_doc",
|
||||||
|
"path": "docs/stale-pr-salvage-ledger.md",
|
||||||
|
"purpose": "Durable source-to-disposition mapping for stale PR cleanup"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"kind": "repo_doc",
|
||||||
|
"path": "docs/legacy-artifact-inventory.md",
|
||||||
|
"purpose": "Import guardrails for legacy and private-context material"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"kind": "roadmap",
|
||||||
|
"path": "docs/ECC-2.0-GA-ROADMAP.md",
|
||||||
|
"purpose": "Operating rule and current execution lane"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"kind": "github_pr",
|
||||||
|
"url": "https://github.com/affaan-m/everything-claude-code/pull/1815",
|
||||||
|
"purpose": "Example maintainer-owned stale salvage PR with attribution"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"kind": "github_pr",
|
||||||
|
"url": "https://github.com/affaan-m/everything-claude-code/pull/1818",
|
||||||
|
"purpose": "Example gap pass classifying already-present and skipped stale work"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"retrieval_questions": [
|
||||||
|
"Which closed PRs contain useful work that is not already present?",
|
||||||
|
"Which files or concepts are unsafe to cherry-pick without manual review?",
|
||||||
|
"Which current docs, skills, commands, or tests are the correct integration points?",
|
||||||
|
"Which validation gates are required before the salvage work can merge?"
|
||||||
|
],
|
||||||
|
"forbidden_actions": [
|
||||||
|
"closing, reopening, or commenting on PRs",
|
||||||
|
"merging PRs",
|
||||||
|
"creating release tags",
|
||||||
|
"publishing packages or plugins",
|
||||||
|
"copying private paths, secrets, or raw personal context",
|
||||||
|
"blindly cherry-picking bulk localization"
|
||||||
|
],
|
||||||
|
"acceptance_gates": [
|
||||||
|
"source attribution is preserved",
|
||||||
|
"salvage ledger or equivalent tracker is updated",
|
||||||
|
"translation/manual-review tails remain blocked",
|
||||||
|
"candidate action is reversible and maintainer-owned",
|
||||||
|
"validation commands are named",
|
||||||
|
"at least one unsafe candidate is rejected"
|
||||||
|
]
|
||||||
|
}
|
||||||
46
examples/evaluator-rag-prototype/trace.json
Normal file
46
examples/evaluator-rag-prototype/trace.json
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
{
|
||||||
|
"schema_version": "ecc.evaluator-rag.trace.v1",
|
||||||
|
"scenario_id": "stale-pr-salvage-maintainer-branch",
|
||||||
|
"run_id": "2026-05-12-cleanup-salvage-prototype",
|
||||||
|
"read_only": true,
|
||||||
|
"events": [
|
||||||
|
{
|
||||||
|
"phase": "observation",
|
||||||
|
"summary": "Public PR, issue, and discussion queues are clear; release publication remains approval-gated; stale-salvage ledger has landed, skipped, superseded, and manual-review states.",
|
||||||
|
"evidence": [
|
||||||
|
"docs/ECC-2.0-GA-ROADMAP.md",
|
||||||
|
"docs/stale-pr-salvage-ledger.md"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"phase": "retrieval",
|
||||||
|
"summary": "Retrieved stale PR source mappings, existing maintainer salvage examples, legacy import rules, and manual-review localization tails.",
|
||||||
|
"evidence": [
|
||||||
|
"docs/stale-pr-salvage-ledger.md",
|
||||||
|
"docs/legacy-artifact-inventory.md",
|
||||||
|
"https://github.com/affaan-m/everything-claude-code/pull/1815",
|
||||||
|
"https://github.com/affaan-m/everything-claude-code/pull/1818"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"phase": "proposal",
|
||||||
|
"summary": "Generated two candidate playbooks: maintainer-owned salvage branch with attribution, and blind cherry-pick of stale translations.",
|
||||||
|
"candidate_ids": [
|
||||||
|
"maintainer-salvage-branch",
|
||||||
|
"blind-cherry-pick-translations"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"phase": "verification",
|
||||||
|
"summary": "Accepted the maintainer-owned salvage branch and rejected blind translation cherry-picking because it violates manual-review and attribution gates.",
|
||||||
|
"evidence": [
|
||||||
|
"examples/evaluator-rag-prototype/verifier-result.json"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"phase": "promotion",
|
||||||
|
"summary": "Promoted only the maintainer-owned salvage branch playbook as a reusable process. No repository, GitHub, release, billing, or plugin publication action is performed by this prototype.",
|
||||||
|
"promoted_candidate_id": "maintainer-salvage-branch"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
35
examples/evaluator-rag-prototype/verifier-result.json
Normal file
35
examples/evaluator-rag-prototype/verifier-result.json
Normal file
@ -0,0 +1,35 @@
|
|||||||
|
{
|
||||||
|
"schema_version": "ecc.evaluator-rag.verifier.v1",
|
||||||
|
"scenario_id": "stale-pr-salvage-maintainer-branch",
|
||||||
|
"run_id": "2026-05-12-cleanup-salvage-prototype",
|
||||||
|
"read_only": true,
|
||||||
|
"candidates": [
|
||||||
|
{
|
||||||
|
"candidate_id": "maintainer-salvage-branch",
|
||||||
|
"decision": "accepted",
|
||||||
|
"score": 0.94,
|
||||||
|
"reasons": [
|
||||||
|
"preserves source PR attribution",
|
||||||
|
"keeps work on a fresh maintainer-owned branch",
|
||||||
|
"updates the salvage ledger",
|
||||||
|
"names validation gates",
|
||||||
|
"does not perform release or publication actions"
|
||||||
|
],
|
||||||
|
"rollback": "Close the maintainer PR or revert its merge commit; source PR state remains unchanged."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"candidate_id": "blind-cherry-pick-translations",
|
||||||
|
"decision": "rejected",
|
||||||
|
"score": 0.21,
|
||||||
|
"reasons": [
|
||||||
|
"bulk localization requires translator/manual review",
|
||||||
|
"does not preserve enough source attribution",
|
||||||
|
"could import stale generated docs",
|
||||||
|
"does not name validation gates",
|
||||||
|
"risks bypassing current catalog and install architecture"
|
||||||
|
],
|
||||||
|
"rollback": "Do not create this branch; keep the localization tail in translator/manual-review state."
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"promoted_candidate_id": "maintainer-salvage-branch"
|
||||||
|
}
|
||||||
142
tests/docs/evaluator-rag-prototype.test.js
Normal file
142
tests/docs/evaluator-rag-prototype.test.js
Normal file
@ -0,0 +1,142 @@
|
|||||||
|
'use strict';
|
||||||
|
|
||||||
|
const assert = require('assert');
|
||||||
|
const fs = require('fs');
|
||||||
|
const path = require('path');
|
||||||
|
|
||||||
|
const repoRoot = path.resolve(__dirname, '..', '..');
|
||||||
|
const fixtureRoot = path.join(repoRoot, 'examples', 'evaluator-rag-prototype');
|
||||||
|
|
||||||
|
let passed = 0;
|
||||||
|
let failed = 0;
|
||||||
|
|
||||||
|
function test(name, fn) {
|
||||||
|
try {
|
||||||
|
fn();
|
||||||
|
console.log(` ✓ ${name}`);
|
||||||
|
passed++;
|
||||||
|
} catch (error) {
|
||||||
|
console.log(` ✗ ${name}`);
|
||||||
|
console.log(` Error: ${error.message}`);
|
||||||
|
failed++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function read(relativePath) {
|
||||||
|
return fs.readFileSync(path.join(repoRoot, relativePath), 'utf8');
|
||||||
|
}
|
||||||
|
|
||||||
|
function readJson(fileName) {
|
||||||
|
return JSON.parse(fs.readFileSync(path.join(fixtureRoot, fileName), 'utf8'));
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log('\n=== Testing evaluator RAG prototype ===\n');
|
||||||
|
|
||||||
|
test('architecture doc records the artifact contract and reference pressure', () => {
|
||||||
|
const source = read('docs/architecture/evaluator-rag-prototype.md');
|
||||||
|
|
||||||
|
for (const required of [
|
||||||
|
'Scenario spec',
|
||||||
|
'Trace',
|
||||||
|
'Report',
|
||||||
|
'Candidate playbook',
|
||||||
|
'Verifier result',
|
||||||
|
'Meta-Harness',
|
||||||
|
'Autocontext',
|
||||||
|
'Claude HUD',
|
||||||
|
'Hermes Agent',
|
||||||
|
'dmux, Orca, Superset, and Ghast',
|
||||||
|
'ECC Tools'
|
||||||
|
]) {
|
||||||
|
assert.ok(source.includes(required), `Missing doc requirement: ${required}`);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
test('fixtures use one scenario id and declare read-only behavior', () => {
|
||||||
|
const scenario = readJson('scenario.json');
|
||||||
|
const trace = readJson('trace.json');
|
||||||
|
const report = readJson('report.json');
|
||||||
|
const verifier = readJson('verifier-result.json');
|
||||||
|
|
||||||
|
assert.strictEqual(scenario.schema_version, 'ecc.evaluator-rag.scenario.v1');
|
||||||
|
assert.strictEqual(trace.schema_version, 'ecc.evaluator-rag.trace.v1');
|
||||||
|
assert.strictEqual(report.schema_version, 'ecc.evaluator-rag.report.v1');
|
||||||
|
assert.strictEqual(verifier.schema_version, 'ecc.evaluator-rag.verifier.v1');
|
||||||
|
|
||||||
|
for (const artifact of [trace, report, verifier]) {
|
||||||
|
assert.strictEqual(artifact.scenario_id, scenario.scenario_id);
|
||||||
|
assert.strictEqual(artifact.read_only, true);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
test('trace covers the full self-improving harness loop', () => {
|
||||||
|
const trace = readJson('trace.json');
|
||||||
|
const phases = trace.events.map(event => event.phase);
|
||||||
|
|
||||||
|
for (const phase of ['observation', 'retrieval', 'proposal', 'verification', 'promotion']) {
|
||||||
|
assert.ok(phases.includes(phase), `Missing trace phase ${phase}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
assert.ok(trace.events.some(event => event.promoted_candidate_id === 'maintainer-salvage-branch'));
|
||||||
|
});
|
||||||
|
|
||||||
|
test('scenario blocks unsafe write actions and release actions', () => {
|
||||||
|
const scenario = readJson('scenario.json');
|
||||||
|
const forbidden = scenario.forbidden_actions.join('\n');
|
||||||
|
|
||||||
|
for (const blocked of [
|
||||||
|
'closing, reopening, or commenting on PRs',
|
||||||
|
'merging PRs',
|
||||||
|
'creating release tags',
|
||||||
|
'publishing packages or plugins',
|
||||||
|
'copying private paths, secrets, or raw personal context',
|
||||||
|
'blindly cherry-picking bulk localization'
|
||||||
|
]) {
|
||||||
|
assert.ok(forbidden.includes(blocked), `Missing forbidden action: ${blocked}`);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
test('verifier accepts maintainer salvage and rejects blind translation imports', () => {
|
||||||
|
const verifier = readJson('verifier-result.json');
|
||||||
|
const accepted = verifier.candidates.find(candidate => candidate.candidate_id === 'maintainer-salvage-branch');
|
||||||
|
const rejected = verifier.candidates.find(candidate => candidate.candidate_id === 'blind-cherry-pick-translations');
|
||||||
|
|
||||||
|
assert.ok(accepted, 'Missing accepted maintainer salvage candidate');
|
||||||
|
assert.ok(rejected, 'Missing rejected blind cherry-pick candidate');
|
||||||
|
assert.strictEqual(accepted.decision, 'accepted');
|
||||||
|
assert.strictEqual(rejected.decision, 'rejected');
|
||||||
|
assert.strictEqual(verifier.promoted_candidate_id, accepted.candidate_id);
|
||||||
|
assert.ok(accepted.score > rejected.score);
|
||||||
|
assert.ok(rejected.reasons.join('\n').includes('translator/manual review'));
|
||||||
|
});
|
||||||
|
|
||||||
|
test('candidate playbook preserves stale-salvage operating rules', () => {
|
||||||
|
const playbook = read('examples/evaluator-rag-prototype/candidate-playbook.md');
|
||||||
|
|
||||||
|
for (const required of [
|
||||||
|
'docs/stale-pr-salvage-ledger.md',
|
||||||
|
'source PR',
|
||||||
|
'maintainer-owned branch',
|
||||||
|
'Preserve attribution',
|
||||||
|
'translator/manual review',
|
||||||
|
'private operator context',
|
||||||
|
'git diff --check'
|
||||||
|
]) {
|
||||||
|
assert.ok(playbook.includes(required), `Missing playbook rule: ${required}`);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
test('roadmap points to the evaluator RAG prototype and keeps broader corpus work open', () => {
|
||||||
|
const roadmap = read('docs/ECC-2.0-GA-ROADMAP.md');
|
||||||
|
|
||||||
|
assert.ok(roadmap.includes('docs/architecture/evaluator-rag-prototype.md'));
|
||||||
|
assert.ok(roadmap.includes('examples/evaluator-rag-prototype/'));
|
||||||
|
assert.ok(roadmap.includes('Needs broader evaluator corpus'));
|
||||||
|
});
|
||||||
|
|
||||||
|
if (failed > 0) {
|
||||||
|
console.log(`\nFailed: ${failed}`);
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(`\nPassed: ${passed}`);
|
||||||
Loading…
x
Reference in New Issue
Block a user