docs: add evaluator billing readiness scenario (#1825)

This commit is contained in:
Affaan Mustafa 2026-05-12 17:24:34 -04:00 committed by GitHub
parent dcf5668b27
commit 863519eecf
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 279 additions and 10 deletions

View File

@ -199,7 +199,7 @@ is not complete unless the evidence column exists and has been freshly verified.
| AgentShield enterprise iteration | Policy gates, SARIF, packs, provenance, corpus, HTML reports, exception lifecycle audit | PRs #53, #55-#62 landed with test evidence | Needs PDF/export decision or next enterprise signal | | AgentShield enterprise iteration | Policy gates, SARIF, packs, provenance, corpus, HTML reports, exception lifecycle audit | PRs #53, #55-#62 landed with test evidence | Needs PDF/export decision or next enterprise signal |
| ECC Tools next-level app | Billing audit, PR checks, deep analyzer, sync backlog | PRs #26-#39 landed with test evidence | Needs capacity-backed Linear rollout / broader evaluator corpus | | ECC Tools next-level app | Billing audit, PR checks, deep analyzer, sync backlog | PRs #26-#39 landed with test evidence | Needs capacity-backed Linear rollout / broader evaluator corpus |
| GitGuardian/Dependabot/CodeRabbit-style checks | Non-blocking taxonomy and deterministic follow-up checks | ECC-Tools risk taxonomy check plus follow-up signals landed, including Skill Quality, Deep Analyzer Evidence, Analyzer Corpus Evidence, RAG/Evaluator Evidence, and PR Review/Salvage Evidence | Partially complete | | GitGuardian/Dependabot/CodeRabbit-style checks | Non-blocking taxonomy and deterministic follow-up checks | ECC-Tools risk taxonomy check plus follow-up signals landed, including Skill Quality, Deep Analyzer Evidence, Analyzer Corpus Evidence, RAG/Evaluator Evidence, and PR Review/Salvage Evidence | Partially complete |
| Harness-agnostic learning system | Audit, adapter matrix, observability, traces, promotion loop | Audit/adapters/observability gates plus `docs/architecture/evaluator-rag-prototype.md` and `examples/evaluator-rag-prototype/` define the first read-only scenario, trace, report, playbook, and verifier result | Needs broader evaluator corpus | | Harness-agnostic learning system | Audit, adapter matrix, observability, traces, promotion loop | Audit/adapters/observability gates plus `docs/architecture/evaluator-rag-prototype.md` and `examples/evaluator-rag-prototype/` define read-only stale-salvage and billing-readiness scenarios with trace, report, playbook, and verifier result artifacts | Needs broader evaluator corpus |
| Linear roadmap is detailed | Linear project status plus repo mirror | Repo mirror exists; issue creation was retried on 2026-05-12 and remains blocked by the workspace free issue limit | Needs recurring status updates after each merge batch | | Linear roadmap is detailed | Linear project status plus repo mirror | Repo mirror exists; issue creation was retried on 2026-05-12 and remains blocked by the workspace free issue limit | Needs recurring status updates after each merge batch |
| Flow separation and progress tracking | Flow lanes with owner artifacts and update cadence | This roadmap defines lanes below | Active | | Flow separation and progress tracking | Flow lanes with owner artifacts and update cadence | This roadmap defines lanes below | Active |
| Realtime Linear sync | Project updates while issue limit is blocked; issues later | ECC-Tools #39 implements opt-in Linear API sync for deferred follow-up backlog items | Needs workspace capacity/config rollout | | Realtime Linear sync | Project updates while issue limit is blocked; issues later | ECC-Tools #39 implements opt-in Linear API sync for deferred follow-up backlog items | Needs workspace capacity/config rollout |
@ -218,7 +218,7 @@ back to the repo evidence and merge commits.
| Queue hygiene and salvage | GitHub PR/issue state, salvage ledger | Append ledger entries for any future stale closures | Every cleanup batch | | Queue hygiene and salvage | GitHub PR/issue state, salvage ledger | Append ledger entries for any future stale closures | Every cleanup batch |
| Release and publication | rc.1 release docs, publication readiness doc | Naming matrix and plugin submission/contact checklist | Before any tag | | Release and publication | rc.1 release docs, publication readiness doc | Naming matrix and plugin submission/contact checklist | Before any tag |
| Harness OS core | Audit, adapter matrix, observability docs, `ecc2/` | HUD/session-control acceptance spec | Weekly until GA | | Harness OS core | Audit, adapter matrix, observability docs, `ecc2/` | HUD/session-control acceptance spec | Weekly until GA |
| Evaluation and RAG | Reference-set validation, harness audit, traces | Read-only evaluator/RAG prototype plus fixture contract | Expand to CI, billing, harness-config, and AgentShield scenarios | | Evaluation and RAG | Reference-set validation, harness audit, traces | Read-only evaluator/RAG prototype plus stale-salvage and billing-readiness fixtures | Expand to CI, harness-config, and AgentShield scenarios |
| AgentShield enterprise | AgentShield PR evidence and roadmap notes | PDF-export decision or next enterprise signal | After value decision | | AgentShield enterprise | AgentShield PR evidence and roadmap notes | PDF-export decision or next enterprise signal | After value decision |
| ECC Tools app | ECC-Tools PR evidence, billing audit, risk taxonomy | Capacity-backed Linear rollout or broader evaluator/RAG corpus slice | Next implementation batch | | ECC Tools app | ECC-Tools PR evidence, billing audit, risk taxonomy | Capacity-backed Linear rollout or broader evaluator/RAG corpus slice | Next implementation batch |
| Linear progress | Linear project status updates and this mirror | Status update with queue/evidence/missing gates | Every significant merge batch | | Linear progress | Linear project status updates and this mirror | Status update with queue/evidence/missing gates | Every significant merge batch |
@ -356,6 +356,11 @@ Acceptance:
Manifest Integrity, CI/CD Recommendation, Cost/Token Risk, Reference Set Manifest Integrity, CI/CD Recommendation, Cost/Token Risk, Reference Set
Validation, Deep Analyzer Evidence, RAG/Evaluator Evidence, Validation, Deep Analyzer Evidence, RAG/Evaluator Evidence,
PR Review/Salvage Evidence, Skill Quality, and Agent Config Review. PR Review/Salvage Evidence, Skill Quality, and Agent Config Review.
- Evaluator/RAG billing readiness fixture
`examples/evaluator-rag-prototype/billing-marketplace-readiness/` records the
read-only claim-verification path for Marketplace, App, subscription, seat,
entitlement, and plan language before launch copy can treat those claims as
live.
- Cost/token-risk predictive follow-ups flag AI routing, model-call, usage, - Cost/token-risk predictive follow-ups flag AI routing, model-call, usage,
quota, and budget changes when budget evidence is missing. quota, and budget changes when budget evidence is missing.
- Reference-set validation follow-ups flag analyzer, skill, agent, command, and - Reference-set validation follow-ups flag analyzer, skill, agent, command, and
@ -412,6 +417,6 @@ Acceptance:
executive report, corpus benchmark output, and exception lifecycle audit. executive report, corpus benchmark output, and exception lifecycle audit.
2. Enable/configure the merged Linear backlog sync path after workspace issue 2. Enable/configure the merged Linear backlog sync path after workspace issue
capacity clears or the Linear workspace is upgraded. capacity clears or the Linear workspace is upgraded.
3. Expand the evaluator/RAG corpus beyond the first stale-salvage prototype to 3. Expand the evaluator/RAG corpus beyond the stale-salvage and billing
CI failure diagnosis, harness-config drift, billing readiness, and prototypes to CI failure diagnosis, harness-config drift, and AgentShield
AgentShield policy exception scenarios. policy exception scenarios.

View File

@ -7,9 +7,10 @@ that loop.
The fixture set lives in The fixture set lives in
[`examples/evaluator-rag-prototype/`](../../examples/evaluator-rag-prototype/). [`examples/evaluator-rag-prototype/`](../../examples/evaluator-rag-prototype/).
It uses the May 2026 stale-PR cleanup and salvage lane as the first concrete It started with the May 2026 stale-PR cleanup and salvage lane because that
scenario because that lane has real inputs, real accepted work, and real lane has real inputs, real accepted work, and real rejected work. The corpus now
rejected work. also includes a billing/Marketplace readiness scenario so launch copy cannot
treat dry-run release evidence or roadmap intent as live billing state.
## Reference Pressure ## Reference Pressure
@ -83,6 +84,19 @@ The verifier rejects a blind cherry-pick proposal that:
- lacks tests or ledger updates; - lacks tests or ledger updates;
- mutates release or plugin publication state. - mutates release or plugin publication state.
## Corpus Fixtures
The root fixture files preserve the original
`stale-pr-salvage-maintainer-branch` prototype. Additional scenarios can live in
subdirectories when they reuse the same five-artifact contract.
Current corpus:
- `stale-pr-salvage-maintainer-branch`: recovers useful closed PR work through
maintainer-owned branches with attribution and validation.
- `billing-marketplace-readiness`: verifies billing, App, and Marketplace
launch claims before public copy says they are live.
## ECC Tools Mapping ## ECC Tools Mapping
ECC Tools already flags missing RAG/evaluator evidence for retrieval, ECC Tools already flags missing RAG/evaluator evidence for retrieval,
@ -117,6 +131,4 @@ The next evaluator/RAG corpus should add:
- a CI-failure diagnosis scenario with captured logs and a known fix; - a CI-failure diagnosis scenario with captured logs and a known fix;
- a harness-config quality scenario covering MCP/plugin/hook drift; - a harness-config quality scenario covering MCP/plugin/hook drift;
- a billing-readiness scenario that separates verified Marketplace claims from
launch-copy assumptions;
- an AgentShield policy exception scenario with SARIF and report evidence. - an AgentShield policy exception scenario with SARIF and report evidence.

View File

@ -0,0 +1,41 @@
# Billing Marketplace Readiness Playbook
Use this playbook when release copy or roadmap text mentions ECC Tools
billing, Marketplace availability, account recovery, plans, seats,
entitlements, or subscription state.
## Accepted Path
1. Start from `docs/releases/2.0.0-rc.1/publication-readiness.md`.
2. Check the current repo and public listing surfaces:
- `gh api repos/ECC-Tools/ECC-Tools`
- `https://github.com/marketplace/ecc-tools`
3. Classify every billing or Marketplace claim as:
- `verified`
- `blocked`
- `remove-before-publication`
4. Keep roadmap acceptance criteria separate from live product claims.
5. Update release copy only after the evidence points to a live URL or command
result.
6. Leave tag creation, npm publish, plugin submission, marketplace edits,
subscription changes, and announcement posting approval-gated.
## Rejected Path
Do not say billing is live because a roadmap item exists, a dry run passed, or a
Marketplace URL is known. Roadmap intent and dry-run publication evidence are
not a billing state.
Do not edit plan limits, subscriptions, seats, entitlements, or Marketplace
metadata from the evaluator run. Those are product/operator actions and require
their own approval path.
## Validation Gates
- `rg -n "billing|Billing|Marketplace|marketplace|subscription|seat|entitlement|plan" README.md docs/releases/2.0.0-rc.1 docs/ECC-2.0-GA-ROADMAP.md`
- `gh api repos/ECC-Tools/ECC-Tools`
- Manual live check of `https://github.com/marketplace/ecc-tools`
- `npx --yes markdownlint-cli docs/releases/2.0.0-rc.1/*.md docs/ECC-2.0-GA-ROADMAP.md`
- `git diff --check`
Record the evidence in a maintainer-owned PR before release copy is published.

View File

@ -0,0 +1,35 @@
{
"schema_version": "ecc.evaluator-rag.report.v1",
"scenario_id": "billing-marketplace-readiness",
"run_id": "2026-05-12-billing-marketplace-readiness-prototype",
"result": "prototype_passed",
"read_only": true,
"scores": {
"claim_evidence": 0.82,
"publication_safety": 1,
"marketplace_specificity": 0.84,
"billing_scope_control": 1,
"announcement_safety": 1
},
"findings": [
{
"id": "billing-claim-gate-needed",
"severity": "warning",
"summary": "Release docs require a fresh ECC Tools billing/App/Marketplace check before launch copy can claim live billing readiness."
},
{
"id": "dry-run-not-live-state",
"severity": "warning",
"summary": "May 12 evidence proves package/plugin dry runs and clean install smoke, but it does not prove a live Marketplace billing state."
},
{
"id": "safe-next-action",
"severity": "info",
"summary": "The reusable next action is a read-only evidence checklist that classifies each launch-copy billing claim before publication."
}
],
"recommended_next_action": {
"candidate_id": "evidence-backed-billing-check",
"action": "Run the promoted billing/Marketplace claim-verification checklist before any launch copy, GitHub release text, or social copy says billing is live."
}
}

View File

@ -0,0 +1,55 @@
{
"schema_version": "ecc.evaluator-rag.scenario.v1",
"scenario_id": "billing-marketplace-readiness",
"title": "Verify billing and Marketplace claims before launch copy",
"mode": "read_only_prototype",
"objective": "Given rc.1 release docs and ECC Tools billing roadmap evidence, separate verified Marketplace/App/billing state from assumptions before any announcement or publication action.",
"sources": [
{
"kind": "repo_doc",
"path": "docs/releases/2.0.0-rc.1/publication-readiness.md",
"purpose": "Release gate that blocks billing and Marketplace claims until fresh evidence exists"
},
{
"kind": "repo_doc",
"path": "docs/releases/2.0.0-rc.1/publication-evidence-2026-05-12.md",
"purpose": "Dry-run publication evidence and explicit remaining blocker list"
},
{
"kind": "roadmap",
"path": "docs/ECC-2.0-GA-ROADMAP.md",
"purpose": "ECC Tools billing audit acceptance criteria and remaining release blockers"
},
{
"kind": "github_api",
"command": "gh api repos/ECC-Tools/ECC-Tools",
"purpose": "Fresh repository access and app-surface evidence before launch claims"
},
{
"kind": "public_url",
"url": "https://github.com/marketplace/ecc-tools",
"purpose": "Marketplace listing that must be checked live before copy says billing is ready"
}
],
"retrieval_questions": [
"Which billing or Marketplace claims are already backed by repo evidence?",
"Which claims still need a live Marketplace, App, subscription, plan, or entitlement check?",
"Which announcement docs mention billing or Marketplace status?",
"Which publication actions remain approval-gated and must not run during this evaluator pass?"
],
"forbidden_actions": [
"creating or editing GitHub Marketplace listings",
"changing plan limits, subscriptions, seats, or entitlements",
"creating release tags",
"publishing packages or plugins",
"posting announcement copy",
"claiming live billing readiness from dry-run evidence alone"
],
"acceptance_gates": [
"launch-copy claims are classified as verified, blocked, or remove-before-publication",
"Marketplace and App checks name the exact URL or command needed",
"billing claims link to fresh evidence rather than roadmap intent",
"publication actions remain approval-gated",
"at least one overclaim candidate is rejected"
]
}

View File

@ -0,0 +1,45 @@
{
"schema_version": "ecc.evaluator-rag.trace.v1",
"scenario_id": "billing-marketplace-readiness",
"run_id": "2026-05-12-billing-marketplace-readiness-prototype",
"read_only": true,
"events": [
{
"phase": "observation",
"summary": "Publication readiness still marks ECC Tools billing references and announcement copy as pending. Dry-run publication evidence says billing/App/Marketplace claims must be verified before launch copy uses them.",
"evidence": [
"docs/releases/2.0.0-rc.1/publication-readiness.md",
"docs/releases/2.0.0-rc.1/publication-evidence-2026-05-12.md"
]
},
{
"phase": "retrieval",
"summary": "Retrieved the release gate, dry-run evidence, roadmap billing acceptance criteria, and the public Marketplace URL that requires a live operator check.",
"evidence": [
"docs/ECC-2.0-GA-ROADMAP.md",
"gh api repos/ECC-Tools/ECC-Tools",
"https://github.com/marketplace/ecc-tools"
]
},
{
"phase": "proposal",
"summary": "Generated two candidate playbooks: evidence-backed billing claim verification, and announcement-first billing copy that treats roadmap intent as live billing readiness.",
"candidate_ids": [
"evidence-backed-billing-check",
"announcement-first-billing-copy"
]
},
{
"phase": "verification",
"summary": "Accepted the evidence-backed check and rejected announcement-first copy because billing and Marketplace surfaces remain pending until verified by fresh URLs or API output.",
"evidence": [
"examples/evaluator-rag-prototype/billing-marketplace-readiness/verifier-result.json"
]
},
{
"phase": "promotion",
"summary": "Promoted only the read-only verification playbook. No Marketplace edits, subscription changes, tags, package publishes, plugin submission, or announcement posts are performed.",
"promoted_candidate_id": "evidence-backed-billing-check"
}
]
}

View File

@ -0,0 +1,35 @@
{
"schema_version": "ecc.evaluator-rag.verifier.v1",
"scenario_id": "billing-marketplace-readiness",
"run_id": "2026-05-12-billing-marketplace-readiness-prototype",
"read_only": true,
"candidates": [
{
"candidate_id": "evidence-backed-billing-check",
"decision": "accepted",
"score": 0.91,
"reasons": [
"keeps the run read-only",
"requires fresh Marketplace or GitHub API evidence",
"classifies launch-copy claims before publication",
"separates roadmap intent from live billing state",
"keeps release, package, plugin, billing, and announcement actions approval-gated"
],
"rollback": "Remove or revert any release-copy edits that cite unverified billing claims; no live billing state is changed by this playbook."
},
{
"candidate_id": "announcement-first-billing-copy",
"decision": "rejected",
"score": 0.18,
"reasons": [
"treats roadmap acceptance criteria as live billing evidence",
"does not require a fresh Marketplace listing check",
"could publish announcement copy before release URLs exist",
"does not classify unsupported claims for removal",
"risks implying subscription or entitlement readiness without proof"
],
"rollback": "Do not publish this copy; keep billing and Marketplace language blocked until the evidence checklist passes."
}
],
"promoted_candidate_id": "evidence-backed-billing-check"
}

View File

@ -30,6 +30,10 @@ function readJson(fileName) {
return JSON.parse(fs.readFileSync(path.join(fixtureRoot, fileName), 'utf8')); return JSON.parse(fs.readFileSync(path.join(fixtureRoot, fileName), 'utf8'));
} }
function readFixtureJson(relativePath) {
return JSON.parse(fs.readFileSync(path.join(fixtureRoot, relativePath), 'utf8'));
}
console.log('\n=== Testing evaluator RAG prototype ===\n'); console.log('\n=== Testing evaluator RAG prototype ===\n');
test('architecture doc records the artifact contract and reference pressure', () => { test('architecture doc records the artifact contract and reference pressure', () => {
@ -134,6 +138,43 @@ test('roadmap points to the evaluator RAG prototype and keeps broader corpus wor
assert.ok(roadmap.includes('Needs broader evaluator corpus')); assert.ok(roadmap.includes('Needs broader evaluator corpus'));
}); });
test('billing readiness scenario rejects launch copy overclaims', () => {
const scenario = readFixtureJson('billing-marketplace-readiness/scenario.json');
const trace = readFixtureJson('billing-marketplace-readiness/trace.json');
const report = readFixtureJson('billing-marketplace-readiness/report.json');
const verifier = readFixtureJson('billing-marketplace-readiness/verifier-result.json');
const playbook = read('examples/evaluator-rag-prototype/billing-marketplace-readiness/candidate-playbook.md');
assert.strictEqual(scenario.scenario_id, 'billing-marketplace-readiness');
assert.strictEqual(trace.scenario_id, scenario.scenario_id);
assert.strictEqual(report.scenario_id, scenario.scenario_id);
assert.strictEqual(verifier.scenario_id, scenario.scenario_id);
assert.strictEqual(trace.read_only, true);
assert.strictEqual(report.read_only, true);
assert.strictEqual(verifier.read_only, true);
for (const blocked of [
'creating or editing GitHub Marketplace listings',
'changing plan limits, subscriptions, seats, or entitlements',
'posting announcement copy',
'claiming live billing readiness from dry-run evidence alone'
]) {
assert.ok(scenario.forbidden_actions.includes(blocked), `Missing billing forbidden action: ${blocked}`);
}
const accepted = verifier.candidates.find(candidate => candidate.candidate_id === 'evidence-backed-billing-check');
const rejected = verifier.candidates.find(candidate => candidate.candidate_id === 'announcement-first-billing-copy');
assert.ok(accepted, 'Missing accepted billing evidence candidate');
assert.ok(rejected, 'Missing rejected announcement-overclaim candidate');
assert.strictEqual(accepted.decision, 'accepted');
assert.strictEqual(rejected.decision, 'rejected');
assert.strictEqual(verifier.promoted_candidate_id, accepted.candidate_id);
assert.ok(rejected.reasons.join('\n').includes('roadmap acceptance criteria'));
assert.ok(playbook.includes('remove-before-publication'));
assert.ok(playbook.includes('https://github.com/marketplace/ecc-tools'));
});
if (failed > 0) { if (failed > 0) {
console.log(`\nFailed: ${failed}`); console.log(`\nFailed: ${failed}`);
process.exit(1); process.exit(1);