mirror of
https://github.com/affaan-m/everything-claude-code.git
synced 2026-06-16 16:36:53 +08:00
Add structured 5-axis self-evaluation framework for agent output quality: - Accuracy, Completeness, Clarity, Actionability, Conciseness - Evidence-based scoring with concrete improvement suggestions - Standalone Python evaluator script with keyword heuristics - Detailed scoring anchors reference guide - High-score and low-score annotated examples - Reusable evaluation report template - Optional hook integration for session-stop evaluation Agent persona (agent-evaluator) provides a dedicated subagent for applying the rubric to agent output with tool-backed verification. All files tested: Python script runs, examples score correctly (high 4.2, low 3.4), frontmatter parses clean, 183 lines (under 500).
372 lines
13 KiB
Python
Executable File
372 lines
13 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""Standalone agent output evaluator using the 5-axis rubric.
|
|
|
|
Reads a task description and agent output from stdin or files,
|
|
scores each axis, and prints a structured evaluation report.
|
|
|
|
Usage:
|
|
# Pipe output directly
|
|
echo "Task: Add retry logic" | evaluate.py --output response.txt
|
|
|
|
# From files
|
|
evaluate.py --task task.txt --output response.txt
|
|
|
|
# Interactive (reads task from prompt, output from stdin)
|
|
evaluate.py --interactive
|
|
|
|
The evaluator uses keyword heuristics + structural checks as a first pass.
|
|
For production use, pair with an LLM judge for semantic understanding.
|
|
"""
|
|
|
|
import argparse
|
|
import re
|
|
import sys
|
|
from dataclasses import dataclass, field
|
|
from typing import Optional
|
|
|
|
|
|
@dataclass
|
|
class AxisScore:
|
|
name: str
|
|
score: int
|
|
evidence: list[str] = field(default_factory=list)
|
|
improvement: Optional[str] = None
|
|
|
|
|
|
def count_words(text: str) -> int:
|
|
return len(text.split())
|
|
|
|
|
|
def check_accuracy(text: str) -> AxisScore:
|
|
"""Check for verifiable claims, tool output references, error signs."""
|
|
evidence = []
|
|
deductions = 0
|
|
score = 5
|
|
|
|
# Positive signals: verified claims
|
|
verified_patterns = [
|
|
(r"(?i)(tests?\s+pass|all\s+tests?\s+passing|\d+\s+passed)", "Tests passing"),
|
|
(r"(?i)(exit\s+code\s*[:=]?\s*0|exited\s+with\s+0)", "Clean exit code"),
|
|
(r"(?i)(lint.*clean|no\s+lint\s+errors|0\s+errors)", "Lint clean"),
|
|
(r"(?i)(verified|confirmed|validated)\s+(with|against|using|by)", "Explicit verification"),
|
|
(r"(?i)(grep|rg)\s+.*\b(found|matched|returned)", "Grep confirmed"),
|
|
]
|
|
for pattern, label in verified_patterns:
|
|
if re.search(pattern, text):
|
|
evidence.append(f"+ {label}")
|
|
|
|
# Negative signals: unverified claims
|
|
danger_patterns = [
|
|
(r"(?i)(should\s+work|probably\s+fine|should\s+be\s+ok)", "Hedged claim without verification"),
|
|
(r"(?i)(I\s+think|I\s+believe|I\s+assume|might\s+be)", "Speculation without evidence"),
|
|
(r"(?i)(untested|not\s+tested|haven'?t\s+tested)", "Explicitly untested"),
|
|
(r"(?i)(TODO|FIXME|HACK|WORKAROUND)", "Unresolved TODO/FIXME"),
|
|
]
|
|
for pattern, label in danger_patterns:
|
|
if re.search(pattern, text):
|
|
deductions += 1
|
|
evidence.append(f"- {label}")
|
|
|
|
if deductions >= 3:
|
|
score = 2
|
|
elif deductions == 2:
|
|
score = 3
|
|
elif deductions == 1:
|
|
score = 4
|
|
|
|
if not evidence:
|
|
evidence.append("No verification signals detected — score assumes correctness")
|
|
|
|
result = AxisScore(name="Accuracy", score=score, evidence=evidence)
|
|
if score < 5:
|
|
result.improvement = "Cite specific tool outputs (test results, exit codes, grep findings) to back claims"
|
|
return result
|
|
|
|
|
|
def check_completeness(text: str, task: Optional[str] = None) -> AxisScore:
|
|
"""Check for requirement coverage, edge cases, error handling."""
|
|
evidence = []
|
|
score = 5
|
|
|
|
# Positive signals
|
|
completeness_signals = [
|
|
(r"(?i)(edge\s*cases?|corner\s*cases?)", "Edge cases addressed"),
|
|
(r"(?i)(error\s*handling|exception\s*handling|try/except|try\s*{)", "Error handling present"),
|
|
(r"(?i)(all\s+\w+\s+(methods|endpoints|routes))", "Full coverage claimed"),
|
|
(r"(?i)(verification|verified\s+that|confirmed\s+that)", "Verification step present"),
|
|
]
|
|
for pattern, label in completeness_signals:
|
|
if re.search(pattern, text):
|
|
evidence.append(f"+ {label}")
|
|
|
|
# Gaps
|
|
gap_signals = [
|
|
(r"(?i)(not\s+covered|not\s+handled|out\s+of\s+scope)", "Explicit gap acknowledged"),
|
|
(r"(?i)(only\s+(works|handles|supports)\s+\w+)", "Limited scope noted"),
|
|
(r"(?i)(assume[sd]?\s+that|assuming\s+the)", "Assumption without verification"),
|
|
]
|
|
deductions = 0
|
|
for pattern, label in gap_signals:
|
|
if re.search(pattern, text):
|
|
deductions += 1
|
|
evidence.append(f"- {label}")
|
|
|
|
if deductions >= 2:
|
|
score = 3
|
|
elif deductions == 1:
|
|
score = 4
|
|
|
|
if not evidence:
|
|
evidence.append("No completeness signals — unable to assess coverage")
|
|
|
|
result = AxisScore(name="Completeness", score=score, evidence=evidence)
|
|
if score < 5:
|
|
result.improvement = "List what was covered AND what was intentionally excluded, with reasoning"
|
|
return result
|
|
|
|
|
|
def check_clarity(text: str) -> AxisScore:
|
|
"""Check for structure, readability, jargon handling."""
|
|
evidence = []
|
|
score = 5
|
|
deductions = 0
|
|
|
|
# Positive signals
|
|
if re.search(r"^#{1,3}\s+", text, re.MULTILINE):
|
|
evidence.append("+ Uses headings for structure")
|
|
if re.search(r"```", text):
|
|
evidence.append("+ Uses code blocks")
|
|
if re.search(r"^\s*[-*]\s+", text, re.MULTILINE):
|
|
evidence.append("+ Uses bullet points")
|
|
|
|
# Negative signals
|
|
# Wall of text: long paragraph without breaks
|
|
paragraphs = [p for p in text.split("\n\n") if p.strip()]
|
|
for p in paragraphs:
|
|
if count_words(p) > 200:
|
|
deductions += 1
|
|
evidence.append("- Wall-of-text paragraph (>200 words without break)")
|
|
break
|
|
|
|
# Jargon without definition
|
|
jargon = [
|
|
(r"\b(idempotent|race condition|deadlock|thundering herd)\b", "concurrency"),
|
|
(r"\b(exponential backoff|circuit breaker|bulkhead)\b", "resilience"),
|
|
(r"\b(ACID|CAP|eventual consistency|linearizability)\b", "database theory"),
|
|
]
|
|
for pattern, domain in jargon:
|
|
if re.search(pattern, text, re.IGNORECASE):
|
|
if not re.search(rf"(?i)({domain}|means|refers to|i\.e\.|in other words)", text):
|
|
deductions += 1
|
|
evidence.append(f"- Domain term used without explanation ({domain})")
|
|
break
|
|
|
|
if not any(t in text[:100].lower() for t in ["summary", "tldr", "overview", "in short"]):
|
|
# No early summary — penalize only if text is long
|
|
if count_words(text) > 300:
|
|
deductions += 1
|
|
evidence.append("- No summary/TLDR in first 100 words (text is 300+ words)")
|
|
|
|
if deductions >= 3:
|
|
score = 2
|
|
elif deductions == 2:
|
|
score = 3
|
|
elif deductions == 1:
|
|
score = 4
|
|
|
|
if not evidence:
|
|
evidence.append("+ Well-structured with no clarity issues detected")
|
|
|
|
result = AxisScore(name="Clarity", score=score, evidence=evidence)
|
|
if score < 5:
|
|
result.improvement = "Add headings, break long paragraphs, define domain terms on first use"
|
|
return result
|
|
|
|
|
|
def check_actionability(text: str) -> AxisScore:
|
|
"""Check if the user can act on the output immediately."""
|
|
evidence = []
|
|
score = 5
|
|
deductions = 0
|
|
|
|
# Positive signals
|
|
actionable_signals = [
|
|
(r"(?i)(merge|PR|pull request).*?(created|ready|open)", "PR created"),
|
|
(r"(?i)(run|execute)\s+[`\"']?[\w./-]+", "Specific run command given"),
|
|
(r"(?i)(next\s+steps?|follow[- ]up|what\s+to\s+do)", "Next steps provided"),
|
|
(r"(?i)(file\s+(created|written|modified|updated)\s+at)", "File path specified"),
|
|
]
|
|
for pattern, label in actionable_signals:
|
|
if re.search(pattern, text):
|
|
evidence.append(f"+ {label}")
|
|
|
|
# Negative signals
|
|
vague_signals = [
|
|
(r"(?i)(you\s+(should|could|might\s+want\s+to))\s+\w+", "Vague suggestion without specifics"),
|
|
(r"(?i)(consider|maybe|perhaps)\s+\w+ing", "Non-committal suggestion"),
|
|
(r"(?i)(figure\s+out|look\s+into|investigate)\s", "Defers work to user"),
|
|
]
|
|
for pattern, label in vague_signals:
|
|
if re.search(pattern, text):
|
|
deductions += 1
|
|
evidence.append(f"- {label}")
|
|
|
|
if deductions >= 3:
|
|
score = 2
|
|
elif deductions == 2:
|
|
score = 3
|
|
elif deductions == 1:
|
|
score = 4
|
|
|
|
if not evidence:
|
|
evidence.append("No actionability signals — user may need to ask 'what now?'")
|
|
|
|
result = AxisScore(name="Actionability", score=score, evidence=evidence)
|
|
if score < 5:
|
|
result.improvement = "End with a single clear action: 'Merge this PR', 'Run ./deploy.sh', or 'Review the 3 changed files'"
|
|
return result
|
|
|
|
|
|
def check_concision(text: str, task: Optional[str] = None) -> AxisScore:
|
|
"""Check for redundancy, filler, information density."""
|
|
evidence = []
|
|
score = 5
|
|
wc = count_words(text)
|
|
|
|
# Heuristic: task-to-output ratio
|
|
if task:
|
|
task_wc = count_words(task)
|
|
ratio = wc / max(task_wc, 1)
|
|
if ratio > 15:
|
|
evidence.append(f"- Output is {ratio:.0f}x longer than task description (high ratio)")
|
|
score = min(score, 3)
|
|
elif ratio > 8:
|
|
evidence.append(f"- Output is {ratio:.0f}x longer than task description")
|
|
score = min(score, 4)
|
|
|
|
# Redundancy signals
|
|
redundancy_checks = [
|
|
(r"(?i)(as\s+(I|we)\s+(mentioned|said|noted|discussed)\s+(earlier|above|before))",
|
|
"Refers back to earlier statement (possible repetition)"),
|
|
(r"(?i)(to\s+summarize|in\s+summary|in\s+conclusion|to\s+conclude)",
|
|
"Has explicit summary (good if needed, flag if redundant)"),
|
|
(r"(?i)(let\s+me\s+(explain|break\s+this\s+down|walk\s+you\s+through))",
|
|
"Meta-commentary adds words without information"),
|
|
]
|
|
redundant_count = 0
|
|
for pattern, label in redundancy_checks:
|
|
matches = re.findall(pattern, text)
|
|
if len(matches) > 2:
|
|
redundant_count += 1
|
|
evidence.append(f"- '{label}' appears {len(matches)} times")
|
|
|
|
if redundant_count >= 2:
|
|
score = min(score, 3)
|
|
elif redundant_count == 1:
|
|
score = min(score, 4)
|
|
|
|
if not evidence and score == 5:
|
|
evidence.append("+ No redundancy detected. Information density appears good.")
|
|
|
|
result = AxisScore(name="Conciseness", score=score, evidence=evidence)
|
|
if score < 5:
|
|
result.improvement = "Cut meta-commentary, remove repeated points, trim examples to one representative case"
|
|
return result
|
|
|
|
|
|
def evaluate(task: Optional[str], output: str) -> list[AxisScore]:
|
|
"""Run all 5 axis checks and return scored results."""
|
|
return [
|
|
check_accuracy(output),
|
|
check_completeness(output, task),
|
|
check_clarity(output),
|
|
check_actionability(output),
|
|
check_concision(output, task),
|
|
]
|
|
|
|
|
|
def format_report(scores: list[AxisScore]) -> str:
|
|
"""Format scores into a readable evaluation report."""
|
|
avg = sum(s.score for s in scores) / len(scores)
|
|
lines = []
|
|
lines.append("=" * 60)
|
|
lines.append("AGENT SELF-EVALUATION REPORT")
|
|
lines.append("=" * 60)
|
|
lines.append("")
|
|
|
|
for s in scores:
|
|
bar = "█" * s.score + "░" * (5 - s.score)
|
|
lines.append(f" {s.name:<15} {bar} {s.score}/5")
|
|
for e in s.evidence:
|
|
lines.append(f" {e}")
|
|
if s.improvement:
|
|
lines.append(f" → {s.improvement}")
|
|
lines.append("")
|
|
|
|
lines.append(f" {'OVERALL':<15} {avg:.1f}/5")
|
|
lines.append("")
|
|
|
|
# Top improvements
|
|
improvements = [(s, s.improvement) for s in scores if s.improvement and s.score < 4]
|
|
if improvements:
|
|
lines.append("TOP IMPROVEMENTS (axes scoring < 4):")
|
|
for s, imp in sorted(improvements, key=lambda x: x[0].score):
|
|
lines.append(f" [{s.name}] {imp}")
|
|
else:
|
|
lines.append("No axes below 4. Strong output across all dimensions.")
|
|
|
|
return "\n".join(lines)
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Evaluate agent output against the 5-axis rubric"
|
|
)
|
|
parser.add_argument("--task", help="Task description (file path or inline text)")
|
|
parser.add_argument("--output", help="Agent output to evaluate (file path)")
|
|
parser.add_argument("--interactive", action="store_true", help="Prompt for task and read output from stdin")
|
|
args = parser.parse_args()
|
|
|
|
task = None
|
|
output = None
|
|
|
|
if args.interactive:
|
|
task = input("Task description: ").strip()
|
|
print("Paste agent output (Ctrl+D to finish):")
|
|
output = sys.stdin.read()
|
|
elif args.task and args.output:
|
|
# Read task
|
|
try:
|
|
with open(args.task) as f:
|
|
task = f.read()
|
|
except FileNotFoundError:
|
|
task = args.task # Treat as inline text
|
|
|
|
# Read output
|
|
try:
|
|
with open(args.output) as f:
|
|
output = f.read()
|
|
except FileNotFoundError:
|
|
print(f"Error: output file '{args.output}' not found", file=sys.stderr)
|
|
sys.exit(1)
|
|
else:
|
|
# Pipe mode: read output from stdin
|
|
output = sys.stdin.read()
|
|
if args.task:
|
|
try:
|
|
with open(args.task) as f:
|
|
task = f.read()
|
|
except FileNotFoundError:
|
|
task = args.task
|
|
|
|
if not output:
|
|
print("Error: no output to evaluate", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
scores = evaluate(task, output)
|
|
print(format_report(scores))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|