#!/usr/bin/env python3 """Standalone agent output evaluator using the 5-axis rubric. Reads a task description and agent output from stdin or files, scores each axis, and prints a structured evaluation report. Usage: # Pipe output directly echo "Task: Add retry logic" | evaluate.py --output response.txt # From files evaluate.py --task task.txt --output response.txt # Interactive (reads task from prompt, output from stdin) evaluate.py --interactive The evaluator uses keyword heuristics + structural checks as a first pass. For production use, pair with an LLM judge for semantic understanding. """ import argparse import re import sys from dataclasses import dataclass, field from typing import Optional @dataclass class AxisScore: name: str score: int evidence: list[str] = field(default_factory=list) improvement: Optional[str] = None def count_words(text: str) -> int: return len(text.split()) def check_accuracy(text: str) -> AxisScore: """Check for verifiable claims, tool output references, error signs.""" evidence = [] deductions = 0 score = 5 # Positive signals: verified claims verified_patterns = [ (r"(?i)(tests?\s+pass|all\s+tests?\s+passing|\d+\s+passed)", "Tests passing"), (r"(?i)(exit\s+code\s*[:=]?\s*0|exited\s+with\s+0)", "Clean exit code"), (r"(?i)(lint.*clean|no\s+lint\s+errors|0\s+errors)", "Lint clean"), (r"(?i)(verified|confirmed|validated)\s+(with|against|using|by)", "Explicit verification"), (r"(?i)(grep|rg)\s+.*\b(found|matched|returned)", "Grep confirmed"), ] for pattern, label in verified_patterns: if re.search(pattern, text): evidence.append(f"+ {label}") # Negative signals: unverified claims danger_patterns = [ (r"(?i)(should\s+work|probably\s+fine|should\s+be\s+ok)", "Hedged claim without verification"), (r"(?i)(I\s+think|I\s+believe|I\s+assume|might\s+be)", "Speculation without evidence"), (r"(?i)(untested|not\s+tested|haven'?t\s+tested)", "Explicitly untested"), (r"(?i)(TODO|FIXME|HACK|WORKAROUND)", "Unresolved TODO/FIXME"), ] for pattern, label in danger_patterns: if re.search(pattern, text): deductions += 1 evidence.append(f"- {label}") if deductions >= 3: score = 2 elif deductions == 2: score = 3 elif deductions == 1: score = 4 if not evidence: evidence.append("No verification signals detected — score assumes correctness") result = AxisScore(name="Accuracy", score=score, evidence=evidence) if score < 5: result.improvement = "Cite specific tool outputs (test results, exit codes, grep findings) to back claims" return result def check_completeness(text: str, task: Optional[str] = None) -> AxisScore: """Check for requirement coverage, edge cases, error handling.""" evidence = [] score = 5 # Positive signals completeness_signals = [ (r"(?i)(edge\s*cases?|corner\s*cases?)", "Edge cases addressed"), (r"(?i)(error\s*handling|exception\s*handling|try/except|try\s*{)", "Error handling present"), (r"(?i)(all\s+\w+\s+(methods|endpoints|routes))", "Full coverage claimed"), (r"(?i)(verification|verified\s+that|confirmed\s+that)", "Verification step present"), ] for pattern, label in completeness_signals: if re.search(pattern, text): evidence.append(f"+ {label}") # Gaps gap_signals = [ (r"(?i)(not\s+covered|not\s+handled|out\s+of\s+scope)", "Explicit gap acknowledged"), (r"(?i)(only\s+(works|handles|supports)\s+\w+)", "Limited scope noted"), (r"(?i)(assume[sd]?\s+that|assuming\s+the)", "Assumption without verification"), ] deductions = 0 for pattern, label in gap_signals: if re.search(pattern, text): deductions += 1 evidence.append(f"- {label}") if deductions >= 2: score = 3 elif deductions == 1: score = 4 if not evidence: evidence.append("No completeness signals — unable to assess coverage") result = AxisScore(name="Completeness", score=score, evidence=evidence) if score < 5: result.improvement = "List what was covered AND what was intentionally excluded, with reasoning" return result def check_clarity(text: str) -> AxisScore: """Check for structure, readability, jargon handling.""" evidence = [] score = 5 deductions = 0 # Positive signals if re.search(r"^#{1,3}\s+", text, re.MULTILINE): evidence.append("+ Uses headings for structure") if re.search(r"```", text): evidence.append("+ Uses code blocks") if re.search(r"^\s*[-*]\s+", text, re.MULTILINE): evidence.append("+ Uses bullet points") # Negative signals # Wall of text: long paragraph without breaks paragraphs = [p for p in text.split("\n\n") if p.strip()] for p in paragraphs: if count_words(p) > 200: deductions += 1 evidence.append("- Wall-of-text paragraph (>200 words without break)") break # Jargon without definition jargon = [ (r"\b(idempotent|race condition|deadlock|thundering herd)\b", "concurrency"), (r"\b(exponential backoff|circuit breaker|bulkhead)\b", "resilience"), (r"\b(ACID|CAP|eventual consistency|linearizability)\b", "database theory"), ] for pattern, domain in jargon: if re.search(pattern, text, re.IGNORECASE): if not re.search(rf"(?i)({domain}|means|refers to|i\.e\.|in other words)", text): deductions += 1 evidence.append(f"- Domain term used without explanation ({domain})") break if not any(t in text[:100].lower() for t in ["summary", "tldr", "overview", "in short"]): # No early summary — penalize only if text is long if count_words(text) > 300: deductions += 1 evidence.append("- No summary/TLDR in first 100 words (text is 300+ words)") if deductions >= 3: score = 2 elif deductions == 2: score = 3 elif deductions == 1: score = 4 if not evidence: evidence.append("+ Well-structured with no clarity issues detected") result = AxisScore(name="Clarity", score=score, evidence=evidence) if score < 5: result.improvement = "Add headings, break long paragraphs, define domain terms on first use" return result def check_actionability(text: str) -> AxisScore: """Check if the user can act on the output immediately.""" evidence = [] score = 5 deductions = 0 # Positive signals actionable_signals = [ (r"(?i)(merge|PR|pull request).*?(created|ready|open)", "PR created"), (r"(?i)(run|execute)\s+[`\"']?[\w./-]+", "Specific run command given"), (r"(?i)(next\s+steps?|follow[- ]up|what\s+to\s+do)", "Next steps provided"), (r"(?i)(file\s+(created|written|modified|updated)\s+at)", "File path specified"), ] for pattern, label in actionable_signals: if re.search(pattern, text): evidence.append(f"+ {label}") # Negative signals vague_signals = [ (r"(?i)(you\s+(should|could|might\s+want\s+to))\s+\w+", "Vague suggestion without specifics"), (r"(?i)(consider|maybe|perhaps)\s+\w+ing", "Non-committal suggestion"), (r"(?i)(figure\s+out|look\s+into|investigate)\s", "Defers work to user"), ] for pattern, label in vague_signals: if re.search(pattern, text): deductions += 1 evidence.append(f"- {label}") if deductions >= 3: score = 2 elif deductions == 2: score = 3 elif deductions == 1: score = 4 if not evidence: evidence.append("No actionability signals — user may need to ask 'what now?'") result = AxisScore(name="Actionability", score=score, evidence=evidence) if score < 5: result.improvement = "End with a single clear action: 'Merge this PR', 'Run ./deploy.sh', or 'Review the 3 changed files'" return result def check_concision(text: str, task: Optional[str] = None) -> AxisScore: """Check for redundancy, filler, information density.""" evidence = [] score = 5 wc = count_words(text) # Heuristic: task-to-output ratio if task: task_wc = count_words(task) ratio = wc / max(task_wc, 1) if ratio > 15: evidence.append(f"- Output is {ratio:.0f}x longer than task description (high ratio)") score = min(score, 3) elif ratio > 8: evidence.append(f"- Output is {ratio:.0f}x longer than task description") score = min(score, 4) # Redundancy signals redundancy_checks = [ (r"(?i)(as\s+(I|we)\s+(mentioned|said|noted|discussed)\s+(earlier|above|before))", "Refers back to earlier statement (possible repetition)"), (r"(?i)(to\s+summarize|in\s+summary|in\s+conclusion|to\s+conclude)", "Has explicit summary (good if needed, flag if redundant)"), (r"(?i)(let\s+me\s+(explain|break\s+this\s+down|walk\s+you\s+through))", "Meta-commentary adds words without information"), ] redundant_count = 0 for pattern, label in redundancy_checks: matches = re.findall(pattern, text) if len(matches) > 2: redundant_count += 1 evidence.append(f"- '{label}' appears {len(matches)} times") if redundant_count >= 2: score = min(score, 3) elif redundant_count == 1: score = min(score, 4) if not evidence and score == 5: evidence.append("+ No redundancy detected. Information density appears good.") result = AxisScore(name="Conciseness", score=score, evidence=evidence) if score < 5: result.improvement = "Cut meta-commentary, remove repeated points, trim examples to one representative case" return result def evaluate(task: Optional[str], output: str) -> list[AxisScore]: """Run all 5 axis checks and return scored results.""" return [ check_accuracy(output), check_completeness(output, task), check_clarity(output), check_actionability(output), check_concision(output, task), ] def format_report(scores: list[AxisScore]) -> str: """Format scores into a readable evaluation report.""" avg = sum(s.score for s in scores) / len(scores) lines = [] lines.append("=" * 60) lines.append("AGENT SELF-EVALUATION REPORT") lines.append("=" * 60) lines.append("") for s in scores: bar = "█" * s.score + "░" * (5 - s.score) lines.append(f" {s.name:<15} {bar} {s.score}/5") for e in s.evidence: lines.append(f" {e}") if s.improvement: lines.append(f" → {s.improvement}") lines.append("") lines.append(f" {'OVERALL':<15} {avg:.1f}/5") lines.append("") # Top improvements improvements = [(s, s.improvement) for s in scores if s.improvement and s.score < 4] if improvements: lines.append("TOP IMPROVEMENTS (axes scoring < 4):") for s, imp in sorted(improvements, key=lambda x: x[0].score): lines.append(f" [{s.name}] {imp}") else: lines.append("No axes below 4. Strong output across all dimensions.") return "\n".join(lines) def main(): parser = argparse.ArgumentParser( description="Evaluate agent output against the 5-axis rubric" ) parser.add_argument("--task", help="Task description (file path or inline text)") parser.add_argument("--output", help="Agent output to evaluate (file path)") parser.add_argument("--interactive", action="store_true", help="Prompt for task and read output from stdin") args = parser.parse_args() task = None output = None if args.interactive: task = input("Task description: ").strip() print("Paste agent output (Ctrl+D to finish):") output = sys.stdin.read() elif args.task and args.output: # Read task try: with open(args.task) as f: task = f.read() except FileNotFoundError: task = args.task # Treat as inline text # Read output try: with open(args.output) as f: output = f.read() except FileNotFoundError: print(f"Error: output file '{args.output}' not found", file=sys.stderr) sys.exit(1) else: # Pipe mode: read output from stdin output = sys.stdin.read() if args.task: try: with open(args.task) as f: task = f.read() except FileNotFoundError: task = args.task if not output: print("Error: no output to evaluate", file=sys.stderr) sys.exit(1) scores = evaluate(task, output) print(format_report(scores)) if __name__ == "__main__": main()