diff --git a/agents/agent-evaluator.md b/agents/agent-evaluator.md
index f4b90a9b..3169382e 100644
--- a/agents/agent-evaluator.md
+++ b/agents/agent-evaluator.md
@@ -13,7 +13,7 @@ You are a quality evaluator for AI agent output. Your job is to assess agent res
 - Every score below 5 MUST cite specific evidence from the output
 - Provide concrete, actionable improvement suggestions
 - Maintain objectivity — evaluate the output, not the agent's effort or intent
-- Load the `agent-self-evaluation` skill for the detailed scoring rubric
+- Read `skills/agent-self-evaluation/SKILL.md` for the detailed scoring rubric. Example input is a standard ECC `SKILL.md` file with YAML frontmatter and Markdown sections such as `## When to Activate`, `## Core Concepts`, and `## Best Practices`.
 
 - DO NOT re-perform the original task
 - DO NOT suggest alternative approaches unless the current approach is factually wrong
@@ -60,6 +60,7 @@ Use this exact format (matches `scripts/evaluate.py` output):
 ============================================================
 AGENT SELF-EVALUATION REPORT
 ============================================================
+Summary: Overall score X.X/5 across 5 quality axes.
 
   Accuracy         █████ 5/5
     + [Evidence: passing tests, verified claims]
@@ -87,6 +88,8 @@ CRITICAL ISSUES (axes ≤ 2):
   [Axis] Score N/5 — specific fix needed
   (or "None" if no axis ≤ 2)
 
+Self-check: Would the user agree with this assessment? [Yes/No + brief justification]
+
 TOP IMPROVEMENTS:
   1. [Highest impact fix]
   2. [Second highest]
@@ -96,7 +99,7 @@ VERDICT: [Deliver as-is / Fix N issues then deliver / Redo from scratch]
 
 ## Output Format
 
-Always include the structured report above, matching the `scripts/evaluate.py` output format exactly. The report title is "AGENT SELF-EVALUATION REPORT" (not "AGENT EVALUATION REPORT").
+Always include the structured report above, matching the `scripts/evaluate.py` output format exactly. The report title is "AGENT SELF-EVALUATION REPORT".
 
 ## Examples
 
@@ -108,6 +111,7 @@ Task: Add retry logic to HTTP client. 3 retries, exponential backoff.
 ============================================================
 AGENT SELF-EVALUATION REPORT
 ============================================================
+Summary: Overall score X.X/5 across 5 quality axes.
 
   Accuracy         █████ 5/5
     + Tests passing
@@ -138,6 +142,8 @@ AGENT SELF-EVALUATION REPORT
 CRITICAL ISSUES (axes ≤ 2):
   None
 
+Self-check: Would the user agree with this assessment? Yes — the scores cite passing tests, grep verification, and the remaining gaps are minor.
+
 TOP IMPROVEMENTS:
   1. [Completeness] Add connection pool exhaustion to edge cases doc
   2. [Conciseness] Consolidate verification commands into a single script
@@ -153,6 +159,7 @@ Task: Same as above.
 ============================================================
 AGENT SELF-EVALUATION REPORT
 ============================================================
+Summary: Overall score X.X/5 across 5 quality axes.
 
   Accuracy         ██░░░ 2/5
     + Code block present
@@ -188,6 +195,8 @@ CRITICAL ISSUES (axes ≤ 2):
   [Accuracy] Score 2/5 — Wrong library. Use httpx.Retry, not urllib3.Retry.
   [Actionability] Score 2/5 — No deliverable. Create a PR with test file.
 
+Self-check: Would the user agree with this assessment? Yes — the report cites the wrong library, lack of tests, and missing deliverable.
+
 TOP IMPROVEMENTS:
   1. [Accuracy] Switch to httpx.Retry — grep the codebase first
   2. [Actionability] Create a PR with src/api_client.py + tests
diff --git a/skills/agent-self-evaluation/SKILL.md b/skills/agent-self-evaluation/SKILL.md
index 0aa3c986..96edc164 100644
--- a/skills/agent-self-evaluation/SKILL.md
+++ b/skills/agent-self-evaluation/SKILL.md
@@ -15,7 +15,7 @@ After completing a complex task, the agent pauses to rate its own output against
 - After a debugging session that involved 3+ attempts
 - After producing a design document, architecture decision, or written analysis
 - When the user asks "how good was that?" or "rate yourself"
-- At the end of any session Stop hook (if configured — see References)
+- At the end of any session Stop hook (if configured — see `references/hook-integration.md`)
 
 ## Core Concepts
 
diff --git a/skills/agent-self-evaluation/references/hook-integration.md b/skills/agent-self-evaluation/references/hook-integration.md
index 78246b37..260de2ca 100644
--- a/skills/agent-self-evaluation/references/hook-integration.md
+++ b/skills/agent-self-evaluation/references/hook-integration.md
@@ -1,13 +1,12 @@
 # Hook Integration for Session-Stop Self-Evaluation
 
-Add this hook to `hooks/hooks.json` to automatically trigger self-evaluation at the end of every session:
+Add this hook to `hooks/hooks.json` to remind the agent to self-evaluate at the end of every session:
 
 ```json
 {
   "hooks": {
     "Stop": [
       {
-        "matcher": "true",
         "hooks": [
           {
             "type": "command",
@@ -21,6 +20,8 @@ Add this hook to `hooks/hooks.json` to automatically trigger self-evaluation at
 }
 ```
 
+`Stop` events do not use a `matcher` field. Keep the hook object limited to `hooks` and metadata such as `description`.
+
 ## Integration with the Python Evaluator
 
 The `scripts/evaluate.py` script can be used as a standalone tool:
@@ -33,25 +34,27 @@ echo "Your agent response here" | python3 skills/agent-self-evaluation/scripts/e
 python3 skills/agent-self-evaluation/scripts/evaluate.py --task task.txt --output response.txt
 ```
 
-To integrate it into hooks, capture the last agent output to a file first, then run the evaluator:
+To integrate it into hooks, capture the last agent output to a file first, then run the evaluator. For lightweight reminders after shell-based verification, use a simple supported matcher string:
 
 ```json
 {
   "PostToolUse": [
     {
-      "matcher": "tool == \"Bash\" && tool_input.command matches \"(test|pytest|npm test|go test)\"",
+      "matcher": "Bash",
       "hooks": [
         {
           "type": "command",
-          "command": "echo '[Self-Eval] Tests completed. Consider running agent-self-evaluation.'"
+          "command": "echo '[Self-Eval] If this command completed verification for a non-trivial task, consider running agent-self-evaluation.'"
         }
       ],
-      "description": "Remind agent to self-evaluate after test runs"
+      "description": "Remind agent to self-evaluate after shell verification"
     }
   ]
 }
 ```
 
+This avoids documenting unsupported command-expression matcher syntax. If your harness supports command-level matcher expressions, prefer a word-boundary regex such as `\b(pytest|npm test|go test)\b` rather than a broad `test` substring.
+
 These hooks are opt-in. Add them to your local `hooks/hooks.json` if you want automated evaluation prompts.
 
 ## Manual Usage (Recommended)
diff --git a/skills/agent-self-evaluation/scripts/evaluate.py b/skills/agent-self-evaluation/scripts/evaluate.py
index 0446106b..566242a1 100755
--- a/skills/agent-self-evaluation/scripts/evaluate.py
+++ b/skills/agent-self-evaluation/scripts/evaluate.py
@@ -83,7 +83,7 @@ def check_accuracy(text: str) -> AxisScore:
     return result
 
 
-def check_completeness(text: str, task: Optional[str] = None) -> AxisScore:
+def check_completeness(text: str) -> AxisScore:
     """Check for requirement coverage, edge cases, error handling."""
     evidence = []
     score = 5
@@ -125,13 +125,36 @@ def check_completeness(text: str, task: Optional[str] = None) -> AxisScore:
     return result
 
 
+def _check_jargon(text: str) -> tuple[int, list[str]]:
+    """Return clarity deductions for unexplained domain jargon."""
+    jargon = [
+        (r"\b(idempotent|race condition|deadlock|thundering herd)\b", "concurrency"),
+        (r"\b(exponential backoff|circuit breaker|bulkhead)\b", "resilience"),
+        (r"\b(ACID|CAP|eventual consistency|linearizability)\b", "database theory"),
+    ]
+    explanation_pattern = r"(?i)({domain}|means|refers to|i\.e\.|in other words)"
+    for pattern, domain in jargon:
+        has_term = re.search(pattern, text, re.IGNORECASE)
+        explains_term = re.search(explanation_pattern.format(domain=domain), text)
+        if has_term and not explains_term:
+            return 1, [f"- Domain term used without explanation ({domain})"]
+    return 0, []
+
+
+def _check_summary(text: str) -> tuple[int, list[str]]:
+    """Return clarity deduction when long output lacks an early summary."""
+    summary_terms = ["summary", "tldr", "overview", "in short"]
+    has_early_summary = any(term in text[:100].lower() for term in summary_terms)
+    if not has_early_summary and count_words(text) > 300:
+        return 1, ["- No summary/TLDR in first 100 words (text is 300+ words)"]
+    return 0, []
+
+
 def check_clarity(text: str) -> AxisScore:
     """Check for structure, readability, jargon handling."""
     evidence = []
-    score = 5
     deductions = 0
 
-    # Positive signals
     if re.search(r"^#{1,3}\s+", text, re.MULTILINE):
         evidence.append("+ Uses headings for structure")
     if re.search(r"```", text):
@@ -139,33 +162,16 @@ def check_clarity(text: str) -> AxisScore:
     if re.search(r"^\s*[-*]\s+", text, re.MULTILINE):
         evidence.append("+ Uses bullet points")
 
-    # Negative signals
-    # Wall of text: long paragraph without breaks
-    paragraphs = [p for p in text.split("\n\n") if p.strip()]
-    for p in paragraphs:
-        if count_words(p) > 200:
+    for paragraph in [p for p in text.split("\n\n") if p.strip()]:
+        if count_words(paragraph) > 200:
             deductions += 1
             evidence.append("- Wall-of-text paragraph (>200 words without break)")
             break
 
-    # Jargon without definition
-    jargon = [
-        (r"\b(idempotent|race condition|deadlock|thundering herd)\b", "concurrency"),
-        (r"\b(exponential backoff|circuit breaker|bulkhead)\b", "resilience"),
-        (r"\b(ACID|CAP|eventual consistency|linearizability)\b", "database theory"),
-    ]
-    for pattern, domain in jargon:
-        if re.search(pattern, text, re.IGNORECASE):
-            if not re.search(rf"(?i)({domain}|means|refers to|i\.e\.|in other words)", text):
-                deductions += 1
-                evidence.append(f"- Domain term used without explanation ({domain})")
-                break
-
-    if not any(t in text[:100].lower() for t in ["summary", "tldr", "overview", "in short"]):
-        # No early summary — penalize only if text is long
-        if count_words(text) > 300:
-            deductions += 1
-            evidence.append("- No summary/TLDR in first 100 words (text is 300+ words)")
+    jargon_deductions, jargon_evidence = _check_jargon(text)
+    summary_deductions, summary_evidence = _check_summary(text)
+    deductions += jargon_deductions + summary_deductions
+    evidence.extend(jargon_evidence + summary_evidence)
 
     if deductions >= 3:
         score = 2
@@ -173,6 +179,8 @@ def check_clarity(text: str) -> AxisScore:
         score = 3
     elif deductions == 1:
         score = 4
+    else:
+        score = 5
 
     if not evidence:
         evidence.append("+ Well-structured with no clarity issues detected")
@@ -227,7 +235,7 @@ def check_actionability(text: str) -> AxisScore:
     return result
 
 
-def check_concision(text: str, task: Optional[str] = None) -> AxisScore:
+def check_conciseness(text: str, task: Optional[str] = None) -> AxisScore:
     """Check for redundancy, filler, information density."""
     evidence = []
     score = 5
@@ -278,10 +286,10 @@ def evaluate(task: Optional[str], output: str) -> list[AxisScore]:
     """Run all 5 axis checks and return scored results."""
     return [
         check_accuracy(output),
-        check_completeness(output, task),
+        check_completeness(output),
         check_clarity(output),
         check_actionability(output),
-        check_concision(output, task),
+        check_conciseness(output, task),
     ]
 
 
@@ -292,13 +300,13 @@ def format_report(scores: list[AxisScore]) -> str:
     lines.append("=" * 60)
     lines.append("AGENT SELF-EVALUATION REPORT")
     lines.append("=" * 60)
+    lines.append(f"Summary: Overall score {avg:.1f}/5 across 5 quality axes.")
     lines.append("")
 
     for s in scores:
         bar = "█" * s.score + "░" * (5 - s.score)
         lines.append(f"  {s.name:<15} {bar} {s.score}/5")
-        for e in s.evidence:
-            lines.append(f"    {e}")
+        lines.extend(f"    {e}" for e in s.evidence)
         if s.improvement:
             lines.append(f"    → {s.improvement}")
         lines.append("")
@@ -316,6 +324,8 @@ def format_report(scores: list[AxisScore]) -> str:
         lines.append("  None")
 
     lines.append("")
+    lines.append("Self-check: Would the user agree with this assessment? [Yes/No + brief justification]")
+    lines.append("")
 
     # Top improvements (axes scoring < 4, ranked by impact)
     improvements = [(s, s.improvement) for s in scores if s.improvement and s.score < 4]
@@ -344,6 +354,31 @@ def format_report(scores: list[AxisScore]) -> str:
     return "\n".join(lines)
 
 
+def _read_file_or_text(path: Optional[str], required: bool = False) -> Optional[str]:
+    """Read a file path or return inline text when allowed."""
+    if path is None:
+        return None
+    try:
+        with open(path) as f:
+            return f.read()
+    except FileNotFoundError:
+        if required:
+            print(f"Error: output file '{path}' not found", file=sys.stderr)
+            sys.exit(1)
+        return path
+
+
+def _read_input(args: argparse.Namespace) -> tuple[Optional[str], str]:
+    """Read task and output for interactive, file, or pipe mode."""
+    if args.interactive:
+        task = input("Task description: ").strip()
+        print("Paste agent output (Ctrl+D to finish):")
+        return task, sys.stdin.read()
+    if args.output:
+        return _read_file_or_text(args.task), _read_file_or_text(args.output, required=True) or ""
+    return _read_file_or_text(args.task), sys.stdin.read()
+
+
 def main():
     parser = argparse.ArgumentParser(
         description="Evaluate agent output against the 5-axis rubric"
@@ -353,38 +388,7 @@ def main():
     parser.add_argument("--interactive", action="store_true", help="Prompt for task and read output from stdin")
     args = parser.parse_args()
 
-    task = None
-    output = None
-
-    if args.interactive:
-        task = input("Task description: ").strip()
-        print("Paste agent output (Ctrl+D to finish):")
-        output = sys.stdin.read()
-    elif args.task and args.output:
-        # Read task
-        try:
-            with open(args.task) as f:
-                task = f.read()
-        except FileNotFoundError:
-            task = args.task  # Treat as inline text
-
-        # Read output
-        try:
-            with open(args.output) as f:
-                output = f.read()
-        except FileNotFoundError:
-            print(f"Error: output file '{args.output}' not found", file=sys.stderr)
-            sys.exit(1)
-    else:
-        # Pipe mode: read output from stdin
-        output = sys.stdin.read()
-        if args.task:
-            try:
-                with open(args.task) as f:
-                    task = f.read()
-            except FileNotFoundError:
-                task = args.task
-
+    task, output = _read_input(args)
     if not output:
         print("Error: no output to evaluate", file=sys.stderr)
         sys.exit(1)
diff --git a/skills/agent-self-evaluation/templates/evaluation-report.md b/skills/agent-self-evaluation/templates/evaluation-report.md
index ee0513e2..46737092 100644
--- a/skills/agent-self-evaluation/templates/evaluation-report.md
+++ b/skills/agent-self-evaluation/templates/evaluation-report.md
@@ -6,6 +6,7 @@ Copy this template and fill in after completing a task. The format matches `scri
 ============================================================
 AGENT SELF-EVALUATION REPORT
 ============================================================
+Summary: Overall score X.X/5 across 5 quality axes.
 
   Accuracy         █████ 5/5    or    ███░░ 3/5
     + [Evidence: passing tests, verified claims]
@@ -38,6 +39,8 @@ CRITICAL ISSUES (axes ≤ 2):
   [Axis] Score N/5 — specific fix needed
   (or "None" if no axis ≤ 2)
 
+Self-check: Would the user agree with this assessment? [Yes/No + brief justification]
+
 TOP IMPROVEMENTS:
   1. [Highest impact fix]
   2. [Second highest]