"""Cross-surface CLI parity audit (ROADMAP #171). Prevents future drift of the unified JSON envelope contract across claw-code's CLI surface. Instead of requiring humans to notice when a new command skips --output-format, this test introspects the parser at runtime and verifies every command in the declared clawable-surface list supports --output-format {text,json}. When a new clawable-surface command is added: 1. Implement --output-format on the subparser (normal feature work). 2. Add the command name to CLAWABLE_SURFACES below. 3. This test passes automatically. When a developer adds a new clawable-surface command but forgets --output-format, the test fails with a concrete message pointing at the missing flag. Claws no longer need to eyeball parity; the contract is enforced at test time. Three classes of commands: - CLAWABLE_SURFACES: MUST accept --output-format (inspect/lifecycle/exec/diagnostic) - OPT_OUT_SURFACES: explicitly exempt (simulation/mode commands, human-first diagnostic) - Any command in parser not listed in either: test FAILS with classification request This is operationalised parity — a machine-first CLI enforced by a machine-first test. """ from __future__ import annotations import subprocess import sys from pathlib import Path import pytest sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) from src.main import build_parser # noqa: E402 # Commands that MUST accept --output-format {text,json}. # These are the machine-first surfaces — session lifecycle, execution, # inspect, diagnostic inventory. CLAWABLE_SURFACES = frozenset({ # Session lifecycle (#160, #165, #166) 'list-sessions', 'delete-session', 'load-session', 'flush-transcript', # Inspect (#167) 'show-command', 'show-tool', # Execution/work-verb (#168) 'exec-command', 'exec-tool', 'route', 'bootstrap', # Diagnostic inventory (#169, #170) 'command-graph', 'tool-pool', 'bootstrap-graph', # Turn-loop with JSON output (#164 Stage B, #174) 'turn-loop', }) # Commands explicitly exempt from --output-format requirement. # Rationale must be explicit — either the command is human-first # (rich Markdown docs/reports), simulation-only, or has a dedicated # JSON mode flag under a different name. OPT_OUT_SURFACES = frozenset({ # Rich-Markdown report commands (planned future: JSON schema) 'summary', # full workspace summary (Markdown) 'manifest', # workspace manifest (Markdown) 'parity-audit', # TypeScript archive comparison (Markdown) 'setup-report', # startup/prefetch report (Markdown) # List commands with their own query/filter surface (not JSON yet) 'subsystems', # use --limit 'commands', # use --query / --limit / --no-plugin-commands 'tools', # use --query / --limit / --simple-mode # Simulation/debug surfaces (not claw-orchestrated) 'remote-mode', 'ssh-mode', 'teleport-mode', 'direct-connect-mode', 'deep-link-mode', }) def _discover_subcommands_and_flags() -> dict[str, frozenset[str]]: """Introspect the argparse tree to discover every subcommand and its flags. Returns: {subcommand_name: frozenset of option strings including --output-format if registered} """ parser = build_parser() subcommand_flags: dict[str, frozenset[str]] = {} for action in parser._actions: if not hasattr(action, 'choices') or not action.choices: continue if action.dest != 'command': continue for name, subp in action.choices.items(): flags: set[str] = set() for a in subp._actions: if a.option_strings: flags.update(a.option_strings) subcommand_flags[name] = frozenset(flags) return subcommand_flags class TestClawableSurfaceParity: """Every clawable-surface command MUST accept --output-format {text,json}. This is the invariant that codifies 'claws can treat the CLI as a unified protocol without special-casing'. """ def test_all_clawable_surfaces_accept_output_format(self) -> None: """All commands in CLAWABLE_SURFACES must have --output-format registered.""" subcommand_flags = _discover_subcommands_and_flags() missing = [] for cmd in CLAWABLE_SURFACES: if cmd not in subcommand_flags: missing.append(f'{cmd}: not registered in parser') elif '--output-format' not in subcommand_flags[cmd]: missing.append(f'{cmd}: missing --output-format flag') assert not missing, ( 'Clawable-surface parity violation. Every command in ' 'CLAWABLE_SURFACES must accept --output-format. Failures:\n' + '\n'.join(f' - {m}' for m in missing) ) @pytest.mark.parametrize('cmd_name', sorted(CLAWABLE_SURFACES)) def test_clawable_surface_output_format_choices(self, cmd_name: str) -> None: """Every clawable surface must accept exactly {text, json} choices.""" parser = build_parser() for action in parser._actions: if not hasattr(action, 'choices') or not action.choices: continue if action.dest != 'command': continue if cmd_name not in action.choices: continue subp = action.choices[cmd_name] for a in subp._actions: if '--output-format' in a.option_strings: assert a.choices == ['text', 'json'], ( f'{cmd_name}: --output-format choices are {a.choices}, ' f'expected [text, json]' ) assert a.default == 'text', ( f'{cmd_name}: --output-format default is {a.default!r}, ' f'expected \'text\' for backward compat' ) return pytest.fail(f'{cmd_name}: no --output-format flag found') class TestCommandClassificationCoverage: """Every registered subcommand must be classified as either CLAWABLE or OPT_OUT. If a new command is added to the parser but forgotten in both sets, this test fails loudly — forcing an explicit classification decision. """ def test_every_registered_command_is_classified(self) -> None: subcommand_flags = _discover_subcommands_and_flags() all_classified = CLAWABLE_SURFACES | OPT_OUT_SURFACES unclassified = set(subcommand_flags.keys()) - all_classified assert not unclassified, ( 'Unclassified subcommands detected. Every new command must be ' 'explicitly added to either CLAWABLE_SURFACES (must accept ' '--output-format) or OPT_OUT_SURFACES (explicitly exempt with ' 'rationale). Unclassified:\n' + '\n'.join(f' - {cmd}' for cmd in sorted(unclassified)) ) def test_no_command_in_both_sets(self) -> None: """Sanity: a command cannot be both clawable AND opt-out.""" overlap = CLAWABLE_SURFACES & OPT_OUT_SURFACES assert not overlap, ( f'Classification conflict: commands appear in both sets: {overlap}' ) def test_all_classified_commands_actually_exist(self) -> None: """No typos — every command in our sets must actually be registered.""" subcommand_flags = _discover_subcommands_and_flags() ghosts = (CLAWABLE_SURFACES | OPT_OUT_SURFACES) - set(subcommand_flags.keys()) assert not ghosts, ( f'Phantom commands in classification sets (not in parser): {ghosts}. ' 'Update CLAWABLE_SURFACES / OPT_OUT_SURFACES if commands were removed.' ) class TestJsonOutputContractEndToEnd: """Verify the contract AT RUNTIME — not just parser-level, but actual execution. Each clawable command must, when invoked with --output-format json, produce parseable JSON on stdout (for success cases). """ # Minimal invocation args for each clawable command (to hit success path) RUNTIME_INVOCATIONS = { 'list-sessions': [], # delete-session/load-session: skip (need state setup, covered by dedicated tests) 'show-command': ['add-dir'], 'show-tool': ['BashTool'], 'exec-command': ['add-dir', 'hi'], 'exec-tool': ['BashTool', '{}'], 'route': ['review'], 'bootstrap': ['hello'], 'command-graph': [], 'tool-pool': [], 'bootstrap-graph': [], # flush-transcript: skip (creates files, covered by dedicated tests) } @pytest.mark.parametrize('cmd_name,cmd_args', sorted(RUNTIME_INVOCATIONS.items())) def test_command_emits_parseable_json(self, cmd_name: str, cmd_args: list[str]) -> None: """End-to-end: invoking with --output-format json yields valid JSON.""" import json result = subprocess.run( [sys.executable, '-m', 'src.main', cmd_name, *cmd_args, '--output-format', 'json'], cwd=Path(__file__).resolve().parent.parent, capture_output=True, text=True, ) # Accept exit 0 (success) or 1 (typed not-found) — both must still produce JSON assert result.returncode in (0, 1), ( f'{cmd_name}: unexpected exit {result.returncode}\n' f'stderr: {result.stderr}\n' f'stdout: {result.stdout[:200]}' ) try: json.loads(result.stdout) except json.JSONDecodeError as e: pytest.fail( f'{cmd_name} {cmd_args} --output-format json did not produce ' f'parseable JSON: {e}\nOutput: {result.stdout[:200]}' ) class TestOptOutSurfaceRejection: """Cycle #30: OPT_OUT surfaces must REJECT --output-format, not silently accept. OPT_OUT_AUDIT.md classifies 12 surfaces as intentionally exempt from the JSON envelope contract. This test LOCKS that rejection so accidental drift (e.g., a developer adds --output-format to summary without thinking) doesn't silently promote an OPT_OUT surface to CLAWABLE. Relationship to existing tests: - test_clawable_surface_has_output_format: asserts CLAWABLE surfaces accept it - TestOptOutSurfaceRejection: asserts OPT_OUT surfaces REJECT it Together, these two test classes form a complete parity check: every surface is either IN or OUT, and both cases are explicitly tested. If an OPT_OUT surface is promoted to CLAWABLE intentionally: 1. Move it from OPT_OUT_SURFACES to CLAWABLE_SURFACES 2. Update OPT_OUT_AUDIT.md with promotion rationale 3. Remove from this test's expected rejections 4. Both sets of tests continue passing """ @pytest.mark.parametrize('cmd_name', sorted(OPT_OUT_SURFACES)) def test_opt_out_surface_rejects_output_format(self, cmd_name: str) -> None: """OPT_OUT surfaces must NOT accept --output-format flag. Passing --output-format to an OPT_OUT surface should produce an 'unrecognized arguments' error from argparse. """ result = subprocess.run( [sys.executable, '-m', 'src.main', cmd_name, '--output-format', 'json'], cwd=Path(__file__).resolve().parent.parent, capture_output=True, text=True, ) # Should fail — argparse exit 2 in text mode, exit 1 in JSON mode # (both modes normalize to "unrecognized arguments" message) assert result.returncode != 0, ( f'{cmd_name} unexpectedly accepted --output-format json. ' f'If this is intentional (promotion to CLAWABLE), move from ' f'OPT_OUT_SURFACES to CLAWABLE_SURFACES and update OPT_OUT_AUDIT.md. ' f'Output: {result.stdout[:200]}\nStderr: {result.stderr[:200]}' ) # Verify the error is specifically about --output-format error_text = result.stdout + result.stderr assert '--output-format' in error_text or 'unrecognized' in error_text, ( f'{cmd_name} failed but error not about --output-format. ' f'Something else is broken:\n' f'stdout: {result.stdout[:300]}\nstderr: {result.stderr[:300]}' ) def test_opt_out_set_matches_audit_document(self) -> None: """OPT_OUT_SURFACES constant must exactly match OPT_OUT_AUDIT.md listing. This test reads OPT_OUT_AUDIT.md and verifies the constant doesn't drift from the documentation. """ audit_path = Path(__file__).resolve().parent.parent / 'OPT_OUT_AUDIT.md' audit_text = audit_path.read_text() # Expected 12 surfaces per audit doc expected_surfaces = { # Group A: Rich-Markdown Reports (4) 'summary', 'manifest', 'parity-audit', 'setup-report', # Group B: List Commands (3) 'subsystems', 'commands', 'tools', # Group C: Simulation/Debug (5) 'remote-mode', 'ssh-mode', 'teleport-mode', 'direct-connect-mode', 'deep-link-mode', } assert OPT_OUT_SURFACES == expected_surfaces, ( f'OPT_OUT_SURFACES drift from expected 12 surfaces per audit:\n' f' Expected: {sorted(expected_surfaces)}\n' f' Actual: {sorted(OPT_OUT_SURFACES)}' ) # Each surface should be mentioned in audit doc missing_from_audit = [s for s in OPT_OUT_SURFACES if s not in audit_text] assert not missing_from_audit, ( f'OPT_OUT surfaces not mentioned in OPT_OUT_AUDIT.md: {missing_from_audit}' ) def test_opt_out_count_matches_declared(self) -> None: """OPT_OUT_AUDIT.md declares '12 surfaces'. Constant must match.""" assert len(OPT_OUT_SURFACES) == 12, ( f'OPT_OUT_SURFACES has {len(OPT_OUT_SURFACES)} items, ' f'but OPT_OUT_AUDIT.md declares 12 total surfaces. ' f'Update either the audit doc or the constant.' )