diff --git a/skills/continuous-learning-v2/hooks/observe.sh b/skills/continuous-learning-v2/hooks/observe.sh index 4fc51458..65d2f1d2 100755 --- a/skills/continuous-learning-v2/hooks/observe.sh +++ b/skills/continuous-learning-v2/hooks/observe.sh @@ -477,21 +477,82 @@ fi # which caused runaway parallel Claude analysis processes. SIGNAL_EVERY_N="${ECC_OBSERVER_SIGNAL_EVERY_N:-20}" SIGNAL_COUNTER_FILE="${PROJECT_DIR}/.observer-signal-counter" +SIGNAL_COUNTER_LOCK="${SIGNAL_COUNTER_FILE}.lock" ACTIVITY_FILE="${PROJECT_DIR}/.observer-last-activity" touch "$ACTIVITY_FILE" 2>/dev/null || true +# Serialize the throttle-counter read-modify-write. observe.sh runs on every +# tool call (which can fire every second), so concurrent invocations previously +# raced on this counter: both read the same value, both incremented, and one +# write was lost, signaling the observer at unpredictable intervals (#2296). +# Prefer flock (a kernel advisory lock the OS releases automatically if the hook +# is killed); fall back to the atomic mkdir lock this script already uses for +# the lazy-start path above. Both wrap the same read-modify-write below. should_signal=0 -if [ -f "$SIGNAL_COUNTER_FILE" ]; then - counter=$(cat "$SIGNAL_COUNTER_FILE" 2>/dev/null || echo 0) - counter=$((counter + 1)) - if [ "$counter" -ge "$SIGNAL_EVERY_N" ]; then - should_signal=1 - counter=0 + +_ecc_bump_signal_counter() { + if [ -f "$SIGNAL_COUNTER_FILE" ]; then + counter=$(cat "$SIGNAL_COUNTER_FILE" 2>/dev/null || echo 0) + # Guard against a corrupt counter file: a non-integer value would abort the + # hook under `set -e` at the arithmetic below. + case "$counter" in + ''|*[!0-9]*) counter=0 ;; + esac + counter=$((counter + 1)) + if [ "$counter" -ge "$SIGNAL_EVERY_N" ]; then + should_signal=1 + counter=0 + fi + echo "$counter" > "$SIGNAL_COUNTER_FILE" + else + echo "1" > "$SIGNAL_COUNTER_FILE" fi - echo "$counter" > "$SIGNAL_COUNTER_FILE" +} + +if command -v flock >/dev/null 2>&1 && exec 8>"$SIGNAL_COUNTER_LOCK" 2>/dev/null; then + # flock is auto-released when fd 8 closes or the process dies, so there is no + # stale lock and no lost increment. Use a bounded -w wait so the hook never + # blocks indefinitely, and only bump the counter while the lock is held -- on + # a timeout we skip the tick rather than doing an unlocked read-modify-write. + if flock -w 2 8 2>/dev/null; then + _ecc_bump_signal_counter + flock -u 8 2>/dev/null || true + fi + exec 8>&- 2>/dev/null || true else - echo "1" > "$SIGNAL_COUNTER_FILE" + # No flock (e.g. macOS): atomic mkdir lock with a bounded spin so the hook + # never blocks indefinitely. A trap releases the lock on every exit path -- + # including the async-timeout SIGTERM -- so a killed hook does not strand the + # directory. We deliberately do NOT hand-roll PID-based stale reclaim: + # re-verifying then removing another process's lock is racy and can delete a + # live re-acquirer's directory, reintroducing the very race this fixes. + _signal_lock_held=0 + _signal_lock_spins=0 + while [ "$_signal_lock_spins" -lt 100 ]; do + if mkdir "$SIGNAL_COUNTER_LOCK" 2>/dev/null; then + # EXIT cleans up on normal completion. INT/TERM must release AND exit: + # a signal trap that only released the lock would otherwise fall through + # and continue the read-modify-write without ownership. + trap 'rmdir "$SIGNAL_COUNTER_LOCK" 2>/dev/null || true' EXIT + trap 'rmdir "$SIGNAL_COUNTER_LOCK" 2>/dev/null || true; exit 130' INT + trap 'rmdir "$SIGNAL_COUNTER_LOCK" 2>/dev/null || true; exit 143' TERM + _signal_lock_held=1 + break + fi + _signal_lock_spins=$((_signal_lock_spins + 1)) + sleep 0.02 + done + if [ "$_signal_lock_held" -eq 1 ]; then + # Bump only under the held lock -- never an unlocked read-modify-write. + _ecc_bump_signal_counter + rmdir "$SIGNAL_COUNTER_LOCK" 2>/dev/null || true + trap - EXIT INT TERM + fi + # If the lock could not be acquired within the spin budget we skip this tick + # rather than racing on an unlocked counter. Dropping one throttle tick under + # extreme contention only delays the next observer signal slightly; it never + # corrupts the counter or signals spuriously. fi # Signal observer if running and throttle allows (check both project-scoped and global observer, deduplicate) diff --git a/tests/hooks/observe-signal-counter-race.test.js b/tests/hooks/observe-signal-counter-race.test.js new file mode 100644 index 00000000..2fb978aa --- /dev/null +++ b/tests/hooks/observe-signal-counter-race.test.js @@ -0,0 +1,271 @@ +/** + * Regression tests for the SIGUSR1 throttle-counter race in observe.sh (#2296) + * + * observe.sh runs on every tool call and bumps a throttle counter in + * ${PROJECT_DIR}/.observer-signal-counter so the observer is signaled only + * every N observations (#521). The bump used a plain read-modify-write with no + * locking, so concurrent hook invocations could read the same value, both + * increment, and lose a write — the observer then fired at unpredictable + * intervals. The fix serializes the read-modify-write with an atomic mkdir + * lock. + * + * These tests drive the real observe.sh (reusing the stub harness from + * observer-memory.test.js) and assert the lock's invariant: with the reset + * threshold set high enough that no reset fires, the final counter must equal + * the number of invocations — i.e. no increment is ever lost, even under heavy + * concurrency. + * + * Run with: node tests/hooks/observe-signal-counter-race.test.js + */ + +const assert = require('assert'); +const path = require('path'); +const fs = require('fs'); +const os = require('os'); +const { spawn, spawnSync } = require('child_process'); + +let passed = 0; +let failed = 0; + +function test(name, fn) { + try { + fn(); + console.log(` ✓ ${name}`); + passed++; + } catch (err) { + console.log(` ✗ ${name}`); + console.log(` Error: ${err.message}`); + failed++; + } +} + +async function asyncTest(name, fn) { + try { + await fn(); + console.log(` ✓ ${name}`); + passed++; + } catch (err) { + console.log(` ✗ ${name}`); + console.log(` Error: ${err.message}`); + failed++; + } +} + +function createTempDir() { + return fs.mkdtempSync(path.join(os.tmpdir(), 'ecc-signal-race-')); +} + +function cleanupDir(dir) { + try { + fs.rmSync(dir, { recursive: true, force: true }); + } catch { + // ignore cleanup errors + } +} + +const repoRoot = path.resolve(__dirname, '..', '..'); +const observeShPath = path.join(repoRoot, 'skills', 'continuous-learning-v2', 'hooks', 'observe.sh'); + +const isWindows = process.platform === 'win32'; +const hasPython = !isWindows && spawnSync('python3', ['--version']).status === 0; +// When the runner has flock the lock is exact (blocking, kernel auto-release); +// without it observe.sh uses a best-effort mkdir spin that may drop at most one +// increment under pathological contention. +const hasFlock = !isWindows && spawnSync('bash', ['-c', 'command -v flock']).status === 0; + +// Build a self-contained observe.sh sandbox (stub detect-project.sh + +// homunculus-dir.sh, SKILL_ROOT patched to the sandbox) and return its paths. +function buildSandbox() { + const testDir = createTempDir(); + const projectDir = path.join(testDir, 'project'); + fs.mkdirSync(projectDir, { recursive: true }); + + const skillRoot = path.join(testDir, 'skill'); + const scriptsDir = path.join(skillRoot, 'scripts'); + const scriptsLibDir = path.join(scriptsDir, 'lib'); + const hooksDir = path.join(skillRoot, 'hooks'); + fs.mkdirSync(scriptsDir, { recursive: true }); + fs.mkdirSync(scriptsLibDir, { recursive: true }); + fs.mkdirSync(hooksDir, { recursive: true }); + + fs.writeFileSync( + path.join(scriptsDir, 'detect-project.sh'), + [ + '#!/bin/bash', + 'PROJECT_ID="test-project"', + 'PROJECT_NAME="test-project"', + `PROJECT_ROOT="${projectDir}"`, + `PROJECT_DIR="${projectDir}"`, + 'CLV2_PYTHON_CMD="python3"', + '' + ].join('\n') + ); + fs.writeFileSync( + path.join(scriptsLibDir, 'homunculus-dir.sh'), + [ + '#!/bin/bash', + '_ecc_resolve_homunculus_dir() { printf "%s\\n" "$HOME/.local/share/ecc-homunculus"; }', + '' + ].join('\n') + ); + + let observeContent = fs.readFileSync(observeShPath, 'utf8'); + const skillRootMarker = 'SKILL_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"'; + // Fail fast if observe.sh's SKILL_ROOT definition drifts; otherwise the + // no-op replace would leave the sandbox pointing at the real skill tree and + // the test could pass spuriously. + assert.ok( + observeContent.includes(skillRootMarker), + 'observe.sh SKILL_ROOT definition changed; update the sandbox rewrite' + ); + observeContent = observeContent.replace( + skillRootMarker, + `SKILL_ROOT="${skillRoot}"` + ); + const testObserve = path.join(hooksDir, 'observe.sh'); + fs.writeFileSync(testObserve, observeContent, { mode: 0o755 }); + + return { testDir, projectDir, testObserve }; +} + +// Run observe.sh once against the sandbox. Resolves when the process exits. +function runObserve(testObserve, projectDir) { + const input = JSON.stringify({ + tool_name: 'Read', + tool_input: { file_path: '/tmp/test.txt' }, + session_id: 'test-session', + cwd: projectDir + }); + return new Promise((resolve, reject) => { + const child = spawn('bash', [testObserve, 'post'], { + env: { + ...process.env, + HOME: projectDir, + CLAUDE_CODE_ENTRYPOINT: 'cli', + ECC_HOOK_PROFILE: 'standard', + ECC_SKIP_OBSERVE: '0', + CLAUDE_PROJECT_DIR: projectDir, + // Reset threshold far above the invocation count, so no reset fires and + // the final counter equals the number of invocations. + ECC_OBSERVER_SIGNAL_EVERY_N: '100000' + }, + stdio: ['pipe', 'ignore', 'pipe'] + }); + let stderr = ''; + // Fail the test on a hung hook rather than waiting forever. + const timer = setTimeout(() => { + child.kill('SIGKILL'); + reject(new Error('observe.sh timed out')); + }, 20000); + child.stderr.on('data', (chunk) => { stderr += chunk; }); + // A broken observe.sh must fail the test, not be silently swallowed. + child.on('close', (code, signal) => { + clearTimeout(timer); + if (code === 0 && signal === null) { + resolve(); + } else { + reject(new Error(`observe.sh failed code=${code} signal=${signal}: ${stderr.trim()}`)); + } + }); + child.on('error', (err) => { + clearTimeout(timer); + reject(err); + }); + child.stdin.end(input); + }); +} + +function readCounter(projectDir) { + const counterFile = path.join(projectDir, '.observer-signal-counter'); + if (!fs.existsSync(counterFile)) { + return null; + } + return parseInt(fs.readFileSync(counterFile, 'utf8').trim(), 10); +} + +console.log('\n=== observe.sh signal-counter race regression (#2296) ===\n'); + +test('observe.sh uses a lock around the throttle-counter update', () => { + const content = fs.readFileSync(observeShPath, 'utf8'); + assert.ok( + content.includes('SIGNAL_COUNTER_LOCK'), + 'observe.sh should define a lock for the signal counter' + ); + assert.ok( + /flock 8\b/.test(content) || /mkdir "\$SIGNAL_COUNTER_LOCK"/.test(content), + 'observe.sh should acquire the counter lock via flock or an atomic mkdir' + ); +}); + +test('observe.sh guards against a corrupt (non-integer) counter file', () => { + const content = fs.readFileSync(observeShPath, 'utf8'); + assert.ok( + /''\|\*\[!0-9\]\*\) counter=0/.test(content), + 'observe.sh should reset a non-integer counter to 0 before incrementing' + ); +}); + +async function runSequential() { + const { testDir, projectDir, testObserve } = buildSandbox(); + try { + const N = 5; + for (let i = 0; i < N; i++) { + await runObserve(testObserve, projectDir); + } + const counter = readCounter(projectDir); + assert.notStrictEqual(counter, null, 'counter file should exist after runs'); + assert.strictEqual(counter, N, `sequential counter should be ${N}, got ${counter}`); + } finally { + cleanupDir(testDir); + } +} + +async function runConcurrent() { + const { testDir, projectDir, testObserve } = buildSandbox(); + try { + const K = 20; + // Spawn all K before awaiting any, so they genuinely contend on the counter. + const runs = []; + for (let i = 0; i < K; i++) { + runs.push(runObserve(testObserve, projectDir)); + } + await Promise.all(runs); + const counter = readCounter(projectDir); + assert.notStrictEqual(counter, null, 'counter file should exist after concurrent runs'); + if (hasFlock) { + // flock serializes every invocation, so no increment is ever lost. The + // pre-fix unlocked code drops increments under this same contention. + assert.strictEqual( + counter, + K, + `with flock the counter must be exactly ${K}, got ${counter}` + ); + } else { + // mkdir fallback is best-effort: it may drop at most one increment if its + // bounded spin is exhausted, but never the multi-increment loss the + // unlocked code exhibited. + assert.ok( + counter >= K - 1, + `mkdir fallback should keep the counter >= ${K - 1}, got ${counter}` + ); + } + } finally { + cleanupDir(testDir); + } +} + +(async () => { + if (!isWindows && hasPython) { + await asyncTest('sequential invocations increment the counter exactly once each', runSequential); + await asyncTest('concurrent invocations never lose a counter increment', runConcurrent); + } else { + console.log(' - skipping shell-execution tests (requires non-Windows + python3)'); + } + + console.log('\n=== Test Results ==='); + console.log(`Passed: ${passed}`); + console.log(`Failed: ${failed}`); + console.log(`Total: ${passed + failed}`); + + process.exit(failed > 0 ? 1 : 0); +})();