fix(clv2): serialize observer signal-counter to stop dropped increments (#2372)

observe.sh bumps the SIGUSR1 throttle counter in
${PROJECT_DIR}/.observer-signal-counter with an unlocked read-modify-write.
The hook runs on every tool call, so concurrent invocations read the same
value, both increment, and lose a write, signaling the observer at
unpredictable intervals and defeating the #521 throttle.

Serialize the read-modify-write under a lock, and only ever bump the counter
while that lock is held:

- Prefer flock with a bounded -w wait (the OS auto-releases it when the fd
  closes or the process dies, so there is no stale lock and no lost increment);
  on a timeout the tick is skipped rather than bumped unlocked.
- Fall back to an atomic mkdir lock on platforms without flock, with a bounded
  spin. An EXIT trap cleans up on normal completion; INT/TERM traps release the
  lock and exit, so a signal cannot drop the lock and then continue the
  read-modify-write without ownership. If the lock cannot be acquired in the
  budget the tick is skipped rather than raced. No hand-rolled PID stale-reclaim
  (which is racy and can delete a live re-acquirer's lock).
- Guard the counter read against a corrupt (non-integer) file that would abort
  the hook under set -e.

Add tests/hooks/observe-signal-counter-race.test.js: 20 concurrent observe.sh
invocations must not lose increments (exact under flock; at most one dropped on
the best-effort mkdir fallback), the runner rejects on any hook execution
failure or hang, plus content guards for the lock and the corrupt-counter
handling.

Fixes #2296
This commit is contained in:
Gaurav Dubey 2026-06-30 07:13:23 +05:30 committed by GitHub
parent f12b106c3c
commit a89b32c2b5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 340 additions and 8 deletions

View File

@ -477,13 +477,28 @@ fi
# which caused runaway parallel Claude analysis processes. # which caused runaway parallel Claude analysis processes.
SIGNAL_EVERY_N="${ECC_OBSERVER_SIGNAL_EVERY_N:-20}" SIGNAL_EVERY_N="${ECC_OBSERVER_SIGNAL_EVERY_N:-20}"
SIGNAL_COUNTER_FILE="${PROJECT_DIR}/.observer-signal-counter" SIGNAL_COUNTER_FILE="${PROJECT_DIR}/.observer-signal-counter"
SIGNAL_COUNTER_LOCK="${SIGNAL_COUNTER_FILE}.lock"
ACTIVITY_FILE="${PROJECT_DIR}/.observer-last-activity" ACTIVITY_FILE="${PROJECT_DIR}/.observer-last-activity"
touch "$ACTIVITY_FILE" 2>/dev/null || true touch "$ACTIVITY_FILE" 2>/dev/null || true
# Serialize the throttle-counter read-modify-write. observe.sh runs on every
# tool call (which can fire every second), so concurrent invocations previously
# raced on this counter: both read the same value, both incremented, and one
# write was lost, signaling the observer at unpredictable intervals (#2296).
# Prefer flock (a kernel advisory lock the OS releases automatically if the hook
# is killed); fall back to the atomic mkdir lock this script already uses for
# the lazy-start path above. Both wrap the same read-modify-write below.
should_signal=0 should_signal=0
_ecc_bump_signal_counter() {
if [ -f "$SIGNAL_COUNTER_FILE" ]; then if [ -f "$SIGNAL_COUNTER_FILE" ]; then
counter=$(cat "$SIGNAL_COUNTER_FILE" 2>/dev/null || echo 0) counter=$(cat "$SIGNAL_COUNTER_FILE" 2>/dev/null || echo 0)
# Guard against a corrupt counter file: a non-integer value would abort the
# hook under `set -e` at the arithmetic below.
case "$counter" in
''|*[!0-9]*) counter=0 ;;
esac
counter=$((counter + 1)) counter=$((counter + 1))
if [ "$counter" -ge "$SIGNAL_EVERY_N" ]; then if [ "$counter" -ge "$SIGNAL_EVERY_N" ]; then
should_signal=1 should_signal=1
@ -493,6 +508,52 @@ if [ -f "$SIGNAL_COUNTER_FILE" ]; then
else else
echo "1" > "$SIGNAL_COUNTER_FILE" echo "1" > "$SIGNAL_COUNTER_FILE"
fi fi
}
if command -v flock >/dev/null 2>&1 && exec 8>"$SIGNAL_COUNTER_LOCK" 2>/dev/null; then
# flock is auto-released when fd 8 closes or the process dies, so there is no
# stale lock and no lost increment. Use a bounded -w wait so the hook never
# blocks indefinitely, and only bump the counter while the lock is held -- on
# a timeout we skip the tick rather than doing an unlocked read-modify-write.
if flock -w 2 8 2>/dev/null; then
_ecc_bump_signal_counter
flock -u 8 2>/dev/null || true
fi
exec 8>&- 2>/dev/null || true
else
# No flock (e.g. macOS): atomic mkdir lock with a bounded spin so the hook
# never blocks indefinitely. A trap releases the lock on every exit path --
# including the async-timeout SIGTERM -- so a killed hook does not strand the
# directory. We deliberately do NOT hand-roll PID-based stale reclaim:
# re-verifying then removing another process's lock is racy and can delete a
# live re-acquirer's directory, reintroducing the very race this fixes.
_signal_lock_held=0
_signal_lock_spins=0
while [ "$_signal_lock_spins" -lt 100 ]; do
if mkdir "$SIGNAL_COUNTER_LOCK" 2>/dev/null; then
# EXIT cleans up on normal completion. INT/TERM must release AND exit:
# a signal trap that only released the lock would otherwise fall through
# and continue the read-modify-write without ownership.
trap 'rmdir "$SIGNAL_COUNTER_LOCK" 2>/dev/null || true' EXIT
trap 'rmdir "$SIGNAL_COUNTER_LOCK" 2>/dev/null || true; exit 130' INT
trap 'rmdir "$SIGNAL_COUNTER_LOCK" 2>/dev/null || true; exit 143' TERM
_signal_lock_held=1
break
fi
_signal_lock_spins=$((_signal_lock_spins + 1))
sleep 0.02
done
if [ "$_signal_lock_held" -eq 1 ]; then
# Bump only under the held lock -- never an unlocked read-modify-write.
_ecc_bump_signal_counter
rmdir "$SIGNAL_COUNTER_LOCK" 2>/dev/null || true
trap - EXIT INT TERM
fi
# If the lock could not be acquired within the spin budget we skip this tick
# rather than racing on an unlocked counter. Dropping one throttle tick under
# extreme contention only delays the next observer signal slightly; it never
# corrupts the counter or signals spuriously.
fi
# Signal observer if running and throttle allows (check both project-scoped and global observer, deduplicate) # Signal observer if running and throttle allows (check both project-scoped and global observer, deduplicate)
if [ "$should_signal" -eq 1 ]; then if [ "$should_signal" -eq 1 ]; then

View File

@ -0,0 +1,271 @@
/**
* Regression tests for the SIGUSR1 throttle-counter race in observe.sh (#2296)
*
* observe.sh runs on every tool call and bumps a throttle counter in
* ${PROJECT_DIR}/.observer-signal-counter so the observer is signaled only
* every N observations (#521). The bump used a plain read-modify-write with no
* locking, so concurrent hook invocations could read the same value, both
* increment, and lose a write the observer then fired at unpredictable
* intervals. The fix serializes the read-modify-write with an atomic mkdir
* lock.
*
* These tests drive the real observe.sh (reusing the stub harness from
* observer-memory.test.js) and assert the lock's invariant: with the reset
* threshold set high enough that no reset fires, the final counter must equal
* the number of invocations i.e. no increment is ever lost, even under heavy
* concurrency.
*
* Run with: node tests/hooks/observe-signal-counter-race.test.js
*/
const assert = require('assert');
const path = require('path');
const fs = require('fs');
const os = require('os');
const { spawn, spawnSync } = require('child_process');
let passed = 0;
let failed = 0;
function test(name, fn) {
try {
fn();
console.log(`${name}`);
passed++;
} catch (err) {
console.log(`${name}`);
console.log(` Error: ${err.message}`);
failed++;
}
}
async function asyncTest(name, fn) {
try {
await fn();
console.log(`${name}`);
passed++;
} catch (err) {
console.log(`${name}`);
console.log(` Error: ${err.message}`);
failed++;
}
}
function createTempDir() {
return fs.mkdtempSync(path.join(os.tmpdir(), 'ecc-signal-race-'));
}
function cleanupDir(dir) {
try {
fs.rmSync(dir, { recursive: true, force: true });
} catch {
// ignore cleanup errors
}
}
const repoRoot = path.resolve(__dirname, '..', '..');
const observeShPath = path.join(repoRoot, 'skills', 'continuous-learning-v2', 'hooks', 'observe.sh');
const isWindows = process.platform === 'win32';
const hasPython = !isWindows && spawnSync('python3', ['--version']).status === 0;
// When the runner has flock the lock is exact (blocking, kernel auto-release);
// without it observe.sh uses a best-effort mkdir spin that may drop at most one
// increment under pathological contention.
const hasFlock = !isWindows && spawnSync('bash', ['-c', 'command -v flock']).status === 0;
// Build a self-contained observe.sh sandbox (stub detect-project.sh +
// homunculus-dir.sh, SKILL_ROOT patched to the sandbox) and return its paths.
function buildSandbox() {
const testDir = createTempDir();
const projectDir = path.join(testDir, 'project');
fs.mkdirSync(projectDir, { recursive: true });
const skillRoot = path.join(testDir, 'skill');
const scriptsDir = path.join(skillRoot, 'scripts');
const scriptsLibDir = path.join(scriptsDir, 'lib');
const hooksDir = path.join(skillRoot, 'hooks');
fs.mkdirSync(scriptsDir, { recursive: true });
fs.mkdirSync(scriptsLibDir, { recursive: true });
fs.mkdirSync(hooksDir, { recursive: true });
fs.writeFileSync(
path.join(scriptsDir, 'detect-project.sh'),
[
'#!/bin/bash',
'PROJECT_ID="test-project"',
'PROJECT_NAME="test-project"',
`PROJECT_ROOT="${projectDir}"`,
`PROJECT_DIR="${projectDir}"`,
'CLV2_PYTHON_CMD="python3"',
''
].join('\n')
);
fs.writeFileSync(
path.join(scriptsLibDir, 'homunculus-dir.sh'),
[
'#!/bin/bash',
'_ecc_resolve_homunculus_dir() { printf "%s\\n" "$HOME/.local/share/ecc-homunculus"; }',
''
].join('\n')
);
let observeContent = fs.readFileSync(observeShPath, 'utf8');
const skillRootMarker = 'SKILL_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"';
// Fail fast if observe.sh's SKILL_ROOT definition drifts; otherwise the
// no-op replace would leave the sandbox pointing at the real skill tree and
// the test could pass spuriously.
assert.ok(
observeContent.includes(skillRootMarker),
'observe.sh SKILL_ROOT definition changed; update the sandbox rewrite'
);
observeContent = observeContent.replace(
skillRootMarker,
`SKILL_ROOT="${skillRoot}"`
);
const testObserve = path.join(hooksDir, 'observe.sh');
fs.writeFileSync(testObserve, observeContent, { mode: 0o755 });
return { testDir, projectDir, testObserve };
}
// Run observe.sh once against the sandbox. Resolves when the process exits.
function runObserve(testObserve, projectDir) {
const input = JSON.stringify({
tool_name: 'Read',
tool_input: { file_path: '/tmp/test.txt' },
session_id: 'test-session',
cwd: projectDir
});
return new Promise((resolve, reject) => {
const child = spawn('bash', [testObserve, 'post'], {
env: {
...process.env,
HOME: projectDir,
CLAUDE_CODE_ENTRYPOINT: 'cli',
ECC_HOOK_PROFILE: 'standard',
ECC_SKIP_OBSERVE: '0',
CLAUDE_PROJECT_DIR: projectDir,
// Reset threshold far above the invocation count, so no reset fires and
// the final counter equals the number of invocations.
ECC_OBSERVER_SIGNAL_EVERY_N: '100000'
},
stdio: ['pipe', 'ignore', 'pipe']
});
let stderr = '';
// Fail the test on a hung hook rather than waiting forever.
const timer = setTimeout(() => {
child.kill('SIGKILL');
reject(new Error('observe.sh timed out'));
}, 20000);
child.stderr.on('data', (chunk) => { stderr += chunk; });
// A broken observe.sh must fail the test, not be silently swallowed.
child.on('close', (code, signal) => {
clearTimeout(timer);
if (code === 0 && signal === null) {
resolve();
} else {
reject(new Error(`observe.sh failed code=${code} signal=${signal}: ${stderr.trim()}`));
}
});
child.on('error', (err) => {
clearTimeout(timer);
reject(err);
});
child.stdin.end(input);
});
}
function readCounter(projectDir) {
const counterFile = path.join(projectDir, '.observer-signal-counter');
if (!fs.existsSync(counterFile)) {
return null;
}
return parseInt(fs.readFileSync(counterFile, 'utf8').trim(), 10);
}
console.log('\n=== observe.sh signal-counter race regression (#2296) ===\n');
test('observe.sh uses a lock around the throttle-counter update', () => {
const content = fs.readFileSync(observeShPath, 'utf8');
assert.ok(
content.includes('SIGNAL_COUNTER_LOCK'),
'observe.sh should define a lock for the signal counter'
);
assert.ok(
/flock 8\b/.test(content) || /mkdir "\$SIGNAL_COUNTER_LOCK"/.test(content),
'observe.sh should acquire the counter lock via flock or an atomic mkdir'
);
});
test('observe.sh guards against a corrupt (non-integer) counter file', () => {
const content = fs.readFileSync(observeShPath, 'utf8');
assert.ok(
/''\|\*\[!0-9\]\*\) counter=0/.test(content),
'observe.sh should reset a non-integer counter to 0 before incrementing'
);
});
async function runSequential() {
const { testDir, projectDir, testObserve } = buildSandbox();
try {
const N = 5;
for (let i = 0; i < N; i++) {
await runObserve(testObserve, projectDir);
}
const counter = readCounter(projectDir);
assert.notStrictEqual(counter, null, 'counter file should exist after runs');
assert.strictEqual(counter, N, `sequential counter should be ${N}, got ${counter}`);
} finally {
cleanupDir(testDir);
}
}
async function runConcurrent() {
const { testDir, projectDir, testObserve } = buildSandbox();
try {
const K = 20;
// Spawn all K before awaiting any, so they genuinely contend on the counter.
const runs = [];
for (let i = 0; i < K; i++) {
runs.push(runObserve(testObserve, projectDir));
}
await Promise.all(runs);
const counter = readCounter(projectDir);
assert.notStrictEqual(counter, null, 'counter file should exist after concurrent runs');
if (hasFlock) {
// flock serializes every invocation, so no increment is ever lost. The
// pre-fix unlocked code drops increments under this same contention.
assert.strictEqual(
counter,
K,
`with flock the counter must be exactly ${K}, got ${counter}`
);
} else {
// mkdir fallback is best-effort: it may drop at most one increment if its
// bounded spin is exhausted, but never the multi-increment loss the
// unlocked code exhibited.
assert.ok(
counter >= K - 1,
`mkdir fallback should keep the counter >= ${K - 1}, got ${counter}`
);
}
} finally {
cleanupDir(testDir);
}
}
(async () => {
if (!isWindows && hasPython) {
await asyncTest('sequential invocations increment the counter exactly once each', runSequential);
await asyncTest('concurrent invocations never lose a counter increment', runConcurrent);
} else {
console.log(' - skipping shell-execution tests (requires non-Windows + python3)');
}
console.log('\n=== Test Results ===');
console.log(`Passed: ${passed}`);
console.log(`Failed: ${failed}`);
console.log(`Total: ${passed + failed}`);
process.exit(failed > 0 ? 1 : 0);
})();