From 51184b692e9db1a0bb6b89b4d46bf8a074845328 Mon Sep 17 00:00:00 2001 From: Affaan Mustafa Date: Thu, 18 Jun 2026 16:34:11 -0400 Subject: [PATCH] fix(continuous-learning): eliminate _SECRET_RE catastrophic backtracking + orphaned-CPU hang (#2278) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The observe hook's secret-scrub regex used a generic ([A-Za-z]+\s+)? group that overlapped the separator and value classes, causing exponential backtracking on identifier-dense tool I/O — an orphaned python child then pegged a core at ~100% CPU for days because the async hook timed out without killing it. - Rewrite _SECRET_RE as a linear matcher: bounded separator {1,8}, a fixed set of auth schemes (bearer|basic|token|bot) instead of [A-Za-z]+, and a bounded value {8,256}. Pathological input drops from hang to <1ms; real secrets still redact (verified incl. 'Bearer '). - Add a signal.alarm(8) self-timeout to both scrub blocks so any runaway child self-terminates before the 10s async-hook timeout can orphan it. --- .../continuous-learning-v2/hooks/observe.sh | 36 +++++++++++++++---- 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/skills/continuous-learning-v2/hooks/observe.sh b/skills/continuous-learning-v2/hooks/observe.sh index 018d5db9..4fc51458 100755 --- a/skills/continuous-learning-v2/hooks/observe.sh +++ b/skills/continuous-learning-v2/hooks/observe.sh @@ -268,13 +268,25 @@ if [ "$PARSED_OK" != "True" ]; then echo "$INPUT_JSON" | "$PYTHON_CMD" -c ' import json, sys, os, re +# Linear-time secret matcher. Bounded quantifiers and a fixed set of auth +# schemes (instead of a generic [A-Za-z]+\s+ that overlapped the value class) +# prevent the catastrophic backtracking that pegged python at 100% CPU (#2278). _SECRET_RE = re.compile( r"(?i)(api[_-]?key|token|secret|password|authorization|credentials?|auth)" - r"""(["'"'"'\s:=]+)""" - r"([A-Za-z]+\s+)?" - r"([A-Za-z0-9_\-/.+=]{8,})" + r"""(["'"'"'\s:=]{1,8})""" + r"((?:bearer|basic|token|bot)\s+)?" + r"([A-Za-z0-9_\-/.+=]{8,256})" ) +import signal +def _ecc_bail(*_): + sys.exit(0) +try: + signal.signal(signal.SIGALRM, _ecc_bail) + signal.alarm(8) # self-terminate before the async hook 10s timeout can orphan us (#2278) +except Exception: + pass + raw = sys.stdin.read()[:2000] raw = _SECRET_RE.sub(lambda m: m.group(1) + m.group(2) + (m.group(3) or "") + "[REDACTED]", raw) print(json.dumps({"timestamp": os.environ["TIMESTAMP"], "event": "parse_error", "raw": raw})) @@ -302,6 +314,15 @@ export TIMESTAMP="$timestamp" echo "$PARSED" | "$PYTHON_CMD" -c ' import json, sys, os, re +import signal + +def _ecc_bail(*_): + sys.exit(0) +try: + signal.signal(signal.SIGALRM, _ecc_bail) + signal.alarm(8) # self-terminate before the async hook 10s timeout can orphan us (#2278) +except Exception: + pass parsed = json.load(sys.stdin) observation = { @@ -315,11 +336,14 @@ observation = { # Scrub secrets: match common key=value, key: value, and key"value patterns # Includes optional auth scheme (e.g., "Bearer", "Basic") before token +# Linear-time secret matcher. Bounded quantifiers and a fixed set of auth +# schemes (instead of a generic [A-Za-z]+\s+ that overlapped the value class) +# prevent the catastrophic backtracking that pegged python at 100% CPU (#2278). _SECRET_RE = re.compile( r"(?i)(api[_-]?key|token|secret|password|authorization|credentials?|auth)" - r"""(["'"'"'\s:=]+)""" - r"([A-Za-z]+\s+)?" - r"([A-Za-z0-9_\-/.+=]{8,})" + r"""(["'"'"'\s:=]{1,8})""" + r"((?:bearer|basic|token|bot)\s+)?" + r"([A-Za-z0-9_\-/.+=]{8,256})" ) def scrub(val):