fix: prevent zombie processes with proper process lifecycle management (#1306)
* fix: prevent zombie processes with proper process lifecycle management - Await proc.exited for fire-and-forget spawns in tmux-utils.ts - Remove competing process.exit() calls from LSP client and skill-mcp-manager signal handlers to let background-agent manager coordinate final exit - Await process exit after kill() in interactive-bash timeout handler - Await process exit after kill() in LSP client stop() method These changes ensure spawned processes are properly reaped and prevent orphan/zombie processes when running with tmux integration. * fix: address Copilot review comments on process cleanup - LSP cleanup: use async/sync split with Promise.allSettled for proper subprocess cleanup - LSP stop(): make idempotent by nulling proc before await to prevent race conditions - Interactive-bash timeout: use .then()/.catch() pattern instead of async callback to avoid unhandled rejections - Skill-mcp-manager: use void+catch pattern for fire-and-forget signal handlers * fix: address remaining Copilot review comments - interactive-bash: reject timeout immediately, fire-and-forget zombie cleanup - skill-mcp-manager: update comments to accurately describe signal handling strategy * fix: address additional Copilot review comments - LSP stop(): add 5s timeout to prevent indefinite hang on stuck processes - tmux-utils: log warnings when pane title setting fails (both spawn/replace) - BackgroundManager: delay process.exit() to next tick via setImmediate to allow other signal handlers to complete cleanup * fix: address code review findings - Increase exit delay from setImmediate to 100ms setTimeout to allow async cleanup - Use asyncCleanup for SIGBREAK on Windows for consistency with SIGINT/SIGTERM - Add try/catch around stderr read in spawnTmuxPane for consistency with replaceTmuxPane * fix: address latest Copilot review comments - LSP stop(): properly clear timeout when proc.exited wins the race - BackgroundManager: use process.exitCode before delayed exit for cleaner shutdown - spawnTmuxPane: remove redundant log import, reuse existing one * fix: address latest Copilot review comments - LSP stop(): escalate to SIGKILL on timeout, add logging - tmux spawnTmuxPane/replaceTmuxPane: drain stderr immediately to avoid backpressure * fix: address latest Copilot review comments - Add .catch() to asyncCleanup() signal handlers to prevent unhandled rejections - Await proc.exited after SIGKILL with 1s timeout to confirm termination * fix: increase exit delay to 6s to accommodate LSP cleanup LSP cleanup can take up to 5s (timeout) + 1s (SIGKILL wait), so the exit delay must be at least 6s to ensure child processes are properly reaped.
This commit is contained in:
parent
4a82ff40fb
commit
b03e463bde
@ -1396,7 +1396,10 @@ function registerProcessSignal(
|
|||||||
const listener = () => {
|
const listener = () => {
|
||||||
handler()
|
handler()
|
||||||
if (exitAfter) {
|
if (exitAfter) {
|
||||||
process.exit(0)
|
// Set exitCode and schedule exit after delay to allow other handlers to complete async cleanup
|
||||||
|
// Use 6s delay to accommodate LSP cleanup (5s timeout + 1s SIGKILL wait)
|
||||||
|
process.exitCode = 0
|
||||||
|
setTimeout(() => process.exit(), 6000)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
process.on(signal, listener)
|
process.on(signal, listener)
|
||||||
|
|||||||
@ -114,23 +114,15 @@ export class SkillMcpManager {
|
|||||||
this.pendingConnections.clear()
|
this.pendingConnections.clear()
|
||||||
}
|
}
|
||||||
|
|
||||||
// Note: 'exit' event is synchronous-only in Node.js, so we use 'beforeExit' for async cleanup
|
// Note: Node's 'exit' event is synchronous-only, so we rely on signal handlers for async cleanup.
|
||||||
// However, 'beforeExit' is not emitted on explicit process.exit() calls
|
// Signal handlers invoke the async cleanup function and ignore errors so they don't block or throw.
|
||||||
// Signal handlers are made async to properly await cleanup
|
// Don't call process.exit() here - let the background-agent manager handle the final process exit.
|
||||||
|
// Use void + catch to trigger async cleanup without awaiting it in the signal handler.
|
||||||
|
|
||||||
process.on("SIGINT", async () => {
|
process.on("SIGINT", () => void cleanup().catch(() => {}))
|
||||||
await cleanup()
|
process.on("SIGTERM", () => void cleanup().catch(() => {}))
|
||||||
process.exit(0)
|
|
||||||
})
|
|
||||||
process.on("SIGTERM", async () => {
|
|
||||||
await cleanup()
|
|
||||||
process.exit(0)
|
|
||||||
})
|
|
||||||
if (process.platform === "win32") {
|
if (process.platform === "win32") {
|
||||||
process.on("SIGBREAK", async () => {
|
process.on("SIGBREAK", () => void cleanup().catch(() => {}))
|
||||||
await cleanup()
|
|
||||||
process.exit(0)
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -139,10 +139,22 @@ export async function spawnTmuxPane(
|
|||||||
}
|
}
|
||||||
|
|
||||||
const title = `omo-subagent-${description.slice(0, 20)}`
|
const title = `omo-subagent-${description.slice(0, 20)}`
|
||||||
spawn([tmux, "select-pane", "-t", paneId, "-T", title], {
|
const titleProc = spawn([tmux, "select-pane", "-t", paneId, "-T", title], {
|
||||||
stdout: "ignore",
|
stdout: "ignore",
|
||||||
stderr: "ignore",
|
stderr: "pipe",
|
||||||
})
|
})
|
||||||
|
// Drain stderr immediately to avoid backpressure
|
||||||
|
const stderrPromise = new Response(titleProc.stderr).text().catch(() => "")
|
||||||
|
const titleExitCode = await titleProc.exited
|
||||||
|
if (titleExitCode !== 0) {
|
||||||
|
const titleStderr = await stderrPromise
|
||||||
|
log("[spawnTmuxPane] WARNING: failed to set pane title", {
|
||||||
|
paneId,
|
||||||
|
title,
|
||||||
|
exitCode: titleExitCode,
|
||||||
|
stderr: titleStderr.trim(),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
return { success: true, paneId }
|
return { success: true, paneId }
|
||||||
}
|
}
|
||||||
@ -217,10 +229,21 @@ export async function replaceTmuxPane(
|
|||||||
}
|
}
|
||||||
|
|
||||||
const title = `omo-subagent-${description.slice(0, 20)}`
|
const title = `omo-subagent-${description.slice(0, 20)}`
|
||||||
spawn([tmux, "select-pane", "-t", paneId, "-T", title], {
|
const titleProc = spawn([tmux, "select-pane", "-t", paneId, "-T", title], {
|
||||||
stdout: "ignore",
|
stdout: "ignore",
|
||||||
stderr: "ignore",
|
stderr: "pipe",
|
||||||
})
|
})
|
||||||
|
// Drain stderr immediately to avoid backpressure
|
||||||
|
const stderrPromise = new Response(titleProc.stderr).text().catch(() => "")
|
||||||
|
const titleExitCode = await titleProc.exited
|
||||||
|
if (titleExitCode !== 0) {
|
||||||
|
const titleStderr = await stderrPromise
|
||||||
|
log("[replaceTmuxPane] WARNING: failed to set pane title", {
|
||||||
|
paneId,
|
||||||
|
exitCode: titleExitCode,
|
||||||
|
stderr: titleStderr.trim(),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
log("[replaceTmuxPane] SUCCESS", { paneId, sessionId })
|
log("[replaceTmuxPane] SUCCESS", { paneId, sessionId })
|
||||||
return { success: true, paneId }
|
return { success: true, paneId }
|
||||||
|
|||||||
@ -96,10 +96,19 @@ The Bash tool can execute these commands directly. Do NOT retry with interactive
|
|||||||
|
|
||||||
const timeoutPromise = new Promise<never>((_, reject) => {
|
const timeoutPromise = new Promise<never>((_, reject) => {
|
||||||
const id = setTimeout(() => {
|
const id = setTimeout(() => {
|
||||||
proc.kill()
|
const timeoutError = new Error(`Timeout after ${DEFAULT_TIMEOUT_MS}ms`)
|
||||||
reject(new Error(`Timeout after ${DEFAULT_TIMEOUT_MS}ms`))
|
try {
|
||||||
|
proc.kill()
|
||||||
|
// Fire-and-forget: wait for process exit in background to avoid zombies
|
||||||
|
void proc.exited.catch(() => {})
|
||||||
|
} catch {
|
||||||
|
// Ignore kill errors; we'll still reject with timeoutError below
|
||||||
|
}
|
||||||
|
reject(timeoutError)
|
||||||
}, DEFAULT_TIMEOUT_MS)
|
}, DEFAULT_TIMEOUT_MS)
|
||||||
proc.exited.then(() => clearTimeout(id))
|
proc.exited
|
||||||
|
.then(() => clearTimeout(id))
|
||||||
|
.catch(() => clearTimeout(id))
|
||||||
})
|
})
|
||||||
|
|
||||||
// Read stdout and stderr in parallel to avoid race conditions
|
// Read stdout and stderr in parallel to avoid race conditions
|
||||||
|
|||||||
@ -33,10 +33,12 @@ class LSPServerManager {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private registerProcessCleanup(): void {
|
private registerProcessCleanup(): void {
|
||||||
const cleanup = () => {
|
// Synchronous cleanup for 'exit' event (cannot await)
|
||||||
|
const syncCleanup = () => {
|
||||||
for (const [, managed] of this.clients) {
|
for (const [, managed] of this.clients) {
|
||||||
try {
|
try {
|
||||||
managed.client.stop()
|
// Fire-and-forget during sync exit - process is terminating
|
||||||
|
void managed.client.stop().catch(() => {})
|
||||||
} catch {}
|
} catch {}
|
||||||
}
|
}
|
||||||
this.clients.clear()
|
this.clients.clear()
|
||||||
@ -46,23 +48,30 @@ class LSPServerManager {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
process.on("exit", cleanup)
|
// Async cleanup for signal handlers - properly await all stops
|
||||||
|
const asyncCleanup = async () => {
|
||||||
|
const stopPromises: Promise<void>[] = []
|
||||||
|
for (const [, managed] of this.clients) {
|
||||||
|
stopPromises.push(managed.client.stop().catch(() => {}))
|
||||||
|
}
|
||||||
|
await Promise.allSettled(stopPromises)
|
||||||
|
this.clients.clear()
|
||||||
|
if (this.cleanupInterval) {
|
||||||
|
clearInterval(this.cleanupInterval)
|
||||||
|
this.cleanupInterval = null
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
process.on("SIGINT", () => {
|
process.on("exit", syncCleanup)
|
||||||
cleanup()
|
|
||||||
process.exit(0)
|
|
||||||
})
|
|
||||||
|
|
||||||
process.on("SIGTERM", () => {
|
// Don't call process.exit() here - let other handlers complete their cleanup first
|
||||||
cleanup()
|
// The background-agent manager handles the final exit call
|
||||||
process.exit(0)
|
// Use async handlers to properly await LSP subprocess cleanup
|
||||||
})
|
process.on("SIGINT", () => void asyncCleanup().catch(() => {}))
|
||||||
|
process.on("SIGTERM", () => void asyncCleanup().catch(() => {}))
|
||||||
|
|
||||||
if (process.platform === "win32") {
|
if (process.platform === "win32") {
|
||||||
process.on("SIGBREAK", () => {
|
process.on("SIGBREAK", () => void asyncCleanup().catch(() => {}))
|
||||||
cleanup()
|
|
||||||
process.exit(0)
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -532,8 +541,34 @@ export class LSPClient {
|
|||||||
this.connection.dispose()
|
this.connection.dispose()
|
||||||
this.connection = null
|
this.connection = null
|
||||||
}
|
}
|
||||||
this.proc?.kill()
|
const proc = this.proc
|
||||||
this.proc = null
|
if (proc) {
|
||||||
|
this.proc = null
|
||||||
|
let exitedBeforeTimeout = false
|
||||||
|
try {
|
||||||
|
proc.kill()
|
||||||
|
// Wait for exit with timeout to prevent indefinite hang
|
||||||
|
let timeoutId: ReturnType<typeof setTimeout> | undefined
|
||||||
|
const timeoutPromise = new Promise<void>((resolve) => {
|
||||||
|
timeoutId = setTimeout(resolve, 5000)
|
||||||
|
})
|
||||||
|
await Promise.race([
|
||||||
|
proc.exited.then(() => { exitedBeforeTimeout = true }).finally(() => timeoutId && clearTimeout(timeoutId)),
|
||||||
|
timeoutPromise,
|
||||||
|
])
|
||||||
|
if (!exitedBeforeTimeout) {
|
||||||
|
log("[LSPClient] Process did not exit within timeout, escalating to SIGKILL")
|
||||||
|
try {
|
||||||
|
proc.kill("SIGKILL")
|
||||||
|
// Wait briefly for SIGKILL to take effect
|
||||||
|
await Promise.race([
|
||||||
|
proc.exited,
|
||||||
|
new Promise<void>((resolve) => setTimeout(resolve, 1000)),
|
||||||
|
])
|
||||||
|
} catch {}
|
||||||
|
}
|
||||||
|
} catch {}
|
||||||
|
}
|
||||||
this.processExited = true
|
this.processExited = true
|
||||||
this.diagnosticsStore.clear()
|
this.diagnosticsStore.clear()
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user