fix(run): add event watchdog and secondary timeout for hasReceivedMeaningfulWork
Implements fixes from issue #1880 and #1934 to prevent exit code 130 timeout in CI environments: - Add lastEventTimestamp to EventState for tracking when events were last received - Add event watchdog: if no events for 30s, verify session status via direct API call - Add secondary timeout: after 60s without meaningful work events, check for active children/todos and assume work is in progress This prevents the poll loop from waiting for full 600s timeout when: 1. Event stream drops silently (common in CI with network instability) 2. Main session delegates to children without producing meaningful work on main session
This commit is contained in:
parent
55b9ad60d8
commit
159ade05cc
@ -7,6 +7,8 @@ export interface EventState {
|
|||||||
currentTool: string | null
|
currentTool: string | null
|
||||||
/** Set to true when the main session has produced meaningful work (text, tool call, or tool result) */
|
/** Set to true when the main session has produced meaningful work (text, tool call, or tool result) */
|
||||||
hasReceivedMeaningfulWork: boolean
|
hasReceivedMeaningfulWork: boolean
|
||||||
|
/** Timestamp of the last received event (for watchdog detection) */
|
||||||
|
lastEventTimestamp: number
|
||||||
/** Count of assistant messages for the main session */
|
/** Count of assistant messages for the main session */
|
||||||
messageCount: number
|
messageCount: number
|
||||||
/** Current agent name from the latest assistant message */
|
/** Current agent name from the latest assistant message */
|
||||||
@ -54,6 +56,7 @@ export function createEventState(): EventState {
|
|||||||
lastPartText: "",
|
lastPartText: "",
|
||||||
currentTool: null,
|
currentTool: null,
|
||||||
hasReceivedMeaningfulWork: false,
|
hasReceivedMeaningfulWork: false,
|
||||||
|
lastEventTimestamp: Date.now(),
|
||||||
messageCount: 0,
|
messageCount: 0,
|
||||||
currentAgent: null,
|
currentAgent: null,
|
||||||
currentModel: null,
|
currentModel: null,
|
||||||
|
|||||||
@ -35,6 +35,9 @@ export async function processEvents(
|
|||||||
logEventVerbose(ctx, payload)
|
logEventVerbose(ctx, payload)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Update last event timestamp for watchdog detection
|
||||||
|
state.lastEventTimestamp = Date.now()
|
||||||
|
|
||||||
handleSessionError(ctx, payload, state)
|
handleSessionError(ctx, payload, state)
|
||||||
handleSessionIdle(ctx, payload, state)
|
handleSessionIdle(ctx, payload, state)
|
||||||
handleSessionStatus(ctx, payload, state)
|
handleSessionStatus(ctx, payload, state)
|
||||||
|
|||||||
@ -8,11 +8,15 @@ const DEFAULT_POLL_INTERVAL_MS = 500
|
|||||||
const DEFAULT_REQUIRED_CONSECUTIVE = 1
|
const DEFAULT_REQUIRED_CONSECUTIVE = 1
|
||||||
const ERROR_GRACE_CYCLES = 3
|
const ERROR_GRACE_CYCLES = 3
|
||||||
const MIN_STABILIZATION_MS = 1_000
|
const MIN_STABILIZATION_MS = 1_000
|
||||||
|
const DEFAULT_EVENT_WATCHDOG_MS = 30_000 // 30 seconds
|
||||||
|
const DEFAULT_SECONDARY_MEANINGFUL_WORK_TIMEOUT_MS = 60_000 // 60 seconds
|
||||||
|
|
||||||
export interface PollOptions {
|
export interface PollOptions {
|
||||||
pollIntervalMs?: number
|
pollIntervalMs?: number
|
||||||
requiredConsecutive?: number
|
requiredConsecutive?: number
|
||||||
minStabilizationMs?: number
|
minStabilizationMs?: number
|
||||||
|
eventWatchdogMs?: number
|
||||||
|
secondaryMeaningfulWorkTimeoutMs?: number
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function pollForCompletion(
|
export async function pollForCompletion(
|
||||||
@ -28,6 +32,11 @@ export async function pollForCompletion(
|
|||||||
options.minStabilizationMs ?? MIN_STABILIZATION_MS
|
options.minStabilizationMs ?? MIN_STABILIZATION_MS
|
||||||
const minStabilizationMs =
|
const minStabilizationMs =
|
||||||
rawMinStabilizationMs > 0 ? rawMinStabilizationMs : MIN_STABILIZATION_MS
|
rawMinStabilizationMs > 0 ? rawMinStabilizationMs : MIN_STABILIZATION_MS
|
||||||
|
const eventWatchdogMs =
|
||||||
|
options.eventWatchdogMs ?? DEFAULT_EVENT_WATCHDOG_MS
|
||||||
|
const secondaryMeaningfulWorkTimeoutMs =
|
||||||
|
options.secondaryMeaningfulWorkTimeoutMs ??
|
||||||
|
DEFAULT_SECONDARY_MEANINGFUL_WORK_TIMEOUT_MS
|
||||||
let consecutiveCompleteChecks = 0
|
let consecutiveCompleteChecks = 0
|
||||||
let errorCycleCount = 0
|
let errorCycleCount = 0
|
||||||
let firstWorkTimestamp: number | null = null
|
let firstWorkTimestamp: number | null = null
|
||||||
@ -59,6 +68,32 @@ export async function pollForCompletion(
|
|||||||
errorCycleCount = 0
|
errorCycleCount = 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Watchdog: if no events received for N seconds, verify session status via API
|
||||||
|
if (eventState.lastEventTimestamp !== null) {
|
||||||
|
const timeSinceLastEvent = Date.now() - eventState.lastEventTimestamp
|
||||||
|
if (timeSinceLastEvent > eventWatchdogMs) {
|
||||||
|
// Events stopped coming - verify actual session state
|
||||||
|
console.log(
|
||||||
|
pc.yellow(
|
||||||
|
`\n No events for ${Math.round(
|
||||||
|
timeSinceLastEvent / 1000
|
||||||
|
)}s, verifying session status...`
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
// Force check session status directly
|
||||||
|
const directStatus = await getMainSessionStatus(ctx)
|
||||||
|
if (directStatus === "idle") {
|
||||||
|
eventState.mainSessionIdle = true
|
||||||
|
} else if (directStatus === "busy" || directStatus === "retry") {
|
||||||
|
eventState.mainSessionIdle = false
|
||||||
|
}
|
||||||
|
|
||||||
|
// Reset timestamp to avoid repeated checks
|
||||||
|
eventState.lastEventTimestamp = Date.now()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const mainSessionStatus = await getMainSessionStatus(ctx)
|
const mainSessionStatus = await getMainSessionStatus(ctx)
|
||||||
if (mainSessionStatus === "busy" || mainSessionStatus === "retry") {
|
if (mainSessionStatus === "busy" || mainSessionStatus === "retry") {
|
||||||
eventState.mainSessionIdle = false
|
eventState.mainSessionIdle = false
|
||||||
@ -81,6 +116,47 @@ export async function pollForCompletion(
|
|||||||
consecutiveCompleteChecks = 0
|
consecutiveCompleteChecks = 0
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Secondary timeout: if we've been polling for reasonable time but haven't
|
||||||
|
// received meaningful work via events, check if there's active work via API
|
||||||
|
if (Date.now() - pollStartTimestamp > secondaryMeaningfulWorkTimeoutMs) {
|
||||||
|
// Check if session actually has pending work (children, todos, etc.)
|
||||||
|
const childrenRes = await ctx.client.session.children({
|
||||||
|
path: { id: ctx.sessionID },
|
||||||
|
query: { directory: ctx.directory },
|
||||||
|
})
|
||||||
|
const children = normalizeSDKResponse(childrenRes, [] as unknown[])
|
||||||
|
const todosRes = await ctx.client.session.todo({
|
||||||
|
path: { id: ctx.sessionID },
|
||||||
|
query: { directory: ctx.directory },
|
||||||
|
})
|
||||||
|
const todos = normalizeSDKResponse(todosRes, [] as unknown[])
|
||||||
|
|
||||||
|
const hasActiveWork =
|
||||||
|
Array.isArray(children)
|
||||||
|
? children.length > 0
|
||||||
|
: false || Array.isArray(todos)
|
||||||
|
? todos.some(
|
||||||
|
(
|
||||||
|
t: unknown
|
||||||
|
) =>
|
||||||
|
(t as { status?: string })?.status !== "completed" &&
|
||||||
|
(t as { status?: string })?.status !== "cancelled"
|
||||||
|
)
|
||||||
|
: false
|
||||||
|
|
||||||
|
if (hasActiveWork) {
|
||||||
|
// Assume meaningful work is happening even without events
|
||||||
|
eventState.hasReceivedMeaningfulWork = true
|
||||||
|
console.log(
|
||||||
|
pc.yellow(
|
||||||
|
`\n No meaningful work events for ${Math.round(
|
||||||
|
secondaryMeaningfulWorkTimeoutMs / 1000
|
||||||
|
)}s but session has active work - assuming in progress`
|
||||||
|
)
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
// Track when first meaningful work was received
|
// Track when first meaningful work was received
|
||||||
if (firstWorkTimestamp === null) {
|
if (firstWorkTimestamp === null) {
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user