fix(background-agent): detect stale tasks that never received progress updates
Tasks with no progress.lastUpdate were silently skipped in checkAndInterruptStaleTasks, causing them to hang forever when the model hangs before its first tool call. Now falls back to checking startedAt against a configurable messageStalenessTimeoutMs (default: 10 minutes). Closes #1769
This commit is contained in:
parent
daf011c616
commit
f3ff32fd18
@ -6,6 +6,8 @@ export const BackgroundTaskConfigSchema = z.object({
|
||||
modelConcurrency: z.record(z.string(), z.number().min(0)).optional(),
|
||||
/** Stale timeout in milliseconds - interrupt tasks with no activity for this duration (default: 180000 = 3 minutes, minimum: 60000 = 1 minute) */
|
||||
staleTimeoutMs: z.number().min(60000).optional(),
|
||||
/** Timeout for tasks that never received any progress update, falling back to startedAt (default: 600000 = 10 minutes, minimum: 60000 = 1 minute) */
|
||||
messageStalenessTimeoutMs: z.number().min(60000).optional(),
|
||||
})
|
||||
|
||||
export type BackgroundTaskConfig = z.infer<typeof BackgroundTaskConfigSchema>
|
||||
|
||||
@ -4,6 +4,7 @@ import type { BackgroundTask, LaunchInput } from "./types"
|
||||
export const TASK_TTL_MS = 30 * 60 * 1000
|
||||
export const MIN_STABILITY_TIME_MS = 10 * 1000
|
||||
export const DEFAULT_STALE_TIMEOUT_MS = 180_000
|
||||
export const DEFAULT_MESSAGE_STALENESS_TIMEOUT_MS = 600_000
|
||||
export const MIN_RUNTIME_BEFORE_STALE_MS = 30_000
|
||||
export const MIN_IDLE_TIME_MS = 5000
|
||||
export const POLLING_INTERVAL_MS = 3000
|
||||
|
||||
192
src/features/background-agent/task-poller.test.ts
Normal file
192
src/features/background-agent/task-poller.test.ts
Normal file
@ -0,0 +1,192 @@
|
||||
import { describe, it, expect, mock } from "bun:test"
|
||||
|
||||
import { checkAndInterruptStaleTasks, pruneStaleTasksAndNotifications } from "./task-poller"
|
||||
import type { BackgroundTask } from "./types"
|
||||
|
||||
describe("checkAndInterruptStaleTasks", () => {
|
||||
const mockClient = {
|
||||
session: {
|
||||
abort: mock(() => Promise.resolve()),
|
||||
},
|
||||
}
|
||||
const mockConcurrencyManager = {
|
||||
release: mock(() => {}),
|
||||
}
|
||||
const mockNotify = mock(() => Promise.resolve())
|
||||
|
||||
function createRunningTask(overrides: Partial<BackgroundTask> = {}): BackgroundTask {
|
||||
return {
|
||||
id: "task-1",
|
||||
sessionID: "ses-1",
|
||||
parentSessionID: "parent-ses-1",
|
||||
parentMessageID: "msg-1",
|
||||
description: "test",
|
||||
prompt: "test",
|
||||
agent: "explore",
|
||||
status: "running",
|
||||
startedAt: new Date(Date.now() - 120_000),
|
||||
...overrides,
|
||||
}
|
||||
}
|
||||
|
||||
it("should interrupt tasks with lastUpdate exceeding stale timeout", async () => {
|
||||
//#given
|
||||
const task = createRunningTask({
|
||||
progress: {
|
||||
toolCalls: 1,
|
||||
lastUpdate: new Date(Date.now() - 200_000),
|
||||
},
|
||||
})
|
||||
|
||||
//#when
|
||||
await checkAndInterruptStaleTasks({
|
||||
tasks: [task],
|
||||
client: mockClient as never,
|
||||
config: { staleTimeoutMs: 180_000 },
|
||||
concurrencyManager: mockConcurrencyManager as never,
|
||||
notifyParentSession: mockNotify,
|
||||
})
|
||||
|
||||
//#then
|
||||
expect(task.status).toBe("cancelled")
|
||||
expect(task.error).toContain("Stale timeout")
|
||||
})
|
||||
|
||||
it("should NOT interrupt tasks with recent lastUpdate", async () => {
|
||||
//#given
|
||||
const task = createRunningTask({
|
||||
progress: {
|
||||
toolCalls: 1,
|
||||
lastUpdate: new Date(Date.now() - 10_000),
|
||||
},
|
||||
})
|
||||
|
||||
//#when
|
||||
await checkAndInterruptStaleTasks({
|
||||
tasks: [task],
|
||||
client: mockClient as never,
|
||||
config: { staleTimeoutMs: 180_000 },
|
||||
concurrencyManager: mockConcurrencyManager as never,
|
||||
notifyParentSession: mockNotify,
|
||||
})
|
||||
|
||||
//#then
|
||||
expect(task.status).toBe("running")
|
||||
})
|
||||
|
||||
it("should interrupt tasks with NO progress.lastUpdate that exceeded messageStalenessTimeoutMs since startedAt", async () => {
|
||||
//#given — task started 15 minutes ago, never received any progress update
|
||||
const task = createRunningTask({
|
||||
startedAt: new Date(Date.now() - 15 * 60 * 1000),
|
||||
progress: undefined,
|
||||
})
|
||||
|
||||
//#when
|
||||
await checkAndInterruptStaleTasks({
|
||||
tasks: [task],
|
||||
client: mockClient as never,
|
||||
config: { messageStalenessTimeoutMs: 600_000 },
|
||||
concurrencyManager: mockConcurrencyManager as never,
|
||||
notifyParentSession: mockNotify,
|
||||
})
|
||||
|
||||
//#then
|
||||
expect(task.status).toBe("cancelled")
|
||||
expect(task.error).toContain("no activity")
|
||||
})
|
||||
|
||||
it("should NOT interrupt tasks with NO progress.lastUpdate that are within messageStalenessTimeoutMs", async () => {
|
||||
//#given — task started 5 minutes ago, default timeout is 10 minutes
|
||||
const task = createRunningTask({
|
||||
startedAt: new Date(Date.now() - 5 * 60 * 1000),
|
||||
progress: undefined,
|
||||
})
|
||||
|
||||
//#when
|
||||
await checkAndInterruptStaleTasks({
|
||||
tasks: [task],
|
||||
client: mockClient as never,
|
||||
config: { messageStalenessTimeoutMs: 600_000 },
|
||||
concurrencyManager: mockConcurrencyManager as never,
|
||||
notifyParentSession: mockNotify,
|
||||
})
|
||||
|
||||
//#then
|
||||
expect(task.status).toBe("running")
|
||||
})
|
||||
|
||||
it("should use DEFAULT_MESSAGE_STALENESS_TIMEOUT_MS when messageStalenessTimeoutMs is not configured", async () => {
|
||||
//#given — task started 15 minutes ago, no config for messageStalenessTimeoutMs
|
||||
const task = createRunningTask({
|
||||
startedAt: new Date(Date.now() - 15 * 60 * 1000),
|
||||
progress: undefined,
|
||||
})
|
||||
|
||||
//#when — default is 10 minutes (600_000ms)
|
||||
await checkAndInterruptStaleTasks({
|
||||
tasks: [task],
|
||||
client: mockClient as never,
|
||||
config: undefined,
|
||||
concurrencyManager: mockConcurrencyManager as never,
|
||||
notifyParentSession: mockNotify,
|
||||
})
|
||||
|
||||
//#then
|
||||
expect(task.status).toBe("cancelled")
|
||||
expect(task.error).toContain("no activity")
|
||||
})
|
||||
|
||||
it("should release concurrency key when interrupting a never-updated task", async () => {
|
||||
//#given
|
||||
const releaseMock = mock(() => {})
|
||||
const task = createRunningTask({
|
||||
startedAt: new Date(Date.now() - 15 * 60 * 1000),
|
||||
progress: undefined,
|
||||
concurrencyKey: "anthropic/claude-opus-4-6",
|
||||
})
|
||||
|
||||
//#when
|
||||
await checkAndInterruptStaleTasks({
|
||||
tasks: [task],
|
||||
client: mockClient as never,
|
||||
config: { messageStalenessTimeoutMs: 600_000 },
|
||||
concurrencyManager: { release: releaseMock } as never,
|
||||
notifyParentSession: mockNotify,
|
||||
})
|
||||
|
||||
//#then
|
||||
expect(releaseMock).toHaveBeenCalledWith("anthropic/claude-opus-4-6")
|
||||
expect(task.concurrencyKey).toBeUndefined()
|
||||
})
|
||||
})
|
||||
|
||||
describe("pruneStaleTasksAndNotifications", () => {
|
||||
it("should prune tasks that exceeded TTL", () => {
|
||||
//#given
|
||||
const tasks = new Map<string, BackgroundTask>()
|
||||
const oldTask: BackgroundTask = {
|
||||
id: "old-task",
|
||||
parentSessionID: "parent",
|
||||
parentMessageID: "msg",
|
||||
description: "old",
|
||||
prompt: "old",
|
||||
agent: "explore",
|
||||
status: "running",
|
||||
startedAt: new Date(Date.now() - 31 * 60 * 1000),
|
||||
}
|
||||
tasks.set("old-task", oldTask)
|
||||
|
||||
const pruned: string[] = []
|
||||
const notifications = new Map<string, BackgroundTask[]>()
|
||||
|
||||
//#when
|
||||
pruneStaleTasksAndNotifications({
|
||||
tasks,
|
||||
notifications,
|
||||
onTaskPruned: (taskId) => pruned.push(taskId),
|
||||
})
|
||||
|
||||
//#then
|
||||
expect(pruned).toContain("old-task")
|
||||
})
|
||||
})
|
||||
@ -6,6 +6,7 @@ import type { ConcurrencyManager } from "./concurrency"
|
||||
import type { OpencodeClient } from "./opencode-client"
|
||||
|
||||
import {
|
||||
DEFAULT_MESSAGE_STALENESS_TIMEOUT_MS,
|
||||
DEFAULT_STALE_TIMEOUT_MS,
|
||||
MIN_RUNTIME_BEFORE_STALE_MS,
|
||||
TASK_TTL_MS,
|
||||
@ -67,15 +68,41 @@ export async function checkAndInterruptStaleTasks(args: {
|
||||
const staleTimeoutMs = config?.staleTimeoutMs ?? DEFAULT_STALE_TIMEOUT_MS
|
||||
const now = Date.now()
|
||||
|
||||
const messageStalenessMs = config?.messageStalenessTimeoutMs ?? DEFAULT_MESSAGE_STALENESS_TIMEOUT_MS
|
||||
|
||||
for (const task of tasks) {
|
||||
if (task.status !== "running") continue
|
||||
if (!task.progress?.lastUpdate) continue
|
||||
|
||||
const startedAt = task.startedAt
|
||||
const sessionID = task.sessionID
|
||||
if (!startedAt || !sessionID) continue
|
||||
|
||||
const runtime = now - startedAt.getTime()
|
||||
|
||||
if (!task.progress?.lastUpdate) {
|
||||
if (runtime <= messageStalenessMs) continue
|
||||
|
||||
const staleMinutes = Math.round(runtime / 60000)
|
||||
task.status = "cancelled"
|
||||
task.error = `Stale timeout (no activity for ${staleMinutes}min since start)`
|
||||
task.completedAt = new Date()
|
||||
|
||||
if (task.concurrencyKey) {
|
||||
concurrencyManager.release(task.concurrencyKey)
|
||||
task.concurrencyKey = undefined
|
||||
}
|
||||
|
||||
client.session.abort({ path: { id: sessionID } }).catch(() => {})
|
||||
log(`[background-agent] Task ${task.id} interrupted: no progress since start`)
|
||||
|
||||
try {
|
||||
await notifyParentSession(task)
|
||||
} catch (err) {
|
||||
log("[background-agent] Error in notifyParentSession for stale task:", { taskId: task.id, error: err })
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
if (runtime < MIN_RUNTIME_BEFORE_STALE_MS) continue
|
||||
|
||||
const timeSinceLastUpdate = now - task.progress.lastUpdate.getTime()
|
||||
@ -92,10 +119,7 @@ export async function checkAndInterruptStaleTasks(args: {
|
||||
task.concurrencyKey = undefined
|
||||
}
|
||||
|
||||
client.session.abort({
|
||||
path: { id: sessionID },
|
||||
}).catch(() => {})
|
||||
|
||||
client.session.abort({ path: { id: sessionID } }).catch(() => {})
|
||||
log(`[background-agent] Task ${task.id} interrupted: stale timeout`)
|
||||
|
||||
try {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user