Codex プロセスのハングによる worker pool スロット占有を防止
Codex CLI プロセスが API 応答待ちで無応答になった場合、for await ループが 永久にブロックし worker pool のスロットを占有し続ける問題に対処。 AbortSignal の伝播経路を整備し、2層のタイムアウトを導入した。 - Codex ストリームのアイドルタイムアウト(10分無応答で中断) - タスクレベルのタイムアウト(並列実行時、1時間で中断) - AbortSignal を worker pool → PieceEngine → AgentRunner → Codex SDK まで伝播
This commit is contained in:
parent
88f7b38796
commit
55559cc41c
@ -100,6 +100,7 @@ export class AgentRunner {
|
||||
): ProviderCallOptions {
|
||||
return {
|
||||
cwd: options.cwd,
|
||||
abortSignal: options.abortSignal,
|
||||
sessionId: options.sessionId,
|
||||
allowedTools: options.allowedTools ?? agentConfig?.allowedTools,
|
||||
mcpServers: options.mcpServers,
|
||||
|
||||
@ -10,6 +10,7 @@ export type { StreamCallback };
|
||||
/** Common options for running agents */
|
||||
export interface RunAgentOptions {
|
||||
cwd: string;
|
||||
abortSignal?: AbortSignal;
|
||||
sessionId?: string;
|
||||
model?: string;
|
||||
provider?: 'claude' | 'codex' | 'mock';
|
||||
|
||||
@ -33,6 +33,7 @@ export class OptionsBuilder {
|
||||
|
||||
return {
|
||||
cwd: this.getCwd(),
|
||||
abortSignal: this.engineOptions.abortSignal,
|
||||
personaPath: step.personaPath,
|
||||
provider: step.provider ?? this.engineOptions.personaProviders?.[step.personaDisplayName] ?? this.engineOptions.provider,
|
||||
model: step.model ?? this.engineOptions.model,
|
||||
|
||||
@ -153,6 +153,7 @@ export type IterationLimitCallback = (request: IterationLimitRequest) => Promise
|
||||
|
||||
/** Options for piece engine */
|
||||
export interface PieceEngineOptions {
|
||||
abortSignal?: AbortSignal;
|
||||
/** Callback for streaming real-time output */
|
||||
onStream?: StreamCallback;
|
||||
/** Callback for requesting user input when an agent is blocked */
|
||||
|
||||
@ -331,6 +331,7 @@ export async function executePiece(
|
||||
: undefined;
|
||||
|
||||
const engine = new PieceEngine(pieceConfig, cwd, task, {
|
||||
abortSignal: options.abortSignal,
|
||||
onStream: streamHandler,
|
||||
onUserInput,
|
||||
initialSessions: savedSessions,
|
||||
|
||||
@ -23,6 +23,7 @@ import { resolveTaskExecution } from './resolveTask.js';
|
||||
export type { TaskExecutionOptions, ExecuteTaskOptions };
|
||||
|
||||
const log = createLogger('task');
|
||||
const TASK_TIMEOUT_MS = 60 * 60 * 1000;
|
||||
|
||||
/**
|
||||
* Resolve a GitHub issue from task data's issue number.
|
||||
@ -107,12 +108,29 @@ export async function executeAndCompleteTask(
|
||||
): Promise<boolean> {
|
||||
const startedAt = new Date().toISOString();
|
||||
const executionLog: string[] = [];
|
||||
const taskAbortController = new AbortController();
|
||||
const externalAbortSignal = parallelOptions?.abortSignal;
|
||||
const taskTimeoutMs = externalAbortSignal ? TASK_TIMEOUT_MS : undefined;
|
||||
const taskAbortSignal = externalAbortSignal ? taskAbortController.signal : undefined;
|
||||
let timeoutId: ReturnType<typeof setTimeout> | undefined;
|
||||
|
||||
const onExternalAbort = (): void => {
|
||||
taskAbortController.abort();
|
||||
};
|
||||
|
||||
if (externalAbortSignal) {
|
||||
if (externalAbortSignal.aborted) {
|
||||
taskAbortController.abort();
|
||||
} else {
|
||||
externalAbortSignal.addEventListener('abort', onExternalAbort, { once: true });
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
const { execCwd, execPiece, isWorktree, branch, baseBranch, startMovement, retryNote, autoPr, issueNumber } = await resolveTaskExecution(task, cwd, pieceName);
|
||||
|
||||
// cwd is always the project root; pass it as projectCwd so reports/sessions go there
|
||||
const taskSuccess = await executeTask({
|
||||
const taskRunPromise = executeTask({
|
||||
task: task.content,
|
||||
cwd: execCwd,
|
||||
pieceIdentifier: execPiece,
|
||||
@ -120,10 +138,26 @@ export async function executeAndCompleteTask(
|
||||
agentOverrides: options,
|
||||
startMovement,
|
||||
retryNote,
|
||||
abortSignal: parallelOptions?.abortSignal,
|
||||
abortSignal: taskAbortSignal,
|
||||
taskPrefix: parallelOptions?.taskPrefix,
|
||||
taskColorIndex: parallelOptions?.taskColorIndex,
|
||||
});
|
||||
|
||||
const timeoutPromise = taskTimeoutMs && taskTimeoutMs > 0
|
||||
? new Promise<boolean>((_, reject) => {
|
||||
timeoutId = setTimeout(() => {
|
||||
taskAbortController.abort();
|
||||
reject(new Error(`Task timed out after ${Math.floor(taskTimeoutMs / 60000)} minutes`));
|
||||
}, taskTimeoutMs);
|
||||
})
|
||||
: undefined;
|
||||
|
||||
const taskSuccess = timeoutPromise
|
||||
? await Promise.race<boolean>([
|
||||
taskRunPromise,
|
||||
timeoutPromise,
|
||||
])
|
||||
: await taskRunPromise;
|
||||
const completedAt = new Date().toISOString();
|
||||
|
||||
if (taskSuccess && isWorktree) {
|
||||
@ -192,6 +226,13 @@ export async function executeAndCompleteTask(
|
||||
|
||||
error(`Task "${task.name}" error: ${getErrorMessage(err)}`);
|
||||
return false;
|
||||
} finally {
|
||||
if (timeoutId !== undefined) {
|
||||
clearTimeout(timeoutId);
|
||||
}
|
||||
if (externalAbortSignal) {
|
||||
externalAbortSignal.removeEventListener('abort', onExternalAbort);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -23,6 +23,7 @@ import {
|
||||
export type { CodexCallOptions } from './types.js';
|
||||
|
||||
const log = createLogger('codex-sdk');
|
||||
const CODEX_STREAM_IDLE_TIMEOUT_MS = 10 * 60 * 1000;
|
||||
|
||||
/**
|
||||
* Client for Codex SDK agent interactions.
|
||||
@ -55,6 +56,31 @@ export class CodexClient {
|
||||
? `${options.systemPrompt}\n\n${prompt}`
|
||||
: prompt;
|
||||
|
||||
let idleTimeoutId: ReturnType<typeof setTimeout> | undefined;
|
||||
const streamAbortController = new AbortController();
|
||||
const abortMessage = `Codex stream timed out after ${Math.floor(CODEX_STREAM_IDLE_TIMEOUT_MS / 60000)} minutes of inactivity`;
|
||||
|
||||
const resetIdleTimeout = (): void => {
|
||||
if (idleTimeoutId !== undefined) {
|
||||
clearTimeout(idleTimeoutId);
|
||||
}
|
||||
idleTimeoutId = setTimeout(() => {
|
||||
streamAbortController.abort();
|
||||
}, CODEX_STREAM_IDLE_TIMEOUT_MS);
|
||||
};
|
||||
|
||||
const onExternalAbort = (): void => {
|
||||
streamAbortController.abort();
|
||||
};
|
||||
|
||||
if (options.abortSignal) {
|
||||
if (options.abortSignal.aborted) {
|
||||
streamAbortController.abort();
|
||||
} else {
|
||||
options.abortSignal.addEventListener('abort', onExternalAbort, { once: true });
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
log.debug('Executing Codex thread', {
|
||||
agentType,
|
||||
@ -62,7 +88,10 @@ export class CodexClient {
|
||||
hasSystemPrompt: !!options.systemPrompt,
|
||||
});
|
||||
|
||||
const { events } = await thread.runStreamed(fullPrompt);
|
||||
const { events } = await thread.runStreamed(fullPrompt, {
|
||||
signal: streamAbortController.signal,
|
||||
});
|
||||
resetIdleTimeout();
|
||||
let content = '';
|
||||
const contentOffsets = new Map<string, number>();
|
||||
let success = true;
|
||||
@ -70,6 +99,7 @@ export class CodexClient {
|
||||
const state = createStreamTrackingState();
|
||||
|
||||
for await (const event of events as AsyncGenerator<CodexEvent>) {
|
||||
resetIdleTimeout();
|
||||
if (event.type === 'thread.started') {
|
||||
threadId = typeof event.thread_id === 'string' ? event.thread_id : threadId;
|
||||
emitInit(options.onStream, options.model, threadId);
|
||||
@ -172,15 +202,23 @@ export class CodexClient {
|
||||
};
|
||||
} catch (error) {
|
||||
const message = getErrorMessage(error);
|
||||
emitResult(options.onStream, false, message, threadId);
|
||||
const errorMessage = streamAbortController.signal.aborted ? abortMessage : message;
|
||||
emitResult(options.onStream, false, errorMessage, threadId);
|
||||
|
||||
return {
|
||||
persona: agentType,
|
||||
status: 'blocked',
|
||||
content: message,
|
||||
content: errorMessage,
|
||||
timestamp: new Date(),
|
||||
sessionId: threadId,
|
||||
};
|
||||
} finally {
|
||||
if (idleTimeoutId !== undefined) {
|
||||
clearTimeout(idleTimeoutId);
|
||||
}
|
||||
if (options.abortSignal) {
|
||||
options.abortSignal.removeEventListener('abort', onExternalAbort);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -21,6 +21,7 @@ export function mapToCodexSandboxMode(mode: PermissionMode): CodexSandboxMode {
|
||||
/** Options for calling Codex */
|
||||
export interface CodexCallOptions {
|
||||
cwd: string;
|
||||
abortSignal?: AbortSignal;
|
||||
sessionId?: string;
|
||||
model?: string;
|
||||
systemPrompt?: string;
|
||||
|
||||
@ -27,6 +27,7 @@ function isInsideGitRepo(cwd: string): boolean {
|
||||
function toCodexOptions(options: ProviderCallOptions): CodexCallOptions {
|
||||
return {
|
||||
cwd: options.cwd,
|
||||
abortSignal: options.abortSignal,
|
||||
sessionId: options.sessionId,
|
||||
model: options.model,
|
||||
permissionMode: options.permissionMode,
|
||||
|
||||
@ -20,6 +20,7 @@ export interface AgentSetup {
|
||||
/** Runtime options passed at call time */
|
||||
export interface ProviderCallOptions {
|
||||
cwd: string;
|
||||
abortSignal?: AbortSignal;
|
||||
sessionId?: string;
|
||||
model?: string;
|
||||
allowedTools?: string[];
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user