From 4919bc759f5be1e40c26d1a38f907c153797aec1 Mon Sep 17 00:00:00 2001 From: nrslib <38722970+nrslib@users.noreply.github.com> Date: Fri, 13 Feb 2026 06:11:06 +0900 Subject: [PATCH] =?UTF-8?q?=E5=88=A4=E5=AE=9A=E5=87=A6=E7=90=86=E3=81=AE?= =?UTF-8?q?=E4=BF=AE=E6=AD=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- builtins/schemas/decomposition.json | 4 +- builtins/schemas/evaluation.json | 2 +- builtins/schemas/judgment.json | 2 +- docs/implements/structured-output.ja.md | 127 +++++++++++++++ e2e/fixtures/pieces/structured-output.yaml | 18 +++ e2e/specs/structured-output.e2e.ts | 96 +++++++++++ src/__tests__/agent-usecases.test.ts | 89 +++++----- src/__tests__/codex-structured-output.test.ts | 152 ++++++++---------- src/__tests__/engine-abort.test.ts | 2 +- src/__tests__/engine-arpeggio.test.ts | 2 +- src/__tests__/engine-blocked.test.ts | 2 +- src/__tests__/engine-error.test.ts | 2 +- src/__tests__/engine-happy-path.test.ts | 2 +- src/__tests__/engine-loop-monitors.test.ts | 2 +- src/__tests__/engine-parallel-failure.test.ts | 2 +- src/__tests__/engine-parallel.test.ts | 2 +- src/__tests__/engine-team-leader.test.ts | 2 +- src/__tests__/engine-test-helpers.ts | 2 +- src/__tests__/engine-worktree-report.test.ts | 2 +- src/__tests__/it-error-recovery.test.ts | 2 +- src/__tests__/it-piece-execution.test.ts | 2 +- src/__tests__/it-piece-patterns.test.ts | 2 +- src/__tests__/it-pipeline-modes.test.ts | 2 +- src/__tests__/it-pipeline.test.ts | 2 +- .../it-three-phase-execution.test.ts | 10 +- src/__tests__/parseStructuredOutput.test.ts | 86 ++++++++++ src/__tests__/report-phase-blocked.test.ts | 2 +- src/core/piece/agent-usecases.ts | 52 +++--- src/core/piece/engine/MovementExecutor.ts | 36 +++-- src/core/piece/engine/ParallelRunner.ts | 20 ++- .../instruction/StatusJudgmentBuilder.ts | 17 +- src/core/piece/phase-runner.ts | 2 +- src/core/piece/status-judgment-phase.ts | 89 ++++++---- src/infra/codex/client.ts | 30 +--- src/infra/opencode/client.ts | 34 ++-- .../prompts/en/perform_phase3_message.md | 17 +- .../prompts/ja/perform_phase3_message.md | 17 +- src/shared/utils/index.ts | 1 + src/shared/utils/structuredOutput.ts | 56 +++++++ vitest.config.e2e.provider.ts | 1 + vitest.config.e2e.structured-output.ts | 20 +++ 41 files changed, 729 insertions(+), 283 deletions(-) create mode 100644 docs/implements/structured-output.ja.md create mode 100644 e2e/fixtures/pieces/structured-output.yaml create mode 100644 e2e/specs/structured-output.e2e.ts create mode 100644 src/__tests__/parseStructuredOutput.test.ts create mode 100644 src/shared/utils/structuredOutput.ts create mode 100644 vitest.config.e2e.structured-output.ts diff --git a/builtins/schemas/decomposition.json b/builtins/schemas/decomposition.json index 5706f4b..e9116c7 100644 --- a/builtins/schemas/decomposition.json +++ b/builtins/schemas/decomposition.json @@ -19,11 +19,11 @@ "description": "Instruction for the part agent" }, "timeout_ms": { - "type": "integer", + "type": ["integer", "null"], "description": "Optional timeout in ms" } }, - "required": ["id", "title", "instruction"], + "required": ["id", "title", "instruction", "timeout_ms"], "additionalProperties": false } } diff --git a/builtins/schemas/evaluation.json b/builtins/schemas/evaluation.json index 269909c..1526206 100644 --- a/builtins/schemas/evaluation.json +++ b/builtins/schemas/evaluation.json @@ -10,6 +10,6 @@ "description": "Why this condition was matched" } }, - "required": ["matched_index"], + "required": ["matched_index", "reason"], "additionalProperties": false } diff --git a/builtins/schemas/judgment.json b/builtins/schemas/judgment.json index 42c21d0..a8d6aed 100644 --- a/builtins/schemas/judgment.json +++ b/builtins/schemas/judgment.json @@ -10,6 +10,6 @@ "description": "Brief justification for the decision" } }, - "required": ["step"], + "required": ["step", "reason"], "additionalProperties": false } diff --git a/docs/implements/structured-output.ja.md b/docs/implements/structured-output.ja.md new file mode 100644 index 0000000..660fbe3 --- /dev/null +++ b/docs/implements/structured-output.ja.md @@ -0,0 +1,127 @@ +# Structured Output — Phase 3 ステータス判定 + +## 概要 + +Phase 3(ステータス判定)において、エージェントの出力を structured output(JSON スキーマ)で取得し、ルールマッチングの精度と信頼性を向上させる。 + +## プロバイダ別の挙動 + +| プロバイダ | メソッド | 仕組み | +|-----------|---------|--------| +| Claude | `structured_output` | SDK が `StructuredOutput` ツールを自動追加。エージェントがツール経由で `{ step, reason }` を返す | +| Codex | `structured_output` | `TurnOptions.outputSchema` で API レベルの JSON 制約。テキストが JSON になる | +| OpenCode | `structured_output` | プロンプト末尾に JSON スキーマ付き出力指示を注入。テキストレスポンスから `parseStructuredOutput()` で JSON を抽出 | + +## フォールバックチェーン + +`judgeStatus()` は3段階の独立した LLM 呼び出しでルールをマッチする。 + +``` +Stage 1: structured_output — outputSchema 付き LLM 呼び出し → structuredOutput.step(1-based integer) +Stage 2: phase3_tag — outputSchema なし LLM 呼び出し → content 内の [MOVEMENT:N] タグ検出 +Stage 3: ai_judge — evaluateCondition() による AI 条件評価 +``` + +各ステージは専用のインストラクションで LLM に問い合わせる。Stage 1 は「ルール番号を JSON で返せ」、Stage 2 は「タグを1行で出力せよ」と聞き方が異なる。 + +セッションログには `toJudgmentMatchMethod()` で変換された値が記録される。 + +| 内部メソッド | セッションログ | +|-------------|--------------| +| `structured_output` | `structured_output` | +| `phase3_tag` / `phase1_tag` | `tag_fallback` | +| `ai_judge` / `ai_judge_fallback` | `ai_judge` | + +## インストラクション分岐 + +Phase 3 テンプレート(`perform_phase3_message`)は `structuredOutput` フラグで2つのモードを持つ。 + +### Structured Output モード(`structuredOutput: true`) + +主要指示: ルール番号(1-based)と理由を返せ。 +フォールバック指示: structured output が使えない場合はタグを出力せよ。 + +### タグモード(`structuredOutput: false`) + +従来の指示: 対応するタグを1行で出力せよ。 + +現在、Phase 3 は常に `structuredOutput: true` で実行される。 + +## アーキテクチャ + +``` +StatusJudgmentBuilder + └─ structuredOutput: true + ├─ criteriaTable: ルール条件テーブル(常に含む) + ├─ outputList: タグ一覧(フォールバック用に含む) + └─ テンプレート: "ルール番号と理由を返せ + タグはフォールバック" + +runStatusJudgmentPhase() + └─ judgeStatus() → JudgeStatusResult { ruleIndex, method } + └─ StatusJudgmentPhaseResult { tag, ruleIndex, method } + +MovementExecutor + ├─ Phase 3 あり → judgeStatus の結果を直接使用(method 伝搬) + └─ Phase 3 なし → detectMatchedRule() で Phase 1 コンテンツから検出 +``` + +## JSON スキーマ + +### judgment.json(judgeStatus 用) + +```json +{ + "type": "object", + "properties": { + "step": { "type": "integer", "description": "Matched rule number (1-based)" }, + "reason": { "type": "string", "description": "Brief justification" } + }, + "required": ["step", "reason"], + "additionalProperties": false +} +``` + +### evaluation.json(evaluateCondition 用) + +```json +{ + "type": "object", + "properties": { + "matched_index": { "type": "integer" }, + "reason": { "type": "string" } + }, + "required": ["matched_index", "reason"], + "additionalProperties": false +} +``` + +## parseStructuredOutput() — JSON 抽出 + +Codex と OpenCode はテキストレスポンスから JSON を抽出する。3段階のフォールバック戦略を持つ。 + +``` +1. Direct parse — テキスト全体が `{` で始まる JSON オブジェクト +2. Code block — ```json ... ``` または ``` ... ``` 内の JSON +3. Brace extraction — テキスト内の最初の `{` から最後の `}` までを切り出し +``` + +## OpenCode 固有の仕組み + +OpenCode SDK は `outputFormat` を型定義でサポートしていない。代わりにプロンプト末尾に JSON 出力指示を注入する。 + +``` +--- +IMPORTANT: You MUST respond with ONLY a valid JSON object matching this schema. No other text, no markdown code blocks, no explanation. +```json +{ "type": "object", ... } +``` +``` + +エージェントが返すテキストを `parseStructuredOutput()` でパースし、`AgentResponse.structuredOutput` に格納する。 + +## 注意事項 + +- OpenAI API(Codex)は `required` に全プロパティを含めないとエラーになる(`additionalProperties: false` 時) +- Codex SDK の `TurnCompletedEvent` には `finalResponse` フィールドがない。structured output は `AgentMessageItem.text` の JSON テキストから `parseStructuredOutput()` でパースする +- Claude SDK は `StructuredOutput` ツール方式のため、インストラクションでタグ出力を強調しすぎるとエージェントがツールを呼ばずタグを出力してしまう +- OpenCode のプロンプト注入方式はモデルの指示従順性に依存する。JSON 以外のテキストが混在する場合は `parseStructuredOutput()` の code block / brace extraction で回収する diff --git a/e2e/fixtures/pieces/structured-output.yaml b/e2e/fixtures/pieces/structured-output.yaml new file mode 100644 index 0000000..fcb7280 --- /dev/null +++ b/e2e/fixtures/pieces/structured-output.yaml @@ -0,0 +1,18 @@ +name: e2e-structured-output +description: E2E piece to verify structured output rule matching + +max_movements: 5 + +movements: + - name: execute + edit: false + persona: ../agents/test-coder.md + permission_mode: readonly + instruction_template: | + Reply with exactly: "Task completed successfully." + Do not do anything else. + rules: + - condition: Task completed + next: COMPLETE + - condition: Task failed + next: ABORT diff --git a/e2e/specs/structured-output.e2e.ts b/e2e/specs/structured-output.e2e.ts new file mode 100644 index 0000000..1742e63 --- /dev/null +++ b/e2e/specs/structured-output.e2e.ts @@ -0,0 +1,96 @@ +import { describe, it, expect, beforeEach, afterEach } from 'vitest'; +import { resolve, dirname } from 'node:path'; +import { fileURLToPath } from 'node:url'; +import { createIsolatedEnv, type IsolatedEnv } from '../helpers/isolated-env'; +import { createLocalRepo, type LocalRepo } from '../helpers/test-repo'; +import { runTakt } from '../helpers/takt-runner'; +import { readSessionRecords } from '../helpers/session-log'; + +const __filename = fileURLToPath(import.meta.url); +const __dirname = dirname(__filename); + +/** + * E2E: Structured output for status judgment (Phase 3). + * + * Verifies that real providers (Claude, Codex, OpenCode) can execute a piece + * where the status judgment phase uses structured output (`outputSchema`) + * internally via `judgeStatus()`. + * + * The piece has 2 rules per step, so `judgeStatus` cannot auto-select + * and must actually call the provider with an outputSchema to determine + * which rule matched. + * + * If structured output works correctly, `judgeStatus` extracts the step + * number from `response.structuredOutput.step` (recorded as `structured_output`). + * If the agent happens to output `[STEP:N]` tags, the RuleEvaluator detects + * them as `phase3_tag`/`phase1_tag` (recorded as `tag_fallback` in session log). + * The session log matchMethod is transformed by `toJudgmentMatchMethod()`. + * + * Run with: + * TAKT_E2E_PROVIDER=claude vitest run --config vitest.config.e2e.structured-output.ts + * TAKT_E2E_PROVIDER=codex vitest run --config vitest.config.e2e.structured-output.ts + * TAKT_E2E_PROVIDER=opencode TAKT_E2E_MODEL=openai/gpt-4 vitest run --config vitest.config.e2e.structured-output.ts + */ +describe('E2E: Structured output rule matching', () => { + let isolatedEnv: IsolatedEnv; + let repo: LocalRepo; + + beforeEach(() => { + isolatedEnv = createIsolatedEnv(); + repo = createLocalRepo(); + }); + + afterEach(() => { + try { repo.cleanup(); } catch { /* best-effort */ } + try { isolatedEnv.cleanup(); } catch { /* best-effort */ } + }); + + it('should complete piece via Phase 3 status judgment with 2-rule step', () => { + const piecePath = resolve(__dirname, '../fixtures/pieces/structured-output.yaml'); + + const result = runTakt({ + args: [ + '--task', 'Say hello', + '--piece', piecePath, + '--create-worktree', 'no', + ], + cwd: repo.path, + env: isolatedEnv.env, + timeout: 240_000, + }); + + if (result.exitCode !== 0) { + console.log('=== STDOUT ===\n', result.stdout); + console.log('=== STDERR ===\n', result.stderr); + } + + // Always log the matchMethod for diagnostic purposes + const allRecords = readSessionRecords(repo.path); + const sc = allRecords.find((r) => r.type === 'step_complete'); + console.log(`=== matchMethod: ${sc?.matchMethod ?? '(none)'} ===`); + + expect(result.exitCode).toBe(0); + expect(result.stdout).toContain('Piece completed'); + + // Verify session log has proper step_complete with matchMethod + const records = readSessionRecords(repo.path); + + const pieceComplete = records.find((r) => r.type === 'piece_complete'); + expect(pieceComplete).toBeDefined(); + + const stepComplete = records.find((r) => r.type === 'step_complete'); + expect(stepComplete).toBeDefined(); + + // matchMethod should be present — the 2-rule step required actual judgment + // (auto_select is only used for single-rule steps) + const matchMethod = stepComplete?.matchMethod as string | undefined; + expect(matchMethod).toBeDefined(); + + // Session log records transformed matchMethod via toJudgmentMatchMethod(): + // structured_output → structured_output (judgeStatus extracted from structuredOutput.step) + // phase3_tag / phase1_tag → tag_fallback (agent output [STEP:N] tag, detected by RuleEvaluator) + // ai_judge / ai_judge_fallback → ai_judge (AI evaluated conditions as fallback) + const validMethods = ['structured_output', 'tag_fallback', 'ai_judge']; + expect(validMethods).toContain(matchMethod); + }, 240_000); +}); diff --git a/src/__tests__/agent-usecases.test.ts b/src/__tests__/agent-usecases.test.ts index 570f368..91d0f40 100644 --- a/src/__tests__/agent-usecases.test.ts +++ b/src/__tests__/agent-usecases.test.ts @@ -40,6 +40,8 @@ function doneResponse(content: string, structuredOutput?: Record { beforeEach(() => { vi.clearAllMocks(); @@ -102,83 +104,90 @@ describe('agent-usecases', () => { expect(detectJudgeIndex).not.toHaveBeenCalled(); }); + // --- judgeStatus: 3-stage fallback --- + it('judgeStatus は単一ルール時に auto_select を返す', async () => { - const result = await judgeStatus('instruction', [{ condition: 'always', next: 'done' }], { - cwd: '/repo', - movementName: 'review', - }); + const result = await judgeStatus('structured', 'tag', [{ condition: 'always', next: 'done' }], judgeOptions); expect(result).toEqual({ ruleIndex: 0, method: 'auto_select' }); expect(runAgent).not.toHaveBeenCalled(); }); it('judgeStatus はルールが空ならエラー', async () => { - await expect(judgeStatus('instruction', [], { - cwd: '/repo', - movementName: 'review', - })).rejects.toThrow('judgeStatus requires at least one rule'); + await expect(judgeStatus('structured', 'tag', [], judgeOptions)) + .rejects.toThrow('judgeStatus requires at least one rule'); }); - it('judgeStatus は構造化出力 step を採用する', async () => { - vi.mocked(runAgent).mockResolvedValue(doneResponse('x', { step: 2 })); + it('judgeStatus は Stage 1 で構造化出力 step を採用する', async () => { + vi.mocked(runAgent).mockResolvedValueOnce(doneResponse('x', { step: 2 })); - const result = await judgeStatus('instruction', [ + const result = await judgeStatus('structured', 'tag', [ { condition: 'a', next: 'one' }, { condition: 'b', next: 'two' }, - ], { - cwd: '/repo', - movementName: 'review', - }); + ], judgeOptions); expect(result).toEqual({ ruleIndex: 1, method: 'structured_output' }); + expect(runAgent).toHaveBeenCalledTimes(1); + expect(runAgent).toHaveBeenCalledWith('conductor', 'structured', expect.objectContaining({ + outputSchema: { type: 'judgment' }, + })); }); - it('judgeStatus はタグフォールバックを使う', async () => { - vi.mocked(runAgent).mockResolvedValue(doneResponse('[REVIEW:2]')); + it('judgeStatus は Stage 2 でタグ検出を使う', async () => { + // Stage 1: structured output fails (no structuredOutput) + vi.mocked(runAgent).mockResolvedValueOnce(doneResponse('no match')); + // Stage 2: tag detection succeeds + vi.mocked(runAgent).mockResolvedValueOnce(doneResponse('[REVIEW:2]')); - const result = await judgeStatus('instruction', [ + const result = await judgeStatus('structured', 'tag', [ { condition: 'a', next: 'one' }, { condition: 'b', next: 'two' }, - ], { - cwd: '/repo', - movementName: 'review', - }); + ], judgeOptions); expect(result).toEqual({ ruleIndex: 1, method: 'phase3_tag' }); + expect(runAgent).toHaveBeenCalledTimes(2); + expect(runAgent).toHaveBeenNthCalledWith(1, 'conductor', 'structured', expect.objectContaining({ + outputSchema: { type: 'judgment' }, + })); + expect(runAgent).toHaveBeenNthCalledWith(2, 'conductor', 'tag', expect.not.objectContaining({ + outputSchema: expect.anything(), + })); }); - it('judgeStatus は最終手段として AI Judge を使う', async () => { - vi.mocked(runAgent) - .mockResolvedValueOnce(doneResponse('no match')) - .mockResolvedValueOnce(doneResponse('ignored', { matched_index: 2 })); + it('judgeStatus は Stage 3 で AI Judge を使う', async () => { + // Stage 1: structured output fails + vi.mocked(runAgent).mockResolvedValueOnce(doneResponse('no match')); + // Stage 2: tag detection fails + vi.mocked(runAgent).mockResolvedValueOnce(doneResponse('no tag')); + // Stage 3: evaluateCondition succeeds + vi.mocked(runAgent).mockResolvedValueOnce(doneResponse('ignored', { matched_index: 2 })); - const result = await judgeStatus('instruction', [ + const result = await judgeStatus('structured', 'tag', [ { condition: 'a', next: 'one' }, { condition: 'b', next: 'two' }, - ], { - cwd: '/repo', - movementName: 'review', - }); + ], judgeOptions); expect(result).toEqual({ ruleIndex: 1, method: 'ai_judge' }); - expect(runAgent).toHaveBeenCalledTimes(2); + expect(runAgent).toHaveBeenCalledTimes(3); }); it('judgeStatus は全ての判定に失敗したらエラー', async () => { - vi.mocked(runAgent) - .mockResolvedValueOnce(doneResponse('no match')) - .mockResolvedValueOnce(doneResponse('still no match')); + // Stage 1: structured output fails + vi.mocked(runAgent).mockResolvedValueOnce(doneResponse('no match')); + // Stage 2: tag detection fails + vi.mocked(runAgent).mockResolvedValueOnce(doneResponse('no tag')); + // Stage 3: evaluateCondition fails + vi.mocked(runAgent).mockResolvedValueOnce(doneResponse('still no match')); vi.mocked(detectJudgeIndex).mockReturnValue(-1); - await expect(judgeStatus('instruction', [ + await expect(judgeStatus('structured', 'tag', [ { condition: 'a', next: 'one' }, { condition: 'b', next: 'two' }, - ], { - cwd: '/repo', - movementName: 'review', - })).rejects.toThrow('Status not found for movement "review"'); + ], judgeOptions)).rejects.toThrow('Status not found for movement "review"'); }); + // --- decomposeTask --- + it('decomposeTask は構造化出力 parts を返す', async () => { vi.mocked(runAgent).mockResolvedValue(doneResponse('x', { parts: [ diff --git a/src/__tests__/codex-structured-output.test.ts b/src/__tests__/codex-structured-output.test.ts index 1ceb12c..a262b14 100644 --- a/src/__tests__/codex-structured-output.test.ts +++ b/src/__tests__/codex-structured-output.test.ts @@ -1,8 +1,12 @@ /** * Codex SDK layer structured output tests. * - * Tests CodexClient's extraction of structuredOutput from - * `turn.completed` events' `finalResponse` field. + * Tests CodexClient's extraction of structuredOutput by parsing + * JSON text from agent_message items when outputSchema is provided. + * + * Codex SDK returns structured output as JSON text in agent_message + * items (not via turn.completed.finalResponse which doesn't exist + * on TurnCompletedEvent). */ import { beforeEach, describe, expect, it, vi } from 'vitest'; @@ -42,34 +46,32 @@ describe('CodexClient — structuredOutput 抽出', () => { mockEvents = []; }); - it('turn.completed の finalResponse を structuredOutput として返す', async () => { + it('outputSchema 指定時に agent_message の JSON テキストを structuredOutput として返す', async () => { + const schema = { type: 'object', properties: { step: { type: 'integer' } } }; mockEvents = [ { type: 'thread.started', thread_id: 'thread-1' }, { type: 'item.completed', - item: { id: 'msg-1', type: 'agent_message', text: 'response text' }, - }, - { - type: 'turn.completed', - turn: { finalResponse: { step: 2, reason: 'approved' } }, + item: { id: 'msg-1', type: 'agent_message', text: '{"step": 2, "reason": "approved"}' }, }, + { type: 'turn.completed', usage: { input_tokens: 0, cached_input_tokens: 0, output_tokens: 0 } }, ]; const client = new CodexClient(); - const result = await client.call('coder', 'prompt', { cwd: '/tmp' }); + const result = await client.call('coder', 'prompt', { cwd: '/tmp', outputSchema: schema }); expect(result.status).toBe('done'); expect(result.structuredOutput).toEqual({ step: 2, reason: 'approved' }); }); - it('turn.completed に finalResponse がない場合は undefined', async () => { + it('outputSchema なしの場合はテキストを JSON パースしない', async () => { mockEvents = [ { type: 'thread.started', thread_id: 'thread-1' }, { type: 'item.completed', - item: { id: 'msg-1', type: 'agent_message', text: 'text' }, + item: { id: 'msg-1', type: 'agent_message', text: '{"step": 2}' }, }, - { type: 'turn.completed', turn: {} }, + { type: 'turn.completed', usage: { input_tokens: 0, cached_input_tokens: 0, output_tokens: 0 } }, ]; const client = new CodexClient(); @@ -79,86 +81,64 @@ describe('CodexClient — structuredOutput 抽出', () => { expect(result.structuredOutput).toBeUndefined(); }); - it('finalResponse が配列の場合は無視する', async () => { - mockEvents = [ - { type: 'thread.started', thread_id: 'thread-1' }, - { - type: 'item.completed', - item: { id: 'msg-1', type: 'agent_message', text: 'text' }, - }, - { type: 'turn.completed', turn: { finalResponse: [1, 2, 3] } }, - ]; - - const client = new CodexClient(); - const result = await client.call('coder', 'prompt', { cwd: '/tmp' }); - - expect(result.structuredOutput).toBeUndefined(); - }); - - it('finalResponse が null の場合は undefined', async () => { - mockEvents = [ - { type: 'thread.started', thread_id: 'thread-1' }, - { type: 'turn.completed', turn: { finalResponse: null } }, - ]; - - const client = new CodexClient(); - const result = await client.call('coder', 'prompt', { cwd: '/tmp' }); - - expect(result.structuredOutput).toBeUndefined(); - }); - - it('turn.completed がない場合は structuredOutput なし', async () => { - mockEvents = [ - { type: 'thread.started', thread_id: 'thread-1' }, - { - type: 'item.completed', - item: { id: 'msg-1', type: 'agent_message', text: 'response' }, - }, - ]; - - const client = new CodexClient(); - const result = await client.call('coder', 'prompt', { cwd: '/tmp' }); - - expect(result.status).toBe('done'); - expect(result.structuredOutput).toBeUndefined(); - }); - - it('outputSchema が runStreamed に渡される', async () => { + it('agent_message が JSON でない場合は undefined', async () => { const schema = { type: 'object', properties: { step: { type: 'integer' } } }; - const runStreamedSpy = vi.fn().mockResolvedValue({ - events: (async function* () { - yield { type: 'thread.started', thread_id: 'thread-1' }; - yield { - type: 'item.completed', - item: { id: 'msg-1', type: 'agent_message', text: 'ok' }, - }; - yield { - type: 'turn.completed', - turn: { finalResponse: { step: 1 } }, - }; - })(), - }); - - // Mock SDK で startThread が返す thread の runStreamed を spy に差し替え - const { Codex } = await import('@openai/codex-sdk'); - const codex = new Codex({} as never); - const thread = await codex.startThread(); - thread.runStreamed = runStreamedSpy; - - // CodexClient は内部で Codex を new するため、 - // SDK クラス自体のモックで startThread の返り値を制御 - // → mockEvents ベースの簡易テストでは runStreamed の引数を直接検証できない - // ここではプロバイダ層テスト (provider-structured-output.test.ts) で - // outputSchema パススルーを検証済みのため、SDK 内部の引数検証はスキップ - - // 代わりに、outputSchema 付きで呼び出して structuredOutput が返ることを確認 mockEvents = [ { type: 'thread.started', thread_id: 'thread-1' }, { type: 'item.completed', - item: { id: 'msg-1', type: 'agent_message', text: 'ok' }, + item: { id: 'msg-1', type: 'agent_message', text: 'plain text response' }, }, - { type: 'turn.completed', turn: { finalResponse: { step: 1 } } }, + { type: 'turn.completed', usage: { input_tokens: 0, cached_input_tokens: 0, output_tokens: 0 } }, + ]; + + const client = new CodexClient(); + const result = await client.call('coder', 'prompt', { cwd: '/tmp', outputSchema: schema }); + + expect(result.status).toBe('done'); + expect(result.structuredOutput).toBeUndefined(); + }); + + it('JSON が配列の場合は無視する', async () => { + const schema = { type: 'object', properties: { step: { type: 'integer' } } }; + mockEvents = [ + { type: 'thread.started', thread_id: 'thread-1' }, + { + type: 'item.completed', + item: { id: 'msg-1', type: 'agent_message', text: '[1, 2, 3]' }, + }, + { type: 'turn.completed', usage: { input_tokens: 0, cached_input_tokens: 0, output_tokens: 0 } }, + ]; + + const client = new CodexClient(); + const result = await client.call('coder', 'prompt', { cwd: '/tmp', outputSchema: schema }); + + expect(result.structuredOutput).toBeUndefined(); + }); + + it('agent_message がない場合は structuredOutput なし', async () => { + const schema = { type: 'object', properties: { step: { type: 'integer' } } }; + mockEvents = [ + { type: 'thread.started', thread_id: 'thread-1' }, + { type: 'turn.completed', usage: { input_tokens: 0, cached_input_tokens: 0, output_tokens: 0 } }, + ]; + + const client = new CodexClient(); + const result = await client.call('coder', 'prompt', { cwd: '/tmp', outputSchema: schema }); + + expect(result.status).toBe('done'); + expect(result.structuredOutput).toBeUndefined(); + }); + + it('outputSchema 付きで呼び出して structuredOutput が返る', async () => { + const schema = { type: 'object', properties: { step: { type: 'integer' } } }; + mockEvents = [ + { type: 'thread.started', thread_id: 'thread-1' }, + { + type: 'item.completed', + item: { id: 'msg-1', type: 'agent_message', text: '{"step": 1}' }, + }, + { type: 'turn.completed', usage: { input_tokens: 0, cached_input_tokens: 0, output_tokens: 0 } }, ]; const client = new CodexClient(); diff --git a/src/__tests__/engine-abort.test.ts b/src/__tests__/engine-abort.test.ts index dae845c..0d9fdff 100644 --- a/src/__tests__/engine-abort.test.ts +++ b/src/__tests__/engine-abort.test.ts @@ -25,7 +25,7 @@ vi.mock('../core/piece/evaluation/index.js', () => ({ vi.mock('../core/piece/phase-runner.js', () => ({ needsStatusJudgmentPhase: vi.fn().mockReturnValue(false), runReportPhase: vi.fn().mockResolvedValue(undefined), - runStatusJudgmentPhase: vi.fn().mockResolvedValue(''), + runStatusJudgmentPhase: vi.fn().mockResolvedValue({ tag: '', ruleIndex: 0, method: 'auto_select' }), })); vi.mock('../shared/utils/index.js', async (importOriginal) => ({ diff --git a/src/__tests__/engine-arpeggio.test.ts b/src/__tests__/engine-arpeggio.test.ts index 3523c60..35f55b2 100644 --- a/src/__tests__/engine-arpeggio.test.ts +++ b/src/__tests__/engine-arpeggio.test.ts @@ -21,7 +21,7 @@ vi.mock('../core/piece/evaluation/index.js', () => ({ vi.mock('../core/piece/phase-runner.js', () => ({ needsStatusJudgmentPhase: vi.fn().mockReturnValue(false), runReportPhase: vi.fn().mockResolvedValue(undefined), - runStatusJudgmentPhase: vi.fn().mockResolvedValue(''), + runStatusJudgmentPhase: vi.fn().mockResolvedValue({ tag: '', ruleIndex: 0, method: 'auto_select' }), })); vi.mock('../shared/utils/index.js', async () => { diff --git a/src/__tests__/engine-blocked.test.ts b/src/__tests__/engine-blocked.test.ts index 8cf10e6..02abc20 100644 --- a/src/__tests__/engine-blocked.test.ts +++ b/src/__tests__/engine-blocked.test.ts @@ -23,7 +23,7 @@ vi.mock('../core/piece/evaluation/index.js', () => ({ vi.mock('../core/piece/phase-runner.js', () => ({ needsStatusJudgmentPhase: vi.fn().mockReturnValue(false), runReportPhase: vi.fn().mockResolvedValue(undefined), - runStatusJudgmentPhase: vi.fn().mockResolvedValue(''), + runStatusJudgmentPhase: vi.fn().mockResolvedValue({ tag: '', ruleIndex: 0, method: 'auto_select' }), })); vi.mock('../shared/utils/index.js', async (importOriginal) => ({ diff --git a/src/__tests__/engine-error.test.ts b/src/__tests__/engine-error.test.ts index 28fa82c..553ec2f 100644 --- a/src/__tests__/engine-error.test.ts +++ b/src/__tests__/engine-error.test.ts @@ -24,7 +24,7 @@ vi.mock('../core/piece/evaluation/index.js', () => ({ vi.mock('../core/piece/phase-runner.js', () => ({ needsStatusJudgmentPhase: vi.fn().mockReturnValue(false), runReportPhase: vi.fn().mockResolvedValue(undefined), - runStatusJudgmentPhase: vi.fn().mockResolvedValue(''), + runStatusJudgmentPhase: vi.fn().mockResolvedValue({ tag: '', ruleIndex: 0, method: 'auto_select' }), })); vi.mock('../shared/utils/index.js', async (importOriginal) => ({ diff --git a/src/__tests__/engine-happy-path.test.ts b/src/__tests__/engine-happy-path.test.ts index d067fa4..c42e613 100644 --- a/src/__tests__/engine-happy-path.test.ts +++ b/src/__tests__/engine-happy-path.test.ts @@ -28,7 +28,7 @@ vi.mock('../core/piece/evaluation/index.js', () => ({ vi.mock('../core/piece/phase-runner.js', () => ({ needsStatusJudgmentPhase: vi.fn().mockReturnValue(false), runReportPhase: vi.fn().mockResolvedValue(undefined), - runStatusJudgmentPhase: vi.fn().mockResolvedValue(''), + runStatusJudgmentPhase: vi.fn().mockResolvedValue({ tag: '', ruleIndex: 0, method: 'auto_select' }), })); vi.mock('../shared/utils/index.js', async (importOriginal) => ({ diff --git a/src/__tests__/engine-loop-monitors.test.ts b/src/__tests__/engine-loop-monitors.test.ts index e363264..31aff5d 100644 --- a/src/__tests__/engine-loop-monitors.test.ts +++ b/src/__tests__/engine-loop-monitors.test.ts @@ -27,7 +27,7 @@ vi.mock('../core/piece/evaluation/index.js', () => ({ vi.mock('../core/piece/phase-runner.js', () => ({ needsStatusJudgmentPhase: vi.fn().mockReturnValue(false), runReportPhase: vi.fn().mockResolvedValue(undefined), - runStatusJudgmentPhase: vi.fn().mockResolvedValue(''), + runStatusJudgmentPhase: vi.fn().mockResolvedValue({ tag: '', ruleIndex: 0, method: 'auto_select' }), })); vi.mock('../shared/utils/index.js', async (importOriginal) => ({ diff --git a/src/__tests__/engine-parallel-failure.test.ts b/src/__tests__/engine-parallel-failure.test.ts index ef1569b..2ead682 100644 --- a/src/__tests__/engine-parallel-failure.test.ts +++ b/src/__tests__/engine-parallel-failure.test.ts @@ -23,7 +23,7 @@ vi.mock('../core/piece/evaluation/index.js', () => ({ vi.mock('../core/piece/phase-runner.js', () => ({ needsStatusJudgmentPhase: vi.fn().mockReturnValue(false), runReportPhase: vi.fn().mockResolvedValue(undefined), - runStatusJudgmentPhase: vi.fn().mockResolvedValue(''), + runStatusJudgmentPhase: vi.fn().mockResolvedValue({ tag: '', ruleIndex: 0, method: 'auto_select' }), })); vi.mock('../shared/utils/index.js', async (importOriginal) => ({ diff --git a/src/__tests__/engine-parallel.test.ts b/src/__tests__/engine-parallel.test.ts index bb5cf77..f86f1bf 100644 --- a/src/__tests__/engine-parallel.test.ts +++ b/src/__tests__/engine-parallel.test.ts @@ -24,7 +24,7 @@ vi.mock('../core/piece/evaluation/index.js', () => ({ vi.mock('../core/piece/phase-runner.js', () => ({ needsStatusJudgmentPhase: vi.fn().mockReturnValue(false), runReportPhase: vi.fn().mockResolvedValue(undefined), - runStatusJudgmentPhase: vi.fn().mockResolvedValue(''), + runStatusJudgmentPhase: vi.fn().mockResolvedValue({ tag: '', ruleIndex: 0, method: 'auto_select' }), })); vi.mock('../shared/utils/index.js', async (importOriginal) => ({ diff --git a/src/__tests__/engine-team-leader.test.ts b/src/__tests__/engine-team-leader.test.ts index cd35a8d..3d0de7e 100644 --- a/src/__tests__/engine-team-leader.test.ts +++ b/src/__tests__/engine-team-leader.test.ts @@ -17,7 +17,7 @@ vi.mock('../core/piece/evaluation/index.js', () => ({ vi.mock('../core/piece/phase-runner.js', () => ({ needsStatusJudgmentPhase: vi.fn().mockReturnValue(false), runReportPhase: vi.fn().mockResolvedValue(undefined), - runStatusJudgmentPhase: vi.fn().mockResolvedValue(''), + runStatusJudgmentPhase: vi.fn().mockResolvedValue({ tag: '', ruleIndex: 0, method: 'auto_select' }), })); vi.mock('../shared/utils/index.js', async (importOriginal) => ({ diff --git a/src/__tests__/engine-test-helpers.ts b/src/__tests__/engine-test-helpers.ts index 3513476..f17dc03 100644 --- a/src/__tests__/engine-test-helpers.ts +++ b/src/__tests__/engine-test-helpers.ts @@ -173,7 +173,7 @@ export function createTestTmpDir(): string { export function applyDefaultMocks(): void { vi.mocked(needsStatusJudgmentPhase).mockReturnValue(false); vi.mocked(runReportPhase).mockResolvedValue(undefined); - vi.mocked(runStatusJudgmentPhase).mockResolvedValue(''); + vi.mocked(runStatusJudgmentPhase).mockResolvedValue({ tag: '', ruleIndex: 0, method: 'auto_select' }); vi.mocked(generateReportDir).mockReturnValue('test-report-dir'); } diff --git a/src/__tests__/engine-worktree-report.test.ts b/src/__tests__/engine-worktree-report.test.ts index 32fa194..f90084f 100644 --- a/src/__tests__/engine-worktree-report.test.ts +++ b/src/__tests__/engine-worktree-report.test.ts @@ -24,7 +24,7 @@ vi.mock('../core/piece/evaluation/index.js', () => ({ vi.mock('../core/piece/phase-runner.js', () => ({ needsStatusJudgmentPhase: vi.fn().mockReturnValue(false), runReportPhase: vi.fn().mockResolvedValue(undefined), - runStatusJudgmentPhase: vi.fn().mockResolvedValue(''), + runStatusJudgmentPhase: vi.fn().mockResolvedValue({ tag: '', ruleIndex: 0, method: 'auto_select' }), })); vi.mock('../shared/utils/index.js', async (importOriginal) => ({ diff --git a/src/__tests__/it-error-recovery.test.ts b/src/__tests__/it-error-recovery.test.ts index 4375c31..75199ba 100644 --- a/src/__tests__/it-error-recovery.test.ts +++ b/src/__tests__/it-error-recovery.test.ts @@ -31,7 +31,7 @@ vi.mock('../agents/ai-judge.js', async (importOriginal) => { vi.mock('../core/piece/phase-runner.js', () => ({ needsStatusJudgmentPhase: vi.fn().mockReturnValue(false), runReportPhase: vi.fn().mockResolvedValue(undefined), - runStatusJudgmentPhase: vi.fn().mockResolvedValue(''), + runStatusJudgmentPhase: vi.fn().mockResolvedValue({ tag: '', ruleIndex: 0, method: 'auto_select' }), })); vi.mock('../shared/utils/index.js', async (importOriginal) => ({ diff --git a/src/__tests__/it-piece-execution.test.ts b/src/__tests__/it-piece-execution.test.ts index 78e2851..912fa8b 100644 --- a/src/__tests__/it-piece-execution.test.ts +++ b/src/__tests__/it-piece-execution.test.ts @@ -35,7 +35,7 @@ vi.mock('../agents/ai-judge.js', async (importOriginal) => { vi.mock('../core/piece/phase-runner.js', () => ({ needsStatusJudgmentPhase: vi.fn().mockReturnValue(false), runReportPhase: vi.fn().mockResolvedValue(undefined), - runStatusJudgmentPhase: vi.fn().mockResolvedValue(''), + runStatusJudgmentPhase: vi.fn().mockResolvedValue({ tag: '', ruleIndex: 0, method: 'auto_select' }), })); vi.mock('../shared/utils/index.js', async (importOriginal) => ({ diff --git a/src/__tests__/it-piece-patterns.test.ts b/src/__tests__/it-piece-patterns.test.ts index 081af86..bd99736 100644 --- a/src/__tests__/it-piece-patterns.test.ts +++ b/src/__tests__/it-piece-patterns.test.ts @@ -37,7 +37,7 @@ vi.mock('../agents/ai-judge.js', async (importOriginal) => { vi.mock('../core/piece/phase-runner.js', () => ({ needsStatusJudgmentPhase: vi.fn().mockReturnValue(false), runReportPhase: vi.fn().mockResolvedValue(undefined), - runStatusJudgmentPhase: vi.fn().mockResolvedValue(''), + runStatusJudgmentPhase: vi.fn().mockResolvedValue({ tag: '', ruleIndex: 0, method: 'auto_select' }), })); vi.mock('../shared/utils/index.js', async (importOriginal) => ({ diff --git a/src/__tests__/it-pipeline-modes.test.ts b/src/__tests__/it-pipeline-modes.test.ts index 0916bef..a381483 100644 --- a/src/__tests__/it-pipeline-modes.test.ts +++ b/src/__tests__/it-pipeline-modes.test.ts @@ -144,7 +144,7 @@ vi.mock('../shared/prompt/index.js', () => ({ vi.mock('../core/piece/phase-runner.js', () => ({ needsStatusJudgmentPhase: vi.fn().mockReturnValue(false), runReportPhase: vi.fn().mockResolvedValue(undefined), - runStatusJudgmentPhase: vi.fn().mockResolvedValue(''), + runStatusJudgmentPhase: vi.fn().mockResolvedValue({ tag: '', ruleIndex: 0, method: 'auto_select' }), })); // --- Imports (after mocks) --- diff --git a/src/__tests__/it-pipeline.test.ts b/src/__tests__/it-pipeline.test.ts index ad02723..5743f29 100644 --- a/src/__tests__/it-pipeline.test.ts +++ b/src/__tests__/it-pipeline.test.ts @@ -125,7 +125,7 @@ vi.mock('../shared/prompt/index.js', () => ({ vi.mock('../core/piece/phase-runner.js', () => ({ needsStatusJudgmentPhase: vi.fn().mockReturnValue(false), runReportPhase: vi.fn().mockResolvedValue(undefined), - runStatusJudgmentPhase: vi.fn().mockResolvedValue(''), + runStatusJudgmentPhase: vi.fn().mockResolvedValue({ tag: '', ruleIndex: 0, method: 'auto_select' }), })); // --- Imports (after mocks) --- diff --git a/src/__tests__/it-three-phase-execution.test.ts b/src/__tests__/it-three-phase-execution.test.ts index a6f7d6a..d5b173e 100644 --- a/src/__tests__/it-three-phase-execution.test.ts +++ b/src/__tests__/it-three-phase-execution.test.ts @@ -114,7 +114,7 @@ describe('Three-Phase Execution IT: phase1 only (no report, no tag rules)', () = // No tag rules needed → Phase 3 not needed mockNeedsStatusJudgmentPhase.mockReturnValue(false); mockRunReportPhase.mockResolvedValue(undefined); - mockRunStatusJudgmentPhase.mockResolvedValue(''); + mockRunStatusJudgmentPhase.mockResolvedValue({ tag: '', ruleIndex: 0, method: 'auto_select' }); }); afterEach(() => { @@ -166,7 +166,7 @@ describe('Three-Phase Execution IT: phase1 + phase2 (report defined)', () => { mockNeedsStatusJudgmentPhase.mockReturnValue(false); mockRunReportPhase.mockResolvedValue(undefined); - mockRunStatusJudgmentPhase.mockResolvedValue(''); + mockRunStatusJudgmentPhase.mockResolvedValue({ tag: '', ruleIndex: 0, method: 'auto_select' }); }); afterEach(() => { @@ -246,7 +246,7 @@ describe('Three-Phase Execution IT: phase1 + phase3 (tag rules defined)', () => mockNeedsStatusJudgmentPhase.mockReturnValue(true); mockRunReportPhase.mockResolvedValue(undefined); // Phase 3 returns content with a tag - mockRunStatusJudgmentPhase.mockResolvedValue('[STEP:1]'); + mockRunStatusJudgmentPhase.mockResolvedValue({ tag: '[STEP:1]', ruleIndex: 0, method: 'structured_output' }); }); afterEach(() => { @@ -298,7 +298,7 @@ describe('Three-Phase Execution IT: all three phases', () => { mockNeedsStatusJudgmentPhase.mockReturnValue(true); mockRunReportPhase.mockResolvedValue(undefined); - mockRunStatusJudgmentPhase.mockResolvedValue('[STEP:1]'); + mockRunStatusJudgmentPhase.mockResolvedValue({ tag: '[STEP:1]', ruleIndex: 0, method: 'structured_output' }); }); afterEach(() => { @@ -369,7 +369,7 @@ describe('Three-Phase Execution IT: phase3 tag → rule match', () => { ]); // Phase 3 returns rule 2 (ABORT) - mockRunStatusJudgmentPhase.mockResolvedValue('[STEP1:2]'); + mockRunStatusJudgmentPhase.mockResolvedValue({ tag: '[STEP1:2]', ruleIndex: 1, method: 'structured_output' }); const config: PieceConfig = { name: 'it-phase3-tag', diff --git a/src/__tests__/parseStructuredOutput.test.ts b/src/__tests__/parseStructuredOutput.test.ts new file mode 100644 index 0000000..7f247e7 --- /dev/null +++ b/src/__tests__/parseStructuredOutput.test.ts @@ -0,0 +1,86 @@ +import { describe, it, expect } from 'vitest'; +import { parseStructuredOutput } from '../shared/utils/structuredOutput.js'; + +describe('parseStructuredOutput', () => { + it('should return undefined when hasOutputSchema is false', () => { + expect(parseStructuredOutput('{"step":1}', false)).toBeUndefined(); + }); + + it('should return undefined for empty text', () => { + expect(parseStructuredOutput('', true)).toBeUndefined(); + }); + + // Strategy 1: Direct JSON parse + describe('direct JSON parse', () => { + it('should parse pure JSON object', () => { + expect(parseStructuredOutput('{"step":1,"reason":"done"}', true)) + .toEqual({ step: 1, reason: 'done' }); + }); + + it('should parse JSON with whitespace', () => { + expect(parseStructuredOutput(' { "step": 2, "reason": "ok" } ', true)) + .toEqual({ step: 2, reason: 'ok' }); + }); + + it('should ignore arrays', () => { + expect(parseStructuredOutput('[1,2,3]', true)).toBeUndefined(); + }); + + it('should ignore primitive JSON', () => { + expect(parseStructuredOutput('"hello"', true)).toBeUndefined(); + }); + }); + + // Strategy 2: Code block extraction + describe('code block extraction', () => { + it('should extract JSON from ```json code block', () => { + const text = 'Here is the result:\n```json\n{"step":1,"reason":"matched"}\n```'; + expect(parseStructuredOutput(text, true)) + .toEqual({ step: 1, reason: 'matched' }); + }); + + it('should extract JSON from ``` code block (no language)', () => { + const text = 'Result:\n```\n{"step":2,"reason":"fallback"}\n```'; + expect(parseStructuredOutput(text, true)) + .toEqual({ step: 2, reason: 'fallback' }); + }); + }); + + // Strategy 3: Brace extraction + describe('brace extraction', () => { + it('should extract JSON with preamble text', () => { + const text = 'The matched rule is: {"step":1,"reason":"condition met"}'; + expect(parseStructuredOutput(text, true)) + .toEqual({ step: 1, reason: 'condition met' }); + }); + + it('should extract JSON with postamble text', () => { + const text = '{"step":3,"reason":"done"}\nEnd of response.'; + expect(parseStructuredOutput(text, true)) + .toEqual({ step: 3, reason: 'done' }); + }); + + it('should extract JSON with both preamble and postamble', () => { + const text = 'Based on my analysis:\n{"matched_index":2,"reason":"test"}\nThat is my judgment.'; + expect(parseStructuredOutput(text, true)) + .toEqual({ matched_index: 2, reason: 'test' }); + }); + }); + + // Edge cases + describe('edge cases', () => { + it('should return undefined for text without JSON', () => { + expect(parseStructuredOutput('No JSON here at all.', true)).toBeUndefined(); + }); + + it('should return undefined for invalid JSON', () => { + expect(parseStructuredOutput('{invalid json}', true)).toBeUndefined(); + }); + + it('should handle nested objects', () => { + const text = '{"step":1,"reason":"ok","meta":{"detail":"extra"}}'; + expect(parseStructuredOutput(text, true)) + .toEqual({ step: 1, reason: 'ok', meta: { detail: 'extra' } }); + }); + }); +}); diff --git a/src/__tests__/report-phase-blocked.test.ts b/src/__tests__/report-phase-blocked.test.ts index fca6c2b..0241784 100644 --- a/src/__tests__/report-phase-blocked.test.ts +++ b/src/__tests__/report-phase-blocked.test.ts @@ -23,7 +23,7 @@ vi.mock('../core/piece/evaluation/index.js', () => ({ vi.mock('../core/piece/phase-runner.js', () => ({ needsStatusJudgmentPhase: vi.fn().mockReturnValue(false), runReportPhase: vi.fn().mockResolvedValue(undefined), - runStatusJudgmentPhase: vi.fn().mockResolvedValue(''), + runStatusJudgmentPhase: vi.fn().mockResolvedValue({ tag: '', ruleIndex: 0, method: 'auto_select' }), })); vi.mock('../shared/utils/index.js', async (importOriginal) => ({ diff --git a/src/core/piece/agent-usecases.ts b/src/core/piece/agent-usecases.ts index 0a0fe5e..2c20ee2 100644 --- a/src/core/piece/agent-usecases.ts +++ b/src/core/piece/agent-usecases.ts @@ -85,7 +85,8 @@ export async function evaluateCondition( } export async function judgeStatus( - instruction: string, + structuredInstruction: string, + tagInstruction: string, rules: PieceRule[], options: JudgeStatusOptions, ): Promise { @@ -94,48 +95,47 @@ export async function judgeStatus( } if (rules.length === 1) { - return { - ruleIndex: 0, - method: 'auto_select', - }; + return { ruleIndex: 0, method: 'auto_select' }; } - const response = await runAgent('conductor', instruction, { + const agentOptions = { cwd: options.cwd, maxTurns: 3, - permissionMode: 'readonly', + permissionMode: 'readonly' as const, language: options.language, + }; + + // Stage 1: Structured output + const structuredResponse = await runAgent('conductor', structuredInstruction, { + ...agentOptions, outputSchema: loadJudgmentSchema(), }); - if (response.status === 'done') { - const stepNumber = response.structuredOutput?.step; + if (structuredResponse.status === 'done') { + const stepNumber = structuredResponse.structuredOutput?.step; if (typeof stepNumber === 'number' && Number.isInteger(stepNumber)) { const ruleIndex = stepNumber - 1; if (ruleIndex >= 0 && ruleIndex < rules.length) { - return { - ruleIndex, - method: 'structured_output', - }; + return { ruleIndex, method: 'structured_output' }; } } - - const tagRuleIndex = detectRuleIndex(response.content, options.movementName); - if (tagRuleIndex >= 0 && tagRuleIndex < rules.length) { - return { - ruleIndex: tagRuleIndex, - method: 'phase3_tag', - }; - } } + // Stage 2: Tag detection (dedicated call, no outputSchema) + const tagResponse = await runAgent('conductor', tagInstruction, agentOptions); + + if (tagResponse.status === 'done') { + const tagRuleIndex = detectRuleIndex(tagResponse.content, options.movementName); + if (tagRuleIndex >= 0 && tagRuleIndex < rules.length) { + return { ruleIndex: tagRuleIndex, method: 'phase3_tag' }; + } + } + + // Stage 3: AI judge const conditions = rules.map((rule, index) => ({ index, text: rule.condition })); - const fallbackIndex = await evaluateCondition(instruction, conditions, { cwd: options.cwd }); + const fallbackIndex = await evaluateCondition(structuredInstruction, conditions, { cwd: options.cwd }); if (fallbackIndex >= 0 && fallbackIndex < rules.length) { - return { - ruleIndex: fallbackIndex, - method: 'ai_judge', - }; + return { ruleIndex: fallbackIndex, method: 'ai_judge' }; } throw new Error(`Status not found for movement "${options.movementName}"`); diff --git a/src/core/piece/engine/MovementExecutor.ts b/src/core/piece/engine/MovementExecutor.ts index ca4cd76..32a2b8a 100644 --- a/src/core/piece/engine/MovementExecutor.ts +++ b/src/core/piece/engine/MovementExecutor.ts @@ -220,22 +220,28 @@ export class MovementExecutor { } } - // Phase 3: status judgment (resume session, no tools, output status tag) - let tagContent = ''; - if (needsStatusJudgmentPhase(step)) { - tagContent = await runStatusJudgmentPhase(step, phaseCtx); - } + // Phase 3: status judgment (new session, no tools, determines matched rule) + const phase3Result = needsStatusJudgmentPhase(step) + ? await runStatusJudgmentPhase(step, phaseCtx) + : undefined; - const match = await detectMatchedRule(step, response.content, tagContent, { - state, - cwd: this.deps.getCwd(), - interactive: this.deps.getInteractive(), - detectRuleIndex: this.deps.detectRuleIndex, - callAiJudge: this.deps.callAiJudge, - }); - if (match) { - log.debug('Rule matched', { movement: step.name, ruleIndex: match.index, method: match.method }); - response = { ...response, matchedRuleIndex: match.index, matchedRuleMethod: match.method }; + if (phase3Result) { + // Phase 3 already determined the matched rule — use its result directly + log.debug('Rule matched (Phase 3)', { movement: step.name, ruleIndex: phase3Result.ruleIndex, method: phase3Result.method }); + response = { ...response, matchedRuleIndex: phase3Result.ruleIndex, matchedRuleMethod: phase3Result.method }; + } else { + // No Phase 3 — use rule evaluator with Phase 1 content + const match = await detectMatchedRule(step, response.content, '', { + state, + cwd: this.deps.getCwd(), + interactive: this.deps.getInteractive(), + detectRuleIndex: this.deps.detectRuleIndex, + callAiJudge: this.deps.callAiJudge, + }); + if (match) { + log.debug('Rule matched', { movement: step.name, ruleIndex: match.index, method: match.method }); + response = { ...response, matchedRuleIndex: match.index, matchedRuleMethod: match.method }; + } } state.movementOutputs.set(step.name, response); diff --git a/src/core/piece/engine/ParallelRunner.ts b/src/core/piece/engine/ParallelRunner.ts index 94dc5ae..1a5d865 100644 --- a/src/core/piece/engine/ParallelRunner.ts +++ b/src/core/piece/engine/ParallelRunner.ts @@ -114,15 +114,19 @@ export class ParallelRunner { } // Phase 3: status judgment for sub-movement - let subTagContent = ''; - if (needsStatusJudgmentPhase(subMovement)) { - subTagContent = await runStatusJudgmentPhase(subMovement, phaseCtx); - } + const subPhase3 = needsStatusJudgmentPhase(subMovement) + ? await runStatusJudgmentPhase(subMovement, phaseCtx) + : undefined; - const match = await detectMatchedRule(subMovement, subResponse.content, subTagContent, ruleCtx); - const finalResponse = match - ? { ...subResponse, matchedRuleIndex: match.index, matchedRuleMethod: match.method } - : subResponse; + let finalResponse: AgentResponse; + if (subPhase3) { + finalResponse = { ...subResponse, matchedRuleIndex: subPhase3.ruleIndex, matchedRuleMethod: subPhase3.method }; + } else { + const match = await detectMatchedRule(subMovement, subResponse.content, '', ruleCtx); + finalResponse = match + ? { ...subResponse, matchedRuleIndex: match.index, matchedRuleMethod: match.method } + : subResponse; + } state.movementOutputs.set(subMovement.name, finalResponse); this.deps.movementExecutor.emitMovementReports(subMovement); diff --git a/src/core/piece/instruction/StatusJudgmentBuilder.ts b/src/core/piece/instruction/StatusJudgmentBuilder.ts index e4f4099..9529050 100644 --- a/src/core/piece/instruction/StatusJudgmentBuilder.ts +++ b/src/core/piece/instruction/StatusJudgmentBuilder.ts @@ -27,8 +27,8 @@ export interface StatusJudgmentContext { lastResponse?: string; /** Input source type for fallback strategies */ inputSource?: 'report' | 'response'; - /** Structured output mode omits tag-format instructions */ - useStructuredOutput?: boolean; + /** When true, omit tag output instructions (structured output schema handles format) */ + structuredOutput?: boolean; } /** @@ -66,14 +66,17 @@ export class StatusJudgmentBuilder { contentToJudge = this.buildFromResponse(); } + const isStructured = this.context.structuredOutput ?? false; + return loadTemplate('perform_phase3_message', language, { reportContent: contentToJudge, criteriaTable: components.criteriaTable, - outputList: this.context.useStructuredOutput - ? '' - : components.outputList, - hasAppendix: components.hasAppendix, - appendixContent: components.appendixContent, + structuredOutput: isStructured, + ...(isStructured ? {} : { + outputList: components.outputList, + hasAppendix: components.hasAppendix, + appendixContent: components.appendixContent, + }), }); } diff --git a/src/core/piece/phase-runner.ts b/src/core/piece/phase-runner.ts index 7059d2a..6388e76 100644 --- a/src/core/piece/phase-runner.ts +++ b/src/core/piece/phase-runner.ts @@ -15,7 +15,7 @@ import { hasTagBasedRules, getReportFiles } from './evaluation/rule-utils.js'; import { executeAgent } from './agent-usecases.js'; import { createLogger } from '../../shared/utils/index.js'; import { buildSessionKey } from './session-key.js'; -export { runStatusJudgmentPhase } from './status-judgment-phase.js'; +export { runStatusJudgmentPhase, type StatusJudgmentPhaseResult } from './status-judgment-phase.js'; const log = createLogger('phase-runner'); diff --git a/src/core/piece/status-judgment-phase.ts b/src/core/piece/status-judgment-phase.ts index 1a7b501..3c5d899 100644 --- a/src/core/piece/status-judgment-phase.ts +++ b/src/core/piece/status-judgment-phase.ts @@ -1,75 +1,98 @@ import { existsSync, readFileSync } from 'node:fs'; import { resolve } from 'node:path'; -import type { PieceMovement } from '../models/types.js'; +import type { PieceMovement, RuleMatchMethod } from '../models/types.js'; import { judgeStatus } from './agent-usecases.js'; -import { StatusJudgmentBuilder } from './instruction/StatusJudgmentBuilder.js'; +import { StatusJudgmentBuilder, type StatusJudgmentContext } from './instruction/StatusJudgmentBuilder.js'; import { getReportFiles } from './evaluation/rule-utils.js'; import { createLogger } from '../../shared/utils/index.js'; import type { PhaseRunnerContext } from './phase-runner.js'; const log = createLogger('phase-runner'); +/** Result of Phase 3 status judgment, including the detection method. */ +export interface StatusJudgmentPhaseResult { + tag: string; + ruleIndex: number; + method: RuleMatchMethod; +} + /** - * Phase 3: Status judgment. - * Uses the 'conductor' agent in a new session to output a status tag. - * Implements multi-stage fallback logic to ensure judgment succeeds. - * Returns the Phase 3 response content (containing the status tag). + * Build the base context (shared by structured output and tag instructions). */ -export async function runStatusJudgmentPhase( +function buildBaseContext( step: PieceMovement, ctx: PhaseRunnerContext, -): Promise { - log.debug('Running status judgment phase', { movement: step.name }); - if (!step.rules || step.rules.length === 0) { - throw new Error(`Status judgment requires rules for movement "${step.name}"`); - } - +): Omit | undefined { const reportFiles = getReportFiles(step.outputContracts); - let instruction: string | undefined; if (reportFiles.length > 0) { const reports: string[] = []; for (const fileName of reportFiles) { const filePath = resolve(ctx.reportDir, fileName); - if (!existsSync(filePath)) { - continue; - } + if (!existsSync(filePath)) continue; const content = readFileSync(filePath, 'utf-8'); reports.push(`# ${fileName}\n\n${content}`); } if (reports.length > 0) { - instruction = new StatusJudgmentBuilder(step, { + return { language: ctx.language, reportContent: reports.join('\n\n---\n\n'), inputSource: 'report', - useStructuredOutput: true, - }).build(); + }; } } - if (instruction == null) { - if (!ctx.lastResponse) { - throw new Error(`Status judgment requires report or lastResponse for movement "${step.name}"`); - } + if (!ctx.lastResponse) return undefined; - instruction = new StatusJudgmentBuilder(step, { - language: ctx.language, - lastResponse: ctx.lastResponse, - inputSource: 'response', - useStructuredOutput: true, - }).build(); + return { + language: ctx.language, + lastResponse: ctx.lastResponse, + inputSource: 'response', + }; +} + +/** + * Phase 3: Status judgment. + * + * Builds two instructions from the same context: + * - Structured output instruction (JSON schema) + * - Tag instruction (free-form tag detection) + * + * `judgeStatus()` tries them in order: structured → tag → ai_judge. + */ +export async function runStatusJudgmentPhase( + step: PieceMovement, + ctx: PhaseRunnerContext, +): Promise { + log.debug('Running status judgment phase', { movement: step.name }); + if (!step.rules || step.rules.length === 0) { + throw new Error(`Status judgment requires rules for movement "${step.name}"`); } - ctx.onPhaseStart?.(step, 3, 'judge', instruction); + const baseContext = buildBaseContext(step, ctx); + if (!baseContext) { + throw new Error(`Status judgment requires report or lastResponse for movement "${step.name}"`); + } + + const structuredInstruction = new StatusJudgmentBuilder(step, { + ...baseContext, + structuredOutput: true, + }).build(); + + const tagInstruction = new StatusJudgmentBuilder(step, { + ...baseContext, + }).build(); + + ctx.onPhaseStart?.(step, 3, 'judge', structuredInstruction); try { - const result = await judgeStatus(instruction, step.rules, { + const result = await judgeStatus(structuredInstruction, tagInstruction, step.rules, { cwd: ctx.cwd, movementName: step.name, language: ctx.language, }); const tag = `[${step.name.toUpperCase()}:${result.ruleIndex + 1}]`; ctx.onPhaseComplete?.(step, 3, 'judge', tag, 'done'); - return tag; + return { tag, ruleIndex: result.ruleIndex, method: result.method }; } catch (error) { const errorMsg = error instanceof Error ? error.message : String(error); ctx.onPhaseComplete?.(step, 3, 'judge', '', 'error', errorMsg); diff --git a/src/infra/codex/client.ts b/src/infra/codex/client.ts index 0859842..0a0e045 100644 --- a/src/infra/codex/client.ts +++ b/src/infra/codex/client.ts @@ -4,9 +4,9 @@ * Uses @openai/codex-sdk for native TypeScript integration. */ -import { Codex } from '@openai/codex-sdk'; +import { Codex, type TurnOptions } from '@openai/codex-sdk'; import type { AgentResponse } from '../../core/models/index.js'; -import { createLogger, getErrorMessage, createStreamDiagnostics, type StreamDiagnostics } from '../../shared/utils/index.js'; +import { createLogger, getErrorMessage, createStreamDiagnostics, parseStructuredOutput, type StreamDiagnostics } from '../../shared/utils/index.js'; import { mapToCodexSandboxMode, type CodexCallOptions } from './types.js'; import { type CodexEvent, @@ -150,20 +150,15 @@ export class CodexClient { const diag = createStreamDiagnostics('codex-sdk', { agentType, model: options.model, attempt }); diagRef = diag; - const runOptions: Record = { + const turnOptions: TurnOptions = { signal: streamAbortController.signal, + ...(options.outputSchema ? { outputSchema: options.outputSchema } : {}), }; - if (options.outputSchema) { - runOptions.outputSchema = options.outputSchema; - } - // Codex SDK types do not yet expose outputSchema even though runtime accepts it. - const runStreamedOptions = runOptions as unknown as Parameters[1]; - const { events } = await thread.runStreamed(fullPrompt, runStreamedOptions); + const { events } = await thread.runStreamed(fullPrompt, turnOptions); resetIdleTimeout(); diag.onConnected(); let content = ''; - let structuredOutput: Record | undefined; const contentOffsets = new Map(); let success = true; let failureMessage = ''; @@ -196,20 +191,6 @@ export class CodexClient { break; } - if (event.type === 'turn.completed') { - const rawFinalResponse = (event as unknown as { - turn?: { finalResponse?: unknown }; - }).turn?.finalResponse; - if ( - rawFinalResponse - && typeof rawFinalResponse === 'object' - && !Array.isArray(rawFinalResponse) - ) { - structuredOutput = rawFinalResponse as Record; - } - continue; - } - if (event.type === 'item.started') { const item = event.item as CodexItem | undefined; if (item) { @@ -291,6 +272,7 @@ export class CodexClient { } const trimmed = content.trim(); + const structuredOutput = parseStructuredOutput(trimmed, !!options.outputSchema); emitResult(options.onStream, true, trimmed, currentThreadId); return { diff --git a/src/infra/opencode/client.ts b/src/infra/opencode/client.ts index 17133b7..0b04999 100644 --- a/src/infra/opencode/client.ts +++ b/src/infra/opencode/client.ts @@ -8,7 +8,7 @@ import { createOpencode } from '@opencode-ai/sdk/v2'; import { createServer } from 'node:net'; import type { AgentResponse } from '../../core/models/index.js'; -import { createLogger, getErrorMessage, createStreamDiagnostics, type StreamDiagnostics } from '../../shared/utils/index.js'; +import { createLogger, getErrorMessage, createStreamDiagnostics, parseStructuredOutput, type StreamDiagnostics } from '../../shared/utils/index.js'; import { parseProviderModel } from '../../shared/utils/providerModel.js'; import { buildOpenCodePermissionConfig, @@ -236,16 +236,34 @@ export class OpenCodeClient { }); } + /** Build a prompt suffix that instructs the agent to return JSON matching the schema */ + private buildStructuredOutputSuffix(schema: Record): string { + return [ + '', + '---', + 'IMPORTANT: You MUST respond with ONLY a valid JSON object matching this schema. No other text, no markdown code blocks, no explanation.', + '```', + JSON.stringify(schema, null, 2), + '```', + ].join('\n'); + } + /** Call OpenCode with an agent prompt */ async call( agentType: string, prompt: string, options: OpenCodeCallOptions, ): Promise { - const fullPrompt = options.systemPrompt + const basePrompt = options.systemPrompt ? `${options.systemPrompt}\n\n${prompt}` : prompt; + // OpenCode SDK does not natively support structured output via outputFormat. + // Inject JSON output instructions into the prompt to make the agent return JSON. + const fullPrompt = options.outputSchema + ? `${basePrompt}${this.buildStructuredOutputSuffix(options.outputSchema)}` + : basePrompt; + for (let attempt = 1; attempt <= OPENCODE_RETRY_MAX_ATTEMPTS; attempt++) { let idleTimeoutId: ReturnType | undefined; const streamAbortController = new AbortController(); @@ -580,17 +598,7 @@ export class OpenCodeClient { } const trimmed = content.trim(); - let structuredOutput: Record | undefined; - if (options.outputSchema && trimmed.startsWith('{')) { - try { - const parsed = JSON.parse(trimmed) as unknown; - if (parsed && typeof parsed === 'object' && !Array.isArray(parsed)) { - structuredOutput = parsed as Record; - } - } catch { - // Non-JSON response falls back to text path. - } - } + const structuredOutput = parseStructuredOutput(trimmed, !!options.outputSchema); emitResult(options.onStream, true, trimmed, sessionId); return { diff --git a/src/shared/prompts/en/perform_phase3_message.md b/src/shared/prompts/en/perform_phase3_message.md index a3aa41b..80e5839 100644 --- a/src/shared/prompts/en/perform_phase3_message.md +++ b/src/shared/prompts/en/perform_phase3_message.md @@ -1,10 +1,14 @@ +{{#if structuredOutput}} +**Review is already complete. Evaluate the report below and determine which numbered rule (1-based) best matches the result.** +{{else}} **Review is already complete. Output exactly one tag corresponding to the judgment result shown in the report below.** +{{/if}} {{reportContent}} @@ -12,12 +16,21 @@ {{criteriaTable}} +{{#if structuredOutput}} + +## Task + +Evaluate the report against the criteria above. Return the matched rule number (1-based integer) and a brief reason for your decision. +{{else}} + ## Output Format **Output the tag corresponding to the judgment shown in the report in one line:** {{outputList}} +{{/if}} {{#if hasAppendix}} ### Appendix Template -{{appendixContent}}{{/if}} +{{appendixContent}} +{{/if}} diff --git a/src/shared/prompts/ja/perform_phase3_message.md b/src/shared/prompts/ja/perform_phase3_message.md index becfa29..89299ef 100644 --- a/src/shared/prompts/ja/perform_phase3_message.md +++ b/src/shared/prompts/ja/perform_phase3_message.md @@ -1,10 +1,14 @@ +{{#if structuredOutput}} +**既にレビューは完了しています。以下のレポートを評価し、どの番号のルール(1始まり)が結果に最も合致するか判定してください。** +{{else}} **既にレビューは完了しています。以下のレポートで示された判定結果に対応するタグを1つだけ出力してください。** +{{/if}} {{reportContent}} @@ -12,12 +16,21 @@ {{criteriaTable}} +{{#if structuredOutput}} + +## タスク + +上記の判定基準に照らしてレポートを評価してください。合致するルール番号(1始まりの整数)と簡潔な理由を返してください。 +{{else}} + ## 出力フォーマット **レポートで示した判定に対応するタグを1行で出力してください:** {{outputList}} +{{/if}} {{#if hasAppendix}} ### 追加出力テンプレート -{{appendixContent}}{{/if}} +{{appendixContent}} +{{/if}} diff --git a/src/shared/utils/index.ts b/src/shared/utils/index.ts index 69050cb..ffa23c0 100644 --- a/src/shared/utils/index.ts +++ b/src/shared/utils/index.ts @@ -11,6 +11,7 @@ export * from './slackWebhook.js'; export * from './sleep.js'; export * from './slug.js'; export * from './streamDiagnostics.js'; +export * from './structuredOutput.js'; export * from './taskPaths.js'; export * from './text.js'; export * from './types.js'; diff --git a/src/shared/utils/structuredOutput.ts b/src/shared/utils/structuredOutput.ts new file mode 100644 index 0000000..e1b8838 --- /dev/null +++ b/src/shared/utils/structuredOutput.ts @@ -0,0 +1,56 @@ +/** + * Parse structured output from provider text response. + * + * Codex and OpenCode return structured output as JSON text in agent messages. + * This function extracts a JSON object from the text when outputSchema was requested. + * + * Extraction strategies (in order): + * 1. Direct JSON parse — text is pure JSON starting with `{` + * 2. Code block extraction — JSON inside ```json ... ``` or ``` ... ``` + * 3. Brace extraction — find outermost `{` ... `}` in the text + */ + +function tryParseJsonObject(text: string): Record | undefined { + try { + const parsed = JSON.parse(text) as unknown; + if (parsed && typeof parsed === 'object' && !Array.isArray(parsed)) { + return parsed as Record; + } + } catch { + // Not valid JSON + } + return undefined; +} + +export function parseStructuredOutput( + text: string, + hasOutputSchema: boolean, +): Record | undefined { + if (!hasOutputSchema || !text) return undefined; + + const trimmed = text.trim(); + + // Strategy 1: Direct JSON parse (text is pure JSON) + if (trimmed.startsWith('{')) { + const result = tryParseJsonObject(trimmed); + if (result) return result; + } + + // Strategy 2: Extract from markdown code block (```json\n{...}\n```) + const codeBlockMatch = trimmed.match(/```(?:json)?\s*\n(\{[\s\S]*?\})\s*\n```/); + if (codeBlockMatch?.[1]) { + const result = tryParseJsonObject(codeBlockMatch[1].trim()); + if (result) return result; + } + + // Strategy 3: Find first `{` and last `}` (handles preamble/postamble text) + const firstBrace = trimmed.indexOf('{'); + const lastBrace = trimmed.lastIndexOf('}'); + if (firstBrace >= 0 && lastBrace > firstBrace) { + const candidate = trimmed.slice(firstBrace, lastBrace + 1); + const result = tryParseJsonObject(candidate); + if (result) return result; + } + + return undefined; +} diff --git a/vitest.config.e2e.provider.ts b/vitest.config.e2e.provider.ts index 84c2932..cd00e37 100644 --- a/vitest.config.e2e.provider.ts +++ b/vitest.config.e2e.provider.ts @@ -7,6 +7,7 @@ export default defineConfig({ 'e2e/specs/worktree.e2e.ts', 'e2e/specs/pipeline.e2e.ts', 'e2e/specs/github-issue.e2e.ts', + 'e2e/specs/structured-output.e2e.ts', ], environment: 'node', globals: false, diff --git a/vitest.config.e2e.structured-output.ts b/vitest.config.e2e.structured-output.ts new file mode 100644 index 0000000..9926aa5 --- /dev/null +++ b/vitest.config.e2e.structured-output.ts @@ -0,0 +1,20 @@ +import { defineConfig } from 'vitest/config'; + +export default defineConfig({ + test: { + include: [ + 'e2e/specs/structured-output.e2e.ts', + ], + environment: 'node', + globals: false, + testTimeout: 240000, + hookTimeout: 60000, + teardownTimeout: 30000, + pool: 'threads', + poolOptions: { + threads: { + singleThread: true, + }, + }, + }, +});