fix(executor): surface the CLI stream error instead of the swallowed-stderr placeholder

When the `claude` CLI errors mid-stream, claude-agent-sdk throws a bare `Exception("Command failed with exit code 1 …")` whose only text is the useless `Check stderr output for details` placeholder — but the *actual* failure reason (model 404, rate limit, auth) arrived a moment earlier as a stream-json `ResultMessage(is_error=True)` carrying `result` text and `api_error_status`. That was thrown away. `_run_query` now captures `ResultMessage(is_error=True)` detail (and, as a fallback, the trailing AssistantMessage text) and re-attaches it to the raised exception as `_molecule_stream_detail`. `_format_process_error` surfaces it as `cli_stream_error=…` and, when present, skips the `_probe_claude_cli_error` re-probe (#160) — the probe can't replay the failing `--model`/`--system-prompt` argv, so it may even succeed and mislead. The probe stays as the last resort when there's nothing to salvage. Regression context: the 2026-05-10 dev-team incident — six lead workspaces 404ing on every turn (`--model claude-code` → `api_error_status=404`, "There's an issue with the selected model (claude-code)"), invisible for an hour because the CLI wrote nothing to stderr and this text was discarded. See internal#226 follow-up #5. Tests: tests/test_executor_error_detail.py — 6 cases (format surfaces the salvaged detail; format still probes when there's nothing salvaged; salvaged detail takes precedence over the probe; _run_query annotates from ResultMessage(is_error); _run_query falls back to assistant text; clean success path unaffected). `pytest tests/` → 87 passed. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-10 03:10:36 -07:00
2 changed files with 275 additions and 4 deletions
@@ -377,21 +377,37 @@ def _format_process_error(exc: BaseException) -> str:
    ``_probe_claude_cli_error`` so the operator sees the real failure
    reason (e.g. ``You've hit your limit · resets Apr 17``) instead of
    chasing ghosts in the workspace logs.
+
+    internal#226: prefer ``exc._molecule_stream_detail`` — the failure
+    reason ``_run_query`` salvaged from the CLI's stream-json
+    ``ResultMessage(is_error=True)`` (model 404, api_error_status, etc.)
+    before the SDK threw it away. That's the *exact* error for *this*
+    invocation; the ``_probe_claude_cli_error`` re-probe is a last resort
+    (it can't replay the failing ``--model``/``--system-prompt`` argv, so
+    it may even succeed and mislead — which is exactly what happened with
+    ``--model claude-code`` on 2026-05-10).
    """
    parts = [f"{type(exc).__name__}: {exc}"]
    exit_code = getattr(exc, "exit_code", None)
    if exit_code is not None:
        parts.append(f"exit_code={exit_code}")
+    stream_detail = getattr(exc, "_molecule_stream_detail", None)
+    if stream_detail:
+        trimmed = stream_detail[:_PROCESS_ERROR_STDERR_MAX_CHARS]
+        if len(stream_detail) > _PROCESS_ERROR_STDERR_MAX_CHARS:
+            trimmed += f"... [{len(stream_detail) - _PROCESS_ERROR_STDERR_MAX_CHARS} more chars truncated]"
+        parts.append(f"cli_stream_error={trimmed!r}")
    stderr = getattr(exc, "stderr", None)
    if stderr:
        trimmed = stderr[:_PROCESS_ERROR_STDERR_MAX_CHARS]
        if len(stderr) > _PROCESS_ERROR_STDERR_MAX_CHARS:
            trimmed += f"... [{len(stderr) - _PROCESS_ERROR_STDERR_MAX_CHARS} more chars truncated]"
        parts.append(f"stderr={trimmed!r}")
-    elif exit_code is None and _SWALLOWED_STDERR_MARKER in str(exc):
-        # #160: generic exception with the swallowed-stderr placeholder.
-        # Probe the CLI directly — this is the only way to surface the real
-        # error when the SDK lost it in translation.
+    elif exit_code is None and not stream_detail and _SWALLOWED_STDERR_MARKER in str(exc):
+        # #160: generic exception with the swallowed-stderr placeholder AND no
+        # stream detail to fall back on — probe the CLI directly as a last
+        # resort. (If _run_query salvaged a stream detail we already have the
+        # real error; the probe is unreliable since it can't replay the argv.)
        probed = _probe_claude_cli_error()
        if probed:
            parts.append(f"probed_cli_error={probed!r}")
@@ -586,6 +602,12 @@ class ClaudeSDKExecutor(AgentExecutor):
        assistant_chunks: list[str] = []
        result_text: str | None = None
        session_id: str | None = None
+        # Captured from a ResultMessage(is_error=True) — the CLI's stream-json
+        # carries the *actual* failure reason (model 404, rate limit, auth) in
+        # the result text + api_error_status BEFORE the SDK throws a bare
+        # "Command failed with exit code 1" that loses it. Stashed so the
+        # except arm below can re-attach it (see _format_process_error).
+        stream_error_detail: str | None = None
        self._active_stream = sdk.query(prompt=prompt, options=options)
        try:
            async for message in self._active_stream:
@@ -606,6 +628,34 @@ class ClaudeSDKExecutor(AgentExecutor):
                    if sid:
                        session_id = sid
                    result_text = getattr(message, "result", None)
+                    if getattr(message, "is_error", False):
+                        api_status = getattr(message, "api_error_status", None)
+                        stream_error_detail = (
+                            (f"api_error_status={api_status} " if api_status else "")
+                            + f"result={result_text!r}"
+                        )
+        except BaseException as exc:  # noqa: BLE001 — re-raised; we only annotate
+            # The claude-agent-sdk raises a bare Exception / ProcessError when
+            # the CLI subprocess errors mid-stream — but the actionable detail
+            # (model not found, rate limit, auth) arrived earlier as a
+            # ResultMessage(is_error) / synthetic AssistantMessage and is about
+            # to be discarded. Re-attach it so _format_process_error surfaces
+            # it instead of the useless "Check stderr output for details"
+            # placeholder. (The 2026-05-10 dev-team incident: `--model
+            # claude-code` → api_error_status=404, "There's an issue with the
+            # selected model (claude-code)" — invisible for an hour because
+            # the CLI wrote nothing to stderr and this text was thrown away.)
+            detail = stream_error_detail
+            if not detail:
+                last_assistant = "".join(assistant_chunks).strip()
+                if last_assistant:
+                    detail = last_assistant[:_PROCESS_ERROR_STDERR_MAX_CHARS]
+            if detail and getattr(exc, "_molecule_stream_detail", None) is None:
+                try:
+                    exc._molecule_stream_detail = detail  # type: ignore[attr-defined]
+                except Exception:  # pragma: no cover — exotic frozen exception
+                    pass
+            raise
        finally:
            self._active_stream = None
        text = result_text if result_text is not None else "".join(assistant_chunks)
@@ -0,0 +1,221 @@
+"""Pin the CLI-stream-error surfacing in _run_query + _format_process_error.
+
+When the `claude` CLI errors mid-stream, the claude-agent-sdk throws a bare
+``Exception("Command failed with exit code 1 …")`` whose only text is the
+useless ``Check stderr output for details`` placeholder — but the *actual*
+failure reason (model 404, rate limit, auth) arrived a moment earlier as a
+stream-json ``ResultMessage(is_error=True)`` carrying ``result`` text and
+``api_error_status``. ``_run_query`` salvages that onto the exception
+(``_molecule_stream_detail``); ``_format_process_error`` surfaces it.
+
+Regression context: the 2026-05-10 dev-team incident — six lead workspaces
+404ing on every turn (``--model claude-code`` → ``api_error_status=404``,
+"There's an issue with the selected model (claude-code)"), invisible for an
+hour because the CLI wrote nothing to stderr and that text was thrown away.
+See internal#226.
+
+Stub pattern mirrors test_runtime_wedge_mirror.py — same _ensure_module /
+_ensure_attr / _load_executor helpers so a real-package install on a
+workstation still wins over the stubs.
+"""
+
+import os
+import sys
+import types
+from unittest.mock import MagicMock
+
+import pytest
+
+
+# ---- Stubs (mirror test_runtime_wedge_mirror.py) ----
+
+
+def _ensure_module(dotted: str) -> types.ModuleType:
+    if dotted not in sys.modules:
+        sys.modules[dotted] = types.ModuleType(dotted)
+    return sys.modules[dotted]
+
+
+def _ensure_attr(mod: types.ModuleType, name: str, value: object) -> None:
+    if not hasattr(mod, name):
+        setattr(mod, name, value)
+
+
+def _install_executor_stubs():
+    sdk = _ensure_module("claude_agent_sdk")
+    _ensure_attr(sdk, "ClaudeAgentOptions", MagicMock(name="ClaudeAgentOptions"))
+    _ensure_attr(sdk, "AssistantMessage", type("AssistantMessage", (), {}))
+    _ensure_attr(sdk, "TextBlock", type("TextBlock", (), {}))
+    _ensure_attr(sdk, "ResultMessage", type("ResultMessage", (), {}))
+    _ensure_attr(sdk, "query", MagicMock(name="query"))
+
+    _ensure_module("a2a")
+    _ensure_module("a2a.server")
+    a2a_exec = _ensure_module("a2a.server.agent_execution")
+    _ensure_attr(a2a_exec, "AgentExecutor", type("AgentExecutor", (), {}))
+    _ensure_attr(a2a_exec, "RequestContext", type("RequestContext", (), {}))
+    a2a_events = _ensure_module("a2a.server.events")
+    _ensure_attr(a2a_events, "EventQueue", type("EventQueue", (), {}))
+    a2a_helpers = _ensure_module("a2a.helpers")
+    _ensure_attr(a2a_helpers, "new_text_message", lambda *_a, **_kw: None)
+
+    _ensure_module("molecule_runtime")
+    rw = _ensure_module("molecule_runtime.runtime_wedge")
+    _ensure_attr(rw, "mark_wedged", lambda *_a, **_kw: None)
+    _ensure_attr(rw, "clear_wedge", lambda *_a, **_kw: None)
+    helpers = _ensure_module("molecule_runtime.executor_helpers")
+    for name in (
+        "auto_push_hook", "brief_summary", "collect_outbound_files", "commit_memory",
+        "extract_attached_files", "extract_message_text", "get_a2a_instructions",
+        "get_hma_instructions", "read_delegation_results", "recall_memories",
+        "set_current_task",
+    ):
+        _ensure_attr(helpers, name, lambda *_a, **_kw: ("" if "instr" in name or "summary" in name else None))
+    _ensure_attr(helpers, "collect_outbound_files", lambda *_a, **_kw: [])
+    _ensure_attr(helpers, "extract_attached_files", lambda *_a, **_kw: [])
+    _ensure_attr(helpers, "get_mcp_server_path", lambda *_a, **_kw: "/dev/null")
+    _ensure_attr(helpers, "get_system_prompt", lambda *_a, **_kw: "")
+    _ensure_attr(helpers, "sanitize_agent_error", lambda e: str(e))
+    _ensure_attr(helpers, "CONFIG_MOUNT", "/configs")
+    _ensure_attr(helpers, "WORKSPACE_MOUNT", "/workspace")
+    _ensure_attr(helpers, "MEMORY_CONTENT_MAX_CHARS", 10000)
+
+
+def _load_executor():
+    _install_executor_stubs()
+    parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+    if parent_dir not in sys.path:
+        sys.path.insert(0, parent_dir)
+    sys.modules.pop("claude_sdk_executor", None)
+    import claude_sdk_executor  # noqa: WPS433
+    return claude_sdk_executor
+
+
+def _async_stream(messages, raise_at_end=None):
+    """Build a fake `sdk.query(...)` return value: an async iterator that
+    yields ``messages`` then (optionally) raises — exactly the shape the
+    claude-agent-sdk produces when the CLI errors after emitting a
+    ResultMessage(is_error)."""
+    class _Stream:
+        def __aiter__(self):
+            return self
+
+        def __init__(self):
+            self._it = iter(messages)
+
+        async def __anext__(self):
+            try:
+                return next(self._it)
+            except StopIteration:
+                if raise_at_end is not None:
+                    raise raise_at_end
+                raise StopAsyncIteration
+
+    return _Stream()
+
+
+# ─── _format_process_error: surface the salvaged stream detail ───────────
+
+
+def test_format_process_error_surfaces_molecule_stream_detail():
+    mod = _load_executor()
+    exc = Exception("Command failed with exit code 1 — Check stderr output for details")
+    exc._molecule_stream_detail = (
+        'api_error_status=404 result="There\'s an issue with the selected '
+        'model (claude-code). It may not exist or you may not have access."'
+    )
+    out = mod._format_process_error(exc)
+    assert "cli_stream_error=" in out
+    assert "api_error_status=404" in out
+    assert "claude-code" in out
+    # When we already salvaged the real error, don't ALSO re-probe the CLI
+    # (the probe can't replay the failing --model argv and may mislead).
+    assert "probed_cli_error" not in out
+
+
+def test_format_process_error_still_probes_when_no_stream_detail(monkeypatch):
+    """The #160 fallback (probe the CLI when only the swallowed-stderr
+    placeholder is present) still fires when _run_query had nothing to
+    salvage."""
+    mod = _load_executor()
+    monkeypatch.setattr(mod, "_probe_claude_cli_error", lambda: "You've hit your limit · resets Apr 17")
+    exc = Exception("Command failed with exit code 1 — Check stderr output for details")
+    out = mod._format_process_error(exc)
+    assert "probed_cli_error=" in out
+    assert "hit your limit" in out
+
+
+def test_format_process_error_stream_detail_takes_precedence_over_probe(monkeypatch):
+    mod = _load_executor()
+    probe = MagicMock(name="_probe_claude_cli_error", return_value="<should not be called>")
+    monkeypatch.setattr(mod, "_probe_claude_cli_error", probe)
+    exc = Exception("Command failed with exit code 1 — Check stderr output for details")
+    exc._molecule_stream_detail = "api_error_status=429 result='rate limited'"
+    out = mod._format_process_error(exc)
+    assert "cli_stream_error=" in out
+    probe.assert_not_called()
+
+
+# ─── _run_query: salvage the detail onto the raised exception ────────────
+
+
+@pytest.mark.asyncio
+async def test_run_query_annotates_exception_from_is_error_result_message():
+    mod = _load_executor()
+    sdk = sys.modules["claude_agent_sdk"]
+
+    rm = sdk.ResultMessage()
+    rm.session_id = "sess-1"
+    rm.result = "There's an issue with the selected model (claude-code)."
+    rm.is_error = True
+    rm.api_error_status = 404
+
+    boom = Exception("Command failed with exit code 1 (exit code: 1)\nCheck stderr output for details")
+    sdk.query = MagicMock(return_value=_async_stream([rm], raise_at_end=boom))
+
+    ex = mod.ClaudeSDKExecutor(system_prompt="", config_path="/tmp", heartbeat=None, model="opus")
+    with pytest.raises(Exception) as ei:
+        await ex._run_query("hi", options=MagicMock())
+    detail = getattr(ei.value, "_molecule_stream_detail", None)
+    assert detail is not None
+    assert "api_error_status=404" in detail
+    assert "claude-code" in detail
+    # And it threads through the formatter the executor's error path uses.
+    assert "cli_stream_error=" in mod._format_process_error(ei.value)
+
+
+@pytest.mark.asyncio
+async def test_run_query_falls_back_to_assistant_text_when_no_error_result():
+    mod = _load_executor()
+    sdk = sys.modules["claude_agent_sdk"]
+
+    tb = sdk.TextBlock()
+    tb.text = "helpful pre-crash context from the model"
+    am = sdk.AssistantMessage()
+    am.content = [tb]
+
+    boom = Exception("Command failed with exit code 1 — Check stderr output for details")
+    sdk.query = MagicMock(return_value=_async_stream([am], raise_at_end=boom))
+
+    ex = mod.ClaudeSDKExecutor(system_prompt="", config_path="/tmp", heartbeat=None, model="opus")
+    with pytest.raises(Exception) as ei:
+        await ex._run_query("hi", options=MagicMock())
+    assert getattr(ei.value, "_molecule_stream_detail", None) == "helpful pre-crash context from the model"
+
+
+@pytest.mark.asyncio
+async def test_run_query_clean_success_unaffected():
+    """No exception → no annotation, normal QueryResult."""
+    mod = _load_executor()
+    sdk = sys.modules["claude_agent_sdk"]
+
+    rm = sdk.ResultMessage()
+    rm.session_id = "sess-ok"
+    rm.result = "done"
+    rm.is_error = False
+    sdk.query = MagicMock(return_value=_async_stream([rm]))
+
+    ex = mod.ClaudeSDKExecutor(system_prompt="", config_path="/tmp", heartbeat=None, model="opus")
+    res = await ex._run_query("hi", options=MagicMock())
+    assert res.text == "done"
+    assert res.session_id == "sess-ok"