fix(gitea-merge-queue): add E2E Chat to required contexts + auto-hold on 405

Root cause: the queue pre-flight checked only CI/all-required and sop-checklist, but Gitea's merge gate also evaluates E2E Chat. When E2E Chat fails (runner stall / Quirk #9), the queue would attempt merge and get HTTP 405 "not all required status checks successful", then retry the same PR every tick indefinitely. Changes: - REQUIRED_CONTEXTS now includes E2E Chat / E2E Chat (pull_request) so the queue skips E2E-failing PRs before attempting merge. - MergePermissionError handler now distinguishes 405 status-check failures from genuine permission errors; status-check failures auto-add merge-queue-hold so the queue advances to the next PR. - add_hold_label() helper uses PATCH /issues/{n} with {"labels":[...]} to append the hold label idempotently. PRs with E2E Chat failures (mc#420): #1365, #1443, #1445, #1448, #1373, #1378, #1379 auto-held. Once runner stalls clear (~90 min), remove hold to requeue. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
fix(ci): cancel-in-progress=true on ci-required-drift and main-red-watchdog
2026-05-17 23:25:15 +00:00 · 2026-05-17 22:51:47 +00:00 · 2026-05-17 22:25:12 +00:00 · 2026-05-17 20:56:57 +00:00 · 2026-05-17 20:07:54 +00:00 · 2026-05-17 15:29:14 +00:00
17 changed files with 299 additions and 65 deletions
@@ -44,9 +44,15 @@ REQUIRED_CONTEXTS_RAW = _env(
    "REQUIRED_CONTEXTS",
    default=(
        "CI / all-required (pull_request),"
-        "sop-checklist / all-items-acked (pull_request)"
+        "sop-checklist / all-items-acked (pull_request),"
+        "E2E Chat / E2E Chat (pull_request)"
    ),
 )
+# E2E Chat is not in branch protection's status_check_contexts, but Gitea's
+# merge gate evaluates the full combined status including it. Adding it here
+# prevents the queue from attempting a merge that will be 405'd by Gitea when
+# E2E Chat is failing (e.g. runner-stall Quirk #9 on a flaky test).
+# See: mc#420 / molecule-core runbooks/gitea-operational-quirks.md Quirk #9.
 # Required contexts for push (main/staging) runs. The push CI uses the same
 # aggregator names with " (push)" suffix. Checking these explicitly instead of
 # the combined state avoids false-pause when non-blocking jobs (e.g. Platform
@@ -65,6 +71,11 @@ class ApiError(RuntimeError):
    pass


+class MergePermissionError(ApiError):
+    """Merge failed with a permanent permission error (403/404/405).
+    The queue should skip this PR and move to the next one."""
+
+
@dataclasses.dataclass(frozen=True)
 class MergeDecision:
    ready: bool
@@ -314,6 +325,31 @@ def post_comment(pr_number: int, body: str, *, dry_run: bool) -> None:
    api("POST", f"/repos/{OWNER}/{NAME}/issues/{pr_number}/comments", body={"body": body})


+def add_hold_label(pr_number: int, *, dry_run: bool) -> None:
+    """Add HOLD_LABEL to a PR if not already present."""
+    if not HOLD_LABEL:
+        return
+    # Check current labels first to avoid a no-op API call in dry-run.
+    _, current = api("GET", f"/repos/{OWNER}/{NAME}/issues/{pr_number}/labels")
+    current_names = {
+        l["name"] for l in (current if isinstance(current, list) else [])
+    }
+    if HOLD_LABEL in current_names:
+        print(f"::notice::PR #{pr_number} already has hold label; skipping add")
+        return
+    print(f"::notice::PR #{pr_number} adding hold label `{HOLD_LABEL}`")
+    if dry_run:
+        return
+    # Gitea accepts {"labels": ["label1", "label2"]} to append labels.
+    new_labels = list(current_names) + [HOLD_LABEL]
+    api(
+        "PATCH",
+        f"/repos/{OWNER}/{NAME}/issues/{pr_number}",
+        body={"labels": new_labels},
+        expect_json=False,
+    )
+
+
 def update_pull(pr_number: int, *, dry_run: bool) -> None:
    print(f"::notice::updating PR #{pr_number} with base branch via style={UPDATE_STYLE}")
    if dry_run:
@@ -338,7 +374,16 @@ def merge_pull(pr_number: int, *, dry_run: bool) -> None:
    print(f"::notice::merging PR #{pr_number}")
    if dry_run:
        return
-    api("POST", f"/repos/{OWNER}/{NAME}/pulls/{pr_number}/merge", body=payload, expect_json=False)
+    try:
+        api("POST", f"/repos/{OWNER}/{NAME}/pulls/{pr_number}/merge", body=payload, expect_json=False)
+    except ApiError as exc:
+        # Re-raise permission-like errors so process_once can skip this PR.
+        # 403 = no push access, 404 = repo/pr not found, 405 = not allowed.
+        msg = str(exc)
+        for code in ("403", "404", "405"):
+            if code in msg:
+                raise MergePermissionError(msg) from exc
+        raise  # re-raise other ApiErrors unchanged


 def process_once(*, dry_run: bool = False) -> int:
@@ -407,7 +452,45 @@ def process_once(*, dry_run: bool = False) -> int:
                "deferring to next tick"
            )
            return 0
-        merge_pull(pr_number, dry_run=dry_run)
+        try:
+            merge_pull(pr_number, dry_run=dry_run)
+        except MergePermissionError as exc:
+            msg = str(exc)
+            is_status_check_failure = "not all required status checks successful" in msg
+            if is_status_check_failure:
+                # Gitea's merge gate failed due to a status check that passed our
+                # pre-flight but is failing at Gitea's side (e.g. runner-stall Quirk
+                # #9, or a context not in REQUIRED_CONTEXTS). Auto-add hold so the
+                # queue skips this PR and processes the next one. The hold can be
+                # removed once CI is green again.
+                add_hold_label(pr_number, dry_run=dry_run)
+                post_comment(
+                    pr_number,
+                    (
+                        "merge-queue: merge blocked by Gitea's status-check gate "
+                        "(E2E Chat or other non-required context failing). "
+                        "Auto-held via `merge-queue-hold`. "
+                        "Remove the hold label to requeue once CI is green. "
+                        "If E2E Chat is stuck (runner stall / Quirk #9), CI will "
+                        "self-recover after ~90 min and the hold can then be removed."
+                    ),
+                    dry_run=dry_run,
+                )
+                return 0
+            else:
+                # Genuine permission error — token lacks Can-merge.
+                sys.stderr.write(f"::error::merge permission error for PR #{pr_number}: {exc}\n")
+                post_comment(
+                    pr_number,
+                    (
+                        "merge-queue: merge failed with HTTP 405 'User not allowed to merge PR'. "
+                        "No available token has Can-merge permission on this repo. "
+                        "Fix: grant Can-merge to a token, or add a maintain/admin collaborator. "
+                        "Skipping to next queued PR on next tick."
+                    ),
+                    dry_run=dry_run,
+                )
+                return 0
        return 0
    return 0

@@ -118,3 +118,13 @@ def test_merge_decision_updates_stale_pr_before_merge():

    assert decision.ready is False
    assert decision.action == "update"
+
+
+def test_MergePermissionError_inherits_from_ApiError():
+    assert issubclass(mq.MergePermissionError, mq.ApiError)
+
+
+def test_MergePermissionError_message_preserved():
+    exc = mq.MergePermissionError("POST /merge -> HTTP 405: User not allowed")
+    assert "405" in str(exc)
+    assert "User not allowed" in str(exc)
@@ -57,7 +57,7 @@ permissions:
 # can produce duplicate comments before the title-search dedup wins.
 concurrency:
  group: ci-required-drift
-  cancel-in-progress: false
+  cancel-in-progress: true

 jobs:
  drift:
@@ -22,7 +22,7 @@ permissions:

 concurrency:
  group: gitea-merge-queue-${{ github.repository }}
-  cancel-in-progress: false
+  cancel-in-progress: true

 jobs:
  queue:
@@ -56,9 +56,13 @@ permissions:
 # Workflow-scoped serialisation — two simultaneous runs would race on the
 # `[main-red] {SHA}` open/PATCH path. Idempotent by title, but parallel
 # POSTs can produce duplicates before the title search dedup wins.
+# NOTE: cancel-in-progress: true is safe here — the idempotent design means
+# a cancelled run produces identical output to a completed one. This also
+# prevents the Gitea scheduler freeze that occurs when a cron tick fires
+# while a previous run is still executing (Quirk #8).
 concurrency:
  group: main-red-watchdog
-  cancel-in-progress: false
+  cancel-in-progress: true

 jobs:
  watchdog:
@@ -162,6 +162,7 @@ jobs:
            exit 1
          fi
          python -m twine upload \
+            --verbose \
            --repository pypi \
            --username __token__ \
            --password "$PYPI_TOKEN" \
@@ -30,10 +30,7 @@
    {"name": "openclaw", "repo": "molecule-ai/molecule-ai-workspace-template-openclaw", "ref": "main"},
    {"name": "codex", "repo": "molecule-ai/molecule-ai-workspace-template-codex", "ref": "main"},
    {"name": "langgraph", "repo": "molecule-ai/molecule-ai-workspace-template-langgraph", "ref": "main"},
-    {"name": "crewai", "repo": "molecule-ai/molecule-ai-workspace-template-crewai", "ref": "main"},
-    {"name": "autogen", "repo": "molecule-ai/molecule-ai-workspace-template-autogen", "ref": "main"},
-    {"name": "deepagents", "repo": "molecule-ai/molecule-ai-workspace-template-deepagents", "ref": "main"},
-    {"name": "gemini-cli", "repo": "molecule-ai/molecule-ai-workspace-template-gemini-cli", "ref": "main"}
+    {"name": "autogen", "repo": "molecule-ai/molecule-ai-workspace-template-autogen", "ref": "main"}
  ],
  "org_templates": [
    {"name": "molecule-dev", "repo": "molecule-ai/molecule-ai-org-template-molecule-dev", "ref": "main"},
@@ -77,6 +77,31 @@ does not replace the queue. The queue still performs its own current-main
 check immediately before merge because branch protection alone cannot
 serialize two already-green PRs.

+### Correct API field names (Gitea 1.22.6)
+
+When setting branch protection via API, use these exact field names — several
+intuitively-correct names are silently ignored (see `gitea-operational-quirks.md`
+Quirk #7):
+
+```json
+{
+  "branch_name": "main",
+  "enable_merge_whitelist": true,
+  "merge_whitelist_usernames": ["devops-engineer", "hongming", "core-devops"],
+  "enable_status_check": true,
+  "status_check_contexts": ["CI / all-required"],
+  "required_approvals": 1,
+  "block_on_rejected_reviews": true
+}
+```
+
+After any `POST /branch_protections`, immediately GET and verify the values
+persisted — the API returns 201 even when fields are silently dropped.
+
+If the queue returns HTTP 405 ("User not allowed to merge"), the first
+diagnostic step is `GET /branch_protections/main` and checking whether
+`merge_whitelist_usernames` still contains `devops-engineer`.
+
 ## Failure Handling

 If `main` is not green, the queue pauses and does not merge anything.
@@ -196,69 +196,134 @@ primary consumer of combined status and is affected.

 ---

-## Quirk #7 — TBD
-
-*[Placeholder — document here when a new Gitea Actions quirk is discovered.]*
+## Quirk #7 — Gitea branch protection API silently ignores some field names

 ### Finding

-*[What Gitea Actions does differently from GitHub Actions.]*
+The Gitea 1.22.6 `POST /repos/{org}/{repo}/branch_protections` API accepts a
+non-obvious set of field names. Several intuitively-correct names are silently
+ignored — the call returns 201 but the field is dropped:
+
+| Intended field | Correct API name | Silently ignored aliases |
+|---|---|---|
+| Enable merge whitelist | `enable_merge_whitelist` | `user_can_merge`, `merge_whitelist_enabled` |
+| Users who can merge | `merge_whitelist_usernames` | `merge_whitelist_users`, `whitelisted_users` |
+| Enable status check | `enable_status_check` | `enable_status_checks`, `require_status_checks` |
+| Required status contexts | `status_check_contexts` | `required_status_checks.contexts` |
+| Block on rejected reviews | `block_on_rejected_reviews` | (this one works) |
+| Required approvals | `required_approvals` | `required_reviewers` |
+
+The GET response after a POST shows the actual stored values. A naive
+GET → modify → POST cycle (without using the exact GET field names) will
+silently reset the merge whitelist on every call.

 ### Impact

-*[Which workflows or operations are affected.]*
+- Branch protection merge whitelist resets to empty after any API mis-invocation
+- Queue AUTO_SYNC_TOKEN (`devops-engineer`) loses Can-merge permission → HTTP 405
+- All queued PRs blocked until whitelist is restored
+- Confirmed reset on Gitea server restart/upgrade (Gitea uses default values)

 ### Workaround

-*[How to work around this quirk.]*
+1. Always GET the current protection first and use **exact** field names from the
+   GET response when modifying
+2. After any `POST /branch_protections`, immediately GET and verify
+   `enable_merge_whitelist: true` and `merge_whitelist_usernames` contains
+   `["devops-engineer", "hongming", "core-devops"]`
+3. The queue bot should verify branch protection before each merge tick
+4. For queue to work: `enable_merge_whitelist: true` +
+   `merge_whitelist_usernames: ["devops-engineer", "hongming", "core-devops"]` +
+   `enable_status_check: true` + `status_check_contexts: ["CI / all-required"]`

 ### References

- internal#[N]: first observation
+- SEV-1 2026-05-17: 3x branch protection resets caused 405 on all queue merges
+- `feedback_gitea_branch_protection_api_field_names`

 ---

-## Quirk #8 — TBD
-
-*[Placeholder — document here when a new Gitea Actions quirk is discovered.]*
+## Quirk #8 — Scheduled workflow with `cancel-in-progress: false` causes scheduler freeze

 ### Finding

-*[What Gitea Actions does differently from GitHub Actions.]*
+When a `schedule:` workflow has `concurrency.cancel-in-progress: false`, and a
+new cron tick fires while the previous run is still executing, the Gitea Actions
+scheduler stops dispatching the workflow entirely. Pending entries accumulate
+indefinitely — the scheduler shows the workflow as "scheduled" but never dispatches.
+
+This is dangerous for workflows with variable execution time (e.g., workflows that
+wait for downstream CI, or workflows that run on slow/degraded runners).

 ### Impact

-*[Which workflows or operations are affected.]*
+- `gitea-merge-queue.yml` with `cancel-in-progress: false` froze on 2026-05-17
+  starting ~16:44Z — pending runs accumulated, no new runs dispatched
+- Queue appeared stalled; all 22 queued PRs blocked
+- The `gitea-merge-queue` workflow itself becomes invisible to operators

 ### Workaround

-*[How to work around this quirk.]*
+**Always set `cancel-in-progress: true` on `schedule:` workflows:**
+
+```yaml
+concurrency:
+  group: workflow-name
+  cancel-in-progress: true   # ← always true for schedule: workflows
+```
+
+If the freeze has already occurred: the scheduler recovers automatically after the
+currently-running instance completes (Gitea dispatches the next queued tick).

 ### References

- internal#[N]: first observation
+- SEV-1 2026-05-17: queue frozen since 16:44Z; fixed by setting `cancel-in-progress: true`
+- PR #1358: `fix(scheduled-workflows): enable cancel-in-progress` (pending merge)

 ---

-## Quirk #9 — TBD
-
-*[Placeholder — document here when a new Gitea Actions quirk is discovered.]*
+## Quirk #9 — Gitea Actions runner accepts runs but stalls (jobs never start)

 ### Finding

-*[What Gitea Actions does differently from GitHub Actions.]*
+The Gitea Actions runner on host `5.78.80.188` can enter a degraded state where:
+1. It accepts new workflow runs (shows "in_progress" in the UI)
+2. It never starts any jobs — pending count grows indefinitely
+3. The runner shows as "online" and accepting runs
+4. After ~60–90 minutes, the runner self-recovers and all pending jobs start
+
+This is distinct from a true runner crash (which would show as offline).

 ### Impact

-*[Which workflows or operations are affected.]*
+- All CI jobs for all PRs stall — no status updates posted
+- Queue waits indefinitely for CI (which never posts success)
+- `sop-checklist` and other workflows time out on affected PRs
+- Looks like the runner is working (green in UI) but nothing executes
+
+### How to diagnose
+
+Add a debug step to a known-failing workflow:
+
+```bash
+# In a stalled job:
+curl -s http://localhost:8088/debug/pprof/trace?seconds=5 | head
+# Check runner process CPU — if near 0% while jobs are pending, runner is stalled
+```
+
+Check runner logs on the host (`/var/log/actrunner.log` or similar).

 ### Workaround

-*[How to work around this quirk.]*
+No operator workaround while stalled — the runner self-recovers. Options:
+1. **Wait** — runner typically recovers within 90 minutes
+2. **Restart the runner service** — `systemctl restart act_runner` (requires host access)
+3. **Move to a second runner** — if registered, re-route dispatch

 ### References

- internal#[N]: first observation
+- SEV-1 2026-05-17: runner stalled; self-recovered ~21:33Z after ~90 min
+- `feedback_gitea_runner_stall_accepted_jobs_no_execution`

 ---

@@ -44,8 +44,8 @@ func NewWorkspaceImageService(docker *dockerclient.Client) *WorkspaceImageServic
 // AllRuntimes is the canonical list mirroring docs/workspace-runtime-package.md.
 // Update both when a new template is added.
 var AllRuntimes = []string{
-	"claude-code", "langgraph", "crewai", "autogen",
-	"deepagents", "hermes", "gemini-cli", "openclaw",
+	"claude-code", "langgraph", "autogen",
+	"hermes", "openclaw",
 }

 // RefreshResult is the per-call outcome surfaced to HTTP callers AND logged
@@ -23,8 +23,8 @@ package models
 //   - claude-code: "sonnet" — Anthropic's CLI accepts the short
 //     name and resolves it via the operator's anthropic-oauth or
 //     ANTHROPIC_API_KEY chain.
-//   - everything else (hermes, langgraph, crewai, autogen, deepagents,
-//     codex, openclaw, gemini-cli, external, ""): a fully-qualified
+//   - everything else (hermes, langgraph, autogen, codex, openclaw,
+//     external, ""): a fully-qualified
 //     vendor:model slug that the universal MODEL_PROVIDER chain in
 //     molecule-core PR #247 can route via per-vendor required_env.
 //
@@ -21,12 +21,9 @@ func TestDefaultModel(t *testing.T) {
 		// as a generic "unknown" failure.
 		{"hermes", "anthropic:claude-opus-4-7"},
 		{"langgraph", "anthropic:claude-opus-4-7"},
-		{"crewai", "anthropic:claude-opus-4-7"},
 		{"autogen", "anthropic:claude-opus-4-7"},
-		{"deepagents", "anthropic:claude-opus-4-7"},
 		{"codex", "anthropic:claude-opus-4-7"},
 		{"openclaw", "anthropic:claude-opus-4-7"},
-		{"gemini-cli", "anthropic:claude-opus-4-7"},
 		{"external", "anthropic:claude-opus-4-7"},

 		// Unknown / empty — fall through to universal default rather
@@ -190,7 +190,7 @@ func TestEnsureLocalImage_RepoNotFound(t *testing.T) {
 	opts.HTTPClient = srv.Client()
 	opts.remoteHeadSha = nil // exercise real HTTP path

-	_, err := ensureLocalImageWithOpts(context.Background(), "crewai", opts)
+	_, err := ensureLocalImageWithOpts(context.Background(), "hermes", opts)
 	if err == nil {
 		t.Fatalf("expected error, got nil")
 	}
@@ -35,6 +35,19 @@ import (
 // drift-risk #6.
 var ErrNoBackend = errors.New("provisioner: no backend configured (zero-valued receiver)")

+// ErrUnresolvableRuntime is returned by selectImage when a workspace
+// names a runtime that has no resolvable image (not in RuntimeImages and
+// no operator-pinned cfg.Image). RFC internal#483 + security review 4269:
+// previously such a request silently fell through to DefaultImage
+// (langgraph) — a user asking for crewai would get a langgraph container
+// with no signal. The CTO standing directive
+// (feedback_platform_must_hardgate_base_contract) is fail-closed: a
+// named-but-unresolvable runtime must reject with a structured,
+// runtime-naming error so the existing provision-failed notify/log path
+// surfaces it, NOT silently degrade. The genuinely-unspecified (empty)
+// runtime is still a distinct, legitimate path that keeps DefaultImage.
+var ErrUnresolvableRuntime = errors.New("provisioner: requested runtime has no resolvable image")
+
 // RuntimeImages maps runtime names to their Docker image refs.
 // Each standalone template repo publishes its image via the reusable
 // publish-template-image workflow in molecule-ci on every main merge.
@@ -104,20 +117,33 @@ type WorkspaceConfig struct {
 // selectImage resolves the final Docker image ref for a workspace. The handler
 // layer is the source of truth — if it set cfg.Image (the digest-pinned form
 // from runtime_image_pins, #2272), honor that. Otherwise fall back to the
-// runtime→tag lookup in RuntimeImages (legacy `:latest` behavior). When the
-// runtime isn't recognized either, fall back to DefaultImage so Start() still
-// has something to hand Docker — surfacing a "No such image" later is more
-// actionable than a silent "" panic in ContainerCreate.
-func selectImage(cfg WorkspaceConfig) string {
+// runtime→tag lookup in RuntimeImages (legacy `:latest` behavior).
+//
+// Fail-closed contract (RFC internal#483 / security review 4269 /
+// feedback_platform_must_hardgate_base_contract): if the workspace NAMES a
+// runtime that resolves to no image (not in RuntimeImages, no pinned
+// cfg.Image), reject with ErrUnresolvableRuntime instead of silently
+// substituting DefaultImage. Pre-fix, removing crewai/deepagents/gemini-cli
+// from the catalog left those create requests silently provisioning a
+// langgraph container — the user asked for crewai and got langgraph with no
+// signal. The error propagates through Start → markProvisionFailed, which
+// already broadcasts WorkspaceProvisionFailed and records the message.
+//
+// The genuinely-unspecified runtime (empty cfg.Runtime, e.g. an org template
+// that doesn't pin one) is an intended distinct path and still resolves to
+// DefaultImage — only a NAMED-but-unresolvable runtime is rejected.
+func selectImage(cfg WorkspaceConfig) (string, error) {
 	if cfg.Image != "" {
-		return cfg.Image
+		return cfg.Image, nil
 	}
 	if cfg.Runtime != "" {
 		if img, ok := RuntimeImages[cfg.Runtime]; ok {
-			return img
+			return img, nil
 		}
+		return "", fmt.Errorf("%w: runtime %q (known runtimes: %v)",
+			ErrUnresolvableRuntime, cfg.Runtime, knownRuntimes)
 	}
-	return DefaultImage
+	return DefaultImage, nil
 }

 // Workspace-access constants for #65. Matches the CHECK constraint on
@@ -336,7 +362,15 @@ func (p *Provisioner) Start(ctx context.Context, cfg WorkspaceConfig) (string, e

 	env := buildContainerEnv(cfg)

-	image := selectImage(cfg)
+	image, imgErr := selectImage(cfg)
+	if imgErr != nil {
+		// Fail-closed: a named-but-unresolvable runtime must not silently
+		// become DefaultImage (RFC internal#483 / review 4269). The caller's
+		// error path (markProvisionFailed) broadcasts the failure + records
+		// the message so the canvas surfaces it.
+		log.Printf("Provisioner: refusing to start %s: %v", cfg.WorkspaceID, imgErr)
+		return "", imgErr
+	}

 	// Local-build mode (issue #63 / Task #194): when MOLECULE_IMAGE_REGISTRY
 	// is unset, the OSS contributor path skips the registry pull entirely
@@ -513,7 +513,10 @@ func TestWorkspaceConfig_ResetClaudeSessionFieldPresent(t *testing.T) {
 // we lose the "one bad publish doesn't break every workspace" guarantee.
 func TestSelectImage_PrefersExplicitImage(t *testing.T) {
 	pinned := "ghcr.io/molecule-ai/workspace-template-claude-code@sha256:3d6761a97ed07d7d33cfc19a8fbab81175d9d9179618d493dbc00c5f7ef076a3"
-	got := selectImage(WorkspaceConfig{Runtime: "claude-code", Image: pinned})
+	got, err := selectImage(WorkspaceConfig{Runtime: "claude-code", Image: pinned})
+	if err != nil {
+		t.Fatalf("selectImage with cfg.Image=pinned: unexpected error %v", err)
+	}
 	if got != pinned {
 		t.Errorf("selectImage with cfg.Image=pinned: got %q, want %q", got, pinned)
 	}
@@ -523,28 +526,46 @@ func TestSelectImage_PrefersExplicitImage(t *testing.T) {
 // pin lookup deliberately bypassed via WORKSPACE_IMAGE_LOCAL_OVERRIDE).
 // selectImage must use the legacy runtime→:latest map.
 func TestSelectImage_FallsBackToRuntimeMap(t *testing.T) {
-	got := selectImage(WorkspaceConfig{Runtime: "claude-code", Image: ""})
+	got, err := selectImage(WorkspaceConfig{Runtime: "claude-code", Image: ""})
+	if err != nil {
+		t.Fatalf("selectImage with empty Image: unexpected error %v", err)
+	}
 	want := RuntimeImages["claude-code"]
 	if got != want {
 		t.Errorf("selectImage with empty Image: got %q, want %q", got, want)
 	}
 }

-// TestSelectImage_UnknownRuntimeFallsBackToDefault preserves today's
-// behavior — an unrecognized runtime resolves to DefaultImage rather than
-// "" so ContainerCreate gets a usable arg and surfaces a meaningful
-// "No such image" error if the default itself is missing.
-func TestSelectImage_UnknownRuntimeFallsBackToDefault(t *testing.T) {
-	got := selectImage(WorkspaceConfig{Runtime: "no-such-runtime"})
-	if got != DefaultImage {
-		t.Errorf("selectImage with unknown runtime: got %q, want DefaultImage %q", got, DefaultImage)
+// TestSelectImage_NamedUnresolvableRuntimeRejects pins the fail-closed
+// contract (RFC internal#483 / security review 4269 /
+// feedback_platform_must_hardgate_base_contract): a NAMED runtime with no
+// resolvable image must reject with ErrUnresolvableRuntime, NOT silently
+// substitute DefaultImage. Pre-fix this returned langgraph — a user asking
+// for a removed runtime (crewai/deepagents/gemini-cli) silently got a
+// langgraph container. "crewai" is the concrete regression from the
+// security finding.
+func TestSelectImage_NamedUnresolvableRuntimeRejects(t *testing.T) {
+	for _, rt := range []string{"no-such-runtime", "crewai", "deepagents", "gemini-cli"} {
+		got, err := selectImage(WorkspaceConfig{Runtime: rt})
+		if !errors.Is(err, ErrUnresolvableRuntime) {
+			t.Errorf("selectImage(%q): got err %v, want ErrUnresolvableRuntime", rt, err)
+		}
+		if got != "" {
+			t.Errorf("selectImage(%q): got image %q, want \"\" on reject", rt, got)
+		}
+		if err != nil && !strings.Contains(err.Error(), rt) {
+			t.Errorf("selectImage(%q): error must name the offending runtime, got %v", rt, err)
+		}
 	}
 }

 // TestSelectImage_EmptyRuntimeFallsBackToDefault: same invariant for the
 // no-runtime-supplied path (legacy callers / older handler code).
 func TestSelectImage_EmptyRuntimeFallsBackToDefault(t *testing.T) {
-	got := selectImage(WorkspaceConfig{})
+	got, err := selectImage(WorkspaceConfig{})
+	if err != nil {
+		t.Fatalf("selectImage with zero cfg: unexpected error %v (empty runtime is a legitimate DefaultImage path)", err)
+	}
 	if got != DefaultImage {
 		t.Errorf("selectImage with zero cfg: got %q, want DefaultImage %q", got, DefaultImage)
 	}
@@ -808,7 +829,7 @@ func TestIsImageNotFoundErr(t *testing.T) {
 		{"nil", nil, false},
 		{"moby no such image", fmtErr(`Error response from daemon: No such image: workspace-template:openclaw`), true},
 		{"no such image lowercase", fmtErr(`error: no such image: foo:bar`), true},
-		{"image not found", fmtErr(`Error: image "workspace-template:crewai" not found`), true},
+		{"image not found", fmtErr(`Error: image "workspace-template:hermes" not found`), true},
 		{"generic not found without image", fmtErr(`container not found`), false},
 		{"unrelated error", fmtErr(`connection refused`), false},
 		{"permission denied", fmtErr(`permission denied`), false},
@@ -21,9 +21,6 @@ var knownRuntimes = []string{
 	"autogen",
 	"claude-code",
 	"codex",
-	"crewai",
-	"deepagents",
-	"gemini-cli",
 	"hermes",
 	"langgraph",
 	"openclaw",
@@ -53,8 +53,8 @@ func TestRuntimeImage_AllKnownRuntimes(t *testing.T) {
 		}
 	}
 	// Pin the count so adding a runtime requires explicit test acknowledgement.
-	if len(knownRuntimes) != 9 {
-		t.Errorf("knownRuntimes length = %d, want 9 (autogen, claude-code, codex, crewai, deepagents, gemini-cli, hermes, langgraph, openclaw)", len(knownRuntimes))
+	if len(knownRuntimes) != 6 {
+		t.Errorf("knownRuntimes length = %d, want 6 (autogen, claude-code, codex, hermes, langgraph, openclaw)", len(knownRuntimes))
 	}
 }