Compare commits
12 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| c04e75f1eb | |||
| 55d7b04a42 | |||
| b23e733a93 | |||
| 4c0cd6b705 | |||
| af7afc6112 | |||
| dc858ad164 | |||
| 2ffd44c694 | |||
| 4f5d683f4b | |||
| df4a0e3f9d | |||
| c3cfbea750 | |||
| a01d1d8f86 | |||
| 3508d738a9 |
@@ -44,9 +44,15 @@ REQUIRED_CONTEXTS_RAW = _env(
|
||||
"REQUIRED_CONTEXTS",
|
||||
default=(
|
||||
"CI / all-required (pull_request),"
|
||||
"sop-checklist / all-items-acked (pull_request)"
|
||||
"sop-checklist / all-items-acked (pull_request),"
|
||||
"E2E Chat / E2E Chat (pull_request)"
|
||||
),
|
||||
)
|
||||
# E2E Chat is not in branch protection's status_check_contexts, but Gitea's
|
||||
# merge gate evaluates the full combined status including it. Adding it here
|
||||
# prevents the queue from attempting a merge that will be 405'd by Gitea when
|
||||
# E2E Chat is failing (e.g. runner-stall Quirk #9 on a flaky test).
|
||||
# See: mc#420 / molecule-core runbooks/gitea-operational-quirks.md Quirk #9.
|
||||
# Required contexts for push (main/staging) runs. The push CI uses the same
|
||||
# aggregator names with " (push)" suffix. Checking these explicitly instead of
|
||||
# the combined state avoids false-pause when non-blocking jobs (e.g. Platform
|
||||
@@ -65,6 +71,11 @@ class ApiError(RuntimeError):
|
||||
pass
|
||||
|
||||
|
||||
class MergePermissionError(ApiError):
|
||||
"""Merge failed with a permanent permission error (403/404/405).
|
||||
The queue should skip this PR and move to the next one."""
|
||||
|
||||
|
||||
@dataclasses.dataclass(frozen=True)
|
||||
class MergeDecision:
|
||||
ready: bool
|
||||
@@ -314,6 +325,31 @@ def post_comment(pr_number: int, body: str, *, dry_run: bool) -> None:
|
||||
api("POST", f"/repos/{OWNER}/{NAME}/issues/{pr_number}/comments", body={"body": body})
|
||||
|
||||
|
||||
def add_hold_label(pr_number: int, *, dry_run: bool) -> None:
|
||||
"""Add HOLD_LABEL to a PR if not already present."""
|
||||
if not HOLD_LABEL:
|
||||
return
|
||||
# Check current labels first to avoid a no-op API call in dry-run.
|
||||
_, current = api("GET", f"/repos/{OWNER}/{NAME}/issues/{pr_number}/labels")
|
||||
current_names = {
|
||||
l["name"] for l in (current if isinstance(current, list) else [])
|
||||
}
|
||||
if HOLD_LABEL in current_names:
|
||||
print(f"::notice::PR #{pr_number} already has hold label; skipping add")
|
||||
return
|
||||
print(f"::notice::PR #{pr_number} adding hold label `{HOLD_LABEL}`")
|
||||
if dry_run:
|
||||
return
|
||||
# Gitea accepts {"labels": ["label1", "label2"]} to append labels.
|
||||
new_labels = list(current_names) + [HOLD_LABEL]
|
||||
api(
|
||||
"PATCH",
|
||||
f"/repos/{OWNER}/{NAME}/issues/{pr_number}",
|
||||
body={"labels": new_labels},
|
||||
expect_json=False,
|
||||
)
|
||||
|
||||
|
||||
def update_pull(pr_number: int, *, dry_run: bool) -> None:
|
||||
print(f"::notice::updating PR #{pr_number} with base branch via style={UPDATE_STYLE}")
|
||||
if dry_run:
|
||||
@@ -338,7 +374,16 @@ def merge_pull(pr_number: int, *, dry_run: bool) -> None:
|
||||
print(f"::notice::merging PR #{pr_number}")
|
||||
if dry_run:
|
||||
return
|
||||
api("POST", f"/repos/{OWNER}/{NAME}/pulls/{pr_number}/merge", body=payload, expect_json=False)
|
||||
try:
|
||||
api("POST", f"/repos/{OWNER}/{NAME}/pulls/{pr_number}/merge", body=payload, expect_json=False)
|
||||
except ApiError as exc:
|
||||
# Re-raise permission-like errors so process_once can skip this PR.
|
||||
# 403 = no push access, 404 = repo/pr not found, 405 = not allowed.
|
||||
msg = str(exc)
|
||||
for code in ("403", "404", "405"):
|
||||
if code in msg:
|
||||
raise MergePermissionError(msg) from exc
|
||||
raise # re-raise other ApiErrors unchanged
|
||||
|
||||
|
||||
def process_once(*, dry_run: bool = False) -> int:
|
||||
@@ -407,7 +452,45 @@ def process_once(*, dry_run: bool = False) -> int:
|
||||
"deferring to next tick"
|
||||
)
|
||||
return 0
|
||||
merge_pull(pr_number, dry_run=dry_run)
|
||||
try:
|
||||
merge_pull(pr_number, dry_run=dry_run)
|
||||
except MergePermissionError as exc:
|
||||
msg = str(exc)
|
||||
is_status_check_failure = "not all required status checks successful" in msg
|
||||
if is_status_check_failure:
|
||||
# Gitea's merge gate failed due to a status check that passed our
|
||||
# pre-flight but is failing at Gitea's side (e.g. runner-stall Quirk
|
||||
# #9, or a context not in REQUIRED_CONTEXTS). Auto-add hold so the
|
||||
# queue skips this PR and processes the next one. The hold can be
|
||||
# removed once CI is green again.
|
||||
add_hold_label(pr_number, dry_run=dry_run)
|
||||
post_comment(
|
||||
pr_number,
|
||||
(
|
||||
"merge-queue: merge blocked by Gitea's status-check gate "
|
||||
"(E2E Chat or other non-required context failing). "
|
||||
"Auto-held via `merge-queue-hold`. "
|
||||
"Remove the hold label to requeue once CI is green. "
|
||||
"If E2E Chat is stuck (runner stall / Quirk #9), CI will "
|
||||
"self-recover after ~90 min and the hold can then be removed."
|
||||
),
|
||||
dry_run=dry_run,
|
||||
)
|
||||
return 0
|
||||
else:
|
||||
# Genuine permission error — token lacks Can-merge.
|
||||
sys.stderr.write(f"::error::merge permission error for PR #{pr_number}: {exc}\n")
|
||||
post_comment(
|
||||
pr_number,
|
||||
(
|
||||
"merge-queue: merge failed with HTTP 405 'User not allowed to merge PR'. "
|
||||
"No available token has Can-merge permission on this repo. "
|
||||
"Fix: grant Can-merge to a token, or add a maintain/admin collaborator. "
|
||||
"Skipping to next queued PR on next tick."
|
||||
),
|
||||
dry_run=dry_run,
|
||||
)
|
||||
return 0
|
||||
return 0
|
||||
return 0
|
||||
|
||||
|
||||
@@ -118,3 +118,13 @@ def test_merge_decision_updates_stale_pr_before_merge():
|
||||
|
||||
assert decision.ready is False
|
||||
assert decision.action == "update"
|
||||
|
||||
|
||||
def test_MergePermissionError_inherits_from_ApiError():
|
||||
assert issubclass(mq.MergePermissionError, mq.ApiError)
|
||||
|
||||
|
||||
def test_MergePermissionError_message_preserved():
|
||||
exc = mq.MergePermissionError("POST /merge -> HTTP 405: User not allowed")
|
||||
assert "405" in str(exc)
|
||||
assert "User not allowed" in str(exc)
|
||||
|
||||
@@ -57,7 +57,7 @@ permissions:
|
||||
# can produce duplicate comments before the title-search dedup wins.
|
||||
concurrency:
|
||||
group: ci-required-drift
|
||||
cancel-in-progress: false
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
drift:
|
||||
|
||||
@@ -22,7 +22,7 @@ permissions:
|
||||
|
||||
concurrency:
|
||||
group: gitea-merge-queue-${{ github.repository }}
|
||||
cancel-in-progress: false
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
queue:
|
||||
|
||||
@@ -56,9 +56,13 @@ permissions:
|
||||
# Workflow-scoped serialisation — two simultaneous runs would race on the
|
||||
# `[main-red] {SHA}` open/PATCH path. Idempotent by title, but parallel
|
||||
# POSTs can produce duplicates before the title search dedup wins.
|
||||
# NOTE: cancel-in-progress: true is safe here — the idempotent design means
|
||||
# a cancelled run produces identical output to a completed one. This also
|
||||
# prevents the Gitea scheduler freeze that occurs when a cron tick fires
|
||||
# while a previous run is still executing (Quirk #8).
|
||||
concurrency:
|
||||
group: main-red-watchdog
|
||||
cancel-in-progress: false
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
watchdog:
|
||||
|
||||
@@ -162,6 +162,7 @@ jobs:
|
||||
exit 1
|
||||
fi
|
||||
python -m twine upload \
|
||||
--verbose \
|
||||
--repository pypi \
|
||||
--username __token__ \
|
||||
--password "$PYPI_TOKEN" \
|
||||
|
||||
+1
-4
@@ -30,10 +30,7 @@
|
||||
{"name": "openclaw", "repo": "molecule-ai/molecule-ai-workspace-template-openclaw", "ref": "main"},
|
||||
{"name": "codex", "repo": "molecule-ai/molecule-ai-workspace-template-codex", "ref": "main"},
|
||||
{"name": "langgraph", "repo": "molecule-ai/molecule-ai-workspace-template-langgraph", "ref": "main"},
|
||||
{"name": "crewai", "repo": "molecule-ai/molecule-ai-workspace-template-crewai", "ref": "main"},
|
||||
{"name": "autogen", "repo": "molecule-ai/molecule-ai-workspace-template-autogen", "ref": "main"},
|
||||
{"name": "deepagents", "repo": "molecule-ai/molecule-ai-workspace-template-deepagents", "ref": "main"},
|
||||
{"name": "gemini-cli", "repo": "molecule-ai/molecule-ai-workspace-template-gemini-cli", "ref": "main"}
|
||||
{"name": "autogen", "repo": "molecule-ai/molecule-ai-workspace-template-autogen", "ref": "main"}
|
||||
],
|
||||
"org_templates": [
|
||||
{"name": "molecule-dev", "repo": "molecule-ai/molecule-ai-org-template-molecule-dev", "ref": "main"},
|
||||
|
||||
@@ -77,6 +77,31 @@ does not replace the queue. The queue still performs its own current-main
|
||||
check immediately before merge because branch protection alone cannot
|
||||
serialize two already-green PRs.
|
||||
|
||||
### Correct API field names (Gitea 1.22.6)
|
||||
|
||||
When setting branch protection via API, use these exact field names — several
|
||||
intuitively-correct names are silently ignored (see `gitea-operational-quirks.md`
|
||||
Quirk #7):
|
||||
|
||||
```json
|
||||
{
|
||||
"branch_name": "main",
|
||||
"enable_merge_whitelist": true,
|
||||
"merge_whitelist_usernames": ["devops-engineer", "hongming", "core-devops"],
|
||||
"enable_status_check": true,
|
||||
"status_check_contexts": ["CI / all-required"],
|
||||
"required_approvals": 1,
|
||||
"block_on_rejected_reviews": true
|
||||
}
|
||||
```
|
||||
|
||||
After any `POST /branch_protections`, immediately GET and verify the values
|
||||
persisted — the API returns 201 even when fields are silently dropped.
|
||||
|
||||
If the queue returns HTTP 405 ("User not allowed to merge"), the first
|
||||
diagnostic step is `GET /branch_protections/main` and checking whether
|
||||
`merge_whitelist_usernames` still contains `devops-engineer`.
|
||||
|
||||
## Failure Handling
|
||||
|
||||
If `main` is not green, the queue pauses and does not merge anything.
|
||||
|
||||
@@ -196,69 +196,134 @@ primary consumer of combined status and is affected.
|
||||
|
||||
---
|
||||
|
||||
## Quirk #7 — TBD
|
||||
|
||||
*[Placeholder — document here when a new Gitea Actions quirk is discovered.]*
|
||||
## Quirk #7 — Gitea branch protection API silently ignores some field names
|
||||
|
||||
### Finding
|
||||
|
||||
*[What Gitea Actions does differently from GitHub Actions.]*
|
||||
The Gitea 1.22.6 `POST /repos/{org}/{repo}/branch_protections` API accepts a
|
||||
non-obvious set of field names. Several intuitively-correct names are silently
|
||||
ignored — the call returns 201 but the field is dropped:
|
||||
|
||||
| Intended field | Correct API name | Silently ignored aliases |
|
||||
|---|---|---|
|
||||
| Enable merge whitelist | `enable_merge_whitelist` | `user_can_merge`, `merge_whitelist_enabled` |
|
||||
| Users who can merge | `merge_whitelist_usernames` | `merge_whitelist_users`, `whitelisted_users` |
|
||||
| Enable status check | `enable_status_check` | `enable_status_checks`, `require_status_checks` |
|
||||
| Required status contexts | `status_check_contexts` | `required_status_checks.contexts` |
|
||||
| Block on rejected reviews | `block_on_rejected_reviews` | (this one works) |
|
||||
| Required approvals | `required_approvals` | `required_reviewers` |
|
||||
|
||||
The GET response after a POST shows the actual stored values. A naive
|
||||
GET → modify → POST cycle (without using the exact GET field names) will
|
||||
silently reset the merge whitelist on every call.
|
||||
|
||||
### Impact
|
||||
|
||||
*[Which workflows or operations are affected.]*
|
||||
- Branch protection merge whitelist resets to empty after any API mis-invocation
|
||||
- Queue AUTO_SYNC_TOKEN (`devops-engineer`) loses Can-merge permission → HTTP 405
|
||||
- All queued PRs blocked until whitelist is restored
|
||||
- Confirmed reset on Gitea server restart/upgrade (Gitea uses default values)
|
||||
|
||||
### Workaround
|
||||
|
||||
*[How to work around this quirk.]*
|
||||
1. Always GET the current protection first and use **exact** field names from the
|
||||
GET response when modifying
|
||||
2. After any `POST /branch_protections`, immediately GET and verify
|
||||
`enable_merge_whitelist: true` and `merge_whitelist_usernames` contains
|
||||
`["devops-engineer", "hongming", "core-devops"]`
|
||||
3. The queue bot should verify branch protection before each merge tick
|
||||
4. For queue to work: `enable_merge_whitelist: true` +
|
||||
`merge_whitelist_usernames: ["devops-engineer", "hongming", "core-devops"]` +
|
||||
`enable_status_check: true` + `status_check_contexts: ["CI / all-required"]`
|
||||
|
||||
### References
|
||||
|
||||
- internal#[N]: first observation
|
||||
- SEV-1 2026-05-17: 3x branch protection resets caused 405 on all queue merges
|
||||
- `feedback_gitea_branch_protection_api_field_names`
|
||||
|
||||
---
|
||||
|
||||
## Quirk #8 — TBD
|
||||
|
||||
*[Placeholder — document here when a new Gitea Actions quirk is discovered.]*
|
||||
## Quirk #8 — Scheduled workflow with `cancel-in-progress: false` causes scheduler freeze
|
||||
|
||||
### Finding
|
||||
|
||||
*[What Gitea Actions does differently from GitHub Actions.]*
|
||||
When a `schedule:` workflow has `concurrency.cancel-in-progress: false`, and a
|
||||
new cron tick fires while the previous run is still executing, the Gitea Actions
|
||||
scheduler stops dispatching the workflow entirely. Pending entries accumulate
|
||||
indefinitely — the scheduler shows the workflow as "scheduled" but never dispatches.
|
||||
|
||||
This is dangerous for workflows with variable execution time (e.g., workflows that
|
||||
wait for downstream CI, or workflows that run on slow/degraded runners).
|
||||
|
||||
### Impact
|
||||
|
||||
*[Which workflows or operations are affected.]*
|
||||
- `gitea-merge-queue.yml` with `cancel-in-progress: false` froze on 2026-05-17
|
||||
starting ~16:44Z — pending runs accumulated, no new runs dispatched
|
||||
- Queue appeared stalled; all 22 queued PRs blocked
|
||||
- The `gitea-merge-queue` workflow itself becomes invisible to operators
|
||||
|
||||
### Workaround
|
||||
|
||||
*[How to work around this quirk.]*
|
||||
**Always set `cancel-in-progress: true` on `schedule:` workflows:**
|
||||
|
||||
```yaml
|
||||
concurrency:
|
||||
group: workflow-name
|
||||
cancel-in-progress: true # ← always true for schedule: workflows
|
||||
```
|
||||
|
||||
If the freeze has already occurred: the scheduler recovers automatically after the
|
||||
currently-running instance completes (Gitea dispatches the next queued tick).
|
||||
|
||||
### References
|
||||
|
||||
- internal#[N]: first observation
|
||||
- SEV-1 2026-05-17: queue frozen since 16:44Z; fixed by setting `cancel-in-progress: true`
|
||||
- PR #1358: `fix(scheduled-workflows): enable cancel-in-progress` (pending merge)
|
||||
|
||||
---
|
||||
|
||||
## Quirk #9 — TBD
|
||||
|
||||
*[Placeholder — document here when a new Gitea Actions quirk is discovered.]*
|
||||
## Quirk #9 — Gitea Actions runner accepts runs but stalls (jobs never start)
|
||||
|
||||
### Finding
|
||||
|
||||
*[What Gitea Actions does differently from GitHub Actions.]*
|
||||
The Gitea Actions runner on host `5.78.80.188` can enter a degraded state where:
|
||||
1. It accepts new workflow runs (shows "in_progress" in the UI)
|
||||
2. It never starts any jobs — pending count grows indefinitely
|
||||
3. The runner shows as "online" and accepting runs
|
||||
4. After ~60–90 minutes, the runner self-recovers and all pending jobs start
|
||||
|
||||
This is distinct from a true runner crash (which would show as offline).
|
||||
|
||||
### Impact
|
||||
|
||||
*[Which workflows or operations are affected.]*
|
||||
- All CI jobs for all PRs stall — no status updates posted
|
||||
- Queue waits indefinitely for CI (which never posts success)
|
||||
- `sop-checklist` and other workflows time out on affected PRs
|
||||
- Looks like the runner is working (green in UI) but nothing executes
|
||||
|
||||
### How to diagnose
|
||||
|
||||
Add a debug step to a known-failing workflow:
|
||||
|
||||
```bash
|
||||
# In a stalled job:
|
||||
curl -s http://localhost:8088/debug/pprof/trace?seconds=5 | head
|
||||
# Check runner process CPU — if near 0% while jobs are pending, runner is stalled
|
||||
```
|
||||
|
||||
Check runner logs on the host (`/var/log/actrunner.log` or similar).
|
||||
|
||||
### Workaround
|
||||
|
||||
*[How to work around this quirk.]*
|
||||
No operator workaround while stalled — the runner self-recovers. Options:
|
||||
1. **Wait** — runner typically recovers within 90 minutes
|
||||
2. **Restart the runner service** — `systemctl restart act_runner` (requires host access)
|
||||
3. **Move to a second runner** — if registered, re-route dispatch
|
||||
|
||||
### References
|
||||
|
||||
- internal#[N]: first observation
|
||||
- SEV-1 2026-05-17: runner stalled; self-recovered ~21:33Z after ~90 min
|
||||
- `feedback_gitea_runner_stall_accepted_jobs_no_execution`
|
||||
|
||||
---
|
||||
|
||||
|
||||
@@ -44,8 +44,8 @@ func NewWorkspaceImageService(docker *dockerclient.Client) *WorkspaceImageServic
|
||||
// AllRuntimes is the canonical list mirroring docs/workspace-runtime-package.md.
|
||||
// Update both when a new template is added.
|
||||
var AllRuntimes = []string{
|
||||
"claude-code", "langgraph", "crewai", "autogen",
|
||||
"deepagents", "hermes", "gemini-cli", "openclaw",
|
||||
"claude-code", "langgraph", "autogen",
|
||||
"hermes", "openclaw",
|
||||
}
|
||||
|
||||
// RefreshResult is the per-call outcome surfaced to HTTP callers AND logged
|
||||
|
||||
@@ -23,8 +23,8 @@ package models
|
||||
// - claude-code: "sonnet" — Anthropic's CLI accepts the short
|
||||
// name and resolves it via the operator's anthropic-oauth or
|
||||
// ANTHROPIC_API_KEY chain.
|
||||
// - everything else (hermes, langgraph, crewai, autogen, deepagents,
|
||||
// codex, openclaw, gemini-cli, external, ""): a fully-qualified
|
||||
// - everything else (hermes, langgraph, autogen, codex, openclaw,
|
||||
// external, ""): a fully-qualified
|
||||
// vendor:model slug that the universal MODEL_PROVIDER chain in
|
||||
// molecule-core PR #247 can route via per-vendor required_env.
|
||||
//
|
||||
|
||||
@@ -21,12 +21,9 @@ func TestDefaultModel(t *testing.T) {
|
||||
// as a generic "unknown" failure.
|
||||
{"hermes", "anthropic:claude-opus-4-7"},
|
||||
{"langgraph", "anthropic:claude-opus-4-7"},
|
||||
{"crewai", "anthropic:claude-opus-4-7"},
|
||||
{"autogen", "anthropic:claude-opus-4-7"},
|
||||
{"deepagents", "anthropic:claude-opus-4-7"},
|
||||
{"codex", "anthropic:claude-opus-4-7"},
|
||||
{"openclaw", "anthropic:claude-opus-4-7"},
|
||||
{"gemini-cli", "anthropic:claude-opus-4-7"},
|
||||
{"external", "anthropic:claude-opus-4-7"},
|
||||
|
||||
// Unknown / empty — fall through to universal default rather
|
||||
|
||||
@@ -190,7 +190,7 @@ func TestEnsureLocalImage_RepoNotFound(t *testing.T) {
|
||||
opts.HTTPClient = srv.Client()
|
||||
opts.remoteHeadSha = nil // exercise real HTTP path
|
||||
|
||||
_, err := ensureLocalImageWithOpts(context.Background(), "crewai", opts)
|
||||
_, err := ensureLocalImageWithOpts(context.Background(), "hermes", opts)
|
||||
if err == nil {
|
||||
t.Fatalf("expected error, got nil")
|
||||
}
|
||||
|
||||
@@ -35,6 +35,19 @@ import (
|
||||
// drift-risk #6.
|
||||
var ErrNoBackend = errors.New("provisioner: no backend configured (zero-valued receiver)")
|
||||
|
||||
// ErrUnresolvableRuntime is returned by selectImage when a workspace
|
||||
// names a runtime that has no resolvable image (not in RuntimeImages and
|
||||
// no operator-pinned cfg.Image). RFC internal#483 + security review 4269:
|
||||
// previously such a request silently fell through to DefaultImage
|
||||
// (langgraph) — a user asking for crewai would get a langgraph container
|
||||
// with no signal. The CTO standing directive
|
||||
// (feedback_platform_must_hardgate_base_contract) is fail-closed: a
|
||||
// named-but-unresolvable runtime must reject with a structured,
|
||||
// runtime-naming error so the existing provision-failed notify/log path
|
||||
// surfaces it, NOT silently degrade. The genuinely-unspecified (empty)
|
||||
// runtime is still a distinct, legitimate path that keeps DefaultImage.
|
||||
var ErrUnresolvableRuntime = errors.New("provisioner: requested runtime has no resolvable image")
|
||||
|
||||
// RuntimeImages maps runtime names to their Docker image refs.
|
||||
// Each standalone template repo publishes its image via the reusable
|
||||
// publish-template-image workflow in molecule-ci on every main merge.
|
||||
@@ -104,20 +117,33 @@ type WorkspaceConfig struct {
|
||||
// selectImage resolves the final Docker image ref for a workspace. The handler
|
||||
// layer is the source of truth — if it set cfg.Image (the digest-pinned form
|
||||
// from runtime_image_pins, #2272), honor that. Otherwise fall back to the
|
||||
// runtime→tag lookup in RuntimeImages (legacy `:latest` behavior). When the
|
||||
// runtime isn't recognized either, fall back to DefaultImage so Start() still
|
||||
// has something to hand Docker — surfacing a "No such image" later is more
|
||||
// actionable than a silent "" panic in ContainerCreate.
|
||||
func selectImage(cfg WorkspaceConfig) string {
|
||||
// runtime→tag lookup in RuntimeImages (legacy `:latest` behavior).
|
||||
//
|
||||
// Fail-closed contract (RFC internal#483 / security review 4269 /
|
||||
// feedback_platform_must_hardgate_base_contract): if the workspace NAMES a
|
||||
// runtime that resolves to no image (not in RuntimeImages, no pinned
|
||||
// cfg.Image), reject with ErrUnresolvableRuntime instead of silently
|
||||
// substituting DefaultImage. Pre-fix, removing crewai/deepagents/gemini-cli
|
||||
// from the catalog left those create requests silently provisioning a
|
||||
// langgraph container — the user asked for crewai and got langgraph with no
|
||||
// signal. The error propagates through Start → markProvisionFailed, which
|
||||
// already broadcasts WorkspaceProvisionFailed and records the message.
|
||||
//
|
||||
// The genuinely-unspecified runtime (empty cfg.Runtime, e.g. an org template
|
||||
// that doesn't pin one) is an intended distinct path and still resolves to
|
||||
// DefaultImage — only a NAMED-but-unresolvable runtime is rejected.
|
||||
func selectImage(cfg WorkspaceConfig) (string, error) {
|
||||
if cfg.Image != "" {
|
||||
return cfg.Image
|
||||
return cfg.Image, nil
|
||||
}
|
||||
if cfg.Runtime != "" {
|
||||
if img, ok := RuntimeImages[cfg.Runtime]; ok {
|
||||
return img
|
||||
return img, nil
|
||||
}
|
||||
return "", fmt.Errorf("%w: runtime %q (known runtimes: %v)",
|
||||
ErrUnresolvableRuntime, cfg.Runtime, knownRuntimes)
|
||||
}
|
||||
return DefaultImage
|
||||
return DefaultImage, nil
|
||||
}
|
||||
|
||||
// Workspace-access constants for #65. Matches the CHECK constraint on
|
||||
@@ -336,7 +362,15 @@ func (p *Provisioner) Start(ctx context.Context, cfg WorkspaceConfig) (string, e
|
||||
|
||||
env := buildContainerEnv(cfg)
|
||||
|
||||
image := selectImage(cfg)
|
||||
image, imgErr := selectImage(cfg)
|
||||
if imgErr != nil {
|
||||
// Fail-closed: a named-but-unresolvable runtime must not silently
|
||||
// become DefaultImage (RFC internal#483 / review 4269). The caller's
|
||||
// error path (markProvisionFailed) broadcasts the failure + records
|
||||
// the message so the canvas surfaces it.
|
||||
log.Printf("Provisioner: refusing to start %s: %v", cfg.WorkspaceID, imgErr)
|
||||
return "", imgErr
|
||||
}
|
||||
|
||||
// Local-build mode (issue #63 / Task #194): when MOLECULE_IMAGE_REGISTRY
|
||||
// is unset, the OSS contributor path skips the registry pull entirely
|
||||
|
||||
@@ -513,7 +513,10 @@ func TestWorkspaceConfig_ResetClaudeSessionFieldPresent(t *testing.T) {
|
||||
// we lose the "one bad publish doesn't break every workspace" guarantee.
|
||||
func TestSelectImage_PrefersExplicitImage(t *testing.T) {
|
||||
pinned := "ghcr.io/molecule-ai/workspace-template-claude-code@sha256:3d6761a97ed07d7d33cfc19a8fbab81175d9d9179618d493dbc00c5f7ef076a3"
|
||||
got := selectImage(WorkspaceConfig{Runtime: "claude-code", Image: pinned})
|
||||
got, err := selectImage(WorkspaceConfig{Runtime: "claude-code", Image: pinned})
|
||||
if err != nil {
|
||||
t.Fatalf("selectImage with cfg.Image=pinned: unexpected error %v", err)
|
||||
}
|
||||
if got != pinned {
|
||||
t.Errorf("selectImage with cfg.Image=pinned: got %q, want %q", got, pinned)
|
||||
}
|
||||
@@ -523,28 +526,46 @@ func TestSelectImage_PrefersExplicitImage(t *testing.T) {
|
||||
// pin lookup deliberately bypassed via WORKSPACE_IMAGE_LOCAL_OVERRIDE).
|
||||
// selectImage must use the legacy runtime→:latest map.
|
||||
func TestSelectImage_FallsBackToRuntimeMap(t *testing.T) {
|
||||
got := selectImage(WorkspaceConfig{Runtime: "claude-code", Image: ""})
|
||||
got, err := selectImage(WorkspaceConfig{Runtime: "claude-code", Image: ""})
|
||||
if err != nil {
|
||||
t.Fatalf("selectImage with empty Image: unexpected error %v", err)
|
||||
}
|
||||
want := RuntimeImages["claude-code"]
|
||||
if got != want {
|
||||
t.Errorf("selectImage with empty Image: got %q, want %q", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
// TestSelectImage_UnknownRuntimeFallsBackToDefault preserves today's
|
||||
// behavior — an unrecognized runtime resolves to DefaultImage rather than
|
||||
// "" so ContainerCreate gets a usable arg and surfaces a meaningful
|
||||
// "No such image" error if the default itself is missing.
|
||||
func TestSelectImage_UnknownRuntimeFallsBackToDefault(t *testing.T) {
|
||||
got := selectImage(WorkspaceConfig{Runtime: "no-such-runtime"})
|
||||
if got != DefaultImage {
|
||||
t.Errorf("selectImage with unknown runtime: got %q, want DefaultImage %q", got, DefaultImage)
|
||||
// TestSelectImage_NamedUnresolvableRuntimeRejects pins the fail-closed
|
||||
// contract (RFC internal#483 / security review 4269 /
|
||||
// feedback_platform_must_hardgate_base_contract): a NAMED runtime with no
|
||||
// resolvable image must reject with ErrUnresolvableRuntime, NOT silently
|
||||
// substitute DefaultImage. Pre-fix this returned langgraph — a user asking
|
||||
// for a removed runtime (crewai/deepagents/gemini-cli) silently got a
|
||||
// langgraph container. "crewai" is the concrete regression from the
|
||||
// security finding.
|
||||
func TestSelectImage_NamedUnresolvableRuntimeRejects(t *testing.T) {
|
||||
for _, rt := range []string{"no-such-runtime", "crewai", "deepagents", "gemini-cli"} {
|
||||
got, err := selectImage(WorkspaceConfig{Runtime: rt})
|
||||
if !errors.Is(err, ErrUnresolvableRuntime) {
|
||||
t.Errorf("selectImage(%q): got err %v, want ErrUnresolvableRuntime", rt, err)
|
||||
}
|
||||
if got != "" {
|
||||
t.Errorf("selectImage(%q): got image %q, want \"\" on reject", rt, got)
|
||||
}
|
||||
if err != nil && !strings.Contains(err.Error(), rt) {
|
||||
t.Errorf("selectImage(%q): error must name the offending runtime, got %v", rt, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestSelectImage_EmptyRuntimeFallsBackToDefault: same invariant for the
|
||||
// no-runtime-supplied path (legacy callers / older handler code).
|
||||
func TestSelectImage_EmptyRuntimeFallsBackToDefault(t *testing.T) {
|
||||
got := selectImage(WorkspaceConfig{})
|
||||
got, err := selectImage(WorkspaceConfig{})
|
||||
if err != nil {
|
||||
t.Fatalf("selectImage with zero cfg: unexpected error %v (empty runtime is a legitimate DefaultImage path)", err)
|
||||
}
|
||||
if got != DefaultImage {
|
||||
t.Errorf("selectImage with zero cfg: got %q, want DefaultImage %q", got, DefaultImage)
|
||||
}
|
||||
@@ -808,7 +829,7 @@ func TestIsImageNotFoundErr(t *testing.T) {
|
||||
{"nil", nil, false},
|
||||
{"moby no such image", fmtErr(`Error response from daemon: No such image: workspace-template:openclaw`), true},
|
||||
{"no such image lowercase", fmtErr(`error: no such image: foo:bar`), true},
|
||||
{"image not found", fmtErr(`Error: image "workspace-template:crewai" not found`), true},
|
||||
{"image not found", fmtErr(`Error: image "workspace-template:hermes" not found`), true},
|
||||
{"generic not found without image", fmtErr(`container not found`), false},
|
||||
{"unrelated error", fmtErr(`connection refused`), false},
|
||||
{"permission denied", fmtErr(`permission denied`), false},
|
||||
|
||||
@@ -21,9 +21,6 @@ var knownRuntimes = []string{
|
||||
"autogen",
|
||||
"claude-code",
|
||||
"codex",
|
||||
"crewai",
|
||||
"deepagents",
|
||||
"gemini-cli",
|
||||
"hermes",
|
||||
"langgraph",
|
||||
"openclaw",
|
||||
|
||||
@@ -53,8 +53,8 @@ func TestRuntimeImage_AllKnownRuntimes(t *testing.T) {
|
||||
}
|
||||
}
|
||||
// Pin the count so adding a runtime requires explicit test acknowledgement.
|
||||
if len(knownRuntimes) != 9 {
|
||||
t.Errorf("knownRuntimes length = %d, want 9 (autogen, claude-code, codex, crewai, deepagents, gemini-cli, hermes, langgraph, openclaw)", len(knownRuntimes))
|
||||
if len(knownRuntimes) != 6 {
|
||||
t.Errorf("knownRuntimes length = %d, want 6 (autogen, claude-code, codex, hermes, langgraph, openclaw)", len(knownRuntimes))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user