Merge pull request #2442 from Molecule-AI/staging

staging → main: auto-promote 5b70204
Merge pull request #2498 from Molecule-AI/auto-sync/main-76c604fb
2026-05-01 22:52:03 -07:00 · 2026-05-02 05:34:16 +00:00 · 2026-05-01 22:31:53 -07:00 · 2026-05-02 05:30:52 +00:00 · 2026-05-01 22:28:35 -07:00 · 2026-05-02 05:03:41 +00:00
228 changed files with 28490 additions and 1454 deletions
@@ -95,7 +95,39 @@ if [ -n "$STAGED_GO" ]; then
 fi

 # ──────────────────────────────────────────────────────────
-# 5. Secrets: No tokens/keys in staged files
+# 5. Go: build check — catches bot-generated structurally-invalid Go (#1770)
+# ──────────────────────────────────────────────────────────
+#
+# Background: bot agents have produced syntactically-broken Go that the
+# patch tool happily applied (e.g. PR #1769 commit 66ea0b64 — function
+# declaration nested inside another function's body). Compilation failed,
+# staging Platform(Go) was red for hours. CI catches this AT PR-time but
+# by then the malformed commit is already shared.
+#
+# Pre-commit guard: when ANY .go file in workspace-server/ is staged, run
+# `go build ./...` from workspace-server. If it fails, reject the commit.
+# Cost: ~5-10s on a warm cache; acceptable for the class of bug it
+# catches. Skip when go isn't available (CI runners that need to bypass).
+
+if [ -n "$STAGED_GO" ]; then
+  if command -v go >/dev/null 2>&1; then
+    if ! (cd workspace-server && go build ./... >/tmp/precommit-go-build.log 2>&1); then
+      echo "❌ GO BUILD FAILED — staged Go changes don't compile (workspace-server/)."
+      echo "   Output:"
+      sed 's/^/     /' /tmp/precommit-go-build.log | head -20
+      echo "   Fix the build error before committing. See #1770 for context."
+      ERRORS=$((ERRORS + 1))
+    fi
+  else
+    # Bots and CI runners may bypass when go isn't installed — surface a
+    # warning so the absence is visible, but don't block. Humans hit this
+    # only if they didn't run setup.sh.
+    echo "⚠️  go not installed — skipping go-build pre-commit check (#1770)"
+  fi
+fi
+
+# ──────────────────────────────────────────────────────────
+# 6. Secrets: No tokens/keys in staged files
 # ──────────────────────────────────────────────────────────

 ALL_STAGED=$(git diff --cached --name-only --diff-filter=ACM || true)
@@ -155,6 +155,20 @@ jobs:
          fi

          # Upstream is publish-workspace-server-image. Check E2E state.
+          # The jq filter must defend against TWO empty cases that gh
+          # CLI emits indistinguishably:
+          #   1. gh exits non-zero (network blip, auth issue) → handled
+          #      by the `|| echo "none/none"` fallback below.
+          #   2. gh exits zero but returns `[]` (no E2E run on this
+          #      main SHA — the common case for canvas-only / cmd-only
+          #      / sweep-only changes whose paths don't trigger E2E).
+          #      Without `(.[0] // {})`, jq sees `null` and emits
+          #      "null/none" — which the case statement below has no
+          #      branch for, so it falls into *) → exit 1.
+          # Surfaced 2026-04-30 the first time the App-token chain
+          # (#2389) actually fired auto-promote-on-e2e from a publish
+          # upstream — every prior run was E2E-upstream which
+          # short-circuits before this gate.
          RESULT=$(gh run list \
            --repo "$REPO" \
            --workflow e2e-staging-saas.yml \
@@ -162,7 +176,7 @@ jobs:
            --commit "$SHA" \
            --limit 1 \
            --json status,conclusion \
-            --jq '.[0] | "\(.status)/\(.conclusion // "none")"' \
+            --jq '(.[0] // {}) | "\(.status // "none")/\(.conclusion // "none")"' \
            2>/dev/null || echo "none/none")

          echo "E2E Staging SaaS for ${SHA:0:7}: $RESULT"
@@ -236,6 +250,135 @@ jobs:
            echo "  ok: $tag exists"
          done

+      - name: Ancestry check — refuse to promote :latest backwards
+        # #2244: workflow_run completions arrive in arbitrary order. If
+        # SHA-A and SHA-B both reach main within ~10 min and SHA-B's E2E
+        # completes before SHA-A's, this workflow can fire for SHA-A
+        # AFTER it already promoted SHA-B → :latest goes backwards. The
+        # orphan-reconciler "next run corrects it" doesn't apply: there's
+        # no auto-corrective re-promote, :latest stays wrong until the
+        # next main push lands.
+        #
+        # Detection: read current :latest's `org.opencontainers.image.revision`
+        # label (set by publish-workspace-server-image.yml at build time)
+        # and ask the GitHub compare API whether the candidate SHA is
+        # ahead-of / identical-to / behind / diverged-from current.
+        # Hard-fail on `behind` and `diverged` per the approved design —
+        # silent-bypass is the class we're moving away from. Workflow
+        # goes red, oncall sees it, operator decides how to recover
+        # (manual dispatch with the right SHA, force-promote, etc.).
+        #
+        # Manual dispatch skips this check — operator override semantics
+        # match the gate-check step above.
+        #
+        # Backward-compat: when current :latest carries no revision
+        # label (legacy image pre-publish-with-label), skip-with-warning.
+        # All :latest images on main are post-label as of 2026-04-29, so
+        # this branch will be dead within 90 days; remove then.
+        if: steps.gate.outputs.proceed == 'true' && github.event_name != 'workflow_dispatch'
+        id: ancestry
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          REPO: ${{ github.repository }}
+          TARGET_SHA: ${{ steps.sha.outputs.full }}
+        run: |
+          set -euo pipefail
+
+          # Read the current :latest config and pull the revision label.
+          # `crane config` returns the OCI image config blob (not the manifest);
+          # labels live under `.config.Labels`. `// empty` makes jq return ""
+          # rather than the literal "null" so the test below works.
+          CURRENT_REVISION=$(crane config "${IMAGE_NAME}:latest" 2>/dev/null \
+            | jq -r '.config.Labels["org.opencontainers.image.revision"] // empty' \
+            || true)
+
+          if [ -z "$CURRENT_REVISION" ]; then
+            echo "decision=skip-no-label" >> "$GITHUB_OUTPUT"
+            {
+              echo "## ⚠ Ancestry check skipped — current :latest has no revision label"
+              echo
+              echo "Likely a legacy image built before \`org.opencontainers.image.revision\` was set."
+              echo "Falling through to retag. After all \`:latest\` images are post-label (TODO 90 days), this branch is dead and should be removed."
+            } >> "$GITHUB_STEP_SUMMARY"
+            echo "::warning::Current :latest carries no revision label — skipping ancestry check (legacy image)"
+            exit 0
+          fi
+
+          if [ "$CURRENT_REVISION" = "$TARGET_SHA" ]; then
+            echo "decision=identical" >> "$GITHUB_OUTPUT"
+            echo "::notice:::latest already at ${TARGET_SHA:0:7} — retag will be a no-op"
+            exit 0
+          fi
+
+          # Ask GitHub which side of the merge graph TARGET_SHA sits on
+          # relative to CURRENT_REVISION. Returns one of: ahead | identical
+          # | behind | diverged. Network or auth errors collapse to "error"
+          # via the explicit fallback so the case below always matches.
+          STATUS=$(gh api \
+            "repos/${REPO}/compare/${CURRENT_REVISION}...${TARGET_SHA}" \
+            --jq '.status' 2>/dev/null || echo "error")
+
+          echo "ancestry compare ${CURRENT_REVISION:0:7} → ${TARGET_SHA:0:7}: $STATUS"
+
+          case "$STATUS" in
+            ahead)
+              echo "decision=ahead" >> "$GITHUB_OUTPUT"
+              echo "::notice::Target ${TARGET_SHA:0:7} is ahead of current :latest (${CURRENT_REVISION:0:7}) — proceeding with retag"
+              ;;
+            identical)
+              echo "decision=identical" >> "$GITHUB_OUTPUT"
+              echo "::notice::Target identical to :latest — retag will be a no-op"
+              ;;
+            behind)
+              echo "decision=behind" >> "$GITHUB_OUTPUT"
+              {
+                echo "## ❌ Auto-promote refused — target is BEHIND current :latest"
+                echo
+                echo "| Field | Value |"
+                echo "|---|---|"
+                echo "| Target SHA | \`$TARGET_SHA\` |"
+                echo "| Current :latest revision | \`$CURRENT_REVISION\` |"
+                echo "| GitHub compare status | \`behind\` |"
+                echo
+                echo "This guard catches the workflow_run-completion-order race (#2244):"
+                echo "two rapid main pushes whose E2Es complete out-of-order can otherwise"
+                echo "promote \`:latest\` backwards. \`:latest\` stays on \`${CURRENT_REVISION:0:7}\`."
+                echo
+                echo "**Recovery:** if this is a legitimate revert that should land on \`:latest\`,"
+                echo "manually dispatch this workflow with the target sha as input — the manual-dispatch"
+                echo "path skips the ancestry check (operator override)."
+              } >> "$GITHUB_STEP_SUMMARY"
+              exit 1
+              ;;
+            diverged)
+              echo "decision=diverged" >> "$GITHUB_OUTPUT"
+              {
+                echo "## ❓ Auto-promote refused — history diverged"
+                echo
+                echo "| Field | Value |"
+                echo "|---|---|"
+                echo "| Target SHA | \`$TARGET_SHA\` |"
+                echo "| Current :latest revision | \`$CURRENT_REVISION\` |"
+                echo "| GitHub compare status | \`diverged\` |"
+                echo
+                echo "Likely cause: force-push rewrote main's history, leaving the previous"
+                echo "\`:latest\` revision orphaned. Needs human review before \`:latest\` advances."
+              } >> "$GITHUB_STEP_SUMMARY"
+              exit 1
+              ;;
+            error|*)
+              echo "decision=error" >> "$GITHUB_OUTPUT"
+              {
+                echo "## ❌ Auto-promote aborted — ancestry-check API error"
+                echo
+                echo "\`gh api repos/${REPO}/compare/${CURRENT_REVISION}...${TARGET_SHA}\` returned unexpected status: \`$STATUS\`"
+                echo
+                echo "Manual dispatch with the target sha bypasses this check."
+              } >> "$GITHUB_STEP_SUMMARY"
+              exit 1
+              ;;
+          esac
+
      - name: Retag platform :staging-<sha> → :latest
        if: steps.gate.outputs.proceed == 'true'
        run: |
@@ -76,6 +76,27 @@ on:
 permissions:
  contents: write
  pull-requests: write
+  # actions: write is needed by the post-merge dispatch tail step
+  # (#2358 / #2357) — `gh workflow run publish-workspace-server-image.yml`
+  # POSTs to /actions/workflows/.../dispatches which requires this scope.
+  # Without it the call 403s and the publish/canary/redeploy chain still
+  # doesn't run on staging→main promotions, undoing #2358.
+  actions: write
+
+# Serialize auto-promote runs. Multiple staging gate completions can land
+# in quick succession (CI + E2E + CodeQL all finish within seconds of
+# each other on a green PR) — without this, two parallel runs both:
+#   1. Open / re-use the same promote PR.
+#   2. Both call `gh pr merge --auto` (idempotent — fine).
+#   3. Both poll for the same mergedAt and both `gh workflow run` publish
+#      → 2× redundant publish builds racing for the same `:staging-latest`
+#      retag, and 2× canary-verify chains.
+# cancel-in-progress: false because we don't want a brand-new run to kill
+# a polling-tail that's about to dispatch — the polling tail's 30 min cap
+# is the right backstop, not workflow-level cancel.
+concurrency:
+  group: auto-promote-staging
+  cancel-in-progress: false

 jobs:
  check-all-gates-green:
@@ -240,3 +261,124 @@ jobs:
            echo
            echo "Merge queue lands the PR once required gates are green; no human action needed unless gates fail."
          } >> "$GITHUB_STEP_SUMMARY"
+
+          # Hand the PR number to the next step so we can dispatch the
+          # tenant-redeploy chain after the merge queue lands the merge.
+          echo "promote_pr_num=${PR_NUM}" >> "$GITHUB_OUTPUT"
+        id: promote_pr
+
+      # Mint a short-lived GitHub App installation token for the dispatch
+      # step below. We CANNOT use `secrets.GITHUB_TOKEN` to dispatch the
+      # downstream publish chain — workflow runs created by GITHUB_TOKEN
+      # do not fire `workflow_run` triggers on completion (the
+      # documented "no recursion" rule —
+      # https://docs.github.com/en/actions/using-workflows/triggering-a-workflow#triggering-a-workflow-from-a-workflow).
+      #
+      # Symptom this caused (root-caused on 2026-04-30): publish-image
+      # ran successfully twice (21313dc 14:41Z, 59dec57 15:21Z) but
+      # canary-verify and redeploy-tenants-on-main never chained,
+      # because the publish run's `triggering_actor` was
+      # `github-actions[bot]` (i.e. GITHUB_TOKEN). A manual dispatch
+      # earlier in the day with the operator's PAT (d850ec7 06:52Z) did
+      # chain — same workflow file, only the actor differed.
+      #
+      # An App token's triggering_actor is the App user (e.g.
+      # `molecule-ai[bot]`), which IS allowed to fire downstream
+      # workflow_run cascades.
+      - name: Mint App token for downstream dispatch
+        if: steps.promote_pr.outputs.promote_pr_num != ''
+        id: app-token
+        uses: actions/create-github-app-token@1b10c78c7865c340bc4f6099eb2f838309f1e8c3 # v3.1.1
+        with:
+          app-id: ${{ secrets.MOLECULE_AI_APP_ID }}
+          private-key: ${{ secrets.MOLECULE_AI_APP_PRIVATE_KEY }}
+
+      - name: Wait for promote merge, then dispatch publish + redeploy (#2357)
+        # GITHUB_TOKEN-initiated merges suppress downstream `push` events
+        # (https://docs.github.com/en/actions/using-workflows/triggering-a-workflow#triggering-a-workflow-from-a-workflow).
+        # Result: when the merge queue lands the promote PR, the resulting
+        # main-branch push DOES NOT fire publish-workspace-server-image,
+        # so canary-verify and redeploy-tenants-on-main never run and
+        # tenants stay on stale code (issue #2357).
+        #
+        # Workaround: poll for the merge to land, then explicitly
+        # `gh workflow run` publish-workspace-server-image. The dispatch
+        # MUST authenticate as the molecule-ai App (App token minted
+        # above) — not GITHUB_TOKEN — so that the resulting publish
+        # run's completion event can fire the workflow_run cascade
+        # into canary-verify + redeploy-tenants-on-main. See the prior
+        # step's comment for the GITHUB_TOKEN no-recursion details.
+        #
+        # Long-term fix: switch the auto-merge call above to use the
+        # same App token, so the merge's push event fires
+        # publish-workspace-server-image naturally and this polling tail
+        # becomes unnecessary. Tracked in #2357.
+        if: steps.promote_pr.outputs.promote_pr_num != ''
+        env:
+          GH_TOKEN: ${{ steps.app-token.outputs.token }}
+          REPO: ${{ github.repository }}
+          PR_NUM: ${{ steps.promote_pr.outputs.promote_pr_num }}
+        run: |
+          # Poll for merge — max 30 min (60 × 30s). The merge queue
+          # typically lands within 5-10 min when gates are green. Break
+          # early if the PR is closed without merging (operator action,
+          # gates flipped red post-approval, branch-protection rejection)
+          # so we don't tie up a runner for the full 30 min on a dead PR.
+          MERGED=""
+          STATE=""
+          for _ in $(seq 1 60); do
+            VIEW=$(gh pr view "$PR_NUM" --repo "$REPO" --json mergedAt,state)
+            MERGED=$(echo "$VIEW" | jq -r '.mergedAt // ""')
+            STATE=$(echo "$VIEW" | jq -r '.state // ""')
+            if [ -n "$MERGED" ] && [ "$MERGED" != "null" ]; then
+              echo "::notice::Promote PR #${PR_NUM} merged at ${MERGED}"
+              break
+            fi
+            if [ "$STATE" = "CLOSED" ]; then
+              echo "::warning::Promote PR #${PR_NUM} was closed without merging — skipping deploy dispatch."
+              exit 0
+            fi
+            sleep 30
+          done
+
+          if [ -z "$MERGED" ] || [ "$MERGED" = "null" ]; then
+            echo "::warning::Promote PR #${PR_NUM} didn't merge within 30min — skipping deploy dispatch (manually run \`gh workflow run publish-workspace-server-image.yml --ref main\` once it lands)."
+            exit 0
+          fi
+
+          # Dispatch publish on main using the App token. App-initiated
+          # workflow_dispatch DOES propagate the workflow_run cascade,
+          # unlike GITHUB_TOKEN-initiated dispatch.
+          # publish completes → canary-verify chains via workflow_run →
+          # redeploy-tenants-on-main chains via workflow_run + branches:[main].
+          if gh workflow run publish-workspace-server-image.yml \
+              --repo "$REPO" --ref main 2>&1; then
+            echo "::notice::Dispatched publish-workspace-server-image on ref=main as molecule-ai App — canary-verify and redeploy-tenants-on-main will chain via workflow_run."
+            {
+              echo "## 🚀 Tenant redeploy chain dispatched"
+              echo
+              echo "- publish-workspace-server-image (workflow_dispatch on \`main\`, actor: \`molecule-ai[bot]\`)"
+              echo "- canary-verify will chain on completion"
+              echo "- redeploy-tenants-on-main will chain on canary green"
+            } >> "$GITHUB_STEP_SUMMARY"
+          else
+            echo "::error::Failed to dispatch publish-workspace-server-image. Run manually: gh workflow run publish-workspace-server-image.yml --ref main"
+          fi
+
+          # ALSO dispatch auto-sync-main-to-staging.yml. Same root cause as
+          # publish above (issue #2357): the merge-queue-initiated push to
+          # main is by GITHUB_TOKEN → no `on: push` triggers fire downstream.
+          # Without this dispatch, every staging→main promote leaves staging
+          # one merge commit BEHIND main, which silently dead-locks the NEXT
+          # promote PR as `mergeStateStatus: BEHIND` because main's
+          # branch-protection has `strict: true`. Verified empirically on
+          # 2026-05-02 against PR #2442 (Phase 2 promote): only the explicit
+          # publish-workspace-server-image dispatch fired on the previous
+          # promote SHA 76c604fb, while auto-sync silently no-op'd, leaving
+          # staging behind for ~24h until manually bridged.
+          if gh workflow run auto-sync-main-to-staging.yml \
+              --repo "$REPO" --ref main 2>&1; then
+            echo "::notice::Dispatched auto-sync-main-to-staging on ref=main as molecule-ai App — staging will absorb the new main merge commit via PR + merge queue."
+          else
+            echo "::error::Failed to dispatch auto-sync-main-to-staging. Run manually: gh workflow run auto-sync-main-to-staging.yml --ref main"
+          fi
@@ -60,6 +60,24 @@ name: Auto-sync main → staging
 on:
  push:
    branches: [main]
+  # workflow_dispatch lets:
+  #   1. Operators manually backfill a missed sync (e.g. after a manual
+  #      UI merge that the runner missed).
+  #   2. auto-promote-staging.yml's polling tail explicitly invoke us
+  #      after the promote PR lands. This is load-bearing: when the
+  #      merge queue lands a promote-PR merge, the resulting push to
+  #      `main` is "by GITHUB_TOKEN", and per GitHub's no-recursion
+  #      rule (https://docs.github.com/en/actions/using-workflows/triggering-a-workflow#triggering-a-workflow-from-a-workflow)
+  #      that push event does NOT fire any downstream workflows. The
+  #      `on: push` trigger above is silently dead for the very pattern
+  #      we exist to handle. Verified empirically 2026-05-02 against
+  #      SHA 76c604fb (PR #2437 staging→main): only ONE workflow fired
+  #      (publish-workspace-server-image, dispatched explicitly by
+  #      auto-promote's polling tail with an App token). Every other
+  #      `on: push: branches: [main]` workflow — including this one —
+  #      was suppressed. Until the underlying merge call moves to an
+  #      App token, an explicit dispatch is the only reliable path.
+  workflow_dispatch:

 permissions:
  contents: write
@@ -71,8 +89,14 @@ concurrency:

 jobs:
  sync-staging:
-    # Self-hosted Mac mini matches the rest of this repo's workflows.
-    runs-on: [self-hosted, macos, arm64]
+    # ubuntu-latest matches every other workflow in this repo. The
+    # earlier `[self-hosted, macos, arm64]` was a copy-paste artefact
+    # from the molecule-controlplane repo (which IS private and uses a
+    # Mac runner) — molecule-core has no Mac runner registered, so the
+    # job sat unassigned whenever the trigger fired. Verified 2026-05-02:
+    # this is the ONLY workflow in molecule-core/.github/workflows/ with
+    # a non-ubuntu runs-on.
+    runs-on: ubuntu-latest
    steps:
      - name: Checkout staging
        uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
@@ -0,0 +1,58 @@
+name: Check migration collisions
+
+# Hard gate (#2341): fails a PR that adds a migration prefix already
+# claimed by the base branch or another open PR. Caught manually 2026-04-30
+# during PR #2276 rebase: 044_runtime_image_pins collided with
+# 044_platform_inbound_secret from RFC #2312. This workflow makes that
+# check automatic.
+#
+# Trigger model: pull_request only — there's no value running this on
+# pushes to staging or main (those are post-merge; the gate must fire
+# pre-merge to be useful). Path filter scopes to PRs that actually touch
+# migrations.
+
+on:
+  pull_request:
+    types: [opened, synchronize, reopened]
+    paths:
+      - 'workspace-server/migrations/**'
+      - 'scripts/ops/check_migration_collisions.py'
+      - '.github/workflows/check-migration-collisions.yml'
+
+permissions:
+  contents: read
+  # gh pr list/diff need read access to other PRs
+  pull-requests: read
+
+jobs:
+  check:
+    name: Migration version collision check
+    runs-on: ubuntu-latest
+    timeout-minutes: 5
+    steps:
+      - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
+        with:
+          # Need history to diff against base ref
+          fetch-depth: 0
+
+      - name: Detect collisions
+        env:
+          PR_NUMBER: ${{ github.event.pull_request.number }}
+          BASE_REF: origin/${{ github.event.pull_request.base.ref }}
+          HEAD_REF: ${{ github.event.pull_request.head.sha }}
+          GITHUB_REPOSITORY: ${{ github.repository }}
+          # gh CLI uses GH_TOKEN from env
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          # Ensure the named base ref exists locally. checkout@v4 with
+          # fetch-depth=0 pulls full history, but the explicit fetch is
+          # cheap insurance against form-of-ref differences across runs.
+          #
+          # IMPORTANT: do NOT pass --depth=1 here. The script below uses
+          # `git diff origin/<base>...<head>` (three-dot, merge-base form),
+          # which fails with "fatal: no merge base" if the base ref is
+          # shallow. The auto-promote staging→main PR (#2361) was blocked
+          # by exactly this for ~5h on 2026-04-30 — the depth=1 fetch
+          # overwrote checkout@v4's full-history clone with a shallow tip.
+          git fetch origin "${{ github.event.pull_request.base.ref }}" || true
+          python3 scripts/ops/check_migration_collisions.py
@@ -63,29 +63,42 @@ jobs:
          echo "python=$(echo "$DIFF" | grep -qE '^workspace/|^\.github/workflows/ci\.yml$' && echo true || echo false)" >> "$GITHUB_OUTPUT"
          echo "scripts=$(echo "$DIFF" | grep -qE '^tests/e2e/|^scripts/|^infra/scripts/|^\.github/workflows/ci\.yml$' && echo true || echo false)" >> "$GITHUB_OUTPUT"

+  # Platform (Go) is a required check on staging. Always-run + per-step
+  # gating (see Canvas (Next.js) for the rationale and the failure mode
+  # this avoids).
  platform-build:
    name: Platform (Go)
    needs: changes
-    if: needs.changes.outputs.platform == 'true'
    runs-on: ubuntu-latest
    defaults:
      run:
        working-directory: workspace-server
    steps:
-      - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
-      - uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5
+      - if: needs.changes.outputs.platform != 'true'
+        working-directory: .
+        run: echo "No platform/** changes — skipping real build steps; this job always runs to satisfy the required-check name on branch protection."
+      - if: needs.changes.outputs.platform == 'true'
+        uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
+      - if: needs.changes.outputs.platform == 'true'
+        uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5
        with:
          go-version: 'stable'
-      - run: go mod download
-      - run: go build ./cmd/server
+      - if: needs.changes.outputs.platform == 'true'
+        run: go mod download
+      - if: needs.changes.outputs.platform == 'true'
+        run: go build ./cmd/server
      # CLI (molecli) moved to standalone repo: github.com/Molecule-AI/molecule-cli
-      - run: go vet ./... || true
-      - name: Run golangci-lint
+      - if: needs.changes.outputs.platform == 'true'
+        run: go vet ./... || true
+      - if: needs.changes.outputs.platform == 'true'
+        name: Run golangci-lint
        run: golangci-lint run --timeout 3m ./... || true
-      - name: Run tests with race detection and coverage
+      - if: needs.changes.outputs.platform == 'true'
+        name: Run tests with race detection and coverage
        run: go test -race -coverprofile=coverage.out ./...

-      - name: Per-file coverage report
+      - if: needs.changes.outputs.platform == 'true'
+        name: Per-file coverage report
        # Advisory — lists every source file with its coverage so reviewers
        # can see at-a-glance where gaps are. Sorted ascending so the worst
        # offenders float to the top. Does NOT fail the build; the hard
@@ -98,7 +111,8 @@ jobs:
                   END {for (f in s) printf "%6.1f%%  %s\n", s[f]/c[f], f}' \
            | sort -n

-      - name: Check coverage thresholds
+      - if: needs.changes.outputs.platform == 'true'
+        name: Check coverage thresholds
        # Enforces two gates from #1823 Layer 1:
        #   1. Total floor (25% — ratchet plan in COVERAGE_FLOOR.md).
        #   2. Per-file floor — non-test .go files in security-critical
@@ -178,23 +192,55 @@ jobs:
            exit 1
          fi

+  # Canvas (Next.js) — required check, always runs. See platform-build
+  # comment above for the rationale.
+  #
+  # Supersedes the canvas-build-noop pattern attempted in PR #2321: two
+  # jobs sharing `name:` doesn't actually satisfy branch protection
+  # because the SKIPPED check run sibling is treated as not-passed
+  # regardless of how many SUCCESS siblings it has. Verified empirically
+  # on PR #2314 — mergeStateStatus stayed BLOCKED until I collapsed to
+  # a single-job-with-conditional-steps shape.
  canvas-build:
    name: Canvas (Next.js)
    needs: changes
-    if: needs.changes.outputs.canvas == 'true'
    runs-on: ubuntu-latest
    defaults:
      run:
        working-directory: canvas
    steps:
-      - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
-      - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
+      - if: needs.changes.outputs.canvas != 'true'
+        working-directory: .
+        run: echo "No canvas/** changes — skipping real build steps; this job always runs to satisfy the required-check name on branch protection."
+      - if: needs.changes.outputs.canvas == 'true'
+        uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
+      - if: needs.changes.outputs.canvas == 'true'
+        uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
        with:
          node-version: '22'
-      - run: rm -f package-lock.json && npm install
-      - run: npm run build
-      - name: Run tests
-        run: npx vitest run
+      - if: needs.changes.outputs.canvas == 'true'
+        run: rm -f package-lock.json && npm install
+      - if: needs.changes.outputs.canvas == 'true'
+        run: npm run build
+      - if: needs.changes.outputs.canvas == 'true'
+        name: Run tests with coverage
+        # Coverage instrumentation is configured in canvas/vitest.config.ts
+        # (provider: v8, reporters: text + html + json-summary). Step 2 of
+        # #1815 — wires coverage into CI so we get a baseline visible on
+        # every PR. No threshold gate yet; thresholds dial in (Step 3, also
+        # tracked in #1815) after the team sees what current coverage is.
+        # Per the inline comment in vitest.config.ts: "first land
+        # observability so we can see the baseline, then dial in
+        # thresholds + a hard gate" — this PR ships the observability half.
+        run: npx vitest run --coverage
+      - name: Upload coverage summary as artifact
+        if: needs.changes.outputs.canvas == 'true' && always()
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        with:
+          name: canvas-coverage-${{ github.run_id }}
+          path: canvas/coverage/
+          retention-days: 7
+          if-no-files-found: warn

  # MCP Server + SDK removed from CI — now in standalone repos:
  # - github.com/Molecule-AI/molecule-mcp-server (npm CI)
@@ -204,14 +250,19 @@ jobs:
  # It now has workflow-level concurrency (cancel-in-progress: false) so
  # new pushes queue the E2E run rather than cancelling it at the run level.

+  # Shellcheck (E2E scripts) — required check, always runs. See
+  # platform-build for the rationale.
  shellcheck:
    name: Shellcheck (E2E scripts)
    needs: changes
-    if: needs.changes.outputs.scripts == 'true'
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
-      - name: Run shellcheck on tests/e2e/*.sh and infra/scripts/*.sh
+      - if: needs.changes.outputs.scripts != 'true'
+        run: echo "No tests/e2e/ or infra/scripts/ changes — skipping real shellcheck; this job always runs to satisfy the required-check name on branch protection."
+      - if: needs.changes.outputs.scripts == 'true'
+        uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
+      - if: needs.changes.outputs.scripts == 'true'
+        name: Run shellcheck on tests/e2e/*.sh and infra/scripts/*.sh
        # shellcheck is pre-installed on ubuntu-latest runners (via apt).
        # infra/scripts/ is included because setup.sh + nuke.sh gate the
        # README quickstart — a shellcheck regression there silently breaks
@@ -265,10 +316,11 @@ jobs:
            "repos/${{ github.repository }}/commits/${{ github.sha }}/comments" \
            --field "body=@/tmp/deploy-reminder.md"

+  # Python Lint & Test — required check, always runs. See platform-build
+  # for the rationale.
  python-lint:
    name: Python Lint & Test
    needs: changes
-    if: needs.changes.outputs.python == 'true'
    runs-on: ubuntu-latest
    env:
      WORKSPACE_ID: test
@@ -276,16 +328,23 @@ jobs:
      run:
        working-directory: workspace
    steps:
-      - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
-      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
+      - if: needs.changes.outputs.python != 'true'
+        working-directory: .
+        run: echo "No workspace/** changes — skipping real lint+test; this job always runs to satisfy the required-check name on branch protection."
+      - if: needs.changes.outputs.python == 'true'
+        uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
+      - if: needs.changes.outputs.python == 'true'
+        uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
        with:
          python-version: '3.11'
          cache: pip
          cache-dependency-path: workspace/requirements.txt
-      - run: pip install -r requirements.txt pytest pytest-asyncio pytest-cov
+      - if: needs.changes.outputs.python == 'true'
+        run: pip install -r requirements.txt pytest pytest-asyncio pytest-cov
      # Coverage flags + fail-under floor moved into workspace/pytest.ini
      # (issue #1817) so local `pytest` and CI use identical config.
-      - run: python -m pytest --tb=short
+      - if: needs.changes.outputs.python == 'true'
+        run: python -m pytest --tb=short

      # SDK + plugin validation moved to standalone repo:
      # github.com/Molecule-AI/molecule-sdk-python
@@ -0,0 +1,160 @@
+name: Continuous synthetic E2E (staging)
+
+# Hard gate (#2342): cron-driven full-lifecycle E2E that catches
+# regressions visible only at runtime — schema drift, deployment-pipeline
+# gaps, vendor outages, env-var rotations, DNS / CF / Railway side-effects.
+#
+# Why this gate exists:
+#   PR-time CI catches code-level regressions but not deployment-time or
+#   integration-time ones. Today's empirical data:
+#     • #2345 (A2A v0.2 silent drop) — passed all unit tests, broke at
+#       JSON-RPC parse layer between sender and receiver. Visible only
+#       to a sender exercising the full path.
+#     • RFC #2312 chat upload — landed on staging-branch but never
+#       reached staging tenants because publish-workspace-server-image
+#       was main-only. Caught by manual dogfooding hours after deploy.
+#   Both would have surfaced within 15-20 min of regression if a
+#   continuous synth-E2E was running.
+#
+# Cadence: every 20 min (3x/hour). The script is conservatively
+# bounded at 10 min wall-clock; even on degraded staging it should
+# finish before the next firing. cron-overlap is guarded by the
+# concurrency group below.
+#
+# Cost: ~3 runs/hour × 5-10 min × $0.008/min GHA = ~$0.50-$1/day.
+# Plus a fresh tenant provisioned + torn down each run (Railway +
+# AWS pennies). Negligible.
+#
+# Failure handling: when the run fails, the workflow exits non-zero
+# and GitHub's standard email/notification path fires. Operators
+# can subscribe to this workflow's failure channel for paging-grade
+# alerting.
+
+on:
+  schedule:
+    # Every 20 minutes, on the :00 :20 :40. Offsets the existing :15
+    # sweep-cf-orphans and :45 sweep-cf-tunnels so the three
+    # operations don't all hit Cloudflare/AWS at the same minute.
+    - cron: '0,20,40 * * * *'
+  workflow_dispatch:
+    inputs:
+      runtime:
+        description: "Runtime to provision (langgraph = fastest, default; hermes = slower but covers SDK-native path; claude-code = needs OAUTH token in tenant env)"
+        required: false
+        default: "langgraph"
+        type: string
+      keep_org:
+        description: "Skip teardown for post-mortem debugging (only manual dispatch — never set this for cron runs)"
+        required: false
+        default: false
+        type: boolean
+
+permissions:
+  contents: read
+  # No issue-write here — failures surface as red runs in the workflow
+  # history. If you want auto-issue-on-fail, add a follow-up step that
+  # uses gh issue create gated on `if: failure()`. Keeping the surface
+  # minimal until that's actually wanted.
+
+# Serialize so two firings can never overlap. Cron firing every 20 min
+# but scripts conservatively bounded at 10 min — overlap shouldn't
+# happen in steady state, but if a run hangs we don't want N more
+# stacking up.
+concurrency:
+  group: continuous-synth-e2e
+  cancel-in-progress: false
+
+jobs:
+  synth:
+    name: Synthetic E2E against staging
+    runs-on: ubuntu-latest
+    timeout-minutes: 12
+    env:
+      # langgraph default keeps cold-start under 5 min on staging EC2.
+      # hermes is slower (~7-10 min) and isn't needed for the
+      # regression class this gate exists to catch (deployment-pipeline
+      # + schema-drift + integration). Operators can pick hermes via
+      # workflow_dispatch when they need to exercise the SDK-native
+      # session path.
+      E2E_RUNTIME: ${{ github.event.inputs.runtime || 'langgraph' }}
+      # Bound to 10 min so a stuck provision fails the run instead of
+      # holding up the next cron firing. 15-min default in the script
+      # is for the on-PR full lifecycle where we have more headroom.
+      E2E_PROVISION_TIMEOUT_SECS: '600'
+      # Slug suffix — namespaced "synth-" so these runs are
+      # distinguishable from PR-driven runs in CP admin.
+      E2E_RUN_ID: synth-${{ github.run_id }}
+      # Forced false for cron; respected for manual dispatch
+      E2E_KEEP_ORG: ${{ github.event.inputs.keep_org == 'true' && '1' || '' }}
+      MOLECULE_CP_URL: ${{ vars.STAGING_CP_URL || 'https://staging-api.moleculesai.app' }}
+      MOLECULE_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
+    steps:
+      - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
+
+      - name: Verify required secret present
+        run: |
+          # Schedule-vs-dispatch hardening (mirrors the sweep-cf-* and
+          # redeploy-tenants-on-* workflows): hard-fail on missing secret
+          # for cron firing so a misconfigured-repo doesn't silently
+          # report green while doing nothing. Soft-skip on operator
+          # dispatch — operators can dispatch ad-hoc to verify a fix
+          # without setting up the secret first.
+          if [ -z "${MOLECULE_ADMIN_TOKEN:-}" ]; then
+            if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
+              echo "::warning::CP_STAGING_ADMIN_API_TOKEN not set — synth E2E cannot run"
+              echo "::warning::Set it at Settings → Secrets and Variables → Actions"
+              exit 0
+            fi
+            echo "::error::CP_STAGING_ADMIN_API_TOKEN secret missing — synth E2E cannot run"
+            echo "::error::Set it at Settings → Secrets and Variables → Actions; pull from staging-CP's CP_ADMIN_API_TOKEN env in Railway."
+            exit 1
+          fi
+
+      - name: Install required tools
+        run: |
+          # The script depends on jq + curl (already on ubuntu-latest)
+          # and python3 (likewise). Verify they're all present so we
+          # fail fast on a runner image regression rather than mid-script.
+          for cmd in jq curl python3; do
+            command -v "$cmd" >/dev/null 2>&1 || {
+              echo "::error::required tool '$cmd' not on PATH — runner image regression?"
+              exit 1
+            }
+          done
+
+      - name: Run synthetic E2E
+        # The script handles its own teardown via EXIT trap; even on
+        # failure (timeout, assertion), the org is deprovisioned and
+        # leaks are reported. Exit code propagates from the script.
+        run: |
+          bash tests/e2e/test_staging_full_saas.sh
+
+      - name: Failure summary
+        # Runs only on failure. Adds a job summary so the workflow run
+        # page shows a quick "what happened" instead of forcing readers
+        # to scroll through script output.
+        if: failure()
+        run: |
+          {
+            echo "## Continuous synth E2E failed"
+            echo ""
+            echo "**Run ID:** ${{ github.run_id }}"
+            echo "**Trigger:** ${{ github.event_name }}"
+            echo "**Runtime:** ${E2E_RUNTIME}"
+            echo "**Slug:** synth-${{ github.run_id }}"
+            echo ""
+            echo "### What this means"
+            echo ""
+            echo "Staging just regressed on a path that previously worked. Likely classes:"
+            echo "- Schema mismatch between sender and receiver (#2345 class)"
+            echo "- Deployment-pipeline gap (RFC #2312 / staging-tenant-image-stale class)"
+            echo "- Vendor outage (Cloudflare, Railway, AWS, GHCR)"
+            echo "- Staging-CP env var rotation"
+            echo ""
+            echo "### Next steps"
+            echo ""
+            echo "1. Check the script output above for the assertion that failed"
+            echo "2. If it's a vendor outage, no action needed — next firing in ~20 min"
+            echo "3. If it's a code regression, find the causing PR via \`git log\` against last green run and revert/fix"
+            echo "4. Keep an eye on the next 1-2 firings — flake vs persistent fail differs in priority"
+          } >> "$GITHUB_STEP_SUMMARY"
@@ -2,22 +2,16 @@ name: E2E API Smoke Test
 # Extracted from ci.yml so workflow-level concurrency can protect this job
 # from run-level cancellation (issue #458).
 #
-# Trigger model (changed 2026-04-28 — see auto-promote gap below):
+# Trigger model (revised 2026-04-29):
 #
-# This workflow always FIRES on push/pull_request to staging+main, but
-# only does real work when paths under `workspace-server/`,
-# `tests/e2e/`, or this workflow file changed. The detect-changes job
-# uses dorny/paths-filter to decide; the e2e-api job runs only if
-# changes match. Otherwise the no-op job emits success so the workflow
-# always produces a `completed/success` run record.
-#
-# Why: auto-promote-staging.yml's gate-check (line 99) treats "workflow
-# didn't run" as failure, which dead-locked any platform-only or
-# test-only push to staging that didn't touch workspace-server paths.
-# Dropping the path filter on the trigger and gating real work
-# internally guarantees the workflow always emits a result that the
-# auto-promote chain can read. Same pattern applied to
-# e2e-staging-canvas.yml in the same PR.
+# Always FIRES on push/pull_request to staging+main. Real work is gated
+# per-step on `needs.detect-changes.outputs.api` — when paths under
+# `workspace-server/`, `tests/e2e/`, or this workflow file haven't
+# changed, the no-op step alone runs and emits SUCCESS for the
+# `E2E API Smoke Test` check, satisfying branch protection without
+# spending CI cycles. See the in-job comment on the `e2e-api` job for
+# why this is one job (not two-jobs-sharing-name) and the 2026-04-29
+# PR #2264 incident that drove the consolidation.

 on:
  push:
@@ -66,27 +60,20 @@ jobs:
            echo "api=${{ steps.filter.outputs.api }}" >> "$GITHUB_OUTPUT"
          fi

-  # Same `name:` as the real job below so the check-run produced by the
-  # no-op path is indistinguishable from the real one for branch
-  # protection purposes. Without this, the real job was always skipped on
-  # paths-filtered commits → branch protection on `main` saw "E2E API
-  # Smoke Test" as a missing required check → auto-promote-staging's
-  # `git push origin main` got rejected with GH006. Observed 2026-04-28
-  # 00:22 UTC blocking the staging→main promote despite all gates
-  # actually passing at the workflow level.
-  no-op:
-    needs: detect-changes
-    if: needs.detect-changes.outputs.api != 'true'
-    name: E2E API Smoke Test
-    runs-on: ubuntu-latest
-    steps:
-      - run: |
-          echo "No workspace-server / tests/e2e / workflow changes — E2E API gate satisfied without running tests."
-          echo "::notice::E2E API Smoke Test no-op pass (paths filter excluded this commit)."
-
+  # ONE job (no job-level `if:`) that always runs and reports under the
+  # required-check name `E2E API Smoke Test`. Real work is gated per-step
+  # on `needs.detect-changes.outputs.api`. Reason: GitHub registers a
+  # check run for every job that matches `name:`, and a job-level
+  # `if: false` produces a SKIPPED check run. Branch protection treats
+  # all check runs with a matching context name on the latest commit as a
+  # SET — any SKIPPED in the set fails the required-check eval, even with
+  # SUCCESS siblings. Verified 2026-04-29 on PR #2264 (staging→main):
+  # 4 check runs (2 SKIPPED + 2 SUCCESS) at the head SHA blocked
+  # promotion despite all real work succeeding. Collapsing to a single
+  # always-running job with conditional steps emits exactly one SUCCESS
+  # check run regardless of paths filter — branch-protection-clean.
  e2e-api:
    needs: detect-changes
-    if: needs.detect-changes.outputs.api == 'true'
    name: E2E API Smoke Test
    runs-on: ubuntu-latest
    timeout-minutes: 15
@@ -97,13 +84,21 @@ jobs:
      PG_CONTAINER: molecule-ci-postgres
      REDIS_CONTAINER: molecule-ci-redis
    steps:
-      - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
-      - uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5
+      - name: No-op pass (paths filter excluded this commit)
+        if: needs.detect-changes.outputs.api != 'true'
+        run: |
+          echo "No workspace-server / tests/e2e / workflow changes — E2E API gate satisfied without running tests."
+          echo "::notice::E2E API Smoke Test no-op pass (paths filter excluded this commit)."
+      - if: needs.detect-changes.outputs.api == 'true'
+        uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
+      - if: needs.detect-changes.outputs.api == 'true'
+        uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5
        with:
          go-version: 'stable'
          cache: true
          cache-dependency-path: workspace-server/go.sum
      - name: Start Postgres (docker)
+        if: needs.detect-changes.outputs.api == 'true'
        run: |
          docker rm -f "$PG_CONTAINER" 2>/dev/null || true
          docker run -d --name "$PG_CONTAINER" -e POSTGRES_USER=dev -e POSTGRES_PASSWORD=dev -e POSTGRES_DB=molecule -p 15432:5432 postgres:16
@@ -118,6 +113,7 @@ jobs:
          docker logs "$PG_CONTAINER" || true
          exit 1
      - name: Start Redis (docker)
+        if: needs.detect-changes.outputs.api == 'true'
        run: |
          docker rm -f "$REDIS_CONTAINER" 2>/dev/null || true
          docker run -d --name "$REDIS_CONTAINER" -p 16379:6379 redis:7
@@ -132,14 +128,17 @@ jobs:
          docker logs "$REDIS_CONTAINER" || true
          exit 1
      - name: Build platform
+        if: needs.detect-changes.outputs.api == 'true'
        working-directory: workspace-server
        run: go build -o platform-server ./cmd/server
      - name: Start platform (background)
+        if: needs.detect-changes.outputs.api == 'true'
        working-directory: workspace-server
        run: |
          ./platform-server > platform.log 2>&1 &
          echo $! > platform.pid
      - name: Wait for /health
+        if: needs.detect-changes.outputs.api == 'true'
        run: |
          for i in $(seq 1 30); do
            if curl -sf http://localhost:8080/health > /dev/null; then
@@ -152,6 +151,7 @@ jobs:
          cat workspace-server/platform.log || true
          exit 1
      - name: Assert migrations applied
+        if: needs.detect-changes.outputs.api == 'true'
        run: |
          tables=$(docker exec "$PG_CONTAINER" psql -U dev -d molecule -tAc "SELECT count(*) FROM information_schema.tables WHERE table_schema='public' AND table_name='workspaces'")
          if [ "$tables" != "1" ]; then
@@ -161,25 +161,28 @@ jobs:
          fi
          echo "Migrations OK"
      - name: Run E2E API tests
+        if: needs.detect-changes.outputs.api == 'true'
        run: bash tests/e2e/test_api.sh
      - name: Run notify-with-attachments E2E
+        if: needs.detect-changes.outputs.api == 'true'
        run: bash tests/e2e/test_notify_attachments_e2e.sh
      - name: Run priority-runtimes E2E (claude-code + hermes — skips when keys absent)
-        # Validates the test script itself runs cleanly even with no LLM
-        # keys (both phases skip gracefully). The wire-real coverage with
-        # actual keys runs in canary-staging.yml + e2e-staging-saas.yml.
+        if: needs.detect-changes.outputs.api == 'true'
        run: bash tests/e2e/test_priority_runtimes_e2e.sh
+      - name: Run poll-mode + since_id cursor E2E (#2339)
+        if: needs.detect-changes.outputs.api == 'true'
+        run: bash tests/e2e/test_poll_mode_e2e.sh
      - name: Dump platform log on failure
-        if: failure()
+        if: failure() && needs.detect-changes.outputs.api == 'true'
        run: cat workspace-server/platform.log || true
      - name: Stop platform
-        if: always()
+        if: always() && needs.detect-changes.outputs.api == 'true'
        run: |
          if [ -f workspace-server/platform.pid ]; then
            kill "$(cat workspace-server/platform.pid)" 2>/dev/null || true
          fi
      - name: Stop service containers
-        if: always()
+        if: always() && needs.detect-changes.outputs.api == 'true'
        run: |
          docker rm -f "$PG_CONTAINER" 2>/dev/null || true
          docker rm -f "$REDIS_CONTAINER" 2>/dev/null || true
@@ -13,19 +13,14 @@ name: E2E Staging Canvas (Playwright)
 # workflow — mirrors what PR #1891 does for e2e-api.yml.

 on:
-  # Trigger model (changed 2026-04-28 — see auto-promote gap below):
+  # Trigger model (revised 2026-04-29):
  #
-  # Always fires on push/pull_request; only does real work when canvas/
-  # or this workflow file changed. The detect-changes job uses
-  # dorny/paths-filter to decide; the playwright job runs only if
-  # changes match. Otherwise no-op emits success so the workflow always
-  # produces a `completed/success` run record.
-  #
-  # Why: auto-promote-staging.yml's gate-check (line 99) treats
-  # "workflow didn't run" as failure, which dead-locked platform-only
-  # pushes to staging. Dropping the trigger path filter and gating real
-  # work internally guarantees a result the auto-promote chain can
-  # read. Same pattern applied to e2e-api.yml in the same PR.
+  # Always fires on push/pull_request; real work is gated per-step on
+  # `needs.detect-changes.outputs.canvas`. When canvas/ paths haven't
+  # changed, the no-op step alone runs and emits SUCCESS for the
+  # `Canvas tabs E2E` check, satisfying branch protection without
+  # spending CI cycles. See e2e-api.yml for the rationale on why this
+  # is a single job rather than two-jobs-sharing-name.
  push:
    branches: [main, staging]
  pull_request:
@@ -82,23 +77,14 @@ jobs:
            echo "canvas=${{ steps.filter.outputs.canvas }}" >> "$GITHUB_OUTPUT"
          fi

-  # Same `name:` as the playwright job below so the check-run is
-  # indistinguishable from the real one for branch protection. Mirrors
-  # the e2e-api.yml fix in the same PR — see that file for the
-  # 2026-04-28 incident reference.
-  no-op:
-    needs: detect-changes
-    if: needs.detect-changes.outputs.canvas != 'true'
-    name: Canvas tabs E2E
-    runs-on: ubuntu-latest
-    steps:
-      - run: |
-          echo "No canvas / workflow changes — E2E Staging Canvas gate satisfied without running tests."
-          echo "::notice::E2E Staging Canvas no-op pass (paths filter excluded this commit)."
-
+  # ONE job (no job-level `if:`) that always runs and reports under the
+  # required-check name `Canvas tabs E2E`. Real work is gated per-step on
+  # `needs.detect-changes.outputs.canvas`. See e2e-api.yml for the full
+  # rationale — same path-filter check-name parity issue blocked PR #2264
+  # (staging→main) on 2026-04-29 because branch protection treats matching-
+  # name check runs as a SET, and any SKIPPED member fails the eval.
  playwright:
    needs: detect-changes
-    if: needs.detect-changes.outputs.canvas == 'true'
    name: Canvas tabs E2E
    runs-on: ubuntu-latest
    timeout-minutes: 40
@@ -113,9 +99,18 @@ jobs:
        working-directory: canvas

    steps:
-      - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
+      - name: No-op pass (paths filter excluded this commit)
+        if: needs.detect-changes.outputs.canvas != 'true'
+        working-directory: .
+        run: |
+          echo "No canvas / workflow changes — E2E Staging Canvas gate satisfied without running tests."
+          echo "::notice::E2E Staging Canvas no-op pass (paths filter excluded this commit)."
+
+      - if: needs.detect-changes.outputs.canvas == 'true'
+        uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4

      - name: Verify admin token present
+        if: needs.detect-changes.outputs.canvas == 'true'
        run: |
          if [ -z "$MOLECULE_ADMIN_TOKEN" ]; then
            echo "::error::Missing MOLECULE_STAGING_ADMIN_TOKEN"
@@ -123,6 +118,7 @@ jobs:
          fi

      - name: Set up Node
+        if: needs.detect-changes.outputs.canvas == 'true'
        uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
        with:
          node-version: '20'
@@ -130,16 +126,19 @@ jobs:
          cache-dependency-path: canvas/package-lock.json

      - name: Install canvas deps
+        if: needs.detect-changes.outputs.canvas == 'true'
        run: npm ci

      - name: Install Playwright browsers
+        if: needs.detect-changes.outputs.canvas == 'true'
        run: npx playwright install --with-deps chromium

      - name: Run staging canvas E2E
+        if: needs.detect-changes.outputs.canvas == 'true'
        run: npx playwright test --config=playwright.staging.config.ts

      - name: Upload Playwright report on failure
-        if: failure()
+        if: failure() && needs.detect-changes.outputs.canvas == 'true'
        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
        with:
          name: playwright-report-staging
@@ -147,50 +146,46 @@ jobs:
          retention-days: 14

      - name: Upload screenshots on failure
-        if: failure()
+        if: failure() && needs.detect-changes.outputs.canvas == 'true'
        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
        with:
          name: playwright-screenshots
          path: canvas/test-results/
          retention-days: 14

-      # Safety-net teardown mirrors the bash-harness workflow — if
-      # globalTeardown didn't run (worker crash, runner cancel), this
-      # step sweeps any e2e-canvas-* org tagged with today's date.
+      # Safety-net teardown — fires only when Playwright's globalTeardown
+      # didn't (worker crash, runner cancel). Reads the slug from
+      # canvas/.playwright-staging-state.json (written by staging-setup
+      # as its first action, before any CP call) and deletes only that
+      # slug.
+      #
+      # Earlier versions of this step pattern-swept `e2e-canvas-<today>-*`
+      # orgs to compensate for setup-crash-before-state-file-write. That
+      # over-aggressive cleanup raced concurrent canvas-E2E runs and
+      # poisoned each other's tenants — observed 2026-04-30 when three
+      # real-test runs killed each other mid-test, surfacing as
+      # `getaddrinfo ENOTFOUND` once CP had cleaned up the just-deleted
+      # DNS record. Pattern-sweep removed; setup now writes the state
+      # file before any CP work, so the slug is always recoverable.
      - name: Teardown safety net
-        if: always()
+        if: always() && needs.detect-changes.outputs.canvas == 'true'
        env:
          ADMIN_TOKEN: ${{ secrets.MOLECULE_STAGING_ADMIN_TOKEN }}
        run: |
          set +e
-          # Midnight-UTC rollover guard: a single-date filter misses
-          # orgs created on the prior UTC day when the run crosses
-          # midnight (incident 2026-04-26 23:46Z → 2026-04-27 00:12Z:
-          # slug `e2e-canvas-20260426-1u8nz3` survived because the
-          # safety-net step ran on the 27th, computed `today=20260427`,
-          # and the filter `e2e-canvas-20260427-` never matched). Sweep
-          # both today AND yesterday's dates so a cross-midnight run
-          # still cleans up its own slug.
-          orgs=$(curl -sS "$MOLECULE_CP_URL/cp/admin/orgs" \
-            -H "Authorization: Bearer $ADMIN_TOKEN" 2>/dev/null \
-            | python3 -c "
-          import json, sys, datetime
-          d = json.load(sys.stdin)
-          today = datetime.date.today()
-          yesterday = today - datetime.timedelta(days=1)
-          prefixes = (
-              f'e2e-canvas-{today.strftime(\"%Y%m%d\")}-',
-              f'e2e-canvas-{yesterday.strftime(\"%Y%m%d\")}-',
-          )
-          candidates = [o['slug'] for o in d.get('orgs', [])
-                        if any(o.get('slug','').startswith(p) for p in prefixes)
-                        and o.get('status') not in ('purged',)]
-          print('\n'.join(candidates))
-          " 2>/dev/null)
-          for slug in $orgs; do
-            curl -sS -X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \
-              -H "Authorization: Bearer $ADMIN_TOKEN" \
-              -H "Content-Type: application/json" \
-              -d "{\"confirm\":\"$slug\"}" >/dev/null || true
-          done
+          STATE_FILE=".playwright-staging-state.json"
+          if [ ! -f "$STATE_FILE" ]; then
+            echo "::notice::No state file at canvas/$STATE_FILE — Playwright globalTeardown handled it (or setup never ran)."
+            exit 0
+          fi
+          slug=$(python3 -c "import json; print(json.load(open('$STATE_FILE')).get('slug',''))")
+          if [ -z "$slug" ]; then
+            echo "::warning::State file present but slug missing; nothing to clean up."
+            exit 0
+          fi
+          echo "Deleting orphan tenant: $slug"
+          curl -sS -X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \
+            -H "Authorization: Bearer $ADMIN_TOKEN" \
+            -H "Content-Type: application/json" \
+            -d "{\"confirm\":\"$slug\"}" >/dev/null || true
          exit 0
@@ -0,0 +1,164 @@
+name: E2E Staging External Runtime
+
+# Regression for the four/five workspaces.status=awaiting_agent transitions
+# that silently failed in production for five days before migration 046
+# extended the workspace_status enum (see
+# workspace-server/migrations/046_workspace_status_awaiting_agent.up.sql).
+#
+# Why this is its own workflow (not folded into e2e-staging-saas.yml):
+#   - The full-saas harness defaults to runtime=hermes, never exercises
+#     external-runtime. Adding an `external` parameter to that script
+#     would force every push to staging through both lifecycles in
+#     series, doubling the EC2 cold-start budget.
+#   - The external lifecycle has unique timing (REMOTE_LIVENESS_STALE_AFTER
+#     window, 90s default + sweep interval), which we wait through
+#     deliberately. Folding it into hermes would make the long path
+#     even longer.
+#   - It can run in parallel with the hermes E2E since both create
+#     fresh tenant orgs with distinct slug prefixes (`e2e-ext-...` vs
+#     `e2e-...`).
+#
+# Triggers:
+#   - Push to staging when any source affecting external runtime,
+#     hibernation, or the migration set changes.
+#   - PR review for the same set.
+#   - Manual workflow_dispatch.
+#   - Daily cron at 07:30 UTC (catches drift on quiet days; staggered
+#     30 min after e2e-staging-saas.yml's 07:00 UTC cron).
+#
+# Concurrency: serialized so two staging pushes don't fight for the
+# same EC2 quota window. cancel-in-progress=false so a half-rolled
+# tenant always finishes its teardown.
+
+on:
+  push:
+    branches: [staging, main]
+    paths:
+      - 'workspace-server/internal/handlers/workspace.go'
+      - 'workspace-server/internal/handlers/registry.go'
+      - 'workspace-server/internal/handlers/workspace_restart.go'
+      - 'workspace-server/internal/registry/healthsweep.go'
+      - 'workspace-server/internal/registry/liveness.go'
+      - 'workspace-server/migrations/**'
+      - 'workspace-server/internal/db/workspace_status_enum_drift_test.go'
+      - 'tests/e2e/test_staging_external_runtime.sh'
+      - '.github/workflows/e2e-staging-external.yml'
+  pull_request:
+    branches: [staging, main]
+    paths:
+      - 'workspace-server/internal/handlers/workspace.go'
+      - 'workspace-server/internal/handlers/registry.go'
+      - 'workspace-server/internal/handlers/workspace_restart.go'
+      - 'workspace-server/internal/registry/healthsweep.go'
+      - 'workspace-server/internal/registry/liveness.go'
+      - 'workspace-server/migrations/**'
+      - 'workspace-server/internal/db/workspace_status_enum_drift_test.go'
+      - 'tests/e2e/test_staging_external_runtime.sh'
+      - '.github/workflows/e2e-staging-external.yml'
+  workflow_dispatch:
+    inputs:
+      keep_org:
+        description: "Skip teardown for debugging (only via manual dispatch)"
+        required: false
+        type: boolean
+        default: false
+      stale_wait_secs:
+        description: "Seconds to wait for the heartbeat-staleness sweep (default 180 = 90s window + 90s buffer)"
+        required: false
+        default: "180"
+  schedule:
+    - cron: '30 7 * * *'
+
+concurrency:
+  group: e2e-staging-external
+  cancel-in-progress: false
+
+permissions:
+  contents: read
+
+jobs:
+  e2e-staging-external:
+    name: E2E Staging External Runtime
+    runs-on: ubuntu-latest
+    timeout-minutes: 25
+
+    env:
+      MOLECULE_CP_URL: https://staging-api.moleculesai.app
+      MOLECULE_ADMIN_TOKEN: ${{ secrets.MOLECULE_STAGING_ADMIN_TOKEN }}
+      E2E_RUN_ID: "${{ github.run_id }}-${{ github.run_attempt }}"
+      E2E_KEEP_ORG: ${{ github.event.inputs.keep_org && '1' || '0' }}
+      E2E_STALE_WAIT_SECS: ${{ github.event.inputs.stale_wait_secs || '180' }}
+
+    steps:
+      - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
+
+      - name: Verify admin token present
+        run: |
+          if [ -z "$MOLECULE_ADMIN_TOKEN" ]; then
+            # Schedule + push triggers must hard-fail when the token is
+            # missing — silent skip would mask infra rot. Manual dispatch
+            # gets the same hard-fail; an operator running this on a fork
+            # without secrets configured needs to know up-front.
+            echo "::error::MOLECULE_STAGING_ADMIN_TOKEN secret not set (Railway staging CP_ADMIN_API_TOKEN)"
+            exit 2
+          fi
+          echo "Admin token present ✓"
+
+      - name: CP staging health preflight
+        run: |
+          code=$(curl -sS -o /dev/null -w "%{http_code}" --max-time 10 "$MOLECULE_CP_URL/health")
+          if [ "$code" != "200" ]; then
+            echo "::error::Staging CP unhealthy (got HTTP $code). Skipping — not a workspace bug."
+            exit 1
+          fi
+          echo "Staging CP healthy ✓"
+
+      - name: Run external-runtime E2E
+        id: e2e
+        run: bash tests/e2e/test_staging_external_runtime.sh
+
+      # Mirror the e2e-staging-saas.yml safety net: if the runner is
+      # cancelled (e.g. concurrent staging push), the test script's
+      # EXIT trap may not fire, so we sweep e2e-ext-* slugs scoped to
+      # *this* run id.
+      - name: Teardown safety net (runs on cancel/failure)
+        if: always()
+        env:
+          ADMIN_TOKEN: ${{ secrets.MOLECULE_STAGING_ADMIN_TOKEN }}
+        run: |
+          set +e
+          orgs=$(curl -sS "$MOLECULE_CP_URL/cp/admin/orgs" \
+            -H "Authorization: Bearer $ADMIN_TOKEN" 2>/dev/null \
+            | python3 -c "
+          import json, sys, os, datetime
+          run_id = os.environ.get('GITHUB_RUN_ID', '')
+          d = json.load(sys.stdin)
+          # Scope STRICTLY to this run id (e2e-ext-YYYYMMDD-<runid>-...)
+          # so concurrent runs and unrelated dev probes are not touched.
+          # Sweep today AND yesterday so a midnight-crossing run still
+          # cleans up its own slug.
+          today = datetime.date.today()
+          yesterday = today - datetime.timedelta(days=1)
+          dates = (today.strftime('%Y%m%d'), yesterday.strftime('%Y%m%d'))
+          if not run_id:
+              # Without a run id we cannot scope safely; bail rather
+              # than risk deleting unrelated tenants.
+              sys.exit(0)
+          prefixes = tuple(f'e2e-ext-{d}-{run_id}-' for d in dates)
+          for o in d.get('orgs', []):
+              s = o.get('slug', '')
+              if s.startswith(prefixes) and o.get('status') != 'purged':
+                  print(s)
+          " 2>/dev/null)
+          if [ -n "$orgs" ]; then
+            echo "Safety-net sweep: deleting leftover orgs:"
+            echo "$orgs"
+            for slug in $orgs; do
+              curl -sS -X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \
+                -H "Authorization: Bearer $ADMIN_TOKEN" \
+                -H "Content-Type: application/json" \
+                -d "{\"confirm\":\"$slug\"}" >/dev/null 2>&1
+            done
+          else
+            echo "Safety-net sweep: no leftover orgs to clean."
+          fi
@@ -0,0 +1,170 @@
+name: Harness Replays
+
+# Boots tests/harness (production-shape compose topology with TenantGuard,
+# /cp/* proxy, canvas proxy, real production Dockerfile.tenant) and runs
+# every replay under tests/harness/replays/. Fails the PR if any replay
+# fails.
+#
+# Why this exists: 2026-04-30 we shipped #2398 which added /buildinfo as
+# a public route in router.go but forgot to add it to TenantGuard's
+# allowlist. The handler-level test in buildinfo_test.go constructed a
+# minimal gin engine without TenantGuard — green. The harness's
+# buildinfo-stale-image.sh replay would have caught it (cf-proxy doesn't
+# inject X-Molecule-Org-Id, so the curl path is identical to production's
+# redeploy verifier), but no one ran the harness pre-merge. The bug
+# shipped; the redeploy verifier silently soft-warned every tenant as
+# "unreachable" for ~1 day before being noticed.
+#
+# This gate makes "did you actually run the harness?" a CI invariant
+# instead of a memory-discipline thing.
+#
+# Trigger model — match e2e-api.yml: always FIRES on push/pull_request
+# to staging+main, real work is gated per-step on detect-changes output.
+# One job → one check run → branch-protection-clean (the SKIPPED-in-set
+# trap from PR #2264 is documented in e2e-api.yml's e2e-api job comment).
+
+on:
+  push:
+    branches: [main, staging]
+    paths:
+      - 'workspace-server/**'
+      - 'canvas/**'
+      - 'tests/harness/**'
+      - '.github/workflows/harness-replays.yml'
+  pull_request:
+    branches: [main, staging]
+    paths:
+      - 'workspace-server/**'
+      - 'canvas/**'
+      - 'tests/harness/**'
+      - '.github/workflows/harness-replays.yml'
+  workflow_dispatch:
+  merge_group:
+    types: [checks_requested]
+
+concurrency:
+  # Per-SHA grouping. Per-ref kept hitting the auto-promote-staging
+  # cancellation deadlock — see e2e-api.yml's concurrency block for
+  # the 2026-04-28 incident that codified this pattern.
+  group: harness-replays-${{ github.event.pull_request.head.sha || github.sha }}
+  cancel-in-progress: false
+
+jobs:
+  detect-changes:
+    runs-on: ubuntu-latest
+    outputs:
+      run: ${{ steps.decide.outputs.run }}
+    steps:
+      - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
+      - uses: dorny/paths-filter@fbd0ab8f3e69293af611ebaee6363fc25e6d187d # v4.0.1
+        id: filter
+        with:
+          filters: |
+            run:
+              - 'workspace-server/**'
+              - 'canvas/**'
+              - 'tests/harness/**'
+              - '.github/workflows/harness-replays.yml'
+      - id: decide
+        run: |
+          if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
+            echo "run=true" >> "$GITHUB_OUTPUT"
+          else
+            echo "run=${{ steps.filter.outputs.run }}" >> "$GITHUB_OUTPUT"
+          fi
+
+  # ONE job that always runs. Real work is gated per-step on
+  # detect-changes.outputs.run so an unrelated PR (e.g. doc-only
+  # change to molecule-controlplane wired here later) emits the
+  # required check without spending CI cycles. Single-job pattern
+  # matches e2e-api.yml — see that workflow's comment for why a
+  # job-level `if: false` would block branch protection via the
+  # SKIPPED-in-set bug.
+  harness-replays:
+    needs: detect-changes
+    name: Harness Replays
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+    steps:
+      - name: No-op pass (paths filter excluded this commit)
+        if: needs.detect-changes.outputs.run != 'true'
+        run: |
+          echo "No workspace-server / canvas / tests/harness / workflow changes — Harness Replays gate satisfied without running."
+          echo "::notice::Harness Replays no-op pass (paths filter excluded this commit)."
+
+      - if: needs.detect-changes.outputs.run == 'true'
+        uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
+
+      - name: Checkout sibling plugin repo
+        # Dockerfile.tenant copies molecule-ai-plugin-github-app-auth/
+        # at the build-context root (see workspace-server/Dockerfile.tenant
+        # line 19). PLUGIN_REPO_PAT pattern matches publish-workspace-server-image.yml.
+        if: needs.detect-changes.outputs.run == 'true'
+        uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
+        with:
+          repository: Molecule-AI/molecule-ai-plugin-github-app-auth
+          path: molecule-ai-plugin-github-app-auth
+          token: ${{ secrets.PLUGIN_REPO_PAT || secrets.GITHUB_TOKEN }}
+
+      - name: Install Python deps for replays
+        # peer-discovery-404 (and future replays) eval Python against the
+        # running tenant — importing workspace/a2a_client.py pulls in
+        # httpx. tests/harness/requirements.txt holds just the HTTP-client
+        # surface to keep CI install fast (~3s) vs the full
+        # workspace/requirements.txt (~30s).
+        if: needs.detect-changes.outputs.run == 'true'
+        run: pip install -r tests/harness/requirements.txt
+
+      - name: Run all replays against the harness
+        # run-all-replays.sh: boot via up.sh → seed via seed.sh → run
+        # every replays/*.sh → tear down via down.sh on EXIT (trap).
+        # Non-zero exit on any replay failure.
+        #
+        # KEEP_UP=1: without this, the script's trap-on-EXIT tears
+        # down containers immediately on failure, leaving the dump
+        # step below with nothing to dump (verified on PR #2410's
+        # first run — tenant became unhealthy, trap fired, dump
+        # step saw empty containers). Keeping them up lets the
+        # failure path collect tenant/cp-stub/cf-proxy logs. The
+        # always-run "Force teardown" step does the actual cleanup.
+        if: needs.detect-changes.outputs.run == 'true'
+        working-directory: tests/harness
+        env:
+          KEEP_UP: "1"
+        run: ./run-all-replays.sh
+
+      - name: Dump compose logs on failure
+        # SECRETS_ENCRYPTION_KEY: docker compose validates the entire compose
+        # file even for read-only `logs` calls. up.sh generates a per-run key
+        # and exports it to its OWN shell — this step runs in a fresh shell
+        # that wouldn't see it, so without a placeholder the validate step
+        # errors before logs print (verified against PR #2492's first run:
+        # "required variable SECRETS_ENCRYPTION_KEY is missing a value").
+        # A placeholder is fine — we're only reading log streams, not booting.
+        if: failure() && needs.detect-changes.outputs.run == 'true'
+        working-directory: tests/harness
+        env:
+          SECRETS_ENCRYPTION_KEY: dump-logs-placeholder
+        run: |
+          echo "=== docker compose ps ==="
+          docker compose -f compose.yml ps || true
+          echo "=== tenant-alpha logs ==="
+          docker compose -f compose.yml logs tenant-alpha || true
+          echo "=== tenant-beta logs ==="
+          docker compose -f compose.yml logs tenant-beta || true
+          echo "=== cp-stub logs ==="
+          docker compose -f compose.yml logs cp-stub || true
+          echo "=== cf-proxy logs ==="
+          docker compose -f compose.yml logs cf-proxy || true
+          echo "=== postgres-alpha logs (last 100) ==="
+          docker compose -f compose.yml logs --tail 100 postgres-alpha || true
+          echo "=== postgres-beta logs (last 100) ==="
+          docker compose -f compose.yml logs --tail 100 postgres-beta || true
+
+      - name: Force teardown
+        # We pass KEEP_UP=1 to run-all-replays.sh so the dump step
+        # above sees real containers — that means we own teardown
+        # explicitly here. Always run.
+        if: always() && needs.detect-changes.outputs.run == 'true'
+        working-directory: tests/harness
+        run: ./down.sh || true
@@ -154,139 +154,15 @@ jobs:

      - name: Verify package contents (sanity)
        working-directory: ${{ runner.temp }}/runtime-build
+        # Smoke logic lives in scripts/wheel_smoke.py so the same gate runs
+        # at both PR-time (runtime-prbuild-compat.yml) and publish-time
+        # (here). Splitting the smoke across two heredocs let them drift
+        # apart historically — one script keeps them locked.
        run: |
          python -m twine check dist/*
-          # Smoke-import the built wheel to catch import-rewrite mistakes
-          # before they hit PyPI. Asserts on STABLE INVARIANTS only —
-          # symbols + classes that are part of the package's public
-          # contract (BaseAdapter interface, the canonical a2a sentinel,
-          # core submodules). Don't add feature-flag-style assertions
-          # here — they fire false-positive every time staging is mid-
-          # release of that feature.
          python -m venv /tmp/smoke
          /tmp/smoke/bin/pip install --quiet dist/*.whl
-          WORKSPACE_ID=00000000-0000-0000-0000-000000000000 \
-          PLATFORM_URL=http://localhost:8080 \
-            /tmp/smoke/bin/python -c "
-          # Importing main is the strongest smoke test we can do here:
-          # main.py is the entry point and pulls every other module
-          # transitively. If the build script missed an import rewrite
-          # (e.g. left a bare \`from transcript_auth import ...\` instead
-          # of \`from molecule_runtime.transcript_auth import ...\` — the
-          # 0.1.16 incident), this fails with ModuleNotFoundError instead
-          # of shipping to PyPI and breaking every workspace startup.
-          # Import the entry-point target by NAME — not just the module.
-          # The wheel's pyproject.toml declares
-          # `molecule-runtime = molecule_runtime.main:main_sync` so if
-          # main_sync goes missing (it did in 0.1.16-0.1.18), every
-          # workspace startup fails with `ImportError: cannot import name
-          # 'main_sync'`. Plain `import molecule_runtime.main` doesn't
-          # catch that because the module loads fine.
-          from molecule_runtime.main import main_sync  # noqa: F401
-          from molecule_runtime import a2a_client, a2a_tools
-          from molecule_runtime.builtin_tools import memory
-          from molecule_runtime.adapters import get_adapter, BaseAdapter, AdapterConfig
-          # Stable invariants: package exports + BaseAdapter shape.
-          assert a2a_client._A2A_ERROR_PREFIX, 'a2a_client missing error sentinel'
-          assert callable(get_adapter), 'adapters.get_adapter must be callable'
-          assert hasattr(BaseAdapter, 'name'), 'BaseAdapter interface broken'
-          assert hasattr(AdapterConfig, '__init__'), 'AdapterConfig dataclass missing'
-
-          # Call-shape smoke for AgentCard. Pure imports don't catch
-          # field-shape regressions in upstream SDKs that only surface
-          # at construction time. Two bugs of this exact class shipped
-          # since the a2a-sdk 1.0 migration:
-          #   - state_transition_history=True (fixed in #2179)
-          #   - supported_protocols=[...] (the protobuf field is
-          #     supported_interfaces — caused every workspace boot
-          #     to crash with `ValueError: Protocol message AgentCard
-          #     has no "supported_protocols" field`; fixed alongside
-          #     this smoke)
-          #
-          # This block instantiates the EXACT classes main.py uses,
-          # with the EXACT keyword arguments. If a future a2a-sdk
-          # upgrade renames any of supported_interfaces / streaming /
-          # push_notifications / etc., the publish fails here instead
-          # of breaking every workspace startup. main.py and this
-          # smoke MUST stay in lockstep — adding a kwarg to one
-          # without mirroring it here is the regression vector.
-          from a2a.types import AgentCard, AgentCapabilities, AgentSkill, AgentInterface
-          AgentCard(
-              name='smoke-agent',
-              description='publish-runtime smoke test',
-              version='0.0.0-smoke',
-              supported_interfaces=[
-                  AgentInterface(protocol_binding='https://a2a.g/v1', url='http://localhost:8080'),
-              ],
-              capabilities=AgentCapabilities(
-                  streaming=True,
-                  push_notifications=False,
-              ),
-              skills=[
-                  AgentSkill(
-                      id='smoke-skill',
-                      name='Smoke',
-                      description='no-op',
-                      tags=['smoke'],
-                      examples=['noop'],
-                  ),
-              ],
-              default_input_modes=['text/plain', 'application/json'],
-              default_output_modes=['text/plain', 'application/json'],
-          )
-          print('✓ AgentCard call-shape smoke passed')
-
-          # Well-known agent-card path probe alignment. main.py's
-          # _send_initial_prompt() polls AGENT_CARD_WELL_KNOWN_PATH
-          # to know when the local A2A server is ready. If the SDK
-          # ever splits the constant value from the path that
-          # create_agent_card_routes() actually mounts at, every
-          # workspace silently drops its initial_prompt:
-          #   - Probe gets 404 every attempt.
-          #   - Falls through to 'server not ready after 30s,
-          #     skipping' even though the server is fine.
-          #   - The user hits a fresh chat with no kickoff context.
-          # This was the #2193 incident class — the v0.x → v1.x
-          # rename of /.well-known/agent.json → /.well-known/agent-card.json
-          # plus the constant itself moving to a2a.utils.constants.
-          # source-tree pytest (test_agent_card_well_known_path.py)
-          # catches main.py-side regressions; this catches the
-          # SDK-side ones BEFORE PyPI upload.
-          from a2a.utils.constants import AGENT_CARD_WELL_KNOWN_PATH
-          from a2a.server.routes import create_agent_card_routes
-          mounted_paths = [
-              getattr(r, 'path', None)
-              for r in create_agent_card_routes(
-                  AgentCard(
-                      name='wk-smoke',
-                      description='well-known mount alignment',
-                      version='0.0.0-smoke',
-                  )
-              )
-          ]
-          assert AGENT_CARD_WELL_KNOWN_PATH in mounted_paths, (
-              f'AGENT_CARD_WELL_KNOWN_PATH ({AGENT_CARD_WELL_KNOWN_PATH!r}) '
-              f'is NOT among paths mounted by create_agent_card_routes '
-              f'({mounted_paths!r}). The SDK constant and its own route '
-              f'factory have drifted — workspace probes will 404 forever, '
-              f'silently dropping every workspace initial_prompt.'
-          )
-          print(f'✓ well-known mount alignment OK ({AGENT_CARD_WELL_KNOWN_PATH})')
-
-          # Message helper smoke. a2a-sdk renamed
-          # new_agent_text_message → new_text_message in the v1.x
-          # protobuf-flat migration (per the v0→v1 cheat sheet). main.py
-          # and a2a_executor.py call new_text_message in hot paths; if
-          # the import breaks, every reply errors with ImportError before
-          # the message even leaves the workspace. Importing here
-          # catches a future v2.x rename at publish time.
-          from a2a.helpers import new_text_message
-          msg = new_text_message('smoke')
-          assert msg is not None, 'new_text_message returned None'
-          print('✓ message helper import + call OK')
-
-          print('✓ smoke import passed')
-          "
+          /tmp/smoke/bin/python "$GITHUB_WORKSPACE/scripts/wheel_smoke.py"

      - name: Publish to PyPI (Trusted Publisher / OIDC)
        # PyPI side is configured: project molecule-ai-workspace-runtime →
@@ -1,19 +1,60 @@
 name: publish-workspace-server-image

-# Builds and pushes Docker images to GHCR when staging is promoted to main.
-# PRs target staging (default branch). Only main push triggers production builds.
+# Builds and pushes Docker images to GHCR on staging or main pushes.
 # EC2 tenant instances pull the tenant image from GHCR.
+#
+# Branch / tag policy (see Compute tags step for the per-branch logic):
+#
+#   staging push  → builds image, tags :staging-<sha> + :staging-latest.
+#                   staging-CP pins TENANT_IMAGE=:staging-latest, so it
+#                   picks up staging-branch code automatically. This is
+#                   what makes staging-CP actually test staging-branch
+#                   code instead of "yesterday's main" — pre-fix, this
+#                   workflow only ran on main, so staging tenants
+#                   silently served stale code (#2308 fix RFC #2312
+#                   landed on staging but never reached tenants because
+#                   staging→main was wedged on path-filter parity bugs).
+#
+#   main push     → builds image, tags :staging-<sha> + :staging-latest
+#                   (same as before). canary-verify.yml retags
+#                   :staging-<sha> → :latest after canary tenants
+#                   green-light the digest. The :staging-latest retag
+#                   on main push is intentional: when main lands AFTER a
+#                   staging push, staging-CP gets the post-promote code
+#                   (which equals what it had + any merge resolution),
+#                   so the canary-on-staging-CP step still runs against
+#                   the prod-bound digest.
+#
+# In the steady state both branches refresh :staging-latest; the
+# semantic is "most recent staging-or-main build of tenant code."
+# Drift between the two is bounded by the staging→main auto-promote
+# cadence and is corrected on the next staging push.

 on:
  push:
-    branches: [main]
+    branches: [staging, main]
    paths:
      - 'workspace-server/**'
      - 'canvas/**'
      - 'manifest.json'
-      - '.github/workflows/publish-platform-image.yml'
+      - '.github/workflows/publish-workspace-server-image.yml'
  workflow_dispatch:

+# Serialize per-branch so two rapid staging pushes don't race the same
+# :staging-latest tag retag. Allow staging and main to run in parallel
+# (different github.ref → different concurrency group) since they
+# produce different :staging-<sha> tags and last-write-wins on
+# :staging-latest is acceptable across branches (the post-promote
+# main code equals current staging code in a healthy flow).
+#
+# cancel-in-progress: false → in-flight builds finish; the next push's
+# build queues. This avoids a partially-pushed image and keeps the
+# canary fleet pin (:staging-<sha>) consistent with what was actually
+# tested at canary-verify time.
+concurrency:
+  group: publish-workspace-server-image-${{ github.ref }}
+  cancel-in-progress: false
+
 permissions:
  contents: read
  packages: write
@@ -63,29 +104,32 @@ jobs:
        run: |
          echo "sha=${GITHUB_SHA::7}" >> "$GITHUB_OUTPUT"

-      # Canary-gated release: we publish :staging-<sha> ONLY here. The
-      # :latest tag (which existing prod tenants auto-pull every 5 min)
-      # is promoted by .github/workflows/canary-verify.yml after the
-      # staging canary fleet green-lights this digest.
-      # That means:
-      #   - Every main merge produces a :staging-<sha> image
-      #   - Canary tenants (configured to pull :staging-<sha>) pick it up
-      #   - canary-verify.yml runs smoke tests against them
-      #   - On green → canary-verify retags :staging-<sha> → :latest
-      #   - On red → :latest stays on the prior good digest, prod is safe
-      # Every push of :staging-<sha> also retags the same digest as
-      # :staging-latest so staging CP (which pins TENANT_IMAGE at
-      # :staging-latest) picks up new builds automatically — no more manual
-      # Railway env-var edits. Prod's :latest retag still happens in
-      # canary-verify.yml after the canary fleet greenlights this digest;
-      # :staging-latest is strictly the "most recent main build," not a
-      # canary-verified promotion.
+      # Canary-gated release flow:
+      #   - This step always publishes :staging-<sha> + :staging-latest.
+      #   - On staging push, staging-CP picks up :staging-latest immediately
+      #     (its TENANT_IMAGE pin is :staging-latest) — so staging-branch
+      #     code reaches staging tenants without waiting for main.
+      #   - On main push, canary-verify.yml runs smoke tests against
+      #     canary tenants (which pin :staging-<sha>), and on green retags
+      #     :staging-<sha> → :latest. Prod tenants pull :latest.
+      #   - On red, :latest stays on the prior good digest — prod is safe.
      #
-      # Before this, TENANT_IMAGE on Railway staging was pinned to a static
-      # :staging-<sha> and drifted months behind (2026-04-24 incident:
-      # canary tenant ran :staging-a14cf86, 10 days stale, which lacked
-      # applyRuntimeModelEnv and caused every E2E to route hermes+openai
-      # through openrouter → 401). See issue filed with this PR.
+      # Why :staging-latest is retagged on main push too: when main lands
+      # after a staging promote, staging-CP gets the post-promote code so
+      # the canary-on-staging-CP step still runs against the prod-bound
+      # digest. In a healthy flow the post-promote main code == the
+      # current staging code, so this is effectively a no-op except for
+      # the canary fleet pin handoff.
+      #
+      # Pre-fix history: this workflow used to only trigger on main. That
+      # meant staging-CP served "yesterday's main" indefinitely whenever
+      # staging→main was wedged. The 2026-04-30 dogfooding session
+      # surfaced this when RFC #2312 (chat upload HTTP-forward) landed on
+      # staging but staging tenants kept failing chat upload because they
+      # were running pre-RFC code. Adding the staging trigger above closes
+      # that gap. Earlier 2026-04-24 incident: a static :staging-<sha> pin
+      # drifted 10 days behind staging — same class of bug, different
+      # mechanism.
      - name: Build & push platform image to GHCR (staging-<sha> + staging-latest)
        uses: docker/build-push-action@10e90e3645eae34f1e60eeb005ba3a3d33f178e8 # v6
        with:
@@ -98,6 +142,13 @@ jobs:
            ${{ env.IMAGE_NAME }}:staging-latest
          cache-from: type=gha
          cache-to: type=gha,mode=max
+          # GIT_SHA bakes into the Go binary via -ldflags so /buildinfo
+          # returns it at runtime — see Dockerfile + buildinfo/buildinfo.go.
+          # This is the same value as the OCI revision label below; passing
+          # it twice is intentional, the OCI label is for registry tooling
+          # while /buildinfo is for the redeploy verification step.
+          build-args: |
+            GIT_SHA=${{ github.sha }}
          labels: |
            org.opencontainers.image.source=https://github.com/${{ github.repository }}
            org.opencontainers.image.revision=${{ github.sha }}
@@ -128,6 +179,7 @@ jobs:
          # NEXT_PUBLIC_PLATFORM_URL=http://localhost:8080).
          build-args: |
            NEXT_PUBLIC_PLATFORM_URL=
+            GIT_SHA=${{ github.sha }}
          labels: |
            org.opencontainers.image.source=https://github.com/${{ github.repository }}
            org.opencontainers.image.revision=${{ github.sha }}
@@ -0,0 +1,207 @@
+name: Railway pin audit (drift detection)
+
+# Daily audit of Railway env vars for drift-prone image-tag pins —
+# automation-cadence layer over the detection script + regression test
+# shipped in PR #2168 (#2001 closure).
+#
+# Background: on 2026-04-24 a stale `:staging-a14cf86` SHA pin in CP's
+# TENANT_IMAGE caused 3+ hours of E2E failure with the appearance that
+# "every fix didn't propagate" — really the tenant image was so old it
+# didn't read the env vars those fixes produced. The audit script
+# (scripts/ops/audit-railway-sha-pins.sh) flags drift; this workflow
+# runs the same check unattended on a daily cron.
+#
+# Cadence: once a day, 13:00 UTC (06:00 PT). Daily is the right
+# cadence for variables-tier config — Railway env var changes are
+# deliberate operator actions, low-frequency. Hourly would risk
+# Railway API rate-limit surprises and is overkill for the change rate.
+#
+# Issue-on-failure: drift triggers a priority-high issue, mirroring
+# .github/workflows/e2e-staging-sanity.yml's pattern. Drift is
+# medium-priority "config slipped, fix at next ops window," not
+# active-outage paging.
+#
+# Secret hardening: per feedback_schedule_vs_dispatch_secrets_hardening,
+# the schedule trigger HARD-FAILS on missing RAILWAY_AUDIT_TOKEN
+# (silent-success on schedule was the failure-mode class that bit the
+# team before; cron firing without checking anything is worse than no
+# cron). The workflow_dispatch trigger SOFT-SKIPS on missing secret so
+# an operator can dry-run the workflow shape during initial provisioning
+# without tripping a fake red.
+
+on:
+  schedule:
+    - cron: '0 13 * * *'
+  workflow_dispatch:
+
+concurrency:
+  group: railway-pin-audit
+  cancel-in-progress: false
+
+permissions:
+  issues: write
+  contents: read
+
+jobs:
+  audit:
+    name: Audit Railway env vars for drift-prone pins
+    runs-on: ubuntu-latest
+    timeout-minutes: 10
+
+    steps:
+      - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
+
+      - name: Verify RAILWAY_AUDIT_TOKEN present
+        # Schedule trigger: hard-fail when the secret is missing —
+        # otherwise the cron silently runs against the wrong scope (or
+        # exits 2 from the script and we issue-spam) without anyone
+        # noticing the token rot.
+        # Dispatch trigger: soft-skip — operator may be dry-running the
+        # workflow shape before provisioning the secret. Logged as a
+        # workflow notice, not a failure.
+        env:
+          RAILWAY_AUDIT_TOKEN: ${{ secrets.RAILWAY_AUDIT_TOKEN }}
+          EVENT_NAME: ${{ github.event_name }}
+        id: secret_check
+        run: |
+          set -euo pipefail
+          if [ -n "${RAILWAY_AUDIT_TOKEN:-}" ]; then
+            echo "have_secret=true" >> "$GITHUB_OUTPUT"
+            exit 0
+          fi
+          echo "have_secret=false" >> "$GITHUB_OUTPUT"
+          if [ "$EVENT_NAME" = "workflow_dispatch" ]; then
+            echo "::notice::RAILWAY_AUDIT_TOKEN not configured — soft-skipping (manual dispatch)"
+            exit 0
+          fi
+          echo "::error::RAILWAY_AUDIT_TOKEN secret missing — schedule trigger requires it. Provision the token (read-only \`variables\` scope on the molecule-platform Railway project) and store as repo secret RAILWAY_AUDIT_TOKEN."
+          exit 1
+
+      - name: Install Railway CLI
+        if: steps.secret_check.outputs.have_secret == 'true'
+        # Pinned hash matching the public install instructions; bump in
+        # tandem with the audit-script's documented Railway CLI version.
+        run: |
+          set -euo pipefail
+          curl -fsSL https://railway.com/install.sh | sh
+          # The installer drops the binary in ~/.railway/bin
+          echo "$HOME/.railway/bin" >> "$GITHUB_PATH"
+
+      - name: Verify Railway CLI authenticated
+        if: steps.secret_check.outputs.have_secret == 'true'
+        env:
+          RAILWAY_TOKEN: ${{ secrets.RAILWAY_AUDIT_TOKEN }}
+        run: |
+          set -euo pipefail
+          # `railway whoami` exits non-zero when the token is
+          # unauthenticated or doesn't have any project access.
+          if ! railway whoami >/dev/null 2>&1; then
+            echo "::error::Railway CLI failed to authenticate with RAILWAY_AUDIT_TOKEN — token may be revoked or scoped incorrectly"
+            exit 2
+          fi
+
+      - name: Link molecule-platform project
+        if: steps.secret_check.outputs.have_secret == 'true'
+        env:
+          RAILWAY_TOKEN: ${{ secrets.RAILWAY_AUDIT_TOKEN }}
+        # Project ID from reference_production_stack: molecule-platform
+        # / 7ccc8c68-61f4-42ab-9be5-586eeee11768. Linking is per-process,
+        # so we re-link in this CI shell (the audit script comment says
+        # it deliberately doesn't chdir for you because the linked
+        # project's identity matters).
+        run: |
+          set -euo pipefail
+          railway link --project 7ccc8c68-61f4-42ab-9be5-586eeee11768
+
+      - name: Run drift audit
+        if: steps.secret_check.outputs.have_secret == 'true'
+        id: audit
+        env:
+          RAILWAY_TOKEN: ${{ secrets.RAILWAY_AUDIT_TOKEN }}
+        run: |
+          set +e
+          bash scripts/ops/audit-railway-sha-pins.sh 2>&1 | tee /tmp/audit.log
+          rc=${PIPESTATUS[0]}
+          echo "rc=$rc" >> "$GITHUB_OUTPUT"
+          # Capture the audit log for the issue body.
+          {
+            echo 'log<<AUDIT_EOF'
+            cat /tmp/audit.log
+            echo 'AUDIT_EOF'
+          } >> "$GITHUB_OUTPUT"
+          # Exit codes from the script:
+          #   0 — no drift; workflow goes green
+          #   1 — drift detected; we'll file an issue and fail the run
+          #   2 — railway CLI unauthenticated / project unlinked; fail
+          # Anything else: also fail.
+          case "$rc" in
+            0) exit 0 ;;
+            1) echo "::warning::Drift-prone pin(s) detected — issue will be filed"; exit 1 ;;
+            2) echo "::error::Railway CLI auth/link failed mid-script — token or project ID drift"; exit 2 ;;
+            *) echo "::error::Unexpected audit rc=$rc"; exit 1 ;;
+          esac
+
+      - name: Open / update drift issue
+        if: failure() && steps.audit.outputs.rc == '1'
+        uses: actions/github-script@f28e40c7f34bde8b3046d885e986cb6290c5673b # v7
+        env:
+          AUDIT_LOG: ${{ steps.audit.outputs.log }}
+        with:
+          script: |
+            const title = "🚨 Railway env-var drift detected";
+            const runURL = `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`;
+            const body =
+              `Daily Railway pin audit found drift-prone image-tag pins in the molecule-platform Railway project.\n\n` +
+              `**What this means:** an env var (likely on \`controlplane\`) is pinned to a SHA-shaped or semver tag instead of a floating tag. ` +
+              `Same pattern that caused the 2026-04-24 TENANT_IMAGE incident — fix-PRs land but the running service doesn't pick them up.\n\n` +
+              `**Recovery:** open the Railway dashboard, replace the flagged value with a floating tag (\`:staging-latest\`, \`:main\`) unless the pin is intentional and documented in the ops runbook.\n\n` +
+              `**Audit output:**\n\n\`\`\`\n${process.env.AUDIT_LOG || '(log unavailable)'}\n\`\`\`\n\n` +
+              `Run: ${runURL}\n\n` +
+              `Closes automatically when a subsequent daily run reports clean.`;
+
+            const { data: existing } = await github.rest.issues.listForRepo({
+              owner: context.repo.owner, repo: context.repo.repo,
+              state: 'open', labels: 'railway-drift',
+            });
+            const match = existing.find(i => i.title === title);
+            if (match) {
+              await github.rest.issues.createComment({
+                owner: context.repo.owner, repo: context.repo.repo,
+                issue_number: match.number,
+                body: `Still drifting. ${runURL}\n\n\`\`\`\n${process.env.AUDIT_LOG || '(log unavailable)'}\n\`\`\``,
+              });
+            } else {
+              await github.rest.issues.create({
+                owner: context.repo.owner, repo: context.repo.repo,
+                title, body,
+                labels: ['railway-drift', 'bug', 'priority-high'],
+              });
+            }
+
+      - name: Close stale drift issue on clean run
+        # When a previously-flagged drift gets fixed by an operator,
+        # the next daily run goes green. Close any open `railway-drift`
+        # issue with a confirmation comment so the queue doesn't carry
+        # stale ones.
+        if: success() && steps.audit.outputs.rc == '0'
+        uses: actions/github-script@f28e40c7f34bde8b3046d885e986cb6290c5673b # v7
+        with:
+          script: |
+            const runURL = `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`;
+            const { data: existing } = await github.rest.issues.listForRepo({
+              owner: context.repo.owner, repo: context.repo.repo,
+              state: 'open', labels: 'railway-drift',
+            });
+            for (const issue of existing) {
+              await github.rest.issues.createComment({
+                owner: context.repo.owner, repo: context.repo.repo,
+                issue_number: issue.number,
+                body: `Daily audit clean — drift resolved. ${runURL}`,
+              });
+              await github.rest.issues.update({
+                owner: context.repo.owner, repo: context.repo.repo,
+                issue_number: issue.number,
+                state: 'closed',
+                state_reason: 'completed',
+              });
+            }
@@ -64,6 +64,20 @@ permissions:
  # No write scopes needed — the workflow hits an external CP endpoint,
  # not the GitHub API.

+# Serialize redeploys so two rapid main pushes' redeploys don't overlap
+# and cause confusing per-tenant SSM state. Without this, GitHub's
+# implicit workflow_run queueing would *probably* serialize them, but
+# the explicit block makes the invariant defensible. Mirrors the
+# concurrency block on redeploy-tenants-on-staging.yml for shape parity.
+#
+# cancel-in-progress: false → aborting a half-rolled-out fleet would
+# leave tenants stuck on whatever image they happened to be on when
+# cancelled. Better to finish the in-flight rollout before starting
+# the next one.
+concurrency:
+  group: redeploy-tenants-on-main
+  cancel-in-progress: false
+
 jobs:
  redeploy:
    # Skip the auto-trigger if publish-workspace-server-image didn't
@@ -161,4 +175,151 @@ jobs:
            echo "::error::redeploy-fleet reported ok=false (see summary for which tenant halted the rollout)"
            exit 1
          fi
-          echo "::notice::Tenant fleet redeploy complete."
+          echo "::notice::Tenant fleet redeploy reported ssm_status=Success — verifying actual image roll on each tenant..."
+
+          # Stash the response for the verify step. $RUNNER_TEMP outlasts
+          # the step boundary; $HTTP_RESPONSE doesn't.
+          cp "$HTTP_RESPONSE" "$RUNNER_TEMP/redeploy-response.json"
+
+      - name: Verify each tenant /buildinfo matches published SHA
+        # ROOT FIX FOR #2395.
+        #
+        # `redeploy-fleet`'s `ssm_status=Success` means "the SSM RPC
+        # didn't error" — NOT "the new image is running on the tenant."
+        # `:latest` lives in the local Docker daemon's image cache; if
+        # the SSM document does `docker compose up -d` without an
+        # explicit `docker pull`, the daemon serves the previously-
+        # cached digest and the container restarts on stale code.
+        # 2026-04-30 incident: hongmingwang's tenant reported
+        # ssm_status=Success at 17:00:53Z but kept serving pre-501a42d7
+        # chat_files for 30+ min — the lazy-heal fix never reached the
+        # user despite green deploy + green redeploy.
+        #
+        # This step closes the gap by curling each tenant's /buildinfo
+        # endpoint (added in workspace-server/internal/buildinfo +
+        # /Dockerfile* GIT_SHA build-arg, this PR) and comparing the
+        # returned git_sha to the SHA the workflow expects. Mismatches
+        # fail the workflow, which is what `ok=true` should have
+        # guaranteed all along.
+        #
+        # When the redeploy was triggered by workflow_dispatch with a
+        # specific tag (target_tag != "latest"), the expected SHA may
+        # not equal ${{ github.sha }} — in that case we resolve via
+        # GHCR's manifest. For workflow_run (default :latest) the
+        # workflow_run.head_sha is the SHA that just published.
+        env:
+          EXPECTED_SHA: ${{ github.event.workflow_run.head_sha || github.sha }}
+          TARGET_TAG: ${{ inputs.target_tag || 'latest' }}
+          # Tenant subdomain template — slugs from the response are
+          # appended. Production CP issues `<slug>.moleculesai.app`;
+          # staging CP issues `<slug>.staging.moleculesai.app`. This
+          # workflow runs on main → prod CP → no `staging.` infix.
+          TENANT_DOMAIN: 'moleculesai.app'
+        run: |
+          set -euo pipefail
+
+          if [ "$TARGET_TAG" != "latest" ] && [ "$TARGET_TAG" != "$EXPECTED_SHA" ]; then
+            # workflow_dispatch with a pinned tag that isn't the head
+            # SHA — operator is rolling back / pinning. Skip the
+            # verification because we don't have the expected SHA in
+            # this context (would need to crane-inspect the GHCR
+            # manifest, which is a follow-up). Failing-open here is
+            # safe: the operator chose the tag deliberately.
+            echo "::notice::target_tag=$TARGET_TAG (operator-pinned) — skipping per-tenant SHA verification."
+            exit 0
+          fi
+
+          RESP="$RUNNER_TEMP/redeploy-response.json"
+          if [ ! -s "$RESP" ]; then
+            echo "::error::redeploy-response.json missing or empty — verify step ran without a response to read"
+            exit 1
+          fi
+
+          # Pull only successfully-redeployed tenants. Any tenant that
+          # halted the rollout already failed the previous step, so we
+          # don't double-count them here.
+          mapfile -t SLUGS < <(jq -r '.results[]? | select(.healthz_ok == true) | .slug' "$RESP")
+          if [ ${#SLUGS[@]} -eq 0 ]; then
+            echo "::warning::No tenants reported healthz_ok — nothing to verify"
+            exit 0
+          fi
+
+          echo "Verifying ${#SLUGS[@]} tenant(s) against EXPECTED_SHA=${EXPECTED_SHA:0:7}..."
+
+          # Two distinct failure modes — STALE (the #2395 bug class, hard-fail)
+          # vs UNREACHABLE (teardown race, soft-warn). See the staging variant's
+          # comment for the full rationale; same logic applies on prod even
+          # though prod has fewer ephemeral tenants — the asymmetry would be a
+          # gratuitous fork.
+          STALE_COUNT=0
+          UNREACHABLE_COUNT=0
+          STALE_LINES=()
+          UNREACHABLE_LINES=()
+          for slug in "${SLUGS[@]}"; do
+            URL="https://${slug}.${TENANT_DOMAIN}/buildinfo"
+            # 30s total: tenant just SSM-restarted, may still be coming
+            # up. Retry-on-empty rather than retry-on-status — we want
+            # to fail fast on "responded with wrong SHA", not "still
+            # warming up".
+            BODY=$(curl -sS --max-time 30 --retry 3 --retry-delay 5 --retry-connrefused "$URL" || true)
+            ACTUAL_SHA=$(echo "$BODY" | jq -r '.git_sha // ""' 2>/dev/null || echo "")
+            if [ -z "$ACTUAL_SHA" ]; then
+              UNREACHABLE_COUNT=$((UNREACHABLE_COUNT + 1))
+              UNREACHABLE_LINES+=("| $slug | (no /buildinfo response) | ${EXPECTED_SHA:0:7} | ⚠ unreachable (likely teardown race) |")
+              continue
+            fi
+            if [ "$ACTUAL_SHA" = "$EXPECTED_SHA" ]; then
+              echo "  $slug: ${ACTUAL_SHA:0:7} ✓"
+            else
+              STALE_COUNT=$((STALE_COUNT + 1))
+              STALE_LINES+=("| $slug | ${ACTUAL_SHA:0:7} | ${EXPECTED_SHA:0:7} | ❌ stale |")
+            fi
+          done
+
+          {
+            echo ""
+            echo "### Per-tenant /buildinfo verification"
+            echo ""
+            echo "Expected SHA: \`${EXPECTED_SHA:0:7}\`"
+            echo ""
+            if [ $STALE_COUNT -gt 0 ]; then
+              echo "**${STALE_COUNT} STALE tenant(s) — these did NOT pick up the new image despite ssm_status=Success:**"
+              echo ""
+              echo "| Slug | Actual /buildinfo SHA | Expected | Status |"
+              echo "|------|----------------------|----------|--------|"
+              for line in "${STALE_LINES[@]}"; do echo "$line"; done
+              echo ""
+            fi
+            if [ $UNREACHABLE_COUNT -gt 0 ]; then
+              echo "**${UNREACHABLE_COUNT} unreachable tenant(s) — likely teardown race (soft-warn, not failing):**"
+              echo ""
+              echo "| Slug | Actual /buildinfo SHA | Expected | Status |"
+              echo "|------|----------------------|----------|--------|"
+              for line in "${UNREACHABLE_LINES[@]}"; do echo "$line"; done
+              echo ""
+            fi
+            if [ $STALE_COUNT -eq 0 ] && [ $UNREACHABLE_COUNT -eq 0 ]; then
+              echo "All ${#SLUGS[@]} tenants returned matching SHA. ✓"
+            fi
+          } >> "$GITHUB_STEP_SUMMARY"
+
+          if [ $UNREACHABLE_COUNT -gt 0 ]; then
+            echo "::warning::$UNREACHABLE_COUNT tenant(s) unreachable post-redeploy. Likely benign teardown race — CP healthz monitor catches real outages."
+          fi
+
+          # Belt-and-suspenders sanity floor: same logic as the staging
+          # variant — see that file's comment for the full rationale.
+          # Floor only applies when fleet >= 4; below that, canary-verify
+          # is the actual gate.
+          TOTAL_VERIFIED=${#SLUGS[@]}
+          if [ $TOTAL_VERIFIED -ge 4 ] && [ $UNREACHABLE_COUNT -gt $((TOTAL_VERIFIED / 2)) ]; then
+            echo "::error::$UNREACHABLE_COUNT of $TOTAL_VERIFIED tenant(s) unreachable — exceeds 50% threshold on a fleet large enough that this signals a real outage, not teardown race."
+            exit 1
+          fi
+
+          if [ $STALE_COUNT -gt 0 ]; then
+            echo "::error::$STALE_COUNT tenant(s) returned a stale SHA. ssm_status=Success was misleading — see job summary."
+            exit 1
+          fi
+
+          echo "::notice::Tenant fleet redeploy complete — all reachable tenants on ${EXPECTED_SHA:0:7} (${UNREACHABLE_COUNT} unreachable, soft-warned)."
@@ -0,0 +1,310 @@
+name: redeploy-tenants-on-staging
+
+# Auto-refresh staging tenant EC2s after every staging-branch merge.
+#
+# Mirror of redeploy-tenants-on-main.yml, with the staging-CP host and
+# the :staging-latest tag. Sister workflow exists for prod (rolls
+# :latest after canary-verify). Both share the same shape — just
+# different CP_URL + target_tag + admin token secret.
+#
+# Why this workflow exists: publish-workspace-server-image now builds
+# on every staging-branch push (PR #2335), pushing
+# platform-tenant:staging-latest to GHCR. Existing tenants pulled
+# their image once at boot and never re-pull, so the new image just
+# sits unused until the tenant is reprovisioned.
+#
+# This workflow closes the gap by calling staging-CP's
+# /cp/admin/tenants/redeploy-fleet, which performs a canary-first,
+# batched, health-gated SSM redeploy across every live staging tenant.
+# Same endpoint shape as prod CP — only the host differs.
+#
+# Runtime ordering:
+#   1. publish-workspace-server-image completes on staging branch →
+#      new :staging-latest in GHCR.
+#   2. This workflow fires via workflow_run, waits 30s for GHCR's CDN
+#      to propagate the new tag.
+#   3. Calls redeploy-fleet with no canary (staging IS canary; we don't
+#      need a sub-canary inside it). Soak still applies to the first
+#      tenant in case of bad-deploy detection.
+#   4. Any failure aborts the rollout and leaves older tenants on the
+#      prior image — safer default than half-and-half state.
+#
+# Rollback path: re-run with workflow_dispatch + target_tag=staging-<sha>
+# of a known-good build.
+
+on:
+  workflow_run:
+    workflows: ['publish-workspace-server-image']
+    types: [completed]
+    branches: [staging]
+  workflow_dispatch:
+    inputs:
+      target_tag:
+        description: 'Tenant image tag to deploy (e.g. "staging-latest" or "staging-a59f1a6c"). Defaults to staging-latest when empty.'
+        required: false
+        type: string
+        default: 'staging-latest'
+      canary_slug:
+        description: 'Tenant slug to deploy first + soak (empty = skip canary, fan out immediately). Default empty for staging since staging itself is the canary.'
+        required: false
+        type: string
+        default: ''
+      soak_seconds:
+        description: 'Seconds to wait after canary before fanning out. Only meaningful if canary_slug is set.'
+        required: false
+        type: string
+        default: '60'
+      batch_size:
+        description: 'How many tenants SSM redeploys in parallel per batch.'
+        required: false
+        type: string
+        default: '3'
+      dry_run:
+        description: 'Plan only — do not actually redeploy.'
+        required: false
+        type: boolean
+        default: false
+
+permissions:
+  contents: read
+  # No write scopes needed — the workflow hits an external CP endpoint,
+  # not the GitHub API.
+
+# Serialize per-branch so two rapid staging pushes' redeploys don't
+# overlap and cause confusing per-tenant SSM state. cancel-in-progress
+# is false because aborting a half-rolled-out fleet leaves tenants
+# stuck on whatever image they happened to be on when cancelled.
+concurrency:
+  group: redeploy-tenants-on-staging
+  cancel-in-progress: false
+
+jobs:
+  redeploy:
+    # Skip the auto-trigger if publish-workspace-server-image didn't
+    # actually succeed. workflow_run fires on any completion state; we
+    # don't want to redeploy against a half-built image.
+    if: |
+      github.event_name == 'workflow_dispatch' ||
+      (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success')
+    runs-on: ubuntu-latest
+    timeout-minutes: 25
+    steps:
+      - name: Wait for GHCR tag propagation
+        # GHCR's edge cache takes ~15-30s to consistently serve the new
+        # :staging-latest manifest after the registry accepts the push.
+        # Same rationale as redeploy-tenants-on-main.yml.
+        run: sleep 30
+
+      - name: Call staging-CP redeploy-fleet
+        # CP_STAGING_ADMIN_API_TOKEN must be set as a repo/org secret
+        # on Molecule-AI/molecule-core, matching staging-CP's
+        # CP_ADMIN_API_TOKEN env var (visible in Railway controlplane
+        # / staging environment). Stored separately from the prod
+        # CP_ADMIN_API_TOKEN so a leak of one doesn't auth the other.
+        env:
+          CP_URL: ${{ vars.STAGING_CP_URL || 'https://staging-api.moleculesai.app' }}
+          CP_STAGING_ADMIN_API_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
+          TARGET_TAG: ${{ inputs.target_tag || 'staging-latest' }}
+          CANARY_SLUG: ${{ inputs.canary_slug || '' }}
+          SOAK_SECONDS: ${{ inputs.soak_seconds || '60' }}
+          BATCH_SIZE: ${{ inputs.batch_size || '3' }}
+          DRY_RUN: ${{ inputs.dry_run || false }}
+        run: |
+          set -euo pipefail
+
+          # Schedule-vs-dispatch hardening (mirrors sweep-cf-orphans
+          # and sweep-cf-tunnels): hard-fail on auto-trigger when the
+          # secret is missing so a misconfigured-repo doesn't silently
+          # serve stale staging tenants. Soft-skip on operator dispatch.
+          if [ -z "${CP_STAGING_ADMIN_API_TOKEN:-}" ]; then
+            if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
+              echo "::warning::CP_STAGING_ADMIN_API_TOKEN secret not set — skipping redeploy"
+              echo "::warning::Set CP_STAGING_ADMIN_API_TOKEN in repo secrets to enable auto-redeploy."
+              echo "::notice::Pull the value from staging-CP's CP_ADMIN_API_TOKEN env in Railway."
+              exit 0
+            fi
+            echo "::error::staging redeploy cannot run — CP_STAGING_ADMIN_API_TOKEN secret missing"
+            echo "::error::set it at Settings → Secrets and Variables → Actions; pull from staging-CP's CP_ADMIN_API_TOKEN env in Railway."
+            exit 1
+          fi
+
+          BODY=$(jq -nc \
+            --arg tag "$TARGET_TAG" \
+            --arg canary "$CANARY_SLUG" \
+            --argjson soak "$SOAK_SECONDS" \
+            --argjson batch "$BATCH_SIZE" \
+            --argjson dry "$DRY_RUN" \
+            '{
+              target_tag: $tag,
+              canary_slug: $canary,
+              soak_seconds: $soak,
+              batch_size: $batch,
+              dry_run: $dry
+            }')
+
+          echo "POST $CP_URL/cp/admin/tenants/redeploy-fleet"
+          echo "  body: $BODY"
+
+          HTTP_RESPONSE=$(mktemp)
+          HTTP_CODE=$(curl -sS -o "$HTTP_RESPONSE" -w '%{http_code}' \
+            -m 1200 \
+            -H "Authorization: Bearer $CP_STAGING_ADMIN_API_TOKEN" \
+            -H "Content-Type: application/json" \
+            -X POST "$CP_URL/cp/admin/tenants/redeploy-fleet" \
+            -d "$BODY" || echo "000")
+
+          echo "HTTP $HTTP_CODE"
+          cat "$HTTP_RESPONSE" | jq . || cat "$HTTP_RESPONSE"
+
+          {
+            echo "## Staging tenant redeploy fleet"
+            echo ""
+            echo "**Target tag:** \`$TARGET_TAG\`"
+            echo "**Canary:** \`${CANARY_SLUG:-(none — staging is itself the canary)}\` (soak ${SOAK_SECONDS}s)"
+            echo "**Batch size:** $BATCH_SIZE"
+            echo "**Dry run:** $DRY_RUN"
+            echo "**HTTP:** $HTTP_CODE"
+            echo ""
+            echo "### Per-tenant result"
+            echo ""
+            echo '| Slug | Phase | SSM Status | Exit | Healthz | Error |'
+            echo '|------|-------|------------|------|---------|-------|'
+            jq -r '.results[]? | "| \(.slug) | \(.phase) | \(.ssm_status // "-") | \(.ssm_exit_code) | \(.healthz_ok) | \(.error // "-") |"' "$HTTP_RESPONSE" || true
+          } >> "$GITHUB_STEP_SUMMARY"
+
+          if [ "$HTTP_CODE" != "200" ]; then
+            echo "::error::redeploy-fleet returned HTTP $HTTP_CODE"
+            exit 1
+          fi
+          OK=$(jq -r '.ok' "$HTTP_RESPONSE")
+          if [ "$OK" != "true" ]; then
+            echo "::error::redeploy-fleet reported ok=false (see summary for which tenant halted the rollout)"
+            exit 1
+          fi
+          echo "::notice::Staging tenant fleet redeploy reported ssm_status=Success — verifying actual image roll on each tenant..."
+
+          cp "$HTTP_RESPONSE" "$RUNNER_TEMP/redeploy-response.json"
+
+      - name: Verify each staging tenant /buildinfo matches published SHA
+        # Mirror of the verify step in redeploy-tenants-on-main.yml — see
+        # there for the rationale (#2395 root fix). Staging has the same
+        # ssm_status-success-but-stale-image hazard and benefits from the
+        # same gate. Diff: TENANT_DOMAIN includes the `staging.` infix.
+        env:
+          EXPECTED_SHA: ${{ github.event.workflow_run.head_sha || github.sha }}
+          TARGET_TAG: ${{ inputs.target_tag || 'staging-latest' }}
+          TENANT_DOMAIN: 'staging.moleculesai.app'
+        run: |
+          set -euo pipefail
+
+          # staging-latest is the staging-side moving tag; treat it the
+          # same way main treats `latest`. Operator-pinned SHAs skip
+          # verification (see main variant for why).
+          if [ "$TARGET_TAG" != "staging-latest" ] && [ "$TARGET_TAG" != "latest" ] && [ "$TARGET_TAG" != "$EXPECTED_SHA" ]; then
+            echo "::notice::target_tag=$TARGET_TAG (operator-pinned) — skipping per-tenant SHA verification."
+            exit 0
+          fi
+
+          RESP="$RUNNER_TEMP/redeploy-response.json"
+          if [ ! -s "$RESP" ]; then
+            echo "::error::redeploy-response.json missing or empty"
+            exit 1
+          fi
+
+          mapfile -t SLUGS < <(jq -r '.results[]? | select(.healthz_ok == true) | .slug' "$RESP")
+          if [ ${#SLUGS[@]} -eq 0 ]; then
+            echo "::warning::No staging tenants reported healthz_ok — nothing to verify"
+            exit 0
+          fi
+
+          echo "Verifying ${#SLUGS[@]} staging tenant(s) against EXPECTED_SHA=${EXPECTED_SHA:0:7}..."
+
+          # Two distinct failure modes here:
+          #   STALE_COUNT      — tenant returned a SHA that doesn't match. THIS is
+          #                      the #2395 bug class: tenant up + serving old code.
+          #                      Always hard-fail the workflow.
+          #   UNREACHABLE_COUNT — tenant didn't respond. Almost always a benign
+          #                      teardown race: redeploy-fleet snapshot says
+          #                      healthz_ok=true, then the E2E suite tears the
+          #                      ephemeral tenant down before this step runs (the
+          #                      e2e-* fixtures churn 5-10/hour on staging). Soft-
+          #                      warn so we don't block staging→main on cleanup.
+          #                      Real "tenant up but unreachable" is caught by CP's
+          #                      own healthz monitor + the post-redeploy alert; we
+          #                      don't need to double-count it here.
+          STALE_COUNT=0
+          UNREACHABLE_COUNT=0
+          STALE_LINES=()
+          UNREACHABLE_LINES=()
+          for slug in "${SLUGS[@]}"; do
+            URL="https://${slug}.${TENANT_DOMAIN}/buildinfo"
+            BODY=$(curl -sS --max-time 30 --retry 3 --retry-delay 5 --retry-connrefused "$URL" || true)
+            ACTUAL_SHA=$(echo "$BODY" | jq -r '.git_sha // ""' 2>/dev/null || echo "")
+            if [ -z "$ACTUAL_SHA" ]; then
+              UNREACHABLE_COUNT=$((UNREACHABLE_COUNT + 1))
+              UNREACHABLE_LINES+=("| $slug | (no /buildinfo response) | ${EXPECTED_SHA:0:7} | ⚠ unreachable (likely teardown race) |")
+              continue
+            fi
+            if [ "$ACTUAL_SHA" = "$EXPECTED_SHA" ]; then
+              echo "  $slug: ${ACTUAL_SHA:0:7} ✓"
+            else
+              STALE_COUNT=$((STALE_COUNT + 1))
+              STALE_LINES+=("| $slug | ${ACTUAL_SHA:0:7} | ${EXPECTED_SHA:0:7} | ❌ stale |")
+            fi
+          done
+
+          {
+            echo ""
+            echo "### Per-tenant /buildinfo verification (staging)"
+            echo ""
+            echo "Expected SHA: \`${EXPECTED_SHA:0:7}\`"
+            echo ""
+            if [ $STALE_COUNT -gt 0 ]; then
+              echo "**${STALE_COUNT} STALE tenant(s) — these did NOT pick up the new image despite ssm_status=Success:**"
+              echo ""
+              echo "| Slug | Actual /buildinfo SHA | Expected | Status |"
+              echo "|------|----------------------|----------|--------|"
+              for line in "${STALE_LINES[@]}"; do echo "$line"; done
+              echo ""
+            fi
+            if [ $UNREACHABLE_COUNT -gt 0 ]; then
+              echo "**${UNREACHABLE_COUNT} unreachable tenant(s) — likely E2E teardown race (soft-warn, not failing):**"
+              echo ""
+              echo "| Slug | Actual /buildinfo SHA | Expected | Status |"
+              echo "|------|----------------------|----------|--------|"
+              for line in "${UNREACHABLE_LINES[@]}"; do echo "$line"; done
+              echo ""
+            fi
+            if [ $STALE_COUNT -eq 0 ] && [ $UNREACHABLE_COUNT -eq 0 ]; then
+              echo "All ${#SLUGS[@]} staging tenants returned matching SHA. ✓"
+            fi
+          } >> "$GITHUB_STEP_SUMMARY"
+
+          if [ $UNREACHABLE_COUNT -gt 0 ]; then
+            echo "::warning::$UNREACHABLE_COUNT staging tenant(s) unreachable post-redeploy. Likely benign teardown race — CP healthz monitor catches real outages."
+          fi
+
+          # Belt-and-suspenders sanity floor: if MORE than half the fleet is
+          # unreachable AND the fleet is large enough that "half down" is
+          # statistically meaningful, this is a real outage (e.g. new image
+          # crashes on startup), not a teardown race. Hard-fail.
+          #
+          # Floor only applies when TOTAL_VERIFIED >= 4 — below that, the
+          # canary-verify step is the actual gate for "all tenants down"
+          # detection (it runs against the canary first and aborts the
+          # rollout if the canary fails to come up). Without the >=4 gate,
+          # a 1-tenant fleet (e.g. a single ephemeral e2e-* tenant on a
+          # quiet staging push) would re-flake on the exact teardown-race
+          # condition #2402 fixed: 1 of 1 unreachable = 100% > 50% → fail.
+          TOTAL_VERIFIED=${#SLUGS[@]}
+          if [ $TOTAL_VERIFIED -ge 4 ] && [ $UNREACHABLE_COUNT -gt $((TOTAL_VERIFIED / 2)) ]; then
+            echo "::error::$UNREACHABLE_COUNT of $TOTAL_VERIFIED staging tenant(s) unreachable — exceeds 50% threshold on a fleet large enough that this signals a real outage, not teardown race."
+            exit 1
+          fi
+
+          if [ $STALE_COUNT -gt 0 ]; then
+            echo "::error::$STALE_COUNT staging tenant(s) returned a stale SHA. ssm_status=Success was misleading — see job summary."
+            exit 1
+          fi
+
+          echo "::notice::Staging tenant fleet redeploy complete — all reachable tenants on ${EXPECTED_SHA:0:7} (${UNREACHABLE_COUNT} unreachable, soft-warned)."
@@ -23,53 +23,88 @@ name: Runtime PR-Built Compatibility
 #
 # By building from the PR's source and smoke-importing THAT wheel, we
 # fail at PR-time instead of after publish.
+#
+# Required-check shape (2026-05-01): the workflow runs on EVERY push +
+# PR + merge_group event with no top-level `paths:` filter, then uses a
+# detect-changes job + per-step `if:` gates inside ONE always-running
+# job named `PR-built wheel + import smoke`. PRs that don't touch
+# wheel-relevant paths get a no-op SUCCESS check run, satisfying branch
+# protection without re-running the heavy build. Same pattern as
+# e2e-api.yml — see its comment for the full rationale + the 2026-04-29
+# PR #2264 incident that motivated the always-run-with-if-gates shape.

 on:
  push:
    branches: [main, staging]
-    paths:
-      # Broad filter: this workflow's verdict can change whenever any
-      # workspace/ source file changes (because the wheel we build is
-      # produced from those files), or when the build script itself
-      # changes (it controls the wheel layout).
-      - 'workspace/**'
-      - 'scripts/build_runtime_package.py'
-      - '.github/workflows/runtime-prbuild-compat.yml'
  pull_request:
    branches: [main, staging]
-    paths:
-      - 'workspace/**'
-      - 'scripts/build_runtime_package.py'
-      - '.github/workflows/runtime-prbuild-compat.yml'
  workflow_dispatch:
-  # Required-check support: when this becomes a branch-protection gate,
-  # merge_group runs let the queue green-check this in addition to PRs.
  merge_group:
    types: [checks_requested]
-  # No cron: the same pre-merge run already covered the commit, and
-  # re-running daily wouldn't surface anything new (workspace/ doesn't
-  # change between cron firings unless a PR already passed this gate).

 concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
+  group: ${{ github.workflow }}-${{ github.event.pull_request.head.sha || github.sha }}
  cancel-in-progress: true

 jobs:
+  detect-changes:
+    runs-on: ubuntu-latest
+    outputs:
+      wheel: ${{ steps.decide.outputs.wheel }}
+    steps:
+      - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
+      - uses: dorny/paths-filter@fbd0ab8f3e69293af611ebaee6363fc25e6d187d # v4.0.1
+        id: filter
+        with:
+          filters: |
+            wheel:
+              - 'workspace/**'
+              - 'scripts/build_runtime_package.py'
+              - 'scripts/wheel_smoke.py'
+              - '.github/workflows/runtime-prbuild-compat.yml'
+      - id: decide
+        # Always run real work for manual dispatch + merge_group — no
+        # diff-against-base in those contexts, and the gate exists to
+        # validate the to-be-merged state regardless of which paths it
+        # touched (paths-filter would default to "no changes" which is
+        # the wrong answer when the queue is composing many PRs).
+        run: |
+          if [ "${{ github.event_name }}" = "workflow_dispatch" ] || [ "${{ github.event_name }}" = "merge_group" ]; then
+            echo "wheel=true" >> "$GITHUB_OUTPUT"
+          else
+            echo "wheel=${{ steps.filter.outputs.wheel }}" >> "$GITHUB_OUTPUT"
+          fi
+
+  # ONE job (no job-level `if:`) that always runs and reports under the
+  # required-check name `PR-built wheel + import smoke`. Real work is
+  # gated per-step on `needs.detect-changes.outputs.wheel`. Same shape
+  # as e2e-api.yml's e2e-api job — see its comment block for the full
+  # rationale (SKIPPED check runs block branch protection even with
+  # SUCCESS siblings; collapsing to one always-run job emits exactly
+  # one SUCCESS check run).
  local-build-install:
-    # Builds the wheel from THIS PR's workspace/ + scripts/ and tests
-    # IT — the artifact that WOULD be published if this PR merges.
+    needs: detect-changes
    name: PR-built wheel + import smoke
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
-      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
+      - name: No-op pass (paths filter excluded this commit)
+        if: needs.detect-changes.outputs.wheel != 'true'
+        run: |
+          echo "No workspace/ / scripts/{build_runtime_package,wheel_smoke}.py / workflow changes — wheel gate satisfied without rebuilding."
+          echo "::notice::PR-built wheel + import smoke no-op pass (paths filter excluded this commit)."
+      - if: needs.detect-changes.outputs.wheel == 'true'
+        uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
+      - if: needs.detect-changes.outputs.wheel == 'true'
+        uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
        with:
          python-version: '3.11'
          cache: pip
          cache-dependency-path: workspace/requirements.txt
      - name: Install build tooling
+        if: needs.detect-changes.outputs.wheel == 'true'
        run: pip install build
      - name: Build wheel from PR source (mirrors publish-runtime.yml)
+        if: needs.detect-changes.outputs.wheel == 'true'
        # Use a fixed test version so the wheel filename is predictable.
        # Doesn't reach PyPI — this build is local-only for the smoke.
        # Use the SAME build script with the SAME args as
@@ -86,6 +121,7 @@ jobs:
            --out /tmp/runtime-build
          cd /tmp/runtime-build && python -m build
      - name: Install built wheel + workspace requirements
+        if: needs.detect-changes.outputs.wheel == 'true'
        run: |
          python -m venv /tmp/venv-built
          /tmp/venv-built/bin/pip install --upgrade pip
@@ -94,7 +130,10 @@ jobs:
          /tmp/venv-built/bin/pip show molecule-ai-workspace-runtime a2a-sdk \
            | grep -E '^(Name|Version):'
      - name: Smoke import the PR-built wheel
-        env:
-          WORKSPACE_ID: 00000000-0000-0000-0000-000000000001
+        if: needs.detect-changes.outputs.wheel == 'true'
+        # Same script publish-runtime.yml runs against the to-be-PyPI wheel.
+        # Closes the PR-time vs publish-time gap: a PR adding a new SDK
+        # call-shape no longer passes here (narrow `import main_sync`) only
+        # to fail post-merge in publish-runtime's broader smoke.
        run: |
-          /tmp/venv-built/bin/python -c "from molecule_runtime.main import main_sync; print('PR-built runtime imports OK')"
+          /tmp/venv-built/bin/python "$GITHUB_WORKSPACE/scripts/wheel_smoke.py"
@@ -0,0 +1,112 @@
+name: Sweep stale Cloudflare Tunnels
+
+# Janitor for Cloudflare Tunnels whose backing tenant no longer
+# exists. Parallel-shape to sweep-cf-orphans.yml (which sweeps DNS
+# records); same justification, different CF resource.
+#
+# Why this exists separately from sweep-cf-orphans:
+#   - DNS records live on the zone (`/zones/<id>/dns_records`).
+#   - Tunnels live on the account (`/accounts/<id>/cfd_tunnel`).
+#   - Different CF API surface, different scopes; the existing CF
+#     token might not have `account:cloudflare_tunnel:edit`. Splitting
+#     the workflows keeps each one's secret-presence gate independent
+#     so neither silent-skips when the other's secret is missing.
+#   - Cleaner blast radius — operators can disable one without the
+#     other if a regression surfaces.
+#
+# Safety: the script's MAX_DELETE_PCT gate (default 90% — higher than
+# the DNS sweep's 50% because tenant-shaped tunnels are mostly
+# orphans by design) refuses to nuke past the threshold.
+
+on:
+  schedule:
+    # Hourly at :45 — offset from sweep-cf-orphans (:15) so the two
+    # janitors don't issue parallel CF API bursts at the same minute.
+    - cron: '45 * * * *'
+  workflow_dispatch:
+    inputs:
+      dry_run:
+        description: "Dry run only — list what would be deleted, no deletion"
+        required: false
+        type: boolean
+        default: true
+      max_delete_pct:
+        description: "Override safety gate (default 90, set higher only for major cleanup)"
+        required: false
+        default: "90"
+
+# Don't let two sweeps race the same account.
+concurrency:
+  group: sweep-cf-tunnels
+  cancel-in-progress: false
+
+permissions:
+  contents: read
+
+jobs:
+  sweep:
+    name: Sweep CF tunnels
+    runs-on: ubuntu-latest
+    # 5 min surfaces hangs (CF API stall, slow pagination on busy
+    # accounts). Realistic worst case is ~3 min: 2 CP curls + N CF
+    # list pages + N×CF-DELETE, each capped at 10-15s by curl -m.
+    timeout-minutes: 5
+    env:
+      CF_API_TOKEN: ${{ secrets.CF_API_TOKEN }}
+      CF_ACCOUNT_ID: ${{ secrets.CF_ACCOUNT_ID }}
+      CP_PROD_ADMIN_TOKEN: ${{ secrets.CP_PROD_ADMIN_TOKEN }}
+      CP_STAGING_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_TOKEN }}
+      MAX_DELETE_PCT: ${{ github.event.inputs.max_delete_pct || '90' }}
+
+    steps:
+      - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
+
+      - name: Verify required secrets present
+        id: verify
+        # Schedule-vs-dispatch behaviour split mirrors sweep-cf-orphans
+        # (hardened 2026-04-28 after the silent-no-op incident: the
+        # janitor reported green while doing nothing because secrets
+        # were unset, masking a 152/200 zone-record leak). Same
+        # principle applies here:
+        #   - schedule → exit 1 on missing secrets (red CI surfaces it)
+        #   - workflow_dispatch → exit 0 with warning (operator-driven,
+        #     they already accepted the repo state)
+        run: |
+          missing=()
+          for var in CF_API_TOKEN CF_ACCOUNT_ID CP_PROD_ADMIN_TOKEN CP_STAGING_ADMIN_TOKEN; do
+            if [ -z "${!var:-}" ]; then
+              missing+=("$var")
+            fi
+          done
+          if [ ${#missing[@]} -gt 0 ]; then
+            if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
+              echo "::warning::skipping sweep — secrets not configured: ${missing[*]}"
+              echo "::warning::set them at Settings → Secrets and Variables → Actions, then rerun."
+              echo "::warning::CF_API_TOKEN must include account:cloudflare_tunnel:edit scope (separate from the zone:dns:edit scope used by sweep-cf-orphans)."
+              echo "skip=true" >> "$GITHUB_OUTPUT"
+              exit 0
+            fi
+            echo "::error::sweep cannot run — required secrets missing: ${missing[*]}"
+            echo "::error::set them at Settings → Secrets and Variables → Actions, or disable this workflow."
+            echo "::error::CF_API_TOKEN must include account:cloudflare_tunnel:edit scope."
+            exit 1
+          fi
+          echo "All required secrets present ✓"
+          echo "skip=false" >> "$GITHUB_OUTPUT"
+
+      - name: Run sweep
+        if: steps.verify.outputs.skip != 'true'
+        # Schedule-vs-dispatch dry-run asymmetry mirrors sweep-cf-orphans:
+        #   - Scheduled: input empty → "false" → --execute (the whole
+        #     point of an hourly janitor).
+        #   - Manual workflow_dispatch: input default true → dry-run;
+        #     operator must flip it to actually delete.
+        run: |
+          set -euo pipefail
+          if [ "${{ github.event.inputs.dry_run || 'false' }}" = "true" ]; then
+            echo "Running in dry-run mode — no deletions"
+            bash scripts/ops/sweep-cf-tunnels.sh
+          else
+            echo "Running with --execute — will delete identified orphans"
+            bash scripts/ops/sweep-cf-tunnels.sh --execute
+          fi
@@ -1,19 +1,27 @@
 name: Ops Scripts Tests

-# Runs the unittest suite for scripts/ops/ on every PR + push that touches
-# the directory. Kept separate from the main CI so a script-only change
-# doesn't trigger the heavier Go/Canvas/Python pipelines.
+# Runs the unittest suite for scripts/ on every PR + push that touches
+# anything under scripts/. Kept separate from the main CI so a script-only
+# change doesn't trigger the heavier Go/Canvas/Python pipelines.
+#
+# Discovery layout: tests sit alongside the code they test (see
+# scripts/ops/test_sweep_cf_decide.py for the pattern; scripts/
+# test_build_runtime_package.py for the rewriter coverage). The job
+# below runs `unittest discover` TWICE — once from `scripts/`, once
+# from `scripts/ops/` — because neither dir has an `__init__.py`, so
+# a single discover from `scripts/` doesn't recurse into the ops
+# subdir. Two passes is simpler than retrofitting namespace packages.

 on:
  push:
    branches: [main, staging]
    paths:
-      - 'scripts/ops/**'
+      - 'scripts/**'
      - '.github/workflows/test-ops-scripts.yml'
  pull_request:
    branches: [main, staging]
    paths:
-      - 'scripts/ops/**'
+      - 'scripts/**'
      - '.github/workflows/test-ops-scripts.yml'
  merge_group:
    types: [checks_requested]
@@ -31,6 +39,14 @@ jobs:
      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
        with:
          python-version: '3.11'
-      - name: Run unittest
+      - name: Run scripts/ unittests (build_runtime_package, …)
+        # Top-level scripts/ tests live alongside their target file
+        # (e.g. scripts/test_build_runtime_package.py exercises
+        # scripts/build_runtime_package.py). discover from scripts/
+        # picks up only top-level test_*.py because scripts/ops/ has
+        # no __init__.py — that's intentional, so we run two passes.
+        working-directory: scripts
+        run: python -m unittest discover -t . -p 'test_*.py' -v
+      - name: Run scripts/ops/ unittests (sweep_cf_decide, …)
        working-directory: scripts/ops
        run: python -m unittest discover -p 'test_*.py' -v
@@ -146,3 +146,4 @@ backups/
 *-temp.txt
 /test-pmm-*.txt
 /tick-reflections-*.md
+tests/harness/cp-stub/cp-stub
@@ -53,6 +53,29 @@ cp .env.example .env

 See `CLAUDE.md` for a full list of environment variables and their purposes.

+## What goes where (content vs code)
+
+This repo is scoped to **code** (canvas, workspace, workspace-server, related
+infra). Public content (blog posts, marketing copy, OG images, SEO briefs,
+DevRel demos) lives in [`Molecule-AI/docs`](https://github.com/Molecule-AI/docs).
+The `Block forbidden paths` CI gate fails any PR that writes to `marketing/`
+or other removed paths — open against `Molecule-AI/docs` instead.
+
+| Content type | Target |
+|---|---|
+| Blog posts | `Molecule-AI/docs` → `content/blog/<YYYY-MM-DD-slug>/` |
+| Doc pages | `Molecule-AI/docs` → `content/docs/` |
+| Marketing copy / PMM positioning | `Molecule-AI/docs` → `marketing/` |
+| OG images, visual assets | `Molecule-AI/docs` → `app/` or `marketing/` |
+| SEO briefs | `Molecule-AI/docs` → `marketing/` |
+| DevRel demos (runnable code) | Standalone repo under `Molecule-AI/`, OR embedded in `Molecule-AI/docs` |
+| Launch checklists, internal tracking | GitHub Issues — **not** committed files |
+| Engineering docs (`docs/adr/`, `docs/architecture/`, `docs/incidents/`) | This repo (internal, not published) |
+| Live product pages (e.g. `canvas/src/app/pricing/page.tsx`) | This repo (these are app code, not marketing copy) |
+
+If a PR fails the `Block forbidden paths` check, the contents belong in
+`Molecule-AI/docs`. No CI drag, no Canvas E2E, content lands in minutes.
+
 ## Development Workflow

 ### Branch Naming
@@ -152,6 +175,17 @@ and run CI manually.
 - Type hints on public functions
 - pytest for all tests

+## External integrations
+
+Code in this repo lands in molecule-core. Some related runtime artifacts
+live in their own repos:
+
+- [`Molecule-AI/molecule-ai-workspace-runtime`](https://github.com/Molecule-AI/molecule-ai-workspace-runtime) — Python adapter SDK (`molecule_runtime`) that runs inside containerized Molecule workspaces. Bridges Claude Code SDK / hermes / langgraph / etc. → A2A queue.
+- [`Molecule-AI/molecule-sdk-python`](https://github.com/Molecule-AI/molecule-sdk-python) — `A2AServer` + `RemoteAgentClient` for external agents that register over the public `/registry/register` flow.
+- [`Molecule-AI/molecule-mcp-claude-channel`](https://github.com/Molecule-AI/molecule-mcp-claude-channel) — Claude Code channel plugin. Bridges A2A traffic into a running Claude Code session via MCP `notifications/claude/channel`. Polling-based (no tunnel required); install with `claude --channels plugin:molecule@Molecule-AI/molecule-mcp-claude-channel`.
+
+When extending the **A2A surface** in molecule-core (`workspace-server/internal/handlers/a2a_proxy.go` etc.), consider whether the change has a downstream impact on the runtime SDK or the channel plugin — they're versioned independently but share the wire shape.
+
 ## Architecture Overview

 See `CLAUDE.md` for detailed architecture documentation, including:
@@ -39,8 +39,8 @@
  <a href="./docs/agent-runtime/workspace-runtime.md"><strong>Workspace Runtime</strong></a>
 </p>

-[![Deploy on Railway](https://railway.app/button.svg)](https://railway.app/new/template?template=https://github.com/Molecule-AI/molecule-core)
-[![Deploy to Render](https://render.com/images/deploy-to-render-button.svg)](https://render.com/deploy?repo=https://github.com/Molecule-AI/molecule-core)
+[![Deploy on Railway](https://railway.app/button.svg)](https://railway.app/new/template?template=https://github.com/Molecule-AI/molecule-monorepo)
+[![Deploy to Render](https://render.com/images/deploy-to-render-button.svg)](https://render.com/deploy?repo=https://github.com/Molecule-AI/molecule-monorepo)

 </div>

@@ -249,8 +249,8 @@ Workspace Runtime (Python image with adapters)
 ## Quick Start

 ```bash
-git clone https://github.com/Molecule-AI/molecule-core.git
-cd molecule-core
+git clone https://github.com/Molecule-AI/molecule-monorepo.git
+cd molecule-monorepo

 cp .env.example .env
 # Defaults boot the stack locally out of the box. See .env.example for
@@ -111,6 +111,20 @@ export default async function globalSetup(_config: FullConfig): Promise<void> {
  const adminAuth = { Authorization: `Bearer ${ADMIN_TOKEN}` };
  console.log(`[staging-setup] Using slug=${slug}`);

+  // Write the state file FIRST, before any CP call. Teardown (both
+  // Playwright globalTeardown and the workflow safety-net) reads this
+  // file to identify the slug it must clean up. If we wait until the
+  // end of setup to write it (the previous behavior), a crash during
+  // any of steps 1-6 leaves the org orphaned in CP with no record on
+  // disk — forcing the workflow safety-net into a pattern-sweep over
+  // every `e2e-canvas-<date>-*` org, which races with concurrent
+  // canvas-E2E runs and deletes their live tenants. Race observed
+  // 2026-04-30 on PR #2264 staging→main: three real-test runs killed
+  // each other's tenants mid-test, surfacing as `getaddrinfo ENOTFOUND`
+  // when CP cleaned up the just-deleted DNS record.
+  const stateFile = join(process.cwd(), ".playwright-staging-state.json");
+  writeFileSync(stateFile, JSON.stringify({ slug }, null, 2));
+
  // 1. Create org via admin endpoint — no WorkOS session needed
  const create = await jsonFetch(`${CP_URL}/cp/admin/orgs`, {
    method: "POST",
@@ -245,8 +259,8 @@ export default async function globalSetup(_config: FullConfig): Promise<void> {
  );
  console.log(`[staging-setup] Workspace online`);

-  // 7. Hand state off to tests + teardown
-  const stateFile = join(process.cwd(), ".playwright-staging-state.json");
+  // 7. Hand state off to tests + teardown — overwrite the slug-only
+  // bootstrap state with the full state spec tests need.
  writeFileSync(
    stateFile,
    JSON.stringify({ slug, tenantURL, workspaceId, tenantToken }, null, 2),
@@ -24,7 +24,11 @@ export default async function globalTeardown(): Promise<void> {

  const stateFile = join(process.cwd(), ".playwright-staging-state.json");
  if (!existsSync(stateFile)) {
-    console.warn("[staging-teardown] no state file — setup must have failed before org create; nothing to tear down");
+    // staging-setup writes this file as its first action, before any
+    // CP call. Missing here means setup never ran (CANVAS_E2E_STAGING
+    // unset, or ran in a different cwd) — there's no slug we created
+    // that needs cleaning up.
+    console.warn("[staging-teardown] no state file — nothing to tear down");
    return;
  }

@@ -0,0 +1,48 @@
+/**
+ * Canvas /api/buildinfo — version-display endpoint mirroring
+ * workspace-server's /buildinfo. Lets `curl <url>/api/buildinfo`
+ * confirm which git SHA is live on a canvas deployment.
+ */
+import { describe, it, expect, beforeEach, afterEach } from "vitest";
+import { GET } from "../route";
+
+const ENV_KEYS = ["VERCEL_GIT_COMMIT_SHA", "VERCEL_GIT_COMMIT_REF", "VERCEL_ENV"];
+
+describe("GET /api/buildinfo", () => {
+  let saved: Record<string, string | undefined>;
+
+  beforeEach(() => {
+    saved = Object.fromEntries(ENV_KEYS.map((k) => [k, process.env[k]]));
+    for (const k of ENV_KEYS) delete process.env[k];
+  });
+
+  afterEach(() => {
+    for (const k of ENV_KEYS) {
+      if (saved[k] === undefined) delete process.env[k];
+      else process.env[k] = saved[k];
+    }
+  });
+
+  it("returns dev sentinel when Vercel env vars are unset", async () => {
+    const res = await GET();
+    const body = await res.json();
+    expect(body).toEqual({ git_sha: "dev", git_ref: "", vercel_env: "local" });
+  });
+
+  it("reports the SHA Vercel injected at build time", async () => {
+    process.env.VERCEL_GIT_COMMIT_SHA = "abc1234567890";
+    process.env.VERCEL_GIT_COMMIT_REF = "main";
+    process.env.VERCEL_ENV = "production";
+    const res = await GET();
+    const body = await res.json();
+    expect(body.git_sha).toBe("abc1234567890");
+    expect(body.git_ref).toBe("main");
+    expect(body.vercel_env).toBe("production");
+  });
+
+  it("returns 200 status and JSON content type", async () => {
+    const res = await GET();
+    expect(res.status).toBe(200);
+    expect(res.headers.get("content-type")).toContain("application/json");
+  });
+});
@@ -0,0 +1,18 @@
+import { NextResponse } from "next/server";
+
+// Mirror of workspace-server's GET /buildinfo (PR #2398). Lets a developer
+// confirm which git SHA is live on a canvas deployment with the same
+// `curl <url>/buildinfo` flow they use against tenant workspaces.
+//
+// Vercel injects VERCEL_GIT_COMMIT_SHA / _REF / VERCEL_ENV at build time
+// from the deploying commit; outside Vercel (local `next dev`, harness)
+// these are unset and the endpoint reports `git_sha: "dev"`. Same sentinel
+// the workspace-server uses pre-ldflags-injection so both surfaces speak
+// the same vocabulary.
+export async function GET() {
+  return NextResponse.json({
+    git_sha: process.env.VERCEL_GIT_COMMIT_SHA ?? "dev",
+    git_ref: process.env.VERCEL_GIT_COMMIT_REF ?? "",
+    vercel_env: process.env.VERCEL_ENV ?? "local",
+  });
+}
@@ -12,6 +12,19 @@ interface WorkspaceOption {
  tier: number;
 }

+// Subset of the /templates row used here. Mirrors the shape ConfigTab
+// reads. `providers` is the per-template declarative list of supported
+// LLM providers — sourced from the template's
+// runtime_config.providers (config.yaml). When present, it filters
+// the modal's provider <select> so an operator can only pick a
+// provider the template actually supports.
+interface TemplateSpec {
+  id: string;
+  name?: string;
+  runtime?: string;
+  providers?: string[];
+}
+
 interface HermesProvider {
  id: string;
  label: string;
@@ -55,6 +68,13 @@ export function CreateWorkspaceButton() {
  const [creating, setCreating] = useState(false);
  const [error, setError] = useState<string | null>(null);
  const [workspaces, setWorkspaces] = useState<WorkspaceOption[]>([]);
+  // Templates fetched from /api/templates — drives the dynamic provider
+  // filter below. Same data source ConfigTab uses (PR #2454). When the
+  // selected template declares `runtime_config.providers` in its
+  // config.yaml, the modal surfaces only those providers in the
+  // <select>. Empty/missing list falls back to the full HERMES_PROVIDERS
+  // catalog so older templates without the field keep working.
+  const [templateSpecs, setTemplateSpecs] = useState<TemplateSpec[]>([]);
  // External-runtime path: skip docker provision, mint a workspace_auth_token,
  // and surface the connection snippet in a modal after create. When
  // isExternal is true the template / model / hermes-provider fields are
@@ -130,6 +150,52 @@ export function CreateWorkspaceButton() {

  const isHermes = template.trim().toLowerCase() === "hermes";

+  // Resolve the selected template's spec from the /templates response.
+  // The `template` input is free-text; templates can be matched by id,
+  // name, or runtime so any of those work. Lower-cased compare keeps
+  // "Hermes" / "hermes" / "HERMES" interchangeable.
+  const selectedTemplateSpec = useMemo<TemplateSpec | null>(() => {
+    const t = template.trim().toLowerCase();
+    if (!t) return null;
+    return (
+      templateSpecs.find(
+        (s) =>
+          (s.id || "").toLowerCase() === t ||
+          (s.name || "").toLowerCase() === t ||
+          (s.runtime || "").toLowerCase() === t,
+      ) ?? null
+    );
+  }, [template, templateSpecs]);
+
+  // Filter HERMES_PROVIDERS by what the template declares it supports.
+  // Empty/missing declared list → fall back to the full catalog so
+  // templates that haven't migrated to the explicit `providers:` field
+  // (and self-hosted setups without /templates) keep working unchanged.
+  const availableProviders = useMemo<HermesProvider[]>(() => {
+    const declared = selectedTemplateSpec?.providers;
+    if (!declared || declared.length === 0) return HERMES_PROVIDERS;
+    const allowed = new Set(declared.map((p) => p.toLowerCase()));
+    const filtered = HERMES_PROVIDERS.filter((p) => allowed.has(p.id.toLowerCase()));
+    // Defensive: if the template's declared list doesn't match anything
+    // in our static catalog (e.g. brand-new provider id we don't have
+    // metadata for yet), fall back to the full list rather than render
+    // an empty <select>. Better to over-show than to lock the user out.
+    return filtered.length > 0 ? filtered : HERMES_PROVIDERS;
+  }, [selectedTemplateSpec]);
+
+  // If the currently-selected provider is filtered out by a template
+  // change, snap back to the first available. Without this, the
+  // hermesProvider state could refer to a provider not in the dropdown
+  // — confusing UI + the API key field's envVar would be wrong.
+  useEffect(() => {
+    if (!isHermes) return;
+    if (availableProviders.length === 0) return;
+    if (!availableProviders.some((p) => p.id === hermesProvider)) {
+      setHermesProvider(availableProviders[0].id);
+    }
+    // eslint-disable-next-line react-hooks/exhaustive-deps
+  }, [availableProviders, isHermes]);
+
  // Auto-fill hermesModel with the provider's defaultModel whenever the
  // provider changes, but only if the user hasn't already typed their own
  // slug. Prevents the empty-model → "auto" → Anthropic-default 401 trap.
@@ -163,6 +229,10 @@ export function CreateWorkspaceButton() {
      .get<WorkspaceOption[]>("/workspaces")
      .then((ws) => setWorkspaces(ws))
      .catch(() => {});
+    api
+      .get<TemplateSpec[]>("/templates")
+      .then((rows) => setTemplateSpecs(Array.isArray(rows) ? rows : []))
+      .catch(() => { /* keep empty — HERMES_PROVIDERS fallback below */ });
    // defaultTier is stable for the session (derived from window.location),
    // safe to omit from deps.
    // eslint-disable-next-line react-hooks/exhaustive-deps
@@ -405,7 +475,7 @@ export function CreateWorkspaceButton() {
                  aria-label="Hermes provider"
                  className="w-full bg-zinc-800/60 border border-zinc-700/50 rounded-lg px-3 py-2 text-sm text-zinc-100 focus:outline-none focus:border-violet-500/60 focus:ring-1 focus:ring-violet-500/20 transition-colors"
                >
-                  {HERMES_PROVIDERS.map((p) => (
+                  {availableProviders.map((p) => (
                    <option key={p.id} value={p.id}>
                      {p.label}
                    </option>
@@ -1,3 +1,5 @@
+'use client';
+
 // ExternalConnectModal — shown once after creating a runtime="external"
 // workspace. Surfaces the workspace_auth_token + ready-to-paste snippets
 // so the operator can hand them to whoever runs their off-host agent
@@ -24,6 +26,20 @@ export interface ExternalConnectionInfo {
  heartbeat_endpoint: string;
  curl_register_template: string;
  python_snippet: string;
+  // Claude Code channel plugin snippet — for operators whose external
+  // agent IS a Claude Code session. Polling-based; no tunnel required.
+  // Optional in the type for backward compat with platforms that
+  // haven't shipped molecule-core PR #2304 yet (older response payload
+  // omits the field; tab is hidden if empty).
+  claude_code_channel_snippet?: string;
+  // Universal MCP snippet — runtime-agnostic outbound tool path via
+  // the `molecule-mcp` console script in the
+  // molecule-ai-workspace-runtime PyPI wheel. Works with any MCP-aware
+  // agent runtime (Claude Code, hermes, codex, third-party). Outbound-
+  // only: pair with claude_code_channel or python tabs for heartbeat
+  // + inbound. Optional for backward compat with platforms that
+  // haven't shipped PR #2413 yet.
+  universal_mcp_snippet?: string;
 }

 interface Props {
@@ -31,10 +47,14 @@ interface Props {
  onClose: () => void;
 }

-type Tab = "python" | "curl" | "fields";
+type Tab = "python" | "curl" | "claude" | "mcp" | "fields";

 export function ExternalConnectModal({ info, onClose }: Props) {
-  const [tab, setTab] = useState<Tab>("python");
+  // Default to Claude Code when the platform offers it — that's the
+  // newest + simplest path (no tunnel needed). Falls back to Python
+  // for older platform builds that don't ship the snippet.
+  const initialTab: Tab = info?.claude_code_channel_snippet ? "claude" : "python";
+  const [tab, setTab] = useState<Tab>(initialTab);
  const [copiedKey, setCopiedKey] = useState<string | null>(null);

  const copy = useCallback(async (value: string, key: string) => {
@@ -70,6 +90,24 @@ export function ExternalConnectModal({ info, onClose }: Props) {
    'WORKSPACE_AUTH_TOKEN="<paste from create response>"',
    `WORKSPACE_AUTH_TOKEN="${info.auth_token}"`,
  );
+  // The channel snippet asks the operator to paste the auth_token into
+  // the .env file's MOLECULE_WORKSPACE_TOKENS field. Stamp it server-side
+  // here so the copy-paste-block is truly ready-to-run.
+  const filledChannel = info.claude_code_channel_snippet?.replace(
+    'MOLECULE_WORKSPACE_TOKENS=<paste auth_token from create response>',
+    `MOLECULE_WORKSPACE_TOKENS=${info.auth_token}`,
+  );
+  // Universal MCP snippet uses MOLECULE_WORKSPACE_TOKEN as the env-var
+  // name passed through to molecule-mcp via `claude mcp add ... -- env
+  // MOLECULE_WORKSPACE_TOKEN=...`. The placeholder must match the
+  // template's literal — pre-2026-04-30 polish this looked for
+  // WORKSPACE_AUTH_TOKEN (carryover from the curl tab), which silently
+  // skipped the substitution and left "<paste from create response>"
+  // visible in the operator's clipboard.
+  const filledUniversalMcp = info.universal_mcp_snippet?.replace(
+    'MOLECULE_WORKSPACE_TOKEN="<paste from create response>"',
+    `MOLECULE_WORKSPACE_TOKEN="${info.auth_token}"`,
+  );

  return (
    <Dialog.Root open onOpenChange={(o) => !o && onClose()}>
@@ -91,7 +129,19 @@ export function ExternalConnectModal({ info, onClose }: Props) {
            aria-label="Connection snippet format"
            className="mt-4 flex gap-1 border-b border-zinc-800"
          >
-            {(["python", "curl", "fields"] as Tab[]).map((t) => (
+            {(() => {
+              // Build the tab order dynamically. Claude Code first
+              // (when offered) since it's the simplest setup; Python
+              // SDK second (full register+heartbeat+inbound); Universal
+              // MCP third (any MCP-aware runtime, outbound-only); curl
+              // for one-shot register; Fields for raw values.
+              const tabs: Tab[] = [];
+              if (filledChannel) tabs.push("claude");
+              tabs.push("python");
+              if (filledUniversalMcp) tabs.push("mcp");
+              tabs.push("curl", "fields");
+              return tabs;
+            })().map((t) => (
              <button
                key={t}
                type="button"
@@ -104,17 +154,34 @@ export function ExternalConnectModal({ info, onClose }: Props) {
                    : "border-transparent text-zinc-500 hover:text-zinc-300"
                }`}
              >
-                {t === "python" ? "Python SDK" : t === "curl" ? "curl" : "Fields"}
+                {t === "claude"
+                  ? "Claude Code"
+                  : t === "python"
+                  ? "Python SDK"
+                  : t === "mcp"
+                  ? "Universal MCP"
+                  : t === "curl"
+                  ? "curl"
+                  : "Fields"}
              </button>
            ))}
          </div>

          {/* Snippet area */}
          <div className="mt-3">
+            {tab === "claude" && filledChannel && (
+              <SnippetBlock
+                value={filledChannel}
+                label="Claude Code channel — polls workspace's A2A; no tunnel needed"
+                copyKey="claude"
+                copied={copiedKey === "claude"}
+                onCopy={() => copy(filledChannel, "claude")}
+              />
+            )}
            {tab === "python" && (
              <SnippetBlock
                value={filledPython}
-                label="Python (recommended — includes heartbeat loop)"
+                label="Python SDK — includes heartbeat loop (push-mode, needs public URL)"
                copyKey="python"
                copied={copiedKey === "python"}
                onCopy={() => copy(filledPython, "python")}
@@ -129,6 +196,15 @@ export function ExternalConnectModal({ info, onClose }: Props) {
                onCopy={() => copy(filledCurl, "curl")}
              />
            )}
+            {tab === "mcp" && filledUniversalMcp && (
+              <SnippetBlock
+                value={filledUniversalMcp}
+                label="Universal MCP — standalone register + heartbeat + tools for any MCP-aware runtime (Claude Code, hermes, codex). Pair with Python or Claude Code tab if you need inbound A2A delivery."
+                copyKey="mcp"
+                copied={copiedKey === "mcp"}
+                onCopy={() => copy(filledUniversalMcp, "mcp")}
+              />
+            )}
            {tab === "fields" && (
              <div className="space-y-2">
                <Field label="workspace_id" value={info.workspace_id} onCopy={() => copy(info.workspace_id, "wsid")} copied={copiedKey === "wsid"} />
@@ -16,14 +16,35 @@ interface Props {
  /** Runtime slug — used only for the "The <runtime> runtime …"
   *  headline; behavior is driven by providers/missingKeys. */
  runtime: string;
-  /** Called when all required keys for the chosen provider are saved. */
-  onKeysAdded: () => void;
+  /** Called when all required keys for the chosen provider are saved.
+   *  Receives the model slug if the modal collected one (template-deploy
+   *  flow); legacy callers ignore it. */
+  onKeysAdded: (model?: string) => void;
  /** Called when the user cancels the deploy. */
  onCancel: () => void;
  /** Optional — open the Settings Panel (Config tab → Secrets). */
  onOpenSettings?: () => void;
  /** If provided, secrets save at workspace scope instead of global. */
  workspaceId?: string;
+  /** Set of env var names already configured in the relevant scope
+   *  (global or workspace). When provided, entries whose key is already
+   *  in this set start as `saved: true` so the user can confirm without
+   *  re-entering. Used by the template-deploy "always ask" flow so a
+   *  user can pick a different provider even when global env covers
+   *  the default one. */
+  configuredKeys?: Set<string>;
+  /** Model slug suggestions (datalist) — populated from the template's
+   *  models[]. When non-empty the picker renders a model input above
+   *  the API-key fields. The picker passes the entered slug back via
+   *  onKeysAdded. */
+  modelSuggestions?: string[];
+  /** Pre-fill the model input. */
+  initialModel?: string;
+  /** Override the modal's title + description copy. The default
+   *  "Missing API Keys" title misreads when the modal is opened to
+   *  pick provider/model with keys already configured. */
+  title?: string;
+  description?: string;
 }

 interface KeyEntry {
@@ -60,6 +81,11 @@ export function MissingKeysModal({
  onCancel,
  onOpenSettings,
  workspaceId,
+  configuredKeys,
+  modelSuggestions,
+  initialModel,
+  title,
+  description,
 }: Props) {
  const pickerProviders = providers ?? [];
  const pickerMode = pickerProviders.length > 1;
@@ -74,6 +100,11 @@ export function MissingKeysModal({
        onCancel={onCancel}
        onOpenSettings={onOpenSettings}
        workspaceId={workspaceId}
+        configuredKeys={configuredKeys}
+        modelSuggestions={modelSuggestions}
+        initialModel={initialModel}
+        title={title}
+        description={description}
      />
    );
  }
@@ -108,17 +139,41 @@ function ProviderPickerModal({
  onCancel,
  onOpenSettings,
  workspaceId,
+  configuredKeys,
+  modelSuggestions,
+  initialModel,
+  title,
+  description,
 }: {
  open: boolean;
  providers: ProviderChoice[];
  runtime: string;
-  onKeysAdded: () => void;
+  onKeysAdded: (model?: string) => void;
  onCancel: () => void;
  onOpenSettings?: () => void;
  workspaceId?: string;
+  configuredKeys?: Set<string>;
+  modelSuggestions?: string[];
+  initialModel?: string;
+  title?: string;
+  description?: string;
 }) {
-  const [selectedId, setSelectedId] = useState(providers[0].id);
+  // Prefer the first provider whose env vars are already satisfied by
+  // the configured set — pre-selecting "the option the user already has
+  // keys for" matches expected UX. Falls back to providers[0] otherwise.
+  const initialSelected = useMemo(() => {
+    if (configuredKeys) {
+      const satisfied = providers.find((p) =>
+        p.envVars.every((k) => configuredKeys.has(k)),
+      );
+      if (satisfied) return satisfied.id;
+    }
+    return providers[0].id;
+  }, [providers, configuredKeys]);
+
+  const [selectedId, setSelectedId] = useState(initialSelected);
  const [entries, setEntries] = useState<KeyEntry[]>([]);
+  const [model, setModel] = useState(initialModel ?? "");
  const firstInputRef = useRef<HTMLInputElement>(null);

  const selected = useMemo(
@@ -126,10 +181,13 @@ function ProviderPickerModal({
    [providers, selectedId],
  );

+  const showModelInput = (modelSuggestions?.length ?? 0) > 0 || initialModel !== undefined;
+
  useEffect(() => {
    if (!open) return;
-    setSelectedId(providers[0].id);
-  }, [open, providers]);
+    setSelectedId(initialSelected);
+    setModel(initialModel ?? "");
+  }, [open, initialSelected, initialModel]);

  useEffect(() => {
    if (!open) return;
@@ -137,12 +195,15 @@ function ProviderPickerModal({
      selected.envVars.map((key) => ({
        key,
        value: "",
-        saved: false,
+        // Pre-mark as saved when the key is already in the configured
+        // set (global or workspace scope). Lets the user click Deploy
+        // without re-entering a key the platform already holds.
+        saved: configuredKeys?.has(key) ?? false,
        saving: false,
        error: null,
      })),
    );
-  }, [open, selected]);
+  }, [open, selected, configuredKeys]);

  useEffect(() => {
    if (!open) return;
@@ -243,16 +304,52 @@ function ProviderPickerModal({
              </svg>
            </div>
            <h3 id="missing-keys-title" className="text-sm font-semibold text-zinc-100">
-              Missing API Keys
+              {title ?? "Missing API Keys"}
            </h3>
          </div>
          <p className="text-[12px] text-zinc-400 leading-relaxed">
-            The <span className="text-amber-300 font-medium">{runtimeLabel}</span>{" "}
-            runtime supports multiple providers. Pick one and paste its API key.
+            {description ?? (
+              <>
+                The <span className="text-amber-300 font-medium">{runtimeLabel}</span>{" "}
+                runtime supports multiple providers. Pick one and paste its API key.
+              </>
+            )}
          </p>
        </div>

        <div className="px-5 py-4 space-y-3">
+          {showModelInput && (
+            <div>
+              <label
+                htmlFor="provider-picker-model-input"
+                className="text-[10px] uppercase tracking-wide text-zinc-500 font-semibold mb-1.5 block"
+              >
+                Model{" "}
+                <span aria-hidden="true" className="text-red-400">*</span>
+                <span className="sr-only"> (required)</span>
+              </label>
+              <input
+                id="provider-picker-model-input"
+                type="text"
+                value={model}
+                onChange={(e) => setModel(e.target.value)}
+                placeholder="e.g. minimax/MiniMax-M2.7"
+                aria-label="Model slug"
+                autoComplete="off"
+                spellCheck={false}
+                list="provider-picker-model-suggestions"
+                className="w-full bg-zinc-900 border border-zinc-600 rounded px-2 py-1.5 text-[11px] text-zinc-100 font-mono focus:outline-none focus:border-blue-500 focus:ring-1 focus:ring-blue-500/20 transition-colors"
+              />
+              <datalist id="provider-picker-model-suggestions">
+                {modelSuggestions?.map((m) => (
+                  <option key={m} value={m} />
+                ))}
+              </datalist>
+              <p className="text-[9px] text-zinc-500 mt-1 leading-relaxed">
+                Slug determines provider routing at install time.
+              </p>
+            </div>
+          )}
          <fieldset className="space-y-1.5">
            <legend className="text-[10px] uppercase tracking-wide text-zinc-500 font-semibold mb-1.5">
              Provider
@@ -364,8 +461,12 @@ function ProviderPickerModal({
              Cancel Deploy
            </button>
            <button
-              onClick={onKeysAdded}
-              disabled={!allSaved || anySaving}
+              onClick={() => onKeysAdded(showModelInput ? model.trim() : undefined)}
+              disabled={
+                !allSaved ||
+                anySaving ||
+                (showModelInput && model.trim() === "")
+              }
              className="px-3.5 py-1.5 text-[12px] bg-blue-600 hover:bg-blue-500 text-white rounded-lg transition-colors disabled:opacity-40"
            >
              {allSaved ? "Deploy" : entries.length > 1 ? "Add Keys" : "Add Key"}
@@ -190,6 +190,91 @@ describe("CreateWorkspaceDialog — Hermes provider picker", () => {
    expect(ids).toContain("hermes");
  });

+  // Pins the dynamic-providers behavior: when the matched template's
+  // /templates row declares `providers`, the dropdown filters to that
+  // subset instead of showing the full HERMES_PROVIDERS catalog. Same
+  // data source ConfigTab uses (PR #2454) — keeps the modal and the
+  // settings tab honest about which providers a template supports.
+  it("hermes provider dropdown filters to template-declared providers when /templates ships them", async () => {
+    // Per-URL mock: /workspaces returns the existing fixture, /templates
+    // returns a hermes row that only allows anthropic + minimax + openai.
+    mockGet.mockImplementation(async (url: string) => {
+      if (url === "/templates") {
+        return [
+          { id: "hermes", name: "Hermes", runtime: "hermes", providers: ["anthropic", "minimax", "openai"] },
+        // eslint-disable-next-line @typescript-eslint/no-explicit-any
+        ] as any;
+      }
+      // eslint-disable-next-line @typescript-eslint/no-explicit-any
+      return SAMPLE_WORKSPACES as any;
+    });
+
+    await openDialog();
+    await setTemplate("hermes");
+    await waitFor(() =>
+      expect(document.querySelector("[data-testid='hermes-provider-section']")).toBeTruthy()
+    );
+    const providerSelect = document.getElementById("hermes-provider-select") as HTMLSelectElement;
+    // Filtered list arrives async after /templates fetch resolves —
+    // keep waiting until the dropdown shrinks below the full catalog.
+    await waitFor(() => expect(providerSelect.options.length).toBe(3));
+    const ids = Array.from(providerSelect.options).map((o) => o.value);
+    expect(ids).toEqual(expect.arrayContaining(["anthropic", "minimax", "openai"]));
+    expect(ids).not.toContain("gemini");
+    expect(ids).not.toContain("deepseek");
+  });
+
+  // Back-compat: a template that hasn't migrated to runtime_config.providers
+  // (older templates, self-hosted setups without /templates server) keeps
+  // showing the full provider catalog. Operators picking from those
+  // templates can't be locked out of providers we know hermes supports.
+  it("hermes provider dropdown falls back to all providers when template declares no providers list", async () => {
+    mockGet.mockImplementation(async (url: string) => {
+      if (url === "/templates") {
+        // No `providers` field — empty/missing → fall back to full catalog.
+        // eslint-disable-next-line @typescript-eslint/no-explicit-any
+        return [{ id: "hermes", name: "Hermes", runtime: "hermes" }] as any;
+      }
+      // eslint-disable-next-line @typescript-eslint/no-explicit-any
+      return SAMPLE_WORKSPACES as any;
+    });
+
+    await openDialog();
+    await setTemplate("hermes");
+    await waitFor(() =>
+      expect(document.querySelector("[data-testid='hermes-provider-section']")).toBeTruthy()
+    );
+    const providerSelect = document.getElementById("hermes-provider-select") as HTMLSelectElement;
+    expect(providerSelect.options.length).toBe(HERMES_PROVIDERS.length);
+  });
+
+  // Defensive: a template's declared list with NO matches against our
+  // static catalog (e.g. a brand-new provider id we don't have label/
+  // envVar metadata for yet) must not render an empty <select> — the
+  // operator can't pick a provider, the form locks. Component falls
+  // back to the full catalog so the user can still proceed.
+  it("hermes provider dropdown falls back to all providers when template declares only unknown providers", async () => {
+    mockGet.mockImplementation(async (url: string) => {
+      if (url === "/templates") {
+        return [
+          { id: "hermes", name: "Hermes", runtime: "hermes", providers: ["totally-new-provider-2030"] },
+        // eslint-disable-next-line @typescript-eslint/no-explicit-any
+        ] as any;
+      }
+      // eslint-disable-next-line @typescript-eslint/no-explicit-any
+      return SAMPLE_WORKSPACES as any;
+    });
+
+    await openDialog();
+    await setTemplate("hermes");
+    await waitFor(() =>
+      expect(document.querySelector("[data-testid='hermes-provider-section']")).toBeTruthy()
+    );
+    const providerSelect = document.getElementById("hermes-provider-select") as HTMLSelectElement;
+    // Stays at full catalog length — no flapping to 0 then back.
+    expect(providerSelect.options.length).toBe(HERMES_PROVIDERS.length);
+  });
+
  it("hermes API key field is a password input (masked)", async () => {
    await openDialog();
    await setTemplate("hermes");
@@ -100,6 +100,42 @@ interface RuntimeOption {
  value: string;
  label: string;
  models: ModelSpec[];
+  // providers is the declarative provider list each template ships in
+  // its config.yaml under runtime_config.providers. The /templates API
+  // surfaces it (workspace-server templates.go) so canvas stays
+  // adapter-driven: hermes ships ~20 slugs, claude-code ships
+  // ["anthropic"], gemini-cli ships ["gemini"], etc. Empty list →
+  // canvas falls back to deriving unique vendor prefixes from
+  // models[].id (still adapter-driven, just inferred).
+  providers: string[];
+}
+
+// deriveProvidersFromModels — when a template doesn't ship an explicit
+// providers list, infer suggestions from the vendor prefixes of its
+// model slugs. e.g. ["anthropic:claude-opus-4-7", "openai:gpt-4o",
+// "anthropic:claude-sonnet-4-5"] → ["anthropic", "openai"].
+//
+// This keeps the dropdown adapter-driven for older templates that
+// haven't migrated to the explicit `providers:` field yet, AND
+// continues to be a useful fallback for any future runtime whose
+// derive-provider semantics happen to match the slug prefix.
+function deriveProvidersFromModels(models: ModelSpec[]): string[] {
+  const seen = new Set<string>();
+  const out: string[] = [];
+  for (const m of models) {
+    if (!m.id) continue;
+    // Both ":" (anthropic:claude-opus-4-7) and "/" (nousresearch/hermes-4-70b)
+    // are valid vendor separators in our slug taxonomy. Take whichever
+    // appears first and split there.
+    const sep = m.id.match(/[:/]/)?.index ?? -1;
+    if (sep <= 0) continue;
+    const vendor = m.id.slice(0, sep);
+    if (!seen.has(vendor)) {
+      seen.add(vendor);
+      out.push(vendor);
+    }
+  }
+  return out;
 }

 // Fallback used when /templates can't be fetched (offline, older backend).
@@ -118,14 +154,14 @@ interface RuntimeOption {
 const RUNTIMES_WITH_OWN_CONFIG = new Set<string>(["external"]);

 const FALLBACK_RUNTIME_OPTIONS: RuntimeOption[] = [
-  { value: "", label: "LangGraph (default)", models: [] },
-  { value: "claude-code", label: "Claude Code", models: [] },
-  { value: "crewai", label: "CrewAI", models: [] },
-  { value: "autogen", label: "AutoGen", models: [] },
-  { value: "deepagents", label: "DeepAgents", models: [] },
-  { value: "openclaw", label: "OpenClaw", models: [] },
-  { value: "hermes", label: "Hermes", models: [] },
-  { value: "gemini-cli", label: "Gemini CLI", models: [] },
+  { value: "", label: "LangGraph (default)", models: [], providers: [] },
+  { value: "claude-code", label: "Claude Code", models: [], providers: [] },
+  { value: "crewai", label: "CrewAI", models: [], providers: [] },
+  { value: "autogen", label: "AutoGen", models: [], providers: [] },
+  { value: "deepagents", label: "DeepAgents", models: [], providers: [] },
+  { value: "openclaw", label: "OpenClaw", models: [], providers: [] },
+  { value: "hermes", label: "Hermes", models: [], providers: [] },
+  { value: "gemini-cli", label: "Gemini CLI", models: [], providers: [] },
 ];

 export function ConfigTab({ workspaceId }: Props) {
@@ -138,6 +174,17 @@ export function ConfigTab({ workspaceId }: Props) {
  const [rawMode, setRawMode] = useState(false);
  const [rawDraft, setRawDraft] = useState("");
  const [runtimeOptions, setRuntimeOptions] = useState<RuntimeOption[]>(FALLBACK_RUNTIME_OPTIONS);
+  // Provider override (Option B PR-5): stored separately from config.yaml
+  // because the value lives in workspace_secrets (encrypted), not in the
+  // platform-managed config.yaml. The two endpoints are GET/PUT
+  // /workspaces/:id/provider on workspace-server (handlers/secrets.go).
+  // Empty = "auto-derive from model slug prefix" — pre-Option-B behavior
+  // and what most users want. Setting to a non-empty value writes
+  // LLM_PROVIDER into workspace_secrets and triggers an auto-restart so
+  // the workspace boots with the new provider in env (and via CP user-
+  // data, written into /configs/config.yaml on next provision too).
+  const [provider, setProvider] = useState("");
+  const [originalProvider, setOriginalProvider] = useState("");
  const successTimerRef = useRef<ReturnType<typeof setTimeout>>(undefined);

  useEffect(() => {
@@ -168,6 +215,22 @@ export function ConfigTab({ workspaceId }: Props) {
      wsMetadataModel = (m.model || "").trim();
    } catch { /* non-fatal */ }

+    // Load explicit provider override (Option B PR-5). Endpoint returns
+    // {provider: "", source: "default"} when no override is set, so the
+    // empty string is the legitimate "auto-derive" signal — don't treat
+    // it as a load error. Non-fatal: an older workspace-server that
+    // predates PR-2 returns 404 here; the form falls back to "" and
+    // Save just won't PUT the provider field.
+    try {
+      const p = await api.get<{ provider?: string }>(`/workspaces/${workspaceId}/provider`);
+      const loadedProvider = (p.provider || "").trim();
+      setProvider(loadedProvider);
+      setOriginalProvider(loadedProvider);
+    } catch {
+      setProvider("");
+      setOriginalProvider("");
+    }
+
    try {
      const res = await api.get<{ content: string }>(`/workspaces/${workspaceId}/files/config.yaml`);
      const parsed = parseYaml(res.content);
@@ -209,11 +272,11 @@ export function ConfigTab({ workspaceId }: Props) {

  useEffect(() => {
    let cancelled = false;
-    api.get<Array<{ id: string; name?: string; runtime?: string; models?: ModelSpec[] }>>("/templates")
+    api.get<Array<{ id: string; name?: string; runtime?: string; models?: ModelSpec[]; providers?: string[] }>>("/templates")
      .then((rows) => {
        if (cancelled || !Array.isArray(rows)) return;
        const byRuntime = new Map<string, RuntimeOption>();
-        byRuntime.set("", { value: "", label: "LangGraph (default)", models: [] });
+        byRuntime.set("", { value: "", label: "LangGraph (default)", models: [], providers: [] });
        for (const r of rows) {
          const v = (r.runtime || "").trim();
          if (!v || v === "langgraph") continue;
@@ -221,8 +284,9 @@ export function ConfigTab({ workspaceId }: Props) {
          // one with the richer models list is probably newer.
          const existing = byRuntime.get(v);
          const models = Array.isArray(r.models) ? r.models : [];
+          const providers = Array.isArray(r.providers) ? r.providers : [];
          if (!existing || models.length > existing.models.length) {
-            byRuntime.set(v, { value: v, label: r.name || v, models });
+            byRuntime.set(v, { value: v, label: r.name || v, models, providers });
          }
        }
        if (byRuntime.size > 1) setRuntimeOptions(Array.from(byRuntime.values()));
@@ -234,6 +298,16 @@ export function ConfigTab({ workspaceId }: Props) {
  // Models + env hints for the currently-selected runtime.
  const selectedRuntime = runtimeOptions.find((o) => o.value === (config.runtime || "")) ?? null;
  const availableModels: ModelSpec[] = selectedRuntime?.models ?? [];
+  // Provider suggestions: prefer the runtime's declarative providers
+  // list (sourced from its template config.yaml runtime_config.providers
+  // and surfaced via /templates), fall back to deriving from model slug
+  // prefixes when the template hasn't migrated to the explicit field
+  // yet. Either way the data flows from the adapter — no hardcoded
+  // canvas-side enum.
+  const providerSuggestions: string[] =
+    (selectedRuntime?.providers && selectedRuntime.providers.length > 0)
+      ? selectedRuntime.providers
+      : deriveProvidersFromModels(availableModels);
  const currentModelId = config.runtime_config?.model || config.model || "";
  const currentModelSpec = availableModels.find((m) => m.id === currentModelId) ?? null;

@@ -301,20 +375,57 @@ export function ConfigTab({ workspaceId }: Props) {
      // partial-save state — we report it as a user-visible warning
      // rather than lying "Saved" and letting the user discover the
      // revert on next reload.
-      const oldModel = (oldParsed.model as string) || "";
+      //
+      // Read from runtime_config.model first, then fall back to top-level
+      // model. The dropdown's onChange (above, ~line 475) writes to
+      // runtime_config.model whenever a runtime is selected (hermes,
+      // claude-code, etc.) and only falls back to top-level model when
+      // there's no runtime. handleSave used to diff against top-level
+      // model only, so for any runtime-bearing workspace the user's
+      // model selection never persisted — they'd Save & Restart, the
+      // EC2 would boot with HERMES_DEFAULT_MODEL empty, and hermes
+      // would fall back to nousresearch/hermes-4-70b → "No LLM provider
+      // configured" error in the chat. Caught 2026-04-30 on hongmingwang
+      // hermes workspace 32993ee7-…cb9d75d112a5.
+      const nextModelRaw = (nextSource.runtime_config as Record<string, unknown> | undefined)?.model;
+      const oldModelRaw = (oldParsed.runtime_config as Record<string, unknown> | undefined)?.model;
+      const nextModel =
+        typeof nextModelRaw === "string" && nextModelRaw
+          ? nextModelRaw
+          : typeof nextSource.model === "string"
+            ? nextSource.model
+            : "";
+      const oldModel =
+        typeof oldModelRaw === "string" && oldModelRaw
+          ? oldModelRaw
+          : (oldParsed.model as string) || "";
      let modelSaveError: string | null = null;
-      if (
-        typeof nextSource.model === "string" &&
-        nextSource.model &&
-        nextSource.model !== oldModel
-      ) {
+      if (nextModel && nextModel !== oldModel) {
        try {
-          await api.put(`/workspaces/${workspaceId}/model`, { model: nextSource.model });
+          await api.put(`/workspaces/${workspaceId}/model`, { model: nextModel });
        } catch (e) {
          modelSaveError = e instanceof Error ? e.message : "Model update was rejected";
        }
      }

+      // Provider override save (Option B PR-5). PUT only when the user
+      // changed the dropdown — otherwise an unrelated Save (e.g. tier
+      // edit) would re-write the provider unchanged and the server-
+      // side auto-restart would fire on every Save, costing the user a
+      // ~30s reboot for a no-op change. Server endpoint accepts an
+      // empty string to clear the override (deletes the
+      // workspace_secrets row); we forward whatever the form holds.
+      let providerSaveError: string | null = null;
+      const providerChanged = provider !== originalProvider;
+      if (providerChanged) {
+        try {
+          await api.put(`/workspaces/${workspaceId}/provider`, { provider });
+          setOriginalProvider(provider);
+        } catch (e) {
+          providerSaveError = e instanceof Error ? e.message : "Provider update was rejected";
+        }
+      }
+
      setOriginalYaml(content);
      if (rawMode) {
        const parsed = parseYaml(content);
@@ -322,16 +433,30 @@ export function ConfigTab({ workspaceId }: Props) {
      } else {
        setRawDraft(content);
      }
-      if (restart) {
+      // SetProvider on the server already triggers an auto-restart for
+      // the workspace whenever the value actually changed (see
+      // workspace-server/internal/handlers/secrets.go:SetProvider). If
+      // the user also clicked Save+Restart we'd kick off a SECOND
+      // restart here and the two would race in the canvas store —
+      // suppress the redundant call and rely on the server-side one.
+      const providerWillAutoRestart = providerChanged && !providerSaveError;
+      if (restart && !providerWillAutoRestart) {
        await useCanvasStore.getState().restartWorkspace(workspaceId);
-      } else {
-        useCanvasStore.getState().updateNodeData(workspaceId, { needsRestart: true });
+      } else if (!restart) {
+        useCanvasStore.getState().updateNodeData(workspaceId, { needsRestart: !providerWillAutoRestart });
      }
-      if (modelSaveError) {
-        // Partial-save UX: surface the model rejection instead of
-        // showing "Saved" — the user would otherwise watch the model
-        // field revert on next reload with no explanation.
-        setError(`Other fields saved, but model update failed: ${modelSaveError}`);
+      // Aggregate partial-save errors. Both modelSaveError and
+      // providerSaveError describe rejected updates from independent
+      // endpoints — show whichever fired so the user knows which
+      // field reverts on next reload (otherwise they'd see "Saved" and
+      // be confused why Provider snapped back).
+      const partialError = providerSaveError
+        ? `Other fields saved, but provider update failed: ${providerSaveError}`
+        : modelSaveError
+          ? `Other fields saved, but model update failed: ${modelSaveError}`
+          : null;
+      if (partialError) {
+        setError(partialError);
      } else {
        setSuccess(true);
        clearTimeout(successTimerRef.current);
@@ -352,7 +477,8 @@ export function ConfigTab({ workspaceId }: Props) {
  const taskBudgetId = useId();
  const sandboxBackendId = useId();

-  const isDirty = rawMode ? rawDraft !== originalYaml : toYaml(config) !== originalYaml;
+  const providerDirty = provider !== originalProvider;
+  const isDirty = (rawMode ? rawDraft !== originalYaml : toYaml(config) !== originalYaml) || providerDirty;

  if (loading) {
    return <div className="p-4 text-xs text-zinc-500">Loading config...</div>;
@@ -499,6 +625,51 @@ export function ConfigTab({ workspaceId }: Props) {
                )}
              </div>
            </div>
+            {/* Provider override (Option B PR-5). Free-text combobox so
+                operators can use any of the 30+ slugs hermes-agent's
+                derive-provider.sh recognizes — the suggestion list is
+                a hint, not a constraint. Empty = "auto-derive from
+                model slug prefix" which is correct for the common case
+                (model "anthropic:claude-opus-4-7" → provider derived
+                as "anthropic"). The override is needed when the model
+                alias has no clean vendor prefix (e.g. hermes default
+                "nousresearch/hermes-4-70b" → derive returns empty →
+                hermes errors "No LLM provider configured"). */}
+            <div>
+              <label htmlFor={`${runtimeId}-provider`} className="text-[10px] text-zinc-500 block mb-1">
+                Provider
+                <span className="ml-1 text-zinc-600">
+                  (override — leave empty to auto-derive from model slug)
+                </span>
+              </label>
+              <input
+                id={`${runtimeId}-provider`}
+                type="text"
+                list={providerSuggestions.length > 0 ? `${runtimeId}-providers` : undefined}
+                value={provider}
+                onChange={(e) => setProvider(e.target.value.trim())}
+                placeholder={
+                  providerSuggestions.length > 0
+                    ? `e.g. ${providerSuggestions.slice(0, 3).join(", ")} (empty = auto-derive)`
+                    : "empty = auto-derive from model slug"
+                }
+                aria-label="LLM provider override"
+                data-testid="provider-input"
+                className="w-full bg-zinc-800 border border-zinc-700 rounded px-2 py-1 text-xs text-zinc-200 font-mono focus:outline-none focus:border-blue-500"
+              />
+              {providerSuggestions.length > 0 && (
+                <datalist id={`${runtimeId}-providers`}>
+                  {providerSuggestions.map((p) => (
+                    <option key={p} value={p} />
+                  ))}
+                </datalist>
+              )}
+              {provider && provider !== originalProvider && (
+                <p className="text-[10px] text-amber-500 mt-1">
+                  Provider change → workspace will auto-restart on Save.
+                </p>
+              )}
+            </div>
            <TagList
              label={
                currentModelSpec?.required_env?.length &&
@@ -11,7 +11,7 @@
 // Each test pins one invariant. If any fails, the bug is back.

 import { describe, it, expect, vi, afterEach, beforeEach } from "vitest";
-import { render, screen, cleanup, waitFor } from "@testing-library/react";
+import { render, screen, cleanup, waitFor, fireEvent } from "@testing-library/react";
 import React from "react";

 afterEach(cleanup);
@@ -168,6 +168,116 @@ describe("ConfigTab — hermes workspace", () => {
  });
 });

+describe("ConfigTab — Save persists model under runtime_config.model (2026-04-30)", () => {
+  // The dropdown's onChange writes to config.runtime_config.model whenever
+  // a runtime is selected (hermes, claude-code, etc.) and only falls back
+  // to top-level config.model when no runtime is set. The Save handler used
+  // to diff against top-level model only, so for any runtime-bearing
+  // workspace the user's model selection never persisted — Save & Restart
+  // would reboot with HERMES_DEFAULT_MODEL empty, hermes would fall back
+  // to nousresearch/hermes-4-70b → "No LLM provider configured" in chat.
+  // Caught 2026-04-30 on hongmingwang hermes workspace.
+
+  it("PUTs /model when user picks a model on a hermes workspace", async () => {
+    apiGet.mockImplementation((path: string) => {
+      if (path === "/workspaces/ws-test") {
+        return Promise.resolve({ runtime: "hermes" });
+      }
+      if (path === "/workspaces/ws-test/model") {
+        return Promise.resolve({ model: "" });
+      }
+      if (path === "/workspaces/ws-test/files/config.yaml") {
+        return Promise.reject(new Error("not found"));
+      }
+      if (path === "/templates") {
+        return Promise.resolve([
+          {
+            id: "t-hermes",
+            name: "Hermes",
+            runtime: "hermes",
+            models: [
+              { id: "minimax/MiniMax-M2.7-highspeed", name: "MiniMax M2.7" },
+            ],
+          },
+        ]);
+      }
+      return Promise.reject(new Error(`unmocked api.get: ${path}`));
+    });
+    apiPut.mockResolvedValue({});
+    apiPatch.mockResolvedValue({});
+
+    render(<ConfigTab workspaceId="ws-test" />);
+
+    // Wait for the runtime dropdown to populate so the model textbox renders.
+    await waitFor(() =>
+      expect(
+        (screen.getByRole("combobox", { name: /runtime/i }) as HTMLSelectElement).value,
+      ).toBe("hermes"),
+    );
+
+    // The model input is a free-text input wired to a datalist of suggestions.
+    const modelInput = (await waitFor(() =>
+      screen.getByPlaceholderText(/anthropic:claude-sonnet/i),
+    )) as HTMLInputElement;
+
+    fireEvent.change(modelInput, {
+      target: { value: "minimax/MiniMax-M2.7-highspeed" },
+    });
+
+    // Click Save & Restart.
+    fireEvent.click(screen.getByRole("button", { name: /save & restart/i }));
+
+    await waitFor(() => {
+      expect(apiPut).toHaveBeenCalledWith("/workspaces/ws-test/model", {
+        model: "minimax/MiniMax-M2.7-highspeed",
+      });
+    });
+  });
+
+  it("does NOT PUT /model when the value is unchanged (no-op restart)", async () => {
+    apiGet.mockImplementation((path: string) => {
+      if (path === "/workspaces/ws-test") {
+        return Promise.resolve({ runtime: "hermes" });
+      }
+      if (path === "/workspaces/ws-test/model") {
+        return Promise.resolve({ model: "minimax/MiniMax-M2.7" });
+      }
+      if (path === "/workspaces/ws-test/files/config.yaml") {
+        return Promise.reject(new Error("not found"));
+      }
+      if (path === "/templates") {
+        return Promise.resolve([
+          { id: "t-hermes", runtime: "hermes", models: [] },
+        ]);
+      }
+      return Promise.reject(new Error(`unmocked api.get: ${path}`));
+    });
+    apiPut.mockResolvedValue({});
+
+    render(<ConfigTab workspaceId="ws-test" />);
+
+    // Wait for load.
+    await waitFor(() =>
+      expect(
+        (screen.getByRole("combobox", { name: /runtime/i }) as HTMLSelectElement).value,
+      ).toBe("hermes"),
+    );
+
+    // Force isDirty by toggling a field that doesn't affect model. (Save is
+    // disabled until isDirty=true; we want to prove that even when Save
+    // fires, /model isn't called for an unchanged model.) Skipped — easier
+    // to just verify apiPut wasn't called with the model URL.
+
+    // Without any user edit, Save & Restart is disabled, so /model is
+    // trivially not PUT. The asserts below verify no /model PUT happens
+    // at any point during load.
+    const modelPuts = apiPut.mock.calls.filter(
+      ([path]) => path === "/workspaces/ws-test/model",
+    );
+    expect(modelPuts).toHaveLength(0);
+  });
+});
+
 describe("ConfigTab — config.yaml on disk", () => {
  it("workspace metadata (DB) wins over config.yaml when both are present (#2061)", async () => {
    // Priority inversion in #2061: previously config.yaml overrode DB, so
@@ -0,0 +1,332 @@
+// @vitest-environment jsdom
+//
+// Regression tests for ConfigTab Provider override (Option B PR-5).
+//
+// What this pins: a free-text Provider combobox in the Runtime section
+// that lets the operator override the model→provider derivation hermes-
+// agent does internally. Without this UI, a fresh signup whose Hermes
+// workspace defaults to a model with no clean vendor prefix (e.g.
+// `nousresearch/hermes-4-70b`) hits the runtime's own preflight error:
+//   "No LLM provider configured. Run `hermes model` to select a
+//    provider, or run `hermes setup` for first-time configuration."
+// — even though tasks #195-198 wired the entire downstream pipe so a
+// non-empty provider WOULD flow through canvas → workspace-server →
+// CP user-data → workspace config.yaml → hermes adapter.
+//
+// Hongming Wang hit this on hongming.moleculesai.app at signup
+// 2026-05-01T17:35Z. Backend PRs were green, the gap was the missing
+// UI to set the value.
+//
+// Each test pins one invariant. If any fails, the bug is back.
+
+import { describe, it, expect, vi, afterEach, beforeEach } from "vitest";
+import { render, screen, cleanup, waitFor, fireEvent } from "@testing-library/react";
+import React from "react";
+
+afterEach(cleanup);
+
+const apiGet = vi.fn();
+const apiPatch = vi.fn();
+const apiPut = vi.fn();
+vi.mock("@/lib/api", () => ({
+  api: {
+    get: (path: string) => apiGet(path),
+    patch: (path: string, body: unknown) => apiPatch(path, body),
+    put: (path: string, body: unknown) => apiPut(path, body),
+    post: vi.fn(),
+    del: vi.fn(),
+  },
+}));
+
+vi.mock("@/store/canvas", () => ({
+  useCanvasStore: Object.assign(
+    (selector: (s: unknown) => unknown) => selector({ restartWorkspace: vi.fn(), updateNodeData: vi.fn() }),
+    { getState: () => ({ restartWorkspace: vi.fn(), updateNodeData: vi.fn() }) },
+  ),
+}));
+
+vi.mock("../AgentCardSection", () => ({
+  AgentCardSection: () => <div data-testid="agent-card-stub" />,
+}));
+
+import { ConfigTab } from "../ConfigTab";
+
+// wireApi — same shape as ConfigTab.hermes.test.tsx, extended with the
+// /provider endpoint. Each test sets `providerValue` to the value the
+// GET endpoint returns; "missing" means the endpoint rejects (older
+// workspace-server pre-PR-2 — must not crash the tab).
+function wireApi(opts: {
+  workspaceRuntime?: string;
+  workspaceModel?: string;
+  configYamlContent?: string | null;
+  templates?: Array<{ id: string; name?: string; runtime?: string; models?: unknown[]; providers?: string[] }>;
+  providerValue?: string | "missing";
+}) {
+  apiGet.mockImplementation((path: string) => {
+    if (path === `/workspaces/ws-test`) {
+      return Promise.resolve({ runtime: opts.workspaceRuntime ?? "" });
+    }
+    if (path === `/workspaces/ws-test/model`) {
+      return Promise.resolve({ model: opts.workspaceModel ?? "" });
+    }
+    if (path === `/workspaces/ws-test/provider`) {
+      if (opts.providerValue === "missing") {
+        return Promise.reject(new Error("404"));
+      }
+      return Promise.resolve({ provider: opts.providerValue ?? "", source: opts.providerValue ? "workspace_secrets" : "default" });
+    }
+    if (path === `/workspaces/ws-test/files/config.yaml`) {
+      if (opts.configYamlContent === null) return Promise.reject(new Error("not found"));
+      return Promise.resolve({ content: opts.configYamlContent ?? "" });
+    }
+    if (path === "/templates") {
+      return Promise.resolve(opts.templates ?? []);
+    }
+    return Promise.reject(new Error(`unmocked api.get: ${path}`));
+  });
+}
+
+beforeEach(() => {
+  apiGet.mockReset();
+  apiPatch.mockReset();
+  apiPut.mockReset();
+});
+
+describe("ConfigTab — Provider override (Option B PR-5)", () => {
+  // Empty provider on load is the legitimate default ("auto-derive
+  // from model slug prefix"), NOT an error. The endpoint returning
+  // {provider: "", source: "default"} is the documented happy-path
+  // shape — if the form treated that as "load failed" we'd lose the
+  // ability to render the input at all on fresh workspaces.
+  it("renders an empty Provider input when no override is set", async () => {
+    wireApi({
+      workspaceRuntime: "hermes",
+      workspaceModel: "nousresearch/hermes-4-70b",
+      configYamlContent: "name: ws\nruntime: hermes\n",
+      providerValue: "",
+    });
+
+    render(<ConfigTab workspaceId="ws-test" />);
+    const input = await screen.findByTestId("provider-input");
+    expect((input as HTMLInputElement).value).toBe("");
+  });
+
+  // Pre-existing override loads back into the field on mount. Without
+  // this, an operator who set provider=openrouter yesterday would see
+  // the field blank today, conclude the value didn't stick, and
+  // re-save — the resulting PUT-with-same-value would auto-restart
+  // the workspace for nothing.
+  it("loads an existing provider override from the server", async () => {
+    wireApi({
+      workspaceRuntime: "hermes",
+      workspaceModel: "nousresearch/hermes-4-70b",
+      configYamlContent: "name: ws\nruntime: hermes\n",
+      providerValue: "openrouter",
+    });
+
+    render(<ConfigTab workspaceId="ws-test" />);
+    const input = await screen.findByTestId("provider-input");
+    await waitFor(() => expect((input as HTMLInputElement).value).toBe("openrouter"));
+  });
+
+  // Old workspace-server (pre-PR-2) returns a 404 on /provider. The
+  // tab must keep loading — the fallback is "" (auto-derive), same as
+  // a fresh workspace.
+  it("falls back to empty provider when the endpoint is missing", async () => {
+    wireApi({
+      workspaceRuntime: "hermes",
+      workspaceModel: "nousresearch/hermes-4-70b",
+      configYamlContent: "name: ws\nruntime: hermes\n",
+      providerValue: "missing",
+    });
+
+    render(<ConfigTab workspaceId="ws-test" />);
+    const input = await screen.findByTestId("provider-input");
+    expect((input as HTMLInputElement).value).toBe("");
+    // Tab should be fully rendered, not stuck in loading or error state.
+    expect(screen.queryByText(/Loading config/i)).toBeNull();
+  });
+
+  // Setting a value + Save must PUT to the right endpoint with the
+  // right body shape. Server-side handler (workspace-server
+  // handlers/secrets.go:SetProvider) reads body.provider — any other
+  // key gets silently ignored and the workspace_secrets row stays
+  // unset. This regression would manifest as "Save → Restart →
+  // workspace still says No LLM provider configured."
+  it("PUTs the new provider to /workspaces/:id/provider on Save", async () => {
+    wireApi({
+      workspaceRuntime: "hermes",
+      workspaceModel: "nousresearch/hermes-4-70b",
+      configYamlContent: "name: ws\nruntime: hermes\n",
+      providerValue: "",
+    });
+    apiPut.mockResolvedValue({ status: "saved", provider: "anthropic" });
+
+    render(<ConfigTab workspaceId="ws-test" />);
+    const input = await screen.findByTestId("provider-input");
+
+    fireEvent.change(input, { target: { value: "anthropic" } });
+    expect((input as HTMLInputElement).value).toBe("anthropic");
+
+    const saveBtn = screen.getByRole("button", { name: /^save$/i });
+    fireEvent.click(saveBtn);
+
+    await waitFor(() => {
+      const providerCalls = apiPut.mock.calls.filter(([path]) => path === "/workspaces/ws-test/provider");
+      expect(providerCalls.length).toBe(1);
+      expect(providerCalls[0][1]).toEqual({ provider: "anthropic" });
+    });
+  });
+
+  // No-change Save must NOT PUT /provider. The server-side SetProvider
+  // auto-restarts the workspace on every successful PUT — re-writing
+  // an unchanged value would cost the user a ~30s reboot every time
+  // they tweak some other field.
+  it("does not PUT /provider when the value is unchanged", async () => {
+    wireApi({
+      workspaceRuntime: "hermes",
+      workspaceModel: "nousresearch/hermes-4-70b",
+      configYamlContent: "name: ws\nruntime: hermes\ntier: 2\n",
+      providerValue: "openrouter",
+    });
+    apiPut.mockResolvedValue({});
+
+    render(<ConfigTab workspaceId="ws-test" />);
+    await screen.findByTestId("provider-input");
+
+    // Click Save without touching the provider field. Trigger another
+    // dirty-marker (tier change) so Save is enabled — the test is
+    // about NOT touching /provider, not about Save being disabled.
+    const tierSelect = screen.getByLabelText(/tier/i) as HTMLSelectElement;
+    fireEvent.change(tierSelect, { target: { value: "3" } });
+
+    const saveBtn = screen.getByRole("button", { name: /^save$/i });
+    fireEvent.click(saveBtn);
+
+    await waitFor(() => {
+      // Some PUT(s) may fire (e.g. /model). Just assert /provider is NOT among them.
+      const providerCalls = apiPut.mock.calls.filter(([path]) => path === "/workspaces/ws-test/provider");
+      expect(providerCalls.length).toBe(0);
+    });
+  });
+
+  // The dropdown's suggestion list MUST come from the runtime's own
+  // template (via /templates → runtime_config.providers), not a
+  // hardcoded canvas-side enum. This is the "Native + pluggable
+  // runtime" invariant: a new runtime declaring its own provider
+  // taxonomy in its config.yaml gets a working dropdown without ANY
+  // canvas-side change.
+  //
+  // Pinned by checking that suggestions surfaced in the datalist
+  // exactly mirror what the templates endpoint returned for the
+  // matching runtime. If a future contributor reintroduces a
+  // PROVIDER_SUGGESTIONS-style hardcoded list and the datalist
+  // contents don't follow the template, this test fails.
+  it("populates the provider datalist from the matched runtime's templates entry", async () => {
+    wireApi({
+      workspaceRuntime: "hermes",
+      workspaceModel: "nousresearch/hermes-4-70b",
+      configYamlContent: "name: ws\nruntime: hermes\n",
+      providerValue: "",
+      templates: [
+        {
+          id: "hermes",
+          name: "Hermes",
+          runtime: "hermes",
+          models: [],
+          // The provider list every runtime adapter ships in its own
+          // config.yaml. Canvas must surface THIS, not its own list.
+          providers: ["nous", "openrouter", "anthropic", "minimax-cn"],
+        },
+      ],
+    });
+
+    render(<ConfigTab workspaceId="ws-test" />);
+    const input = await screen.findByTestId("provider-input");
+    const listId = (input as HTMLInputElement).getAttribute("list");
+    expect(listId).toBeTruthy();
+    await waitFor(() => {
+      const datalist = document.getElementById(listId!);
+      expect(datalist).not.toBeNull();
+      const optionValues = Array.from(datalist!.querySelectorAll("option")).map(
+        (o) => (o as HTMLOptionElement).value,
+      );
+      // Order matters — most-common-first is part of the contract so
+      // the demo flow lands on a working choice without scrolling.
+      expect(optionValues).toEqual(["nous", "openrouter", "anthropic", "minimax-cn"]);
+    });
+  });
+
+  // Fallback path: when a template hasn't migrated to the explicit
+  // `providers:` field yet, suggestions are derived from model slug
+  // prefixes. Still adapter-driven (the slugs come from the template's
+  // `models:` list), just inferred. This keeps existing templates
+  // working while the platform team migrates them one at a time.
+  it("falls back to model-slug prefixes when the runtime ships no providers list", async () => {
+    wireApi({
+      workspaceRuntime: "hermes",
+      workspaceModel: "anthropic:claude-opus-4-7",
+      configYamlContent: "name: ws\nruntime: hermes\n",
+      providerValue: "",
+      templates: [
+        {
+          id: "hermes",
+          name: "Hermes",
+          runtime: "hermes",
+          models: [
+            { id: "anthropic:claude-opus-4-7" },
+            { id: "openai:gpt-4o" },
+            { id: "anthropic:claude-sonnet-4-5" }, // dup vendor — must dedupe
+            { id: "nousresearch/hermes-4-70b" },   // "/" separator
+          ],
+          // No `providers:` field → fallback derivation kicks in.
+        },
+      ],
+    });
+
+    render(<ConfigTab workspaceId="ws-test" />);
+    const input = await screen.findByTestId("provider-input");
+    const listId = (input as HTMLInputElement).getAttribute("list");
+    expect(listId).toBeTruthy();
+    await waitFor(() => {
+      const datalist = document.getElementById(listId!);
+      const optionValues = Array.from(datalist!.querySelectorAll("option")).map(
+        (o) => (o as HTMLOptionElement).value,
+      );
+      // Order = first-appearance from models[]; dedup keeps anthropic
+      // once even though two model slugs use it.
+      expect(optionValues).toEqual(["anthropic", "openai", "nousresearch"]);
+    });
+  });
+
+  // Empty string is a legitimate save target — it clears the override
+  // (the server-side endpoint deletes the workspace_secrets row).
+  // Operators who picked "anthropic" yesterday and want to revert to
+  // auto-derive today should be able to do so by clearing the field
+  // and clicking Save. Without this PUT path, the only way to clear
+  // would be a direct DB edit.
+  it("PUTs an empty string when the operator clears a previously-set provider", async () => {
+    wireApi({
+      workspaceRuntime: "hermes",
+      workspaceModel: "anthropic:claude-opus-4-7",
+      configYamlContent: "name: ws\nruntime: hermes\n",
+      providerValue: "openrouter",
+    });
+    apiPut.mockResolvedValue({ status: "cleared" });
+
+    render(<ConfigTab workspaceId="ws-test" />);
+    const input = await screen.findByTestId("provider-input");
+    await waitFor(() => expect((input as HTMLInputElement).value).toBe("openrouter"));
+
+    fireEvent.change(input, { target: { value: "" } });
+
+    const saveBtn = screen.getByRole("button", { name: /^save$/i });
+    fireEvent.click(saveBtn);
+
+    await waitFor(() => {
+      const providerCalls = apiPut.mock.calls.filter(([path]) => path === "/workspaces/ws-test/provider");
+      expect(providerCalls.length).toBe(1);
+      expect(providerCalls[0][1]).toEqual({ provider: "" });
+    });
+  });
+});
@@ -27,16 +27,16 @@ import { renderHook } from "@testing-library/react";
 import type { Template } from "@/lib/deploy-preflight";

 // ── Hoisted mocks ────────────────────────────────────────────────────────────
-const { mockApiPost, mockCheckDeploySecrets, mockResolveRuntime } = vi.hoisted(
-  () => ({
+const { mockApiPost, mockApiGet, mockCheckDeploySecrets, mockResolveRuntime } =
+  vi.hoisted(() => ({
    mockApiPost: vi.fn(),
+    mockApiGet: vi.fn(),
    mockCheckDeploySecrets: vi.fn(),
    mockResolveRuntime: vi.fn(),
-  }),
-);
+  }));

 vi.mock("@/lib/api", () => ({
-  api: { post: mockApiPost },
+  api: { post: mockApiPost, get: mockApiGet },
 }));

 vi.mock("@/lib/deploy-preflight", async () => {
@@ -51,20 +51,44 @@ vi.mock("@/lib/deploy-preflight", async () => {
  };
 });

-// MissingKeysModal: render a minimal stand-in that exposes the two
-// callbacks the hook wires up. The real modal pulls in radix + the
-// secrets store, neither of which is relevant to this hook's behavior.
+// MissingKeysModal: render a minimal stand-in that exposes the
+// callbacks the hook wires up + dumps the new template-deploy props
+// (configuredKeys size, modelSuggestions, initialModel) into the
+// DOM so tests can assert on them. The real modal pulls in radix +
+// the secrets store, neither of which is relevant to this hook's
+// behavior.
 vi.mock("@/components/MissingKeysModal", () => ({
  MissingKeysModal: (props: {
    open: boolean;
-    onKeysAdded: () => void;
+    onKeysAdded: (model?: string) => void;
    onCancel: () => void;
+    configuredKeys?: Set<string>;
+    modelSuggestions?: string[];
+    initialModel?: string;
+    title?: string;
  }) =>
    props.open ? (
      <div data-testid="missing-keys-modal">
-        <button data-testid="modal-keys-added" onClick={props.onKeysAdded}>
+        <span data-testid="modal-configured-size">
+          {props.configuredKeys?.size ?? 0}
+        </span>
+        <span data-testid="modal-model-suggestions">
+          {(props.modelSuggestions ?? []).join(",")}
+        </span>
+        <span data-testid="modal-initial-model">{props.initialModel ?? ""}</span>
+        <span data-testid="modal-title">{props.title ?? ""}</span>
+        <button
+          data-testid="modal-keys-added"
+          onClick={() => props.onKeysAdded()}
+        >
          keys added
        </button>
+        <button
+          data-testid="modal-keys-added-with-model"
+          onClick={() => props.onKeysAdded("minimax/MiniMax-M2.7")}
+        >
+          keys added with model
+        </button>
        <button data-testid="modal-cancel" onClick={props.onCancel}>
          cancel
        </button>
@@ -95,6 +119,7 @@ function makeTemplate(over: Partial<Template> = {}): Template {

 beforeEach(() => {
  mockApiPost.mockReset();
+  mockApiGet.mockReset();
  mockCheckDeploySecrets.mockReset();
  mockResolveRuntime.mockReset();
  // Default: identity-mapped runtime, preflight passes.
@@ -104,8 +129,12 @@ beforeEach(() => {
    missingKeys: [],
    providers: [],
    runtime: "claude-code",
+    configuredKeys: new Set(),
  });
  mockApiPost.mockResolvedValue({ id: "ws-new" });
+  // Default: secrets endpoint returns nothing so the picker
+  // renders every entry as input. Multi-provider tests override.
+  mockApiGet.mockResolvedValue([]);
 });

 afterEach(() => {
@@ -114,14 +143,38 @@ afterEach(() => {

 // ── Tests ────────────────────────────────────────────────────────────────────

-describe("useTemplateDeploy — happy path", () => {
-  it("preflight ok → POST /workspaces → onDeployed fires with new id", async () => {
-    const onDeployed = vi.fn();
-    const { result } = renderHook(() => useTemplateDeploy({ onDeployed }));
+/**
+ * Drive the always-show-picker flow to completion: deploy() opens the
+ * modal, then we click "keys added" to fire the actual POST. Centralised
+ * here because as of the always-prompt change, every happy-path test
+ * must click through the modal before asserting on POST.
+ */
+async function deployThroughPicker<T>(
+  result: { current: ReturnType<typeof useTemplateDeploy> },
+  rerender: () => void,
+  template: Template,
+): Promise<void> {
+  await act(async () => {
+    await result.current.deploy(template);
+  });
+  rerender();
+  render(<>{result.current.modal}</>);
+  await act(async () => {
+    fireEvent.click(screen.getByTestId("modal-keys-added"));
+    // Let the fire-and-forget executeDeploy resolve.
+    await Promise.resolve();
+    await Promise.resolve();
+  });
+}

-    await act(async () => {
-      await result.current.deploy(makeTemplate());
-    });
+describe("useTemplateDeploy — happy path", () => {
+  it("preflight ok → modal opens → keys-added → POST /workspaces → onDeployed fires", async () => {
+    const onDeployed = vi.fn();
+    const { result, rerender } = renderHook(() =>
+      useTemplateDeploy({ onDeployed }),
+    );
+
+    await deployThroughPicker(result, rerender, makeTemplate());

    expect(mockCheckDeploySecrets).toHaveBeenCalledTimes(1);
    expect(mockApiPost).toHaveBeenCalledWith(
@@ -139,11 +192,11 @@ describe("useTemplateDeploy — happy path", () => {

  it("uses caller-supplied canvasCoords when provided", async () => {
    const canvasCoords = vi.fn(() => ({ x: 42, y: 99 }));
-    const { result } = renderHook(() => useTemplateDeploy({ canvasCoords }));
+    const { result, rerender } = renderHook(() =>
+      useTemplateDeploy({ canvasCoords }),
+    );

-    await act(async () => {
-      await result.current.deploy(makeTemplate());
-    });
+    await deployThroughPicker(result, rerender, makeTemplate());

    expect(canvasCoords).toHaveBeenCalledTimes(1);
    expect(mockApiPost).toHaveBeenCalledWith(
@@ -153,11 +206,9 @@ describe("useTemplateDeploy — happy path", () => {
  });

  it("falls back to random coords inside [100,500] × [100,400] when canvasCoords omitted", async () => {
-    const { result } = renderHook(() => useTemplateDeploy());
+    const { result, rerender } = renderHook(() => useTemplateDeploy());

-    await act(async () => {
-      await result.current.deploy(makeTemplate());
-    });
+    await deployThroughPicker(result, rerender, makeTemplate());

    const body = (mockApiPost as Mock).mock.calls[0]?.[1] as {
      canvas: { x: number; y: number };
@@ -204,6 +255,7 @@ describe("useTemplateDeploy — preflight failure modes", () => {
      missingKeys: ["ANTHROPIC_API_KEY"],
      providers: [],
      runtime: "claude-code",
+      configuredKeys: new Set(),
    });
    const onDeployed = vi.fn();

@@ -231,6 +283,7 @@ describe("useTemplateDeploy — modal lifecycle", () => {
      missingKeys: ["ANTHROPIC_API_KEY"],
      providers: [],
      runtime: "claude-code",
+      configuredKeys: new Set(),
    });
    const onDeployed = vi.fn();
    const { result, rerender } = renderHook(() =>
@@ -265,6 +318,7 @@ describe("useTemplateDeploy — modal lifecycle", () => {
      missingKeys: ["ANTHROPIC_API_KEY"],
      providers: [],
      runtime: "claude-code",
+      configuredKeys: new Set(),
    });
    const { result, rerender } = renderHook(() => useTemplateDeploy());

@@ -287,16 +341,190 @@ describe("useTemplateDeploy — modal lifecycle", () => {
  });
 });

-describe("useTemplateDeploy — POST failure", () => {
-  it("POST rejection sets error and clears deploying", async () => {
-    mockApiPost.mockRejectedValueOnce(new Error("server 500"));
+describe("useTemplateDeploy — multi-provider always-ask flow", () => {
+  // The user-reported bug: clicking a hermes template (which has
+  // multiple provider options) deployed silently when global env
+  // covered the API key, producing "No LLM provider configured" 500
+  // because the workspace booted with no explicit model. Fix:
+  // always open the picker for multi-provider templates so the
+  // user picks provider + model per workspace, even when keys are
+  // already saved.
+  function multiProviderTemplate(): Template {
+    return makeTemplate({
+      id: "hermes-template",
+      name: "Hermes",
+      runtime: "hermes",
+      model: "anthropic/claude-sonnet-4-5",
+      models: [
+        { id: "minimax/MiniMax-M2.7", required_env: ["MINIMAX_API_KEY"] },
+        { id: "anthropic/claude-sonnet-4-5", required_env: ["ANTHROPIC_API_KEY"] },
+      ],
+    });
+  }
+
+  it("opens picker even when preflight.ok=true (≥2 providers)", async () => {
+    mockCheckDeploySecrets.mockResolvedValueOnce({
+      ok: true, // every key is in global env
+      missingKeys: [],
+      providers: [
+        { id: "MINIMAX_API_KEY", label: "MiniMax", envVars: ["MINIMAX_API_KEY"] },
+        { id: "ANTHROPIC_API_KEY", label: "Anthropic", envVars: ["ANTHROPIC_API_KEY"] },
+      ],
+      runtime: "hermes",
+      configuredKeys: new Set(["MINIMAX_API_KEY", "ANTHROPIC_API_KEY"]),
+    });
+    const { result, rerender } = renderHook(() => useTemplateDeploy());
+
+    await act(async () => {
+      await result.current.deploy(multiProviderTemplate());
+    });
+
+    rerender();
+    render(<>{result.current.modal}</>);
+
+    expect(screen.getByTestId("missing-keys-modal")).toBeTruthy();
+    // Both global keys flowed into the modal as `configuredKeys` so
+    // entries can render as Saved without re-prompting.
+    expect(screen.getByTestId("modal-configured-size").textContent).toBe("2");
+    // Confirm POST has NOT fired yet — the user must explicitly
+    // confirm in the picker even though preflight passed.
+    expect(mockApiPost).not.toHaveBeenCalled();
+    // Title shifts to "Configure Workspace" since keys aren't missing.
+    expect(screen.getByTestId("modal-title").textContent).toBe(
+      "Configure Workspace",
+    );
+  });
+
+  it("threads template.models[].id as model suggestions + template.model as initial value", async () => {
+    mockCheckDeploySecrets.mockResolvedValueOnce({
+      ok: true,
+      missingKeys: [],
+      providers: [
+        { id: "MINIMAX_API_KEY", label: "MiniMax", envVars: ["MINIMAX_API_KEY"] },
+        { id: "ANTHROPIC_API_KEY", label: "Anthropic", envVars: ["ANTHROPIC_API_KEY"] },
+      ],
+      runtime: "hermes",
+      configuredKeys: new Set(),
+    });
+    const { result, rerender } = renderHook(() => useTemplateDeploy());
+
+    await act(async () => {
+      await result.current.deploy(multiProviderTemplate());
+    });
+
+    rerender();
+    render(<>{result.current.modal}</>);
+
+    expect(screen.getByTestId("modal-model-suggestions").textContent).toBe(
+      "minimax/MiniMax-M2.7,anthropic/claude-sonnet-4-5",
+    );
+    expect(screen.getByTestId("modal-initial-model").textContent).toBe(
+      "anthropic/claude-sonnet-4-5",
+    );
+  });
+
+  it("POST /workspaces includes model when picker confirms with one", async () => {
+    mockCheckDeploySecrets.mockResolvedValueOnce({
+      ok: true,
+      missingKeys: [],
+      providers: [
+        { id: "MINIMAX_API_KEY", label: "MiniMax", envVars: ["MINIMAX_API_KEY"] },
+        { id: "ANTHROPIC_API_KEY", label: "Anthropic", envVars: ["ANTHROPIC_API_KEY"] },
+      ],
+      runtime: "hermes",
+      configuredKeys: new Set(),
+    });
+    const { result, rerender } = renderHook(() => useTemplateDeploy());
+
+    await act(async () => {
+      await result.current.deploy(multiProviderTemplate());
+    });
+
+    rerender();
+    render(<>{result.current.modal}</>);
+
+    await act(async () => {
+      fireEvent.click(screen.getByTestId("modal-keys-added-with-model"));
+      await Promise.resolve();
+      await Promise.resolve();
+    });
+
+    expect(mockApiPost).toHaveBeenCalledWith(
+      "/workspaces",
+      expect.objectContaining({
+        template: "hermes-template",
+        model: "minimax/MiniMax-M2.7",
+      }),
+    );
+  });
+
+  it("single-provider template ALSO opens picker when preflight.ok (always-prompt rule)", async () => {
+    // Default preflight mock: ok=true, providers=[]. claude-code is
+    // single-provider, but the always-prompt rule means the user must
+    // still click through the picker to confirm provider+model — even
+    // when keys are saved and the runtime has only one provider option.
+    // Reason: the user needs an explicit chance to override the
+    // template's default model (e.g. opus vs sonnet vs haiku) before
+    // an EC2 boots and burns billing on the wrong tier.
    const onDeployed = vi.fn();
-    const { result } = renderHook(() => useTemplateDeploy({ onDeployed }));
+    const { result, rerender } = renderHook(() =>
+      useTemplateDeploy({ onDeployed }),
+    );

    await act(async () => {
      await result.current.deploy(makeTemplate());
    });

+    rerender();
+    render(<>{result.current.modal}</>);
+
+    expect(screen.getByTestId("missing-keys-modal")).toBeTruthy();
+    // POST does NOT fire until the user confirms in the picker.
+    expect(mockApiPost).not.toHaveBeenCalled();
+    expect(onDeployed).not.toHaveBeenCalled();
+    expect(result.current.deploying).toBeNull();
+  });
+
+  it("empty configuredKeys (preflight defensive fallback) still opens picker", async () => {
+    // checkDeploySecrets falls back to an empty Set when the
+    // /settings/secrets endpoint errors — the modal must still
+    // open so the user isn't blocked, just with every entry
+    // rendered as input rather than Saved.
+    mockCheckDeploySecrets.mockResolvedValueOnce({
+      ok: true,
+      missingKeys: [],
+      providers: [
+        { id: "MINIMAX_API_KEY", label: "MiniMax", envVars: ["MINIMAX_API_KEY"] },
+        { id: "ANTHROPIC_API_KEY", label: "Anthropic", envVars: ["ANTHROPIC_API_KEY"] },
+      ],
+      runtime: "hermes",
+      configuredKeys: new Set(),
+    });
+    const { result, rerender } = renderHook(() => useTemplateDeploy());
+
+    await act(async () => {
+      await result.current.deploy(multiProviderTemplate());
+    });
+
+    rerender();
+    render(<>{result.current.modal}</>);
+
+    expect(screen.getByTestId("missing-keys-modal")).toBeTruthy();
+    expect(screen.getByTestId("modal-configured-size").textContent).toBe("0");
+    expect(mockApiPost).not.toHaveBeenCalled();
+  });
+});
+
+describe("useTemplateDeploy — POST failure", () => {
+  it("POST rejection sets error and clears deploying", async () => {
+    mockApiPost.mockRejectedValueOnce(new Error("server 500"));
+    const onDeployed = vi.fn();
+    const { result, rerender } = renderHook(() =>
+      useTemplateDeploy({ onDeployed }),
+    );
+
+    await deployThroughPicker(result, rerender, makeTemplate());
+
    expect(result.current.error).toBe("server 500");
    expect(result.current.deploying).toBeNull();
    expect(onDeployed).not.toHaveBeenCalled();
@@ -304,11 +532,9 @@ describe("useTemplateDeploy — POST failure", () => {

  it("non-Error rejection still surfaces a message (defensive)", async () => {
    mockApiPost.mockRejectedValueOnce("plain string");
-    const { result } = renderHook(() => useTemplateDeploy());
+    const { result, rerender } = renderHook(() => useTemplateDeploy());

-    await act(async () => {
-      await result.current.deploy(makeTemplate());
-    });
+    await deployThroughPicker(result, rerender, makeTemplate());

    expect(result.current.error).toBe("Deploy failed");
    expect(result.current.deploying).toBeNull();
@@ -44,7 +44,11 @@ export interface UseTemplateDeployOptions {
 /** Paired template + preflight result carried through the "user
 *  clicked deploy → modal opens → keys saved → retry" loop. Named
 *  so the `useState` generic and any future signature change have
- *  a single place to track. */
+ *  a single place to track. `preflight.configuredKeys` lets the
+ *  modal mark pre-saved entries without re-prompting — the
+ *  template-deploy "always ask" flow surfaces the picker even when
+ *  preflight.ok is true so the user can pick a different provider
+ *  per workspace. */
 interface MissingKeysInfo {
  template: Template;
  preflight: PreflightResult;
@@ -81,9 +85,14 @@ export function useTemplateDeploy(

  /** Actually execute the POST /workspaces call. Split from `deploy`
   *  so the "modal → keys added → retry" path can reuse it without
-   *  re-running preflight (the user just proved the keys are now set). */
+   *  re-running preflight (the user just proved the keys are now set).
+   *
+   *  `model` (optional) is the user-picked model slug from the picker
+   *  modal. When the template is multi-provider, hermes-style routing
+   *  reads the slug prefix at install time to pick the upstream
+   *  endpoint, so the slug must reach the workspace verbatim. */
  const executeDeploy = useCallback(
-    async (template: Template) => {
+    async (template: Template, model?: string) => {
      setDeploying(template.id);
      setError(null);
      try {
@@ -98,6 +107,7 @@ export function useTemplateDeploy(
          template: template.id,
          tier: template.tier,
          canvas: coords,
+          ...(model ? { model } : {}),
        });
        onDeployed?.(ws.id);
      } catch (e) {
@@ -133,33 +143,70 @@ export function useTemplateDeploy(
        setDeploying(null);
        return;
      }
-      if (!preflight.ok) {
-        setMissingKeysInfo({ template, preflight });
-        setDeploying(null);
-        return;
-      }
-      await executeDeploy(template);
+      // Always open the picker — every deploy goes through an
+      // explicit confirm-provider/model step. Reasons:
+      //   1. Multi-provider templates (e.g. hermes) need a per-
+      //      workspace pick or the adapter falls back to its
+      //      compiled-in default and 500s with "No LLM provider
+      //      configured".
+      //   2. Single-provider templates (claude-code, langgraph)
+      //      still need the model field — the template's default
+      //      may be wrong for the user's billing tier or a model
+      //      they explicitly want (sonnet vs opus vs haiku).
+      //   3. Even when keys + model are pre-filled, surfacing the
+      //      modal one-click-away is the cheapest UX for catching
+      //      a misconfigured org BEFORE provisioning an EC2 that
+      //      will then sit in degraded.
+      // The picker handles the "all-keys-saved single-provider"
+      // case as a confirm-only prompt (provider radio is hidden,
+      // model input is pre-filled with template.model).
+      setMissingKeysInfo({ template, preflight });
+      setDeploying(null);
    },
-    [executeDeploy],
+    [],
  );

  // No useCallback here — consumers call this on every render anyway
  // (it's placed inline in JSX), and useCallback's deps would
  // invalidate on every state change, making the memoisation a wash.
  // Plain ReactNode is simpler and equally performant.
+  const isMultiProvider = (missingKeysInfo?.preflight.providers.length ?? 0) >= 2;
+  // Suggestions for the model field — pull declared model ids from the
+  // template. Templates without `models` declared (e.g. claude-code)
+  // pass [] which suppresses the model field entirely.
+  const modelSuggestions =
+    missingKeysInfo?.template.models?.map((m) => m.id) ?? [];
+  // Pre-fill the model input with the template's default `model` so
+  // confirming without changing it preserves today's behaviour.
+  const initialModel = missingKeysInfo?.template.model;
+  // When the user has keys configured (preflight.ok) we re-purpose the
+  // modal as a "confirm provider/model" prompt — adjust copy
+  // accordingly so it doesn't claim keys are missing.
+  const allConfigured = missingKeysInfo?.preflight.ok ?? false;
+  const modalTitle = allConfigured
+    ? "Configure Workspace"
+    : undefined;
+  const modalDescription = allConfigured
+    ? "Pick the provider and model for this workspace. Saved API keys are reused automatically."
+    : undefined;
  const modal: ReactNode = (
    <MissingKeysModal
      open={!!missingKeysInfo}
      missingKeys={missingKeysInfo?.preflight.missingKeys ?? []}
      providers={missingKeysInfo?.preflight.providers ?? []}
      runtime={missingKeysInfo?.preflight.runtime ?? ""}
-      onKeysAdded={() => {
+      configuredKeys={missingKeysInfo?.preflight.configuredKeys}
+      modelSuggestions={isMultiProvider ? modelSuggestions : undefined}
+      initialModel={isMultiProvider ? initialModel : undefined}
+      title={modalTitle}
+      description={modalDescription}
+      onKeysAdded={(model?: string) => {
        if (missingKeysInfo) {
          const template = missingKeysInfo.template;
          setMissingKeysInfo(null);
          // Intentional fire-and-forget — executeDeploy manages
          // its own error state via setError.
-          void executeDeploy(template);
+          void executeDeploy(template, model);
        }
      }}
      onCancel={() => setMissingKeysInfo(null)}
@@ -244,5 +244,26 @@ describe("checkDeploySecrets", () => {
    const result = await checkDeploySecrets(LANGGRAPH);
    expect(result.ok).toBe(false);
    expect(result.missingKeys).toEqual(["OPENAI_API_KEY"]);
+    // Empty Set on fetch failure — useTemplateDeploy relies on this
+    // so the picker still opens with every entry rendered as input.
+    expect(result.configuredKeys).toEqual(new Set());
+  });
+
+  it("surfaces configuredKeys (has_value=true entries only) so callers skip a second fetch", async () => {
+    (global.fetch as ReturnType<typeof vi.fn>).mockResolvedValueOnce({
+      ok: true,
+      json: () =>
+        Promise.resolve([
+          { key: "ANTHROPIC_API_KEY", has_value: true, created_at: "", updated_at: "" },
+          { key: "OPENROUTER_API_KEY", has_value: false, created_at: "", updated_at: "" },
+          { key: "RANDOM_OTHER_KEY", has_value: true, created_at: "", updated_at: "" },
+        ]),
+    } as Response);
+
+    const result = await checkDeploySecrets(HERMES);
+    // Only has_value=true entries belong in the set.
+    expect(result.configuredKeys).toEqual(
+      new Set(["ANTHROPIC_API_KEY", "RANDOM_OTHER_KEY"]),
+    );
  });
 });
@@ -91,6 +91,12 @@ export interface PreflightResult {
   *  required (AllKeysModal renders the N envVars inline). */
  providers: ProviderChoice[];
  runtime: string;
+  /** Set of env var names already configured (i.e. `has_value: true`) at
+   *  the relevant scope (workspace if `workspaceId` was passed, otherwise
+   *  global). Surfaced so callers can mark pre-saved entries in the
+   *  picker without making a second `/settings/secrets` round trip.
+   *  Empty Set on secrets-endpoint failure (treated as "nothing set"). */
+  configuredKeys: Set<string>;
 }

 /* ---------- Provider options ---------- */
@@ -235,7 +241,13 @@ export async function checkDeploySecrets(

  if (providers.length === 0) {
    // Template declares no env requirements — nothing to preflight.
-    return { ok: true, missingKeys: [], providers: [], runtime };
+    return {
+      ok: true,
+      missingKeys: [],
+      providers: [],
+      runtime,
+      configuredKeys: new Set(),
+    };
  }

  let configured: Set<string>;
@@ -254,7 +266,13 @@ export async function checkDeploySecrets(
  }

  if (findSatisfiedProvider(providers, configured)) {
-    return { ok: true, missingKeys: [], providers, runtime };
+    return {
+      ok: true,
+      missingKeys: [],
+      providers,
+      runtime,
+      configuredKeys: configured,
+    };
  }

  // Nothing configured — surface every candidate env var so the modal
@@ -262,5 +280,11 @@ export async function checkDeploySecrets(
  const missingKeys = Array.from(
    new Set(providers.flatMap((p) => p.envVars)),
  );
-  return { ok: false, missingKeys, providers, runtime };
+  return {
+    ok: false,
+    missingKeys,
+    providers,
+    runtime,
+    configuredKeys: configured,
+  };
 }
@@ -2,7 +2,7 @@

 **Status:** living document — update when you ship a feature that touches one backend.
 **Owner:** workspace-server + controlplane teams.
-**Last audit:** 2026-04-23 (Claude agent, PR #TBD).
+**Last audit:** 2026-05-02 (Claude agent, PR #TBD).

 ## Why this exists

@@ -37,6 +37,12 @@ This document is the canonical matrix. If you are landing a workspace-facing fea
 | **A2A proxy** | | | | |
 | Forward | `a2a_proxy.go` | `127.0.0.1:<port>` | EC2 private IP inside tenant VPC | ✅ parity |
 | Liveness | `a2a_proxy_helpers.go` | `provisioner.IsRunning()` | `cpProv.IsRunning()` (DB-backed) | ✅ parity |
+| Channel envelope enrichment (peer_name / peer_role / agent_card_url) | `a2a_proxy.go` + workspace-runtime channel emitter (PR #2471) | inbox row carries enriched fields | inbox row carries enriched fields | ✅ parity as of 2026-05-02 |
+| **MCP tools (a2a)** | | | | |
+| `chat_history` — fetch prior turns with a peer | `mcp_server.go` + workspace-runtime `a2a_mcp` (PR #2474) | runtime-served, backend-agnostic | runtime-served, backend-agnostic | ✅ parity as of 2026-05-02 |
+| **Activity API** | | | | |
+| `before_ts` paging on `/workspaces/:id/activity` | `activity.go` (PR #2476) | DB-driven | DB-driven | ✅ parity as of 2026-05-02 |
+| `peer_id` filter on `/workspaces/:id/activity` | `activity.go` (PR #2472) | DB-driven | DB-driven | ✅ parity as of 2026-05-02 |
 | **Config / template injection** | | | | |
 | Template copy at provision | `provisioner.go:553-648` | host walk → tar → `CopyToContainer(/configs)` | CP user-data bakes template into bootstrap script | ⚠️ divergent — sync (docker) vs async (EC2) |
 | Runtime config hot-reload | `templates.go` + handlers | no hot-reload — restart required | no hot-reload — restart required | ✅ parity (both require restart; acceptable) |
@@ -45,6 +51,9 @@ This document is the canonical matrix. If you are landing a workspace-facing fea
 | **Bootstrap signals** | | | | |
 | Ready detection | registry `/registry/register` | container heartbeat | tenant heartbeat + boot-event phone-home (CP `bootevents` table + `wait_platform_health=ok`) | ✅ parity as of molecule-controlplane#235 |
 | Console / log output | `workspace_bootstrap.go` | `docker logs` | `ec2:GetConsoleOutput` via CP proxy | 🟡 ec2-only (docker has `docker logs` directly; no unified API) |
+| `runtime_wedge` post-`execute()` smoke gate | workspace-runtime `smoke_mode.py` (PRs #2473 + #2475) | runtime-served, surfaces SDK-init wedges to wheel-smoke + container start | runtime-served, surfaces SDK-init wedges to wheel-smoke + container start | ✅ parity as of 2026-05-02 |
+| **Test infrastructure** | | | | |
+| Canvas-E2E `.playwright-staging-state.json` written before any CP call | `tools/e2e-staging-setup` (PR #2327, 2026-04-30) | n/a — staging-only safety net | required so workflow safety-net can find slug; pattern-sweeping by date prefix poisons concurrent runs | ✅ enforced (staging E2E) |
 | **Orphan cleanup** | | | | |
 | Detect + terminate stale | `healthsweep.go` + CP `DeprovisionInstance` | Docker daemon scan | CP OrgID-tag cascade (molecule-controlplane#234) | ✅ parity as of 2026-04-23 |
 | **Health / budget / schedules** | | | | |
@@ -16,7 +16,11 @@ workspace container running on it) over an [EC2 Instance Connect
 Endpoint](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-instance-connect-setup-ec2-instance-connect-endpoint.html).
 End users see a terminal; no direct public SSH ingress is required.

-Tracking: [molecule-core#1528](https://github.com/Molecule-AI/molecule-core/issues/1528) (resolved 2026-04-22).
+Tracking: originally `molecule-core#1528` (resolved 2026-04-22). The
+`molecule-core` repo has since been renamed to `molecule-monorepo` and no
+longer accepts new issues under the old name; future terminal work is
+tracked in `molecule-monorepo` issues (workspace-server scope) and in
+`molecule-controlplane` issues for the EIC / per-tenant SG path.

 ## Where things are

@@ -17,6 +17,29 @@ distinct from the PyPI package) is no longer the source-of-truth and should
 be treated as a publish artifact only. It can be archived or used as a
 read-only mirror.

+## Where to make changes
+
+**All runtime edits land in `molecule-monorepo/workspace/`. Period.**
+
+The GitHub repo `Molecule-AI/molecule-ai-workspace-runtime` is **mirror-only**.
+It exists so external consumers (template repos, downstream operators) have a
+git-cloneable artifact that mirrors the PyPI wheel — nothing more.
+
+- **Direct PRs against `molecule-ai-workspace-runtime` are auto-rejected by
+  the `mirror-guard` CI check.** The check fails any push that did not come
+  from the publish pipeline. There is no opt-out — file the change against
+  `molecule-monorepo/workspace/` instead.
+- **The mirror + the PyPI wheel both auto-regenerate on every push to
+  `staging`** via `.github/workflows/publish-runtime.yml` (which calls
+  `scripts/build_runtime_package.py`, builds wheel + sdist, smoke-imports,
+  uploads to PyPI via Trusted Publisher, and force-pushes the rewritten tree
+  to the mirror repo). You never touch the mirror by hand.
+
+If you have an old local clone of the mirror and try to push a fix to it
+directly, expect a CI failure with a message pointing you here. Re-open the
+change against `molecule-monorepo/workspace/` and let the publish workflow
+do the rest.
+
 ## Why this shape

 The 8 workspace template repos (claude-code, langgraph, hermes, etc.) each
@@ -0,0 +1,49 @@
+# scripts/
+
+Operational and one-off scripts for molecule-core. Most are
+self-documenting — see the header comments in each file.
+
+## RFC #2251 coordinator task-bound harnesses
+
+There are three related scripts; pick the right one:
+
+| Script | Purpose | Targets |
+|---|---|---|
+| `measure-coordinator-task-bounds.sh` | **Canonical** v1 harness for the RFC #2251 / Issue 4 reproduction. Provisions a PM coordinator + Researcher child via `claude-code-default` + `langgraph` templates, sends a synthesis-heavy A2A kickoff, observes elapsed time + activity trace. | OSS-shape platform — localhost or any `/workspaces`-shaped endpoint. Has tenant/admin-token guards for non-localhost runs. |
+| `measure-coordinator-task-bounds-runner.sh` | Generalised runner for the same measurement contract but with **arbitrary template + secret + model combinations** (Hermes/MiniMax, etc.). Useful for cross-runtime variants without modifying the canonical harness. | Same as above (local or SaaS via `MODE=saas`). |
+| `measure-coordinator-task-bounds.sh` (in [molecule-controlplane](https://github.com/Molecule-AI/molecule-controlplane)) | **Production-shape** variant that bootstraps a real staging tenant via `POST /cp/admin/orgs`, then runs the same measurement against `<slug>.staging.moleculesai.app`. | Staging controlplane only — refuses to run against production. |
+
+See `reference_harness_pair_pattern` (auto-memory) for when to use which
+and the cross-repo design rationale.
+
+### Common safety pattern across all three
+
+- **Cleanup trap** on EXIT/INT/TERM auto-deletes provisioned resources.
+- **`DRY_RUN=1`** prints plan + auth fingerprint, exits before any
+  state mutation. Run this before pointing at staging or any shared
+  infrastructure.
+- **Non-target guard** refuses arbitrary endpoints (the controlplane
+  variant is locked to `staging-api.moleculesai.app`; the OSS variant
+  requires explicit auth + tenant scoping for non-localhost PLATFORM).
+- **Cleanup failures emit `cleanup_*_failed` events** with remediation
+  hints; no silenced curl. ADMIN_TOKEN expiring mid-run surfaces as a
+  structured event rather than a silent leak.
+
+### Activity trace caveat
+
+If `activity_trace.raw == "<endpoint_unavailable>"`, the per-workspace
+`/activity` endpoint isn't wired on the target build — the bound
+measurement is INCONCLUSIVE on the platform-ceiling question. Either
+wire the endpoint or replace with the equivalent Datadog query. Note
+that `/activity` accepts a `since_secs` query parameter; see the
+endpoint handler for the supported range.
+
+## Other scripts
+
+- `cleanup-rogue-workspaces.sh` — emergency teardown for leaked
+  workspaces. Prompts for confirmation. Pair with the harnesses if a
+  cleanup trap fails (see `cleanup_*_failed` events).
+- `canary-smoke.sh` — quick smoke test for canary releases.
+- `dev-start.sh` — local-dev platform bring-up.
+
+The rest are self-documenting in their header comments.
@@ -59,20 +59,27 @@ TOP_LEVEL_MODULES = {
    "agent",
    "agents_md",
    "config",
+    "configs_dir",
    "consolidation",
    "coordinator",
    "events",
    "executor_helpers",
    "heartbeat",
+    "inbox",
    "initial_prompt",
+    "internal_chat_uploads",
+    "internal_file_read",
    "main",
+    "mcp_cli",
    "molecule_ai_status",
    "platform_auth",
+    "platform_inbound_auth",
    "plugins",
    "preflight",
    "prompt",
    "runtime_wedge",
    "shared_runtime",
+    "smoke_mode",
    "transcript_auth",
    "watcher",
 }
@@ -145,6 +152,13 @@ def rewrite_imports(text: str, regex: re.Pattern) -> str:
    `import X`           → `import molecule_runtime.X as X`  (preserve binding)
    `from X import Y`    → `from molecule_runtime.X import Y`
    `from X.sub import Y` → `from molecule_runtime.X.sub import Y`
+
+    Rejects `import X as Y` because the rewrite would produce
+    `import molecule_runtime.X as X as Y`, a syntax error. The PR #2433
+    incident shipped this exact pattern past `Python Lint & Test` (which
+    runs against pre-rewrite source) but blew up the wheel-smoke gate.
+    Detecting it here turns the silent build failure into a build-time
+    error with a clear path: use `from X import …` or plain `import X`.
    """
    def repl(m: re.Match) -> str:
        indent, kw, mod, rest = m.group("indent"), m.group("kw"), m.group("mod"), m.group("rest")
@@ -158,6 +172,26 @@ def rewrite_imports(text: str, regex: re.Pattern) -> str:
            # `import X.sub` — rewrite as `import molecule_runtime.X.sub` and
            # leave the trailing dot pattern intact for the rest of the line.
            return f"{indent}import molecule_runtime.{mod}{rest}"
+        # Detect `import X as Y` — the regex's `rest` group captures only
+        # the immediate following char (whitespace, comma, or EOL), so we
+        # have to peek at the surrounding line context. The match start is
+        # at the line's `import` keyword; everything after the matched
+        # name on the same line is what the source author wrote.
+        line_start = text.rfind("\n", 0, m.start()) + 1
+        line_end = text.find("\n", m.end())
+        if line_end == -1:
+            line_end = len(text)
+        line_after = text[m.end() - len(rest):line_end]
+        # Strip comments from consideration so `import X  # noqa` doesn't trip.
+        line_after_no_comment = line_after.split("#", 1)[0]
+        if re.search(r"^\s*as\s+\w+", line_after_no_comment):
+            raise ValueError(
+                f"rewrite_imports: cannot rewrite 'import {mod} as <alias>' on a "
+                f"workspace module — the regex would produce "
+                f"'import molecule_runtime.{mod} as {mod} as <alias>', invalid syntax. "
+                f"Use 'from {mod} import …' or plain 'import {mod}' instead. "
+                f"Offending line: {text[line_start:line_end]!r}"
+            )
        # Plain `import X` — alias preserves the local name.
        return f"{indent}import molecule_runtime.{mod} as {mod}{rest}"
    return regex.sub(repl, text)
@@ -214,6 +248,7 @@ dependencies = [

 [project.scripts]
 molecule-runtime = "molecule_runtime.main:main_sync"
+molecule-mcp = "molecule_runtime.mcp_cli:main"

 [tool.setuptools.packages.find]
 where = ["."]
@@ -237,6 +272,31 @@ directory** by the `publish-runtime` GitHub Actions workflow on every
 `runtime-v*` tag push. **Do not edit this package directly** — edit
 `workspace/` in the monorepo.

+## External-runtime MCP server (`molecule-mcp`)
+
+Operators running an agent outside the platform's container fleet
+(any runtime that supports MCP stdio — Claude Code, hermes, codex,
+etc.) can install this wheel and run the universal MCP server
+locally:
+
+```sh
+pip install molecule-ai-workspace-runtime
+WORKSPACE_ID=<uuid> \\
+  PLATFORM_URL=https://<tenant>.staging.moleculesai.app \\
+  MOLECULE_WORKSPACE_TOKEN=<bearer> \\
+  molecule-mcp
+```
+
+That exposes the same 8 platform tools (`delegate_task`, `list_peers`,
+`send_message_to_user`, `commit_memory`, etc.) that container-bound
+runtimes already get via the workspace's auto-spawned MCP. Register
+the binary in your agent's MCP config (e.g. Claude Code's
+`claude mcp add molecule -- molecule-mcp` with the env above).
+
+The token comes from the canvas → Tokens tab. Restarting an external
+workspace from the canvas no longer revokes the token (PR #2412), so
+operator tokens persist across status nudges.
+
 See [`docs/workspace-runtime-package.md`](https://github.com/Molecule-AI/molecule-core/blob/main/docs/workspace-runtime-package.md)
 for the publish flow and architecture.
 """
@@ -0,0 +1,306 @@
+# Demo-day runbook
+
+Pre-, during-, and post-demo operational procedures for the molecule
+production stack. Updated 2026-05-01 ahead of the funding-demo on
+~2026-05-06.
+
+The whole stack:
+
+```
+Vercel canvas (app.moleculesai.app)
+  → Railway controlplane (api.moleculesai.app)
+    → CloudFront/Cloudflare per-tenant edge (<slug>.moleculesai.app)
+      → EC2 tenant instance running platform container
+        → Docker workspaces pulled from
+          ghcr.io/molecule-ai/workspace-template-<runtime>:latest
+```
+
+Every layer has its own deploy/rollback story. This runbook indexes
+them in the order an operator would touch them during an incident.
+
+## Pre-demo (T-48h to T-1h)
+
+### 1. Freeze the runtime + template image cascade
+
+A merge to `molecule-core/staging` that touches `workspace/**` triggers
+`publish-runtime.yml` → PyPI bump → repository_dispatch → 8 template
+repos rebuild and re-tag `:latest`. A merge to any template repo's
+`main` triggers the same final re-tag directly. Either path means a
+new workspace provision during the demo pulls whatever `:latest`
+resolved to seconds earlier.
+
+Capture current good digests + disable both cascade vectors:
+
+```bash
+# Dry-run first — verifies digests can be fetched and tooling is set up
+scripts/demo-freeze.sh
+
+# Apply
+scripts/demo-freeze.sh --execute
+```
+
+The script writes two receipts to `scripts/demo-freeze-snapshots/`:
+
+- `digests-<TS>.txt` — current `:latest` digest per template (rollback target if needed)
+- `disabled-workflows-<TS>.txt` — workflow paths to re-enable post-demo
+
+Verify the freeze landed:
+
+```bash
+gh workflow list -R Molecule-AI/molecule-core | grep publish-runtime
+# expect: status = disabled_manually
+```
+
+If a critical fix MUST ship during the freeze window:
+
+1. `gh workflow enable publish-runtime.yml -R Molecule-AI/molecule-core`
+2. Merge the fix
+3. Watch the cascade through to GHCR:latest manually
+4. Smoke-verify against a staging tenant (`scripts/api-smoke.sh` or
+   manual canvas walkthrough)
+5. `gh workflow disable publish-runtime.yml -R Molecule-AI/molecule-core` to re-freeze
+
+Don't auto-promote during the freeze — the value of the freeze is that
+nothing happens automatically.
+
+### 2. Confirm production CP is on the expected SHA
+
+```bash
+gh run list -R Molecule-AI/molecule-controlplane --branch main --limit 5
+# Last `ci` run should be SUCCESS with the SHA you intend to demo on
+```
+
+Railway auto-deploys from main. Spot-check `api.moleculesai.app`:
+
+```bash
+curl -fsS -H "Authorization: Bearer $CP_ADMIN_API_TOKEN" \
+  https://api.moleculesai.app/cp/admin/orgs?limit=1
+# Expect: 200 + a JSON {"orgs": [...]}
+```
+
+### 3. Confirm production canvas (Vercel) is on main
+
+Vercel auto-deploys `main`. Verify in the Vercel dashboard the most
+recent prod deploy ran from the expected commit SHA.
+
+### 4. Pre-warm the demo tenant
+
+Cold-start times on workspace-template images:
+
+| Runtime | Cold-start (first boot) |
+|---|---|
+| claude-code | ~30-60s |
+| openclaw | ~1-2 min |
+| langgraph | ~1 min |
+| hermes | **~7 min** (large image) |
+
+If the demo will use `hermes`, provision the demo workspace at least
+10 min before. The cold-start clock starts when the workspace is
+created, not when it's used.
+
+## During demo — emergency rollback levers
+
+### Lever A: Platform-image rollback (canvas/CP layer regression)
+
+If the canvas or platform container shipped a regression, retag
+`:latest` to a prior staging SHA without rebuilding:
+
+```bash
+# Find a known-good SHA from staging history
+gh run list -R Molecule-AI/molecule-core --workflow=publish-canvas-image.yml --limit 5
+
+# Roll both platform + tenant images
+GITHUB_TOKEN=$(gh auth token) scripts/rollback-latest.sh <good-sha>
+```
+
+`rollback-latest.sh` retags both `ghcr.io/molecule-ai/platform:latest`
+and `ghcr.io/molecule-ai/platform-tenant:latest`. Existing tenants
+auto-pull `:latest` every 5 min — rollback propagates without manual
+restart.
+
+### Lever B: Workspace-template image rollback
+
+If a specific runtime template (claude-code, hermes, etc.) shipped a
+broken `:latest`:
+
+```bash
+# Get the demo's snapshotted-good digest from the freeze receipt
+grep claude-code scripts/demo-freeze-snapshots/digests-<TS>.txt
+
+# Retag :latest back to the snapshotted digest using crane
+crane auth login ghcr.io -u "$(gh api user --jq .login)" \
+  --password-stdin <<< "$(gh auth token)"
+crane tag \
+  ghcr.io/molecule-ai/workspace-template-claude-code@sha256:<digest> \
+  latest
+```
+
+The next workspace provision pulls the rolled-back image. Existing
+workspaces are unaffected (their image is already loaded into Docker).
+
+### Lever C: Wedged demo tenant — redeploy
+
+If the demo tenant's EC2 instance is wedged (boot succeeded but app
+not responding, or a stuck workspace), the controlplane has an admin
+redeploy endpoint:
+
+```bash
+# AWS-side: forces a fresh EC2 launch with current image. ~3 min.
+curl -fsS -X POST \
+  -H "Authorization: Bearer $CP_ADMIN_API_TOKEN" \
+  https://api.moleculesai.app/cp/admin/orgs/<slug>/redeploy
+```
+
+WARNING per memory: this triggers real EC2 + SSM actions on production.
+Double-check `<slug>` against the demo tenant's slug before pressing
+return. The `/redeploy` endpoint is idempotent on the EC2 side but
+WILL drop active SSH sessions.
+
+### Lever D: Specific bad workspace — delete
+
+If a single workspace inside the demo tenant is misbehaving (e.g.
+hermes wedged on cold-start, claude-code returning the generic
+"Agent error (Exception)" message), kill it:
+
+```bash
+# Get the demo tenant's per-tenant ADMIN_TOKEN
+TENANT_ADMIN=$(curl -fsS -H "Authorization: Bearer $CP_ADMIN_API_TOKEN" \
+  https://api.moleculesai.app/cp/admin/orgs/<slug>/admin-token \
+  | jq -r .admin_token)
+
+ORG_ID=$(curl -fsS -H "Authorization: Bearer $CP_ADMIN_API_TOKEN" \
+  https://api.moleculesai.app/cp/admin/orgs?limit=20 \
+  | jq -r '.orgs[] | select(.slug=="<slug>") | .id')
+
+# Delete the bad workspace
+curl -fsS -X DELETE \
+  -H "Origin: https://<slug>.moleculesai.app" \
+  -H "Authorization: Bearer $TENANT_ADMIN" \
+  -H "X-Molecule-Org-Id: $ORG_ID" \
+  https://<slug>.moleculesai.app/workspaces/<workspace-id>
+```
+
+Then re-provision a fresh workspace from the canvas. Faster than
+debugging the wedged one.
+
+### Lever E: Railway production rollback (CP regression)
+
+If the last Railway deploy of CP introduced a regression that lever A
+can't fix (e.g. a logic bug, not a container issue):
+
+1. Open Railway dashboard → molecule-platform → controlplane → Deployments
+2. Find the previous-known-good deployment
+3. Click **Rollback to this deployment**
+
+Manual step — no CLI equivalent built. Takes ~30s to redeploy from
+the prior image. Note: rollback restores the prior code AND prior env
+var snapshot; don't expect any env var changes made since to persist.
+
+### Lever F: Vercel production rollback (canvas regression)
+
+If the canvas ships a regression:
+
+1. Open Vercel dashboard → molecule-app → Deployments
+2. Find the previous prod deployment
+3. **Promote to Production**
+
+Same pattern as Railway — fast revert, no rebuild.
+
+## Tenant-level read-only diagnostics (not actions)
+
+Useful during a "is this working?" moment without touching anything:
+
+```bash
+# Tenant infra state
+curl -fsS -H "Authorization: Bearer $CP_ADMIN_API_TOKEN" \
+  "https://api.moleculesai.app/cp/admin/orgs?limit=20" \
+  | jq '.orgs[] | select(.slug=="<slug>")'
+
+# Tenant boot events (debug a stuck provision)
+curl -fsS -H "Authorization: Bearer $CP_ADMIN_API_TOKEN" \
+  "https://api.moleculesai.app/cp/admin/tenants/<slug>/boot-events?limit=50" \
+  | jq
+
+# Workspace activity (debug an unresponsive agent)
+curl -fsS \
+  -H "Origin: https://<slug>.moleculesai.app" \
+  -H "Authorization: Bearer $TENANT_ADMIN" \
+  -H "X-Molecule-Org-Id: $ORG_ID" \
+  "https://<slug>.moleculesai.app/workspaces/<workspace-id>/activity?limit=20" \
+  | jq
+```
+
+## Post-demo (T+30m to T+24h)
+
+### 1. Thaw the cascades
+
+```bash
+# Find the freeze receipt
+ls scripts/demo-freeze-snapshots/
+
+# Thaw — pass the timestamp suffix
+scripts/demo-thaw.sh 20260506-180000
+```
+
+The next merge to `molecule-core/staging` (workspace/**) or any
+template repo's `main` will resume the auto-rebuild cascade.
+
+### 2. Audit what was held back
+
+If any merges queued during the freeze:
+
+```bash
+gh pr list -R Molecule-AI/molecule-core --base staging --state merged \
+  --search "merged:>=$(date -u -v-7d +%Y-%m-%d)"
+```
+
+Verify each merge's CI is green and dispatch the runtime cascade once
+to ensure all templates rebuild against the post-freeze HEAD.
+
+### 3. File a post-mortem if anything fired
+
+If any rollback lever was used during the demo, file a brief doc:
+
+- Which lever (A through F)
+- Which SHA was rolled back FROM and TO
+- Did the rollback fully resolve the issue or was a follow-up needed
+- Whether the underlying regression should have been caught by CI
+
+## Common issues + first-line fix
+
+| Symptom | First lever to try |
+|---|---|
+| Workspace boots but agent always errors | Lever D (delete + reprovision) |
+| Whole tenant unreachable | Lever C (redeploy) |
+| Canvas crashes on load | Lever F (Vercel rollback) |
+| Login broken / API errors | Lever E (Railway rollback) |
+| Specific runtime broken across tenants | Lever B (template image rollback) |
+| Platform container regression | Lever A (rollback-latest.sh) |
+| Mid-demo stray PR auto-published a bad image | Lever B + investigate why freeze didn't catch it |
+
+## Auth fingerprint (rotate post-demo)
+
+The freeze + rollback procedures assume:
+
+- `CP_ADMIN_API_TOKEN` available via `railway variables --kv --environment production`
+- `gh auth token` returns a working PAT with `workflow:write` + `write:packages`
+- `crane` installed (`brew install crane`)
+
+After the demo, **rotate** `CP_ADMIN_API_TOKEN` (it's the keys-to-the-kingdom
+token for production) — it likely got copy-pasted into shells during
+the demo.
+
+```bash
+# Generate a new admin token
+NEW_TOKEN=$(openssl rand -hex 32)
+
+# Update Railway production env var (and optionally staging)
+railway variables --set CP_ADMIN_API_TOKEN="$NEW_TOKEN" --environment production
+
+# Restart CP service to pick up the change
+# (Railway auto-restarts on env var change)
+
+# Verify
+curl -fsS -H "Authorization: Bearer $NEW_TOKEN" \
+  https://api.moleculesai.app/cp/admin/orgs?limit=1
+```
@@ -0,0 +1,6 @@
+# Generated by scripts/demo-freeze.sh — receipts are operational state,
+# not source. Tracked .gitignore + .gitkeep keep the directory itself
+# in version control so the freeze script's output dir always exists.
+*
+!.gitignore
+!.gitkeep
@@ -0,0 +1,214 @@
+#!/usr/bin/env bash
+# demo-freeze.sh — disable the runtime + template image publish cascades
+# during a demo-prep window so a stray staging merge can't auto-rebuild
+# `:latest` for the 8 workspace-template images mid-demo.
+#
+# Demo prep typically runs T-48h to T+1h. During that window:
+#
+#   PATH 1: any merge to molecule-core/staging that touches workspace/**
+#           → publish-runtime.yml fires
+#           → PyPI auto-bumps molecule-ai-workspace-runtime patch version
+#           → repository_dispatch fans out to 8 workspace-template-* repos
+#           → each template repo rebuilds and re-tags
+#             ghcr.io/molecule-ai/workspace-template-<runtime>:latest
+#
+#   PATH 2: any merge to a workspace-template-* repo's main branch
+#           → that repo's publish-image.yml fires
+#           → ghcr.io/molecule-ai/workspace-template-<runtime>:latest
+#             gets re-tagged
+#
+#   provisioner.go:296 RuntimeImages[runtime] reads `:latest` at every
+#   workspace boot. A new workspace provision during demo pulls whatever
+#   `:latest` resolved to seconds earlier — so a bad merge minutes
+#   before the demo can break a tenant the funder is about to see.
+#
+# This script captures the current good `:latest` digests for all 8
+# templates and disables both cascade vectors. The complementary
+# demo-thaw.sh re-enables them.
+#
+# Usage:
+#   scripts/demo-freeze.sh                # dry run — print what would happen
+#   scripts/demo-freeze.sh --execute      # actually disable workflows + snapshot
+#
+# Prereqs:
+#   - gh CLI authenticated with workflow:write scope on Molecule-AI org
+#   - curl + jq (for digest snapshot via GHCR anonymous registry API)
+#
+# Output:
+#   <snapshot dir>/digests-YYYYMMDD-HHMMSS.txt
+#     One line per template: "<runtime>: <digest>"
+#   <snapshot dir>/disabled-workflows-YYYYMMDD-HHMMSS.txt
+#     One line per disabled workflow: "<repo>: <workflow>"
+#
+# Exit codes:
+#   0 — freeze complete (or dry-run successful)
+#   1 — pre-flight failure (missing tooling, missing auth, etc.)
+#   2 — partial freeze (some workflows did not disable cleanly; see log)
+
+set -euo pipefail
+
+usage() {
+  cat <<'USAGE'
+demo-freeze.sh — disable the runtime + template image publish cascades
+during a demo-prep window.
+
+Captures current :latest digests for all 8 workspace-template-* images
+and disables the workflows that would otherwise re-tag them.
+
+Usage:
+  scripts/demo-freeze.sh                # dry run — print what would happen
+  scripts/demo-freeze.sh --execute      # actually disable workflows + snapshot
+
+See the comment block at the top of this script for the full procedure.
+USAGE
+}
+
+EXECUTE=0
+case "${1:-}" in
+  --execute)
+    EXECUTE=1
+    ;;
+  --help|-h)
+    usage
+    exit 0
+    ;;
+  "")
+    ;;
+  *)
+    echo "unknown arg: $1" >&2
+    usage >&2
+    exit 2
+    ;;
+esac
+
+# Templates and their GHCR repository slugs. Source of truth for the
+# runtime → image map is workspace-server/internal/provisioner/provisioner.go
+# RuntimeImages — keep this list in sync if a runtime is added.
+TEMPLATES=(
+  "claude-code"
+  "hermes"
+  "openclaw"
+  "langgraph"
+  "deepagents"
+  "crewai"
+  "autogen"
+  "gemini-cli"
+)
+
+# Pre-flight: required tooling.
+need() {
+  command -v "$1" >/dev/null || { echo "ERROR: missing required tool: $1" >&2; exit 1; }
+}
+need gh
+need curl
+need jq
+
+# Pre-flight: gh auth. Snapshot via anonymous GHCR token works without
+# org auth, but workflow disable needs an authenticated gh.
+if ! gh auth status >/dev/null 2>&1; then
+  echo "ERROR: gh not authenticated. Run 'gh auth login' first." >&2
+  exit 1
+fi
+
+# Snapshot location relative to this script. Keeping it under scripts/
+# rather than a temp dir means freeze receipts are easy to find again
+# during the actual demo.
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+SNAPSHOT_DIR="${SCRIPT_DIR}/demo-freeze-snapshots"
+mkdir -p "$SNAPSHOT_DIR"
+TS="$(date -u +%Y%m%d-%H%M%S)"
+DIGESTS_FILE="${SNAPSHOT_DIR}/digests-${TS}.txt"
+WORKFLOWS_FILE="${SNAPSHOT_DIR}/disabled-workflows-${TS}.txt"
+
+if [ $EXECUTE -eq 0 ]; then
+  echo "=== DRY RUN (no changes will be made; pass --execute to apply) ==="
+else
+  echo "=== EXECUTING FREEZE — workflows will be disabled ==="
+fi
+echo "Snapshot timestamp: $TS"
+echo "Digest log:    $DIGESTS_FILE"
+echo "Workflow log:  $WORKFLOWS_FILE"
+echo
+
+# Step 1: capture current :latest digest for each template.
+echo "→ Capturing current :latest digests"
+for tpl in "${TEMPLATES[@]}"; do
+  token=$(curl -fsS "https://ghcr.io/token?scope=repository:molecule-ai/workspace-template-${tpl}:pull" | jq -r .token 2>/dev/null || true)
+  if [ -z "$token" ] || [ "$token" = "null" ]; then
+    echo "  WARN: token fetch failed for $tpl — skipping digest capture"
+    continue
+  fi
+  digest=$(curl -fsSI \
+    -H "Authorization: Bearer $token" \
+    -H "Accept: application/vnd.oci.image.index.v1+json" \
+    -H "Accept: application/vnd.docker.distribution.manifest.v2+json" \
+    "https://ghcr.io/v2/molecule-ai/workspace-template-${tpl}/manifests/latest" 2>/dev/null \
+    | grep -i 'docker-content-digest' \
+    | awk '{print $2}' \
+    | tr -d '\r')
+  if [ -z "$digest" ]; then
+    echo "  WARN: digest fetch failed for $tpl"
+    continue
+  fi
+  echo "  $tpl: $digest"
+  if [ $EXECUTE -eq 1 ]; then
+    echo "$tpl: $digest" >> "$DIGESTS_FILE"
+  fi
+done
+echo
+
+# Step 2: disable publish-runtime.yml in molecule-core (PATH 1 source).
+echo "→ Disabling publish-runtime.yml in molecule-core (kills runtime → 8-template cascade)"
+if [ $EXECUTE -eq 1 ]; then
+  if gh workflow disable publish-runtime.yml -R Molecule-AI/molecule-core 2>/tmp/freeze.err; then
+    echo "  OK   molecule-core/publish-runtime.yml disabled"
+    echo "Molecule-AI/molecule-core: publish-runtime.yml" >> "$WORKFLOWS_FILE"
+  else
+    echo "  FAIL molecule-core/publish-runtime.yml: $(cat /tmp/freeze.err)" >&2
+  fi
+else
+  echo "  (dry-run) would disable: gh workflow disable publish-runtime.yml -R Molecule-AI/molecule-core"
+fi
+echo
+
+# Step 3: disable publish-image.yml in each of the 8 template repos (PATH 2 sources).
+echo "→ Disabling publish-image.yml in each workspace-template-* repo"
+PARTIAL_FAIL=0
+for tpl in "${TEMPLATES[@]}"; do
+  repo="Molecule-AI/molecule-ai-workspace-template-${tpl}"
+  if [ $EXECUTE -eq 1 ]; then
+    if gh workflow disable publish-image.yml -R "$repo" 2>/tmp/freeze.err; then
+      echo "  OK   $repo/publish-image.yml disabled"
+      echo "${repo}: publish-image.yml" >> "$WORKFLOWS_FILE"
+    else
+      echo "  FAIL $repo/publish-image.yml: $(cat /tmp/freeze.err)" >&2
+      PARTIAL_FAIL=1
+    fi
+  else
+    echo "  (dry-run) would disable: gh workflow disable publish-image.yml -R $repo"
+  fi
+done
+echo
+
+if [ $EXECUTE -eq 0 ]; then
+  echo "=== DRY RUN COMPLETE ==="
+  echo "Re-run with --execute to apply the freeze."
+  exit 0
+fi
+
+echo "=== FREEZE COMPLETE ==="
+echo "Receipts: $DIGESTS_FILE"
+echo "          $WORKFLOWS_FILE"
+echo
+echo "Next steps:"
+echo "  - Verify by running: gh workflow list -R Molecule-AI/molecule-core | grep publish-runtime"
+echo "    Status should be 'disabled_manually'."
+echo "  - Demo proceeds; new workspaces pull the snapshotted :latest digests."
+echo "  - Post-demo, run: scripts/demo-thaw.sh ${TS}"
+echo "    to re-enable every workflow this freeze disabled."
+echo
+if [ $PARTIAL_FAIL -ne 0 ]; then
+  echo "WARNING: one or more workflows did not disable cleanly. Re-run after fixing." >&2
+  exit 2
+fi
+exit 0
@@ -0,0 +1,124 @@
+#!/usr/bin/env bash
+# demo-thaw.sh — re-enable workflows that demo-freeze.sh disabled.
+#
+# Usage:
+#   scripts/demo-thaw.sh <freeze-timestamp>
+#   scripts/demo-thaw.sh 20260503-180000
+#
+# Reads disabled-workflows-<ts>.txt produced by demo-freeze.sh and
+# runs `gh workflow enable` for each entry. Idempotent — re-enabling
+# an already-enabled workflow is a no-op.
+#
+# Defaults to executing (the inverse of freeze, which defaults to
+# dry-run). Pass --dry-run to print without executing.
+#
+# Prereqs:
+#   - gh CLI authenticated with workflow:write scope on Molecule-AI org
+#
+# Exit codes:
+#   0 — all workflows re-enabled
+#   1 — pre-flight failure (missing receipt file, missing tooling)
+#   2 — partial thaw (some workflows did not enable; check output)
+
+set -euo pipefail
+
+usage() {
+  cat <<'USAGE'
+demo-thaw.sh — re-enable workflows that demo-freeze.sh disabled.
+
+Usage:
+  scripts/demo-thaw.sh <freeze-timestamp>            # apply
+  scripts/demo-thaw.sh <freeze-timestamp> --dry-run  # print without applying
+
+ts is the YYYYMMDD-HHMMSS suffix on
+scripts/demo-freeze-snapshots/disabled-workflows-*.txt produced by
+demo-freeze.sh.
+USAGE
+}
+
+DRY_RUN=0
+TS=""
+for arg in "$@"; do
+  case "$arg" in
+    --dry-run)
+      DRY_RUN=1
+      ;;
+    --help|-h)
+      usage
+      exit 0
+      ;;
+    *)
+      if [ -z "$TS" ]; then
+        TS="$arg"
+      else
+        echo "unknown arg: $arg" >&2
+        usage >&2
+        exit 2
+      fi
+      ;;
+  esac
+done
+
+if [ -z "$TS" ]; then
+  echo "usage: $0 <freeze-timestamp> [--dry-run]" >&2
+  echo "  e.g. $0 20260503-180000" >&2
+  echo "  ts is the YYYYMMDD-HHMMSS suffix on demo-freeze-snapshots/disabled-workflows-*.txt" >&2
+  exit 2
+fi
+
+command -v gh >/dev/null || { echo "ERROR: gh CLI required" >&2; exit 1; }
+if ! gh auth status >/dev/null 2>&1; then
+  echo "ERROR: gh not authenticated. Run 'gh auth login' first." >&2
+  exit 1
+fi
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+WORKFLOWS_FILE="${SCRIPT_DIR}/demo-freeze-snapshots/disabled-workflows-${TS}.txt"
+
+if [ ! -f "$WORKFLOWS_FILE" ]; then
+  echo "ERROR: receipt not found: $WORKFLOWS_FILE" >&2
+  echo "Available receipts:" >&2
+  ls "${SCRIPT_DIR}/demo-freeze-snapshots/" 2>/dev/null | grep '^disabled-workflows-' >&2 || echo "  (none)" >&2
+  exit 1
+fi
+
+if [ $DRY_RUN -eq 1 ]; then
+  echo "=== DRY RUN (no changes will be made) ==="
+else
+  echo "=== THAWING — re-enabling workflows ==="
+fi
+echo "Reading: $WORKFLOWS_FILE"
+echo
+
+PARTIAL_FAIL=0
+while IFS=': ' read -r repo workflow; do
+  [ -z "$repo" ] && continue
+  if [ $DRY_RUN -eq 1 ]; then
+    echo "  (dry-run) would enable: gh workflow enable $workflow -R $repo"
+  else
+    if gh workflow enable "$workflow" -R "$repo" 2>/tmp/thaw.err; then
+      echo "  OK   $repo/$workflow re-enabled"
+    else
+      echo "  FAIL $repo/$workflow: $(cat /tmp/thaw.err)" >&2
+      PARTIAL_FAIL=1
+    fi
+  fi
+done < "$WORKFLOWS_FILE"
+
+echo
+if [ $DRY_RUN -eq 1 ]; then
+  echo "=== DRY RUN COMPLETE ==="
+  echo "Re-run without --dry-run to apply."
+  exit 0
+fi
+
+echo "=== THAW COMPLETE ==="
+echo "Cascades restored. Next workspace/** push to molecule-core/staging will"
+echo "auto-publish the runtime wheel and fan out to template rebuilds as normal."
+if [ $PARTIAL_FAIL -ne 0 ]; then
+  echo
+  echo "WARNING: one or more workflows did not re-enable cleanly. Re-run or enable manually:" >&2
+  echo "  gh workflow list -R <repo>" >&2
+  exit 2
+fi
+exit 0
@@ -105,11 +105,43 @@ echo "==> Running infra/scripts/setup.sh (infra + template registry)"
 "$ROOT/infra/scripts/setup.sh"

 # ─────────────────────────────────────────────── 3. platform
+#
+# Two paths:
+#   (a) `go` is on PATH → run the platform directly via `go run`.
+#       Fast iteration, attaches to /tmp/molecule-platform.log.
+#   (b) `go` is NOT on PATH → fall back to the published platform
+#       container image. Slower first run (image pull) but the script
+#       still works on a fresh dev box without forcing the dev to
+#       install Go just to read logs.
+#
+# The earlier version of this script silently called `go run` and died
+# with `go: not found` on dev boxes where Go wasn't installed; the
+# script's own prerequisite list (line 13-21) said "Go 1.25+" but the
+# user had no signpost between "open the doc" and "command not found
+# at line 111." This branch makes the failure path either succeed
+# (fallback) or fail loud with explicit install guidance.

-echo "==> Starting Platform (Go :8080)"
-cd "$ROOT/workspace-server"
-go run ./cmd/server > /tmp/molecule-platform.log 2>&1 &
-PLATFORM_PID=$!
+if command -v go >/dev/null 2>&1; then
+    echo "==> Starting Platform (Go :8080)"
+    cd "$ROOT/workspace-server"
+    go run ./cmd/server > /tmp/molecule-platform.log 2>&1 &
+    PLATFORM_PID=$!
+else
+    echo "==> Go not found on PATH — falling back to docker-compose platform service"
+    echo "    (Install Go 1.25+ for faster iteration: https://go.dev/dl/)"
+    cd "$ROOT"
+    # Bring up just the platform service from docker-compose.yml. infra/setup.sh
+    # already brought up postgres+redis+etc on docker-compose.infra.yml; this
+    # adds the platform container on top, mapped to :8080 so the rest of this
+    # script's wait-for-/health loop works unchanged.
+    docker compose up -d --build platform > /tmp/molecule-platform.log 2>&1 || {
+        echo "    ✗ docker compose up platform failed — see /tmp/molecule-platform.log"
+        echo "    Either install Go 1.25+ (https://go.dev/dl/) and rerun, or fix the docker fallback."
+        exit 1
+    }
+    # PLATFORM_PID is unset on this path; cleanup() handles that with `kill ... 2>/dev/null || true`.
+    PLATFORM_PID=
+fi

 echo "    Waiting for Platform /health..."
 PLATFORM_READY=0
@@ -0,0 +1,271 @@
+#!/usr/bin/env bash
+# Standalone runner for Issue 4 reproduction (RFC #2251) — exists alongside
+# `measure-coordinator-task-bounds.sh` to support arbitrary template + secret
+# combinations without modifying the canonical harness. The canonical harness
+# stays focused on its v1 contract (claude-code-default + langgraph + OpenRouter);
+# this runner wraps the same workspace-server API calls but takes everything as
+# env-var inputs so a Hermes/MiniMax run can share the measurement code path.
+#
+# Two routing modes:
+#   MODE=local (default) — direct workspace-server API
+#   MODE=saas            — placeholder; populates same vars but expects
+#                          PLATFORM=<tenant-subdomain> with X-Tenant-Id +
+#                          Authorization headers from CP_ADMIN_API_TOKEN
+#
+# Required env:
+#   PLATFORM            workspace-server base URL (default http://localhost:8080)
+#   PM_TEMPLATE         template slug for coordinator
+#   CHILD_TEMPLATE      template slug for researcher child
+#   SECRET_NAME         workspace_secrets key (e.g. MINIMAX_API_KEY)
+#   SECRET_VALUE        the secret value (or read from $SECRET_NAME if unset)
+#
+# Optional:
+#   MODEL               PUT /workspaces/:id/model after provision
+#   SYNTHESIS_DEPTH=3   number of delegation rounds in the kickoff task
+#   A2A_TIMEOUT=600     ceiling on measurement-side wait (seconds)
+#   KEEP_WORKSPACES=0   skip cleanup-on-exit when 1 (for log inspection)
+#   MODE=local|saas     local-dev vs SaaS routing posture
+#   CP_ADMIN_API_TOKEN  required when MODE=saas; sent as Authorization bearer
+#   TENANT_ID           required when MODE=saas; sent as X-Tenant-Id
+#
+# Output: NDJSON event stream on stdout + a human summary on stderr.
+#
+set -euo pipefail
+
+PLATFORM="${PLATFORM:-http://localhost:8080}"
+MODE="${MODE:-local}"
+PM_TEMPLATE="${PM_TEMPLATE:?PM_TEMPLATE is required (e.g. claude-code-default, hermes)}"
+CHILD_TEMPLATE="${CHILD_TEMPLATE:?CHILD_TEMPLATE is required}"
+SECRET_NAME="${SECRET_NAME:?SECRET_NAME is required (e.g. MINIMAX_API_KEY)}"
+MODEL="${MODEL:-}"
+SYNTHESIS_DEPTH="${SYNTHESIS_DEPTH:-3}"
+A2A_TIMEOUT="${A2A_TIMEOUT:-600}"
+KEEP_WORKSPACES="${KEEP_WORKSPACES:-0}"
+
+# SaaS-mode auth chain: workspace-server (per-tenant Go binary on EC2)
+# requires BOTH headers:
+#   Authorization: Bearer <tenant-admin-token>      (per-tenant secret)
+#   X-Molecule-Org-Id:  <org-uuid>                  (TenantGuard middleware)
+# The tenant-admin-token is provisioned by controlplane and retrievable via:
+#   GET /cp/admin/orgs/<slug>/admin-token   (CP_ADMIN_API_TOKEN bearer-gated)
+# The runner can either:
+#   1. Take ORG_SLUG + CP_ADMIN_API_TOKEN and fetch the tenant token itself, or
+#   2. Take ORG_ID + TENANT_ADMIN_TOKEN directly.
+ORG_ID="${ORG_ID:-}"
+ORG_SLUG="${ORG_SLUG:-}"
+TENANT_ADMIN_TOKEN="${TENANT_ADMIN_TOKEN:-}"
+CP_ADMIN_API_TOKEN="${CP_ADMIN_API_TOKEN:-}"
+CP_API_URL="${CP_API_URL:-https://staging-api.moleculesai.app}"
+
+# Resolve secret value: ${SECRET_VALUE} > $${SECRET_NAME} > error.
+SECRET_VALUE="${SECRET_VALUE:-}"
+if [ -z "$SECRET_VALUE" ]; then
+  SECRET_VALUE="$(printenv "$SECRET_NAME" 2>/dev/null || true)"
+fi
+[ -n "$SECRET_VALUE" ] || { echo "ERROR: set \$$SECRET_NAME or \$SECRET_VALUE" >&2; exit 1; }
+
+# SaaS-mode preflight + format validation.
+# Validating ORG_ID + ORG_SLUG client-side gives an actionable error
+# before the request hits TenantGuard's intentionally-opaque 404
+# (which doesn't tell the operator whether the slug is wrong, the
+# UUID is wrong, or auth is wrong).
+if [ "$MODE" = "saas" ]; then
+  [ -n "$ORG_ID" ] || { echo "ERROR: MODE=saas requires ORG_ID (the org UUID)" >&2; exit 1; }
+  case "$ORG_ID" in
+    [0-9a-f][0-9a-f][0-9a-f][0-9a-f][0-9a-f][0-9a-f][0-9a-f][0-9a-f]-[0-9a-f][0-9a-f][0-9a-f][0-9a-f]-[0-9a-f][0-9a-f][0-9a-f][0-9a-f]-[0-9a-f][0-9a-f][0-9a-f][0-9a-f]-[0-9a-f][0-9a-f][0-9a-f][0-9a-f][0-9a-f][0-9a-f][0-9a-f][0-9a-f][0-9a-f][0-9a-f][0-9a-f][0-9a-f]) ;;
+    *) echo "ERROR: ORG_ID must be a UUID (got '$ORG_ID')" >&2; exit 1;;
+  esac
+  if [ -n "$ORG_SLUG" ]; then
+    case "$ORG_SLUG" in
+      *[!a-z0-9-]* | -* | *-) echo "ERROR: ORG_SLUG must match ^[a-z0-9][a-z0-9-]*[a-z0-9]\$ (got '$ORG_SLUG')" >&2; exit 1;;
+    esac
+  fi
+  if [ -z "$TENANT_ADMIN_TOKEN" ]; then
+    [ -n "$ORG_SLUG" ]          || { echo "ERROR: MODE=saas needs TENANT_ADMIN_TOKEN or ORG_SLUG (to fetch it via CP)" >&2; exit 1; }
+    [ -n "$CP_ADMIN_API_TOKEN" ] || { echo "ERROR: ORG_SLUG path needs CP_ADMIN_API_TOKEN to fetch tenant token from $CP_API_URL" >&2; exit 1; }
+    TENANT_ADMIN_TOKEN=$(curl -s -H "Authorization: Bearer $CP_ADMIN_API_TOKEN" \
+      "$CP_API_URL/cp/admin/orgs/$ORG_SLUG/admin-token" \
+      | python3 -c "import sys,json; print(json.load(sys.stdin).get('admin_token',''))" 2>/dev/null || echo "")
+    [ -n "$TENANT_ADMIN_TOKEN" ] || { echo "ERROR: failed to resolve tenant admin token via $CP_API_URL/cp/admin/orgs/$ORG_SLUG/admin-token" >&2; exit 1; }
+  fi
+fi
+
+ts() { date -u +%Y-%m-%dT%H:%M:%S.%3NZ 2>/dev/null || date -u +%Y-%m-%dT%H:%M:%SZ; }
+emit() { printf '{"ts":"%s","event":"%s","data":%s}\n' "$(ts)" "$1" "${2:-null}"; }
+
+api() {
+  local args=()
+  if [ "$MODE" = "saas" ]; then
+    args+=(-H "Authorization: Bearer $TENANT_ADMIN_TOKEN")
+    args+=(-H "X-Molecule-Org-Id: $ORG_ID")
+  fi
+  curl -s ${args[@]+"${args[@]}"} "$@"
+}
+
+PM_ID=""
+CHILD_ID=""
+cleanup() {
+  local rc=$?
+  set +e
+  if [ "$KEEP_WORKSPACES" = "1" ]; then
+    emit "cleanup_skipped" "{\"reason\":\"KEEP_WORKSPACES=1\",\"pm_id\":\"$PM_ID\",\"child_id\":\"$CHILD_ID\"}"
+    return $rc
+  fi
+  for id in "$CHILD_ID" "$PM_ID"; do
+    [ -z "$id" ] && continue
+    code=$(api -o /dev/null -w '%{http_code}' -X DELETE "$PLATFORM/workspaces/$id" 2>/dev/null || echo "curl_err")
+    if [ "$code" = "200" ] || [ "$code" = "204" ] || [ "$code" = "404" ]; then
+      emit "cleanup_deleted" "{\"workspace_id\":\"$id\",\"http_code\":\"$code\"}"
+    else
+      emit "cleanup_failed" "{\"workspace_id\":\"$id\",\"http_code\":\"$code\"}"
+    fi
+  done
+  return $rc
+}
+trap cleanup EXIT INT TERM
+
+emit "run_started" "{\"platform\":\"$PLATFORM\",\"mode\":\"$MODE\",\"pm_template\":\"$PM_TEMPLATE\",\"child_template\":\"$CHILD_TEMPLATE\",\"model\":\"$MODEL\",\"secret_name\":\"$SECRET_NAME\",\"synthesis_depth\":$SYNTHESIS_DEPTH,\"a2a_timeout_secs\":$A2A_TIMEOUT}"
+
+# ---- Provision via JSON-encoded bodies (defends against templates/values
+# with embedded shell-special chars). ----
+pm_body=$(python3 -c '
+import json, sys
+print(json.dumps({"name":"PM","role":"Coordinator — delegates and synthesizes","tier":2,"template":sys.argv[1]}))' "$PM_TEMPLATE")
+child_body=$(python3 -c '
+import json, sys
+print(json.dumps({"name":"Researcher","role":"Returns short research findings","tier":2,"template":sys.argv[1]}))' "$CHILD_TEMPLATE")
+secret_body=$(python3 -c '
+import json, sys
+print(json.dumps({"key":sys.argv[1],"value":sys.argv[2]}))' "$SECRET_NAME" "$SECRET_VALUE")
+
+emit "provisioning_pm" "{\"template\":\"$PM_TEMPLATE\"}"
+R=$(api -X POST "$PLATFORM/workspaces" -H 'Content-Type: application/json' -d "$pm_body")
+PM_ID=$(echo "$R" | python3 -c "import sys,json; print(json.load(sys.stdin).get('id',''))" 2>/dev/null || echo "")
+[ -n "$PM_ID" ] || { echo "ERROR: PM create failed — response: $R" >&2; exit 1; }
+emit "pm_provisioned" "{\"workspace_id\":\"$PM_ID\"}"
+
+emit "provisioning_child" "{\"template\":\"$CHILD_TEMPLATE\"}"
+R=$(api -X POST "$PLATFORM/workspaces" -H 'Content-Type: application/json' -d "$child_body")
+CHILD_ID=$(echo "$R" | python3 -c "import sys,json; print(json.load(sys.stdin).get('id',''))" 2>/dev/null || echo "")
+[ -n "$CHILD_ID" ] || { echo "ERROR: child create failed — response: $R" >&2; exit 1; }
+emit "child_provisioned" "{\"workspace_id\":\"$CHILD_ID\"}"
+
+api -X PATCH "$PLATFORM/workspaces/$CHILD_ID" -H 'Content-Type: application/json' \
+  -d "{\"parent_id\":\"$PM_ID\"}" > /dev/null
+
+# Seed secret on BOTH workspaces. Hermes/MiniMax both sides need it; templates
+# that ignore unknown env vars treat extras as no-op.
+for id in "$PM_ID" "$CHILD_ID"; do
+  api -X POST "$PLATFORM/workspaces/$id/secrets" -H 'Content-Type: application/json' -d "$secret_body" > /dev/null
+done
+emit "secrets_seeded" "{\"key\":\"$SECRET_NAME\",\"workspaces\":[\"$PM_ID\",\"$CHILD_ID\"]}"
+
+if [ -n "$MODEL" ]; then
+  model_body=$(python3 -c 'import json,sys; print(json.dumps({"model":sys.argv[1]}))' "$MODEL")
+  for id in "$PM_ID" "$CHILD_ID"; do
+    api -X PUT "$PLATFORM/workspaces/$id/model" -H 'Content-Type: application/json' -d "$model_body" > /dev/null
+  done
+  emit "model_set" "{\"model\":\"$MODEL\",\"workspaces\":[\"$PM_ID\",\"$CHILD_ID\"]}"
+fi
+
+# ---- Wait for both online ----
+WAIT_ONLINE_SECS="${WAIT_ONLINE_SECS:-180}"
+wait_online() {
+  local id="$1" label="$2"
+  # Round up so a non-multiple-of-3 budget waits at least the requested
+  # seconds (200 → 67 polls × 3s = 201s, not 198s).
+  local polls=$(( (WAIT_ONLINE_SECS + 2) / 3 ))
+  local last_status=""
+  for i in $(seq 1 "$polls"); do
+    s=$(api "$PLATFORM/workspaces/$id" | python3 -c "import sys,json; print(json.load(sys.stdin).get('status',''))" 2>/dev/null || echo "")
+    if [ "$s" != "$last_status" ]; then
+      emit "status_change" "{\"workspace\":\"$label\",\"from\":\"$last_status\",\"to\":\"$s\",\"poll\":$i}"
+      last_status="$s"
+    fi
+    [ "$s" = "online" ] && { emit "online" "{\"workspace\":\"$label\",\"after_polls\":$i,\"after_secs\":$((i * 3))}"; return 0; }
+    [ "$s" = "failed" ] && { emit "failed" "{\"workspace\":\"$label\"}"; return 1; }
+    sleep 3
+  done
+  emit "online_timeout" "{\"workspace\":\"$label\",\"last_status\":\"$last_status\",\"waited_secs\":$WAIT_ONLINE_SECS}"
+  return 1
+}
+wait_online "$PM_ID"    "PM"    || exit 2
+wait_online "$CHILD_ID" "child" || exit 2
+
+# ---- Build a synthesis-heavy kickoff task ----
+TASK="You are coordinating a research analysis. Delegate $SYNTHESIS_DEPTH separate sub-questions to the Researcher (one at a time, sequentially — wait for each response before sending the next), then synthesize all findings into a single coherent report. Sub-questions: (a) historical context of distributed consensus, (b) modern Byzantine-fault-tolerant protocols, (c) practical trade-offs between Raft and Paxos. After all delegations complete, write a 600-word synthesis comparing the three responses and drawing one cross-cutting insight. Do not respond until the synthesis is complete."
+
+# ---- A2A kickoff round-trip ----
+emit "a2a_kickoff_sent" "{\"to\":\"$PM_ID\",\"task_chars\":${#TASK}}"
+START_NS=$(python3 -c 'import time; print(int(time.time_ns()))')
+
+a2a_body=$(python3 -c '
+import json, sys
+print(json.dumps({"method":"message/send","params":{"message":{"role":"user","parts":[{"type":"text","text":sys.argv[1]}]}}}))' "$TASK")
+
+RESP=$(api --max-time "$A2A_TIMEOUT" -X POST "$PLATFORM/workspaces/$PM_ID/a2a" \
+  -H "Content-Type: application/json" -d "$a2a_body" || echo "<curl_failed_or_timed_out>")
+
+END_NS=$(python3 -c 'import time; print(int(time.time_ns()))')
+ELAPSED_SECS=$(python3 -c "print(round(($END_NS - $START_NS) / 1e9, 2))")
+
+emit "a2a_response_observed" "{\"elapsed_secs\":$ELAPSED_SECS,\"response_chars\":${#RESP},\"response_head\":$(python3 -c "import json,sys; print(json.dumps(sys.argv[1][:200]))" "$RESP")}"
+
+# ---- Activity trace ----
+# Earlier versions of this runner called /workspaces/:id/heartbeat-history,
+# which doesn't exist on workspace-server. On local dev that returned 404,
+# on tenant builds the platform's canvas-proxy fallback intercepted it and
+# returned 28KB of Next.js HTML — neither of which is useful trace data.
+# /workspaces/:id/activity is the existing endpoint that reads the
+# activity_logs table (a2a_send / a2a_receive / task_update / agent_log /
+# error events with duration_ms + status). That's the data the RFC's
+# §V1.0 step 6 'platform-side transition' check actually needs.
+emit "fetching_activity_trace" "{\"mode\":\"$MODE\"}"
+ACTIVITY=$(api "$PLATFORM/workspaces/$PM_ID/activity?since_secs=$A2A_TIMEOUT" 2>&1 || echo "<endpoint_unavailable>")
+emit "activity_trace" "{\"raw\":$(python3 -c "import json,sys; print(json.dumps(sys.argv[1]))" "$ACTIVITY")}"
+
+# ---- rfc2251_phase log lines from the workspace container ----
+# Local Docker provisioner: workspace container name is workspace-<id>.
+# SaaS: container is on EC2 — skip log capture, fall back to heartbeat only.
+if [ "$MODE" = "local" ] && command -v docker >/dev/null 2>&1; then
+  for id in "$PM_ID"; do
+    container=$(docker ps --filter "name=workspace-$id" --format '{{.Names}}' | head -1)
+    if [ -n "$container" ]; then
+      phase_log=$(docker logs --since "${A2A_TIMEOUT}s" "$container" 2>&1 | grep 'rfc2251_phase=' || echo "<no rfc2251_phase log lines — container running stale image without #2255 instrumentation>")
+      emit "phase_log" "{\"workspace_id\":\"$id\",\"container\":\"$container\",\"raw\":$(python3 -c "import json,sys; print(json.dumps(sys.argv[1]))" "$phase_log")}"
+    fi
+  done
+fi
+
+emit "run_completed" "{\"elapsed_secs\":$ELAPSED_SECS,\"pm_id\":\"$PM_ID\",\"child_id\":\"$CHILD_ID\"}"
+
+cat <<EOF >&2
+
+=========================================
+  Measurement complete. (RFC #2251 / Issue 4 repro)
+  Mode:                  $MODE
+  Coordinator template:  $PM_TEMPLATE
+  Child template:        $CHILD_TEMPLATE
+  Model:                 ${MODEL:-<template default>}
+  Coordinator response:  ${ELAPSED_SECS}s
+  PM workspace:          $PM_ID
+  Child workspace:       $CHILD_ID
+=========================================
+
+Interpretation:
+
+  ELAPSED < 60   → Synthesis fast; not informative about platform bounds.
+                   Re-run with SYNTHESIS_DEPTH=8 for longer synthesis.
+
+  60 <= ELAPSED < 300 → Within DELEGATION_TIMEOUT. Doesn't prove or refute
+                   Issue 4 — HTTP-level timeout would be sufficient.
+
+  ELAPSED >= 300 → BUG CONFIRMED IF activity_trace shows no platform-side
+                   transition. Coordinator ran past DELEGATION_TIMEOUT without
+                   any platform ceiling kicking in — exactly the gap V1.0
+                   plans to close with MAX_TASK_EXECUTION_SECS.
+
+  curl_failed_or_timed_out → \$A2A_TIMEOUT exceeded. Coordinator likely hung
+                   or synthesis is just very slow.
+
+EOF
@@ -0,0 +1,112 @@
+#!/usr/bin/env bash
+# Check whether production tenants and canvas are running latest main.
+#
+# Usage:
+#   ./scripts/ops/check-prod-versions.sh                # production
+#   ENV=staging ./scripts/ops/check-prod-versions.sh    # staging tenants
+#
+# Outputs a table of {surface, current_sha, expected_sha, status}. Returns
+# non-zero if any surface is stale so this can be wired into a periodic
+# alert.
+#
+# Why this exists: every time someone hits a "is the fix live?" question,
+# they have to remember the curl pattern + cross-reference with
+# `git rev-parse origin/main`. This script does that check uniformly across
+# every public surface (workspace tenants + canvas) and gives a one-line
+# verdict instead of a stack of one-off curls.
+
+set -euo pipefail
+
+ENV="${ENV:-production}"
+EXPECTED_REF="${EXPECTED_REF:-main}"
+
+case "$ENV" in
+    production)
+        TENANT_DOMAIN="moleculesai.app"
+        CANVAS_URL="https://canvas.moleculesai.app"
+        # Default canary tenant for production. Override via TENANT_SLUGS=
+        # to cover a custom set.
+        DEFAULT_TENANTS="hongmingwang reno-stars"
+        ;;
+    staging)
+        TENANT_DOMAIN="staging.moleculesai.app"
+        CANVAS_URL="https://canvas-staging.moleculesai.app"
+        DEFAULT_TENANTS=""  # staging tenants are ephemeral; user must specify
+        ;;
+    *)
+        echo "Unknown ENV=$ENV (expected: production | staging)" >&2
+        exit 2
+        ;;
+esac
+
+TENANT_SLUGS="${TENANT_SLUGS:-$DEFAULT_TENANTS}"
+
+# Pull EXPECTED_SHA from GitHub. Falls back to local git if gh isn't
+# logged in — local main may lag origin but is usually close enough for
+# debugging, and we still report the comparison clearly.
+EXPECTED_SHA=""
+if command -v gh >/dev/null 2>&1; then
+    EXPECTED_SHA=$(gh api "repos/Molecule-AI/molecule-core/commits/${EXPECTED_REF}" --jq '.sha' 2>/dev/null || true)
+fi
+if [ -z "$EXPECTED_SHA" ]; then
+    if git rev-parse "origin/${EXPECTED_REF}" >/dev/null 2>&1; then
+        EXPECTED_SHA=$(git rev-parse "origin/${EXPECTED_REF}")
+        echo "[check-prod-versions] WARN: gh unavailable, using local origin/${EXPECTED_REF}=${EXPECTED_SHA:0:7} (may lag)"
+    else
+        echo "[check-prod-versions] ERROR: cannot resolve expected SHA — gh not logged in and origin/${EXPECTED_REF} not fetched" >&2
+        exit 2
+    fi
+fi
+EXPECTED_SHORT="${EXPECTED_SHA:0:7}"
+
+echo "Checking ${ENV} surfaces against ${EXPECTED_REF}=${EXPECTED_SHORT}"
+echo ""
+printf "%-25s  %-9s  %-9s  %s\n" "Surface" "Live" "Expected" "Status"
+printf "%-25s  %-9s  %-9s  %s\n" "-------" "----" "--------" "------"
+
+STALE_COUNT=0
+UNREACHABLE_COUNT=0
+
+# Tenant surfaces — workspace-server /buildinfo (added in PR #2398).
+for slug in $TENANT_SLUGS; do
+    URL="https://${slug}.${TENANT_DOMAIN}/buildinfo"
+    BODY=$(curl -sS --max-time 15 "$URL" 2>/dev/null || echo "")
+    ACTUAL_SHA=$(echo "$BODY" | jq -r '.git_sha // ""' 2>/dev/null || echo "")
+    if [ -z "$ACTUAL_SHA" ]; then
+        printf "%-25s  %-9s  %-9s  ⚠ unreachable\n" "tenant: $slug" "—" "$EXPECTED_SHORT"
+        UNREACHABLE_COUNT=$((UNREACHABLE_COUNT + 1))
+    elif [ "$ACTUAL_SHA" = "$EXPECTED_SHA" ]; then
+        printf "%-25s  %-9s  %-9s  ✓ current\n" "tenant: $slug" "${ACTUAL_SHA:0:7}" "$EXPECTED_SHORT"
+    else
+        printf "%-25s  %-9s  %-9s  ✗ stale\n" "tenant: $slug" "${ACTUAL_SHA:0:7}" "$EXPECTED_SHORT"
+        STALE_COUNT=$((STALE_COUNT + 1))
+    fi
+done
+
+# Canvas — Next.js /api/buildinfo (PR #2407). Vercel injects
+# VERCEL_GIT_COMMIT_SHA at build time so this reflects the deployed
+# commit, not the request time.
+CANVAS_BODY=$(curl -sS --max-time 15 "${CANVAS_URL}/api/buildinfo" 2>/dev/null || echo "")
+CANVAS_SHA=$(echo "$CANVAS_BODY" | jq -r '.git_sha // ""' 2>/dev/null || echo "")
+if [ -z "$CANVAS_SHA" ]; then
+    printf "%-25s  %-9s  %-9s  ⚠ unreachable (route may not be deployed yet)\n" "canvas" "—" "$EXPECTED_SHORT"
+    UNREACHABLE_COUNT=$((UNREACHABLE_COUNT + 1))
+elif [ "$CANVAS_SHA" = "dev" ]; then
+    printf "%-25s  %-9s  %-9s  ⚠ dev sentinel (Vercel env not injected — check VERCEL_GIT_COMMIT_SHA)\n" "canvas" "dev" "$EXPECTED_SHORT"
+    UNREACHABLE_COUNT=$((UNREACHABLE_COUNT + 1))
+elif [ "$CANVAS_SHA" = "$EXPECTED_SHA" ]; then
+    printf "%-25s  %-9s  %-9s  ✓ current\n" "canvas" "${CANVAS_SHA:0:7}" "$EXPECTED_SHORT"
+else
+    printf "%-25s  %-9s  %-9s  ✗ stale\n" "canvas" "${CANVAS_SHA:0:7}" "$EXPECTED_SHORT"
+    STALE_COUNT=$((STALE_COUNT + 1))
+fi
+
+echo ""
+if [ $STALE_COUNT -eq 0 ] && [ $UNREACHABLE_COUNT -eq 0 ]; then
+    echo "All surfaces current."
+    exit 0
+fi
+echo "Summary: ${STALE_COUNT} stale, ${UNREACHABLE_COUNT} unreachable."
+# Stale is a deploy gap; unreachable is operational (DNS, CF, route absent).
+# Both are signal — exit non-zero so cron / CI can alert.
+exit 1
@@ -0,0 +1,206 @@
+#!/usr/bin/env python3
+"""check_migration_collisions.py — fail-loud detector for two open PRs adding
+the same migration version number.
+
+Why this exists: two PRs targeting staging can each add a migration with the
+same numeric prefix (e.g. 044_*.up.sql). Each passes CI independently. They
+collide at merge time. Worst-case the second migration silently doesn't apply
+and the schema drifts from what the code expects. Caught manually 2026-04-30
+during PR #2276 rebase: 044_runtime_image_pins collided with
+044_platform_inbound_secret from RFC #2312.
+
+This check runs on every PR and asserts the migration prefixes added by THIS
+PR don't collide with:
+
+    1. The base branch's tip (someone else already used this number)
+    2. Any other open PR (race-window collision — both pass CI independently)
+
+Exit codes:
+    0  — no collisions
+    1  — collision detected; output names the conflicting PR(s) for the author
+
+Designed to run from a GitHub Actions PR check. Reads PR metadata via the
+GitHub CLI (gh) which is preinstalled on ubuntu-latest runners. Runs in
+under 10s against a typical PR.
+"""
+
+from __future__ import annotations
+
+import json
+import os
+import re
+import subprocess
+import sys
+from pathlib import Path
+
+MIGRATIONS_DIR = "workspace-server/migrations"
+MIGRATION_FILE_RE = re.compile(r"^(\d+)_[^/]+\.(up|down)\.sql$")
+
+
+def run(cmd: list[str], check: bool = True) -> str:
+    """Run a subprocess and return stdout. Raise on non-zero when check=True."""
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    if check and result.returncode != 0:
+        sys.stderr.write(f"command failed: {' '.join(cmd)}\n{result.stderr}\n")
+        sys.exit(1)
+    return result.stdout
+
+
+def migrations_in_diff(base_ref: str, head_ref: str) -> set[int]:
+    """Return the set of migration prefixes added or modified between two refs.
+
+    Uses --diff-filter=AM (Added or Modified) so a deleted migration doesn't
+    count. Renames (--diff-filter=R) appear as A on the new path and D on the
+    old, so we'd catch a renumbering correctly.
+    """
+    out = run([
+        "git", "diff", "--name-only", "--diff-filter=AM",
+        f"{base_ref}...{head_ref}", "--", MIGRATIONS_DIR,
+    ])
+    prefixes: set[int] = set()
+    for line in out.splitlines():
+        path = Path(line.strip())
+        if not path.name:
+            continue
+        m = MIGRATION_FILE_RE.match(path.name)
+        if not m:
+            # Files like the workflow_checkpoints.up.sql with non-numeric
+            # prefix are intentional — skip without complaint.
+            continue
+        prefixes.add(int(m.group(1)))
+    return prefixes
+
+
+def migrations_on_ref(ref: str) -> set[int]:
+    """Return the set of numeric migration prefixes existing at the given git ref.
+
+    Walks the migrations dir at that ref via `git ls-tree`, not the working
+    tree, so it works against any branch / SHA without checking it out.
+    """
+    out = run([
+        "git", "ls-tree", "-r", "--name-only", ref, "--", MIGRATIONS_DIR,
+    ])
+    prefixes: set[int] = set()
+    for line in out.splitlines():
+        path = Path(line.strip())
+        if not path.name:
+            continue
+        m = MIGRATION_FILE_RE.match(path.name)
+        if not m:
+            continue
+        prefixes.add(int(m.group(1)))
+    return prefixes
+
+
+def open_prs_with_migration_prefix(
+    repo: str, prefix: int, exclude_pr: int
+) -> list[dict]:
+    """Return open PRs (other than `exclude_pr`) that add a migration with
+    `prefix`. Uses `gh pr diff` per PR — we only need to walk PRs that are
+    actually in flight, so the cost is bounded by open-PR count.
+    """
+    out = run([
+        "gh", "pr", "list", "--repo", repo, "--state", "open",
+        "--json", "number,headRefName", "--limit", "100",
+    ])
+    prs = json.loads(out)
+    matches: list[dict] = []
+    for pr in prs:
+        num = pr["number"]
+        if num == exclude_pr:
+            continue
+        try:
+            files = run([
+                "gh", "pr", "diff", str(num), "--repo", repo, "--name-only",
+            ], check=False)
+        except Exception:  # noqa: BLE001
+            continue
+        for raw in files.splitlines():
+            path = Path(raw.strip())
+            if not path.name:
+                continue
+            m = MIGRATION_FILE_RE.match(path.name)
+            if m and int(m.group(1)) == prefix:
+                matches.append(pr)
+                break
+    return matches
+
+
+def main() -> int:
+    pr_number_env = os.environ.get("PR_NUMBER", "").strip()
+    if not pr_number_env:
+        sys.stderr.write(
+            "PR_NUMBER not set — this script is intended to run from a PR "
+            "context. Set PR_NUMBER (e.g. ${{ github.event.pull_request.number }}) "
+            "and BASE_REF (target branch) and HEAD_REF (PR head SHA).\n"
+        )
+        return 1
+    pr_number = int(pr_number_env)
+    base_ref = os.environ.get("BASE_REF", "origin/staging")
+    head_ref = os.environ.get("HEAD_REF", "HEAD")
+    repo = os.environ.get("GITHUB_REPOSITORY", "Molecule-AI/molecule-core")
+
+    added = migrations_in_diff(base_ref, head_ref)
+    if not added:
+        print("no migrations added or modified by this PR — nothing to check")
+        return 0
+
+    print(f"this PR adds/modifies migrations: {sorted(added)}")
+
+    # Collision check 1: base branch already has this prefix on a different
+    # filename. This happens when the PR was branched off an old base and
+    # didn't rebase — base advanced and another PR landed the same number.
+    base_prefixes = migrations_on_ref(base_ref)
+    base_collisions = added & base_prefixes
+    # Filter to "different filename, same prefix" — same filename means the
+    # PR is updating an existing migration in place, which is fine.
+    real_base_collisions: set[int] = set()
+    for prefix in base_collisions:
+        # List filenames at base for this prefix
+        out = run([
+            "git", "ls-tree", "-r", "--name-only", base_ref, "--",
+            MIGRATIONS_DIR,
+        ])
+        base_names = {
+            Path(line).name for line in out.splitlines()
+            if (m := MIGRATION_FILE_RE.match(Path(line).name)) and int(m.group(1)) == prefix
+        }
+        # And in the PR
+        diff_out = run([
+            "git", "diff", "--name-only", "--diff-filter=AM",
+            f"{base_ref}...{head_ref}", "--", MIGRATIONS_DIR,
+        ])
+        pr_names = {
+            Path(line).name for line in diff_out.splitlines()
+            if (m := MIGRATION_FILE_RE.match(Path(line).name)) and int(m.group(1)) == prefix
+        }
+        if pr_names - base_names:
+            real_base_collisions.add(prefix)
+
+    # Collision check 2: another open PR claims the same prefix.
+    open_pr_collisions: dict[int, list[dict]] = {}
+    for prefix in added:
+        peers = open_prs_with_migration_prefix(repo, prefix, pr_number)
+        if peers:
+            open_pr_collisions[prefix] = peers
+
+    if not real_base_collisions and not open_pr_collisions:
+        print("no migration version collisions detected")
+        return 0
+
+    print()
+    print("::error::migration version collision detected")
+    if real_base_collisions:
+        print(f"::error::these prefixes already exist on {base_ref} with different filenames: "
+              f"{sorted(real_base_collisions)}")
+        print("::error::rebase onto current base and renumber to the next available prefix")
+    for prefix, peers in sorted(open_pr_collisions.items()):
+        peer_str = ", ".join(f"#{p['number']} ({p['headRefName']})" for p in peers)
+        print(f"::error::migration prefix {prefix:03d} also claimed by open PR(s): {peer_str}")
+        print(f"::error::rebase coordination needed — only one PR can land a given prefix; "
+              f"renumber yours or theirs")
+    return 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
@@ -0,0 +1,257 @@
+#!/usr/bin/env bash
+# sweep-cf-tunnels.sh — safe, targeted sweep of Cloudflare Tunnels
+# whose corresponding tenant no longer exists.
+#
+# Why this exists: CP's tenant-delete cascade removes the DNS record
+# (caught by sweep-cf-orphans.sh as a backstop) but does NOT delete
+# the underlying Cloudflare Tunnel. Each E2E provision creates one
+# Tunnel named `tenant-<slug>`; without cleanup these accumulate
+# indefinitely on the account, consuming the account's tunnel quota
+# and cluttering the Cloudflare dashboard.
+#
+# Observed 2026-04-30: dozens of `tenant-e2e-canvas-*` tunnels in
+# Down state with zero replicas, weeks past their tenant's deletion.
+#
+# This script is a parallel-shape janitor to sweep-cf-orphans.sh:
+#   1. Query CP admin API to enumerate live org slugs (prod + staging)
+#   2. Enumerate Cloudflare Tunnels via the account-scoped API
+#   3. For each tunnel matching `tenant-<slug>`, check if <slug>
+#      appears in the live set
+#   4. Skip tunnels with active connections (defense-in-depth — never
+#      delete a healthy tunnel even if CP claims the org is gone)
+#   5. Only delete tunnels with NO live counterpart AND NO active
+#      connections
+#
+# Dry-run by default; must pass --execute to actually delete.
+#
+# Env vars required:
+#   CF_API_TOKEN        — Cloudflare token with
+#                          account:cloudflare_tunnel:edit scope.
+#                          (Same secret as sweep-cf-orphans, but the
+#                          token must include the tunnel scope.)
+#   CF_ACCOUNT_ID       — the account that owns the tunnels (visible
+#                          in dash.cloudflare.com URL path)
+#   CP_PROD_ADMIN_TOKEN — CP admin bearer for api.moleculesai.app
+#   CP_STAGING_ADMIN_TOKEN — CP admin bearer for staging-api.moleculesai.app
+#
+# Exit codes:
+#   0  — dry-run completed or sweep executed successfully
+#   1  — missing required env, API failure, or unexpected state
+#   2  — safety check failed (would delete >MAX_DELETE_PCT% of
+#         tenant-shaped tunnels; refusing)
+
+set -euo pipefail
+
+DRY_RUN=1
+# Tenant tunnels are short-lived by design — most of them at any
+# given moment are orphans from finished E2E runs. The default is
+# tuned higher than sweep-cf-orphans (50%) to reflect that the
+# steady-state for tenant-* tunnels is mostly-orphan, not mostly-live.
+MAX_DELETE_PCT="${MAX_DELETE_PCT:-90}"
+
+for arg in "$@"; do
+  case "$arg" in
+    --execute|--no-dry-run) DRY_RUN=0 ;;
+    --help|-h)
+      grep '^#' "$0" | head -45 | sed 's/^# \{0,1\}//'
+      exit 0
+      ;;
+    *)
+      echo "unknown arg: $arg (use --help)" >&2
+      exit 1
+      ;;
+  esac
+done
+
+need() {
+  local var="$1"
+  if [ -z "${!var:-}" ]; then
+    echo "ERROR: $var is required" >&2
+    exit 1
+  fi
+}
+need CF_API_TOKEN
+need CF_ACCOUNT_ID
+need CP_PROD_ADMIN_TOKEN
+need CP_STAGING_ADMIN_TOKEN
+
+log() { echo "[$(date -u +%H:%M:%S)] $*"; }
+
+# --- Gather live sets ------------------------------------------------------
+
+log "Fetching CP prod org slugs..."
+PROD_SLUGS=$(curl -sS -m 15 -H "Authorization: Bearer $CP_PROD_ADMIN_TOKEN" \
+  "https://api.moleculesai.app/cp/admin/orgs?limit=500" \
+  | python3 -c "import json,sys; print(' '.join(o['slug'] for o in json.load(sys.stdin).get('orgs',[])))")
+log "  prod orgs: $(echo "$PROD_SLUGS" | wc -w | tr -d ' ')"
+
+log "Fetching CP staging org slugs..."
+STAGING_SLUGS=$(curl -sS -m 15 -H "Authorization: Bearer $CP_STAGING_ADMIN_TOKEN" \
+  "https://staging-api.moleculesai.app/cp/admin/orgs?limit=500" \
+  | python3 -c "import json,sys; print(' '.join(o['slug'] for o in json.load(sys.stdin).get('orgs',[])))")
+log "  staging orgs: $(echo "$STAGING_SLUGS" | wc -w | tr -d ' ')"
+
+log "Fetching Cloudflare tunnels..."
+# The cfd_tunnel list endpoint is paginated; per_page max is 50.
+# Walk all pages so we don't silently miss orphans on busy accounts.
+PAGE=1
+TUNNEL_JSON='{"result":[]}'
+while :; do
+  page_json=$(curl -sS -m 15 -H "Authorization: Bearer $CF_API_TOKEN" \
+    "https://api.cloudflare.com/client/v4/accounts/$CF_ACCOUNT_ID/cfd_tunnel?per_page=50&page=$PAGE&is_deleted=false")
+  page_count=$(echo "$page_json" | python3 -c "import json,sys; print(len(json.load(sys.stdin).get('result') or []))")
+  if [ "$page_count" = "0" ]; then break; fi
+  # Merge pages
+  TUNNEL_JSON=$(python3 -c "
+import json, sys
+acc = json.loads(sys.argv[1])
+new = json.loads(sys.argv[2])
+acc['result'].extend(new.get('result') or [])
+print(json.dumps(acc))
+" "$TUNNEL_JSON" "$page_json")
+  PAGE=$((PAGE + 1))
+  if [ "$PAGE" -gt 20 ]; then
+    log "::warning::stopping pagination at page 20 (1000 tunnels) — re-run if more"
+    break
+  fi
+done
+TOTAL_TUNNELS=$(echo "$TUNNEL_JSON" | python3 -c "import json,sys; print(len(json.load(sys.stdin)['result']))")
+log "  total tunnels: $TOTAL_TUNNELS"
+
+# --- Compute orphans -------------------------------------------------------
+#
+# Rules (in order):
+#   1. Name doesn't match `tenant-<slug>` → keep (unknown — never sweep
+#      arbitrary tunnels that might belong to platform infra).
+#   2. Tunnel has active connections (status=healthy or non-empty
+#      connections array) → keep (defense-in-depth: don't kill a live
+#      tunnel even if CP forgot the org).
+#   3. Slug ∈ {prod_slugs ∪ staging_slugs} → keep (live tenant).
+#   4. Otherwise → delete (orphan).
+
+export PROD_SLUGS STAGING_SLUGS
+DECISIONS=$(echo "$TUNNEL_JSON" | python3 -c '
+import json, os, re, sys
+
+prod_slugs = set(os.environ["PROD_SLUGS"].split())
+staging_slugs = set(os.environ["STAGING_SLUGS"].split())
+all_slugs = prod_slugs | staging_slugs
+
+_TENANT_RE = re.compile(r"^tenant-(.+)$")
+
+def decide(t, all_slugs):
+    name = t.get("name", "")
+    tid = t.get("id", "")
+    status = t.get("status", "")
+    conns = t.get("connections") or []
+
+    m = _TENANT_RE.match(name)
+    if not m:
+        return ("keep", "not-a-tenant-tunnel", tid, name, status)
+
+    slug = m.group(1)
+
+    # Defense-in-depth: never delete a tunnel with live connectors.
+    # The CF tunnel "status" field is one of inactive/degraded/healthy/down.
+    # "down" with empty connections is the orphan state we sweep.
+    if status == "healthy" or len(conns) > 0:
+        return ("keep", "active-connections", tid, name, status)
+
+    if slug in all_slugs:
+        return ("keep", "live-tenant", tid, name, status)
+
+    return ("delete", "orphan-tenant", tid, name, status)
+
+d = json.loads(sys.stdin.read())
+for t in d.get("result", []):
+    action, reason, tid, name, status = decide(t, all_slugs)
+    print(json.dumps({"action": action, "reason": reason, "id": tid, "name": name, "status": status}))
+')
+
+# --- Summarize + safety gate ----------------------------------------------
+
+DELETE_COUNT=$(echo "$DECISIONS" | python3 -c "import json,sys; print(sum(1 for l in sys.stdin if json.loads(l)['action']=='delete'))")
+KEEP_COUNT=$((TOTAL_TUNNELS - DELETE_COUNT))
+TENANT_TUNNELS=$(echo "$DECISIONS" | python3 -c "
+import json, sys
+n = sum(1 for l in sys.stdin if json.loads(l)['reason'] != 'not-a-tenant-tunnel')
+print(n)
+")
+
+log ""
+log "== Sweep plan =="
+log "  total tunnels:          $TOTAL_TUNNELS"
+log "  tenant-shaped tunnels:  $TENANT_TUNNELS"
+log "  would delete:           $DELETE_COUNT"
+log "  would keep:             $KEEP_COUNT"
+log ""
+
+# Per-reason breakdown of deletes
+echo "$DECISIONS" | python3 -c "
+import json,sys,collections
+c = collections.Counter()
+for l in sys.stdin:
+    d = json.loads(l)
+    if d['action'] == 'delete':
+        c[d['reason']] += 1
+for reason, n in c.most_common():
+    print(f'  delete/{reason}: {n}')
+"
+
+# Safety gate operates against the tenant-shaped subset (the reasonable
+# "all of these could conceivably be ours" denominator), not the total.
+# A miscount of platform-infra tunnels shouldn't relax the gate.
+if [ "$TENANT_TUNNELS" -gt 0 ]; then
+  PCT=$(( DELETE_COUNT * 100 / TENANT_TUNNELS ))
+  if [ "$PCT" -gt "$MAX_DELETE_PCT" ]; then
+    log ""
+    log "SAFETY: would delete $PCT% of tenant-shaped tunnels (threshold $MAX_DELETE_PCT%) — refusing."
+    log "  If this is expected (e.g. major cleanup after incident), rerun with"
+    log "  MAX_DELETE_PCT=$((PCT+5)) $0 $*"
+    exit 2
+  fi
+fi
+
+if [ "$DRY_RUN" = "1" ]; then
+  log ""
+  log "Dry run complete. Pass --execute to actually delete $DELETE_COUNT tunnels."
+  log ""
+  log "First 20 tunnels that would be deleted:"
+  echo "$DECISIONS" | python3 -c "
+import json, sys
+shown = 0
+for l in sys.stdin:
+    d = json.loads(l)
+    if d['action'] == 'delete':
+        print(f\"  {d['reason']:25s}  {d['name']:40s}  status={d['status']}\")
+        shown += 1
+        if shown >= 20: break
+"
+  exit 0
+fi
+
+# --- Execute deletes -------------------------------------------------------
+
+log ""
+log "Executing $DELETE_COUNT deletions..."
+DELETED=0
+FAILED=0
+while IFS= read -r line; do
+  action=$(echo "$line" | python3 -c "import json,sys; print(json.loads(sys.stdin.read())['action'])")
+  [ "$action" = "delete" ] || continue
+  tid=$(echo "$line" | python3 -c "import json,sys; print(json.loads(sys.stdin.read())['id'])")
+  name=$(echo "$line" | python3 -c "import json,sys; print(json.loads(sys.stdin.read())['name'])")
+  if curl -sS -m 10 -X DELETE \
+      -H "Authorization: Bearer $CF_API_TOKEN" \
+      "https://api.cloudflare.com/client/v4/accounts/$CF_ACCOUNT_ID/cfd_tunnel/$tid" \
+      | grep -q '"success":true'; then
+    DELETED=$((DELETED+1))
+  else
+    FAILED=$((FAILED+1))
+    log "  FAILED: $name ($tid)"
+  fi
+done <<< "$DECISIONS"
+
+log ""
+log "Done. deleted=$DELETED failed=$FAILED"
+[ "$FAILED" -eq 0 ]
@@ -0,0 +1,65 @@
+"""Unit tests for check_migration_collisions.py — focuses on the regex
+classifier + the diff/base-set logic that runs without git.
+
+The end-to-end git diff + gh pr list path is exercised manually (running
+the workflow against test PRs). These tests pin the pure-logic surface
+so a regression in migration-name parsing fails immediately at PR time.
+
+Run locally: ``python3 -m unittest scripts/ops/test_check_migration_collisions.py -v``
+"""
+
+import importlib.util
+import unittest
+from pathlib import Path
+
+# Load the script as a module without invoking main(). We import the
+# regex + helpers directly so we can test them without setting up git.
+SCRIPT_PATH = Path(__file__).parent / "check_migration_collisions.py"
+spec = importlib.util.spec_from_file_location("ccm", SCRIPT_PATH)
+ccm = importlib.util.module_from_spec(spec)
+spec.loader.exec_module(ccm)
+
+
+class TestMigrationFileRe(unittest.TestCase):
+    """The regex classifier — the load-bearing piece of the detector."""
+
+    def test_matches_standard_three_digit_prefix(self):
+        m = ccm.MIGRATION_FILE_RE.match("044_platform_inbound_secret.up.sql")
+        assert m is not None
+        assert int(m.group(1)) == 44
+        assert m.group(2) == "up"
+
+    def test_matches_down_migration(self):
+        m = ccm.MIGRATION_FILE_RE.match("044_platform_inbound_secret.down.sql")
+        assert m is not None
+        assert int(m.group(1)) == 44
+        assert m.group(2) == "down"
+
+    def test_matches_date_shaped_prefix(self):
+        # Real example from the repo: 20260417000000_workflow_checkpoints
+        m = ccm.MIGRATION_FILE_RE.match("20260417000000_workflow_checkpoints.up.sql")
+        assert m is not None
+        assert int(m.group(1)) == 20260417000000
+
+    def test_matches_long_compound_name(self):
+        m = ccm.MIGRATION_FILE_RE.match("042_a2a_queue.up.sql")
+        assert m is not None
+        assert int(m.group(1)) == 42
+
+    def test_rejects_no_prefix(self):
+        assert ccm.MIGRATION_FILE_RE.match("readme.md") is None
+
+    def test_rejects_alpha_prefix(self):
+        assert ccm.MIGRATION_FILE_RE.match("abc_migration.up.sql") is None
+
+    def test_rejects_wrong_extension(self):
+        assert ccm.MIGRATION_FILE_RE.match("044_test.sql") is None
+        assert ccm.MIGRATION_FILE_RE.match("044_test.up.txt") is None
+
+    def test_rejects_path_separator(self):
+        # Filename only — paths come pre-split via Path(line).name
+        assert ccm.MIGRATION_FILE_RE.match("044/test.up.sql") is None
+
+    def test_rejects_no_underscore(self):
+        # Naming convention requires <digits>_<name>
+        assert ccm.MIGRATION_FILE_RE.match("044.up.sql") is None
@@ -0,0 +1,201 @@
+"""Tests for scripts/build_runtime_package.py — the wheel-build import rewriter.
+
+Run locally: ``python3 -m unittest scripts/test_build_runtime_package.py -v``
+
+Why this exists: PR #2433 shipped ``import inbox as _inbox_module`` inside
+the workspace runtime, and the rewriter expanded it to
+``import molecule_runtime.inbox as inbox as _inbox_module`` — invalid
+Python. The wheel-smoke gate caught it post-merge but couldn't block
+the merge (not a required check yet — see PR #2439). PR #2436 added a
+build-time gate that raises ``ValueError`` on this pattern; this file
+locks the rewriter's documented contract under unit test so the gate
+itself can't silently regress.
+
+Coverage:
+- ``import X``                  → ``import molecule_runtime.X as X``
+- ``import X.sub``              → ``import molecule_runtime.X.sub``
+- ``import X``  + trailing comment is preserved
+- ``from X import Y``           → ``from molecule_runtime.X import Y``
+- ``from X.sub import Y``       → ``from molecule_runtime.X.sub import Y``
+- ``from X import Y, Z``        → ``from molecule_runtime.X import Y, Z``
+- ``import X as Y``             → raises ValueError (the rewriter would
+  produce ``import molecule_runtime.X as X as Y``, syntax error)
+- non-allowlist module names    → not rewritten (regex anchors on the closed set)
+- Indented imports (inside def/class) keep their indentation.
+"""
+from __future__ import annotations
+
+import os
+import sys
+import unittest
+
+# scripts/build_runtime_package.py lives at scripts/ — add scripts/ to sys.path
+# so the import works whether unittest is invoked from repo root or scripts/.
+HERE = os.path.dirname(os.path.abspath(__file__))
+if HERE not in sys.path:
+    sys.path.insert(0, HERE)
+
+import build_runtime_package as M  # noqa: E402
+
+
+def rewrite(text: str) -> str:
+    """Run the rewriter end-to-end so the test exercises the same path
+    used by the wheel build (regex compile + substitution)."""
+    regex = M.build_import_rewriter()
+    return M.rewrite_imports(text, regex)
+
+
+class TestBareImportRewriting(unittest.TestCase):
+    def test_plain_import_aliases_to_preserve_binding(self):
+        self.assertEqual(
+            rewrite("import inbox\n"),
+            "import molecule_runtime.inbox as inbox\n",
+        )
+
+    def test_plain_import_with_trailing_comment_is_preserved(self):
+        # Real-world shape from a2a_mcp_server.py — the comment must
+        # survive the rewrite without losing its leading-space buffer.
+        self.assertEqual(
+            rewrite("import inbox  # noqa: E402\n"),
+            "import molecule_runtime.inbox as inbox  # noqa: E402\n",
+        )
+
+    def test_import_dotted_keeps_dotted_form(self):
+        # `import X.sub` is rare for our modules but the rewriter must
+        # not double-alias — we want `import molecule_runtime.X.sub`,
+        # not `import molecule_runtime.X.sub as X.sub` (invalid).
+        self.assertEqual(
+            rewrite("import platform_tools.registry\n"),
+            "import molecule_runtime.platform_tools.registry\n",
+        )
+
+    def test_indented_import_preserves_indentation(self):
+        src = "def foo():\n    import inbox\n    return inbox.x\n"
+        out = rewrite(src)
+        self.assertIn("    import molecule_runtime.inbox as inbox\n", out)
+
+
+class TestFromImportRewriting(unittest.TestCase):
+    def test_from_module_import_simple(self):
+        self.assertEqual(
+            rewrite("from inbox import InboxState\n"),
+            "from molecule_runtime.inbox import InboxState\n",
+        )
+
+    def test_from_dotted_import(self):
+        self.assertEqual(
+            rewrite("from platform_tools.registry import TOOLS\n"),
+            "from molecule_runtime.platform_tools.registry import TOOLS\n",
+        )
+
+    def test_from_import_multiple_symbols(self):
+        # Multi-import statement — the rewriter only touches the module
+        # prefix, not the names being imported.
+        self.assertEqual(
+            rewrite("from a2a_tools import (foo, bar, baz)\n"),
+            "from molecule_runtime.a2a_tools import (foo, bar, baz)\n",
+        )
+
+    def test_from_import_block_form(self):
+        src = (
+            "from a2a_tools import (\n"
+            "    tool_check_task_status,\n"
+            "    tool_commit_memory,\n"
+            ")\n"
+        )
+        out = rewrite(src)
+        self.assertIn("from molecule_runtime.a2a_tools import (\n", out)
+        # Trailing names + closer are unchanged.
+        self.assertIn("    tool_check_task_status,\n", out)
+        self.assertIn(")\n", out)
+
+
+class TestImportAsAliasRejection(unittest.TestCase):
+    """The key regression class — the failure mode that shipped in PR #2433."""
+
+    def test_import_as_alias_raises_value_error(self):
+        with self.assertRaises(ValueError) as ctx:
+            rewrite("import inbox as _inbox_module\n")
+        msg = str(ctx.exception)
+        # Error must name the offending module + suggest the fix.
+        self.assertIn("inbox", msg)
+        self.assertIn("as <alias>", msg)
+        self.assertIn("from", msg)  # suggests `from X import …`
+
+    def test_import_as_alias_indented_still_rejected(self):
+        # Indented (inside def/class) — same hazard, same rejection.
+        with self.assertRaises(ValueError):
+            rewrite("def foo():\n    import inbox as _x\n")
+
+    def test_import_as_alias_with_trailing_comment_still_rejected(self):
+        with self.assertRaises(ValueError):
+            rewrite("import inbox as _x  # comment\n")
+
+    def test_plain_import_with_as_in_comment_does_not_trip(self):
+        # The detection strips comments before pattern-matching, so a
+        # comment containing "as foo" must NOT trigger the rejection.
+        self.assertEqual(
+            rewrite("import inbox  # rewriter produces alias as inbox\n"),
+            "import molecule_runtime.inbox as inbox  # rewriter produces alias as inbox\n",
+        )
+
+    def test_import_followed_by_comma_is_not_an_alias(self):
+        # `import inbox, os` — comma is not `as`, must not be rejected.
+        # Our regex captures `inbox` then `,` — only `inbox` gets prefixed.
+        # `os` is not in TOP_LEVEL_MODULES so it's left alone.
+        out = rewrite("import inbox, os\n")
+        # The first module is rewritten; the second (non-allowlist) is not.
+        self.assertIn("import molecule_runtime.inbox as inbox", out)
+
+
+class TestOutsideAllowlistModules(unittest.TestCase):
+    def test_third_party_imports_unchanged(self):
+        # `httpx`, `os`, `re` etc. are not in TOP_LEVEL_MODULES — the
+        # regex must not match them. This is the closed-list invariant
+        # that prevents accidental rewrites of stdlib / third-party.
+        src = "import httpx\nimport os\nfrom re import match\n"
+        self.assertEqual(rewrite(src), src)
+
+    def test_short_name_collision_avoided(self):
+        # `from a2a.server.X import Y` must not match the bare `a2a`
+        # prefix — `a2a` isn't in our allowlist (we allow `a2a_tools`,
+        # `a2a_client`, etc., but not bare `a2a`). Belt-and-suspenders.
+        src = "from a2a.server.routes import create_agent_card_routes\n"
+        self.assertEqual(rewrite(src), src)
+
+
+class TestEndToEndShape(unittest.TestCase):
+    """Reproduces the PR #2433 → #2436 incident shape."""
+
+    def test_pr_2433_pattern_now_rejected(self):
+        # The exact line PR #2433 added (inside main()), which produced
+        # `import molecule_runtime.inbox as inbox as _inbox_module` —
+        # invalid syntax in the published wheel.
+        with self.assertRaises(ValueError) as ctx:
+            rewrite(
+                "    import inbox as _inbox_module\n"
+                "    _inbox_module.set_notification_callback(_on_inbox_message)\n"
+            )
+        # Error message includes the offending line so the operator
+        # knows exactly where to fix.
+        self.assertIn("inbox", str(ctx.exception))
+
+    def test_pr_2436_fix_pattern_works(self):
+        # The fix-forward shape (#2436): top-level `import inbox`,
+        # bridge wired in main() via `inbox.set_notification_callback`.
+        src = (
+            "import inbox\n"
+            "\n"
+            "def main():\n"
+            "    inbox.set_notification_callback(cb)\n"
+        )
+        out = rewrite(src)
+        self.assertIn("import molecule_runtime.inbox as inbox\n", out)
+        # The callable reference inside main() is left alone — only
+        # imports get rewritten, not arbitrary `inbox.foo` callsites
+        # (those resolve via the module binding the rewrite preserves).
+        self.assertIn("    inbox.set_notification_callback(cb)\n", out)
+
+
+if __name__ == "__main__":
+    unittest.main()
@@ -0,0 +1,173 @@
+#!/usr/bin/env python3
+"""Smoke-test an installed molecule-ai-workspace-runtime wheel.
+
+Runs the same invariant assertions in two workflows:
+  * publish-runtime.yml — after building dist/*.whl, before PyPI upload
+  * runtime-prbuild-compat.yml — after building the PR's wheel, before merge
+
+Splitting the smoke across two inline heredocs let PR-time and publish-time
+drift apart. After 2026-04 we kept hitting publish-time failures for
+regressions a PR-time check could have caught. One script, both gates.
+
+Failure here intentionally exits non-zero so the workflow's `run:` step fails.
+Each block prints a single ✓ line on success so the GH summary log stays
+readable; assertion errors propagate with their own message.
+
+Run directly: `python scripts/wheel_smoke.py` after `pip install <wheel>`.
+"""
+
+import os
+import sys
+
+
+def smoke_imports_and_invariants() -> None:
+    """Module imports + stable contract assertions.
+
+    Importing main_sync by name is the strongest pre-PyPI gate we have for
+    import-rewrite mistakes (the 0.1.16 incident, where main.py loaded but
+    main_sync was missing because the build script dropped a re-export).
+    """
+    from molecule_runtime.main import main_sync  # noqa: F401
+    from molecule_runtime import a2a_client, a2a_tools  # noqa: F401
+    from molecule_runtime.builtin_tools import memory  # noqa: F401
+    from molecule_runtime.adapters import get_adapter, BaseAdapter, AdapterConfig
+
+    # cli_main + mcp_cli.main are the molecule-mcp console-script entry
+    # points — the external-runtime universal MCP path. Same regression
+    # class as the 0.1.16 main_sync incident: a silent rename or missed
+    # rewrite here would break every external operator's MCP install on
+    # the next wheel publish. Pin both names because pyproject points
+    # at mcp_cli.main, which then imports a2a_mcp_server.cli_main.
+    from molecule_runtime.a2a_mcp_server import cli_main  # noqa: F401
+    from molecule_runtime.mcp_cli import main as mcp_cli_main  # noqa: F401
+    assert callable(cli_main), "a2a_mcp_server.cli_main must be callable"
+    assert callable(mcp_cli_main), "mcp_cli.main must be callable"
+
+    # inbox.activate / get_state / start_poller_thread form the inbound
+    # delivery path for the standalone molecule-mcp wrapper. mcp_cli.main
+    # imports + activates these at startup; if a wheel ships without
+    # them, the standalone agent silently loses the wait_for_message /
+    # inbox_peek / inbox_pop tools and reverts to outbound-only.
+    from molecule_runtime.inbox import (  # noqa: F401
+        InboxState,
+        activate as inbox_activate,
+        get_state as inbox_get_state,
+        set_notification_callback as inbox_set_notification_callback,
+        start_poller_thread as inbox_start_poller_thread,
+    )
+    assert callable(inbox_activate), "inbox.activate must be callable"
+    assert callable(inbox_get_state), "inbox.get_state must be callable"
+    assert callable(inbox_start_poller_thread), "inbox.start_poller_thread must be callable"
+    assert callable(inbox_set_notification_callback), "inbox.set_notification_callback must be callable"
+
+    assert a2a_client._A2A_ERROR_PREFIX, "a2a_client missing error sentinel"
+    assert callable(get_adapter), "adapters.get_adapter must be callable"
+    assert hasattr(BaseAdapter, "name"), "BaseAdapter interface broken"
+    assert hasattr(AdapterConfig, "__init__"), "AdapterConfig dataclass missing"
+    print("✓ module imports + invariants OK")
+
+
+def smoke_agent_card_call_shape() -> None:
+    """Construct AgentCard with the EXACT kwargs main.py uses.
+
+    Pure imports don't catch field-shape regressions in upstream SDKs that
+    only surface at construction time. Two bugs of this exact class shipped
+    since the a2a-sdk 1.0 migration:
+      - state_transition_history=True (#2179)
+      - supported_protocols=[...] (the protobuf field is supported_interfaces;
+        every workspace boot crashed with `ValueError: Protocol message
+        AgentCard has no "supported_protocols" field`)
+
+    main.py and this block MUST stay in lockstep — adding a kwarg there
+    without mirroring it here is the regression vector.
+    """
+    from a2a.types import AgentCard, AgentCapabilities, AgentSkill, AgentInterface
+
+    AgentCard(
+        name="smoke-agent",
+        description="wheel-smoke: AgentCard call-shape",
+        version="0.0.0-smoke",
+        supported_interfaces=[
+            AgentInterface(protocol_binding="https://a2a.g/v1", url="http://localhost:8080"),
+        ],
+        capabilities=AgentCapabilities(
+            streaming=True,
+            push_notifications=False,
+        ),
+        skills=[
+            AgentSkill(
+                id="smoke-skill",
+                name="Smoke",
+                description="no-op",
+                tags=["smoke"],
+                examples=["noop"],
+            ),
+        ],
+        default_input_modes=["text/plain", "application/json"],
+        default_output_modes=["text/plain", "application/json"],
+    )
+    print("✓ AgentCard call-shape smoke passed")
+
+
+def smoke_well_known_path_alignment() -> None:
+    """The SDK's published constant must match the path it actually mounts.
+
+    main.py polls AGENT_CARD_WELL_KNOWN_PATH to detect server readiness. If
+    the constant and create_agent_card_routes() drift, every workspace's
+    initial_prompt silently drops (probe 404s, falls through to "skipping").
+    This was the #2193 incident class.
+    """
+    from a2a.types import AgentCard
+    from a2a.utils.constants import AGENT_CARD_WELL_KNOWN_PATH
+    from a2a.server.routes import create_agent_card_routes
+
+    mounted_paths = [
+        getattr(r, "path", None)
+        for r in create_agent_card_routes(
+            AgentCard(
+                name="wk-smoke",
+                description="well-known mount alignment",
+                version="0.0.0-smoke",
+            )
+        )
+    ]
+    assert AGENT_CARD_WELL_KNOWN_PATH in mounted_paths, (
+        f"AGENT_CARD_WELL_KNOWN_PATH ({AGENT_CARD_WELL_KNOWN_PATH!r}) is NOT among "
+        f"paths mounted by create_agent_card_routes ({mounted_paths!r}). The SDK "
+        "constant and its own route factory have drifted — workspace probes will "
+        "404 forever, silently dropping every workspace initial_prompt."
+    )
+    print(f"✓ well-known mount alignment OK ({AGENT_CARD_WELL_KNOWN_PATH})")
+
+
+def smoke_message_helper() -> None:
+    """new_text_message is the v1.x rename of new_agent_text_message.
+
+    main.py and a2a_executor.py call new_text_message in hot paths; if the
+    import breaks, every reply errors with ImportError before the message
+    even leaves the workspace. Importing here catches a future v2.x rename
+    at publish time.
+    """
+    from a2a.helpers import new_text_message
+
+    msg = new_text_message("smoke")
+    assert msg is not None, "new_text_message returned None"
+    print("✓ message helper import + call OK")
+
+
+def main() -> int:
+    # main.py validates WORKSPACE_ID at module-import time via platform_auth.
+    # Set placeholders so the smoke doesn't trip on the env-var guard.
+    os.environ.setdefault("WORKSPACE_ID", "00000000-0000-0000-0000-000000000000")
+    os.environ.setdefault("PLATFORM_URL", "http://localhost:8080")
+
+    smoke_imports_and_invariants()
+    smoke_agent_card_call_shape()
+    smoke_well_known_path_alignment()
+    smoke_message_helper()
+    print("✓ wheel smoke passed")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
@@ -0,0 +1,275 @@
+#!/usr/bin/env bash
+# Staging E2E for #2307 — create fresh tenant, test peer visibility, tear down.
+#
+# Mirrors tests/e2e/test_staging_full_saas.sh's pattern (org create via
+# /cp/admin/orgs, EXIT-trap teardown via DELETE /cp/admin/tenants/:slug
+# with required {"confirm":slug} body).
+#
+# Required: MOLECULE_ADMIN_TOKEN exported (CP admin bearer).
+# Optional:
+#   MOLECULE_CP_URL  default https://staging-api.moleculesai.app
+#   PARENT_RUNTIME   default claude-code
+
+set -uo pipefail
+
+CP_URL="${MOLECULE_CP_URL:-https://staging-api.moleculesai.app}"
+ADMIN_TOKEN="${MOLECULE_ADMIN_TOKEN:?MOLECULE_ADMIN_TOKEN required}"
+PARENT_RUNTIME="${PARENT_RUNTIME:-claude-code}"
+
+RUN_ID=$(date +%s | tail -c 8)
+SLUG="e2e-2307-$RUN_ID"
+ORG_ID=""
+TENANT_URL=""
+TENANT_TOKEN=""
+PARENT=""
+CHILD=""
+CTOK=""
+
+admin_call() {
+    local method="$1" path="$2"
+    shift 2
+    curl -sS -X "$method" "$CP_URL$path" \
+        -H "Authorization: Bearer $ADMIN_TOKEN" \
+        -H "Content-Type: application/json" \
+        "$@"
+}
+
+tenant_call() {
+    local method="$1" path="$2"
+    shift 2
+    curl -sS -X "$method" "$TENANT_URL$path" \
+        -H "Authorization: Bearer $TENANT_TOKEN" \
+        -H "X-Molecule-Org-Id: $ORG_ID" \
+        -H "Content-Type: application/json" \
+        "$@"
+}
+
+teardown() {
+    local rc=$?
+    set +e
+    echo ""
+    echo "[teardown] DELETE /cp/admin/tenants/$SLUG ..."
+    admin_call DELETE "/cp/admin/tenants/$SLUG" \
+        --max-time 120 \
+        -d "{\"confirm\":\"$SLUG\"}" >/dev/null 2>&1
+    # Poll up to 60s for purge
+    for j in $(seq 1 12); do
+        LIST=$(admin_call GET /cp/admin/orgs 2>/dev/null)
+        LEAK=$(echo "$LIST" | python3 -c "
+import sys, json
+try:
+    d = json.load(sys.stdin)
+except Exception:
+    print(1); sys.exit(0)
+orgs = d if isinstance(d, list) else d.get('orgs', [])
+n = sum(1 for o in orgs if o.get('slug') == '$SLUG' and o.get('status') != 'purged')
+print(n)
+" 2>/dev/null || echo 1)
+        if [ "$LEAK" = "0" ]; then
+            echo "  ✓ tenant purged (after ${j}x5s)"
+            exit $rc
+        fi
+        sleep 5
+    done
+    echo "  ⚠ LEAK: $SLUG still in /cp/admin/orgs after 60s — manual cleanup needed"
+    [ $rc -eq 0 ] && rc=4
+    exit $rc
+}
+trap teardown EXIT INT TERM
+
+# ─── 1. Create the org ────────────────────────────────────────────────
+echo "[1/8] POST /cp/admin/orgs — slug=$SLUG"
+CREATE=$(admin_call POST /cp/admin/orgs \
+    -d "{\"slug\":\"$SLUG\",\"name\":\"E2E #2307 $SLUG\",\"owner_user_id\":\"e2e-runner:$SLUG\"}")
+echo "  resp: $(echo "$CREATE" | head -c 300)"
+ORG_ID=$(echo "$CREATE" | python3 -c "import sys,json; print(json.load(sys.stdin).get('id',''))" 2>/dev/null)
+[ -n "$ORG_ID" ] || { echo "  ✗ org creation failed"; exit 1; }
+echo "  ✓ ORG_ID=$ORG_ID"
+
+# ─── 2. Wait for tenant ready ─────────────────────────────────────────
+echo "[2/8] waiting for tenant to come up (cold-start ~5-10min)..."
+for i in $(seq 1 180); do
+    STATUS=$(admin_call GET /cp/admin/orgs 2>/dev/null | python3 -c "
+import sys, json
+try: d = json.load(sys.stdin)
+except Exception: sys.exit(0)
+orgs = d if isinstance(d, list) else d.get('orgs', [])
+for o in orgs:
+    if o.get('slug') == '$SLUG':
+        print(o.get('instance_status') or o.get('status') or 'unknown')
+        break
+" 2>/dev/null)
+    [ $((i % 6)) -eq 1 ] && echo "  attempt $i: status=$STATUS"
+    case "$STATUS" in running|online|ready) break ;; esac
+    sleep 5
+done
+case "$STATUS" in running|online|ready) ;;
+    *) echo "  ✗ tenant never came up (last=$STATUS)"; exit 2 ;; esac
+echo "  ✓ tenant status=$STATUS"
+
+# ─── 3. Per-tenant admin token ────────────────────────────────────────
+echo "[3/8] fetching per-tenant admin token..."
+TT_RESP=$(admin_call GET "/cp/admin/orgs/$SLUG/admin-token")
+TENANT_TOKEN=$(echo "$TT_RESP" | python3 -c "import sys,json; print(json.load(sys.stdin).get('admin_token',''))" 2>/dev/null)
+[ -n "$TENANT_TOKEN" ] || { echo "  ✗ tenant token fetch failed: $TT_RESP"; exit 2; }
+echo "  ✓ got tenant admin token (len ${#TENANT_TOKEN})"
+
+CP_HOST=$(echo "$CP_URL" | sed -E 's#^https?://##; s#/.*$##')
+case "$CP_HOST" in
+    api.*)         DERIVED_DOMAIN="${CP_HOST#api.}" ;;
+    staging-api.*) DERIVED_DOMAIN="staging.${CP_HOST#staging-api.}" ;;
+    *)             DERIVED_DOMAIN="$CP_HOST" ;;
+esac
+TENANT_URL="https://${SLUG}.${DERIVED_DOMAIN}"
+echo "  tenant url: $TENANT_URL"
+
+# ─── 4. Wait for tenant TLS/DNS readiness ─────────────────────────────
+echo "[4/8] waiting for tenant /health (TLS/DNS, up to 10min)..."
+for i in $(seq 1 120); do
+    if curl -fsS "$TENANT_URL/health" -m 5 -k >/dev/null 2>&1; then
+        echo "  ✓ /health ok (attempt $i)"
+        break
+    fi
+    sleep 5
+done
+
+# ─── 5. Provision parent CEO workspace ────────────────────────────────
+echo "[5/8] creating parent CEO ($PARENT_RUNTIME)..."
+P_RESP=$(tenant_call POST /workspaces \
+    -d "{\"name\":\"e2e-CEO\",\"runtime\":\"$PARENT_RUNTIME\",\"tier\":3}")
+echo "  parent resp: $(echo "$P_RESP" | head -c 300)"
+PARENT=$(echo "$P_RESP" | python3 -c "import sys,json; print(json.load(sys.stdin).get('id',''))" 2>/dev/null)
+PTOK=$(echo "$P_RESP" | python3 -c "import sys,json; print(json.load(sys.stdin).get('auth_token',''))" 2>/dev/null)
+[ -n "$PARENT" ] || { echo "  ✗ parent create failed"; exit 3; }
+echo "  ✓ PARENT=$PARENT  (parent_token_returned=$([ -n "$PTOK" ] && echo yes || echo no))"
+
+# ─── 6. Wait for parent online ────────────────────────────────────────
+echo "[6/8] waiting for parent to come online (up to 12min)..."
+for i in $(seq 1 144); do
+    WS_JSON=$(tenant_call GET "/workspaces/$PARENT" 2>/dev/null)
+    S=$(echo "$WS_JSON" | python3 -c "
+import sys, json
+try: d = json.load(sys.stdin)
+except Exception: sys.exit(0)
+w = d.get('workspace') if isinstance(d.get('workspace'), dict) else d
+print(w.get('status') or '')
+" 2>/dev/null)
+    [ $((i % 6)) -eq 1 ] && echo "  attempt $i: parent status=$S"
+    [ "$S" = "online" ] && break
+    sleep 5
+done
+[ "$S" = "online" ] || { echo "  ✗ parent never online (last=$S)"; exit 3; }
+echo "  ✓ parent online"
+
+# ─── 7. Create external child + register URL ──────────────────────────
+echo "[7/8] creating external child + registering..."
+C_RESP=$(tenant_call POST /workspaces \
+    -d "{\"name\":\"e2e-Reno-Server\",\"runtime\":\"external\",\"external\":true,\"tier\":2,\"parent_id\":\"$PARENT\"}")
+echo "  child resp: $(echo "$C_RESP" | head -c 400)"
+CHILD=$(echo "$C_RESP" | python3 -c "import sys,json; print(json.load(sys.stdin).get('id',''))" 2>/dev/null)
+# External-runtime token is nested under `connection.auth_token` (verified
+# 2026-04-29 against staging response shape). Fall back to top-level for
+# parity with older clients.
+CTOK=$(echo "$C_RESP" | python3 -c "
+import sys, json
+d = json.load(sys.stdin)
+print(d.get('connection', {}).get('auth_token') or d.get('auth_token') or '')
+" 2>/dev/null)
+[ -n "$CHILD" ] || { echo "  ✗ child create failed"; exit 3; }
+echo "  ✓ CHILD=$CHILD  (child_token_returned=$([ -n "$CTOK" ] && echo yes || echo no))"
+
+# Try register with child's own token (bootstrap path); fall back to tenant_call
+if [ -n "$CTOK" ]; then
+    REG_RESP=$(curl -sS -X POST "$TENANT_URL/registry/register" \
+        -H "Authorization: Bearer $CTOK" \
+        -H "X-Molecule-Org-Id: $ORG_ID" \
+        -H "Content-Type: application/json" \
+        -d "{\"id\":\"$CHILD\",\"url\":\"https://example.com/molecule-test\",\"agent_card\":{\"name\":\"Reno Server\",\"description\":\"Mock\",\"version\":\"0.1.0\"}}")
+else
+    REG_RESP=$(tenant_call POST /registry/register \
+        -d "{\"id\":\"$CHILD\",\"url\":\"https://example.com/molecule-test\",\"agent_card\":{\"name\":\"Reno Server\",\"description\":\"Mock\",\"version\":\"0.1.0\"}}")
+fi
+echo "  register resp: $(echo "$REG_RESP" | head -c 300)"
+
+# ─── 8. THE TEST — peer visibility ────────────────────────────────────
+echo ""
+echo "[8/8] === Verdict — does parent see external child? ==="
+echo ""
+echo "(a) DB shape via admin: GET /cp/admin/orgs/$SLUG (workspaces listing if exposed)"
+
+# Check children listing — most direct DB-shape signal we can get from outside
+LIST=$(tenant_call GET "/workspaces?parent_id=$PARENT")
+echo "  /workspaces?parent_id=$PARENT response: $(echo "$LIST" | head -c 500)"
+echo ""
+
+CHILD_LISTED=$(echo "$LIST" | python3 -c "
+import sys, json
+try:
+    d = json.load(sys.stdin)
+except Exception:
+    print('parse_error'); sys.exit(0)
+ws = d if isinstance(d, list) else d.get('workspaces', d.get('items', []))
+print('yes' if any(w.get('id') == '$CHILD' for w in ws) else 'no')
+" 2>/dev/null)
+echo "  child appears in parent's children listing: $CHILD_LISTED"
+
+# (b) /peers from PARENT side using PTOK if provided
+if [ -n "$PTOK" ]; then
+    PEERS=$(curl -sS "$TENANT_URL/registry/$PARENT/peers" \
+        -H "Authorization: Bearer $PTOK" \
+        -H "X-Molecule-Org-Id: $ORG_ID")
+    echo ""
+    echo "(b) GET /registry/$PARENT/peers (parent's bearer):"
+    echo "    $(echo "$PEERS" | head -c 600)"
+    if echo "$PEERS" | grep -q "$CHILD"; then
+        echo "  ✓ child IS in parent's /peers"
+        VERDICT_B=ok
+    else
+        echo "  ✗ child is NOT in parent's /peers — bug REPRODUCES at API layer"
+        VERDICT_B=fail
+    fi
+else
+    echo ""
+    echo "(b) parent's auth_token not exposed by /workspaces create — skipping direct /peers check"
+    VERDICT_B=skipped
+fi
+
+# (c) /peers from CHILD side using CTOK
+if [ -n "$CTOK" ]; then
+    PEERS_C=$(curl -sS "$TENANT_URL/registry/$CHILD/peers" \
+        -H "Authorization: Bearer $CTOK" \
+        -H "X-Molecule-Org-Id: $ORG_ID")
+    echo ""
+    echo "(c) GET /registry/$CHILD/peers (child's bearer):"
+    echo "    $(echo "$PEERS_C" | head -c 600)"
+    if echo "$PEERS_C" | grep -q "$PARENT"; then
+        echo "  ✓ parent IS in child's /peers"
+        VERDICT_C=ok
+    else
+        echo "  ✗ parent is NOT in child's /peers"
+        VERDICT_C=fail
+    fi
+else
+    VERDICT_C=skipped
+fi
+
+echo ""
+echo "=== SUMMARY for #2307 staging E2E ==="
+echo "  child listed under parent: $CHILD_LISTED"
+echo "  /peers parent→child:       $VERDICT_B"
+echo "  /peers child→parent:       $VERDICT_C"
+
+# Exit code: 0 if everything visible, 10 if bug reproduces, 11 if inconclusive
+if [ "$CHILD_LISTED" = "yes" ] && [ "$VERDICT_B" = "ok" ]; then
+    echo ""
+    echo "✓ STAGING: parent fully sees external child — bug is downstream (agent code, not platform API)"
+    exit 0
+elif [ "$VERDICT_B" = "fail" ] || [ "$CHILD_LISTED" = "no" ]; then
+    echo ""
+    echo "✗ STAGING: bug REPRODUCES at platform-API layer"
+    exit 10
+else
+    echo ""
+    echo "? STAGING: inconclusive (need parent token to call /peers definitively)"
+    exit 11
+fi
@@ -0,0 +1,135 @@
+#!/usr/bin/env bash
+# E2E for the v2 chat upload path (RFC #2312):
+#
+#   POST /workspaces/:id/chat/uploads
+#       └─▶ platform Go workspace-server (proxies)
+#               └─▶ workspace's own /internal/chat/uploads/ingest
+#                       └─▶ writes to /workspace/.molecule/chat-uploads
+#
+# The same script runs against ANY environment because the architecture
+# is now uniform — local docker-compose, staging tenant, production
+# health-probe — all hit the same call site with the same expected
+# behavior. This is the design goal RFC #2312 set: "test local will
+# pretty much match production."
+#
+# Required env:
+#   BASE                   default http://localhost:8080
+#                          override to https://<id>.<tenant>.staging...
+#   WORKSPACE_RUNTIME      default langgraph (any internal runtime)
+#
+# Exit codes:
+#   0  upload + read-back round-trip succeeded
+#   1  setup failed (couldn't create workspace, never came online, etc.)
+#   2  upload returned non-2xx
+#   3  upload succeeded but the file isn't readable via download
+
+set -uo pipefail
+
+BASE="${BASE:-http://localhost:8080}"
+RUNTIME="${WORKSPACE_RUNTIME:-langgraph}"
+
+PARENT=""
+PARENT_TOK=""
+
+# shellcheck disable=SC1091
+source "$(dirname "$0")/_lib.sh"
+
+cleanup() {
+    local rc=$?
+    set +e
+    if [ -n "$PARENT" ]; then
+        curl -sS -X DELETE "$BASE/workspaces/$PARENT?confirm=true&purge=true" \
+            ${PARENT_TOK:+-H "Authorization: Bearer $PARENT_TOK"} >/dev/null 2>&1
+    fi
+    exit $rc
+}
+trap cleanup EXIT INT TERM
+
+# ─── 1. Create workspace ───────────────────────────────────────────────
+echo "[1/5] POST /workspaces (runtime=$RUNTIME)..."
+P_RESP=$(curl -sS -X POST "$BASE/workspaces" \
+    -H "Content-Type: application/json" \
+    -d "{\"name\":\"e2e-chat-upload\",\"runtime\":\"$RUNTIME\",\"tier\":2}")
+PARENT=$(echo "$P_RESP" | python3 -c "import sys,json; print(json.load(sys.stdin).get('id',''))" 2>/dev/null)
+[ -n "$PARENT" ] || { echo "  ✗ workspace create failed: $P_RESP"; exit 1; }
+echo "  ✓ workspace=$PARENT"
+
+# ─── 2. Wait for online ────────────────────────────────────────────────
+echo "[2/5] waiting for workspace online (up to 5min)..."
+for i in $(seq 1 60); do
+    S=$(curl -sS "$BASE/workspaces/$PARENT" 2>/dev/null \
+        | python3 -c "import sys,json; d=json.load(sys.stdin); w=d.get('workspace') if isinstance(d.get('workspace'),dict) else d; print(w.get('status') or '')" 2>/dev/null)
+    [ $((i % 6)) -eq 1 ] && echo "  attempt $i: status=$S"
+    [ "$S" = "online" ] && break
+    sleep 5
+done
+[ "$S" = "online" ] || { echo "  ✗ workspace never online (last=$S)"; exit 1; }
+echo "  ✓ online"
+
+# Mint a workspace bearer for the test (the auth needed to call
+# /workspaces/:id/chat/uploads, which is wsAuth-gated).
+PARENT_TOK=$(e2e_mint_test_token "$PARENT") || {
+    echo "  ✗ couldn't mint test token (MOLECULE_ENV=production?)"
+    exit 1
+}
+
+# ─── 3. Upload a fixture ───────────────────────────────────────────────
+echo "[3/5] POST /workspaces/$PARENT/chat/uploads ..."
+FIXTURE=$(mktemp)
+echo "e2e fixture content $(date +%s)" > "$FIXTURE"
+EXPECTED=$(cat "$FIXTURE")
+
+UPLOAD=$(curl -sS -X POST "$BASE/workspaces/$PARENT/chat/uploads" \
+    -H "Authorization: Bearer $PARENT_TOK" \
+    -F "files=@$FIXTURE;filename=greeting.txt;type=text/plain" \
+    -w "\nHTTP_CODE=%{http_code}\n")
+CODE=$(echo "$UPLOAD" | grep -oE 'HTTP_CODE=[0-9]+' | cut -d= -f2)
+BODY=$(echo "$UPLOAD" | sed '/^HTTP_CODE=/,$d')
+echo "  status=$CODE"
+echo "  body=$(echo "$BODY" | head -c 300)"
+
+if [ "$CODE" != "200" ]; then
+    echo "  ✗ upload returned $CODE"
+    rm -f "$FIXTURE"
+    exit 2
+fi
+
+URI=$(echo "$BODY" | python3 -c "import sys,json; d=json.load(sys.stdin); print(d['files'][0]['uri'])" 2>/dev/null)
+NAME=$(echo "$BODY" | python3 -c "import sys,json; d=json.load(sys.stdin); print(d['files'][0]['name'])" 2>/dev/null)
+SIZE=$(echo "$BODY" | python3 -c "import sys,json; d=json.load(sys.stdin); print(d['files'][0]['size'])" 2>/dev/null)
+[ -n "$URI" ] || { echo "  ✗ no URI in response"; rm -f "$FIXTURE"; exit 2; }
+[ "$NAME" = "greeting.txt" ] || { echo "  ✗ name mismatch: $NAME"; rm -f "$FIXTURE"; exit 2; }
+[ "$SIZE" = "$(wc -c <"$FIXTURE" | tr -d ' ')" ] || { echo "  ✗ size mismatch: $SIZE"; rm -f "$FIXTURE"; exit 2; }
+echo "  ✓ uri=$URI"
+echo "  ✓ name=$NAME size=$SIZE"
+
+# Extract the absolute path inside the workspace (strip workspace: scheme).
+PATH_IN_WS="${URI#workspace:}"
+
+# ─── 4. Read it back via /chat/download ────────────────────────────────
+echo "[4/5] GET /workspaces/$PARENT/chat/download?path=$PATH_IN_WS"
+DOWNLOADED=$(curl -sS "$BASE/workspaces/$PARENT/chat/download?path=$PATH_IN_WS" \
+    -H "Authorization: Bearer $PARENT_TOK")
+if [ "$DOWNLOADED" != "$EXPECTED" ]; then
+    echo "  ✗ content mismatch"
+    echo "    expected: $EXPECTED"
+    echo "    got:      $DOWNLOADED"
+    rm -f "$FIXTURE"
+    exit 3
+fi
+echo "  ✓ round-trip content matches"
+
+# ─── 5. Auth: bare upload without bearer is rejected ───────────────────
+echo "[5/5] POST without bearer must be 401..."
+NA_CODE=$(curl -sS -o /dev/null -w "%{http_code}" -X POST "$BASE/workspaces/$PARENT/chat/uploads" \
+    -F "files=@$FIXTURE")
+if [ "$NA_CODE" != "401" ]; then
+    echo "  ✗ expected 401 without bearer, got $NA_CODE"
+    rm -f "$FIXTURE"
+    exit 2
+fi
+echo "  ✓ 401 without bearer"
+
+rm -f "$FIXTURE"
+echo ""
+echo "✓ chat upload v2 (RFC #2312) end-to-end passed against $BASE"
@@ -0,0 +1,308 @@
+#!/usr/bin/env bash
+# E2E for delivery_mode=poll + since_id cursor (#2339).
+#
+# Round-trip: register a workspace as poll-mode (no URL) → POST A2A to it →
+# verify the proxy short-circuits to {status:"queued"} → verify the message
+# appears in /activity → verify the since_id cursor returns ONLY new events
+# in ASC order → verify a stale cursor returns 410.
+#
+# Requires: platform running on localhost:8080 with migrations applied.
+#   bash workspace-server/scripts/dev-start.sh
+#   bash workspace-server/scripts/run-migrations.sh
+#
+# Idempotent: each run uses fresh per-script workspace ids so reruns don't
+# collide. Does NOT call e2e_cleanup_all_workspaces — see
+# `feedback_never_run_cluster_cleanup_tests_on_live_platform.md`.
+
+set -euo pipefail
+
+source "$(dirname "$0")/_lib.sh"
+
+PASS=0
+FAIL=0
+TIMEOUT="${A2A_TIMEOUT:-30}"
+
+# Per-run unique ids — workspaces.id is a UUID column, so we generate
+# real v4 UUIDs. A "ws-<tag>" string fails the pq UUID cast and surfaces
+# as opaque "registration failed" (caught against this very test in CI
+# before merge — the failure mode that motivates the helper).
+gen_uuid() {
+  if command -v uuidgen >/dev/null 2>&1; then
+    uuidgen | tr '[:upper:]' '[:lower:]'
+  else
+    python3 -c 'import uuid; print(uuid.uuid4())'
+  fi
+}
+POLL_WS_ID="$(gen_uuid)"
+CALLER_WS_ID="$(gen_uuid)"
+# Phase 2 uses a separate UUID for its invalid-mode probe so a rerun
+# can't poison POLL_WS_ID's row with a bad upsert (the 400 path doesn't
+# touch DB, but defense in depth).
+INVALID_PROBE_ID="$(gen_uuid)"
+
+cleanup() {
+  local rc=$?
+  # Best-effort delete; non-fatal if the row was never created.
+  curl -s -X DELETE "$BASE/workspaces/$POLL_WS_ID" >/dev/null || true
+  curl -s -X DELETE "$BASE/workspaces/$CALLER_WS_ID" >/dev/null || true
+  exit $rc
+}
+trap cleanup EXIT
+
+check() {
+  local desc="$1"
+  local expected="$2"
+  local actual="$3"
+  if echo "$actual" | grep -qF -- "$expected"; then
+    echo "PASS: $desc"
+    PASS=$((PASS + 1))
+  else
+    echo "FAIL: $desc"
+    echo "  expected to contain: $expected"
+    echo "  got: $(echo "$actual" | head -10)"
+    FAIL=$((FAIL + 1))
+  fi
+}
+
+check_eq() {
+  local desc="$1"
+  local expected="$2"
+  local actual="$3"
+  if [ "$actual" = "$expected" ]; then
+    echo "PASS: $desc"
+    PASS=$((PASS + 1))
+  else
+    echo "FAIL: $desc"
+    echo "  expected: $expected"
+    echo "  got:      $actual"
+    FAIL=$((FAIL + 1))
+  fi
+}
+
+check_not_contains() {
+  local desc="$1"
+  local unexpected="$2"
+  local actual="$3"
+  if echo "$actual" | grep -qF -- "$unexpected"; then
+    echo "FAIL: $desc"
+    echo "  should NOT contain: $unexpected"
+    FAIL=$((FAIL + 1))
+  else
+    echo "PASS: $desc"
+    PASS=$((PASS + 1))
+  fi
+}
+
+echo "=== Poll-Mode + since_id Cursor E2E (#2339) ==="
+echo "  base: $BASE"
+echo "  poll workspace: $POLL_WS_ID"
+echo "  caller workspace: $CALLER_WS_ID"
+echo ""
+
+# ---------- Phase 1: register as poll-mode ----------
+echo "--- Phase 1: Register poll-mode workspace (no URL) ---"
+
+# A poll-mode workspace registers WITHOUT a URL — that's the contract from
+# PR 1 (#2348). The agent_card is required; everything else is optional.
+REG_RESP=$(curl -s -X POST "$BASE/registry/register" \
+  -H "Content-Type: application/json" \
+  -d "{
+    \"id\": \"$POLL_WS_ID\",
+    \"delivery_mode\": \"poll\",
+    \"agent_card\": {\"name\": \"poll-mode-test\"}
+  }")
+
+check "register accepts poll mode without URL" '"status":"registered"' "$REG_RESP"
+check "register response echoes delivery_mode=poll"  '"delivery_mode":"poll"' "$REG_RESP"
+
+# Capture the auth token for subsequent /activity reads (Phase 30.1).
+POLL_TOKEN=$(echo "$REG_RESP" | e2e_extract_token || true)
+if [ -z "$POLL_TOKEN" ]; then
+  echo "WARN: no auth_token in register response — token-required reads will fail"
+fi
+
+# ---------- Phase 2: invalid mode rejected ----------
+echo ""
+echo "--- Phase 2: Invalid delivery_mode rejected ---"
+
+INVALID_RESP=$(curl -s -w '\n%{http_code}' -X POST "$BASE/registry/register" \
+  -H "Content-Type: application/json" \
+  -d "{
+    \"id\": \"$INVALID_PROBE_ID\",
+    \"delivery_mode\": \"webhook\",
+    \"agent_card\": {\"name\": \"bad\"}
+  }")
+INVALID_CODE=$(printf '%s' "$INVALID_RESP" | tail -n1)
+INVALID_BODY=$(printf '%s' "$INVALID_RESP" | sed '$d')
+
+check_eq "register rejects unknown delivery_mode (HTTP 400)" "400" "$INVALID_CODE"
+check "error mentions delivery_mode" "delivery_mode" "$INVALID_BODY"
+
+# ---------- Phase 3: A2A short-circuits to {status:"queued"} ----------
+echo ""
+echo "--- Phase 3: A2A to poll-mode workspace short-circuits ---"
+
+A2A_RESP=$(curl -s --max-time "$TIMEOUT" -X POST "$BASE/workspaces/$POLL_WS_ID/a2a" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "jsonrpc": "2.0",
+    "id": "msg-1",
+    "method": "message/send",
+    "params": {
+      "message": {
+        "role": "user",
+        "parts": [{"type": "text", "text": "hello-from-e2e-1"}]
+      }
+    }
+  }')
+
+check "poll-mode A2A returns queued status" '"status":"queued"' "$A2A_RESP"
+check "queued response echoes delivery_mode=poll" '"delivery_mode":"poll"' "$A2A_RESP"
+check "queued response echoes the JSON-RPC method" '"method":"message/send"' "$A2A_RESP"
+
+# ---------- Phase 4: queued message appears in /activity ----------
+echo ""
+echo "--- Phase 4: Queued message visible via /activity ---"
+
+# The activity_logs INSERT runs in a goroutine — give it a moment.
+sleep 1
+
+# Use bearer token if we got one; some platforms require it on /activity.
+ACTIVITY_AUTH=()
+[ -n "${POLL_TOKEN:-}" ] && ACTIVITY_AUTH=(-H "Authorization: Bearer $POLL_TOKEN")
+
+ACT_RESP=$(curl -s --max-time "$TIMEOUT" "${ACTIVITY_AUTH[@]}" \
+  "$BASE/workspaces/$POLL_WS_ID/activity?type=a2a_receive&limit=10")
+
+check "activity feed has the queued message text" "hello-from-e2e-1" "$ACT_RESP"
+check "activity_type is a2a_receive"             '"activity_type":"a2a_receive"' "$ACT_RESP"
+check "method preserved on the activity row"     '"method":"message/send"' "$ACT_RESP"
+
+# Pull the most-recent activity_id for use as a cursor.
+FIRST_ACTIVITY_ID=$(echo "$ACT_RESP" | python3 -c "
+import json, sys
+rows = json.load(sys.stdin)
+if not rows:
+    print('')
+else:
+    # Default ordering is DESC (newest-first) when no since_id is set.
+    print(rows[0]['id'])
+")
+
+if [ -z "$FIRST_ACTIVITY_ID" ]; then
+  echo "FAIL: could not extract activity_id from /activity response"
+  FAIL=$((FAIL + 1))
+  exit 1
+fi
+echo "  cursor candidate: $FIRST_ACTIVITY_ID"
+
+# ---------- Phase 5: since_id returns only events strictly after ----------
+echo ""
+echo "--- Phase 5: since_id cursor returns ASC, strictly-after ---"
+
+# Send a SECOND A2A message; it must appear in the cursor-filtered feed,
+# the FIRST message must NOT (cursor is strictly-after).
+A2A_RESP2=$(curl -s --max-time "$TIMEOUT" -X POST "$BASE/workspaces/$POLL_WS_ID/a2a" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "jsonrpc": "2.0",
+    "id": "msg-2",
+    "method": "message/send",
+    "params": {
+      "message": {
+        "role": "user",
+        "parts": [{"type": "text", "text": "hello-from-e2e-2"}]
+      }
+    }
+  }')
+check "second A2A also queues" '"status":"queued"' "$A2A_RESP2"
+
+sleep 1
+
+CURSOR_RESP=$(curl -s --max-time "$TIMEOUT" "${ACTIVITY_AUTH[@]}" \
+  "$BASE/workspaces/$POLL_WS_ID/activity?type=a2a_receive&since_id=$FIRST_ACTIVITY_ID&limit=10")
+
+check              "since_id feed includes the new message"          "hello-from-e2e-2" "$CURSOR_RESP"
+check_not_contains "since_id feed excludes the cursor row itself"  "hello-from-e2e-1" "$CURSOR_RESP"
+
+# Verify ASC ordering: in a fresh cursor window with two new events the
+# array's first element must be the OLDER one (the test only sends one
+# event after the cursor, so this case is trivially "exactly one row";
+# the next sub-phase strengthens this with a second event).
+A2A_RESP3=$(curl -s --max-time "$TIMEOUT" -X POST "$BASE/workspaces/$POLL_WS_ID/a2a" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "jsonrpc": "2.0",
+    "id": "msg-3",
+    "method": "message/send",
+    "params": {
+      "message": {
+        "role": "user",
+        "parts": [{"type": "text", "text": "hello-from-e2e-3"}]
+      }
+    }
+  }')
+check "third A2A queues" '"status":"queued"' "$A2A_RESP3"
+
+sleep 1
+
+ASC_RESP=$(curl -s --max-time "$TIMEOUT" "${ACTIVITY_AUTH[@]}" \
+  "$BASE/workspaces/$POLL_WS_ID/activity?type=a2a_receive&since_id=$FIRST_ACTIVITY_ID&limit=10")
+
+# rows[0] should be msg-2 (older), rows[-1] should be msg-3 (newer) — that's
+# ASC. If the server still defaulted to DESC, rows[0] would be msg-3.
+ASC_FIRST=$(echo "$ASC_RESP" | python3 -c "
+import json, sys
+rows = json.load(sys.stdin)
+def text_of(r):
+    body = r.get('request_body') or {}
+    parts = (body.get('params') or {}).get('message', {}).get('parts') or []
+    return ''.join(p.get('text','') for p in parts if p.get('type')=='text')
+if len(rows) < 2:
+    print('NEED2_GOT_'+str(len(rows)))
+else:
+    print(text_of(rows[0]) + '|' + text_of(rows[-1]))
+")
+check_eq "since_id feed orders ASC (oldest-new first, newest-new last)" \
+  "hello-from-e2e-2|hello-from-e2e-3" "$ASC_FIRST"
+
+# ---------- Phase 6: stale cursor returns 410 ----------
+echo ""
+echo "--- Phase 6: Stale / unknown cursor returns 410 ---"
+
+GONE_RESP=$(curl -s -w '\n%{http_code}' --max-time "$TIMEOUT" "${ACTIVITY_AUTH[@]}" \
+  "$BASE/workspaces/$POLL_WS_ID/activity?since_id=00000000-0000-0000-0000-000000000000")
+GONE_CODE=$(printf '%s' "$GONE_RESP" | tail -n1)
+GONE_BODY=$(printf '%s' "$GONE_RESP" | sed '$d')
+
+check_eq "unknown since_id returns HTTP 410 Gone" "410" "$GONE_CODE"
+check "410 body explains how to recover" "since_id" "$GONE_BODY"
+
+# ---------- Phase 7: cross-workspace cursor isolation ----------
+echo ""
+echo "--- Phase 7: Cross-workspace cursor isolation ---"
+
+# Register a SECOND poll-mode workspace and try to read its activity
+# feed using a cursor from the FIRST workspace. Must 410 — the cursor
+# is workspace-scoped to prevent UUID-guessing peeks.
+REG2=$(curl -s -X POST "$BASE/registry/register" \
+  -H "Content-Type: application/json" \
+  -d "{
+    \"id\": \"$CALLER_WS_ID\",
+    \"delivery_mode\": \"poll\",
+    \"agent_card\": {\"name\": \"poll-cross-test\"}
+  }")
+check "second poll-mode workspace registers" '"status":"registered"' "$REG2"
+CALLER_TOKEN=$(echo "$REG2" | e2e_extract_token || true)
+CROSS_AUTH=()
+[ -n "${CALLER_TOKEN:-}" ] && CROSS_AUTH=(-H "Authorization: Bearer $CALLER_TOKEN")
+
+CROSS_RESP=$(curl -s -w '\n%{http_code}' --max-time "$TIMEOUT" "${CROSS_AUTH[@]}" \
+  "$BASE/workspaces/$CALLER_WS_ID/activity?since_id=$FIRST_ACTIVITY_ID")
+CROSS_CODE=$(printf '%s' "$CROSS_RESP" | tail -n1)
+check_eq "cross-workspace cursor blocked with 410 (no info leak)" "410" "$CROSS_CODE"
+
+# ---------- Results ----------
+echo ""
+echo "=== Results: $PASS passed, $FAIL failed ==="
+[ "$FAIL" -eq 0 ]
@@ -0,0 +1,348 @@
+#!/bin/bash
+# test_staging_external_runtime.sh — E2E regression for the
+# external-runtime workspace lifecycle on a real staging tenant.
+#
+# Why this test exists: the four/five sites that write 'awaiting_agent'
+# / 'hibernating' to workspaces.status had been silently failing in
+# production for five days (see migration 046) before a static drift
+# gate caught the enum gap. Unit tests passed because sqlmock matched
+# the SQL by regex but didn't enforce the live enum constraint, and
+# every existing E2E exercised hermes (not external) so the silent
+# failures never surfaced. This test pins the four awaiting_agent
+# transitions in real Postgres on a real staging tenant.
+#
+# Verification path:
+#   1. Provision a fresh tenant (test_staging_full_saas.sh harness shape).
+#   2. Create an external-runtime workspace with NO URL → assert
+#      response status == 'awaiting_agent' AND GET on the workspace
+#      returns the same. (Pre-fix the row stuck on 'provisioning'
+#      because the UPDATE in workspace.go:333 silently failed.)
+#   3. Register a fake URL via /registry/register → assert transition
+#      to 'online'. (Pre-fix this branch worked because it writes
+#      'online' which IS in the enum.)
+#   4. Stop heartbeating; wait past REMOTE_LIVENESS_STALE_AFTER (90s
+#      default) + a sweep interval → assert transition back to
+#      'awaiting_agent'. (Pre-fix the sweep UPDATE failed silently and
+#      the workspace stuck on 'online' indefinitely.)
+#
+# Hibernation is intentionally NOT covered here — it has its own timing
+# model (idle threshold) and warrants a separate harness.
+#
+# Required env (mirrors test_staging_full_saas.sh):
+#   MOLECULE_CP_URL          default: https://staging-api.moleculesai.app
+#   MOLECULE_ADMIN_TOKEN     CP admin bearer (Railway CP_ADMIN_API_TOKEN)
+#
+# Optional env:
+#   E2E_PROVISION_TIMEOUT_SECS  default 900 (15 min cold EC2 budget)
+#   E2E_KEEP_ORG                1 → skip teardown (debugging only)
+#   E2E_RUN_ID                  Slug suffix; CI: ${GITHUB_RUN_ID}
+#   E2E_STALE_WAIT_SECS         default 180 (90s window + 90s buffer)
+#   E2E_INTENTIONAL_FAILURE     1 → break a step on purpose to verify
+#                               the EXIT trap still tears down (mirrors
+#                               the full-saas harness's safety net).
+#
+# Exit codes: 0 happy, 1 generic, 2 missing env, 3 provision timeout,
+# 4 teardown leak.
+
+set -euo pipefail
+
+CP_URL="${MOLECULE_CP_URL:-https://staging-api.moleculesai.app}"
+ADMIN_TOKEN="${MOLECULE_ADMIN_TOKEN:?MOLECULE_ADMIN_TOKEN required — Railway staging CP_ADMIN_API_TOKEN}"
+PROVISION_TIMEOUT_SECS="${E2E_PROVISION_TIMEOUT_SECS:-900}"
+RUN_ID_SUFFIX="${E2E_RUN_ID:-$(date +%H%M%S)-$$}"
+STALE_WAIT_SECS="${E2E_STALE_WAIT_SECS:-180}"
+
+SLUG="e2e-ext-$(date +%Y%m%d)-${RUN_ID_SUFFIX}"
+SLUG=$(echo "$SLUG" | tr '[:upper:]' '[:lower:]' | tr -cd 'a-z0-9-' | head -c 32)
+
+log()  { echo "[$(date +%H:%M:%S)] $*"; }
+fail() { echo "[$(date +%H:%M:%S)] ❌ $*" >&2; exit 1; }
+ok()   { echo "[$(date +%H:%M:%S)] ✅ $*"; }
+
+CURL_COMMON=(-sS --fail-with-body --max-time 30)
+
+# ─── cleanup trap (mirrors full-saas) ────────────────────────────────────
+CLEANUP_DONE=0
+cleanup_org() {
+  local entry_rc=$?
+  if [ "$CLEANUP_DONE" = "1" ]; then return 0; fi
+  CLEANUP_DONE=1
+
+  if [ "${E2E_KEEP_ORG:-0}" = "1" ]; then
+    log "E2E_KEEP_ORG=1 → leaving $SLUG behind for inspection"
+    return 0
+  fi
+
+  log "Cleanup: deleting tenant $SLUG..."
+  curl "${CURL_COMMON[@]}" --max-time 120 -X DELETE "$CP_URL/cp/admin/tenants/$SLUG" \
+    -H "Authorization: Bearer $ADMIN_TOKEN" \
+    -H "Content-Type: application/json" \
+    -d "{\"confirm\":\"$SLUG\"}" >/dev/null 2>&1 \
+    && ok "Teardown request accepted" \
+    || log "Teardown returned non-2xx (may already be gone)"
+
+  local leak_count=1 elapsed=0
+  while [ "$elapsed" -lt 60 ]; do
+    leak_count=$(curl "${CURL_COMMON[@]}" "$CP_URL/cp/admin/orgs" \
+      -H "Authorization: Bearer $ADMIN_TOKEN" 2>/dev/null \
+      | python3 -c "import json,sys; d=json.load(sys.stdin); print(sum(1 for o in d.get('orgs', []) if o.get('slug')=='$SLUG' and o.get('status') != 'purged'))" \
+      2>/dev/null || echo 1)
+    [ "$leak_count" = "0" ] && break
+    sleep 5
+    elapsed=$((elapsed + 5))
+  done
+
+  if [ "$leak_count" != "0" ]; then
+    echo "⚠️  LEAK: org $SLUG still present post-teardown (count=$leak_count)" >&2
+    exit 4
+  fi
+  ok "Teardown clean — no orphan resources for $SLUG (${elapsed}s)"
+
+  case "$entry_rc" in
+    0|1|2|3|4) ;;
+    *) exit 1 ;;
+  esac
+}
+trap cleanup_org EXIT INT TERM
+
+# ─── 0. Preflight ───────────────────────────────────────────────────────
+log "═══════════════════════════════════════════════════════════════════"
+log " Staging external-runtime E2E (regression for migration 046)"
+log "   CP:    $CP_URL"
+log "   Slug:  $SLUG"
+log "   Stale: ${STALE_WAIT_SECS}s wait window"
+log "═══════════════════════════════════════════════════════════════════"
+
+curl "${CURL_COMMON[@]}" "$CP_URL/health" >/dev/null || fail "CP health check failed"
+ok "CP reachable"
+
+admin_call() {
+  local method="$1"; shift; local path="$1"; shift
+  curl "${CURL_COMMON[@]}" -X "$method" "$CP_URL$path" \
+    -H "Authorization: Bearer $ADMIN_TOKEN" \
+    -H "Content-Type: application/json" "$@"
+}
+
+# ─── 1. Create org ──────────────────────────────────────────────────────
+log "1/8 Creating org $SLUG..."
+CREATE_RESP=$(admin_call POST /cp/admin/orgs \
+  -d "{\"slug\":\"$SLUG\",\"name\":\"E2E ext $SLUG\",\"owner_user_id\":\"e2e-runner:$SLUG\"}")
+ORG_ID=$(echo "$CREATE_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('id',''))")
+[ -z "$ORG_ID" ] && fail "Org create response missing 'id'"
+ok "Org created (id=$ORG_ID)"
+
+# ─── 2. Wait for tenant provisioning ────────────────────────────────────
+# Terminal status from /cp/admin/orgs is 'running' (org_instances.status),
+# NOT 'ready' — same field the full-saas harness polls. 'failed' surfaces
+# diagnostic dump and aborts. See test_staging_full_saas.sh step 2 for
+# the field-bugfix history (2026-04-21, last_error path).
+log "2/8 Waiting for tenant (up to ${PROVISION_TIMEOUT_SECS}s)..."
+DEADLINE=$(( $(date +%s) + PROVISION_TIMEOUT_SECS ))
+LAST_STATUS=""
+while true; do
+  if [ "$(date +%s)" -gt "$DEADLINE" ]; then
+    fail "Tenant provisioning timed out (last: $LAST_STATUS)"
+  fi
+  LIST_JSON=$(admin_call GET /cp/admin/orgs 2>/dev/null || echo '{"orgs":[]}')
+  STATUS=$(echo "$LIST_JSON" | python3 -c "
+import json, sys
+d = json.load(sys.stdin)
+for o in d.get('orgs', []):
+    if o.get('slug') == '$SLUG':
+        print(o.get('instance_status', ''))
+        sys.exit(0)
+print('')
+" 2>/dev/null || echo "")
+  if [ "$STATUS" != "$LAST_STATUS" ]; then
+    log "   instance_status: $STATUS"
+    LAST_STATUS="$STATUS"
+  fi
+  case "$STATUS" in
+    running) break ;;
+    failed)
+      log "── DIAGNOSTIC BURST (step 2 — tenant provisioning failed) ──"
+      echo "$LIST_JSON" | python3 -c "
+import json, sys
+d = json.load(sys.stdin)
+for o in d.get('orgs', []):
+    if o.get('slug') == '$SLUG':
+        print(json.dumps(o, indent=2))
+        sys.exit(0)
+print('(no org row found for slug=$SLUG — DB drift?)')
+" 2>&1 | sed 's/^/  /'
+      log "── END DIAGNOSTIC ──"
+      fail "Tenant provisioning failed for $SLUG (see diagnostic above)"
+      ;;
+    *) sleep 15 ;;
+  esac
+done
+ok "Tenant provisioning complete"
+
+# Derive tenant URL the same way the full-saas harness does.
+CP_HOST=$(echo "$CP_URL" | sed -E 's#^https?://##; s#/.*$##')
+case "$CP_HOST" in
+  api.*)         DERIVED_DOMAIN="${CP_HOST#api.}" ;;
+  staging-api.*) DERIVED_DOMAIN="staging.${CP_HOST#staging-api.}" ;;
+  *)             DERIVED_DOMAIN="$CP_HOST" ;;
+esac
+TENANT_DOMAIN="${MOLECULE_TENANT_DOMAIN:-$DERIVED_DOMAIN}"
+TENANT_URL="https://$SLUG.$TENANT_DOMAIN"
+log "    TENANT_URL=$TENANT_URL"
+
+# ─── 3. Per-tenant admin token + TLS readiness ──────────────────────────
+log "3/8 Fetching per-tenant admin token..."
+TENANT_TOKEN_RESP=$(admin_call GET "/cp/admin/orgs/$SLUG/admin-token")
+TENANT_TOKEN=$(echo "$TENANT_TOKEN_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('admin_token',''))")
+[ -z "$TENANT_TOKEN" ] && fail "Could not retrieve per-tenant admin token"
+ok "Token retrieved (len=${#TENANT_TOKEN})"
+
+log "Waiting for tenant TLS / DNS..."
+TLS_DEADLINE=$(( $(date +%s) + 15 * 60 ))
+while true; do
+  if curl -sSfk --max-time 5 "$TENANT_URL/health" >/dev/null 2>&1; then break; fi
+  if [ "$(date +%s)" -gt "$TLS_DEADLINE" ]; then
+    fail "Tenant URL never responded 2xx on /health within 15min"
+  fi
+  sleep 5
+done
+ok "Tenant reachable"
+
+tenant_call() {
+  local method="$1"; shift; local path="$1"; shift
+  curl "${CURL_COMMON[@]}" -X "$method" "$TENANT_URL$path" \
+    -H "Authorization: Bearer $TENANT_TOKEN" \
+    -H "X-Molecule-Org-Id: $ORG_ID" \
+    "$@"
+}
+
+# ─── 4. Create external workspace (no URL) ──────────────────────────────
+# This is the FIRST silent-failure path (workspace.go:333). Pre-migration
+# 046, the response would say status=awaiting_agent but the row stuck
+# on whatever the create handler set first (typically 'provisioning')
+# because the follow-up UPDATE failed the enum cast.
+log "4/8 Creating external workspace (no URL — exercises workspace.go:333)..."
+WS_CREATE_RESP=$(tenant_call POST /workspaces \
+  -d '{"name":"ext-e2e","runtime":"external","external":true}')
+
+WS_ID=$(echo "$WS_CREATE_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('id',''))")
+WS_RESP_STATUS=$(echo "$WS_CREATE_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('status',''))")
+WS_AUTH_TOKEN=$(echo "$WS_CREATE_RESP" | python3 -c "
+import json,sys
+try:
+    d = json.load(sys.stdin)
+    conn = d.get('connection') or {}
+    print(conn.get('auth_token','') or d.get('auth_token',''))
+except Exception:
+    print('')
+")
+[ -z "$WS_ID" ] && fail "Workspace create missing id: $WS_CREATE_RESP"
+[ "$WS_RESP_STATUS" != "awaiting_agent" ] && fail "Expected response status=awaiting_agent, got $WS_RESP_STATUS"
+ok "Workspace created (id=$WS_ID, response status=awaiting_agent)"
+
+# This GET is the proof that the row actually has the value (not just
+# the response body lying). Pre-migration-046 the UPDATE would have
+# silently failed and this would return whatever 'provisioning' the
+# initial INSERT left. Post-fix it must be 'awaiting_agent'.
+log "    Verifying DB row..."
+GET_RESP=$(tenant_call GET "/workspaces/$WS_ID")
+DB_STATUS=$(echo "$GET_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('status',''))")
+[ "$DB_STATUS" != "awaiting_agent" ] && fail "DB row status=$DB_STATUS (expected awaiting_agent — migration 046 likely not applied)"
+ok "DB row stored as awaiting_agent (proof migration 046 applied)"
+
+# ─── 5. Register the workspace (transitions to online) ──────────────────
+# Pre-fix this path was actually fine because it writes 'online', a value
+# already in the enum. We exercise it anyway because the registration
+# implicitly walks resolveDeliveryMode (registry.go:resolveDeliveryMode),
+# which DOES read runtime + apply the new poll-default introduced by
+# PR #2382.
+log "5/8 Registering workspace via /registry/register..."
+[ -z "$WS_AUTH_TOKEN" ] && fail "No workspace auth token returned — register impossible"
+# Payload contract (workspace-server/internal/models/workspace.go RegisterPayload):
+#   id            — required, the workspace UUID (NOT "workspace_id" — that's the
+#                   heartbeat payload field; mixing them yields a 400 from
+#                   ShouldBindJSON because `id` has binding:"required").
+#   agent_card    — required (binding:"required"); minimal valid card is name+skills.
+#   delivery_mode — set explicitly to "poll" so url validation is skipped
+#                   regardless of whether the deployed image has the
+#                   runtime=external→poll default from PR #2382. Observed
+#                   2026-04-30 17:18Z: a freshly-provisioned staging tenant
+#                   was running an older workspace-server :latest image
+#                   that lacked resolveDeliveryMode's external→poll branch,
+#                   so the implicit default was push and validateAgentURL
+#                   400'd on example.invalid. Asserting on the implicit
+#                   default makes the *register call* itself fragile to
+#                   image-tag drift on the fleet — verify the default
+#                   separately (step 5b assertion) without depending on it
+#                   here.
+#   url           — accepted but not dispatched-to in poll mode, so
+#                   example.invalid is a valid sentinel.
+REGISTER_BODY=$(printf '{"id":"%s","url":"https://example.invalid:443","delivery_mode":"poll","agent_card":{"name":"e2e-ext","skills":[{"id":"echo","name":"Echo"}]}}' "$WS_ID")
+# Disable --fail-with-body for this one call so a 4xx surfaces the response
+# body (the bare CURL_COMMON would `set -e`-kill before we could log it).
+REGISTER_RESP=$(curl -sS --max-time 30 -w "\nHTTP_CODE=%{http_code}" -X POST "$TENANT_URL/registry/register" \
+  -H "Authorization: Bearer $WS_AUTH_TOKEN" \
+  -H "X-Molecule-Org-Id: $ORG_ID" \
+  -H "Content-Type: application/json" \
+  -d "$REGISTER_BODY") || true
+log "    register response: $(echo "$REGISTER_RESP" | head -c 300)"
+echo "$REGISTER_RESP" | grep -q "HTTP_CODE=200" || fail "register returned non-200 — see body above"
+
+GET_RESP=$(tenant_call GET "/workspaces/$WS_ID")
+ONLINE_STATUS=$(echo "$GET_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('status',''))")
+[ "$ONLINE_STATUS" != "online" ] && fail "Expected online after register, got $ONLINE_STATUS"
+ok "Workspace transitioned to online"
+
+# Confirm the register handler echoed back delivery_mode=poll. We read
+# this from the register RESPONSE, not the workspace GET response, because
+# the GET handler's SELECT (workspace.go:597) doesn't fetch delivery_mode
+# — its column list pre-dates the delivery_mode column from #2339 PR 1.
+# Surfacing delivery_mode in GET is tracked separately; not gating on it
+# here keeps this test focused on the awaiting_agent transitions.
+REGISTER_BODY_JSON=$(echo "$REGISTER_RESP" | head -n 1)
+REGISTER_DELIVERY_MODE=$(echo "$REGISTER_BODY_JSON" | python3 -c "import json,sys; print(json.load(sys.stdin).get('delivery_mode',''))")
+if [ "$REGISTER_DELIVERY_MODE" = "poll" ]; then
+  ok "delivery_mode=poll (register response echoed explicit value)"
+else
+  fail "Register response delivery_mode=$REGISTER_DELIVERY_MODE (expected poll). Body: $REGISTER_BODY_JSON"
+fi
+
+# ─── 6. Stop heartbeating; wait past REMOTE_LIVENESS_STALE_AFTER ────────
+# This is the SECOND silent-failure path (registry/healthsweep.go's
+# sweepStaleRemoteWorkspaces). Pre-migration-046 the heartbeat-staleness
+# UPDATE silently failed and the workspace stuck on 'online' forever
+# even though no agent was alive. We wait the full window + a sweep
+# interval and assert the row transitions back to 'awaiting_agent'.
+log "6/8 Waiting ${STALE_WAIT_SECS}s for heartbeat-staleness sweep (no heartbeat sent)..."
+sleep "$STALE_WAIT_SECS"
+
+GET_RESP=$(tenant_call GET "/workspaces/$WS_ID")
+STALE_STATUS=$(echo "$GET_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('status',''))")
+[ "$STALE_STATUS" != "awaiting_agent" ] && \
+  fail "After ${STALE_WAIT_SECS}s with no heartbeat, expected status=awaiting_agent (sweep transition), got $STALE_STATUS — migration 046 likely not applied OR sweep not running"
+ok "Heartbeat-staleness sweep transitioned online → awaiting_agent (proof healthsweep.go fix working)"
+
+# ─── 7. Re-register and confirm we can come back online ─────────────────
+# This proves the awaiting_agent state is recoverable (re-registrable),
+# which is the whole point of using it instead of 'offline'.
+log "7/8 Re-registering after stale → confirming recovery to online..."
+# Same payload contract as step 5 (id + agent_card both required). See note
+# there for why workspace_id would 400.
+REREG_RESP=$(curl -sS --max-time 30 -w "\nHTTP_CODE=%{http_code}" -X POST "$TENANT_URL/registry/register" \
+  -H "Authorization: Bearer $WS_AUTH_TOKEN" \
+  -H "X-Molecule-Org-Id: $ORG_ID" \
+  -H "Content-Type: application/json" \
+  -d "$REGISTER_BODY") || true
+log "    re-register response: $(echo "$REREG_RESP" | head -c 300)"
+echo "$REREG_RESP" | grep -q "HTTP_CODE=200" || fail "re-register returned non-200 — see body above"
+
+GET_RESP=$(tenant_call GET "/workspaces/$WS_ID")
+RECOVERED_STATUS=$(echo "$GET_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin).get('status',''))")
+[ "$RECOVERED_STATUS" != "online" ] && \
+  fail "Expected re-register to return workspace to online, got $RECOVERED_STATUS"
+ok "Re-register succeeded — awaiting_agent → online (operator-recoverable)"
+
+# ─── 8. Done — cleanup runs in the EXIT trap ───────────────────────────
+log "8/8 All four awaiting_agent transitions verified."
+log "═══════════════════════════════════════════════════════════════════"
+ok "External-runtime E2E PASSED on $SLUG"
+log "═══════════════════════════════════════════════════════════════════"
@@ -0,0 +1,2 @@
+# Harness ephemeral state. Re-generated by ./seed.sh on every boot.
+.seed.env
@@ -0,0 +1,156 @@
+# Production-shape local harness
+
+The harness brings up the SaaS tenant topology on localhost using the
+same `Dockerfile.tenant` image that ships to production. Tests target
+the cf-proxy on `http://localhost:8080` and pass the tenant identity
+via a `Host:` header — exactly the way production CF tunnel routes by
+Host header. The cf-proxy nginx then rewrites headers and proxies to
+the right tenant container, exercising the SAME code path a real tenant
+takes including TenantGuard middleware, the `/cp/*` reverse proxy, the
+canvas reverse proxy, and a Cloudflare-tunnel-shape header rewrite
+layer.
+
+Since Phase 2 the harness runs **two tenants in parallel** (alpha and
+beta) with their own Postgres instance and distinct
+`MOLECULE_ORG_ID`s — same shape as production, where each tenant gets
+its own EC2 + DB. This is what cross-tenant isolation replays need to
+prove TenantGuard actually 404s a misrouted request.
+
+`tests/harness/_curl.sh` is the helper sourced by every replay. Per
+tenant: `curl_alpha_anon` / `curl_alpha_admin` / `curl_beta_anon` /
+`curl_beta_admin` / `psql_exec_alpha` / `psql_exec_beta`. Plus
+deliberately-wrong cross-tenant negative-test helpers for isolation
+replays: `curl_alpha_creds_at_beta` / `curl_beta_creds_at_alpha`.
+Legacy single-tenant aliases (`curl_anon`, `curl_admin`, `psql_exec`)
+default to alpha so pre-Phase-2 replays continue to work. New replays
+should source `_curl.sh` rather than rolling their own curl.
+
+## Why this exists
+
+Local `go run ./cmd/server` skips:
+- `TenantGuard` middleware (no `MOLECULE_ORG_ID` env)
+- `/cp/*` reverse proxy mount (no `CP_UPSTREAM_URL` env)
+- `CANVAS_PROXY_URL` (canvas runs separately on `:3000`)
+- Header rewrites that production's CF tunnel + LB perform
+- Strict-auth mode (no live `ADMIN_TOKEN`)
+
+Bugs that survive `go run` and ship to production almost always live
+in one of those layers. The harness activates ALL of them.
+
+## Topology
+
+```
+                                      client
+                                        ↓
+                                     cf-proxy            nginx, mirrors CF tunnel header rewrites
+                                        ↓ (routes by Host header)
+              ┌─────────────────────────┴─────────────────────────┐
+              ↓                                                   ↓
+        tenant-alpha                                        tenant-beta
+        Host: harness-tenant-alpha.localhost                Host: harness-tenant-beta.localhost
+        MOLECULE_ORG_ID=harness-org-alpha                   MOLECULE_ORG_ID=harness-org-beta
+              ↓                                                   ↓
+        postgres-alpha                                      postgres-beta
+              ↓                                                   ↓
+              └─────────────────────────┬─────────────────────────┘
+                                        ↓
+                             cp-stub + redis (shared)
+```
+
+Each tenant runs the production `Dockerfile.tenant` image with its own
+admin token, org id, and Postgres instance — identical isolation
+boundaries to production where each tenant gets a dedicated EC2 + DB.
+cp-stub and redis are shared because they model the per-region
+multi-tenant CP and a single Redis cluster.
+
+## Quickstart
+
+```bash
+cd tests/harness
+./up.sh                 # builds + starts all services (both tenants)
+./seed.sh               # registers parent+child workspaces in BOTH tenants
+./replays/tenant-isolation.sh
+./replays/per-tenant-independence.sh
+./down.sh               # tear down + remove volumes
+```
+
+To run every replay in one shot (boot, seed, run-all, teardown):
+
+```bash
+cd tests/harness
+./run-all-replays.sh    # full lifecycle; non-zero exit if any replay fails
+KEEP_UP=1 ./run-all-replays.sh   # leave harness up for debugging
+REBUILD=1 ./run-all-replays.sh   # rebuild images before booting
+```
+
+No `/etc/hosts` edit required — replays use the cf-proxy's loopback
+port and pass the per-tenant `Host:` header (`_curl.sh` handles this
+automatically). This matches how production CF tunnel routes: the URL
+is the public CF endpoint, the Host header carries the per-tenant
+identity. Quick check:
+
+```bash
+curl -H "Host: harness-tenant-alpha.localhost" http://localhost:8080/health
+curl -H "Host: harness-tenant-beta.localhost"  http://localhost:8080/health
+```
+
+(If you have a legacy `/etc/hosts` entry from older docs, it still
+works — `BASE`, `ALPHA_HOST`, `BETA_HOST` all honor env-var overrides.
+The legacy `harness-tenant.localhost` host alias maps to alpha.)
+
+## Replay scripts
+
+Each replay script reproduces a real bug class against the harness so
+fixes can be verified locally before deploy. The bar for adding a
+replay is "this bug shipped to production despite local E2E being
+green" — the script becomes the regression gate that closes that gap.
+
+| Replay | Closes | What it proves |
+|--------|--------|----------------|
+| `peer-discovery-404.sh` | #2397 | tool_list_peers surfaces the actual reason instead of "may be isolated" |
+| `buildinfo-stale-image.sh` | #2395 | GIT_SHA reaches the binary; verify-step comparison logic works |
+| `chat-history.sh` | #2472 + #2474 + #2476 | `peer_id` filter (incl. OR over source/target) + `before_ts` paging + UUID/RFC3339 trust boundary on the activity route |
+| `channel-envelope-trust-boundary.sh` | #2471 + #2481 | published wheel scrubs malformed `peer_id` from the channel envelope and from `agent_card_url` (path-traversal + XML-attr injection) |
+| `tenant-isolation.sh` | Phase 2 | TenantGuard 404s any request whose `X-Molecule-Org-Id` doesn't match the container's `MOLECULE_ORG_ID` (covers cross-tenant routing bug + allowlist drift); per-tenant `/workspaces` listings stay partitioned |
+| `per-tenant-independence.sh` | Phase 2 | parallel A2A workflows in both tenants don't bleed into each other's `activity_logs` / `workspaces`, including under a concurrent INSERT race (catches lib/pq prepared-statement cache collision + shared-pool poisoning) |
+
+To add a new replay:
+1. Drop a script under `replays/` named after the issue.
+2. The script's purpose: reproduce the production failure mode against
+   the harness, then assert the fix is present. PASS criterion is the
+   post-fix behavior.
+3. The `run-all-replays.sh` runner picks up every `replays/*.sh` script
+   automatically — no per-replay registration needed.
+
+## Extending the cp-stub
+
+`cp-stub/main.go` serves the minimum surface for the existing replays
+plus a catch-all that returns 501 + a clear message when the tenant
+asks for a route the stub doesn't implement. To add a new CP route:
+
+1. Add a `mux.HandleFunc` in `cp-stub/main.go` for the path.
+2. Return the same wire shape the real CP returns. The contract is
+   "wire compatibility with the staging CP at the time of writing" —
+   document it with a comment pointing at the real CP handler.
+3. Add a replay script that exercises the path.
+
+## What the harness does NOT cover
+
+- Real TLS / cert handling (CF terminates TLS in production; harness is
+  HTTP-only).
+- Cloudflare API edge cases (rate limits, DNS propagation timing).
+- Real EC2 / SSM / EBS behavior (image-cache replay simulates the
+  outcome but not the AWS API surface).
+- Cross-region or multi-AZ topology.
+- Real production data scale.
+
+These are intentional Phase 1 limits. If a bug class hits one of these
+gaps, escalate to staging E2E rather than expanding the harness past
+its mandate of "exercise the tenant binary in production-shape topology."
+
+## Roadmap
+
+- **Phase 1 (shipped):** harness + cp-stub + cf-proxy + 4 replays + `run-all-replays.sh` runner. No-sudo `Host`-header path via `_curl.sh`. Per-replay psql seeding for tests that need DB-side fixtures.
+- **Phase 2 (shipped):** multi-tenant — `tenant-alpha` + `tenant-beta` with their own Postgres instances and distinct `MOLECULE_ORG_ID`s; cf-proxy nginx routes by Host header (prod CF tunnel parity); `seed.sh` registers parent+child workspaces in both tenants; `_curl.sh` exposes per-tenant + cross-tenant-negative helpers; new replays cover TenantGuard isolation (`tenant-isolation.sh`) and per-tenant independence under concurrent load (`per-tenant-independence.sh`). `harness-replays.yml` runs `run-all-replays.sh` as a required check on every PR touching `workspace-server/**`, `canvas/**`, `tests/harness/**`, or the workflow itself.
+- **Phase 3:** replace `cp-stub/` with the real `molecule-controlplane` Docker build. Add a config-coherence lint that diffs harness env list against production CP's env list and fails CI on drift. Convert `tests/e2e/test_api.sh` to target the harness instead of localhost.
+- **Phase 4 (long-term):** Miniflare in front of cf-proxy for real CF emulation (WAF, BotID, rate-limit, cf-tunnel headers). LocalStack for the EC2 provisioner. Anonymized prod-traffic recording/replay for SaaS-scale regression detection.
@@ -0,0 +1,159 @@
+# Sourceable helper for harness replays. Centralises the
+# curl-against-cf-proxy pattern so scripts don't depend on /etc/hosts.
+#
+# Production CF tunnel routes by Host header, not by DNS — the request
+# URL is to a public CF endpoint and the Host header carries the
+# per-tenant identity. We replay the same shape locally:
+#
+#   curl -H "Host: harness-tenant-alpha.localhost" http://localhost:8080/health
+#
+# This matches what cf-proxy/nginx.conf already routes (`server_name
+# *.localhost` + `map $host $tenant_upstream`) and avoids the macOS
+# /etc/hosts requirement that previously gated the harness behind a
+# sudo step.
+#
+# Multi-tenant since Phase 2: alpha and beta tenants run in parallel.
+# `curl_alpha_admin` and `curl_beta_admin` target each tenant's URL
+# with that tenant's ADMIN_TOKEN + MOLECULE_ORG_ID. The legacy
+# `curl_admin` is aliased to alpha for backwards compat with the
+# pre-Phase-2 single-tenant replays.
+#
+# Usage:
+#   HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+#   source "$HERE/../_curl.sh"     # from replays/<name>.sh
+#   curl_alpha_admin "$BASE/health"
+#   curl_beta_admin  "$BASE/health"
+
+# Bind to the cf-proxy's loopback port — the proxy front-doors every
+# tenant and routes by Host header, exactly like production's CF tunnel.
+: "${BASE:=http://localhost:8080}"
+
+# Per-tenant identity. Each pair must match the corresponding tenant
+# container's environment in compose.yml or auth/TenantGuard will fail
+# in non-obvious ways (401 vs 403 vs silent route to wrong tenant).
+: "${ALPHA_HOST:=harness-tenant-alpha.localhost}"
+: "${ALPHA_ADMIN_TOKEN:=harness-admin-token-alpha}"
+: "${ALPHA_ORG_ID:=harness-org-alpha}"
+
+: "${BETA_HOST:=harness-tenant-beta.localhost}"
+: "${BETA_ADMIN_TOKEN:=harness-admin-token-beta}"
+: "${BETA_ORG_ID:=harness-org-beta}"
+
+# Legacy single-tenant aliases — pre-Phase-2 replays use these without
+# knowing the topology grew. They map to alpha. New replays should use
+# the explicit alpha/beta variants for clarity.
+: "${TENANT_HOST:=$ALPHA_HOST}"
+: "${ADMIN_TOKEN:=$ALPHA_ADMIN_TOKEN}"
+: "${ORG_ID:=$ALPHA_ORG_ID}"
+
+# ─── Anonymous (no auth) ──────────────────────────────────────────────
+
+# Anonymous request to alpha. Use for /health, /buildinfo, etc.
+curl_alpha_anon() {
+    curl -sS -H "Host: ${ALPHA_HOST}" "$@"
+}
+
+# Anonymous request to beta.
+curl_beta_anon() {
+    curl -sS -H "Host: ${BETA_HOST}" "$@"
+}
+
+# Legacy alias for single-tenant replays.
+curl_anon() {
+    curl -sS -H "Host: ${TENANT_HOST}" "$@"
+}
+
+# ─── Admin-token requests ─────────────────────────────────────────────
+
+# Admin-token request to alpha tenant. SaaS-shape auth: bearer token,
+# tenant org header (TenantGuard activates), JSON content type.
+curl_alpha_admin() {
+    curl -sS \
+        -H "Host: ${ALPHA_HOST}" \
+        -H "Authorization: Bearer ${ALPHA_ADMIN_TOKEN}" \
+        -H "X-Molecule-Org-Id: ${ALPHA_ORG_ID}" \
+        -H "Content-Type: application/json" \
+        "$@"
+}
+
+# Admin-token request to beta tenant.
+curl_beta_admin() {
+    curl -sS \
+        -H "Host: ${BETA_HOST}" \
+        -H "Authorization: Bearer ${BETA_ADMIN_TOKEN}" \
+        -H "X-Molecule-Org-Id: ${BETA_ORG_ID}" \
+        -H "Content-Type: application/json" \
+        "$@"
+}
+
+# Legacy alias.
+curl_admin() {
+    curl_alpha_admin "$@"
+}
+
+# ─── Cross-tenant negative-test helpers ───────────────────────────────
+# These exist to MAKE WRONG calls — replays use them to assert
+# TenantGuard rejects them. Names spell out what's mismatched.
+
+# alpha bearer + alpha org, but talking to beta's URL. TenantGuard
+# should reject because the org header doesn't match beta's MOLECULE_ORG_ID.
+curl_alpha_creds_at_beta() {
+    curl -sS \
+        -H "Host: ${BETA_HOST}" \
+        -H "Authorization: Bearer ${ALPHA_ADMIN_TOKEN}" \
+        -H "X-Molecule-Org-Id: ${ALPHA_ORG_ID}" \
+        -H "Content-Type: application/json" \
+        "$@"
+}
+
+# beta bearer + beta org, but talking to alpha's URL.
+curl_beta_creds_at_alpha() {
+    curl -sS \
+        -H "Host: ${ALPHA_HOST}" \
+        -H "Authorization: Bearer ${BETA_ADMIN_TOKEN}" \
+        -H "X-Molecule-Org-Id: ${BETA_ORG_ID}" \
+        -H "Content-Type: application/json" \
+        "$@"
+}
+
+# ─── Workspace-scoped (per-workspace bearer) ──────────────────────────
+
+# Workspace-scoped request to alpha — uses a per-workspace bearer
+# minted from /admin/workspaces/:id/test-token. Caller must export
+# WORKSPACE_TOKEN.
+curl_workspace() {
+    : "${WORKSPACE_TOKEN:?WORKSPACE_TOKEN must be set — mint via /admin/workspaces/:id/test-token}"
+    curl -sS \
+        -H "Host: ${TENANT_HOST}" \
+        -H "Authorization: Bearer ${WORKSPACE_TOKEN}" \
+        -H "X-Molecule-Org-Id: ${ORG_ID}" \
+        -H "Content-Type: application/json" \
+        "$@"
+}
+
+# ─── Postgres exec (per-tenant) ───────────────────────────────────────
+
+# Direct postgres exec — for replays that need to seed activity_logs
+# rows or read DB state that has no public HTTP route.
+#
+# SECRETS_ENCRYPTION_KEY placeholder lets compose validate without
+# requiring up.sh's per-run key (exec doesn't actually use it but
+# compose validates the file).
+psql_exec_alpha() {
+    SECRETS_ENCRYPTION_KEY="${SECRETS_ENCRYPTION_KEY:-exec-placeholder}" \
+    docker compose -f "${HARNESS_COMPOSE:-$(dirname "${BASH_SOURCE[0]}")/compose.yml}" \
+        exec -T postgres-alpha \
+        psql -U harness -d molecule -At "$@"
+}
+
+psql_exec_beta() {
+    SECRETS_ENCRYPTION_KEY="${SECRETS_ENCRYPTION_KEY:-exec-placeholder}" \
+    docker compose -f "${HARNESS_COMPOSE:-$(dirname "${BASH_SOURCE[0]}")/compose.yml}" \
+        exec -T postgres-beta \
+        psql -U harness -d molecule -At "$@"
+}
+
+# Legacy alias — single-tenant replays default to alpha's DB.
+psql_exec() {
+    psql_exec_alpha "$@"
+}
@@ -0,0 +1,97 @@
+# cf-proxy — Cloudflare-tunnel-shape reverse proxy for the local harness.
+#
+# Production path: agent → CF tunnel → AWS LB → tenant container.
+# This config replays the same header rewrites the CF tunnel does so
+# the tenant sees the same Host + X-Forwarded-* it would in production.
+#
+# Multi-tenant: nginx routes by Host header to the right tenant
+# container — exactly the same way the production CF tunnel does
+# (URL is the public CF endpoint, Host carries the tenant identity).
+#
+# How tests reach it (no /etc/hosts required):
+#   curl -H 'Host: harness-tenant-alpha.localhost' http://localhost:8080/health
+#   curl -H 'Host: harness-tenant-beta.localhost'  http://localhost:8080/health
+#
+# Backwards-compat: harness-tenant.localhost (no -alpha/-beta suffix) maps
+# to alpha for legacy single-tenant replays.
+
+worker_processes 1;
+events { worker_connections 256; }
+
+http {
+    # Docker's embedded DNS at 127.0.0.11. Required because the
+    # `proxy_pass http://$tenant_upstream:8080` below uses a variable —
+    # nginx needs an explicit resolver to do per-request DNS lookups
+    # (literal hostnames are resolved once at startup, variables are
+    # resolved per-request). Without this, nginx fails closed with
+    # "no resolver defined" + 502.
+    #
+    # `valid=30s` caps cache life so a tenant container restart picks
+    # up a new IP within 30 seconds. ipv6=off skips AAAA lookups that
+    # Docker DNS doesn't always serve cleanly.
+    resolver 127.0.0.11 valid=30s ipv6=off;
+
+    # Reusable proxy block so each tenant server only carries the
+    # upstream-pointer + its identity-specific tweaks. Keeping the
+    # header rewrites + buffering settings centralised prevents drift
+    # between alpha and beta as the harness grows.
+    map $host $tenant_upstream {
+        default                            tenant-alpha;
+        harness-tenant.localhost           tenant-alpha;
+        harness-tenant-alpha.localhost     tenant-alpha;
+        harness-tenant-beta.localhost      tenant-beta;
+    }
+
+    server {
+        listen 8080 default_server;
+
+        # Reject Host headers we don't recognise — without this, an
+        # unknown Host would silently route to the default tenant and
+        # mask cross-tenant routing bugs in test output.
+        server_name harness-tenant.localhost
+                    harness-tenant-alpha.localhost
+                    harness-tenant-beta.localhost
+                    localhost;
+
+        # Cap upload at 50MB to mirror the staging tenant nginx limit;
+        # chat upload tests will fail closed if the platform handler
+        # ever silently expands its limit (catches the failure mode
+        # opposite of the chat-files lazy-heal incident).
+        client_max_body_size 50m;
+
+        location / {
+            # The map above resolves $tenant_upstream to the right
+            # container based on the Host header — production CF tunnel
+            # behavior in one line.
+            proxy_pass http://$tenant_upstream:8080;
+
+            # Header parity with CF tunnel + AWS LB. Production CF sets
+            # X-Forwarded-Proto=https; we keep http here because TLS
+            # termination in compose is unnecessary for testing the
+            # tenant logic — TLS is a CF concern, not a tenant bug
+            # surface. If TLS-specific bugs ever bite, add cert-manager
+            # + listen 8443 ssl here.
+            proxy_set_header Host              $host;
+            proxy_set_header X-Real-IP         $remote_addr;
+            proxy_set_header X-Forwarded-For   $proxy_add_x_forwarded_for;
+            proxy_set_header X-Forwarded-Host  $host;
+            proxy_set_header X-Forwarded-Proto $scheme;
+
+            # Streamable HTTP / SSE / WebSocket — the tenant exposes /ws
+            # and /events/stream + MCP /mcp/stream. Disabling buffering
+            # reproduces CF tunnel's pass-through streaming semantics
+            # (CF tunnel = no buffering by default; nginx default IS
+            # buffering, which would mask issue #2397-class streaming
+            # bugs by accumulating output until the client disconnects).
+            proxy_buffering         off;
+            proxy_request_buffering off;
+            proxy_http_version      1.1;
+            proxy_set_header        Connection "";
+
+            # Read timeout — CF tunnel default is 100s. Setting this to
+            # the same value catches "long agent run finishes after the
+            # proxy already closed the upstream" failure mode.
+            proxy_read_timeout      100s;
+        }
+    }
+}
@@ -0,0 +1,173 @@
+# Production-shape harness for local E2E. Multi-tenant.
+#
+# Reproduces the SaaS tenant topology on localhost using the SAME
+# images that ship to production:
+#
+#   client → cf-proxy (nginx, mimics CF tunnel headers, routes by Host)
+#          ├─ Host: harness-tenant-alpha.localhost → tenant-alpha
+#          │   ↓ (CP_UPSTREAM_URL=http://cp-stub:9090)
+#          │   tenant-alpha (workspace-server/Dockerfile.tenant)
+#          │   ↓
+#          │   postgres-alpha (per-tenant DB, matches prod)
+#          ├─ Host: harness-tenant-beta.localhost  → tenant-beta
+#          │   ↓
+#          │   tenant-beta + postgres-beta
+#          └─ cp-stub + redis (shared infra; CP is Railway-singleton in prod,
+#                              redis is shared cluster)
+#
+# The two-tenant topology catches:
+#   - TenantGuard cross-tenant escape (alpha-org token shouldn't see
+#     beta-tenant data even with a valid bearer)
+#   - cf-proxy Host-header routing correctness
+#   - Per-tenant DB isolation (workspaces table, activity_logs)
+#   - Concurrent multi-tenant operation (no shared mutable state)
+#
+# Quickstart (no /etc/hosts edits — see README):
+#   cd tests/harness && ./up.sh && ./seed.sh
+#   ./replays/peer-discovery-404.sh
+#   ./run-all-replays.sh
+#
+# Env config:
+#   GIT_SHA — passed to BOTH tenant builds for /buildinfo verification.
+#   CP_STUB_PEERS_MODE — peers failure mode for replay scripts.
+
+services:
+  # ─── Shared infra (matches prod: CP is Railway-singleton, redis shared) ───
+  redis:
+    image: redis:7-alpine
+    networks: [harness-net]
+    healthcheck:
+      test: ["CMD", "redis-cli", "ping"]
+      interval: 2s
+      timeout: 5s
+      retries: 10
+
+  cp-stub:
+    build:
+      context: ./cp-stub
+    environment:
+      PORT: "9090"
+      CP_STUB_PEERS_MODE: "${CP_STUB_PEERS_MODE:-}"
+    networks: [harness-net]
+    healthcheck:
+      test: ["CMD-SHELL", "wget -q -O- http://localhost:9090/healthz || exit 1"]
+      interval: 2s
+      timeout: 5s
+      retries: 10
+
+  # ─── Tenant alpha: postgres + workspace-server ────────────────────────
+  postgres-alpha:
+    image: postgres:16-alpine
+    environment:
+      POSTGRES_USER: harness
+      POSTGRES_PASSWORD: harness
+      POSTGRES_DB: molecule
+    networks: [harness-net]
+    healthcheck:
+      test: ["CMD-SHELL", "pg_isready -U harness"]
+      interval: 2s
+      timeout: 5s
+      retries: 10
+
+  tenant-alpha:
+    build:
+      context: ../..
+      dockerfile: workspace-server/Dockerfile.tenant
+      args:
+        GIT_SHA: "${GIT_SHA:-harness}"
+    depends_on:
+      postgres-alpha:
+        condition: service_healthy
+      redis:
+        condition: service_healthy
+      cp-stub:
+        condition: service_healthy
+    environment:
+      DATABASE_URL: "postgres://harness:harness@postgres-alpha:5432/molecule?sslmode=disable"
+      REDIS_URL: "redis://redis:6379"
+      PORT: "8080"
+      PLATFORM_URL: "http://tenant-alpha:8080"
+      MOLECULE_ENV: "production"
+      SECRETS_ENCRYPTION_KEY: "${SECRETS_ENCRYPTION_KEY:?must be set — run via tests/harness/up.sh, which generates one per run}"
+      ADMIN_TOKEN: "harness-admin-token-alpha"
+      MOLECULE_ORG_ID: "harness-org-alpha"
+      CP_UPSTREAM_URL: "http://cp-stub:9090"
+      RATE_LIMIT: "1000"
+      CANVAS_PROXY_URL: "http://localhost:3000"
+    networks: [harness-net]
+    healthcheck:
+      test: ["CMD-SHELL", "wget -q -O- http://localhost:8080/health || exit 1"]
+      interval: 5s
+      timeout: 5s
+      retries: 20
+
+  # ─── Tenant beta: postgres + workspace-server (parallel to alpha) ─────
+  postgres-beta:
+    image: postgres:16-alpine
+    environment:
+      POSTGRES_USER: harness
+      POSTGRES_PASSWORD: harness
+      POSTGRES_DB: molecule
+    networks: [harness-net]
+    healthcheck:
+      test: ["CMD-SHELL", "pg_isready -U harness"]
+      interval: 2s
+      timeout: 5s
+      retries: 10
+
+  tenant-beta:
+    build:
+      context: ../..
+      dockerfile: workspace-server/Dockerfile.tenant
+      args:
+        GIT_SHA: "${GIT_SHA:-harness}"
+    depends_on:
+      postgres-beta:
+        condition: service_healthy
+      redis:
+        condition: service_healthy
+      cp-stub:
+        condition: service_healthy
+    environment:
+      DATABASE_URL: "postgres://harness:harness@postgres-beta:5432/molecule?sslmode=disable"
+      REDIS_URL: "redis://redis:6379"
+      PORT: "8080"
+      PLATFORM_URL: "http://tenant-beta:8080"
+      MOLECULE_ENV: "production"
+      SECRETS_ENCRYPTION_KEY: "${SECRETS_ENCRYPTION_KEY:?must be set — run via tests/harness/up.sh, which generates one per run}"
+      # Distinct ADMIN_TOKEN — replays use this to verify TenantGuard
+      # blocks alpha-token presented at beta's URL.
+      ADMIN_TOKEN: "harness-admin-token-beta"
+      MOLECULE_ORG_ID: "harness-org-beta"
+      CP_UPSTREAM_URL: "http://cp-stub:9090"
+      RATE_LIMIT: "1000"
+      CANVAS_PROXY_URL: "http://localhost:3000"
+    networks: [harness-net]
+    healthcheck:
+      test: ["CMD-SHELL", "wget -q -O- http://localhost:8080/health || exit 1"]
+      interval: 5s
+      timeout: 5s
+      retries: 20
+
+  # ─── cf-proxy: routes by Host to the right tenant container ───────────
+  # Production shape: same single CF tunnel front-doors every tenant
+  # subdomain — the Host header carries the tenant identity, not the
+  # routing destination. Local cf-proxy mirrors this exactly.
+  cf-proxy:
+    image: nginx:1.27-alpine
+    depends_on:
+      tenant-alpha:
+        condition: service_healthy
+      tenant-beta:
+        condition: service_healthy
+    volumes:
+      - ./cf-proxy/nginx.conf:/etc/nginx/nginx.conf:ro
+    # Bind to 127.0.0.1 only — hardcoded ADMIN_TOKENs make 0.0.0.0
+    # exposure unsafe even on a local network.
+    ports:
+      - "127.0.0.1:8080:8080"
+    networks: [harness-net]
+
+networks:
+  harness-net:
+    name: molecule-harness-net
@@ -0,0 +1,14 @@
+# cp-stub — minimal CP stand-in for the local production-shape harness.
+# See main.go for the rationale. Self-contained build, no module deps.
+
+FROM golang:1.25-alpine AS builder
+WORKDIR /src
+COPY go.mod ./
+COPY main.go ./
+RUN CGO_ENABLED=0 GOOS=linux go build -ldflags="-s -w" -o /cp-stub .
+
+FROM alpine:3.20
+RUN apk add --no-cache ca-certificates
+COPY --from=builder /cp-stub /cp-stub
+EXPOSE 9090
+ENTRYPOINT ["/cp-stub"]
@@ -0,0 +1,3 @@
+module github.com/Molecule-AI/molecule-monorepo/tests/harness/cp-stub
+
+go 1.25
@@ -0,0 +1,113 @@
+// cp-stub — minimal control-plane stand-in for the local production-shape harness.
+//
+// In production, the tenant Go server reverse-proxies /cp/* to the SaaS
+// control-plane (molecule-controlplane). This stub plays that role on
+// localhost so we can exercise the SAME code path the tenant takes in
+// production — `if cpURL := os.Getenv("CP_UPSTREAM_URL"); cpURL != ""`
+// in workspace-server/internal/router/router.go fires, the proxy mount
+// activates, and tests exercise the real tenant→CP wire.
+//
+// This is NOT a CP reimplementation. It serves the minimum surface to:
+//   1. Boot the tenant image without /cp/* breaking the canvas bootstrap.
+//   2. Replay specific bug classes (e.g. /cp/* returns 404, returns 5xx,
+//      returns malformed JSON) by toggling env vars.
+//
+// Scope is bounded by what the tenant + canvas actually call. Add new
+// handlers as new replay scenarios demand them. Drift from real CP is
+// tolerated because each handler is named for the exact path it serves —
+// when the real CP changes, the failing scenario tells us where to look.
+package main
+
+import (
+	"encoding/json"
+	"fmt"
+	"log"
+	"net/http"
+	"os"
+	"sync/atomic"
+)
+
+// redeployFleetCalls tracks how many times /cp/admin/tenants/redeploy-fleet
+// was invoked. Replay scripts assert > 0 to confirm the workflow's redeploy
+// step actually reached the stub (catches misrouted CP_URL configs).
+var redeployFleetCalls atomic.Int64
+
+func main() {
+	mux := http.NewServeMux()
+
+	// /cp/auth/me — canvas calls this on bootstrap; minimal user record
+	// keeps the canvas from redirecting to login during local E2E.
+	mux.HandleFunc("/cp/auth/me", func(w http.ResponseWriter, r *http.Request) {
+		writeJSON(w, 200, map[string]any{
+			"id":     "harness-user",
+			"email":  "harness@local",
+			"org_id": "harness-org",
+			"roles":  []string{"admin"},
+		})
+	})
+
+	// /cp/admin/tenants/redeploy-fleet — exercised by the
+	// redeploy-tenants-on-{staging,main} workflow's local replay. Returns
+	// the same shape the real CP returns so the verify-fleet logic in CI
+	// can be tested without spinning up a real EC2 fleet.
+	mux.HandleFunc("/cp/admin/tenants/redeploy-fleet", func(w http.ResponseWriter, r *http.Request) {
+		redeployFleetCalls.Add(1)
+		writeJSON(w, 200, map[string]any{
+			"ok": true,
+			"results": []map[string]any{
+				{
+					"slug":          "harness-tenant",
+					"phase":         "redeploy",
+					"ssm_status":    "Success",
+					"ssm_exit_code": 0,
+					"healthz_ok":    true,
+				},
+			},
+		})
+	})
+
+	// __stub/state — expose stub state (counters) so replay scripts can
+	// assert the tenant actually reached us. Read-only.
+	mux.HandleFunc("/__stub/state", func(w http.ResponseWriter, r *http.Request) {
+		writeJSON(w, 200, map[string]any{
+			"redeploy_fleet_calls": redeployFleetCalls.Load(),
+		})
+	})
+
+	// Catch-all for any /cp/* the tenant proxies. Keeps the harness from
+	// crashing the canvas when a new CP route is added — surfaces a clear
+	// "stub doesn't implement X" error instead of opaque 502 from the
+	// reverse proxy.
+	mux.HandleFunc("/cp/", func(w http.ResponseWriter, r *http.Request) {
+		writeJSON(w, 501, map[string]any{
+			"error": "cp-stub: handler not implemented for " + r.Method + " " + r.URL.Path,
+			"hint":  "add a handler in tests/harness/cp-stub/main.go for the scenario you're testing",
+		})
+	})
+
+	// /healthz — readiness probe for compose's depends_on.
+	mux.HandleFunc("/healthz", func(w http.ResponseWriter, r *http.Request) {
+		writeJSON(w, 200, map[string]any{"status": "ok"})
+	})
+
+	addr := ":" + envOr("PORT", "9090")
+	log.Printf("cp-stub listening on %s", addr)
+	if err := http.ListenAndServe(addr, mux); err != nil {
+		log.Fatal(err)
+	}
+}
+
+func writeJSON(w http.ResponseWriter, code int, body any) {
+	w.Header().Set("Content-Type", "application/json")
+	w.WriteHeader(code)
+	if err := json.NewEncoder(w).Encode(body); err != nil {
+		fmt.Fprintf(os.Stderr, "cp-stub: write json: %v\n", err)
+	}
+}
+
+func envOr(k, def string) string {
+	if v := os.Getenv(k); v != "" {
+		return v
+	}
+	return def
+}
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+# Tear down the harness and wipe per-tenant volumes.
+#
+# SECRETS_ENCRYPTION_KEY placeholder: docker compose validates the entire
+# compose file even for `down -v` (a destructive read-only operation that
+# doesn't read the env). up.sh generates a per-run key into its own
+# shell — this script runs in a fresh shell that wouldn't see it. Without
+# the placeholder, `compose down` exits non-zero before removing volumes,
+# silently leaking workspaces+activity_logs into the next ./up.sh + seed.sh
+# (verified 2026-05-02: tenant-isolation.sh F1/F2 saw 3× duplicate
+# alpha-parent + alpha-child rows accumulated across three prior boots).
+set -euo pipefail
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+cd "$HERE"
+SECRETS_ENCRYPTION_KEY="${SECRETS_ENCRYPTION_KEY:-down-placeholder}" \
+    docker compose -f compose.yml down -v --remove-orphans
+echo "[harness] down + volumes removed."
@@ -0,0 +1,75 @@
+#!/usr/bin/env bash
+# Replay for issue #2395 — local proof that the /buildinfo verify gate
+# closes the SaaS deploy-chain blindness.
+#
+# Prior behavior: redeploy-fleet returned ssm_status=Success based on
+# the SSM RPC return code alone. EC2 tenants kept serving the cached
+# :latest digest because `docker compose up -d` is a no-op when the
+# tag hasn't been invalidated. ssm_status=Success was lying.
+#
+# This replay simulates that condition locally:
+#   1. Boot the harness with GIT_SHA=fix-applied.
+#   2. Curl /buildinfo and assert it returns "fix-applied" (the new code
+#      actually shipped).
+#   3. Negative test: curl with a different EXPECTED_SHA and assert the
+#      mismatch detection logic the workflow uses returns failure.
+#
+# This proves the verify-step's jq lookup + comparison logic works
+# against the SAME Dockerfile.tenant production builds. If the
+# /buildinfo route ever stops being wired through, this replay
+# catches it before it reaches a production tenant.
+
+set -euo pipefail
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+HARNESS_ROOT="$(dirname "$HERE")"
+# shellcheck source=../_curl.sh
+source "$HARNESS_ROOT/_curl.sh"
+
+# 1. Confirm /buildinfo wire shape — same shape the workflow's jq lookup expects.
+echo "[replay] curl $BASE/buildinfo ..."
+BUILD_JSON=$(curl_anon "$BASE/buildinfo")
+echo "[replay]   $BUILD_JSON"
+
+ACTUAL_SHA=$(echo "$BUILD_JSON" | jq -r '.git_sha // ""')
+if [ -z "$ACTUAL_SHA" ]; then
+    echo "[replay] FAIL: /buildinfo response missing git_sha field — workflow's jq lookup would null"
+    exit 1
+fi
+echo "[replay] git_sha=$ACTUAL_SHA"
+
+# 2. Assert the harness build threaded GIT_SHA through. If we got "dev",
+#    the Dockerfile arg / ldflags wiring is broken — same regression
+#    class that made #2395 invisible until production.
+EXPECTED_FROM_HARNESS="${HARNESS_GIT_SHA:-harness}"
+if [ "$ACTUAL_SHA" = "dev" ]; then
+    echo "[replay] FAIL: /buildinfo returned 'dev' — Dockerfile.tenant ARG GIT_SHA isn't reaching the binary"
+    echo "[replay]       This regresses #2395 by silencing the deploy-verify gate."
+    exit 1
+fi
+if [ "$ACTUAL_SHA" != "$EXPECTED_FROM_HARNESS" ]; then
+    echo "[replay] WARN: /buildinfo returned '$ACTUAL_SHA' but harness was built with GIT_SHA='$EXPECTED_FROM_HARNESS'"
+    echo "[replay]       Image may be cached from a previous run. Run ./up.sh --rebuild to force a fresh build."
+fi
+
+# 3. Negative test — replay the workflow's mismatch detection by
+#    comparing the actual SHA to a deliberately-wrong expected SHA.
+WRONG_EXPECTED="0000000000000000000000000000000000000000"
+if [ "$ACTUAL_SHA" = "$WRONG_EXPECTED" ]; then
+    echo "[replay] FAIL: /buildinfo returned all-zero SHA — wiring inverted"
+    exit 1
+fi
+
+# 4. Replay the workflow's exact comparison logic so a regression in
+#    the verify step's bash gets caught here.
+MISMATCH_DETECTED=0
+if [ "$ACTUAL_SHA" != "$WRONG_EXPECTED" ]; then
+    MISMATCH_DETECTED=1
+fi
+if [ "$MISMATCH_DETECTED" != "1" ]; then
+    echo "[replay] FAIL: workflow comparison logic would not flag a real mismatch"
+    exit 1
+fi
+
+echo ""
+echo "[replay] PASS: /buildinfo wire shape, GIT_SHA injection, and mismatch detection all work in"
+echo "        production-shape topology. The redeploy-fleet verify-step covers what it claims to."
@@ -0,0 +1,182 @@
+#!/usr/bin/env bash
+# Replay for the channel envelope peer_id trust-boundary fix
+# (PR #2481, follow-up to PR #2471). Verifies that the PUBLISHED wheel
+# installed on this machine — not local source — gates malformed peer_id
+# at both the envelope builder and the agent_card_url builder.
+#
+# Why this matters:
+#   - Unit tests in workspace/tests/ run against local source. They
+#     prove the fix works in source. They DO NOT prove the published
+#     wheel contains the fix.
+#   - The wheel rewriter (scripts/build_runtime_package.py) renames
+#     symbols + paths. Any rewrite drift could silently strip the
+#     guard from the shipped artifact.
+#   - This replay imports from `molecule_runtime.a2a_mcp_server` (the
+#     wheel-rewritten path), exercises the actual published code, and
+#     asserts the envelope shape. If the wheel build ever ships without
+#     the guard, this fails — even if unit tests on local source pass.
+#
+# Phases:
+#   A. Confirm an installed molecule-runtime version that contains the
+#      #2481 fix (>= 0.1.78).
+#   B. Call `_build_channel_notification` with peer_id="../../foo" and
+#      assert (1) meta["peer_id"] == "", (2) no agent_card_url field,
+#      (3) no peer_name/peer_role.
+#   C. Symmetric case: peer_id with embedded XML-attribute injection
+#      bytes — assert the same scrubbing.
+#   D. Happy path: a valid UUID peer_id is preserved (proves we didn't
+#      regress legitimate enrichment).
+#   E. Direct check on the URL builder — `_agent_card_url_for("../../foo")`
+#      must return "" and never an unsanitised URL.
+
+set -euo pipefail
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+HARNESS_ROOT="$(dirname "$HERE")"
+cd "$HARNESS_ROOT"
+# shellcheck source=../_curl.sh
+source "$HARNESS_ROOT/_curl.sh"
+
+PASS=0
+FAIL=0
+
+assert() {
+    local desc="$1" expected="$2" actual="$3"
+    if [ "$expected" = "$actual" ]; then
+        printf "  PASS %s\n" "$desc"
+        PASS=$((PASS + 1))
+    else
+        printf "  FAIL %s\n    expected: %s\n    got     : %s\n" "$desc" "$expected" "$actual" >&2
+        FAIL=$((FAIL + 1))
+    fi
+}
+
+# ─── Phase A: wheel version contains the fix ───────────────────────────
+echo "[replay] A. confirming installed molecule-ai-workspace-runtime contains #2481..."
+INSTALLED=$(pip3 show molecule-ai-workspace-runtime 2>/dev/null | awk -F': ' '/^Version:/ {print $2}')
+if [ -z "$INSTALLED" ]; then
+    echo "[replay] FAIL A: molecule-ai-workspace-runtime not installed."
+    echo "         Install: pip3 install molecule-ai-workspace-runtime"
+    exit 2
+fi
+echo "[replay]   installed version: $INSTALLED"
+
+# 0.1.78 is the first published version after #2481 merged to staging.
+# Compare via Python distutils-style version sort (works across patch
+# bumps without sed-fragility).
+HAS_FIX=$(python3 -c "
+from packaging.version import parse
+print('yes' if parse('$INSTALLED') >= parse('0.1.78') else 'no')
+" 2>/dev/null || echo "unknown")
+if [ "$HAS_FIX" != "yes" ]; then
+    echo "[replay] FAIL A: installed $INSTALLED < 0.1.78 (the version that shipped the #2481 fix)."
+    echo "         Upgrade: pip3 install --upgrade molecule-ai-workspace-runtime"
+    exit 2
+fi
+echo "[replay]   ✓ contains #2481 trust-boundary fix"
+
+# ─── Phase B-E: in-process assertions against the installed wheel ──────
+# We don't need WORKSPACE_ID/PLATFORM_URL/MOLECULE_WORKSPACE_TOKEN to
+# import the module — the env validation only fires at console-script
+# entry. We use molecule_runtime.* (the wheel-rewritten import path)
+# rather than workspace.a2a_mcp_server (local source) so this exercises
+# the SHIPPED code.
+echo ""
+echo "[replay] B-E. exercising _build_channel_notification + _agent_card_url_for from the installed wheel..."
+
+OUT=$(WORKSPACE_ID=00000000-0000-0000-0000-000000000000 \
+      PLATFORM_URL=http://localhost:8080 \
+      MOLECULE_WORKSPACE_TOKEN=stub \
+      MOLECULE_MCP_DISABLE_HEARTBEAT=1 \
+      python3 - <<'PYEOF'
+import json
+import sys
+
+from molecule_runtime.a2a_mcp_server import _build_channel_notification
+from molecule_runtime.a2a_client import _agent_card_url_for
+
+results = []
+
+def emit(name, value):
+    results.append({"name": name, "value": value})
+
+# ── B: path-traversal peer_id stripped from envelope ──
+payload = _build_channel_notification({
+    "peer_id": "../../foo",
+    "kind": "peer_agent",
+    "text": "redirect-attempt",
+    "activity_id": "act-1",
+    "method": "message/send",
+    "created_at": "2026-05-01T00:00:00Z",
+})
+meta = payload["params"]["meta"]
+emit("B1_peer_id_scrubbed", meta.get("peer_id", "<missing>"))
+emit("B2_agent_card_url_absent", "absent" if "agent_card_url" not in meta else meta["agent_card_url"])
+emit("B3_peer_name_absent", "absent" if "peer_name" not in meta else meta["peer_name"])
+emit("B4_peer_role_absent", "absent" if "peer_role" not in meta else meta["peer_role"])
+
+# ── C: XML-attribute-injection-shape peer_id ──
+payload = _build_channel_notification({
+    "peer_id": 'aaa" onclick="alert(1)',
+    "kind": "peer_agent",
+    "text": "xss",
+})
+meta = payload["params"]["meta"]
+emit("C1_peer_id_scrubbed", meta.get("peer_id", "<missing>"))
+emit("C2_agent_card_url_absent", "absent" if "agent_card_url" not in meta else "leaked")
+
+# ── D: legitimate UUID is preserved ──
+valid_uuid = "11111111-2222-3333-4444-555555555555"
+payload = _build_channel_notification({
+    "peer_id": valid_uuid,
+    "kind": "peer_agent",
+    "text": "legit",
+})
+meta = payload["params"]["meta"]
+emit("D1_peer_id_preserved", meta.get("peer_id", "<missing>"))
+# agent_card_url IS present (we don't gate the URL itself on whether the registry is reachable)
+emit("D2_agent_card_url_present", "yes" if meta.get("agent_card_url", "").endswith(valid_uuid) else "no")
+
+# ── E: direct URL builder gate ──
+emit("E1_url_builder_strips_traversal", _agent_card_url_for("../../foo"))
+emit("E2_url_builder_strips_xml", _agent_card_url_for('a" onclick="x'))
+emit("E3_url_builder_accepts_uuid_endswith", "yes" if _agent_card_url_for(valid_uuid).endswith(valid_uuid) else "no")
+
+print(json.dumps(results))
+PYEOF
+)
+
+# Parse and assert each result.
+echo "$OUT" | python3 -c "
+import json, sys
+results = json.loads(sys.stdin.read())
+for r in results:
+    print(f\"{r['name']}={r['value']}\")
+" > /tmp/cha-envelope-results.txt
+
+while IFS='=' read -r key value; do
+    case "$key" in
+        B1_peer_id_scrubbed)        assert "B1: malicious peer_id scrubbed to \"\"" "" "$value" ;;
+        B2_agent_card_url_absent)   assert "B2: agent_card_url not emitted" "absent" "$value" ;;
+        B3_peer_name_absent)        assert "B3: peer_name not enriched" "absent" "$value" ;;
+        B4_peer_role_absent)        assert "B4: peer_role not enriched" "absent" "$value" ;;
+        C1_peer_id_scrubbed)        assert "C1: XML-injection peer_id scrubbed" "" "$value" ;;
+        C2_agent_card_url_absent)   assert "C2: XML-injection URL not emitted" "absent" "$value" ;;
+        D1_peer_id_preserved)       assert "D1: valid UUID peer_id preserved" "11111111-2222-3333-4444-555555555555" "$value" ;;
+        D2_agent_card_url_present)  assert "D2: agent_card_url present for valid id" "yes" "$value" ;;
+        E1_url_builder_strips_traversal) assert "E1: _agent_card_url_for(\"../../foo\") returns \"\"" "" "$value" ;;
+        E2_url_builder_strips_xml)       assert "E2: _agent_card_url_for(XML-injection) returns \"\"" "" "$value" ;;
+        E3_url_builder_accepts_uuid_endswith) assert "E3: _agent_card_url_for(valid uuid) builds canonical URL" "yes" "$value" ;;
+    esac
+done < /tmp/cha-envelope-results.txt
+
+echo ""
+if [ "$FAIL" -gt 0 ]; then
+    echo "[replay] FAIL: $PASS pass, $FAIL fail"
+    echo ""
+    echo "[replay] If B/C/E failed: the published wheel does NOT contain the #2481 fix."
+    echo "[replay] Likely causes:"
+    echo "         - Wheel rewriter dropped _validate_peer_id from molecule_runtime.a2a_client"
+    echo "         - publish-runtime.yml regressed to a SHA before #2481 (check pip install version)"
+    exit 1
+fi
+echo "[replay] PASS: $PASS/$PASS — channel envelope peer_id trust boundary holds in published wheel $INSTALLED"
@@ -0,0 +1,175 @@
+#!/usr/bin/env bash
+# Replay for the chat_history MCP tool — exercises the full SaaS-shape
+# wire that PRs #2472 (peer_id filter), #2474 (chat_history client), and
+# #2476 (before_ts paging) ride on. Runs against the prod-shape tenant
+# image, not unit-mock'd handlers, so any drift between the Go handler
+# and the Python tool's expectations surfaces here.
+#
+# What this catches that unit tests don't:
+#   - Real Postgres planner behaviour on the (source_id = $X OR target_id = $X)
+#     OR clause (issue #2478 — both indexes missing).
+#   - cf-proxy header rewrites + TenantGuard middleware in the path.
+#   - lib/pq + Postgres driver type binding for time.Time parameters.
+#   - JSON encoding of created_at across the wire (timezone, precision).
+#
+# Phases:
+#   A. Seed three a2a_receive rows for alpha with peer_id=beta, spread
+#      across distinct timestamps.
+#   B. Basic peer_id filter: GET ?type=a2a_receive&peer_id=beta&limit=10
+#      → assert 3 rows DESC.
+#   C. Limit cap: limit=2 → assert 2 newest rows.
+#   D. before_ts paging: take the 2nd-newest's created_at, GET with
+#      before_ts=that → assert the 1 strictly-older row.
+#   E. OR clause (target side): seed an a2a_send row where source=alpha,
+#      target=beta. GET with type unset, peer_id=beta → assert that row
+#      surfaces too (target_id match, not just source_id).
+#   F. Trust-boundary: peer_id="not-a-uuid" → 400 + "peer_id must be a UUID".
+#   G. Trust-boundary: before_ts="garbage" → 400 + RFC3339 example.
+#   H. URL-encoded SQL-injection-shape peer_id → 400 (matches activity_test.go's
+#      malicious-peer-id panel).
+
+set -euo pipefail
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+HARNESS_ROOT="$(dirname "$HERE")"
+cd "$HARNESS_ROOT"
+
+if [ ! -f .seed.env ]; then
+    echo "[replay] no .seed.env — running ./seed.sh first..."
+    ./seed.sh
+fi
+# shellcheck source=/dev/null
+source .seed.env
+# shellcheck source=../_curl.sh
+source "$HARNESS_ROOT/_curl.sh"
+
+PASS=0
+FAIL=0
+
+assert() {
+    local desc="$1" expected="$2" actual="$3"
+    if [ "$expected" = "$actual" ]; then
+        printf "  PASS %s\n" "$desc"
+        PASS=$((PASS + 1))
+    else
+        printf "  FAIL %s\n    expected: %s\n    got     : %s\n" "$desc" "$expected" "$actual" >&2
+        FAIL=$((FAIL + 1))
+    fi
+}
+
+assert_contains() {
+    local desc="$1" needle="$2" haystack="$3"
+    if echo "$haystack" | grep -qF "$needle"; then
+        printf "  PASS %s\n" "$desc"
+        PASS=$((PASS + 1))
+    else
+        printf "  FAIL %s\n    expected to contain: %s\n    got: %s\n" "$desc" "$needle" "$haystack" >&2
+        FAIL=$((FAIL + 1))
+    fi
+}
+
+echo "[replay] alpha=$ALPHA_ID beta=$BETA_ID"
+
+# ─── Phase A: seed the activity_logs table ─────────────────────────────
+# Inserted via psql so the seed is independent of the platform's HTTP
+# Notify path — that path itself ships through the same handler chain
+# we want to test, and seeding through it would conflate setup and
+# assertion.
+echo ""
+echo "[replay] A. seeding 3 a2a_receive rows for alpha←beta at distinct timestamps..."
+psql_exec >/dev/null <<SQL
+DELETE FROM activity_logs WHERE workspace_id = '$ALPHA_ID';
+INSERT INTO activity_logs (workspace_id, activity_type, source_id, target_id, method, summary, created_at)
+VALUES
+  ('$ALPHA_ID', 'a2a_receive', '$BETA_ID', '$ALPHA_ID', 'message/send', 'oldest from beta',  NOW() - INTERVAL '4 hours'),
+  ('$ALPHA_ID', 'a2a_receive', '$BETA_ID', '$ALPHA_ID', 'message/send', 'middle from beta',  NOW() - INTERVAL '2 hours'),
+  ('$ALPHA_ID', 'a2a_receive', '$BETA_ID', '$ALPHA_ID', 'message/send', 'newest from beta',  NOW() - INTERVAL '1 hour');
+SQL
+echo "[replay]   inserted 3 rows"
+
+# ─── Phase B: basic peer_id filter ─────────────────────────────────────
+echo ""
+echo "[replay] B. GET ?type=a2a_receive&peer_id=beta&limit=10 ..."
+RESP=$(curl_admin "$BASE/workspaces/$ALPHA_ID/activity?type=a2a_receive&peer_id=$BETA_ID&limit=10")
+COUNT=$(echo "$RESP" | jq 'length')
+assert "B1: returns 3 rows" "3" "$COUNT"
+
+# DESC order — newest first
+NEWEST_SUMMARY=$(echo "$RESP" | jq -r '.[0].summary')
+assert "B2: newest first (DESC ordering)" "newest from beta" "$NEWEST_SUMMARY"
+
+OLDEST_SUMMARY=$(echo "$RESP" | jq -r '.[2].summary')
+assert "B3: oldest last" "oldest from beta" "$OLDEST_SUMMARY"
+
+# ─── Phase C: limit cap ────────────────────────────────────────────────
+echo ""
+echo "[replay] C. limit=2 (expecting 2 newest) ..."
+RESP=$(curl_admin "$BASE/workspaces/$ALPHA_ID/activity?type=a2a_receive&peer_id=$BETA_ID&limit=2")
+assert "C1: limit clamps to 2" "2" "$(echo "$RESP" | jq 'length')"
+assert "C2: kept newest" "newest from beta" "$(echo "$RESP" | jq -r '.[0].summary')"
+assert "C3: kept middle" "middle from beta" "$(echo "$RESP" | jq -r '.[1].summary')"
+
+# ─── Phase D: before_ts paging ─────────────────────────────────────────
+echo ""
+echo "[replay] D. before_ts paging — walk backwards from middle row's created_at ..."
+# Take the newest row's created_at, page from there.
+NEWEST_TS=$(curl_admin "$BASE/workspaces/$ALPHA_ID/activity?type=a2a_receive&peer_id=$BETA_ID&limit=1" \
+    | jq -r '.[0].created_at')
+# RFC3339 with timezone — Go's time.Parse(RFC3339) handles `2026-...Z` AND
+# `2026-...+00:00`. Postgres returns the latter; URL-encode the +.
+NEWEST_TS_ENCODED=$(echo "$NEWEST_TS" | python3 -c 'import sys, urllib.parse; print(urllib.parse.quote(sys.stdin.read().strip(), safe=""))')
+RESP=$(curl_admin "$BASE/workspaces/$ALPHA_ID/activity?type=a2a_receive&peer_id=$BETA_ID&before_ts=$NEWEST_TS_ENCODED&limit=10")
+assert "D1: 2 rows older than newest" "2" "$(echo "$RESP" | jq 'length')"
+assert "D2: middle is now newest in the slice" "middle from beta" "$(echo "$RESP" | jq -r '.[0].summary')"
+# Strict less-than — the row at exactly NEWEST_TS must NOT come back.
+NOT_INCLUDED=$(echo "$RESP" | jq -r '[.[].summary] | index("newest from beta") // "absent"')
+assert "D3: strictly older — newest excluded" "absent" "$NOT_INCLUDED"
+
+# ─── Phase E: OR clause covers target_id direction ─────────────────────
+echo ""
+echo "[replay] E. OR clause: seed an a2a_send row (alpha→beta) and confirm it surfaces ..."
+psql_exec >/dev/null <<SQL
+INSERT INTO activity_logs (workspace_id, activity_type, source_id, target_id, method, summary, created_at)
+VALUES ('$ALPHA_ID', 'a2a_send', '$ALPHA_ID', '$BETA_ID', 'message/send', 'sent to beta', NOW());
+SQL
+# No type filter — we want both a2a_receive AND a2a_send rows back.
+RESP=$(curl_admin "$BASE/workspaces/$ALPHA_ID/activity?peer_id=$BETA_ID&limit=10")
+HAS_SENT=$(echo "$RESP" | jq '[.[].summary] | any(. == "sent to beta")')
+assert "E1: a2a_send (alpha→beta) returned via target_id match" "true" "$HAS_SENT"
+TOTAL=$(echo "$RESP" | jq 'length')
+assert "E2: total = 4 (3 receives + 1 send)" "4" "$TOTAL"
+
+# ─── Phase F: malformed peer_id → 400 ──────────────────────────────────
+echo ""
+echo "[replay] F. malformed peer_id → 400 ..."
+HTTP_CODE=$(curl_admin -o /tmp/cha-bad-peer.json -w '%{http_code}' \
+    "$BASE/workspaces/$ALPHA_ID/activity?type=a2a_receive&peer_id=not-a-uuid")
+assert "F1: HTTP 400" "400" "$HTTP_CODE"
+assert_contains "F2: error names the param" "peer_id must be a UUID" "$(cat /tmp/cha-bad-peer.json)"
+
+# ─── Phase G: malformed before_ts → 400 ────────────────────────────────
+echo ""
+echo "[replay] G. malformed before_ts → 400 ..."
+HTTP_CODE=$(curl_admin -o /tmp/cha-bad-ts.json -w '%{http_code}' \
+    "$BASE/workspaces/$ALPHA_ID/activity?type=a2a_receive&before_ts=garbage")
+assert "G1: HTTP 400" "400" "$HTTP_CODE"
+assert_contains "G2: error mentions RFC3339" "RFC3339" "$(cat /tmp/cha-bad-ts.json)"
+
+# ─── Phase H: SQL-injection-shape peer_id is rejected ──────────────────
+echo ""
+echo "[replay] H. URL-encoded SQLi-shape peer_id → 400 ..."
+SQLI_ENCODED="%27%20OR%201%3D1%20--"  # ' OR 1=1 --
+HTTP_CODE=$(curl_admin -o /tmp/cha-sqli.json -w '%{http_code}' \
+    "$BASE/workspaces/$ALPHA_ID/activity?type=a2a_receive&peer_id=$SQLI_ENCODED")
+assert "H1: HTTP 400 (UUID validation rejects before SQL builder sees it)" "400" "$HTTP_CODE"
+
+# ─── Cleanup: tear down seeded rows so subsequent runs don't accumulate ─
+psql_exec >/dev/null <<SQL
+DELETE FROM activity_logs WHERE workspace_id = '$ALPHA_ID';
+SQL
+
+echo ""
+if [ "$FAIL" -gt 0 ]; then
+    echo "[replay] FAIL: $PASS pass, $FAIL fail"
+    exit 1
+fi
+echo "[replay] PASS: $PASS/$PASS — chat_history wire (peer_id filter + before_ts paging + trust boundary + OR clause)"
@@ -0,0 +1,135 @@
+#!/usr/bin/env bash
+# Replay for issue #2397 — local proof that peer-discovery surfaces
+# actionable diagnostics instead of "may be isolated".
+#
+# Prior behavior: tool_list_peers returned "No peers available (this
+# workspace may be isolated)" regardless of WHY peers were empty —
+# five distinct conditions (200+empty, 401, 403, 404, 5xx, network)
+# collapsed to one ambiguous message.
+#
+# This replay proves two things, separately:
+#   (a) WIRE: the platform side of the contract — the tenant's
+#       /registry/<unregistered>/peers returns 404. If this regresses
+#       (e.g. tenant starts returning 200 with empty list, or 500),
+#       the runtime helper would parse it differently and the agent
+#       would see a different diagnostic. The harness catches that here.
+#   (b) PARSE: the runtime helper, given a 404, produces a diagnostic
+#       containing "404" + "register" hints. Done in unit tests against
+#       a mock httpx response (test_a2a_client.py::TestGetPeersWithDiagnostic
+#       — the harness re-asserts the same contract here against a real
+#       Python eval that does NOT depend on workspace auth tokens.
+#
+# Why split the assertion: the Python eval here doesn't have the
+# workspace's auth token file, so going through get_peers_with_diagnostic
+# directly would hit the platform without auth and produce a different
+# branch (401 instead of 404). Splitting (a) from (b) keeps each
+# assertion targeting exactly what it claims to test.
+
+set -euo pipefail
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+HARNESS_ROOT="$(dirname "$HERE")"
+cd "$HARNESS_ROOT"
+
+if [ ! -f .seed.env ]; then
+    echo "[replay] no .seed.env — running ./seed.sh first..."
+    ./seed.sh
+fi
+# shellcheck source=/dev/null
+source .seed.env
+# shellcheck source=../_curl.sh
+source "$HARNESS_ROOT/_curl.sh"
+
+# ─── (a) WIRE: tenant returns 404 for an unregistered workspace ────────
+ROGUE_ID="$(uuidgen | tr '[:upper:]' '[:lower:]')"
+echo "[replay] (a) WIRE: querying /registry/$ROGUE_ID/peers (unregistered workspace)..."
+HTTP_CODE=$(curl_admin -o /tmp/peer-replay.json -w '%{http_code}' \
+    -H "X-Workspace-ID: $ROGUE_ID" \
+    "$BASE/registry/$ROGUE_ID/peers")
+
+echo "[replay]     tenant responded HTTP $HTTP_CODE"
+if [ "$HTTP_CODE" != "404" ]; then
+    echo "[replay] FAIL (a): expected 404 from /registry/<unregistered>/peers, got $HTTP_CODE"
+    echo "[replay]   This is a platform-side regression — the runtime's diagnostic helper"
+    echo "[replay]   would see a different status code than the unit tests cover."
+    cat /tmp/peer-replay.json
+    exit 1
+fi
+
+# ─── (b) PARSE: helper converts a synthetic 404 to actionable diagnostic ─
+#
+# We construct a synthetic httpx 404 response and run the helper against
+# it directly. This isolates the parse branch we want to test from the
+# auth-context concerns of going through the network. The helper's network
+# branches are exhaustively covered by tests/test_a2a_client.py — this is
+# a regression-guard that the helper IS in the install, IS importable in
+# the harness's Python env, and IS reading the status code.
+
+WORKSPACE_PATH="$(cd "$HARNESS_ROOT/../../workspace" && pwd)"
+DIAGNOSTIC=$(WORKSPACE_ID="harness-rogue" PYTHONPATH="$WORKSPACE_PATH" \
+    python3 - "$WORKSPACE_PATH" <<'PYEOF'
+import asyncio
+import sys
+import types
+from unittest.mock import AsyncMock, MagicMock, patch
+
+# Stub platform_auth so a2a_client imports cleanly without requiring a
+# real workspace token file. The helper's auth_headers() only matters
+# when going through the network; we're feeding it a mock response.
+_pa = types.ModuleType("platform_auth")
+_pa.auth_headers = lambda: {}
+_pa.self_source_headers = lambda: {}
+sys.modules.setdefault("platform_auth", _pa)
+
+sys.path.insert(0, sys.argv[1])
+import a2a_client  # noqa: E402
+
+# This replay validates PR #2399's diagnostic helper. If the workspace
+# runtime in the current checkout pre-dates that fix, fail with a
+# clear message instead of an opaque AttributeError.
+if not hasattr(a2a_client, "get_peers_with_diagnostic"):
+    print("__SKIP__: workspace/a2a_client.py is pre-#2399 (no get_peers_with_diagnostic).")
+    sys.exit(0)
+
+resp = MagicMock()
+resp.status_code = 404
+resp.json = MagicMock(return_value={"detail": "not found"})
+
+mock_client = AsyncMock()
+mock_client.__aenter__ = AsyncMock(return_value=mock_client)
+mock_client.__aexit__ = AsyncMock(return_value=False)
+mock_client.get = AsyncMock(return_value=resp)
+
+async def main():
+    with patch("a2a_client.httpx.AsyncClient", return_value=mock_client):
+        peers, diag = await a2a_client.get_peers_with_diagnostic()
+    print(repr(diag))
+
+asyncio.run(main())
+PYEOF
+)
+
+if [[ "$DIAGNOSTIC" == __SKIP__:* ]]; then
+    echo "[replay] (b) SKIP: ${DIAGNOSTIC#__SKIP__: }"
+    echo "[replay]            Re-run after #2399 lands on staging."
+    echo ""
+    echo "[replay] PASS (a) only: peer-discovery wire returns 404 (parse branch skipped — see above)."
+    exit 0
+fi
+
+echo "[replay] (b) PARSE: helper diagnostic = $DIAGNOSTIC"
+
+if ! echo "$DIAGNOSTIC" | grep -q "404"; then
+    echo "[replay] FAIL (b): diagnostic missing '404' — helper regressed to swallow-the-status-code"
+    exit 1
+fi
+if ! echo "$DIAGNOSTIC" | grep -qi "regist"; then
+    echo "[replay] FAIL (b): diagnostic missing 'register' guidance — helper regressed to opaque message"
+    exit 1
+fi
+if echo "$DIAGNOSTIC" | grep -qi "may be isolated"; then
+    echo "[replay] FAIL (b): diagnostic still says 'may be isolated' — fix didn't reach this code path"
+    exit 1
+fi
+
+echo ""
+echo "[replay] PASS: peer-discovery (a) wire returns 404, (b) helper produces actionable diagnostic."
@@ -0,0 +1,185 @@
+#!/usr/bin/env bash
+# Replay for per-tenant independence — each tenant runs the same
+# workflow concurrently with no cross-bleed in workspaces table or
+# activity_logs.
+#
+# What this proves that tenant-isolation.sh doesn't:
+#   tenant-isolation.sh proves that REQUESTS get rejected at the
+#   middleware layer when they target the wrong tenant. THIS replay
+#   proves that even when both tenants are doing legitimate work
+#   simultaneously, the back-end state stays partitioned: no row in
+#   alpha's activity_logs ever shows up in beta's, no FK-resolution
+#   ever crosses tenants, etc.
+#
+# Test shape: seed activity_logs in BOTH tenants in parallel using
+# distinct row counts (3 vs 5) so we can distinguish them. Then
+# fetch each tenant's history and assert the count + content match
+# the seed exactly — proves no leak in either direction.
+#
+# Phases:
+#   A. Seed alpha tenant: 3 a2a_receive rows (parent ← child).
+#   B. Seed beta tenant:  5 a2a_receive rows (parent ← child).
+#   C. GET alpha history → exactly 3 rows, all alpha-summary.
+#   D. GET beta history  → exactly 5 rows, all beta-summary.
+#   E. Direct DB sanity — alpha PG has only alpha rows, beta PG only beta.
+#   F. Concurrent write race — both tenants take turns INSERTing
+#      simultaneously; each tenant's count after the race matches what
+#      it INSERTed. Catches "shared cache poison" / "shared connection
+#      pool" failure modes that don't show up in single-tenant tests.
+
+set -euo pipefail
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+HARNESS_ROOT="$(dirname "$HERE")"
+cd "$HARNESS_ROOT"
+
+if [ ! -f .seed.env ]; then
+    echo "[replay] no .seed.env — running ./seed.sh first..."
+    ./seed.sh
+fi
+# shellcheck source=/dev/null
+source .seed.env
+# shellcheck source=../_curl.sh
+source "$HARNESS_ROOT/_curl.sh"
+
+PASS=0
+FAIL=0
+
+assert() {
+    local desc="$1" expected="$2" actual="$3"
+    if [ "$expected" = "$actual" ]; then
+        printf "  PASS %s\n" "$desc"
+        PASS=$((PASS + 1))
+    else
+        printf "  FAIL %s\n    expected: %s\n    got     : %s\n" "$desc" "$expected" "$actual" >&2
+        FAIL=$((FAIL + 1))
+    fi
+}
+
+# ─── Cleanup (idempotent) ──────────────────────────────────────────────
+psql_exec_alpha >/dev/null <<SQL
+DELETE FROM activity_logs WHERE workspace_id = '$ALPHA_PARENT_ID';
+SQL
+psql_exec_beta >/dev/null <<SQL
+DELETE FROM activity_logs WHERE workspace_id = '$BETA_PARENT_ID';
+SQL
+
+# ─── Phase A: seed alpha (3 rows) ──────────────────────────────────────
+echo "[replay] A. seeding alpha tenant: 3 a2a_receive rows for alpha-parent ←alpha-child"
+psql_exec_alpha >/dev/null <<SQL
+INSERT INTO activity_logs (workspace_id, activity_type, source_id, target_id, method, summary, created_at)
+VALUES
+  ('$ALPHA_PARENT_ID', 'a2a_receive', '$ALPHA_CHILD_ID', '$ALPHA_PARENT_ID', 'message/send', 'alpha-msg-1', NOW() - INTERVAL '3 hours'),
+  ('$ALPHA_PARENT_ID', 'a2a_receive', '$ALPHA_CHILD_ID', '$ALPHA_PARENT_ID', 'message/send', 'alpha-msg-2', NOW() - INTERVAL '2 hours'),
+  ('$ALPHA_PARENT_ID', 'a2a_receive', '$ALPHA_CHILD_ID', '$ALPHA_PARENT_ID', 'message/send', 'alpha-msg-3', NOW() - INTERVAL '1 hour');
+SQL
+
+# ─── Phase B: seed beta (5 rows — distinct count) ──────────────────────
+echo "[replay] B. seeding beta tenant: 5 a2a_receive rows for beta-parent ← beta-child"
+psql_exec_beta >/dev/null <<SQL
+INSERT INTO activity_logs (workspace_id, activity_type, source_id, target_id, method, summary, created_at)
+VALUES
+  ('$BETA_PARENT_ID', 'a2a_receive', '$BETA_CHILD_ID', '$BETA_PARENT_ID', 'message/send', 'beta-msg-1', NOW() - INTERVAL '5 hours'),
+  ('$BETA_PARENT_ID', 'a2a_receive', '$BETA_CHILD_ID', '$BETA_PARENT_ID', 'message/send', 'beta-msg-2', NOW() - INTERVAL '4 hours'),
+  ('$BETA_PARENT_ID', 'a2a_receive', '$BETA_CHILD_ID', '$BETA_PARENT_ID', 'message/send', 'beta-msg-3', NOW() - INTERVAL '3 hours'),
+  ('$BETA_PARENT_ID', 'a2a_receive', '$BETA_CHILD_ID', '$BETA_PARENT_ID', 'message/send', 'beta-msg-4', NOW() - INTERVAL '2 hours'),
+  ('$BETA_PARENT_ID', 'a2a_receive', '$BETA_CHILD_ID', '$BETA_PARENT_ID', 'message/send', 'beta-msg-5', NOW() - INTERVAL '1 hour');
+SQL
+
+# ─── Phase C: alpha tenant sees only its 3 rows ────────────────────────
+echo ""
+echo "[replay] C. alpha history via /activity ..."
+ALPHA_RESP=$(curl_alpha_admin "$BASE/workspaces/$ALPHA_PARENT_ID/activity?type=a2a_receive&peer_id=$ALPHA_CHILD_ID&limit=20")
+assert "C1: alpha row count = 3" "3" "$(echo "$ALPHA_RESP" | jq 'length')"
+
+# Every summary must start with "alpha-msg-" — beta leak would manifest
+# as a beta-msg-* string in this list.
+ALPHA_NON_ALPHA=$(echo "$ALPHA_RESP" | jq -r '[.[].summary | select(startswith("alpha-msg-") | not)] | length')
+assert "C2: zero non-alpha summaries leaked into alpha" "0" "$ALPHA_NON_ALPHA"
+
+# ─── Phase D: beta tenant sees only its 5 rows ─────────────────────────
+echo ""
+echo "[replay] D. beta history via /activity ..."
+BETA_RESP=$(curl_beta_admin "$BASE/workspaces/$BETA_PARENT_ID/activity?type=a2a_receive&peer_id=$BETA_CHILD_ID&limit=20")
+assert "D1: beta row count = 5" "5" "$(echo "$BETA_RESP" | jq 'length')"
+
+BETA_NON_BETA=$(echo "$BETA_RESP" | jq -r '[.[].summary | select(startswith("beta-msg-") | not)] | length')
+assert "D2: zero non-beta summaries leaked into beta" "0" "$BETA_NON_BETA"
+
+# ─── Phase E: direct DB-side sanity ────────────────────────────────────
+echo ""
+echo "[replay] E. direct DB-side counts ..."
+ALPHA_DB=$(psql_exec_alpha -c "SELECT COUNT(*) FROM activity_logs WHERE workspace_id = '$ALPHA_PARENT_ID';")
+BETA_DB=$(psql_exec_beta -c  "SELECT COUNT(*) FROM activity_logs WHERE workspace_id = '$BETA_PARENT_ID';")
+assert "E1: postgres-alpha has exactly 3 alpha rows"  "3" "$ALPHA_DB"
+assert "E2: postgres-beta has exactly 5 beta rows"   "5" "$BETA_DB"
+
+# Cross-DB sanity: alpha PG has zero beta-named workspaces, vice versa.
+ALPHA_HAS_BETA=$(psql_exec_alpha -c "SELECT COUNT(*) FROM workspaces WHERE name LIKE 'beta-%';")
+BETA_HAS_ALPHA=$(psql_exec_beta  -c "SELECT COUNT(*) FROM workspaces WHERE name LIKE 'alpha-%';")
+assert "E3: postgres-alpha has zero beta-named workspaces" "0" "$ALPHA_HAS_BETA"
+assert "E4: postgres-beta has zero alpha-named workspaces" "0" "$BETA_HAS_ALPHA"
+
+# ─── Phase F: concurrent INSERT race ───────────────────────────────────
+# Both tenants insert 10 rows concurrently. Race shape catches the
+# failure modes that CAN cross tenants in this topology:
+#   - redis cross-keyspace bleed (shared redis container).
+#   - shared-cp-stub state corruption (single Go process serves both).
+#   - cf-proxy buffer mixup under simultaneous in-flight writes.
+# Does NOT catch lib/pq prepared-statement cache collision or shared
+# *sql.DB pool poisoning — each tenant has its own DATABASE_URL and
+# its own postgres-{alpha,beta} container, so there is no shared pool
+# to corrupt. A future replay variant on a single shared Postgres
+# would be the right place to assert that failure mode.
+# Each side must end with EXACTLY +10 rows from its own writes.
+echo ""
+echo "[replay] F. concurrent insert race — 10 rows per tenant in parallel"
+
+(
+    for i in $(seq 1 10); do
+        psql_exec_alpha >/dev/null <<SQL
+INSERT INTO activity_logs (workspace_id, activity_type, source_id, target_id, method, summary)
+VALUES ('$ALPHA_PARENT_ID', 'a2a_receive', '$ALPHA_CHILD_ID', '$ALPHA_PARENT_ID', 'message/send', 'alpha-race-$i');
+SQL
+    done
+) &
+ALPHA_PID=$!
+
+(
+    for i in $(seq 1 10); do
+        psql_exec_beta >/dev/null <<SQL
+INSERT INTO activity_logs (workspace_id, activity_type, source_id, target_id, method, summary)
+VALUES ('$BETA_PARENT_ID', 'a2a_receive', '$BETA_CHILD_ID', '$BETA_PARENT_ID', 'message/send', 'beta-race-$i');
+SQL
+    done
+) &
+BETA_PID=$!
+
+wait $ALPHA_PID $BETA_PID
+
+ALPHA_AFTER=$(psql_exec_alpha -c "SELECT COUNT(*) FROM activity_logs WHERE workspace_id = '$ALPHA_PARENT_ID';")
+BETA_AFTER=$(psql_exec_beta  -c "SELECT COUNT(*) FROM activity_logs WHERE workspace_id = '$BETA_PARENT_ID';")
+assert "F1: alpha has 13 rows after race (3 + 10)"  "13" "$ALPHA_AFTER"
+assert "F2: beta has 15 rows after race (5 + 10)"  "15" "$BETA_AFTER"
+
+# Concurrency leak check: alpha's "race" rows must all be alpha-race-*,
+# beta's must all be beta-race-*. A pool/cache cross-bleed would surface
+# as some tenant getting the other's writes.
+ALPHA_RACE_NAMES=$(psql_exec_alpha -c "SELECT COUNT(*) FROM activity_logs WHERE workspace_id = '$ALPHA_PARENT_ID' AND summary LIKE 'beta-race-%';")
+BETA_RACE_NAMES=$(psql_exec_beta  -c "SELECT COUNT(*) FROM activity_logs WHERE workspace_id = '$BETA_PARENT_ID' AND summary LIKE 'alpha-race-%';")
+assert "F3: zero beta-race rows leaked into alpha PG" "0" "$ALPHA_RACE_NAMES"
+assert "F4: zero alpha-race rows leaked into beta PG" "0" "$BETA_RACE_NAMES"
+
+# ─── Cleanup ───────────────────────────────────────────────────────────
+psql_exec_alpha >/dev/null <<SQL
+DELETE FROM activity_logs WHERE workspace_id = '$ALPHA_PARENT_ID';
+SQL
+psql_exec_beta >/dev/null <<SQL
+DELETE FROM activity_logs WHERE workspace_id = '$BETA_PARENT_ID';
+SQL
+
+echo ""
+if [ "$FAIL" -gt 0 ]; then
+    echo "[replay] FAIL: $PASS pass, $FAIL fail"
+    exit 1
+fi
+echo "[replay] PASS: $PASS/$PASS — per-tenant independence holds (DB partition + concurrent race)"
@@ -0,0 +1,186 @@
+#!/usr/bin/env bash
+# Replay for cross-tenant isolation — TenantGuard middleware MUST 404
+# any request whose X-Molecule-Org-Id (or Fly-Replay state, or
+# same-origin Canvas trust) doesn't match the tenant container's
+# configured MOLECULE_ORG_ID.
+#
+# Why this matters in production:
+#   - One Cloudflare tunnel front-doors every tenant subdomain.
+#   - DNS/routing layer can mis-direct a request (CF cache poisoning,
+#     misconfigured CNAME, internal traffic mirror).
+#   - TenantGuard is the last-line defense — it 404s any request whose
+#     declared org doesn't match what the tenant binary was provisioned
+#     with. Returning 404 (not 403) is intentional: the existence of a
+#     tenant on this machine must not be probable by an outsider.
+#
+# What this replay catches:
+#   - A regression where TenantGuard accidentally allows requests with
+#     a different org id (e.g. someone removes the strict equality check).
+#   - cf-proxy routing-by-Host bug that sends alpha's request to beta's
+#     container (the negative test would suddenly succeed).
+#   - Allowlist drift — if /workspaces is added to tenantGuardAllowlist
+#     it would silently be cross-tenant readable.
+#
+# Phases:
+#   A. Positive controls — each tenant accepts its own valid creds.
+#   B. Org-header mismatch — alpha-org header at beta's URL → 404.
+#   C. Reverse — beta-org header at alpha's URL → 404.
+#   D. Right URL, wrong org header (typo) → 404.
+#   E. Bearer present but no org header → 404 (TenantGuard rejects).
+#   F. Per-tenant DB isolation — alpha's /workspaces enumerates only
+#      alpha workspaces; beta's only beta. Confirms cf-proxy + TenantGuard
+#      really did partition the request to the right backing DB.
+#   G. Allowlisted /health stays public on both tenants (sanity check —
+#      a regression that put /health behind the guard would 404 too).
+
+set -euo pipefail
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+HARNESS_ROOT="$(dirname "$HERE")"
+cd "$HARNESS_ROOT"
+
+if [ ! -f .seed.env ]; then
+    echo "[replay] no .seed.env — running ./seed.sh first..."
+    ./seed.sh
+fi
+# shellcheck source=/dev/null
+source .seed.env
+# shellcheck source=../_curl.sh
+source "$HARNESS_ROOT/_curl.sh"
+
+PASS=0
+FAIL=0
+
+assert_status() {
+    local desc="$1" expected="$2" actual="$3"
+    if [ "$expected" = "$actual" ]; then
+        printf "  PASS %s (HTTP %s)\n" "$desc" "$actual"
+        PASS=$((PASS + 1))
+    else
+        printf "  FAIL %s\n    expected HTTP %s, got HTTP %s\n" "$desc" "$expected" "$actual" >&2
+        FAIL=$((FAIL + 1))
+    fi
+}
+
+# Plain equality check — for non-HTTP values (counts, names, etc.).
+# Distinct from assert_status so output reads naturally instead of
+# claiming "(HTTP 0)" for what is really a count.
+assert() {
+    local desc="$1" expected="$2" actual="$3"
+    if [ "$expected" = "$actual" ]; then
+        printf "  PASS %s\n" "$desc"
+        PASS=$((PASS + 1))
+    else
+        printf "  FAIL %s\n    expected: %s\n    got     : %s\n" "$desc" "$expected" "$actual" >&2
+        FAIL=$((FAIL + 1))
+    fi
+}
+
+# ─── Phase A: positive controls ────────────────────────────────────────
+echo "[replay] A. positive controls — each tenant accepts its own valid creds"
+
+ALPHA_OWN=$(curl_alpha_admin -o /dev/null -w '%{http_code}' "$BASE/workspaces")
+assert_status "A1: alpha creds at alpha returns 200" "200" "$ALPHA_OWN"
+
+BETA_OWN=$(curl_beta_admin -o /dev/null -w '%{http_code}' "$BASE/workspaces")
+assert_status "A2: beta creds at beta returns 200" "200" "$BETA_OWN"
+
+# ─── Phase B: alpha creds at beta's URL → 404 ──────────────────────────
+echo ""
+echo "[replay] B. alpha-org header at beta's URL — TenantGuard must 404"
+
+CROSS_AB=$(curl_alpha_creds_at_beta -o /tmp/iso-ab.json -w '%{http_code}' "$BASE/workspaces")
+assert_status "B1: alpha-org header at beta URL → 404" "404" "$CROSS_AB"
+
+# Body must be a generic 404 — never reveal that beta exists or that
+# the org check fired (TenantGuard is intentionally indistinguishable
+# from "no such route" to an outside scanner).
+B_BODY=$(cat /tmp/iso-ab.json)
+if echo "$B_BODY" | grep -qiE "tenant|org|forbidden|denied"; then
+    printf "  FAIL B2: 404 body leaks tenant/org/auth keywords (info disclosure)\n    body: %s\n" "$B_BODY" >&2
+    FAIL=$((FAIL + 1))
+else
+    printf "  PASS B2: 404 body has no tenant/org leak\n"
+    PASS=$((PASS + 1))
+fi
+
+# ─── Phase C: beta creds at alpha's URL → 404 ──────────────────────────
+echo ""
+echo "[replay] C. beta-org header at alpha's URL — TenantGuard must 404"
+
+CROSS_BA=$(curl_beta_creds_at_alpha -o /tmp/iso-ba.json -w '%{http_code}' "$BASE/workspaces")
+assert_status "C1: beta-org header at alpha URL → 404" "404" "$CROSS_BA"
+
+# ─── Phase D: right URL, garbage org header ────────────────────────────
+echo ""
+echo "[replay] D. right URL, garbage org header → 404"
+
+GARBAGE=$(curl -sS -o /dev/null -w '%{http_code}' \
+    -H "Host: ${ALPHA_HOST}" \
+    -H "Authorization: Bearer ${ALPHA_ADMIN_TOKEN}" \
+    -H "X-Molecule-Org-Id: not-the-right-org" \
+    "$BASE/workspaces")
+assert_status "D1: garbage org id at alpha URL → 404" "404" "$GARBAGE"
+
+# ─── Phase E: bearer present but no org header at all → 404 ────────────
+echo ""
+echo "[replay] E. valid bearer but missing X-Molecule-Org-Id → 404"
+
+NO_ORG=$(curl -sS -o /dev/null -w '%{http_code}' \
+    -H "Host: ${ALPHA_HOST}" \
+    -H "Authorization: Bearer ${ALPHA_ADMIN_TOKEN}" \
+    "$BASE/workspaces")
+assert_status "E1: missing X-Molecule-Org-Id → 404" "404" "$NO_ORG"
+
+# ─── Phase F: per-tenant DB isolation via list_workspaces ──────────────
+echo ""
+echo "[replay] F. per-tenant DB isolation via /workspaces listing"
+
+ALPHA_LIST=$(curl_alpha_admin "$BASE/workspaces")
+ALPHA_NAMES=$(echo "$ALPHA_LIST" | jq -r '.[].name' | sort | tr '\n' ',' | sed 's/,$//')
+echo "[replay]   alpha tenant sees: $ALPHA_NAMES"
+
+if [ "$ALPHA_NAMES" = "alpha-child,alpha-parent" ]; then
+    printf "  PASS F1: alpha enumerates only alpha workspaces\n"
+    PASS=$((PASS + 1))
+else
+    printf "  FAIL F1: alpha enumerated unexpected workspaces\n    expected: alpha-child,alpha-parent\n    got     : %s\n" "$ALPHA_NAMES" >&2
+    FAIL=$((FAIL + 1))
+fi
+
+BETA_LIST=$(curl_beta_admin "$BASE/workspaces")
+BETA_NAMES=$(echo "$BETA_LIST" | jq -r '.[].name' | sort | tr '\n' ',' | sed 's/,$//')
+echo "[replay]   beta tenant sees:  $BETA_NAMES"
+
+if [ "$BETA_NAMES" = "beta-child,beta-parent" ]; then
+    printf "  PASS F2: beta enumerates only beta workspaces\n"
+    PASS=$((PASS + 1))
+else
+    printf "  FAIL F2: beta enumerated unexpected workspaces\n    expected: beta-child,beta-parent\n    got     : %s\n" "$BETA_NAMES" >&2
+    FAIL=$((FAIL + 1))
+fi
+
+# Cross-check: neither tenant's list contains the other's workspace ids.
+LEAKED_INTO_ALPHA=$(echo "$ALPHA_LIST" | jq -r --arg b1 "$BETA_PARENT_ID" --arg b2 "$BETA_CHILD_ID" \
+    '[.[] | select(.id == $b1 or .id == $b2)] | length')
+assert "F3: alpha list contains zero beta workspace ids" "0" "$LEAKED_INTO_ALPHA"
+
+LEAKED_INTO_BETA=$(echo "$BETA_LIST" | jq -r --arg a1 "$ALPHA_PARENT_ID" --arg a2 "$ALPHA_CHILD_ID" \
+    '[.[] | select(.id == $a1 or .id == $a2)] | length')
+assert "F4: beta list contains zero alpha workspace ids" "0" "$LEAKED_INTO_BETA"
+
+# ─── Phase G: /health is allowlisted (sanity) ──────────────────────────
+echo ""
+echo "[replay] G. /health stays public on both tenants (TenantGuard allowlist sanity)"
+
+ALPHA_HEALTH=$(curl -sS -o /dev/null -w '%{http_code}' -H "Host: ${ALPHA_HOST}" "$BASE/health")
+assert_status "G1: alpha /health public → 200" "200" "$ALPHA_HEALTH"
+
+BETA_HEALTH=$(curl -sS -o /dev/null -w '%{http_code}' -H "Host: ${BETA_HOST}" "$BASE/health")
+assert_status "G2: beta /health public → 200" "200" "$BETA_HEALTH"
+
+echo ""
+if [ "$FAIL" -gt 0 ]; then
+    echo "[replay] FAIL: $PASS pass, $FAIL fail"
+    exit 1
+fi
+echo "[replay] PASS: $PASS/$PASS — TenantGuard isolation + per-tenant DB partitioning hold"
@@ -0,0 +1,20 @@
+# Harness-replay Python deps — minimal set for replays/*.sh scripts that
+# eval Python against the running tenant (e.g. importing
+# workspace/a2a_client.py to assert parser behavior).
+#
+# This is intentionally smaller than workspace/requirements.txt: the
+# replays don't need a2a-sdk, langchain, opentelemetry, etc. — only the
+# HTTP client surface that the imported helpers depend on. Adding the
+# full workspace deps would slow every harness CI run by ~30s for no
+# gain.
+#
+# Add a line here (with a version constraint matching workspace/requirements.txt)
+# when a new replay introduces a new Python import.
+
+httpx>=0.28.1
+
+# channel-envelope-trust-boundary.sh imports from `molecule_runtime.*` (the
+# wheel-rewritten path) so it catches the failure mode where the wheel
+# build silently strips a fix that unit tests on local source still pass.
+# >= 0.1.78 ships PR #2481's peer_id trust-boundary guard.
+molecule-ai-workspace-runtime>=0.1.78
@@ -0,0 +1,90 @@
+#!/usr/bin/env bash
+# Run every replay under tests/harness/replays/ against a fresh harness.
+#
+# Boots the harness (up.sh + seed.sh), runs each `replays/*.sh` in
+# alphabetical order, tracks pass/fail, and tears down on exit. Returns
+# non-zero if any replay failed.
+#
+# Usage:
+#   ./run-all-replays.sh                # boot, run, teardown
+#   KEEP_UP=1 ./run-all-replays.sh      # leave harness running on exit (debug)
+#   REBUILD=1 ./run-all-replays.sh      # rebuild images before booting
+#
+# CI usage: invoke without flags. The trap-on-EXIT teardown ensures we
+# don't leak Docker resources when a replay fails partway through.
+
+set -euo pipefail
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+cd "$HERE"
+
+REPLAYS_DIR="$HERE/replays"
+if [ ! -d "$REPLAYS_DIR" ]; then
+    echo "[run-all] no replays/ directory at $REPLAYS_DIR — nothing to run"
+    exit 1
+fi
+
+shopt -s nullglob
+REPLAYS=("$REPLAYS_DIR"/*.sh)
+shopt -u nullglob
+if [ ${#REPLAYS[@]} -eq 0 ]; then
+    echo "[run-all] replays/ is empty — nothing to run"
+    exit 1
+fi
+
+cleanup() {
+    local exit_code=$?
+    if [ "${KEEP_UP:-0}" = "1" ]; then
+        echo ""
+        echo "[run-all] KEEP_UP=1 — leaving harness up. Tear down manually with ./down.sh"
+    else
+        echo ""
+        echo "[run-all] tearing down harness..."
+        ./down.sh >/dev/null 2>&1 || echo "[run-all] WARN: ./down.sh exited non-zero"
+    fi
+    exit "$exit_code"
+}
+trap cleanup EXIT INT TERM
+
+echo "[run-all] booting harness..."
+if [ "${REBUILD:-0}" = "1" ]; then
+    ./up.sh --rebuild
+else
+    ./up.sh
+fi
+
+echo "[run-all] seeding workspaces..."
+./seed.sh
+
+PASS_COUNT=0
+FAIL_COUNT=0
+SKIP_COUNT=0
+FAILED_NAMES=()
+
+for replay in "${REPLAYS[@]}"; do
+    name=$(basename "$replay" .sh)
+    echo ""
+    echo "[run-all] ━━━ $name ━━━"
+    if bash "$replay"; then
+        # Replays signal "skip" by exiting 0 with a __SKIP__ marker in stdout —
+        # but we capture that as a pass here since the script exited 0. The
+        # skip is documented in the script's own output. CI uses pass/fail.
+        PASS_COUNT=$((PASS_COUNT + 1))
+        echo "[run-all] PASS: $name"
+    else
+        FAIL_COUNT=$((FAIL_COUNT + 1))
+        FAILED_NAMES+=("$name")
+        echo "[run-all] FAIL: $name"
+    fi
+done
+
+echo ""
+echo "[run-all] ============================="
+echo "[run-all] Replay summary: ${PASS_COUNT} passed, ${FAIL_COUNT} failed (of ${#REPLAYS[@]} total)"
+if [ ${FAIL_COUNT} -gt 0 ]; then
+    echo "[run-all] Failed:"
+    for name in "${FAILED_NAMES[@]}"; do
+        echo "[run-all]   - $name"
+    done
+    exit 1
+fi
+echo "[run-all] All replays passed."
@@ -0,0 +1,89 @@
+#!/usr/bin/env bash
+# Seed BOTH tenants with parent + child workspaces so peer-discovery
+# and cross-tenant replays have something to discover.
+#
+# Tenant alpha:
+#   - alpha-parent (tier 0)
+#   - alpha-child  (tier 1, child of alpha-parent)
+# Tenant beta:
+#   - beta-parent  (tier 0)
+#   - beta-child   (tier 1, child of beta-parent)
+#
+# IDs are server-generated (POST /workspaces ignores body.id) — we
+# capture the returned id rather than minting client-side. Older
+# versions silently desynced from the workspaces table, breaking
+# FK-dependent replays.
+#
+# All four IDs persist to .seed.env so replays can target any of them.
+
+set -euo pipefail
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+cd "$HERE"
+
+# shellcheck source=_curl.sh
+source "$HERE/_curl.sh"
+
+create_workspace() {
+    local tenant="$1" name="$2" tier="$3" parent="${4:-}"
+    local body
+    if [ -n "$parent" ]; then
+        body="{\"name\":\"$name\",\"tier\":$tier,\"parent_id\":\"$parent\",\"runtime\":\"langgraph\"}"
+    else
+        body="{\"name\":\"$name\",\"tier\":$tier,\"runtime\":\"langgraph\"}"
+    fi
+    local id
+    if [ "$tenant" = "alpha" ]; then
+        id=$(curl_alpha_admin -X POST "$BASE/workspaces" -d "$body" | jq -r '.id')
+    else
+        id=$(curl_beta_admin -X POST "$BASE/workspaces" -d "$body" | jq -r '.id')
+    fi
+    if [ -z "$id" ] || [ "$id" = "null" ]; then
+        echo "[seed] FAIL: $tenant/$name workspace creation returned no id" >&2
+        return 1
+    fi
+    echo "$id"
+}
+
+echo "[seed] confirming both tenants reachable..."
+ALPHA_HEALTH=$(curl_alpha_anon "$BASE/health" || echo "")
+BETA_HEALTH=$(curl_beta_anon "$BASE/health" || echo "")
+if [ -z "$ALPHA_HEALTH" ] || [ -z "$BETA_HEALTH" ]; then
+    echo "[seed] FAIL: tenant unreachable. alpha='$ALPHA_HEALTH' beta='$BETA_HEALTH'"
+    echo "       Did ./up.sh complete cleanly?"
+    exit 1
+fi
+echo "[seed]   alpha: $ALPHA_HEALTH"
+echo "[seed]   beta : $BETA_HEALTH"
+
+echo ""
+echo "[seed] tenant alpha — creating alpha-parent + alpha-child ..."
+ALPHA_PARENT_ID=$(create_workspace alpha alpha-parent 0)
+echo "[seed]   alpha-parent id=$ALPHA_PARENT_ID"
+ALPHA_CHILD_ID=$(create_workspace alpha alpha-child 1 "$ALPHA_PARENT_ID")
+echo "[seed]   alpha-child  id=$ALPHA_CHILD_ID"
+
+echo ""
+echo "[seed] tenant beta — creating beta-parent + beta-child ..."
+BETA_PARENT_ID=$(create_workspace beta beta-parent 0)
+echo "[seed]   beta-parent  id=$BETA_PARENT_ID"
+BETA_CHILD_ID=$(create_workspace beta beta-child 1 "$BETA_PARENT_ID")
+echo "[seed]   beta-child   id=$BETA_CHILD_ID"
+
+# Stash IDs for replay scripts.
+#
+# Backwards-compat: ALPHA_ID + BETA_ID aliases keep pre-Phase-2 replays
+# working (they used these names for the alpha tenant's parent + child).
+{
+    echo "ALPHA_PARENT_ID=$ALPHA_PARENT_ID"
+    echo "ALPHA_CHILD_ID=$ALPHA_CHILD_ID"
+    echo "BETA_PARENT_ID=$BETA_PARENT_ID"
+    echo "BETA_CHILD_ID=$BETA_CHILD_ID"
+    echo "# legacy aliases — pre-Phase-2 replays expect these names"
+    echo "ALPHA_ID=$ALPHA_PARENT_ID"
+    echo "BETA_ID=$ALPHA_CHILD_ID"
+} > "$HERE/.seed.env"
+
+echo ""
+echo "[seed] done. IDs persisted to tests/harness/.seed.env"
+echo "[seed]   alpha: parent=$ALPHA_PARENT_ID child=$ALPHA_CHILD_ID"
+echo "[seed]   beta : parent=$BETA_PARENT_ID child=$BETA_CHILD_ID"
@@ -0,0 +1,59 @@
+#!/usr/bin/env bash
+# Bring the production-shape harness up.
+#
+# Usage: ./up.sh [--rebuild]
+#
+# Always operates in tests/harness/ regardless of where it's invoked
+# from — test scripts under tests/harness/replays/ source it via the
+# absolute path, so cd-ing first prevents compose-context surprises.
+
+set -euo pipefail
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+cd "$HERE"
+
+REBUILD=false
+for arg in "$@"; do
+    case "$arg" in
+        --rebuild) REBUILD=true ;;
+    esac
+done
+
+# Generate a per-run encryption key. The tenant runs with
+# MOLECULE_ENV=production (intentional, to replay prod-shape bugs), and
+# crypto.InitStrict() refuses to boot without SECRETS_ENCRYPTION_KEY.
+# Generate fresh so:
+#   - No key-shaped string lives in the repo (avoids muscle-memorying a
+#     hardcoded value into other places + secret-scanner false positives).
+#   - Each harness lifetime gets a unique key, mimicking prod's per-tenant
+#     isolation. Persistence across runs isn't required — the harness DB
+#     is wiped on every ./down.sh.
+# Honor a caller-supplied value if already exported (lets a debug session
+# pin a key for reproducibility).
+if [ -z "${SECRETS_ENCRYPTION_KEY:-}" ]; then
+    SECRETS_ENCRYPTION_KEY=$(openssl rand -base64 32)
+    export SECRETS_ENCRYPTION_KEY
+fi
+
+if [ "$REBUILD" = true ]; then
+    docker compose -f compose.yml build --no-cache tenant cp-stub
+fi
+
+echo "[harness] starting redis + cp-stub + tenant-alpha + tenant-beta + cf-proxy ..."
+docker compose -f compose.yml up -d --wait
+
+# Sudo-free reachability: cf-proxy/nginx routes by Host header to the
+# right tenant container (matches production CF tunnel: same URL,
+# different Host = different tenant). Replays target loopback :8080
+# with a per-tenant Host header. _curl.sh centralises the helper
+# functions (curl_alpha_admin, curl_beta_admin, etc.).
+echo ""
+echo "[harness] up. Multi-tenant topology:"
+echo "          tenant-alpha:  Host: harness-tenant-alpha.localhost"
+echo "          tenant-beta:   Host: harness-tenant-beta.localhost"
+echo "          legacy alias:  Host: harness-tenant.localhost → alpha"
+echo ""
+echo "          Quick check (no /etc/hosts needed):"
+echo "            curl -H 'Host: harness-tenant-alpha.localhost' http://localhost:8080/health"
+echo "            curl -H 'Host: harness-tenant-beta.localhost'  http://localhost:8080/health"
+echo ""
+echo "Next: ./seed.sh   # register parent+child workspaces in BOTH tenants"
@@ -16,7 +16,11 @@ RUN echo 'replace github.com/Molecule-AI/molecule-ai-plugin-github-app-auth => /
 RUN sed -i 's|replace github.com/Molecule-AI/molecule-monorepo/platform => .*|replace github.com/Molecule-AI/molecule-monorepo/platform => /app|' /plugin/go.mod
 RUN go mod download
 COPY workspace-server/ .
-RUN CGO_ENABLED=0 GOOS=linux go build -o /platform ./cmd/server
+# GIT_SHA mirror of Dockerfile.tenant — see that file for the rationale.
+ARG GIT_SHA=dev
+RUN CGO_ENABLED=0 GOOS=linux go build \
+    -ldflags "-X github.com/Molecule-AI/molecule-monorepo/platform/internal/buildinfo.GitSHA=${GIT_SHA}" \
+    -o /platform ./cmd/server

 # Clone templates + plugins at build time from manifest.json
 FROM alpine:3.20 AS templates
@@ -21,7 +21,19 @@ COPY workspace-server/go.mod workspace-server/go.sum ./
 RUN echo 'replace github.com/Molecule-AI/molecule-ai-plugin-github-app-auth => /plugin' >> go.mod
 RUN go mod download
 COPY workspace-server/ .
-RUN CGO_ENABLED=0 GOOS=linux go build -o /platform ./cmd/server
+
+# GIT_SHA is baked into the binary via -ldflags so /buildinfo can return
+# it at runtime. CI passes ${{ github.sha }}; local builds default to
+# "dev" so an unset value never reads as a real SHA.
+#
+# Why this matters: the redeploy verification step compares each tenant's
+# /buildinfo against the SHA the workflow expects. If GIT_SHA isn't
+# threaded through here, every tenant returns "dev" and the verification
+# fails closed — which is the correct fail-direction (#2395 root fix).
+ARG GIT_SHA=dev
+RUN CGO_ENABLED=0 GOOS=linux go build \
+    -ldflags "-X github.com/Molecule-AI/molecule-monorepo/platform/internal/buildinfo.GitSHA=${GIT_SHA}" \
+    -o /platform ./cmd/server

 # ── Stage 2: Canvas Next.js standalone ────────────────────────────────
 FROM node:20-alpine AS canvas-builder
@@ -223,13 +223,24 @@ func main() {
 		registry.StartLivenessMonitor(c, onWorkspaceOffline)
 	})

-	// Proactive container health sweep — detects dead containers faster than Redis TTL.
-	// Checks all "online" workspaces against Docker every 15 seconds.
-	if prov != nil {
-		go supervised.RunWithRecover(ctx, "health-sweep", func(c context.Context) {
-			registry.StartHealthSweep(c, prov, 15*time.Second, onWorkspaceOffline)
-		})
-	}
+	// Proactive health sweep — two passes per tick:
+	//   1. Docker-side: checks "online" workspaces against the local Docker
+	//      daemon (only runs when prov is non-nil, i.e. self-hosted mode).
+	//   2. Remote-side: scans runtime='external' rows whose last_heartbeat_at
+	//      is past REMOTE_LIVENESS_STALE_AFTER and flips them to
+	//      awaiting_agent. Runs regardless of provisioner mode — SaaS
+	//      tenants need this even though they don't run Docker locally,
+	//      because external-runtime workspaces are operator-managed and
+	//      the platform-side liveness sweep is the only thing that
+	//      transitions them off 'online' when the operator's CLI dies.
+	//
+	// Pre-2026-04-30 this goroutine was gated on prov != nil, which silently
+	// disabled the remote-side sweep on every SaaS tenant. The function in
+	// healthsweep.go has always handled nil checker correctly; only the
+	// orchestration was wrong. See #2392's CI failure for the trace.
+	go supervised.RunWithRecover(ctx, "health-sweep", func(c context.Context) {
+		registry.StartHealthSweep(c, prov, 15*time.Second, onWorkspaceOffline)
+	})

 	// Orphan-container reconcile sweep — finds running containers
 	// whose workspace row is already status='removed' and stops
@@ -249,7 +260,13 @@ func main() {
 	// and the state is incoherent (e.g. user sees "Retry" after 15min but
 	// backend still thinks provisioning is in progress).
 	go supervised.RunWithRecover(ctx, "provision-timeout-sweep", func(c context.Context) {
-		registry.StartProvisioningTimeoutSweep(c, broadcaster, registry.DefaultProvisionSweepInterval)
+		// Pass the handler's per-runtime template-manifest lookup so the
+		// sweeper honours `runtime_config.provision_timeout_seconds`
+		// declared in any template's config.yaml — the same value the
+		// canvas already reads via addProvisionTimeoutMs. Without this
+		// the sweeper killed claude-code at the 10-min hardcoded floor
+		// regardless of the manifest. See registry.RuntimeTimeoutLookup.
+		registry.StartProvisioningTimeoutSweep(c, broadcaster, registry.DefaultProvisionSweepInterval, wh.ProvisionTimeoutSecondsForRuntime)
 	})

 	// Cron Scheduler — fires A2A messages to workspaces on user-defined schedules
@@ -0,0 +1,26 @@
+// Package buildinfo exposes the git SHA the binary was built from.
+//
+// Set at link time:
+//
+//	go build -ldflags "-X github.com/Molecule-AI/molecule-monorepo/platform/internal/buildinfo.GitSHA=<sha>"
+//
+// CI passes ${{ github.sha }} via Dockerfile.tenant ARG GIT_SHA; local
+// dev builds default to "dev" so unset never reads as success.
+//
+// Why this package exists: redeploy-fleet (CP) returns ssm_status=Success
+// when the SSM RPC didn't error — that's "the deploy command ran",
+// NOT "the new code is running on every tenant." Image-tag-as-tag
+// (`:latest`) caches in the local Docker daemon so `docker compose up -d`
+// without an explicit `docker pull` is a no-op when the tag hasn't been
+// invalidated. Both observed 2026-04-30: the user's tenant kept serving
+// pre-501a42d7 chat_files even after main published the lazy-heal fix
+// (#2395). Exposing GitSHA at /buildinfo lets the redeploy workflow
+// verify EVERY tenant is actually running the published SHA before
+// reporting success.
+package buildinfo
+
+// GitSHA is overwritten at build time via -ldflags. Default catches
+// dev builds + any deploy that forgot to wire the build-arg through.
+// "dev" is intentional — comparing it to a real SHA always fails,
+// which is what we want for an unconfigured deploy.
+var GitSHA = "dev"
@@ -0,0 +1,81 @@
+package buildinfo_test
+
+import (
+	"encoding/json"
+	"net/http"
+	"net/http/httptest"
+	"testing"
+
+	"github.com/Molecule-AI/molecule-monorepo/platform/internal/buildinfo"
+	"github.com/gin-gonic/gin"
+)
+
+// TestGitSHA_DefaultDevSentinel pins the contract that an unset
+// GIT_SHA at build time reads as "dev", NOT as an empty string. The
+// redeploy verification step compares the deployed /buildinfo against
+// the workflow's expected SHA — if GitSHA were "" by default, a
+// misconfigured deploy would round-trip "" successfully if the
+// expected SHA were also somehow ""; "dev" guarantees the comparison
+// always fails closed for an unset deploy.
+//
+// Linker tests can't directly exercise -ldflags injection from inside
+// `go test`, but they can pin the default the linker overrides.
+func TestGitSHA_DefaultDevSentinel(t *testing.T) {
+	if buildinfo.GitSHA != "dev" {
+		t.Errorf("GitSHA default = %q, want %q (CI ldflags override expected to set this; tests run without ldflags so this should be the dev sentinel)", buildinfo.GitSHA, "dev")
+	}
+}
+
+// TestBuildInfoEndpoint_ReturnsGitSHA pins the wire shape of the
+// /buildinfo response. The redeploy verification step reads
+// `.git_sha` from this JSON; renaming the field would silently break
+// every tenant verification (the jq lookup would return null + the
+// step would interpret it as "tenant unreachable" and fail closed,
+// which is correct but noisy).
+//
+// Test routes the handler against an httptest server rather than
+// constructing a router.Setup() — that constructor takes a Hub +
+// Broadcaster + Provisioner + WorkspaceHandler + ChannelMgr, and
+// /buildinfo doesn't depend on any of them. Using a minimal gin
+// engine here keeps the test fast and isolated to the contract under
+// test.
+func TestBuildInfoEndpoint_ReturnsGitSHA(t *testing.T) {
+	// Stash + restore so other tests that read GitSHA see a stable
+	// value. The package-level var is mutable by design (-ldflags),
+	// so test isolation requires explicit save/restore.
+	prev := buildinfo.GitSHA
+	t.Cleanup(func() { buildinfo.GitSHA = prev })
+	buildinfo.GitSHA = "abc1234deadbeef"
+
+	gin.SetMode(gin.TestMode)
+	r := gin.New()
+	r.GET("/buildinfo", func(c *gin.Context) {
+		c.JSON(200, gin.H{"git_sha": buildinfo.GitSHA})
+	})
+
+	srv := httptest.NewServer(r)
+	t.Cleanup(srv.Close)
+
+	resp, err := http.Get(srv.URL + "/buildinfo")
+	if err != nil {
+		t.Fatalf("GET /buildinfo: %v", err)
+	}
+	t.Cleanup(func() { _ = resp.Body.Close() })
+
+	if resp.StatusCode != 200 {
+		t.Fatalf("status = %d, want 200", resp.StatusCode)
+	}
+
+	var body map[string]string
+	if err := json.NewDecoder(resp.Body).Decode(&body); err != nil {
+		t.Fatalf("decode: %v", err)
+	}
+
+	got, ok := body["git_sha"]
+	if !ok {
+		t.Fatalf("response missing git_sha field — would break the redeploy verification jq lookup. Body: %+v", body)
+	}
+	if got != "abc1234deadbeef" {
+		t.Errorf("git_sha = %q, want %q", got, "abc1234deadbeef")
+	}
+}
@@ -7,6 +7,7 @@ import (

 	"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
 	"github.com/Molecule-AI/molecule-monorepo/platform/internal/events"
+	"github.com/Molecule-AI/molecule-monorepo/platform/internal/models"
 	"github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner"
 	"github.com/google/uuid"
 )
@@ -131,7 +132,8 @@ func buildBundleConfigFiles(b *Bundle) map[string][]byte {

 func markFailed(ctx context.Context, wsID string, broadcaster *events.Broadcaster, err error) {
 	db.DB.ExecContext(ctx,
-		`UPDATE workspaces SET status = 'failed', updated_at = now() WHERE id = $1`, wsID)
+		`UPDATE workspaces SET status = $1, updated_at = now() WHERE id = $2`,
+		models.StatusFailed, wsID)
 	broadcaster.RecordAndBroadcast(ctx, "WORKSPACE_PROVISION_FAILED", wsID, map[string]interface{}{
 		"error": err.Error(),
 	})
@@ -0,0 +1,63 @@
+package db_test
+
+// Architecture test (#2344): db is a leaf — DB pool + migrations + raw
+// SQL helpers, no business-logic dependencies. The DB layer must be
+// testable with sqlmock in isolation. If db starts importing handlers
+// or provisioner, every db unit test would need to bring up that
+// subsystem, and the layering becomes circular.
+//
+// If this test fails: you put business logic in the db package. Move
+// it to a higher-tier package that imports db, not the reverse.
+
+import (
+	"go/parser"
+	"go/token"
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+)
+
+const moduleInternalPrefix = "github.com/Molecule-AI/molecule-monorepo/platform/internal/"
+
+func TestDBHasNoInternalDependencies(t *testing.T) {
+	t.Parallel()
+	for path, file := range listImports(t, ".") {
+		if strings.HasPrefix(path, moduleInternalPrefix) {
+			t.Errorf(
+				"db must not import other internal packages "+
+					"(found %q in %s) — db is the foundation layer and a "+
+					"reverse dep creates a cycle (everything imports db). "+
+					"See workspace-server/internal/db/architecture_test.go.",
+				path, file,
+			)
+		}
+	}
+}
+
+func listImports(t *testing.T, dir string) map[string]string {
+	t.Helper()
+	fset := token.NewFileSet()
+	entries, err := os.ReadDir(dir)
+	if err != nil {
+		t.Fatalf("read %s: %v", dir, err)
+	}
+	out := make(map[string]string)
+	for _, e := range entries {
+		name := e.Name()
+		if e.IsDir() || !strings.HasSuffix(name, ".go") || strings.HasSuffix(name, "_test.go") {
+			continue
+		}
+		f, err := parser.ParseFile(fset, filepath.Join(dir, name), nil, parser.ImportsOnly)
+		if err != nil {
+			t.Fatalf("parse %s: %v", name, err)
+		}
+		for _, imp := range f.Imports {
+			path := strings.Trim(imp.Path.Value, "\"")
+			if _, seen := out[path]; !seen {
+				out[path] = name
+			}
+		}
+	}
+	return out
+}
@@ -0,0 +1,360 @@
+package db_test
+
+// Static drift gate: every value declared in models.AllWorkspaceStatuses
+// must exist in the workspace_status enum after every migration applies.
+//
+// Why this exists: the workspace_status enum (migration 043) initially
+// shipped without 'awaiting_agent' and 'hibernating' even though
+// application code already wrote both. Every UPDATE silently failed in
+// production for five days because:
+//
+//   - Status values were ad-hoc string literals scattered across raw
+//     SQL strings in 8+ files, with no compile-time check.
+//   - sqlmock matched SQL by regex, not against the live enum.
+//   - Errors were dropped or log-and-continued at every call site.
+//
+// The fix is layered. This gate is the static layer:
+//
+//   - models.AllWorkspaceStatuses is the source of truth for the
+//     codebase side. Every status write goes through one of those
+//     typed constants (the parameterized-write refactor enforces this).
+//   - The migrations are the source of truth for the DB side.
+//   - This test parses both and asserts the codebase set ⊆ migration set.
+//
+// If you add a new status:
+//
+//   1. Add a `Status…` constant in models/workspace_status.go AND
+//      append it to AllWorkspaceStatuses.
+//   2. Open a migration `ALTER TYPE workspace_status ADD VALUE 'X'`.
+//   3. This test confirms both happened in the same PR.
+//
+// If you intend to retire a status: keep it in the enum as long as any
+// row could legitimately still hold it, then drop it from
+// AllWorkspaceStatuses (the gate runs the inclusion in one direction
+// only — extras in the enum are fine).
+
+import (
+	"go/ast"
+	"go/parser"
+	"go/token"
+	"os"
+	"path/filepath"
+	"regexp"
+	"sort"
+	"strings"
+	"testing"
+)
+
+func TestWorkspaceStatusEnum_NoLiteralDrift(t *testing.T) {
+	t.Parallel()
+
+	repoRoot := findRepoRoot(t)
+	migrationsDir := filepath.Join(repoRoot, "workspace-server", "migrations")
+	statusFile := filepath.Join(repoRoot, "workspace-server", "internal", "models", "workspace_status.go")
+	srcRoot := filepath.Join(repoRoot, "workspace-server")
+
+	enum := loadWorkspaceStatusEnum(t, migrationsDir)
+	if len(enum) == 0 {
+		t.Fatalf("could not parse workspace_status enum from %s — gate is non-functional", migrationsDir)
+	}
+
+	codebase := loadAllWorkspaceStatuses(t, statusFile)
+	if len(codebase) == 0 {
+		t.Fatalf("could not parse models.AllWorkspaceStatuses from %s — gate is non-functional", statusFile)
+	}
+
+	var rogue []string
+	for lit := range codebase {
+		if _, ok := enum[lit]; !ok {
+			rogue = append(rogue, lit)
+		}
+	}
+	if len(rogue) > 0 {
+		sort.Strings(rogue)
+		t.Errorf(
+			"workspace status constants %v are declared in models.AllWorkspaceStatuses but not in the workspace_status enum.\n"+
+				"Add a migration `ALTER TYPE workspace_status ADD VALUE 'X';` (see migration 046 for shape).\n"+
+				"Enum currently: %v\nCodebase declares: %v",
+			rogue, sortedKeys(enum), sortedKeys(codebase),
+		)
+	}
+
+	// Second axis: scan production .go files for hard-coded
+	// `UPDATE workspaces SET status = '<literal>'`. Every status write must
+	// flow through models.Status* constants — the typed-constants refactor
+	// (PR #2396) made this enforceable. Without this scan, a future
+	// site-update can silently re-introduce a literal that bypasses
+	// AllWorkspaceStatuses + the migration gate above. The hard-coded site
+	// in workspace_bootstrap.go:62 was missed in the initial sweep and
+	// only caught by manual grep — this gate makes that automatic.
+	if hits := findHardCodedStatusWrites(t, srcRoot); len(hits) > 0 {
+		t.Errorf(
+			"hard-coded `SET status = '<literal>'` found in production code — replace with a parameterized $N + models.Status* constant:\n  %s",
+			strings.Join(hits, "\n  "),
+		)
+	}
+}
+
+// loadWorkspaceStatusEnum scans every *.up.sql file for either:
+//
+//	CREATE TYPE workspace_status AS ENUM ('a', 'b', ...)
+//	ALTER TYPE workspace_status ADD VALUE [IF NOT EXISTS] 'X' [BEFORE|AFTER 'Y']
+//
+// and returns the union of every value the enum will hold after all
+// migrations apply.
+func loadWorkspaceStatusEnum(t *testing.T, migrationsDir string) map[string]struct{} {
+	t.Helper()
+
+	out := make(map[string]struct{})
+
+	files, err := filepath.Glob(filepath.Join(migrationsDir, "*.up.sql"))
+	if err != nil {
+		t.Fatalf("glob migrations: %v", err)
+	}
+	sort.Strings(files)
+
+	createRE := regexp.MustCompile(`(?is)CREATE\s+TYPE\s+workspace_status\s+AS\s+ENUM\s*\(([^)]+)\)`)
+	addValueRE := regexp.MustCompile(`(?i)ALTER\s+TYPE\s+workspace_status\s+ADD\s+VALUE(?:\s+IF\s+NOT\s+EXISTS)?\s+'([^']+)'`)
+	literalRE := regexp.MustCompile(`'([^']+)'`)
+
+	for _, f := range files {
+		body, err := os.ReadFile(f)
+		if err != nil {
+			t.Fatalf("read %s: %v", f, err)
+		}
+		for _, m := range createRE.FindAllStringSubmatch(string(body), -1) {
+			for _, lit := range literalRE.FindAllStringSubmatch(m[1], -1) {
+				out[lit[1]] = struct{}{}
+			}
+		}
+		for _, m := range addValueRE.FindAllStringSubmatch(string(body), -1) {
+			out[m[1]] = struct{}{}
+		}
+	}
+	return out
+}
+
+// loadAllWorkspaceStatuses parses workspace_status.go and extracts:
+//
+//   - Every `Status… WorkspaceStatus = "..."` declaration in the const block.
+//   - Every entry in the AllWorkspaceStatuses slice literal.
+//
+// The gate asserts the slice's set equals (or is a subset of) the const
+// block's set, so a new status added to the const block but forgotten
+// in AllWorkspaceStatuses surfaces here. AllWorkspaceStatuses is the
+// canonical "what the codebase expects the DB to accept" list — any
+// const not in the slice is unenforced by the gate.
+func loadAllWorkspaceStatuses(t *testing.T, statusFile string) map[string]struct{} {
+	t.Helper()
+
+	fset := token.NewFileSet()
+	f, err := parser.ParseFile(fset, statusFile, nil, parser.ParseComments)
+	if err != nil {
+		t.Fatalf("parse %s: %v", statusFile, err)
+	}
+
+	consts := make(map[string]string)        // const name → string value
+	var sliceEntries []string                 // identifiers used in AllWorkspaceStatuses
+	allWorkspaceStatusesFound := false
+
+	ast.Inspect(f, func(n ast.Node) bool {
+		switch decl := n.(type) {
+		case *ast.GenDecl:
+			if decl.Tok == token.CONST {
+				for _, spec := range decl.Specs {
+					vs, ok := spec.(*ast.ValueSpec)
+					if !ok {
+						continue
+					}
+					for i, name := range vs.Names {
+						if !strings.HasPrefix(name.Name, "Status") {
+							continue
+						}
+						if i >= len(vs.Values) {
+							continue
+						}
+						lit, ok := vs.Values[i].(*ast.BasicLit)
+						if !ok || lit.Kind != token.STRING {
+							continue
+						}
+						unquoted := strings.Trim(lit.Value, `"`)
+						consts[name.Name] = unquoted
+					}
+				}
+			}
+			if decl.Tok == token.VAR {
+				for _, spec := range decl.Specs {
+					vs, ok := spec.(*ast.ValueSpec)
+					if !ok {
+						continue
+					}
+					for i, name := range vs.Names {
+						if name.Name != "AllWorkspaceStatuses" {
+							continue
+						}
+						allWorkspaceStatusesFound = true
+						if i >= len(vs.Values) {
+							continue
+						}
+						composite, ok := vs.Values[i].(*ast.CompositeLit)
+						if !ok {
+							continue
+						}
+						for _, elt := range composite.Elts {
+							ident, ok := elt.(*ast.Ident)
+							if !ok {
+								continue
+							}
+							sliceEntries = append(sliceEntries, ident.Name)
+						}
+					}
+				}
+			}
+		}
+		return true
+	})
+
+	if !allWorkspaceStatusesFound {
+		t.Fatalf("AllWorkspaceStatuses not found in %s", statusFile)
+	}
+
+	// Cross-check: every slice entry must resolve to a known const.
+	out := make(map[string]struct{})
+	for _, entry := range sliceEntries {
+		v, ok := consts[entry]
+		if !ok {
+			t.Errorf("AllWorkspaceStatuses references undefined identifier %q in %s", entry, statusFile)
+			continue
+		}
+		out[v] = struct{}{}
+	}
+
+	// Cross-check: every const must be in the slice (otherwise the
+	// gate runs against an outdated source-of-truth list).
+	sliceSet := make(map[string]struct{}, len(sliceEntries))
+	for _, e := range sliceEntries {
+		sliceSet[e] = struct{}{}
+	}
+	for name := range consts {
+		if _, ok := sliceSet[name]; !ok {
+			t.Errorf(
+				"const %q is declared but missing from AllWorkspaceStatuses in %s — "+
+					"add it to the slice or the drift gate cannot enforce migration coverage for it",
+				name, statusFile,
+			)
+		}
+	}
+
+	return out
+}
+
+// findHardCodedStatusWrites walks workspace-server/ production .go files
+// (excluding *_test.go) and returns any string literal that contains a
+// `SET status = '<literal>'` write against the workspaces table. Uses Go
+// AST so quoted snippets in comments don't false-positive.
+func findHardCodedStatusWrites(t *testing.T, srcRoot string) []string {
+	t.Helper()
+
+	// Match `SET status = '<lit>'` only in strings that also reference
+	// the workspaces table — narrows out a2a_queue / agents / approvals
+	// which have their own status enums.
+	literalRE := regexp.MustCompile(`(?is)UPDATE\s+workspaces\b[^']*?SET\s+status\s*=\s*'([^']+)'`)
+
+	var hits []string
+	walkErr := filepath.Walk(srcRoot, func(path string, info os.FileInfo, err error) error {
+		if err != nil {
+			return err
+		}
+		if info.IsDir() {
+			// Skip vendor + .git + migrations (literals there are intentional).
+			base := filepath.Base(path)
+			if base == "vendor" || base == ".git" || base == "migrations" {
+				return filepath.SkipDir
+			}
+			return nil
+		}
+		if !strings.HasSuffix(path, ".go") || strings.HasSuffix(path, "_test.go") {
+			return nil
+		}
+
+		fset := token.NewFileSet()
+		f, parseErr := parser.ParseFile(fset, path, nil, parser.ParseComments)
+		if parseErr != nil {
+			return nil
+		}
+
+		ast.Inspect(f, func(n ast.Node) bool {
+			lit, ok := n.(*ast.BasicLit)
+			if !ok || lit.Kind != token.STRING {
+				return true
+			}
+			s := lit.Value
+			if !strings.Contains(s, "UPDATE workspaces") && !strings.Contains(s, "UPDATE\nworkspaces") && !strings.Contains(s, "UPDATE\n\t\t\tworkspaces") {
+				return true
+			}
+			for _, m := range literalRE.FindAllStringSubmatch(s, -1) {
+				pos := fset.Position(lit.Pos())
+				rel, _ := filepath.Rel(srcRoot, path)
+				hits = append(hits, rel+":"+itoa(pos.Line)+" → SET status = '"+m[1]+"'")
+			}
+			return true
+		})
+		return nil
+	})
+	if walkErr != nil {
+		t.Fatalf("walk %s: %v", srcRoot, walkErr)
+	}
+	sort.Strings(hits)
+	return hits
+}
+
+func itoa(n int) string {
+	if n == 0 {
+		return "0"
+	}
+	neg := n < 0
+	if neg {
+		n = -n
+	}
+	var b [20]byte
+	i := len(b)
+	for n > 0 {
+		i--
+		b[i] = byte('0' + n%10)
+		n /= 10
+	}
+	if neg {
+		i--
+		b[i] = '-'
+	}
+	return string(b[i:])
+}
+
+func findRepoRoot(t *testing.T) string {
+	t.Helper()
+	dir, err := os.Getwd()
+	if err != nil {
+		t.Fatalf("getwd: %v", err)
+	}
+	for i := 0; i < 8; i++ {
+		if _, err := os.Stat(filepath.Join(dir, "workspace-server", "migrations")); err == nil {
+			return dir
+		}
+		parent := filepath.Dir(dir)
+		if parent == dir {
+			break
+		}
+		dir = parent
+	}
+	t.Fatalf("could not locate repo root with workspace-server/migrations from %s", dir)
+	return ""
+}
+
+func sortedKeys(m map[string]struct{}) []string {
+	out := make([]string, 0, len(m))
+	for k := range m {
+		out = append(out, k)
+	}
+	sort.Strings(out)
+	return out
+}
@@ -0,0 +1,318 @@
+package handlers
+
+// a2a_corpus_test.go — backward-compat replay gate for the A2A
+// JSON-RPC protocol surface. Every PR that touches
+// normalizeA2APayload OR bumps the a-2-a-sdk version pin runs
+// every shape in testdata/a2a_corpus/ through the current code
+// and asserts:
+//
+//   valid/   — every shape MUST parse without error and produce a
+//              canonical v0.3 payload (params.message.parts list).
+//
+//   invalid/ — every shape MUST be rejected with the documented
+//              status code and error substring. Pins the
+//              rejection contract so a future PR doesn't silently
+//              start accepting malformed payloads.
+//
+// Closes the gap that allowed the 2026-04-29 v0.2 → v0.3 silent-
+// drop bug (PR #2349). That bug shipped because the SDK bump PR
+// didn't replay v0.2-shaped inputs against the new code; the
+// shape-mismatch surfaced only in production when the receiver's
+// Pydantic validator silently rejected inbound messages.
+//
+// Adding to the corpus: see testdata/a2a_corpus/README.md.
+// Removing from valid/: breaking change, requires explicit
+// approval per the README.
+
+import (
+	"encoding/json"
+	"fmt"
+	"net/http"
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+)
+
+const (
+	corpusValidDir   = "testdata/a2a_corpus/valid"
+	corpusInvalidDir = "testdata/a2a_corpus/invalid"
+)
+
+// metadataFields are the documentation-only keys the corpus loader
+// strips before passing the payload to normalizeA2APayload. They
+// are required for every corpus entry per the README policy.
+var metadataFields = []string{
+	"_comment",
+	"_added",
+	"_source",
+	"_expect_error",
+	"_expect_status",
+}
+
+// loadCorpusEntry reads one JSON file, parses it as a generic map,
+// extracts the metadata fields (including expected error/status for
+// invalid entries), strips them from the payload, and returns the
+// stripped JSON bytes ready for normalizeA2APayload.
+func loadCorpusEntry(t *testing.T, path string) (payload []byte, expectErr string, expectStatus int) {
+	t.Helper()
+	raw, err := os.ReadFile(path)
+	if err != nil {
+		t.Fatalf("read %s: %v", path, err)
+	}
+	var doc map[string]interface{}
+	if err := json.Unmarshal(raw, &doc); err != nil {
+		t.Fatalf("parse %s as JSON: %v", path, err)
+	}
+	// Pull metadata before strip.
+	if v, ok := doc["_expect_error"].(string); ok {
+		expectErr = v
+	}
+	if v, ok := doc["_expect_status"].(float64); ok {
+		expectStatus = int(v)
+	}
+	for _, f := range metadataFields {
+		delete(doc, f)
+	}
+	payload, err = json.Marshal(doc)
+	if err != nil {
+		t.Fatalf("re-marshal %s after strip: %v", path, err)
+	}
+	return payload, expectErr, expectStatus
+}
+
+// listCorpus enumerates every .json file under dir and returns
+// (filename → full path). Sorted for stable test ordering.
+func listCorpus(t *testing.T, dir string) map[string]string {
+	t.Helper()
+	out := map[string]string{}
+	entries, err := os.ReadDir(dir)
+	if err != nil {
+		t.Fatalf("read %s: %v", dir, err)
+	}
+	for _, e := range entries {
+		if e.IsDir() || !strings.HasSuffix(e.Name(), ".json") {
+			continue
+		}
+		out[e.Name()] = filepath.Join(dir, e.Name())
+	}
+	if len(out) == 0 {
+		t.Fatalf("corpus dir %s is empty — at least one entry is required", dir)
+	}
+	return out
+}
+
+// TestA2ACorpus_ValidShapesParse replays every entry in valid/
+// through normalizeA2APayload and asserts:
+//  1. No error returned.
+//  2. The output's params.message.parts is a non-empty list
+//     (v0.3 canonical shape — the compat shim must have converted
+//     any v0.2 content field into parts).
+//  3. The output's params.message.messageId is non-empty (the
+//     normalizer auto-fills if the sender omitted it).
+//  4. The output's method matches the input's method (the
+//     normalizer is method-agnostic).
+//
+// One subtest per corpus entry — failures point directly at the
+// offending shape file.
+func TestA2ACorpus_ValidShapesParse(t *testing.T) {
+	t.Parallel()
+	for name, path := range listCorpus(t, corpusValidDir) {
+		t.Run(name, func(t *testing.T) {
+			payload, _, _ := loadCorpusEntry(t, path)
+
+			normalized, method, perr := normalizeA2APayload(payload)
+			if perr != nil {
+				t.Fatalf("valid/%s: normalizeA2APayload returned error %d: %v",
+					name, perr.Status, perr.Response)
+			}
+
+			// Read back the normalized payload to verify shape invariants.
+			var parsed map[string]interface{}
+			if err := json.Unmarshal(normalized, &parsed); err != nil {
+				t.Fatalf("valid/%s: normalized output not valid JSON: %v", name, err)
+			}
+
+			// Method-agnostic check — input method survives normalization.
+			if input := mustGetString(t, parsed, "method"); input != method {
+				t.Errorf("valid/%s: method mismatch — got %q, want %q",
+					name, method, input)
+			}
+
+			// Canonical v0.3 shape invariants: params.message.parts is a
+			// non-empty list, messageId is non-empty.
+			params := mustGetMap(t, parsed, "params")
+			msg := mustGetMap(t, params, "message")
+
+			parts, ok := msg["parts"].([]interface{})
+			if !ok {
+				t.Errorf("valid/%s: params.message.parts is not a list (got %T)",
+					name, msg["parts"])
+				return
+			}
+			if len(parts) == 0 {
+				t.Errorf("valid/%s: params.message.parts is empty — compat shim should have converted content", name)
+			}
+
+			if id := mustGetString(t, msg, "messageId"); id == "" {
+				t.Errorf("valid/%s: params.message.messageId is empty after normalization", name)
+			}
+
+			// content must NOT survive into the output — the shim
+			// deletes it after converting to parts. If the shim left
+			// content in place, downstream pydantic v0.3 would still
+			// reject because it doesn't know that field.
+			if _, hasContent := msg["content"]; hasContent {
+				t.Errorf("valid/%s: params.message.content survived normalization (compat shim should delete it)", name)
+			}
+		})
+	}
+}
+
+// TestA2ACorpus_InvalidShapesRejected replays every entry in
+// invalid/ through normalizeA2APayload and asserts the rejection
+// matches the documented contract — same status code AND error
+// substring as recorded in the corpus entry's metadata.
+//
+// Catches the regression class "future PR adds permissive defaults
+// that silently accept what we used to reject loud."
+func TestA2ACorpus_InvalidShapesRejected(t *testing.T) {
+	t.Parallel()
+	for name, path := range listCorpus(t, corpusInvalidDir) {
+		t.Run(name, func(t *testing.T) {
+			payload, expectErr, expectStatus := loadCorpusEntry(t, path)
+
+			if expectErr == "" {
+				t.Fatalf("invalid/%s: missing _expect_error metadata", name)
+			}
+			if expectStatus == 0 {
+				t.Fatalf("invalid/%s: missing _expect_status metadata", name)
+			}
+
+			_, _, perr := normalizeA2APayload(payload)
+			if perr == nil {
+				t.Fatalf("invalid/%s: normalizeA2APayload returned no error — should have rejected", name)
+			}
+			if perr.Status != expectStatus {
+				t.Errorf("invalid/%s: status = %d, want %d", name, perr.Status, expectStatus)
+			}
+
+			body, _ := json.Marshal(perr.Response)
+			if !strings.Contains(string(body), expectErr) {
+				t.Errorf("invalid/%s: error response %q does not contain expected substring %q",
+					name, string(body), expectErr)
+			}
+		})
+	}
+}
+
+// TestA2ACorpus_MalformedJSONRejected covers the case where the
+// body isn't valid JSON at all. The corpus is JSON-only so this
+// can't be expressed as a corpus entry; pin the contract inline.
+func TestA2ACorpus_MalformedJSONRejected(t *testing.T) {
+	t.Parallel()
+	cases := []struct {
+		name    string
+		payload []byte
+	}{
+		{"truncated_object", []byte(`{"jsonrpc":"2.0","method":"message/send"`)},
+		{"not_json_at_all", []byte(`this is not json`)},
+		{"empty_body", []byte(``)},
+		{"only_whitespace", []byte(`   `)},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			_, _, perr := normalizeA2APayload(tc.payload)
+			if perr == nil {
+				t.Fatalf("expected error for %s, got none", tc.name)
+			}
+			if perr.Status != http.StatusBadRequest {
+				t.Errorf("status = %d, want %d", perr.Status, http.StatusBadRequest)
+			}
+			body, _ := json.Marshal(perr.Response)
+			if !strings.Contains(string(body), "invalid JSON") {
+				t.Errorf("expected 'invalid JSON' in response, got %q", string(body))
+			}
+		})
+	}
+}
+
+// TestA2ACorpus_HasMinimumCoverage pins the corpus's
+// representativeness. The corpus must have at least one v0.2
+// entry (string content) and at least one v0.3 entry (parts list)
+// — losing either side of the schema bridge would silently drop
+// the most important coverage.
+func TestA2ACorpus_HasMinimumCoverage(t *testing.T) {
+	t.Parallel()
+	files := listCorpus(t, corpusValidDir)
+	hasV02 := false
+	hasV03 := false
+	for name := range files {
+		if strings.Contains(name, "v0_2_") {
+			hasV02 = true
+		}
+		if strings.Contains(name, "v0_3_") {
+			hasV03 = true
+		}
+	}
+	if !hasV02 {
+		t.Error("corpus has no v0_2_*.json entries — backward-compat coverage missing")
+	}
+	if !hasV03 {
+		t.Error("corpus has no v0_3_*.json entries — forward (canonical) coverage missing")
+	}
+}
+
+// TestA2ACorpus_EveryEntryHasMetadata pins the README policy:
+// every corpus entry MUST have _comment, _added, _source. Catches
+// the bad commit shape "added entry without explanation" before
+// review.
+func TestA2ACorpus_EveryEntryHasMetadata(t *testing.T) {
+	t.Parallel()
+	for _, dir := range []string{corpusValidDir, corpusInvalidDir} {
+		for name, path := range listCorpus(t, dir) {
+			t.Run(filepath.Base(dir)+"/"+name, func(t *testing.T) {
+				raw, err := os.ReadFile(path)
+				if err != nil {
+					t.Fatalf("read %s: %v", path, err)
+				}
+				var doc map[string]interface{}
+				if err := json.Unmarshal(raw, &doc); err != nil {
+					t.Fatalf("parse %s: %v", path, err)
+				}
+				required := []string{"_comment", "_added", "_source"}
+				if dir == corpusInvalidDir {
+					required = append(required, "_expect_error", "_expect_status")
+				}
+				for _, key := range required {
+					if _, ok := doc[key]; !ok {
+						t.Errorf("missing required metadata field %q", key)
+					}
+				}
+			})
+		}
+	}
+}
+
+func mustGetMap(t *testing.T, m map[string]interface{}, key string) map[string]interface{} {
+	t.Helper()
+	v, ok := m[key].(map[string]interface{})
+	if !ok {
+		t.Fatalf("expected %q to be a map, got %T", key, m[key])
+	}
+	return v
+}
+
+func mustGetString(t *testing.T, m map[string]interface{}, key string) string {
+	t.Helper()
+	v, ok := m[key].(string)
+	if !ok {
+		t.Fatalf("expected %q to be a string, got %T", key, m[key])
+	}
+	return v
+}
+
+// _ silences the unused-import linter for fmt in case future
+// helpers don't use it. Currently used by the t.Helper-style
+// formatters above (kept inline for clarity).
+var _ = fmt.Sprintf
@@ -13,6 +13,7 @@ import (
 	"errors"
 	"io"
 	"log"
+	"net"
 	"net/http"
 	"os"
 	"strconv"
@@ -21,8 +22,10 @@ import (

 	"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
 	"github.com/Molecule-AI/molecule-monorepo/platform/internal/events"
+	"github.com/Molecule-AI/molecule-monorepo/platform/internal/models"
 	"github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner"
 	"github.com/Molecule-AI/molecule-monorepo/platform/internal/registry"
+	"github.com/Molecule-AI/molecule-monorepo/platform/internal/wsauth"
 	"github.com/gin-gonic/gin"
 	"github.com/google/uuid"
 )
@@ -90,13 +93,47 @@ func isSystemCaller(callerID string) bool {
 const maxProxyResponseBody = 10 << 20

 // a2aClient is a shared HTTP client for proxying A2A requests to workspace agents.
-// No client-level timeout — timeouts are enforced per-request via context
-// deadlines: canvas = 5 min (Rule 3), agent-to-agent = 30 min (DoS cap). Do NOT
-// set a Client.Timeout here: it is enforced independently of ctx deadlines and
-// would pre-empt legitimate slow cold-start flows (e.g. Claude Code first-token
-// over OAuth can take 30-60s on boot). Callers that want a safety net should
-// build a context.WithTimeout themselves.
-var a2aClient = &http.Client{}
+//
+// Timeout model — three independent budgets, none of which gets in each other's way:
+//
+//   1. Client.Timeout — DELIBERATELY UNSET. Client.Timeout is a hard wall on
+//      the entire request including streamed body reads, and would pre-empt
+//      legitimate slow cold-start flows (Claude Code first-token over OAuth
+//      can take 30-60s on boot; long-running agent synthesis can stream
+//      tokens for minutes). Total-request budget is enforced per-request
+//      via context deadline (canvas = idle-only, agent-to-agent = 30 min ceiling).
+//
+//   2. Transport.DialContext — 10s connect timeout. When a workspace's EC2
+//      black-holes TCP connects (instance terminated mid-flight, security group
+//      flipped, NACL bug), the OS default is 75s on Linux / 21s on macOS — long
+//      enough that Cloudflare's ~100s edge timeout can fire first and surface
+//      a generic 502 page to canvas. 10s is well above realistic intra-region
+//      latencies and well below CF's edge timeout.
+//
+//   3. Transport.ResponseHeaderTimeout — 60s. From request-body-end to
+//      response-headers-start. Covers cold-start first-byte (the 30-60s OAuth
+//      flow above), with margin. Body streaming after headers is governed by
+//      the per-request context deadline, NOT this timeout — so multi-minute
+//      agent responses still work fine.
+//
+// The point of (2) and (3) is to surface a *structured* 503 from
+// handleA2ADispatchError when the workspace agent is unreachable, so canvas
+// gets `{"error":"workspace agent unreachable","restarting":true}` instead
+// of Cloudflare's opaque 502 error page. Without these, dead workspaces hang
+// long enough that CF gives up first and shows its own page.
+var a2aClient = &http.Client{
+	Transport: &http.Transport{
+		DialContext: (&net.Dialer{
+			Timeout:   10 * time.Second,
+			KeepAlive: 30 * time.Second,
+		}).DialContext,
+		ResponseHeaderTimeout: 60 * time.Second,
+		TLSHandshakeTimeout:   10 * time.Second,
+		// MaxIdleConns / IdleConnTimeout: stdlib defaults are fine; agent
+		// fan-in is bounded by the platform's broadcaster fan-out, not by
+		// connection-pool sizing.
+	},
+}

 type proxyA2AError struct {
 	Status   int
@@ -144,6 +181,35 @@ func isUpstreamBusyError(err error) bool {
 		strings.Contains(msg, "connection reset")
 }

+// isUpstreamDeadStatus returns true when the upstream HTTP status indicates
+// the workspace agent is unreachable / unresponsive at the network layer
+// (vs an agent-authored 5xx with a real body). Used by the proxy to gate
+// reactive container-dead detection + auto-restart.
+//
+//   - 502 Bad Gateway, 503 Service Unavailable, 504 Gateway Timeout: standard
+//     proxy-layer "upstream is broken" codes (Cloudflare, ELB, agent tunnel).
+//   - 521 Web Server Is Down: Cloudflare can't open TCP to origin (most
+//     direct dead-EC2 signal).
+//   - 522 Connection Timed Out: Cloudflare opened TCP but no response within
+//     ~15s — typical of SG/NACL flap or agent process hung.
+//   - 523 Origin Is Unreachable: Cloudflare can't route to origin (DNS or
+//     network-path failure).
+//   - 524 A Timeout Occurred: TCP succeeded, but origin didn't return
+//     headers within ~100s — agent process alive but wedged.
+//
+// We always probe IsRunning before acting, so a transient false positive
+// from this set just costs one CP API call.
+func isUpstreamDeadStatus(status int) bool {
+	switch status {
+	case http.StatusBadGateway, // 502
+		http.StatusServiceUnavailable, // 503
+		http.StatusGatewayTimeout,     // 504
+		521, 522, 523, 524:            // CF dead-origin family
+		return true
+	}
+	return false
+}
+
 func (e *proxyA2AError) Error() string {
 	if e == nil || e.Response == nil {
 		return "proxy a2a error"
@@ -192,6 +258,27 @@ func (h *WorkspaceHandler) ProxyA2A(c *gin.Context) {

 	callerID := c.GetHeader("X-Workspace-ID")

+	// #2306: when X-Workspace-ID isn't set, derive callerID from the bearer
+	// token's owning workspace. External callers (third-party SDKs, the
+	// channel plugin, etc.) authenticate purely via bearer and frequently
+	// don't set the header — without this, activity_logs.source_id ends up
+	// NULL and downstream consumers (notification peer_id, "Agent Comms by
+	// peer" tab, analytics) can't identify the sender. The bearer is the
+	// authoritative caller identity per the wsauth contract; the header is
+	// just a display/routing hint that must agree with it.
+	//
+	// Skip when an org-level token is in play (canvas/admin path) — those
+	// tokens grant org-wide access and don't bind to a single workspace.
+	if callerID == "" {
+		if _, isOrg := c.Get("org_token_id"); !isOrg {
+			if tok := wsauth.BearerTokenFromHeader(c.GetHeader("Authorization")); tok != "" {
+				if wsID, err := wsauth.WorkspaceFromToken(ctx, db.DB, tok); err == nil {
+					callerID = wsID
+				}
+			}
+		}
+	}
+
 	// #761 SECURITY: reject requests where the client-supplied X-Workspace-ID
 	// contains a system-caller prefix. isSystemCaller() bypasses both token
 	// validation and CanCommunicate. On the public /a2a endpoint, system-caller
@@ -283,17 +370,54 @@ func (h *WorkspaceHandler) proxyA2ARequest(ctx context.Context, workspaceID stri
 		return 0, nil, proxyErr
 	}

-	agentURL, proxyErr := h.resolveAgentURL(ctx, workspaceID)
-	if proxyErr != nil {
-		return 0, nil, proxyErr
-	}
-
+	// Normalize the JSON-RPC envelope BEFORE the poll-mode short-circuit
+	// so the activity_logs entry carries the protocol method name (initialize,
+	// message/send, etc.) — the polling agent uses that to dispatch the
+	// request body to the right handler. Doing it here also means a
+	// malformed payload fails the same way for push and poll callers
+	// (consistent 400 instead of "queued garbage").
 	normalizedBody, a2aMethod, proxyErr := normalizeA2APayload(body)
 	if proxyErr != nil {
 		return 0, nil, proxyErr
 	}
 	body = normalizedBody

+	// #2339 PR 2 — poll-mode short-circuit. When the target workspace
+	// is registered as delivery_mode=poll (e.g. an operator's laptop
+	// running molecule-mcp-claude-channel), the platform does NOT
+	// dispatch over HTTP — the agent has no public URL. Instead we record
+	// the A2A request to activity_logs and the agent picks it up via
+	// GET /activity?since_id= (PR 3).
+	//
+	// Returning here means we skip resolveAgentURL entirely (no SSRF check
+	// needed — there's no URL to validate; no DNS lookup against potentially-
+	// changing operator-side IPs) and skip the dispatch path completely
+	// (no Do(), no maybeMarkContainerDead). The response is a synthetic
+	// {status:"queued"} envelope so the caller (canvas, another workspace)
+	// knows delivery is acknowledged but pending consumption.
+	if lookupDeliveryMode(ctx, workspaceID) == models.DeliveryModePoll {
+		if logActivity {
+			h.logA2AReceiveQueued(ctx, workspaceID, callerID, body, a2aMethod)
+		}
+		respBody, marshalErr := json.Marshal(gin.H{
+			"status":        "queued",
+			"delivery_mode": models.DeliveryModePoll,
+			"method":        a2aMethod,
+		})
+		if marshalErr != nil {
+			return 0, nil, &proxyA2AError{
+				Status:   http.StatusInternalServerError,
+				Response: gin.H{"error": "failed to marshal poll-mode response"},
+			}
+		}
+		return http.StatusOK, respBody, nil
+	}
+
+	agentURL, proxyErr := h.resolveAgentURL(ctx, workspaceID)
+	if proxyErr != nil {
+		return 0, nil, proxyErr
+	}
+
 	startTime := time.Now()
 	resp, cancelFwd, err := h.dispatchA2A(ctx, workspaceID, agentURL, body, callerID)
 	if cancelFwd != nil {
@@ -362,6 +486,43 @@ func (h *WorkspaceHandler) proxyA2ARequest(ctx context.Context, workspaceID stri
 		if errMsg == "" {
 			errMsg = http.StatusText(resp.StatusCode)
 		}
+
+		// Upstream returned 502/503/504 (gateway/proxy failure). This is
+		// the "agent process is dead but the tunnel between us and the
+		// workspace is still up" signal — handleA2ADispatchError's
+		// network-error path doesn't run because Do() succeeded at the
+		// HTTP layer. Without this branch, the dead-agent failure mode
+		// surfaces to canvas as a generic 502 (and CF in front of the
+		// platform masks it with its own error page, hiding any
+		// structured response we might write).
+		//
+		// Treatment matches handleA2ADispatchError's container-dead path:
+		//   1. Probe IsRunning via maybeMarkContainerDead. If the
+		//      container truly is dead, mark workspace offline + kick
+		//      a restart goroutine.
+		//   2. Return a structured 503 with restarting=true + Retry-After
+		//      so canvas shows a useful "agent is restarting" message
+		//      (and CF doesn't intercept the 503 the way it does 502).
+		// If IsRunning reports the container is alive, we leave the
+		// upstream status untouched — the agent legitimately returned
+		// 502/503/504 (e.g. it's returning its own Bad-Gateway from
+		// some downstream call) and we shouldn't mistakenly recycle it.
+		//
+		// Empty body is the strong signal here — a CF-tunnel "no-origin"
+		// 502 has 0 bytes; an agent-authored 502 typically has a JSON
+		// error body. We probe IsRunning regardless (it's the
+		// authoritative check) but the empty-body case is what makes
+		// this fix necessary.
+		if isUpstreamDeadStatus(resp.StatusCode) {
+			if h.maybeMarkContainerDead(ctx, workspaceID) {
+				return 0, nil, &proxyA2AError{
+					Status:   http.StatusServiceUnavailable,
+					Headers:  map[string]string{"Retry-After": "15"},
+					Response: gin.H{"error": "workspace agent unreachable — container restart triggered", "restarting": true, "retry_after": 15},
+				}
+			}
+		}
+
 		return resp.StatusCode, respBody, &proxyA2AError{
 			Status:   resp.StatusCode,
 			Response: gin.H{"error": errMsg},
@@ -464,11 +625,54 @@ func normalizeA2APayload(body []byte) ([]byte, string, *proxyA2AError) {
 	}

 	// Ensure params.message.messageId exists (required by a2a-sdk)
+	// AND v0.2→v0.3 compat (#2345): when sender supplies
+	// params.message.content (v0.2) instead of params.message.parts
+	// (v0.3), wrap the content as a single text Part so the downstream
+	// a2a-sdk's v0.3 Pydantic validator accepts the message.
+	//
+	// Pre-fix: Design Director silently dropped briefs whose sender
+	// used v0.2 shape — Pydantic rejected at parse time, the rejection
+	// went only to logs, and the sender saw a happy 200/202.
+	//
+	// Reject loud (HTTP 400) when neither content nor parts is present;
+	// previously the SDK's own rejection happened post-handler-dispatch
+	// and was invisible to the original sender.
 	if params, ok := payload["params"].(map[string]interface{}); ok {
 		if msg, ok := params["message"].(map[string]interface{}); ok {
 			if _, hasID := msg["messageId"]; !hasID {
 				msg["messageId"] = uuid.New().String()
 			}
+			_, hasParts := msg["parts"]
+			rawContent, hasContent := msg["content"]
+			if !hasParts {
+				if hasContent {
+					switch v := rawContent.(type) {
+					case string:
+						msg["parts"] = []interface{}{
+							map[string]interface{}{"kind": "text", "text": v},
+						}
+					case []interface{}:
+						msg["parts"] = v
+					default:
+						return nil, "", &proxyA2AError{
+							Status: http.StatusBadRequest,
+							Response: gin.H{
+								"error": "invalid params.message.content type",
+								"hint":  "content must be a string (v0.2 compat) or omitted in favour of parts (v0.3)",
+							},
+						}
+					}
+					delete(msg, "content")
+				} else {
+					return nil, "", &proxyA2AError{
+						Status: http.StatusBadRequest,
+						Response: gin.H{
+							"error": "params.message must contain either 'parts' (v0.3) or 'content' (v0.2 compat)",
+							"hint":  "v0.3 example: {\"parts\":[{\"kind\":\"text\",\"text\":\"...\"}]}",
+						},
+					}
+				}
+			}
 		}
 	}

@@ -5,6 +5,7 @@ package handlers

 import (
 	"context"
+	"database/sql"
 	"encoding/json"
 	"errors"
 	"log"
@@ -13,6 +14,7 @@ import (
 	"time"

 	"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
+	"github.com/Molecule-AI/molecule-monorepo/platform/internal/models"
 	"github.com/Molecule-AI/molecule-monorepo/platform/internal/wsauth"
 	"github.com/gin-gonic/gin"
 )
@@ -97,8 +99,16 @@ func (h *WorkspaceHandler) handleA2ADispatchError(ctx context.Context, workspace
 		}

 		idempotencyKey := extractIdempotencyKey(body)
+		// Honor params.expires_in_seconds when the caller specifies one. Zero
+		// (the unset default) → expiresAt = nil → infinite TTL preserved by
+		// DequeueNext. RFC #2331 Tier 1.
+		var expiresAt *time.Time
+		if secs := extractExpiresInSeconds(body); secs > 0 {
+			t := time.Now().Add(time.Duration(secs) * time.Second)
+			expiresAt = &t
+		}
 		if qid, depth, qerr := EnqueueA2A(
-			ctx, workspaceID, callerID, PriorityTask, body, a2aMethod, idempotencyKey,
+			ctx, workspaceID, callerID, PriorityTask, body, a2aMethod, idempotencyKey, expiresAt,
 		); qerr == nil {
 			log.Printf("ProxyA2A: target %s busy — enqueued as %s (depth=%d)", workspaceID, qid, depth)
 			respBody, _ := json.Marshal(gin.H{
@@ -131,29 +141,53 @@ func (h *WorkspaceHandler) handleA2ADispatchError(ctx context.Context, workspace
 }

 // maybeMarkContainerDead runs the reactive health check after a forward error.
-// If the workspace's Docker container is no longer running (and the workspace
-// isn't external), it marks the workspace offline, clears Redis state,
-// broadcasts WORKSPACE_OFFLINE, and triggers an async restart. Returns true
-// when the container was found dead.
+// If the workspace's compute (Docker container OR EC2 instance) is no longer
+// running (and the workspace isn't external), it marks the workspace offline,
+// clears Redis state, broadcasts WORKSPACE_OFFLINE, and triggers an async
+// restart. Returns true when the compute was found dead.
+//
+// Provisioner selection (mutually exclusive in production):
+//   - h.provisioner != nil  → local Docker deployment; IsRunning does docker inspect.
+//   - h.cpProv != nil       → SaaS / EC2 deployment; IsRunning calls CP's
+//                              /cp/workspaces/:id/status to read the EC2 state.
+//
+// Pre-fix this function ONLY consulted h.provisioner — for SaaS tenants
+// (h.provisioner=nil, h.cpProv=set) it short-circuited to false on every
+// call, so a dead EC2 agent would propagate upstream 502/503/504 to canvas
+// with no auto-recovery and Cloudflare in front would mask the response with
+// its own error page. The 2026-04-30 hongmingwang.moleculesai.app
+// canvas-chat-to-dead-workspace incident traces to exactly this gap.
 func (h *WorkspaceHandler) maybeMarkContainerDead(ctx context.Context, workspaceID string) bool {
 	var wsRuntime string
 	db.DB.QueryRowContext(ctx, `SELECT COALESCE(runtime, 'langgraph') FROM workspaces WHERE id = $1`, workspaceID).Scan(&wsRuntime)
-	if h.provisioner == nil || wsRuntime == "external" {
+	if wsRuntime == "external" {
 		return false
 	}
-	running, inspectErr := h.provisioner.IsRunning(ctx, workspaceID)
+	if h.provisioner == nil && h.cpProv == nil {
+		return false
+	}
+
+	var running bool
+	var inspectErr error
+	if h.provisioner != nil {
+		running, inspectErr = h.provisioner.IsRunning(ctx, workspaceID)
+	} else {
+		// SaaS path: ask the CP about the EC2 state. Same (true, err) on
+		// transport errors contract — keeps the caller on the alive path
+		// instead of triggering a restart cascade on a flaky CP call.
+		running, inspectErr = h.cpProv.IsRunning(ctx, workspaceID)
+	}
 	if inspectErr != nil {
-		// Transient Docker-daemon error (timeout, socket EOF, etc.). Post-
-		// #386, IsRunning returns (true, err) in this case — caller stays
-		// on the alive path and does not trigger a restart cascade. Log
-		// so the defect is visible without being destructive.
+		// Transient backend error (Docker daemon EOF, CP HTTP 5xx, etc.).
+		// IsRunning's contract returns (true, err) in this case so we stay
+		// on the alive path without triggering a restart cascade.
 		log.Printf("ProxyA2A: IsRunning for %s returned transient error (assuming alive): %v", workspaceID, inspectErr)
 	}
 	if running {
 		return false
 	}
 	log.Printf("ProxyA2A: container for %s is dead — marking offline and triggering restart", workspaceID)
-	if _, err := db.DB.ExecContext(ctx, `UPDATE workspaces SET status = 'offline', updated_at = now() WHERE id = $1 AND status NOT IN ('removed', 'provisioning')`, workspaceID); err != nil {
+	if _, err := db.DB.ExecContext(ctx, `UPDATE workspaces SET status = $1, updated_at = now() WHERE id = $2 AND status NOT IN ('removed', 'provisioning')`, models.StatusOffline, workspaceID); err != nil {
 		log.Printf("ProxyA2A: failed to mark workspace %s offline: %v", workspaceID, err)
 	}
 	db.ClearWorkspaceKeys(ctx, workspaceID)
@@ -368,6 +402,74 @@ func parseUsageFromA2AResponse(body []byte) (inputTokens, outputTokens int64) {
 	return 0, 0
 }

+// lookupDeliveryMode returns the workspace's delivery_mode. On any DB
+// error or missing row it returns DeliveryModePush — the fail-closed
+// default. "Closed" here means "fall back to today's behavior (synchronous
+// dispatch)" rather than "fall back to drop the request silently into
+// activity_logs where the agent might never see it." A poll-mode workspace
+// that briefly reads as push will get its A2A request dispatched to the
+// stored URL (or a 502 if no URL); a push-mode workspace that briefly
+// reads as poll would get its request silently queued with no dispatch.
+// The first failure is loud + recoverable; the second is silent.
+//
+// The function is intentionally lookup-only — it never mutates the row.
+// The register handler (registry.go) is the only writer for delivery_mode.
+//
+// See #2339 PR 1 for the column + register-flow side; this is the
+// proxy-side read used for the short-circuit in proxyA2ARequest.
+func lookupDeliveryMode(ctx context.Context, workspaceID string) string {
+	var mode sql.NullString
+	err := db.DB.QueryRowContext(ctx,
+		`SELECT delivery_mode FROM workspaces WHERE id = $1`, workspaceID,
+	).Scan(&mode)
+	if err != nil {
+		if !errors.Is(err, sql.ErrNoRows) {
+			log.Printf("ProxyA2A: lookupDeliveryMode(%s) failed (%v) — defaulting to push", workspaceID, err)
+		}
+		return models.DeliveryModePush
+	}
+	if !mode.Valid || mode.String == "" {
+		return models.DeliveryModePush
+	}
+	if !models.IsValidDeliveryMode(mode.String) {
+		log.Printf("ProxyA2A: workspace %s has invalid delivery_mode=%q — defaulting to push", workspaceID, mode.String)
+		return models.DeliveryModePush
+	}
+	return mode.String
+}
+
+// logA2AReceiveQueued records a poll-mode "queued" A2A receive into
+// activity_logs. Same shape as logA2ASuccess but without ResponseBody
+// (there is no response yet — the polling agent will produce one when
+// it picks the request up). status="ok" because the request was
+// successfully queued; the consume side reports its own outcome.
+//
+// The activity_logs row is what the polling agent's GET /activity?since_id=
+// reads in PR 3 — that's how a poll-mode workspace receives inbound A2A
+// without a public URL.
+func (h *WorkspaceHandler) logA2AReceiveQueued(ctx context.Context, workspaceID, callerID string, body []byte, a2aMethod string) {
+	var wsName string
+	db.DB.QueryRowContext(ctx, `SELECT name FROM workspaces WHERE id = $1`, workspaceID).Scan(&wsName)
+	if wsName == "" {
+		wsName = workspaceID
+	}
+	summary := a2aMethod + " → " + wsName + " (queued for poll)"
+	go func(parent context.Context) {
+		logCtx, cancel := context.WithTimeout(context.WithoutCancel(parent), 30*time.Second)
+		defer cancel()
+		LogActivity(logCtx, h.broadcaster, ActivityParams{
+			WorkspaceID:  workspaceID,
+			ActivityType: "a2a_receive",
+			SourceID:     nilIfEmpty(callerID),
+			TargetID:     &workspaceID,
+			Method:       &a2aMethod,
+			Summary:      &summary,
+			RequestBody:  json.RawMessage(body),
+			Status:       "ok",
+		})
+	}(ctx)
+}
+
 // readUsageMap extracts input_tokens / output_tokens from the "usage" key of m.
 // Returns (0, 0, false) when the key is absent or contains no non-zero values.
 func readUsageMap(m map[string]json.RawMessage) (inputTokens, outputTokens int64, ok bool) {
@@ -11,10 +11,13 @@ import (
 	"net/http"
 	"net/http/httptest"
 	"os"
+	"strings"
 	"testing"
 	"time"

 	"github.com/DATA-DOG/go-sqlmock"
+	"github.com/Molecule-AI/molecule-monorepo/platform/internal/models"
+	"github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner"
 	"github.com/gin-gonic/gin"
 )

@@ -243,6 +246,117 @@ func TestProxyA2A_AgentReturnsError(t *testing.T) {
 	}
 }

+// TestProxyA2A_Upstream502_TriggersContainerDeadCheck — when the agent
+// tunnel returns 502 (the "tunnel up but no origin" failure mode that
+// surfaces a Cloudflare error page to canvas), proxyA2A must consult
+// IsRunning on cpProv. If the EC2 instance truly is dead, the response
+// becomes a structured 503 with restarting=true (not the upstream 502
+// which CF would mask), and the workspace flips to status='offline' so
+// the next reactive poll sees the right state. This is the
+// 2026-04-30 hongmingwang.moleculesai.app canvas-chat-to-dead-workspace
+// regression: upstream 502 was previously propagated as-is, CF masked
+// it, and no auto-restart fired.
+func TestProxyA2A_Upstream502_TriggersContainerDeadCheck(t *testing.T) {
+	mock := setupTestDB(t)
+	mr := setupTestRedis(t)
+	allowLoopbackForTest(t)
+	broadcaster := newTestBroadcaster()
+	handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir())
+	cp := &fakeCPProv{running: false}
+	handler.SetCPProvisioner(cp)
+
+	// Agent tunnel returns 502 with empty body — the CF "no-origin" shape.
+	agentServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.WriteHeader(http.StatusBadGateway)
+	}))
+	defer agentServer.Close()
+
+	mr.Set(fmt.Sprintf("ws:%s:url", "ws-tunnel-dead"), agentServer.URL)
+	expectBudgetCheck(mock, "ws-tunnel-dead")
+	// Activity log fires (delivery_confirmed is true on Do() success regardless
+	// of upstream status — handler's existing logA2ASuccess path runs first
+	// and logs as success because the dispatch did get a response).
+	mock.ExpectExec("INSERT INTO activity_logs").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+	// maybeMarkContainerDead's runtime lookup, then the offline-flip UPDATE.
+	mock.ExpectQuery(`SELECT COALESCE\(runtime, 'langgraph'\) FROM workspaces WHERE id =`).
+		WithArgs("ws-tunnel-dead").
+		WillReturnRows(sqlmock.NewRows([]string{"runtime"}).AddRow("hermes"))
+	mock.ExpectExec(`UPDATE workspaces SET status =`).
+		WithArgs(models.StatusOffline, "ws-tunnel-dead").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Params = gin.Params{{Key: "id", Value: "ws-tunnel-dead"}}
+	body := `{"method":"message/send","params":{"message":{"role":"user","parts":[{"text":"hi"}]}}}`
+	c.Request = httptest.NewRequest("POST", "/workspaces/ws-tunnel-dead/a2a", bytes.NewBufferString(body))
+	c.Request.Header.Set("Content-Type", "application/json")
+
+	handler.ProxyA2A(c)
+
+	time.Sleep(80 * time.Millisecond)
+
+	// Caller sees a structured 503 (NOT the upstream 502 which CF would mask).
+	if w.Code != http.StatusServiceUnavailable {
+		t.Fatalf("upstream 502 should translate to 503 once cpProv reports dead; got %d: %s", w.Code, w.Body.String())
+	}
+	if !strings.Contains(w.Body.String(), "restarting") {
+		t.Errorf("response body should mention restart trigger; got %s", w.Body.String())
+	}
+	if w.Header().Get("Retry-After") != "15" {
+		t.Errorf("Retry-After header should be 15 to throttle canvas-side retry loop; got %q", w.Header().Get("Retry-After"))
+	}
+	if cp.calls != 1 {
+		t.Errorf("cpProv.IsRunning must be consulted exactly once; got %d calls", cp.calls)
+	}
+}
+
+// TestProxyA2A_Upstream502_AliveAgent_PropagatesAsIs — the safety check:
+// if cpProv reports the EC2 IS running, the upstream 502 is propagated
+// as-is. Don't recycle a healthy agent on a transient hiccup — the agent
+// might have legitimately returned 502 (e.g. a downstream service it
+// called returned 502 and it forwarded). Net behavior matches pre-fix
+// for the alive-agent case.
+func TestProxyA2A_Upstream502_AliveAgent_PropagatesAsIs(t *testing.T) {
+	mock := setupTestDB(t)
+	mr := setupTestRedis(t)
+	allowLoopbackForTest(t)
+	broadcaster := newTestBroadcaster()
+	handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir())
+	cp := &fakeCPProv{running: true}
+	handler.SetCPProvisioner(cp)
+
+	agentServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.Header().Set("Content-Type", "application/json")
+		w.WriteHeader(http.StatusBadGateway)
+		fmt.Fprint(w, `{"error":"downstream service returned 502"}`)
+	}))
+	defer agentServer.Close()
+
+	mr.Set(fmt.Sprintf("ws:%s:url", "ws-alive-502"), agentServer.URL)
+	expectBudgetCheck(mock, "ws-alive-502")
+	mock.ExpectExec("INSERT INTO activity_logs").WillReturnResult(sqlmock.NewResult(0, 1))
+	// IsRunning runtime lookup runs but no UPDATE follows (running=true).
+	mock.ExpectQuery(`SELECT COALESCE\(runtime, 'langgraph'\) FROM workspaces WHERE id =`).
+		WithArgs("ws-alive-502").
+		WillReturnRows(sqlmock.NewRows([]string{"runtime"}).AddRow("hermes"))
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Params = gin.Params{{Key: "id", Value: "ws-alive-502"}}
+	body := `{"method":"message/send","params":{"message":{"role":"user","parts":[{"text":"hi"}]}}}`
+	c.Request = httptest.NewRequest("POST", "/workspaces/ws-alive-502/a2a", bytes.NewBufferString(body))
+	c.Request.Header.Set("Content-Type", "application/json")
+
+	handler.ProxyA2A(c)
+	time.Sleep(50 * time.Millisecond)
+
+	if w.Code != http.StatusBadGateway {
+		t.Fatalf("alive agent 502 should propagate as 502; got %d: %s", w.Code, w.Body.String())
+	}
+}
+
 // ==================== ProxyA2A — messageId injection ====================

 func TestProxyA2A_MessageIDInjected(t *testing.T) {
@@ -504,6 +618,182 @@ func TestA2AProxy_SystemCallerForge_IsRejected(t *testing.T) {
 	}
 }

+// ==================== ProxyA2A — bearer-derived callerID (#2306) ====================
+
+// TestProxyA2A_CallerIDDerivedFromBearer verifies that when X-Workspace-ID
+// is absent, ProxyA2A derives the callerID from the bearer token's owning
+// workspace. Without this, third-party SDKs that authenticate purely via
+// bearer end up with activity_logs.source_id=NULL, breaking peer_id and
+// "Agent Comms by peer" downstream signals.
+func TestProxyA2A_CallerIDDerivedFromBearer(t *testing.T) {
+	mock := setupTestDB(t)
+	mr := setupTestRedis(t)
+	allowLoopbackForTest(t)
+	broadcaster := newTestBroadcaster()
+	handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir())
+
+	agentServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.Header().Set("Content-Type", "application/json")
+		fmt.Fprint(w, `{"jsonrpc":"2.0","id":"1","result":{}}`)
+	}))
+	defer agentServer.Close()
+	mr.Set(fmt.Sprintf("ws:%s:url", "ws-target"), agentServer.URL)
+
+	// 1. Bearer-derive lookup → returns ws-caller
+	mock.ExpectQuery(`SELECT t\.id, t\.workspace_id.*FROM workspace_auth_tokens t.*JOIN workspaces`).
+		WillReturnRows(sqlmock.NewRows([]string{"id", "workspace_id"}).AddRow("tok-1", "ws-caller"))
+
+	// 2. validateCallerToken's HasAnyLiveToken / ValidateToken queries fall
+	//    through to fail-open (no expectations set) — same pattern as
+	//    TestProxyA2A_CallerIDPropagated.
+
+	// 3. CanCommunicate — siblings under same parent
+	mock.ExpectQuery("SELECT id, parent_id FROM workspaces WHERE id = ").
+		WithArgs("ws-caller").
+		WillReturnRows(sqlmock.NewRows([]string{"id", "parent_id"}).AddRow("ws-caller", "ws-parent"))
+	mock.ExpectQuery("SELECT id, parent_id FROM workspaces WHERE id = ").
+		WithArgs("ws-target").
+		WillReturnRows(sqlmock.NewRows([]string{"id", "parent_id"}).AddRow("ws-target", "ws-parent"))
+
+	expectBudgetCheck(mock, "ws-target")
+
+	// 4. activity_logs INSERT — verify source_id arg is the derived ws-caller
+	//    (column order: workspace_id, activity_type, source_id, target_id, ...)
+	mock.ExpectExec("INSERT INTO activity_logs").
+		WithArgs(
+			"ws-target",                       // $1 workspace_id
+			"a2a_receive",                     // $2 activity_type
+			sqlmock.AnyArg(),                  // $3 source_id — *string("ws-caller"), checked below
+			sqlmock.AnyArg(),                  // $4 target_id
+			sqlmock.AnyArg(),                  // $5 method
+			sqlmock.AnyArg(),                  // $6 summary
+			sqlmock.AnyArg(),                  // $7 request_body
+			sqlmock.AnyArg(),                  // $8 response_body
+			sqlmock.AnyArg(),                  // $9 tool_trace
+			sqlmock.AnyArg(),                  // $10 duration_ms
+			sqlmock.AnyArg(),                  // $11 status
+			sqlmock.AnyArg(),                  // $12 error_detail
+		).
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Params = gin.Params{{Key: "id", Value: "ws-target"}}
+
+	body := `{"method":"message/send","params":{"message":{"role":"user","parts":[{"text":"test"}]}}}`
+	c.Request = httptest.NewRequest("POST", "/workspaces/ws-target/a2a", bytes.NewBufferString(body))
+	c.Request.Header.Set("Content-Type", "application/json")
+	// NOTE: no X-Workspace-ID — the bearer must be the only callerID source.
+	c.Request.Header.Set("Authorization", "Bearer some-bearer-token")
+
+	handler.ProxyA2A(c)
+	time.Sleep(50 * time.Millisecond) // allow LogActivity goroutine to flush
+
+	if w.Code != http.StatusOK {
+		t.Errorf("expected 200, got %d: %s", w.Code, w.Body.String())
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
+// TestProxyA2A_OrgTokenSkipsBearerDerive verifies that when an org-level
+// token is in play (canvas/admin path), the bearer-derive logic is skipped
+// even if the bearer matches a workspace token. Org tokens grant org-wide
+// access and don't bind to a single workspace; treating them as a workspace
+// caller would mis-attribute activity logs.
+func TestProxyA2A_OrgTokenSkipsBearerDerive(t *testing.T) {
+	mock := setupTestDB(t)
+	mr := setupTestRedis(t)
+	allowLoopbackForTest(t)
+	broadcaster := newTestBroadcaster()
+	handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir())
+
+	agentServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.Header().Set("Content-Type", "application/json")
+		fmt.Fprint(w, `{"jsonrpc":"2.0","id":"1","result":{}}`)
+	}))
+	defer agentServer.Close()
+	mr.Set(fmt.Sprintf("ws:%s:url", "ws-target"), agentServer.URL)
+
+	// No WorkspaceFromToken expectation — the bearer-derive branch must NOT
+	// fire when org_token_id is set.
+	expectBudgetCheck(mock, "ws-target")
+
+	// Activity log INSERT with NULL source_id (canvas-class semantics).
+	mock.ExpectExec("INSERT INTO activity_logs").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Params = gin.Params{{Key: "id", Value: "ws-target"}}
+	c.Set("org_token_id", "org-token-123") // org-level auth
+
+	body := `{"method":"message/send","params":{"message":{"role":"user","parts":[{"text":"hi"}]}}}`
+	c.Request = httptest.NewRequest("POST", "/workspaces/ws-target/a2a", bytes.NewBufferString(body))
+	c.Request.Header.Set("Content-Type", "application/json")
+	c.Request.Header.Set("Authorization", "Bearer org-bearer")
+
+	handler.ProxyA2A(c)
+	time.Sleep(50 * time.Millisecond)
+
+	if w.Code != http.StatusOK {
+		t.Errorf("expected 200, got %d: %s", w.Code, w.Body.String())
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
+// TestProxyA2A_BearerDeriveFailureFallsThrough verifies that if the bearer
+// is present but doesn't resolve (e.g. revoked, removed workspace), the
+// callerID stays empty and the request is treated as canvas-class — we
+// don't 401, we don't error; we just lose the source_id signal. Mirrors
+// the canvas-bypass shape so legacy/anonymous paths aren't broken.
+func TestProxyA2A_BearerDeriveFailureFallsThrough(t *testing.T) {
+	mock := setupTestDB(t)
+	mr := setupTestRedis(t)
+	allowLoopbackForTest(t)
+	broadcaster := newTestBroadcaster()
+	handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir())
+
+	agentServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.Header().Set("Content-Type", "application/json")
+		fmt.Fprint(w, `{"jsonrpc":"2.0","id":"1","result":{}}`)
+	}))
+	defer agentServer.Close()
+	mr.Set(fmt.Sprintf("ws:%s:url", "ws-target"), agentServer.URL)
+
+	// Bearer-derive lookup fails (no live row) — collapses to ErrInvalidToken
+	// inside WorkspaceFromToken; ProxyA2A swallows the error and proceeds with
+	// callerID="".
+	mock.ExpectQuery(`SELECT t\.id, t\.workspace_id.*FROM workspace_auth_tokens t.*JOIN workspaces`).
+		WillReturnError(sql.ErrNoRows)
+
+	expectBudgetCheck(mock, "ws-target")
+	mock.ExpectExec("INSERT INTO activity_logs").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Params = gin.Params{{Key: "id", Value: "ws-target"}}
+
+	body := `{"method":"message/send","params":{"message":{"role":"user","parts":[{"text":"hi"}]}}}`
+	c.Request = httptest.NewRequest("POST", "/workspaces/ws-target/a2a", bytes.NewBufferString(body))
+	c.Request.Header.Set("Content-Type", "application/json")
+	c.Request.Header.Set("Authorization", "Bearer revoked-or-stale")
+
+	handler.ProxyA2A(c)
+	time.Sleep(50 * time.Millisecond)
+
+	if w.Code != http.StatusOK {
+		t.Errorf("expected 200 (canvas-fallback), got %d: %s", w.Code, w.Body.String())
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
 func TestIsSystemCaller(t *testing.T) {
 	cases := []struct {
 		caller   string
@@ -630,6 +920,46 @@ func TestIsUpstreamBusyError(t *testing.T) {
 	}
 }

+// TestIsUpstreamDeadStatus locks in the status-code matrix that gates
+// reactive container-dead detection. Order matters: the helper exists so
+// the proxy + any future caller (e.g. a sweeper) classify CF dead-origin
+// codes the same way. Drift here would re-introduce the SaaS-blind bug
+// for whichever code we forgot.
+func TestIsUpstreamDeadStatus(t *testing.T) {
+	cases := []struct {
+		name   string
+		status int
+		want   bool
+	}{
+		// Standard proxy-layer dead-upstream codes
+		{"502 BadGateway", 502, true},
+		{"503 ServiceUnavailable", 503, true},
+		{"504 GatewayTimeout", 504, true},
+		// Cloudflare dead-origin family
+		{"521 WebServerDown", 521, true},
+		{"522 ConnectionTimedOut", 522, true},
+		{"523 OriginUnreachable", 523, true},
+		{"524 OriginTimedOut", 524, true},
+		// Negative cases — must NOT trigger restart
+		{"200 OK", 200, false},
+		{"400 BadRequest (agent rejected payload)", 400, false},
+		{"401 Unauthorized", 401, false},
+		{"404 NotFound (no such session)", 404, false},
+		{"408 RequestTimeout (client-side)", 408, false},
+		{"429 TooManyRequests (rate limited, agent alive)", 429, false},
+		{"500 InternalServerError (agent crashed mid-request)", 500, false},
+		{"501 NotImplemented", 501, false},
+		{"505 HTTPVersionNotSupported", 505, false},
+		{"520 WebServerReturnedUnknown (agent returned malformed)", 520, false},
+		{"525 SSLHandshakeFailed (TLS misconfig, not dead origin)", 525, false},
+	}
+	for _, tc := range cases {
+		if got := isUpstreamDeadStatus(tc.status); got != tc.want {
+			t.Errorf("%s: isUpstreamDeadStatus(%d) = %v, want %v", tc.name, tc.status, got, tc.want)
+		}
+	}
+}
+
 // ==================== ProxyA2A — upstream timeout returns 503 busy + Retry-After ====================

 // Verifies the full error-shaping contract for the 503-busy path:
@@ -961,7 +1291,10 @@ func TestNormalizeA2APayload_PreservesExistingMessageId(t *testing.T) {
 }

 func TestNormalizeA2APayload_MissingMethodReturnsEmpty(t *testing.T) {
-	raw := []byte(`{"params":{"message":{"role":"user"}}}`)
+	// Method extraction returns empty string when method is absent,
+	// regardless of message validity. Include parts: [] so the v0.2→v0.3
+	// compat check (#2345) doesn't reject before method extraction.
+	raw := []byte(`{"params":{"message":{"role":"user","parts":[]}}}`)
 	_, method, perr := normalizeA2APayload(raw)
 	if perr != nil {
 		t.Fatalf("unexpected error: %+v", perr)
@@ -971,6 +1304,102 @@ func TestNormalizeA2APayload_MissingMethodReturnsEmpty(t *testing.T) {
 	}
 }

+// --- v0.2 → v0.3 compat shim (#2345) ---
+
+func TestNormalizeA2APayload_ConvertsV02StringContentToParts(t *testing.T) {
+	raw := []byte(`{"method":"message/send","params":{"message":{"role":"user","content":"hello world"}}}`)
+	out, _, perr := normalizeA2APayload(raw)
+	if perr != nil {
+		t.Fatalf("unexpected error: %+v", perr)
+	}
+	var parsed map[string]interface{}
+	if err := json.Unmarshal(out, &parsed); err != nil {
+		t.Fatalf("output not valid JSON: %v", err)
+	}
+	msg := parsed["params"].(map[string]interface{})["message"].(map[string]interface{})
+	if _, stillHasContent := msg["content"]; stillHasContent {
+		t.Error("v0.2 'content' field should be removed after conversion")
+	}
+	parts, ok := msg["parts"].([]interface{})
+	if !ok || len(parts) != 1 {
+		t.Fatalf("expected 1 part, got %v", msg["parts"])
+	}
+	part := parts[0].(map[string]interface{})
+	if part["kind"] != "text" || part["text"] != "hello world" {
+		t.Errorf("expected {kind:text, text:'hello world'}, got %v", part)
+	}
+}
+
+func TestNormalizeA2APayload_ConvertsV02ListContentToParts(t *testing.T) {
+	raw := []byte(`{"method":"message/send","params":{"message":{"role":"user","content":[{"kind":"text","text":"hi"}]}}}`)
+	out, _, perr := normalizeA2APayload(raw)
+	if perr != nil {
+		t.Fatalf("unexpected error: %+v", perr)
+	}
+	var parsed map[string]interface{}
+	_ = json.Unmarshal(out, &parsed)
+	msg := parsed["params"].(map[string]interface{})["message"].(map[string]interface{})
+	parts, ok := msg["parts"].([]interface{})
+	if !ok || len(parts) != 1 {
+		t.Fatalf("expected list preserved as parts, got %v", msg["parts"])
+	}
+}
+
+func TestNormalizeA2APayload_PreservesV03Parts(t *testing.T) {
+	raw := []byte(`{"method":"message/send","params":{"message":{"role":"user","parts":[{"kind":"text","text":"hi"}]}}}`)
+	out, _, perr := normalizeA2APayload(raw)
+	if perr != nil {
+		t.Fatalf("unexpected error: %+v", perr)
+	}
+	var parsed map[string]interface{}
+	_ = json.Unmarshal(out, &parsed)
+	msg := parsed["params"].(map[string]interface{})["message"].(map[string]interface{})
+	if _, hasContent := msg["content"]; hasContent {
+		t.Error("did not expect content field in v0.3-shaped payload output")
+	}
+	parts := msg["parts"].([]interface{})
+	if len(parts) != 1 {
+		t.Errorf("expected 1 part preserved, got %d", len(parts))
+	}
+}
+
+func TestNormalizeA2APayload_RejectsMessageWithNeitherContentNorParts(t *testing.T) {
+	raw := []byte(`{"method":"message/send","params":{"message":{"role":"user","metadata":{}}}}`)
+	_, _, perr := normalizeA2APayload(raw)
+	if perr == nil {
+		t.Fatal("expected error for message with neither content nor parts")
+	}
+	if perr.Status != http.StatusBadRequest {
+		t.Errorf("expected 400, got %d", perr.Status)
+	}
+	errMsg, _ := perr.Response["error"].(string)
+	if !strings.Contains(errMsg, "parts") || !strings.Contains(errMsg, "content") {
+		t.Errorf("error message should mention both 'parts' and 'content', got: %q", errMsg)
+	}
+}
+
+func TestNormalizeA2APayload_RejectsContentWithUnsupportedType(t *testing.T) {
+	raw := []byte(`{"method":"message/send","params":{"message":{"role":"user","content":42}}}`)
+	_, _, perr := normalizeA2APayload(raw)
+	if perr == nil {
+		t.Fatal("expected error for non-string non-list content")
+	}
+	if perr.Status != http.StatusBadRequest {
+		t.Errorf("expected 400, got %d", perr.Status)
+	}
+}
+
+func TestNormalizeA2APayload_NoMessageNoCheck(t *testing.T) {
+	raw := []byte(`{"method":"tasks/list","params":{}}`)
+	_, method, perr := normalizeA2APayload(raw)
+	if perr != nil {
+		t.Fatalf("unexpected error on params-message-absent payload: %+v", perr)
+	}
+	if method != "tasks/list" {
+		t.Errorf("expected method=tasks/list, got %q", method)
+	}
+}
+
 // --- resolveAgentURL direct unit tests ---

 func TestResolveAgentURL_CacheHit(t *testing.T) {
@@ -1364,6 +1793,143 @@ func TestMaybeMarkContainerDead_NilProvisioner(t *testing.T) {
 	}
 }

+// SaaS path: h.provisioner=nil but h.cpProv is wired and reports the EC2
+// instance is NOT running. maybeMarkContainerDead must consult cpProv,
+// flip the workspace to status='offline', clear keys, broadcast OFFLINE,
+// and return true so the caller surfaces the structured 503. Pre-fix
+// (#NNN) it returned false unconditionally on h.provisioner==nil, so
+// dead EC2 agents leaked upstream 502 to canvas with no recovery.
+func TestMaybeMarkContainerDead_CPOnly_NotRunning(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+	handler := NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir())
+	cp := &fakeCPProv{running: false}
+	handler.SetCPProvisioner(cp)
+
+	mock.ExpectQuery(`SELECT COALESCE\(runtime, 'langgraph'\) FROM workspaces WHERE id =`).
+		WithArgs("ws-saas-dead").
+		WillReturnRows(sqlmock.NewRows([]string{"runtime"}).AddRow("hermes"))
+	mock.ExpectExec(`UPDATE workspaces SET status =`).
+		WithArgs(models.StatusOffline, "ws-saas-dead").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	got := handler.maybeMarkContainerDead(context.Background(), "ws-saas-dead")
+	if !got {
+		t.Fatal("expected true (cpProv reports not running) — without cpProv consultation, SaaS dead-agent recovery is impossible")
+	}
+	if cp.calls != 1 {
+		t.Errorf("expected exactly 1 IsRunning call on cpProv; got %d", cp.calls)
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
+// SaaS path: h.cpProv reports running=true → maybeMarkContainerDead must
+// return false (don't restart a healthy agent on a transient upstream
+// hiccup). This is the safety check that prevents over-eager recycling.
+func TestMaybeMarkContainerDead_CPOnly_Running(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+	handler := NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir())
+	cp := &fakeCPProv{running: true}
+	handler.SetCPProvisioner(cp)
+
+	mock.ExpectQuery(`SELECT COALESCE\(runtime, 'langgraph'\) FROM workspaces WHERE id =`).
+		WithArgs("ws-saas-alive").
+		WillReturnRows(sqlmock.NewRows([]string{"runtime"}).AddRow("hermes"))
+
+	if got := handler.maybeMarkContainerDead(context.Background(), "ws-saas-alive"); got {
+		t.Error("expected false when cpProv reports running — must not recycle a healthy agent")
+	}
+	if cp.calls != 1 {
+		t.Errorf("expected exactly 1 IsRunning call on cpProv; got %d", cp.calls)
+	}
+}
+
+// SaaS-path runRestartCycle: when h.provisioner is nil and h.cpProv is set,
+// the auto-restart cycle MUST call cpProv.Stop (not Docker provisioner.Stop).
+// Pre-fix this dispatched only to h.provisioner.Stop, NPE'd on nil, was
+// silently swallowed by coalesceRestart's recover-without-re-raise, and
+// left the workspace stuck in status='provisioning' forever — making
+// reactive auto-restart on SaaS effectively dead code. The independent
+// review of PR #2362 caught this gap.
+//
+// We drive runRestartCycle directly (not via RestartByID/coalesceRestart)
+// so we don't fight the goroutine's timing in a unit test. The full
+// restart chain (provisionWorkspaceCP) needs its own mocked DB rows that
+// would explode the surface area of this test; what we care about here
+// is the dispatch decision, which is observable on cpProv.stopCalls.
+// stopForRestart is the dispatch helper extracted from runRestartCycle so the
+// branch logic can be tested without spawning the async sendRestartContext
+// goroutine that the full cycle fires. Pre-fix runRestartCycle's Stop dispatch
+// only called the Docker path, so on SaaS (h.provisioner=nil) the cycle NPE'd
+// silently and left the workspace stuck in status='provisioning'.
+func TestStopForRestart_SaaSPath_DispatchesViaCPProv(t *testing.T) {
+	setupTestRedis(t)
+	handler := NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir())
+	cp := &fakeCPProv{}
+	handler.SetCPProvisioner(cp)
+
+	handler.stopForRestart(context.Background(), "ws-saas-restart")
+
+	if cp.stopCalls != 1 {
+		t.Fatalf("expected cpProv.Stop to be called once on SaaS auto-restart; got %d", cp.stopCalls)
+	}
+	if cp.startCalls != 0 {
+		t.Fatalf("expected cpProv.Start NOT to be called by stopForRestart; got %d", cp.startCalls)
+	}
+}
+
+// Both nil → no-op, no panic, no DB / broadcast side effects. Guards the
+// dispatcher against being invoked on a misconfigured handler. Important
+// because runRestartCycle's surrounding flow (status='provisioning' UPDATE
+// + broadcast) MUST happen even when both provisioners are nil — but
+// stopForRestart itself is a pure dispatcher and shouldn't touch state.
+func TestStopForRestart_NoProvisioner_NoOp(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+	handler := NewWorkspaceHandler(newTestBroadcaster(), nil, "http://localhost:8080", t.TempDir())
+	// no provisioner, no cpProv, no DB expectations set on mock — any
+	// unexpected query/exec will produce a sqlmock error.
+	handler.stopForRestart(context.Background(), "ws-orphan")
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Fatalf("stopForRestart no-provisioner path should not touch DB: %v", err)
+	}
+}
+
+// fakeCPProv satisfies provisioner.CPProvisionerAPI for tests that exercise
+// the SaaS / EC2-backed reactive-health path.
+//
+// Methods all record calls. Start/Stop/GetConsoleOutput return nil/empty by
+// default — the maybeMarkContainerDead happy path triggers an async
+// `go h.RestartByID(...)` which calls Stop, so the previous "panic on
+// unexpected call" pattern was unsafe (the panic fires on a goroutine,
+// after the assertions ran). Tests that want to ASSERT a method is unused
+// can check `calls == 0` after a sync barrier.
+type fakeCPProv struct {
+	running    bool
+	calls      int
+	stopCalls  int
+	startCalls int
+}
+
+func (f *fakeCPProv) Start(_ context.Context, _ provisioner.WorkspaceConfig) (string, error) {
+	f.startCalls++
+	return "", nil
+}
+func (f *fakeCPProv) Stop(_ context.Context, _ string) error {
+	f.stopCalls++
+	return nil
+}
+func (f *fakeCPProv) GetConsoleOutput(_ context.Context, _ string) (string, error) {
+	return "", nil
+}
+func (f *fakeCPProv) IsRunning(_ context.Context, _ string) (bool, error) {
+	f.calls++
+	return f.running, nil
+}
+
 // external runtime → false regardless of provisioner.
 func TestMaybeMarkContainerDead_ExternalRuntime(t *testing.T) {
 	mock := setupTestDB(t)
@@ -1528,3 +2094,185 @@ func TestResolveAgentURL_HibernatedWorkspace_NullURLVariant(t *testing.T) {
 		t.Errorf("unmet DB expectations: %v", err)
 	}
 }
+
+// ==================== ProxyA2A — poll-mode short-circuit (#2339 PR 2) ====================
+
+// TestProxyA2A_PollMode_ShortCircuits_NoSSRF_NoDispatch verifies the core
+// invariant of #2339 PR 2: when delivery_mode=poll, ProxyA2A must NOT
+// hit resolveAgentURL (which would SSRF-check or 502 on a missing URL)
+// and must NOT dispatch over HTTP. It records the request to activity_logs
+// and returns 200 {status:"queued"} instead.
+//
+// Without this short-circuit, the canvas chat fails for any workspace
+// running molecule-mcp-claude-channel (operator's laptop, no public URL):
+// resolveAgentURL would 502 on the missing URL and the polling agent
+// would never see the inbound message. That's the bug PR 2 fixes.
+func TestProxyA2A_PollMode_ShortCircuits_NoSSRF_NoDispatch(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+	broadcaster := newTestBroadcaster()
+	handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir())
+
+	const wsID = "ws-poll-shortcircuit"
+
+	// Budget check still runs (above the short-circuit) — affirms the
+	// budget guard is mode-agnostic, which is correct: a poll-mode
+	// workspace shouldn't burn unmetered platform CPU/storage either.
+	expectBudgetCheck(mock, wsID)
+
+	// lookupDeliveryMode SELECT — returns poll, triggering the short-circuit.
+	// Note: NO ExpectQuery for `SELECT url, status FROM workspaces` (that's
+	// resolveAgentURL's query) — the short-circuit must skip resolveAgentURL.
+	mock.ExpectQuery("SELECT delivery_mode FROM workspaces WHERE id").
+		WithArgs(wsID).
+		WillReturnRows(sqlmock.NewRows([]string{"delivery_mode"}).AddRow("poll"))
+
+	// Activity log: the queued receive (logA2AReceiveQueued in helpers.go).
+	mock.ExpectExec("INSERT INTO activity_logs").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Params = gin.Params{{Key: "id", Value: wsID}}
+
+	body := `{"jsonrpc":"2.0","id":"poll-1","method":"message/send","params":{"message":{"role":"user","parts":[{"text":"hi"}]}}}`
+	c.Request = httptest.NewRequest("POST", "/workspaces/"+wsID+"/a2a", bytes.NewBufferString(body))
+	c.Request.Header.Set("Content-Type", "application/json")
+
+	handler.ProxyA2A(c)
+
+	time.Sleep(50 * time.Millisecond)
+
+	if w.Code != http.StatusOK {
+		t.Fatalf("expected 200 (queued), got %d: %s", w.Code, w.Body.String())
+	}
+
+	var resp map[string]interface{}
+	if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
+		t.Fatalf("response is not valid JSON: %v", err)
+	}
+	if resp["status"] != "queued" {
+		t.Errorf("response.status = %v, want %q", resp["status"], "queued")
+	}
+	if resp["delivery_mode"] != "poll" {
+		t.Errorf("response.delivery_mode = %v, want %q", resp["delivery_mode"], "poll")
+	}
+	if resp["method"] != "message/send" {
+		t.Errorf("response.method = %v, want %q (the JSON-RPC method that was queued)", resp["method"], "message/send")
+	}
+
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
+// TestProxyA2A_PushMode_NoShortCircuit verifies the symmetric contract:
+// a push-mode workspace (default) is NOT affected by the new short-circuit.
+// It still proceeds to resolveAgentURL + dispatch. Without this guard, a
+// regression in lookupDeliveryMode could silently break the entire fleet.
+func TestProxyA2A_PushMode_NoShortCircuit(t *testing.T) {
+	mock := setupTestDB(t)
+	mr := setupTestRedis(t)
+	allowLoopbackForTest(t)
+	broadcaster := newTestBroadcaster()
+	handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir())
+
+	const wsID = "ws-push-default"
+
+	dispatched := false
+	agentServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		dispatched = true
+		w.Header().Set("Content-Type", "application/json")
+		fmt.Fprint(w, `{"jsonrpc":"2.0","id":"1","result":{"status":"ok"}}`)
+	}))
+	defer agentServer.Close()
+
+	mr.Set(fmt.Sprintf("ws:%s:url", wsID), agentServer.URL)
+	expectBudgetCheck(mock, wsID)
+
+	// lookupDeliveryMode returns "push" — short-circuit must NOT fire.
+	mock.ExpectQuery("SELECT delivery_mode FROM workspaces WHERE id").
+		WithArgs(wsID).
+		WillReturnRows(sqlmock.NewRows([]string{"delivery_mode"}).AddRow("push"))
+
+	mock.ExpectExec("INSERT INTO activity_logs").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Params = gin.Params{{Key: "id", Value: wsID}}
+
+	body := `{"jsonrpc":"2.0","id":"push-1","method":"message/send","params":{"message":{"role":"user","parts":[{"text":"hi"}]}}}`
+	c.Request = httptest.NewRequest("POST", "/workspaces/"+wsID+"/a2a", bytes.NewBufferString(body))
+	c.Request.Header.Set("Content-Type", "application/json")
+
+	handler.ProxyA2A(c)
+
+	time.Sleep(50 * time.Millisecond)
+
+	if w.Code != http.StatusOK {
+		t.Fatalf("expected 200 (dispatched), got %d: %s", w.Code, w.Body.String())
+	}
+	if !dispatched {
+		t.Error("push-mode workspace: expected the agent server to receive the request, but it did not")
+	}
+	var resp map[string]interface{}
+	if err := json.Unmarshal(w.Body.Bytes(), &resp); err == nil {
+		if resp["status"] == "queued" {
+			t.Error("push-mode response leaked queued envelope — short-circuit fired when it shouldn't have")
+		}
+	}
+
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
+// TestProxyA2A_PollMode_FailsClosedToPush verifies the safety contract:
+// a DB error reading delivery_mode must default to push (the existing
+// behavior), NOT poll. Failing to push means a poll-mode workspace
+// briefly attempts a real dispatch — visible failure (502 / SSRF
+// rejection / restart cascade), not a silent drop into activity_logs
+// where the agent might never look. Loud > silent, recoverable > lost.
+func TestProxyA2A_PollMode_FailsClosedToPush(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t) // empty Redis — forces resolveAgentURL DB lookup
+	broadcaster := newTestBroadcaster()
+	handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir())
+
+	const wsID = "ws-mode-db-error"
+
+	expectBudgetCheck(mock, wsID)
+
+	// lookupDeliveryMode hits a transient DB error → must default push.
+	mock.ExpectQuery("SELECT delivery_mode FROM workspaces WHERE id").
+		WithArgs(wsID).
+		WillReturnError(sql.ErrConnDone)
+
+	// Push path proceeds to resolveAgentURL — empty result → 502 path.
+	mock.ExpectQuery("SELECT url, status FROM workspaces WHERE id =").
+		WithArgs(wsID).
+		WillReturnRows(sqlmock.NewRows([]string{"url", "status"}))
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Params = gin.Params{{Key: "id", Value: wsID}}
+
+	body := `{"jsonrpc":"2.0","id":"x","method":"message/send","params":{}}`
+	c.Request = httptest.NewRequest("POST", "/workspaces/"+wsID+"/a2a", bytes.NewBufferString(body))
+	c.Request.Header.Set("Content-Type", "application/json")
+
+	handler.ProxyA2A(c)
+
+	if w.Code == http.StatusOK {
+		var resp map[string]interface{}
+		_ = json.Unmarshal(w.Body.Bytes(), &resp)
+		if resp["status"] == "queued" {
+			t.Errorf("DB error on delivery_mode lookup silently queued the request — must fail-closed-to-push, got body: %s", w.Body.String())
+		}
+	}
+
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
@@ -18,6 +18,7 @@ import (
 	"fmt"
 	"log"
 	"net/http"
+	"time"

 	"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
 )
@@ -39,6 +40,33 @@ func extractIdempotencyKey(body []byte) string {
 	return envelope.Params.Message.MessageID
 }

+// extractExpiresInSeconds pulls params.expires_in_seconds out of an A2A
+// JSON-RPC body and returns it as a positive integer. A zero return means
+// "no caller-specified TTL" — caller should leave expires_at NULL on the
+// queue row, preserving today's infinite-TTL behaviour (the
+// DropStaleQueueItems admin sweeper still drops entries past the
+// platform-default age). Negative values and parse errors collapse to 0.
+//
+// Why params-level (not metadata): expires_in_seconds is a delivery
+// directive, not a peer-to-peer message attribute. Putting it under
+// `params` keeps it adjacent to other delivery hints (priority,
+// idempotency) and out of `params.message.metadata` which the receiving
+// agent can read.
+func extractExpiresInSeconds(body []byte) int {
+	var envelope struct {
+		Params struct {
+			ExpiresInSeconds int `json:"expires_in_seconds"`
+		} `json:"params"`
+	}
+	if err := json.Unmarshal(body, &envelope); err != nil {
+		return 0
+	}
+	if envelope.Params.ExpiresInSeconds < 0 {
+		return 0
+	}
+	return envelope.Params.ExpiresInSeconds
+}
+
 const (
 	PriorityCritical = 100
 	PriorityTask     = 50
@@ -70,6 +98,7 @@ func EnqueueA2A(
 	priority int,
 	body []byte,
 	method, idempotencyKey string,
+	expiresAt *time.Time,
 ) (id string, depth int, err error) {
 	var keyArg interface{}
 	if idempotencyKey != "" {
@@ -83,6 +112,13 @@ func EnqueueA2A(
 	if method != "" {
 		methodArg = method
 	}
+	// expiresAtArg stays NULL when caller didn't specify a TTL. DequeueNext's
+	// `expires_at IS NULL OR expires_at > now()` filter then preserves today's
+	// infinite-TTL semantics for un-flagged messages.
+	var expiresAtArg interface{}
+	if expiresAt != nil {
+		expiresAtArg = *expiresAt
+	}

 	// INSERT ... ON CONFLICT DO NOTHING RETURNING id. The conflict target
 	// must reference the partial unique INDEX columns + WHERE clause directly
@@ -91,13 +127,13 @@ func EnqueueA2A(
 	// then look up the existing row's id so the caller always receives a
 	// valid queue entry reference.
 	err = db.DB.QueryRowContext(ctx, `
-		INSERT INTO a2a_queue (workspace_id, caller_id, priority, body, method, idempotency_key)
-		VALUES ($1, $2, $3, $4::jsonb, $5, $6)
+		INSERT INTO a2a_queue (workspace_id, caller_id, priority, body, method, idempotency_key, expires_at)
+		VALUES ($1, $2, $3, $4::jsonb, $5, $6, $7)
 		ON CONFLICT (workspace_id, idempotency_key)
 			WHERE idempotency_key IS NOT NULL AND status IN ('queued','dispatched')
 			DO NOTHING
 		RETURNING id
-	`, workspaceID, callerArg, priority, string(body), methodArg, keyArg).Scan(&id)
+	`, workspaceID, callerArg, priority, string(body), methodArg, keyArg, expiresAtArg).Scan(&id)

 	if errors.Is(err, sql.ErrNoRows) && idempotencyKey != "" {
 		// Conflict — look up the existing active row and use its id.
@@ -0,0 +1,231 @@
+package handlers
+
+// a2a_queue_status.go — RFC #2331 Tier 1: public per-queue-id status endpoint.
+//
+// Closes the gap surfaced in #2329 item 5: callers receive `queue_id` in
+// the 202 enqueue response but had no public lookup endpoint. The only
+// observability path was through `check_task_status` which joins via
+// `request_body->>'delegation_id'` in `activity_logs` — works only for
+// delegation-flavored A2A. Cross-workspace peer-direct A2A had no
+// observability after enqueue.
+//
+// Auth model:
+//
+//   - The caller's workspace token must match the `caller_id` recorded
+//     on the queue row at enqueue time, OR the caller's token must be
+//     for the target workspace_id (target can see what's queued for it),
+//     OR an org-level token (canvas/admin) can see anything.
+//
+//   - 404 — not 403 — when the caller has no read access. The queue_id
+//     UUID is the access token; revealing "this queue_id exists but
+//     you can't see it" leaks the existence-of-other-callers' state.
+//
+// What the response body excludes:
+//
+//   - `body` (the original JSON-RPC request body) — could contain
+//     prompts/PII the caller's authority shouldn't include in poll-loop
+//     responses. The body is only relevant to the dispatching agent.
+//   - `caller_id` — exposes the existence of other callers.
+//
+// What it includes:
+//
+//   - status, attempts, last_error, enqueued_at, dispatched_at,
+//     completed_at, expires_at, priority — the delivery state machine
+//     observables.
+//   - response_body when status == completed — so the caller can
+//     retrieve the response without polling check_task_status.
+
+import (
+	"context"
+	"database/sql"
+	"errors"
+	"log"
+	"net/http"
+
+	"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
+	"github.com/Molecule-AI/molecule-monorepo/platform/internal/wsauth"
+	"github.com/gin-gonic/gin"
+)
+
+// QueueStatus is the public projection of an a2a_queue row.
+type QueueStatus struct {
+	ID           string  `json:"queue_id"`
+	WorkspaceID  string  `json:"workspace_id"`
+	Status       string  `json:"status"`
+	Priority     int     `json:"priority"`
+	Attempts     int     `json:"attempts"`
+	LastError    *string `json:"last_error,omitempty"`
+	EnqueuedAt   string  `json:"enqueued_at"`
+	DispatchedAt *string `json:"dispatched_at,omitempty"`
+	CompletedAt  *string `json:"completed_at,omitempty"`
+	ExpiresAt    *string `json:"expires_at,omitempty"`
+	ResponseBody []byte  `json:"response_body,omitempty"`
+}
+
+// QueueStatusByID looks up the queue row and projects it for the public
+// endpoint. Returns ErrNoQueueRow when the row doesn't exist OR the
+// caller has no read access — collapsing the two surfaces a single 404
+// from the handler so an attacker can't probe queue_id existence.
+//
+// Access rules — caller must satisfy at least one of:
+//
+//	(a) callerID == queue.caller_id        (sender can read own enqueue)
+//	(b) callerID == queue.workspace_id     (target can read queued-for-me)
+//	(c) isAdmin == true                    (canvas/admin token)
+//
+// Internal helper; the HTTP handler enforces the auth checks before
+// calling this — by the time we get here we already know the caller
+// is authorized, so this just runs the SELECT.
+func QueueStatusByID(ctx context.Context, queueID string) (*QueueStatus, error) {
+	var qs QueueStatus
+	var lastError, dispatchedAt, completedAt, expiresAt sql.NullString
+	var responseBody []byte
+
+	// response_body lives on activity_logs (the stitched delegation row), not
+	// on a2a_queue itself. We pull both here in one round-trip via LEFT JOIN
+	// so a completed delegation surfaces its result inline — non-delegation
+	// queue rows simply won't have a matching activity_logs row and the field
+	// stays null.
+	err := db.DB.QueryRowContext(ctx, `
+		SELECT
+			q.id,
+			q.workspace_id,
+			q.status,
+			q.priority,
+			q.attempts,
+			q.last_error,
+			q.enqueued_at::text,
+			q.dispatched_at::text,
+			q.completed_at::text,
+			q.expires_at::text,
+			al.response_body::text
+		FROM a2a_queue q
+		LEFT JOIN activity_logs al
+			ON al.method = 'delegate_result'
+			AND al.target_id = q.workspace_id
+			AND al.workspace_id = q.caller_id
+			AND al.response_body->>'delegation_id' = (q.body->'params'->'message'->'metadata'->>'delegation_id')
+		WHERE q.id = $1
+	`, queueID).Scan(
+		&qs.ID, &qs.WorkspaceID, &qs.Status, &qs.Priority, &qs.Attempts,
+		&lastError, &qs.EnqueuedAt, &dispatchedAt, &completedAt, &expiresAt,
+		&responseBody,
+	)
+	if errors.Is(err, sql.ErrNoRows) {
+		return nil, sql.ErrNoRows
+	}
+	if err != nil {
+		return nil, err
+	}
+
+	if lastError.Valid && lastError.String != "" {
+		s := lastError.String
+		qs.LastError = &s
+	}
+	if dispatchedAt.Valid && dispatchedAt.String != "" {
+		s := dispatchedAt.String
+		qs.DispatchedAt = &s
+	}
+	if completedAt.Valid && completedAt.String != "" {
+		s := completedAt.String
+		qs.CompletedAt = &s
+	}
+	if expiresAt.Valid && expiresAt.String != "" {
+		s := expiresAt.String
+		qs.ExpiresAt = &s
+	}
+	if len(responseBody) > 0 && qs.Status == "completed" {
+		qs.ResponseBody = responseBody
+	}
+
+	return &qs, nil
+}
+
+// queueRowAuthFields returns the (caller_id, workspace_id) of the queue row
+// for access control. Separate from QueueStatusByID so the handler can do
+// the auth check without first projecting the public response.
+func queueRowAuthFields(ctx context.Context, queueID string) (callerID, workspaceID string, err error) {
+	var callerNS, workspaceNS sql.NullString
+	err = db.DB.QueryRowContext(ctx,
+		`SELECT caller_id, workspace_id FROM a2a_queue WHERE id = $1`,
+		queueID,
+	).Scan(&callerNS, &workspaceNS)
+	if err != nil {
+		return "", "", err
+	}
+	return callerNS.String, workspaceNS.String, nil
+}
+
+// GetA2AQueueStatus handles GET /workspaces/:id/a2a/queue/:queue_id.
+//
+// The :id path param is the workspace context (matches the proxy pattern
+// /workspaces/:id/a2a). :queue_id is the row's UUID returned from the
+// 202 enqueue response.
+//
+// Auth flow:
+//
+//  1. Extract caller's workspace from bearer (org tokens grant org-wide
+//     access and short-circuit the per-row check).
+//  2. Look up queue row's (caller_id, workspace_id).
+//  3. Allow when caller's workspace == queue.caller_id OR
+//     == queue.workspace_id, OR caller has org-level access.
+//  4. Otherwise 404 (not 403) — see file-header rationale.
+func (h *WorkspaceHandler) GetA2AQueueStatus(c *gin.Context) {
+	ctx := c.Request.Context()
+	queueID := c.Param("queue_id")
+	if queueID == "" {
+		c.JSON(http.StatusBadRequest, gin.H{"error": "queue_id required"})
+		return
+	}
+
+	// Org-level token (canvas/admin)? Bypass per-row caller match.
+	_, isOrg := c.Get("org_token_id")
+
+	// Derive caller workspace from bearer when not org-token.
+	callerWorkspace := c.GetHeader("X-Workspace-ID")
+	if !isOrg && callerWorkspace == "" {
+		if tok := wsauth.BearerTokenFromHeader(c.GetHeader("Authorization")); tok != "" {
+			if wsID, err := wsauth.WorkspaceFromToken(ctx, db.DB, tok); err == nil {
+				callerWorkspace = wsID
+			}
+		}
+	}
+	if !isOrg && callerWorkspace == "" {
+		// No identity — treat as not-found rather than 401, matching the
+		// file-header existence-non-inference policy. A 401 would tell
+		// an attacker that the queue_id at least might exist.
+		c.JSON(http.StatusNotFound, gin.H{"error": "queue item not found"})
+		return
+	}
+
+	rowCallerID, rowWorkspaceID, err := queueRowAuthFields(ctx, queueID)
+	if errors.Is(err, sql.ErrNoRows) {
+		c.JSON(http.StatusNotFound, gin.H{"error": "queue item not found"})
+		return
+	}
+	if err != nil {
+		log.Printf("GetA2AQueueStatus: row lookup failed for %s: %v", queueID, err)
+		c.JSON(http.StatusInternalServerError, gin.H{"error": "lookup failed"})
+		return
+	}
+
+	// Access check.
+	if !isOrg && callerWorkspace != rowCallerID && callerWorkspace != rowWorkspaceID {
+		// Collapse to 404 — see header.
+		c.JSON(http.StatusNotFound, gin.H{"error": "queue item not found"})
+		return
+	}
+
+	status, err := QueueStatusByID(ctx, queueID)
+	if errors.Is(err, sql.ErrNoRows) {
+		c.JSON(http.StatusNotFound, gin.H{"error": "queue item not found"})
+		return
+	}
+	if err != nil {
+		log.Printf("GetA2AQueueStatus: status fetch failed for %s: %v", queueID, err)
+		c.JSON(http.StatusInternalServerError, gin.H{"error": "status fetch failed"})
+		return
+	}
+
+	c.JSON(http.StatusOK, status)
+}
@@ -0,0 +1,60 @@
+package handlers
+
+import (
+	"testing"
+)
+
+// TestExtractExpiresInSeconds covers the JSON parser used at enqueue time
+// to honor a caller-specified TTL. Zero return = "no TTL" — caller leaves
+// expires_at NULL on the queue row.
+func TestExtractExpiresInSeconds(t *testing.T) {
+	tests := []struct {
+		name string
+		body string
+		want int
+	}{
+		{
+			name: "absent",
+			body: `{"params":{"message":{"messageId":"x"}}}`,
+			want: 0,
+		},
+		{
+			name: "positive",
+			body: `{"params":{"expires_in_seconds":300,"message":{"messageId":"x"}}}`,
+			want: 300,
+		},
+		{
+			name: "zero",
+			body: `{"params":{"expires_in_seconds":0,"message":{"messageId":"x"}}}`,
+			want: 0,
+		},
+		{
+			name: "negative coerced to zero",
+			body: `{"params":{"expires_in_seconds":-30,"message":{"messageId":"x"}}}`,
+			want: 0,
+		},
+		{
+			name: "invalid JSON returns zero",
+			body: `not json`,
+			want: 0,
+		},
+		{
+			name: "wrong type silently zero (json.Unmarshal returns err on type mismatch)",
+			body: `{"params":{"expires_in_seconds":"not-a-number"}}`,
+			want: 0,
+		},
+		{
+			name: "params absent entirely",
+			body: `{}`,
+			want: 0,
+		},
+	}
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			got := extractExpiresInSeconds([]byte(tc.body))
+			if got != tc.want {
+				t.Errorf("extractExpiresInSeconds(%q) = %d, want %d", tc.body, got, tc.want)
+			}
+		})
+	}
+}
@@ -2,7 +2,9 @@ package handlers

 import (
 	"context"
+	"database/sql"
 	"encoding/json"
+	"errors"
 	"fmt"
 	"log"
 	"net/http"
@@ -13,6 +15,7 @@ import (
 	"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
 	"github.com/Molecule-AI/molecule-monorepo/platform/internal/events"
 	"github.com/gin-gonic/gin"
+	"github.com/google/uuid"
 )

 type ActivityHandler struct {
@@ -23,12 +26,74 @@ func NewActivityHandler(b *events.Broadcaster) *ActivityHandler {
 	return &ActivityHandler{broadcaster: b}
 }

-// List handles GET /workspaces/:id/activity?type=&limit=
+// List handles GET /workspaces/:id/activity?type=&source=&limit=&since_secs=&since_id=
+//
+// since_secs filters to activity_logs.created_at >= NOW() - INTERVAL '$N seconds'.
+// Optional, additive — callers that don't pass it get today's behavior (the
+// most-recent N events regardless of time). The harness runner
+// (scripts/measure-coordinator-task-bounds-runner.sh) uses this to scope a
+// trace to a specific test window; RFC #2251 §V1.0 step 6 also depends on it.
+// Capped at 30 days (2_592_000s) — anything older has typically been paged
+// out anyway, and a defensive ceiling keeps a paranoid client from triggering
+// a full-table scan via since_secs=99999999999. Closes #2268.
+//
+// since_id is a CURSOR for poll-mode workspaces (#2339 PR 3). The agent
+// passes the id of the last activity_logs row it has consumed; the server
+// returns rows STRICTLY AFTER that cursor in chronological (ASC) order so
+// the agent processes events in the order they were recorded. Telegram
+// getUpdates / Slack RTM shape — same proven pattern.
+//
+// Cross-workspace safety: the cursor lookup is scoped by workspace_id, so a
+// caller cannot peek at another workspace's activity by guessing its UUIDs.
+//
+// Cursor-not-found: returns 410 Gone. The client should reset its cursor
+// (omit since_id) and re-fetch the recent backlog. This avoids the silent
+// loss-window where a pruned cursor silently filters everything out.
+//
+// since_id + since_secs together: both filters apply (AND). Output is ASC
+// when since_id is set (polling order), DESC otherwise (recent feed order).
 func (h *ActivityHandler) List(c *gin.Context) {
 	workspaceID := c.Param("id")
 	activityType := c.Query("type")
 	source := c.Query("source") // "canvas" = source_id IS NULL, "agent" = source_id IS NOT NULL
+	peerID := c.Query("peer_id") // optional UUID — restrict to rows where this peer is sender OR target
 	limitStr := c.DefaultQuery("limit", "100")
+	sinceSecsStr := c.Query("since_secs")
+	sinceID := c.Query("since_id")
+	beforeTSStr := c.Query("before_ts") // optional RFC3339 — return rows strictly older than this timestamp
+
+	// Validate peer_id as a UUID at the trust boundary so a malformed
+	// caller (the agent or a downstream MCP tool) can't smuggle SQL
+	// fragments into the WHERE clause via the parameter, even though
+	// args are bound. UUID-shape rejection is also the cleanest 400
+	// signal for the wheel-side chat_history MCP tool — clearer than a
+	// generic "no rows" empty list when the agent passed an obviously
+	// wrong id.
+	if peerID != "" {
+		if _, err := uuid.Parse(peerID); err != nil {
+			c.JSON(http.StatusBadRequest, gin.H{"error": "peer_id must be a UUID"})
+			return
+		}
+	}
+
+	// Parse before_ts as the wall-clock paging knob for the wheel-side
+	// `chat_history` MCP tool. The agent passes the oldest `created_at`
+	// from a previous response to walk backward through long histories.
+	// Validated as RFC3339 at the trust boundary so a typoed value
+	// surfaces as a clean 400 instead of being silently ignored.
+	var beforeTS time.Time
+	usingBeforeTS := false
+	if beforeTSStr != "" {
+		t, err := time.Parse(time.RFC3339, beforeTSStr)
+		if err != nil {
+			c.JSON(http.StatusBadRequest, gin.H{
+				"error": "before_ts must be an RFC3339 timestamp (e.g. 2026-05-01T00:00:00Z)",
+			})
+			return
+		}
+		beforeTS = t
+		usingBeforeTS = true
+	}

 	limit := 100
 	if n, err := strconv.Atoi(limitStr); err == nil && n > 0 {
@@ -38,6 +103,54 @@ func (h *ActivityHandler) List(c *gin.Context) {
 		}
 	}

+	// Parse since_secs. Reject negative or non-integer values rather than
+	// silently ignoring them — a typoed param shouldn't be lost as
+	// most-recent-100, that's exactly the bug this fixes.
+	var sinceSecs int
+	if sinceSecsStr != "" {
+		n, err := strconv.Atoi(sinceSecsStr)
+		if err != nil || n <= 0 {
+			c.JSON(http.StatusBadRequest, gin.H{"error": "since_secs must be a positive integer"})
+			return
+		}
+		const maxSinceSecs = 30 * 24 * 60 * 60 // 30 days
+		if n > maxSinceSecs {
+			n = maxSinceSecs
+		}
+		sinceSecs = n
+	}
+
+	// Resolve since_id cursor (if set) BEFORE building the main query so we
+	// can 410 cleanly when the cursor row is gone — and so the cursor's
+	// created_at is bound as a regular timestamp parameter (not a subquery)
+	// for clean sqlmock matching and to keep the planner predictable.
+	//
+	// The lookup is scoped by workspace_id: a caller cannot enumerate or
+	// peek at another workspace's events by passing a UUID belonging to a
+	// different workspace. Mismatched-workspace cursor → 410, same as
+	// "row not found" — both indicate the cursor is no longer usable for
+	// this caller, no information leak.
+	var cursorTime time.Time
+	usingCursor := false
+	if sinceID != "" {
+		err := db.DB.QueryRowContext(c.Request.Context(),
+			`SELECT created_at FROM activity_logs WHERE id = $1 AND workspace_id = $2`,
+			sinceID, workspaceID,
+		).Scan(&cursorTime)
+		if errors.Is(err, sql.ErrNoRows) {
+			c.JSON(http.StatusGone, gin.H{
+				"error": "since_id cursor not found (row may have been pruned or belongs to a different workspace); omit since_id to reset",
+			})
+			return
+		}
+		if err != nil {
+			log.Printf("Activity since_id cursor lookup error for ws=%s id=%s: %v", workspaceID, sinceID, err)
+			c.JSON(http.StatusInternalServerError, gin.H{"error": "cursor lookup failed"})
+			return
+		}
+		usingCursor = true
+	}
+
 	// Build query with optional filters
 	query := `SELECT id, workspace_id, activity_type, source_id, target_id, method,
 			   summary, request_body, response_body, tool_trace, duration_ms, status, error_detail, created_at
@@ -58,8 +171,55 @@ func (h *ActivityHandler) List(c *gin.Context) {
 		c.JSON(http.StatusBadRequest, gin.H{"error": "source must be 'canvas' or 'agent'"})
 		return
 	}
+	if peerID != "" {
+		// Restrict to rows where this peer is either the sender (source_id)
+		// or the recipient (target_id) of an A2A turn. This is the
+		// "conversation history with peer X" view the wheel-side
+		// chat_history MCP tool surfaces — agent receives a peer_agent
+		// push, wants to see the prior 20 turns with that workspace
+		// without paging through every other peer's traffic.
+		//
+		// Bound as a single arg, matched twice — keeps argIdx accurate
+		// and avoids duplicate parameter binding (some drivers reject the
+		// same arg slot reused, ours is fine but the explicit form is
+		// clearer to read and matches the rest of the builder.)
+		query += fmt.Sprintf(" AND (source_id = $%d OR target_id = $%d)", argIdx, argIdx)
+		args = append(args, peerID)
+		argIdx++
+	}
+	if usingBeforeTS {
+		// Strictly older — never replay a row with the exact same
+		// timestamp, mirrors the `created_at > cursorTime` shape
+		// `since_id` uses for forward paging.
+		query += fmt.Sprintf(" AND created_at < $%d", argIdx)
+		args = append(args, beforeTS)
+		argIdx++
+	}
+	if sinceSecs > 0 {
+		// Use a parameterized interval so the value is bound, not
+		// interpolated into the SQL string. `make_interval(secs => $N)`
+		// avoids the lib/pq quirk where INTERVAL '$N seconds' won't
+		// substitute a placeholder inside the literal.
+		query += fmt.Sprintf(" AND created_at >= NOW() - make_interval(secs => $%d)", argIdx)
+		args = append(args, sinceSecs)
+		argIdx++
+	}
+	if usingCursor {
+		// Strictly after — never replay the cursor row itself.
+		query += fmt.Sprintf(" AND created_at > $%d", argIdx)
+		args = append(args, cursorTime)
+		argIdx++
+	}

-	query += fmt.Sprintf(" ORDER BY created_at DESC LIMIT $%d", argIdx)
+	// Polling clients (since_id) need oldest-first within the new window so
+	// they process events in recorded order. The recent-feed view (no
+	// since_id) keeps DESC — that's the canvas/UI shape and changing it
+	// would surprise existing callers.
+	if usingCursor {
+		query += fmt.Sprintf(" ORDER BY created_at ASC LIMIT $%d", argIdx)
+	} else {
+		query += fmt.Sprintf(" ORDER BY created_at DESC LIMIT $%d", argIdx)
+	}
 	args = append(args, limit)

 	rows, err := db.DB.QueryContext(c.Request.Context(), query, args...)
@@ -0,0 +1,156 @@
+package handlers
+
+import (
+	"database/sql"
+	"net/http"
+	"net/http/httptest"
+	"testing"
+	"time"
+
+	"github.com/DATA-DOG/go-sqlmock"
+	"github.com/gin-gonic/gin"
+)
+
+// Tests for the since_id cursor on GET /workspaces/:id/activity (#2339 PR 3).
+//
+// Cursor shape: Telegram getUpdates / Slack RTM. The polling agent passes
+// the id of the last activity_logs row it processed; the server returns
+// rows STRICTLY AFTER that cursor in ASC order. Cross-workspace lookups
+// return 410 to prevent UUID-guessing peeks at other workspaces' events.
+
+// TestActivityHandler_SinceID_ReturnsNewerASC: with a valid cursor the
+// handler does the cursor lookup, then queries with the cursor's
+// created_at as a > filter and ASC ordering — the polling shape.
+func TestActivityHandler_SinceID_ReturnsNewerASC(t *testing.T) {
+	mock := setupTestDB(t)
+
+	cursorID := "act-cursor-42"
+	cursorTime := time.Date(2026, 4, 30, 5, 0, 0, 0, time.UTC)
+
+	// Step 1: cursor lookup — must include workspace_id scope so a UUID
+	// from another workspace can't be used.
+	mock.ExpectQuery(`SELECT created_at FROM activity_logs WHERE id = \$1 AND workspace_id = \$2`).
+		WithArgs(cursorID, "ws-1").
+		WillReturnRows(sqlmock.NewRows([]string{"created_at"}).AddRow(cursorTime))
+
+	// Step 2: main query with the cursor's created_at as a > filter,
+	// ASC ordering. Args: workspace_id, cursorTime, limit.
+	mock.ExpectQuery("SELECT id, workspace_id, activity_type").
+		WithArgs("ws-1", cursorTime, 100).
+		WillReturnRows(newActivityRows())
+
+	broadcaster := newTestBroadcaster()
+	handler := NewActivityHandler(broadcaster)
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Params = gin.Params{{Key: "id", Value: "ws-1"}}
+	c.Request = httptest.NewRequest("GET", "/workspaces/ws-1/activity?since_id="+cursorID, nil)
+
+	handler.List(c)
+
+	if w.Code != http.StatusOK {
+		t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String())
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
+// TestActivityHandler_SinceID_CursorNotFound_410: cursor row doesn't exist
+// (pruned, never existed, or wrong UUID). Server returns 410 Gone so the
+// client knows to reset its cursor — silent empty results would cause a
+// stuck-poll bug where the agent never sees new events.
+func TestActivityHandler_SinceID_CursorNotFound_410(t *testing.T) {
+	mock := setupTestDB(t)
+
+	mock.ExpectQuery(`SELECT created_at FROM activity_logs WHERE id = \$1 AND workspace_id = \$2`).
+		WithArgs("act-gone", "ws-1").
+		WillReturnError(sql.ErrNoRows)
+
+	broadcaster := newTestBroadcaster()
+	handler := NewActivityHandler(broadcaster)
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Params = gin.Params{{Key: "id", Value: "ws-1"}}
+	c.Request = httptest.NewRequest("GET", "/workspaces/ws-1/activity?since_id=act-gone", nil)
+
+	handler.List(c)
+
+	if w.Code != http.StatusGone {
+		t.Fatalf("expected 410, got %d: %s", w.Code, w.Body.String())
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
+// TestActivityHandler_SinceID_CrossWorkspaceCursor_410: a caller passes a
+// UUID that belongs to a different workspace. The cursor lookup is scoped
+// by workspace_id so the row is "not found" from this caller's perspective —
+// same 410 path as the pruned case. No information leak (caller cannot tell
+// whether the UUID belongs to nobody or to another workspace).
+func TestActivityHandler_SinceID_CrossWorkspaceCursor_410(t *testing.T) {
+	mock := setupTestDB(t)
+
+	// Cursor exists in DB but the WHERE workspace_id = $2 filter excludes
+	// it — sqlmock returns no rows, which is what Postgres would do.
+	mock.ExpectQuery(`SELECT created_at FROM activity_logs WHERE id = \$1 AND workspace_id = \$2`).
+		WithArgs("act-other-ws", "ws-1").
+		WillReturnError(sql.ErrNoRows)
+
+	broadcaster := newTestBroadcaster()
+	handler := NewActivityHandler(broadcaster)
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Params = gin.Params{{Key: "id", Value: "ws-1"}}
+	c.Request = httptest.NewRequest("GET", "/workspaces/ws-1/activity?since_id=act-other-ws", nil)
+
+	handler.List(c)
+
+	if w.Code != http.StatusGone {
+		t.Fatalf("cross-workspace cursor: expected 410, got %d: %s", w.Code, w.Body.String())
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
+// TestActivityHandler_SinceID_CombinedWithSinceSecs: both filters apply
+// together (AND). Argument order in the main query: workspace_id,
+// since_secs, cursorTime, limit. Sanity-checks the placeholder index
+// arithmetic in the query builder.
+func TestActivityHandler_SinceID_CombinedWithSinceSecs(t *testing.T) {
+	mock := setupTestDB(t)
+
+	cursorID := "act-c"
+	cursorTime := time.Date(2026, 4, 30, 4, 0, 0, 0, time.UTC)
+
+	mock.ExpectQuery(`SELECT created_at FROM activity_logs WHERE id = \$1 AND workspace_id = \$2`).
+		WithArgs(cursorID, "ws-1").
+		WillReturnRows(sqlmock.NewRows([]string{"created_at"}).AddRow(cursorTime))
+
+	mock.ExpectQuery("SELECT id, workspace_id, activity_type").
+		WithArgs("ws-1", 600, cursorTime, 100).
+		WillReturnRows(newActivityRows())
+
+	broadcaster := newTestBroadcaster()
+	handler := NewActivityHandler(broadcaster)
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Params = gin.Params{{Key: "id", Value: "ws-1"}}
+	c.Request = httptest.NewRequest("GET",
+		"/workspaces/ws-1/activity?since_secs=600&since_id="+cursorID, nil)
+
+	handler.List(c)
+
+	if w.Code != http.StatusOK {
+		t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String())
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
@@ -0,0 +1,163 @@
+package handlers
+
+import (
+	"encoding/json"
+	"net/http"
+	"net/http/httptest"
+	"testing"
+	"time"
+
+	"github.com/DATA-DOG/go-sqlmock"
+	"github.com/gin-gonic/gin"
+)
+
+// Tests for the since_secs query parameter on GET /workspaces/:id/activity.
+// Closes #2268 — the harness runner was passing this param and it was
+// silently ignored, capping the trace at most-recent-100 events. The new
+// shape: parse since_secs, add a parameterised `created_at >= NOW() -
+// make_interval(secs => $N)` clause, cap at 30 days, reject invalid input
+// with 400.
+
+const activityCols = `id, workspace_id, activity_type, source_id, target_id, method, ` +
+	`summary, request_body, response_body, tool_trace, duration_ms, status, error_detail, created_at`
+
+func newActivityRows() *sqlmock.Rows {
+	cols := []string{
+		"id", "workspace_id", "activity_type", "source_id", "target_id", "method",
+		"summary", "request_body", "response_body", "tool_trace", "duration_ms", "status", "error_detail", "created_at",
+	}
+	return sqlmock.NewRows(cols).
+		AddRow("act-1", "ws-1", "a2a_send", nil, nil, nil,
+			"sent", nil, nil, nil, nil, "ok", nil,
+			time.Date(2026, 4, 29, 10, 0, 0, 0, time.UTC))
+}
+
+// TestActivityHandler_SinceSecs_Accepted verifies that a valid since_secs
+// query param adds the make_interval clause to the SQL with the parsed
+// value as a bound parameter — exactly what the runner needs to scope a
+// trace to a test window.
+func TestActivityHandler_SinceSecs_Accepted(t *testing.T) {
+	mock := setupTestDB(t)
+
+	mock.ExpectQuery("SELECT id, workspace_id, activity_type").
+		WithArgs("ws-1", 600, 100). // workspaceID, since_secs, limit
+		WillReturnRows(newActivityRows())
+
+	broadcaster := newTestBroadcaster()
+	handler := NewActivityHandler(broadcaster)
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Params = gin.Params{{Key: "id", Value: "ws-1"}}
+	c.Request = httptest.NewRequest("GET", "/workspaces/ws-1/activity?since_secs=600", nil)
+
+	handler.List(c)
+
+	if w.Code != http.StatusOK {
+		t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String())
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
+// TestActivityHandler_SinceSecs_ClampedAt30Days verifies the defensive
+// ceiling so a paranoid client can't trigger a multi-month full-table
+// scan via since_secs=999999999.
+func TestActivityHandler_SinceSecs_ClampedAt30Days(t *testing.T) {
+	mock := setupTestDB(t)
+
+	const cap30Days = 30 * 24 * 60 * 60
+	mock.ExpectQuery("SELECT id, workspace_id, activity_type").
+		WithArgs("ws-1", cap30Days, 100).
+		WillReturnRows(newActivityRows())
+
+	broadcaster := newTestBroadcaster()
+	handler := NewActivityHandler(broadcaster)
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Params = gin.Params{{Key: "id", Value: "ws-1"}}
+	c.Request = httptest.NewRequest("GET", "/workspaces/ws-1/activity?since_secs=999999999", nil)
+
+	handler.List(c)
+
+	if w.Code != http.StatusOK {
+		t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String())
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
+
+// TestActivityHandler_SinceSecs_InvalidRejected covers the loud-fail path:
+// a typoed param (non-int, zero, negative) returns 400 instead of being
+// silently dropped — that's the bug this whole feature is fixing.
+func TestActivityHandler_SinceSecs_InvalidRejected(t *testing.T) {
+	cases := []struct {
+		name string
+		val  string
+	}{
+		{"non-integer", "abc"},
+		{"zero", "0"},
+		{"negative", "-1"},
+		{"hex-prefix", "0x10"},
+		{"float", "60.5"},
+	}
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			// No DB call expected; bad input must be caught before the query.
+			setupTestDB(t)
+			broadcaster := newTestBroadcaster()
+			handler := NewActivityHandler(broadcaster)
+
+			w := httptest.NewRecorder()
+			c, _ := gin.CreateTestContext(w)
+			c.Params = gin.Params{{Key: "id", Value: "ws-1"}}
+			c.Request = httptest.NewRequest("GET",
+				"/workspaces/ws-1/activity?since_secs="+tc.val, nil)
+
+			handler.List(c)
+
+			if w.Code != http.StatusBadRequest {
+				t.Errorf("expected 400 for %q, got %d: %s", tc.val, w.Code, w.Body.String())
+			}
+			var resp map[string]string
+			_ = json.Unmarshal(w.Body.Bytes(), &resp)
+			if resp["error"] == "" {
+				t.Errorf("expected error message in response body for %q", tc.val)
+			}
+		})
+	}
+}
+
+// TestActivityHandler_SinceSecs_Omitted verifies backward compat — callers
+// that don't pass since_secs see the original behavior (no extra WHERE
+// clause, just workspace_id + limit).
+func TestActivityHandler_SinceSecs_Omitted(t *testing.T) {
+	mock := setupTestDB(t)
+
+	// Only workspace_id + limit; the query must NOT include the
+	// make_interval clause. sqlmock's WithArgs is strict on count, so a
+	// since_secs leak would surface as "expected 2 args, got 3".
+	mock.ExpectQuery("SELECT id, workspace_id, activity_type").
+		WithArgs("ws-1", 100).
+		WillReturnRows(newActivityRows())
+
+	broadcaster := newTestBroadcaster()
+	handler := NewActivityHandler(broadcaster)
+
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Params = gin.Params{{Key: "id", Value: "ws-1"}}
+	c.Request = httptest.NewRequest("GET", "/workspaces/ws-1/activity", nil)
+
+	handler.List(c)
+
+	if w.Code != http.StatusOK {
+		t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String())
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("unmet sqlmock expectations: %v", err)
+	}
+}
@@ -167,6 +167,223 @@ func TestActivityList_SourceWithType(t *testing.T) {
 	}
 }

+// ---------- Activity List peer_id filter ----------
+//
+// peer_id surfaces the conversation history with one specific peer
+// for the wheel-side chat_history MCP tool. The filter joins
+// (source_id = $X OR target_id = $X) so both inbound (where this
+// peer was the sender) and outbound (where this peer was the
+// recipient) turns appear in the same view, ordered by created_at.
+
+const testPeerUUID = "11111111-2222-3333-4444-555555555555"
+
+func TestActivityList_PeerIDFilter(t *testing.T) {
+	mock := setupTestDB(t)
+	broadcaster := newTestBroadcaster()
+	handler := NewActivityHandler(broadcaster)
+
+	// peer_id binds twice in the query (source_id OR target_id) but is
+	// added to args once — sqlmock matches positional args, so the
+	// binding shape is what matters.
+	mock.ExpectQuery(
+		`SELECT .+ FROM activity_logs WHERE workspace_id = .+ AND \(source_id = .+ OR target_id = .+\)`,
+	).
+		WithArgs("ws-1", testPeerUUID, 100).
+		WillReturnRows(sqlmock.NewRows([]string{
+			"id", "workspace_id", "activity_type", "source_id", "target_id",
+			"method", "summary", "request_body", "response_body",
+			"tool_trace", "duration_ms", "status", "error_detail", "created_at",
+		}))
+
+	gin.SetMode(gin.TestMode)
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Params = gin.Params{{Key: "id", Value: "ws-1"}}
+	c.Request = httptest.NewRequest(
+		"GET", "/workspaces/ws-1/activity?peer_id="+testPeerUUID, nil,
+	)
+	handler.List(c)
+
+	if w.Code != http.StatusOK {
+		t.Errorf("expected 200, got %d: %s", w.Code, w.Body.String())
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Fatalf("unmet expectations: %v", err)
+	}
+}
+
+func TestActivityList_PeerIDComposesWithType(t *testing.T) {
+	// peer_id + type + source must compose into a single AND-chain so
+	// the wheel can fetch e.g. "all peer_agent inbound from peer X" in
+	// one round-trip. Pin both args + arg order so a future refactor
+	// of the builder can't silently rearrange placeholders.
+	mock := setupTestDB(t)
+	broadcaster := newTestBroadcaster()
+	handler := NewActivityHandler(broadcaster)
+
+	mock.ExpectQuery(
+		`SELECT .+ FROM activity_logs WHERE workspace_id = .+ AND activity_type = .+ AND source_id IS NOT NULL AND \(source_id = .+ OR target_id = .+\)`,
+	).
+		WithArgs("ws-1", "a2a_receive", testPeerUUID, 100).
+		WillReturnRows(sqlmock.NewRows([]string{
+			"id", "workspace_id", "activity_type", "source_id", "target_id",
+			"method", "summary", "request_body", "response_body",
+			"tool_trace", "duration_ms", "status", "error_detail", "created_at",
+		}))
+
+	gin.SetMode(gin.TestMode)
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Params = gin.Params{{Key: "id", Value: "ws-1"}}
+	c.Request = httptest.NewRequest(
+		"GET",
+		"/workspaces/ws-1/activity?type=a2a_receive&source=agent&peer_id="+testPeerUUID,
+		nil,
+	)
+	handler.List(c)
+
+	if w.Code != http.StatusOK {
+		t.Errorf("expected 200, got %d: %s", w.Code, w.Body.String())
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Fatalf("unmet expectations: %v", err)
+	}
+}
+
+func TestActivityList_PeerIDRejectsNonUUID(t *testing.T) {
+	// Trust-boundary check: a malformed peer_id must 400 before any
+	// query is built. Defends against caller bugs (typoed UUID,
+	// leading whitespace) and against any future code path that might
+	// otherwise interpolate the value into the URL or another query.
+	gin.SetMode(gin.TestMode)
+	broadcaster := newTestBroadcaster()
+	handler := NewActivityHandler(broadcaster)
+
+	for _, bad := range []string{
+		"not-a-uuid",
+		"%27%20OR%201%3D1%20--",                          // URL-encoded ' OR 1=1 --
+		"11111111-2222-3333-4444",                        // truncated
+		"11111111-2222-3333-4444-555555555555-extra",     // overlong
+		"11111111-2222-3333-4444-55555555555G",           // non-hex
+	} {
+		w := httptest.NewRecorder()
+		c, _ := gin.CreateTestContext(w)
+		c.Params = gin.Params{{Key: "id", Value: "ws-1"}}
+		c.Request = httptest.NewRequest(
+			"GET", "/workspaces/ws-1/activity?peer_id="+bad, nil,
+		)
+		handler.List(c)
+
+		if w.Code != http.StatusBadRequest {
+			t.Errorf("peer_id=%q: expected 400, got %d (%s)", bad, w.Code, w.Body.String())
+		}
+	}
+}
+
+// ---------- before_ts paging knob ----------
+//
+// before_ts is the wall-clock paging companion to peer_id — the agent
+// walks backward through long histories by passing the oldest
+// `created_at` from the previous response. Validated as RFC3339 at the
+// trust boundary; mirrors the strict-inequality shape since_id uses
+// for forward paging.
+
+func TestActivityList_BeforeTSFilter(t *testing.T) {
+	mock := setupTestDB(t)
+	broadcaster := newTestBroadcaster()
+	handler := NewActivityHandler(broadcaster)
+
+	cutoff, _ := time.Parse(time.RFC3339, "2026-05-01T00:00:00Z")
+	mock.ExpectQuery(
+		`SELECT .+ FROM activity_logs WHERE workspace_id = .+ AND created_at < .+`,
+	).
+		WithArgs("ws-1", cutoff, 100).
+		WillReturnRows(sqlmock.NewRows([]string{
+			"id", "workspace_id", "activity_type", "source_id", "target_id",
+			"method", "summary", "request_body", "response_body",
+			"tool_trace", "duration_ms", "status", "error_detail", "created_at",
+		}))
+
+	gin.SetMode(gin.TestMode)
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Params = gin.Params{{Key: "id", Value: "ws-1"}}
+	c.Request = httptest.NewRequest(
+		"GET", "/workspaces/ws-1/activity?before_ts=2026-05-01T00%3A00%3A00Z", nil,
+	)
+	handler.List(c)
+
+	if w.Code != http.StatusOK {
+		t.Errorf("expected 200, got %d: %s", w.Code, w.Body.String())
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Fatalf("unmet expectations: %v", err)
+	}
+}
+
+func TestActivityList_BeforeTSComposesWithPeerID(t *testing.T) {
+	// peer_id + before_ts: the canonical wheel-side chat_history paging
+	// shape. Pin both args + arg order so a future builder refactor
+	// can't silently drop one filter or reorder placeholders.
+	mock := setupTestDB(t)
+	broadcaster := newTestBroadcaster()
+	handler := NewActivityHandler(broadcaster)
+
+	cutoff, _ := time.Parse(time.RFC3339, "2026-05-01T00:00:00Z")
+	mock.ExpectQuery(
+		`SELECT .+ FROM activity_logs WHERE workspace_id = .+ AND \(source_id = .+ OR target_id = .+\) AND created_at < .+`,
+	).
+		WithArgs("ws-1", testPeerUUID, cutoff, 100).
+		WillReturnRows(sqlmock.NewRows([]string{
+			"id", "workspace_id", "activity_type", "source_id", "target_id",
+			"method", "summary", "request_body", "response_body",
+			"tool_trace", "duration_ms", "status", "error_detail", "created_at",
+		}))
+
+	gin.SetMode(gin.TestMode)
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Params = gin.Params{{Key: "id", Value: "ws-1"}}
+	c.Request = httptest.NewRequest(
+		"GET",
+		"/workspaces/ws-1/activity?peer_id="+testPeerUUID+"&before_ts=2026-05-01T00%3A00%3A00Z",
+		nil,
+	)
+	handler.List(c)
+
+	if w.Code != http.StatusOK {
+		t.Errorf("expected 200, got %d: %s", w.Code, w.Body.String())
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Fatalf("unmet expectations: %v", err)
+	}
+}
+
+func TestActivityList_BeforeTSRejectsInvalidFormat(t *testing.T) {
+	gin.SetMode(gin.TestMode)
+	broadcaster := newTestBroadcaster()
+	handler := NewActivityHandler(broadcaster)
+
+	for _, bad := range []string{
+		"yesterday",
+		"2026-05-01",                            // missing time component
+		"2026-05-01%2000%3A00%3A00",             // URL-encoded space instead of T
+		"%27%20OR%201%3D1%20--",                 // URL-encoded SQL injection
+	} {
+		w := httptest.NewRecorder()
+		c, _ := gin.CreateTestContext(w)
+		c.Params = gin.Params{{Key: "id", Value: "ws-1"}}
+		c.Request = httptest.NewRequest(
+			"GET", "/workspaces/ws-1/activity?before_ts="+bad, nil,
+		)
+		handler.List(c)
+
+		if w.Code != http.StatusBadRequest {
+			t.Errorf("before_ts=%q: expected 400, got %d (%s)", bad, w.Code, w.Body.String())
+		}
+	}
+}
+
 // ---------- Activity type allowlist (#125: memory_write added) ----------

 func TestActivityReport_AcceptsMemoryWriteType(t *testing.T) {
@@ -129,3 +129,97 @@ func TestAdminTestToken_HappyPath_TokenValidates(t *testing.T) {
 }

 func sqlErrNoRows() error { return sql.ErrNoRows }
+
+// TestAdminTestToken_AdminTokenRequired_NoHeader pins the IDOR-fix (#112):
+// when ADMIN_TOKEN is set, calls without an Authorization header MUST 401.
+// Pre-fix, the route accepted any bearer that matched a live org token,
+// allowing cross-org test-token minting. The current code uses
+// subtle.ConstantTimeCompare against ADMIN_TOKEN explicitly. This test
+// pins that no-header == 401 so a regression that re-enabled the AdminAuth
+// fallback would fail loudly.
+func TestAdminTestToken_AdminTokenRequired_NoHeader(t *testing.T) {
+	setupTestDB(t)
+	t.Setenv("MOLECULE_ENV", "development")
+	t.Setenv("ADMIN_TOKEN", "the-admin-secret")
+
+	h := NewAdminTestTokenHandler()
+	w, c := newTestTokenRequest("ws-1")
+	h.GetTestToken(c)
+
+	if w.Code != http.StatusUnauthorized {
+		t.Fatalf("expected 401 with ADMIN_TOKEN set + no Authorization, got %d: %s", w.Code, w.Body.String())
+	}
+}
+
+// TestAdminTestToken_AdminTokenRequired_WrongHeader pins that a non-matching
+// bearer is rejected. Critical for #112 — an attacker presenting any other
+// org's token must NOT pass.
+func TestAdminTestToken_AdminTokenRequired_WrongHeader(t *testing.T) {
+	setupTestDB(t)
+	t.Setenv("MOLECULE_ENV", "development")
+	t.Setenv("ADMIN_TOKEN", "the-admin-secret")
+
+	h := NewAdminTestTokenHandler()
+	w, c := newTestTokenRequest("ws-1")
+	c.Request.Header.Set("Authorization", "Bearer wrong-token")
+	h.GetTestToken(c)
+
+	if w.Code != http.StatusUnauthorized {
+		t.Fatalf("expected 401 with wrong Authorization, got %d: %s", w.Code, w.Body.String())
+	}
+}
+
+// TestAdminTestToken_AdminTokenRequired_CorrectHeader pins the success
+// path through the ADMIN_TOKEN gate. Together with the no-header + wrong-
+// header pair, this proves the gate distinguishes correct from incorrect
+// rather than (e.g.) erroring on every request.
+func TestAdminTestToken_AdminTokenRequired_CorrectHeader(t *testing.T) {
+	mock := setupTestDB(t)
+	t.Setenv("MOLECULE_ENV", "development")
+	t.Setenv("ADMIN_TOKEN", "the-admin-secret")
+
+	mock.ExpectQuery("SELECT id FROM workspaces WHERE id =").
+		WithArgs("ws-1").
+		WillReturnRows(sqlmock.NewRows([]string{"id"}).AddRow("ws-1"))
+	mock.ExpectExec("INSERT INTO workspace_auth_tokens").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	h := NewAdminTestTokenHandler()
+	w, c := newTestTokenRequest("ws-1")
+	c.Request.Header.Set("Authorization", "Bearer the-admin-secret")
+	h.GetTestToken(c)
+
+	if w.Code != http.StatusOK {
+		t.Fatalf("expected 200 with correct ADMIN_TOKEN, got %d: %s", w.Code, w.Body.String())
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("sqlmock expectations not met — INSERT into workspace_auth_tokens did not run, suggesting the gate short-circuited the success path: %v", err)
+	}
+}
+
+// TestAdminTestToken_AdminTokenEmpty_GateBypassedSafely pins that when
+// ADMIN_TOKEN is unset (typical local-dev setup), the explicit gate is
+// bypassed and the route works without an Authorization header. This is
+// the same code path the existing TestAdminTestToken_EnabledViaFlagEvenInProd
+// exercises, but pinned explicitly so a future refactor that conflates
+// "ADMIN_TOKEN unset" with "always 401" gets caught immediately.
+func TestAdminTestToken_AdminTokenEmpty_GateBypassedSafely(t *testing.T) {
+	mock := setupTestDB(t)
+	t.Setenv("MOLECULE_ENV", "development")
+	t.Setenv("ADMIN_TOKEN", "")
+
+	mock.ExpectQuery("SELECT id FROM workspaces WHERE id =").
+		WithArgs("ws-1").
+		WillReturnRows(sqlmock.NewRows([]string{"id"}).AddRow("ws-1"))
+	mock.ExpectExec("INSERT INTO workspace_auth_tokens").
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	h := NewAdminTestTokenHandler()
+	w, c := newTestTokenRequest("ws-1")
+	// Note: NO Authorization header — the gate is unset, so this MUST work.
+	h.GetTestToken(c)
+
+	if w.Code != http.StatusOK {
+		t.Fatalf("expected 200 with ADMIN_TOKEN empty + no Authorization, got %d: %s", w.Code, w.Body.String())
+	}
+}
@@ -1,6 +1,26 @@
 package handlers

-// chat_files.go — file upload/download for workspace chat.
+// chat_files.go — file upload + download for workspace chat,
+// both HTTP-forward (RFC #2312, fully landed).
+//
+// Architecture (v2, post-RFC-#2312):
+//
+//   - Upload (POST /workspaces/:id/uploads): the platform proxies the
+//     multipart request straight to the workspace's own
+//     /internal/chat/uploads/ingest endpoint. The workspace agent then
+//     writes to local /workspace/.molecule/chat-uploads.
+//
+//   - Download (GET /workspaces/:id/files): the platform makes an HTTP
+//     GET to the workspace's /internal/file/read?path=<abs> endpoint
+//     and streams the response body to the caller.
+//
+// Same code path on local Docker and SaaS — the v1 docker-exec /
+// docker-cp paths were structurally broken in SaaS because
+// workspace-server's local Docker client has no visibility into
+// EC2-hosted workspaces (#2308 root cause). Both surfaces now use the
+// per-workspace platform_inbound_secret minted at provision time
+// (RFC #2312 PR-F) for auth, and the workspace's HTTP server mounts
+// the corresponding receiver at workspace/main.py.
 //
 // Split from templates.go because these endpoints have a different
 // security model (no /configs write, no template fallback) and a
@@ -9,61 +29,141 @@ package handlers
 // conversation payloads.

 import (
-	"archive/tar"
-	"bytes"
 	"context"
-	"crypto/rand"
-	"encoding/hex"
 	"fmt"
 	"io"
 	"log"
-	"mime"
-	"mime/multipart"
 	"net/http"
+	"net/url"
 	"path/filepath"
-	"regexp"
 	"strings"
+	"time"

-	"github.com/docker/docker/api/types/container"
+	"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
 	"github.com/gin-gonic/gin"
 )

-// ChatFilesHandler serves file upload + download for chat. It
-// composes the existing TemplatesHandler's Docker plumbing
-// (findContainer, execInContainer, copyFilesToContainer) rather than
-// duplicating them, so a bug fix in the Docker layer propagates to
-// both endpoints.
+// ChatFilesHandler serves file upload + download for chat. Holds a
+// reference to TemplatesHandler so the (still docker-exec) Download
+// path keeps using the shared findContainer/CopyFromContainer helpers
+// without duplicating them. Upload no longer reaches into Docker.
 type ChatFilesHandler struct {
 	templates *TemplatesHandler
+
+	// httpClient is broken out so tests can swap in an httptest.Server
+	// transport. Prod uses a default with a generous Timeout to cover
+	// the 50 MB worst case on a slow EC2 link without leaving a
+	// connection hanging forever on a sick workspace.
+	httpClient *http.Client
 }

 func NewChatFilesHandler(t *TemplatesHandler) *ChatFilesHandler {
-	return &ChatFilesHandler{templates: t}
+	return &ChatFilesHandler{
+		templates: t,
+		httpClient: &http.Client{
+			// 50 MB total body cap / ~1 MB/s slow-network floor → ~60s.
+			// Doubled for headroom on the legitimate-but-slow case.
+			Timeout: 120 * time.Second,
+		},
+	}
 }

 // chatUploadMaxBytes caps the full multipart request body so a
-// malicious / runaway client can't OOM the server. 50 MB covers most
-// documents + a handful of images per message; larger artefacts
-// should go through git/S3 rather than chat.
+// malicious / runaway client can't OOM the proxy hop. 50 MB matches
+// the workspace-side limit; anything larger is rejected at the
+// network boundary before forwarding.
 const chatUploadMaxBytes = 50 * 1024 * 1024

-// chatUploadMaxFileBytes caps individual files in a multi-file upload.
-// Keeping the per-file cap below the total lets a user send, say, a
-// 5 MB PDF + 10 screenshots without tripping the batch limit on any
-// single attachment.
-const chatUploadMaxFileBytes = 25 * 1024 * 1024
-
 // chatUploadDir is the in-container path where user-uploaded chat
-// attachments land. Under /workspace so the file persists with the
-// workspace volume and is readable by the agent without any extra
-// plumbing — the agent just reads from the URI path we return.
+// attachments land. Kept here for documentation parity with the
+// workspace-side handler — the platform no longer writes files
+// directly, but the URI scheme returned in responses still uses this
+// path, so any consumer parsing those URIs has the constant to
+// reference.
 const chatUploadDir = "/workspace/.molecule/chat-uploads"

-// unsafeFilenameChars matches anything outside the conservative
-// {alnum, dot, underscore, dash} set. Filenames get rewritten
-// character-class at a time, so embedded paths, control chars,
-// newlines, quotes, and shell metachars never reach the filesystem.
-var unsafeFilenameChars = regexp.MustCompile(`[^a-zA-Z0-9._\-]`)
+// resolveWorkspaceForwardCreds resolves the workspace's URL +
+// platform_inbound_secret for an /internal/* forward, applying
+// lazy-heal on a missing inbound secret (RFC #2312 backfill — the
+// 2026-04-30 fix that closes the existing-workspace gap left by the
+// shared-mint refactor).
+//
+// On any failure path the function HAS ALREADY written the appropriate
+// status + JSON body to c (404 / 503 / 500) and returns ok=false.
+// On success returns the URL + secret + ok=true.
+//
+// op is the human-readable feature label ("upload"/"download") used
+// in log messages and the 503 RFC-#2312 detail copy so operators can
+// distinguish which feature ran.
+//
+// Centralized here (rather than inline in Upload + Download) so the
+// next forward-time condition we add — secret rotation, audit, etc. —
+// goes in ONE place. Drift between the two handlers is the same class
+// of bug as the original SaaS provision drift fixed in #2366; this
+// extraction prevents that class on the consumer side.
+func resolveWorkspaceForwardCreds(c *gin.Context, ctx context.Context, workspaceID, op string) (wsURL, secret string, ok bool) {
+	if err := db.DB.QueryRowContext(ctx,
+		`SELECT COALESCE(url, '') FROM workspaces WHERE id = $1`, workspaceID,
+	).Scan(&wsURL); err != nil {
+		log.Printf("chat_files %s: workspace lookup failed for %s: %v", op, workspaceID, err)
+		c.JSON(http.StatusNotFound, gin.H{"error": "workspace not found"})
+		return "", "", false
+	}
+	if wsURL == "" {
+		c.JSON(http.StatusServiceUnavailable, gin.H{"error": "workspace url not registered yet"})
+		return "", "", false
+	}
+	// Trust note: workspaces.url passes validateAgentURL at /registry/
+	// register write time, blocking SSRF-shaped URLs. We rely on that
+	// upstream gate rather than re-validating here. Tracked at #2316
+	// for follow-up: forward-time re-validation as defense-in-depth.
+
+	secret, healed, err := readOrLazyHealInboundSecret(ctx, workspaceID, "chat_files "+op)
+	if err != nil {
+		// Either a non-NoInboundSecret read error (DB hiccup) or a mint
+		// failure during lazy-heal. The chat_files contract is to surface
+		// 503 with the RFC-#2312 reprovision hint in both cases — the user
+		// can't proceed and needs ops attention.
+		c.JSON(http.StatusServiceUnavailable, gin.H{
+			"error":  "workspace not yet enrolled in v2 " + op + " (RFC #2312)",
+			"detail": "Failed to mint inbound secret. Reprovision the workspace if this persists.",
+		})
+		return "", "", false
+	}
+	if healed {
+		// The platform now has the secret but the workspace's
+		// /configs/.platform_inbound_secret is still empty until the next
+		// /registry/register response propagates it. User retries after
+		// the workspace's next heartbeat picks up the new secret (~30s).
+		c.JSON(http.StatusServiceUnavailable, gin.H{
+			"error":               "workspace re-registering — please retry in 30 seconds",
+			"detail":              "Inbound secret was just minted. Workspace will pick it up on its next heartbeat.",
+			"retry_after_seconds": 30,
+		})
+		return "", "", false
+	}
+	return wsURL, secret, true
+}
+
+// urlPathEscape percent-encodes every byte outside the RFC 3986
+// unreserved set — stricter than net/url.PathEscape (which leaves
+// "/" unescaped because it's legal in URL paths). Filenames must
+// never contain "/" anyway, so escaping it is defence-in-depth
+// against an agent that writes a path-like name.
+//
+// Used by Download's Content-Disposition header.
+func urlPathEscape(s string) string {
+	const unreserved = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~"
+	var b strings.Builder
+	for _, c := range []byte(s) {
+		if strings.IndexByte(unreserved, c) >= 0 {
+			b.WriteByte(c)
+		} else {
+			fmt.Fprintf(&b, "%%%02X", c)
+		}
+	}
+	return b.String()
+}

 // contentDispositionAttachment produces a safe `attachment; filename=...`
 // header. Quotes, CR, and LF in the filename are escaped per RFC 6266 /
@@ -97,60 +197,23 @@ func contentDispositionAttachment(name string) string {
 		asciiSafe, urlPathEscape(name))
 }

-// urlPathEscape percent-encodes every byte outside the RFC 3986
-// unreserved set — stricter than net/url.PathEscape (which leaves
-// "/" unescaped because it's legal in URL paths). Filenames must
-// never contain "/" anyway, so escaping it is defence-in-depth
-// against an agent that writes a path-like name.
-func urlPathEscape(s string) string {
-	const unreserved = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~"
-	var b strings.Builder
-	for _, c := range []byte(s) {
-		if strings.IndexByte(unreserved, c) >= 0 {
-			b.WriteByte(c)
-		} else {
-			fmt.Fprintf(&b, "%%%02X", c)
-		}
-	}
-	return b.String()
-}
-
-func sanitizeFilename(in string) string {
-	base := filepath.Base(in)
-	base = strings.ReplaceAll(base, " ", "_")
-	base = unsafeFilenameChars.ReplaceAllString(base, "_")
-	if len(base) > 100 {
-		ext := filepath.Ext(base)
-		if len(ext) > 16 {
-			ext = ""
-		}
-		base = base[:100-len(ext)] + ext
-	}
-	if base == "" || base == "." || base == ".." {
-		return "file"
-	}
-	return base
-}
-
-// ChatUploadedFile is the per-file response returned from POST
-// /workspaces/:id/chat/uploads. Clients include this payload (or a
-// trimmed subset) in their outgoing A2A `message/send` parts.
-type ChatUploadedFile struct {
-	// URI uses a custom "workspace:" scheme so clients can resolve it
-	// against the streaming Download endpoint regardless of where the
-	// canvas itself is hosted. The path component is always absolute
-	// within the workspace container.
-	URI      string `json:"uri"`
-	Name     string `json:"name"`
-	MimeType string `json:"mimeType,omitempty"`
-	Size     int64  `json:"size"`
-}
-
 // Upload handles POST /workspaces/:id/chat/uploads.
-// Accepts multipart/form-data with one or more `files` fields, stages
-// each under /workspace/.molecule/chat-uploads with a UUID prefix,
-// and returns the list of URIs for the caller to attach to an A2A
-// message.
+//
+// Streams the multipart body straight to the workspace's own
+// /internal/chat/uploads/ingest endpoint with the platform_inbound_secret
+// (RFC #2312, migration 044) in the Authorization header. The workspace
+// validates and writes to its local /workspace/.molecule/chat-uploads;
+// the response (containing one ChatUploadedFile per upload) is streamed
+// back unchanged.
+//
+// Why streaming, not parse-then-re-encode:
+//   - Eliminates the 50 MB intermediate buffer on the platform.
+//   - Per-file size + path-safety enforcement is the workspace's job;
+//     duplicating it here just creates two places to keep in sync.
+//   - The error responses from the workspace (413 with the offending
+//     filename, 400 on missing files field, etc.) propagate through
+//     unchanged, so the user sees the same shapes regardless of where
+//     the failure originated.
 func (h *ChatFilesHandler) Upload(c *gin.Context) {
 	workspaceID := c.Param("id")
 	if err := validateWorkspaceID(workspaceID); err != nil {
@@ -158,172 +221,62 @@ func (h *ChatFilesHandler) Upload(c *gin.Context) {
 		return
 	}

-	// Hard cap the request body BEFORE ParseMultipartForm — otherwise
-	// a client could chunk-upload past the cap before Go notices.
+	// Hard cap the request body BEFORE forwarding. http.MaxBytesReader
+	// enforces lazily as the body is read; a malicious client cannot
+	// chunk-upload past the cap, the wrapped reader returns an error
+	// when the cap is exceeded and the workspace receives a truncated
+	// stream that fails its own multipart parser.
 	c.Request.Body = http.MaxBytesReader(c.Writer, c.Request.Body, chatUploadMaxBytes)
-	if err := c.Request.ParseMultipartForm(chatUploadMaxBytes); err != nil {
-		c.JSON(http.StatusBadRequest, gin.H{"error": "failed to parse multipart form"})
-		return
-	}
-
-	form := c.Request.MultipartForm
-	var headers []*multipart.FileHeader
-	if form != nil && form.File != nil {
-		headers = form.File["files"]
-	}
-	if len(headers) == 0 {
-		c.JSON(http.StatusBadRequest, gin.H{"error": "expected at least one 'files' field"})
-		return
-	}

 	ctx := c.Request.Context()
-	containerName := h.templates.findContainer(ctx, workspaceID)
-	if containerName == "" {
-		c.JSON(http.StatusServiceUnavailable, gin.H{"error": "workspace container not running"})
+
+	wsURL, secret, ok := resolveWorkspaceForwardCreds(c, ctx, workspaceID, "upload")
+	if !ok {
 		return
 	}

-	// Build the archive in memory. Files are byte-preserving through
-	// Go's string<->[]byte (the tar helper takes map[string]string but
-	// the conversion is a literal copy, not a UTF-8 reinterpretation).
-	archive := map[string]string{}
-	uploaded := make([]ChatUploadedFile, 0, len(headers))
-	for _, fh := range headers {
-		if fh.Size > chatUploadMaxFileBytes {
-			c.JSON(http.StatusRequestEntityTooLarge, gin.H{
-				"error": fmt.Sprintf("%s exceeds per-file limit (%d MB)", fh.Filename, chatUploadMaxFileBytes/(1024*1024)),
-			})
-			return
-		}
-		f, err := fh.Open()
-		if err != nil {
-			c.JSON(http.StatusBadRequest, gin.H{"error": "failed to read upload"})
-			return
-		}
-		// LimitReader guards against a truthful-but-lying Size header:
-		// if the multipart stream carries more bytes than declared, we
-		// stop at the cap instead of growing the buffer.
-		data, err := io.ReadAll(io.LimitReader(f, chatUploadMaxFileBytes+1))
-		f.Close()
-		if err != nil {
-			c.JSON(http.StatusBadRequest, gin.H{"error": "failed to read upload"})
-			return
-		}
-		if int64(len(data)) > chatUploadMaxFileBytes {
-			c.JSON(http.StatusRequestEntityTooLarge, gin.H{
-				"error": fmt.Sprintf("%s exceeds per-file limit (%d MB)", fh.Filename, chatUploadMaxFileBytes/(1024*1024)),
-			})
-			return
-		}
-
-		name := sanitizeFilename(fh.Filename)
-		// 16-byte (UUID-equivalent) random prefix. Within a single
-		// batch we also check for collisions — birthday on 128 bits
-		// is astronomical, but a bad PRNG or single re-used draw
-		// would silently overwrite a sibling upload with its own
-		// content and return two URIs pointing at one file.
-		var stored string
-		for attempt := 0; attempt < 4; attempt++ {
-			idBytes := make([]byte, 16)
-			if _, err := rand.Read(idBytes); err != nil {
-				c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to allocate upload ID"})
-				return
-			}
-			candidate := hex.EncodeToString(idBytes) + "-" + name
-			if _, taken := archive[candidate]; !taken {
-				stored = candidate
-				break
-			}
-		}
-		if stored == "" {
-			c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to allocate unique upload ID"})
-			return
-		}
-		archive[stored] = string(data)
-
-		mt := fh.Header.Get("Content-Type")
-		if mt == "" {
-			mt = mime.TypeByExtension(filepath.Ext(name))
-		}
-		uploaded = append(uploaded, ChatUploadedFile{
-			URI:      "workspace:" + chatUploadDir + "/" + stored,
-			Name:     name,
-			MimeType: mt,
-			Size:     int64(len(data)),
-		})
-	}
-
-	// mkdir -p is idempotent; we fire it every upload instead of
-	// caching state here so container restarts don't surprise us.
-	_, _ = h.templates.execInContainer(ctx, containerName, []string{"mkdir", "-p", chatUploadDir})
-
-	// Defence in depth: pre-remove each target path before extracting
-	// the tar. An agent with write access to /workspace could in
-	// theory race-create a symlink at <chatUploadDir>/<stored-name>
-	// pointing at a sensitive in-container path (its own /etc/*,
-	// mounted secrets). Docker's tar extraction on some drivers
-	// follows pre-existing symlinks at the destination. `rm -f` the
-	// exact stored-name closes that window — the UUID prefix on the
-	// name makes a successful race effectively impossible, but this
-	// guard costs nothing and documents the intent.
-	rmArgs := []string{"rm", "-f", "--"}
-	for stored := range archive {
-		rmArgs = append(rmArgs, chatUploadDir+"/"+stored)
-	}
-	_, _ = h.templates.execInContainer(ctx, containerName, rmArgs)
-
-	if err := h.copyFlatToContainer(ctx, containerName, chatUploadDir, archive); err != nil {
-		log.Printf("Chat upload copy failed for %s: %v", workspaceID, err)
-		c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to stage files in workspace"})
+	// Build the forward request. Body is the (capped) reader from the
+	// inbound request — Go's http.Client streams it directly to the
+	// workspace, no intermediate buffering on the platform.
+	forwardURL := strings.TrimRight(wsURL, "/") + "/internal/chat/uploads/ingest"
+	req, err := http.NewRequestWithContext(ctx, http.MethodPost, forwardURL, c.Request.Body)
+	if err != nil {
+		log.Printf("chat_files Upload: build request failed for %s: %v", workspaceID, err)
+		c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to construct forward request"})
 		return
 	}
+	// Forward the multipart Content-Type (with boundary) verbatim;
+	// without it the workspace's parser cannot find part boundaries.
+	if ct := c.Request.Header.Get("Content-Type"); ct != "" {
+		req.Header.Set("Content-Type", ct)
+	}
+	req.Header.Set("Authorization", "Bearer "+secret)
+	// Pass through Content-Length so the workspace can short-circuit
+	// the total-body cap before parsing. ContentLength on the request
+	// struct also lets Go's transport know whether to stream or send
+	// chunked-encoded.
+	if c.Request.ContentLength > 0 {
+		req.ContentLength = c.Request.ContentLength
+	}

-	c.JSON(http.StatusOK, gin.H{"files": uploaded})
-}
-
-// copyFlatToContainer extracts one tar of flat files into destPath
-// inside the container. Unlike the shared copyFilesToContainer helper
-// (which prepends destPath into tar entry names — correct for its
-// callers whose files relative-live inside a nested tree), this
-// helper writes tar entries with ONLY the flat filename so Docker's
-// extraction at destPath lands them directly in destPath, not at
-// destPath/destPath/... as the shared helper would.
-// Filenames are validated to contain no path separator so nothing
-// can escape destPath via an embedded "../" or a leading "/".
-func (h *ChatFilesHandler) copyFlatToContainer(ctx context.Context, containerName, destPath string, files map[string]string) error {
-	if h.templates.docker == nil {
-		return fmt.Errorf("docker not available")
-	}
-	var buf bytes.Buffer
-	tw := tar.NewWriter(&buf)
-	for name, content := range files {
-		if strings.ContainsAny(name, "/\\") || name == ".." || name == "." || name == "" {
-			return fmt.Errorf("unsafe flat filename: %q", name)
-		}
-		data := []byte(content)
-		if err := tw.WriteHeader(&tar.Header{
-			Name:     name, // relative — Docker resolves against destPath
-			Mode:     0644,
-			Size:     int64(len(data)),
-			Typeflag: tar.TypeReg,
-		}); err != nil {
-			return fmt.Errorf("tar header %q: %w", name, err)
-		}
-		if _, err := tw.Write(data); err != nil {
-			return fmt.Errorf("tar write %q: %w", name, err)
-		}
-	}
-	if err := tw.Close(); err != nil {
-		return fmt.Errorf("tar close: %w", err)
-	}
-	return h.templates.docker.CopyToContainer(ctx, containerName, destPath, &buf, container.CopyToContainerOptions{})
+	h.streamWorkspaceResponse(c, "upload", workspaceID, forwardURL, req, []string{"Content-Type"})
 }

 // Download handles GET /workspaces/:id/chat/download?path=<abs path>.
-// Streams the file bytes from the container with a correct
-// Content-Type and attachment Content-Disposition. Binary-safe —
-// unlike the existing JSON ReadFile endpoint which carries content
-// as a string (lossy for non-UTF-8 bytes).
+// Forwards over HTTP to the workspace's own /internal/file/read endpoint
+// (RFC #2312 PR-D), replacing the docker-cp tar-stream extraction that
+// only worked when the platform binary had local Docker socket access.
+//
+// Same path-safety contract as the legacy version: caller-side validation
+// is duplicated on the workspace side (internal_file_read.py) so a
+// platform bug or malicious caller bypassing one layer still hits the
+// other. This is "defence in depth via two parallel checks," not "trust
+// the workspace to validate" — the workspace doesn't trust the platform
+// either.
+//
+// Body is streamed end-to-end (no buffering on the platform), preserving
+// binary safety and arbitrary file size (the 50 MB cap on Upload doesn't
+// apply to artefacts the agent produced).
 func (h *ChatFilesHandler) Download(c *gin.Context) {
 	workspaceID := c.Param("id")
 	if err := validateWorkspaceID(workspaceID); err != nil {
@@ -362,54 +315,61 @@ func (h *ChatFilesHandler) Download(c *gin.Context) {
 	}

 	ctx := c.Request.Context()
-	if h.templates.docker == nil {
-		c.JSON(http.StatusServiceUnavailable, gin.H{"error": "docker unavailable"})
-		return
-	}
-	containerName := h.templates.findContainer(ctx, workspaceID)
-	if containerName == "" {
-		c.JSON(http.StatusServiceUnavailable, gin.H{"error": "workspace container not running"})
+
+	wsURL, secret, ok := resolveWorkspaceForwardCreds(c, ctx, workspaceID, "download")
+	if !ok {
 		return
 	}

-	// docker cp returns a tar stream containing the requested path.
-	// For a regular file that's a single tar entry; we extract and
-	// stream the body through.
-	reader, _, err := h.templates.docker.CopyFromContainer(ctx, containerName, path)
+	// Build forward URL with the validated path encoded as a query param.
+	// url.Values handles all the percent-encoding correctly — a path with
+	// special chars (spaces, &, +) round-trips through both the platform's
+	// validator and the workspace-side validator.
+	forwardURL := strings.TrimRight(wsURL, "/") + "/internal/file/read?path=" + url.QueryEscape(path)
+	req, err := http.NewRequestWithContext(ctx, http.MethodGet, forwardURL, nil)
 	if err != nil {
-		c.JSON(http.StatusNotFound, gin.H{"error": "file not found"})
+		log.Printf("chat_files Download: build request failed for %s: %v", workspaceID, err)
+		c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to construct forward request"})
 		return
 	}
-	defer reader.Close()
+	req.Header.Set("Authorization", "Bearer "+secret)

-	tr := tar.NewReader(reader)
-	hdr, err := tr.Next()
+	h.streamWorkspaceResponse(c, "download", workspaceID, forwardURL, req,
+		[]string{"Content-Type", "Content-Length", "Content-Disposition"})
+}
+
+// streamWorkspaceResponse executes the prepared forward request and
+// streams the workspace's response back to the inbound caller.
+// Forwards the named response headers verbatim. Centralizes the
+// "do request → check err → defer close → copy headers → set status →
+// io.Copy" tail that's identical between Upload and Download.
+//
+// op is the human-readable feature label ("upload"/"download") used
+// in log messages so operators can distinguish which feature ran.
+func (h *ChatFilesHandler) streamWorkspaceResponse(
+	c *gin.Context,
+	op, workspaceID, forwardURL string,
+	req *http.Request,
+	forwardHeaders []string,
+) {
+	resp, err := h.httpClient.Do(req)
 	if err != nil {
-		c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to read archive"})
-		return
-	}
-	if hdr.Typeflag != tar.TypeReg {
-		c.JSON(http.StatusBadRequest, gin.H{"error": "path is not a regular file"})
+		log.Printf("chat_files %s: forward to %s failed: %v", op, forwardURL, err)
+		c.JSON(http.StatusBadGateway, gin.H{"error": "workspace unreachable"})
 		return
 	}
+	defer resp.Body.Close()

-	name := filepath.Base(path)
-	mt := mime.TypeByExtension(filepath.Ext(name))
-	if mt == "" {
-		mt = "application/octet-stream"
+	for _, hdr := range forwardHeaders {
+		if v := resp.Header.Get(hdr); v != "" {
+			c.Header(hdr, v)
+		}
 	}
-	c.Header("Content-Type", mt)
-	c.Header("Content-Length", fmt.Sprintf("%d", hdr.Size))
-	c.Header("Content-Disposition", contentDispositionAttachment(name))
-	c.Status(http.StatusOK)
-
-	// Stream exactly hdr.Size bytes. CopyN was chosen over LimitReader
-	// because it returns an error when the source is short — that
-	// surfaces a bug in the tar extraction path immediately instead
-	// of silently truncating. Agents can legitimately produce files
-	// larger than the 50 MB upload cap (that's a per-request inbound
-	// cap, not a per-artifact one), so we cannot clamp here.
-	if _, err := io.CopyN(c.Writer, tr, hdr.Size); err != nil {
-		log.Printf("Chat download stream error for %s (%s): %v", workspaceID, path, err)
+	c.Status(resp.StatusCode)
+	if _, err := io.Copy(c.Writer, resp.Body); err != nil {
+		// Mid-stream failure — too late to write a JSON error, just
+		// log so ops can correlate with the workspace's logs.
+		log.Printf("chat_files %s: stream response back failed for %s: %v", op, workspaceID, err)
 	}
 }
+
@@ -1,67 +1,91 @@
 package handlers

-// Unit tests for chat_files.go. The Docker-touching paths (Upload
-// actually copying into a container, Download actually streaming tar)
-// are exercised via integration tests — docker-in-docker is out of
-// scope for the unit suite. These tests cover the validation + error
-// surfaces that a caller can reach without a running container.
+// Unit tests for chat_files.go.
+//
+// Upload (HTTP-forward, RFC #2312 PR-C): exercised against an httptest
+// mock workspace + sqlmock-backed db.DB. The platform-side handler is
+// now a streaming proxy; assertions focus on:
+//   * input validation (400 on bad workspace id)
+//   * resolution failures (404 missing row, 503 missing secret/url)
+//   * forward shape (Authorization, Content-Type, body)
+//   * pass-through of the workspace's status + body
+//
+// Path-safety + sanitization that lived on the platform pre-#2312 is
+// now the workspace-side handler's concern; covered in the Python
+// suite (workspace/tests/test_internal_chat_uploads.py).

 import (
 	"bytes"
+	"database/sql"
+	"io"
 	"mime/multipart"
 	"net/http"
 	"net/http/httptest"
 	"strings"
 	"testing"

+	"github.com/DATA-DOG/go-sqlmock"
 	"github.com/gin-gonic/gin"
 )

-func TestSanitizeFilename(t *testing.T) {
-	cases := []struct {
-		in, want string
-	}{
-		{"report.pdf", "report.pdf"},
-		{"my file.pdf", "my_file.pdf"},
-		{"../../etc/passwd", "passwd"},
-		{"weird;$name`.txt", "weird__name_.txt"},
-		{"", "file"},
-		{".", "file"},
-		{"..", "file"},
-	}
-	for _, tc := range cases {
-		got := sanitizeFilename(tc.in)
-		if got != tc.want {
-			t.Errorf("sanitizeFilename(%q) = %q, want %q", tc.in, got, tc.want)
-		}
-	}
+// makeUploadRequest builds a gin context for POST /workspaces/:id/chat/uploads
+// with the given multipart body. The recorder is returned so callers can
+// assert status + body after invoking h.Upload(c).
+func makeUploadRequest(t *testing.T, workspaceID string, body *bytes.Buffer, contentType string) (*gin.Context, *httptest.ResponseRecorder) {
+	t.Helper()
+	w := httptest.NewRecorder()
+	c, _ := gin.CreateTestContext(w)
+	c.Params = gin.Params{{Key: "id", Value: workspaceID}}
+	req := httptest.NewRequest("POST", "/workspaces/"+workspaceID+"/chat/uploads", body)
+	req.Header.Set("Content-Type", contentType)
+	c.Request = req
+	return c, w
 }

-func TestSanitizeFilename_LongNamePreservesExtension(t *testing.T) {
-	// 120-char base + .pdf — the helper should truncate the base but
-	// keep the extension intact so content-type inference still works.
-	longBase := strings.Repeat("a", 120)
-	got := sanitizeFilename(longBase + ".pdf")
-	if len(got) > 100 {
-		t.Errorf("filename not truncated: len=%d", len(got))
-	}
-	if !strings.HasSuffix(got, ".pdf") {
-		t.Errorf("extension stripped: %q", got)
+// uploadFixture builds a minimal multipart/form-data body with a single
+// `files` part. The exact bytes don't matter for proxy tests — only that
+// the workspace receives the same boundary + headers we sent.
+func uploadFixture(t *testing.T) (*bytes.Buffer, string) {
+	t.Helper()
+	var buf bytes.Buffer
+	mw := multipart.NewWriter(&buf)
+	fw, err := mw.CreateFormFile("files", "fixture.txt")
+	if err != nil {
+		t.Fatalf("CreateFormFile: %v", err)
 	}
+	_, _ = fw.Write([]byte("fixture-payload"))
+	mw.Close()
+	return &buf, mw.FormDataContentType()
+}
+
+// expectURL stubs the SELECT that resolves the workspace's url.
+func expectURL(mock sqlmock.Sqlmock, workspaceID, url string) {
+	mock.ExpectQuery(`SELECT COALESCE\(url, ''\) FROM workspaces WHERE id = \$1`).
+		WithArgs(workspaceID).
+		WillReturnRows(sqlmock.NewRows([]string{"url"}).AddRow(url))
+}
+
+// expectURLMissing stubs the SELECT to return sql.ErrNoRows.
+func expectURLMissing(mock sqlmock.Sqlmock, workspaceID string) {
+	mock.ExpectQuery(`SELECT COALESCE\(url, ''\) FROM workspaces WHERE id = \$1`).
+		WithArgs(workspaceID).
+		WillReturnError(sql.ErrNoRows)
+}
+
+// expectInboundSecret stubs the SELECT performed by ReadPlatformInboundSecret.
+func expectInboundSecret(mock sqlmock.Sqlmock, workspaceID string, secret interface{}) {
+	mock.ExpectQuery(`SELECT platform_inbound_secret FROM workspaces WHERE id = \$1`).
+		WithArgs(workspaceID).
+		WillReturnRows(sqlmock.NewRows([]string{"platform_inbound_secret"}).AddRow(secret))
 }

 func TestChatUpload_InvalidWorkspaceID(t *testing.T) {
 	setupTestDB(t)
 	setupTestRedis(t)

-	tmplh := NewTemplatesHandler(t.TempDir(), nil)
-	h := NewChatFilesHandler(tmplh)
-
-	w := httptest.NewRecorder()
-	c, _ := gin.CreateTestContext(w)
-	c.Params = gin.Params{{Key: "id", Value: "not-a-uuid"}}
-	c.Request = httptest.NewRequest("POST", "/workspaces/not-a-uuid/chat/uploads", nil)
+	h := NewChatFilesHandler(NewTemplatesHandler(t.TempDir(), nil))

+	c, w := makeUploadRequest(t, "not-a-uuid", &bytes.Buffer{}, "")
 	h.Upload(c)

 	if w.Code != http.StatusBadRequest {
@@ -69,33 +93,240 @@ func TestChatUpload_InvalidWorkspaceID(t *testing.T) {
 	}
 }

-func TestChatUpload_MissingFiles(t *testing.T) {
-	setupTestDB(t)
+func TestChatUpload_WorkspaceNotInDB(t *testing.T) {
+	mock := setupTestDB(t)
 	setupTestRedis(t)

-	tmplh := NewTemplatesHandler(t.TempDir(), nil)
-	h := NewChatFilesHandler(tmplh)
-
-	// Multipart body with no `files` field — only a text field.
-	var buf bytes.Buffer
-	mw := multipart.NewWriter(&buf)
-	_ = mw.WriteField("other", "value")
-	mw.Close()
-
-	w := httptest.NewRecorder()
-	c, _ := gin.CreateTestContext(w)
-	c.Params = gin.Params{{Key: "id", Value: "00000000-0000-0000-0000-000000000001"}}
-	req := httptest.NewRequest("POST", "/workspaces/00000000-0000-0000-0000-000000000001/chat/uploads", &buf)
-	req.Header.Set("Content-Type", mw.FormDataContentType())
-	c.Request = req
+	wsID := "00000000-0000-0000-0000-000000000099"
+	expectURLMissing(mock, wsID)

+	h := NewChatFilesHandler(NewTemplatesHandler(t.TempDir(), nil))
+	body, ct := uploadFixture(t)
+	c, w := makeUploadRequest(t, wsID, body, ct)
 	h.Upload(c)

-	if w.Code != http.StatusBadRequest {
-		t.Errorf("expected 400 when files field missing, got %d: %s", w.Code, w.Body.String())
+	// QueryRow returning sql.ErrNoRows surfaces as 404. The validate-id
+	// step already passed; this is the next layer.
+	if w.Code != http.StatusNotFound {
+		t.Errorf("expected 404 when workspace row missing, got %d: %s", w.Code, w.Body.String())
 	}
-	if !strings.Contains(w.Body.String(), "files") {
-		t.Errorf("expected error to mention files field: %s", w.Body.String())
+}
+
+// TestChatUpload_NoInboundSecret_LazyHeal pins the lazy-heal flow
+// added 2026-04-30 alongside the SaaS shared-prepare refactor:
+//
+//   1. Reading the workspace's platform_inbound_secret returns NULL
+//      (legacy row from before RFC #2312).
+//   2. Handler MUST call wsauth.IssuePlatformInboundSecret (an UPDATE
+//      on the workspaces row) to backfill the secret, so the next
+//      upload after the workspace's heartbeat picks it up succeeds
+//      without operator action.
+//   3. Response is 503 with retry_after_seconds=30 — the workspace's
+//      local /configs/.platform_inbound_secret is also empty, so the
+//      forward this request would do still fails. The user retries
+//      after the next register response delivers the new secret.
+//
+// Pre-fix (before the lazy-heal): handlers returned 503 with
+// "Reprovision the workspace" — accurate, but every legacy workspace
+// would 503 forever until ops manually triggered a reprovision.
+func TestChatUpload_NoInboundSecret_LazyHeal(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+
+	// Legacy row: URL set but platform_inbound_secret is NULL.
+	wsID := "00000000-0000-0000-0000-000000000041"
+	expectURL(mock, wsID, "http://127.0.0.1:1")
+	expectInboundSecret(mock, wsID, nil) // NULL — triggers lazy-heal
+	// Lazy-heal mint MUST land. If this expectation isn't matched,
+	// the upload handler skipped the backfill and ops would have to
+	// manually reprovision every legacy workspace.
+	mock.ExpectExec(`UPDATE workspaces SET platform_inbound_secret = \$1 WHERE id = \$2`).
+		WithArgs(sqlmock.AnyArg(), wsID).
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	h := NewChatFilesHandler(NewTemplatesHandler(t.TempDir(), nil))
+	body, ct := uploadFixture(t)
+	c, w := makeUploadRequest(t, wsID, body, ct)
+	h.Upload(c)
+
+	if w.Code != http.StatusServiceUnavailable {
+		t.Errorf("expected 503 when platform_inbound_secret missing, got %d: %s", w.Code, w.Body.String())
+	}
+	// Lazy-heal-success body steers the user to retry; the failure
+	// body steers them to reprovision. Distinguishing them pins which
+	// branch ran.
+	if !strings.Contains(w.Body.String(), "retry") {
+		t.Errorf("expected lazy-heal success response (retry hint), got: %s", w.Body.String())
+	}
+	if !strings.Contains(w.Body.String(), "30") {
+		t.Errorf("expected retry_after_seconds=30 in body, got: %s", w.Body.String())
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("sqlmock expectations not met — lazy-heal mint did NOT run, regression of #2312 backfill: %v", err)
+	}
+}
+
+// TestChatUpload_NoInboundSecret_LazyHealFailure pins the alternate
+// branch: the platform_inbound_secret is NULL AND the lazy-heal mint
+// itself fails (e.g. DB unreachable). Handler must surface the
+// reprovision-steering error rather than silently swallowing.
+func TestChatUpload_NoInboundSecret_LazyHealFailure(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+
+	wsID := "00000000-0000-0000-0000-000000000042"
+	expectURL(mock, wsID, "http://127.0.0.1:1")
+	expectInboundSecret(mock, wsID, nil) // NULL — triggers lazy-heal
+	mock.ExpectExec(`UPDATE workspaces SET platform_inbound_secret = \$1 WHERE id = \$2`).
+		WithArgs(sqlmock.AnyArg(), wsID).
+		WillReturnError(sql.ErrConnDone) // mint fails
+
+	h := NewChatFilesHandler(NewTemplatesHandler(t.TempDir(), nil))
+	body, ct := uploadFixture(t)
+	c, w := makeUploadRequest(t, wsID, body, ct)
+	h.Upload(c)
+
+	if w.Code != http.StatusServiceUnavailable {
+		t.Errorf("expected 503 when lazy-heal fails, got %d: %s", w.Code, w.Body.String())
+	}
+	if !strings.Contains(w.Body.String(), "RFC #2312") {
+		t.Errorf("expected detail to reference RFC #2312 on lazy-heal failure, got: %s", w.Body.String())
+	}
+	if !strings.Contains(w.Body.String(), "Reprovision") {
+		t.Errorf("expected reprovision hint on mint failure, got: %s", w.Body.String())
+	}
+}
+
+func TestChatUpload_NoURL(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+
+	// Workspace registered but URL hasn't been reported yet (mid-boot).
+	wsID := "00000000-0000-0000-0000-000000000042"
+	expectURL(mock, wsID, "")
+
+	h := NewChatFilesHandler(NewTemplatesHandler(t.TempDir(), nil))
+	body, ct := uploadFixture(t)
+	c, w := makeUploadRequest(t, wsID, body, ct)
+	h.Upload(c)
+
+	if w.Code != http.StatusServiceUnavailable {
+		t.Errorf("expected 503 when workspace url empty, got %d: %s", w.Code, w.Body.String())
+	}
+}
+
+// captured snapshots everything the forwarder sent to the workspace so
+// we can assert auth + body + content-type forwarded correctly.
+type captured struct {
+	authorization string
+	contentType   string
+	method        string
+	path          string
+	body          []byte
+}
+
+func newCapturingWorkspace(t *testing.T, status int, response string) (*httptest.Server, *captured) {
+	t.Helper()
+	cap := &captured{}
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		cap.authorization = r.Header.Get("Authorization")
+		cap.contentType = r.Header.Get("Content-Type")
+		cap.method = r.Method
+		cap.path = r.URL.Path
+		body, _ := io.ReadAll(r.Body)
+		cap.body = body
+
+		w.Header().Set("Content-Type", "application/json")
+		w.WriteHeader(status)
+		_, _ = w.Write([]byte(response))
+	}))
+	t.Cleanup(srv.Close)
+	return srv, cap
+}
+
+func TestChatUpload_ForwardsToWorkspace_HappyPath(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+
+	srv, captured := newCapturingWorkspace(t, http.StatusOK, `{"files":[{"uri":"workspace:/workspace/.molecule/chat-uploads/abc-fixture.txt","name":"fixture.txt","size":15}]}`)
+
+	wsID := "00000000-0000-0000-0000-000000000043"
+	expectURL(mock, wsID, srv.URL)
+	expectInboundSecret(mock, wsID, "super-secret-123")
+
+	h := NewChatFilesHandler(NewTemplatesHandler(t.TempDir(), nil))
+	body, ct := uploadFixture(t)
+	c, w := makeUploadRequest(t, wsID, body, ct)
+	h.Upload(c)
+
+	if w.Code != http.StatusOK {
+		t.Fatalf("expected 200 from happy forward, got %d: %s", w.Code, w.Body.String())
+	}
+	if captured.method != "POST" {
+		t.Errorf("expected POST, got %s", captured.method)
+	}
+	if captured.path != "/internal/chat/uploads/ingest" {
+		t.Errorf("expected /internal/chat/uploads/ingest, got %s", captured.path)
+	}
+	if captured.authorization != "Bearer super-secret-123" {
+		t.Errorf("expected secret in Authorization header, got %q", captured.authorization)
+	}
+	if !strings.HasPrefix(captured.contentType, "multipart/form-data") {
+		t.Errorf("expected multipart Content-Type forwarded, got %q", captured.contentType)
+	}
+	// Body shape: must contain the multipart-encoded fixture content.
+	if !bytes.Contains(captured.body, []byte("fixture-payload")) {
+		t.Errorf("expected body to contain fixture payload, got %d bytes", len(captured.body))
+	}
+	// Response body streamed back unchanged.
+	if !strings.Contains(w.Body.String(), "fixture.txt") {
+		t.Errorf("expected workspace response forwarded back, got: %s", w.Body.String())
+	}
+}
+
+func TestChatUpload_ForwardsErrorStatusUnchanged(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+
+	// Workspace returns 413 with its standard "exceeds per-file limit"
+	// shape. Platform must propagate, NOT remap to 500.
+	srv, _ := newCapturingWorkspace(t, http.StatusRequestEntityTooLarge, `{"error":"big.bin exceeds per-file limit (25 MB)"}`)
+
+	wsID := "00000000-0000-0000-0000-000000000044"
+	expectURL(mock, wsID, srv.URL)
+	expectInboundSecret(mock, wsID, "tok")
+
+	h := NewChatFilesHandler(NewTemplatesHandler(t.TempDir(), nil))
+	body, ct := uploadFixture(t)
+	c, w := makeUploadRequest(t, wsID, body, ct)
+	h.Upload(c)
+
+	if w.Code != http.StatusRequestEntityTooLarge {
+		t.Errorf("expected 413 propagated unchanged, got %d", w.Code)
+	}
+	if !strings.Contains(w.Body.String(), "exceeds per-file limit") {
+		t.Errorf("expected workspace's 413 body verbatim, got: %s", w.Body.String())
+	}
+}
+
+func TestChatUpload_WorkspaceUnreachable(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+
+	wsID := "00000000-0000-0000-0000-000000000045"
+	// 127.0.0.1:1 — port 1 has no listener → connect refused.
+	expectURL(mock, wsID, "http://127.0.0.1:1")
+	expectInboundSecret(mock, wsID, "tok")
+
+	h := NewChatFilesHandler(NewTemplatesHandler(t.TempDir(), nil))
+	body, ct := uploadFixture(t)
+	c, w := makeUploadRequest(t, wsID, body, ct)
+	h.Upload(c)
+
+	// Connect-refused → BadGateway. NOT 500 — the platform itself is
+	// fine; the upstream is broken.
+	if w.Code != http.StatusBadGateway {
+		t.Errorf("expected 502 on workspace unreachable, got %d: %s", w.Code, w.Body.String())
 	}
 }

@@ -103,8 +334,7 @@ func TestChatDownload_InvalidPath(t *testing.T) {
 	setupTestDB(t)
 	setupTestRedis(t)

-	tmplh := NewTemplatesHandler(t.TempDir(), nil)
-	h := NewChatFilesHandler(tmplh)
+	h := NewChatFilesHandler(NewTemplatesHandler(t.TempDir(), nil))

 	cases := []struct {
 		name, path, wantSubstr string
@@ -173,22 +403,158 @@ func TestContentDispositionAttachment_Escapes(t *testing.T) {
 	}
 }

-func TestChatDownload_DockerUnavailable(t *testing.T) {
-	setupTestDB(t)
-	setupTestRedis(t)
-
-	tmplh := NewTemplatesHandler(t.TempDir(), nil) // docker=nil
-	h := NewChatFilesHandler(tmplh)
-
+// makeDownloadRequest builds a gin context for GET /workspaces/:id/chat/download
+// with the given path query param.
+func makeDownloadRequest(t *testing.T, workspaceID, path string) (*gin.Context, *httptest.ResponseRecorder) {
+	t.Helper()
 	w := httptest.NewRecorder()
 	c, _ := gin.CreateTestContext(w)
-	c.Params = gin.Params{{Key: "id", Value: "00000000-0000-0000-0000-000000000001"}}
-	req := httptest.NewRequest("GET", "/workspaces/xxx/chat/download?path=/workspace/report.pdf", nil)
-	c.Request = req
+	c.Params = gin.Params{{Key: "id", Value: workspaceID}}
+	c.Request = httptest.NewRequest("GET", "/workspaces/"+workspaceID+"/chat/download?path="+path, nil)
+	return c, w
+}

+func TestChatDownload_WorkspaceNotInDB(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+
+	wsID := "00000000-0000-0000-0000-000000000099"
+	mock.ExpectQuery(`SELECT COALESCE\(url, ''\) FROM workspaces WHERE id = \$1`).
+		WithArgs(wsID).
+		WillReturnError(sql.ErrNoRows)
+
+	h := NewChatFilesHandler(NewTemplatesHandler(t.TempDir(), nil))
+	c, w := makeDownloadRequest(t, wsID, "/workspace/foo.txt")
+	h.Download(c)
+
+	if w.Code != http.StatusNotFound {
+		t.Errorf("expected 404 when workspace row missing, got %d", w.Code)
+	}
+}
+
+// TestChatDownload_NoInboundSecret_LazyHeal — same lazy-heal flow
+// as TestChatUpload_NoInboundSecret_LazyHeal but on the Download
+// handler. Pinned separately because Upload + Download have
+// independent code paths into ReadPlatformInboundSecret; a partial
+// regression that healed Upload but skipped Download is the kind of
+// drift we want to fail the test, not ship.
+func TestChatDownload_NoInboundSecret_LazyHeal(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+
+	wsID := "00000000-0000-0000-0000-000000000051"
+	expectURL(mock, wsID, "http://127.0.0.1:1")
+	expectInboundSecret(mock, wsID, nil)
+	mock.ExpectExec(`UPDATE workspaces SET platform_inbound_secret = \$1 WHERE id = \$2`).
+		WithArgs(sqlmock.AnyArg(), wsID).
+		WillReturnResult(sqlmock.NewResult(0, 1))
+
+	h := NewChatFilesHandler(NewTemplatesHandler(t.TempDir(), nil))
+	c, w := makeDownloadRequest(t, wsID, "/workspace/foo.txt")
 	h.Download(c)

 	if w.Code != http.StatusServiceUnavailable {
-		t.Errorf("expected 503 when docker is nil, got %d: %s", w.Code, w.Body.String())
+		t.Errorf("expected 503 when platform_inbound_secret missing, got %d: %s", w.Code, w.Body.String())
+	}
+	if !strings.Contains(w.Body.String(), "retry") {
+		t.Errorf("expected lazy-heal success response (retry hint), got: %s", w.Body.String())
+	}
+	if err := mock.ExpectationsWereMet(); err != nil {
+		t.Errorf("sqlmock expectations not met — Download lazy-heal mint did NOT run: %v", err)
+	}
+}
+
+func TestChatDownload_NoInboundSecret_LazyHealFailure(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+
+	wsID := "00000000-0000-0000-0000-000000000052"
+	expectURL(mock, wsID, "http://127.0.0.1:1")
+	expectInboundSecret(mock, wsID, nil)
+	mock.ExpectExec(`UPDATE workspaces SET platform_inbound_secret = \$1 WHERE id = \$2`).
+		WithArgs(sqlmock.AnyArg(), wsID).
+		WillReturnError(sql.ErrConnDone)
+
+	h := NewChatFilesHandler(NewTemplatesHandler(t.TempDir(), nil))
+	c, w := makeDownloadRequest(t, wsID, "/workspace/foo.txt")
+	h.Download(c)
+
+	if w.Code != http.StatusServiceUnavailable {
+		t.Errorf("expected 503 when lazy-heal fails, got %d: %s", w.Code, w.Body.String())
+	}
+	if !strings.Contains(w.Body.String(), "RFC #2312") {
+		t.Errorf("expected detail to reference RFC #2312 on lazy-heal failure, got: %s", w.Body.String())
+	}
+}
+
+func TestChatDownload_ForwardsToWorkspace_HappyPath(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+
+	body := []byte("file-contents-here\nmultiline\n")
+	cap := &captured{}
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		cap.authorization = r.Header.Get("Authorization")
+		cap.method = r.Method
+		cap.path = r.URL.Path
+		w.Header().Set("Content-Type", "text/plain")
+		w.Header().Set("Content-Disposition", `attachment; filename="report.txt"`)
+		w.WriteHeader(http.StatusOK)
+		_, _ = w.Write(body)
+	}))
+	t.Cleanup(srv.Close)
+
+	wsID := "00000000-0000-0000-0000-000000000052"
+	expectURL(mock, wsID, srv.URL)
+	expectInboundSecret(mock, wsID, "the-secret")
+
+	h := NewChatFilesHandler(NewTemplatesHandler(t.TempDir(), nil))
+	c, w := makeDownloadRequest(t, wsID, "/workspace/report.txt")
+	h.Download(c)
+
+	if w.Code != http.StatusOK {
+		t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String())
+	}
+	if cap.authorization != "Bearer the-secret" {
+		t.Errorf("expected secret in Authorization header, got %q", cap.authorization)
+	}
+	if cap.method != "GET" {
+		t.Errorf("expected GET, got %s", cap.method)
+	}
+	if cap.path != "/internal/file/read" {
+		t.Errorf("expected /internal/file/read, got %s", cap.path)
+	}
+	if got := w.Header().Get("Content-Type"); got != "text/plain" {
+		t.Errorf("Content-Type not forwarded: %q", got)
+	}
+	if got := w.Header().Get("Content-Disposition"); got != `attachment; filename="report.txt"` {
+		t.Errorf("Content-Disposition not forwarded: %q", got)
+	}
+	if got := w.Body.Bytes(); !bytes.Equal(got, body) {
+		t.Errorf("body mismatch: got %q, want %q", got, body)
+	}
+}
+
+func TestChatDownload_404FromWorkspacePropagated(t *testing.T) {
+	mock := setupTestDB(t)
+	setupTestRedis(t)
+
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.Header().Set("Content-Type", "application/json")
+		w.WriteHeader(http.StatusNotFound)
+		_, _ = w.Write([]byte(`{"error":"file not found"}`))
+	}))
+	t.Cleanup(srv.Close)
+
+	wsID := "00000000-0000-0000-0000-000000000053"
+	expectURL(mock, wsID, srv.URL)
+	expectInboundSecret(mock, wsID, "tok")
+
+	h := NewChatFilesHandler(NewTemplatesHandler(t.TempDir(), nil))
+	c, w := makeDownloadRequest(t, wsID, "/workspace/missing.txt")
+	h.Download(c)
+
+	if w.Code != http.StatusNotFound {
+		t.Errorf("expected 404 propagated, got %d", w.Code)
 	}
 }
--- a/Show More
+++ b/Show More