fix(org_helpers): correct duplicate phrase in loadWorkspaceEnv comment

The comment had the phrase "the workspace-specific .env" duplicated. Removed the redundant repetition. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
fix(workspace-server): inject /configs token files agent-owned, not root
2026-05-16 10:27:13 +00:00 · 2026-05-16 02:19:11 -07:00 · 2026-05-16 07:32:27 +00:00 · 2026-05-16 06:58:48 +00:00 · 2026-05-15 23:08:56 -07:00 · 2026-05-15 22:27:51 +00:00
8 changed files with 778 additions and 32 deletions
@@ -11,19 +11,12 @@ from __future__ import annotations
 import argparse
 import json
 import os
-import socket  # mc#1234: set default timeout to prevent indefinite hangs
 import sys
 import time
 import urllib.error
 import urllib.request
 from urllib.parse import quote

-# Prevent HTTP hangs (e.g. Gitea commit-status API going slow). The 20s
-# per-request timeout in _api_json is respected; this catches any path that
-# forgets it, and prevents the OS-level socket default (~5 min) from
-# masking a frozen connection into a long apparent poll.
-socket.setdefaulttimeout(30)
-

 TRUE_VALUES = {"1", "true", "yes", "on", "disabled", "disable"}
 PROD_CP_URL = "https://api.moleculesai.app"
@@ -32,12 +25,9 @@ DEFAULT_REQUIRED_CONTEXTS = [
    "CI / Canvas (Next.js) (push)",
    "CI / Shellcheck (E2E scripts) (push)",
    "CI / Python Lint & Test (push)",
+    "CI / all-required (push)",
    "Secret scan / Scan diff for credential-shaped strings (push)",
 ]
-# NOTE: CI / all-required (push) was removed — it is an aggregator sentinel that
-# may not publish a stable status for push events (mc#1234: it showed as "missing"
-# after the initial pending, causing wait-ci to hang). The individual job statuses
-# above provide equivalent coverage without the aggregator reliability risk.
 TERMINAL_FAILURE_STATES = {"failure", "error", "cancelled", "canceled", "skipped"}


@@ -141,7 +131,7 @@ def required_contexts(env: dict[str, str]) -> list[str]:
 def _api_json(url: str, token: str) -> dict:
    req = urllib.request.Request(url, headers={"Authorization": f"token {token}"})
    try:
-        with urllib.request.urlopen(req, timeout=60) as resp:
+        with urllib.request.urlopen(req, timeout=20) as resp:
            return json.loads(resp.read())
    except urllib.error.HTTPError as exc:
        body = exc.read().decode("utf-8", errors="replace")[:500]
@@ -151,7 +141,7 @@ def _api_json(url: str, token: str) -> dict:
 def _api_json_optional(url: str, token: str) -> tuple[int, dict | None]:
    req = urllib.request.Request(url, headers={"Authorization": f"token {token}"})
    try:
-        with urllib.request.urlopen(req, timeout=60) as resp:
+        with urllib.request.urlopen(req, timeout=20) as resp:
            return resp.status, json.loads(resp.read())
    except urllib.error.HTTPError as exc:
        if exc.code == 404:
@@ -0,0 +1,225 @@
+name: E2E Peer Visibility (literal MCP list_peers)
+
+# WHY A DEDICATED WORKFLOW (not folded into e2e-staging-saas.yml)
+# --------------------------------------------------------------
+# This is the systemic fix for a real trust failure. Hermes and OpenClaw
+# were reported "fleet-verified / cascade-complete" because the *proxy*
+# signals were green (registry registration + heartbeat for Hermes; model
+# round-trip 200 for OpenClaw). A freshly-provisioned workspace asked on
+# canvas "can you see your peers" actually FAILS:
+#   - Hermes: 401 on the molecule MCP `list_peers` call
+#   - OpenClaw: native `sessions_list` fallback, sees no platform peers
+# Tasks #142/#159 were even marked "completed" under this proxy flaw.
+#
+# A dedicated workflow (vs extending e2e-staging-saas.yml) because:
+#   - It must provision MULTIPLE distinct runtimes (hermes, openclaw,
+#     claude-code) in ONE org and assert each sees the others. The
+#     full-saas script is single-runtime-per-run (E2E_RUNTIME) and folding
+#     a multi-runtime matrix into it would conflate concerns and bloat its
+#     already-45-min run.
+#   - It needs its own concurrency group so it doesn't fight full-saas /
+#     canvas for the staging org-creation quota.
+#   - It needs an independent, non-required status-context name so it can
+#     be RED today (the in-flight Hermes-401 / OpenClaw-MCP-wiring fixes
+#     have not landed) WITHOUT wedging unrelated merges — and flipped to
+#     REQUIRED in one branch-protection edit once it goes green
+#     (flip-to-required checklist: molecule-core#1296).
+#
+# THE ASSERTION IS NOT A PROXY. The driving script
+# tests/e2e/test_peer_visibility_mcp_staging.sh issues the byte-for-byte
+# JSON-RPC `tools/call name=list_peers` envelope to `POST
+# /workspaces/:id/mcp` using each workspace's OWN bearer token, through
+# the real WorkspaceAuth + MCPRateLimiter middleware chain — the exact
+# call mcp_molecule_list_peers makes from a canvas agent. It does NOT
+# read a registry row, /health, the heartbeat table, or
+# GET /registry/:id/peers.
+#
+# HONEST GATE — NO continue-on-error. Per feedback_fix_root_not_symptom a
+# fake-green mask would defeat the entire purpose. This workflow goes red
+# on today's broken behavior and green only when the root-cause fixes
+# actually land. It is intentionally NOT in branch_protections — see PR
+# body for the required-vs-not decision + flip tracking issue.
+#
+# Gitea 1.22.6 / act_runner notes honored:
+#   - No cross-repo `uses:` (feedback_gitea_cross_repo_uses_blocked). The
+#     actions/checkout SHA is the one e2e-staging-canvas.yml already uses
+#     successfully (a mirrored SHA — see #1277/PR#1292 root-cause).
+#   - Per-SHA concurrency, not global (feedback_concurrency_group_per_sha).
+#   - Workflow-level GITHUB_SERVER_URL pinned
+#     (feedback_act_runner_github_server_url).
+#   - pr-validate posts a status under the same check name so a
+#     workflow-only PR is not silently statusless and the context is
+#     flip-to-required-ready (mirrors e2e-staging-saas.yml's proven shape;
+#     real EC2-provisioning E2E is push/dispatch/cron only — it is 30+ min
+#     and cannot run per-PR-update).
+
+on:
+  push:
+    branches: [main]
+    paths:
+      - 'workspace-server/internal/handlers/mcp.go'
+      - 'workspace-server/internal/handlers/mcp_tools.go'
+      - 'workspace-server/internal/middleware/**'
+      - 'workspace-server/internal/handlers/registry.go'
+      - 'workspace-server/internal/handlers/workspace.go'
+      - 'workspace/a2a_mcp_server.py'
+      - 'workspace/platform_tools/registry.py'
+      - 'tests/e2e/test_peer_visibility_mcp_staging.sh'
+      - '.gitea/workflows/e2e-peer-visibility.yml'
+  pull_request:
+    branches: [main]
+    paths:
+      - 'workspace-server/internal/handlers/mcp.go'
+      - 'workspace-server/internal/handlers/mcp_tools.go'
+      - 'workspace-server/internal/middleware/**'
+      - 'workspace-server/internal/handlers/registry.go'
+      - 'workspace-server/internal/handlers/workspace.go'
+      - 'workspace/a2a_mcp_server.py'
+      - 'workspace/platform_tools/registry.py'
+      - 'tests/e2e/test_peer_visibility_mcp_staging.sh'
+      - '.gitea/workflows/e2e-peer-visibility.yml'
+  workflow_dispatch:
+  schedule:
+    # 07:30 UTC daily — catches AMI / template-hermes / template-openclaw
+    # drift even on quiet days. Offset 30m from e2e-staging-saas (07:00)
+    # so the two don't collide on the staging org-creation quota.
+    - cron: '30 7 * * *'
+
+concurrency:
+  # Per-SHA (feedback_concurrency_group_per_sha). A single global group
+  # would let a queued staging/main push behind a PR run get cancelled,
+  # leaving any gate that reads "completed run at SHA" stuck.
+  group: e2e-peer-visibility-${{ github.event.pull_request.head.sha || github.sha }}
+  cancel-in-progress: false
+
+env:
+  GITHUB_SERVER_URL: https://git.moleculesai.app
+
+jobs:
+  # PR path: post a real status under the required-ready check name so a
+  # workflow-only PR is never silently statusless. The actual EC2 E2E is
+  # push/dispatch/cron only (30+ min). This is NOT a fake-green mask of
+  # the real assertion — it validates the driving script's bash syntax
+  # and inline-python so a broken test script fails at PR time.
+  pr-validate:
+    name: E2E Peer Visibility
+    runs-on: ubuntu-latest
+    if: github.event_name == 'pull_request'
+    timeout-minutes: 5
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+      - name: Validate driving script
+        run: |
+          bash -n tests/e2e/test_peer_visibility_mcp_staging.sh
+          echo "test_peer_visibility_mcp_staging.sh — bash syntax OK"
+          echo "Real fresh-provision MCP list_peers E2E runs on push to"
+          echo "main / workflow_dispatch / daily cron (30+ min EC2 boot)."
+
+  # Real gate: provisions a throwaway org + sibling-per-runtime, drives
+  # the LITERAL list_peers MCP call per runtime, asserts 200 + expected
+  # peer set, then scoped teardown. push(main)/dispatch/cron only.
+  peer-visibility:
+    name: E2E Peer Visibility
+    runs-on: ubuntu-latest
+    if: github.event_name != 'pull_request'
+    timeout-minutes: 60
+
+    env:
+      MOLECULE_CP_URL: https://staging-api.moleculesai.app
+      MOLECULE_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
+      # LLM provider key so each runtime can authenticate at boot.
+      # Priority MiniMax → direct-Anthropic → OpenAI matches
+      # test_staging_full_saas.sh's secrets-injection chain.
+      E2E_MINIMAX_API_KEY: ${{ secrets.MOLECULE_STAGING_MINIMAX_API_KEY }}
+      E2E_ANTHROPIC_API_KEY: ${{ secrets.MOLECULE_STAGING_ANTHROPIC_API_KEY }}
+      E2E_OPENAI_API_KEY: ${{ secrets.MOLECULE_STAGING_OPENAI_API_KEY }}
+      E2E_RUN_ID: "${{ github.run_id }}-${{ github.run_attempt }}"
+      PV_RUNTIMES: "hermes openclaw claude-code"
+
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+
+      - name: Verify admin token present
+        run: |
+          if [ -z "$MOLECULE_ADMIN_TOKEN" ]; then
+            echo "::error::CP_STAGING_ADMIN_API_TOKEN secret not set (Railway staging CP_ADMIN_API_TOKEN)"
+            exit 2
+          fi
+          echo "Admin token present"
+
+      - name: Verify an LLM key present
+        run: |
+          if [ -z "${E2E_MINIMAX_API_KEY:-}" ] && [ -z "${E2E_ANTHROPIC_API_KEY:-}" ] && [ -z "${E2E_OPENAI_API_KEY:-}" ]; then
+            echo "::error::No LLM provider key set — workspaces fail at boot with 'No provider API key found'. Set MOLECULE_STAGING_MINIMAX_API_KEY (or ANTHROPIC / OPENAI)."
+            exit 2
+          fi
+          echo "LLM key present"
+
+      - name: CP staging health preflight
+        run: |
+          code=$(curl -sS -o /dev/null -w "%{http_code}" --max-time 10 "$MOLECULE_CP_URL/health")
+          if [ "$code" != "200" ]; then
+            echo "::error::Staging CP unhealthy (HTTP $code) — infra, not a workspace bug. Failing loud per feedback_fix_root_not_symptom."
+            exit 1
+          fi
+          echo "Staging CP healthy"
+
+      - name: Run fresh-provision peer-visibility E2E (literal MCP list_peers)
+        run: bash tests/e2e/test_peer_visibility_mcp_staging.sh
+
+      # Belt-and-braces scoped teardown: the script installs an EXIT/INT/
+      # TERM trap, but if the runner itself is cancelled the trap may not
+      # fire. This always() step deletes ONLY the e2e-pv-<run_id> org this
+      # run created — never a cluster-wide sweep
+      # (feedback_never_run_cluster_cleanup_tests_on_live_platform). The
+      # admin DELETE is idempotent so double-invoking is safe;
+      # sweep-stale-e2e-orgs is the final net (slug starts with 'e2e-').
+      - name: Teardown safety net (runs on cancel/failure)
+        if: always()
+        env:
+          ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
+        run: |
+          set +e
+          orgs=$(curl -sS "$MOLECULE_CP_URL/cp/admin/orgs?limit=500" \
+            -H "Authorization: Bearer $ADMIN_TOKEN" 2>/dev/null \
+            | python3 -c "
+          import json, sys, os, datetime
+          run_id = os.environ.get('GITHUB_RUN_ID', '')
+          try:
+              d = json.load(sys.stdin)
+          except Exception:
+              print(''); sys.exit(0)
+          # ONLY sweep slugs from THIS run. e2e-pv-<YYYYMMDD>-<run_id>-...
+          # Sweep today AND yesterday's UTC date so a midnight-crossing run
+          # still matches its own slug (same bug class as the saas/canvas
+          # safety nets).
+          today = datetime.date.today()
+          yest = today - datetime.timedelta(days=1)
+          dates = (today.strftime('%Y%m%d'), yest.strftime('%Y%m%d'))
+          if run_id:
+              prefixes = tuple(f'e2e-pv-{dt}-{run_id}-' for dt in dates)
+          else:
+              prefixes = tuple(f'e2e-pv-{dt}-' for dt in dates)
+          orgs = d if isinstance(d, list) else d.get('orgs', [])
+          cands = [o['slug'] for o in orgs
+                   if any(o.get('slug','').startswith(p) for p in prefixes)
+                   and o.get('instance_status') not in ('purged',)]
+          print('\n'.join(cands))
+          " 2>/dev/null)
+          for slug in $orgs; do
+            echo "Safety-net teardown: $slug"
+            set +e
+            curl -sS -o /tmp/pv-cleanup.out -w "%{http_code}" \
+              -X DELETE "$MOLECULE_CP_URL/cp/admin/tenants/$slug" \
+              -H "Authorization: Bearer $ADMIN_TOKEN" \
+              -H "Content-Type: application/json" \
+              -d "{\"confirm\":\"$slug\"}" >/tmp/pv-cleanup.code
+            set -e
+            code=$(cat /tmp/pv-cleanup.code 2>/dev/null || echo "000")
+            if [ "$code" = "200" ] || [ "$code" = "204" ]; then
+              echo "[teardown] deleted $slug (HTTP $code)"
+            else
+              echo "::warning::pv teardown for $slug returned HTTP $code — sweep-stale-e2e-orgs will catch it within MAX_AGE_MINUTES. Body: $(head -c 300 /tmp/pv-cleanup.out 2>/dev/null)"
+            fi
+          done
+          exit 0
@@ -176,7 +176,7 @@ export function deriveProvidersFromModels(models: ModelSpec[]): string[] {
 // exactly the point of the platform adaptor. The deep `~/.hermes/
 // config.yaml` on the container is a separate runtime-internal file,
 // not this one.
-const RUNTIMES_WITH_OWN_CONFIG = new Set<string>(["external", "kimi", "kimi-cli"]);
+const RUNTIMES_WITH_OWN_CONFIG = new Set<string>(["external", "kimi", "kimi-cli", "openclaw"]);

 const FALLBACK_RUNTIME_OPTIONS: RuntimeOption[] = [
  { value: "", label: "LangGraph (default)", models: [], providers: [] },
@@ -8,14 +8,18 @@ import { getTenantSlug } from "./tenant";
 export const PLATFORM_URL =
  process.env.NEXT_PUBLIC_PLATFORM_URL ?? "http://localhost:8080";

-// 15s is long enough for slow CP queries but short enough that a
-// hung backend doesn't leave the UI spinning forever. The abort
-// propagates through AbortController so React components can observe
-// the error and render a retry affordance. Callers that know the
-// endpoint is intentionally slow (org import walks a tree of
-// workspaces with server-side pacing) can pass `timeoutMs` to
-// override.
-const DEFAULT_TIMEOUT_MS = 15_000;
+// 35s is long enough for the slowest server-side path (EIC SSH
+// tunnel for tenant EC2 file operations, bounded server-side by
+// `eicFileOpTimeout = 30 * time.Second` in
+// workspace-server/internal/handlers/template_files_eic.go) so the
+// canvas surfaces the server's real error instead of aborting first
+// with a generic timeout. Shorter values caused "Save & Restart" to
+// time out at the client before the backend returned its 5xx. The
+// abort still propagates through AbortController so React components
+// can render a retry affordance. Callers that know an endpoint is
+// intentionally slow (org import walks a tree of workspaces with
+// server-side pacing) can pass `timeoutMs` to override.
+const DEFAULT_TIMEOUT_MS = 35_000;

 export interface RequestOptions {
  timeoutMs?: number;
@@ -0,0 +1,376 @@
+#!/usr/bin/env bash
+# Staging E2E — fresh-provision peer-visibility gate via the LITERAL MCP path.
+#
+# WHY THIS EXISTS
+# ---------------
+# Hermes and OpenClaw were repeatedly reported "fleet-verified / cascade-
+# complete" because the *proxy* signals were green:
+#   - registry-registration + heartbeat (Hermes), and
+#   - model round-trip 200 (OpenClaw).
+# But a freshly-provisioned workspace, asked on canvas "can you see your
+# peers", actually FAILS:
+#   - Hermes: 401 on the molecule MCP `list_peers` call,
+#   - OpenClaw: falls back to native `sessions_list`, sees no platform peers.
+# Tasks #142/#159 were even marked "completed" under this same proxy flaw.
+#
+# This script codifies the LITERAL user-facing path so it can never silently
+# regress: it provisions a brand-new throwaway org + sibling workspaces via
+# the real control-plane provisioning path, then for each runtime that should
+# have platform peer-visibility it drives the EXACT MCP call the canvas agent
+# makes — `POST /workspaces/:id/mcp` JSON-RPC tools/call name=list_peers,
+# authenticated by that workspace's own bearer token through the real
+# WorkspaceAuth + MCPRateLimiter middleware chain. It then asserts:
+#   (1) HTTP 200,
+#   (2) JSON-RPC `result` present (NOT an `error` object — a -32000
+#       "tool call failed" or a 401 from WorkspaceAuth fails here),
+#   (3) the returned peer set CONTAINS the other provisioned sibling
+#       workspace IDs — not an empty list, not a native-sessions fallback.
+#
+# This is NOT a proxy. It does not look at a registry row, /health, the
+# heartbeat table, or `GET /registry/:id/peers`. It drives the byte-for-byte
+# JSON-RPC envelope that mcp_molecule_list_peers issues from a real agent.
+#
+# It is written to FAIL on today's broken Hermes/OpenClaw behavior and go
+# green only when the in-flight root-cause fixes (Hermes-401, OpenClaw MCP
+# wiring) actually land. That is the point: it is the objective proof gate.
+#
+# AUTH MODEL (mirrors tests/e2e/test_staging_full_saas.sh)
+# --------------------------------------------------------
+#   Single MOLECULE_ADMIN_TOKEN (= CP_ADMIN_API_TOKEN on Railway staging)
+#   drives: POST /cp/admin/orgs (provision), GET
+#   /cp/admin/orgs/:slug/admin-token (per-tenant token), DELETE
+#   /cp/admin/tenants/:slug (teardown). The per-tenant admin token drives
+#   tenant workspace creation; each workspace's OWN auth_token (returned by
+#   POST /workspaces) drives its MCP call.
+#
+# Required env:
+#   MOLECULE_ADMIN_TOKEN   CP admin bearer — Railway staging CP_ADMIN_API_TOKEN
+# Optional env:
+#   MOLECULE_CP_URL        default https://staging-api.moleculesai.app
+#   E2E_RUN_ID             slug suffix; CI passes ${GITHUB_RUN_ID}
+#   PV_RUNTIMES            space list; default "hermes openclaw claude-code"
+#   E2E_PROVISION_TIMEOUT_SECS  default 1800 (hermes/openclaw cold EC2 budget)
+#   E2E_MINIMAX_API_KEY / E2E_ANTHROPIC_API_KEY / E2E_OPENAI_API_KEY
+#                          LLM provider key injected so the runtime can boot
+#   E2E_KEEP_ORG           1 → skip teardown (local debugging only)
+#
+# Exit codes:
+#   0  every runtime saw its peers via the literal MCP call
+#   1  generic failure
+#   2  missing required env
+#   3  provisioning timed out
+#   4  teardown left orphan resources
+#   10 peer-visibility regression reproduced (the gate firing as designed)
+
+set -uo pipefail
+
+CP_URL="${MOLECULE_CP_URL:-https://staging-api.moleculesai.app}"
+ADMIN_TOKEN="${MOLECULE_ADMIN_TOKEN:?MOLECULE_ADMIN_TOKEN required — Railway staging CP_ADMIN_API_TOKEN}"
+RUN_ID_SUFFIX="${E2E_RUN_ID:-$(date +%H%M%S)-$$}"
+PV_RUNTIMES="${PV_RUNTIMES:-hermes openclaw claude-code}"
+PROVISION_TIMEOUT_SECS="${E2E_PROVISION_TIMEOUT_SECS:-1800}"
+
+# Slug MUST start with 'e2e-' so the sweep-stale-e2e-orgs safety net
+# (EPHEMERAL_PREFIXES) catches any leak this run fails to tear down.
+SLUG="e2e-pv-$(date +%Y%m%d)-${RUN_ID_SUFFIX}"
+SLUG=$(echo "$SLUG" | tr '[:upper:]' '[:lower:]' | tr -cd 'a-z0-9-' | head -c 32)
+
+ORG_ID=""
+TENANT_URL=""
+TENANT_TOKEN=""
+
+log()  { echo "[$(date +%H:%M:%S)] $*"; }
+fail() { echo "[$(date +%H:%M:%S)] ❌ $*" >&2; exit 1; }
+ok()   { echo "[$(date +%H:%M:%S)] ✅ $*"; }
+
+admin_call() {
+  local method="$1" path="$2"; shift 2
+  curl -sS -X "$method" "$CP_URL$path" \
+    -H "Authorization: Bearer $ADMIN_TOKEN" \
+    -H "Content-Type: application/json" "$@"
+}
+tenant_call() {
+  local method="$1" path="$2"; shift 2
+  curl -sS -X "$method" "$TENANT_URL$path" \
+    -H "Authorization: Bearer $TENANT_TOKEN" \
+    -H "X-Molecule-Org-Id: $ORG_ID" \
+    -H "Content-Type: application/json" "$@"
+}
+
+# ─── Scoped teardown ───────────────────────────────────────────────────
+# Deletes ONLY the org this run created (DELETE /cp/admin/tenants/$SLUG
+# with the {"confirm":$SLUG} fat-finger guard). Never a cluster-wide
+# sweep — honors feedback_cleanup_after_each_test and
+# feedback_never_run_cluster_cleanup_tests_on_live_platform. The
+# workflow's always() step + sweep-stale-e2e-orgs are the outer nets.
+teardown() {
+  local rc=$?
+  set +e
+  if [ "${E2E_KEEP_ORG:-0}" = "1" ]; then
+    echo ""
+    log "[teardown] E2E_KEEP_ORG=1 — leaving $SLUG for debugging (REMEMBER TO DELETE)"
+    exit $rc
+  fi
+  echo ""
+  log "[teardown] DELETE /cp/admin/tenants/$SLUG (scoped to this run only)"
+  admin_call DELETE "/cp/admin/tenants/$SLUG" --max-time 120 \
+    -d "{\"confirm\":\"$SLUG\"}" >/dev/null 2>&1
+  for j in $(seq 1 24); do
+    LIST=$(admin_call GET "/cp/admin/orgs?limit=500" 2>/dev/null)
+    LEAK=$(echo "$LIST" | python3 -c "
+import sys, json
+try: d = json.load(sys.stdin)
+except Exception: print(1); sys.exit(0)
+orgs = d if isinstance(d, list) else d.get('orgs', [])
+print(sum(1 for o in orgs if o.get('slug') == '$SLUG' and o.get('instance_status') not in ('purged',) and o.get('status') != 'purged'))
+" 2>/dev/null || echo 1)
+    if [ "$LEAK" = "0" ]; then
+      log "[teardown] ✓ $SLUG purged (after ${j}x5s)"
+      exit $rc
+    fi
+    sleep 5
+  done
+  echo "::warning::[teardown] $SLUG still present after 120s — sweep-stale-e2e-orgs will catch it within MAX_AGE_MINUTES" >&2
+  [ $rc -eq 0 ] && rc=4
+  exit $rc
+}
+trap teardown EXIT INT TERM
+
+# ─── 1. Provision the throwaway org ────────────────────────────────────
+log "1/6 POST /cp/admin/orgs — slug=$SLUG"
+CREATE=$(admin_call POST /cp/admin/orgs \
+  -d "{\"slug\":\"$SLUG\",\"name\":\"E2E peer-visibility $SLUG\",\"owner_user_id\":\"e2e-runner:$SLUG\"}")
+ORG_ID=$(echo "$CREATE" | python3 -c "import sys,json; print(json.load(sys.stdin).get('id',''))" 2>/dev/null)
+[ -n "$ORG_ID" ] || fail "org creation failed: $(echo "$CREATE" | head -c 300)"
+log "    ORG_ID=$ORG_ID"
+
+# ─── 2. Wait for tenant EC2 + DNS ──────────────────────────────────────
+log "2/6 waiting for tenant instance_status=running (cold EC2 + cloudflared)..."
+DEADLINE=$(( $(date +%s) + PROVISION_TIMEOUT_SECS ))
+while true; do
+  [ "$(date +%s)" -gt "$DEADLINE" ] && fail "tenant never came up within ${PROVISION_TIMEOUT_SECS}s"
+  STATUS=$(admin_call GET "/cp/admin/orgs?limit=500" 2>/dev/null | python3 -c "
+import sys, json
+try: d = json.load(sys.stdin)
+except Exception: sys.exit(0)
+orgs = d if isinstance(d, list) else d.get('orgs', [])
+for o in orgs:
+    if o.get('slug') == '$SLUG':
+        print(o.get('instance_status') or o.get('status') or 'unknown'); break
+" 2>/dev/null)
+  case "$STATUS" in running|online|ready) break ;; esac
+  sleep 10
+done
+log "    tenant status=$STATUS"
+
+# ─── 3. Per-tenant admin token + tenant URL ────────────────────────────
+log "3/6 fetching per-tenant admin token..."
+TT_RESP=$(admin_call GET "/cp/admin/orgs/$SLUG/admin-token")
+TENANT_TOKEN=$(echo "$TT_RESP" | python3 -c "import sys,json; print(json.load(sys.stdin).get('admin_token',''))" 2>/dev/null)
+[ -n "$TENANT_TOKEN" ] || fail "tenant token fetch failed: $(echo "$TT_RESP" | head -c 200)"
+
+CP_HOST=$(echo "$CP_URL" | sed -E 's#^https?://##; s#/.*$##')
+case "$CP_HOST" in
+  api.*)         DERIVED_DOMAIN="${CP_HOST#api.}" ;;
+  staging-api.*) DERIVED_DOMAIN="staging.${CP_HOST#staging-api.}" ;;
+  *)             DERIVED_DOMAIN="$CP_HOST" ;;
+esac
+TENANT_URL="https://${SLUG}.${DERIVED_DOMAIN}"
+log "    tenant url: $TENANT_URL"
+
+log "3b. waiting for tenant /health (TLS/DNS, up to 10min)..."
+for i in $(seq 1 120); do
+  curl -fsS "$TENANT_URL/health" -m 5 -k >/dev/null 2>&1 && { log "    /health ok (attempt $i)"; break; }
+  sleep 5
+done
+
+# ─── 4. Provision the parent + one sibling per runtime under test ──────
+# Inject the LLM provider key so each runtime can authenticate at boot.
+# Priority: MiniMax → direct-Anthropic → OpenAI (mirrors
+# test_staging_full_saas.sh's secrets-injection chain).
+SECRETS_JSON='{}'
+if [ -n "${E2E_MINIMAX_API_KEY:-}" ]; then
+  SECRETS_JSON=$(python3 -c "import json,os;k=os.environ['E2E_MINIMAX_API_KEY'];print(json.dumps({'ANTHROPIC_BASE_URL':'https://api.minimax.io/anthropic','ANTHROPIC_AUTH_TOKEN':k,'MINIMAX_API_KEY':k}))")
+elif [ -n "${E2E_ANTHROPIC_API_KEY:-}" ]; then
+  SECRETS_JSON=$(python3 -c "import json,os;k=os.environ['E2E_ANTHROPIC_API_KEY'];print(json.dumps({'ANTHROPIC_API_KEY':k}))")
+elif [ -n "${E2E_OPENAI_API_KEY:-}" ]; then
+  SECRETS_JSON=$(python3 -c "import json,os;k=os.environ['E2E_OPENAI_API_KEY'];print(json.dumps({'OPENAI_API_KEY':k,'OPENAI_BASE_URL':'https://api.openai.com/v1','MODEL_PROVIDER':'openai:gpt-4o','HERMES_INFERENCE_PROVIDER':'custom','HERMES_CUSTOM_BASE_URL':'https://api.openai.com/v1','HERMES_CUSTOM_API_KEY':k,'HERMES_CUSTOM_API_MODE':'chat_completions'}))")
+fi
+
+log "4/6 provisioning parent (claude-code) + one sibling per runtime under test..."
+P_RESP=$(tenant_call POST /workspaces \
+  -d "{\"name\":\"pv-parent\",\"runtime\":\"claude-code\",\"tier\":3,\"secrets\":$SECRETS_JSON}")
+PARENT_ID=$(echo "$P_RESP" | python3 -c "import sys,json; print(json.load(sys.stdin).get('id',''))" 2>/dev/null)
+[ -n "$PARENT_ID" ] || fail "parent create failed: $(echo "$P_RESP" | head -c 300)"
+log "    PARENT_ID=$PARENT_ID"
+
+# WS_IDS[runtime]=id ; WS_TOKENS[runtime]=auth_token (the MCP bearer)
+declare -A WS_IDS WS_TOKENS
+ALL_WS_IDS="$PARENT_ID"
+for rt in $PV_RUNTIMES; do
+  R=$(tenant_call POST /workspaces \
+    -d "{\"name\":\"pv-$rt\",\"runtime\":\"$rt\",\"tier\":2,\"parent_id\":\"$PARENT_ID\",\"secrets\":$SECRETS_JSON}")
+  WID=$(echo "$R" | python3 -c "import sys,json; print(json.load(sys.stdin).get('id',''))" 2>/dev/null)
+  # auth_token is top-level for container runtimes; external-like nest it
+  # under connection.auth_token (verified vs staging response shape).
+  WTOK=$(echo "$R" | python3 -c "
+import sys, json
+try: d = json.load(sys.stdin)
+except Exception: print(''); sys.exit(0)
+print(d.get('auth_token') or d.get('connection', {}).get('auth_token') or '')
+" 2>/dev/null)
+  [ -n "$WID" ] || fail "$rt workspace create failed: $(echo "$R" | head -c 300)"
+  [ -n "$WTOK" ] || fail "$rt workspace did not return an auth_token — cannot drive its MCP call (resp: $(echo "$R" | head -c 300))"
+  WS_IDS[$rt]="$WID"
+  WS_TOKENS[$rt]="$WTOK"
+  ALL_WS_IDS="$ALL_WS_IDS $WID"
+  log "    $rt → $WID"
+done
+
+# ─── 5. Wait for every sibling online ──────────────────────────────────
+log "5/6 waiting for all workspaces status=online (up to ${PROVISION_TIMEOUT_SECS}s — cold boot)..."
+WS_DEADLINE=$(( $(date +%s) + PROVISION_TIMEOUT_SECS ))
+for rt in $PV_RUNTIMES; do
+  wid="${WS_IDS[$rt]}"
+  LAST=""
+  while true; do
+    [ "$(date +%s)" -gt "$WS_DEADLINE" ] && fail "$rt ($wid) never reached online (last=$LAST)"
+    S=$(tenant_call GET "/workspaces/$wid" 2>/dev/null | python3 -c "
+import sys, json
+try: d = json.load(sys.stdin)
+except Exception: sys.exit(0)
+w = d.get('workspace') if isinstance(d.get('workspace'), dict) else d
+print(w.get('status') or '')
+" 2>/dev/null)
+    [ "$S" != "$LAST" ] && { log "    $rt → $S"; LAST="$S"; }
+    case "$S" in
+      online) break ;;
+      failed) sleep 10 ;;   # transient: bootstrap-watcher 5-min deadline, heartbeat recovers
+      *)      sleep 10 ;;
+    esac
+  done
+  ok "    $rt online"
+done
+
+# ─── 6. THE GATE — literal mcp_molecule_list_peers via POST /:id/mcp ────
+# This is the byte-for-byte user-facing call. NOT GET /registry/:id/peers,
+# NOT /health, NOT the heartbeat table. JSON-RPC 2.0 tools/call,
+# name=list_peers, authenticated by the workspace's OWN bearer token
+# through WorkspaceAuth + MCPRateLimiter.
+log "6/6 driving the LITERAL list_peers MCP call per runtime..."
+echo ""
+RPC_BODY='{"jsonrpc":"2.0","id":1,"method":"tools/call","params":{"name":"list_peers","arguments":{}}}'
+REGRESSED=0
+declare -A VERDICT
+
+for rt in $PV_RUNTIMES; do
+  wid="${WS_IDS[$rt]}"
+  wtok="${WS_TOKENS[$rt]}"
+  # The expected peer set = every OTHER provisioned workspace (parent +
+  # the sibling runtimes), excluding the caller itself.
+  EXPECT_IDS=$(echo "$ALL_WS_IDS" | tr ' ' '\n' | grep -v "^${wid}$" | grep -v '^$')
+
+  set +e
+  RESP=$(curl -sS -X POST "$TENANT_URL/workspaces/$wid/mcp" \
+    -H "Authorization: Bearer $wtok" \
+    -H "X-Molecule-Org-Id: $ORG_ID" \
+    -H "Content-Type: application/json" \
+    -d "$RPC_BODY" \
+    -o /tmp/pv_mcp_body.json -w "%{http_code}" 2>/dev/null)
+  set -e
+  HTTP_CODE="$RESP"
+  BODY=$(cat /tmp/pv_mcp_body.json 2>/dev/null || echo '')
+
+  echo "--- $rt (ws=$wid) ---"
+  echo "    HTTP $HTTP_CODE"
+  echo "    body: $(echo "$BODY" | head -c 600)"
+
+  # (1) HTTP 200 — a 401 (WorkspaceAuth reject, the Hermes symptom) fails here.
+  if [ "$HTTP_CODE" != "200" ]; then
+    echo "  ✗ $rt: list_peers MCP call returned HTTP $HTTP_CODE (expected 200)"
+    VERDICT[$rt]="FAIL(http=$HTTP_CODE)"
+    REGRESSED=1
+    continue
+  fi
+
+  # (2) JSON-RPC result present, not an error object.
+  PARSE=$(echo "$BODY" | python3 -c "
+import sys, json
+expect = set(filter(None, '''$EXPECT_IDS'''.split()))
+try:
+    d = json.load(sys.stdin)
+except Exception as e:
+    print('PARSE_ERROR:' + str(e)); sys.exit(0)
+if isinstance(d, dict) and d.get('error') is not None:
+    print('RPC_ERROR:' + json.dumps(d['error'])[:200]); sys.exit(0)
+res = d.get('result') if isinstance(d, dict) else None
+if res is None:
+    print('NO_RESULT'); sys.exit(0)
+# MCP tools/call result shape: {content:[{type:text,text:'<json or prose>'}]}
+text = ''
+if isinstance(res, dict):
+    for c in res.get('content', []):
+        if c.get('type') == 'text':
+            text += c.get('text', '')
+text_l = text.lower()
+# Native-sessions fallback signature (the OpenClaw symptom): the agent
+# answered from its own runtime session list, not the platform peer set.
+if 'sessions_list' in text_l or 'no platform peers' in text_l or 'native session' in text_l:
+    print('NATIVE_FALLBACK:' + text[:200]); sys.exit(0)
+# The expected sibling IDs must literally appear in the returned peer text.
+found = sorted(i for i in expect if i in text)
+missing = sorted(expect - set(found))
+if not expect:
+    print('NO_EXPECTED_PEERS_CONFIGURED'); sys.exit(0)
+if missing:
+    print('MISSING_PEERS:found=%d/%d missing=%s' % (len(found), len(expect), ','.join(m[:8] for m in missing)))
+    sys.exit(0)
+print('OK:found=%d/%d' % (len(found), len(expect)))
+" 2>/dev/null)
+
+  case "$PARSE" in
+    OK:*)
+      echo "  ✓ $rt: list_peers returned 200 and contains all expected peers ($PARSE)"
+      VERDICT[$rt]="OK"
+      ;;
+    NATIVE_FALLBACK:*)
+      echo "  ✗ $rt: list_peers fell back to NATIVE sessions — sees no platform peers ($PARSE)"
+      VERDICT[$rt]="FAIL(native-fallback)"
+      REGRESSED=1
+      ;;
+    RPC_ERROR:*|NO_RESULT|PARSE_ERROR:*)
+      echo "  ✗ $rt: list_peers MCP call did not return a usable result ($PARSE)"
+      VERDICT[$rt]="FAIL(rpc=$PARSE)"
+      REGRESSED=1
+      ;;
+    MISSING_PEERS:*)
+      echo "  ✗ $rt: list_peers returned 200 but peer set is wrong/empty ($PARSE)"
+      VERDICT[$rt]="FAIL(peers=$PARSE)"
+      REGRESSED=1
+      ;;
+    *)
+      echo "  ✗ $rt: unexpected verdict '$PARSE'"
+      VERDICT[$rt]="FAIL(unknown)"
+      REGRESSED=1
+      ;;
+  esac
+  echo ""
+done
+
+echo "=== SUMMARY — fresh-provision peer-visibility (literal MCP list_peers) ==="
+for rt in $PV_RUNTIMES; do
+  printf '  %-14s %s\n' "$rt" "${VERDICT[$rt]:-NO_RUN}"
+done
+echo ""
+
+if [ "$REGRESSED" -ne 0 ]; then
+  echo "✗ GATE FAILED — at least one runtime cannot see its peers via the"
+  echo "  literal mcp_molecule_list_peers call. This is the real user-facing"
+  echo "  failure the proxy signals (registry row / heartbeat / model 200)"
+  echo "  were hiding. Expected RED until the Hermes-401 + OpenClaw-MCP-wiring"
+  echo "  root-cause fixes land; goes green only when they actually do."
+  exit 10
+fi
+
+ok "GATE PASSED — every runtime under test sees its platform peers via the literal MCP call."
+exit 0
@@ -177,7 +177,7 @@ func expandEnvRef(key, ref, whole string, env map[string]string) string {
 }


-// loadWorkspaceEnv reads the org root .env and the workspace-specific .env .env and the workspace-specific .env
+// loadWorkspaceEnv reads the org root .env and the workspace-specific .env
 // (workspace overrides org root). Used by both secret injection and channel
 // config expansion.
 //
@@ -189,6 +189,24 @@ const containerNamePrefix = "ws-"
 // (the wiped-DB case after `docker compose down -v`).
 const LabelManaged = "molecule.platform.managed"

+// AgentUID / AgentGID are the uid/gid of the unprivileged `agent` user that
+// every workspace template creates and drops to via `gosu agent` before
+// exec'ing the runtime (the a2a_mcp_server runs under this uid). The value is
+// fixed at 1000:1000 across all templates — see:
+//   - workspace-configs-templates/claude-code-default/Dockerfile (`useradd -u 1000 ... agent`)
+//   - workspace-configs-templates/hermes/Dockerfile               (`useradd -u 1000 ... agent`)
+//   - workspace/entrypoint.sh                                     (`exec gosu agent` — "uid 1000")
+//
+// Files the platform injects into /configs AFTER the entrypoint's
+// `chown -R agent:agent /configs` (the post-start #418 re-injection and the
+// pre-start #1877 volume write) must be owned by this uid/gid, otherwise the
+// agent-uid MCP server hits EACCES reading /configs/.auth_token, sends an
+// empty bearer, and the platform 401s on /registry/{id}/peers (list_peers).
+const (
+	AgentUID = 1000
+	AgentGID = 1000
+)
+
 // managedLabels is the canonical label map applied to every workspace
 // container + volume. Pulled out so a future addition (e.g. instance
 // UUID for multi-platform-shared-daemon disambiguation) is one edit.
@@ -862,8 +880,18 @@ func buildTemplateTar(templatePath string) (*bytes.Buffer, error) {
 	return &buf, nil
 }

-// WriteFilesToContainer writes in-memory files into /configs in the container.
-func (p *Provisioner) WriteFilesToContainer(ctx context.Context, containerID string, files map[string][]byte) error {
+// buildConfigFilesTar builds the tar stream that WriteFilesToContainer streams
+// into /configs via CopyToContainer. Every entry is stamped Uid/Gid = agent
+// (AgentUID/AgentGID) so the files land agent-owned after extraction. This is
+// the issue #418 post-start re-injection path: it runs AFTER the template
+// entrypoint's `chown -R agent:agent /configs`, so without explicit ownership
+// in the tar header the files extract as root:root (tar Uid/Gid default 0) and
+// the agent-uid MCP server can no longer read /configs/.auth_token (and
+// /configs/.platform_inbound_secret) → empty bearer → list_peers 401.
+//
+// Pulled out as a pure function so the ownership contract is unit-testable
+// without a live Docker daemon (mirrors buildTemplateTar).
+func buildConfigFilesTar(files map[string][]byte) (*bytes.Buffer, error) {
 	var buf bytes.Buffer
 	tw := tar.NewWriter(&buf)

@@ -876,8 +904,10 @@ func (p *Provisioner) WriteFilesToContainer(ctx context.Context, containerID str
 				Typeflag: tar.TypeDir,
 				Name:     dir + "/",
 				Mode:     0755,
+				Uid:      AgentUID,
+				Gid:      AgentGID,
 			}); err != nil {
-				return fmt.Errorf("failed to write tar dir header for %s: %w", dir, err)
+				return nil, fmt.Errorf("failed to write tar dir header for %s: %w", dir, err)
 			}
 			createdDirs[dir] = true
 		}
@@ -886,19 +916,30 @@ func (p *Provisioner) WriteFilesToContainer(ctx context.Context, containerID str
 			Name: name,
 			Mode: 0644,
 			Size: int64(len(data)),
+			Uid:  AgentUID,
+			Gid:  AgentGID,
 		}
 		if err := tw.WriteHeader(header); err != nil {
-			return fmt.Errorf("failed to write tar header for %s: %w", name, err)
+			return nil, fmt.Errorf("failed to write tar header for %s: %w", name, err)
 		}
 		if _, err := tw.Write(data); err != nil {
-			return fmt.Errorf("failed to write tar data for %s: %w", name, err)
+			return nil, fmt.Errorf("failed to write tar data for %s: %w", name, err)
 		}
 	}
 	if err := tw.Close(); err != nil {
-		return fmt.Errorf("failed to close tar writer: %w", err)
+		return nil, fmt.Errorf("failed to close tar writer: %w", err)
 	}
+	return &buf, nil
+}

-	return p.cli.CopyToContainer(ctx, containerID, "/configs", &buf, container.CopyToContainerOptions{})
+// WriteFilesToContainer writes in-memory files into /configs in the container,
+// agent-owned (see buildConfigFilesTar).
+func (p *Provisioner) WriteFilesToContainer(ctx context.Context, containerID string, files map[string][]byte) error {
+	buf, err := buildConfigFilesTar(files)
+	if err != nil {
+		return err
+	}
+	return p.cli.CopyToContainer(ctx, containerID, "/configs", buf, container.CopyToContainerOptions{})
 }

 // CopyToContainer exposes CopyToContainer from the Docker client for use by other packages.
@@ -988,13 +1029,28 @@ func (p *Provisioner) ReadFromVolume(ctx context.Context, volumeName, filePath s
 	return clean, nil
 }

+// writeAuthTokenVolumeCmd is the shell command the throwaway alpine container
+// runs to seed /vol/.auth_token. alpine runs it as root, so without the
+// explicit `chown 1000:1000` the file stays root:root after the template
+// entrypoint's `chown -R agent:agent /configs` has already run — the agent-uid
+// (AgentUID) MCP server then gets EACCES reading it → empty bearer →
+// list_peers 401. Pulled out as a pure function so the ownership contract is
+// unit-testable without a live Docker daemon. Issue #1877.
+func writeAuthTokenVolumeCmd() string {
+	return fmt.Sprintf(
+		"mkdir -p /vol && printf '%%s' $TOKEN > /vol/.auth_token && chmod 0600 /vol/.auth_token && chown %d:%d /vol/.auth_token",
+		AgentUID, AgentGID,
+	)
+}
+
 // WriteAuthTokenToVolume writes the workspace auth token into the config volume
 // BEFORE the container starts, eliminating the token-injection race window where
 // a restarted container could read a stale token from /configs/.auth_token before
 // WriteFilesToContainer writes the new one. Issue #1877.
 //
 // Uses a throwaway alpine container to write directly to the named volume,
-// bypassing the container lifecycle entirely.
+// bypassing the container lifecycle entirely. The written file is chowned to
+// the agent uid/gid (see writeAuthTokenVolumeCmd).
 func (p *Provisioner) WriteAuthTokenToVolume(ctx context.Context, workspaceID, token string) error {
 	if p == nil || p.cli == nil {
 		return ErrNoBackend
@@ -1002,7 +1058,7 @@ func (p *Provisioner) WriteAuthTokenToVolume(ctx context.Context, workspaceID, t
 	volName := ConfigVolumeName(workspaceID)
 	resp, err := p.cli.ContainerCreate(ctx, &container.Config{
 		Image: "alpine",
-		Cmd:   []string{"sh", "-c", "mkdir -p /vol && printf '%s' $TOKEN > /vol/.auth_token && chmod 0600 /vol/.auth_token"},
+		Cmd:   []string{"sh", "-c", writeAuthTokenVolumeCmd()},
 		Env:   []string{"TOKEN=" + token},
 	}, &container.HostConfig{
 		Binds: []string{volName + ":/vol"},
@@ -0,0 +1,95 @@
+package provisioner
+
+import (
+	"archive/tar"
+	"errors"
+	"io"
+	"strings"
+	"testing"
+)
+
+// These tests pin the P0 fix for the fleet-wide list_peers 401 (Hermes and
+// every other template): the workspace-server token-injection paths wrote
+// /configs/.auth_token (and /configs/.platform_inbound_secret) as root:root
+// AFTER the template entrypoint's `chown -R agent:agent /configs` ran, so the
+// agent-uid (1000) MCP server (a2a_mcp_server, running via `gosu agent`) hit
+// `[Errno 13] Permission denied` reading the bearer → empty bearer → platform
+// 401 on /registry/{id}/peers (the literal tool_list_peers path).
+//
+// The agent uid is 1000:1000, verified from the templates:
+//   - workspace-configs-templates/claude-code-default/Dockerfile: `useradd -u 1000 ... agent`
+//   - workspace-configs-templates/hermes/Dockerfile:               `useradd -u 1000 ... agent`
+//   - workspace/entrypoint.sh / claude-code-default/entrypoint.sh:  `exec gosu agent` ("uid 1000")
+//
+// Both tests assert the real artifact (the tar headers Docker's CopyToContainer
+// honours for ownership, and the literal shell command the throwaway alpine
+// container runs), not a mock that bypasses ownership. They FAIL on pre-fix
+// code (no Uid/Gid in tar headers; no chown in the alpine command → root:root)
+// and PASS post-fix (agent-owned).
+
+// TestWriteFilesToContainerTar_FilesAreAgentOwned covers the issue #418
+// post-start re-injection path (WriteFilesToContainer): the tar it streams
+// into /configs via CopyToContainer must carry Uid/Gid = agent (1000) so the
+// extracted files land agent-readable, not root:root. This is the path that
+// (re)writes BOTH .auth_token and .platform_inbound_secret on a cadence.
+func TestWriteFilesToContainerTar_FilesAreAgentOwned(t *testing.T) {
+	files := map[string][]byte{
+		".auth_token":              []byte("tok-abc123"),
+		".platform_inbound_secret": []byte("inbound-secret-xyz"),
+		"nested/dir/file.txt":      []byte("data"),
+	}
+
+	buf, err := buildConfigFilesTar(files)
+	if err != nil {
+		t.Fatalf("buildConfigFilesTar: %v", err)
+	}
+
+	tr := tar.NewReader(buf)
+	seen := map[string]bool{}
+	for {
+		hdr, err := tr.Next()
+		if errors.Is(err, io.EOF) {
+			break
+		}
+		if err != nil {
+			t.Fatalf("read tar: %v", err)
+		}
+		if _, err := io.Copy(io.Discard, tr); err != nil {
+			t.Fatalf("drain %s: %v", hdr.Name, err)
+		}
+		seen[hdr.Name] = true
+		if hdr.Uid != AgentUID {
+			t.Fatalf("tar entry %q Uid = %d, want %d (agent) — root-owned injection causes the list_peers 401",
+				hdr.Name, hdr.Uid, AgentUID)
+		}
+		if hdr.Gid != AgentGID {
+			t.Fatalf("tar entry %q Gid = %d, want %d (agent)", hdr.Name, hdr.Gid, AgentGID)
+		}
+	}
+
+	for _, want := range []string{".auth_token", ".platform_inbound_secret"} {
+		if !seen[want] {
+			t.Fatalf("tar missing %q (seen: %v)", want, seen)
+		}
+	}
+}
+
+// TestWriteAuthTokenVolumeCmd_ChownsToAgent covers the issue #1877 pre-start
+// volume-write path (WriteAuthTokenToVolume): the throwaway alpine container
+// writes /vol/.auth_token then chmod 0600 but, pre-fix, never chowns it, so it
+// stays root:root (alpine runs the command as root). The literal command must
+// chown the file to the agent uid:gid so the agent-uid MCP server can read it.
+func TestWriteAuthTokenVolumeCmd_ChownsToAgent(t *testing.T) {
+	cmd := writeAuthTokenVolumeCmd()
+
+	if !strings.Contains(cmd, "chmod 0600 /vol/.auth_token") {
+		t.Fatalf("alpine cmd lost the 0600 chmod (regression): %q", cmd)
+	}
+
+	wantChown := "chown 1000:1000 /vol/.auth_token"
+	if !strings.Contains(cmd, wantChown) {
+		t.Fatalf("alpine cmd = %q, missing %q — without it .auth_token stays root:root "+
+			"and the agent-uid MCP server gets EACCES → empty bearer → list_peers 401",
+			cmd, wantChown)
+	}
+}