fix(ci): repair scheduled main janitors and track masks

2026-05-12 16:10:53 -07:00
parent 760e4eb806
commit 5d66ed6212
35 changed files with 110 additions and 48 deletions
@@ -37,6 +37,7 @@ jobs:
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking
    # the PR. Follow-up PR flips this off after surfaced defects are
    # triaged.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
@@ -48,6 +48,7 @@ jobs:
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking
    # the PR. Follow-up PR flips this off after surfaced defects are
    # triaged.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    steps:
      - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
@@ -45,6 +45,7 @@ jobs:
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking
    # the PR. Follow-up PR flips this off after surfaced defects are
    # triaged.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    timeout-minutes: 5
    steps:
@@ -148,6 +148,7 @@ jobs:
    # a permanent re-mask. Re-flip blocked on mc#664 fix-forward landing.
    # Other 4 #656 flips (changes, canvas-build, shellcheck, python-lint)
    # retain continue-on-error: false; only platform-build regresses.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true  # mc#664 fix-forward in flight; re-flip when mc#664 lands (PR #669 → rebase after #709)
    defaults:
      run:
@@ -186,6 +187,7 @@ jobs:
          echo "::group::pendinguploads exit=$pu_exit (last 100 lines)"
          tail -100 /tmp/test-pu.log
          echo "::endgroup::"
+        # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
        continue-on-error: true
      - if: needs.changes.outputs.platform == 'true'
        name: Run tests with race detection and coverage
@@ -372,6 +374,7 @@ jobs:
  canvas-deploy-reminder:
    name: Canvas Deploy Reminder
    runs-on: ubuntu-latest
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    needs: [changes, canvas-build]
    # Only fires on direct pushes to main (i.e. after staging→main promotion).
@@ -90,6 +90,7 @@ jobs:
    name: Synthetic E2E against staging
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    # Bumped from 12 → 20 (2026-05-04). Tenant user-data install phase
    # (apt-get update + install docker.io/jq/awscli/caddy + snap install
@@ -103,6 +103,7 @@ jobs:
  detect-changes:
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    outputs:
      api: ${{ steps.decide.outputs.api }}
@@ -154,6 +155,7 @@ jobs:
    name: E2E API Smoke Test
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    timeout-minutes: 15
    env:
@@ -164,7 +166,6 @@ jobs:
      # we let Docker assign an ephemeral host port.
      PG_CONTAINER: pg-e2e-api-${{ github.run_id }}-${{ github.run_attempt }}
      REDIS_CONTAINER: redis-e2e-api-${{ github.run_id }}-${{ github.run_attempt }}
-      PORT: "8080"
    steps:
      - name: No-op pass (paths filter excluded this commit)
        if: needs.detect-changes.outputs.api != 'true'
@@ -268,6 +269,20 @@ jobs:
        if: needs.detect-changes.outputs.api == 'true'
        working-directory: workspace-server
        run: go build -o platform-server ./cmd/server
+      - name: Pick platform port
+        if: needs.detect-changes.outputs.api == 'true'
+        run: |
+          PLATFORM_PORT=$(python3 - <<'PY'
+          import socket
+
+          with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+              s.bind(("127.0.0.1", 0))
+              print(s.getsockname()[1])
+          PY
+          )
+          echo "PORT=${PLATFORM_PORT}" >> "$GITHUB_ENV"
+          echo "BASE=http://127.0.0.1:${PLATFORM_PORT}" >> "$GITHUB_ENV"
+          echo "Platform host port: ${PLATFORM_PORT}"
      - name: Start platform (background)
        if: needs.detect-changes.outputs.api == 'true'
        working-directory: workspace-server
@@ -280,7 +295,7 @@ jobs:
        if: needs.detect-changes.outputs.api == 'true'
        run: |
          for i in $(seq 1 30); do
-            if curl -sf http://127.0.0.1:8080/health > /dev/null; then
+            if curl -sf "$BASE/health" > /dev/null; then
              echo "Platform up after ${i}s"
              exit 0
            fi
@@ -70,6 +70,7 @@ jobs:
  detect-changes:
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    outputs:
      canvas: ${{ steps.decide.outputs.canvas }}
@@ -118,6 +119,7 @@ jobs:
    name: Canvas tabs E2E
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    timeout-minutes: 40

@@ -84,6 +84,7 @@ jobs:
    name: E2E Staging External Runtime
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    timeout-minutes: 25

@@ -88,17 +88,20 @@ jobs:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
        with:
          fetch-depth: 1
+        # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
        continue-on-error: true

      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
        with:
          python-version: "3.11"
+        # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
        continue-on-error: true

      - name: YAML validation (best-effort)
        run: |
          echo "e2e-staging-saas.yml — PR validation: workflow YAML is valid."
          echo "E2E step runs only when provisioning-critical files change."
+        # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
        continue-on-error: true

  # Actual E2E: runs on trunk pushes (main + staging). NOT the PR-fire-only
@@ -109,6 +112,7 @@ jobs:
    # Only runs on trunk pushes. PR paths get pr-validate instead.
    if: github.event.pull_request.base.ref == ''
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    timeout-minutes: 45
    permissions:
@@ -37,6 +37,7 @@ jobs:
    name: Intentional-failure teardown sanity
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    timeout-minutes: 20

@@ -46,6 +46,7 @@ env:
 jobs:
  gate-check:
    runs-on: ubuntu-latest
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true  # Never block on our own detector failing
    steps:
      - name: Check out BASE ref (never PR-head under pull_request_target)
@@ -76,25 +77,32 @@ jobs:
        if: github.event_name == 'schedule'
        env:
          GITEA_TOKEN: ${{ secrets.SOP_TIER_CHECK_TOKEN || secrets.GITHUB_TOKEN }}
+          REPO: ${{ github.repository }}
        run: |
          set -euo pipefail
          # Fetch all open PRs and run gate-check on each
          # socket.setdefaulttimeout(15): defence-in-depth for missing SOP_TIER_CHECK_TOKEN.
          # gate_check.py uses timeout=15 on every urlopen call; this catches the
          # inline Python polling loop too (issue #603).
-          pr_numbers=$(python3 -c "
-            import socket, urllib.request, json, os
-            socket.setdefaulttimeout(15)
-            token = os.environ['GITEA_TOKEN']
-            req = urllib.request.Request(
-                'https://git.moleculesai.app/api/v1/repos/${{ github.repository }}/pulls?state=open&limit=100',
-                headers={'Authorization': f'token {token}', 'Accept': 'application/json'}
-            )
-            with urllib.request.urlopen(req) as r:
-                prs = json.loads(r.read())
-            for pr in prs:
-                print(pr['number'])
-          ")
+          pr_numbers=$(python3 <<'PY'
+          import json
+          import os
+          import socket
+          import urllib.request
+
+          socket.setdefaulttimeout(15)
+          token = os.environ["GITEA_TOKEN"]
+          repo = os.environ["REPO"]
+          req = urllib.request.Request(
+              f"https://git.moleculesai.app/api/v1/repos/{repo}/pulls?state=open&limit=100",
+              headers={"Authorization": f"token {token}", "Accept": "application/json"},
+          )
+          with urllib.request.urlopen(req) as r:
+              prs = json.loads(r.read())
+          for pr in prs:
+              print(pr["number"])
+          PY
+          )
          for pr in $pr_numbers; do
            echo "Checking PR #$pr..."
            python3 tools/gate-check-v3/gate_check.py \
@@ -78,7 +78,8 @@ jobs:
  detect-changes:
    name: detect-changes
    runs-on: ubuntu-latest
-    # internal#219 Phase 3 (RFC §1): surface broken workflows without blocking.
+    # mc#664 Phase 3 (RFC §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    outputs:
      handlers: ${{ steps.filter.outputs.handlers }}
@@ -118,7 +119,8 @@ jobs:
    name: Handlers Postgres Integration
    needs: detect-changes
    runs-on: ubuntu-latest
-    # internal#219 Phase 3 (RFC §1): surface broken workflows without blocking.
+    # mc#664 Phase 3 (RFC §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    env:
      # Unique name per run so concurrent jobs don't collide on the
@@ -63,6 +63,7 @@ jobs:
  detect-changes:
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    outputs:
      run: ${{ steps.decide.outputs.run }}
@@ -154,6 +155,7 @@ jobs:
    name: Harness Replays
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    timeout-minutes: 30
    steps:
@@ -1,6 +1,6 @@
 name: lint-continue-on-error-tracking

-# Tier 2e hard-gate lint (per internal#350) — every
+# Tier 2e hard-gate lint (per mc#664) — every
 # `continue-on-error: true` in `.gitea/workflows/*.yml` must carry a
 # `# mc#NNNN` or `# internal#NNNN` tracker comment within 2 lines,
 # the referenced issue must be OPEN, and ≤14 days old.
@@ -45,11 +45,11 @@ name: lint-continue-on-error-tracking
 # close-and-flip, or document the deliberate keep-mask in a fresh
 # 14-day-renewable tracker. After main is clean for 3 days,
 # follow-up PR flips this workflow's continue-on-error to false.
-# Tracking: internal#350.
+# Tracking: mc#664.
 #
 # Cross-links
 # -----------
-# - internal#350 (the RFC that specs this lint)
+# - mc#664 (the RFC that specs this lint)
 # - mc#664 (the empirical masked-3-weeks case)
 # - feedback_chained_defects_in_never_tested_workflows
 # - feedback_behavior_based_ast_gates
@@ -96,8 +96,9 @@ jobs:
    # Phase 3 (RFC #219 §1): surface masked defects without blocking
    # PRs. Pre-existing continue-on-error: true directives on main
    # all violate this lint at first — intentional. Flip to false
-    # follow-up after main is clean for 3 days. internal#350.
-    continue-on-error: true  # internal#350 Phase 3 mask — 14d forced-renewal cadence
+    # follow-up after main is clean for 3 days. mc#664.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
+    continue-on-error: true  # mc#664 Phase 3 mask — 14d forced-renewal cadence
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
      - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065  # v5.6.0
@@ -45,6 +45,7 @@ jobs:
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking
    # the PR. Follow-up PR flips this off after surfaced defects are
    # triaged.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
@@ -1,6 +1,6 @@
 name: lint-mask-pr-atomicity

-# Tier 2d hard-gate lint (per internal#350) — blocks PRs that touch
+# Tier 2d hard-gate lint (per mc#664) — blocks PRs that touch
 # `.gitea/workflows/ci.yml` and modify ONLY ONE of {continue-on-error,
 # all-required.sentinel.needs} without a `Paired: #NNN` reference in
 # the PR body or in a commit message.
@@ -37,11 +37,11 @@ name: lint-mask-pr-atomicity
 # This workflow lands at `continue-on-error: true` (Phase 3 — surface
 # regressions without blocking PRs while the rule beds in).
 # Follow-up PR flips to `false` once we have ≥3 days of clean runs on
-# `main` and no false-positives. Tracking issue: internal#350.
+# `main` and no false-positives. Tracking issue: mc#664.
 #
 # Cross-links
 # -----------
-# - internal#350 (the RFC that specs this lint)
+# - mc#664 (the RFC that specs this lint)
 # - PR#665 / PR#668 (the empirical split-pair)
 # - mc#664 (the main-red incident the split caused)
 # - feedback_strict_root_only_after_class_a
@@ -91,7 +91,8 @@ jobs:
    # Phase 3 (RFC #219 §1): surface broken shapes without blocking
    # PRs. Follow-up PR flips this to `false` once recent runs on main
    # are confirmed clean (eat-our-own-dogfood discipline mirrors
-    # PR#673's same-shape comment). Tracking: internal#350.
+    # PR#673's same-shape comment). Tracking: mc#664.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    steps:
      - name: Check out PR head with full history (need base SHA blobs)
@@ -55,6 +55,7 @@ jobs:
    # Phase 3 (RFC #219 §1): surface broken shapes without blocking PRs.
    # Follow-up PR flips this off after the 4 existing-on-main rule-2
    # (workflow_run) violations are migrated to a supported trigger.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
@@ -62,6 +62,7 @@ jobs:
    # See issue #576 + infra-lead pulse ~00:30Z.
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    steps:
      - name: Checkout
@@ -55,6 +55,7 @@ jobs:
  # The actual bump work happens on the main/staging push after merge.
  pr-validate:
    runs-on: ubuntu-latest
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true  # do not block PR merge on operational failures
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
@@ -51,6 +51,7 @@ jobs:
    name: Audit Railway env vars for drift-prone pins
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    timeout-minutes: 10

@@ -86,6 +86,7 @@ jobs:
    if: ${{ github.event.workflow_run.conclusion == 'success' }}
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    timeout-minutes: 25
    steps:
@@ -76,6 +76,7 @@ jobs:
  redeploy:
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    timeout-minutes: 25
    steps:
@@ -53,6 +53,7 @@ jobs:
        # runners with internet access to package mirrors). Falls back to GitHub
        # binary download. GitHub releases may be blocked on some runner networks
        # (infra#241 follow-up).
+        # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
        continue-on-error: true
        run: |
          if apt-get update -qq && apt-get install -y -qq jq; then
@@ -67,6 +67,7 @@ jobs:
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking
    # the PR. Follow-up PR flips this off after surfaced defects are
    # triaged.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
@@ -52,6 +52,7 @@ jobs:
  detect-changes:
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    outputs:
      wheel: ${{ steps.decide.outputs.wheel }}
@@ -96,6 +97,7 @@ jobs:
    name: PR-built wheel + import smoke
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    steps:
      - name: No-op pass (paths filter excluded this commit)
@@ -57,6 +57,7 @@ jobs:
    name: Detect SECRET_PATTERNS drift
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    timeout-minutes: 5
    steps:
@@ -64,7 +64,8 @@ jobs:
  tier-check:
    runs-on: ubuntu-latest
    # BURN-IN: continue-on-error prevents AND-composition from blocking
-    # PRs during the 7-day window. Remove after 2026-05-17 (internal#189).
+    # PRs during the 7-day window. Remove after 2026-05-17 (mc#664).
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    permissions:
      contents: read
@@ -89,6 +90,7 @@ jobs:
        # runners). The sop-tier-check script has its own fallback as a
        # third line of defense. continue-on-error: true ensures this step
        # failing does not block the job.
+        # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
        continue-on-error: true
        run: |
          # apt-get is the primary method — Ubuntu package mirrors are reliably
@@ -109,6 +111,7 @@ jobs:
        # continue-on-error: true at step level — job-level is ignored by Gitea
        # Actions (quirk #10, internal runbooks). Belt-and-suspenders with
        # SOP_FAIL_OPEN=1 + || true below.
+        # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
        continue-on-error: true
        env:
          GITEA_TOKEN: ${{ secrets.SOP_TIER_CHECK_TOKEN || secrets.GITHUB_TOKEN }}
@@ -85,6 +85,7 @@ jobs:
  staging-smoke:
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    outputs:
      sha: ${{ steps.compute.outputs.sha }}
@@ -205,6 +206,7 @@ jobs:
    if: ${{ needs.staging-smoke.result == 'success' && needs.staging-smoke.outputs.smoke_ran == 'true' }}
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    env:
      SHA: ${{ needs.staging-smoke.outputs.sha }}
@@ -29,15 +29,11 @@ name: Sweep stale AWS Secrets Manager secrets
 #     reconciler enumerator) is filed as a separate controlplane
 #     issue. This sweeper is the immediate cost-relief stopgap.
 #
-# AWS credentials: the confirmed Gitea secrets are AWS_ACCESS_KEY_ID /
-# AWS_SECRET_ACCESS_KEY (the molecule-cp IAM user). These are the same
-# credentials used by the rest of the platform. The dedicated
-# AWS_JANITOR_* naming (which the original GitHub workflow used) was
-# never populated in Gitea — the existing secrets are AWS_ACCESS_KEY_ID /
-# AWS_SECRET_ACCESS_KEY (per issue #425 §425 audit). These DO have
-# secretsmanager:ListSecrets (the production molecule-cp principal);
-# if ListSecrets is revoked in future, a dedicated janitor principal
-# would need to be created and the Gitea secret names updated here.
+# AWS credentials: use the dedicated Secrets Manager janitor principal.
+# Do not fall back to the molecule-cp application principal: it does
+# not need account-wide ListSecrets, and a 2026-05-12 CI failure proved
+# that using it here turns a least-privilege production credential into
+# a red scheduled janitor.
 #
 # Safety: the script's MAX_DELETE_PCT gate (default 50%, mirroring
 # sweep-cf-orphans.yml — tenant secrets are durable by design, unlike
@@ -65,6 +61,7 @@ jobs:
    name: Sweep AWS Secrets Manager
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    # 30 min cap, mirroring the other janitors. AWS DeleteSecret is
    # fast (~0.3s/call) so even a 100+ backlog drains in seconds
@@ -73,8 +70,8 @@ jobs:
    timeout-minutes: 30
    env:
      AWS_REGION: ${{ secrets.AWS_REGION || 'us-east-1' }}
-      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_SECRETS_JANITOR_ACCESS_KEY_ID }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRETS_JANITOR_SECRET_ACCESS_KEY }}
      CP_ADMIN_API_TOKEN: ${{ secrets.CP_ADMIN_API_TOKEN }}
      CP_STAGING_ADMIN_API_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
      MAX_DELETE_PCT: ${{ github.event.inputs.max_delete_pct || '50' }}
@@ -71,6 +71,7 @@ jobs:
    name: Sweep CF orphans
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    # 3 min surfaces hangs (CF API stall, AWS describe-instances stuck)
    # within one cron interval instead of burning a full tick. Realistic
@@ -55,6 +55,7 @@ jobs:
    name: Sweep CF tunnels
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    # 30 min cap. Was 5 min on the theory that the only thing that
    # could take >5min is a CF-API hang — but on 2026-05-02 a backlog
@@ -46,6 +46,7 @@ jobs:
    name: Ops scripts (unittest)
    runs-on: ubuntu-latest
    # Phase 3 (RFC #219 §1): surface broken workflows without blocking.
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
@@ -31,6 +31,7 @@ jobs:
    name: Weekly Platform-Go Surface
    runs-on: ubuntu-latest
    # continue-on-error: surface only, never block
+    # mc#664: pre-existing continue-on-error mask; root-fix and remove, do not renew silently.
    continue-on-error: true
    defaults:
      run:
@@ -239,9 +239,9 @@ for s in d.get("SecretList", []):

 # --- Summarize + safety gate ----------------------------------------------

-DELETE_COUNT=$(echo "$DECISIONS" | python3 -c "import json,sys; print(sum(1 for l in sys.stdin if json.loads(l)['action']=='delete'))")
+DELETE_COUNT=$(printf '%s' "$DECISIONS" | python3 -c "import json,sys; print(sum(1 for l in sys.stdin if json.loads(l)['action']=='delete'))")
 KEEP_COUNT=$((TOTAL_SECRETS - DELETE_COUNT))
-TENANT_SECRETS=$(echo "$DECISIONS" | python3 -c "
+TENANT_SECRETS=$(printf '%s' "$DECISIONS" | python3 -c "
 import json, sys
 n = sum(1 for l in sys.stdin if json.loads(l)['reason'] != 'not-a-tenant-secret')
 print(n)
@@ -256,7 +256,7 @@ log "  would keep:             $KEEP_COUNT"
 log ""

 # Per-reason breakdown of deletes + keep-categories worth seeing
-echo "$DECISIONS" | python3 -c "
+printf '%s' "$DECISIONS" | python3 -c "
 import json,sys,collections
 delete_c = collections.Counter()
 keep_c = collections.Counter()
@@ -291,7 +291,7 @@ if [ "$DRY_RUN" = "1" ]; then
  log "Dry run complete. Pass --execute to actually delete $DELETE_COUNT secrets."
  log ""
  log "First 20 secrets that would be deleted:"
-  echo "$DECISIONS" | python3 -c "
+  printf '%s' "$DECISIONS" | python3 -c "
 import json, sys
 shown = 0
 for l in sys.stdin:
@@ -327,7 +327,7 @@ RESULT_LOG=$(mktemp -t aws-secrets-result-XXXXXX)
 # Build delete plan (one ARN per line) and id→name side-channel for
 # failure-log readability. Use ARN rather than Name on the delete
 # call because Name is mutable; ARN is the stable identifier.
-echo "$DECISIONS" | python3 -c '
+printf '%s' "$DECISIONS" | python3 -c '
 import json, sys
 plan_path = sys.argv[1]
 map_path = sys.argv[2]
@@ -195,9 +195,9 @@ for t in d.get("result", []):

 # --- Summarize + safety gate ----------------------------------------------

-DELETE_COUNT=$(echo "$DECISIONS" | python3 -c "import json,sys; print(sum(1 for l in sys.stdin if json.loads(l)['action']=='delete'))")
+DELETE_COUNT=$(printf '%s' "$DECISIONS" | python3 -c "import json,sys; print(sum(1 for l in sys.stdin if json.loads(l)['action']=='delete'))")
 KEEP_COUNT=$((TOTAL_TUNNELS - DELETE_COUNT))
-TENANT_TUNNELS=$(echo "$DECISIONS" | python3 -c "
+TENANT_TUNNELS=$(printf '%s' "$DECISIONS" | python3 -c "
 import json, sys
 n = sum(1 for l in sys.stdin if json.loads(l)['reason'] != 'not-a-tenant-tunnel')
 print(n)
@@ -212,7 +212,7 @@ log "  would keep:             $KEEP_COUNT"
 log ""

 # Per-reason breakdown of deletes
-echo "$DECISIONS" | python3 -c "
+printf '%s' "$DECISIONS" | python3 -c "
 import json,sys,collections
 c = collections.Counter()
 for l in sys.stdin:
@@ -242,7 +242,7 @@ if [ "$DRY_RUN" = "1" ]; then
  log "Dry run complete. Pass --execute to actually delete $DELETE_COUNT tunnels."
  log ""
  log "First 20 tunnels that would be deleted:"
-  echo "$DECISIONS" | python3 -c "
+  printf '%s' "$DECISIONS" | python3 -c "
 import json, sys
 shown = 0
 for l in sys.stdin:
@@ -283,7 +283,7 @@ RESULT_LOG=$(mktemp -t cf-tunnels-result-XXXXXX)

 # Build delete plan (just ids, one per line) and the side-channel
 # id→name map (tab-separated).
-echo "$DECISIONS" | python3 -c '
+printf '%s' "$DECISIONS" | python3 -c '
 import json, os, sys
 plan_path = sys.argv[1]
 map_path = sys.argv[2]