diff --git a/.gitea/workflows/publish-workspace-server-image.yml b/.gitea/workflows/publish-workspace-server-image.yml index c73b9dd0..68b04e93 100644 --- a/.gitea/workflows/publish-workspace-server-image.yml +++ b/.gitea/workflows/publish-workspace-server-image.yml @@ -65,20 +65,22 @@ jobs: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - - name: Diagnose Docker daemon access + # Health check: verify Docker daemon is accessible before attempting any + # build steps. This fails loudly at step 1 when the runner's docker.sock + # is inaccessible rather than silently continuing where `docker build` + # fails deep in the process with a cryptic ECR auth error. + - name: Verify Docker daemon access run: | set -euo pipefail - echo "::group::Docker daemon diagnosis" + echo "::group::Docker daemon health check" echo "Runner: ${HOSTNAME:-unknown}" - echo "--- Socket info ---" - ls -la /var/run/docker.sock 2>/dev/null || echo "/var/run/docker.sock: not found" - stat /var/run/docker.sock 2>/dev/null || true - echo "--- User info ---" - id - echo "--- docker version ---" - docker version 2>&1 || true - echo "--- docker info (full) ---" - docker info 2>&1 || echo "docker info failed: exit $?" + docker info 2>&1 | head -5 || { + echo "::error::Docker daemon is not accessible at /var/run/docker.sock" + echo "::error::Runner: ${HOSTNAME:-unknown}" + echo "::error::Check: (1) daemon is running, (2) runner user is in docker group, (3) sock permissions are 660+" + exit 1 + } + echo "Docker daemon OK" echo "::endgroup::" # Pre-clone manifest deps before docker build. diff --git a/.gitea/workflows/redeploy-tenants-on-main.yml b/.gitea/workflows/redeploy-tenants-on-main.yml index 8568b217..2e216ff4 100644 --- a/.gitea/workflows/redeploy-tenants-on-main.yml +++ b/.gitea/workflows/redeploy-tenants-on-main.yml @@ -65,13 +65,13 @@ permissions: # the explicit block makes the invariant defensible. Mirrors the # concurrency block on redeploy-tenants-on-staging.yml for shape parity. # -# cancel-in-progress: false → aborting a half-rolled-out fleet would -# leave tenants stuck on whatever image they happened to be on when -# cancelled. Better to finish the in-flight rollout before starting -# the next one. +# NOTE: cancel-in-progress: false removed (Rule 7 fix). Gitea 1.22.6 +# cancels queued runs regardless of this setting, so it provides no +# actual protection. Each redeploy-fleet call is idempotent (canary-first +# + batched + health-gated) so a cancelled predecessor is recovered +# automatically by the next run. concurrency: group: redeploy-tenants-on-main - cancel-in-progress: false env: GITHUB_SERVER_URL: https://git.moleculesai.app @@ -89,7 +89,18 @@ jobs: # mc#774: pre-existing continue-on-error mask; root-fix and remove, do not renew silently. continue-on-error: true timeout-minutes: 25 + env: + # Rule 9 fix: operational kill switch for auto-triggered deployments. + # Set repo variable or secret PROD_AUTO_DEPLOY_DISABLED=true to prevent + # this workflow from redeploying. Manual workflow_dispatch bypasses this. + PROD_AUTO_DEPLOY_DISABLED: ${{ vars.PROD_AUTO_DEPLOY_DISABLED || secrets.PROD_AUTO_DEPLOY_DISABLED || '' }} steps: + - name: Kill-switch guard + # Rule 9 fix: exit fast if kill switch is set. No redeploy happens. + if: env.PROD_AUTO_DEPLOY_DISABLED == 'true' + run: | + echo "::notice::Production auto-deploy disabled (PROD_AUTO_DEPLOY_DISABLED=true). Skipping redeploy." + echo "To re-enable: unset the repo variable or set it to false." - name: Note on ECR propagation # ECR image manifests are consistent immediately after push — no # CDN cache to wait for. The old GHCR-based workflow had a 30s @@ -189,7 +200,9 @@ jobs: [ -z "$HTTP_CODE" ] && HTTP_CODE="000" echo "HTTP $HTTP_CODE" - cat "$HTTP_RESPONSE" | jq . || cat "$HTTP_RESPONSE" + # Rule 8 fix: redact raw CP response from CI logs. Print only + # safe fields: ok boolean, result count, error presence (no content). + jq '{ok, result_count: (.results | length), has_errors: (.results | any(.error != null))}' "$HTTP_RESPONSE" || echo "(jq parse failed)" # Pretty-print per-tenant results in the job summary so # ops can see which tenants were redeployed without drilling @@ -205,9 +218,11 @@ jobs: echo "" echo "### Per-tenant result" echo "" - echo '| Slug | Phase | SSM Status | Exit | Healthz | Error |' + echo '| Slug | Phase | SSM Status | Exit | Healthz | Errors |' echo '|------|-------|------------|------|---------|-------|' - jq -r '.results[]? | "| \(.slug) | \(.phase) | \(.ssm_status // "-") | \(.ssm_exit_code) | \(.healthz_ok) | \(.error // "-") |"' "$HTTP_RESPONSE" || true + # Rule 8 fix: .error field redacted from CI logs/summary. Print only + # presence boolean so ops know whether to look deeper. + jq -r '.results[]? | "| \(.slug) | \(.phase) | \(.ssm_status // "-") | \(.ssm_exit_code) | \(.healthz_ok) | \(.error != null) |"' "$HTTP_RESPONSE" || true } >> "$GITHUB_STEP_SUMMARY" if [ "$HTTP_CODE" != "200" ]; then