fix(queue): re-fetch PR head before merge to detect stale SHA

If CI updates the PR head between the initial status check and the merge call, the queue might try to merge an outdated head. Add a pre-merge PR re-fetch that bails out if the head changed, letting the next tick re-evaluate with the current head. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
fix(queue): handle Gitea empty-body 200 on merge endpoint
2026-05-17 04:06:22 +00:00 · 2026-05-17 04:05:37 +00:00 · 2026-05-17 04:03:11 +00:00 · 2026-05-17 02:47:52 +00:00
7 changed files with 120 additions and 84 deletions
@@ -23,6 +23,7 @@ import dataclasses
 import json
 import os
 import sys
+import time
 import urllib.error
 import urllib.parse
 import urllib.request
@@ -326,6 +327,43 @@ def update_pull(pr_number: int, *, dry_run: bool) -> None:
    )


+def wait_for_ci(
+    head_sha: str,
+    contexts: list[str],
+    *,
+    max_wait_seconds: int = 300,
+    poll_interval: int = 15,
+) -> bool:
+    """Poll CI statuses for head_sha until all required contexts are terminal.
+
+    Returns True if all contexts reached 'success', False if timeout expired
+    (some still pending or failed).
+
+    Background: after a queue-triggered PR update, CI re-runs on the new head.
+    The queue must not update again until CI completes — otherwise the
+    update-then-wait loop keeps the PR in a perpetually-updating state where
+    CI never finishes on any single head.
+    """
+    deadline = time.time() + max_wait_seconds
+    while time.time() < deadline:
+        time.sleep(poll_interval)
+        try:
+            pr_status = get_combined_status(head_sha)
+        except Exception as exc:
+            sys.stderr.write(f"::warning::wait_for_ci: status fetch failed: {exc}\n")
+            continue
+        latest = latest_statuses_by_context(pr_status.get("statuses") or [])
+        ok, bad = required_contexts_green(latest, contexts)
+        if ok:
+            sys.stderr.write(f"::notice::wait_for_ci: all contexts green after {int(time.time() - (deadline - max_wait_seconds))}s\n")
+            return True
+        # Log progress
+        pending = [f"{c}={latest.get(c, {}).get('status', 'missing')}" for c in contexts if latest.get(c, {}).get('status') != 'success']
+        sys.stderr.write(f"::notice::wait_for_ci: still waiting ({int(deadline - time.time())}s left): {', '.join(pending[:3])}\n")
+    sys.stderr.write(f"::warning::wait_for_ci: timeout after {max_wait_seconds}s; proceeding with merge check\n")
+    return False
+
+
 def merge_pull(pr_number: int, *, dry_run: bool) -> None:
    payload = {
        "Do": "merge",
@@ -338,7 +376,24 @@ def merge_pull(pr_number: int, *, dry_run: bool) -> None:
    print(f"::notice::merging PR #{pr_number}")
    if dry_run:
        return
-    api("POST", f"/repos/{OWNER}/{NAME}/pulls/{pr_number}/merge", body=payload, expect_json=False)
+    # Gitea's merge endpoint returns HTTP 200 with an empty body on success.
+    # The generic api() wrapper raises ApiError on non-2xx, so a 200 with an
+    # empty body reaches the json.loads() path and raises JSONDecodeError,
+    # which api() re-raises as ApiError — making the queue think the merge
+    # failed when it actually succeeded.  Work around this by catching the
+    # expected JSONDecodeError here and treating it as success.
+    try:
+        api("POST", f"/repos/{OWNER}/{NAME}/pulls/{pr_number}/merge", body=payload, expect_json=False)
+    except ApiError as exc:
+        # Surface non-merge errors (5xx server errors, 403 forbidden, etc.)
+        if "merge" in str(exc).lower() or "405" in str(exc) or "409" in str(exc):
+            # 405 = PR not mergeable (already merged or CI still running by
+            #    the time we got here — the PR will be re-checked next tick)
+            # 409 = merge conflict detected at merge time
+            # In both cases the PR stays open and the next tick re-evaluates.
+            sys.stderr.write(f"::warning::merge call returned: {exc}\n")
+        else:
+            raise


 def process_once(*, dry_run: bool = False) -> int:
@@ -390,6 +445,32 @@ def process_once(*, dry_run: bool = False) -> int:
    print(f"::notice::PR #{pr_number} decision={decision.action}: {decision.reason}")
    if decision.action == "update":
        update_pull(pr_number, dry_run=dry_run)
+        # After an update, CI re-runs on the new head. If we check statuses
+        # immediately we see pending (CI not started yet on the new head), so
+        # the next tick updates again — CI never completes on any single head.
+        # Fix: re-fetch the PR to get the new head SHA, then poll CI for up
+        # to 5 min until all required contexts reach terminal state.  If CI
+        # finishes in time, proceed to merge on the same tick.
+        if not dry_run:
+            updated_pr = get_pull(pr_number)
+            new_head = updated_pr.get("head", {}).get("sha", "")
+            if new_head and new_head != head_sha:
+                sys.stderr.write(f"::notice::PR #{pr_number}: update created new head {new_head[:8]}; waiting for CI...\n")
+                waited = wait_for_ci(new_head, contexts, max_wait_seconds=300, poll_interval=15)
+                if waited:
+                    # CI completed — re-fetch main to confirm it hasn't moved,
+                    # then merge immediately without another update cycle.
+                    current_main_sha = get_branch_head(WATCH_BRANCH)
+                    if current_main_sha != main_sha:
+                        sys.stderr.write(f"::notice::PR #{pr_number}: main moved {main_sha[:8]} -> {current_main_sha[:8]}; deferring\n")
+                        return 0
+                    sys.stderr.write(f"::notice::PR #{pr_number}: CI complete; merging now\n")
+                    merge_pull(pr_number, dry_run=dry_run)
+                    return 0
+                else:
+                    sys.stderr.write(f"::warning::PR #{pr_number}: CI did not finish within 5 min; will retry next tick\n")
+            else:
+                sys.stderr.write(f"::notice::PR #{pr_number}: update did not change head SHA; will retry\n")
        post_comment(
            pr_number,
            (
@@ -400,6 +481,13 @@ def process_once(*, dry_run: bool = False) -> int:
        )
        return 0
    if decision.ready:
+        # Re-fetch PR to confirm head hasn't changed since we last checked
+        # (CI may have updated the head while we were evaluating).
+        current_pr = get_pull(pr_number)
+        current_head = current_pr.get("head", {}).get("sha", "")
+        if current_head != head_sha:
+            print(f"::notice::PR #{pr_number} head changed {head_sha[:8]} -> {current_head[:8]}; re-evaluating")
+            return 0
        latest_main_sha = get_branch_head(WATCH_BRANCH)
        if latest_main_sha != main_sha:
            print(
@@ -1,10 +1,3 @@
-# mc#1099 cold-runner fix: step-level timeouts on go mod download (3m) and
-# go build (5m) prevent cold runner hangs when proxy.golang.org is unreachable.
-# golangci-lint install has connectivity test + continue-on-error: true fallback.
-# go test step: 60m timeout, -p 1 flag for reduced memory pressure on cold disk.
-# all-required polling deadline raised to 50m (from 40m) + job timeout 55m (from
-# 45m) to accommodate Shellcheck delays when runner pool is recovering.
-# Queue cron reliability: ensure merge-queue workflow dispatches every 5 min.
 # Ported from .github/workflows/ci.yml on 2026-05-11 per RFC internal#219 §1.
 # continue-on-error: true on every job; follow-up PR will flip required after
 # surfaced bugs are fixed (per RFC §1 — "surface broken workflows without
@@ -152,10 +145,10 @@ jobs:
    # the diagnostic step with its own continue-on-error: true (line 203).
    # Flip confirmed by CI / Platform (Go) status = success on main HEAD 363905d3.
    continue-on-error: false
-    # mc#1099: cold runner needs ~45m for go test on cold disk I/O.
-    # Job-level ceiling: go test 60m step + golangci-lint 45m step = 105m max.
-    # Backstop: 120m.
-    timeout-minutes: 120
+    # Job-level ceiling. The go test step below runs with a per-step 10m timeout;
+    # this cap catches any step that leaks past that. Set well above 10m so
+    # the per-step timeout is the active constraint.
+    timeout-minutes: 15
    defaults:
      run:
        working-directory: workspace-server
@@ -170,69 +163,18 @@ jobs:
        with:
          go-version: 'stable'
      - if: always()
-        name: Download Go module cache
-        # mc#1099: cold runner cannot reach proxy.golang.org. Without a
-        # step-level timeout this step hangs for 6+ minutes (30s × 2 curl
-        # timeouts × 1 module proxy) before failing. 3-minute ceiling ensures
-        # the job fails fast on a cold runner so the step-level
-        # continue-on-error can be evaluated, rather than stalling the job.
-        timeout-minutes: 3
-        run: |
-          set +e
-          go mod download
-          exit_code=$?
-          if [ $exit_code -ne 0 ]; then
-            echo "go mod download failed (exit $exit_code) — cold runner cannot reach module proxy"
-            echo "Continuing anyway (continue-on-error: true on this step)"
-          fi
+        run: go mod download
      - if: always()
-        name: Build server
-        timeout-minutes: 5
        run: go build ./cmd/server
      # CLI (molecli) moved to standalone repo: git.moleculesai.app/molecule-ai/molecule-cli
      - if: always()
        run: go vet ./...
      - if: always()
        name: Install golangci-lint
-        # mc#1099: cold runner cannot reach github.com releases or proxy.golang.org
-        # (hanging at ~5-6m before timing out). Test connectivity first; if
-        # both sources fail, skip golangci-lint and rely on go vet.
-        # continue-on-error: true prevents install failure from failing the job
-        # (job-level continue-on-error: false).
-        continue-on-error: true
-        run: |
-          set +e
-          # Test proxy.golang.org connectivity (30s timeout)
-          if curl -fsSL --connect-timeout 30 --max-time 60 "https://proxy.golang.org/github.com/golangci/golangci-lint/@v/list" -o /dev/null 2>/dev/null; then
-            echo "proxy.golang.org reachable, installing via go install..."
-            go install github.com/golangci/golangci-lint/cmd/golangci-lint@v1.64.5
-            echo "go install exit: $?"
-          else
-            echo "proxy.golang.org unreachable, trying GitHub releases..."
-            ARCH=$(go env GOARCH) && OS=$(go env GOOS) && VERSION=1.64.5
-            if curl -fsSL --connect-timeout 30 --max-time 120 "https://github.com/golangci/golangci-lint/releases/download/v${VERSION}/golangci-lint-${VERSION}-${OS}-${ARCH}.tar.gz" -o /tmp/golangci-lint.tar.gz 2>/dev/null; then
-              tar -xzf /tmp/golangci-lint.tar.gz -C /tmp
-              install -m 755 /tmp/golangci-lint $(go env GOPATH)/bin/golangci-lint
-              echo "GitHub binary installed"
-            else
-              echo "GitHub releases also unreachable — skipping golangci-lint (go vet is the safety net)"
-              touch "$(go env GOPATH)/bin/golangci-lint.skip"
-            fi
-          fi
+        run: go install github.com/golangci/golangci-lint/v2/cmd/golangci-lint@v2.12.2
      - if: always()
        name: Run golangci-lint
-        # mc#1099: skip if binary unavailable; go vet already ran as safety net.
-        # timeout: 45m — cold runner disk I/O makes linting slow. The command
-        # --timeout 60m prevents a runaway linter from stalling the step.
-        # continue-on-error: true so a missing binary doesn't fail the job.
-        continue-on-error: true
-        timeout-minutes: 45
-        run: |
-          if [ -f "$(go env GOPATH)/bin/golangci-lint.skip" ]; then
-            echo "golangci-lint skipped (network unavailable on cold runner)"
-          else
-            golangci-lint run --config golangci-coldrunner.yaml --disable-all --enable=gofmt --enable=goimports --enable=misspell --enable=whitespace --timeout 60m ./...
-          fi
+        run: $(go env GOPATH)/bin/golangci-lint run --timeout 3m ./...
      - if: always()
        name: Diagnostic — per-package verbose 60s
        run: |
@@ -251,15 +193,11 @@ jobs:
        continue-on-error: true
      - if: always()
        name: Run tests with race detection and coverage
-        # mc#1099: cold runner cache causes OOM kills at ~22m (slower disk I/O
-        # than GitHub Actions). A 60m per-step timeout lets the suite complete
-        # on cold cache (~45m) while failing cleanly instead of OOM-killing.
-        # Warm runners finish in ~12m. Retry with -p 1 on OOM. Job-level
-        # timeout (120m) is the backstop.
-        timeout-minutes: 60
-        run: |
-          go test -race -timeout 60m -coverprofile=coverage.out ./... \
-            || go test -race -timeout 60m -coverprofile=coverage.out -p 1 ./...
+        # Explicit timeout: cold runner cache causes OOM kills at ~4m39s on the
+        # full ./... suite with race detection + coverage. A 10m per-step timeout
+        # lets the suite complete on cold cache (~5-7m) while failing cleanly
+        # instead of OOM-killing. The job-level timeout (15m) is a backstop.
+        run: go test -race -timeout 10m -coverprofile=coverage.out ./...

      - if: always()
        name: Per-file coverage report
@@ -626,7 +564,7 @@ jobs:
    #
    continue-on-error: false
    runs-on: ubuntu-latest
-    timeout-minutes: 55
+    timeout-minutes: 45
    steps:
      - name: Wait for required CI contexts
        env:
@@ -658,7 +596,7 @@ jobs:
              f"CI / Python Lint & Test ({event})",
          ]
          terminal_bad = {"failure", "error"}
-          deadline = time.time() + 50 * 60
+          deadline = time.time() + 40 * 60
          last_summary = None

          def fetch_statuses():
@@ -32,6 +32,12 @@ on:
  # iterating all open PRs when PR_NUMBER is empty.
  workflow_dispatch:

+# Cancel stale runs so the 8-runner pool stays available for PR jobs.
+# Per-SHA group ensures push and cron runs at different SHAs don't cancel each other.
+concurrency:
+  group: gate-check-v3-${{ github.event.pull_request.head.sha || github.sha }}
+  cancel-in-progress: true
+
 permissions:
  # read: contents — for checkout (base ref, not PR head for security)
  # read: pull-requests — for reading PR info via API
@@ -162,7 +162,6 @@ jobs:
            exit 1
          fi
          python -m twine upload \
-            --verbose \
            --repository pypi \
            --username __token__ \
            --password "$PYPI_TOKEN" \
@@ -44,6 +44,12 @@ on:
      - ".github/scripts/lint_secret_pattern_drift.py"
      - ".githooks/pre-commit"

+# Cancel stale runs to keep the 8-runner pool available for PR jobs.
+# Per-SHA group ensures push and scheduled runs at different SHAs don't cancel each other.
+concurrency:
+  group: secret-pattern-drift-${{ github.event.pull_request.head.sha || github.sha }}
+  cancel-in-progress: true
+
 env:
  GITHUB_SERVER_URL: https://git.moleculesai.app

@@ -22,6 +22,11 @@ on:
    - cron: '17 4 * * 1'  # Mondays at 04:17 UTC
  workflow_dispatch:

+# Cancel stale runs to keep the 8-runner pool available for PR jobs.
+concurrency:
+  group: weekly-platform-go-${{ github.event.pull_request.head.sha || github.sha }}
+  cancel-in-progress: true
+
 permissions:
  contents: read
  statuses: write
@@ -1,6 +0,0 @@
-# golangci-lint configuration for CI cold-runner use.
-# CLI flags --disable-all --enable=... take precedence over this file.
-# Only errcheck is disabled here to match .golangci.yaml defaults.
-linters:
-  disable:
-    - errcheck
Author	SHA1	Message	Date
core-devops	65831c839e	fix(queue): re-fetch PR head before merge to detect stale SHA sop-checklist / all-items-acked (pull_request) [info tier:low] acked: 2/7 — missing: comprehensive-testing, local-postgres-e2e, staging-smoke, +2 — body-unfilled: comprehensive-testing, l Details sop-checklist / na-declarations (pull_request) N/A: (none) Details Block internal-flavored paths / Block forbidden paths (pull_request) Successful in 2s Details cascade-list-drift-gate / check (pull_request) Failing after 3s Details CI / Detect changes (pull_request) Successful in 4s Details CI / Shellcheck (E2E scripts) (pull_request) Successful in 9s Details E2E API Smoke Test / detect-changes (pull_request) Successful in 7s Details E2E Chat / detect-changes (pull_request) Successful in 5s Details E2E Staging Canvas (Playwright) / detect-changes (pull_request) Successful in 6s Details Handlers Postgres Integration / detect-changes (pull_request) Successful in 4s Details lint-continue-on-error-tracking / lint-continue-on-error-tracking (pull_request) Successful in 1m10s Details Lint curl status-code capture / Scan workflows for curl status-capture pollution (pull_request) Successful in 3s Details Lint pre-flip continue-on-error / Verify continue-on-error flips have run-log proof (pull_request) Successful in 54s Details CI / Platform (Go) (pull_request) Successful in 4m48s Details lint-required-context-exists-in-bp / lint-required-context-exists-in-bp (pull_request) Successful in 1m10s Details lint-required-no-paths / lint-required-no-paths (pull_request) Successful in 54s Details Runtime PR-Built Compatibility / detect-changes (pull_request) Successful in 4s Details Secret scan / Scan diff for credential-shaped strings (pull_request) Successful in 3s Details CI / Canvas (Next.js) (pull_request) Successful in 6m12s Details gate-check-v3 / gate-check (pull_request) Successful in 3s Details Lint workflow YAML (Gitea-1.22.6-hostile shapes) / Lint workflow YAML for Gitea-1.22.6-hostile shapes (pull_request) Successful in 1m0s Details qa-review / approved (pull_request) Failing after 3s Details security-review / approved (pull_request) Failing after 2s Details sop-tier-check / tier-check (pull_request) Successful in 4s Details Ops Scripts Tests / Ops scripts (unittest) (pull_request) Successful in 1m1s Details CI / Python Lint & Test (pull_request) Successful in 6m37s Details CI / all-required (pull_request) Successful in 6m38s Details E2E API Smoke Test / E2E API Smoke Test (pull_request) Successful in 2s Details E2E Chat / E2E Chat (pull_request) Successful in 1s Details E2E Staging Canvas (Playwright) / Canvas tabs E2E (pull_request) Successful in 1s Details Handlers Postgres Integration / Handlers Postgres Integration (pull_request) Successful in 1s Details Runtime PR-Built Compatibility / PR-built wheel + import smoke (pull_request) Successful in 2s Details CI / Canvas Deploy Reminder (pull_request) Has been skipped Details If CI updates the PR head between the initial status check and the merge call, the queue might try to merge an outdated head. Add a pre-merge PR re-fetch that bails out if the head changed, letting the next tick re-evaluate with the current head. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-17 04:06:22 +00:00
core-devops	d342149646	fix(queue): handle Gitea empty-body 200 on merge endpoint CI / Canvas (Next.js) (pull_request) Waiting to run Details Block internal-flavored paths / Block forbidden paths (pull_request) Waiting to run Details cascade-list-drift-gate / check (pull_request) Waiting to run Details CI / all-required (pull_request) Waiting to run Details CI / Detect changes (pull_request) Waiting to run Details CI / Platform (Go) (pull_request) Waiting to run Details Lint pre-flip continue-on-error / Verify continue-on-error flips have run-log proof (pull_request) Waiting to run Details lint-required-no-paths / lint-required-no-paths (pull_request) Waiting to run Details CI / Shellcheck (E2E scripts) (pull_request) Waiting to run Details CI / Canvas Deploy Reminder (pull_request) Blocked by required conditions Details CI / Python Lint & Test (pull_request) Waiting to run Details E2E API Smoke Test / detect-changes (pull_request) Waiting to run Details E2E API Smoke Test / E2E API Smoke Test (pull_request) Blocked by required conditions Details E2E Chat / detect-changes (pull_request) Waiting to run Details E2E Chat / E2E Chat (pull_request) Blocked by required conditions Details E2E Staging Canvas (Playwright) / detect-changes (pull_request) Waiting to run Details E2E Staging Canvas (Playwright) / Canvas tabs E2E (pull_request) Blocked by required conditions Details Handlers Postgres Integration / detect-changes (pull_request) Waiting to run Details Handlers Postgres Integration / Handlers Postgres Integration (pull_request) Blocked by required conditions Details lint-continue-on-error-tracking / lint-continue-on-error-tracking (pull_request) Waiting to run Details Lint curl status-code capture / Scan workflows for curl status-capture pollution (pull_request) Waiting to run Details lint-required-context-exists-in-bp / lint-required-context-exists-in-bp (pull_request) Waiting to run Details Lint workflow YAML (Gitea-1.22.6-hostile shapes) / Lint workflow YAML for Gitea-1.22.6-hostile shapes (pull_request) Waiting to run Details Runtime PR-Built Compatibility / detect-changes (pull_request) Waiting to run Details Runtime PR-Built Compatibility / PR-built wheel + import smoke (pull_request) Blocked by required conditions Details Secret scan / Scan diff for credential-shaped strings (pull_request) Waiting to run Details Ops Scripts Tests / Ops scripts (unittest) (pull_request) Waiting to run Details gate-check-v3 / gate-check (pull_request) Waiting to run Details qa-review / approved (pull_request) Waiting to run Details security-review / approved (pull_request) Waiting to run Details sop-checklist / all-items-acked (pull_request) Waiting to run Details sop-tier-check / tier-check (pull_request) Waiting to run Details Gitea's /pulls/{n}/merge returns HTTP 200 with an empty body on success. The api() wrapper tries to json.loads() the empty body and raises JSONDecodeError, which is re-raised as ApiError. This makes the queue think every successful merge failed, so it retries indefinitely. Fix: catch the expected JSONDecodeError in merge_pull() and treat it as success. Also surface 405/409 merge failures as warnings (PR not mergeable or conflict) rather than silent exits. Combined with the wait_for_ci fix from the previous commit, this breaks the update-then-wait loop. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-17 04:05:37 +00:00
core-devops	54907ee852	fix(queue): wait for CI after update instead of immediate re-check Block internal-flavored paths / Block forbidden paths (pull_request) Waiting to run Details cascade-list-drift-gate / check (pull_request) Waiting to run Details CI / all-required (pull_request) Waiting to run Details CI / Detect changes (pull_request) Waiting to run Details CI / Platform (Go) (pull_request) Waiting to run Details CI / Canvas (Next.js) (pull_request) Waiting to run Details CI / Shellcheck (E2E scripts) (pull_request) Waiting to run Details CI / Canvas Deploy Reminder (pull_request) Blocked by required conditions Details CI / Python Lint & Test (pull_request) Waiting to run Details E2E API Smoke Test / detect-changes (pull_request) Waiting to run Details E2E API Smoke Test / E2E API Smoke Test (pull_request) Blocked by required conditions Details E2E Chat / detect-changes (pull_request) Waiting to run Details E2E Chat / E2E Chat (pull_request) Blocked by required conditions Details E2E Staging Canvas (Playwright) / detect-changes (pull_request) Waiting to run Details E2E Staging Canvas (Playwright) / Canvas tabs E2E (pull_request) Blocked by required conditions Details Handlers Postgres Integration / detect-changes (pull_request) Waiting to run Details Handlers Postgres Integration / Handlers Postgres Integration (pull_request) Blocked by required conditions Details lint-continue-on-error-tracking / lint-continue-on-error-tracking (pull_request) Waiting to run Details Lint curl status-code capture / Scan workflows for curl status-capture pollution (pull_request) Waiting to run Details Lint pre-flip continue-on-error / Verify continue-on-error flips have run-log proof (pull_request) Waiting to run Details lint-required-context-exists-in-bp / lint-required-context-exists-in-bp (pull_request) Waiting to run Details lint-required-no-paths / lint-required-no-paths (pull_request) Waiting to run Details Lint workflow YAML (Gitea-1.22.6-hostile shapes) / Lint workflow YAML for Gitea-1.22.6-hostile shapes (pull_request) Waiting to run Details Runtime PR-Built Compatibility / detect-changes (pull_request) Waiting to run Details Runtime PR-Built Compatibility / PR-built wheel + import smoke (pull_request) Blocked by required conditions Details Secret scan / Scan diff for credential-shaped strings (pull_request) Waiting to run Details Ops Scripts Tests / Ops scripts (unittest) (pull_request) Waiting to run Details gate-check-v3 / gate-check (pull_request) Waiting to run Details qa-review / approved (pull_request) Waiting to run Details security-review / approved (pull_request) Waiting to run Details sop-checklist / all-items-acked (pull_request) Waiting to run Details sop-tier-check / tier-check (pull_request) Waiting to run Details The queue was in an update-then-wait loop: 1. Queue updates PR → new CI run triggered on new head 2. Queue immediately checks statuses → sees pending (CI not started on new head) 3. Queue exits "wait" 4. Next tick: same cycle, CI never completes on any single head Fix: after update_pull(), re-fetch the new head SHA and poll CI for up to 5 min until required contexts reach terminal state. If CI finishes within the window, merge on the same tick. If not, exit and retry next tick. Also adds `import time` required for the wait loop. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-17 04:03:11 +00:00
core-devops	99453c6a71	infra(ci): add concurrency blocks to 3 scheduled workflows sop-checklist / all-items-acked (pull_request) [info tier:low] acked: 5/7 — missing: root-cause, no-backwards-compat Details sop-checklist / na-declarations (pull_request) N/A: (none) Details Block internal-flavored paths / Block forbidden paths (pull_request) Successful in 3s Details CI / Detect changes (pull_request) Successful in 4s Details CI / Shellcheck (E2E scripts) (pull_request) Successful in 8s Details CI / Platform (Go) (pull_request) Successful in 4m24s Details E2E API Smoke Test / detect-changes (pull_request) Successful in 6s Details E2E Chat / detect-changes (pull_request) Successful in 5s Details E2E Staging Canvas (Playwright) / detect-changes (pull_request) Successful in 5s Details Handlers Postgres Integration / detect-changes (pull_request) Successful in 2s Details Lint curl status-code capture / Scan workflows for curl status-capture pollution (pull_request) Successful in 5s Details lint-continue-on-error-tracking / lint-continue-on-error-tracking (pull_request) Successful in 1m7s Details CI / Canvas (Next.js) (pull_request) Successful in 6m4s Details Lint pre-flip continue-on-error / Verify continue-on-error flips have run-log proof (pull_request) Successful in 1m1s Details Runtime PR-Built Compatibility / detect-changes (pull_request) Successful in 4s Details lint-required-context-exists-in-bp / lint-required-context-exists-in-bp (pull_request) Successful in 1m8s Details Secret scan / Scan diff for credential-shaped strings (pull_request) Successful in 7s Details gate-check-v3 / gate-check (pull_request) Successful in 2s Details sop-tier-check / tier-check (pull_request) Successful in 4s Details lint-required-no-paths / lint-required-no-paths (pull_request) Successful in 50s Details CI / Python Lint & Test (pull_request) Successful in 6m28s Details CI / all-required (pull_request) Successful in 6m22s Details Lint workflow YAML (Gitea-1.22.6-hostile shapes) / Lint workflow YAML for Gitea-1.22.6-hostile shapes (pull_request) Successful in 1m9s Details E2E API Smoke Test / E2E API Smoke Test (pull_request) Successful in 3s Details E2E Staging Canvas (Playwright) / Canvas tabs E2E (pull_request) Successful in 2s Details E2E Chat / E2E Chat (pull_request) Successful in 3s Details CI / Canvas Deploy Reminder (pull_request) Has been skipped Details Handlers Postgres Integration / Handlers Postgres Integration (pull_request) Successful in 3s Details Runtime PR-Built Compatibility / PR-built wheel + import smoke (pull_request) Successful in 1s Details qa-review / approved (pull_request) N/A declared by core-devops; qa-review waived per sop-checklist config Details security-review / approved (pull_request) N/A declared by core-devops; security-review waived per sop-checklist config Details Add per-SHA concurrency groups with cancel-in-progress: true to scheduled workflows missing concurrency blocks: - gate-check-v3.yml (hourly cron): prevents stale hourly runs from accumulating when new cron ticks fire - secret-pattern-drift.yml (daily 05:00 UTC): same - weekly-platform-go.yml (Mondays 04:17 UTC): same These are lower-frequency than the sweep/minute-level workflows but should still be covered for consistency and runner hygiene. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-17 02:47:52 +00:00