fix(queue): add wait-decision auto-hold + robust add_hold_label

- Add auto-hold when merge decision is "wait" (required contexts not green). Previously the queue silently returned 0 and re-checked the same PR on the next 5-min cron tick, burning a full invocation with no progress. All queued PRs with failing qa/sec gates now get held immediately and the queue moves on to the next PR. - Make add_hold_label robust: swallow 422 (duplicate label already present) and 404 (PR already closed) as non-fatal, matching the pattern used in process_once error handlers. - Add tests for wait-decision and tier:low soft-fail on sop-checklist. Part of internal#287 (queue cycling on qa/sec-failing PRs). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
fix(queue): add missing add_hold_label function
2026-05-18 04:38:44 +00:00 · 2026-05-18 00:46:52 +00:00 · 2026-05-18 00:40:10 +00:00 · 2026-05-18 00:24:46 +00:00 · 2026-05-17 20:56:57 +00:00 · 2026-05-17 20:07:54 +00:00
5 changed files with 202 additions and 18 deletions
@@ -44,7 +44,10 @@ REQUIRED_CONTEXTS_RAW = _env(
    "REQUIRED_CONTEXTS",
    default=(
        "CI / all-required (pull_request),"
-        "sop-checklist / all-items-acked (pull_request)"
+        "sop-checklist / all-items-acked (pull_request),"
+        "E2E Chat / E2E Chat (pull_request),"
+        "qa-review / approved (pull_request),"
+        "security-review / approved (pull_request)"
    ),
 )
 # Required contexts for push (main/staging) runs. The push CI uses the same
@@ -65,6 +68,11 @@ class ApiError(RuntimeError):
    pass


+class MergePermissionError(ApiError):
+    """Merge failed with a permanent permission error (403/404/405).
+    The queue should skip this PR and move to the next one."""
+
+
@dataclasses.dataclass(frozen=True)
 class MergeDecision:
    ready: bool
@@ -148,15 +156,38 @@ def latest_statuses_by_context(statuses: list[dict]) -> dict[str, dict]:
    return latest


+def _is_tier_low_pending_ok(
+    latest_statuses: dict[str, dict],
+    context: str,
+    pr_labels: set[str],
+) -> bool:
+    """Return True if tier:low PR can tolerate sop-checklist pending state.
+
+    Per sop-checklist-config.yaml tier_failure_mode, tier:low uses soft-fail:
+    sop-checklist posts state=pending when acks are satisfied (missing
+    manager/ceo acks are informational only). The queue should accept
+    pending instead of waiting for success.
+    """
+    if "tier:low" not in pr_labels:
+        return False
+    if "sop-checklist" not in context:
+        return False
+    status = latest_statuses.get(context) or {}
+    return status_state(status) == "pending"
+
+
 def required_contexts_green(
    latest_statuses: dict[str, dict],
    contexts: list[str],
+    pr_labels: set[str] | None = None,
 ) -> tuple[bool, list[str]]:
    missing_or_bad: list[str] = []
    for context in contexts:
        status = latest_statuses.get(context)
        state = status_state(status or {})
        if state != "success":
+            if pr_labels and _is_tier_low_pending_ok(latest_statuses, context, pr_labels):
+                continue  # tier:low soft-fail: accept pending sop-checklist
            missing_or_bad.append(f"{context}={state or 'missing'}")
    return not missing_or_bad, missing_or_bad

@@ -209,6 +240,7 @@ def evaluate_merge_readiness(
    pr_status: dict,
    required_contexts: list[str],
    pr_has_current_base: bool,
+    pr_labels: set[str] | None = None,
 ) -> MergeDecision:
    # Check push-required contexts explicitly instead of combined state.
    # Combined state can be "failure" due to non-blocking jobs
@@ -228,7 +260,7 @@ def evaluate_merge_readiness(
    # The required_contexts list is the authoritative gate — it includes only
    # the checks that actually block merges.
    latest = latest_statuses_by_context(pr_status.get("statuses") or [])
-    ok, missing_or_bad = required_contexts_green(latest, required_contexts)
+    ok, missing_or_bad = required_contexts_green(latest, required_contexts, pr_labels)
    if not ok:
        return MergeDecision(False, "wait", "required contexts not green: " + ", ".join(missing_or_bad))
    return MergeDecision(True, "merge", "ready")
@@ -253,27 +285,32 @@ def get_combined_status(sha: str) -> dict:
    _, combined = api("GET", f"/repos/{OWNER}/{NAME}/commits/{sha}/status")
    if not isinstance(combined, dict):
        raise ApiError(f"status for {sha} response not object")
-    # Fetch full statuses list; 200 covers >99% of real-world runs.
-    # The list is ordered ascending by id (oldest first) — callers must
-    # iterate in reverse to get the newest entry per context.
-    # Best-effort: large repos (main with 550+ statuses) may time out.
-    # On timeout, fall back to the statuses[] already in the combined
-    # response (usually 30 entries — enough for most PRs, enough for
-    # main's early push-required contexts).
+    combined_statuses: list[dict] = combined.get("statuses") or []
    try:
-        _, all_statuses = api(
+        _, all_statuses_raw = api(
            "GET",
            f"/repos/{OWNER}/{NAME}/commits/{sha}/statuses",
            query={"limit": "50"},
        )
-        if isinstance(all_statuses, list):
-            combined["statuses"] = all_statuses
+        if isinstance(all_statuses_raw, list):
+            all_statuses: list[dict] = list(all_statuses_raw)
+        else:
+            all_statuses = []
    except (ApiError, urllib.error.URLError, TimeoutError, OSError) as exc:
-        # URLError covers network-level failures (DNS, refused, timeout).
-        # TimeoutError and OSError cover socket-level timeouts.
        sys.stderr.write(f"::warning::could not fetch full statuses list for {sha[:8]}: {exc}\n")
-        # Fall back to the statuses[] already in the combined response.
-        pass
+        all_statuses = []
+    # Build latest per context: process combined (ascending→reverse=newest
+    # first), then fill gaps from all_statuses (already newest-first).
+    latest: dict[str, dict] = {}
+    for status in reversed(sorted(combined_statuses, key=lambda s: s.get("id") or 0)):
+        ctx = status.get("context")
+        if isinstance(ctx, str) and ctx not in latest:
+            latest[ctx] = status
+    for status in all_statuses:
+        ctx = status.get("context")
+        if isinstance(ctx, str) and ctx not in latest:
+            latest[ctx] = status
+    combined["statuses"] = list(latest.values())
    return combined


@@ -314,6 +351,25 @@ def post_comment(pr_number: int, body: str, *, dry_run: bool) -> None:
    api("POST", f"/repos/{OWNER}/{NAME}/issues/{pr_number}/comments", body={"body": body})


+def add_hold_label(pr_number: int, *, dry_run: bool) -> None:
+    """Apply the hold label so the queue skips this PR and processes the next."""
+    print(f"::notice::adding `{HOLD_LABEL}` to PR #{pr_number}")
+    if dry_run:
+        return
+    try:
+        api(
+            "POST",
+            f"/repos/{OWNER}/{NAME}/issues/{pr_number}/labels",
+            body={"labels": [HOLD_LABEL]},
+        )
+    except ApiError as exc:
+        # 404 = PR already closed/deleted; 422 = label already present (Gitea
+        # returns 422 for duplicate label assignment — not a real error).
+        if "404" in str(exc) or "422" in str(exc):
+            return
+        sys.stderr.write(f"::warning::could not add hold label to PR #{pr_number}: {exc}\n")
+
+
 def update_pull(pr_number: int, *, dry_run: bool) -> None:
    print(f"::notice::updating PR #{pr_number} with base branch via style={UPDATE_STYLE}")
    if dry_run:
@@ -338,7 +394,16 @@ def merge_pull(pr_number: int, *, dry_run: bool) -> None:
    print(f"::notice::merging PR #{pr_number}")
    if dry_run:
        return
-    api("POST", f"/repos/{OWNER}/{NAME}/pulls/{pr_number}/merge", body=payload, expect_json=False)
+    try:
+        api("POST", f"/repos/{OWNER}/{NAME}/pulls/{pr_number}/merge", body=payload, expect_json=False)
+    except ApiError as exc:
+        # Re-raise permission-like errors so process_once can skip this PR.
+        # 403 = no push access, 404 = repo/pr not found, 405 = not allowed.
+        msg = str(exc)
+        for code in ("403", "404", "405"):
+            if code in msg:
+                raise MergePermissionError(msg) from exc
+        raise  # re-raise other ApiErrors unchanged


 def process_once(*, dry_run: bool = False) -> int:
@@ -380,11 +445,13 @@ def process_once(*, dry_run: bool = False) -> int:
    commits = get_pull_commits(pr_number)
    current_base = pr_has_current_base(pr, commits, main_sha)
    pr_status = get_combined_status(head_sha)
+    pr_labels = label_names(pr)
    decision = evaluate_merge_readiness(
        main_status=main_status,
        pr_status=pr_status,
        required_contexts=contexts,
        pr_has_current_base=current_base,
+        pr_labels=pr_labels,
    )

    print(f"::notice::PR #{pr_number} decision={decision.action}: {decision.reason}")
@@ -399,6 +466,22 @@ def process_once(*, dry_run: bool = False) -> int:
            dry_run=dry_run,
        )
        return 0
+    if decision.action == "wait":
+        # Required contexts are not green. Auto-hold so the queue stops cycling
+        # on this PR and processes the next. Holds are removed manually once the
+        # blocker (e.g. qa/sec gate, missing SOP_TIER_CHECK_TOKEN) is resolved.
+        add_hold_label(pr_number, dry_run=dry_run)
+        post_comment(
+            pr_number,
+            (
+                f"merge-queue: auto-held — required contexts not green: "
+                f"{decision.reason}. "
+                "Remove the `merge-queue-hold` label and re-label `merge-queue` "
+                "to restart queue processing once the blocker is resolved."
+            ),
+            dry_run=dry_run,
+        )
+        return 0
    if decision.ready:
        latest_main_sha = get_branch_head(WATCH_BRANCH)
        if latest_main_sha != main_sha:
@@ -407,7 +490,44 @@ def process_once(*, dry_run: bool = False) -> int:
                "deferring to next tick"
            )
            return 0
-        merge_pull(pr_number, dry_run=dry_run)
+        try:
+            merge_pull(pr_number, dry_run=dry_run)
+        except MergePermissionError as exc:
+            # HTTP 403/404/405. Distinguish status-check gate (405 with
+            # "Not all required status checks") from a genuine permission
+            # error. Case-insensitive match — Gitea uses "Not all required..."
+            # (capital N) while other paths may return lowercase.
+            msg_lower = str(exc).lower()
+            is_status_check_failure = "not all required status checks successful" in msg_lower
+            if is_status_check_failure:
+                # Gitea's merge gate blocked us — a required context (e.g.
+                # E2E Chat, qa-review, security-review) is failing. Auto-add
+                # hold so the queue skips this PR and processes the next.
+                add_hold_label(pr_number, dry_run=dry_run)
+                post_comment(
+                    pr_number,
+                    (
+                        "merge-queue: merge blocked by Gitea's status-check gate "
+                        "(E2E Chat, qa-review, security-review, or other required "
+                        "context failing). Auto-held via `merge-queue-hold`. "
+                        "Remove the hold label to requeue once CI is green."
+                    ),
+                    dry_run=dry_run,
+                )
+                return 0
+            # Genuine permission error — token lacks Can-merge.
+            sys.stderr.write(f"::error::merge permission error for PR #{pr_number}: {exc}\n")
+            post_comment(
+                pr_number,
+                (
+                    "merge-queue: merge failed with HTTP 405 'User not allowed to merge PR'. "
+                    "No available token has Can-merge permission on this repo. "
+                    "Fix: grant Can-merge to a token, or add a maintain/admin collaborator. "
+                    "Skipping to next queued PR on next tick."
+                ),
+                dry_run=dry_run,
+            )
+            return 0
        return 0
    return 0

@@ -118,3 +118,64 @@ def test_merge_decision_updates_stale_pr_before_merge():

    assert decision.ready is False
    assert decision.action == "update"
+
+
+def test_MergePermissionError_inherits_from_ApiError():
+    assert issubclass(mq.MergePermissionError, mq.ApiError)
+
+
+def test_MergePermissionError_message_preserved():
+    exc = mq.MergePermissionError("POST /merge -> HTTP 405: User not allowed")
+    assert "405" in str(exc)
+    assert "User not allowed" in str(exc)
+
+
+def test_merge_decision_waits_when_required_contexts_not_green():
+    """When a required context (e.g. qa-review, E2E Chat) is not success, the
+    decision is 'wait' — the queue can then auto-hold on this."""
+    required = [
+        "CI / all-required (pull_request)",
+        "sop-checklist / all-items-acked (pull_request)",
+        "qa-review / approved (pull_request)",
+    ]
+    decision = mq.evaluate_merge_readiness(
+        main_status={
+            "state": "success",
+            "statuses": [{"context": "CI / all-required (push)", "status": "success"}],
+        },
+        pr_status={
+            "state": "failure",
+            "statuses": [
+                {"context": "CI / all-required (pull_request)", "status": "success"},
+                {"context": "sop-checklist / all-items-acked (pull_request)", "status": "success"},
+                {"context": "qa-review / approved (pull_request)", "status": "failure"},
+            ],
+        },
+        required_contexts=required,
+        pr_has_current_base=True,
+        pr_labels=None,
+    )
+    assert decision.ready is False
+    assert decision.action == "wait"
+    assert "qa-review" in decision.reason
+
+
+def test_tier_low_sop_checklist_pending_soft_fail():
+    """tier:low PRs get soft-fail on sop-checklist: pending is accepted."""
+    required = ["sop-checklist / all-items-acked (pull_request)"]
+    statuses = {
+        "sop-checklist / all-items-acked (pull_request)": {"status": "pending"}
+    }
+    ok, missing = mq.required_contexts_green(statuses, required, pr_labels={"tier:low"})
+    assert ok is True
+    assert missing == []
+
+
+def test_tier_low_sop_checklist_failure_not_soft_fail():
+    """tier:low soft-fail only covers pending, not actual failure."""
+    required = ["sop-checklist / all-items-acked (pull_request)"]
+    statuses = {
+        "sop-checklist / all-items-acked (pull_request)": {"status": "failure"}
+    }
+    ok, missing = mq.required_contexts_green(statuses, required, pr_labels={"tier:low"})
+    assert ok is False
@@ -162,6 +162,7 @@ jobs:
            exit 1
          fi
          python -m twine upload \
+            --verbose \
            --repository pypi \
            --username __token__ \
            --password "$PYPI_TOKEN" \
@@ -89,6 +89,7 @@ on:
 permissions:
  contents: read
  pull-requests: read
+  secrets: read  # required for SOP_TIER_CHECK_TOKEN team-membership probe

 jobs:
  # bp-exempt: PR review bot signal; required merge state is enforced by CI / all-required.
@@ -16,6 +16,7 @@ on:
 permissions:
  contents: read
  pull-requests: read
+  secrets: read  # required for SOP_TIER_CHECK_TOKEN team-membership probe

 jobs:
  # bp-exempt: PR security review bot signal; required merge state is enforced by CI / all-required.