Compare commits
3 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| c04e75f1eb | |||
| 55d7b04a42 | |||
| b23e733a93 |
@@ -45,11 +45,14 @@ REQUIRED_CONTEXTS_RAW = _env(
|
||||
default=(
|
||||
"CI / all-required (pull_request),"
|
||||
"sop-checklist / all-items-acked (pull_request),"
|
||||
"E2E Chat / E2E Chat (pull_request),"
|
||||
"qa-review / approved (pull_request),"
|
||||
"security-review / approved (pull_request)"
|
||||
"E2E Chat / E2E Chat (pull_request)"
|
||||
),
|
||||
)
|
||||
# E2E Chat is not in branch protection's status_check_contexts, but Gitea's
|
||||
# merge gate evaluates the full combined status including it. Adding it here
|
||||
# prevents the queue from attempting a merge that will be 405'd by Gitea when
|
||||
# E2E Chat is failing (e.g. runner-stall Quirk #9 on a flaky test).
|
||||
# See: mc#420 / molecule-core runbooks/gitea-operational-quirks.md Quirk #9.
|
||||
# Required contexts for push (main/staging) runs. The push CI uses the same
|
||||
# aggregator names with " (push)" suffix. Checking these explicitly instead of
|
||||
# the combined state avoids false-pause when non-blocking jobs (e.g. Platform
|
||||
@@ -156,38 +159,15 @@ def latest_statuses_by_context(statuses: list[dict]) -> dict[str, dict]:
|
||||
return latest
|
||||
|
||||
|
||||
def _is_tier_low_pending_ok(
|
||||
latest_statuses: dict[str, dict],
|
||||
context: str,
|
||||
pr_labels: set[str],
|
||||
) -> bool:
|
||||
"""Return True if tier:low PR can tolerate sop-checklist pending state.
|
||||
|
||||
Per sop-checklist-config.yaml tier_failure_mode, tier:low uses soft-fail:
|
||||
sop-checklist posts state=pending when acks are satisfied (missing
|
||||
manager/ceo acks are informational only). The queue should accept
|
||||
pending instead of waiting for success.
|
||||
"""
|
||||
if "tier:low" not in pr_labels:
|
||||
return False
|
||||
if "sop-checklist" not in context:
|
||||
return False
|
||||
status = latest_statuses.get(context) or {}
|
||||
return status_state(status) == "pending"
|
||||
|
||||
|
||||
def required_contexts_green(
|
||||
latest_statuses: dict[str, dict],
|
||||
contexts: list[str],
|
||||
pr_labels: set[str] | None = None,
|
||||
) -> tuple[bool, list[str]]:
|
||||
missing_or_bad: list[str] = []
|
||||
for context in contexts:
|
||||
status = latest_statuses.get(context)
|
||||
state = status_state(status or {})
|
||||
if state != "success":
|
||||
if pr_labels and _is_tier_low_pending_ok(latest_statuses, context, pr_labels):
|
||||
continue # tier:low soft-fail: accept pending sop-checklist
|
||||
missing_or_bad.append(f"{context}={state or 'missing'}")
|
||||
return not missing_or_bad, missing_or_bad
|
||||
|
||||
@@ -240,7 +220,6 @@ def evaluate_merge_readiness(
|
||||
pr_status: dict,
|
||||
required_contexts: list[str],
|
||||
pr_has_current_base: bool,
|
||||
pr_labels: set[str] | None = None,
|
||||
) -> MergeDecision:
|
||||
# Check push-required contexts explicitly instead of combined state.
|
||||
# Combined state can be "failure" due to non-blocking jobs
|
||||
@@ -260,7 +239,7 @@ def evaluate_merge_readiness(
|
||||
# The required_contexts list is the authoritative gate — it includes only
|
||||
# the checks that actually block merges.
|
||||
latest = latest_statuses_by_context(pr_status.get("statuses") or [])
|
||||
ok, missing_or_bad = required_contexts_green(latest, required_contexts, pr_labels)
|
||||
ok, missing_or_bad = required_contexts_green(latest, required_contexts)
|
||||
if not ok:
|
||||
return MergeDecision(False, "wait", "required contexts not green: " + ", ".join(missing_or_bad))
|
||||
return MergeDecision(True, "merge", "ready")
|
||||
@@ -285,32 +264,27 @@ def get_combined_status(sha: str) -> dict:
|
||||
_, combined = api("GET", f"/repos/{OWNER}/{NAME}/commits/{sha}/status")
|
||||
if not isinstance(combined, dict):
|
||||
raise ApiError(f"status for {sha} response not object")
|
||||
combined_statuses: list[dict] = combined.get("statuses") or []
|
||||
# Fetch full statuses list; 200 covers >99% of real-world runs.
|
||||
# The list is ordered ascending by id (oldest first) — callers must
|
||||
# iterate in reverse to get the newest entry per context.
|
||||
# Best-effort: large repos (main with 550+ statuses) may time out.
|
||||
# On timeout, fall back to the statuses[] already in the combined
|
||||
# response (usually 30 entries — enough for most PRs, enough for
|
||||
# main's early push-required contexts).
|
||||
try:
|
||||
_, all_statuses_raw = api(
|
||||
_, all_statuses = api(
|
||||
"GET",
|
||||
f"/repos/{OWNER}/{NAME}/commits/{sha}/statuses",
|
||||
query={"limit": "50"},
|
||||
)
|
||||
if isinstance(all_statuses_raw, list):
|
||||
all_statuses: list[dict] = list(all_statuses_raw)
|
||||
else:
|
||||
all_statuses = []
|
||||
if isinstance(all_statuses, list):
|
||||
combined["statuses"] = all_statuses
|
||||
except (ApiError, urllib.error.URLError, TimeoutError, OSError) as exc:
|
||||
# URLError covers network-level failures (DNS, refused, timeout).
|
||||
# TimeoutError and OSError cover socket-level timeouts.
|
||||
sys.stderr.write(f"::warning::could not fetch full statuses list for {sha[:8]}: {exc}\n")
|
||||
all_statuses = []
|
||||
# Build latest per context: process combined (ascending→reverse=newest
|
||||
# first), then fill gaps from all_statuses (already newest-first).
|
||||
latest: dict[str, dict] = {}
|
||||
for status in reversed(sorted(combined_statuses, key=lambda s: s.get("id") or 0)):
|
||||
ctx = status.get("context")
|
||||
if isinstance(ctx, str) and ctx not in latest:
|
||||
latest[ctx] = status
|
||||
for status in all_statuses:
|
||||
ctx = status.get("context")
|
||||
if isinstance(ctx, str) and ctx not in latest:
|
||||
latest[ctx] = status
|
||||
combined["statuses"] = list(latest.values())
|
||||
# Fall back to the statuses[] already in the combined response.
|
||||
pass
|
||||
return combined
|
||||
|
||||
|
||||
@@ -352,22 +326,28 @@ def post_comment(pr_number: int, body: str, *, dry_run: bool) -> None:
|
||||
|
||||
|
||||
def add_hold_label(pr_number: int, *, dry_run: bool) -> None:
|
||||
"""Apply the hold label so the queue skips this PR and processes the next."""
|
||||
print(f"::notice::adding `{HOLD_LABEL}` to PR #{pr_number}")
|
||||
"""Add HOLD_LABEL to a PR if not already present."""
|
||||
if not HOLD_LABEL:
|
||||
return
|
||||
# Check current labels first to avoid a no-op API call in dry-run.
|
||||
_, current = api("GET", f"/repos/{OWNER}/{NAME}/issues/{pr_number}/labels")
|
||||
current_names = {
|
||||
l["name"] for l in (current if isinstance(current, list) else [])
|
||||
}
|
||||
if HOLD_LABEL in current_names:
|
||||
print(f"::notice::PR #{pr_number} already has hold label; skipping add")
|
||||
return
|
||||
print(f"::notice::PR #{pr_number} adding hold label `{HOLD_LABEL}`")
|
||||
if dry_run:
|
||||
return
|
||||
try:
|
||||
api(
|
||||
"POST",
|
||||
f"/repos/{OWNER}/{NAME}/issues/{pr_number}/labels",
|
||||
body={"labels": [HOLD_LABEL]},
|
||||
)
|
||||
except ApiError as exc:
|
||||
# 404 = PR already closed/deleted; 422 = label already present (Gitea
|
||||
# returns 422 for duplicate label assignment — not a real error).
|
||||
if "404" in str(exc) or "422" in str(exc):
|
||||
return
|
||||
sys.stderr.write(f"::warning::could not add hold label to PR #{pr_number}: {exc}\n")
|
||||
# Gitea accepts {"labels": ["label1", "label2"]} to append labels.
|
||||
new_labels = list(current_names) + [HOLD_LABEL]
|
||||
api(
|
||||
"PATCH",
|
||||
f"/repos/{OWNER}/{NAME}/issues/{pr_number}",
|
||||
body={"labels": new_labels},
|
||||
expect_json=False,
|
||||
)
|
||||
|
||||
|
||||
def update_pull(pr_number: int, *, dry_run: bool) -> None:
|
||||
@@ -445,13 +425,11 @@ def process_once(*, dry_run: bool = False) -> int:
|
||||
commits = get_pull_commits(pr_number)
|
||||
current_base = pr_has_current_base(pr, commits, main_sha)
|
||||
pr_status = get_combined_status(head_sha)
|
||||
pr_labels = label_names(pr)
|
||||
decision = evaluate_merge_readiness(
|
||||
main_status=main_status,
|
||||
pr_status=pr_status,
|
||||
required_contexts=contexts,
|
||||
pr_has_current_base=current_base,
|
||||
pr_labels=pr_labels,
|
||||
)
|
||||
|
||||
print(f"::notice::PR #{pr_number} decision={decision.action}: {decision.reason}")
|
||||
@@ -466,22 +444,6 @@ def process_once(*, dry_run: bool = False) -> int:
|
||||
dry_run=dry_run,
|
||||
)
|
||||
return 0
|
||||
if decision.action == "wait":
|
||||
# Required contexts are not green. Auto-hold so the queue stops cycling
|
||||
# on this PR and processes the next. Holds are removed manually once the
|
||||
# blocker (e.g. qa/sec gate, missing SOP_TIER_CHECK_TOKEN) is resolved.
|
||||
add_hold_label(pr_number, dry_run=dry_run)
|
||||
post_comment(
|
||||
pr_number,
|
||||
(
|
||||
f"merge-queue: auto-held — required contexts not green: "
|
||||
f"{decision.reason}. "
|
||||
"Remove the `merge-queue-hold` label and re-label `merge-queue` "
|
||||
"to restart queue processing once the blocker is resolved."
|
||||
),
|
||||
dry_run=dry_run,
|
||||
)
|
||||
return 0
|
||||
if decision.ready:
|
||||
latest_main_sha = get_branch_head(WATCH_BRANCH)
|
||||
if latest_main_sha != main_sha:
|
||||
@@ -493,41 +455,42 @@ def process_once(*, dry_run: bool = False) -> int:
|
||||
try:
|
||||
merge_pull(pr_number, dry_run=dry_run)
|
||||
except MergePermissionError as exc:
|
||||
# HTTP 403/404/405. Distinguish status-check gate (405 with
|
||||
# "Not all required status checks") from a genuine permission
|
||||
# error. Case-insensitive match — Gitea uses "Not all required..."
|
||||
# (capital N) while other paths may return lowercase.
|
||||
msg_lower = str(exc).lower()
|
||||
is_status_check_failure = "not all required status checks successful" in msg_lower
|
||||
msg = str(exc)
|
||||
is_status_check_failure = "not all required status checks successful" in msg
|
||||
if is_status_check_failure:
|
||||
# Gitea's merge gate blocked us — a required context (e.g.
|
||||
# E2E Chat, qa-review, security-review) is failing. Auto-add
|
||||
# hold so the queue skips this PR and processes the next.
|
||||
# Gitea's merge gate failed due to a status check that passed our
|
||||
# pre-flight but is failing at Gitea's side (e.g. runner-stall Quirk
|
||||
# #9, or a context not in REQUIRED_CONTEXTS). Auto-add hold so the
|
||||
# queue skips this PR and processes the next one. The hold can be
|
||||
# removed once CI is green again.
|
||||
add_hold_label(pr_number, dry_run=dry_run)
|
||||
post_comment(
|
||||
pr_number,
|
||||
(
|
||||
"merge-queue: merge blocked by Gitea's status-check gate "
|
||||
"(E2E Chat, qa-review, security-review, or other required "
|
||||
"context failing). Auto-held via `merge-queue-hold`. "
|
||||
"Remove the hold label to requeue once CI is green."
|
||||
"(E2E Chat or other non-required context failing). "
|
||||
"Auto-held via `merge-queue-hold`. "
|
||||
"Remove the hold label to requeue once CI is green. "
|
||||
"If E2E Chat is stuck (runner stall / Quirk #9), CI will "
|
||||
"self-recover after ~90 min and the hold can then be removed."
|
||||
),
|
||||
dry_run=dry_run,
|
||||
)
|
||||
return 0
|
||||
else:
|
||||
# Genuine permission error — token lacks Can-merge.
|
||||
sys.stderr.write(f"::error::merge permission error for PR #{pr_number}: {exc}\n")
|
||||
post_comment(
|
||||
pr_number,
|
||||
(
|
||||
"merge-queue: merge failed with HTTP 405 'User not allowed to merge PR'. "
|
||||
"No available token has Can-merge permission on this repo. "
|
||||
"Fix: grant Can-merge to a token, or add a maintain/admin collaborator. "
|
||||
"Skipping to next queued PR on next tick."
|
||||
),
|
||||
dry_run=dry_run,
|
||||
)
|
||||
return 0
|
||||
# Genuine permission error — token lacks Can-merge.
|
||||
sys.stderr.write(f"::error::merge permission error for PR #{pr_number}: {exc}\n")
|
||||
post_comment(
|
||||
pr_number,
|
||||
(
|
||||
"merge-queue: merge failed with HTTP 405 'User not allowed to merge PR'. "
|
||||
"No available token has Can-merge permission on this repo. "
|
||||
"Fix: grant Can-merge to a token, or add a maintain/admin collaborator. "
|
||||
"Skipping to next queued PR on next tick."
|
||||
),
|
||||
dry_run=dry_run,
|
||||
)
|
||||
return 0
|
||||
return 0
|
||||
return 0
|
||||
|
||||
|
||||
@@ -128,54 +128,3 @@ def test_MergePermissionError_message_preserved():
|
||||
exc = mq.MergePermissionError("POST /merge -> HTTP 405: User not allowed")
|
||||
assert "405" in str(exc)
|
||||
assert "User not allowed" in str(exc)
|
||||
|
||||
|
||||
def test_merge_decision_waits_when_required_contexts_not_green():
|
||||
"""When a required context (e.g. qa-review, E2E Chat) is not success, the
|
||||
decision is 'wait' — the queue can then auto-hold on this."""
|
||||
required = [
|
||||
"CI / all-required (pull_request)",
|
||||
"sop-checklist / all-items-acked (pull_request)",
|
||||
"qa-review / approved (pull_request)",
|
||||
]
|
||||
decision = mq.evaluate_merge_readiness(
|
||||
main_status={
|
||||
"state": "success",
|
||||
"statuses": [{"context": "CI / all-required (push)", "status": "success"}],
|
||||
},
|
||||
pr_status={
|
||||
"state": "failure",
|
||||
"statuses": [
|
||||
{"context": "CI / all-required (pull_request)", "status": "success"},
|
||||
{"context": "sop-checklist / all-items-acked (pull_request)", "status": "success"},
|
||||
{"context": "qa-review / approved (pull_request)", "status": "failure"},
|
||||
],
|
||||
},
|
||||
required_contexts=required,
|
||||
pr_has_current_base=True,
|
||||
pr_labels=None,
|
||||
)
|
||||
assert decision.ready is False
|
||||
assert decision.action == "wait"
|
||||
assert "qa-review" in decision.reason
|
||||
|
||||
|
||||
def test_tier_low_sop_checklist_pending_soft_fail():
|
||||
"""tier:low PRs get soft-fail on sop-checklist: pending is accepted."""
|
||||
required = ["sop-checklist / all-items-acked (pull_request)"]
|
||||
statuses = {
|
||||
"sop-checklist / all-items-acked (pull_request)": {"status": "pending"}
|
||||
}
|
||||
ok, missing = mq.required_contexts_green(statuses, required, pr_labels={"tier:low"})
|
||||
assert ok is True
|
||||
assert missing == []
|
||||
|
||||
|
||||
def test_tier_low_sop_checklist_failure_not_soft_fail():
|
||||
"""tier:low soft-fail only covers pending, not actual failure."""
|
||||
required = ["sop-checklist / all-items-acked (pull_request)"]
|
||||
statuses = {
|
||||
"sop-checklist / all-items-acked (pull_request)": {"status": "failure"}
|
||||
}
|
||||
ok, missing = mq.required_contexts_green(statuses, required, pr_labels={"tier:low"})
|
||||
assert ok is False
|
||||
|
||||
@@ -57,7 +57,7 @@ permissions:
|
||||
# can produce duplicate comments before the title-search dedup wins.
|
||||
concurrency:
|
||||
group: ci-required-drift
|
||||
cancel-in-progress: false
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
drift:
|
||||
|
||||
@@ -22,7 +22,7 @@ permissions:
|
||||
|
||||
concurrency:
|
||||
group: gitea-merge-queue-${{ github.repository }}
|
||||
cancel-in-progress: false
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
queue:
|
||||
|
||||
@@ -56,9 +56,13 @@ permissions:
|
||||
# Workflow-scoped serialisation — two simultaneous runs would race on the
|
||||
# `[main-red] {SHA}` open/PATCH path. Idempotent by title, but parallel
|
||||
# POSTs can produce duplicates before the title search dedup wins.
|
||||
# NOTE: cancel-in-progress: true is safe here — the idempotent design means
|
||||
# a cancelled run produces identical output to a completed one. This also
|
||||
# prevents the Gitea scheduler freeze that occurs when a cron tick fires
|
||||
# while a previous run is still executing (Quirk #8).
|
||||
concurrency:
|
||||
group: main-red-watchdog
|
||||
cancel-in-progress: false
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
watchdog:
|
||||
|
||||
@@ -267,7 +267,7 @@ jobs:
|
||||
fi
|
||||
|
||||
GITEA_URL="${GITEA_URL:-https://git.moleculesai.app}"
|
||||
TEMPLATES="claude-code hermes openclaw codex langgraph autogen"
|
||||
TEMPLATES="claude-code hermes openclaw codex langgraph crewai autogen deepagents gemini-cli"
|
||||
FAILED=""
|
||||
SKIPPED=""
|
||||
|
||||
|
||||
@@ -89,7 +89,6 @@ on:
|
||||
permissions:
|
||||
contents: read
|
||||
pull-requests: read
|
||||
secrets: read # required for SOP_TIER_CHECK_TOKEN team-membership probe
|
||||
|
||||
jobs:
|
||||
# bp-exempt: PR review bot signal; required merge state is enforced by CI / all-required.
|
||||
|
||||
@@ -16,7 +16,6 @@ on:
|
||||
permissions:
|
||||
contents: read
|
||||
pull-requests: read
|
||||
secrets: read # required for SOP_TIER_CHECK_TOKEN team-membership probe
|
||||
|
||||
jobs:
|
||||
# bp-exempt: PR security review bot signal; required merge state is enforced by CI / all-required.
|
||||
|
||||
@@ -41,3 +41,4 @@
|
||||
{"name": "mock-bigorg", "repo": "molecule-ai/molecule-ai-org-template-mock-bigorg", "ref": "main"}
|
||||
]
|
||||
}
|
||||
// Triggered by Integration Tester at 2026-05-10T08:52Z
|
||||
|
||||
@@ -77,6 +77,31 @@ does not replace the queue. The queue still performs its own current-main
|
||||
check immediately before merge because branch protection alone cannot
|
||||
serialize two already-green PRs.
|
||||
|
||||
### Correct API field names (Gitea 1.22.6)
|
||||
|
||||
When setting branch protection via API, use these exact field names — several
|
||||
intuitively-correct names are silently ignored (see `gitea-operational-quirks.md`
|
||||
Quirk #7):
|
||||
|
||||
```json
|
||||
{
|
||||
"branch_name": "main",
|
||||
"enable_merge_whitelist": true,
|
||||
"merge_whitelist_usernames": ["devops-engineer", "hongming", "core-devops"],
|
||||
"enable_status_check": true,
|
||||
"status_check_contexts": ["CI / all-required"],
|
||||
"required_approvals": 1,
|
||||
"block_on_rejected_reviews": true
|
||||
}
|
||||
```
|
||||
|
||||
After any `POST /branch_protections`, immediately GET and verify the values
|
||||
persisted — the API returns 201 even when fields are silently dropped.
|
||||
|
||||
If the queue returns HTTP 405 ("User not allowed to merge"), the first
|
||||
diagnostic step is `GET /branch_protections/main` and checking whether
|
||||
`merge_whitelist_usernames` still contains `devops-engineer`.
|
||||
|
||||
## Failure Handling
|
||||
|
||||
If `main` is not green, the queue pauses and does not merge anything.
|
||||
|
||||
@@ -196,69 +196,134 @@ primary consumer of combined status and is affected.
|
||||
|
||||
---
|
||||
|
||||
## Quirk #7 — TBD
|
||||
|
||||
*[Placeholder — document here when a new Gitea Actions quirk is discovered.]*
|
||||
## Quirk #7 — Gitea branch protection API silently ignores some field names
|
||||
|
||||
### Finding
|
||||
|
||||
*[What Gitea Actions does differently from GitHub Actions.]*
|
||||
The Gitea 1.22.6 `POST /repos/{org}/{repo}/branch_protections` API accepts a
|
||||
non-obvious set of field names. Several intuitively-correct names are silently
|
||||
ignored — the call returns 201 but the field is dropped:
|
||||
|
||||
| Intended field | Correct API name | Silently ignored aliases |
|
||||
|---|---|---|
|
||||
| Enable merge whitelist | `enable_merge_whitelist` | `user_can_merge`, `merge_whitelist_enabled` |
|
||||
| Users who can merge | `merge_whitelist_usernames` | `merge_whitelist_users`, `whitelisted_users` |
|
||||
| Enable status check | `enable_status_check` | `enable_status_checks`, `require_status_checks` |
|
||||
| Required status contexts | `status_check_contexts` | `required_status_checks.contexts` |
|
||||
| Block on rejected reviews | `block_on_rejected_reviews` | (this one works) |
|
||||
| Required approvals | `required_approvals` | `required_reviewers` |
|
||||
|
||||
The GET response after a POST shows the actual stored values. A naive
|
||||
GET → modify → POST cycle (without using the exact GET field names) will
|
||||
silently reset the merge whitelist on every call.
|
||||
|
||||
### Impact
|
||||
|
||||
*[Which workflows or operations are affected.]*
|
||||
- Branch protection merge whitelist resets to empty after any API mis-invocation
|
||||
- Queue AUTO_SYNC_TOKEN (`devops-engineer`) loses Can-merge permission → HTTP 405
|
||||
- All queued PRs blocked until whitelist is restored
|
||||
- Confirmed reset on Gitea server restart/upgrade (Gitea uses default values)
|
||||
|
||||
### Workaround
|
||||
|
||||
*[How to work around this quirk.]*
|
||||
1. Always GET the current protection first and use **exact** field names from the
|
||||
GET response when modifying
|
||||
2. After any `POST /branch_protections`, immediately GET and verify
|
||||
`enable_merge_whitelist: true` and `merge_whitelist_usernames` contains
|
||||
`["devops-engineer", "hongming", "core-devops"]`
|
||||
3. The queue bot should verify branch protection before each merge tick
|
||||
4. For queue to work: `enable_merge_whitelist: true` +
|
||||
`merge_whitelist_usernames: ["devops-engineer", "hongming", "core-devops"]` +
|
||||
`enable_status_check: true` + `status_check_contexts: ["CI / all-required"]`
|
||||
|
||||
### References
|
||||
|
||||
- internal#[N]: first observation
|
||||
- SEV-1 2026-05-17: 3x branch protection resets caused 405 on all queue merges
|
||||
- `feedback_gitea_branch_protection_api_field_names`
|
||||
|
||||
---
|
||||
|
||||
## Quirk #8 — TBD
|
||||
|
||||
*[Placeholder — document here when a new Gitea Actions quirk is discovered.]*
|
||||
## Quirk #8 — Scheduled workflow with `cancel-in-progress: false` causes scheduler freeze
|
||||
|
||||
### Finding
|
||||
|
||||
*[What Gitea Actions does differently from GitHub Actions.]*
|
||||
When a `schedule:` workflow has `concurrency.cancel-in-progress: false`, and a
|
||||
new cron tick fires while the previous run is still executing, the Gitea Actions
|
||||
scheduler stops dispatching the workflow entirely. Pending entries accumulate
|
||||
indefinitely — the scheduler shows the workflow as "scheduled" but never dispatches.
|
||||
|
||||
This is dangerous for workflows with variable execution time (e.g., workflows that
|
||||
wait for downstream CI, or workflows that run on slow/degraded runners).
|
||||
|
||||
### Impact
|
||||
|
||||
*[Which workflows or operations are affected.]*
|
||||
- `gitea-merge-queue.yml` with `cancel-in-progress: false` froze on 2026-05-17
|
||||
starting ~16:44Z — pending runs accumulated, no new runs dispatched
|
||||
- Queue appeared stalled; all 22 queued PRs blocked
|
||||
- The `gitea-merge-queue` workflow itself becomes invisible to operators
|
||||
|
||||
### Workaround
|
||||
|
||||
*[How to work around this quirk.]*
|
||||
**Always set `cancel-in-progress: true` on `schedule:` workflows:**
|
||||
|
||||
```yaml
|
||||
concurrency:
|
||||
group: workflow-name
|
||||
cancel-in-progress: true # ← always true for schedule: workflows
|
||||
```
|
||||
|
||||
If the freeze has already occurred: the scheduler recovers automatically after the
|
||||
currently-running instance completes (Gitea dispatches the next queued tick).
|
||||
|
||||
### References
|
||||
|
||||
- internal#[N]: first observation
|
||||
- SEV-1 2026-05-17: queue frozen since 16:44Z; fixed by setting `cancel-in-progress: true`
|
||||
- PR #1358: `fix(scheduled-workflows): enable cancel-in-progress` (pending merge)
|
||||
|
||||
---
|
||||
|
||||
## Quirk #9 — TBD
|
||||
|
||||
*[Placeholder — document here when a new Gitea Actions quirk is discovered.]*
|
||||
## Quirk #9 — Gitea Actions runner accepts runs but stalls (jobs never start)
|
||||
|
||||
### Finding
|
||||
|
||||
*[What Gitea Actions does differently from GitHub Actions.]*
|
||||
The Gitea Actions runner on host `5.78.80.188` can enter a degraded state where:
|
||||
1. It accepts new workflow runs (shows "in_progress" in the UI)
|
||||
2. It never starts any jobs — pending count grows indefinitely
|
||||
3. The runner shows as "online" and accepting runs
|
||||
4. After ~60–90 minutes, the runner self-recovers and all pending jobs start
|
||||
|
||||
This is distinct from a true runner crash (which would show as offline).
|
||||
|
||||
### Impact
|
||||
|
||||
*[Which workflows or operations are affected.]*
|
||||
- All CI jobs for all PRs stall — no status updates posted
|
||||
- Queue waits indefinitely for CI (which never posts success)
|
||||
- `sop-checklist` and other workflows time out on affected PRs
|
||||
- Looks like the runner is working (green in UI) but nothing executes
|
||||
|
||||
### How to diagnose
|
||||
|
||||
Add a debug step to a known-failing workflow:
|
||||
|
||||
```bash
|
||||
# In a stalled job:
|
||||
curl -s http://localhost:8088/debug/pprof/trace?seconds=5 | head
|
||||
# Check runner process CPU — if near 0% while jobs are pending, runner is stalled
|
||||
```
|
||||
|
||||
Check runner logs on the host (`/var/log/actrunner.log` or similar).
|
||||
|
||||
### Workaround
|
||||
|
||||
*[How to work around this quirk.]*
|
||||
No operator workaround while stalled — the runner self-recovers. Options:
|
||||
1. **Wait** — runner typically recovers within 90 minutes
|
||||
2. **Restart the runner service** — `systemctl restart act_runner` (requires host access)
|
||||
3. **Move to a second runner** — if registered, re-route dispatch
|
||||
|
||||
### References
|
||||
|
||||
- internal#[N]: first observation
|
||||
- SEV-1 2026-05-17: runner stalled; self-recovered ~21:33Z after ~90 min
|
||||
- `feedback_gitea_runner_stall_accepted_jobs_no_execution`
|
||||
|
||||
---
|
||||
|
||||
|
||||
@@ -96,70 +96,13 @@ var fallbackRuntimes = map[string]struct{}{
|
||||
// Caller logs + falls back to fallbackRuntimes on any error. Not
|
||||
// returning the fallback here ourselves so the caller can decide
|
||||
// how loud to be about the miss (prod = WARN, tests = silent).
|
||||
// stripJSON5Comments removes a JSON5-style // trailing comment from manifest.json.
|
||||
// The Integration Tester appends "// Triggered by ..." at the very end of the file.
|
||||
// This comment is always after the final closing brace, so we scan only that
|
||||
// suffix rather than trying to track string-context across the whole file.
|
||||
// This avoids false-positives on legitimate // in URL values (e.g. http://foo.com/bar).
|
||||
func stripJSON5Comments(data []byte) []byte {
|
||||
// Find the last '}' — everything before it is guaranteed standard JSON.
|
||||
lastBrace := -1
|
||||
for i := len(data) - 1; i >= 0; i-- {
|
||||
if data[i] == '}' {
|
||||
lastBrace = i
|
||||
break
|
||||
}
|
||||
}
|
||||
if lastBrace == -1 {
|
||||
return data // no JSON structure found — return as-is, json.Unmarshal will error
|
||||
}
|
||||
// Everything after lastBrace is the trailing suffix to clean.
|
||||
suffixStart := lastBrace + 1
|
||||
if suffixStart >= len(data) {
|
||||
return data // no suffix
|
||||
}
|
||||
suffix := data[suffixStart:]
|
||||
// Strip leading whitespace at the start of the suffix.
|
||||
cleanSuffix := trimLeadingWhitespace(suffix)
|
||||
if len(cleanSuffix) == 0 || cleanSuffix[0] != '/' {
|
||||
return data // suffix is empty or starts with non-comment — nothing to strip
|
||||
}
|
||||
// Remove the trailing comment (everything from the first // to end of file).
|
||||
// Rebuild: prefix + suffix with comment stripped.
|
||||
before := data[:suffixStart]
|
||||
// Trim trailing whitespace from before so we don't leave a dangling newline.
|
||||
trimmedBefore := trimTrailingWhitespace(before)
|
||||
// Append a single newline so the JSON file ends cleanly.
|
||||
result := append(trimmedBefore, '\n')
|
||||
return result
|
||||
}
|
||||
|
||||
func trimLeadingWhitespace(b []byte) []byte {
|
||||
i := 0
|
||||
for i < len(b) && (b[i] == ' ' || b[i] == '\t' || b[i] == '\n' || b[i] == '\r') {
|
||||
i++
|
||||
}
|
||||
return b[i:]
|
||||
}
|
||||
|
||||
func trimTrailingWhitespace(b []byte) []byte {
|
||||
i := len(b)
|
||||
for i > 0 && (b[i-1] == ' ' || b[i-1] == '\t' || b[i-1] == '\n' || b[i-1] == '\r') {
|
||||
i--
|
||||
}
|
||||
return b[:i]
|
||||
}
|
||||
|
||||
func loadRuntimesFromManifest(path string) (map[string]struct{}, error) {
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
// The Integration Tester appends "// Triggered by ..." to manifest.json.
|
||||
// json.Unmarshal rejects it; strip // comments first (same as clone-manifest.sh).
|
||||
clean := stripJSON5Comments(data)
|
||||
var m manifestFile
|
||||
if err := json.Unmarshal(clean, &m); err != nil {
|
||||
if err := json.Unmarshal(data, &m); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
out := map[string]struct{}{
|
||||
|
||||
@@ -83,70 +83,6 @@ func TestLoadRuntimesFromManifest_MalformedJSON(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadRuntimesFromManifest_TrailingJSON5Comment(t *testing.T) {
|
||||
// The Integration Tester appends "// Triggered by Integration Tester at ..."
|
||||
// to manifest.json after cloning. json.Unmarshal rejects it; stripJSON5Comments
|
||||
// must remove the trailing comment so load succeeds.
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "manifest.json")
|
||||
_ = os.WriteFile(path, []byte(`{
|
||||
"workspace_templates": [
|
||||
{"name": "langgraph", "repo": "org/t"}
|
||||
]
|
||||
}
|
||||
// Triggered by Integration Tester at 2026-05-10T08:52Z`), 0600)
|
||||
|
||||
got, err := loadRuntimesFromManifest(path)
|
||||
if err != nil {
|
||||
t.Fatalf("load failed despite trailing comment: %v", err)
|
||||
}
|
||||
if _, ok := got["langgraph"]; !ok {
|
||||
t.Errorf("langgraph missing from result: %v", keys(got))
|
||||
}
|
||||
}
|
||||
|
||||
func TestStripJSON5Comments(t *testing.T) {
|
||||
cases := []struct {
|
||||
name string
|
||||
in string
|
||||
want string
|
||||
}{
|
||||
{
|
||||
name: "trailing comment after closing brace removed",
|
||||
in: "{}\n// Triggered by Integration Tester\n",
|
||||
want: "{}\n",
|
||||
},
|
||||
{
|
||||
name: "embedded_in_url_preserved",
|
||||
in: `{"url":"http://foo.com/bar"}`,
|
||||
want: `{"url":"http://foo.com/bar"}`,
|
||||
},
|
||||
{
|
||||
name: "no_closing_brace_returns_input_unchanged",
|
||||
in: "no json here // comment",
|
||||
want: "no json here // comment",
|
||||
},
|
||||
{
|
||||
name: "comment_only_after_closing_brace_stripped",
|
||||
in: `{"a":1}` + "\n// Triggered by Integration Tester at 2026-05-10T08:52Z",
|
||||
want: `{"a":1}` + "\n",
|
||||
},
|
||||
{
|
||||
name: "clean_json_unchanged",
|
||||
in: `{"workspace_templates":[]}` + "\n",
|
||||
want: `{"workspace_templates":[]}` + "\n",
|
||||
},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
got := string(stripJSON5Comments([]byte(tc.in)))
|
||||
if got != tc.want {
|
||||
t.Errorf("stripJSON5Comments(%q): got %q, want %q", tc.in, got, tc.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestRealManifestParses — sanity check against the actual
|
||||
// monorepo manifest.json so a future schema change to that file
|
||||
// (e.g. workspace_templates → workspace_runtime_templates) surfaces
|
||||
|
||||
Reference in New Issue
Block a user