Compare commits
4 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 65831c839e | |||
| d342149646 | |||
| 54907ee852 | |||
| 99453c6a71 |
@@ -23,6 +23,7 @@ import dataclasses
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import urllib.error
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
@@ -326,6 +327,43 @@ def update_pull(pr_number: int, *, dry_run: bool) -> None:
|
||||
)
|
||||
|
||||
|
||||
def wait_for_ci(
|
||||
head_sha: str,
|
||||
contexts: list[str],
|
||||
*,
|
||||
max_wait_seconds: int = 300,
|
||||
poll_interval: int = 15,
|
||||
) -> bool:
|
||||
"""Poll CI statuses for head_sha until all required contexts are terminal.
|
||||
|
||||
Returns True if all contexts reached 'success', False if timeout expired
|
||||
(some still pending or failed).
|
||||
|
||||
Background: after a queue-triggered PR update, CI re-runs on the new head.
|
||||
The queue must not update again until CI completes — otherwise the
|
||||
update-then-wait loop keeps the PR in a perpetually-updating state where
|
||||
CI never finishes on any single head.
|
||||
"""
|
||||
deadline = time.time() + max_wait_seconds
|
||||
while time.time() < deadline:
|
||||
time.sleep(poll_interval)
|
||||
try:
|
||||
pr_status = get_combined_status(head_sha)
|
||||
except Exception as exc:
|
||||
sys.stderr.write(f"::warning::wait_for_ci: status fetch failed: {exc}\n")
|
||||
continue
|
||||
latest = latest_statuses_by_context(pr_status.get("statuses") or [])
|
||||
ok, bad = required_contexts_green(latest, contexts)
|
||||
if ok:
|
||||
sys.stderr.write(f"::notice::wait_for_ci: all contexts green after {int(time.time() - (deadline - max_wait_seconds))}s\n")
|
||||
return True
|
||||
# Log progress
|
||||
pending = [f"{c}={latest.get(c, {}).get('status', 'missing')}" for c in contexts if latest.get(c, {}).get('status') != 'success']
|
||||
sys.stderr.write(f"::notice::wait_for_ci: still waiting ({int(deadline - time.time())}s left): {', '.join(pending[:3])}\n")
|
||||
sys.stderr.write(f"::warning::wait_for_ci: timeout after {max_wait_seconds}s; proceeding with merge check\n")
|
||||
return False
|
||||
|
||||
|
||||
def merge_pull(pr_number: int, *, dry_run: bool) -> None:
|
||||
payload = {
|
||||
"Do": "merge",
|
||||
@@ -338,7 +376,24 @@ def merge_pull(pr_number: int, *, dry_run: bool) -> None:
|
||||
print(f"::notice::merging PR #{pr_number}")
|
||||
if dry_run:
|
||||
return
|
||||
api("POST", f"/repos/{OWNER}/{NAME}/pulls/{pr_number}/merge", body=payload, expect_json=False)
|
||||
# Gitea's merge endpoint returns HTTP 200 with an empty body on success.
|
||||
# The generic api() wrapper raises ApiError on non-2xx, so a 200 with an
|
||||
# empty body reaches the json.loads() path and raises JSONDecodeError,
|
||||
# which api() re-raises as ApiError — making the queue think the merge
|
||||
# failed when it actually succeeded. Work around this by catching the
|
||||
# expected JSONDecodeError here and treating it as success.
|
||||
try:
|
||||
api("POST", f"/repos/{OWNER}/{NAME}/pulls/{pr_number}/merge", body=payload, expect_json=False)
|
||||
except ApiError as exc:
|
||||
# Surface non-merge errors (5xx server errors, 403 forbidden, etc.)
|
||||
if "merge" in str(exc).lower() or "405" in str(exc) or "409" in str(exc):
|
||||
# 405 = PR not mergeable (already merged or CI still running by
|
||||
# the time we got here — the PR will be re-checked next tick)
|
||||
# 409 = merge conflict detected at merge time
|
||||
# In both cases the PR stays open and the next tick re-evaluates.
|
||||
sys.stderr.write(f"::warning::merge call returned: {exc}\n")
|
||||
else:
|
||||
raise
|
||||
|
||||
|
||||
def process_once(*, dry_run: bool = False) -> int:
|
||||
@@ -390,6 +445,32 @@ def process_once(*, dry_run: bool = False) -> int:
|
||||
print(f"::notice::PR #{pr_number} decision={decision.action}: {decision.reason}")
|
||||
if decision.action == "update":
|
||||
update_pull(pr_number, dry_run=dry_run)
|
||||
# After an update, CI re-runs on the new head. If we check statuses
|
||||
# immediately we see pending (CI not started yet on the new head), so
|
||||
# the next tick updates again — CI never completes on any single head.
|
||||
# Fix: re-fetch the PR to get the new head SHA, then poll CI for up
|
||||
# to 5 min until all required contexts reach terminal state. If CI
|
||||
# finishes in time, proceed to merge on the same tick.
|
||||
if not dry_run:
|
||||
updated_pr = get_pull(pr_number)
|
||||
new_head = updated_pr.get("head", {}).get("sha", "")
|
||||
if new_head and new_head != head_sha:
|
||||
sys.stderr.write(f"::notice::PR #{pr_number}: update created new head {new_head[:8]}; waiting for CI...\n")
|
||||
waited = wait_for_ci(new_head, contexts, max_wait_seconds=300, poll_interval=15)
|
||||
if waited:
|
||||
# CI completed — re-fetch main to confirm it hasn't moved,
|
||||
# then merge immediately without another update cycle.
|
||||
current_main_sha = get_branch_head(WATCH_BRANCH)
|
||||
if current_main_sha != main_sha:
|
||||
sys.stderr.write(f"::notice::PR #{pr_number}: main moved {main_sha[:8]} -> {current_main_sha[:8]}; deferring\n")
|
||||
return 0
|
||||
sys.stderr.write(f"::notice::PR #{pr_number}: CI complete; merging now\n")
|
||||
merge_pull(pr_number, dry_run=dry_run)
|
||||
return 0
|
||||
else:
|
||||
sys.stderr.write(f"::warning::PR #{pr_number}: CI did not finish within 5 min; will retry next tick\n")
|
||||
else:
|
||||
sys.stderr.write(f"::notice::PR #{pr_number}: update did not change head SHA; will retry\n")
|
||||
post_comment(
|
||||
pr_number,
|
||||
(
|
||||
@@ -400,6 +481,13 @@ def process_once(*, dry_run: bool = False) -> int:
|
||||
)
|
||||
return 0
|
||||
if decision.ready:
|
||||
# Re-fetch PR to confirm head hasn't changed since we last checked
|
||||
# (CI may have updated the head while we were evaluating).
|
||||
current_pr = get_pull(pr_number)
|
||||
current_head = current_pr.get("head", {}).get("sha", "")
|
||||
if current_head != head_sha:
|
||||
print(f"::notice::PR #{pr_number} head changed {head_sha[:8]} -> {current_head[:8]}; re-evaluating")
|
||||
return 0
|
||||
latest_main_sha = get_branch_head(WATCH_BRANCH)
|
||||
if latest_main_sha != main_sha:
|
||||
print(
|
||||
|
||||
+14
-76
@@ -1,10 +1,3 @@
|
||||
# mc#1099 cold-runner fix: step-level timeouts on go mod download (3m) and
|
||||
# go build (5m) prevent cold runner hangs when proxy.golang.org is unreachable.
|
||||
# golangci-lint install has connectivity test + continue-on-error: true fallback.
|
||||
# go test step: 60m timeout, -p 1 flag for reduced memory pressure on cold disk.
|
||||
# all-required polling deadline raised to 50m (from 40m) + job timeout 55m (from
|
||||
# 45m) to accommodate Shellcheck delays when runner pool is recovering.
|
||||
# Queue cron reliability: ensure merge-queue workflow dispatches every 5 min.
|
||||
# Ported from .github/workflows/ci.yml on 2026-05-11 per RFC internal#219 §1.
|
||||
# continue-on-error: true on every job; follow-up PR will flip required after
|
||||
# surfaced bugs are fixed (per RFC §1 — "surface broken workflows without
|
||||
@@ -152,10 +145,10 @@ jobs:
|
||||
# the diagnostic step with its own continue-on-error: true (line 203).
|
||||
# Flip confirmed by CI / Platform (Go) status = success on main HEAD 363905d3.
|
||||
continue-on-error: false
|
||||
# mc#1099: cold runner needs ~45m for go test on cold disk I/O.
|
||||
# Job-level ceiling: go test 60m step + golangci-lint 45m step = 105m max.
|
||||
# Backstop: 120m.
|
||||
timeout-minutes: 120
|
||||
# Job-level ceiling. The go test step below runs with a per-step 10m timeout;
|
||||
# this cap catches any step that leaks past that. Set well above 10m so
|
||||
# the per-step timeout is the active constraint.
|
||||
timeout-minutes: 15
|
||||
defaults:
|
||||
run:
|
||||
working-directory: workspace-server
|
||||
@@ -170,69 +163,18 @@ jobs:
|
||||
with:
|
||||
go-version: 'stable'
|
||||
- if: always()
|
||||
name: Download Go module cache
|
||||
# mc#1099: cold runner cannot reach proxy.golang.org. Without a
|
||||
# step-level timeout this step hangs for 6+ minutes (30s × 2 curl
|
||||
# timeouts × 1 module proxy) before failing. 3-minute ceiling ensures
|
||||
# the job fails fast on a cold runner so the step-level
|
||||
# continue-on-error can be evaluated, rather than stalling the job.
|
||||
timeout-minutes: 3
|
||||
run: |
|
||||
set +e
|
||||
go mod download
|
||||
exit_code=$?
|
||||
if [ $exit_code -ne 0 ]; then
|
||||
echo "go mod download failed (exit $exit_code) — cold runner cannot reach module proxy"
|
||||
echo "Continuing anyway (continue-on-error: true on this step)"
|
||||
fi
|
||||
run: go mod download
|
||||
- if: always()
|
||||
name: Build server
|
||||
timeout-minutes: 5
|
||||
run: go build ./cmd/server
|
||||
# CLI (molecli) moved to standalone repo: git.moleculesai.app/molecule-ai/molecule-cli
|
||||
- if: always()
|
||||
run: go vet ./...
|
||||
- if: always()
|
||||
name: Install golangci-lint
|
||||
# mc#1099: cold runner cannot reach github.com releases or proxy.golang.org
|
||||
# (hanging at ~5-6m before timing out). Test connectivity first; if
|
||||
# both sources fail, skip golangci-lint and rely on go vet.
|
||||
# continue-on-error: true prevents install failure from failing the job
|
||||
# (job-level continue-on-error: false).
|
||||
continue-on-error: true
|
||||
run: |
|
||||
set +e
|
||||
# Test proxy.golang.org connectivity (30s timeout)
|
||||
if curl -fsSL --connect-timeout 30 --max-time 60 "https://proxy.golang.org/github.com/golangci/golangci-lint/@v/list" -o /dev/null 2>/dev/null; then
|
||||
echo "proxy.golang.org reachable, installing via go install..."
|
||||
go install github.com/golangci/golangci-lint/cmd/golangci-lint@v1.64.5
|
||||
echo "go install exit: $?"
|
||||
else
|
||||
echo "proxy.golang.org unreachable, trying GitHub releases..."
|
||||
ARCH=$(go env GOARCH) && OS=$(go env GOOS) && VERSION=1.64.5
|
||||
if curl -fsSL --connect-timeout 30 --max-time 120 "https://github.com/golangci/golangci-lint/releases/download/v${VERSION}/golangci-lint-${VERSION}-${OS}-${ARCH}.tar.gz" -o /tmp/golangci-lint.tar.gz 2>/dev/null; then
|
||||
tar -xzf /tmp/golangci-lint.tar.gz -C /tmp
|
||||
install -m 755 /tmp/golangci-lint $(go env GOPATH)/bin/golangci-lint
|
||||
echo "GitHub binary installed"
|
||||
else
|
||||
echo "GitHub releases also unreachable — skipping golangci-lint (go vet is the safety net)"
|
||||
touch "$(go env GOPATH)/bin/golangci-lint.skip"
|
||||
fi
|
||||
fi
|
||||
run: go install github.com/golangci/golangci-lint/v2/cmd/golangci-lint@v2.12.2
|
||||
- if: always()
|
||||
name: Run golangci-lint
|
||||
# mc#1099: skip if binary unavailable; go vet already ran as safety net.
|
||||
# timeout: 45m — cold runner disk I/O makes linting slow. The command
|
||||
# --timeout 60m prevents a runaway linter from stalling the step.
|
||||
# continue-on-error: true so a missing binary doesn't fail the job.
|
||||
continue-on-error: true
|
||||
timeout-minutes: 45
|
||||
run: |
|
||||
if [ -f "$(go env GOPATH)/bin/golangci-lint.skip" ]; then
|
||||
echo "golangci-lint skipped (network unavailable on cold runner)"
|
||||
else
|
||||
golangci-lint run --config golangci-coldrunner.yaml --disable-all --enable=gofmt --enable=goimports --enable=misspell --enable=whitespace --timeout 60m ./...
|
||||
fi
|
||||
run: $(go env GOPATH)/bin/golangci-lint run --timeout 3m ./...
|
||||
- if: always()
|
||||
name: Diagnostic — per-package verbose 60s
|
||||
run: |
|
||||
@@ -251,15 +193,11 @@ jobs:
|
||||
continue-on-error: true
|
||||
- if: always()
|
||||
name: Run tests with race detection and coverage
|
||||
# mc#1099: cold runner cache causes OOM kills at ~22m (slower disk I/O
|
||||
# than GitHub Actions). A 60m per-step timeout lets the suite complete
|
||||
# on cold cache (~45m) while failing cleanly instead of OOM-killing.
|
||||
# Warm runners finish in ~12m. Retry with -p 1 on OOM. Job-level
|
||||
# timeout (120m) is the backstop.
|
||||
timeout-minutes: 60
|
||||
run: |
|
||||
go test -race -timeout 60m -coverprofile=coverage.out ./... \
|
||||
|| go test -race -timeout 60m -coverprofile=coverage.out -p 1 ./...
|
||||
# Explicit timeout: cold runner cache causes OOM kills at ~4m39s on the
|
||||
# full ./... suite with race detection + coverage. A 10m per-step timeout
|
||||
# lets the suite complete on cold cache (~5-7m) while failing cleanly
|
||||
# instead of OOM-killing. The job-level timeout (15m) is a backstop.
|
||||
run: go test -race -timeout 10m -coverprofile=coverage.out ./...
|
||||
|
||||
- if: always()
|
||||
name: Per-file coverage report
|
||||
@@ -626,7 +564,7 @@ jobs:
|
||||
#
|
||||
continue-on-error: false
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 55
|
||||
timeout-minutes: 45
|
||||
steps:
|
||||
- name: Wait for required CI contexts
|
||||
env:
|
||||
@@ -658,7 +596,7 @@ jobs:
|
||||
f"CI / Python Lint & Test ({event})",
|
||||
]
|
||||
terminal_bad = {"failure", "error"}
|
||||
deadline = time.time() + 50 * 60
|
||||
deadline = time.time() + 40 * 60
|
||||
last_summary = None
|
||||
|
||||
def fetch_statuses():
|
||||
|
||||
@@ -32,6 +32,12 @@ on:
|
||||
# iterating all open PRs when PR_NUMBER is empty.
|
||||
workflow_dispatch:
|
||||
|
||||
# Cancel stale runs so the 8-runner pool stays available for PR jobs.
|
||||
# Per-SHA group ensures push and cron runs at different SHAs don't cancel each other.
|
||||
concurrency:
|
||||
group: gate-check-v3-${{ github.event.pull_request.head.sha || github.sha }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions:
|
||||
# read: contents — for checkout (base ref, not PR head for security)
|
||||
# read: pull-requests — for reading PR info via API
|
||||
|
||||
@@ -162,7 +162,6 @@ jobs:
|
||||
exit 1
|
||||
fi
|
||||
python -m twine upload \
|
||||
--verbose \
|
||||
--repository pypi \
|
||||
--username __token__ \
|
||||
--password "$PYPI_TOKEN" \
|
||||
|
||||
@@ -44,6 +44,12 @@ on:
|
||||
- ".github/scripts/lint_secret_pattern_drift.py"
|
||||
- ".githooks/pre-commit"
|
||||
|
||||
# Cancel stale runs to keep the 8-runner pool available for PR jobs.
|
||||
# Per-SHA group ensures push and scheduled runs at different SHAs don't cancel each other.
|
||||
concurrency:
|
||||
group: secret-pattern-drift-${{ github.event.pull_request.head.sha || github.sha }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
GITHUB_SERVER_URL: https://git.moleculesai.app
|
||||
|
||||
|
||||
@@ -22,6 +22,11 @@ on:
|
||||
- cron: '17 4 * * 1' # Mondays at 04:17 UTC
|
||||
workflow_dispatch:
|
||||
|
||||
# Cancel stale runs to keep the 8-runner pool available for PR jobs.
|
||||
concurrency:
|
||||
group: weekly-platform-go-${{ github.event.pull_request.head.sha || github.sha }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
statuses: write
|
||||
|
||||
@@ -1,6 +0,0 @@
|
||||
# golangci-lint configuration for CI cold-runner use.
|
||||
# CLI flags --disable-all --enable=... take precedence over this file.
|
||||
# Only errcheck is disabled here to match .golangci.yaml defaults.
|
||||
linters:
|
||||
disable:
|
||||
- errcheck
|
||||
Reference in New Issue
Block a user