Compare commits
2 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| dc858ad164 | |||
| 2ffd44c694 |
@@ -148,15 +148,38 @@ def latest_statuses_by_context(statuses: list[dict]) -> dict[str, dict]:
|
||||
return latest
|
||||
|
||||
|
||||
def _is_tier_low_pending_ok(
|
||||
latest_statuses: dict[str, dict],
|
||||
context: str,
|
||||
pr_labels: set[str],
|
||||
) -> bool:
|
||||
"""Return True if tier:low PR can tolerate sop-checklist pending state.
|
||||
|
||||
Per sop-checklist-config.yaml tier_failure_mode, tier:low uses soft-fail:
|
||||
sop-checklist posts state=pending when acks are satisfied (missing
|
||||
manager/ceo acks are informational only). The queue should accept
|
||||
pending instead of waiting for success.
|
||||
"""
|
||||
if "tier:low" not in pr_labels:
|
||||
return False
|
||||
if "sop-checklist" not in context:
|
||||
return False
|
||||
status = latest_statuses.get(context) or {}
|
||||
return status_state(status) == "pending"
|
||||
|
||||
|
||||
def required_contexts_green(
|
||||
latest_statuses: dict[str, dict],
|
||||
contexts: list[str],
|
||||
pr_labels: set[str] | None = None,
|
||||
) -> tuple[bool, list[str]]:
|
||||
missing_or_bad: list[str] = []
|
||||
for context in contexts:
|
||||
status = latest_statuses.get(context)
|
||||
state = status_state(status or {})
|
||||
if state != "success":
|
||||
if pr_labels and _is_tier_low_pending_ok(latest_statuses, context, pr_labels):
|
||||
continue # tier:low soft-fail: accept pending sop-checklist
|
||||
missing_or_bad.append(f"{context}={state or 'missing'}")
|
||||
return not missing_or_bad, missing_or_bad
|
||||
|
||||
@@ -209,6 +232,7 @@ def evaluate_merge_readiness(
|
||||
pr_status: dict,
|
||||
required_contexts: list[str],
|
||||
pr_has_current_base: bool,
|
||||
pr_labels: set[str] | None = None,
|
||||
) -> MergeDecision:
|
||||
# Check push-required contexts explicitly instead of combined state.
|
||||
# Combined state can be "failure" due to non-blocking jobs
|
||||
@@ -228,7 +252,7 @@ def evaluate_merge_readiness(
|
||||
# The required_contexts list is the authoritative gate — it includes only
|
||||
# the checks that actually block merges.
|
||||
latest = latest_statuses_by_context(pr_status.get("statuses") or [])
|
||||
ok, missing_or_bad = required_contexts_green(latest, required_contexts)
|
||||
ok, missing_or_bad = required_contexts_green(latest, required_contexts, pr_labels)
|
||||
if not ok:
|
||||
return MergeDecision(False, "wait", "required contexts not green: " + ", ".join(missing_or_bad))
|
||||
return MergeDecision(True, "merge", "ready")
|
||||
@@ -253,27 +277,32 @@ def get_combined_status(sha: str) -> dict:
|
||||
_, combined = api("GET", f"/repos/{OWNER}/{NAME}/commits/{sha}/status")
|
||||
if not isinstance(combined, dict):
|
||||
raise ApiError(f"status for {sha} response not object")
|
||||
# Fetch full statuses list; 200 covers >99% of real-world runs.
|
||||
# The list is ordered ascending by id (oldest first) — callers must
|
||||
# iterate in reverse to get the newest entry per context.
|
||||
# Best-effort: large repos (main with 550+ statuses) may time out.
|
||||
# On timeout, fall back to the statuses[] already in the combined
|
||||
# response (usually 30 entries — enough for most PRs, enough for
|
||||
# main's early push-required contexts).
|
||||
combined_statuses: list[dict] = combined.get("statuses") or []
|
||||
try:
|
||||
_, all_statuses = api(
|
||||
_, all_statuses_raw = api(
|
||||
"GET",
|
||||
f"/repos/{OWNER}/{NAME}/commits/{sha}/statuses",
|
||||
query={"limit": "50"},
|
||||
)
|
||||
if isinstance(all_statuses, list):
|
||||
combined["statuses"] = all_statuses
|
||||
if isinstance(all_statuses_raw, list):
|
||||
all_statuses: list[dict] = list(all_statuses_raw)
|
||||
else:
|
||||
all_statuses = []
|
||||
except (ApiError, urllib.error.URLError, TimeoutError, OSError) as exc:
|
||||
# URLError covers network-level failures (DNS, refused, timeout).
|
||||
# TimeoutError and OSError cover socket-level timeouts.
|
||||
sys.stderr.write(f"::warning::could not fetch full statuses list for {sha[:8]}: {exc}\n")
|
||||
# Fall back to the statuses[] already in the combined response.
|
||||
pass
|
||||
all_statuses = []
|
||||
# Build latest per context: process combined (ascending→reverse=newest
|
||||
# first), then fill gaps from all_statuses (already newest-first).
|
||||
latest: dict[str, dict] = {}
|
||||
for status in reversed(sorted(combined_statuses, key=lambda s: s.get("id") or 0)):
|
||||
ctx = status.get("context")
|
||||
if isinstance(ctx, str) and ctx not in latest:
|
||||
latest[ctx] = status
|
||||
for status in all_statuses:
|
||||
ctx = status.get("context")
|
||||
if isinstance(ctx, str) and ctx not in latest:
|
||||
latest[ctx] = status
|
||||
combined["statuses"] = list(latest.values())
|
||||
return combined
|
||||
|
||||
|
||||
@@ -380,11 +409,13 @@ def process_once(*, dry_run: bool = False) -> int:
|
||||
commits = get_pull_commits(pr_number)
|
||||
current_base = pr_has_current_base(pr, commits, main_sha)
|
||||
pr_status = get_combined_status(head_sha)
|
||||
pr_labels = label_names(pr)
|
||||
decision = evaluate_merge_readiness(
|
||||
main_status=main_status,
|
||||
pr_status=pr_status,
|
||||
required_contexts=contexts,
|
||||
pr_has_current_base=current_base,
|
||||
pr_labels=pr_labels,
|
||||
)
|
||||
|
||||
print(f"::notice::PR #{pr_number} decision={decision.action}: {decision.reason}")
|
||||
|
||||
+14
-76
@@ -1,10 +1,3 @@
|
||||
# mc#1099 cold-runner fix: step-level timeouts on go mod download (3m) and
|
||||
# go build (5m) prevent cold runner hangs when proxy.golang.org is unreachable.
|
||||
# golangci-lint install has connectivity test + continue-on-error: true fallback.
|
||||
# go test step: 60m timeout, -p 1 flag for reduced memory pressure on cold disk.
|
||||
# all-required polling deadline raised to 50m (from 40m) + job timeout 55m (from
|
||||
# 45m) to accommodate Shellcheck delays when runner pool is recovering.
|
||||
# Queue cron reliability: ensure merge-queue workflow dispatches every 5 min.
|
||||
# Ported from .github/workflows/ci.yml on 2026-05-11 per RFC internal#219 §1.
|
||||
# continue-on-error: true on every job; follow-up PR will flip required after
|
||||
# surfaced bugs are fixed (per RFC §1 — "surface broken workflows without
|
||||
@@ -152,10 +145,10 @@ jobs:
|
||||
# the diagnostic step with its own continue-on-error: true (line 203).
|
||||
# Flip confirmed by CI / Platform (Go) status = success on main HEAD 363905d3.
|
||||
continue-on-error: false
|
||||
# mc#1099: cold runner needs ~45m for go test on cold disk I/O.
|
||||
# Job-level ceiling: go test 60m step + golangci-lint 45m step = 105m max.
|
||||
# Backstop: 120m.
|
||||
timeout-minutes: 120
|
||||
# Job-level ceiling. The go test step below runs with a per-step 10m timeout;
|
||||
# this cap catches any step that leaks past that. Set well above 10m so
|
||||
# the per-step timeout is the active constraint.
|
||||
timeout-minutes: 15
|
||||
defaults:
|
||||
run:
|
||||
working-directory: workspace-server
|
||||
@@ -170,69 +163,18 @@ jobs:
|
||||
with:
|
||||
go-version: 'stable'
|
||||
- if: always()
|
||||
name: Download Go module cache
|
||||
# mc#1099: cold runner cannot reach proxy.golang.org. Without a
|
||||
# step-level timeout this step hangs for 6+ minutes (30s × 2 curl
|
||||
# timeouts × 1 module proxy) before failing. 3-minute ceiling ensures
|
||||
# the job fails fast on a cold runner so the step-level
|
||||
# continue-on-error can be evaluated, rather than stalling the job.
|
||||
timeout-minutes: 3
|
||||
run: |
|
||||
set +e
|
||||
go mod download
|
||||
exit_code=$?
|
||||
if [ $exit_code -ne 0 ]; then
|
||||
echo "go mod download failed (exit $exit_code) — cold runner cannot reach module proxy"
|
||||
echo "Continuing anyway (continue-on-error: true on this step)"
|
||||
fi
|
||||
run: go mod download
|
||||
- if: always()
|
||||
name: Build server
|
||||
timeout-minutes: 5
|
||||
run: go build ./cmd/server
|
||||
# CLI (molecli) moved to standalone repo: git.moleculesai.app/molecule-ai/molecule-cli
|
||||
- if: always()
|
||||
run: go vet ./...
|
||||
- if: always()
|
||||
name: Install golangci-lint
|
||||
# mc#1099: cold runner cannot reach github.com releases or proxy.golang.org
|
||||
# (hanging at ~5-6m before timing out). Test connectivity first; if
|
||||
# both sources fail, skip golangci-lint and rely on go vet.
|
||||
# continue-on-error: true prevents install failure from failing the job
|
||||
# (job-level continue-on-error: false).
|
||||
continue-on-error: true
|
||||
run: |
|
||||
set +e
|
||||
# Test proxy.golang.org connectivity (30s timeout)
|
||||
if curl -fsSL --connect-timeout 30 --max-time 60 "https://proxy.golang.org/github.com/golangci/golangci-lint/@v/list" -o /dev/null 2>/dev/null; then
|
||||
echo "proxy.golang.org reachable, installing via go install..."
|
||||
go install github.com/golangci/golangci-lint/cmd/golangci-lint@v1.64.5
|
||||
echo "go install exit: $?"
|
||||
else
|
||||
echo "proxy.golang.org unreachable, trying GitHub releases..."
|
||||
ARCH=$(go env GOARCH) && OS=$(go env GOOS) && VERSION=1.64.5
|
||||
if curl -fsSL --connect-timeout 30 --max-time 120 "https://github.com/golangci/golangci-lint/releases/download/v${VERSION}/golangci-lint-${VERSION}-${OS}-${ARCH}.tar.gz" -o /tmp/golangci-lint.tar.gz 2>/dev/null; then
|
||||
tar -xzf /tmp/golangci-lint.tar.gz -C /tmp
|
||||
install -m 755 /tmp/golangci-lint $(go env GOPATH)/bin/golangci-lint
|
||||
echo "GitHub binary installed"
|
||||
else
|
||||
echo "GitHub releases also unreachable — skipping golangci-lint (go vet is the safety net)"
|
||||
touch "$(go env GOPATH)/bin/golangci-lint.skip"
|
||||
fi
|
||||
fi
|
||||
run: go install github.com/golangci/golangci-lint/v2/cmd/golangci-lint@v2.12.2
|
||||
- if: always()
|
||||
name: Run golangci-lint
|
||||
# mc#1099: skip if binary unavailable; go vet already ran as safety net.
|
||||
# timeout: 45m — cold runner disk I/O makes linting slow. The command
|
||||
# --timeout 60m prevents a runaway linter from stalling the step.
|
||||
# continue-on-error: true so a missing binary doesn't fail the job.
|
||||
continue-on-error: true
|
||||
timeout-minutes: 45
|
||||
run: |
|
||||
if [ -f "$(go env GOPATH)/bin/golangci-lint.skip" ]; then
|
||||
echo "golangci-lint skipped (network unavailable on cold runner)"
|
||||
else
|
||||
golangci-lint run --config golangci-coldrunner.yaml --disable-all --enable=gofmt --enable=goimports --enable=misspell --enable=whitespace --timeout 60m ./...
|
||||
fi
|
||||
run: $(go env GOPATH)/bin/golangci-lint run --timeout 3m ./...
|
||||
- if: always()
|
||||
name: Diagnostic — per-package verbose 60s
|
||||
run: |
|
||||
@@ -251,15 +193,11 @@ jobs:
|
||||
continue-on-error: true
|
||||
- if: always()
|
||||
name: Run tests with race detection and coverage
|
||||
# mc#1099: cold runner cache causes OOM kills at ~22m (slower disk I/O
|
||||
# than GitHub Actions). A 60m per-step timeout lets the suite complete
|
||||
# on cold cache (~45m) while failing cleanly instead of OOM-killing.
|
||||
# Warm runners finish in ~12m. Retry with -p 1 on OOM. Job-level
|
||||
# timeout (120m) is the backstop.
|
||||
timeout-minutes: 60
|
||||
run: |
|
||||
go test -race -timeout 60m -coverprofile=coverage.out ./... \
|
||||
|| go test -race -timeout 60m -coverprofile=coverage.out -p 1 ./...
|
||||
# Explicit timeout: cold runner cache causes OOM kills at ~4m39s on the
|
||||
# full ./... suite with race detection + coverage. A 10m per-step timeout
|
||||
# lets the suite complete on cold cache (~5-7m) while failing cleanly
|
||||
# instead of OOM-killing. The job-level timeout (15m) is a backstop.
|
||||
run: go test -race -timeout 10m -coverprofile=coverage.out ./...
|
||||
|
||||
- if: always()
|
||||
name: Per-file coverage report
|
||||
@@ -626,7 +564,7 @@ jobs:
|
||||
#
|
||||
continue-on-error: false
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 55
|
||||
timeout-minutes: 45
|
||||
steps:
|
||||
- name: Wait for required CI contexts
|
||||
env:
|
||||
@@ -658,7 +596,7 @@ jobs:
|
||||
f"CI / Python Lint & Test ({event})",
|
||||
]
|
||||
terminal_bad = {"failure", "error"}
|
||||
deadline = time.time() + 50 * 60
|
||||
deadline = time.time() + 40 * 60
|
||||
last_summary = None
|
||||
|
||||
def fetch_statuses():
|
||||
|
||||
@@ -1,6 +0,0 @@
|
||||
# golangci-lint configuration for CI cold-runner use.
|
||||
# CLI flags --disable-all --enable=... take precedence over this file.
|
||||
# Only errcheck is disabled here to match .golangci.yaml defaults.
|
||||
linters:
|
||||
disable:
|
||||
- errcheck
|
||||
Reference in New Issue
Block a user