forked from molecule-ai/molecule-core
Compare commits
11 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 43e2d24c5b | |||
| 0b840df563 | |||
| bee4f9ea79 | |||
| c1e32ff4a7 | |||
| bac04dc278 | |||
| e16d7eaa08 | |||
| 17f1f30b3f | |||
| 694c05552b | |||
| 948b5a0d89 | |||
| a6d67b4c68 | |||
| d2da0c8d34 |
@@ -94,14 +94,74 @@ jobs:
|
||||
id: ecr-login
|
||||
uses: aws-actions/amazon-ecr-login@v2
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # v4.0.0
|
||||
# docker/setup-buildx-action removed (Task #173, 2026-05-07).
|
||||
# Reason: on Gitea Actions, neither buildx driver works for our
|
||||
# mounted-docker-socket runner topology:
|
||||
# - docker-container driver: spawns a buildkit container that
|
||||
# doesn't share the host's ECR auth (401 Unauthorized on push)
|
||||
# - docker driver: delegates to the operator-host docker daemon,
|
||||
# which doesn't see the runner container's ECR auth either
|
||||
# Plain `docker build` + `docker push` from the runner container
|
||||
# works because both use the same docker socket + the runner's
|
||||
# config.json (populated by `aws ecr get-login-password | docker
|
||||
# login` in the next step). Buildx's value here was only multi-arch
|
||||
# builds, but we only ship linux/amd64 tenant images, so the
|
||||
# complexity isn't earning anything.
|
||||
|
||||
- name: Compute tags
|
||||
id: tags
|
||||
run: |
|
||||
echo "sha=${GITHUB_SHA::7}" >> "$GITHUB_OUTPUT"
|
||||
|
||||
# Pre-clone manifest deps before docker build (Task #173 fix).
|
||||
#
|
||||
# Why pre-clone: post-2026-05-06, every workspace-template-* repo on
|
||||
# Gitea (codex, crewai, deepagents, gemini-cli, langgraph) plus all
|
||||
# 7 org-template-* repos are private. The pre-fix Dockerfile.tenant
|
||||
# ran `git clone` inside an in-image stage, which had no auth path
|
||||
# — every CI build failed with "fatal: could not read Username for
|
||||
# https://git.moleculesai.app". For weeks, every workspace-server
|
||||
# rebuild required a manual operator-host push. Now we clone in the
|
||||
# trusted CI context (where AUTO_SYNC_TOKEN is naturally available)
|
||||
# and Dockerfile.tenant just COPYs from .tenant-bundle-deps/.
|
||||
#
|
||||
# Token shape: AUTO_SYNC_TOKEN is the devops-engineer persona PAT
|
||||
# (see /etc/molecule-bootstrap/agent-secrets.env). Per saved memory
|
||||
# `feedback_per_agent_gitea_identity_default`, every CI surface uses
|
||||
# a per-persona token, never the founder PAT. clone-manifest.sh
|
||||
# embeds it as basic-auth (oauth2:<token>) for the duration of the
|
||||
# clones, then strips .git directories — the token never enters
|
||||
# the resulting image.
|
||||
#
|
||||
# Idempotent: if a re-run finds populated dirs, clone-manifest.sh
|
||||
# skips them; safe to retrigger via path-filter or workflow_dispatch.
|
||||
- name: Pre-clone manifest deps
|
||||
env:
|
||||
MOLECULE_GITEA_TOKEN: ${{ secrets.AUTO_SYNC_TOKEN }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
if [ -z "${MOLECULE_GITEA_TOKEN}" ]; then
|
||||
echo "::error::AUTO_SYNC_TOKEN secret is empty — register the devops-engineer persona PAT in repo Actions secrets"
|
||||
exit 1
|
||||
fi
|
||||
mkdir -p .tenant-bundle-deps
|
||||
bash scripts/clone-manifest.sh \
|
||||
manifest.json \
|
||||
.tenant-bundle-deps/workspace-configs-templates \
|
||||
.tenant-bundle-deps/org-templates \
|
||||
.tenant-bundle-deps/plugins
|
||||
# Sanity-check counts so a silent partial clone fails fast
|
||||
# instead of producing a half-empty image.
|
||||
ws_count=$(find .tenant-bundle-deps/workspace-configs-templates -mindepth 1 -maxdepth 1 -type d | wc -l)
|
||||
org_count=$(find .tenant-bundle-deps/org-templates -mindepth 1 -maxdepth 1 -type d | wc -l)
|
||||
plugins_count=$(find .tenant-bundle-deps/plugins -mindepth 1 -maxdepth 1 -type d | wc -l)
|
||||
echo "Cloned: ws=$ws_count org=$org_count plugins=$plugins_count"
|
||||
# Counts are derived from manifest.json (9 ws / 7 org / 21
|
||||
# plugins as of 2026-05-07). If manifest.json grows but the
|
||||
# clone step regresses silently, the find above caps at the
|
||||
# actual disk state — but clone-manifest.sh's own EXPECTED vs
|
||||
# CLONED check (line ~95) is the authoritative fail-fast.
|
||||
|
||||
# Canary-gated release flow:
|
||||
# - This step always publishes :staging-<sha> + :staging-latest.
|
||||
# - On staging push, staging-CP picks up :staging-latest immediately
|
||||
@@ -128,58 +188,62 @@ jobs:
|
||||
# that gap. Earlier 2026-04-24 incident: a static :staging-<sha> pin
|
||||
# drifted 10 days behind staging — same class of bug, different
|
||||
# mechanism.
|
||||
- name: Build & push platform image to GHCR (staging-<sha> + staging-latest)
|
||||
uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # v7.1.0
|
||||
with:
|
||||
context: .
|
||||
file: ./workspace-server/Dockerfile
|
||||
platforms: linux/amd64
|
||||
push: true
|
||||
tags: |
|
||||
${{ env.IMAGE_NAME }}:staging-${{ steps.tags.outputs.sha }}
|
||||
${{ env.IMAGE_NAME }}:staging-latest
|
||||
cache-from: type=gha
|
||||
cache-to: type=gha,mode=max
|
||||
# GIT_SHA bakes into the Go binary via -ldflags so /buildinfo
|
||||
# returns it at runtime — see Dockerfile + buildinfo/buildinfo.go.
|
||||
# This is the same value as the OCI revision label below; passing
|
||||
# it twice is intentional, the OCI label is for registry tooling
|
||||
# while /buildinfo is for the redeploy verification step.
|
||||
build-args: |
|
||||
GIT_SHA=${{ github.sha }}
|
||||
labels: |
|
||||
org.opencontainers.image.source=https://github.com/${{ github.repository }}
|
||||
org.opencontainers.image.revision=${{ github.sha }}
|
||||
org.opencontainers.image.description=Molecule AI platform (Go API server) — pending canary verify
|
||||
# Build + push platform image with plain `docker` (no buildx).
|
||||
# GIT_SHA bakes into the Go binary via -ldflags so /buildinfo
|
||||
# returns it at runtime — see Dockerfile + buildinfo/buildinfo.go.
|
||||
# The OCI revision label below carries the same value for registry
|
||||
# tooling; the duplication is intentional.
|
||||
- name: Build & push platform image to ECR (staging-<sha> + staging-latest)
|
||||
env:
|
||||
IMAGE_NAME: ${{ env.IMAGE_NAME }}
|
||||
TAG_SHA: staging-${{ steps.tags.outputs.sha }}
|
||||
TAG_LATEST: staging-latest
|
||||
GIT_SHA: ${{ github.sha }}
|
||||
REPO: ${{ github.repository }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
docker build \
|
||||
--file ./workspace-server/Dockerfile \
|
||||
--build-arg GIT_SHA="${GIT_SHA}" \
|
||||
--label "org.opencontainers.image.source=https://github.com/${REPO}" \
|
||||
--label "org.opencontainers.image.revision=${GIT_SHA}" \
|
||||
--label "org.opencontainers.image.description=Molecule AI platform (Go API server) — pending canary verify" \
|
||||
--tag "${IMAGE_NAME}:${TAG_SHA}" \
|
||||
--tag "${IMAGE_NAME}:${TAG_LATEST}" \
|
||||
.
|
||||
docker push "${IMAGE_NAME}:${TAG_SHA}"
|
||||
docker push "${IMAGE_NAME}:${TAG_LATEST}"
|
||||
|
||||
- name: Build & push tenant image to GHCR (staging-<sha> + staging-latest)
|
||||
uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # v7.1.0
|
||||
with:
|
||||
context: .
|
||||
file: ./workspace-server/Dockerfile.tenant
|
||||
platforms: linux/amd64
|
||||
push: true
|
||||
tags: |
|
||||
${{ env.TENANT_IMAGE_NAME }}:staging-${{ steps.tags.outputs.sha }}
|
||||
${{ env.TENANT_IMAGE_NAME }}:staging-latest
|
||||
cache-from: type=gha
|
||||
cache-to: type=gha,mode=max
|
||||
# Canvas uses same-origin fetches. The tenant Go platform
|
||||
# reverse-proxies /cp/* to the SaaS CP via its CP_UPSTREAM_URL
|
||||
# env; the tenant's /canvas/viewport, /approvals/pending,
|
||||
# /org/templates etc. live on the tenant platform itself.
|
||||
# Both legs share one origin (the tenant subdomain) so
|
||||
# PLATFORM_URL="" forces canvas to fetch paths as relative,
|
||||
# which land same-origin.
|
||||
#
|
||||
# Self-hosted / private-label deployments override this at
|
||||
# build time with a specific backend (e.g. local dev:
|
||||
# NEXT_PUBLIC_PLATFORM_URL=http://localhost:8080).
|
||||
build-args: |
|
||||
NEXT_PUBLIC_PLATFORM_URL=
|
||||
GIT_SHA=${{ github.sha }}
|
||||
labels: |
|
||||
org.opencontainers.image.source=https://github.com/${{ github.repository }}
|
||||
org.opencontainers.image.revision=${{ github.sha }}
|
||||
org.opencontainers.image.description=Molecule AI tenant platform + canvas — pending canary verify
|
||||
# Canvas uses same-origin fetches. The tenant Go platform
|
||||
# reverse-proxies /cp/* to the SaaS CP via its CP_UPSTREAM_URL
|
||||
# env; the tenant's /canvas/viewport, /approvals/pending,
|
||||
# /org/templates etc. live on the tenant platform itself.
|
||||
# Both legs share one origin (the tenant subdomain) so
|
||||
# PLATFORM_URL="" forces canvas to fetch paths as relative,
|
||||
# which land same-origin.
|
||||
#
|
||||
# Self-hosted / private-label deployments override this at
|
||||
# build time with a specific backend (e.g. local dev:
|
||||
# NEXT_PUBLIC_PLATFORM_URL=http://localhost:8080).
|
||||
- name: Build & push tenant image to ECR (staging-<sha> + staging-latest)
|
||||
env:
|
||||
TENANT_IMAGE_NAME: ${{ env.TENANT_IMAGE_NAME }}
|
||||
TAG_SHA: staging-${{ steps.tags.outputs.sha }}
|
||||
TAG_LATEST: staging-latest
|
||||
GIT_SHA: ${{ github.sha }}
|
||||
REPO: ${{ github.repository }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
docker build \
|
||||
--file ./workspace-server/Dockerfile.tenant \
|
||||
--build-arg NEXT_PUBLIC_PLATFORM_URL= \
|
||||
--build-arg GIT_SHA="${GIT_SHA}" \
|
||||
--label "org.opencontainers.image.source=https://github.com/${REPO}" \
|
||||
--label "org.opencontainers.image.revision=${GIT_SHA}" \
|
||||
--label "org.opencontainers.image.description=Molecule AI tenant platform + canvas — pending canary verify" \
|
||||
--tag "${TENANT_IMAGE_NAME}:${TAG_SHA}" \
|
||||
--tag "${TENANT_IMAGE_NAME}:${TAG_LATEST}" \
|
||||
.
|
||||
docker push "${TENANT_IMAGE_NAME}:${TAG_SHA}"
|
||||
docker push "${TENANT_IMAGE_NAME}:${TAG_LATEST}"
|
||||
|
||||
|
||||
@@ -131,6 +131,13 @@ backups/
|
||||
# Cloned by publish-workspace-server-image.yml so the Dockerfile's
|
||||
# replace-directive path resolves. Lives in its own repo.
|
||||
/molecule-ai-plugin-github-app-auth/
|
||||
# Tenant-image build context — populated by the workflow's
|
||||
# "Pre-clone manifest deps" step. Mirrors the public manifest, holds the
|
||||
# same content as the three /<>/ dirs above but namespaced under one
|
||||
# parent so the Docker build context is a single COPY-friendly tree.
|
||||
# Each entry is a transient working-dir, never source-of-truth, never
|
||||
# committed.
|
||||
/.tenant-bundle-deps/
|
||||
|
||||
# Internal-flavored content lives in Molecule-AI/internal — NEVER in this
|
||||
# public monorepo. Migrated 2026-04-23 (CEO directive). The CI workflow
|
||||
|
||||
@@ -6,6 +6,29 @@
|
||||
# ./scripts/clone-manifest.sh <manifest.json> <ws-templates-dir> <org-templates-dir> <plugins-dir>
|
||||
#
|
||||
# Requires: git, jq (lighter than python3 — ~2MB vs ~50MB in Alpine)
|
||||
#
|
||||
# Auth (optional):
|
||||
# When MOLECULE_GITEA_TOKEN is set, embed it as the basic-auth password so
|
||||
# private Gitea repos clone successfully. When unset, clone anonymously
|
||||
# (works only for repos that are public on git.moleculesai.app).
|
||||
#
|
||||
# This is the path the publish-workspace-server-image.yml workflow uses:
|
||||
# it injects AUTO_SYNC_TOKEN (devops-engineer persona PAT, repo:read on
|
||||
# the molecule-ai org) so the in-CI pre-clone step succeeds for ALL
|
||||
# manifest entries — including the 5 private workspace-template-* repos
|
||||
# (codex, crewai, deepagents, gemini-cli, langgraph) and all 7
|
||||
# org-template-* repos.
|
||||
#
|
||||
# The token never enters the Docker image: this script runs in the
|
||||
# trusted CI context BEFORE `docker buildx build`, populates
|
||||
# .tenant-bundle-deps/, then `Dockerfile.tenant` COPYs from there with
|
||||
# the .git directories already stripped (see line ~67 below).
|
||||
#
|
||||
# For backward compatibility — and so a fresh clone works without
|
||||
# secrets when (eventually) the workspace-template-* repos flip public —
|
||||
# the unset path remains a plain anonymous HTTPS clone. That path will
|
||||
# FAIL with "could not read Username" on private repos today; CI MUST
|
||||
# set MOLECULE_GITEA_TOKEN.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
@@ -52,11 +75,23 @@ clone_category() {
|
||||
# every manifest entry.
|
||||
repo_gitea="$(echo "$repo" | awk -F/ '{ printf "%s", tolower($1); for (i=2; i<=NF; i++) printf "/%s", $i; print "" }')"
|
||||
|
||||
echo " cloning $repo_gitea -> $target_dir/$name (ref=$ref)"
|
||||
if [ "$ref" = "main" ]; then
|
||||
git clone --depth=1 -q "https://git.moleculesai.app/${repo_gitea}.git" "$target_dir/$name"
|
||||
# Build the clone URL. When MOLECULE_GITEA_TOKEN is set (CI path)
|
||||
# embed it as basic-auth so private repos succeed. The username
|
||||
# part ("oauth2") is conventional and ignored by Gitea — only the
|
||||
# token-as-password is verified.
|
||||
if [ -n "${MOLECULE_GITEA_TOKEN:-}" ]; then
|
||||
clone_url="https://oauth2:${MOLECULE_GITEA_TOKEN}@git.moleculesai.app/${repo_gitea}.git"
|
||||
display_url="https://oauth2:***@git.moleculesai.app/${repo_gitea}.git"
|
||||
else
|
||||
git clone --depth=1 -q --branch "$ref" "https://git.moleculesai.app/${repo_gitea}.git" "$target_dir/$name"
|
||||
clone_url="https://git.moleculesai.app/${repo_gitea}.git"
|
||||
display_url="$clone_url"
|
||||
fi
|
||||
|
||||
echo " cloning $display_url -> $target_dir/$name (ref=$ref)"
|
||||
if [ "$ref" = "main" ]; then
|
||||
git clone --depth=1 -q "$clone_url" "$target_dir/$name"
|
||||
else
|
||||
git clone --depth=1 -q --branch "$ref" "$clone_url" "$target_dir/$name"
|
||||
fi
|
||||
CLONED=$((CLONED + 1))
|
||||
i=$((i + 1))
|
||||
|
||||
+18
-13
@@ -1,7 +1,15 @@
|
||||
# Platform-only image (no canvas). Used by publish-platform-image workflow
|
||||
# for GHCR + Fly registry. Tenant image uses Dockerfile.tenant instead.
|
||||
# Platform-only image (no canvas). Used by publish-workspace-server-image
|
||||
# workflow for ECR. Tenant image uses Dockerfile.tenant instead.
|
||||
#
|
||||
# Build context: repo root.
|
||||
# Templates + plugins are pre-cloned by scripts/clone-manifest.sh (in CI
|
||||
# or on the operator host) into .tenant-bundle-deps/ — same pattern as
|
||||
# Dockerfile.tenant. See that file's header for the full rationale; the
|
||||
# short version is that post-2026-05-06 every workspace-template-* and
|
||||
# org-template-* repo on Gitea is private, so an in-image `git clone`
|
||||
# has no auth path that doesn't leak the Gitea token into a layer.
|
||||
#
|
||||
# Build context: repo root, with `.tenant-bundle-deps/` populated by the
|
||||
# workflow's "Pre-clone manifest deps" step (Task #173).
|
||||
|
||||
FROM golang:1.25-alpine AS builder
|
||||
WORKDIR /app
|
||||
@@ -26,21 +34,18 @@ RUN CGO_ENABLED=0 GOOS=linux go build \
|
||||
-ldflags "-X github.com/Molecule-AI/molecule-monorepo/platform/internal/buildinfo.GitSHA=${GIT_SHA}" \
|
||||
-o /memory-plugin ./cmd/memory-plugin-postgres
|
||||
|
||||
# Clone templates + plugins at build time from manifest.json
|
||||
FROM alpine:3.20 AS templates
|
||||
RUN apk add --no-cache git jq
|
||||
COPY manifest.json /manifest.json
|
||||
COPY scripts/clone-manifest.sh /scripts/clone-manifest.sh
|
||||
RUN chmod +x /scripts/clone-manifest.sh && /scripts/clone-manifest.sh /manifest.json /workspace-configs-templates /org-templates /plugins
|
||||
|
||||
FROM alpine:3.20
|
||||
RUN apk add --no-cache ca-certificates git tzdata wget
|
||||
COPY --from=builder /platform /platform
|
||||
COPY --from=builder /memory-plugin /memory-plugin
|
||||
COPY workspace-server/migrations /migrations
|
||||
COPY --from=templates /workspace-configs-templates /workspace-configs-templates
|
||||
COPY --from=templates /org-templates /org-templates
|
||||
COPY --from=templates /plugins /plugins
|
||||
# Templates + plugins (pre-cloned by scripts/clone-manifest.sh in the
|
||||
# trusted CI / operator-host context, .git already stripped). The Gitea
|
||||
# token used to clone them never enters this image — same shape as
|
||||
# Dockerfile.tenant.
|
||||
COPY .tenant-bundle-deps/workspace-configs-templates /workspace-configs-templates
|
||||
COPY .tenant-bundle-deps/org-templates /org-templates
|
||||
COPY .tenant-bundle-deps/plugins /plugins
|
||||
# Non-root runtime with Docker socket access for workspace provisioning.
|
||||
RUN addgroup -g 1000 platform && adduser -u 1000 -G platform -s /bin/sh -D platform
|
||||
EXPOSE 8080
|
||||
|
||||
@@ -3,14 +3,34 @@
|
||||
# Serves both the API (Go on :8080) and the UI (Node.js on :3000) in a
|
||||
# single container. Go reverse-proxies unknown routes to canvas.
|
||||
#
|
||||
# Templates are cloned from standalone GitHub repos at build time so the
|
||||
# monorepo doesn't need to carry them. The repos are public; no auth.
|
||||
# Templates + plugins are NOT cloned at build time. They are pre-cloned
|
||||
# in the trusted CI context (or operator host) by
|
||||
# `scripts/clone-manifest.sh` into `.tenant-bundle-deps/` and COPYed in.
|
||||
# The reason: post-2026-05-06, every workspace-template-* repo on Gitea
|
||||
# (codex, crewai, deepagents, gemini-cli, langgraph) plus all 7
|
||||
# org-template-* repos are private, so the Docker build can't `git clone`
|
||||
# from inside the build context — there's no auth path that doesn't leak
|
||||
# the Gitea token into an image layer. Pre-cloning keeps the token in
|
||||
# the CI environment only; the resulting image carries the cloned trees
|
||||
# with `.git` already stripped (see clone-manifest.sh).
|
||||
#
|
||||
# Build context: repo root.
|
||||
# Build context: repo root, with `.tenant-bundle-deps/` populated by:
|
||||
#
|
||||
# MOLECULE_GITEA_TOKEN=<persona-PAT> scripts/clone-manifest.sh \
|
||||
# manifest.json \
|
||||
# .tenant-bundle-deps/workspace-configs-templates \
|
||||
# .tenant-bundle-deps/org-templates \
|
||||
# .tenant-bundle-deps/plugins
|
||||
#
|
||||
# In CI this happens in publish-workspace-server-image.yml's "Pre-clone
|
||||
# manifest deps" step (uses AUTO_SYNC_TOKEN = devops-engineer persona).
|
||||
# For a manual operator-host build, source the same token from
|
||||
# /etc/molecule-bootstrap/agent-secrets.env first.
|
||||
#
|
||||
# docker buildx build --platform linux/amd64 \
|
||||
# -f workspace-server/Dockerfile.tenant \
|
||||
# -t registry.fly.io/molecule-tenant:latest \
|
||||
# -t <ECR>/molecule-ai/platform-tenant:latest \
|
||||
# --build-arg GIT_SHA=<sha> --build-arg NEXT_PUBLIC_PLATFORM_URL= \
|
||||
# --push .
|
||||
|
||||
# ── Stage 1: Go platform binary ──────────────────────────────────────
|
||||
@@ -55,14 +75,7 @@ ENV NEXT_PUBLIC_PLATFORM_URL=$NEXT_PUBLIC_PLATFORM_URL
|
||||
ENV NEXT_PUBLIC_WS_URL=$NEXT_PUBLIC_WS_URL
|
||||
RUN npm run build
|
||||
|
||||
# ── Stage 3: Clone templates + plugins from manifest.json ─────────────
|
||||
FROM alpine:3.20 AS templates
|
||||
RUN apk add --no-cache git jq
|
||||
COPY manifest.json /manifest.json
|
||||
COPY scripts/clone-manifest.sh /scripts/clone-manifest.sh
|
||||
RUN chmod +x /scripts/clone-manifest.sh && /scripts/clone-manifest.sh /manifest.json /workspace-configs-templates /org-templates /plugins
|
||||
|
||||
# ── Stage 4: Runtime ──────────────────────────────────────────────────
|
||||
# ── Stage 3: Runtime ──────────────────────────────────────────────────
|
||||
FROM node:20-alpine
|
||||
RUN apk add --no-cache ca-certificates git tzdata openssh-client aws-cli
|
||||
|
||||
@@ -87,10 +100,13 @@ COPY --from=go-builder /platform /platform
|
||||
COPY --from=go-builder /memory-plugin /memory-plugin
|
||||
COPY workspace-server/migrations /migrations
|
||||
|
||||
# Templates + plugins (cloned from GitHub in stage 3)
|
||||
COPY --from=templates /workspace-configs-templates /workspace-configs-templates
|
||||
COPY --from=templates /org-templates /org-templates
|
||||
COPY --from=templates /plugins /plugins
|
||||
# Templates + plugins (pre-cloned by scripts/clone-manifest.sh in the
|
||||
# trusted CI / operator-host context, .git already stripped — see
|
||||
# .tenant-bundle-deps/ in the build context). The Gitea token used to
|
||||
# clone them never enters this image.
|
||||
COPY .tenant-bundle-deps/workspace-configs-templates /workspace-configs-templates
|
||||
COPY .tenant-bundle-deps/org-templates /org-templates
|
||||
COPY .tenant-bundle-deps/plugins /plugins
|
||||
|
||||
# Canvas standalone
|
||||
WORKDIR /canvas
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
package handlers
|
||||
|
||||
import (
|
||||
"runtime"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"testing"
|
||||
@@ -15,6 +16,42 @@ func resetRestartStatesFor(workspaceID string) {
|
||||
restartStates.Delete(workspaceID)
|
||||
}
|
||||
|
||||
// drainCoalesceGoroutine spawns `coalesceRestart(wsID, cycle)` on a
|
||||
// goroutine that mirrors the real production caller shape
|
||||
// (`go h.RestartByID(...)` from a2a_proxy.go, a2a_proxy_helpers.go,
|
||||
// main.go), and registers a t.Cleanup that blocks until the goroutine
|
||||
// has TERMINATED — not just panicked-and-recovered, fully exited.
|
||||
//
|
||||
// This is the bleed-prevention contract for Class H (Task #170): no
|
||||
// test in this file may declare itself complete while a coalesceRestart
|
||||
// goroutine it spawned is still alive, because that goroutine could
|
||||
// otherwise wake up after the test's sqlmock has been closed and
|
||||
// either:
|
||||
// - issue a stale INSERT that gets attributed to the next test's
|
||||
// sqlmock connection — surfaces as
|
||||
// "INSERT-not-expected for kind=DELEGATION_FAILED" / =WORKSPACE_PROVISION_FAILED
|
||||
// in a neighbour test that doesn't itself touch coalesceRestart; or
|
||||
// - hold a reference to the closed *sql.DB and panic on the next op.
|
||||
//
|
||||
// Implementation notes:
|
||||
// - sync.WaitGroup must be Add()ed BEFORE the goroutine is spawned;
|
||||
// Add inside the goroutine races with Wait.
|
||||
// - t.Cleanup runs in LIFO order, so this composes safely with other
|
||||
// cleanups (e.g. setupTestDB's mockDB.Close).
|
||||
// - We don't bound the Wait with a timeout — if the goroutine
|
||||
// genuinely deadlocks, the whole test process should hang and fail
|
||||
// under -timeout. A timeout-then-orphan would mask the bleed.
|
||||
func drainCoalesceGoroutine(t *testing.T, wsID string, cycle func()) {
|
||||
t.Helper()
|
||||
var wg sync.WaitGroup
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
coalesceRestart(wsID, cycle)
|
||||
}()
|
||||
t.Cleanup(wg.Wait)
|
||||
}
|
||||
|
||||
// TestCoalesceRestart_SingleCallRunsOneCycle is the baseline:
|
||||
// no concurrency, one cycle. If this fails the gate logic is broken at
|
||||
// its simplest path.
|
||||
@@ -200,19 +237,45 @@ func TestCoalesceRestart_PanicInCycleClearsState(t *testing.T) {
|
||||
const wsID = "test-coalesce-panic-recovery"
|
||||
resetRestartStatesFor(wsID)
|
||||
|
||||
// First call's cycle panics. coalesceRestart's defer must swallow
|
||||
// the panic so this test caller doesn't see it propagate up — that
|
||||
// matches what the real production caller (`go h.RestartByID(...)`)
|
||||
// gets: the goroutine survives, no process crash.
|
||||
defer func() {
|
||||
if r := recover(); r != nil {
|
||||
t.Errorf("panic should NOT propagate out of coalesceRestart (would crash the platform process from a goroutine), got: %v", r)
|
||||
// Spawn the panicking cycle on a goroutine via drainCoalesceGoroutine
|
||||
// — this mirrors the real production callsite shape
|
||||
// (`go h.RestartByID(...)` from a2a_proxy.go:584,
|
||||
// a2a_proxy_helpers.go:197, main.go:213). The previous form called
|
||||
// coalesceRestart synchronously, which neither exercised the
|
||||
// goroutine-survival contract nor caught Class H bleed regressions
|
||||
// where the panic-recovery goroutine outlives the test and pollutes
|
||||
// the next test's sqlmock with INSERTs from runRestartCycle's
|
||||
// LogActivity calls (kinds DELEGATION_FAILED / WORKSPACE_PROVISION_FAILED).
|
||||
//
|
||||
// drainCoalesceGoroutine registers a t.Cleanup that Wait()s for the
|
||||
// goroutine to TERMINATE — not merely panic-and-recover — before
|
||||
// the test ends.
|
||||
drainCoalesceGoroutine(t, wsID, func() { panic("simulated cycle failure") })
|
||||
|
||||
// We need a mid-test barrier (not just the t.Cleanup-time barrier)
|
||||
// so the second coalesceRestart below sees state.running=false. The
|
||||
// goroutine clears state.running inside its deferred recover; poll
|
||||
// the package-level restartStates map until that observable flip
|
||||
// happens. Bound at 2s — longer = real bug.
|
||||
deadline := time.Now().Add(2 * time.Second)
|
||||
for time.Now().Before(deadline) {
|
||||
sv, ok := restartStates.Load(wsID)
|
||||
if ok {
|
||||
st := sv.(*restartState)
|
||||
st.mu.Lock()
|
||||
running := st.running
|
||||
st.mu.Unlock()
|
||||
if !running {
|
||||
break
|
||||
}
|
||||
}
|
||||
}()
|
||||
coalesceRestart(wsID, func() { panic("simulated cycle failure") })
|
||||
time.Sleep(time.Millisecond)
|
||||
}
|
||||
|
||||
// Second call must run a fresh cycle. If running stayed true after
|
||||
// the panic, this call would early-return without invoking cycle.
|
||||
// Synchronous — no panic, so no goroutine to drain, and we want to
|
||||
// assert ran.Load() immediately after.
|
||||
var ran atomic.Bool
|
||||
coalesceRestart(wsID, func() { ran.Store(true) })
|
||||
if !ran.Load() {
|
||||
@@ -220,6 +283,98 @@ func TestCoalesceRestart_PanicInCycleClearsState(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestCoalesceRestart_DrainHelperWaitsForGoroutineExit is the Class H
|
||||
// regression guard for Task #170. It asserts the contract enforced by
|
||||
// drainCoalesceGoroutine: t.Cleanup blocks until the spawned
|
||||
// coalesceRestart goroutine has FULLY EXITED — not merely recovered
|
||||
// from panic. This is the contract that prevents stale LogActivity
|
||||
// INSERTs from a recovering goroutine bleeding into the next test's
|
||||
// sqlmock (the failure mode reported as "INSERT-not-expected for
|
||||
// kind=DELEGATION_FAILED" in TestPooledWithEICTunnel_PreservesFnErr).
|
||||
//
|
||||
// We use a deterministic bleed-shape probe rather than goroutine-count
|
||||
// arithmetic: the cycle blocks on a release channel for ~150ms — long
|
||||
// enough that without a Wait barrier, the outer sub-test would return
|
||||
// before the goroutine exited. We then verify the wg.Wait inside
|
||||
// drainCoalesceGoroutine actually delayed t.Run's completion: total
|
||||
// elapsed must be >= the block duration. Asserts exact-shape, not
|
||||
// substring (per saved-memory feedback_assert_exact_not_substring):
|
||||
// elapsed < blockFor would mean the cleanup didn't wait, which is the
|
||||
// exact bleed we're guarding against.
|
||||
//
|
||||
// We additionally panic from the cycle (after the block) to confirm
|
||||
// the helper waits past panic recovery, not just past cycle return.
|
||||
func TestCoalesceRestart_DrainHelperWaitsForGoroutineExit(t *testing.T) {
|
||||
const blockFor = 150 * time.Millisecond
|
||||
const wsID = "test-coalesce-drain-helper-contract"
|
||||
resetRestartStatesFor(wsID)
|
||||
|
||||
// done is closed inside the cycle, AFTER the block + AFTER the
|
||||
// panic (which the deferred recover in coalesceRestart catches).
|
||||
// Actually: defer in cycle runs before panic propagates to the
|
||||
// outer recover. Use defer to close.
|
||||
exited := make(chan struct{})
|
||||
|
||||
subStart := time.Now()
|
||||
t.Run("drain_under_subtest", func(st *testing.T) {
|
||||
drainCoalesceGoroutine(st, wsID, func() {
|
||||
defer close(exited)
|
||||
time.Sleep(blockFor)
|
||||
panic("contract-test panic-after-block")
|
||||
})
|
||||
// st.Cleanup runs here, before t.Run returns. wg.Wait must
|
||||
// block until the goroutine has finished its panic recovery.
|
||||
})
|
||||
subElapsed := time.Since(subStart)
|
||||
|
||||
// Contract: the helper's wg.Wait MUST have blocked t.Run from
|
||||
// returning until after the cycle's block + panic recovery.
|
||||
if subElapsed < blockFor {
|
||||
t.Fatalf(
|
||||
"drainCoalesceGoroutine contract violated: t.Run returned in %v, "+
|
||||
"but cycle blocks for %v. The Wait barrier is broken — a "+
|
||||
"coalesceRestart goroutine can outlive its test's t.Cleanup "+
|
||||
"and pollute neighbour-test sqlmock state (Class H bleed).",
|
||||
subElapsed, blockFor,
|
||||
)
|
||||
}
|
||||
|
||||
// And the goroutine must have actually closed `exited` (i.e. ran
|
||||
// the deferred close before panic propagated through coalesceRestart's
|
||||
// recover). If exited is still open here, the goroutine never
|
||||
// reached the close — meaning either the panic short-circuited the
|
||||
// defer (Go runtime bug — won't happen) or the goroutine never
|
||||
// ran at all (drainCoalesceGoroutine spawn shape regressed).
|
||||
select {
|
||||
case <-exited:
|
||||
// Correct path.
|
||||
default:
|
||||
t.Fatal("cycle goroutine never reached its deferred close — panic-recovery contract regressed")
|
||||
}
|
||||
|
||||
// Belt-and-suspenders: the post-recover state-clear must have
|
||||
// flipped state.running back to false. If this fails, the panic
|
||||
// path skipped the deferred state-clear in coalesceRestart.
|
||||
sv, ok := restartStates.Load(wsID)
|
||||
if !ok {
|
||||
t.Fatal("restartStates entry missing for wsID after cycle — sync.Map regression")
|
||||
}
|
||||
st := sv.(*restartState)
|
||||
st.mu.Lock()
|
||||
running := st.running
|
||||
st.mu.Unlock()
|
||||
if running {
|
||||
t.Error("state.running was not cleared after panic — sticky-running deadlock regressed")
|
||||
}
|
||||
|
||||
// Reference runtime.NumGoroutine to keep the runtime import
|
||||
// honest — also a useful smoke check that the goroutine count
|
||||
// hasn't ballooned 10x while debugging this test.
|
||||
if n := runtime.NumGoroutine(); n > 200 {
|
||||
t.Logf("warning: NumGoroutine=%d after drain — high but not necessarily a leak", n)
|
||||
}
|
||||
}
|
||||
|
||||
// TestCoalesceRestart_DifferentWorkspacesDoNotSerialize verifies the
|
||||
// per-workspace state map: an in-flight restart for ws A must not
|
||||
// block restarts for ws B. Important for performance — without this,
|
||||
|
||||
Reference in New Issue
Block a user