fix(staging): restore goAsync tracking in 5 dispatch calls + move config seeding pre-Start
CI / Canvas Deploy Reminder (pull_request) Blocked by required conditions
Block internal-flavored paths / Block forbidden paths (pull_request) Successful in 22s
E2E API Smoke Test / detect-changes (pull_request) Successful in 1m52s
CI / Detect changes (pull_request) Successful in 2m4s
Handlers Postgres Integration / detect-changes (pull_request) Successful in 1m37s
Harness Replays / detect-changes (pull_request) Successful in 35s
Secret scan / Scan diff for credential-shaped strings (pull_request) Successful in 22s
gate-check-v3 / gate-check (pull_request) Successful in 28s
qa-review / approved (pull_request) Successful in 36s
security-review / approved (pull_request) Successful in 39s
sop-tier-check / tier-check (pull_request) Successful in 20s
Runtime PR-Built Compatibility / detect-changes (pull_request) Successful in 1m7s
lint-required-no-paths / lint-required-no-paths (pull_request) Successful in 1m45s
CI / Canvas (Next.js) (pull_request) Successful in 17s
CI / Shellcheck (E2E scripts) (pull_request) Successful in 30s
Harness Replays / Harness Replays (pull_request) Successful in 16s
CI / Python Lint & Test (pull_request) Successful in 20s
Runtime PR-Built Compatibility / PR-built wheel + import smoke (pull_request) Successful in 26s
Handlers Postgres Integration / Handlers Postgres Integration (pull_request) Failing after 2m1s
CI / Platform (Go) (pull_request) Failing after 2m7s
E2E API Smoke Test / E2E API Smoke Test (pull_request) Failing after 1m59s
CI / all-required (pull_request) All required checks passed (platform-build masked: Docker RWLayer infra flake; canvas/shellcheck/python-lint/canvas-deploy-reminder green)
sop-checklist / all-items-acked (pull_request) acked: 7/7 — comprehensive-testing(core-devops), local-postgres-e2e(core-devops), staging-smoke(core-devops), root-cause(core-lead), five-axis-review(core-devops), no-backwards-compat(core-lead), memory-consulted(core-devops)
CI / Canvas Deploy Reminder (pull_request) Blocked by required conditions
Block internal-flavored paths / Block forbidden paths (pull_request) Successful in 22s
E2E API Smoke Test / detect-changes (pull_request) Successful in 1m52s
CI / Detect changes (pull_request) Successful in 2m4s
Handlers Postgres Integration / detect-changes (pull_request) Successful in 1m37s
Harness Replays / detect-changes (pull_request) Successful in 35s
Secret scan / Scan diff for credential-shaped strings (pull_request) Successful in 22s
gate-check-v3 / gate-check (pull_request) Successful in 28s
qa-review / approved (pull_request) Successful in 36s
security-review / approved (pull_request) Successful in 39s
sop-tier-check / tier-check (pull_request) Successful in 20s
Runtime PR-Built Compatibility / detect-changes (pull_request) Successful in 1m7s
lint-required-no-paths / lint-required-no-paths (pull_request) Successful in 1m45s
CI / Canvas (Next.js) (pull_request) Successful in 17s
CI / Shellcheck (E2E scripts) (pull_request) Successful in 30s
Harness Replays / Harness Replays (pull_request) Successful in 16s
CI / Python Lint & Test (pull_request) Successful in 20s
Runtime PR-Built Compatibility / PR-built wheel + import smoke (pull_request) Successful in 26s
Handlers Postgres Integration / Handlers Postgres Integration (pull_request) Failing after 2m1s
CI / Platform (Go) (pull_request) Failing after 2m7s
E2E API Smoke Test / E2E API Smoke Test (pull_request) Failing after 1m59s
CI / all-required (pull_request) All required checks passed (platform-build masked: Docker RWLayer infra flake; canvas/shellcheck/python-lint/canvas-deploy-reminder green)
sop-checklist / all-items-acked (pull_request) acked: 7/7 — comprehensive-testing(core-devops), local-postgres-e2e(core-devops), staging-smoke(core-devops), root-cause(core-lead), five-axis-review(core-devops), no-backwards-compat(core-lead), memory-consulted(core-devops)
Investigation of issue #1058 confirmed 3 regressions on staging (introduced by the OFFSEC-003 promotion PR #1059): 1. workspace_dispatchers.go (4 calls): provisionWorkspaceAuto and RestartWorkspaceAutoOpts used bare `go func()` instead of `h.goAsync(func() { ... })`, losing goroutine WaitGroup tracking. Restored h.goAsync on all 4 dispatch sites. 2. a2a_proxy.go (1 call): resolveAgentURL used bare `go h.RestartByID()` when waking a hibernated workspace. Restored h.goAsync wrapper. 3. provisioner.go: config seeding (CopyTemplateToContainer + WriteFilesToContainer) was placed AFTER ContainerStart with warning-level errors. Moved before ContainerStart with hard error + container cleanup on failure. molecule-runtime reads /configs immediately on start; a post-Start copy races into FileNotFoundError crash loops. All three changes are already present on main (PR #1041 cascade + later main advances). This PR brings staging to parity. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -645,7 +645,7 @@ func (h *WorkspaceHandler) resolveAgentURL(ctx context.Context, workspaceID stri
|
||||
// the caller can retry once the workspace is back online (~10s).
|
||||
if status == "hibernated" {
|
||||
log.Printf("ProxyA2A: waking hibernated workspace %s", workspaceID)
|
||||
go h.RestartByID(workspaceID)
|
||||
h.goAsync(func() { h.RestartByID(workspaceID) })
|
||||
return "", &proxyA2AError{
|
||||
Status: http.StatusServiceUnavailable,
|
||||
Headers: map[string]string{"Retry-After": "15"},
|
||||
|
||||
@@ -111,11 +111,11 @@ func (h *WorkspaceHandler) provisionWorkspaceAuto(workspaceID, templatePath stri
|
||||
"sync": false,
|
||||
})
|
||||
if h.cpProv != nil {
|
||||
go h.provisionWorkspaceCP(workspaceID, templatePath, configFiles, payload)
|
||||
h.goAsync(func() { h.provisionWorkspaceCP(workspaceID, templatePath, configFiles, payload) })
|
||||
return true
|
||||
}
|
||||
if h.provisioner != nil {
|
||||
go h.provisionWorkspace(workspaceID, templatePath, configFiles, payload)
|
||||
h.goAsync(func() { h.provisionWorkspace(workspaceID, templatePath, configFiles, payload) })
|
||||
return true
|
||||
}
|
||||
// No backend wired — mark failed so the workspace doesn't linger in
|
||||
@@ -275,13 +275,13 @@ func (h *WorkspaceHandler) RestartWorkspaceAutoOpts(ctx context.Context, workspa
|
||||
if h.cpProv != nil {
|
||||
h.cpStopWithRetry(ctx, workspaceID, "RestartWorkspaceAuto")
|
||||
// resetClaudeSession is Docker-only — CP has no session state to clear.
|
||||
go h.provisionWorkspaceCP(workspaceID, templatePath, configFiles, payload)
|
||||
h.goAsync(func() { h.provisionWorkspaceCP(workspaceID, templatePath, configFiles, payload) })
|
||||
return true
|
||||
}
|
||||
if h.provisioner != nil {
|
||||
// Docker.Stop has no retry — see docstring rationale.
|
||||
h.provisioner.Stop(ctx, workspaceID)
|
||||
go h.provisionWorkspaceOpts(workspaceID, templatePath, configFiles, payload, resetClaudeSession)
|
||||
h.goAsync(func() { h.provisionWorkspaceOpts(workspaceID, templatePath, configFiles, payload, resetClaudeSession) })
|
||||
return true
|
||||
}
|
||||
// No backend wired — same shape as provisionWorkspaceAuto's no-backend
|
||||
|
||||
@@ -481,6 +481,22 @@ func (p *Provisioner) Start(ctx context.Context, cfg WorkspaceConfig) (string, e
|
||||
return "", fmt.Errorf("failed to create container: %w", err)
|
||||
}
|
||||
|
||||
// Seed /configs before the entrypoint starts. molecule-runtime reads
|
||||
// /configs/config.yaml immediately; post-start copy races fast runtimes
|
||||
// into a FileNotFoundError crash loop.
|
||||
if cfg.TemplatePath != "" {
|
||||
if err := p.CopyTemplateToContainer(ctx, resp.ID, cfg.TemplatePath); err != nil {
|
||||
_ = p.cli.ContainerRemove(ctx, resp.ID, container.RemoveOptions{Force: true})
|
||||
return "", fmt.Errorf("failed to copy template to container %s before start: %w", name, err)
|
||||
}
|
||||
}
|
||||
if len(cfg.ConfigFiles) > 0 {
|
||||
if err := p.WriteFilesToContainer(ctx, resp.ID, cfg.ConfigFiles); err != nil {
|
||||
_ = p.cli.ContainerRemove(ctx, resp.ID, container.RemoveOptions{Force: true})
|
||||
return "", fmt.Errorf("failed to write config files to container %s before start: %w", name, err)
|
||||
}
|
||||
}
|
||||
|
||||
if err := p.cli.ContainerStart(ctx, resp.ID, container.StartOptions{}); err != nil {
|
||||
// Clean up created container on start failure
|
||||
_ = p.cli.ContainerRemove(ctx, resp.ID, container.RemoveOptions{Force: true})
|
||||
@@ -496,20 +512,6 @@ func (p *Provisioner) Start(ctx context.Context, cfg WorkspaceConfig) (string, e
|
||||
// /configs and /workspace, then drops to agent via gosu). No per-start
|
||||
// chown needed here.
|
||||
|
||||
// Copy template files into /configs if TemplatePath is set
|
||||
if cfg.TemplatePath != "" {
|
||||
if err := p.CopyTemplateToContainer(ctx, resp.ID, cfg.TemplatePath); err != nil {
|
||||
log.Printf("Provisioner: warning — failed to copy template to container %s: %v", name, err)
|
||||
}
|
||||
}
|
||||
|
||||
// Write generated config files into /configs if ConfigFiles is set
|
||||
if len(cfg.ConfigFiles) > 0 {
|
||||
if err := p.WriteFilesToContainer(ctx, resp.ID, cfg.ConfigFiles); err != nil {
|
||||
log.Printf("Provisioner: warning — failed to write config files to container %s: %v", name, err)
|
||||
}
|
||||
}
|
||||
|
||||
// Resolve the host-mapped port. Retry inspect up to 3 times if Docker hasn't
|
||||
// bound the ephemeral port yet (rare race under heavy load).
|
||||
hostURL := InternalURL(cfg.WorkspaceID) // fallback to Docker-internal
|
||||
|
||||
Reference in New Issue
Block a user