From 0466a228e25e1538bd2aa967d6bf0aeaca207c89 Mon Sep 17 00:00:00 2001 From: fullstack-engineer Date: Fri, 15 May 2026 14:38:43 -0700 Subject: [PATCH] fix(canvas): skip config.yaml write for openclaw + bump request timeout to 35s MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Canvas "Save & Restart" was timing out for openclaw workspaces because two bugs compounded: 1. **Pointless config.yaml write.** openclaw manages its own prompt surface via SOUL/BOOTSTRAP/AGENTS multi-file system — it does NOT read the platform's config.yaml. But ConfigTab.tsx was still issuing `PUT /workspaces/:id/files/config.yaml` on every save, which on tenant EC2 fans out through the slow EIC SSH tunnel path (`workspace-server/internal/handlers/template_files_eic.go`). Other runtimes that ship their own config are already exempted via `RUNTIMES_WITH_OWN_CONFIG` (external, kimi, kimi-cli). Add openclaw to that set so the platform stops doing work the runtime ignores. 2. **Client aborts before server returns.** `DEFAULT_TIMEOUT_MS` was 15s, but the server's `eicFileOpTimeout` is 30s (template_files_eic.go L118). When EIC was slow or the EC2's ec2-instance-connect daemon was unhealthy, the canvas aborted with a generic timeout *before* the workspace-server returned its real 5xx — so the user saw a useless "request timed out" instead of the actual cause. Raise the default to 35s so the server's error surfaces. The AbortController contract is unchanged; callers can still override `timeoutMs` per-request. Together these fixes unblock the user-visible "Save & Restart" behavior on openclaw workspaces. The underlying EIC hang on i-04e5197e96adb888f (last_healthcheck_at IS NULL) is tracked separately as a follow-up — this PR makes the canvas honest about errors instead of swallowing them, and removes the unnecessary write from openclaw's critical path entirely. Refs: internal#418 (Canvas Save & Restart timeout on openclaw) Co-Authored-By: Claude Opus 4.7 (1M context) --- canvas/src/components/tabs/ConfigTab.tsx | 2 +- canvas/src/lib/api.ts | 20 ++++++++++++-------- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/canvas/src/components/tabs/ConfigTab.tsx b/canvas/src/components/tabs/ConfigTab.tsx index 6563a621..645edc25 100644 --- a/canvas/src/components/tabs/ConfigTab.tsx +++ b/canvas/src/components/tabs/ConfigTab.tsx @@ -176,7 +176,7 @@ export function deriveProvidersFromModels(models: ModelSpec[]): string[] { // exactly the point of the platform adaptor. The deep `~/.hermes/ // config.yaml` on the container is a separate runtime-internal file, // not this one. -const RUNTIMES_WITH_OWN_CONFIG = new Set(["external", "kimi", "kimi-cli"]); +const RUNTIMES_WITH_OWN_CONFIG = new Set(["external", "kimi", "kimi-cli", "openclaw"]); const FALLBACK_RUNTIME_OPTIONS: RuntimeOption[] = [ { value: "", label: "LangGraph (default)", models: [], providers: [] }, diff --git a/canvas/src/lib/api.ts b/canvas/src/lib/api.ts index 3ae5f413..83c6b065 100644 --- a/canvas/src/lib/api.ts +++ b/canvas/src/lib/api.ts @@ -8,14 +8,18 @@ import { getTenantSlug } from "./tenant"; export const PLATFORM_URL = process.env.NEXT_PUBLIC_PLATFORM_URL ?? "http://localhost:8080"; -// 15s is long enough for slow CP queries but short enough that a -// hung backend doesn't leave the UI spinning forever. The abort -// propagates through AbortController so React components can observe -// the error and render a retry affordance. Callers that know the -// endpoint is intentionally slow (org import walks a tree of -// workspaces with server-side pacing) can pass `timeoutMs` to -// override. -const DEFAULT_TIMEOUT_MS = 15_000; +// 35s is long enough for the slowest server-side path (EIC SSH +// tunnel for tenant EC2 file operations, bounded server-side by +// `eicFileOpTimeout = 30 * time.Second` in +// workspace-server/internal/handlers/template_files_eic.go) so the +// canvas surfaces the server's real error instead of aborting first +// with a generic timeout. Shorter values caused "Save & Restart" to +// time out at the client before the backend returned its 5xx. The +// abort still propagates through AbortController so React components +// can render a retry affordance. Callers that know an endpoint is +// intentionally slow (org import walks a tree of workspaces with +// server-side pacing) can pass `timeoutMs` to override. +const DEFAULT_TIMEOUT_MS = 35_000; export interface RequestOptions { timeoutMs?: number; -- 2.52.0