From 0466a228e25e1538bd2aa967d6bf0aeaca207c89 Mon Sep 17 00:00:00 2001
From: fullstack-engineer <fullstack-engineer@agents.moleculesai.app>
Date: Fri, 15 May 2026 14:38:43 -0700
Subject: [PATCH] fix(canvas): skip config.yaml write for openclaw + bump
 request timeout to 35s
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Canvas "Save & Restart" was timing out for openclaw workspaces because
two bugs compounded:

1. **Pointless config.yaml write.** openclaw manages its own prompt
   surface via SOUL/BOOTSTRAP/AGENTS multi-file system — it does NOT
   read the platform's config.yaml. But ConfigTab.tsx was still
   issuing `PUT /workspaces/:id/files/config.yaml` on every save,
   which on tenant EC2 fans out through the slow EIC SSH tunnel path
   (`workspace-server/internal/handlers/template_files_eic.go`).
   Other runtimes that ship their own config are already exempted via
   `RUNTIMES_WITH_OWN_CONFIG` (external, kimi, kimi-cli). Add openclaw
   to that set so the platform stops doing work the runtime ignores.

2. **Client aborts before server returns.** `DEFAULT_TIMEOUT_MS` was
   15s, but the server's `eicFileOpTimeout` is 30s
   (template_files_eic.go L118). When EIC was slow or the EC2's
   ec2-instance-connect daemon was unhealthy, the canvas aborted with
   a generic timeout *before* the workspace-server returned its real
   5xx — so the user saw a useless "request timed out" instead of
   the actual cause. Raise the default to 35s so the server's error
   surfaces. The AbortController contract is unchanged; callers can
   still override `timeoutMs` per-request.

Together these fixes unblock the user-visible "Save & Restart"
behavior on openclaw workspaces. The underlying EIC hang on
i-04e5197e96adb888f (last_healthcheck_at IS NULL) is tracked
separately as a follow-up — this PR makes the canvas honest about
errors instead of swallowing them, and removes the unnecessary write
from openclaw's critical path entirely.

Refs: internal#418 (Canvas Save & Restart timeout on openclaw)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 canvas/src/components/tabs/ConfigTab.tsx |  2 +-
 canvas/src/lib/api.ts                    | 20 ++++++++++++--------
 2 files changed, 13 insertions(+), 9 deletions(-)
diff --git a/canvas/src/components/tabs/ConfigTab.tsx b/canvas/src/components/tabs/ConfigTab.tsx
index 6563a621..645edc25 100644
--- a/canvas/src/components/tabs/ConfigTab.tsx
+++ b/canvas/src/components/tabs/ConfigTab.tsx
@@ -176,7 +176,7 @@ export function deriveProvidersFromModels(models: ModelSpec[]): string[] {
 // exactly the point of the platform adaptor. The deep `~/.hermes/
 // config.yaml` on the container is a separate runtime-internal file,
 // not this one.
-const RUNTIMES_WITH_OWN_CONFIG = new Set<string>(["external", "kimi", "kimi-cli"]);
+const RUNTIMES_WITH_OWN_CONFIG = new Set<string>(["external", "kimi", "kimi-cli", "openclaw"]);
 
 const FALLBACK_RUNTIME_OPTIONS: RuntimeOption[] = [
   { value: "", label: "LangGraph (default)", models: [], providers: [] },
diff --git a/canvas/src/lib/api.ts b/canvas/src/lib/api.ts
index 3ae5f413..83c6b065 100644
--- a/canvas/src/lib/api.ts
+++ b/canvas/src/lib/api.ts
@@ -8,14 +8,18 @@ import { getTenantSlug } from "./tenant";
 export const PLATFORM_URL =
   process.env.NEXT_PUBLIC_PLATFORM_URL ?? "http://localhost:8080";
 
-// 15s is long enough for slow CP queries but short enough that a
-// hung backend doesn't leave the UI spinning forever. The abort
-// propagates through AbortController so React components can observe
-// the error and render a retry affordance. Callers that know the
-// endpoint is intentionally slow (org import walks a tree of
-// workspaces with server-side pacing) can pass `timeoutMs` to
-// override.
-const DEFAULT_TIMEOUT_MS = 15_000;
+// 35s is long enough for the slowest server-side path (EIC SSH
+// tunnel for tenant EC2 file operations, bounded server-side by
+// `eicFileOpTimeout = 30 * time.Second` in
+// workspace-server/internal/handlers/template_files_eic.go) so the
+// canvas surfaces the server's real error instead of aborting first
+// with a generic timeout. Shorter values caused "Save & Restart" to
+// time out at the client before the backend returned its 5xx. The
+// abort still propagates through AbortController so React components
+// can render a retry affordance. Callers that know an endpoint is
+// intentionally slow (org import walks a tree of workspaces with
+// server-side pacing) can pass `timeoutMs` to override.
+const DEFAULT_TIMEOUT_MS = 35_000;
 
 export interface RequestOptions {
   timeoutMs?: number;
-- 
2.52.0