From d008649c3ebbf1fab5bbbe63f1fda99a440b3a03 Mon Sep 17 00:00:00 2001 From: Will Anderson Date: Mon, 22 Jun 2026 11:57:20 -0500 Subject: [PATCH] fix(reliability): engram-connection - entrypoint.sh: extend engram health-check timeout 30->60s; set EL_HTTP_TIMEOUT_MS=10000 and EL_HTTP_CONNECT_TIMEOUT_MS=3000 to bound awareness loop blocking window to 10s/call (down from 60s default) - soul.el: 3-attempt retry loop for boot-time /api/nodes+/api/edges fetch; validate non-empty JSON array before loading to prevent silent zero-node identity graph from transient post-healthcheck network hiccup - awareness.el: soft circuit-breaker in ise_post (opens after 3 failures, 30s backoff, half-open probe); /api/sync refresh skips HTTP call when breaker is open; error-JSON detection on sync response TODOs: full async dispatch, connection pooling (require EL futures/persistent curl) --- awareness.el | 36 +++++++++++++++++++++++++++++++++--- entrypoint.sh | 12 ++++++++---- soul.el | 27 ++++++++++++++++++++++----- 3 files changed, 63 insertions(+), 12 deletions(-) diff --git a/awareness.el b/awareness.el index a3a5432..9f833e2 100644 --- a/awareness.el +++ b/awareness.el @@ -40,7 +40,32 @@ fn ise_post(content: String) -> Void { let safe3: String = str_replace(safe2, "\n", "\\n") let safe4: String = str_replace(safe3, "\r", "\\r") let body: String = "{\"content\":\"" + safe4 + "\"}" - let discard: String = http_post_json(engram_url + "/api/neuron/state-events", body) + // Soft circuit-breaker: skip HTTP call when engram is known-down (30s backoff). + // Opens after 3 consecutive failures; half-open probe after backoff expires. + // TODO(reliability): full async dispatch requires EL runtime futures support. + let cb_open: String = state_get("engram_cb_open") + if str_eq(cb_open, "1") { + let cb_ts_s: String = state_get("engram_cb_open_ts") + let cb_ts: Int = if str_eq(cb_ts_s, "") { 0 } else { str_to_int(cb_ts_s) } + let cb_elapsed: Int = time_now() - cb_ts + if cb_elapsed < 30000 { return "" } + state_set("engram_cb_open", "0") + } + let resp: String = http_post_json(engram_url + "/api/neuron/state-events", body) + let cb_failed: Bool = str_eq(resp, "") || str_starts_with(resp, "{"error":") + if cb_failed { + let fn_s: String = state_get("engram_cb_fails") + let fn_n: Int = if str_eq(fn_s, "") { 0 } else { str_to_int(fn_s) } + let fn_n = fn_n + 1 + state_set("engram_cb_fails", int_to_str(fn_n)) + if fn_n >= 3 { + state_set("engram_cb_open", "1") + state_set("engram_cb_open_ts", int_to_str(time_now())) + println("[awareness] engram circuit-breaker OPEN after " + int_to_str(fn_n) + " failures") + } + } else { + state_set("engram_cb_fails", "0") + } return "" } @@ -540,9 +565,14 @@ fn awareness_run() -> Void { let should_refresh: Bool = refresh_elapsed >= refresh_ms if should_refresh { let engram_url: String = state_get("soul_engram_url") - if !str_eq(engram_url, "") { + let sc: String = state_get("engram_cb_open") + let sc_ts_s: String = state_get("engram_cb_open_ts") + let sc_ts: Int = if str_eq(sc_ts_s, "") { 0 } else { str_to_int(sc_ts_s) } + let sc_elapsed: Int = now_ts - sc_ts + let sync_allowed: Bool = !str_eq(sc, "1") || sc_elapsed >= 30000 + if !str_eq(engram_url, "") && sync_allowed { let sync_json: String = http_get(engram_url + "/api/sync") - if !str_eq(sync_json, "") && !str_eq(sync_json, "{}") { + if !str_eq(sync_json, "") && !str_eq(sync_json, "{}") && !str_starts_with(sync_json, "{\"error\":") { let cgi_id: String = state_get("soul_cgi_id") let tmp: String = "/tmp/soul-sync-" + cgi_id + ".json" fs_write(tmp, sync_json) diff --git a/entrypoint.sh b/entrypoint.sh index 90b0e8c..a2962b3 100644 --- a/entrypoint.sh +++ b/entrypoint.sh @@ -24,19 +24,23 @@ ENGRAM_DATA_DIR="$ENGRAM_DATA_DIR" \ ENGRAM_PID=$! -# Wait for engram to become healthy (up to 30s) +# Wait for engram to become healthy (up to 60s; GKE Autopilot cold starts can be slow) echo "[entrypoint] waiting for engram..." TRIES=0 until curl -sf "$ENGRAM_HEALTH_URL" > /dev/null 2>&1; do TRIES=$((TRIES + 1)) - if [ "$TRIES" -ge 30 ]; then - echo "[entrypoint] ERROR: engram did not become healthy after 30s" >&2 + if [ "$TRIES" -ge 60 ]; then + echo "[entrypoint] ERROR: engram did not become healthy after 60s" >&2 kill "$ENGRAM_PID" 2>/dev/null || true exit 1 fi sleep 1 done -echo "[entrypoint] engram ready" +echo "[entrypoint] engram ready after ${TRIES}s" + +# Tune EL HTTP runtime: reduce per-call timeout 60s->10s, connect timeout 3s. +export EL_HTTP_TIMEOUT_MS="${EL_HTTP_TIMEOUT_MS:-10000}" +export EL_HTTP_CONNECT_TIMEOUT_MS="${EL_HTTP_CONNECT_TIMEOUT_MS:-3000}" # Start soul — it takes over as PID 1's foreground process. # SOUL_ENGRAM_PATH must NOT be set; ENGRAM_URL triggers HTTP mode. diff --git a/soul.el b/soul.el index e224672..bdb177b 100644 --- a/soul.el +++ b/soul.el @@ -370,12 +370,29 @@ let snapshot_usable: Bool = local_node_count > 50 if using_http_engram && !snapshot_usable { // First boot or empty/corrupt snapshot: seed from HTTP Engram. + // Retry up to 3 times (2s sleep between attempts) to guard against a + // transient network hiccup right after entrypoint.sh health check passes. + // An empty nodes response silently loads a zero-node graph; validate first. + // TODO(reliability): replace sleep_ms retry with non-blocking backoff. println("[soul] engram -> HTTP " + engram_url_raw + " (no local snapshot, first boot)") - let nodes_json: String = http_get(engram_url_raw + "/api/nodes?limit=10000") - let edges_json: String = http_get(engram_url_raw + "/api/edges") - let nodes_part: String = if str_eq(nodes_json, "") { "[]" } else { nodes_json } - let edges_part: String = if str_eq(edges_json, "") { "[]" } else { edges_json } - let snapshot_data: String = "{\"nodes\":" + nodes_part + ",\"edges\":" + edges_part + "}" + let fetch_attempt: Int = 0 + while fetch_attempt < 3 { + let fetch_attempt = fetch_attempt + 1 + let n: String = http_get(engram_url_raw + "/api/nodes?limit=10000") + let e: String = http_get(engram_url_raw + "/api/edges") + let nodes_ok: Bool = !str_eq(n, "") && str_starts_with(n, "[") && str_len(n) > 2 + if nodes_ok { + state_set("_boot_nodes_json", n) + state_set("_boot_edges_json", e) + let fetch_attempt = 3 + } else { + println("[soul] boot HTTP fetch attempt " + int_to_str(fetch_attempt) + " failed --- retrying in 2s") + sleep_ms(2000) + } + } + let nodes_json: String = state_get("_boot_nodes_json") + let edges_json: String = state_get("_boot_edges_json") + let snapshot_data: String = "{\"nodes\":" + nodes_part + ",\"edges\":" + edges_part + "}" let tmp_path: String = "/tmp/soul-engram-" + soul_cgi_id + ".json" fs_write(tmp_path, snapshot_data) engram_load(tmp_path)