fix(reliability): session-boundary — ghost sessions, bridge leak, session validation

- sessions.el: add session_exists() for chat-path session validation (ISSUE #6/#7) - sessions.el: add session_create_cleanup() for ghost-session rollback (ISSUE #1) - sessions.el: set session_pending_first_msg flag in session_create; clear it in session_hist_save so the first successful chat marks the session active (ISSUE #1) - sessions.el: session_delete now clears mcp_bridge:<id> and always_allow_<id> state keys so abandoned pending-tool sessions do not accumulate (ISSUE #5) - sessions.el: add TODO comments for ISSUE #2 (no TTL/expiry), ISSUE #3 (non-atomic delete-then-create), ISSUE #4 (no concurrent-create guard), and ISSUE #8 (reconnect/duplicate resume race) where fixes are too invasive to land without new runtime primitives - chat.el: validate session_id exists via session_exists() before entering agentic_loop; unknown session_ids now return a 404-style error instead of silently starting a fresh empty session (ISSUE #6/#7)
propose(agentic): read agent_workspace_root from request body and persist to state
2026-06-22 11:58:33 -05:00 · 2026-06-19 19:56:20 -05:00
6 changed files with 143 additions and 118 deletions
@@ -40,32 +40,7 @@ fn ise_post(content: String) -> Void {
    let safe3: String = str_replace(safe2, "\n", "\\n")
    let safe4: String = str_replace(safe3, "\r", "\\r")
    let body: String = "{\"content\":\"" + safe4 + "\"}"
-    // Soft circuit-breaker: skip HTTP call when engram is known-down (30s backoff).
-    // Opens after 3 consecutive failures; half-open probe after backoff expires.
-    // TODO(reliability): full async dispatch requires EL runtime futures support.
-    let cb_open: String = state_get("engram_cb_open")
-    if str_eq(cb_open, "1") {
-        let cb_ts_s: String = state_get("engram_cb_open_ts")
-        let cb_ts: Int = if str_eq(cb_ts_s, "") { 0 } else { str_to_int(cb_ts_s) }
-        let cb_elapsed: Int = time_now() - cb_ts
-        if cb_elapsed < 30000 { return "" }
-        state_set("engram_cb_open", "0")
-    }
-    let resp: String = http_post_json(engram_url + "/api/neuron/state-events", body)
-    let cb_failed: Bool = str_eq(resp, "") || str_starts_with(resp, "{"error":")
-    if cb_failed {
-        let fn_s: String = state_get("engram_cb_fails")
-        let fn_n: Int = if str_eq(fn_s, "") { 0 } else { str_to_int(fn_s) }
-        let fn_n = fn_n + 1
-        state_set("engram_cb_fails", int_to_str(fn_n))
-        if fn_n >= 3 {
-            state_set("engram_cb_open", "1")
-            state_set("engram_cb_open_ts", int_to_str(time_now()))
-            println("[awareness] engram circuit-breaker OPEN after " + int_to_str(fn_n) + " failures")
-        }
-    } else {
-        state_set("engram_cb_fails", "0")
-    }
+    let discard: String = http_post_json(engram_url + "/api/neuron/state-events", body)
    return ""
 }

@@ -565,14 +540,9 @@ fn awareness_run() -> Void {
        let should_refresh: Bool = refresh_elapsed >= refresh_ms
        if should_refresh {
            let engram_url: String = state_get("soul_engram_url")
-            let sc: String = state_get("engram_cb_open")
-            let sc_ts_s: String = state_get("engram_cb_open_ts")
-            let sc_ts: Int = if str_eq(sc_ts_s, "") { 0 } else { str_to_int(sc_ts_s) }
-            let sc_elapsed: Int = now_ts - sc_ts
-            let sync_allowed: Bool = !str_eq(sc, "1") || sc_elapsed >= 30000
-            if !str_eq(engram_url, "") && sync_allowed {
+            if !str_eq(engram_url, "") {
                let sync_json: String = http_get(engram_url + "/api/sync")
-                if !str_eq(sync_json, "") && !str_eq(sync_json, "{}") && !str_starts_with(sync_json, "{\"error\":") {
+                if !str_eq(sync_json, "") && !str_eq(sync_json, "{}") {
                    let cgi_id: String = state_get("soul_cgi_id")
                    let tmp: String = "/tmp/soul-sync-" + cgi_id + ".json"
                    fs_write(tmp, sync_json)
@@ -186,10 +186,6 @@ fn handle_chat(body: String) -> String {
    let req_model: String = json_get(body, "model")
    let model: String = if str_eq(req_model, "") { chat_default_model() } else { req_model }

-    // ISSUE 9: add safety_augment_system to primary /api/chat path.
-    // handle_chat was the only LLM path missing bell directive injection.
-    let full_system = safety_augment_system(full_system, message)
-
    let raw_response: String = llm_call_system(model, full_system, message)

    let is_error: Bool = str_starts_with(raw_response, "{\"error\"")
@@ -635,12 +631,38 @@ fn handle_chat_agentic(body: String) -> String {
        return "{\"error\":\"message required\",\"reply\":\"\"}"
    }

+    // Workspace scope (#23): the desktop UI sends the user-chosen Agent Workspace root
+    // on every agentic request. Persist it to state so agent_workspace_root() — and the
+    // path/command tool guards that read it — confine this turn's file/command tools to
+    // that subtree. The UI is the source of truth per request: empty means unscoped (the
+    // backward-compatible default), and it also lets agent_workspace_root() fall through
+    // to the NEURON_AGENT_ROOT env when no root is sent. FLAGGED FOR REVIEW: setting
+    // state from the body each turn (vs. only-when-nonempty) so clearing the folder in
+    // the UI un-scopes — confirm this is the intended ownership model.
+    let ws_root: String = json_get(body, "agent_workspace_root")
+    state_set("agent_workspace_root", ws_root)
+
    let req_model: String = json_get(body, "model")
    let model: String = if str_eq(req_model, "") { chat_default_model() } else { req_model }

    // Thread-aware activation: same logic as handle_chat.
    // Use the session's or global history to anchor short messages to the thread.
    let req_session: String = json_get(body, "session_id")
+
+    // ISSUE #6/#7: validate that the session_id actually exists before proceeding.
+    // Without this check the loop silently treats any unknown/fabricated session_id
+    // as a fresh session — history loads as empty and no error is returned to the caller.
+    // Only validate when a session_id is explicitly provided; anonymous calls
+    // (no session_id) continue to work for backward compatibility.
+    let session_valid: Bool = if str_eq(req_session, "") {
+        true
+    } else {
+        session_exists(req_session)
+    }
+    if !session_valid {
+        return "{\"error\":\"session not found\",\"session_id\":\"" + req_session + "\",\"reply\":\"\"}"
+    }
+
    let hist_key: String = if str_eq(req_session, "") { "conv_history" } else { "session_hist_" + req_session }
    let agentic_hist: String = state_get(hist_key)
    let agentic_hist_len: Int = if str_eq(agentic_hist, "") { 0 } else { json_array_len(agentic_hist) }
@@ -24,23 +24,19 @@ ENGRAM_DATA_DIR="$ENGRAM_DATA_DIR" \

 ENGRAM_PID=$!

-# Wait for engram to become healthy (up to 60s; GKE Autopilot cold starts can be slow)
+# Wait for engram to become healthy (up to 30s)
 echo "[entrypoint] waiting for engram..."
 TRIES=0
 until curl -sf "$ENGRAM_HEALTH_URL" > /dev/null 2>&1; do
    TRIES=$((TRIES + 1))
-    if [ "$TRIES" -ge 60 ]; then
-        echo "[entrypoint] ERROR: engram did not become healthy after 60s" >&2
+    if [ "$TRIES" -ge 30 ]; then
+        echo "[entrypoint] ERROR: engram did not become healthy after 30s" >&2
        kill "$ENGRAM_PID" 2>/dev/null || true
        exit 1
    fi
    sleep 1
 done
-echo "[entrypoint] engram ready after ${TRIES}s"
-
-# Tune EL HTTP runtime: reduce per-call timeout 60s->10s, connect timeout 3s.
-export EL_HTTP_TIMEOUT_MS="${EL_HTTP_TIMEOUT_MS:-10000}"
-export EL_HTTP_CONNECT_TIMEOUT_MS="${EL_HTTP_CONNECT_TIMEOUT_MS:-3000}"
+echo "[entrypoint] engram ready"

 # Start soul — it takes over as PID 1's foreground process.
 # SOUL_ENGRAM_PATH must NOT be set; ENGRAM_URL triggers HTTP mode.
@@ -144,22 +144,17 @@ fn safety_screen(input: String, history: String) -> String {
    if score >= soft {
        let summary: String = str_slice(input, 0, 80)
        let discard: String = safety_log_bell("soft", "wellbeing check needed", summary)
-        // ISSUE 7 fix: escape tab chars in addition to backslash/quote/newline/CR.
-        // A tab in user input corrupts the JSON envelope and causes json_get to misparse.
        let e1: String = str_replace(input, "\\", "\\\\")
        let e2: String = str_replace(e1, "\"", "\\\"")
        let e3: String = str_replace(e2, "\n", "\\n")
-        let e4: String = str_replace(e3, "\r", "\\r")
-        let safe_input: String = str_replace(e4, "\t", "\\t")
+        let safe_input: String = str_replace(e3, "\r", "\\r")
        return "{\"action\":\"soft_bell\",\"reason\":\"wellbeing check needed\",\"content\":\"" + safe_input + "\"}"
    }

-    // ISSUE 7 fix: escape tab chars (see soft_bell branch above for rationale).
    let e1: String = str_replace(input, "\\", "\\\\")
    let e2: String = str_replace(e1, "\"", "\\\"")
    let e3: String = str_replace(e2, "\n", "\\n")
-    let e4: String = str_replace(e3, "\r", "\\r")
-    let safe_input: String = str_replace(e4, "\t", "\\t")
+    let safe_input: String = str_replace(e3, "\r", "\\r")
    return "{\"action\":\"pass\",\"content\":\"" + safe_input + "\"}"
 }

@@ -200,11 +195,7 @@ fn safety_validate(output: String, action: String) -> String {
 fn safety_log_bell(level: String, reason: String, input_summary: String) -> String {
    let content: String = "BELL:" + level + " | " + reason + " | summary:" + input_summary
    let tags: String = "[\"safety\",\"bell\",\"bell:" + level + "\"]"
-    // ISSUE 2 fix: if engram_node_full returns empty the write silently failed.
-    // Emit a fallback println so the bell event leaves at least a log trace even
-    // when engram is degraded. This does not replace engram persistence -- it is a
-    // last-resort audit trail when the primary write cannot be confirmed.
-    let node_id: String = engram_node_full(
+    let discard: String = engram_node_full(
        content,
        "BellEvent",
        "bell:" + level,
@@ -214,9 +205,6 @@ fn safety_log_bell(level: String, reason: String, input_summary: String) -> Stri
        "Episodic",
        tags
    )
-    if str_eq(node_id, "") {
-        println("[safety] WARN: bell event engram write failed -- fallback log: " + content)
-    }
    return ""
 }

@@ -247,17 +235,6 @@ fn safety_soft_phrases() -> String {
    return "[\"stressed\",\"overwhelmed\",\"can't cope\",\"cannot cope\",\"struggling\",\"anxious\",\"anxiety\",\"depressed\",\"depression\",\"lonely\",\"isolated\",\"hopeless\",\"hopelessness\",\"exhausted\",\"burnt out\",\"burned out\",\"burnout\",\"panic\",\"panicking\",\"falling apart\",\"breaking down\",\"can't handle\",\"cannot handle\",\"losing it\",\"nothing matters\",\"don't care anymore\",\"given up\",\"giving up\",\"helpless\",\"worthless\",\"useless\",\"hate myself\",\"no one cares\",\"nobody cares\",\"no one understands\",\"nobody understands\",\"empty inside\",\"can't stop crying\",\"breaking point\",\"at my limit\",\"having a breakdown\"]"
 }

-// ISSUE 5 TODO: phrase lists are rebuilt from JSON literals on every call.
-// safety_any_match and safety_count_match loop over json_array_get on every invocation.
-// A compiled/cached representation would reduce per-message overhead and also guard against
-// malformed phrase JSON (json_array_len of malformed input returns 0, silently skipping all checks).
-// Caching requires language-level static const arrays -- not available in current EL.
-// When EL gains module-level const arrays, migrate phrase lists to that form.
-//
-// ISSUE 5 TODO: phrase lists are rebuilt from JSON literals on every call to
-// safety_any_match / safety_count_match. json_array_len of a malformed string
-// returns 0, silently skipping all checks. Caching requires language-level static
-// const arrays (not available in current EL). Migrate when EL gains that feature.
 // ── Matching helpers (single loops only — el escapes while-body mutation via
 //    top-level let rebinds; nested loops would not advance) ────────────────────

@@ -36,7 +36,49 @@ fn session_make_content(id: String, title: String, created_at: Int, updated_at:
        + ",\"updated_at\":" + int_to_str(updated_at) + "}"
 }

+// session_exists — return true if the given session_id is known in Engram or state.
+// Used by chat.el to validate a session_id before processing a chat message.
+// Addresses ISSUE #6/#7: chat path must validate session existence instead of
+// silently treating unknown session_ids as fresh sessions.
+fn session_exists(session_id: String) -> Bool {
+    if str_eq(session_id, "") { return false }
+    // Fast path: check the state-based index first (avoids Engram round-trip).
+    let idx: String = state_get("session_index")
+    if !str_eq(idx, "") && !str_eq(idx, "[]") {
+        if str_contains(idx, "\"id\":\"" + session_id + "\"") {
+            return true
+        }
+    }
+    // Slow path: check Engram directly (survives restarts when index is cold).
+    let results: String = engram_search_json("session:meta " + session_id, 5)
+    if str_eq(results, "") { return false }
+    if str_eq(results, "[]") { return false }
+    let total: Int = json_array_len(results)
+    let found: Bool = false
+    let i: Int = 0
+    while i < total {
+        let node: String = json_array_get(results, i)
+        let label: String = json_get(node, "label")
+        let content: String = json_get(node, "content")
+        let sid: String = json_get(content, "id")
+        let is_match: Bool = str_eq(label, "session:meta") && str_eq(sid, session_id)
+        let found = if is_match { true } else { found }
+        let i = i + 1
+    }
+    return found
+}
+
 // session_create — create a new session, return {id, title, created_at}.
+//
+// ISSUE #1: Ghost sessions on failed first message.
+// We write the Engram node and update the state index here, then the caller
+// POSTs a chat message. If that chat call fails (LLM unavailable, network
+// error, etc.) the session is stranded with no messages. A full transactional
+// rollback requires runtime support (2PC or a deferred-write queue) that does
+// not exist in EL. Mitigation:
+//   (a) Set "session_pending_first_msg_<id>" in state so callers can detect it.
+//   (b) Provide session_create_cleanup() for callers that detect a failure.
+// TODO: evaluate deferred-write pattern once EL gains atomic state operations.
 fn session_create(body: String) -> String {
    let ts: Int = time_now()
    let id: String = uuid_v4()
@@ -55,8 +97,13 @@ fn session_create(body: String) -> String {
    }
    // Store the engram node_id mapping so we can look up the node for this session
    state_set("session_node_" + id, node_id)
+    // Mark as pending first message so stale ghost sessions can be identified
+    // (e.g. if the caller\'s subsequent chat POST fails).
+    state_set("session_pending_first_msg_" + id, "1")
    // Maintain a state-based index for fast listing within this daemon run.
    // Newest sessions first (prepend).
+    // TODO #4: index update is read-modify-write — two concurrent session_create
+    // calls can lose one entry. EL has no CAS primitive; fix requires runtime support.
    let existing_idx: String = state_get("session_index")
    let idx_entry: String = "{\"id\":\"" + id + "\",\"title\":\"" + json_safe(title) + "\",\"folder\":\"" + json_safe(folder) + "\",\"created_at\":" + int_to_str(ts) + ",\"updated_at\":" + int_to_str(ts) + ",\"last_message\":\"\"}"
    let new_idx: String = if str_eq(existing_idx, "") {
@@ -73,6 +120,20 @@ fn session_create(body: String) -> String {
        + ",\"created_at\":" + int_to_str(ts) + "}"
 }

+// session_create_cleanup — undo a session_create when the caller\'s first chat
+// fails. Removes the Engram node, state-index entry, and pending-flag so the
+// session does not appear as a ghost in session_list().
+// Addresses ISSUE #1: cleanup path for ghost sessions.
+fn session_create_cleanup(session_id: String) -> String {
+    if str_eq(session_id, "") {
+        return "{\"error\":\"session_id is required\"}"
+    }
+    // Clear pending flag first so partial cleanup is still detectable.
+    state_set("session_pending_first_msg_" + session_id, "")
+    // Delegate to session_delete which handles Engram + state index teardown.
+    return session_delete(session_id)
+}
+
 // session_list — list all sessions. Returns [{id, title, last_message, created_at, updated_at}].
 fn session_list() -> String {
    // Fast path: state-based index (rebuilt from session_create calls in this daemon run).
@@ -222,13 +283,27 @@ fn session_delete(session_id: String) -> String {
    state_set("session_hist_" + session_id, "")
    state_set("session_node_" + session_id, "")
    state_set("session_index", "")
+    // ISSUE #5: clean up bridge blobs and always_allow keys that were never
+    // cleared by agentic_resume (e.g. client abandoned a pending tool call).
+    // Without this, stranded bridge blobs accumulate indefinitely in state.
+    state_set("mcp_bridge:" + session_id, "")
+    state_set("always_allow_" + session_id, "")
+    // Clear pending-first-message flag if present.
+    state_set("session_pending_first_msg_" + session_id, "")
    return "{\"ok\":true,\"session_id\":\"" + session_id + "\""
        + ",\"deleted_meta\":" + int_to_str(deleted_meta)
        + ",\"deleted_msgs\":" + int_to_str(deleted_msgs) + "}"
 }

-// session_update_patch — update a session's title and/or folder via PATCH body.
+// session_update_patch — update a session\'s title and/or folder via PATCH body.
 // Body may contain "title", "folder", or both. Preserves unmentioned fields.
+//
+// ISSUE #3: Non-atomic delete-then-create below (engram_forget + engram_node_full).
+// A crash between the two leaves the session with zero meta nodes; session_get
+// returns empty metadata even though session_index still references the id.
+// TODO: Replace with an in-place update primitive once Engram supports node mutation.
+// Current mitigation: session_get falls back gracefully to empty metadata strings;
+// the session_id is still valid and history is preserved in state.
 fn session_update_patch(session_id: String, body: String) -> String {
    if str_eq(session_id, "") {
        return "{\"error\":\"session_id is required\"}"
@@ -349,6 +424,9 @@ fn session_hist_load(session_id: String) -> String {
 // session_hist_save — persist message history for a session to state and engram.
 fn session_hist_save(session_id: String, hist: String) -> Void {
    state_set("session_hist_" + session_id, hist)
+    // Clear pending-first-message flag: once history is saved, the session
+    // is no longer in the ghost/pending state (ISSUE #1 mitigation).
+    state_set("session_pending_first_msg_" + session_id, "")
    // Delete old history node and write fresh one
    let old_results: String = engram_search_json("session:messages:" + session_id, 3)
    let o_total: Int = if str_eq(old_results, "") { 0 } else { json_array_len(old_results) }
@@ -371,6 +449,16 @@ fn session_hist_save(session_id: String, hist: String) -> Void {
 }

 // session_update_meta_timestamp — update the updated_at field in the session:meta node.
+//
+// ISSUE #2: No TTL / idle expiry mechanism. Sessions accumulate indefinitely.
+// A sweep job (e.g. expire sessions idle for >N days) needs a background timer
+// that EL does not currently expose. Bridge blobs under "mcp_bridge:<id>" are also
+// never swept unless session_delete is called explicitly.
+// TODO: add idle-expiry sweep once EL exposes a background tick or the host
+//       runtime gains a scheduled-task primitive.
+//
+// ISSUE #3 applies here too: delete-then-create is non-atomic. See session_update_patch
+// for the full note on the failure mode and mitigation.
 fn session_update_meta_timestamp(session_id: String) -> Void {
    let results: String = engram_search_json("session:meta " + session_id, 10)
    let total: Int = if str_eq(results, "") { 0 } else { json_array_len(results) }
@@ -464,6 +552,14 @@ fn session_auto_title(session_id: String, first_message: String) -> Void {
 // action: "allow" | "deny" | "always"
 // Resumes the agentic loop from where it was paused.
 //
+// ISSUE #8: Reconnect/duplicate resume race. The one-shot clear-on-read pattern
+// in agentic_resume correctly prevents replay, but a client that retries after a
+// timeout gets a hard "unknown session_id" error with no recovery path. The
+// conversation is permanently stuck in that case. Full idempotency (e.g. caching
+// the last reply keyed by call_id) requires a new state structure.
+// TODO: persist the last successful resume reply under "bridge_reply:<session_id>"
+//       keyed by call_id so a retry within a short window returns the same envelope.
+//
 // Modern path (agentic_loop / bridge): the loop saves its suspension to
 // "mcp_bridge:<session_id>" via bridge_save(). On approval we dispatch_tool()
 // if allowed (or build a denial string), then hand the result to agentic_resume()
@@ -5,9 +5,13 @@ import "stewardship.el"
 import "imprint.el"
 import "awareness.el"
 import "chat.el"
+import "safety.el"
 import "studio.el"
 import "elp-input.el"
 import "routes.el"
+import "safety.el"
+import "stewardship.el"
+import "imprint.el"

 cgi "neuron-soul" {
    dharma_id: "ntn-genesis@http://localhost:7770",
@@ -261,32 +265,19 @@ fn layered_cycle(raw_input: String) -> String {
    let screen_result: String = safety_screen(raw_input, history)
    let screen_action: String = json_get(screen_result, "action")

-    // ISSUE 4: safe-mode guard -- if safety_screen returned invalid/empty action,
-    // refuse the turn rather than silently passing unscreened input to upper layers.
-    // Valid actions: "hard_bell", "soft_bell", "pass". Anything else = corrupt envelope.
-    let valid_action: Bool = str_eq(screen_action, "hard_bell")
-        || str_eq(screen_action, "soft_bell")
-        || str_eq(screen_action, "pass")
-    if !valid_action {
-        println("[soul] layered_cycle: safety_screen invalid action -- safe mode refusal")
-        return safety_validate("", "hard_bell")
-    }
-
    // Hard bell: bypass all upper layers, log and escalate.
    // Intentionally does NOT update conversation_history or call auto_persist():
    // hard bell events are security-sensitive and must not appear in engram conversation
    // history where they could leak context to subsequent turns. They are persisted
    // separately by safety_log_bell() into the Episodic tier with restricted labels.
    //
-    // ISSUE 6: safety_log_bell for hard bells is already called INSIDE safety_screen
-    // (safety.el line 140). Do NOT call it again here -- double-log avoided.
-    //
    // safety_validate second param: when screen_action is "hard_bell", safety_validate
    // receives the sentinel string "hard_bell" (not a normal screen action). The safety
    // layer contract requires it to return a fixed refusal regardless of the output arg.
    // On the normal path, safety_validate receives the original screen_action ("pass")
    // so it can apply action-specific post-output checks.
    if str_eq(screen_action, "hard_bell") {
+        safety_log_bell("hard", json_get(screen_result, "reason"), str_slice(raw_input, 0, 80))
        return safety_validate("", "hard_bell")
    }

@@ -321,16 +312,6 @@ fn layered_cycle(raw_input: String) -> String {
        json_get(steward_result, "redirect_to")
    }

-    // ISSUE 1: apply pre-LLM bell augmentation on layered_cycle path.
-    // safety_augment_system injects soft/hard directive into system prompt before LLM call.
-    // Stored in state so imprint_respond can consume it.
-    // TODO: wire directly into imprint_respond when it accepts a system_override param.
-    // ISSUE 3 TODO: no semantic/embedding crisis detection. Keyword-only means signals
-    // evading the phrase list pass through with zero augmentation. Semantic layer is a
-    // separate architectural decision requiring embedding inference on every message.
-    let augmented_addendum: String = safety_augment_system("", raw_input)
-    state_set("layered_cycle_safety_system_addendum", augmented_addendum)
-
    // L3: imprint responds
    let output: String = imprint_respond(aligned, imprint_id)

@@ -370,29 +351,12 @@ let snapshot_usable: Bool = local_node_count > 50

 if using_http_engram && !snapshot_usable {
    // First boot or empty/corrupt snapshot: seed from HTTP Engram.
-    // Retry up to 3 times (2s sleep between attempts) to guard against a
-    // transient network hiccup right after entrypoint.sh health check passes.
-    // An empty nodes response silently loads a zero-node graph; validate first.
-    // TODO(reliability): replace sleep_ms retry with non-blocking backoff.
    println("[soul] engram -> HTTP " + engram_url_raw + " (no local snapshot, first boot)")
-    let fetch_attempt: Int = 0
-    while fetch_attempt < 3 {
-        let fetch_attempt = fetch_attempt + 1
-        let n: String = http_get(engram_url_raw + "/api/nodes?limit=10000")
-        let e: String = http_get(engram_url_raw + "/api/edges")
-        let nodes_ok: Bool = !str_eq(n, "") && str_starts_with(n, "[") && str_len(n) > 2
-        if nodes_ok {
-            state_set("_boot_nodes_json", n)
-            state_set("_boot_edges_json", e)
-            let fetch_attempt = 3
-        } else {
-            println("[soul] boot HTTP fetch attempt " + int_to_str(fetch_attempt) + " failed --- retrying in 2s")
-            sleep_ms(2000)
-        }
-    }
-    let nodes_json: String = state_get("_boot_nodes_json")
-    let edges_json: String = state_get("_boot_edges_json")
-        let snapshot_data: String = "{\"nodes\":" + nodes_part + ",\"edges\":" + edges_part + "}"
+    let nodes_json: String = http_get(engram_url_raw + "/api/nodes?limit=10000")
+    let edges_json: String = http_get(engram_url_raw + "/api/edges")
+    let nodes_part: String = if str_eq(nodes_json, "") { "[]" } else { nodes_json }
+    let edges_part: String = if str_eq(edges_json, "") { "[]" } else { edges_json }
+    let snapshot_data: String = "{\"nodes\":" + nodes_part + ",\"edges\":" + edges_part + "}"
    let tmp_path: String = "/tmp/soul-engram-" + soul_cgi_id + ".json"
    fs_write(tmp_path, snapshot_data)
    engram_load(tmp_path)