diff --git a/awareness.el b/awareness.el index 50bf1f6..e309dd7 100644 --- a/awareness.el +++ b/awareness.el @@ -43,7 +43,32 @@ fn ise_post(content: String) -> Void { let safe3: String = str_replace(safe2, "\n", "\\n") let safe4: String = str_replace(safe3, "\r", "\\r") let body: String = "{\"content\":\"" + safe4 + "\"}" - let discard: String = http_post_json(engram_url + "/api/neuron/state-events", body) + // Soft circuit-breaker: skip HTTP call when engram is known-down (30s backoff). + // Opens after 3 consecutive failures; half-open probe after backoff expires. + // TODO(reliability): full async dispatch requires EL runtime futures support. + let cb_open: String = state_get("engram_cb_open") + if str_eq(cb_open, "1") { + let cb_ts_s: String = state_get("engram_cb_open_ts") + let cb_ts: Int = if str_eq(cb_ts_s, "") { 0 } else { str_to_int(cb_ts_s) } + let cb_elapsed: Int = time_now() - cb_ts + if cb_elapsed < 30000 { return "" } + state_set("engram_cb_open", "0") + } + let resp: String = http_post_json(engram_url + "/api/neuron/state-events", body) + let cb_failed: Bool = str_eq(resp, "") || str_starts_with(resp, "{"error":") + if cb_failed { + let fn_s: String = state_get("engram_cb_fails") + let fn_n: Int = if str_eq(fn_s, "") { 0 } else { str_to_int(fn_s) } + let fn_n = fn_n + 1 + state_set("engram_cb_fails", int_to_str(fn_n)) + if fn_n >= 3 { + state_set("engram_cb_open", "1") + state_set("engram_cb_open_ts", int_to_str(time_now())) + println("[awareness] engram circuit-breaker OPEN after " + int_to_str(fn_n) + " failures") + } + } else { + state_set("engram_cb_fails", "0") + } return "" } @@ -543,9 +568,14 @@ fn awareness_run() -> Void { let should_refresh: Bool = refresh_elapsed >= refresh_ms if should_refresh { let engram_url: String = state_get("soul_engram_url") - if !str_eq(engram_url, "") { + let sc: String = state_get("engram_cb_open") + let sc_ts_s: String = state_get("engram_cb_open_ts") + let sc_ts: Int = if str_eq(sc_ts_s, "") { 0 } else { str_to_int(sc_ts_s) } + let sc_elapsed: Int = now_ts - sc_ts + let sync_allowed: Bool = !str_eq(sc, "1") || sc_elapsed >= 30000 + if !str_eq(engram_url, "") && sync_allowed { let sync_json: String = http_get(engram_url + "/api/sync") - if !str_eq(sync_json, "") && !str_eq(sync_json, "{}") { + if !str_eq(sync_json, "") && !str_eq(sync_json, "{}") && !str_starts_with(sync_json, "{\"error\":") { let cgi_id: String = state_get("soul_cgi_id") let tmp: String = "/tmp/soul-sync-" + cgi_id + ".json" fs_write(tmp, sync_json) diff --git a/chat.el b/chat.el index 454f414..67779af 100644 --- a/chat.el +++ b/chat.el @@ -571,11 +571,6 @@ fn handle_chat(body: String) -> String { // ISSUE 9: add safety_augment_system to primary /api/chat path. // handle_chat was the only LLM path missing bell directive injection. - // Safety augmentation on the main chat path. Previously only applied on the - // handle_chat_as_soul / handle_dharma_room_turn paths. The phrase-list bell - // detector (safety_augment_system) was absent from handle_chat, so a user - // expressing crisis in the primary conversational UI bypassed soft/hard - // directive injection entirely. Applying it here before every llm_call_system. let full_system = safety_augment_system(full_system, message) let raw_response: String = llm_call_system(model, full_system, message) diff --git a/entrypoint.sh b/entrypoint.sh index 90b0e8c..a2962b3 100644 --- a/entrypoint.sh +++ b/entrypoint.sh @@ -24,19 +24,23 @@ ENGRAM_DATA_DIR="$ENGRAM_DATA_DIR" \ ENGRAM_PID=$! -# Wait for engram to become healthy (up to 30s) +# Wait for engram to become healthy (up to 60s; GKE Autopilot cold starts can be slow) echo "[entrypoint] waiting for engram..." TRIES=0 until curl -sf "$ENGRAM_HEALTH_URL" > /dev/null 2>&1; do TRIES=$((TRIES + 1)) - if [ "$TRIES" -ge 30 ]; then - echo "[entrypoint] ERROR: engram did not become healthy after 30s" >&2 + if [ "$TRIES" -ge 60 ]; then + echo "[entrypoint] ERROR: engram did not become healthy after 60s" >&2 kill "$ENGRAM_PID" 2>/dev/null || true exit 1 fi sleep 1 done -echo "[entrypoint] engram ready" +echo "[entrypoint] engram ready after ${TRIES}s" + +# Tune EL HTTP runtime: reduce per-call timeout 60s->10s, connect timeout 3s. +export EL_HTTP_TIMEOUT_MS="${EL_HTTP_TIMEOUT_MS:-10000}" +export EL_HTTP_CONNECT_TIMEOUT_MS="${EL_HTTP_CONNECT_TIMEOUT_MS:-3000}" # Start soul — it takes over as PID 1's foreground process. # SOUL_ENGRAM_PATH must NOT be set; ENGRAM_URL triggers HTTP mode. diff --git a/safety.el b/safety.el index 6dd0fa7..4124f67 100644 --- a/safety.el +++ b/safety.el @@ -144,7 +144,8 @@ fn safety_screen(input: String, history: String) -> String { if score >= soft { let summary: String = str_slice(input, 0, 80) let discard: String = safety_log_bell("soft", "wellbeing check needed", summary) - // ISSUE 7: also escape tab chars to prevent JSON envelope corruption. + // ISSUE 7 fix: escape tab chars in addition to backslash/quote/newline/CR. + // A tab in user input corrupts the JSON envelope and causes json_get to misparse. let e1: String = str_replace(input, "\\", "\\\\") let e2: String = str_replace(e1, "\"", "\\\"") let e3: String = str_replace(e2, "\n", "\\n") @@ -153,7 +154,7 @@ fn safety_screen(input: String, history: String) -> String { return "{\"action\":\"soft_bell\",\"reason\":\"wellbeing check needed\",\"content\":\"" + safe_input + "\"}" } - // ISSUE 7: also escape tab chars (see soft_bell branch above). + // ISSUE 7 fix: escape tab chars (see soft_bell branch above for rationale). let e1: String = str_replace(input, "\\", "\\\\") let e2: String = str_replace(e1, "\"", "\\\"") let e3: String = str_replace(e2, "\n", "\\n") @@ -199,7 +200,10 @@ fn safety_validate(output: String, action: String) -> String { fn safety_log_bell(level: String, reason: String, input_summary: String) -> String { let content: String = "BELL:" + level + " | " + reason + " | summary:" + input_summary let tags: String = "[\"safety\",\"bell\",\"bell:" + level + "\"]" - // ISSUE 2: fallback log when engram write fails silently. + // ISSUE 2 fix: if engram_node_full returns empty the write silently failed. + // Emit a fallback println so the bell event leaves at least a log trace even + // when engram is degraded. This does not replace engram persistence -- it is a + // last-resort audit trail when the primary write cannot be confirmed. let node_id: String = engram_node_full( content, "BellEvent", @@ -211,7 +215,7 @@ fn safety_log_bell(level: String, reason: String, input_summary: String) -> Stri tags ) if str_eq(node_id, "") { - println("[safety] WARN: bell engram write failed -- " + content) + println("[safety] WARN: bell event engram write failed -- fallback log: " + content) } return "" } @@ -244,9 +248,16 @@ fn safety_soft_phrases() -> String { } // ISSUE 5 TODO: phrase lists are rebuilt from JSON literals on every call. -// json_array_len of malformed input returns 0, silently skipping all checks. -// Caching requires language-level static const arrays -- not in current EL. -// Migrate to const arrays when EL gains that feature. +// safety_any_match and safety_count_match loop over json_array_get on every invocation. +// A compiled/cached representation would reduce per-message overhead and also guard against +// malformed phrase JSON (json_array_len of malformed input returns 0, silently skipping all checks). +// Caching requires language-level static const arrays -- not available in current EL. +// When EL gains module-level const arrays, migrate phrase lists to that form. +// +// ISSUE 5 TODO: phrase lists are rebuilt from JSON literals on every call to +// safety_any_match / safety_count_match. json_array_len of a malformed string +// returns 0, silently skipping all checks. Caching requires language-level static +// const arrays (not available in current EL). Migrate when EL gains that feature. // ── Matching helpers (single loops only — el escapes while-body mutation via // top-level let rebinds; nested loops would not advance) ──────────────────── diff --git a/soul.el b/soul.el index 48cd632..4942376 100644 --- a/soul.el +++ b/soul.el @@ -305,8 +305,9 @@ fn layered_cycle(raw_input: String) -> String { let screen_result: String = safety_screen(raw_input, history) let screen_action: String = json_get(screen_result, "action") - // ISSUE 4: safe-mode guard. If safety_screen returned an invalid/empty action - // (engram failure or internal error), refuse rather than pass unscreened input. + // ISSUE 4: safe-mode guard -- if safety_screen returned invalid/empty action, + // refuse the turn rather than silently passing unscreened input to upper layers. + // Valid actions: "hard_bell", "soft_bell", "pass". Anything else = corrupt envelope. let valid_action: Bool = str_eq(screen_action, "hard_bell") || str_eq(screen_action, "soft_bell") || str_eq(screen_action, "pass") @@ -321,8 +322,8 @@ fn layered_cycle(raw_input: String) -> String { // history where they could leak context to subsequent turns. They are persisted // separately by safety_log_bell() into the Episodic tier with restricted labels. // - // ISSUE 6: safety_log_bell already called inside safety_screen (line 140). - // Do NOT call it again here -- that would double-log every hard bell. + // ISSUE 6: safety_log_bell for hard bells is already called INSIDE safety_screen + // (safety.el line 140). Do NOT call it again here -- double-log avoided. // // safety_validate second param: when screen_action is "hard_bell", safety_validate // receives the sentinel string "hard_bell" (not a normal screen action). The safety @@ -364,13 +365,13 @@ fn layered_cycle(raw_input: String) -> String { json_get(steward_result, "redirect_to") } - // ISSUE 1: pre-LLM bell augmentation for layered_cycle path. - // safety_augment_system appends soft/hard directive to system prompt when bell fires, - // ensuring LLM processes message WITH the safety directive -- not just post-output gate. - // Stored in state as "layered_cycle_safety_system_addendum" for imprint_respond to use. - // TODO: wire directly when imprint_respond gains system_override param (imprint.el change). - // ISSUE 3 TODO: no semantic crisis detection. Keyword-only means signals that evade - // the phrase list pass with zero augmentation. Semantic layer = separate decision. + // ISSUE 1: apply pre-LLM bell augmentation on layered_cycle path. + // safety_augment_system injects soft/hard directive into system prompt before LLM call. + // Stored in state so imprint_respond can consume it. + // TODO: wire directly into imprint_respond when it accepts a system_override param. + // ISSUE 3 TODO: no semantic/embedding crisis detection. Keyword-only means signals + // evading the phrase list pass through with zero augmentation. Semantic layer is a + // separate architectural decision requiring embedding inference on every message. let augmented_addendum: String = safety_augment_system("", raw_input) state_set("layered_cycle_safety_system_addendum", augmented_addendum) @@ -413,12 +414,29 @@ let snapshot_usable: Bool = local_node_count > 50 if using_http_engram && !snapshot_usable { // First boot or empty/corrupt snapshot: seed from HTTP Engram. + // Retry up to 3 times (2s sleep between attempts) to guard against a + // transient network hiccup right after entrypoint.sh health check passes. + // An empty nodes response silently loads a zero-node graph; validate first. + // TODO(reliability): replace sleep_ms retry with non-blocking backoff. println("[soul] engram -> HTTP " + engram_url_raw + " (no local snapshot, first boot)") - let nodes_json: String = http_get(engram_url_raw + "/api/nodes?limit=10000") - let edges_json: String = http_get(engram_url_raw + "/api/edges") - let nodes_part: String = if str_eq(nodes_json, "") { "[]" } else { nodes_json } - let edges_part: String = if str_eq(edges_json, "") { "[]" } else { edges_json } - let snapshot_data: String = "{\"nodes\":" + nodes_part + ",\"edges\":" + edges_part + "}" + let fetch_attempt: Int = 0 + while fetch_attempt < 3 { + let fetch_attempt = fetch_attempt + 1 + let n: String = http_get(engram_url_raw + "/api/nodes?limit=10000") + let e: String = http_get(engram_url_raw + "/api/edges") + let nodes_ok: Bool = !str_eq(n, "") && str_starts_with(n, "[") && str_len(n) > 2 + if nodes_ok { + state_set("_boot_nodes_json", n) + state_set("_boot_edges_json", e) + let fetch_attempt = 3 + } else { + println("[soul] boot HTTP fetch attempt " + int_to_str(fetch_attempt) + " failed --- retrying in 2s") + sleep_ms(2000) + } + } + let nodes_json: String = state_get("_boot_nodes_json") + let edges_json: String = state_get("_boot_edges_json") + let snapshot_data: String = "{\"nodes\":" + nodes_part + ",\"edges\":" + edges_part + "}" let tmp_path: String = "/tmp/soul-engram-" + soul_cgi_id + ".json" fs_write(tmp_path, snapshot_data) engram_load(tmp_path)