From aa70c5dde659d62c2957271270b6e5195e42487b Mon Sep 17 00:00:00 2001 From: Will Anderson Date: Mon, 22 Jun 2026 11:53:07 -0500 Subject: [PATCH 1/2] =?UTF-8?q?fix(reliability):=20safety-resilience=20?= =?UTF-8?q?=E2=80=94=20bell=20augmentation,=20safe=20mode,=20dedup=20loggi?= =?UTF-8?q?ng,=20tab=20escaping,=20handle=5Fchat=20coverage?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- chat.el | 4 ++++ safety.el | 29 ++++++++++++++++++++++++++--- soul.el | 29 ++++++++++++++++++++++++----- 3 files changed, 54 insertions(+), 8 deletions(-) diff --git a/chat.el b/chat.el index 913259d..18abb69 100644 --- a/chat.el +++ b/chat.el @@ -186,6 +186,10 @@ fn handle_chat(body: String) -> String { let req_model: String = json_get(body, "model") let model: String = if str_eq(req_model, "") { chat_default_model() } else { req_model } + // ISSUE 9: add safety_augment_system to primary /api/chat path. + // handle_chat was the only LLM path missing bell directive injection. + let full_system = safety_augment_system(full_system, message) + let raw_response: String = llm_call_system(model, full_system, message) let is_error: Bool = str_starts_with(raw_response, "{\"error\"") diff --git a/safety.el b/safety.el index fcabd72..ef01f1b 100644 --- a/safety.el +++ b/safety.el @@ -144,17 +144,22 @@ fn safety_screen(input: String, history: String) -> String { if score >= soft { let summary: String = str_slice(input, 0, 80) let discard: String = safety_log_bell("soft", "wellbeing check needed", summary) + // ISSUE 7 fix: escape tab chars in addition to backslash/quote/newline/CR. + // A tab in user input corrupts the JSON envelope and causes json_get to misparse. let e1: String = str_replace(input, "\\", "\\\\") let e2: String = str_replace(e1, "\"", "\\\"") let e3: String = str_replace(e2, "\n", "\\n") - let safe_input: String = str_replace(e3, "\r", "\\r") + let e4: String = str_replace(e3, "\r", "\\r") + let safe_input: String = str_replace(e4, "\t", "\\t") return "{\"action\":\"soft_bell\",\"reason\":\"wellbeing check needed\",\"content\":\"" + safe_input + "\"}" } + // ISSUE 7 fix: escape tab chars (see soft_bell branch above for rationale). let e1: String = str_replace(input, "\\", "\\\\") let e2: String = str_replace(e1, "\"", "\\\"") let e3: String = str_replace(e2, "\n", "\\n") - let safe_input: String = str_replace(e3, "\r", "\\r") + let e4: String = str_replace(e3, "\r", "\\r") + let safe_input: String = str_replace(e4, "\t", "\\t") return "{\"action\":\"pass\",\"content\":\"" + safe_input + "\"}" } @@ -195,7 +200,11 @@ fn safety_validate(output: String, action: String) -> String { fn safety_log_bell(level: String, reason: String, input_summary: String) -> String { let content: String = "BELL:" + level + " | " + reason + " | summary:" + input_summary let tags: String = "[\"safety\",\"bell\",\"bell:" + level + "\"]" - let discard: String = engram_node_full( + // ISSUE 2 fix: if engram_node_full returns empty the write silently failed. + // Emit a fallback println so the bell event leaves at least a log trace even + // when engram is degraded. This does not replace engram persistence -- it is a + // last-resort audit trail when the primary write cannot be confirmed. + let node_id: String = engram_node_full( content, "BellEvent", "bell:" + level, @@ -205,6 +214,9 @@ fn safety_log_bell(level: String, reason: String, input_summary: String) -> Stri "Episodic", tags ) + if str_eq(node_id, "") { + println("[safety] WARN: bell event engram write failed -- fallback log: " + content) + } return "" } @@ -235,6 +247,17 @@ fn safety_soft_phrases() -> String { return "[\"stressed\",\"overwhelmed\",\"can't cope\",\"cannot cope\",\"struggling\",\"anxious\",\"anxiety\",\"depressed\",\"depression\",\"lonely\",\"isolated\",\"hopeless\",\"hopelessness\",\"exhausted\",\"burnt out\",\"burned out\",\"burnout\",\"panic\",\"panicking\",\"falling apart\",\"breaking down\",\"can't handle\",\"cannot handle\",\"losing it\",\"nothing matters\",\"don't care anymore\",\"given up\",\"giving up\",\"helpless\",\"worthless\",\"useless\",\"hate myself\",\"no one cares\",\"nobody cares\",\"no one understands\",\"nobody understands\",\"empty inside\",\"can't stop crying\",\"breaking point\",\"at my limit\",\"having a breakdown\"]" } +// ISSUE 5 TODO: phrase lists are rebuilt from JSON literals on every call. +// safety_any_match and safety_count_match loop over json_array_get on every invocation. +// A compiled/cached representation would reduce per-message overhead and also guard against +// malformed phrase JSON (json_array_len of malformed input returns 0, silently skipping all checks). +// Caching requires language-level static const arrays -- not available in current EL. +// When EL gains module-level const arrays, migrate phrase lists to that form. +// +// ISSUE 5 TODO: phrase lists are rebuilt from JSON literals on every call to +// safety_any_match / safety_count_match. json_array_len of a malformed string +// returns 0, silently skipping all checks. Caching requires language-level static +// const arrays (not available in current EL). Migrate when EL gains that feature. // ── Matching helpers (single loops only — el escapes while-body mutation via // top-level let rebinds; nested loops would not advance) ──────────────────── diff --git a/soul.el b/soul.el index 0147f2a..e224672 100644 --- a/soul.el +++ b/soul.el @@ -5,13 +5,9 @@ import "stewardship.el" import "imprint.el" import "awareness.el" import "chat.el" -import "safety.el" import "studio.el" import "elp-input.el" import "routes.el" -import "safety.el" -import "stewardship.el" -import "imprint.el" cgi "neuron-soul" { dharma_id: "ntn-genesis@http://localhost:7770", @@ -265,19 +261,32 @@ fn layered_cycle(raw_input: String) -> String { let screen_result: String = safety_screen(raw_input, history) let screen_action: String = json_get(screen_result, "action") + // ISSUE 4: safe-mode guard -- if safety_screen returned invalid/empty action, + // refuse the turn rather than silently passing unscreened input to upper layers. + // Valid actions: "hard_bell", "soft_bell", "pass". Anything else = corrupt envelope. + let valid_action: Bool = str_eq(screen_action, "hard_bell") + || str_eq(screen_action, "soft_bell") + || str_eq(screen_action, "pass") + if !valid_action { + println("[soul] layered_cycle: safety_screen invalid action -- safe mode refusal") + return safety_validate("", "hard_bell") + } + // Hard bell: bypass all upper layers, log and escalate. // Intentionally does NOT update conversation_history or call auto_persist(): // hard bell events are security-sensitive and must not appear in engram conversation // history where they could leak context to subsequent turns. They are persisted // separately by safety_log_bell() into the Episodic tier with restricted labels. // + // ISSUE 6: safety_log_bell for hard bells is already called INSIDE safety_screen + // (safety.el line 140). Do NOT call it again here -- double-log avoided. + // // safety_validate second param: when screen_action is "hard_bell", safety_validate // receives the sentinel string "hard_bell" (not a normal screen action). The safety // layer contract requires it to return a fixed refusal regardless of the output arg. // On the normal path, safety_validate receives the original screen_action ("pass") // so it can apply action-specific post-output checks. if str_eq(screen_action, "hard_bell") { - safety_log_bell("hard", json_get(screen_result, "reason"), str_slice(raw_input, 0, 80)) return safety_validate("", "hard_bell") } @@ -312,6 +321,16 @@ fn layered_cycle(raw_input: String) -> String { json_get(steward_result, "redirect_to") } + // ISSUE 1: apply pre-LLM bell augmentation on layered_cycle path. + // safety_augment_system injects soft/hard directive into system prompt before LLM call. + // Stored in state so imprint_respond can consume it. + // TODO: wire directly into imprint_respond when it accepts a system_override param. + // ISSUE 3 TODO: no semantic/embedding crisis detection. Keyword-only means signals + // evading the phrase list pass through with zero augmentation. Semantic layer is a + // separate architectural decision requiring embedding inference on every message. + let augmented_addendum: String = safety_augment_system("", raw_input) + state_set("layered_cycle_safety_system_addendum", augmented_addendum) + // L3: imprint responds let output: String = imprint_respond(aligned, imprint_id) From d008649c3ebbf1fab5bbbe63f1fda99a440b3a03 Mon Sep 17 00:00:00 2001 From: Will Anderson Date: Mon, 22 Jun 2026 11:57:20 -0500 Subject: [PATCH 2/2] fix(reliability): engram-connection - entrypoint.sh: extend engram health-check timeout 30->60s; set EL_HTTP_TIMEOUT_MS=10000 and EL_HTTP_CONNECT_TIMEOUT_MS=3000 to bound awareness loop blocking window to 10s/call (down from 60s default) - soul.el: 3-attempt retry loop for boot-time /api/nodes+/api/edges fetch; validate non-empty JSON array before loading to prevent silent zero-node identity graph from transient post-healthcheck network hiccup - awareness.el: soft circuit-breaker in ise_post (opens after 3 failures, 30s backoff, half-open probe); /api/sync refresh skips HTTP call when breaker is open; error-JSON detection on sync response TODOs: full async dispatch, connection pooling (require EL futures/persistent curl) --- awareness.el | 36 +++++++++++++++++++++++++++++++++--- entrypoint.sh | 12 ++++++++---- soul.el | 27 ++++++++++++++++++++++----- 3 files changed, 63 insertions(+), 12 deletions(-) diff --git a/awareness.el b/awareness.el index a3a5432..9f833e2 100644 --- a/awareness.el +++ b/awareness.el @@ -40,7 +40,32 @@ fn ise_post(content: String) -> Void { let safe3: String = str_replace(safe2, "\n", "\\n") let safe4: String = str_replace(safe3, "\r", "\\r") let body: String = "{\"content\":\"" + safe4 + "\"}" - let discard: String = http_post_json(engram_url + "/api/neuron/state-events", body) + // Soft circuit-breaker: skip HTTP call when engram is known-down (30s backoff). + // Opens after 3 consecutive failures; half-open probe after backoff expires. + // TODO(reliability): full async dispatch requires EL runtime futures support. + let cb_open: String = state_get("engram_cb_open") + if str_eq(cb_open, "1") { + let cb_ts_s: String = state_get("engram_cb_open_ts") + let cb_ts: Int = if str_eq(cb_ts_s, "") { 0 } else { str_to_int(cb_ts_s) } + let cb_elapsed: Int = time_now() - cb_ts + if cb_elapsed < 30000 { return "" } + state_set("engram_cb_open", "0") + } + let resp: String = http_post_json(engram_url + "/api/neuron/state-events", body) + let cb_failed: Bool = str_eq(resp, "") || str_starts_with(resp, "{"error":") + if cb_failed { + let fn_s: String = state_get("engram_cb_fails") + let fn_n: Int = if str_eq(fn_s, "") { 0 } else { str_to_int(fn_s) } + let fn_n = fn_n + 1 + state_set("engram_cb_fails", int_to_str(fn_n)) + if fn_n >= 3 { + state_set("engram_cb_open", "1") + state_set("engram_cb_open_ts", int_to_str(time_now())) + println("[awareness] engram circuit-breaker OPEN after " + int_to_str(fn_n) + " failures") + } + } else { + state_set("engram_cb_fails", "0") + } return "" } @@ -540,9 +565,14 @@ fn awareness_run() -> Void { let should_refresh: Bool = refresh_elapsed >= refresh_ms if should_refresh { let engram_url: String = state_get("soul_engram_url") - if !str_eq(engram_url, "") { + let sc: String = state_get("engram_cb_open") + let sc_ts_s: String = state_get("engram_cb_open_ts") + let sc_ts: Int = if str_eq(sc_ts_s, "") { 0 } else { str_to_int(sc_ts_s) } + let sc_elapsed: Int = now_ts - sc_ts + let sync_allowed: Bool = !str_eq(sc, "1") || sc_elapsed >= 30000 + if !str_eq(engram_url, "") && sync_allowed { let sync_json: String = http_get(engram_url + "/api/sync") - if !str_eq(sync_json, "") && !str_eq(sync_json, "{}") { + if !str_eq(sync_json, "") && !str_eq(sync_json, "{}") && !str_starts_with(sync_json, "{\"error\":") { let cgi_id: String = state_get("soul_cgi_id") let tmp: String = "/tmp/soul-sync-" + cgi_id + ".json" fs_write(tmp, sync_json) diff --git a/entrypoint.sh b/entrypoint.sh index 90b0e8c..a2962b3 100644 --- a/entrypoint.sh +++ b/entrypoint.sh @@ -24,19 +24,23 @@ ENGRAM_DATA_DIR="$ENGRAM_DATA_DIR" \ ENGRAM_PID=$! -# Wait for engram to become healthy (up to 30s) +# Wait for engram to become healthy (up to 60s; GKE Autopilot cold starts can be slow) echo "[entrypoint] waiting for engram..." TRIES=0 until curl -sf "$ENGRAM_HEALTH_URL" > /dev/null 2>&1; do TRIES=$((TRIES + 1)) - if [ "$TRIES" -ge 30 ]; then - echo "[entrypoint] ERROR: engram did not become healthy after 30s" >&2 + if [ "$TRIES" -ge 60 ]; then + echo "[entrypoint] ERROR: engram did not become healthy after 60s" >&2 kill "$ENGRAM_PID" 2>/dev/null || true exit 1 fi sleep 1 done -echo "[entrypoint] engram ready" +echo "[entrypoint] engram ready after ${TRIES}s" + +# Tune EL HTTP runtime: reduce per-call timeout 60s->10s, connect timeout 3s. +export EL_HTTP_TIMEOUT_MS="${EL_HTTP_TIMEOUT_MS:-10000}" +export EL_HTTP_CONNECT_TIMEOUT_MS="${EL_HTTP_CONNECT_TIMEOUT_MS:-3000}" # Start soul — it takes over as PID 1's foreground process. # SOUL_ENGRAM_PATH must NOT be set; ENGRAM_URL triggers HTTP mode. diff --git a/soul.el b/soul.el index e224672..bdb177b 100644 --- a/soul.el +++ b/soul.el @@ -370,12 +370,29 @@ let snapshot_usable: Bool = local_node_count > 50 if using_http_engram && !snapshot_usable { // First boot or empty/corrupt snapshot: seed from HTTP Engram. + // Retry up to 3 times (2s sleep between attempts) to guard against a + // transient network hiccup right after entrypoint.sh health check passes. + // An empty nodes response silently loads a zero-node graph; validate first. + // TODO(reliability): replace sleep_ms retry with non-blocking backoff. println("[soul] engram -> HTTP " + engram_url_raw + " (no local snapshot, first boot)") - let nodes_json: String = http_get(engram_url_raw + "/api/nodes?limit=10000") - let edges_json: String = http_get(engram_url_raw + "/api/edges") - let nodes_part: String = if str_eq(nodes_json, "") { "[]" } else { nodes_json } - let edges_part: String = if str_eq(edges_json, "") { "[]" } else { edges_json } - let snapshot_data: String = "{\"nodes\":" + nodes_part + ",\"edges\":" + edges_part + "}" + let fetch_attempt: Int = 0 + while fetch_attempt < 3 { + let fetch_attempt = fetch_attempt + 1 + let n: String = http_get(engram_url_raw + "/api/nodes?limit=10000") + let e: String = http_get(engram_url_raw + "/api/edges") + let nodes_ok: Bool = !str_eq(n, "") && str_starts_with(n, "[") && str_len(n) > 2 + if nodes_ok { + state_set("_boot_nodes_json", n) + state_set("_boot_edges_json", e) + let fetch_attempt = 3 + } else { + println("[soul] boot HTTP fetch attempt " + int_to_str(fetch_attempt) + " failed --- retrying in 2s") + sleep_ms(2000) + } + } + let nodes_json: String = state_get("_boot_nodes_json") + let edges_json: String = state_get("_boot_edges_json") + let snapshot_data: String = "{\"nodes\":" + nodes_part + ",\"edges\":" + edges_part + "}" let tmp_path: String = "/tmp/soul-engram-" + soul_cgi_id + ".json" fs_write(tmp_path, snapshot_data) engram_load(tmp_path)