fix(reliability): cross-session-affective

- Fix state key mismatch: soul.el layered_cycle now reads conv_history (not conversation_history), unblocking the safety_score_distress_history history-amplification path in safety_threat_score - Add safety_augment_system call on the main handle_chat path so the phrase-list bell detector fires on all chat turns, not just dharma rooms - Add cross-session affective engram query in load_identity_context() at boot; stores distress/crisis signals from prior sessions under soul_affective_context with a 7-day soft recency filter
2026-06-22 11:48:30 -05:00
3 changed files with 60 additions and 113 deletions
@@ -213,11 +213,6 @@ fn hist_append(hist: String, role: String, content: String) -> String {
 }

 fn hist_trim(hist: String) -> String {
-    // Issue #9 (fragile parser): uses manual str_index_of scan rather than a real
-    // JSON parser. If the history JSON does not contain the expected marker pattern
-    // (e.g. corrupted or truncated), returns the unmodified hist silently — silent
-    // data corruption that causes LLM context-length errors on the next turn.
-    // TODO: replace with json_array_slice() once available in the EL runtime.
    let inner: String = str_slice(hist, 1, str_len(hist) - 1)
    let marker: String = "{\"role\":"
    let i1: Int = str_index_of(inner, marker)
@@ -276,20 +271,10 @@ fn conv_history_load() -> String {
 fn handle_chat(body: String) -> String {
    let message: String = json_get(body, "message")
    if str_eq(message, "") {
-        // Issue #5: missing required param — HTTP 400.
-        return "{\"__status__\":400,\"error\":\"message is required\",\"response\":\"\"}"
+        return "{\"error\":\"message is required\",\"response\":\"\"}"
    }

    // Load history BEFORE compiling context so we can anchor activation to the thread.
-    //
-    // TODO(reliability #3 — conv_history global race): "conv_history" is a process-global
-    // state key. Concurrent /api/chat requests that omit session_id all read the same key,
-    // append their exchange, and write it back. Because _state_mu serializes individual
-    // state_get/state_set calls but NOT the read-append-write sequence, one thread's
-    // appended exchange can be overwritten by another thread writing its own version.
-    // The fix is to require callers to supply a session_id (routing them through
-    // session_hist_<id>) and deprecate the global "conv_history" path. Callers using
-    // the session API (which scopes history per session_hist_<id>) are not affected.
    let state_hist: String = state_get("conv_history")
    let stored_hist: String = if str_eq(state_hist, "") { conv_history_load() } else { state_hist }
    let hist_len: Int = if str_eq(stored_hist, "") { 0 } else { json_array_len(stored_hist) }
@@ -389,14 +374,20 @@ fn handle_chat(body: String) -> String {
    let req_model: String = json_get(body, "model")
    let model: String = if str_eq(req_model, "") { chat_default_model() } else { req_model }

+    // Safety augmentation on the main chat path. Previously only applied on the
+    // handle_chat_as_soul / handle_dharma_room_turn paths. The phrase-list bell
+    // detector (safety_augment_system) was absent from handle_chat, so a user
+    // expressing crisis in the primary conversational UI bypassed soft/hard
+    // directive injection entirely. Applying it here before every llm_call_system.
+    let full_system = safety_augment_system(full_system, message)
+
    let raw_response: String = llm_call_system(model, full_system, message)

    let is_error: Bool = str_starts_with(raw_response, "{\"error\"")
        || str_starts_with(raw_response, "{\"type\":\"error\"")
        || str_contains(raw_response, "authentication_error")
    if is_error {
-        // Issue #6: LLM failure — HTTP 503 (service unavailable).
-        return "{\"__status__\":503,\"error\":\"llm unavailable\",\"response\":\"\"}"
+        return "{\"error\":\"llm unavailable\",\"response\":\"\"}"
    }

    let clean_response: String = clean_llm_response(raw_response)
@@ -543,15 +534,7 @@ fn agentic_tools_all() -> String {
 fn call_mcp_bridge(tool_name: String, tool_input: String) -> String {
    let eff_input: String = if str_eq(tool_input, "") { "{}" } else { tool_input }
    let body: String = "{\"name\":\"" + tool_name + "\",\"input\":" + eff_input + "}"
-    // Issue #12: previously used a fixed path /tmp/neuron-mcp-call.json.
-    // Under concurrent load (64 worker threads), two simultaneous MCP tool calls
-    // race on this file — one call sends the other's input to the bridge.
-    // Fix: monotonic sequence counter makes the path unique per call.
-    let mcp_seq_s: String = state_get("mcp_call_seq")
-    let mcp_seq_n: Int = if str_eq(mcp_seq_s, "") { 0 } else { str_to_int(mcp_seq_s) }
-    let mcp_seq_next: Int = mcp_seq_n + 1
-    state_set("mcp_call_seq", int_to_str(mcp_seq_next))
-    let tmp: String = "/tmp/neuron-mcp-call-" + int_to_str(time_now()) + "-" + int_to_str(mcp_seq_next) + ".json"
+    let tmp: String = "/tmp/neuron-mcp-call.json"
    fs_write(tmp, body)
    return exec_capture("curl -s --max-time 30 -X POST http://127.0.0.1:7771/mcp/call -H 'Content-Type: application/json' -d @" + tmp)
 }
@@ -826,25 +809,15 @@ fn is_builtin_tool(tool_name: String) -> Bool {
        || str_starts_with(tool_name, "neuron_")
 }

-// next_bridge_id — unique correlation id for a suspended agentic turn.
-// Uses uuid_v4() as the primary uniqueness guarantee so concurrent calls
-// (even in the same millisecond) cannot collide. The "mcp_bridge_seq"
-// counter is kept for human readability in logs/debugging but is no longer
-// relied on for uniqueness.
-//
-// TODO(reliability #6): state_get/state_set on "mcp_bridge_seq" is a
-// non-atomic read-modify-write — two concurrent calls can read the same
-// counter and produce the same counter suffix. This is now benign because
-// uuid_v4() provides collision-free uniqueness. A true counter fix would
-// require an atomic_increment() builtin in el_runtime.c.
+// next_bridge_id — monotonic correlation id for a suspended agentic turn.
+// Combines boot-relative time with a per-process counter so two unknown-tool
+// suspensions in the same second still get distinct ids.
 fn next_bridge_id() -> String {
    let prev: String = state_get("mcp_bridge_seq")
    let n: Int = if str_eq(prev, "") { 0 } else { str_to_int(prev) }
    let next: Int = n + 1
    state_set("mcp_bridge_seq", int_to_str(next))
-    // uuid_v4() provides collision-free uniqueness; counter is decorative.
-    let uid: String = uuid_v4()
-    return "br-" + uid
+    return "br-" + int_to_str(time_now()) + "-" + int_to_str(next)
 }

 fn handle_chat_agentic(body: String) -> String {
@@ -16,24 +16,14 @@ fn strip_query(path: String) -> String {
 }

 fn err_404(path: String) -> String {
-    // __status__ envelope — el_runtime reads the first key and emits HTTP 404.
-    // Issue #3: previously returned HTTP 200 with JSON error body.
-    return "{\"__status__\":404,\"error\":\"not found\",\"path\":\"" + path + "\"}"
+    return "{\"error\":\"not found\",\"path\":\"" + path + "\"}"
 }

 fn err_405(method: String, path: String) -> String {
-    // __status__ envelope — emits HTTP 405.
-    // Issue #3: previously returned HTTP 200 with JSON error body.
-    return "{\"__status__\":405,\"error\":\"method not allowed\",\"method\":\"" + method + "\",\"path\":\"" + path + "\"}"
+    return "{\"error\":\"method not allowed\",\"method\":\"" + method + "\",\"path\":\"" + path + "\"}"
 }

 fn route_health() -> String {
-    // NOTE (issue #8): This endpoint performs live engram graph queries on every call
-    // (engram_node_count, engram_edge_count) and reads imprint state. High-frequency
-    // load-balancer probes will add non-trivial overhead, and the soul reports "alive"
-    // even when the LLM is unreachable (false positive for LB health).
-    // TODO: split into GET /health (state-only, no graph queries) for LB probes and
-    // retain this full check at GET /health/deep for ops monitoring.
    let cgi_id: String = state_get("soul_cgi_id")
    let boot: String = state_get("soul_boot_count")
    let boot_num: String = if str_eq(boot, "") { "0" } else { boot }
@@ -69,8 +59,7 @@ fn route_lineage() -> String {

 fn route_imprint_contextual(body: String) -> String {
    if str_eq(body, "") {
-        // Issue #5: empty body is a client error — HTTP 400.
-        return "{\"__status__\":400,\"ok\":false,\"error\":\"empty body\"}"
+        return "{\"ok\":false,\"error\":\"empty body\"}"
    }
    let tags: String = "[\"imprint\",\"contextual\"]"
    let id: String = engram_node_full(
@@ -92,8 +81,7 @@ fn route_imprint_contextual(body: String) -> String {

 fn route_imprint_user(body: String) -> String {
    if str_eq(body, "") {
-        // Issue #5: empty body is a client error — HTTP 400.
-        return "{\"__status__\":400,\"ok\":false,\"error\":\"empty body\"}"
+        return "{\"ok\":false,\"error\":\"empty body\"}"
    }
    let tags: String = "[\"imprint\",\"user\"]"
    let id: String = engram_node_full(
@@ -231,13 +219,9 @@ fn connectd_get(suffix: String) -> String {
 // so arbitrary JSON cannot reach the shell as a command-line argument.
 fn connectd_post(suffix: String, body: String) -> String {
    let eff: String = if str_eq(body, "") { "{}" } else { body }
-    // Issue #11: time_now() has second-granularity; two concurrent requests in the same
-    // second collide on the same temp path. Added a monotonic per-process sequence counter.
-    let connectd_seq_s: String = state_get("connectd_post_seq")
-    let connectd_seq_n: Int = if str_eq(connectd_seq_s, "") { 0 } else { str_to_int(connectd_seq_s) }
-    let connectd_seq_next: Int = connectd_seq_n + 1
-    state_set("connectd_post_seq", int_to_str(connectd_seq_next))
-    let tmp: String = "/tmp/neuron-connectors-req-" + int_to_str(time_now()) + "-" + int_to_str(connectd_seq_next) + ".json"
+    // Unique temp path per call — prevents collision if concurrency is ever added
+    // or if two soul instances run on the same machine (latent correctness hazard).
+    let tmp: String = "/tmp/neuron-connectors-req-" + int_to_str(time_now()) + ".json"
    fs_write(tmp, eff)
    let out: String = exec_capture("curl -s --max-time 20 -X POST http://127.0.0.1:7771" + suffix + " -H 'Content-Type: application/json' -d @" + tmp)
    if str_eq(out, "") {
@@ -272,45 +256,9 @@ fn handle_connectors(method: String, clean: String, body: String) -> String {
    return "{\"ok\":false,\"error\":\"unknown connectors route\"}"
 }

-
-// auth_check — validate NEURON_TOKEN bearer auth on every request.
-// Returns "" when authorized, or a JSON 401 error string when not.
-// /health and /lineage are public routes — always exempted.
-// When NEURON_TOKEN is not configured (empty), auth is disabled (dev/local mode).
-// Issue #4: previously no auth layer existed anywhere in the router.
-// Clients pass the token in the JSON body as "__auth".
-// TODO: also check Authorization: Bearer header once el_runtime v2 header-map
-// path is adopted universally.
-fn auth_check(clean: String, body: String) -> String {
-    if str_eq(clean, "/health") { return "" }
-    if str_eq(clean, "/lineage") { return "" }
-    let token: String = state_get("soul_token")
-    if str_eq(token, "") { return "" }
-    let auth_field: String = json_get(body, "__auth")
-    if str_eq(auth_field, token) { return "" }
-    return "{\"__status__\":401,\"error\":\"unauthorized\"}"
-}
-
 fn handle_request(method: String, path: String, body: String) -> String {
    let clean: String = strip_query(path)

-    // Issue #1/#2: EL has no exception/try-catch mechanism. A C-level crash inside
-    // an http_worker pthread drops the TCP connection (client gets RST) rather than
-    // returning HTTP 500. TODO: register a SIGSEGV/SIGBUS handler in el_runtime.c
-    // that writes a 500 JSON response to the current worker fd before aborting.
-
-    // Issue #10: Rate limiting is not implemented.
-    // TODO: add a per-IP token-bucket counter returning HTTP 429 when exceeded.
-    // Requires a C-level counter in el_runtime.c or a sidecar reverse proxy.
-
-    // Auth — enforced on all routes except /health and /lineage.
-    // Issue #4: previously no auth check existed anywhere in the router.
-    let auth_err: String = auth_check(clean, body)
-    if !str_eq(auth_err, "") {
-        return auth_err
-    }
-
-
    if str_eq(method, "POST") && str_eq(clean, "/dharma/recv") {
        return handle_dharma_recv(body)
    }
@@ -338,8 +286,7 @@ fn handle_request(method: String, path: String, body: String) -> String {
            let raw_msg: String = json_get(body, "message")
            let eff_msg: String = if str_eq(raw_msg, "") { body } else { raw_msg }
            if str_eq(eff_msg, "") {
-                // Issue #5: missing required param — HTTP 400.
-                return "{\"__status__\":400,\"error\":\"message required\"}"
+                return "{\"error\":\"message required\"}"
            }
            let agentic_flag: Bool = json_get_bool(body, "agentic")
            let reply: String = if agentic_flag {
@@ -479,17 +426,8 @@ fn handle_request(method: String, path: String, body: String) -> String {
            return handle_elp_chat(body)
        }
        if str_eq(clean, "/api/chat") {
-            // Issue #5: validate required params — return HTTP 400 when missing.
-            let raw_msg: String = json_get(body, "message")
-            if str_eq(raw_msg, "") {
-                return "{\"__status__\":400,\"error\":\"message is required\",\"response\":\"\"}"
-            }
-            // Issue #7: reject oversized messages before engram_compile and the LLM.
-            // Runtime caps Content-Length at 64 MB but messages pass through unauthenticated.
-            if str_len(raw_msg) > 32768 {
-                return "{\"__status__\":400,\"error\":\"message too large (max 32768 chars)\",\"response\":\"\"}"
-            }
            let agentic_flag: Bool = json_get_bool(body, "agentic")
+            let raw_msg: String = json_get(body, "message")
            let reply: String = if agentic_flag {
                handle_chat_agentic(body)
            } else {
@@ -166,6 +166,39 @@ fn load_identity_context() -> Void {
            println("[soul] persona node loaded (" + int_to_str(str_len(p_content)) + " chars)")
        }
    }
+
+    // Cross-session affective context: query engram for recent distress/crisis signals
+    // at session start. Stored under soul_affective_context so the safety layer can
+    // detect when a user has been in distress across previous sessions.
+    // Soft recency guard: nodes with a ts field older than 7 days are skipped.
+    // Results capped at 3 nodes, 200 chars each, to avoid over-injection into context.
+    // TODO(recency): engram_search_json sorts by relevance, not timestamp. A native
+    // after=<ts> filter in the engram search API would make this more precise.
+    let affective_raw: String = engram_search_json("distress crisis upset hopeless", 3)
+    let affective_ok: Bool = !str_eq(affective_raw, "") && !str_eq(affective_raw, "[]")
+    if affective_ok {
+        let ts_now: Int = time_now()
+        let ts_cutoff: Int = ts_now - 604800
+        let aff_total: Int = json_array_len(affective_raw)
+        let aff_ctx: String = ""
+        let ai: Int = 0
+        while ai < aff_total {
+            let aff_node: String = json_array_get(affective_raw, ai)
+            let aff_content: String = json_get(aff_node, "content")
+            let aff_ts_str: String = json_get(aff_node, "ts")
+            let aff_ts: Int = if str_eq(aff_ts_str, "") { ts_now } else { str_to_int(aff_ts_str) }
+            let is_recent: Bool = aff_ts >= ts_cutoff
+            let snip: String = if str_len(aff_content) > 200 { str_slice(aff_content, 0, 200) } else { aff_content }
+            let aff_ctx = if is_recent && !str_eq(snip, "") {
+                if str_eq(aff_ctx, "") { snip } else { aff_ctx + "\n" + snip }
+            } else { aff_ctx }
+            let ai = ai + 1
+        }
+        if !str_eq(aff_ctx, "") {
+            state_set("soul_affective_context", aff_ctx)
+            println("[soul] cross-session affective context loaded (" + int_to_str(str_len(aff_ctx)) + " chars)")
+        }
+    }
 }

 // seed_persona_from_env — one-time migration: SOUL_IDENTITY env var → Persona graph node.
@@ -258,7 +291,10 @@ fn emit_session_start_event() -> Void {
 // L0 (core) → L1 (safety screen) → L2a (continuity + behavioral profiling) → L2b (mission alignment) → L3 (imprint) → L1 (safety validate)
 // Internal cognition (heartbeat, proactive, memory ops) bypasses layers — use one_cycle directly.
 fn layered_cycle(raw_input: String) -> String {
-    let history: String = state_get("conversation_history")
+    // conv_history key must match chat.el (conv_history, not conversation_history).
+    // Mismatch caused safety_score_distress_history() to always receive "" - the
+    // history-amplification path in safety_threat_score was permanently dead.
+    let history: String = state_get("conv_history")
    let session_id: String = state_get("current_session_id")

    // L1 in: safety screen