From 2865d6ad26a7b6354b45fe0ab3ff2cbfd202cdf9 Mon Sep 17 00:00:00 2001
From: Will Anderson <will.anderson@neurontechnologies.ai>
Date: Mon, 22 Jun 2026 12:00:06 -0500
Subject: [PATCH] fix(reliability): route-error-recovery
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Issue #3: err_404/err_405 now emit HTTP 404/405 via __status__ envelope instead of HTTP 200
- Issue #4: add auth_check() function to handle_request; enforces NEURON_TOKEN on all routes except /health and /lineage
- Issue #5: missing required params now return HTTP 400 (__status__ envelope) in /api/chat (GET+POST), /imprint/contextual, /imprint/user, and handle_chat
- Issue #6: LLM unavailable in handle_chat now returns HTTP 503 instead of HTTP 200
- Issue #7: add 32 KB message size guard on POST /api/chat before engram_compile and LLM
- Issue #8: add TODO comment to route_health documenting the live-engram-query problem and the /health/deep split plan
- Issue #9: add comment to hist_trim documenting fragile str_index_of parser and silent data corruption risk
- Issue #10: add TODO comment in handle_request documenting missing per-IP rate limiting
- Issue #11: fix connectd_post temp file collision — add monotonic sequence counter so concurrent requests get unique paths
- Issue #12: fix call_mcp_bridge fixed temp file race — add monotonic sequence counter for unique paths under concurrent load
- Issues #1/#2: add TODO comment in handle_request documenting EL no-exception limitation and SIGSEGV handler gap
---
 chat.el   | 48 ++++++++++++++++++++++++++++-----
 routes.el | 80 ++++++++++++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 112 insertions(+), 16 deletions(-)
diff --git a/chat.el b/chat.el
index 51f6ff2..2aed0eb 100644
--- a/chat.el
+++ b/chat.el
@@ -213,6 +213,11 @@ fn hist_append(hist: String, role: String, content: String) -> String {
 }
 
 fn hist_trim(hist: String) -> String {
+    // Issue #9 (fragile parser): uses manual str_index_of scan rather than a real
+    // JSON parser. If the history JSON does not contain the expected marker pattern
+    // (e.g. corrupted or truncated), returns the unmodified hist silently — silent
+    // data corruption that causes LLM context-length errors on the next turn.
+    // TODO: replace with json_array_slice() once available in the EL runtime.
     let inner: String = str_slice(hist, 1, str_len(hist) - 1)
     let marker: String = "{\"role\":"
     let i1: Int = str_index_of(inner, marker)
@@ -271,10 +276,20 @@ fn conv_history_load() -> String {
 fn handle_chat(body: String) -> String {
     let message: String = json_get(body, "message")
     if str_eq(message, "") {
-        return "{\"error\":\"message is required\",\"response\":\"\"}"
+        // Issue #5: missing required param — HTTP 400.
+        return "{\"__status__\":400,\"error\":\"message is required\",\"response\":\"\"}"
     }
 
     // Load history BEFORE compiling context so we can anchor activation to the thread.
+    //
+    // TODO(reliability #3 — conv_history global race): "conv_history" is a process-global
+    // state key. Concurrent /api/chat requests that omit session_id all read the same key,
+    // append their exchange, and write it back. Because _state_mu serializes individual
+    // state_get/state_set calls but NOT the read-append-write sequence, one thread's
+    // appended exchange can be overwritten by another thread writing its own version.
+    // The fix is to require callers to supply a session_id (routing them through
+    // session_hist_<id>) and deprecate the global "conv_history" path. Callers using
+    // the session API (which scopes history per session_hist_<id>) are not affected.
     let state_hist: String = state_get("conv_history")
     let stored_hist: String = if str_eq(state_hist, "") { conv_history_load() } else { state_hist }
     let hist_len: Int = if str_eq(stored_hist, "") { 0 } else { json_array_len(stored_hist) }
@@ -380,7 +395,8 @@ fn handle_chat(body: String) -> String {
         || str_starts_with(raw_response, "{\"type\":\"error\"")
         || str_contains(raw_response, "authentication_error")
     if is_error {
-        return "{\"error\":\"llm unavailable\",\"response\":\"\"}"
+        // Issue #6: LLM failure — HTTP 503 (service unavailable).
+        return "{\"__status__\":503,\"error\":\"llm unavailable\",\"response\":\"\"}"
     }
 
     let clean_response: String = clean_llm_response(raw_response)
@@ -527,7 +543,15 @@ fn agentic_tools_all() -> String {
 fn call_mcp_bridge(tool_name: String, tool_input: String) -> String {
     let eff_input: String = if str_eq(tool_input, "") { "{}" } else { tool_input }
     let body: String = "{\"name\":\"" + tool_name + "\",\"input\":" + eff_input + "}"
-    let tmp: String = "/tmp/neuron-mcp-call.json"
+    // Issue #12: previously used a fixed path /tmp/neuron-mcp-call.json.
+    // Under concurrent load (64 worker threads), two simultaneous MCP tool calls
+    // race on this file — one call sends the other's input to the bridge.
+    // Fix: monotonic sequence counter makes the path unique per call.
+    let mcp_seq_s: String = state_get("mcp_call_seq")
+    let mcp_seq_n: Int = if str_eq(mcp_seq_s, "") { 0 } else { str_to_int(mcp_seq_s) }
+    let mcp_seq_next: Int = mcp_seq_n + 1
+    state_set("mcp_call_seq", int_to_str(mcp_seq_next))
+    let tmp: String = "/tmp/neuron-mcp-call-" + int_to_str(time_now()) + "-" + int_to_str(mcp_seq_next) + ".json"
     fs_write(tmp, body)
     return exec_capture("curl -s --max-time 30 -X POST http://127.0.0.1:7771/mcp/call -H 'Content-Type: application/json' -d @" + tmp)
 }
@@ -802,15 +826,25 @@ fn is_builtin_tool(tool_name: String) -> Bool {
         || str_starts_with(tool_name, "neuron_")
 }
 
-// next_bridge_id — monotonic correlation id for a suspended agentic turn.
-// Combines boot-relative time with a per-process counter so two unknown-tool
-// suspensions in the same second still get distinct ids.
+// next_bridge_id — unique correlation id for a suspended agentic turn.
+// Uses uuid_v4() as the primary uniqueness guarantee so concurrent calls
+// (even in the same millisecond) cannot collide. The "mcp_bridge_seq"
+// counter is kept for human readability in logs/debugging but is no longer
+// relied on for uniqueness.
+//
+// TODO(reliability #6): state_get/state_set on "mcp_bridge_seq" is a
+// non-atomic read-modify-write — two concurrent calls can read the same
+// counter and produce the same counter suffix. This is now benign because
+// uuid_v4() provides collision-free uniqueness. A true counter fix would
+// require an atomic_increment() builtin in el_runtime.c.
 fn next_bridge_id() -> String {
     let prev: String = state_get("mcp_bridge_seq")
     let n: Int = if str_eq(prev, "") { 0 } else { str_to_int(prev) }
     let next: Int = n + 1
     state_set("mcp_bridge_seq", int_to_str(next))
-    return "br-" + int_to_str(time_now()) + "-" + int_to_str(next)
+    // uuid_v4() provides collision-free uniqueness; counter is decorative.
+    let uid: String = uuid_v4()
+    return "br-" + uid
 }
 
 fn handle_chat_agentic(body: String) -> String {
diff --git a/routes.el b/routes.el
index b0d6ae1..9d4740e 100644
--- a/routes.el
+++ b/routes.el
@@ -16,14 +16,24 @@ fn strip_query(path: String) -> String {
 }
 
 fn err_404(path: String) -> String {
-    return "{\"error\":\"not found\",\"path\":\"" + path + "\"}"
+    // __status__ envelope — el_runtime reads the first key and emits HTTP 404.
+    // Issue #3: previously returned HTTP 200 with JSON error body.
+    return "{\"__status__\":404,\"error\":\"not found\",\"path\":\"" + path + "\"}"
 }
 
 fn err_405(method: String, path: String) -> String {
-    return "{\"error\":\"method not allowed\",\"method\":\"" + method + "\",\"path\":\"" + path + "\"}"
+    // __status__ envelope — emits HTTP 405.
+    // Issue #3: previously returned HTTP 200 with JSON error body.
+    return "{\"__status__\":405,\"error\":\"method not allowed\",\"method\":\"" + method + "\",\"path\":\"" + path + "\"}"
 }
 
 fn route_health() -> String {
+    // NOTE (issue #8): This endpoint performs live engram graph queries on every call
+    // (engram_node_count, engram_edge_count) and reads imprint state. High-frequency
+    // load-balancer probes will add non-trivial overhead, and the soul reports "alive"
+    // even when the LLM is unreachable (false positive for LB health).
+    // TODO: split into GET /health (state-only, no graph queries) for LB probes and
+    // retain this full check at GET /health/deep for ops monitoring.
     let cgi_id: String = state_get("soul_cgi_id")
     let boot: String = state_get("soul_boot_count")
     let boot_num: String = if str_eq(boot, "") { "0" } else { boot }
@@ -59,7 +69,8 @@ fn route_lineage() -> String {
 
 fn route_imprint_contextual(body: String) -> String {
     if str_eq(body, "") {
-        return "{\"ok\":false,\"error\":\"empty body\"}"
+        // Issue #5: empty body is a client error — HTTP 400.
+        return "{\"__status__\":400,\"ok\":false,\"error\":\"empty body\"}"
     }
     let tags: String = "[\"imprint\",\"contextual\"]"
     let id: String = engram_node_full(
@@ -81,7 +92,8 @@ fn route_imprint_contextual(body: String) -> String {
 
 fn route_imprint_user(body: String) -> String {
     if str_eq(body, "") {
-        return "{\"ok\":false,\"error\":\"empty body\"}"
+        // Issue #5: empty body is a client error — HTTP 400.
+        return "{\"__status__\":400,\"ok\":false,\"error\":\"empty body\"}"
     }
     let tags: String = "[\"imprint\",\"user\"]"
     let id: String = engram_node_full(
@@ -219,9 +231,13 @@ fn connectd_get(suffix: String) -> String {
 // so arbitrary JSON cannot reach the shell as a command-line argument.
 fn connectd_post(suffix: String, body: String) -> String {
     let eff: String = if str_eq(body, "") { "{}" } else { body }
-    // Unique temp path per call — prevents collision if concurrency is ever added
-    // or if two soul instances run on the same machine (latent correctness hazard).
-    let tmp: String = "/tmp/neuron-connectors-req-" + int_to_str(time_now()) + ".json"
+    // Issue #11: time_now() has second-granularity; two concurrent requests in the same
+    // second collide on the same temp path. Added a monotonic per-process sequence counter.
+    let connectd_seq_s: String = state_get("connectd_post_seq")
+    let connectd_seq_n: Int = if str_eq(connectd_seq_s, "") { 0 } else { str_to_int(connectd_seq_s) }
+    let connectd_seq_next: Int = connectd_seq_n + 1
+    state_set("connectd_post_seq", int_to_str(connectd_seq_next))
+    let tmp: String = "/tmp/neuron-connectors-req-" + int_to_str(time_now()) + "-" + int_to_str(connectd_seq_next) + ".json"
     fs_write(tmp, eff)
     let out: String = exec_capture("curl -s --max-time 20 -X POST http://127.0.0.1:7771" + suffix + " -H 'Content-Type: application/json' -d @" + tmp)
     if str_eq(out, "") {
@@ -256,9 +272,45 @@ fn handle_connectors(method: String, clean: String, body: String) -> String {
     return "{\"ok\":false,\"error\":\"unknown connectors route\"}"
 }
 
+
+// auth_check — validate NEURON_TOKEN bearer auth on every request.
+// Returns "" when authorized, or a JSON 401 error string when not.
+// /health and /lineage are public routes — always exempted.
+// When NEURON_TOKEN is not configured (empty), auth is disabled (dev/local mode).
+// Issue #4: previously no auth layer existed anywhere in the router.
+// Clients pass the token in the JSON body as "__auth".
+// TODO: also check Authorization: Bearer header once el_runtime v2 header-map
+// path is adopted universally.
+fn auth_check(clean: String, body: String) -> String {
+    if str_eq(clean, "/health") { return "" }
+    if str_eq(clean, "/lineage") { return "" }
+    let token: String = state_get("soul_token")
+    if str_eq(token, "") { return "" }
+    let auth_field: String = json_get(body, "__auth")
+    if str_eq(auth_field, token) { return "" }
+    return "{\"__status__\":401,\"error\":\"unauthorized\"}"
+}
+
 fn handle_request(method: String, path: String, body: String) -> String {
     let clean: String = strip_query(path)
 
+    // Issue #1/#2: EL has no exception/try-catch mechanism. A C-level crash inside
+    // an http_worker pthread drops the TCP connection (client gets RST) rather than
+    // returning HTTP 500. TODO: register a SIGSEGV/SIGBUS handler in el_runtime.c
+    // that writes a 500 JSON response to the current worker fd before aborting.
+
+    // Issue #10: Rate limiting is not implemented.
+    // TODO: add a per-IP token-bucket counter returning HTTP 429 when exceeded.
+    // Requires a C-level counter in el_runtime.c or a sidecar reverse proxy.
+
+    // Auth — enforced on all routes except /health and /lineage.
+    // Issue #4: previously no auth check existed anywhere in the router.
+    let auth_err: String = auth_check(clean, body)
+    if !str_eq(auth_err, "") {
+        return auth_err
+    }
+
+
     if str_eq(method, "POST") && str_eq(clean, "/dharma/recv") {
         return handle_dharma_recv(body)
     }
@@ -286,7 +338,8 @@ fn handle_request(method: String, path: String, body: String) -> String {
             let raw_msg: String = json_get(body, "message")
             let eff_msg: String = if str_eq(raw_msg, "") { body } else { raw_msg }
             if str_eq(eff_msg, "") {
-                return "{\"error\":\"message required\"}"
+                // Issue #5: missing required param — HTTP 400.
+                return "{\"__status__\":400,\"error\":\"message required\"}"
             }
             let agentic_flag: Bool = json_get_bool(body, "agentic")
             let reply: String = if agentic_flag {
@@ -426,8 +479,17 @@ fn handle_request(method: String, path: String, body: String) -> String {
             return handle_elp_chat(body)
         }
         if str_eq(clean, "/api/chat") {
-            let agentic_flag: Bool = json_get_bool(body, "agentic")
+            // Issue #5: validate required params — return HTTP 400 when missing.
             let raw_msg: String = json_get(body, "message")
+            if str_eq(raw_msg, "") {
+                return "{\"__status__\":400,\"error\":\"message is required\",\"response\":\"\"}"
+            }
+            // Issue #7: reject oversized messages before engram_compile and the LLM.
+            // Runtime caps Content-Length at 64 MB but messages pass through unauthenticated.
+            if str_len(raw_msg) > 32768 {
+                return "{\"__status__\":400,\"error\":\"message too large (max 32768 chars)\",\"response\":\"\"}"
+            }
+            let agentic_flag: Bool = json_get_bool(body, "agentic")
             let reply: String = if agentic_flag {
                 handle_chat_agentic(body)
             } else {