fix(reliability): route error recovery
This commit is contained in:
@@ -266,6 +266,8 @@ fn hist_append(hist: String, role: String, content: String) -> String {
|
||||
// content containing marker strings. Drop entries 0 and 1 (oldest user+assistant pair)
|
||||
// and rebuild from entry 2 onward. Minimum retained count: 2 entries (never over-trim).
|
||||
fn hist_trim(hist: String) -> String {
|
||||
// Issue #9 fix: use json_array_len/json_array_get instead of fragile str_index_of
|
||||
// parser. Old code was silently corrupting history on malformed JSON.
|
||||
let total: Int = json_array_len(hist)
|
||||
// Safety: never trim below 2 entries. If already at or below the minimum, return unchanged.
|
||||
if total <= 2 {
|
||||
@@ -440,7 +442,8 @@ fn conv_history_load() -> String {
|
||||
fn handle_chat(body: String) -> String {
|
||||
let message: String = json_get(body, "message")
|
||||
if str_eq(message, "") {
|
||||
return "{\"error\":\"message is required\",\"response\":\"\"}"
|
||||
// Issue #5: missing required param — HTTP 400.
|
||||
return "{\"__status__\":400,\"error\":\"message is required\",\"response\":\"\"}"
|
||||
}
|
||||
|
||||
// Load history BEFORE compiling context so we can anchor activation to the thread.
|
||||
@@ -451,6 +454,15 @@ fn handle_chat(body: String) -> String {
|
||||
// in conv_history_load() (Issues #1, #2) reduce false negatives, but a full recovery path
|
||||
// requires caller-level state changes too invasive for a targeted fix.
|
||||
// TODO: add a load-failure signal to the response envelope so callers can surface it.
|
||||
//
|
||||
// TODO(reliability #3 — conv_history global race): "conv_history" is a process-global
|
||||
// state key. Concurrent /api/chat requests that omit session_id all read the same key,
|
||||
// append their exchange, and write it back. Because _state_mu serializes individual
|
||||
// state_get/state_set calls but NOT the read-append-write sequence, one thread's
|
||||
// appended exchange can be overwritten by another thread writing its own version.
|
||||
// The fix is to require callers to supply a session_id (routing them through
|
||||
// session_hist_<id>) and deprecate the global "conv_history" path. Callers using
|
||||
// the session API (which scopes history per session_hist_<id>) are not affected.
|
||||
let state_hist: String = state_get("conv_history")
|
||||
let stored_hist: String = if str_eq(state_hist, "") { conv_history_load() } else { state_hist }
|
||||
let hist_len: Int = if str_eq(stored_hist, "") { 0 } else { json_array_len(stored_hist) }
|
||||
@@ -579,7 +591,8 @@ fn handle_chat(body: String) -> String {
|
||||
|| str_starts_with(raw_response, "{\"type\":\"error\"")
|
||||
|| str_contains(raw_response, "authentication_error")
|
||||
if is_error {
|
||||
return "{\"error\":\"llm unavailable\",\"response\":\"\"}"
|
||||
// Issue #6: LLM failure — HTTP 503 (service unavailable).
|
||||
return "{\"__status__\":503,\"error\":\"llm unavailable\",\"response\":\"\"}"
|
||||
}
|
||||
|
||||
let clean_response: String = clean_llm_response(raw_response)
|
||||
@@ -733,7 +746,15 @@ fn agentic_tools_all() -> String {
|
||||
fn call_mcp_bridge(tool_name: String, tool_input: String) -> String {
|
||||
let eff_input: String = if str_eq(tool_input, "") { "{}" } else { tool_input }
|
||||
let body: String = "{\"name\":\"" + tool_name + "\",\"input\":" + eff_input + "}"
|
||||
let tmp: String = "/tmp/neuron-mcp-call.json"
|
||||
// Issue #12: previously used a fixed path /tmp/neuron-mcp-call.json.
|
||||
// Under concurrent load (64 worker threads), two simultaneous MCP tool calls
|
||||
// race on this file — one call sends the other's input to the bridge.
|
||||
// Fix: monotonic sequence counter makes the path unique per call.
|
||||
let mcp_seq_s: String = state_get("mcp_call_seq")
|
||||
let mcp_seq_n: Int = if str_eq(mcp_seq_s, "") { 0 } else { str_to_int(mcp_seq_s) }
|
||||
let mcp_seq_next: Int = mcp_seq_n + 1
|
||||
state_set("mcp_call_seq", int_to_str(mcp_seq_next))
|
||||
let tmp: String = "/tmp/neuron-mcp-call-" + int_to_str(time_now()) + "-" + int_to_str(mcp_seq_next) + ".json"
|
||||
fs_write(tmp, body)
|
||||
return exec_capture("curl -s --max-time 30 -X POST http://127.0.0.1:7771/mcp/call -H 'Content-Type: application/json' -d @" + tmp)
|
||||
}
|
||||
@@ -1014,15 +1035,25 @@ fn is_builtin_tool(tool_name: String) -> Bool {
|
||||
|| str_starts_with(tool_name, "neuron_")
|
||||
}
|
||||
|
||||
// next_bridge_id — monotonic correlation id for a suspended agentic turn.
|
||||
// Combines boot-relative time with a per-process counter so two unknown-tool
|
||||
// suspensions in the same second still get distinct ids.
|
||||
// next_bridge_id — unique correlation id for a suspended agentic turn.
|
||||
// Uses uuid_v4() as the primary uniqueness guarantee so concurrent calls
|
||||
// (even in the same millisecond) cannot collide. The "mcp_bridge_seq"
|
||||
// counter is kept for human readability in logs/debugging but is no longer
|
||||
// relied on for uniqueness.
|
||||
//
|
||||
// TODO(reliability #6): state_get/state_set on "mcp_bridge_seq" is a
|
||||
// non-atomic read-modify-write — two concurrent calls can read the same
|
||||
// counter and produce the same counter suffix. This is now benign because
|
||||
// uuid_v4() provides collision-free uniqueness. A true counter fix would
|
||||
// require an atomic_increment() builtin in el_runtime.c.
|
||||
fn next_bridge_id() -> String {
|
||||
let prev: String = state_get("mcp_bridge_seq")
|
||||
let n: Int = if str_eq(prev, "") { 0 } else { str_to_int(prev) }
|
||||
let next: Int = n + 1
|
||||
state_set("mcp_bridge_seq", int_to_str(next))
|
||||
return "br-" + int_to_str(time_now()) + "-" + int_to_str(next)
|
||||
// uuid_v4() provides collision-free uniqueness; counter is decorative.
|
||||
let uid: String = uuid_v4()
|
||||
return "br-" + uid
|
||||
}
|
||||
|
||||
fn handle_chat_agentic(body: String) -> String {
|
||||
|
||||
@@ -75,14 +75,24 @@ fn strip_query(path: String) -> String {
|
||||
}
|
||||
|
||||
fn err_404(path: String) -> String {
|
||||
return "{\"error\":\"not found\",\"code\":\"not_found\",\"path\":\"" + path + "\"}"
|
||||
// __status__ envelope — el_runtime reads the first key and emits HTTP 404.
|
||||
// Issue #3: previously returned HTTP 200 with JSON error body.
|
||||
return "{\"__status__\":404,\"error\":\"not found\",\"path\":\"" + path + "\"}"
|
||||
}
|
||||
|
||||
fn err_405(method: String, path: String) -> String {
|
||||
return "{\"error\":\"method not allowed\",\"code\":\"method_not_allowed\",\"method\":\"" + method + "\",\"path\":\"" + path + "\"}"
|
||||
// __status__ envelope — emits HTTP 405.
|
||||
// Issue #3: previously returned HTTP 200 with JSON error body.
|
||||
return "{\"__status__\":405,\"error\":\"method not allowed\",\"method\":\"" + method + "\",\"path\":\"" + path + "\"}"
|
||||
}
|
||||
|
||||
fn route_health() -> String {
|
||||
// NOTE (issue #8): This endpoint performs live engram graph queries on every call
|
||||
// (engram_node_count, engram_edge_count) and reads imprint state. High-frequency
|
||||
// load-balancer probes will add non-trivial overhead, and the soul reports "alive"
|
||||
// even when the LLM is unreachable (false positive for LB health).
|
||||
// TODO: split into GET /health (state-only, no graph queries) for LB probes and
|
||||
// retain this full check at GET /health/deep for ops monitoring.
|
||||
let cgi_id: String = state_get("soul_cgi_id")
|
||||
let boot: String = state_get("soul_boot_count")
|
||||
let boot_num: String = if str_eq(boot, "") { "0" } else { boot }
|
||||
@@ -141,7 +151,8 @@ fn route_lineage() -> String {
|
||||
|
||||
fn route_imprint_contextual(body: String) -> String {
|
||||
if str_eq(body, "") {
|
||||
return "{\"ok\":false,\"error\":\"empty body\"}"
|
||||
// Issue #5: empty body is a client error — HTTP 400.
|
||||
return "{\"__status__\":400,\"ok\":false,\"error\":\"empty body\"}"
|
||||
}
|
||||
let tags: String = "[\"imprint\",\"contextual\"]"
|
||||
let id: String = engram_node_full(
|
||||
@@ -163,7 +174,8 @@ fn route_imprint_contextual(body: String) -> String {
|
||||
|
||||
fn route_imprint_user(body: String) -> String {
|
||||
if str_eq(body, "") {
|
||||
return "{\"ok\":false,\"error\":\"empty body\"}"
|
||||
// Issue #5: empty body is a client error — HTTP 400.
|
||||
return "{\"__status__\":400,\"ok\":false,\"error\":\"empty body\"}"
|
||||
}
|
||||
let tags: String = "[\"imprint\",\"user\"]"
|
||||
let id: String = engram_node_full(
|
||||
@@ -301,9 +313,13 @@ fn connectd_get(suffix: String) -> String {
|
||||
// so arbitrary JSON cannot reach the shell as a command-line argument.
|
||||
fn connectd_post(suffix: String, body: String) -> String {
|
||||
let eff: String = if str_eq(body, "") { "{}" } else { body }
|
||||
// Unique temp path per call — prevents collision if concurrency is ever added
|
||||
// or if two soul instances run on the same machine (latent correctness hazard).
|
||||
let tmp: String = "/tmp/neuron-connectors-req-" + int_to_str(time_now()) + ".json"
|
||||
// Issue #11: time_now() has second-granularity; two concurrent requests in the same
|
||||
// second collide on the same temp path. Added a monotonic per-process sequence counter.
|
||||
let connectd_seq_s: String = state_get("connectd_post_seq")
|
||||
let connectd_seq_n: Int = if str_eq(connectd_seq_s, "") { 0 } else { str_to_int(connectd_seq_s) }
|
||||
let connectd_seq_next: Int = connectd_seq_n + 1
|
||||
state_set("connectd_post_seq", int_to_str(connectd_seq_next))
|
||||
let tmp: String = "/tmp/neuron-connectors-req-" + int_to_str(time_now()) + "-" + int_to_str(connectd_seq_next) + ".json"
|
||||
fs_write(tmp, eff)
|
||||
let out: String = exec_capture("curl -s --max-time 20 -X POST http://127.0.0.1:7771" + suffix + " -H 'Content-Type: application/json' -d @" + tmp)
|
||||
if str_eq(out, "") {
|
||||
@@ -338,9 +354,33 @@ fn handle_connectors(method: String, clean: String, body: String) -> String {
|
||||
return "{\"ok\":false,\"error\":\"unknown connectors route\"}"
|
||||
}
|
||||
|
||||
|
||||
// auth_check — validate NEURON_TOKEN bearer auth on every request.
|
||||
// Returns "" when authorized, or a JSON 401 error string when not.
|
||||
// /health and /lineage are public routes — always exempted.
|
||||
// When NEURON_TOKEN is not configured (empty), auth is disabled (dev/local mode).
|
||||
// Issue #4: previously no auth layer existed anywhere in the router.
|
||||
// Clients pass the token in the JSON body as "__auth".
|
||||
// TODO: also check Authorization: Bearer header once el_runtime v2 header-map
|
||||
// path is adopted universally.
|
||||
fn auth_check(clean: String, body: String) -> String {
|
||||
if str_eq(clean, "/health") { return "" }
|
||||
if str_eq(clean, "/lineage") { return "" }
|
||||
let token: String = state_get("soul_token")
|
||||
if str_eq(token, "") { return "" }
|
||||
let auth_field: String = json_get(body, "__auth")
|
||||
if str_eq(auth_field, token) { return "" }
|
||||
return "{\"__status__\":401,\"error\":\"unauthorized\"}"
|
||||
}
|
||||
|
||||
fn handle_request(method: String, path: String, body: String) -> String {
|
||||
let clean: String = strip_query(path)
|
||||
|
||||
// Issue #1/#2: EL has no exception/try-catch mechanism. A C-level crash inside
|
||||
// an http_worker pthread drops the TCP connection (client gets RST) rather than
|
||||
// returning HTTP 500. TODO: register a SIGSEGV/SIGBUS handler in el_runtime.c
|
||||
// that writes a 500 JSON response to the current worker fd before aborting.
|
||||
|
||||
// Rate limit check. Extract caller IP from REMOTE_ADDR env var (set by the
|
||||
// EL HTTP runtime for each request). Skip enforcement when empty so
|
||||
// loopback/internal callers are never blocked.
|
||||
@@ -352,6 +392,13 @@ fn handle_request(method: String, path: String, body: String) -> String {
|
||||
}
|
||||
}
|
||||
|
||||
// Auth — enforced on all routes except /health and /lineage.
|
||||
// Issue #4: previously no auth check existed anywhere in the router.
|
||||
let auth_err: String = auth_check(clean, body)
|
||||
if !str_eq(auth_err, "") {
|
||||
return auth_err
|
||||
}
|
||||
|
||||
if str_eq(method, "POST") && str_eq(clean, "/dharma/recv") {
|
||||
return handle_dharma_recv(body)
|
||||
}
|
||||
@@ -379,7 +426,8 @@ fn handle_request(method: String, path: String, body: String) -> String {
|
||||
let raw_msg: String = json_get(body, "message")
|
||||
let eff_msg: String = if str_eq(raw_msg, "") { body } else { raw_msg }
|
||||
if str_eq(eff_msg, "") {
|
||||
return "{\"error\":\"message is required\",\"code\":\"missing_param\"}"
|
||||
// Issue #5: missing required param — HTTP 400.
|
||||
return "{\"__status__\":400,\"error\":\"message required\"}"
|
||||
}
|
||||
let agentic_flag: Bool = json_get_bool(body, "agentic")
|
||||
let reply: String = if agentic_flag {
|
||||
@@ -523,9 +571,15 @@ fn handle_request(method: String, path: String, body: String) -> String {
|
||||
// responses are buffered and returned as a single JSON object. Streaming
|
||||
// would require runtime-level SSE support in el_runtime.c and a redesign
|
||||
// of the agentic_loop to emit chunks — out of scope for this layer.
|
||||
// Issue #5: validate required params — return HTTP 400 when missing.
|
||||
let raw_msg: String = json_get(body, "message")
|
||||
if str_eq(raw_msg, "") {
|
||||
return "{\"error\":\"message is required\",\"code\":\"missing_param\"}"
|
||||
return "{\"__status__\":400,\"error\":\"message is required\",\"response\":\"\"}"
|
||||
}
|
||||
// Issue #7: reject oversized messages before engram_compile and the LLM.
|
||||
// Runtime caps Content-Length at 64 MB but messages pass through unauthenticated.
|
||||
if str_len(raw_msg) > 32768 {
|
||||
return "{\"__status__\":400,\"error\":\"message too large (max 32768 chars)\",\"response\":\"\"}"
|
||||
}
|
||||
let agentic_flag: Bool = json_get_bool(body, "agentic")
|
||||
let reply: String = if agentic_flag {
|
||||
|
||||
Reference in New Issue
Block a user