fix(reliability): LLM retry

2026-06-22 12:37:29 -05:00
parent e447a87a00 47d0e6f985
commit 6edf9937dd
1 changed files with 55 additions and 3 deletions
@@ -587,9 +587,13 @@ fn handle_chat(body: String) -> String {

    let raw_response: String = llm_call_system(model, full_system, message)

+    // Issue #5: also catch empty string — llm_extract_text() in el_runtime.c silently
+    // returns "" when the response content array is missing or all blocks fail to parse.
+    // Without this guard an empty reply passes through as a silent empty response.
    let is_error: Bool = str_starts_with(raw_response, "{\"error\"")
        || str_starts_with(raw_response, "{\"type\":\"error\"")
        || str_contains(raw_response, "authentication_error")
+        || str_eq(raw_response, "")
    if is_error {
        // Issue #6: LLM failure — HTTP 503 (service unavailable).
        return "{\"__status__\":503,\"error\":\"llm unavailable\",\"response\":\"\"}"
@@ -662,6 +666,42 @@ fn studio_tools_json() -> String {
    "]"
 }

+// ---------------------------------------------------------------------------
+// LLM reliability — issues that require C runtime fixes (el_runtime.c).
+// These cannot be addressed at the EL layer; they are documented here so the
+// symptoms are traceable back to their root causes.
+//
+// Issue #1 (no retry on timeout/connection error):
+//   http_do() in el_runtime.c calls curl_easy_perform() once. On
+//   CURLE_OPERATION_TIMEDOUT / CURLE_COULDNT_CONNECT / CURLE_RECV_ERROR it
+//   returns http_error_json() with no retry. Fix: add a retry loop (max 3
+//   attempts, exponential back-off starting at 1s) inside llm_provider_request().
+//
+// Issue #2 (60s timeout applies to all HTTP calls including LLM):
+//   EL_HTTP_TIMEOUT_MS defaults to 60000ms for every http_do() call.
+//   Fix: introduce EL_LLM_TIMEOUT_MS (default 120000) used only by
+//   llm_provider_request(); leave EL_HTTP_TIMEOUT_MS (default 30000) for
+//   general service calls to avoid holding connections for 60s.
+//
+// Issue #3 (HTTP 429 causes silent provider failover, not backoff):
+//   llm_chain_call() advances to the next provider on any JSON-prefixed response
+//   including 429. Fix: parse HTTP status via curl_easy_getinfo; on 429 sleep
+//   Retry-After seconds (default 5s) then retry the same provider up to 3 times.
+//
+// Issue #4 (HTTP 500/502 crashes the request silently):
+//   Same path as #3 — 5xx responses cause immediate provider failover with no
+//   retry. Fix: retry with exponential back-off (1s, 2s, 4s) before advancing.
+//
+// Issue #6 (no secondary LLM fallback in production):
+//   Set NEURON_LLM_1_URL/KEY/FORMAT in ExternalSecret to a secondary provider
+//   (e.g. Gemini). No C code change required; llm_chain_call() already iterates.
+//
+// Issue #8 (LLM response size unbounded — memory-only cap):
+//   HttpBuf grows via realloc() with no hard limit. Fix: add
+//   EL_HTTP_MAX_RESPONSE_BYTES (default 10MiB) cap in httpbuf_append() and
+//   return http_error_json("response too large") on overflow.
+// ---------------------------------------------------------------------------
+
 fn agentic_api_key() -> String {
    let k1: String = env("ANTHROPIC_API_KEY")
    if !str_eq(k1, "") {
@@ -713,7 +753,7 @@ fn agentic_tools_with_web() -> String {
 // Short timeout + empty-array fallback: if the bridge is down, the soul runs
 // exactly as before with only its built-in tools (graceful degradation).
 fn connector_tools_json() -> String {
-    let raw: String = exec_capture("curl -s --max-time 2 http://127.0.0.1:7771/mcp/tools")
+    let raw: String = exec_capture("curl -s --max-time 5 http://127.0.0.1:7771/mcp/tools")
    if str_eq(raw, "") {
        return "[]"
    }
@@ -766,7 +806,7 @@ fn tool_auto_approved(tool_name: String) -> Bool {
    if !str_starts_with(tool_name, "mcp__") {
        return false
    }
-    let raw: String = exec_capture("curl -s --max-time 2 http://127.0.0.1:7771/mcp/auto-approved")
+    let raw: String = exec_capture("curl -s --max-time 5 http://127.0.0.1:7771/mcp/auto-approved")
    if str_eq(raw, "") {
        return false
    }
@@ -1189,6 +1229,14 @@ fn agentic_loop(session_id: String, model: String, safe_sys: String, tools_json:
    let iteration: Int = 0
    let keep_going: Bool = true

+    // Issue #9: agentic max_tokens configurable via NEURON_LLM_MAX_TOKENS env var.
+    // Default 4096 is marginal for long tool chains (8 iterations x 4096 tokens).
+    // Set to 8192+ for complex multi-step tasks.
+    // Note: llm_provider_request() in el_runtime.c also hardcodes 4096 for the
+    // llm_call_system() (non-agentic) path; that requires a C runtime change.
+    let max_tokens_env: String = env("NEURON_LLM_MAX_TOKENS")
+    let max_tokens_str: String = if str_eq(max_tokens_env, "") { "4096" } else { max_tokens_env }
+
    // Suspension state — captured at top level so it escapes the while body.
    let pending: Bool = false
    let pend_tool_id: String = ""
@@ -1197,7 +1245,7 @@ fn agentic_loop(session_id: String, model: String, safe_sys: String, tools_json:

    while keep_going && iteration < 8 {
        let req_body: String = "{\"model\":\"" + model + "\""
-            + ",\"max_tokens\":4096"
+            + ",\"max_tokens\":" + max_tokens_str
            + ",\"system\":\"" + safe_sys + "\""
            + ",\"tools\":" + tools_json
            + ",\"messages\":" + messages
@@ -1477,9 +1525,11 @@ fn handle_chat_as_soul(body: String) -> String {

    let raw_response: String = llm_call_system(model, system_prompt, eff_message)

+    // Issue #5: empty string catch — same rationale as handle_chat.
    let is_error: Bool = str_starts_with(raw_response, "{\"error\"")
        || str_starts_with(raw_response, "{\"type\":\"error\"")
        || str_contains(raw_response, "authentication_error")
+        || str_eq(raw_response, "")
    if is_error {
        return "{\"error\":\"llm unavailable\",\"response\":\"\",\"speaker_slug\":\"" + speaker + "\",\"model\":\"" + model + "\"}"
    }
@@ -1526,9 +1576,11 @@ fn handle_dharma_room_turn(body: String) -> String {

    let raw_response: String = llm_call_system(model, system_prompt, transcript)

+    // Issue #5: empty string catch — same rationale as handle_chat.
    let is_error: Bool = str_starts_with(raw_response, "{\"error\"")
        || str_starts_with(raw_response, "{\"type\":\"error\"")
        || str_contains(raw_response, "authentication_error")
+        || str_eq(raw_response, "")
    if is_error {
        return "{\"error\":\"llm unavailable\",\"response\":\"\",\"cgi_id\":\"" + cgi_id + "\"}"
    }