From f0545defdb22f337ab85e7f820d4606d0cb13d9f Mon Sep 17 00:00:00 2001 From: Will Anderson Date: Mon, 22 Jun 2026 11:58:33 -0500 Subject: [PATCH] =?UTF-8?q?fix(reliability):=20session-boundary=20?= =?UTF-8?q?=E2=80=94=20ghost=20sessions,=20bridge=20leak,=20session=20vali?= =?UTF-8?q?dation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - sessions.el: add session_exists() for chat-path session validation (ISSUE #6/#7) - sessions.el: add session_create_cleanup() for ghost-session rollback (ISSUE #1) - sessions.el: set session_pending_first_msg flag in session_create; clear it in session_hist_save so the first successful chat marks the session active (ISSUE #1) - sessions.el: session_delete now clears mcp_bridge: and always_allow_ state keys so abandoned pending-tool sessions do not accumulate (ISSUE #5) - sessions.el: add TODO comments for ISSUE #2 (no TTL/expiry), ISSUE #3 (non-atomic delete-then-create), ISSUE #4 (no concurrent-create guard), and ISSUE #8 (reconnect/duplicate resume race) where fixes are too invasive to land without new runtime primitives - chat.el: validate session_id exists via session_exists() before entering agentic_loop; unknown session_ids now return a 404-style error instead of silently starting a fresh empty session (ISSUE #6/#7) --- chat.el | 15 ++++++++ sessions.el | 98 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 112 insertions(+), 1 deletion(-) diff --git a/chat.el b/chat.el index 6813b64..0b0428d 100644 --- a/chat.el +++ b/chat.el @@ -648,6 +648,21 @@ fn handle_chat_agentic(body: String) -> String { // Thread-aware activation: same logic as handle_chat. // Use the session's or global history to anchor short messages to the thread. let req_session: String = json_get(body, "session_id") + + // ISSUE #6/#7: validate that the session_id actually exists before proceeding. + // Without this check the loop silently treats any unknown/fabricated session_id + // as a fresh session — history loads as empty and no error is returned to the caller. + // Only validate when a session_id is explicitly provided; anonymous calls + // (no session_id) continue to work for backward compatibility. + let session_valid: Bool = if str_eq(req_session, "") { + true + } else { + session_exists(req_session) + } + if !session_valid { + return "{\"error\":\"session not found\",\"session_id\":\"" + req_session + "\",\"reply\":\"\"}" + } + let hist_key: String = if str_eq(req_session, "") { "conv_history" } else { "session_hist_" + req_session } let agentic_hist: String = state_get(hist_key) let agentic_hist_len: Int = if str_eq(agentic_hist, "") { 0 } else { json_array_len(agentic_hist) } diff --git a/sessions.el b/sessions.el index fac9a79..e7282b6 100644 --- a/sessions.el +++ b/sessions.el @@ -36,7 +36,49 @@ fn session_make_content(id: String, title: String, created_at: Int, updated_at: + ",\"updated_at\":" + int_to_str(updated_at) + "}" } +// session_exists — return true if the given session_id is known in Engram or state. +// Used by chat.el to validate a session_id before processing a chat message. +// Addresses ISSUE #6/#7: chat path must validate session existence instead of +// silently treating unknown session_ids as fresh sessions. +fn session_exists(session_id: String) -> Bool { + if str_eq(session_id, "") { return false } + // Fast path: check the state-based index first (avoids Engram round-trip). + let idx: String = state_get("session_index") + if !str_eq(idx, "") && !str_eq(idx, "[]") { + if str_contains(idx, "\"id\":\"" + session_id + "\"") { + return true + } + } + // Slow path: check Engram directly (survives restarts when index is cold). + let results: String = engram_search_json("session:meta " + session_id, 5) + if str_eq(results, "") { return false } + if str_eq(results, "[]") { return false } + let total: Int = json_array_len(results) + let found: Bool = false + let i: Int = 0 + while i < total { + let node: String = json_array_get(results, i) + let label: String = json_get(node, "label") + let content: String = json_get(node, "content") + let sid: String = json_get(content, "id") + let is_match: Bool = str_eq(label, "session:meta") && str_eq(sid, session_id) + let found = if is_match { true } else { found } + let i = i + 1 + } + return found +} + // session_create — create a new session, return {id, title, created_at}. +// +// ISSUE #1: Ghost sessions on failed first message. +// We write the Engram node and update the state index here, then the caller +// POSTs a chat message. If that chat call fails (LLM unavailable, network +// error, etc.) the session is stranded with no messages. A full transactional +// rollback requires runtime support (2PC or a deferred-write queue) that does +// not exist in EL. Mitigation: +// (a) Set "session_pending_first_msg_" in state so callers can detect it. +// (b) Provide session_create_cleanup() for callers that detect a failure. +// TODO: evaluate deferred-write pattern once EL gains atomic state operations. fn session_create(body: String) -> String { let ts: Int = time_now() let id: String = uuid_v4() @@ -55,8 +97,13 @@ fn session_create(body: String) -> String { } // Store the engram node_id mapping so we can look up the node for this session state_set("session_node_" + id, node_id) + // Mark as pending first message so stale ghost sessions can be identified + // (e.g. if the caller\'s subsequent chat POST fails). + state_set("session_pending_first_msg_" + id, "1") // Maintain a state-based index for fast listing within this daemon run. // Newest sessions first (prepend). + // TODO #4: index update is read-modify-write — two concurrent session_create + // calls can lose one entry. EL has no CAS primitive; fix requires runtime support. let existing_idx: String = state_get("session_index") let idx_entry: String = "{\"id\":\"" + id + "\",\"title\":\"" + json_safe(title) + "\",\"folder\":\"" + json_safe(folder) + "\",\"created_at\":" + int_to_str(ts) + ",\"updated_at\":" + int_to_str(ts) + ",\"last_message\":\"\"}" let new_idx: String = if str_eq(existing_idx, "") { @@ -73,6 +120,20 @@ fn session_create(body: String) -> String { + ",\"created_at\":" + int_to_str(ts) + "}" } +// session_create_cleanup — undo a session_create when the caller\'s first chat +// fails. Removes the Engram node, state-index entry, and pending-flag so the +// session does not appear as a ghost in session_list(). +// Addresses ISSUE #1: cleanup path for ghost sessions. +fn session_create_cleanup(session_id: String) -> String { + if str_eq(session_id, "") { + return "{\"error\":\"session_id is required\"}" + } + // Clear pending flag first so partial cleanup is still detectable. + state_set("session_pending_first_msg_" + session_id, "") + // Delegate to session_delete which handles Engram + state index teardown. + return session_delete(session_id) +} + // session_list — list all sessions. Returns [{id, title, last_message, created_at, updated_at}]. fn session_list() -> String { // Fast path: state-based index (rebuilt from session_create calls in this daemon run). @@ -222,13 +283,27 @@ fn session_delete(session_id: String) -> String { state_set("session_hist_" + session_id, "") state_set("session_node_" + session_id, "") state_set("session_index", "") + // ISSUE #5: clean up bridge blobs and always_allow keys that were never + // cleared by agentic_resume (e.g. client abandoned a pending tool call). + // Without this, stranded bridge blobs accumulate indefinitely in state. + state_set("mcp_bridge:" + session_id, "") + state_set("always_allow_" + session_id, "") + // Clear pending-first-message flag if present. + state_set("session_pending_first_msg_" + session_id, "") return "{\"ok\":true,\"session_id\":\"" + session_id + "\"" + ",\"deleted_meta\":" + int_to_str(deleted_meta) + ",\"deleted_msgs\":" + int_to_str(deleted_msgs) + "}" } -// session_update_patch — update a session's title and/or folder via PATCH body. +// session_update_patch — update a session\'s title and/or folder via PATCH body. // Body may contain "title", "folder", or both. Preserves unmentioned fields. +// +// ISSUE #3: Non-atomic delete-then-create below (engram_forget + engram_node_full). +// A crash between the two leaves the session with zero meta nodes; session_get +// returns empty metadata even though session_index still references the id. +// TODO: Replace with an in-place update primitive once Engram supports node mutation. +// Current mitigation: session_get falls back gracefully to empty metadata strings; +// the session_id is still valid and history is preserved in state. fn session_update_patch(session_id: String, body: String) -> String { if str_eq(session_id, "") { return "{\"error\":\"session_id is required\"}" @@ -349,6 +424,9 @@ fn session_hist_load(session_id: String) -> String { // session_hist_save — persist message history for a session to state and engram. fn session_hist_save(session_id: String, hist: String) -> Void { state_set("session_hist_" + session_id, hist) + // Clear pending-first-message flag: once history is saved, the session + // is no longer in the ghost/pending state (ISSUE #1 mitigation). + state_set("session_pending_first_msg_" + session_id, "") // Delete old history node and write fresh one let old_results: String = engram_search_json("session:messages:" + session_id, 3) let o_total: Int = if str_eq(old_results, "") { 0 } else { json_array_len(old_results) } @@ -371,6 +449,16 @@ fn session_hist_save(session_id: String, hist: String) -> Void { } // session_update_meta_timestamp — update the updated_at field in the session:meta node. +// +// ISSUE #2: No TTL / idle expiry mechanism. Sessions accumulate indefinitely. +// A sweep job (e.g. expire sessions idle for >N days) needs a background timer +// that EL does not currently expose. Bridge blobs under "mcp_bridge:" are also +// never swept unless session_delete is called explicitly. +// TODO: add idle-expiry sweep once EL exposes a background tick or the host +// runtime gains a scheduled-task primitive. +// +// ISSUE #3 applies here too: delete-then-create is non-atomic. See session_update_patch +// for the full note on the failure mode and mitigation. fn session_update_meta_timestamp(session_id: String) -> Void { let results: String = engram_search_json("session:meta " + session_id, 10) let total: Int = if str_eq(results, "") { 0 } else { json_array_len(results) } @@ -464,6 +552,14 @@ fn session_auto_title(session_id: String, first_message: String) -> Void { // action: "allow" | "deny" | "always" // Resumes the agentic loop from where it was paused. // +// ISSUE #8: Reconnect/duplicate resume race. The one-shot clear-on-read pattern +// in agentic_resume correctly prevents replay, but a client that retries after a +// timeout gets a hard "unknown session_id" error with no recovery path. The +// conversation is permanently stuck in that case. Full idempotency (e.g. caching +// the last reply keyed by call_id) requires a new state structure. +// TODO: persist the last successful resume reply under "bridge_reply:" +// keyed by call_id so a retry within a short window returns the same envelope. +// // Modern path (agentic_loop / bridge): the loop saves its suspension to // "mcp_bridge:" via bridge_save(). On approval we dispatch_tool() // if allowed (or build a denial string), then hand the result to agentic_resume()