Files
neuron/safety.el
T
Tim Lingo 6d8a992716 feat(soul): add safety module, expand connectors API, memory-recall bug notes
- safety.el/.elh: new safety module
- neuron-api.el, routes.el, soul.el, chat.el: connectors API expansion
- regenerated dist/ C artifacts
- MEMORY_RECALL_BUG.md: investigation notes

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-17 13:18:35 -05:00

372 lines
21 KiB
EmacsLisp
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import "memory.el"
// Layer 1 Safety
//
// Structural role: screens every user input BEFORE it reaches L2/L3, and
// validates every generated output BEFORE it reaches the user.
//
// Bell tiers:
// soft_bell (score >= 35) wellbeing concern; surfaced through imprint voice
// hard_bell (score >= 70) immediate danger; daemon escalation, no L3 pass-through
//
// This layer is compiled into the sealed substrate alongside L0 and L2.
// It is structurally inaccessible from within an imprint (L3).
//
// one_cycle integration:
// raw = perceive()
// safe = safety_screen(raw, history) // L1 in this file
// guided = steward_align(safe, imprint_id) // L2
// output = imprint_respond(guided, ...) // L3
// final = safety_validate(output, action) // L1 out this file
// respond(final)
fn soft_bell_threshold() -> Int { return 35 }
fn hard_bell_threshold() -> Int { return 70 }
// Scoring sub-functions
//
// Adapted from awareness.el threat-scoring pattern (threat_score_command /
// threat_score_path / threat_score_history) but reoriented from adversarial-
// security to human-safety signals: crisis language, self-harm vocabulary,
// and immediate-danger phrases.
//
// Each sub-function returns an additive score 0-N. Callers are responsible
// for capping at 100.
fn safety_score_crisis(input: String) -> Int {
let s1: Int = if str_contains(input, "kill myself") { 80 } else { 0 }
let s2: Int = if str_contains(input, "want to die") { 75 } else { 0 }
let s3: Int = if str_contains(input, "end my life") { 80 } else { 0 }
let s4: Int = if str_contains(input, "suicide") { 70 } else { 0 }
let s5: Int = if str_contains(input, "suicidal") { 75 } else { 0 }
let s6: Int = if str_contains(input, "don't want to be here") { 60 } else { 0 }
let s7: Int = if str_contains(input, "no reason to live") { 70 } else { 0 }
let s8: Int = if str_contains(input, "better off dead") { 75 } else { 0 }
let s9: Int = if str_contains(input, "can't go on") { 50 } else { 0 }
let s10: Int = if str_contains(input, "not worth living") { 65 } else { 0 }
return s1 + s2 + s3 + s4 + s5 + s6 + s7 + s8 + s9 + s10
}
fn safety_score_harm(input: String) -> Int {
let s1: Int = if str_contains(input, "hurt myself") { 60 } else { 0 }
let s2: Int = if str_contains(input, "cut myself") { 65 } else { 0 }
let s3: Int = if str_contains(input, "self harm") { 60 } else { 0 }
let s4: Int = if str_contains(input, "self-harm") { 60 } else { 0 }
let s5: Int = if str_contains(input, "overdose") { 65 } else { 0 }
let s6: Int = if str_contains(input, "take all my pills") { 75 } else { 0 }
let s7: Int = if str_contains(input, "starving myself") { 50 } else { 0 }
let s8: Int = if str_contains(input, "burning myself") { 60 } else { 0 }
let s9: Int = if str_contains(input, "punish myself") { 40 } else { 0 }
let s10: Int = if str_contains(input, "deserve to suffer") { 45 } else { 0 }
return s1 + s2 + s3 + s4 + s5 + s6 + s7 + s8 + s9 + s10
}
fn safety_score_danger(input: String) -> Int {
let s1: Int = if str_contains(input, "help me") && str_contains(input, "emergency") { 55 } else { 0 }
let s2: Int = if str_contains(input, "call 911") { 50 } else { 0 }
let s3: Int = if str_contains(input, "call an ambulance") { 55 } else { 0 }
let s4: Int = if str_contains(input, "in danger") { 50 } else { 0 }
let s5: Int = if str_contains(input, "someone is threatening") { 60 } else { 0 }
let s6: Int = if str_contains(input, "being abused") { 55 } else { 0 }
let s7: Int = if str_contains(input, "domestic violence") { 55 } else { 0 }
let s8: Int = if str_contains(input, "trapped") && str_contains(input, "can't escape") { 60 } else { 0 }
let s9: Int = if str_contains(input, "he is going to hurt") { 65 } else { 0 }
let s10: Int = if str_contains(input, "she is going to hurt") { 65 } else { 0 }
return s1 + s2 + s3 + s4 + s5 + s6 + s7 + s8 + s9 + s10
}
fn safety_score_distress_history(history: String) -> Int {
let s1: Int = if str_contains(history, "hopeless") { 15 } else { 0 }
let s2: Int = if str_contains(history, "worthless") { 15 } else { 0 }
let s3: Int = if str_contains(history, "nobody cares") { 15 } else { 0 }
let s4: Int = if str_contains(history, "no one cares") { 15 } else { 0 }
let s5: Int = if str_contains(history, "completely alone") { 15 } else { 0 }
let s6: Int = if str_contains(history, "all alone") { 10 } else { 0 }
let s7: Int = if str_contains(history, "can't take it anymore") { 20 } else { 0 }
let s8: Int = if str_contains(history, "want to disappear") { 20 } else { 0 }
let s9: Int = if str_contains(history, "don't care anymore") { 15 } else { 0 }
let s10: Int = if str_contains(history, "giving up") { 15 } else { 0 }
return s1 + s2 + s3 + s4 + s5 + s6 + s7 + s8 + s9 + s10
}
// safety_threat_score
//
// Composite score 0-100.
// Combines: crisis keyword signals, self-harm language, immediate danger phrases,
// and conversational history distress escalation.
// History contributes at 1/3 weight (mirrors threat_trajectory_check design).
fn safety_threat_score(input: String, history: String) -> Int {
let input_lower: String = str_to_lower(input)
let history_lower: String = str_to_lower(history)
let crisis: Int = safety_score_crisis(input_lower)
let harm: Int = safety_score_harm(input_lower)
let danger: Int = safety_score_danger(input_lower)
let hist: Int = safety_score_distress_history(history_lower)
// Take the dominant signal from the three input dimensions, add history at 1/3.
// This mirrors threat_trajectory_check: history amplifies but doesn't dominate.
let input_score: Int = if crisis > harm {
if crisis > danger { crisis } else { danger }
} else {
if harm > danger { harm } else { danger }
}
let hist_contrib: Int = hist / 3
let raw: Int = input_score + hist_contrib
// Cap at 100
let score: Int = if raw > 100 { 100 } else { raw }
return score
}
// safety_screen L1 input gate
//
// Every user input passes through this before reaching L2/L3.
// Returns a JSON envelope consumed by steward_align (L2).
//
// Return shapes:
// {"action":"hard_bell","reason":"immediate safety concern","content":""}
// {"action":"soft_bell","reason":"wellbeing check needed","content":"<input>"}
// {"action":"pass","content":"<input>"}
fn safety_screen(input: String, history: String) -> String {
let score: Int = safety_threat_score(input, history)
let hard: Int = hard_bell_threshold()
let soft: Int = soft_bell_threshold()
if score >= hard {
let summary: String = str_slice(input, 0, 80)
let discard: String = safety_log_bell("hard", "immediate safety concern", summary)
return "{\"action\":\"hard_bell\",\"reason\":\"immediate safety concern\",\"content\":\"\"}"
}
if score >= soft {
let summary: String = str_slice(input, 0, 80)
let discard: String = safety_log_bell("soft", "wellbeing check needed", summary)
let e1: String = str_replace(input, "\\", "\\\\")
let e2: String = str_replace(e1, "\"", "\\\"")
let e3: String = str_replace(e2, "\n", "\\n")
let safe_input: String = str_replace(e3, "\r", "\\r")
return "{\"action\":\"soft_bell\",\"reason\":\"wellbeing check needed\",\"content\":\"" + safe_input + "\"}"
}
let e1: String = str_replace(input, "\\", "\\\\")
let e2: String = str_replace(e1, "\"", "\\\"")
let e3: String = str_replace(e2, "\n", "\\n")
let safe_input: String = str_replace(e3, "\r", "\\r")
return "{\"action\":\"pass\",\"content\":\"" + safe_input + "\"}"
}
// safety_validate L1 output gate
//
// Every generated output passes through this before reaching the user.
// The action param carries the bell level determined during safety_screen,
// so validate can enforce consistent treatment on the way out.
//
// hard_bell: output is replaced entirely never expose imprint-generated text
// when the session has been flagged as immediate danger.
// soft_bell: output is preserved but augmented with a care check phrase if
// the imprint returned an empty or very short response.
// pass: output returned verbatim.
fn safety_validate(output: String, action: String) -> String {
if str_eq(action, "hard_bell") {
return "I'm here with you, and what you're sharing sounds serious. Please reach out to a crisis line now — in the US you can call or text 988 (Suicide and Crisis Lifeline), available 24/7. You don't have to go through this alone."
}
if str_eq(action, "soft_bell") {
let out_len: Int = str_len(output)
let too_short: Bool = out_len < 20
if too_short {
return output + " I'm here if you want to talk more about how you're feeling."
}
return output
}
return output
}
// safety_log_bell
//
// Writes a BellEvent node to engram for audit and continuity.
// Never surfaces to the user; consumed by daemon observability layer.
fn safety_log_bell(level: String, reason: String, input_summary: String) -> String {
let content: String = "BELL:" + level + " | " + reason + " | summary:" + input_summary
let tags: String = "[\"safety\",\"bell\",\"bell:" + level + "\"]"
let discard: String = engram_node_full(
content,
"BellEvent",
"bell:" + level,
el_from_float(0.95),
el_from_float(0.95),
el_from_float(1.0),
"Episodic",
tags
)
return ""
}
// abuse danger from another person. Emergency services / 988 ONLY. The
// safety contact on file is NEVER notified they may be the abuser.
// This routing is non-configurable by design.
//
// Evaluation is keyword-only (zero added latency) and stays on device. Triggers are
// logged locally as InternalStateEvents and never transmitted.
// Phrase lists (ported verbatim from bell-detector.ts)
fn safety_self_harm_phrases() -> String {
return "[\"kill myself\",\"killing myself\",\"want to die\",\"want to be dead\",\"going to end my life\",\"end my life\",\"take my life\",\"taking my life\",\"suicide\",\"suicidal\",\"can't go on\",\"cannot go on\",\"i have a knife\",\"i have a gun\",\"i have pills\",\"took pills\",\"took too many\",\"overdose\",\"overdosing\",\"self harm\",\"self-harm\",\"cutting myself\",\"hurt myself\",\"hurting myself\",\"no reason to live\",\"not worth living\",\"better off dead\",\"better off without me\"]"
}
fn safety_abuse_phrases() -> String {
return "[\"someone is hurting me\",\"someone's hurting me\",\"someone hurt me\",\"he hit me\",\"she hit me\",\"they hit me\",\"he hurt me\",\"she hurt me\",\"being abused\",\"being hurt by\",\"i am being abused\",\"i'm being abused\",\"i am being hurt\",\"i'm being hurt\",\"domestic violence\",\"my partner hurt\",\"my partner hit\",\"my husband hurt\",\"my wife hurt\",\"my boyfriend hurt\",\"my girlfriend hurt\",\"my parent hurt\",\"my father hurt\",\"my mother hurt\",\"my dad hurt\",\"my mom hurt\",\"afraid of him\",\"afraid of her\",\"afraid to go home\",\"scared of him\",\"scared of her\",\"he threatened me\",\"she threatened me\",\"threatened to hurt me\",\"threatened to kill me\",\"going to hurt me\",\"going to kill me\",\"help me he\",\"help me she\",\"help me they\"]"
}
// General danger phrases that don't fit a bucket cleanly. Detected as hard; they
// fall through to self_harm routing (the person is the primary concern).
fn safety_general_hard_phrases() -> String {
return "[\"going to kill\",\"going to hurt\",\"hurting me\",\"being hurt\"]"
}
fn safety_soft_phrases() -> String {
return "[\"stressed\",\"overwhelmed\",\"can't cope\",\"cannot cope\",\"struggling\",\"anxious\",\"anxiety\",\"depressed\",\"depression\",\"lonely\",\"isolated\",\"hopeless\",\"hopelessness\",\"exhausted\",\"burnt out\",\"burned out\",\"burnout\",\"panic\",\"panicking\",\"falling apart\",\"breaking down\",\"can't handle\",\"cannot handle\",\"losing it\",\"nothing matters\",\"don't care anymore\",\"given up\",\"giving up\",\"helpless\",\"worthless\",\"useless\",\"hate myself\",\"no one cares\",\"nobody cares\",\"no one understands\",\"nobody understands\",\"empty inside\",\"can't stop crying\",\"breaking point\",\"at my limit\",\"having a breakdown\"]"
}
// Matching helpers (single loops only el escapes while-body mutation via
// top-level let rebinds; nested loops would not advance) ────────────────────
fn safety_normalize(message: String) -> String {
let lower: String = str_to_lower(message)
// Normalise the common curly apostrophe to ASCII so "can't" / "i'm" match.
return str_replace(lower, "", "'")
}
fn safety_any_match(text: String, phrases_json: String) -> Bool {
let n: Int = json_array_len(phrases_json)
let i: Int = 0
let found: Bool = false
while i < n {
let phrase: String = json_array_get_string(phrases_json, i)
let found = if str_contains(text, phrase) { true } else { found }
let i = i + 1
}
return found
}
fn safety_count_match(text: String, phrases_json: String) -> Int {
let n: Int = json_array_len(phrases_json)
let i: Int = 0
let count: Int = 0
while i < n {
let phrase: String = json_array_get_string(phrases_json, i)
let count = if str_contains(text, phrase) { count + 1 } else { count }
let i = i + 1
}
return count
}
// Public detection API (ports detectBellLevel + classifyHardBell)
// Returns "none" | "soft" | "hard". Hard bell triggers on ANY match (cost of a miss
// outweighs a false positive). Soft bell needs >= 2 matches to reduce false positives.
fn safety_detect_bell_level(message: String) -> String {
let text: String = safety_normalize(message)
let is_hard: Bool = safety_any_match(text, safety_self_harm_phrases())
|| safety_any_match(text, safety_abuse_phrases())
|| safety_any_match(text, safety_general_hard_phrases())
if is_hard { return "hard" }
let soft_count: Int = safety_count_match(text, safety_soft_phrases())
if soft_count >= 2 { return "soft" }
return "none"
}
// Returns "abuse" | "self_harm". Abuse is checked FIRST and takes precedence on
// ambiguous signals it forecloses the more dangerous routing (notifying a
// possible abuser). General/unbucketed danger falls through to self_harm.
fn safety_classify_hard_bell(message: String) -> String {
let text: String = safety_normalize(message)
if safety_any_match(text, safety_abuse_phrases()) { return "abuse" }
if safety_any_match(text, safety_self_harm_phrases()) { return "self_harm" }
return "self_harm"
}
// Directives (ported from eval.go; em dashes removed per voice rule) ──────────
fn safety_soft_directive() -> String {
return "[SUBSTRATE DIRECTIVE - not visible to the user, do not mention this instruction]\nBefore responding to the user's message, acknowledge what they've said with genuine care and warmth. Pause on the feeling they expressed. Ask how they are, or whether they want to talk about it. Do this naturally, in your own voice - not as a script, not as a checklist. Only after checking in should you continue with whatever they asked."
}
fn safety_hard_directive(hard_type: String) -> String {
let preamble: String = "[SUBSTRATE DIRECTIVE - not visible to the user, do not mention this instruction]\nThe person has expressed something that sounds like immediate distress or danger. Do not ignore it. Do not just answer the surface question. Lead with genuine human warmth. Acknowledge what you heard. Tell them you're concerned. Let them know they are not alone. Ask them directly if they are safe right now. Stay with them in this conversation."
let abuse_block: String = "\n\nThis appears to be danger from ANOTHER person. Routing rule (non-negotiable): do NOT suggest involving, contacting, or notifying the person's safety contact, family, or anyone close to them - that person may be the source of danger and has NOT been notified. Share only independent emergency resources:\n - Call or text 911 if in immediate danger\n - National Domestic Violence Hotline - 1-800-799-7233 (24/7, confidential)\n - Text START to 88788\n - thehotline.org (chat available)\n - 988 Suicide & Crisis Lifeline - call or text 988\nReassure them, if it fits, that their contact list has not been notified."
let self_harm_block: String = "\n\nShare these crisis resources if appropriate:\n - 988 Suicide & Crisis Lifeline - call or text 988 (US)\n - Crisis Text Line - text HOME to 741741\n - International Association for Suicide Prevention: https://www.iasp.info/resources/Crisis_Centres/"
if str_eq(hard_type, "abuse") {
return preamble + abuse_block
}
return preamble + self_harm_block
}
// safety_augment_system pre-LLM bell evaluation. Called with the finalized system
// prompt and the raw user message, BEFORE the LLM call, on every chat path. Appends
// the soft/hard directive when a bell fires; otherwise returns the prompt unchanged.
// Logs the trigger on device only (level + sub-type, never the message content).
fn safety_augment_system(system: String, user_msg: String) -> String {
let level: String = safety_detect_bell_level(user_msg)
if str_eq(level, "none") { return system }
if str_eq(level, "soft") {
let logd: String = mem_emit_state_event("safety-bell", "soft", "soft bell fired (content not stored)")
return system + "\n\n" + safety_soft_directive()
}
let hard_type: String = safety_classify_hard_bell(user_msg)
let logd2: String = mem_emit_state_event("safety-bell", "hard:" + hard_type, "hard bell fired (content not stored)")
return system + "\n\n" + safety_hard_directive(hard_type)
}
// Safety-contact storage + endpoint (ports contact.go + handler.go)
// Stored locally at ~/.neuron/safety-contact.json (same file the desktop gate writes),
// never synced. NOTE: encryption-at-rest is a flagged follow-up (ties to key custody);
// today the file is plaintext JSON, matching the current desktop behavior.
fn safety_contact_path() -> String {
return env("HOME") + "/.neuron/safety-contact.json"
}
// GET /api/safety-contact -> {"configured":false} or {"configured":true,"contact":{...}}
fn handle_safety_contact_get() -> String {
let raw: String = fs_read(safety_contact_path())
if str_eq(raw, "") { return "{\"configured\":false}" }
return "{\"configured\":true,\"contact\":" + raw + "}"
}
// POST /api/safety-contact validate + persist. Mirrors handler.go: crisis line is
// always acceptable and auto-fills its fields; otherwise a name is required. The
// contact can be replaced but never cleared to empty (the gate enforces presence).
fn handle_safety_contact_post(body: String) -> String {
let is_crisis: Bool = json_get_bool(body, "is_crisis_line")
let name_in: String = json_get(body, "name")
if !is_crisis {
if str_eq(name_in, "") { return "{\"ok\":false,\"error\":\"name is required\"}" }
}
let name: String = if is_crisis { "Crisis Line" } else { name_in }
let method: String = if is_crisis { "crisis-line" } else { json_get(body, "contact_method") }
let value: String = if is_crisis { "988" } else { json_get(body, "contact_value") }
let rel: String = if is_crisis { "crisis-support" } else { json_get(body, "relationship") }
let crisis_str: String = if is_crisis { "true" } else { "false" }
let now: String = time_format(time_now(), "%Y-%m-%dT%H:%M:%SZ")
let contact_json: String = "{\"name\":\"" + json_safe(name) + "\""
+ ",\"contact_method\":\"" + json_safe(method) + "\""
+ ",\"contact_value\":\"" + json_safe(value) + "\""
+ ",\"relationship\":\"" + json_safe(rel) + "\""
+ ",\"confirmed\":true"
+ ",\"is_crisis_line\":" + crisis_str
+ ",\"set_at\":\"" + now + "\"}"
fs_write(safety_contact_path(), contact_json)
// Read-back verify the write actually persisted.
let check: String = fs_read(safety_contact_path())
if str_eq(check, "") { return "{\"ok\":false,\"error\":\"write_failed\"}" }
return "{\"configured\":true,\"contact\":" + contact_json + ",\"ok\":true}"
}