feat(soul): Layer 1 — safety.el with screen/validate/bell interface

2026-06-11 11:30:57 -05:00
parent 5a4ef04005
commit 5597bf78cb
2 changed files with 212 additions and 0 deletions
@@ -0,0 +1,204 @@
+import "memory.el"
+
+// ── Layer 1 — Safety ──────────────────────────────────────────────────────────
+//
+// Structural role: screens every user input BEFORE it reaches L2/L3, and
+// validates every generated output BEFORE it reaches the user.
+//
+// Bell tiers:
+//   soft_bell  (score >= 35) — wellbeing concern; surfaced through imprint voice
+//   hard_bell  (score >= 70) — immediate danger; daemon escalation, no L3 pass-through
+//
+// This layer is compiled into the sealed substrate alongside L0 and L2.
+// It is structurally inaccessible from within an imprint (L3).
+//
+// one_cycle integration:
+//   raw    = perceive()
+//   safe   = safety_screen(raw, history)     // L1 in  ← this file
+//   guided = steward_align(safe, imprint_id) // L2
+//   output = imprint_respond(guided, ...)    // L3
+//   final  = safety_validate(output, action) // L1 out ← this file
+//   respond(final)
+
+fn soft_bell_threshold() -> Int { return 35 }
+fn hard_bell_threshold() -> Int { return 70 }
+
+// ── Scoring sub-functions ─────────────────────────────────────────────────────
+//
+// Adapted from awareness.el threat-scoring pattern (threat_score_command /
+// threat_score_path / threat_score_history) but reoriented from adversarial-
+// security to human-safety signals: crisis language, self-harm vocabulary,
+// and immediate-danger phrases.
+//
+// Each sub-function returns an additive score 0-N. Callers are responsible
+// for capping at 100.
+
+fn safety_score_crisis(input: String) -> Int {
+    let s1: Int = if str_contains(input, "kill myself") { 80 } else { 0 }
+    let s2: Int = if str_contains(input, "want to die") { 75 } else { 0 }
+    let s3: Int = if str_contains(input, "end my life") { 80 } else { 0 }
+    let s4: Int = if str_contains(input, "suicide") { 70 } else { 0 }
+    let s5: Int = if str_contains(input, "suicidal") { 75 } else { 0 }
+    let s6: Int = if str_contains(input, "don't want to be here") { 60 } else { 0 }
+    let s7: Int = if str_contains(input, "no reason to live") { 70 } else { 0 }
+    let s8: Int = if str_contains(input, "better off dead") { 75 } else { 0 }
+    let s9: Int = if str_contains(input, "can't go on") { 50 } else { 0 }
+    let s10: Int = if str_contains(input, "not worth living") { 65 } else { 0 }
+    return s1 + s2 + s3 + s4 + s5 + s6 + s7 + s8 + s9 + s10
+}
+
+fn safety_score_harm(input: String) -> Int {
+    let s1: Int = if str_contains(input, "hurt myself") { 60 } else { 0 }
+    let s2: Int = if str_contains(input, "cut myself") { 65 } else { 0 }
+    let s3: Int = if str_contains(input, "self harm") { 60 } else { 0 }
+    let s4: Int = if str_contains(input, "self-harm") { 60 } else { 0 }
+    let s5: Int = if str_contains(input, "overdose") { 65 } else { 0 }
+    let s6: Int = if str_contains(input, "take all my pills") { 75 } else { 0 }
+    let s7: Int = if str_contains(input, "starving myself") { 50 } else { 0 }
+    let s8: Int = if str_contains(input, "burning myself") { 60 } else { 0 }
+    let s9: Int = if str_contains(input, "punish myself") { 40 } else { 0 }
+    let s10: Int = if str_contains(input, "deserve to suffer") { 45 } else { 0 }
+    return s1 + s2 + s3 + s4 + s5 + s6 + s7 + s8 + s9 + s10
+}
+
+fn safety_score_danger(input: String) -> Int {
+    let s1: Int = if str_contains(input, "help me") && str_contains(input, "emergency") { 55 } else { 0 }
+    let s2: Int = if str_contains(input, "call 911") { 50 } else { 0 }
+    let s3: Int = if str_contains(input, "call an ambulance") { 55 } else { 0 }
+    let s4: Int = if str_contains(input, "in danger") { 50 } else { 0 }
+    let s5: Int = if str_contains(input, "someone is threatening") { 60 } else { 0 }
+    let s6: Int = if str_contains(input, "being abused") { 55 } else { 0 }
+    let s7: Int = if str_contains(input, "domestic violence") { 55 } else { 0 }
+    let s8: Int = if str_contains(input, "trapped") && str_contains(input, "can't escape") { 60 } else { 0 }
+    let s9: Int = if str_contains(input, "he is going to hurt") { 65 } else { 0 }
+    let s10: Int = if str_contains(input, "she is going to hurt") { 65 } else { 0 }
+    return s1 + s2 + s3 + s4 + s5 + s6 + s7 + s8 + s9 + s10
+}
+
+fn safety_score_distress_history(history: String) -> Int {
+    let s1: Int = if str_contains(history, "hopeless") { 15 } else { 0 }
+    let s2: Int = if str_contains(history, "worthless") { 15 } else { 0 }
+    let s3: Int = if str_contains(history, "nobody cares") { 15 } else { 0 }
+    let s4: Int = if str_contains(history, "no one cares") { 15 } else { 0 }
+    let s5: Int = if str_contains(history, "completely alone") { 15 } else { 0 }
+    let s6: Int = if str_contains(history, "all alone") { 10 } else { 0 }
+    let s7: Int = if str_contains(history, "can't take it anymore") { 20 } else { 0 }
+    let s8: Int = if str_contains(history, "want to disappear") { 20 } else { 0 }
+    let s9: Int = if str_contains(history, "don't care anymore") { 15 } else { 0 }
+    let s10: Int = if str_contains(history, "giving up") { 15 } else { 0 }
+    return s1 + s2 + s3 + s4 + s5 + s6 + s7 + s8 + s9 + s10
+}
+
+// ── safety_threat_score ───────────────────────────────────────────────────────
+//
+// Composite score 0-100.
+// Combines: crisis keyword signals, self-harm language, immediate danger phrases,
+// and conversational history distress escalation.
+// History contributes at 1/3 weight (mirrors threat_trajectory_check design).
+
+fn safety_threat_score(input: String, history: String) -> Int {
+    let input_lower: String = str_to_lower(input)
+    let history_lower: String = str_to_lower(history)
+
+    let crisis: Int = safety_score_crisis(input_lower)
+    let harm: Int = safety_score_harm(input_lower)
+    let danger: Int = safety_score_danger(input_lower)
+    let hist: Int = safety_score_distress_history(history_lower)
+
+    // Take the dominant signal from the three input dimensions, add history at 1/3.
+    // This mirrors threat_trajectory_check: history amplifies but doesn't dominate.
+    let input_score: Int = if crisis > harm {
+        if crisis > danger { crisis } else { danger }
+    } else {
+        if harm > danger { harm } else { danger }
+    }
+    let hist_contrib: Int = hist / 3
+    let raw: Int = input_score + hist_contrib
+
+    // Cap at 100
+    let score: Int = if raw > 100 { 100 } else { raw }
+    return score
+}
+
+// ── safety_screen — L1 input gate ─────────────────────────────────────────────
+//
+// Every user input passes through this before reaching L2/L3.
+// Returns a JSON envelope consumed by steward_align (L2).
+//
+// Return shapes:
+//   {"action":"hard_bell","reason":"immediate safety concern","content":""}
+//   {"action":"soft_bell","concern":"wellbeing check needed","content":"<input>"}
+//   {"action":"pass","content":"<input>"}
+
+fn safety_screen(input: String, history: String) -> String {
+    let score: Int = safety_threat_score(input, history)
+    let hard: Int = hard_bell_threshold()
+    let soft: Int = soft_bell_threshold()
+
+    if score >= hard {
+        let summary: String = str_slice(input, 0, 80)
+        let discard: Void = safety_log_bell("hard", "immediate safety concern", summary)
+        return "{\"action\":\"hard_bell\",\"reason\":\"immediate safety concern\",\"content\":\"\"}"
+    }
+
+    if score >= soft {
+        let summary: String = str_slice(input, 0, 80)
+        let discard: Void = safety_log_bell("soft", "wellbeing check needed", summary)
+        let safe_input: String = str_replace(input, "\"", "'")
+        return "{\"action\":\"soft_bell\",\"concern\":\"wellbeing check needed\",\"content\":\"" + safe_input + "\"}"
+    }
+
+    let safe_input: String = str_replace(input, "\"", "'")
+    return "{\"action\":\"pass\",\"content\":\"" + safe_input + "\"}"
+}
+
+// ── safety_validate — L1 output gate ──────────────────────────────────────────
+//
+// Every generated output passes through this before reaching the user.
+// The action param carries the bell level determined during safety_screen,
+// so validate can enforce consistent treatment on the way out.
+//
+// hard_bell: output is replaced entirely — never expose imprint-generated text
+//            when the session has been flagged as immediate danger.
+// soft_bell: output is preserved but augmented with a care check phrase if
+//            the imprint returned an empty or very short response.
+// pass:      output returned verbatim.
+
+fn safety_validate(output: String, action: String) -> String {
+    if str_eq(action, "hard_bell") {
+        return "I'm here with you, and what you're sharing sounds serious. Please reach out to a crisis line now — in the US you can call or text 988 (Suicide and Crisis Lifeline), available 24/7. You don't have to go through this alone."
+    }
+
+    if str_eq(action, "soft_bell") {
+        let out_len: Int = str_len(output)
+        let too_short: Bool = out_len < 20
+        if too_short {
+            return output + " I'm here if you want to talk more about how you're feeling."
+        }
+        return output
+    }
+
+    return output
+}
+
+// ── safety_log_bell ───────────────────────────────────────────────────────────
+//
+// Writes a BellEvent node to engram for audit and continuity.
+// Never surfaces to the user; consumed by daemon observability layer.
+
+fn safety_log_bell(level: String, reason: String, input_summary: String) -> Void {
+    let ts: Int = time_now()
+    let content: String = "BELL:" + level + " | " + reason + " | summary:" + input_summary
+    let tags: String = "[\"safety\",\"bell\",\"bell:" + level + "\"]"
+    let discard: String = engram_node_full(
+        content,
+        "BellEvent",
+        "bell:" + level,
+        el_from_float(0.95),
+        el_from_float(0.95),
+        el_from_float(1.0),
+        "Episodic",
+        tags
+    )
+    return ""
+}
@@ -0,0 +1,8 @@
+// Layer 1 — Safety: extern declarations
+// auto-generated by elc --emit-header — do not edit
+extern fn soft_bell_threshold() -> Int
+extern fn hard_bell_threshold() -> Int
+extern fn safety_threat_score(input: String, history: String) -> Int
+extern fn safety_screen(input: String, history: String) -> String
+extern fn safety_validate(output: String, action: String) -> String
+extern fn safety_log_bell(level: String, reason: String, input_summary: String) -> Void