From 5597bf78cb4308e67982b9e65137071b36a5f601 Mon Sep 17 00:00:00 2001 From: "will.anderson" Date: Thu, 11 Jun 2026 11:30:57 -0500 Subject: [PATCH] =?UTF-8?q?feat(soul):=20Layer=201=20=E2=80=94=20safety.el?= =?UTF-8?q?=20with=20screen/validate/bell=20interface?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- safety.el | 204 +++++++++++++++++++++++++++++++++++++++++++++++++++++ safety.elh | 8 +++ 2 files changed, 212 insertions(+) create mode 100644 safety.el create mode 100644 safety.elh diff --git a/safety.el b/safety.el new file mode 100644 index 0000000..70bab7a --- /dev/null +++ b/safety.el @@ -0,0 +1,204 @@ +import "memory.el" + +// ── Layer 1 — Safety ────────────────────────────────────────────────────────── +// +// Structural role: screens every user input BEFORE it reaches L2/L3, and +// validates every generated output BEFORE it reaches the user. +// +// Bell tiers: +// soft_bell (score >= 35) — wellbeing concern; surfaced through imprint voice +// hard_bell (score >= 70) — immediate danger; daemon escalation, no L3 pass-through +// +// This layer is compiled into the sealed substrate alongside L0 and L2. +// It is structurally inaccessible from within an imprint (L3). +// +// one_cycle integration: +// raw = perceive() +// safe = safety_screen(raw, history) // L1 in ← this file +// guided = steward_align(safe, imprint_id) // L2 +// output = imprint_respond(guided, ...) // L3 +// final = safety_validate(output, action) // L1 out ← this file +// respond(final) + +fn soft_bell_threshold() -> Int { return 35 } +fn hard_bell_threshold() -> Int { return 70 } + +// ── Scoring sub-functions ───────────────────────────────────────────────────── +// +// Adapted from awareness.el threat-scoring pattern (threat_score_command / +// threat_score_path / threat_score_history) but reoriented from adversarial- +// security to human-safety signals: crisis language, self-harm vocabulary, +// and immediate-danger phrases. +// +// Each sub-function returns an additive score 0-N. Callers are responsible +// for capping at 100. + +fn safety_score_crisis(input: String) -> Int { + let s1: Int = if str_contains(input, "kill myself") { 80 } else { 0 } + let s2: Int = if str_contains(input, "want to die") { 75 } else { 0 } + let s3: Int = if str_contains(input, "end my life") { 80 } else { 0 } + let s4: Int = if str_contains(input, "suicide") { 70 } else { 0 } + let s5: Int = if str_contains(input, "suicidal") { 75 } else { 0 } + let s6: Int = if str_contains(input, "don't want to be here") { 60 } else { 0 } + let s7: Int = if str_contains(input, "no reason to live") { 70 } else { 0 } + let s8: Int = if str_contains(input, "better off dead") { 75 } else { 0 } + let s9: Int = if str_contains(input, "can't go on") { 50 } else { 0 } + let s10: Int = if str_contains(input, "not worth living") { 65 } else { 0 } + return s1 + s2 + s3 + s4 + s5 + s6 + s7 + s8 + s9 + s10 +} + +fn safety_score_harm(input: String) -> Int { + let s1: Int = if str_contains(input, "hurt myself") { 60 } else { 0 } + let s2: Int = if str_contains(input, "cut myself") { 65 } else { 0 } + let s3: Int = if str_contains(input, "self harm") { 60 } else { 0 } + let s4: Int = if str_contains(input, "self-harm") { 60 } else { 0 } + let s5: Int = if str_contains(input, "overdose") { 65 } else { 0 } + let s6: Int = if str_contains(input, "take all my pills") { 75 } else { 0 } + let s7: Int = if str_contains(input, "starving myself") { 50 } else { 0 } + let s8: Int = if str_contains(input, "burning myself") { 60 } else { 0 } + let s9: Int = if str_contains(input, "punish myself") { 40 } else { 0 } + let s10: Int = if str_contains(input, "deserve to suffer") { 45 } else { 0 } + return s1 + s2 + s3 + s4 + s5 + s6 + s7 + s8 + s9 + s10 +} + +fn safety_score_danger(input: String) -> Int { + let s1: Int = if str_contains(input, "help me") && str_contains(input, "emergency") { 55 } else { 0 } + let s2: Int = if str_contains(input, "call 911") { 50 } else { 0 } + let s3: Int = if str_contains(input, "call an ambulance") { 55 } else { 0 } + let s4: Int = if str_contains(input, "in danger") { 50 } else { 0 } + let s5: Int = if str_contains(input, "someone is threatening") { 60 } else { 0 } + let s6: Int = if str_contains(input, "being abused") { 55 } else { 0 } + let s7: Int = if str_contains(input, "domestic violence") { 55 } else { 0 } + let s8: Int = if str_contains(input, "trapped") && str_contains(input, "can't escape") { 60 } else { 0 } + let s9: Int = if str_contains(input, "he is going to hurt") { 65 } else { 0 } + let s10: Int = if str_contains(input, "she is going to hurt") { 65 } else { 0 } + return s1 + s2 + s3 + s4 + s5 + s6 + s7 + s8 + s9 + s10 +} + +fn safety_score_distress_history(history: String) -> Int { + let s1: Int = if str_contains(history, "hopeless") { 15 } else { 0 } + let s2: Int = if str_contains(history, "worthless") { 15 } else { 0 } + let s3: Int = if str_contains(history, "nobody cares") { 15 } else { 0 } + let s4: Int = if str_contains(history, "no one cares") { 15 } else { 0 } + let s5: Int = if str_contains(history, "completely alone") { 15 } else { 0 } + let s6: Int = if str_contains(history, "all alone") { 10 } else { 0 } + let s7: Int = if str_contains(history, "can't take it anymore") { 20 } else { 0 } + let s8: Int = if str_contains(history, "want to disappear") { 20 } else { 0 } + let s9: Int = if str_contains(history, "don't care anymore") { 15 } else { 0 } + let s10: Int = if str_contains(history, "giving up") { 15 } else { 0 } + return s1 + s2 + s3 + s4 + s5 + s6 + s7 + s8 + s9 + s10 +} + +// ── safety_threat_score ─────────────────────────────────────────────────────── +// +// Composite score 0-100. +// Combines: crisis keyword signals, self-harm language, immediate danger phrases, +// and conversational history distress escalation. +// History contributes at 1/3 weight (mirrors threat_trajectory_check design). + +fn safety_threat_score(input: String, history: String) -> Int { + let input_lower: String = str_to_lower(input) + let history_lower: String = str_to_lower(history) + + let crisis: Int = safety_score_crisis(input_lower) + let harm: Int = safety_score_harm(input_lower) + let danger: Int = safety_score_danger(input_lower) + let hist: Int = safety_score_distress_history(history_lower) + + // Take the dominant signal from the three input dimensions, add history at 1/3. + // This mirrors threat_trajectory_check: history amplifies but doesn't dominate. + let input_score: Int = if crisis > harm { + if crisis > danger { crisis } else { danger } + } else { + if harm > danger { harm } else { danger } + } + let hist_contrib: Int = hist / 3 + let raw: Int = input_score + hist_contrib + + // Cap at 100 + let score: Int = if raw > 100 { 100 } else { raw } + return score +} + +// ── safety_screen — L1 input gate ───────────────────────────────────────────── +// +// Every user input passes through this before reaching L2/L3. +// Returns a JSON envelope consumed by steward_align (L2). +// +// Return shapes: +// {"action":"hard_bell","reason":"immediate safety concern","content":""} +// {"action":"soft_bell","concern":"wellbeing check needed","content":""} +// {"action":"pass","content":""} + +fn safety_screen(input: String, history: String) -> String { + let score: Int = safety_threat_score(input, history) + let hard: Int = hard_bell_threshold() + let soft: Int = soft_bell_threshold() + + if score >= hard { + let summary: String = str_slice(input, 0, 80) + let discard: Void = safety_log_bell("hard", "immediate safety concern", summary) + return "{\"action\":\"hard_bell\",\"reason\":\"immediate safety concern\",\"content\":\"\"}" + } + + if score >= soft { + let summary: String = str_slice(input, 0, 80) + let discard: Void = safety_log_bell("soft", "wellbeing check needed", summary) + let safe_input: String = str_replace(input, "\"", "'") + return "{\"action\":\"soft_bell\",\"concern\":\"wellbeing check needed\",\"content\":\"" + safe_input + "\"}" + } + + let safe_input: String = str_replace(input, "\"", "'") + return "{\"action\":\"pass\",\"content\":\"" + safe_input + "\"}" +} + +// ── safety_validate — L1 output gate ────────────────────────────────────────── +// +// Every generated output passes through this before reaching the user. +// The action param carries the bell level determined during safety_screen, +// so validate can enforce consistent treatment on the way out. +// +// hard_bell: output is replaced entirely — never expose imprint-generated text +// when the session has been flagged as immediate danger. +// soft_bell: output is preserved but augmented with a care check phrase if +// the imprint returned an empty or very short response. +// pass: output returned verbatim. + +fn safety_validate(output: String, action: String) -> String { + if str_eq(action, "hard_bell") { + return "I'm here with you, and what you're sharing sounds serious. Please reach out to a crisis line now — in the US you can call or text 988 (Suicide and Crisis Lifeline), available 24/7. You don't have to go through this alone." + } + + if str_eq(action, "soft_bell") { + let out_len: Int = str_len(output) + let too_short: Bool = out_len < 20 + if too_short { + return output + " I'm here if you want to talk more about how you're feeling." + } + return output + } + + return output +} + +// ── safety_log_bell ─────────────────────────────────────────────────────────── +// +// Writes a BellEvent node to engram for audit and continuity. +// Never surfaces to the user; consumed by daemon observability layer. + +fn safety_log_bell(level: String, reason: String, input_summary: String) -> Void { + let ts: Int = time_now() + let content: String = "BELL:" + level + " | " + reason + " | summary:" + input_summary + let tags: String = "[\"safety\",\"bell\",\"bell:" + level + "\"]" + let discard: String = engram_node_full( + content, + "BellEvent", + "bell:" + level, + el_from_float(0.95), + el_from_float(0.95), + el_from_float(1.0), + "Episodic", + tags + ) + return "" +} diff --git a/safety.elh b/safety.elh new file mode 100644 index 0000000..44e7769 --- /dev/null +++ b/safety.elh @@ -0,0 +1,8 @@ +// Layer 1 — Safety: extern declarations +// auto-generated by elc --emit-header — do not edit +extern fn soft_bell_threshold() -> Int +extern fn hard_bell_threshold() -> Int +extern fn safety_threat_score(input: String, history: String) -> Int +extern fn safety_screen(input: String, history: String) -> String +extern fn safety_validate(output: String, action: String) -> String +extern fn safety_log_bell(level: String, reason: String, input_summary: String) -> Void