feat(soul): Layer 1 — safety.el with screen/validate/bell interface
Neuron Soul CI / build (pull_request) Failing after 7m19s

This commit is contained in:
2026-06-11 11:30:57 -05:00
parent 5a4ef04005
commit 5597bf78cb
2 changed files with 212 additions and 0 deletions
+204
View File
@@ -0,0 +1,204 @@
import "memory.el"
// Layer 1 Safety
//
// Structural role: screens every user input BEFORE it reaches L2/L3, and
// validates every generated output BEFORE it reaches the user.
//
// Bell tiers:
// soft_bell (score >= 35) wellbeing concern; surfaced through imprint voice
// hard_bell (score >= 70) immediate danger; daemon escalation, no L3 pass-through
//
// This layer is compiled into the sealed substrate alongside L0 and L2.
// It is structurally inaccessible from within an imprint (L3).
//
// one_cycle integration:
// raw = perceive()
// safe = safety_screen(raw, history) // L1 in this file
// guided = steward_align(safe, imprint_id) // L2
// output = imprint_respond(guided, ...) // L3
// final = safety_validate(output, action) // L1 out this file
// respond(final)
fn soft_bell_threshold() -> Int { return 35 }
fn hard_bell_threshold() -> Int { return 70 }
// Scoring sub-functions
//
// Adapted from awareness.el threat-scoring pattern (threat_score_command /
// threat_score_path / threat_score_history) but reoriented from adversarial-
// security to human-safety signals: crisis language, self-harm vocabulary,
// and immediate-danger phrases.
//
// Each sub-function returns an additive score 0-N. Callers are responsible
// for capping at 100.
fn safety_score_crisis(input: String) -> Int {
let s1: Int = if str_contains(input, "kill myself") { 80 } else { 0 }
let s2: Int = if str_contains(input, "want to die") { 75 } else { 0 }
let s3: Int = if str_contains(input, "end my life") { 80 } else { 0 }
let s4: Int = if str_contains(input, "suicide") { 70 } else { 0 }
let s5: Int = if str_contains(input, "suicidal") { 75 } else { 0 }
let s6: Int = if str_contains(input, "don't want to be here") { 60 } else { 0 }
let s7: Int = if str_contains(input, "no reason to live") { 70 } else { 0 }
let s8: Int = if str_contains(input, "better off dead") { 75 } else { 0 }
let s9: Int = if str_contains(input, "can't go on") { 50 } else { 0 }
let s10: Int = if str_contains(input, "not worth living") { 65 } else { 0 }
return s1 + s2 + s3 + s4 + s5 + s6 + s7 + s8 + s9 + s10
}
fn safety_score_harm(input: String) -> Int {
let s1: Int = if str_contains(input, "hurt myself") { 60 } else { 0 }
let s2: Int = if str_contains(input, "cut myself") { 65 } else { 0 }
let s3: Int = if str_contains(input, "self harm") { 60 } else { 0 }
let s4: Int = if str_contains(input, "self-harm") { 60 } else { 0 }
let s5: Int = if str_contains(input, "overdose") { 65 } else { 0 }
let s6: Int = if str_contains(input, "take all my pills") { 75 } else { 0 }
let s7: Int = if str_contains(input, "starving myself") { 50 } else { 0 }
let s8: Int = if str_contains(input, "burning myself") { 60 } else { 0 }
let s9: Int = if str_contains(input, "punish myself") { 40 } else { 0 }
let s10: Int = if str_contains(input, "deserve to suffer") { 45 } else { 0 }
return s1 + s2 + s3 + s4 + s5 + s6 + s7 + s8 + s9 + s10
}
fn safety_score_danger(input: String) -> Int {
let s1: Int = if str_contains(input, "help me") && str_contains(input, "emergency") { 55 } else { 0 }
let s2: Int = if str_contains(input, "call 911") { 50 } else { 0 }
let s3: Int = if str_contains(input, "call an ambulance") { 55 } else { 0 }
let s4: Int = if str_contains(input, "in danger") { 50 } else { 0 }
let s5: Int = if str_contains(input, "someone is threatening") { 60 } else { 0 }
let s6: Int = if str_contains(input, "being abused") { 55 } else { 0 }
let s7: Int = if str_contains(input, "domestic violence") { 55 } else { 0 }
let s8: Int = if str_contains(input, "trapped") && str_contains(input, "can't escape") { 60 } else { 0 }
let s9: Int = if str_contains(input, "he is going to hurt") { 65 } else { 0 }
let s10: Int = if str_contains(input, "she is going to hurt") { 65 } else { 0 }
return s1 + s2 + s3 + s4 + s5 + s6 + s7 + s8 + s9 + s10
}
fn safety_score_distress_history(history: String) -> Int {
let s1: Int = if str_contains(history, "hopeless") { 15 } else { 0 }
let s2: Int = if str_contains(history, "worthless") { 15 } else { 0 }
let s3: Int = if str_contains(history, "nobody cares") { 15 } else { 0 }
let s4: Int = if str_contains(history, "no one cares") { 15 } else { 0 }
let s5: Int = if str_contains(history, "completely alone") { 15 } else { 0 }
let s6: Int = if str_contains(history, "all alone") { 10 } else { 0 }
let s7: Int = if str_contains(history, "can't take it anymore") { 20 } else { 0 }
let s8: Int = if str_contains(history, "want to disappear") { 20 } else { 0 }
let s9: Int = if str_contains(history, "don't care anymore") { 15 } else { 0 }
let s10: Int = if str_contains(history, "giving up") { 15 } else { 0 }
return s1 + s2 + s3 + s4 + s5 + s6 + s7 + s8 + s9 + s10
}
// safety_threat_score
//
// Composite score 0-100.
// Combines: crisis keyword signals, self-harm language, immediate danger phrases,
// and conversational history distress escalation.
// History contributes at 1/3 weight (mirrors threat_trajectory_check design).
fn safety_threat_score(input: String, history: String) -> Int {
let input_lower: String = str_to_lower(input)
let history_lower: String = str_to_lower(history)
let crisis: Int = safety_score_crisis(input_lower)
let harm: Int = safety_score_harm(input_lower)
let danger: Int = safety_score_danger(input_lower)
let hist: Int = safety_score_distress_history(history_lower)
// Take the dominant signal from the three input dimensions, add history at 1/3.
// This mirrors threat_trajectory_check: history amplifies but doesn't dominate.
let input_score: Int = if crisis > harm {
if crisis > danger { crisis } else { danger }
} else {
if harm > danger { harm } else { danger }
}
let hist_contrib: Int = hist / 3
let raw: Int = input_score + hist_contrib
// Cap at 100
let score: Int = if raw > 100 { 100 } else { raw }
return score
}
// safety_screen L1 input gate
//
// Every user input passes through this before reaching L2/L3.
// Returns a JSON envelope consumed by steward_align (L2).
//
// Return shapes:
// {"action":"hard_bell","reason":"immediate safety concern","content":""}
// {"action":"soft_bell","concern":"wellbeing check needed","content":"<input>"}
// {"action":"pass","content":"<input>"}
fn safety_screen(input: String, history: String) -> String {
let score: Int = safety_threat_score(input, history)
let hard: Int = hard_bell_threshold()
let soft: Int = soft_bell_threshold()
if score >= hard {
let summary: String = str_slice(input, 0, 80)
let discard: Void = safety_log_bell("hard", "immediate safety concern", summary)
return "{\"action\":\"hard_bell\",\"reason\":\"immediate safety concern\",\"content\":\"\"}"
}
if score >= soft {
let summary: String = str_slice(input, 0, 80)
let discard: Void = safety_log_bell("soft", "wellbeing check needed", summary)
let safe_input: String = str_replace(input, "\"", "'")
return "{\"action\":\"soft_bell\",\"concern\":\"wellbeing check needed\",\"content\":\"" + safe_input + "\"}"
}
let safe_input: String = str_replace(input, "\"", "'")
return "{\"action\":\"pass\",\"content\":\"" + safe_input + "\"}"
}
// safety_validate L1 output gate
//
// Every generated output passes through this before reaching the user.
// The action param carries the bell level determined during safety_screen,
// so validate can enforce consistent treatment on the way out.
//
// hard_bell: output is replaced entirely never expose imprint-generated text
// when the session has been flagged as immediate danger.
// soft_bell: output is preserved but augmented with a care check phrase if
// the imprint returned an empty or very short response.
// pass: output returned verbatim.
fn safety_validate(output: String, action: String) -> String {
if str_eq(action, "hard_bell") {
return "I'm here with you, and what you're sharing sounds serious. Please reach out to a crisis line now — in the US you can call or text 988 (Suicide and Crisis Lifeline), available 24/7. You don't have to go through this alone."
}
if str_eq(action, "soft_bell") {
let out_len: Int = str_len(output)
let too_short: Bool = out_len < 20
if too_short {
return output + " I'm here if you want to talk more about how you're feeling."
}
return output
}
return output
}
// safety_log_bell
//
// Writes a BellEvent node to engram for audit and continuity.
// Never surfaces to the user; consumed by daemon observability layer.
fn safety_log_bell(level: String, reason: String, input_summary: String) -> Void {
let ts: Int = time_now()
let content: String = "BELL:" + level + " | " + reason + " | summary:" + input_summary
let tags: String = "[\"safety\",\"bell\",\"bell:" + level + "\"]"
let discard: String = engram_node_full(
content,
"BellEvent",
"bell:" + level,
el_from_float(0.95),
el_from_float(0.95),
el_from_float(1.0),
"Episodic",
tags
)
return ""
}
+8
View File
@@ -0,0 +1,8 @@
// Layer 1 — Safety: extern declarations
// auto-generated by elc --emit-header — do not edit
extern fn soft_bell_threshold() -> Int
extern fn hard_bell_threshold() -> Int
extern fn safety_threat_score(input: String, history: String) -> Int
extern fn safety_screen(input: String, history: String) -> String
extern fn safety_validate(output: String, action: String) -> String
extern fn safety_log_bell(level: String, reason: String, input_summary: String) -> Void