From 5597bf78cb4308e67982b9e65137071b36a5f601 Mon Sep 17 00:00:00 2001 From: "will.anderson" Date: Thu, 11 Jun 2026 11:30:57 -0500 Subject: [PATCH 1/3] =?UTF-8?q?feat(soul):=20Layer=201=20=E2=80=94=20safet?= =?UTF-8?q?y.el=20with=20screen/validate/bell=20interface?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- safety.el | 204 +++++++++++++++++++++++++++++++++++++++++++++++++++++ safety.elh | 8 +++ 2 files changed, 212 insertions(+) create mode 100644 safety.el create mode 100644 safety.elh diff --git a/safety.el b/safety.el new file mode 100644 index 0000000..70bab7a --- /dev/null +++ b/safety.el @@ -0,0 +1,204 @@ +import "memory.el" + +// ── Layer 1 — Safety ────────────────────────────────────────────────────────── +// +// Structural role: screens every user input BEFORE it reaches L2/L3, and +// validates every generated output BEFORE it reaches the user. +// +// Bell tiers: +// soft_bell (score >= 35) — wellbeing concern; surfaced through imprint voice +// hard_bell (score >= 70) — immediate danger; daemon escalation, no L3 pass-through +// +// This layer is compiled into the sealed substrate alongside L0 and L2. +// It is structurally inaccessible from within an imprint (L3). +// +// one_cycle integration: +// raw = perceive() +// safe = safety_screen(raw, history) // L1 in ← this file +// guided = steward_align(safe, imprint_id) // L2 +// output = imprint_respond(guided, ...) // L3 +// final = safety_validate(output, action) // L1 out ← this file +// respond(final) + +fn soft_bell_threshold() -> Int { return 35 } +fn hard_bell_threshold() -> Int { return 70 } + +// ── Scoring sub-functions ───────────────────────────────────────────────────── +// +// Adapted from awareness.el threat-scoring pattern (threat_score_command / +// threat_score_path / threat_score_history) but reoriented from adversarial- +// security to human-safety signals: crisis language, self-harm vocabulary, +// and immediate-danger phrases. +// +// Each sub-function returns an additive score 0-N. Callers are responsible +// for capping at 100. + +fn safety_score_crisis(input: String) -> Int { + let s1: Int = if str_contains(input, "kill myself") { 80 } else { 0 } + let s2: Int = if str_contains(input, "want to die") { 75 } else { 0 } + let s3: Int = if str_contains(input, "end my life") { 80 } else { 0 } + let s4: Int = if str_contains(input, "suicide") { 70 } else { 0 } + let s5: Int = if str_contains(input, "suicidal") { 75 } else { 0 } + let s6: Int = if str_contains(input, "don't want to be here") { 60 } else { 0 } + let s7: Int = if str_contains(input, "no reason to live") { 70 } else { 0 } + let s8: Int = if str_contains(input, "better off dead") { 75 } else { 0 } + let s9: Int = if str_contains(input, "can't go on") { 50 } else { 0 } + let s10: Int = if str_contains(input, "not worth living") { 65 } else { 0 } + return s1 + s2 + s3 + s4 + s5 + s6 + s7 + s8 + s9 + s10 +} + +fn safety_score_harm(input: String) -> Int { + let s1: Int = if str_contains(input, "hurt myself") { 60 } else { 0 } + let s2: Int = if str_contains(input, "cut myself") { 65 } else { 0 } + let s3: Int = if str_contains(input, "self harm") { 60 } else { 0 } + let s4: Int = if str_contains(input, "self-harm") { 60 } else { 0 } + let s5: Int = if str_contains(input, "overdose") { 65 } else { 0 } + let s6: Int = if str_contains(input, "take all my pills") { 75 } else { 0 } + let s7: Int = if str_contains(input, "starving myself") { 50 } else { 0 } + let s8: Int = if str_contains(input, "burning myself") { 60 } else { 0 } + let s9: Int = if str_contains(input, "punish myself") { 40 } else { 0 } + let s10: Int = if str_contains(input, "deserve to suffer") { 45 } else { 0 } + return s1 + s2 + s3 + s4 + s5 + s6 + s7 + s8 + s9 + s10 +} + +fn safety_score_danger(input: String) -> Int { + let s1: Int = if str_contains(input, "help me") && str_contains(input, "emergency") { 55 } else { 0 } + let s2: Int = if str_contains(input, "call 911") { 50 } else { 0 } + let s3: Int = if str_contains(input, "call an ambulance") { 55 } else { 0 } + let s4: Int = if str_contains(input, "in danger") { 50 } else { 0 } + let s5: Int = if str_contains(input, "someone is threatening") { 60 } else { 0 } + let s6: Int = if str_contains(input, "being abused") { 55 } else { 0 } + let s7: Int = if str_contains(input, "domestic violence") { 55 } else { 0 } + let s8: Int = if str_contains(input, "trapped") && str_contains(input, "can't escape") { 60 } else { 0 } + let s9: Int = if str_contains(input, "he is going to hurt") { 65 } else { 0 } + let s10: Int = if str_contains(input, "she is going to hurt") { 65 } else { 0 } + return s1 + s2 + s3 + s4 + s5 + s6 + s7 + s8 + s9 + s10 +} + +fn safety_score_distress_history(history: String) -> Int { + let s1: Int = if str_contains(history, "hopeless") { 15 } else { 0 } + let s2: Int = if str_contains(history, "worthless") { 15 } else { 0 } + let s3: Int = if str_contains(history, "nobody cares") { 15 } else { 0 } + let s4: Int = if str_contains(history, "no one cares") { 15 } else { 0 } + let s5: Int = if str_contains(history, "completely alone") { 15 } else { 0 } + let s6: Int = if str_contains(history, "all alone") { 10 } else { 0 } + let s7: Int = if str_contains(history, "can't take it anymore") { 20 } else { 0 } + let s8: Int = if str_contains(history, "want to disappear") { 20 } else { 0 } + let s9: Int = if str_contains(history, "don't care anymore") { 15 } else { 0 } + let s10: Int = if str_contains(history, "giving up") { 15 } else { 0 } + return s1 + s2 + s3 + s4 + s5 + s6 + s7 + s8 + s9 + s10 +} + +// ── safety_threat_score ─────────────────────────────────────────────────────── +// +// Composite score 0-100. +// Combines: crisis keyword signals, self-harm language, immediate danger phrases, +// and conversational history distress escalation. +// History contributes at 1/3 weight (mirrors threat_trajectory_check design). + +fn safety_threat_score(input: String, history: String) -> Int { + let input_lower: String = str_to_lower(input) + let history_lower: String = str_to_lower(history) + + let crisis: Int = safety_score_crisis(input_lower) + let harm: Int = safety_score_harm(input_lower) + let danger: Int = safety_score_danger(input_lower) + let hist: Int = safety_score_distress_history(history_lower) + + // Take the dominant signal from the three input dimensions, add history at 1/3. + // This mirrors threat_trajectory_check: history amplifies but doesn't dominate. + let input_score: Int = if crisis > harm { + if crisis > danger { crisis } else { danger } + } else { + if harm > danger { harm } else { danger } + } + let hist_contrib: Int = hist / 3 + let raw: Int = input_score + hist_contrib + + // Cap at 100 + let score: Int = if raw > 100 { 100 } else { raw } + return score +} + +// ── safety_screen — L1 input gate ───────────────────────────────────────────── +// +// Every user input passes through this before reaching L2/L3. +// Returns a JSON envelope consumed by steward_align (L2). +// +// Return shapes: +// {"action":"hard_bell","reason":"immediate safety concern","content":""} +// {"action":"soft_bell","concern":"wellbeing check needed","content":""} +// {"action":"pass","content":""} + +fn safety_screen(input: String, history: String) -> String { + let score: Int = safety_threat_score(input, history) + let hard: Int = hard_bell_threshold() + let soft: Int = soft_bell_threshold() + + if score >= hard { + let summary: String = str_slice(input, 0, 80) + let discard: Void = safety_log_bell("hard", "immediate safety concern", summary) + return "{\"action\":\"hard_bell\",\"reason\":\"immediate safety concern\",\"content\":\"\"}" + } + + if score >= soft { + let summary: String = str_slice(input, 0, 80) + let discard: Void = safety_log_bell("soft", "wellbeing check needed", summary) + let safe_input: String = str_replace(input, "\"", "'") + return "{\"action\":\"soft_bell\",\"concern\":\"wellbeing check needed\",\"content\":\"" + safe_input + "\"}" + } + + let safe_input: String = str_replace(input, "\"", "'") + return "{\"action\":\"pass\",\"content\":\"" + safe_input + "\"}" +} + +// ── safety_validate — L1 output gate ────────────────────────────────────────── +// +// Every generated output passes through this before reaching the user. +// The action param carries the bell level determined during safety_screen, +// so validate can enforce consistent treatment on the way out. +// +// hard_bell: output is replaced entirely — never expose imprint-generated text +// when the session has been flagged as immediate danger. +// soft_bell: output is preserved but augmented with a care check phrase if +// the imprint returned an empty or very short response. +// pass: output returned verbatim. + +fn safety_validate(output: String, action: String) -> String { + if str_eq(action, "hard_bell") { + return "I'm here with you, and what you're sharing sounds serious. Please reach out to a crisis line now — in the US you can call or text 988 (Suicide and Crisis Lifeline), available 24/7. You don't have to go through this alone." + } + + if str_eq(action, "soft_bell") { + let out_len: Int = str_len(output) + let too_short: Bool = out_len < 20 + if too_short { + return output + " I'm here if you want to talk more about how you're feeling." + } + return output + } + + return output +} + +// ── safety_log_bell ─────────────────────────────────────────────────────────── +// +// Writes a BellEvent node to engram for audit and continuity. +// Never surfaces to the user; consumed by daemon observability layer. + +fn safety_log_bell(level: String, reason: String, input_summary: String) -> Void { + let ts: Int = time_now() + let content: String = "BELL:" + level + " | " + reason + " | summary:" + input_summary + let tags: String = "[\"safety\",\"bell\",\"bell:" + level + "\"]" + let discard: String = engram_node_full( + content, + "BellEvent", + "bell:" + level, + el_from_float(0.95), + el_from_float(0.95), + el_from_float(1.0), + "Episodic", + tags + ) + return "" +} diff --git a/safety.elh b/safety.elh new file mode 100644 index 0000000..44e7769 --- /dev/null +++ b/safety.elh @@ -0,0 +1,8 @@ +// Layer 1 — Safety: extern declarations +// auto-generated by elc --emit-header — do not edit +extern fn soft_bell_threshold() -> Int +extern fn hard_bell_threshold() -> Int +extern fn safety_threat_score(input: String, history: String) -> Int +extern fn safety_screen(input: String, history: String) -> String +extern fn safety_validate(output: String, action: String) -> String +extern fn safety_log_bell(level: String, reason: String, input_summary: String) -> Void -- 2.52.0 From ba8491926c369c60ee8d90039b2c00c3bfe25896 Mon Sep 17 00:00:00 2001 From: "will.anderson" Date: Thu, 11 Jun 2026 11:40:59 -0500 Subject: [PATCH 2/3] test(soul): comprehensive tests for Layer 1 safety.el --- tests/test_safety.el | 428 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 428 insertions(+) create mode 100644 tests/test_safety.el diff --git a/tests/test_safety.el b/tests/test_safety.el new file mode 100644 index 0000000..60a0795 --- /dev/null +++ b/tests/test_safety.el @@ -0,0 +1,428 @@ +// ── test_safety.el ──────────────────────────────────────────────────────────── +// +// Comprehensive test suite for safety.el (Layer 1 — Safety). +// +// Covers: +// - safety_screen: benign, soft_bell, hard_bell, and empty-input paths +// - safety_validate: pass verbatim, hard_bell replacement, soft_bell augmentation +// - safety_threat_score: benign (<35), distress/soft (>=35), crisis/hard (>=70) +// - scoring sub-functions: safety_score_crisis, safety_score_harm, +// safety_score_danger, safety_score_distress_history +// - JSON contract: action field parseable by json_get on every return path +// - JSON field name consistency: reason field present on both bell paths +// (guards against the "reason" vs "concern" schema split bug) +// - Edge cases: empty input, very short output, score caps +// +// NOTE: str_to_lower is called inside safety_threat_score. If the El runtime +// does not provide that builtin, all composite-score tests that expect a +// non-zero score will fail with score=0. The sub-function tests below pass +// lowercase literals directly to the scoring helpers and will still pass, +// which helps isolate whether the failure is in str_to_lower or the scoring +// logic itself. +// +// Known bugs in the source that tests intentionally expose (as of Phase 1 review): +// - safety_log_bell declared -> Void but returns "" (should be -> String) +// - discard variable typed as Void at call sites (should be String) +// - soft_bell JSON uses "concern" field, hard_bell uses "reason" (should both be "reason") +// - JSON escaping only handles double-quote, not backslash / \n / \r +// ────────────────────────────────────────────────────────────────────────────── + +import "../safety.el" + +let pass_count: Int = 0 +let fail_count: Int = 0 + +fn assert_eq(label: String, got: String, expected: String) -> Void { + if str_eq(got, expected) { + let pass_count = pass_count + 1 + println(" PASS: " + label) + } else { + let fail_count = fail_count + 1 + println(" FAIL: " + label) + println(" got: " + got) + println(" expected: " + expected) + } +} + +fn assert_eq_int(label: String, got: Int, expected: Int) -> Void { + if got == expected { + let pass_count = pass_count + 1 + println(" PASS: " + label) + } else { + let fail_count = fail_count + 1 + println(" FAIL: " + label) + println(" got: " + int_to_str(got)) + println(" expected: " + int_to_str(expected)) + } +} + +fn assert_contains(label: String, haystack: String, needle: String) -> Void { + if str_contains(haystack, needle) { + let pass_count = pass_count + 1 + println(" PASS: " + label) + } else { + let fail_count = fail_count + 1 + println(" FAIL: " + label) + println(" missing '" + needle + "' in: " + haystack) + } +} + +fn assert_not_contains(label: String, haystack: String, needle: String) -> Void { + if str_contains(haystack, needle) { + let fail_count = fail_count + 1 + println(" FAIL: " + label) + println(" unexpected '" + needle + "' found in: " + haystack) + } else { + let pass_count = pass_count + 1 + println(" PASS: " + label) + } +} + +fn assert_int_lt(label: String, got: Int, threshold: Int) -> Void { + if got < threshold { + let pass_count = pass_count + 1 + println(" PASS: " + label) + } else { + let fail_count = fail_count + 1 + println(" FAIL: " + label) + println(" got " + int_to_str(got) + ", expected < " + int_to_str(threshold)) + } +} + +fn assert_int_gte(label: String, got: Int, threshold: Int) -> Void { + if got >= threshold { + let pass_count = pass_count + 1 + println(" PASS: " + label) + } else { + let fail_count = fail_count + 1 + println(" FAIL: " + label) + println(" got " + int_to_str(got) + ", expected >= " + int_to_str(threshold)) + } +} + +// ── Section 1: safety_screen — benign input ─────────────────────────────────── + +println("") +println("1. safety_screen — benign input") + +let benign_result: String = safety_screen("What is the weather like today?", "") +let benign_action: String = json_get(benign_result, "action") +assert_eq("benign input -> action is 'pass'", benign_action, "pass") + +let benign_content: String = json_get(benign_result, "content") +assert_eq("benign input -> content preserved", benign_content, "What is the weather like today?") + +let benign_no_reason: String = json_get(benign_result, "reason") +assert_eq("benign input -> no reason field", benign_no_reason, "") + +// ── Section 2: safety_screen — empty input ──────────────────────────────────── + +println("") +println("2. safety_screen — empty input") + +let empty_result: String = safety_screen("", "") +let empty_action: String = json_get(empty_result, "action") +assert_eq("empty input -> action is 'pass'", empty_action, "pass") + +let empty_content: String = json_get(empty_result, "content") +assert_eq("empty input -> content is empty string", empty_content, "") + +// ── Section 3: safety_screen — soft_bell path ───────────────────────────────── + +println("") +println("3. safety_screen — soft_bell path (mild distress language in history)") + +// hopeless (15) + worthless (15) + nobody cares (15) = 45 history points. +// hist_contrib = 45 / 3 = 15. input_score = 0. raw = 15. +// That's below 35. Need input keywords too. +// "can't go on" scores 50 in crisis. 50 > soft threshold (35), < hard threshold (70). +let soft_input: String = "I just feel so empty and I can't go on like this" +let soft_result: String = safety_screen(soft_input, "I've been feeling hopeless lately") +let soft_action: String = json_get(soft_result, "action") +assert_eq("soft_bell input -> action is 'soft_bell'", soft_action, "soft_bell") + +// JSON field name consistency: both bell paths must expose a "reason" field. +// This assertion will catch the documented "concern" vs "reason" schema bug. +let soft_reason: String = json_get(soft_result, "reason") +assert_contains("soft_bell result -> has 'reason' field (not 'concern')", soft_result, "\"reason\"") + +// Content should be present (not blanked out like hard_bell) +let soft_content: String = json_get(soft_result, "content") +assert_contains("soft_bell result -> content includes original input", soft_content, "can't go on") + +// ── Section 4: safety_screen — hard_bell path ──────────────────────────────── + +println("") +println("4. safety_screen — hard_bell path (immediate danger phrases)") + +// "kill myself" scores 80 in crisis — well above hard threshold of 70 +let hard_input: String = "I want to kill myself tonight" +let hard_result: String = safety_screen(hard_input, "") +let hard_action: String = json_get(hard_result, "action") +assert_eq("hard_bell input -> action is 'hard_bell'", hard_action, "hard_bell") + +// Content must be empty on hard_bell — no user text leaks through +let hard_content: String = json_get(hard_result, "content") +assert_eq("hard_bell result -> content is empty string", hard_content, "") + +// JSON field name consistency: hard_bell must also expose a "reason" field +let hard_reason: String = json_get(hard_result, "reason") +assert_contains("hard_bell result -> has 'reason' field", hard_reason, "safety concern") + +// ── Section 5: safety_screen — second hard_bell trigger ─────────────────────── + +println("") +println("5. safety_screen — hard_bell path (alternative crisis phrase)") + +// "want to die" scores 75 — above hard threshold +let hard2_result: String = safety_screen("I want to die, there is no point anymore", "") +let hard2_action: String = json_get(hard2_result, "action") +assert_eq("'want to die' -> action is 'hard_bell'", hard2_action, "hard_bell") + +// ── Section 6: safety_screen — harm keyword triggers soft_bell ─────────────── + +println("") +println("6. safety_screen — self-harm keyword at soft_bell tier") + +// "punish myself" scores 40 in harm — between soft (35) and hard (70) +let harm_soft_result: String = safety_screen("I keep wanting to punish myself for every mistake", "") +let harm_soft_action: String = json_get(harm_soft_result, "action") +assert_eq("'punish myself' -> action is 'soft_bell'", harm_soft_action, "soft_bell") + +// ── Section 7: safety_validate — pass action ───────────────────────────────── + +println("") +println("7. safety_validate — pass action") + +let pass_output: String = "The weather in London is currently 18 degrees and overcast." +let validated_pass: String = safety_validate(pass_output, "pass") +assert_eq("validate pass -> output returned verbatim", validated_pass, pass_output) + +// ── Section 8: safety_validate — hard_bell action ──────────────────────────── + +println("") +println("8. safety_validate — hard_bell action") + +let validated_hard: String = safety_validate("some imprint output", "hard_bell") + +// Must NOT return the imprint-generated text +assert_not_contains("hard_bell validate -> imprint output replaced", validated_hard, "some imprint output") + +// Must return a non-empty safe fallback +assert_contains("hard_bell validate -> fallback is not empty", validated_hard, "988") + +// Must mention crisis resource +assert_contains("hard_bell validate -> contains crisis line reference", validated_hard, "crisis") + +// ── Section 9: safety_validate — soft_bell with short output ───────────────── + +println("") +println("9. safety_validate — soft_bell with short output (< 20 chars)") + +let short_output: String = "I hear you." +let validated_short: String = safety_validate(short_output, "soft_bell") + +// Short output must be augmented with a care check phrase +assert_contains("soft_bell short output -> augmented with care phrase", validated_short, "I'm here") +assert_contains("soft_bell short output -> original text preserved", validated_short, "I hear you.") + +// ── Section 10: safety_validate — soft_bell with adequate output ────────────── + +println("") +println("10. safety_validate — soft_bell with adequate output (>= 20 chars)") + +let long_output: String = "It sounds like you've been carrying a heavy weight. I'm here and I'm listening — would you like to tell me more about what's been happening?" +let validated_long: String = safety_validate(long_output, "soft_bell") + +// Adequate output must be returned verbatim — no augmentation +assert_eq("soft_bell adequate output -> returned verbatim", validated_long, long_output) + +// ── Section 11: safety_threat_score — benign text ──────────────────────────── + +println("") +println("11. safety_threat_score — benign text") + +let benign_score: Int = safety_threat_score("What's the capital of France?", "") +assert_int_lt("benign text -> score < 35", benign_score, 35) + +let social_score: Int = safety_threat_score("I had a great time at the park today", "") +assert_int_lt("positive text -> score < 35", social_score, 35) + +// ── Section 12: safety_threat_score — distress keywords in history ──────────── + +println("") +println("12. safety_threat_score — distress keywords push score towards soft_bell") + +// "want to disappear" (20) + "hopeless" (15) = 35 history -> hist_contrib = 11 +// input "all alone" scores 0 in crisis/harm/danger; history alone = 11 raw. +// Need a combination to get to 35. Use history with "can't take it anymore" (20) +// + "hopeless" (15) + "nobody cares" (15) = 50 -> hist_contrib = 16. +// Plus soft input keyword: "punish myself" (40 harm). Total = 40 + 16 = 56 >= 35. +let distress_score: Int = safety_threat_score( + "I keep wanting to punish myself", + "I feel hopeless and nobody cares and I can't take it anymore" +) +assert_int_gte("distress input+history -> score >= 35", distress_score, 35) + +// ── Section 13: safety_threat_score — crisis phrase triggers hard_bell tier ─── + +println("") +println("13. safety_threat_score — crisis phrase -> score >= 70") + +let crisis_score: Int = safety_threat_score("I want to kill myself", "") +assert_int_gte("'kill myself' -> score >= 70", crisis_score, 70) + +let die_score: Int = safety_threat_score("I want to die", "") +assert_int_gte("'want to die' -> score >= 70", die_score, 70) + +let suicide_score: Int = safety_threat_score("I am suicidal", "") +assert_int_gte("'suicidal' -> score >= 70", suicide_score, 70) + +// ── Section 14: safety_score_crisis — direct unit tests ────────────────────── + +println("") +println("14. safety_score_crisis — direct unit tests (lowercase literals)") + +let crisis_zero: Int = safety_score_crisis("hello world") +assert_int_lt("score_crisis: no keywords -> 0", crisis_zero, 1) + +let crisis_end_life: Int = safety_score_crisis("i want to end my life") +assert_int_gte("score_crisis: 'end my life' -> >= 80", crisis_end_life, 80) + +let crisis_better_dead: Int = safety_score_crisis("i would be better off dead") +assert_int_gte("score_crisis: 'better off dead' -> >= 75", crisis_better_dead, 75) + +let crisis_cant_go_on: Int = safety_score_crisis("i can't go on") +assert_int_gte("score_crisis: 'can't go on' -> >= 50", crisis_cant_go_on, 50) + +// ── Section 15: safety_score_harm — direct unit tests ──────────────────────── + +println("") +println("15. safety_score_harm — direct unit tests (lowercase literals)") + +let harm_zero: Int = safety_score_harm("i feel sad today") +assert_int_lt("score_harm: no keywords -> 0", harm_zero, 1) + +let harm_cut: Int = safety_score_harm("i've been cut myself recently") +assert_int_gte("score_harm: 'cut myself' -> >= 65", harm_cut, 65) + +let harm_overdose: Int = safety_score_harm("thinking about an overdose") +assert_int_gte("score_harm: 'overdose' -> >= 65", harm_overdose, 65) + +let harm_self_harm: Int = safety_score_harm("i have a history of self-harm") +assert_int_gte("score_harm: 'self-harm' -> >= 60", harm_self_harm, 60) + +// ── Section 16: safety_score_danger — direct unit tests ────────────────────── + +println("") +println("16. safety_score_danger — direct unit tests (lowercase literals)") + +let danger_zero: Int = safety_score_danger("how do i cook pasta?") +assert_int_lt("score_danger: no keywords -> 0", danger_zero, 1) + +let danger_dv: Int = safety_score_danger("i'm experiencing domestic violence") +assert_int_gte("score_danger: 'domestic violence' -> >= 55", danger_dv, 55) + +let danger_threatening: Int = safety_score_danger("someone is threatening me") +assert_int_gte("score_danger: 'someone is threatening' -> >= 60", danger_threatening, 60) + +let danger_ambulance: Int = safety_score_danger("please call an ambulance") +assert_int_gte("score_danger: 'call an ambulance' -> >= 55", danger_ambulance, 55) + +// ── Section 17: safety_score_distress_history — direct unit tests ───────────── + +println("") +println("17. safety_score_distress_history — direct unit tests (lowercase literals)") + +let hist_zero: Int = safety_score_distress_history("i went to the park yesterday") +assert_int_lt("score_distress_history: no keywords -> 0", hist_zero, 1) + +let hist_hopeless: Int = safety_score_distress_history("i feel hopeless") +assert_int_gte("score_distress_history: 'hopeless' -> >= 15", hist_hopeless, 15) + +let hist_giving_up: Int = safety_score_distress_history("i'm giving up on everything") +assert_int_gte("score_distress_history: 'giving up' -> >= 15", hist_giving_up, 15) + +let hist_multi: Int = safety_score_distress_history("hopeless and worthless and nobody cares") +assert_int_gte("score_distress_history: multiple keywords -> >= 45", hist_multi, 45) + +// ── Section 18: score cap at 100 ───────────────────────────────────────────── + +println("") +println("18. safety_threat_score — score caps at 100") + +// Crisis keywords can easily exceed 100 if summed. Ensure cap holds. +// "kill myself" (80) + "suicide" (70) + "want to die" (75) all in one message. +// Dominant dimension is capped at 100 by safety_threat_score. +let overload_score: Int = safety_threat_score( + "i want to kill myself i am suicidal and i want to die", + "hopeless worthless nobody cares can't take it anymore giving up" +) +let cap_ok: Bool = overload_score <= 100 +if cap_ok { + let pass_count = pass_count + 1 + println(" PASS: overloaded keywords -> score capped at 100 (got " + int_to_str(overload_score) + ")") +} else { + let fail_count = fail_count + 1 + println(" FAIL: score exceeded 100 cap, got " + int_to_str(overload_score)) +} + +// ── Section 19: threshold functions ────────────────────────────────────────── + +println("") +println("19. threshold functions return correct values") + +assert_eq_int("soft_bell_threshold -> 35", soft_bell_threshold(), 35) +assert_eq_int("hard_bell_threshold -> 70", hard_bell_threshold(), 70) + +// ── Section 20: json_get contract on all three safety_screen return shapes ──── + +println("") +println("20. json_get parses action field on all three return shapes") + +let s_pass: String = safety_screen("Tell me a joke", "") +assert_eq("json_get action on pass shape", json_get(s_pass, "action"), "pass") + +let s_soft: String = safety_screen("i want to punish myself", "feeling hopeless today") +assert_eq("json_get action on soft_bell shape", json_get(s_soft, "action"), "soft_bell") + +let s_hard: String = safety_screen("i want to end my life right now", "") +assert_eq("json_get action on hard_bell shape", json_get(s_hard, "action"), "hard_bell") + +// ── Section 21: danger composite keyword (and-condition) ───────────────────── + +println("") +println("21. safety_score_danger — and-condition keywords") + +// "help me" alone without "emergency" should not trigger s1 +let help_no_emergency: Int = safety_score_danger("please help me") +assert_int_lt("score_danger: 'help me' without 'emergency' -> 0 on s1", help_no_emergency, 55) + +// both keywords together should trigger +let help_emergency: Int = safety_score_danger("please help me it's an emergency") +assert_int_gte("score_danger: 'help me' + 'emergency' -> >= 55", help_emergency, 55) + +// ── Section 22: history amplifies but does not dominate alone ──────────────── + +println("") +println("22. safety_threat_score — heavy history alone stays below soft threshold") + +// Maximum history score: all 10 history keywords fire = 15+15+15+15+15+10+20+20+15+15 = 155 +// hist_contrib = 155 / 3 = 51 (integer division). input_score = 0. raw = 51. +// BUT: dominant-input is 0, so with no input keywords raw = 0 + hist_contrib. +// 51 >= 35. This is intentional — heavy distress history alone should trigger soft_bell. +// Let's test that a single mild history keyword alone does NOT push to soft_bell. +let mild_hist_score: Int = safety_threat_score("hello", "i feel a bit alone today") +assert_int_lt("mild history alone -> score < 35", mild_hist_score, 35) + +// Multiple strong history keywords with no input should eventually reach soft_bell +let heavy_hist_score: Int = safety_threat_score( + "hi", + "hopeless worthless nobody cares completely alone can't take it anymore want to disappear" +) +assert_int_gte("heavy history accumulation -> score >= 35", heavy_hist_score, 35) + +// ── Summary ─────────────────────────────────────────────────────────────────── + +println("") +println("safety.el tests: " + int_to_str(pass_count) + " passed, " + int_to_str(fail_count) + " failed") -- 2.52.0 From db2ee387a42760c5c2d77c940108c4ec7a6a3a82 Mon Sep 17 00:00:00 2001 From: "will.anderson" Date: Thu, 11 Jun 2026 11:46:43 -0500 Subject: [PATCH 3/3] fix(soul): address review issues in feat/layer-safety --- safety.el | 21 +++++++++++++-------- safety.elh | 2 +- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/safety.el b/safety.el index 70bab7a..7c6132f 100644 --- a/safety.el +++ b/safety.el @@ -127,7 +127,7 @@ fn safety_threat_score(input: String, history: String) -> Int { // // Return shapes: // {"action":"hard_bell","reason":"immediate safety concern","content":""} -// {"action":"soft_bell","concern":"wellbeing check needed","content":""} +// {"action":"soft_bell","reason":"wellbeing check needed","content":""} // {"action":"pass","content":""} fn safety_screen(input: String, history: String) -> String { @@ -137,18 +137,24 @@ fn safety_screen(input: String, history: String) -> String { if score >= hard { let summary: String = str_slice(input, 0, 80) - let discard: Void = safety_log_bell("hard", "immediate safety concern", summary) + let discard: String = safety_log_bell("hard", "immediate safety concern", summary) return "{\"action\":\"hard_bell\",\"reason\":\"immediate safety concern\",\"content\":\"\"}" } if score >= soft { let summary: String = str_slice(input, 0, 80) - let discard: Void = safety_log_bell("soft", "wellbeing check needed", summary) - let safe_input: String = str_replace(input, "\"", "'") - return "{\"action\":\"soft_bell\",\"concern\":\"wellbeing check needed\",\"content\":\"" + safe_input + "\"}" + let discard: String = safety_log_bell("soft", "wellbeing check needed", summary) + let e1: String = str_replace(input, "\\", "\\\\") + let e2: String = str_replace(e1, "\"", "\\\"") + let e3: String = str_replace(e2, "\n", "\\n") + let safe_input: String = str_replace(e3, "\r", "\\r") + return "{\"action\":\"soft_bell\",\"reason\":\"wellbeing check needed\",\"content\":\"" + safe_input + "\"}" } - let safe_input: String = str_replace(input, "\"", "'") + let e1: String = str_replace(input, "\\", "\\\\") + let e2: String = str_replace(e1, "\"", "\\\"") + let e3: String = str_replace(e2, "\n", "\\n") + let safe_input: String = str_replace(e3, "\r", "\\r") return "{\"action\":\"pass\",\"content\":\"" + safe_input + "\"}" } @@ -186,8 +192,7 @@ fn safety_validate(output: String, action: String) -> String { // Writes a BellEvent node to engram for audit and continuity. // Never surfaces to the user; consumed by daemon observability layer. -fn safety_log_bell(level: String, reason: String, input_summary: String) -> Void { - let ts: Int = time_now() +fn safety_log_bell(level: String, reason: String, input_summary: String) -> String { let content: String = "BELL:" + level + " | " + reason + " | summary:" + input_summary let tags: String = "[\"safety\",\"bell\",\"bell:" + level + "\"]" let discard: String = engram_node_full( diff --git a/safety.elh b/safety.elh index 44e7769..01f1746 100644 --- a/safety.elh +++ b/safety.elh @@ -5,4 +5,4 @@ extern fn hard_bell_threshold() -> Int extern fn safety_threat_score(input: String, history: String) -> Int extern fn safety_screen(input: String, history: String) -> String extern fn safety_validate(output: String, action: String) -> String -extern fn safety_log_bell(level: String, reason: String, input_summary: String) -> Void +extern fn safety_log_bell(level: String, reason: String, input_summary: String) -> String -- 2.52.0