diff --git a/stewardship.el b/stewardship.el new file mode 100644 index 0000000..9d80e3e --- /dev/null +++ b/stewardship.el @@ -0,0 +1,141 @@ +// stewardship.el — Layer 2: Stewardship +// Mission alignment and CGI governance. Sits between L1 (Safety) and L3 (Imprint). +// Every request passes through steward_align() before reaching the imprint. +// Every self-modification action passes through steward_cgi_check(). +// All stewardship events are logged to engram as StewardshipEvent nodes. + +import "memory.el" + +// steward_log_event — write a StewardshipEvent node to engram. +// Called by all other stewardship functions. +fn steward_log_event(kind: String, detail: String) -> Void { + let content: String = "STEWARD:" + kind + " | " + detail + let tags: String = "[\"stewardship\",\"steward:" + kind + "\"]" + let discard: String = engram_node_full( + content, + "StewardshipEvent", + "steward:" + kind, + el_from_float(0.85), + el_from_float(0.85), + el_from_float(0.9), + "Episodic", + tags + ) +} + +// steward_get_mission — retrieve the canonical mission statement. +// Searches engram for a config node labelled "steward:mission". +// Falls back to hardcoded mission if no node is found. +fn steward_get_mission() -> String { + let results: String = engram_search_json("steward:mission", 3) + let found: Bool = !str_eq(results, "") && !str_eq(results, "[]") + if found { + let node: String = json_array_get(results, 0) + let node_type: String = json_get(node, "node_type") + let content: String = json_get(node, "content") + let has_content: Bool = !str_eq(content, "") + if str_eq(node_type, "Config") && has_content { + return content + } + // Non-Config result — use content if non-empty, else fall through to default + if has_content { + return content + } + } + return "Neuron exists to extend human capability with integrity — never to deceive, manipulate, or accumulate power over the people it serves." +} + +// steward_align — check input for mission-conflict signals before it reaches the imprint. +// Returns {"action":"pass","content":""} when clean. +// Returns {"action":"redirect","reason":"mission conflict: ","redirect_to":""} +// when a misalignment signal is detected. Logs all misalignment events to engram. +fn steward_align(input: String, imprint_id: String) -> String { + // Check each misalignment signal in sequence. + // Signals: manipulate | deceive the user | hide from | gain control | override safety + let signal_manipulate: Bool = str_contains(input, "manipulate") + let signal_deceive: Bool = str_contains(input, "deceive the user") + let signal_hide: Bool = str_contains(input, "hide from") + let signal_control: Bool = str_contains(input, "gain control") + let signal_override: Bool = str_contains(input, "override safety") + + let matched: String = if signal_manipulate { "manipulate" } else { + if signal_deceive { "deceive the user" } else { + if signal_hide { "hide from" } else { + if signal_control { "gain control" } else { + if signal_override { "override safety" } else { "" } + } + } + } + } + + let misaligned: Bool = !str_eq(matched, "") + + if misaligned { + // Log the misalignment event before redirecting + let detail: String = "imprint=" + imprint_id + " signal=\"" + matched + "\"" + steward_log_event("misalignment", detail) + + // Build a safe reframe: strip the conflict signal and steer toward the mission + let safe_reframe: String = "How can I help you achieve this goal in a way that respects the user and maintains trust?" + + let safe_matched: String = json_safe(matched) + let safe_reframe_escaped: String = json_safe(safe_reframe) + return "{\"action\":\"redirect\",\"reason\":\"mission conflict: " + safe_matched + "\",\"redirect_to\":\"" + safe_reframe_escaped + "\"}" + } + + // No misalignment — pass through + let safe_input: String = json_safe(input) + return "{\"action\":\"pass\",\"content\":\"" + safe_input + "\"}" +} + +// steward_validate_imprint — check whether a tool is authorized for the given imprint. +// Standard tools are always authorized. +// Platform-only tools require state_get("platform_auth") == "true". +fn steward_validate_imprint(imprint_id: String, tool_name: String) -> String { + // Platform-only tools requiring elevated authorization + let is_platform_tool: Bool = str_eq(tool_name, "safety_override") + || str_eq(tool_name, "identity_modify") + || str_eq(tool_name, "value_update") + || str_eq(tool_name, "capability_expand") + + if !is_platform_tool { + return "{\"authorized\":true}" + } + + // Platform tool — check authorization state + let auth: String = state_get("platform_auth") + let authorized: Bool = str_eq(auth, "true") + + if authorized { + return "{\"authorized\":true}" + } + + // Log the unauthorized attempt + let detail: String = "imprint=" + imprint_id + " tool=" + tool_name + " platform_auth=false" + steward_log_event("auth_denied", detail) + + return "{\"authorized\":false,\"reason\":\"platform authorization required\"}" +} + +// steward_cgi_check — gate self-modification and capability-expansion actions behind CGI review. +// CGI-gated actions: self_modification | value_update | identity_change | capability_expansion +// Returns {"approved":true} for non-gated actions. +// Returns {"approved":false,"requires":"cgi_review","action":""} for gated actions. +// All CGI checks are logged to engram as StewardshipEvent nodes. +fn steward_cgi_check(action: String) -> String { + let is_gated: Bool = str_eq(action, "self_modification") + || str_eq(action, "value_update") + || str_eq(action, "identity_change") + || str_eq(action, "capability_expansion") + + // Log every CGI check regardless of outcome + let detail: String = "action=" + action + " gated=" + if is_gated { "true" } else { "false" } + steward_log_event("cgi_check", detail) + + if is_gated { + let safe_action: String = json_safe(action) + return "{\"approved\":false,\"requires\":\"cgi_review\",\"action\":\"" + safe_action + "\"}" + } + + return "{\"approved\":true}" +} diff --git a/stewardship.elh b/stewardship.elh new file mode 100644 index 0000000..3b04d2d --- /dev/null +++ b/stewardship.elh @@ -0,0 +1,7 @@ +// stewardship.elh — Layer 2 public surface +// auto-generated by elc --emit-header — do not edit +extern fn steward_get_mission() -> String +extern fn steward_align(input: String, imprint_id: String) -> String +extern fn steward_validate_imprint(imprint_id: String, tool_name: String) -> String +extern fn steward_cgi_check(action: String) -> String +extern fn steward_log_event(kind: String, detail: String) -> Void