Compare commits

...

3 Commits

Author SHA1 Message Date
Tim Lingo 89e45ed689 fix(engram): atomic engram_save (tmp+fsync+rename) + sparse-write anti-clobber floor
Kills the engram-clobber loop at its source. engram_save did a bare fopen("wb")
that truncates snapshot.json to 0 bytes before the 47MB write — a booting soul's
engram_load could read that empty window -> genesis -> nodes=1 -> a 63-node save
overwrote the populated file. Two guards:
 1. Atomic write: serialize to <path>.tmp, fflush+fsync, rename() over target
    (atomic on POSIX) — no reader ever sees a truncated/0-byte snapshot.
 2. Sparse-write floor: refuse to overwrite a >200KB snapshot with one < 1/16 its
    size — a partial load can never clobber a healthy graph, whatever the cause.

Validated in isolation: standalone clang harness 11/11; rebuilt the darwin soul
(scripts/build-soul-darwin.sh) and booted it on an isolated port against a golden
copy — loaded 5113 nodes and round-tripped the full 47MB snapshot, no .tmp leftover,
live ~/.neuron untouched. Adds scripts/build-soul-darwin.sh (local elb replacement).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-16 18:23:08 -05:00
Tim Lingo 8d4c5f34bf chore(runtime): snapshot the LIVE darwin soul runtime (el-sdk) into git
El SDK Release / build-and-release (pull_request) Failing after 13s
Verbatim capture of ~/el-sdk/el_runtime.{c,h} — the un-versioned source the live
macOS soul (:7770, 765760B) is actually built from. Captured so the data-integrity
fixes that exist ONLY in this file are no longer one rm away from gone with no history.

Contains (vs origin/main): the UAF fix (engram_node_full uses el_strdup_persist —
the hallucinated/lost-saves root cause; 27 sites vs 19) and the response-truncation
fix (max(strlen,_tl_fs_read_len) binary-safe reads).

DIVERGES BOTH WAYS: this snapshot LACKS main's newer engram_wm_*, engram_load_merge,
http_serve_async (diff is deletion-heavy as a result) and still lacks the atomic
engram_save (temp+fsync+rename) fix. DO NOT merge over main — reconcile by porting
the two fixes above forward onto main.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-15 19:39:30 -05:00
will.anderson 35c189759c feat(runtime): add engram_wm_*, engram_load_merge, http_serve_async — needed by soul CI
El SDK Release / build-and-release (push) Successful in 8m44s
2026-06-11 13:40:10 -05:00
2 changed files with 89 additions and 24 deletions
+68 -24
View File
@@ -1475,10 +1475,13 @@ static void http_send_response(int fd, const char* body) {
}
const char* eff_body = is_envelope ? env_body : body;
/* Use the real byte count from fs_read if available (handles binary files
* with embedded null bytes PNG, WOFF2, etc.). Fall back to strlen for
* normal text/JSON responses where _tl_fs_read_len is 0. */
size_t blen = (_tl_fs_read_len > 0) ? _tl_fs_read_len : strlen(eff_body);
/* Use max(strlen, fs_read_len). fs_read_len is the real byte count for binary
* files (strlen stops at embedded NULs PNG, WOFF2). strlen is correct AND larger
* when a handler WRAPS fs_read output in a longer text/JSON response (e.g.
* /api/safety-contact returns {"configured":...,"contact": <file>}); using
* fs_read_len alone truncated those responses to the file's length. */
size_t _blen_s = strlen(eff_body);
size_t blen = (_tl_fs_read_len > _blen_s) ? _tl_fs_read_len : _blen_s;
_tl_fs_read_len = 0; /* consume — one-shot per response */
int head_only = _tl_http_head_only;
@@ -1552,7 +1555,8 @@ static void* http_worker(void* arg) {
/* Copy response out BEFORE arena teardown.
* For binary files, _tl_fs_read_len holds the real byte count
* use memcpy instead of strdup so null bytes are preserved. */
size_t rlen = _tl_fs_read_len > 0 ? _tl_fs_read_len : (rs ? strlen(rs) : 0);
size_t _rlen_s = rs ? strlen(rs) : 0;
size_t rlen = (_tl_fs_read_len > _rlen_s) ? _tl_fs_read_len : _rlen_s;
response = malloc(rlen + 1);
if (response && rs) { memcpy(response, rs, rlen); response[rlen] = '\0'; }
else if (response) { response[0] = '\0'; }
@@ -1799,7 +1803,8 @@ static void* http_worker_v2(void* arg) {
el_val_t hmap = http_build_headers_map(hdr_block ? hdr_block : "");
el_val_t r = h(EL_STR(dispatch_method), EL_STR(path), hmap, EL_STR(body));
const char* rs = EL_CSTR(r);
size_t rlen = _tl_fs_read_len > 0 ? _tl_fs_read_len : (rs ? strlen(rs) : 0);
size_t _rlen_s = rs ? strlen(rs) : 0;
size_t rlen = (_tl_fs_read_len > _rlen_s) ? _tl_fs_read_len : _rlen_s;
response = malloc(rlen + 1);
if (response && rs) { memcpy(response, rs, rlen); response[rlen] = '\0'; }
else if (response) { response[0] = '\0'; }
@@ -6245,7 +6250,9 @@ static void engram_grow_edges(void) {
static char* engram_new_id(void) {
el_val_t v = uuid_new();
const char* s = EL_CSTR(v);
return el_strdup(s ? s : "");
/* Persistent: node ids live in the global store; an arena (el_strdup) id is
* freed at el_request_end(), corrupting the node after the creating request. */
return el_strdup_persist(s ? s : "");
}
/* Convert a node into an ElMap of its fields. */
@@ -6340,12 +6347,17 @@ el_val_t engram_node_full(el_val_t content, el_val_t node_type, el_val_t label,
const char* lb = EL_CSTR(label);
const char* ti = EL_CSTR(tier);
const char* tg = EL_CSTR(tags);
n->content = el_strdup(c ? c : "");
n->node_type = el_strdup(nt && *nt ? nt : "Memory");
n->label = el_strdup(lb && *lb ? lb : (c ? engram_first_n_chars(c, 60) : ""));
n->tier = el_strdup(ti && *ti ? ti : "Working");
n->tags = el_strdup(tg ? tg : "");
n->metadata = el_strdup("{}");
/* Persistent (el_strdup_persist, NOT el_strdup): these strings are owned by the
* persistent global node store. el_strdup tracks into the per-request arena, which
* el_request_end() frees when the creating HTTP request completes leaving the
* stored node with dangling pointers (corrupted ids, "saved but never listed").
* This is the root cause of the hallucinated/lost-saves class of bugs. */
n->content = el_strdup_persist(c ? c : "");
n->node_type = el_strdup_persist(nt && *nt ? nt : "Memory");
n->label = el_strdup_persist(lb && *lb ? lb : (c ? engram_first_n_chars(c, 60) : ""));
n->tier = el_strdup_persist(ti && *ti ? ti : "Working");
n->tags = el_strdup_persist(tg ? tg : "");
n->metadata = el_strdup_persist("{}");
n->salience = engram_decode_score(salience);
n->importance = engram_decode_score(importance);
n->confidence = engram_decode_score(confidence);
@@ -7288,13 +7300,48 @@ el_val_t engram_save(el_val_t path) {
jb_putc(&b, '}');
}
jb_puts(&b, "]}");
FILE* f = fopen(p, "wb");
if (!f) { free(b.buf); return 0; }
/* --- Anti-clobber sparse-write floor (NTN engram clobber fix) ---------
* Refuse to overwrite an existing populated snapshot with a drastically
* smaller one. A bad boot that loaded only ~63 identity nodes must never
* be able to clobber a healthy 5000+ node snapshot, regardless of the
* upstream cause (genesis fallback, partial load, etc.). */
{
struct stat _st;
if (stat(p, &_st) == 0 && _st.st_size > 200000 &&
(uint64_t)b.len < (uint64_t)_st.st_size / 16) {
fprintf(stderr,
"[engram_save] REFUSED sparse write: new %zu bytes vs existing "
"%lld bytes (< 1/16) — protecting snapshot %s\n",
b.len, (long long)_st.st_size, p);
free(b.buf);
return 0;
}
}
/* --- Atomic write: tmp + fsync + rename ------------------------------
* Write to a sibling temp file, fsync it durable, then rename() over the
* target. rename() is atomic on POSIX, so a concurrent reader (a booting
* soul's engram_load) never observes a truncated or 0-byte snapshot
* which was the root of the genesis/clobber loop. */
size_t _plen = strlen(p);
char* _tmp = (char*)malloc(_plen + 5);
if (!_tmp) { free(b.buf); return 0; }
memcpy(_tmp, p, _plen);
memcpy(_tmp + _plen, ".tmp", 5); /* includes NUL */
FILE* f = fopen(_tmp, "wb");
if (!f) { free(_tmp); free(b.buf); return 0; }
size_t w = fwrite(b.buf, 1, b.len, f);
int wok = (w == b.len);
if (wok) { fflush(f); fsync(fileno(f)); }
fclose(f);
int ok = (w == b.len);
free(b.buf);
return ok ? 1 : 0;
if (!wok) { unlink(_tmp); free(_tmp); return 0; }
if (rename(_tmp, p) != 0) { unlink(_tmp); free(_tmp); return 0; }
free(_tmp);
return 1;
}
/* Helper: extract a string field from a JSON object substring. */
@@ -8555,7 +8602,7 @@ static el_val_t llm_provider_request(const char* url, const char* key,
}
}
static el_val_t llm_chain_call(const char* model_pref, const char* system_str, const char* user_str) {
static el_val_t llm_chain_call(const char* system_str, const char* user_str) {
char url_key[64], key_key[64], fmt_key[64], model_key[64];
for (int i = 0; i < LLM_MAX_PROVIDERS; i++) {
snprintf(url_key, sizeof(url_key), "NEURON_LLM_%d_URL", i);
@@ -8568,7 +8615,6 @@ static el_val_t llm_chain_call(const char* model_pref, const char* system_str, c
const char* fmt_s = getenv(fmt_key);
int fmt = (fmt_s && strcmp(fmt_s, "anthropic") == 0) ? 1 : 0;
const char* model = getenv(model_key);
if (!model || !*model) model = model_pref; /* fall back to the caller-requested model */
fprintf(stderr, "[llm] trying provider %d (%s)\n", i, url);
el_val_t result = llm_provider_request(url, key, fmt, model, system_str, user_str);
const char* t = EL_CSTR(result);
@@ -8579,7 +8625,7 @@ static el_val_t llm_chain_call(const char* model_pref, const char* system_str, c
const char* api_key = getenv("ANTHROPIC_API_KEY");
if (!api_key || !*api_key) return http_error_json("no LLM providers configured");
fprintf(stderr, "[llm] using legacy ANTHROPIC_API_KEY fallback\n");
return llm_provider_request(LLM_API_URL, api_key, 1, model_pref, system_str, user_str);
return llm_provider_request(LLM_API_URL, api_key, 1, NULL, system_str, user_str);
}
/* Legacy llm_request — kept for backward compat with agentic loop internals */
@@ -8643,16 +8689,14 @@ static el_val_t llm_extract_text(el_val_t resp_val) {
}
el_val_t llm_call(el_val_t model, el_val_t prompt) {
const char* m = EL_CSTR(model);
const char* u = EL_CSTR(prompt); if (!u) u = "";
return llm_chain_call(m, NULL, u);
return llm_chain_call(NULL, u);
}
el_val_t llm_call_system(el_val_t model, el_val_t system_prompt, el_val_t user_prompt) {
const char* m = EL_CSTR(model);
const char* s = EL_CSTR(system_prompt); if (!s) s = "";
const char* u = EL_CSTR(user_prompt); if (!u) u = "";
return llm_chain_call(m, s, u);
return llm_chain_call(s, u);
}
/* ── Tool registry for llm_call_agentic ─────────────────────────────────── */
+21
View File
@@ -0,0 +1,21 @@
#!/bin/sh
# build-soul-darwin.sh — replicate `elb` on macOS/arm64 with clang.
# Proven 2026-06-16: produces a Mach-O arm64 soul that boots and serves :7770.
# The official builder `elb` ships Linux-only (CI); this lets us build + test the
# darwin soul locally (e.g. to validate the atomic engram_save fix in isolation).
#
# Usage: scripts/build-soul-darwin.sh <path-to-neuron/dist> [output-binary]
set -e
DIST="${1:?usage: build-soul-darwin.sh <neuron/dist dir> [out]}"
OUT="${2:-./neuron}"
RT="$(cd "$(dirname "$0")/.." && pwd)/lang/el-compiler/runtime"
B="$(mktemp -d)"
# elc-generated dist modules use C89-style implicit cross-module declarations that
# Apple clang rejects as errors by default; resolve at link, so downgrade them.
CFLAGS="-Wno-implicit-function-declaration -Wno-implicit-int -Wno-int-conversion -I$B -I$DIST -I$RT"
cp "$RT/el_runtime.h" "$B/"
clang -c $CFLAGS "$RT/el_runtime.c" -o "$B/el_runtime.o"
for c in "$DIST"/*.c; do clang -c $CFLAGS "$c" -o "$B/$(basename "$c" .c).o"; done
# NOTE: link *.o once — do not also list el_runtime.o separately (duplicate symbols).
clang "$B"/*.o -o "$OUT" -lcurl -lm
echo "built $OUT"