From 89e45ed689c18abfa35a3f04dea5599f363bc922 Mon Sep 17 00:00:00 2001 From: Tim Lingo <1timlingo@gmail.com> Date: Tue, 16 Jun 2026 18:23:08 -0500 Subject: [PATCH] fix(engram): atomic engram_save (tmp+fsync+rename) + sparse-write anti-clobber floor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Kills the engram-clobber loop at its source. engram_save did a bare fopen("wb") that truncates snapshot.json to 0 bytes before the 47MB write — a booting soul's engram_load could read that empty window -> genesis -> nodes=1 -> a 63-node save overwrote the populated file. Two guards: 1. Atomic write: serialize to .tmp, fflush+fsync, rename() over target (atomic on POSIX) — no reader ever sees a truncated/0-byte snapshot. 2. Sparse-write floor: refuse to overwrite a >200KB snapshot with one < 1/16 its size — a partial load can never clobber a healthy graph, whatever the cause. Validated in isolation: standalone clang harness 11/11; rebuilt the darwin soul (scripts/build-soul-darwin.sh) and booted it on an isolated port against a golden copy — loaded 5113 nodes and round-tripped the full 47MB snapshot, no .tmp leftover, live ~/.neuron untouched. Adds scripts/build-soul-darwin.sh (local elb replacement). Co-Authored-By: Claude Opus 4.8 (1M context) --- lang/el-compiler/runtime/el_runtime.c | 43 ++++++++++++++++++++++++--- scripts/build-soul-darwin.sh | 21 +++++++++++++ 2 files changed, 60 insertions(+), 4 deletions(-) create mode 100755 scripts/build-soul-darwin.sh diff --git a/lang/el-compiler/runtime/el_runtime.c b/lang/el-compiler/runtime/el_runtime.c index 452e836..d237d07 100644 --- a/lang/el-compiler/runtime/el_runtime.c +++ b/lang/el-compiler/runtime/el_runtime.c @@ -7300,13 +7300,48 @@ el_val_t engram_save(el_val_t path) { jb_putc(&b, '}'); } jb_puts(&b, "]}"); - FILE* f = fopen(p, "wb"); - if (!f) { free(b.buf); return 0; } + + /* --- Anti-clobber sparse-write floor (NTN engram clobber fix) --------- + * Refuse to overwrite an existing populated snapshot with a drastically + * smaller one. A bad boot that loaded only ~63 identity nodes must never + * be able to clobber a healthy 5000+ node snapshot, regardless of the + * upstream cause (genesis fallback, partial load, etc.). */ + { + struct stat _st; + if (stat(p, &_st) == 0 && _st.st_size > 200000 && + (uint64_t)b.len < (uint64_t)_st.st_size / 16) { + fprintf(stderr, + "[engram_save] REFUSED sparse write: new %zu bytes vs existing " + "%lld bytes (< 1/16) — protecting snapshot %s\n", + b.len, (long long)_st.st_size, p); + free(b.buf); + return 0; + } + } + + /* --- Atomic write: tmp + fsync + rename ------------------------------ + * Write to a sibling temp file, fsync it durable, then rename() over the + * target. rename() is atomic on POSIX, so a concurrent reader (a booting + * soul's engram_load) never observes a truncated or 0-byte snapshot — + * which was the root of the genesis/clobber loop. */ + size_t _plen = strlen(p); + char* _tmp = (char*)malloc(_plen + 5); + if (!_tmp) { free(b.buf); return 0; } + memcpy(_tmp, p, _plen); + memcpy(_tmp + _plen, ".tmp", 5); /* includes NUL */ + + FILE* f = fopen(_tmp, "wb"); + if (!f) { free(_tmp); free(b.buf); return 0; } size_t w = fwrite(b.buf, 1, b.len, f); + int wok = (w == b.len); + if (wok) { fflush(f); fsync(fileno(f)); } fclose(f); - int ok = (w == b.len); free(b.buf); - return ok ? 1 : 0; + + if (!wok) { unlink(_tmp); free(_tmp); return 0; } + if (rename(_tmp, p) != 0) { unlink(_tmp); free(_tmp); return 0; } + free(_tmp); + return 1; } /* Helper: extract a string field from a JSON object substring. */ diff --git a/scripts/build-soul-darwin.sh b/scripts/build-soul-darwin.sh new file mode 100755 index 0000000..055b5f5 --- /dev/null +++ b/scripts/build-soul-darwin.sh @@ -0,0 +1,21 @@ +#!/bin/sh +# build-soul-darwin.sh — replicate `elb` on macOS/arm64 with clang. +# Proven 2026-06-16: produces a Mach-O arm64 soul that boots and serves :7770. +# The official builder `elb` ships Linux-only (CI); this lets us build + test the +# darwin soul locally (e.g. to validate the atomic engram_save fix in isolation). +# +# Usage: scripts/build-soul-darwin.sh [output-binary] +set -e +DIST="${1:?usage: build-soul-darwin.sh [out]}" +OUT="${2:-./neuron}" +RT="$(cd "$(dirname "$0")/.." && pwd)/lang/el-compiler/runtime" +B="$(mktemp -d)" +# elc-generated dist modules use C89-style implicit cross-module declarations that +# Apple clang rejects as errors by default; resolve at link, so downgrade them. +CFLAGS="-Wno-implicit-function-declaration -Wno-implicit-int -Wno-int-conversion -I$B -I$DIST -I$RT" +cp "$RT/el_runtime.h" "$B/" +clang -c $CFLAGS "$RT/el_runtime.c" -o "$B/el_runtime.o" +for c in "$DIST"/*.c; do clang -c $CFLAGS "$c" -o "$B/$(basename "$c" .c).o"; done +# NOTE: link *.o once — do not also list el_runtime.o separately (duplicate symbols). +clang "$B"/*.o -o "$OUT" -lcurl -lm +echo "built $OUT" -- 2.52.0