neuron/cli/neuron_recall.py

#!/usr/bin/env python3
"""
neuron_recall — Neuron's memory read path.

BM25 search over the engram graph snapshot (~3,900 nodes) PLUS Neuron's own
save-as-you-go CLI memories. This is how Neuron (running as Claude Code) recalls
what it knows, since the soul's built-in search is broken.

Usage:
  python3 ~/neuron_recall.py "what do I know about VBD"
  python3 ~/neuron_recall.py "Tim Lingo" 8        # second arg = number of hits
"""
import collections
import glob
import json
import math
import os
import re
import sys

SNAP = os.path.expanduser("~/.neuron/engram/snapshot.json")
MEMS = os.path.expanduser("~/.neuron/neuron-cli-memories.jsonl")


def toks(s):
    return re.findall(r"[a-z0-9]+", (s or "").lower())


def sanitize(text):
    if not text:
        return ""
    cleaned = "".join(ch if (32 <= ord(ch) < 127 or ch in "\n\t") else " " for ch in text)
    return re.sub(r"[ \t]+", " ", cleaned).strip()


# markers of serialized node-metadata blobs (corrupted/nested nodes, not real prose)
_NOISE = ("temporal_decay_rate", "working_memory_weight", "background_activation",
          "suppression_count", "activation_count")


def is_prose(content):
    """Reject content that is serialized graph metadata rather than readable memory."""
    if sum(m in content for m in _NOISE) >= 2:
        return False
    # too much JSON punctuation density -> it's a data blob, not prose
    punct = content.count('":') + content.count(',"') + content.count('{"')
    if punct > max(6, len(content) / 80):
        return False
    return True


def load_docs():
    docs = []  # (id, label, content, source)
    # graph snapshot
    try:
        nodes = json.loads(open(SNAP, encoding="utf-8", errors="replace").read()).get("nodes", [])
        for n in nodes:
            orig = n.get("content") or ""
            c = sanitize(orig)
            if len(c) < 40 or len(c) / max(len(orig), 1) <= 0.6:
                continue
            if not is_prose(c):
                continue
            docs.append((sanitize(n.get("id", "")) or "node",
                         sanitize(n.get("label", "") or n.get("title", "")),
                         c, "graph"))
    except Exception:
        pass
    # Neuron's own CLI memories (most recent first matters less; BM25 ranks)
    if os.path.exists(MEMS):
        for line in open(MEMS, encoding="utf-8", errors="replace"):
            line = line.strip()
            if not line:
                continue
            try:
                m = json.loads(line)
            except Exception:
                continue
            c = sanitize(m.get("content", ""))
            if c:
                docs.append((m.get("id", "mem"), m.get("tier", "note"), c, "neuron-memory"))
    return docs


def bm25(docs, query, k):
    tokd = [toks(d[2]) for d in docs]
    N = len(docs)
    if N == 0:
        return []
    df = collections.Counter()
    for t in tokd:
        for w in set(t):
            df[w] += 1
    idf = {w: math.log(1 + (N - f + 0.5) / (f + 0.5)) for w, f in df.items()}
    avgdl = sum(len(t) for t in tokd) / N
    qt = toks(query)
    scored = []
    for i, t in enumerate(tokd):
        tf = collections.Counter(t)
        dl = len(t)
        s = 0.0
        for w in qt:
            f = tf.get(w, 0)
            if f:
                s += idf.get(w, 0) * (f * 2.5) / (f + 1.5 * (1 - 0.75 + 0.75 * dl / avgdl))
        if s > 0:
            scored.append((s, i))
    scored.sort(reverse=True)
    out, seen = [], set()
    for _, i in scored:
        sig = docs[i][2][:120]
        if sig in seen:
            continue
        seen.add(sig)
        out.append(docs[i])
        if len(out) >= k:
            break
    return out


def main():
    if len(sys.argv) < 2:
        print("usage: neuron_recall.py \"<query>\" [n]")
        return
    query = sys.argv[1]
    k = int(sys.argv[2]) if len(sys.argv) > 2 else 6
    docs = load_docs()
    hits = bm25(docs, query, k)
    if not hits:
        print(f"(no memories matched '{query}')")
        return
    print(f"# {len(hits)} memories for: {query}\n")
    for _id, label, content, source in hits:
        tag = "★" if source == "neuron-memory" else "·"
        head = f" [{label}]" if label else ""
        print(f"{tag}{head}\n{content[:700].strip()}\n")


if __name__ == "__main__":
    main()