Files
neuron/tools/photo-to-memory.sh
T
will.anderson dcc0bf550a Add Ollama provider, portable memory, cultivation digest, refugee importer, GLM-OCR spike
- P0: unified soul binary with engram_node_full fix, read-back-verify, search fix
- P0: move API keys from plaintext plists to macOS Keychain
- P0: fix MCP backend URL (port 8742 → 7770)
- P1.6: memory-export/import scripts (AES-256-CBC, versioned .neuronmem format)
- P1.7: nightly cultivation digest with sharpness metric (launchd at 23:55)
- P2.10: Ollama provider in agentic loop (SOUL_LLM_PROVIDER=ollama)
- P3.12: refugee importer for ChatGPT/Screenpipe/generic formats
- P3.13: GLM-OCR spike — SHIP IT (mlx-vlm, 1.59GB, photo-to-memory.sh)
2026-06-27 11:46:30 -05:00

136 lines
5.7 KiB
Bash
Executable File

#!/usr/bin/env bash
# photo-to-memory.sh — OCR a document/photo and store the text in Neuron memory
#
# Uses GLM-OCR (0.9B, MIT) via mlx-vlm on Apple Silicon.
# Model auto-downloads ~1.59 GB to ~/.cache/huggingface/ on first run.
#
# Usage:
# ./tools/photo-to-memory.sh <image-file> [--dry-run] [--prompt "custom prompt"]
#
# Prerequisites:
# pip install -U mlx-vlm
#
# Examples:
# ./tools/photo-to-memory.sh ~/Desktop/receipt.jpg
# ./tools/photo-to-memory.sh ~/Documents/contract.png --dry-run
# ./tools/photo-to-memory.sh scan.jpg --prompt "Extract all text from this receipt"
set -euo pipefail
# ── Config ─────────────────────────────────────────────────────────────────────
SOUL_URL="${SOUL_URL:-http://localhost:7770}"
GLM_MODEL="${GLM_MODEL:-mlx-community/GLM-OCR-8bit}"
MAX_TOKENS="${MAX_TOKENS:-4096}"
DEFAULT_PROMPT="Extract all text from this document. Preserve structure including tables, headers, and lists. Output plain text."
# ── Colours ────────────────────────────────────────────────────────────────────
RED=$'\033[0;31m'; GREEN=$'\033[0;32m'; YELLOW=$'\033[1;33m'
CYAN=$'\033[0;36m'; BOLD=$'\033[1m'; RESET=$'\033[0m'
log() { printf "%s%s%s\n" "$CYAN" "$*" "$RESET"; }
ok() { printf "%s✓ %s%s\n" "$GREEN" "$*" "$RESET"; }
warn() { printf "%s⚠ %s%s\n" "$YELLOW" "$*" "$RESET"; }
die() { printf "%s✗ %s%s\n" "$RED" "$*" "$RESET" >&2; exit 1; }
# ── Parse args ─────────────────────────────────────────────────────────────────
IMAGE_PATH=""
DRY_RUN=0
CUSTOM_PROMPT=""
while [[ $# -gt 0 ]]; do
case "$1" in
--dry-run) DRY_RUN=1; shift ;;
--prompt) CUSTOM_PROMPT="$2"; shift 2 ;;
--model) GLM_MODEL="$2"; shift 2 ;;
--help|-h)
sed -n '2,15p' "$0" | sed 's/^# \{0,1\}//'
exit 0
;;
-*) die "Unknown option: $1" ;;
*)
[[ -n "$IMAGE_PATH" ]] && die "Only one image file at a time"
IMAGE_PATH="$1"
shift
;;
esac
done
[[ -z "$IMAGE_PATH" ]] && die "Usage: $0 <image-file> [--dry-run] [--prompt \"...\"]"
[[ -f "$IMAGE_PATH" ]] || die "File not found: $IMAGE_PATH"
PROMPT="${CUSTOM_PROMPT:-$DEFAULT_PROMPT}"
FILENAME=$(basename "$IMAGE_PATH")
ABS_PATH=$(realpath "$IMAGE_PATH")
# ── Check runtime ───────────────────────────────────────────────────────────────
if ! python3 -c "import mlx_vlm" 2>/dev/null; then
warn "mlx-vlm not installed. Installing now..."
pip install -q -U mlx-vlm || die "pip install mlx-vlm failed — run manually: pip install -U mlx-vlm"
fi
# ── Run GLM-OCR ─────────────────────────────────────────────────────────────────
log "Running GLM-OCR on: $FILENAME"
log "Model: $GLM_MODEL"
[[ "$DRY_RUN" -eq 1 ]] && warn "Dry-run mode — will not post to Neuron"
# GLM-OCR output goes to stdout; capture it
# First run downloads ~1.59 GB — this is expected and cached thereafter.
OCR_TEXT=$(python3 -m mlx_vlm.generate \
--model "$GLM_MODEL" \
--max-tokens "$MAX_TOKENS" \
--temperature 0.0 \
--prompt "$PROMPT" \
--image "$ABS_PATH" \
2>/dev/null) || die "GLM-OCR failed. Check that mlx-vlm is installed and the image is readable."
CHAR_COUNT=${#OCR_TEXT}
log "OCR complete — extracted ${CHAR_COUNT} characters"
if [[ "$CHAR_COUNT" -lt 5 ]]; then
warn "Very short output — the image may be blank or unreadable"
fi
# ── Preview ─────────────────────────────────────────────────────────────────────
printf "\n%s--- OCR output preview (first 400 chars) ---%s\n" "$BOLD" "$RESET"
printf "%s\n" "${OCR_TEXT:0:400}"
[[ "$CHAR_COUNT" -gt 400 ]] && printf "%s... [+%d more chars]%s\n" "$YELLOW" $((CHAR_COUNT - 400)) "$RESET"
printf "\n"
# ── Post to Neuron soul ─────────────────────────────────────────────────────────
if [[ "$DRY_RUN" -eq 1 ]]; then
ok "Dry-run complete — would POST ${CHAR_COUNT} chars to ${SOUL_URL}/api/neuron/memory"
exit 0
fi
log "Posting to Neuron soul at ${SOUL_URL} ..."
PAYLOAD=$(python3 -c "
import json, sys
content = sys.argv[1]
label = sys.argv[2]
tags = ['photo-import', 'ocr', 'glm-ocr']
print(json.dumps({'content': content, 'label': label, 'tags': tags}))
" "$OCR_TEXT" "Photo: ${FILENAME}")
HTTP_STATUS=$(curl -s -o /tmp/photo-to-memory-response.json -w "%{http_code}" \
-X POST "${SOUL_URL}/api/neuron/memory" \
-H "Content-Type: application/json" \
-d "$PAYLOAD")
if [[ "$HTTP_STATUS" =~ ^2 ]]; then
NODE_ID=$(python3 -c "
import json, sys
try:
d = json.load(open('/tmp/photo-to-memory-response.json'))
print(d.get('id', d.get('node_id', 'unknown')))
except Exception:
print('unknown')
")
ok "Memory node created: ${NODE_ID}"
ok "Label: Photo: ${FILENAME}"
ok "Tags: photo-import, ocr, glm-ocr"
else
BODY=$(cat /tmp/photo-to-memory-response.json 2>/dev/null || echo "(no body)")
die "Soul returned HTTP ${HTTP_STATUS}: ${BODY}"
fi