dcc0bf550a
- P0: unified soul binary with engram_node_full fix, read-back-verify, search fix - P0: move API keys from plaintext plists to macOS Keychain - P0: fix MCP backend URL (port 8742 → 7770) - P1.6: memory-export/import scripts (AES-256-CBC, versioned .neuronmem format) - P1.7: nightly cultivation digest with sharpness metric (launchd at 23:55) - P2.10: Ollama provider in agentic loop (SOUL_LLM_PROVIDER=ollama) - P3.12: refugee importer for ChatGPT/Screenpipe/generic formats - P3.13: GLM-OCR spike — SHIP IT (mlx-vlm, 1.59GB, photo-to-memory.sh)
136 lines
5.7 KiB
Bash
Executable File
136 lines
5.7 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# photo-to-memory.sh — OCR a document/photo and store the text in Neuron memory
|
|
#
|
|
# Uses GLM-OCR (0.9B, MIT) via mlx-vlm on Apple Silicon.
|
|
# Model auto-downloads ~1.59 GB to ~/.cache/huggingface/ on first run.
|
|
#
|
|
# Usage:
|
|
# ./tools/photo-to-memory.sh <image-file> [--dry-run] [--prompt "custom prompt"]
|
|
#
|
|
# Prerequisites:
|
|
# pip install -U mlx-vlm
|
|
#
|
|
# Examples:
|
|
# ./tools/photo-to-memory.sh ~/Desktop/receipt.jpg
|
|
# ./tools/photo-to-memory.sh ~/Documents/contract.png --dry-run
|
|
# ./tools/photo-to-memory.sh scan.jpg --prompt "Extract all text from this receipt"
|
|
|
|
set -euo pipefail
|
|
|
|
# ── Config ─────────────────────────────────────────────────────────────────────
|
|
SOUL_URL="${SOUL_URL:-http://localhost:7770}"
|
|
GLM_MODEL="${GLM_MODEL:-mlx-community/GLM-OCR-8bit}"
|
|
MAX_TOKENS="${MAX_TOKENS:-4096}"
|
|
DEFAULT_PROMPT="Extract all text from this document. Preserve structure including tables, headers, and lists. Output plain text."
|
|
|
|
# ── Colours ────────────────────────────────────────────────────────────────────
|
|
RED=$'\033[0;31m'; GREEN=$'\033[0;32m'; YELLOW=$'\033[1;33m'
|
|
CYAN=$'\033[0;36m'; BOLD=$'\033[1m'; RESET=$'\033[0m'
|
|
|
|
log() { printf "%s%s%s\n" "$CYAN" "$*" "$RESET"; }
|
|
ok() { printf "%s✓ %s%s\n" "$GREEN" "$*" "$RESET"; }
|
|
warn() { printf "%s⚠ %s%s\n" "$YELLOW" "$*" "$RESET"; }
|
|
die() { printf "%s✗ %s%s\n" "$RED" "$*" "$RESET" >&2; exit 1; }
|
|
|
|
# ── Parse args ─────────────────────────────────────────────────────────────────
|
|
IMAGE_PATH=""
|
|
DRY_RUN=0
|
|
CUSTOM_PROMPT=""
|
|
|
|
while [[ $# -gt 0 ]]; do
|
|
case "$1" in
|
|
--dry-run) DRY_RUN=1; shift ;;
|
|
--prompt) CUSTOM_PROMPT="$2"; shift 2 ;;
|
|
--model) GLM_MODEL="$2"; shift 2 ;;
|
|
--help|-h)
|
|
sed -n '2,15p' "$0" | sed 's/^# \{0,1\}//'
|
|
exit 0
|
|
;;
|
|
-*) die "Unknown option: $1" ;;
|
|
*)
|
|
[[ -n "$IMAGE_PATH" ]] && die "Only one image file at a time"
|
|
IMAGE_PATH="$1"
|
|
shift
|
|
;;
|
|
esac
|
|
done
|
|
|
|
[[ -z "$IMAGE_PATH" ]] && die "Usage: $0 <image-file> [--dry-run] [--prompt \"...\"]"
|
|
[[ -f "$IMAGE_PATH" ]] || die "File not found: $IMAGE_PATH"
|
|
|
|
PROMPT="${CUSTOM_PROMPT:-$DEFAULT_PROMPT}"
|
|
FILENAME=$(basename "$IMAGE_PATH")
|
|
ABS_PATH=$(realpath "$IMAGE_PATH")
|
|
|
|
# ── Check runtime ───────────────────────────────────────────────────────────────
|
|
if ! python3 -c "import mlx_vlm" 2>/dev/null; then
|
|
warn "mlx-vlm not installed. Installing now..."
|
|
pip install -q -U mlx-vlm || die "pip install mlx-vlm failed — run manually: pip install -U mlx-vlm"
|
|
fi
|
|
|
|
# ── Run GLM-OCR ─────────────────────────────────────────────────────────────────
|
|
log "Running GLM-OCR on: $FILENAME"
|
|
log "Model: $GLM_MODEL"
|
|
[[ "$DRY_RUN" -eq 1 ]] && warn "Dry-run mode — will not post to Neuron"
|
|
|
|
# GLM-OCR output goes to stdout; capture it
|
|
# First run downloads ~1.59 GB — this is expected and cached thereafter.
|
|
OCR_TEXT=$(python3 -m mlx_vlm.generate \
|
|
--model "$GLM_MODEL" \
|
|
--max-tokens "$MAX_TOKENS" \
|
|
--temperature 0.0 \
|
|
--prompt "$PROMPT" \
|
|
--image "$ABS_PATH" \
|
|
2>/dev/null) || die "GLM-OCR failed. Check that mlx-vlm is installed and the image is readable."
|
|
|
|
CHAR_COUNT=${#OCR_TEXT}
|
|
log "OCR complete — extracted ${CHAR_COUNT} characters"
|
|
|
|
if [[ "$CHAR_COUNT" -lt 5 ]]; then
|
|
warn "Very short output — the image may be blank or unreadable"
|
|
fi
|
|
|
|
# ── Preview ─────────────────────────────────────────────────────────────────────
|
|
printf "\n%s--- OCR output preview (first 400 chars) ---%s\n" "$BOLD" "$RESET"
|
|
printf "%s\n" "${OCR_TEXT:0:400}"
|
|
[[ "$CHAR_COUNT" -gt 400 ]] && printf "%s... [+%d more chars]%s\n" "$YELLOW" $((CHAR_COUNT - 400)) "$RESET"
|
|
printf "\n"
|
|
|
|
# ── Post to Neuron soul ─────────────────────────────────────────────────────────
|
|
if [[ "$DRY_RUN" -eq 1 ]]; then
|
|
ok "Dry-run complete — would POST ${CHAR_COUNT} chars to ${SOUL_URL}/api/neuron/memory"
|
|
exit 0
|
|
fi
|
|
|
|
log "Posting to Neuron soul at ${SOUL_URL} ..."
|
|
|
|
PAYLOAD=$(python3 -c "
|
|
import json, sys
|
|
content = sys.argv[1]
|
|
label = sys.argv[2]
|
|
tags = ['photo-import', 'ocr', 'glm-ocr']
|
|
print(json.dumps({'content': content, 'label': label, 'tags': tags}))
|
|
" "$OCR_TEXT" "Photo: ${FILENAME}")
|
|
|
|
HTTP_STATUS=$(curl -s -o /tmp/photo-to-memory-response.json -w "%{http_code}" \
|
|
-X POST "${SOUL_URL}/api/neuron/memory" \
|
|
-H "Content-Type: application/json" \
|
|
-d "$PAYLOAD")
|
|
|
|
if [[ "$HTTP_STATUS" =~ ^2 ]]; then
|
|
NODE_ID=$(python3 -c "
|
|
import json, sys
|
|
try:
|
|
d = json.load(open('/tmp/photo-to-memory-response.json'))
|
|
print(d.get('id', d.get('node_id', 'unknown')))
|
|
except Exception:
|
|
print('unknown')
|
|
")
|
|
ok "Memory node created: ${NODE_ID}"
|
|
ok "Label: Photo: ${FILENAME}"
|
|
ok "Tags: photo-import, ocr, glm-ocr"
|
|
else
|
|
BODY=$(cat /tmp/photo-to-memory-response.json 2>/dev/null || echo "(no body)")
|
|
die "Soul returned HTTP ${HTTP_STATUS}: ${BODY}"
|
|
fi
|