neuron/tools/photo-to-memory.sh

#!/usr/bin/env bash
# photo-to-memory.sh — OCR a document/photo and store the text in Neuron memory
#
# Uses GLM-OCR (0.9B, MIT) via mlx-vlm on Apple Silicon.
# Model auto-downloads ~1.59 GB to ~/.cache/huggingface/ on first run.
#
# Usage:
#   ./tools/photo-to-memory.sh <image-file> [--dry-run] [--prompt "custom prompt"]
#
# Prerequisites:
#   pip install -U mlx-vlm
#
# Examples:
#   ./tools/photo-to-memory.sh ~/Desktop/receipt.jpg
#   ./tools/photo-to-memory.sh ~/Documents/contract.png --dry-run
#   ./tools/photo-to-memory.sh scan.jpg --prompt "Extract all text from this receipt"

set -euo pipefail

# ── Config ─────────────────────────────────────────────────────────────────────
SOUL_URL="${SOUL_URL:-http://localhost:7770}"
GLM_MODEL="${GLM_MODEL:-mlx-community/GLM-OCR-8bit}"
MAX_TOKENS="${MAX_TOKENS:-4096}"
DEFAULT_PROMPT="Extract all text from this document. Preserve structure including tables, headers, and lists. Output plain text."

# ── Colours ────────────────────────────────────────────────────────────────────
RED=$'\033[0;31m'; GREEN=$'\033[0;32m'; YELLOW=$'\033[1;33m'
CYAN=$'\033[0;36m'; BOLD=$'\033[1m'; RESET=$'\033[0m'

log()  { printf "%s%s%s\n" "$CYAN"  "$*" "$RESET"; }
ok()   { printf "%s✓ %s%s\n" "$GREEN" "$*" "$RESET"; }
warn() { printf "%s⚠ %s%s\n" "$YELLOW" "$*" "$RESET"; }
die()  { printf "%s✗ %s%s\n" "$RED" "$*" "$RESET" >&2; exit 1; }

# ── Parse args ─────────────────────────────────────────────────────────────────
IMAGE_PATH=""
DRY_RUN=0
CUSTOM_PROMPT=""

while [[ $# -gt 0 ]]; do
    case "$1" in
        --dry-run)   DRY_RUN=1; shift ;;
        --prompt)    CUSTOM_PROMPT="$2"; shift 2 ;;
        --model)     GLM_MODEL="$2"; shift 2 ;;
        --help|-h)
            sed -n '2,15p' "$0" | sed 's/^# \{0,1\}//'
            exit 0
            ;;
        -*)  die "Unknown option: $1" ;;
        *)
            [[ -n "$IMAGE_PATH" ]] && die "Only one image file at a time"
            IMAGE_PATH="$1"
            shift
            ;;
    esac
done

[[ -z "$IMAGE_PATH" ]] && die "Usage: $0 <image-file> [--dry-run] [--prompt \"...\"]"
[[ -f "$IMAGE_PATH" ]] || die "File not found: $IMAGE_PATH"

PROMPT="${CUSTOM_PROMPT:-$DEFAULT_PROMPT}"
FILENAME=$(basename "$IMAGE_PATH")
ABS_PATH=$(realpath "$IMAGE_PATH")

# ── Check runtime ───────────────────────────────────────────────────────────────
if ! python3 -c "import mlx_vlm" 2>/dev/null; then
    warn "mlx-vlm not installed. Installing now..."
    pip install -q -U mlx-vlm || die "pip install mlx-vlm failed — run manually: pip install -U mlx-vlm"
fi

# ── Run GLM-OCR ─────────────────────────────────────────────────────────────────
log "Running GLM-OCR on: $FILENAME"
log "Model: $GLM_MODEL"
[[ "$DRY_RUN" -eq 1 ]] && warn "Dry-run mode — will not post to Neuron"

# GLM-OCR output goes to stdout; capture it
# First run downloads ~1.59 GB — this is expected and cached thereafter.
OCR_TEXT=$(python3 -m mlx_vlm.generate \
    --model "$GLM_MODEL" \
    --max-tokens "$MAX_TOKENS" \
    --temperature 0.0 \
    --prompt "$PROMPT" \
    --image "$ABS_PATH" \
    2>/dev/null) || die "GLM-OCR failed. Check that mlx-vlm is installed and the image is readable."

CHAR_COUNT=${#OCR_TEXT}
log "OCR complete — extracted ${CHAR_COUNT} characters"

if [[ "$CHAR_COUNT" -lt 5 ]]; then
    warn "Very short output — the image may be blank or unreadable"
fi

# ── Preview ─────────────────────────────────────────────────────────────────────
printf "\n%s--- OCR output preview (first 400 chars) ---%s\n" "$BOLD" "$RESET"
printf "%s\n" "${OCR_TEXT:0:400}"
[[ "$CHAR_COUNT" -gt 400 ]] && printf "%s... [+%d more chars]%s\n" "$YELLOW" $((CHAR_COUNT - 400)) "$RESET"
printf "\n"

# ── Post to Neuron soul ─────────────────────────────────────────────────────────
if [[ "$DRY_RUN" -eq 1 ]]; then
    ok "Dry-run complete — would POST ${CHAR_COUNT} chars to ${SOUL_URL}/api/neuron/memory"
    exit 0
fi

log "Posting to Neuron soul at ${SOUL_URL} ..."

PAYLOAD=$(python3 -c "
import json, sys
content = sys.argv[1]
label   = sys.argv[2]
tags    = ['photo-import', 'ocr', 'glm-ocr']
print(json.dumps({'content': content, 'label': label, 'tags': tags}))
" "$OCR_TEXT" "Photo: ${FILENAME}")

HTTP_STATUS=$(curl -s -o /tmp/photo-to-memory-response.json -w "%{http_code}" \
    -X POST "${SOUL_URL}/api/neuron/memory" \
    -H "Content-Type: application/json" \
    -d "$PAYLOAD")

if [[ "$HTTP_STATUS" =~ ^2 ]]; then
    NODE_ID=$(python3 -c "
import json, sys
try:
    d = json.load(open('/tmp/photo-to-memory-response.json'))
    print(d.get('id', d.get('node_id', 'unknown')))
except Exception:
    print('unknown')
")
    ok "Memory node created: ${NODE_ID}"
    ok "Label: Photo: ${FILENAME}"
    ok "Tags: photo-import, ocr, glm-ocr"
else
    BODY=$(cat /tmp/photo-to-memory-response.json 2>/dev/null || echo "(no body)")
    die "Soul returned HTTP ${HTTP_STATUS}: ${BODY}"
fi