neuron/tools/memory-import-refugee.sh

#!/usr/bin/env bash
# memory-import-refugee.sh — Import conversation/memory history from external apps into Neuron
#
# Usage:
#   ./tools/memory-import-refugee.sh --format chatgpt    conversations.json
#   ./tools/memory-import-refugee.sh --format screenpipe screenpipe-export.json
#   ./tools/memory-import-refugee.sh --format generic    data.json[l]
#
# Supported formats:
#   chatgpt    — ChatGPT conversation export (conversations.json)
#   screenpipe — Screenpipe OCR export (frames array)
#   generic    — Any JSON array or JSONL with content/text fields
#
# The script writes Memory nodes to the Neuron soul via its HTTP API.
# The soul must be running on localhost:7770.

set -euo pipefail

# ── Config ─────────────────────────────────────────────────────────────────────
SOUL_HOST="http://localhost:7770"
# Note: POST /api/neuron/memory ignores the label field (soul hardcodes "memory:remembered").
# We embed the label in the content prefix so it is searchable.
MEMORY_API="${SOUL_HOST}/api/neuron/memory"
SLEEP_MS=100   # ms between API calls (rate limiting)

# ── Dependency check ───────────────────────────────────────────────────────────
if ! command -v jq &>/dev/null; then
  echo "ERROR: jq is required but not installed." >&2
  echo "" >&2
  echo "Install it with:" >&2
  echo "  macOS:  brew install jq" >&2
  echo "  Ubuntu: sudo apt-get install jq" >&2
  echo "  Alpine: apk add jq" >&2
  exit 1
fi

# ── Parse args ─────────────────────────────────────────────────────────────────
FORMAT=""
INPUT_FILE=""

while [[ $# -gt 0 ]]; do
  case "$1" in
    --format|-f)
      FORMAT="$2"
      shift 2
      ;;
    --format=*|-f=*)
      FORMAT="${1#*=}"
      shift
      ;;
    -*)
      echo "Unknown option: $1" >&2
      echo "Usage: $0 --format <chatgpt|screenpipe|generic> <input-file>" >&2
      exit 1
      ;;
    *)
      if [[ -z "$INPUT_FILE" ]]; then
        INPUT_FILE="$1"
      else
        echo "Unexpected argument: $1" >&2
        exit 1
      fi
      shift
      ;;
  esac
done

if [[ -z "$FORMAT" ]]; then
  echo "ERROR: --format is required." >&2
  echo "Usage: $0 --format <chatgpt|screenpipe|generic> <input-file>" >&2
  exit 1
fi

if [[ -z "$INPUT_FILE" ]]; then
  echo "ERROR: No input file specified." >&2
  echo "Usage: $0 --format <chatgpt|screenpipe|generic> <input-file>" >&2
  exit 1
fi

if [[ ! -f "$INPUT_FILE" ]]; then
  echo "ERROR: Input file not found: $INPUT_FILE" >&2
  exit 1
fi

case "$FORMAT" in
  chatgpt|screenpipe|generic) ;;
  *)
    echo "ERROR: Unknown format: $FORMAT" >&2
    echo "Supported formats: chatgpt, screenpipe, generic" >&2
    exit 1
    ;;
esac

# ── Soul health check ──────────────────────────────────────────────────────────
HTTP_CODE="$(curl -s -o /dev/null -w "%{http_code}" "${SOUL_HOST}/api/neuron/memory" 2>/dev/null || echo "000")"
if [[ "$HTTP_CODE" == "000" ]]; then
  echo "ERROR: Neuron soul is not responding at ${SOUL_HOST}." >&2
  echo "       Start the soul service and retry." >&2
  exit 1
fi

# ── Counters ───────────────────────────────────────────────────────────────────
IMPORTED=0
SKIPPED=0
ERRORS=0

# ── Helper: post one memory node ───────────────────────────────────────────────
# post_memory CONTENT LABEL TAGS_JSON
#
# Note: the soul's POST /api/neuron/memory API ignores the label field (hardcodes
# it to "memory:remembered"). We embed the label as a prefix in the content so
# the title remains searchable via recall/search.
post_memory() {
  local content="$1"
  local label="$2"
  local tags_json="$3"

  # Skip empty content
  if [[ -z "$content" || "$content" == "null" ]]; then
    SKIPPED=$((SKIPPED + 1))
    return 0
  fi

  # Embed label in content so it's searchable (the API ignores the label field)
  local full_content="[${label}] ${content}"

  local payload
  payload="$(jq -n \
    --arg content "$full_content" \
    --arg label "$label" \
    --argjson tags "$tags_json" \
    '{content: $content, label: $label, tags: $tags}')"

  local response
  response="$(curl -s -X POST "$MEMORY_API" \
    -H "Content-Type: application/json" \
    -d "$payload" 2>/dev/null)"

  local ok
  ok="$(echo "$response" | jq -r '.ok // "false"' 2>/dev/null)"

  if [[ "$ok" == "true" ]]; then
    IMPORTED=$((IMPORTED + 1))
  else
    ERRORS=$((ERRORS + 1))
    echo "  [ERROR] API error for label \"${label:0:60}\": $response" >&2
  fi

  # Rate limit: sleep 100ms
  sleep "0.${SLEEP_MS}"
}

# ── Format: ChatGPT ────────────────────────────────────────────────────────────
import_chatgpt() {
  echo "Format: ChatGPT conversation export"

  # Validate: must be JSON array at top level
  local top_type
  top_type="$(jq -r 'type' "$INPUT_FILE" 2>/dev/null)"
  if [[ "$top_type" != "array" ]]; then
    echo "ERROR: ChatGPT export must be a JSON array of conversations." >&2
    exit 1
  fi

  local conv_count
  conv_count="$(jq 'length' "$INPUT_FILE")"
  echo "Found ${conv_count} conversation(s) to process."
  echo ""

  # Count total user messages for progress display
  local total_msgs
  total_msgs="$(jq '[.[].mapping // {} | to_entries[] | .value.message | select(. != null and .author.role == "user") | .content.parts // [] | .[] | select(type == "string" and length > 0)] | length' "$INPUT_FILE" 2>/dev/null || echo "?")"
  echo "Total user messages: ${total_msgs}"
  echo ""

  local msg_idx=0

  # Process each conversation
  while IFS= read -r conv_json; do
    local title
    title="$(echo "$conv_json" | jq -r '.title // "Untitled"')"

    # Truncate label to 100 chars
    local label="${title:0:100}"

    # Extract user messages — ChatGPT export uses a mapping dict structure
    # Mapping: { uuid: { id, message: { author: { role }, content: { parts: [...] } }, ... } }
    # We iterate over mapping values, filter role=user, grab text parts
    while IFS= read -r msg_text; do
      msg_idx=$((msg_idx + 1))
      echo "  Importing ${msg_idx}/${total_msgs}..."
      post_memory "$msg_text" "$label" '["chatgpt-import","conversation"]'
    done < <(echo "$conv_json" | jq -r '
      .mapping // {} |
      to_entries[] |
      .value.message |
      select(. != null) |
      select(.author.role == "user") |
      .content.parts // [] |
      .[] |
      select(type == "string" and length > 0)
    ' 2>/dev/null)

  done < <(jq -c '.[]' "$INPUT_FILE")
}

# ── Format: Screenpipe ─────────────────────────────────────────────────────────
import_screenpipe() {
  echo "Format: Screenpipe OCR export"

  # Validate: must have frames array
  local top_type
  top_type="$(jq -r 'type' "$INPUT_FILE" 2>/dev/null)"
  if [[ "$top_type" != "object" ]]; then
    echo "ERROR: Screenpipe export must be a JSON object with a 'frames' array." >&2
    exit 1
  fi

  local frame_count
  frame_count="$(jq '.frames | length' "$INPUT_FILE" 2>/dev/null || echo "0")"
  echo "Found ${frame_count} frame(s) to process."

  if [[ "$frame_count" == "0" ]]; then
    echo "No frames found. Nothing to import."
    return 0
  fi

  # Group frames by app_name + 5-minute window bucket
  # Strategy: process sorted frames, emit a group when app or bucket changes.
  # We do this in pure jq with a reduce, emitting groups as newline-delimited JSON.

  local total_groups=0
  local group_idx=0

  # Collect groups: each group is { app, bucket_ts, texts: [...] }
  # Bucket = floor(timestamp_epoch / 300) * 300 seconds
  # timestamps may be ISO8601 or epoch — handle both

  # We process in jq and emit one group per line as JSON
  while IFS= read -r group_json; do
    total_groups=$((total_groups + 1))
    # Just count first
    :
  done < <(jq -c '
    .frames |
    map(select(.text != null and (.text | length) > 0)) |
    group_by(.app_name) |
    .[] |
    . as $app_frames |
    ($app_frames[0].app_name) as $app |
    # Sort by timestamp within app
    (sort_by(.timestamp)) |
    # Group into 5-minute buckets
    reduce .[] as $f (
      {bucket: null, texts: [], ts: null, groups: []};
      ($f.timestamp // "") as $ts |
      # Derive numeric bucket: try epoch directly; for ISO use first 15 chars as bucket key
      (if ($ts | test("^[0-9]+$")) then ($ts | tonumber / 300 | floor)
       else ($ts[0:15])
       end) as $bucket |
      if .bucket == null then
        {bucket: $bucket, texts: [$f.text], ts: $ts, groups: .groups}
      elif .bucket == $bucket then
        {bucket: $bucket, texts: (.texts + [$f.text]), ts: $ts, groups: .groups}
      else
        {bucket: $bucket, texts: [$f.text], ts: $ts,
         groups: (.groups + [{app: $app, ts: .ts, texts: .texts}])}
      end
    ) |
    # flush last bucket
    (.groups + [{app: .app_name, ts: .ts, texts: .texts}]) |
    .[] |
    select(.texts | length > 0)
  ' "$INPUT_FILE" 2>/dev/null)

  # Now actually process
  while IFS= read -r group_json; do
    group_idx=$((group_idx + 1))
    echo "  Importing ${group_idx}..."

    local app_name ts_str content label

    app_name="$(echo "$group_json" | jq -r '.app // "unknown"')"
    ts_str="$(echo "$group_json" | jq -r '.ts // ""')"

    # Concatenate texts, truncate to 2000 chars
    content="$(echo "$group_json" | jq -r '.texts | join(" ")' | cut -c1-2000)"
    label="Screenpipe: ${app_name} at ${ts_str:0:16}"

    local tags_json
    tags_json="$(jq -n --arg app "$app_name" '["screenpipe-import","screen-capture",$app]')"

    post_memory "$content" "$label" "$tags_json"

  done < <(jq -c '
    .frames |
    map(select(.text != null and (.text | length) > 0)) |
    group_by(.app_name) |
    .[] |
    . as $app_frames |
    ($app_frames[0].app_name) as $app |
    (sort_by(.timestamp)) |
    reduce .[] as $f (
      {bucket: null, texts: [], ts: null, app: $app, groups: []};
      ($f.timestamp // "") as $ts |
      (if ($ts | test("^[0-9]+$")) then ($ts | tonumber / 300 | floor | tostring)
       else ($ts[0:15])
       end) as $bucket |
      if .bucket == null then
        {bucket: $bucket, texts: [$f.text], ts: $ts, app: $app, groups: .groups}
      elif .bucket == $bucket then
        {bucket: $bucket, texts: (.texts + [$f.text]), ts: $ts, app: $app, groups: .groups}
      else
        {bucket: $bucket, texts: [$f.text], ts: $ts, app: $app,
         groups: (.groups + [{app: $app, ts: .ts, texts: .texts}])}
      end
    ) |
    (.groups + [{app: .app, ts: .ts, texts: .texts}]) |
    .[] |
    select(.texts | length > 0)
  ' "$INPUT_FILE" 2>/dev/null)
}

# ── Format: Generic ────────────────────────────────────────────────────────────
import_generic() {
  echo "Format: Generic JSON/JSONL"

  # Detect if JSONL (one JSON object per line) or single JSON array/object
  local first_char
  first_char="$(head -c1 "$INPUT_FILE" 2>/dev/null)"

  local records_file
  records_file="$(mktemp)"
  trap 'rm -f "$records_file"' RETURN

  if [[ "$first_char" == "[" ]]; then
    # JSON array — explode to one object per line
    jq -c '.[]' "$INPUT_FILE" > "$records_file" 2>/dev/null || true
  elif [[ "$first_char" == "{" ]]; then
    # Single object or JSONL — try JSONL first
    # JSONL: each line is valid JSON
    # Check if the whole file is one object or multiple lines
    local line_count
    line_count="$(wc -l < "$INPUT_FILE" | tr -d ' ')"
    if [[ "$line_count" -le 1 ]]; then
      # Single object: wrap in array and explode
      jq -c '[.] | .[]' "$INPUT_FILE" > "$records_file" 2>/dev/null || true
    else
      # Assume JSONL
      cp "$INPUT_FILE" "$records_file"
    fi
  else
    # Try JSONL anyway
    cp "$INPUT_FILE" "$records_file"
  fi

  local total_records
  total_records="$(wc -l < "$records_file" | tr -d ' ')"
  echo "Found ${total_records} record(s) to process."
  echo ""

  local idx=0
  while IFS= read -r record_json; do
    [[ -z "$record_json" ]] && continue

    idx=$((idx + 1))
    echo "  Importing ${idx}/${total_records}..."

    # Extract content: prefer 'content', fall back to 'text', then 'body', then 'message'
    local content
    content="$(echo "$record_json" | jq -r '
      if .content != null and (.content | type) == "string" then .content
      elif .text != null and (.text | type) == "string" then .text
      elif .body != null and (.body | type) == "string" then .body
      elif .message != null and (.message | type) == "string" then .message
      else ""
      end
    ' 2>/dev/null)"

    [[ -z "$content" || "$content" == "null" ]] && { SKIPPED=$((SKIPPED + 1)); continue; }

    # Extract label: prefer 'title', then 'label', then 'name', then first 80 chars of content
    local label
    label="$(echo "$record_json" | jq -r '
      if .title != null and (.title | type) == "string" then .title
      elif .label != null and (.label | type) == "string" then .label
      elif .name != null and (.name | type) == "string" then .name
      else ""
      end
    ' 2>/dev/null)"

    if [[ -z "$label" || "$label" == "null" ]]; then
      label="${content:0:80}"
    fi
    label="${label:0:100}"

    post_memory "$content" "$label" '["imported","generic"]'

  done < "$records_file"
}

# ── Main ───────────────────────────────────────────────────────────────────────
echo "Neuron Refugee Importer"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo "Source:  $INPUT_FILE"
echo "Format:  $FORMAT"
echo "Soul:    $SOUL_HOST"
echo ""

case "$FORMAT" in
  chatgpt)    import_chatgpt ;;
  screenpipe) import_screenpipe ;;
  generic)    import_generic ;;
esac

# ── Final report ───────────────────────────────────────────────────────────────
echo ""
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo "Import complete."
echo "  Imported: ${IMPORTED}"
echo "  Skipped:  ${SKIPPED}"
echo "  Errors:   ${ERRORS}"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"

if [[ $ERRORS -gt 0 ]]; then
  exit 1
fi