infrastructure/scripts/runpod-provision.sh

#!/usr/bin/env bash
# runpod-provision.sh — On-demand RunPod inference pod provisioner
#
# Usage:
#   ./runpod-provision.sh --gpu-type "NVIDIA GeForce RTX 4090" \
#                         --model "meta-llama/Meta-Llama-3-8B-Instruct" \
#                         --replicas 2 \
#                         [--quantization awq] \
#                         [--max-model-len 8192] \
#                         [--cloud-type SECURE] \
#                         [--dry-run]
#
# Env vars required (or sourced from ~/Secrets/credentials/infrastructure.env):
#   RUNPOD_API_KEY   — RunPod API key (from Vault secret/ai)
#   HF_TOKEN         — HuggingFace token for gated model downloads
#
# The script does NOT execute automatically; run it explicitly when you need pods.
# It outputs one endpoint URL per replica to stdout, one per line.

set -euo pipefail

# ---------------------------------------------------------------------------
# Defaults
# ---------------------------------------------------------------------------
GPU_TYPE="NVIDIA GeForce RTX 4090"
MODEL="meta-llama/Meta-Llama-3-8B-Instruct"
REPLICAS=1
QUANTIZATION="awq"
MAX_MODEL_LEN=8192
TENSOR_PARALLEL_SIZE=1
GPU_MEMORY_UTILIZATION="0.92"
CLOUD_TYPE="SECURE"
SERVED_MODEL_NAME=""
GPU_COUNT=1
VOLUME_GB=80
CONTAINER_DISK_GB=30
MIN_VCPU=8
MIN_MEMORY_GB=32
DRY_RUN=false
WAIT_TIMEOUT=600   # seconds to wait for RUNNING state

RUNPOD_API="https://api.runpod.io/graphql"

# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
log()  { printf '[%s] %s\n' "$(date -u +%H:%M:%S)" "$*" >&2; }
die()  { log "ERROR: $*"; exit 1; }

require_cmd() { command -v "$1" &>/dev/null || die "'$1' not found — install it first"; }

# ---------------------------------------------------------------------------
# Argument parsing
# ---------------------------------------------------------------------------
while [[ $# -gt 0 ]]; do
  case "$1" in
    --gpu-type)            GPU_TYPE="$2";              shift 2 ;;
    --model)               MODEL="$2";                 shift 2 ;;
    --replicas)            REPLICAS="$2";              shift 2 ;;
    --quantization)        QUANTIZATION="$2";          shift 2 ;;
    --max-model-len)       MAX_MODEL_LEN="$2";         shift 2 ;;
    --tensor-parallel)     TENSOR_PARALLEL_SIZE="$2";  shift 2 ;;
    --gpu-count)           GPU_COUNT="$2";             shift 2 ;;
    --cloud-type)          CLOUD_TYPE="$2";            shift 2 ;;
    --served-model-name)   SERVED_MODEL_NAME="$2";     shift 2 ;;
    --volume-gb)           VOLUME_GB="$2";             shift 2 ;;
    --wait-timeout)        WAIT_TIMEOUT="$2";          shift 2 ;;
    --dry-run)             DRY_RUN=true;               shift   ;;
    -h|--help)
      grep '^#' "$0" | grep -v '^#!/' | sed 's/^# \?//'
      exit 0
      ;;
    *) die "Unknown argument: $1" ;;
  esac
done

# ---------------------------------------------------------------------------
# Environment
# ---------------------------------------------------------------------------
require_cmd curl
require_cmd jq

# Try loading infrastructure secrets if not already set
if [[ -z "${RUNPOD_API_KEY:-}" ]]; then
  INFRA_ENV="$HOME/Secrets/credentials/infrastructure.env"
  if [[ -f "$INFRA_ENV" ]]; then
    set -a; source "$INFRA_ENV"; set +a
  fi
fi

[[ -n "${RUNPOD_API_KEY:-}" ]] || die "RUNPOD_API_KEY not set. Export it or ensure ~/Secrets/credentials/infrastructure.env is present."
[[ -n "${HF_TOKEN:-}" ]]       || log "WARNING: HF_TOKEN not set — gated models (Llama etc.) will fail to download."

# Derive served model name from model ID if not provided
if [[ -z "$SERVED_MODEL_NAME" ]]; then
  SERVED_MODEL_NAME="$(basename "$MODEL" | tr '[:upper:]' '[:lower:]' | tr -cd 'a-z0-9-')"
fi

# Build docker args (omit --quantization flag if value is empty)
DOCKER_ARGS="--host 0.0.0.0 --port 8000 --model \$MODEL_ID"
[[ -n "$QUANTIZATION" ]] && DOCKER_ARGS="$DOCKER_ARGS --quantization \$QUANTIZATION"
DOCKER_ARGS="$DOCKER_ARGS --max-model-len \$MAX_MODEL_LEN --tensor-parallel-size \$TENSOR_PARALLEL_SIZE --gpu-memory-utilization \$GPU_MEMORY_UTILIZATION --served-model-name \$SERVED_MODEL_NAME"

# ---------------------------------------------------------------------------
# GraphQL helpers
# ---------------------------------------------------------------------------
gql() {
  local query="$1"
  curl -s --fail \
    -H "Content-Type: application/json" \
    -H "Authorization: Bearer ${RUNPOD_API_KEY}" \
    -d "{\"query\": $(jq -Rn --arg q "$query" '$q')}" \
    "$RUNPOD_API"
}

create_pod() {
  local pod_name="$1"
  local mutation
  mutation=$(cat <<MUTATION
mutation {
  podFindAndDeployOnDemand(input: {
    name: "$pod_name"
    imageName: "vllm/vllm-openai:latest"
    gpuTypeId: "$GPU_TYPE"
    cloudType: $CLOUD_TYPE
    gpuCount: $GPU_COUNT
    volumeInGb: $VOLUME_GB
    containerDiskInGb: $CONTAINER_DISK_GB
    minVcpuCount: $MIN_VCPU
    minMemoryInGb: $MIN_MEMORY_GB
    ports: "8000/http"
    env: [
      { key: "MODEL_ID",               value: "$MODEL" }
      { key: "QUANTIZATION",           value: "$QUANTIZATION" }
      { key: "MAX_MODEL_LEN",          value: "$MAX_MODEL_LEN" }
      { key: "TENSOR_PARALLEL_SIZE",   value: "$TENSOR_PARALLEL_SIZE" }
      { key: "GPU_MEMORY_UTILIZATION", value: "$GPU_MEMORY_UTILIZATION" }
      { key: "SERVED_MODEL_NAME",      value: "$SERVED_MODEL_NAME" }
      { key: "HF_TOKEN",               value: "${HF_TOKEN:-}" }
    ]
    dockerArgs: "$DOCKER_ARGS"
  }) {
    id
    name
    desiredStatus
  }
}
MUTATION
)
  gql "$mutation"
}

wait_for_running() {
  local pod_id="$1"
  local deadline=$(( $(date +%s) + WAIT_TIMEOUT ))
  log "Waiting for pod $pod_id to reach RUNNING state (timeout ${WAIT_TIMEOUT}s)..."

  while true; do
    local resp
    resp=$(gql "{ pod(input: { podId: \"$pod_id\" }) { id desiredStatus runtime { ports { ip isIpPublic privatePort publicPort type } } } }")
    local status
    status=$(printf '%s' "$resp" | jq -r '.data.pod.desiredStatus // "UNKNOWN"')

    if [[ "$status" == "RUNNING" ]]; then
      local endpoint
      endpoint=$(printf '%s' "$resp" | jq -r '
        .data.pod.runtime.ports[]?
        | select(.privatePort == 8000 and .isIpPublic == true)
        | "https://\(.ip):\(.publicPort)"
      ' 2>/dev/null || true)

      # Fall back to RunPod proxy URL if no public port exposed
      if [[ -z "$endpoint" ]]; then
        endpoint="https://${pod_id}-8000.proxy.runpod.net"
      fi

      printf '%s' "$endpoint"
      return 0
    fi

    if [[ $(date +%s) -ge $deadline ]]; then
      die "Timed out waiting for pod $pod_id — last status: $status"
    fi

    log "  pod $pod_id status: $status — retrying in 15s..."
    sleep 15
  done
}

# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
log "RunPod inference provisioner"
log "  GPU type  : $GPU_TYPE"
log "  Model     : $MODEL"
log "  Replicas  : $REPLICAS"
log "  Quant     : ${QUANTIZATION:-none}"
log "  TP size   : $TENSOR_PARALLEL_SIZE"
log "  Cloud     : $CLOUD_TYPE"
log "  Dry run   : $DRY_RUN"

TIMESTAMP=$(date -u +%Y%m%d%H%M%S)
declare -a POD_IDS=()

for i in $(seq 1 "$REPLICAS"); do
  POD_NAME="neuron-inf-${TIMESTAMP}-r${i}"

  if $DRY_RUN; then
    log "[dry-run] Would create pod: $POD_NAME"
    continue
  fi

  log "Creating pod $i/$REPLICAS: $POD_NAME ..."
  RESP=$(create_pod "$POD_NAME")
  ERR=$(printf '%s' "$RESP" | jq -r '.errors[0].message // empty')
  [[ -n "$ERR" ]] && die "RunPod API error: $ERR"

  POD_ID=$(printf '%s' "$RESP" | jq -r '.data.podFindAndDeployOnDemand.id')
  [[ -z "$POD_ID" || "$POD_ID" == "null" ]] && die "No pod ID returned. Full response: $RESP"

  log "  Created pod ID: $POD_ID"
  POD_IDS+=("$POD_ID")
done

if $DRY_RUN; then
  log "[dry-run] Complete — no pods were created."
  exit 0
fi

log ""
log "Waiting for all $REPLICAS pod(s) to become RUNNING..."
declare -a ENDPOINTS=()
for POD_ID in "${POD_IDS[@]}"; do
  EP=$(wait_for_running "$POD_ID")
  ENDPOINTS+=("$EP")
  log "  Pod $POD_ID ready: $EP"
done

log ""
log "=== Inference endpoints ==="
for EP in "${ENDPOINTS[@]}"; do
  printf '%s\n' "$EP"
done