Files
Will Anderson 8eb88a3116 feat(runpod): add inference pod templates, nginx LB, and provisioner script
Infrastructure readiness for RunPod inference workloads:
- runpod-inference.yaml: ConfigMap with pod creation payloads for RTX 4090,
  A40 (single+dual), and custom templates
- runpod-lb-configmap.yaml: nginx least-conn load balancer for inference
  endpoint distribution (Deployment + ClusterIP Service)
- runpod-provision.sh: bash provisioner script — reads RUNPOD_API_KEY/HF_TOKEN,
  creates pods via GraphQL, polls until RUNNING, outputs endpoint URLs.
  Does NOT spin up any pods (dry-run flag available).
2026-04-25 01:20:55 -05:00

244 lines
8.1 KiB
Bash
Executable File

#!/usr/bin/env bash
# runpod-provision.sh — On-demand RunPod inference pod provisioner
#
# Usage:
# ./runpod-provision.sh --gpu-type "NVIDIA GeForce RTX 4090" \
# --model "meta-llama/Meta-Llama-3-8B-Instruct" \
# --replicas 2 \
# [--quantization awq] \
# [--max-model-len 8192] \
# [--cloud-type SECURE] \
# [--dry-run]
#
# Env vars required (or sourced from ~/Secrets/credentials/infrastructure.env):
# RUNPOD_API_KEY — RunPod API key (from Vault secret/ai)
# HF_TOKEN — HuggingFace token for gated model downloads
#
# The script does NOT execute automatically; run it explicitly when you need pods.
# It outputs one endpoint URL per replica to stdout, one per line.
set -euo pipefail
# ---------------------------------------------------------------------------
# Defaults
# ---------------------------------------------------------------------------
GPU_TYPE="NVIDIA GeForce RTX 4090"
MODEL="meta-llama/Meta-Llama-3-8B-Instruct"
REPLICAS=1
QUANTIZATION="awq"
MAX_MODEL_LEN=8192
TENSOR_PARALLEL_SIZE=1
GPU_MEMORY_UTILIZATION="0.92"
CLOUD_TYPE="SECURE"
SERVED_MODEL_NAME=""
GPU_COUNT=1
VOLUME_GB=80
CONTAINER_DISK_GB=30
MIN_VCPU=8
MIN_MEMORY_GB=32
DRY_RUN=false
WAIT_TIMEOUT=600 # seconds to wait for RUNNING state
RUNPOD_API="https://api.runpod.io/graphql"
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
log() { printf '[%s] %s\n' "$(date -u +%H:%M:%S)" "$*" >&2; }
die() { log "ERROR: $*"; exit 1; }
require_cmd() { command -v "$1" &>/dev/null || die "'$1' not found — install it first"; }
# ---------------------------------------------------------------------------
# Argument parsing
# ---------------------------------------------------------------------------
while [[ $# -gt 0 ]]; do
case "$1" in
--gpu-type) GPU_TYPE="$2"; shift 2 ;;
--model) MODEL="$2"; shift 2 ;;
--replicas) REPLICAS="$2"; shift 2 ;;
--quantization) QUANTIZATION="$2"; shift 2 ;;
--max-model-len) MAX_MODEL_LEN="$2"; shift 2 ;;
--tensor-parallel) TENSOR_PARALLEL_SIZE="$2"; shift 2 ;;
--gpu-count) GPU_COUNT="$2"; shift 2 ;;
--cloud-type) CLOUD_TYPE="$2"; shift 2 ;;
--served-model-name) SERVED_MODEL_NAME="$2"; shift 2 ;;
--volume-gb) VOLUME_GB="$2"; shift 2 ;;
--wait-timeout) WAIT_TIMEOUT="$2"; shift 2 ;;
--dry-run) DRY_RUN=true; shift ;;
-h|--help)
grep '^#' "$0" | grep -v '^#!/' | sed 's/^# \?//'
exit 0
;;
*) die "Unknown argument: $1" ;;
esac
done
# ---------------------------------------------------------------------------
# Environment
# ---------------------------------------------------------------------------
require_cmd curl
require_cmd jq
# Try loading infrastructure secrets if not already set
if [[ -z "${RUNPOD_API_KEY:-}" ]]; then
INFRA_ENV="$HOME/Secrets/credentials/infrastructure.env"
if [[ -f "$INFRA_ENV" ]]; then
set -a; source "$INFRA_ENV"; set +a
fi
fi
[[ -n "${RUNPOD_API_KEY:-}" ]] || die "RUNPOD_API_KEY not set. Export it or ensure ~/Secrets/credentials/infrastructure.env is present."
[[ -n "${HF_TOKEN:-}" ]] || log "WARNING: HF_TOKEN not set — gated models (Llama etc.) will fail to download."
# Derive served model name from model ID if not provided
if [[ -z "$SERVED_MODEL_NAME" ]]; then
SERVED_MODEL_NAME="$(basename "$MODEL" | tr '[:upper:]' '[:lower:]' | tr -cd 'a-z0-9-')"
fi
# Build docker args (omit --quantization flag if value is empty)
DOCKER_ARGS="--host 0.0.0.0 --port 8000 --model \$MODEL_ID"
[[ -n "$QUANTIZATION" ]] && DOCKER_ARGS="$DOCKER_ARGS --quantization \$QUANTIZATION"
DOCKER_ARGS="$DOCKER_ARGS --max-model-len \$MAX_MODEL_LEN --tensor-parallel-size \$TENSOR_PARALLEL_SIZE --gpu-memory-utilization \$GPU_MEMORY_UTILIZATION --served-model-name \$SERVED_MODEL_NAME"
# ---------------------------------------------------------------------------
# GraphQL helpers
# ---------------------------------------------------------------------------
gql() {
local query="$1"
curl -s --fail \
-H "Content-Type: application/json" \
-H "Authorization: Bearer ${RUNPOD_API_KEY}" \
-d "{\"query\": $(jq -Rn --arg q "$query" '$q')}" \
"$RUNPOD_API"
}
create_pod() {
local pod_name="$1"
local mutation
mutation=$(cat <<MUTATION
mutation {
podFindAndDeployOnDemand(input: {
name: "$pod_name"
imageName: "vllm/vllm-openai:latest"
gpuTypeId: "$GPU_TYPE"
cloudType: $CLOUD_TYPE
gpuCount: $GPU_COUNT
volumeInGb: $VOLUME_GB
containerDiskInGb: $CONTAINER_DISK_GB
minVcpuCount: $MIN_VCPU
minMemoryInGb: $MIN_MEMORY_GB
ports: "8000/http"
env: [
{ key: "MODEL_ID", value: "$MODEL" }
{ key: "QUANTIZATION", value: "$QUANTIZATION" }
{ key: "MAX_MODEL_LEN", value: "$MAX_MODEL_LEN" }
{ key: "TENSOR_PARALLEL_SIZE", value: "$TENSOR_PARALLEL_SIZE" }
{ key: "GPU_MEMORY_UTILIZATION", value: "$GPU_MEMORY_UTILIZATION" }
{ key: "SERVED_MODEL_NAME", value: "$SERVED_MODEL_NAME" }
{ key: "HF_TOKEN", value: "${HF_TOKEN:-}" }
]
dockerArgs: "$DOCKER_ARGS"
}) {
id
name
desiredStatus
}
}
MUTATION
)
gql "$mutation"
}
wait_for_running() {
local pod_id="$1"
local deadline=$(( $(date +%s) + WAIT_TIMEOUT ))
log "Waiting for pod $pod_id to reach RUNNING state (timeout ${WAIT_TIMEOUT}s)..."
while true; do
local resp
resp=$(gql "{ pod(input: { podId: \"$pod_id\" }) { id desiredStatus runtime { ports { ip isIpPublic privatePort publicPort type } } } }")
local status
status=$(printf '%s' "$resp" | jq -r '.data.pod.desiredStatus // "UNKNOWN"')
if [[ "$status" == "RUNNING" ]]; then
local endpoint
endpoint=$(printf '%s' "$resp" | jq -r '
.data.pod.runtime.ports[]?
| select(.privatePort == 8000 and .isIpPublic == true)
| "https://\(.ip):\(.publicPort)"
' 2>/dev/null || true)
# Fall back to RunPod proxy URL if no public port exposed
if [[ -z "$endpoint" ]]; then
endpoint="https://${pod_id}-8000.proxy.runpod.net"
fi
printf '%s' "$endpoint"
return 0
fi
if [[ $(date +%s) -ge $deadline ]]; then
die "Timed out waiting for pod $pod_id — last status: $status"
fi
log " pod $pod_id status: $status — retrying in 15s..."
sleep 15
done
}
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
log "RunPod inference provisioner"
log " GPU type : $GPU_TYPE"
log " Model : $MODEL"
log " Replicas : $REPLICAS"
log " Quant : ${QUANTIZATION:-none}"
log " TP size : $TENSOR_PARALLEL_SIZE"
log " Cloud : $CLOUD_TYPE"
log " Dry run : $DRY_RUN"
TIMESTAMP=$(date -u +%Y%m%d%H%M%S)
declare -a POD_IDS=()
for i in $(seq 1 "$REPLICAS"); do
POD_NAME="neuron-inf-${TIMESTAMP}-r${i}"
if $DRY_RUN; then
log "[dry-run] Would create pod: $POD_NAME"
continue
fi
log "Creating pod $i/$REPLICAS: $POD_NAME ..."
RESP=$(create_pod "$POD_NAME")
ERR=$(printf '%s' "$RESP" | jq -r '.errors[0].message // empty')
[[ -n "$ERR" ]] && die "RunPod API error: $ERR"
POD_ID=$(printf '%s' "$RESP" | jq -r '.data.podFindAndDeployOnDemand.id')
[[ -z "$POD_ID" || "$POD_ID" == "null" ]] && die "No pod ID returned. Full response: $RESP"
log " Created pod ID: $POD_ID"
POD_IDS+=("$POD_ID")
done
if $DRY_RUN; then
log "[dry-run] Complete — no pods were created."
exit 0
fi
log ""
log "Waiting for all $REPLICAS pod(s) to become RUNNING..."
declare -a ENDPOINTS=()
for POD_ID in "${POD_IDS[@]}"; do
EP=$(wait_for_running "$POD_ID")
ENDPOINTS+=("$EP")
log " Pod $POD_ID ready: $EP"
done
log ""
log "=== Inference endpoints ==="
for EP in "${ENDPOINTS[@]}"; do
printf '%s\n' "$EP"
done