8eb88a3116
Infrastructure readiness for RunPod inference workloads: - runpod-inference.yaml: ConfigMap with pod creation payloads for RTX 4090, A40 (single+dual), and custom templates - runpod-lb-configmap.yaml: nginx least-conn load balancer for inference endpoint distribution (Deployment + ClusterIP Service) - runpod-provision.sh: bash provisioner script — reads RUNPOD_API_KEY/HF_TOKEN, creates pods via GraphQL, polls until RUNNING, outputs endpoint URLs. Does NOT spin up any pods (dry-run flag available).
244 lines
8.1 KiB
Bash
Executable File
244 lines
8.1 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# runpod-provision.sh — On-demand RunPod inference pod provisioner
|
|
#
|
|
# Usage:
|
|
# ./runpod-provision.sh --gpu-type "NVIDIA GeForce RTX 4090" \
|
|
# --model "meta-llama/Meta-Llama-3-8B-Instruct" \
|
|
# --replicas 2 \
|
|
# [--quantization awq] \
|
|
# [--max-model-len 8192] \
|
|
# [--cloud-type SECURE] \
|
|
# [--dry-run]
|
|
#
|
|
# Env vars required (or sourced from ~/Secrets/credentials/infrastructure.env):
|
|
# RUNPOD_API_KEY — RunPod API key (from Vault secret/ai)
|
|
# HF_TOKEN — HuggingFace token for gated model downloads
|
|
#
|
|
# The script does NOT execute automatically; run it explicitly when you need pods.
|
|
# It outputs one endpoint URL per replica to stdout, one per line.
|
|
|
|
set -euo pipefail
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Defaults
|
|
# ---------------------------------------------------------------------------
|
|
GPU_TYPE="NVIDIA GeForce RTX 4090"
|
|
MODEL="meta-llama/Meta-Llama-3-8B-Instruct"
|
|
REPLICAS=1
|
|
QUANTIZATION="awq"
|
|
MAX_MODEL_LEN=8192
|
|
TENSOR_PARALLEL_SIZE=1
|
|
GPU_MEMORY_UTILIZATION="0.92"
|
|
CLOUD_TYPE="SECURE"
|
|
SERVED_MODEL_NAME=""
|
|
GPU_COUNT=1
|
|
VOLUME_GB=80
|
|
CONTAINER_DISK_GB=30
|
|
MIN_VCPU=8
|
|
MIN_MEMORY_GB=32
|
|
DRY_RUN=false
|
|
WAIT_TIMEOUT=600 # seconds to wait for RUNNING state
|
|
|
|
RUNPOD_API="https://api.runpod.io/graphql"
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Helpers
|
|
# ---------------------------------------------------------------------------
|
|
log() { printf '[%s] %s\n' "$(date -u +%H:%M:%S)" "$*" >&2; }
|
|
die() { log "ERROR: $*"; exit 1; }
|
|
|
|
require_cmd() { command -v "$1" &>/dev/null || die "'$1' not found — install it first"; }
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Argument parsing
|
|
# ---------------------------------------------------------------------------
|
|
while [[ $# -gt 0 ]]; do
|
|
case "$1" in
|
|
--gpu-type) GPU_TYPE="$2"; shift 2 ;;
|
|
--model) MODEL="$2"; shift 2 ;;
|
|
--replicas) REPLICAS="$2"; shift 2 ;;
|
|
--quantization) QUANTIZATION="$2"; shift 2 ;;
|
|
--max-model-len) MAX_MODEL_LEN="$2"; shift 2 ;;
|
|
--tensor-parallel) TENSOR_PARALLEL_SIZE="$2"; shift 2 ;;
|
|
--gpu-count) GPU_COUNT="$2"; shift 2 ;;
|
|
--cloud-type) CLOUD_TYPE="$2"; shift 2 ;;
|
|
--served-model-name) SERVED_MODEL_NAME="$2"; shift 2 ;;
|
|
--volume-gb) VOLUME_GB="$2"; shift 2 ;;
|
|
--wait-timeout) WAIT_TIMEOUT="$2"; shift 2 ;;
|
|
--dry-run) DRY_RUN=true; shift ;;
|
|
-h|--help)
|
|
grep '^#' "$0" | grep -v '^#!/' | sed 's/^# \?//'
|
|
exit 0
|
|
;;
|
|
*) die "Unknown argument: $1" ;;
|
|
esac
|
|
done
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Environment
|
|
# ---------------------------------------------------------------------------
|
|
require_cmd curl
|
|
require_cmd jq
|
|
|
|
# Try loading infrastructure secrets if not already set
|
|
if [[ -z "${RUNPOD_API_KEY:-}" ]]; then
|
|
INFRA_ENV="$HOME/Secrets/credentials/infrastructure.env"
|
|
if [[ -f "$INFRA_ENV" ]]; then
|
|
set -a; source "$INFRA_ENV"; set +a
|
|
fi
|
|
fi
|
|
|
|
[[ -n "${RUNPOD_API_KEY:-}" ]] || die "RUNPOD_API_KEY not set. Export it or ensure ~/Secrets/credentials/infrastructure.env is present."
|
|
[[ -n "${HF_TOKEN:-}" ]] || log "WARNING: HF_TOKEN not set — gated models (Llama etc.) will fail to download."
|
|
|
|
# Derive served model name from model ID if not provided
|
|
if [[ -z "$SERVED_MODEL_NAME" ]]; then
|
|
SERVED_MODEL_NAME="$(basename "$MODEL" | tr '[:upper:]' '[:lower:]' | tr -cd 'a-z0-9-')"
|
|
fi
|
|
|
|
# Build docker args (omit --quantization flag if value is empty)
|
|
DOCKER_ARGS="--host 0.0.0.0 --port 8000 --model \$MODEL_ID"
|
|
[[ -n "$QUANTIZATION" ]] && DOCKER_ARGS="$DOCKER_ARGS --quantization \$QUANTIZATION"
|
|
DOCKER_ARGS="$DOCKER_ARGS --max-model-len \$MAX_MODEL_LEN --tensor-parallel-size \$TENSOR_PARALLEL_SIZE --gpu-memory-utilization \$GPU_MEMORY_UTILIZATION --served-model-name \$SERVED_MODEL_NAME"
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# GraphQL helpers
|
|
# ---------------------------------------------------------------------------
|
|
gql() {
|
|
local query="$1"
|
|
curl -s --fail \
|
|
-H "Content-Type: application/json" \
|
|
-H "Authorization: Bearer ${RUNPOD_API_KEY}" \
|
|
-d "{\"query\": $(jq -Rn --arg q "$query" '$q')}" \
|
|
"$RUNPOD_API"
|
|
}
|
|
|
|
create_pod() {
|
|
local pod_name="$1"
|
|
local mutation
|
|
mutation=$(cat <<MUTATION
|
|
mutation {
|
|
podFindAndDeployOnDemand(input: {
|
|
name: "$pod_name"
|
|
imageName: "vllm/vllm-openai:latest"
|
|
gpuTypeId: "$GPU_TYPE"
|
|
cloudType: $CLOUD_TYPE
|
|
gpuCount: $GPU_COUNT
|
|
volumeInGb: $VOLUME_GB
|
|
containerDiskInGb: $CONTAINER_DISK_GB
|
|
minVcpuCount: $MIN_VCPU
|
|
minMemoryInGb: $MIN_MEMORY_GB
|
|
ports: "8000/http"
|
|
env: [
|
|
{ key: "MODEL_ID", value: "$MODEL" }
|
|
{ key: "QUANTIZATION", value: "$QUANTIZATION" }
|
|
{ key: "MAX_MODEL_LEN", value: "$MAX_MODEL_LEN" }
|
|
{ key: "TENSOR_PARALLEL_SIZE", value: "$TENSOR_PARALLEL_SIZE" }
|
|
{ key: "GPU_MEMORY_UTILIZATION", value: "$GPU_MEMORY_UTILIZATION" }
|
|
{ key: "SERVED_MODEL_NAME", value: "$SERVED_MODEL_NAME" }
|
|
{ key: "HF_TOKEN", value: "${HF_TOKEN:-}" }
|
|
]
|
|
dockerArgs: "$DOCKER_ARGS"
|
|
}) {
|
|
id
|
|
name
|
|
desiredStatus
|
|
}
|
|
}
|
|
MUTATION
|
|
)
|
|
gql "$mutation"
|
|
}
|
|
|
|
wait_for_running() {
|
|
local pod_id="$1"
|
|
local deadline=$(( $(date +%s) + WAIT_TIMEOUT ))
|
|
log "Waiting for pod $pod_id to reach RUNNING state (timeout ${WAIT_TIMEOUT}s)..."
|
|
|
|
while true; do
|
|
local resp
|
|
resp=$(gql "{ pod(input: { podId: \"$pod_id\" }) { id desiredStatus runtime { ports { ip isIpPublic privatePort publicPort type } } } }")
|
|
local status
|
|
status=$(printf '%s' "$resp" | jq -r '.data.pod.desiredStatus // "UNKNOWN"')
|
|
|
|
if [[ "$status" == "RUNNING" ]]; then
|
|
local endpoint
|
|
endpoint=$(printf '%s' "$resp" | jq -r '
|
|
.data.pod.runtime.ports[]?
|
|
| select(.privatePort == 8000 and .isIpPublic == true)
|
|
| "https://\(.ip):\(.publicPort)"
|
|
' 2>/dev/null || true)
|
|
|
|
# Fall back to RunPod proxy URL if no public port exposed
|
|
if [[ -z "$endpoint" ]]; then
|
|
endpoint="https://${pod_id}-8000.proxy.runpod.net"
|
|
fi
|
|
|
|
printf '%s' "$endpoint"
|
|
return 0
|
|
fi
|
|
|
|
if [[ $(date +%s) -ge $deadline ]]; then
|
|
die "Timed out waiting for pod $pod_id — last status: $status"
|
|
fi
|
|
|
|
log " pod $pod_id status: $status — retrying in 15s..."
|
|
sleep 15
|
|
done
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Main
|
|
# ---------------------------------------------------------------------------
|
|
log "RunPod inference provisioner"
|
|
log " GPU type : $GPU_TYPE"
|
|
log " Model : $MODEL"
|
|
log " Replicas : $REPLICAS"
|
|
log " Quant : ${QUANTIZATION:-none}"
|
|
log " TP size : $TENSOR_PARALLEL_SIZE"
|
|
log " Cloud : $CLOUD_TYPE"
|
|
log " Dry run : $DRY_RUN"
|
|
|
|
TIMESTAMP=$(date -u +%Y%m%d%H%M%S)
|
|
declare -a POD_IDS=()
|
|
|
|
for i in $(seq 1 "$REPLICAS"); do
|
|
POD_NAME="neuron-inf-${TIMESTAMP}-r${i}"
|
|
|
|
if $DRY_RUN; then
|
|
log "[dry-run] Would create pod: $POD_NAME"
|
|
continue
|
|
fi
|
|
|
|
log "Creating pod $i/$REPLICAS: $POD_NAME ..."
|
|
RESP=$(create_pod "$POD_NAME")
|
|
ERR=$(printf '%s' "$RESP" | jq -r '.errors[0].message // empty')
|
|
[[ -n "$ERR" ]] && die "RunPod API error: $ERR"
|
|
|
|
POD_ID=$(printf '%s' "$RESP" | jq -r '.data.podFindAndDeployOnDemand.id')
|
|
[[ -z "$POD_ID" || "$POD_ID" == "null" ]] && die "No pod ID returned. Full response: $RESP"
|
|
|
|
log " Created pod ID: $POD_ID"
|
|
POD_IDS+=("$POD_ID")
|
|
done
|
|
|
|
if $DRY_RUN; then
|
|
log "[dry-run] Complete — no pods were created."
|
|
exit 0
|
|
fi
|
|
|
|
log ""
|
|
log "Waiting for all $REPLICAS pod(s) to become RUNNING..."
|
|
declare -a ENDPOINTS=()
|
|
for POD_ID in "${POD_IDS[@]}"; do
|
|
EP=$(wait_for_running "$POD_ID")
|
|
ENDPOINTS+=("$EP")
|
|
log " Pod $POD_ID ready: $EP"
|
|
done
|
|
|
|
log ""
|
|
log "=== Inference endpoints ==="
|
|
for EP in "${ENDPOINTS[@]}"; do
|
|
printf '%s\n' "$EP"
|
|
done
|