feat(runpod): add inference pod templates, nginx LB, and provisioner script

Infrastructure readiness for RunPod inference workloads:
- runpod-inference.yaml: ConfigMap with pod creation payloads for RTX 4090,
  A40 (single+dual), and custom templates
- runpod-lb-configmap.yaml: nginx least-conn load balancer for inference
  endpoint distribution (Deployment + ClusterIP Service)
- runpod-provision.sh: bash provisioner script — reads RUNPOD_API_KEY/HF_TOKEN,
  creates pods via GraphQL, polls until RUNNING, outputs endpoint URLs.
  Does NOT spin up any pods (dry-run flag available).
This commit is contained in:
Will Anderson
2026-04-25 01:20:55 -05:00
parent c847b22014
commit 8eb88a3116
4 changed files with 566 additions and 0 deletions
+243
View File
@@ -0,0 +1,243 @@
#!/usr/bin/env bash
# runpod-provision.sh — On-demand RunPod inference pod provisioner
#
# Usage:
# ./runpod-provision.sh --gpu-type "NVIDIA GeForce RTX 4090" \
# --model "meta-llama/Meta-Llama-3-8B-Instruct" \
# --replicas 2 \
# [--quantization awq] \
# [--max-model-len 8192] \
# [--cloud-type SECURE] \
# [--dry-run]
#
# Env vars required (or sourced from ~/Secrets/credentials/infrastructure.env):
# RUNPOD_API_KEY — RunPod API key (from Vault secret/ai)
# HF_TOKEN — HuggingFace token for gated model downloads
#
# The script does NOT execute automatically; run it explicitly when you need pods.
# It outputs one endpoint URL per replica to stdout, one per line.
set -euo pipefail
# ---------------------------------------------------------------------------
# Defaults
# ---------------------------------------------------------------------------
GPU_TYPE="NVIDIA GeForce RTX 4090"
MODEL="meta-llama/Meta-Llama-3-8B-Instruct"
REPLICAS=1
QUANTIZATION="awq"
MAX_MODEL_LEN=8192
TENSOR_PARALLEL_SIZE=1
GPU_MEMORY_UTILIZATION="0.92"
CLOUD_TYPE="SECURE"
SERVED_MODEL_NAME=""
GPU_COUNT=1
VOLUME_GB=80
CONTAINER_DISK_GB=30
MIN_VCPU=8
MIN_MEMORY_GB=32
DRY_RUN=false
WAIT_TIMEOUT=600 # seconds to wait for RUNNING state
RUNPOD_API="https://api.runpod.io/graphql"
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
log() { printf '[%s] %s\n' "$(date -u +%H:%M:%S)" "$*" >&2; }
die() { log "ERROR: $*"; exit 1; }
require_cmd() { command -v "$1" &>/dev/null || die "'$1' not found — install it first"; }
# ---------------------------------------------------------------------------
# Argument parsing
# ---------------------------------------------------------------------------
while [[ $# -gt 0 ]]; do
case "$1" in
--gpu-type) GPU_TYPE="$2"; shift 2 ;;
--model) MODEL="$2"; shift 2 ;;
--replicas) REPLICAS="$2"; shift 2 ;;
--quantization) QUANTIZATION="$2"; shift 2 ;;
--max-model-len) MAX_MODEL_LEN="$2"; shift 2 ;;
--tensor-parallel) TENSOR_PARALLEL_SIZE="$2"; shift 2 ;;
--gpu-count) GPU_COUNT="$2"; shift 2 ;;
--cloud-type) CLOUD_TYPE="$2"; shift 2 ;;
--served-model-name) SERVED_MODEL_NAME="$2"; shift 2 ;;
--volume-gb) VOLUME_GB="$2"; shift 2 ;;
--wait-timeout) WAIT_TIMEOUT="$2"; shift 2 ;;
--dry-run) DRY_RUN=true; shift ;;
-h|--help)
grep '^#' "$0" | grep -v '^#!/' | sed 's/^# \?//'
exit 0
;;
*) die "Unknown argument: $1" ;;
esac
done
# ---------------------------------------------------------------------------
# Environment
# ---------------------------------------------------------------------------
require_cmd curl
require_cmd jq
# Try loading infrastructure secrets if not already set
if [[ -z "${RUNPOD_API_KEY:-}" ]]; then
INFRA_ENV="$HOME/Secrets/credentials/infrastructure.env"
if [[ -f "$INFRA_ENV" ]]; then
set -a; source "$INFRA_ENV"; set +a
fi
fi
[[ -n "${RUNPOD_API_KEY:-}" ]] || die "RUNPOD_API_KEY not set. Export it or ensure ~/Secrets/credentials/infrastructure.env is present."
[[ -n "${HF_TOKEN:-}" ]] || log "WARNING: HF_TOKEN not set — gated models (Llama etc.) will fail to download."
# Derive served model name from model ID if not provided
if [[ -z "$SERVED_MODEL_NAME" ]]; then
SERVED_MODEL_NAME="$(basename "$MODEL" | tr '[:upper:]' '[:lower:]' | tr -cd 'a-z0-9-')"
fi
# Build docker args (omit --quantization flag if value is empty)
DOCKER_ARGS="--host 0.0.0.0 --port 8000 --model \$MODEL_ID"
[[ -n "$QUANTIZATION" ]] && DOCKER_ARGS="$DOCKER_ARGS --quantization \$QUANTIZATION"
DOCKER_ARGS="$DOCKER_ARGS --max-model-len \$MAX_MODEL_LEN --tensor-parallel-size \$TENSOR_PARALLEL_SIZE --gpu-memory-utilization \$GPU_MEMORY_UTILIZATION --served-model-name \$SERVED_MODEL_NAME"
# ---------------------------------------------------------------------------
# GraphQL helpers
# ---------------------------------------------------------------------------
gql() {
local query="$1"
curl -s --fail \
-H "Content-Type: application/json" \
-H "Authorization: Bearer ${RUNPOD_API_KEY}" \
-d "{\"query\": $(jq -Rn --arg q "$query" '$q')}" \
"$RUNPOD_API"
}
create_pod() {
local pod_name="$1"
local mutation
mutation=$(cat <<MUTATION
mutation {
podFindAndDeployOnDemand(input: {
name: "$pod_name"
imageName: "vllm/vllm-openai:latest"
gpuTypeId: "$GPU_TYPE"
cloudType: $CLOUD_TYPE
gpuCount: $GPU_COUNT
volumeInGb: $VOLUME_GB
containerDiskInGb: $CONTAINER_DISK_GB
minVcpuCount: $MIN_VCPU
minMemoryInGb: $MIN_MEMORY_GB
ports: "8000/http"
env: [
{ key: "MODEL_ID", value: "$MODEL" }
{ key: "QUANTIZATION", value: "$QUANTIZATION" }
{ key: "MAX_MODEL_LEN", value: "$MAX_MODEL_LEN" }
{ key: "TENSOR_PARALLEL_SIZE", value: "$TENSOR_PARALLEL_SIZE" }
{ key: "GPU_MEMORY_UTILIZATION", value: "$GPU_MEMORY_UTILIZATION" }
{ key: "SERVED_MODEL_NAME", value: "$SERVED_MODEL_NAME" }
{ key: "HF_TOKEN", value: "${HF_TOKEN:-}" }
]
dockerArgs: "$DOCKER_ARGS"
}) {
id
name
desiredStatus
}
}
MUTATION
)
gql "$mutation"
}
wait_for_running() {
local pod_id="$1"
local deadline=$(( $(date +%s) + WAIT_TIMEOUT ))
log "Waiting for pod $pod_id to reach RUNNING state (timeout ${WAIT_TIMEOUT}s)..."
while true; do
local resp
resp=$(gql "{ pod(input: { podId: \"$pod_id\" }) { id desiredStatus runtime { ports { ip isIpPublic privatePort publicPort type } } } }")
local status
status=$(printf '%s' "$resp" | jq -r '.data.pod.desiredStatus // "UNKNOWN"')
if [[ "$status" == "RUNNING" ]]; then
local endpoint
endpoint=$(printf '%s' "$resp" | jq -r '
.data.pod.runtime.ports[]?
| select(.privatePort == 8000 and .isIpPublic == true)
| "https://\(.ip):\(.publicPort)"
' 2>/dev/null || true)
# Fall back to RunPod proxy URL if no public port exposed
if [[ -z "$endpoint" ]]; then
endpoint="https://${pod_id}-8000.proxy.runpod.net"
fi
printf '%s' "$endpoint"
return 0
fi
if [[ $(date +%s) -ge $deadline ]]; then
die "Timed out waiting for pod $pod_id — last status: $status"
fi
log " pod $pod_id status: $status — retrying in 15s..."
sleep 15
done
}
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
log "RunPod inference provisioner"
log " GPU type : $GPU_TYPE"
log " Model : $MODEL"
log " Replicas : $REPLICAS"
log " Quant : ${QUANTIZATION:-none}"
log " TP size : $TENSOR_PARALLEL_SIZE"
log " Cloud : $CLOUD_TYPE"
log " Dry run : $DRY_RUN"
TIMESTAMP=$(date -u +%Y%m%d%H%M%S)
declare -a POD_IDS=()
for i in $(seq 1 "$REPLICAS"); do
POD_NAME="neuron-inf-${TIMESTAMP}-r${i}"
if $DRY_RUN; then
log "[dry-run] Would create pod: $POD_NAME"
continue
fi
log "Creating pod $i/$REPLICAS: $POD_NAME ..."
RESP=$(create_pod "$POD_NAME")
ERR=$(printf '%s' "$RESP" | jq -r '.errors[0].message // empty')
[[ -n "$ERR" ]] && die "RunPod API error: $ERR"
POD_ID=$(printf '%s' "$RESP" | jq -r '.data.podFindAndDeployOnDemand.id')
[[ -z "$POD_ID" || "$POD_ID" == "null" ]] && die "No pod ID returned. Full response: $RESP"
log " Created pod ID: $POD_ID"
POD_IDS+=("$POD_ID")
done
if $DRY_RUN; then
log "[dry-run] Complete — no pods were created."
exit 0
fi
log ""
log "Waiting for all $REPLICAS pod(s) to become RUNNING..."
declare -a ENDPOINTS=()
for POD_ID in "${POD_IDS[@]}"; do
EP=$(wait_for_running "$POD_ID")
ENDPOINTS+=("$EP")
log " Pod $POD_ID ready: $EP"
done
log ""
log "=== Inference endpoints ==="
for EP in "${ENDPOINTS[@]}"; do
printf '%s\n' "$EP"
done
@@ -12,3 +12,5 @@ resources:
- ingress.yaml
- license-ingressroute.yaml
- backup-cronjob.yaml
- runpod-inference.yaml
- runpod-lb-configmap.yaml
@@ -0,0 +1,149 @@
---
# RunPod Inference ConfigMap
# Holds pod creation payloads for on-demand vLLM inference pods.
# Used by the runpod-provision.sh script and any k8s Job that provisions RunPod pods.
#
# GPU IDs sourced from RunPod GraphQL { gpuTypes { id } } — verified 2026-04-25.
# RTX 4090: "NVIDIA GeForce RTX 4090" (24 GB VRAM, secure+community cloud)
# A40: "NVIDIA A40" (48 GB VRAM, secure cloud only)
# L40S: "NVIDIA L40S" (48 GB VRAM, secure+community cloud)
apiVersion: v1
kind: ConfigMap
metadata:
name: runpod-inference-templates
namespace: neuron-prod
labels:
app.kubernetes.io/name: runpod-inference
app.kubernetes.io/component: inference-provisioner
app.kubernetes.io/managed-by: argocd
data:
# --- Llama-3-8B on RTX 4090 (24 GB) ---
# Fits quantised (GPTQ/AWQ) or full fp16 8B models.
pod-template-rtx4090-llama3-8b.json: |
{
"name": "neuron-llama3-8b-rtx4090",
"imageName": "vllm/vllm-openai:latest",
"gpuTypeId": "NVIDIA GeForce RTX 4090",
"cloudType": "SECURE",
"gpuCount": 1,
"volumeInGb": 80,
"containerDiskInGb": 30,
"minVcpuCount": 8,
"minMemoryInGb": 32,
"ports": "8000/http",
"env": [
{ "key": "MODEL_ID", "value": "meta-llama/Meta-Llama-3-8B-Instruct" },
{ "key": "QUANTIZATION", "value": "awq" },
{ "key": "MAX_MODEL_LEN", "value": "8192" },
{ "key": "TENSOR_PARALLEL_SIZE","value": "1" },
{ "key": "GPU_MEMORY_UTILIZATION", "value": "0.92" },
{ "key": "SERVED_MODEL_NAME", "value": "llama3-8b" },
{ "key": "HF_TOKEN", "value": "REPLACE_WITH_HF_TOKEN" }
],
"dockerArgs": "--host 0.0.0.0 --port 8000 --model $MODEL_ID --quantization $QUANTIZATION --max-model-len $MAX_MODEL_LEN --tensor-parallel-size $TENSOR_PARALLEL_SIZE --gpu-memory-utilization $GPU_MEMORY_UTILIZATION --served-model-name $SERVED_MODEL_NAME"
}
# --- Llama-3-70B on A40 (48 GB) — single GPU with AWQ quantisation ---
pod-template-a40-llama3-70b.json: |
{
"name": "neuron-llama3-70b-a40",
"imageName": "vllm/vllm-openai:latest",
"gpuTypeId": "NVIDIA A40",
"cloudType": "SECURE",
"gpuCount": 1,
"volumeInGb": 150,
"containerDiskInGb": 50,
"minVcpuCount": 16,
"minMemoryInGb": 64,
"ports": "8000/http",
"env": [
{ "key": "MODEL_ID", "value": "meta-llama/Meta-Llama-3-70B-Instruct" },
{ "key": "QUANTIZATION", "value": "awq" },
{ "key": "MAX_MODEL_LEN", "value": "4096" },
{ "key": "TENSOR_PARALLEL_SIZE","value": "1" },
{ "key": "GPU_MEMORY_UTILIZATION", "value": "0.90" },
{ "key": "SERVED_MODEL_NAME", "value": "llama3-70b" },
{ "key": "HF_TOKEN", "value": "REPLACE_WITH_HF_TOKEN" }
],
"dockerArgs": "--host 0.0.0.0 --port 8000 --model $MODEL_ID --quantization $QUANTIZATION --max-model-len $MAX_MODEL_LEN --tensor-parallel-size $TENSOR_PARALLEL_SIZE --gpu-memory-utilization $GPU_MEMORY_UTILIZATION --served-model-name $SERVED_MODEL_NAME"
}
# --- Llama-3-70B on 2x A40 (tensor parallel) ---
pod-template-2xa40-llama3-70b-fp16.json: |
{
"name": "neuron-llama3-70b-2xa40-fp16",
"imageName": "vllm/vllm-openai:latest",
"gpuTypeId": "NVIDIA A40",
"cloudType": "SECURE",
"gpuCount": 2,
"volumeInGb": 150,
"containerDiskInGb": 50,
"minVcpuCount": 16,
"minMemoryInGb": 64,
"ports": "8000/http",
"env": [
{ "key": "MODEL_ID", "value": "meta-llama/Meta-Llama-3-70B-Instruct" },
{ "key": "QUANTIZATION", "value": "" },
{ "key": "MAX_MODEL_LEN", "value": "8192" },
{ "key": "TENSOR_PARALLEL_SIZE","value": "2" },
{ "key": "GPU_MEMORY_UTILIZATION", "value": "0.90" },
{ "key": "SERVED_MODEL_NAME", "value": "llama3-70b" },
{ "key": "HF_TOKEN", "value": "REPLACE_WITH_HF_TOKEN" }
],
"dockerArgs": "--host 0.0.0.0 --port 8000 --model $MODEL_ID --max-model-len $MAX_MODEL_LEN --tensor-parallel-size $TENSOR_PARALLEL_SIZE --gpu-memory-utilization $GPU_MEMORY_UTILIZATION --served-model-name $SERVED_MODEL_NAME"
}
# --- Generic custom model template ---
pod-template-custom.json: |
{
"name": "neuron-inference-custom",
"imageName": "vllm/vllm-openai:latest",
"gpuTypeId": "NVIDIA GeForce RTX 4090",
"cloudType": "SECURE",
"gpuCount": 1,
"volumeInGb": 100,
"containerDiskInGb": 30,
"minVcpuCount": 8,
"minMemoryInGb": 32,
"ports": "8000/http",
"env": [
{ "key": "MODEL_ID", "value": "REPLACE_WITH_MODEL_ID" },
{ "key": "QUANTIZATION", "value": "awq" },
{ "key": "MAX_MODEL_LEN", "value": "4096" },
{ "key": "TENSOR_PARALLEL_SIZE","value": "1" },
{ "key": "GPU_MEMORY_UTILIZATION", "value": "0.90" },
{ "key": "SERVED_MODEL_NAME", "value": "model" },
{ "key": "HF_TOKEN", "value": "REPLACE_WITH_HF_TOKEN" }
],
"dockerArgs": "--host 0.0.0.0 --port 8000 --model $MODEL_ID --quantization $QUANTIZATION --max-model-len $MAX_MODEL_LEN --tensor-parallel-size $TENSOR_PARALLEL_SIZE --gpu-memory-utilization $GPU_MEMORY_UTILIZATION --served-model-name $SERVED_MODEL_NAME"
}
# --- Nginx load balancer config (updated dynamically by runpod-provision.sh) ---
# Paste pod endpoint URLs under upstream block when pods are running.
nginx-lb.conf: |
upstream runpod_inference {
least_conn;
# Populated dynamically — add lines like:
# server <pod-id>-8000.proxy.runpod.net:443;
keepalive 32;
}
server {
listen 80;
server_name inference.neuralplatform.ai;
location /v1/ {
proxy_pass https://runpod_inference;
proxy_http_version 1.1;
proxy_set_header Connection "";
proxy_set_header Host $proxy_host;
proxy_set_header X-Real-IP $remote_addr;
proxy_read_timeout 300s;
proxy_send_timeout 60s;
}
location /health {
return 200 'ok';
add_header Content-Type text/plain;
}
}
@@ -0,0 +1,172 @@
---
# RunPod Inference Load Balancer — Nginx ConfigMap
#
# Deploys a lightweight nginx reverse proxy inside the cluster that fans out
# requests to multiple RunPod inference endpoints. Update the upstream block
# with real pod proxy URLs after running runpod-provision.sh.
#
# Reload nginx after updating endpoints:
# kubectl rollout restart deployment/runpod-lb -n neuron-prod
apiVersion: v1
kind: ConfigMap
metadata:
name: runpod-lb-nginx-config
namespace: neuron-prod
labels:
app.kubernetes.io/name: runpod-lb
app.kubernetes.io/component: load-balancer
app.kubernetes.io/managed-by: argocd
data:
nginx.conf: |
worker_processes auto;
error_log /dev/stderr warn;
pid /tmp/nginx.pid;
events {
worker_connections 1024;
use epoll;
}
http {
access_log /dev/stdout combined;
# Aggressive upstream keepalive — RunPod pods are remote HTTPS
upstream runpod_inference {
least_conn;
# ----------------------------------------------------------------
# Add one line per active RunPod pod endpoint.
# Format: server <pod-id>-8000.proxy.runpod.net:443;
# Example:
# server abc123def456-8000.proxy.runpod.net:443;
# server xyz789uvw012-8000.proxy.runpod.net:443;
# ----------------------------------------------------------------
keepalive 32;
keepalive_requests 1000;
keepalive_timeout 75s;
}
# Health-check upstream (one known-good pod for active probing)
# Replace with real pod ID.
upstream runpod_healthcheck {
server REPLACE_POD_ID-8000.proxy.runpod.net:443;
}
server {
listen 8080;
server_name _;
# Proxy timeouts generous for LLM inference
proxy_read_timeout 300s;
proxy_send_timeout 60s;
proxy_connect_timeout 10s;
# ---- OpenAI-compatible inference passthrough ----
location /v1/ {
proxy_pass https://runpod_inference;
proxy_http_version 1.1;
proxy_set_header Connection "";
proxy_set_header Host $proxy_host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
# Stream tokens back to caller without buffering
proxy_buffering off;
proxy_cache off;
chunked_transfer_encoding on;
}
# ---- Cluster-internal health probe ----
location /health {
return 200 'ok\n';
add_header Content-Type text/plain;
}
# ---- Upstream model info (routed to first healthy pod) ----
location /v1/models {
proxy_pass https://runpod_inference;
proxy_http_version 1.1;
proxy_set_header Connection "";
proxy_set_header Host $proxy_host;
}
}
}
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: runpod-lb
namespace: neuron-prod
labels:
app.kubernetes.io/name: runpod-lb
app.kubernetes.io/component: load-balancer
app.kubernetes.io/managed-by: argocd
spec:
replicas: 1
selector:
matchLabels:
app: runpod-lb
template:
metadata:
labels:
app: runpod-lb
spec:
containers:
- name: nginx
image: nginx:1.27-alpine
ports:
- containerPort: 8080
name: http
volumeMounts:
- name: nginx-config
mountPath: /etc/nginx/nginx.conf
subPath: nginx.conf
readOnly: true
livenessProbe:
httpGet:
path: /health
port: 8080
initialDelaySeconds: 5
periodSeconds: 15
readinessProbe:
httpGet:
path: /health
port: 8080
initialDelaySeconds: 3
periodSeconds: 10
resources:
requests:
cpu: "50m"
memory: "64Mi"
limits:
cpu: "200m"
memory: "128Mi"
securityContext:
runAsNonRoot: true
runAsUser: 101
allowPrivilegeEscalation: false
readOnlyRootFilesystem: false
volumes:
- name: nginx-config
configMap:
name: runpod-lb-nginx-config
---
apiVersion: v1
kind: Service
metadata:
name: runpod-lb
namespace: neuron-prod
labels:
app.kubernetes.io/name: runpod-lb
app.kubernetes.io/component: load-balancer
spec:
selector:
app: runpod-lb
ports:
- name: http
port: 80
targetPort: 8080
type: ClusterIP