feat(runpod): add inference pod templates, nginx LB, and provisioner script

Infrastructure readiness for RunPod inference workloads: - runpod-inference.yaml: ConfigMap with pod creation payloads for RTX 4090, A40 (single+dual), and custom templates - runpod-lb-configmap.yaml: nginx least-conn load balancer for inference endpoint distribution (Deployment + ClusterIP Service) - runpod-provision.sh: bash provisioner script — reads RUNPOD_API_KEY/HF_TOKEN, creates pods via GraphQL, polls until RUNNING, outputs endpoint URLs. Does NOT spin up any pods (dry-run flag available).
2026-04-25 01:20:55 -05:00
parent c847b22014
commit 8eb88a3116
4 changed files with 566 additions and 0 deletions
@@ -0,0 +1,243 @@
+#!/usr/bin/env bash
+# runpod-provision.sh — On-demand RunPod inference pod provisioner
+#
+# Usage:
+#   ./runpod-provision.sh --gpu-type "NVIDIA GeForce RTX 4090" \
+#                         --model "meta-llama/Meta-Llama-3-8B-Instruct" \
+#                         --replicas 2 \
+#                         [--quantization awq] \
+#                         [--max-model-len 8192] \
+#                         [--cloud-type SECURE] \
+#                         [--dry-run]
+#
+# Env vars required (or sourced from ~/Secrets/credentials/infrastructure.env):
+#   RUNPOD_API_KEY   — RunPod API key (from Vault secret/ai)
+#   HF_TOKEN         — HuggingFace token for gated model downloads
+#
+# The script does NOT execute automatically; run it explicitly when you need pods.
+# It outputs one endpoint URL per replica to stdout, one per line.
+
+set -euo pipefail
+
+# ---------------------------------------------------------------------------
+# Defaults
+# ---------------------------------------------------------------------------
+GPU_TYPE="NVIDIA GeForce RTX 4090"
+MODEL="meta-llama/Meta-Llama-3-8B-Instruct"
+REPLICAS=1
+QUANTIZATION="awq"
+MAX_MODEL_LEN=8192
+TENSOR_PARALLEL_SIZE=1
+GPU_MEMORY_UTILIZATION="0.92"
+CLOUD_TYPE="SECURE"
+SERVED_MODEL_NAME=""
+GPU_COUNT=1
+VOLUME_GB=80
+CONTAINER_DISK_GB=30
+MIN_VCPU=8
+MIN_MEMORY_GB=32
+DRY_RUN=false
+WAIT_TIMEOUT=600   # seconds to wait for RUNNING state
+
+RUNPOD_API="https://api.runpod.io/graphql"
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+log()  { printf '[%s] %s\n' "$(date -u +%H:%M:%S)" "$*" >&2; }
+die()  { log "ERROR: $*"; exit 1; }
+
+require_cmd() { command -v "$1" &>/dev/null || die "'$1' not found — install it first"; }
+
+# ---------------------------------------------------------------------------
+# Argument parsing
+# ---------------------------------------------------------------------------
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --gpu-type)            GPU_TYPE="$2";              shift 2 ;;
+    --model)               MODEL="$2";                 shift 2 ;;
+    --replicas)            REPLICAS="$2";              shift 2 ;;
+    --quantization)        QUANTIZATION="$2";          shift 2 ;;
+    --max-model-len)       MAX_MODEL_LEN="$2";         shift 2 ;;
+    --tensor-parallel)     TENSOR_PARALLEL_SIZE="$2";  shift 2 ;;
+    --gpu-count)           GPU_COUNT="$2";             shift 2 ;;
+    --cloud-type)          CLOUD_TYPE="$2";            shift 2 ;;
+    --served-model-name)   SERVED_MODEL_NAME="$2";     shift 2 ;;
+    --volume-gb)           VOLUME_GB="$2";             shift 2 ;;
+    --wait-timeout)        WAIT_TIMEOUT="$2";          shift 2 ;;
+    --dry-run)             DRY_RUN=true;               shift   ;;
+    -h|--help)
+      grep '^#' "$0" | grep -v '^#!/' | sed 's/^# \?//'
+      exit 0
+      ;;
+    *) die "Unknown argument: $1" ;;
+  esac
+done
+
+# ---------------------------------------------------------------------------
+# Environment
+# ---------------------------------------------------------------------------
+require_cmd curl
+require_cmd jq
+
+# Try loading infrastructure secrets if not already set
+if [[ -z "${RUNPOD_API_KEY:-}" ]]; then
+  INFRA_ENV="$HOME/Secrets/credentials/infrastructure.env"
+  if [[ -f "$INFRA_ENV" ]]; then
+    set -a; source "$INFRA_ENV"; set +a
+  fi
+fi
+
+[[ -n "${RUNPOD_API_KEY:-}" ]] || die "RUNPOD_API_KEY not set. Export it or ensure ~/Secrets/credentials/infrastructure.env is present."
+[[ -n "${HF_TOKEN:-}" ]]       || log "WARNING: HF_TOKEN not set — gated models (Llama etc.) will fail to download."
+
+# Derive served model name from model ID if not provided
+if [[ -z "$SERVED_MODEL_NAME" ]]; then
+  SERVED_MODEL_NAME="$(basename "$MODEL" | tr '[:upper:]' '[:lower:]' | tr -cd 'a-z0-9-')"
+fi
+
+# Build docker args (omit --quantization flag if value is empty)
+DOCKER_ARGS="--host 0.0.0.0 --port 8000 --model \$MODEL_ID"
+[[ -n "$QUANTIZATION" ]] && DOCKER_ARGS="$DOCKER_ARGS --quantization \$QUANTIZATION"
+DOCKER_ARGS="$DOCKER_ARGS --max-model-len \$MAX_MODEL_LEN --tensor-parallel-size \$TENSOR_PARALLEL_SIZE --gpu-memory-utilization \$GPU_MEMORY_UTILIZATION --served-model-name \$SERVED_MODEL_NAME"
+
+# ---------------------------------------------------------------------------
+# GraphQL helpers
+# ---------------------------------------------------------------------------
+gql() {
+  local query="$1"
+  curl -s --fail \
+    -H "Content-Type: application/json" \
+    -H "Authorization: Bearer ${RUNPOD_API_KEY}" \
+    -d "{\"query\": $(jq -Rn --arg q "$query" '$q')}" \
+    "$RUNPOD_API"
+}
+
+create_pod() {
+  local pod_name="$1"
+  local mutation
+  mutation=$(cat <<MUTATION
+mutation {
+  podFindAndDeployOnDemand(input: {
+    name: "$pod_name"
+    imageName: "vllm/vllm-openai:latest"
+    gpuTypeId: "$GPU_TYPE"
+    cloudType: $CLOUD_TYPE
+    gpuCount: $GPU_COUNT
+    volumeInGb: $VOLUME_GB
+    containerDiskInGb: $CONTAINER_DISK_GB
+    minVcpuCount: $MIN_VCPU
+    minMemoryInGb: $MIN_MEMORY_GB
+    ports: "8000/http"
+    env: [
+      { key: "MODEL_ID",               value: "$MODEL" }
+      { key: "QUANTIZATION",           value: "$QUANTIZATION" }
+      { key: "MAX_MODEL_LEN",          value: "$MAX_MODEL_LEN" }
+      { key: "TENSOR_PARALLEL_SIZE",   value: "$TENSOR_PARALLEL_SIZE" }
+      { key: "GPU_MEMORY_UTILIZATION", value: "$GPU_MEMORY_UTILIZATION" }
+      { key: "SERVED_MODEL_NAME",      value: "$SERVED_MODEL_NAME" }
+      { key: "HF_TOKEN",               value: "${HF_TOKEN:-}" }
+    ]
+    dockerArgs: "$DOCKER_ARGS"
+  }) {
+    id
+    name
+    desiredStatus
+  }
+}
+MUTATION
+)
+  gql "$mutation"
+}
+
+wait_for_running() {
+  local pod_id="$1"
+  local deadline=$(( $(date +%s) + WAIT_TIMEOUT ))
+  log "Waiting for pod $pod_id to reach RUNNING state (timeout ${WAIT_TIMEOUT}s)..."
+
+  while true; do
+    local resp
+    resp=$(gql "{ pod(input: { podId: \"$pod_id\" }) { id desiredStatus runtime { ports { ip isIpPublic privatePort publicPort type } } } }")
+    local status
+    status=$(printf '%s' "$resp" | jq -r '.data.pod.desiredStatus // "UNKNOWN"')
+
+    if [[ "$status" == "RUNNING" ]]; then
+      local endpoint
+      endpoint=$(printf '%s' "$resp" | jq -r '
+        .data.pod.runtime.ports[]?
+        | select(.privatePort == 8000 and .isIpPublic == true)
+        | "https://\(.ip):\(.publicPort)"
+      ' 2>/dev/null || true)
+
+      # Fall back to RunPod proxy URL if no public port exposed
+      if [[ -z "$endpoint" ]]; then
+        endpoint="https://${pod_id}-8000.proxy.runpod.net"
+      fi
+
+      printf '%s' "$endpoint"
+      return 0
+    fi
+
+    if [[ $(date +%s) -ge $deadline ]]; then
+      die "Timed out waiting for pod $pod_id — last status: $status"
+    fi
+
+    log "  pod $pod_id status: $status — retrying in 15s..."
+    sleep 15
+  done
+}
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+log "RunPod inference provisioner"
+log "  GPU type  : $GPU_TYPE"
+log "  Model     : $MODEL"
+log "  Replicas  : $REPLICAS"
+log "  Quant     : ${QUANTIZATION:-none}"
+log "  TP size   : $TENSOR_PARALLEL_SIZE"
+log "  Cloud     : $CLOUD_TYPE"
+log "  Dry run   : $DRY_RUN"
+
+TIMESTAMP=$(date -u +%Y%m%d%H%M%S)
+declare -a POD_IDS=()
+
+for i in $(seq 1 "$REPLICAS"); do
+  POD_NAME="neuron-inf-${TIMESTAMP}-r${i}"
+
+  if $DRY_RUN; then
+    log "[dry-run] Would create pod: $POD_NAME"
+    continue
+  fi
+
+  log "Creating pod $i/$REPLICAS: $POD_NAME ..."
+  RESP=$(create_pod "$POD_NAME")
+  ERR=$(printf '%s' "$RESP" | jq -r '.errors[0].message // empty')
+  [[ -n "$ERR" ]] && die "RunPod API error: $ERR"
+
+  POD_ID=$(printf '%s' "$RESP" | jq -r '.data.podFindAndDeployOnDemand.id')
+  [[ -z "$POD_ID" || "$POD_ID" == "null" ]] && die "No pod ID returned. Full response: $RESP"
+
+  log "  Created pod ID: $POD_ID"
+  POD_IDS+=("$POD_ID")
+done
+
+if $DRY_RUN; then
+  log "[dry-run] Complete — no pods were created."
+  exit 0
+fi
+
+log ""
+log "Waiting for all $REPLICAS pod(s) to become RUNNING..."
+declare -a ENDPOINTS=()
+for POD_ID in "${POD_IDS[@]}"; do
+  EP=$(wait_for_running "$POD_ID")
+  ENDPOINTS+=("$EP")
+  log "  Pod $POD_ID ready: $EP"
+done
+
+log ""
+log "=== Inference endpoints ==="
+for EP in "${ENDPOINTS[@]}"; do
+  printf '%s\n' "$EP"
+done
@@ -12,3 +12,5 @@ resources:
  - ingress.yaml
  - license-ingressroute.yaml
  - backup-cronjob.yaml
+  - runpod-inference.yaml
+  - runpod-lb-configmap.yaml
@@ -0,0 +1,149 @@
+---
+# RunPod Inference ConfigMap
+# Holds pod creation payloads for on-demand vLLM inference pods.
+# Used by the runpod-provision.sh script and any k8s Job that provisions RunPod pods.
+#
+# GPU IDs sourced from RunPod GraphQL { gpuTypes { id } } — verified 2026-04-25.
+# RTX 4090:  "NVIDIA GeForce RTX 4090"  (24 GB VRAM, secure+community cloud)
+# A40:       "NVIDIA A40"               (48 GB VRAM, secure cloud only)
+# L40S:      "NVIDIA L40S"              (48 GB VRAM, secure+community cloud)
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: runpod-inference-templates
+  namespace: neuron-prod
+  labels:
+    app.kubernetes.io/name: runpod-inference
+    app.kubernetes.io/component: inference-provisioner
+    app.kubernetes.io/managed-by: argocd
+data:
+  # --- Llama-3-8B on RTX 4090 (24 GB) ---
+  # Fits quantised (GPTQ/AWQ) or full fp16 8B models.
+  pod-template-rtx4090-llama3-8b.json: |
+    {
+      "name": "neuron-llama3-8b-rtx4090",
+      "imageName": "vllm/vllm-openai:latest",
+      "gpuTypeId": "NVIDIA GeForce RTX 4090",
+      "cloudType": "SECURE",
+      "gpuCount": 1,
+      "volumeInGb": 80,
+      "containerDiskInGb": 30,
+      "minVcpuCount": 8,
+      "minMemoryInGb": 32,
+      "ports": "8000/http",
+      "env": [
+        { "key": "MODEL_ID",            "value": "meta-llama/Meta-Llama-3-8B-Instruct" },
+        { "key": "QUANTIZATION",        "value": "awq" },
+        { "key": "MAX_MODEL_LEN",       "value": "8192" },
+        { "key": "TENSOR_PARALLEL_SIZE","value": "1" },
+        { "key": "GPU_MEMORY_UTILIZATION", "value": "0.92" },
+        { "key": "SERVED_MODEL_NAME",   "value": "llama3-8b" },
+        { "key": "HF_TOKEN",            "value": "REPLACE_WITH_HF_TOKEN" }
+      ],
+      "dockerArgs": "--host 0.0.0.0 --port 8000 --model $MODEL_ID --quantization $QUANTIZATION --max-model-len $MAX_MODEL_LEN --tensor-parallel-size $TENSOR_PARALLEL_SIZE --gpu-memory-utilization $GPU_MEMORY_UTILIZATION --served-model-name $SERVED_MODEL_NAME"
+    }
+
+  # --- Llama-3-70B on A40 (48 GB) — single GPU with AWQ quantisation ---
+  pod-template-a40-llama3-70b.json: |
+    {
+      "name": "neuron-llama3-70b-a40",
+      "imageName": "vllm/vllm-openai:latest",
+      "gpuTypeId": "NVIDIA A40",
+      "cloudType": "SECURE",
+      "gpuCount": 1,
+      "volumeInGb": 150,
+      "containerDiskInGb": 50,
+      "minVcpuCount": 16,
+      "minMemoryInGb": 64,
+      "ports": "8000/http",
+      "env": [
+        { "key": "MODEL_ID",            "value": "meta-llama/Meta-Llama-3-70B-Instruct" },
+        { "key": "QUANTIZATION",        "value": "awq" },
+        { "key": "MAX_MODEL_LEN",       "value": "4096" },
+        { "key": "TENSOR_PARALLEL_SIZE","value": "1" },
+        { "key": "GPU_MEMORY_UTILIZATION", "value": "0.90" },
+        { "key": "SERVED_MODEL_NAME",   "value": "llama3-70b" },
+        { "key": "HF_TOKEN",            "value": "REPLACE_WITH_HF_TOKEN" }
+      ],
+      "dockerArgs": "--host 0.0.0.0 --port 8000 --model $MODEL_ID --quantization $QUANTIZATION --max-model-len $MAX_MODEL_LEN --tensor-parallel-size $TENSOR_PARALLEL_SIZE --gpu-memory-utilization $GPU_MEMORY_UTILIZATION --served-model-name $SERVED_MODEL_NAME"
+    }
+
+  # --- Llama-3-70B on 2x A40 (tensor parallel) ---
+  pod-template-2xa40-llama3-70b-fp16.json: |
+    {
+      "name": "neuron-llama3-70b-2xa40-fp16",
+      "imageName": "vllm/vllm-openai:latest",
+      "gpuTypeId": "NVIDIA A40",
+      "cloudType": "SECURE",
+      "gpuCount": 2,
+      "volumeInGb": 150,
+      "containerDiskInGb": 50,
+      "minVcpuCount": 16,
+      "minMemoryInGb": 64,
+      "ports": "8000/http",
+      "env": [
+        { "key": "MODEL_ID",            "value": "meta-llama/Meta-Llama-3-70B-Instruct" },
+        { "key": "QUANTIZATION",        "value": "" },
+        { "key": "MAX_MODEL_LEN",       "value": "8192" },
+        { "key": "TENSOR_PARALLEL_SIZE","value": "2" },
+        { "key": "GPU_MEMORY_UTILIZATION", "value": "0.90" },
+        { "key": "SERVED_MODEL_NAME",   "value": "llama3-70b" },
+        { "key": "HF_TOKEN",            "value": "REPLACE_WITH_HF_TOKEN" }
+      ],
+      "dockerArgs": "--host 0.0.0.0 --port 8000 --model $MODEL_ID --max-model-len $MAX_MODEL_LEN --tensor-parallel-size $TENSOR_PARALLEL_SIZE --gpu-memory-utilization $GPU_MEMORY_UTILIZATION --served-model-name $SERVED_MODEL_NAME"
+    }
+
+  # --- Generic custom model template ---
+  pod-template-custom.json: |
+    {
+      "name": "neuron-inference-custom",
+      "imageName": "vllm/vllm-openai:latest",
+      "gpuTypeId": "NVIDIA GeForce RTX 4090",
+      "cloudType": "SECURE",
+      "gpuCount": 1,
+      "volumeInGb": 100,
+      "containerDiskInGb": 30,
+      "minVcpuCount": 8,
+      "minMemoryInGb": 32,
+      "ports": "8000/http",
+      "env": [
+        { "key": "MODEL_ID",            "value": "REPLACE_WITH_MODEL_ID" },
+        { "key": "QUANTIZATION",        "value": "awq" },
+        { "key": "MAX_MODEL_LEN",       "value": "4096" },
+        { "key": "TENSOR_PARALLEL_SIZE","value": "1" },
+        { "key": "GPU_MEMORY_UTILIZATION", "value": "0.90" },
+        { "key": "SERVED_MODEL_NAME",   "value": "model" },
+        { "key": "HF_TOKEN",            "value": "REPLACE_WITH_HF_TOKEN" }
+      ],
+      "dockerArgs": "--host 0.0.0.0 --port 8000 --model $MODEL_ID --quantization $QUANTIZATION --max-model-len $MAX_MODEL_LEN --tensor-parallel-size $TENSOR_PARALLEL_SIZE --gpu-memory-utilization $GPU_MEMORY_UTILIZATION --served-model-name $SERVED_MODEL_NAME"
+    }
+
+  # --- Nginx load balancer config (updated dynamically by runpod-provision.sh) ---
+  # Paste pod endpoint URLs under upstream block when pods are running.
+  nginx-lb.conf: |
+    upstream runpod_inference {
+        least_conn;
+        # Populated dynamically — add lines like:
+        #   server <pod-id>-8000.proxy.runpod.net:443;
+        keepalive 32;
+    }
+
+    server {
+        listen 80;
+        server_name inference.neuralplatform.ai;
+
+        location /v1/ {
+            proxy_pass https://runpod_inference;
+            proxy_http_version 1.1;
+            proxy_set_header Connection "";
+            proxy_set_header Host $proxy_host;
+            proxy_set_header X-Real-IP $remote_addr;
+            proxy_read_timeout 300s;
+            proxy_send_timeout 60s;
+        }
+
+        location /health {
+            return 200 'ok';
+            add_header Content-Type text/plain;
+        }
+    }
@@ -0,0 +1,172 @@
+---
+# RunPod Inference Load Balancer — Nginx ConfigMap
+#
+# Deploys a lightweight nginx reverse proxy inside the cluster that fans out
+# requests to multiple RunPod inference endpoints.  Update the upstream block
+# with real pod proxy URLs after running runpod-provision.sh.
+#
+# Reload nginx after updating endpoints:
+#   kubectl rollout restart deployment/runpod-lb -n neuron-prod
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: runpod-lb-nginx-config
+  namespace: neuron-prod
+  labels:
+    app.kubernetes.io/name: runpod-lb
+    app.kubernetes.io/component: load-balancer
+    app.kubernetes.io/managed-by: argocd
+data:
+  nginx.conf: |
+    worker_processes auto;
+    error_log /dev/stderr warn;
+    pid /tmp/nginx.pid;
+
+    events {
+        worker_connections 1024;
+        use epoll;
+    }
+
+    http {
+        access_log /dev/stdout combined;
+
+        # Aggressive upstream keepalive — RunPod pods are remote HTTPS
+        upstream runpod_inference {
+            least_conn;
+
+            # ----------------------------------------------------------------
+            # Add one line per active RunPod pod endpoint.
+            # Format:  server <pod-id>-8000.proxy.runpod.net:443;
+            # Example:
+            #   server abc123def456-8000.proxy.runpod.net:443;
+            #   server xyz789uvw012-8000.proxy.runpod.net:443;
+            # ----------------------------------------------------------------
+
+            keepalive 32;
+            keepalive_requests 1000;
+            keepalive_timeout 75s;
+        }
+
+        # Health-check upstream (one known-good pod for active probing)
+        # Replace with real pod ID.
+        upstream runpod_healthcheck {
+            server REPLACE_POD_ID-8000.proxy.runpod.net:443;
+        }
+
+        server {
+            listen 8080;
+            server_name _;
+
+            # Proxy timeouts generous for LLM inference
+            proxy_read_timeout    300s;
+            proxy_send_timeout     60s;
+            proxy_connect_timeout  10s;
+
+            # ---- OpenAI-compatible inference passthrough ----
+            location /v1/ {
+                proxy_pass          https://runpod_inference;
+                proxy_http_version  1.1;
+                proxy_set_header    Connection      "";
+                proxy_set_header    Host            $proxy_host;
+                proxy_set_header    X-Real-IP       $remote_addr;
+                proxy_set_header    X-Forwarded-For $proxy_add_x_forwarded_for;
+
+                # Stream tokens back to caller without buffering
+                proxy_buffering         off;
+                proxy_cache             off;
+                chunked_transfer_encoding on;
+            }
+
+            # ---- Cluster-internal health probe ----
+            location /health {
+                return 200 'ok\n';
+                add_header Content-Type text/plain;
+            }
+
+            # ---- Upstream model info (routed to first healthy pod) ----
+            location /v1/models {
+                proxy_pass          https://runpod_inference;
+                proxy_http_version  1.1;
+                proxy_set_header    Connection "";
+                proxy_set_header    Host       $proxy_host;
+            }
+        }
+    }
+
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: runpod-lb
+  namespace: neuron-prod
+  labels:
+    app.kubernetes.io/name: runpod-lb
+    app.kubernetes.io/component: load-balancer
+    app.kubernetes.io/managed-by: argocd
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: runpod-lb
+  template:
+    metadata:
+      labels:
+        app: runpod-lb
+    spec:
+      containers:
+        - name: nginx
+          image: nginx:1.27-alpine
+          ports:
+            - containerPort: 8080
+              name: http
+          volumeMounts:
+            - name: nginx-config
+              mountPath: /etc/nginx/nginx.conf
+              subPath: nginx.conf
+              readOnly: true
+          livenessProbe:
+            httpGet:
+              path: /health
+              port: 8080
+            initialDelaySeconds: 5
+            periodSeconds: 15
+          readinessProbe:
+            httpGet:
+              path: /health
+              port: 8080
+            initialDelaySeconds: 3
+            periodSeconds: 10
+          resources:
+            requests:
+              cpu: "50m"
+              memory: "64Mi"
+            limits:
+              cpu: "200m"
+              memory: "128Mi"
+          securityContext:
+            runAsNonRoot: true
+            runAsUser: 101
+            allowPrivilegeEscalation: false
+            readOnlyRootFilesystem: false
+      volumes:
+        - name: nginx-config
+          configMap:
+            name: runpod-lb-nginx-config
+
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: runpod-lb
+  namespace: neuron-prod
+  labels:
+    app.kubernetes.io/name: runpod-lb
+    app.kubernetes.io/component: load-balancer
+spec:
+  selector:
+    app: runpod-lb
+  ports:
+    - name: http
+      port: 80
+      targetPort: 8080
+  type: ClusterIP