diff --git a/scripts/runpod-provision.sh b/scripts/runpod-provision.sh new file mode 100755 index 0000000..2a5125b --- /dev/null +++ b/scripts/runpod-provision.sh @@ -0,0 +1,243 @@ +#!/usr/bin/env bash +# runpod-provision.sh — On-demand RunPod inference pod provisioner +# +# Usage: +# ./runpod-provision.sh --gpu-type "NVIDIA GeForce RTX 4090" \ +# --model "meta-llama/Meta-Llama-3-8B-Instruct" \ +# --replicas 2 \ +# [--quantization awq] \ +# [--max-model-len 8192] \ +# [--cloud-type SECURE] \ +# [--dry-run] +# +# Env vars required (or sourced from ~/Secrets/credentials/infrastructure.env): +# RUNPOD_API_KEY — RunPod API key (from Vault secret/ai) +# HF_TOKEN — HuggingFace token for gated model downloads +# +# The script does NOT execute automatically; run it explicitly when you need pods. +# It outputs one endpoint URL per replica to stdout, one per line. + +set -euo pipefail + +# --------------------------------------------------------------------------- +# Defaults +# --------------------------------------------------------------------------- +GPU_TYPE="NVIDIA GeForce RTX 4090" +MODEL="meta-llama/Meta-Llama-3-8B-Instruct" +REPLICAS=1 +QUANTIZATION="awq" +MAX_MODEL_LEN=8192 +TENSOR_PARALLEL_SIZE=1 +GPU_MEMORY_UTILIZATION="0.92" +CLOUD_TYPE="SECURE" +SERVED_MODEL_NAME="" +GPU_COUNT=1 +VOLUME_GB=80 +CONTAINER_DISK_GB=30 +MIN_VCPU=8 +MIN_MEMORY_GB=32 +DRY_RUN=false +WAIT_TIMEOUT=600 # seconds to wait for RUNNING state + +RUNPOD_API="https://api.runpod.io/graphql" + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- +log() { printf '[%s] %s\n' "$(date -u +%H:%M:%S)" "$*" >&2; } +die() { log "ERROR: $*"; exit 1; } + +require_cmd() { command -v "$1" &>/dev/null || die "'$1' not found — install it first"; } + +# --------------------------------------------------------------------------- +# Argument parsing +# --------------------------------------------------------------------------- +while [[ $# -gt 0 ]]; do + case "$1" in + --gpu-type) GPU_TYPE="$2"; shift 2 ;; + --model) MODEL="$2"; shift 2 ;; + --replicas) REPLICAS="$2"; shift 2 ;; + --quantization) QUANTIZATION="$2"; shift 2 ;; + --max-model-len) MAX_MODEL_LEN="$2"; shift 2 ;; + --tensor-parallel) TENSOR_PARALLEL_SIZE="$2"; shift 2 ;; + --gpu-count) GPU_COUNT="$2"; shift 2 ;; + --cloud-type) CLOUD_TYPE="$2"; shift 2 ;; + --served-model-name) SERVED_MODEL_NAME="$2"; shift 2 ;; + --volume-gb) VOLUME_GB="$2"; shift 2 ;; + --wait-timeout) WAIT_TIMEOUT="$2"; shift 2 ;; + --dry-run) DRY_RUN=true; shift ;; + -h|--help) + grep '^#' "$0" | grep -v '^#!/' | sed 's/^# \?//' + exit 0 + ;; + *) die "Unknown argument: $1" ;; + esac +done + +# --------------------------------------------------------------------------- +# Environment +# --------------------------------------------------------------------------- +require_cmd curl +require_cmd jq + +# Try loading infrastructure secrets if not already set +if [[ -z "${RUNPOD_API_KEY:-}" ]]; then + INFRA_ENV="$HOME/Secrets/credentials/infrastructure.env" + if [[ -f "$INFRA_ENV" ]]; then + set -a; source "$INFRA_ENV"; set +a + fi +fi + +[[ -n "${RUNPOD_API_KEY:-}" ]] || die "RUNPOD_API_KEY not set. Export it or ensure ~/Secrets/credentials/infrastructure.env is present." +[[ -n "${HF_TOKEN:-}" ]] || log "WARNING: HF_TOKEN not set — gated models (Llama etc.) will fail to download." + +# Derive served model name from model ID if not provided +if [[ -z "$SERVED_MODEL_NAME" ]]; then + SERVED_MODEL_NAME="$(basename "$MODEL" | tr '[:upper:]' '[:lower:]' | tr -cd 'a-z0-9-')" +fi + +# Build docker args (omit --quantization flag if value is empty) +DOCKER_ARGS="--host 0.0.0.0 --port 8000 --model \$MODEL_ID" +[[ -n "$QUANTIZATION" ]] && DOCKER_ARGS="$DOCKER_ARGS --quantization \$QUANTIZATION" +DOCKER_ARGS="$DOCKER_ARGS --max-model-len \$MAX_MODEL_LEN --tensor-parallel-size \$TENSOR_PARALLEL_SIZE --gpu-memory-utilization \$GPU_MEMORY_UTILIZATION --served-model-name \$SERVED_MODEL_NAME" + +# --------------------------------------------------------------------------- +# GraphQL helpers +# --------------------------------------------------------------------------- +gql() { + local query="$1" + curl -s --fail \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer ${RUNPOD_API_KEY}" \ + -d "{\"query\": $(jq -Rn --arg q "$query" '$q')}" \ + "$RUNPOD_API" +} + +create_pod() { + local pod_name="$1" + local mutation + mutation=$(cat </dev/null || true) + + # Fall back to RunPod proxy URL if no public port exposed + if [[ -z "$endpoint" ]]; then + endpoint="https://${pod_id}-8000.proxy.runpod.net" + fi + + printf '%s' "$endpoint" + return 0 + fi + + if [[ $(date +%s) -ge $deadline ]]; then + die "Timed out waiting for pod $pod_id — last status: $status" + fi + + log " pod $pod_id status: $status — retrying in 15s..." + sleep 15 + done +} + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- +log "RunPod inference provisioner" +log " GPU type : $GPU_TYPE" +log " Model : $MODEL" +log " Replicas : $REPLICAS" +log " Quant : ${QUANTIZATION:-none}" +log " TP size : $TENSOR_PARALLEL_SIZE" +log " Cloud : $CLOUD_TYPE" +log " Dry run : $DRY_RUN" + +TIMESTAMP=$(date -u +%Y%m%d%H%M%S) +declare -a POD_IDS=() + +for i in $(seq 1 "$REPLICAS"); do + POD_NAME="neuron-inf-${TIMESTAMP}-r${i}" + + if $DRY_RUN; then + log "[dry-run] Would create pod: $POD_NAME" + continue + fi + + log "Creating pod $i/$REPLICAS: $POD_NAME ..." + RESP=$(create_pod "$POD_NAME") + ERR=$(printf '%s' "$RESP" | jq -r '.errors[0].message // empty') + [[ -n "$ERR" ]] && die "RunPod API error: $ERR" + + POD_ID=$(printf '%s' "$RESP" | jq -r '.data.podFindAndDeployOnDemand.id') + [[ -z "$POD_ID" || "$POD_ID" == "null" ]] && die "No pod ID returned. Full response: $RESP" + + log " Created pod ID: $POD_ID" + POD_IDS+=("$POD_ID") +done + +if $DRY_RUN; then + log "[dry-run] Complete — no pods were created." + exit 0 +fi + +log "" +log "Waiting for all $REPLICAS pod(s) to become RUNNING..." +declare -a ENDPOINTS=() +for POD_ID in "${POD_IDS[@]}"; do + EP=$(wait_for_running "$POD_ID") + ENDPOINTS+=("$EP") + log " Pod $POD_ID ready: $EP" +done + +log "" +log "=== Inference endpoints ===" +for EP in "${ENDPOINTS[@]}"; do + printf '%s\n' "$EP" +done diff --git a/servers/legion/k8s/neuron-technologies/prod/kustomization.yaml b/servers/legion/k8s/neuron-technologies/prod/kustomization.yaml index 9fa9a13..15f513d 100644 --- a/servers/legion/k8s/neuron-technologies/prod/kustomization.yaml +++ b/servers/legion/k8s/neuron-technologies/prod/kustomization.yaml @@ -12,3 +12,5 @@ resources: - ingress.yaml - license-ingressroute.yaml - backup-cronjob.yaml + - runpod-inference.yaml + - runpod-lb-configmap.yaml diff --git a/servers/legion/k8s/neuron-technologies/prod/runpod-inference.yaml b/servers/legion/k8s/neuron-technologies/prod/runpod-inference.yaml new file mode 100644 index 0000000..c9a7437 --- /dev/null +++ b/servers/legion/k8s/neuron-technologies/prod/runpod-inference.yaml @@ -0,0 +1,149 @@ +--- +# RunPod Inference ConfigMap +# Holds pod creation payloads for on-demand vLLM inference pods. +# Used by the runpod-provision.sh script and any k8s Job that provisions RunPod pods. +# +# GPU IDs sourced from RunPod GraphQL { gpuTypes { id } } — verified 2026-04-25. +# RTX 4090: "NVIDIA GeForce RTX 4090" (24 GB VRAM, secure+community cloud) +# A40: "NVIDIA A40" (48 GB VRAM, secure cloud only) +# L40S: "NVIDIA L40S" (48 GB VRAM, secure+community cloud) +apiVersion: v1 +kind: ConfigMap +metadata: + name: runpod-inference-templates + namespace: neuron-prod + labels: + app.kubernetes.io/name: runpod-inference + app.kubernetes.io/component: inference-provisioner + app.kubernetes.io/managed-by: argocd +data: + # --- Llama-3-8B on RTX 4090 (24 GB) --- + # Fits quantised (GPTQ/AWQ) or full fp16 8B models. + pod-template-rtx4090-llama3-8b.json: | + { + "name": "neuron-llama3-8b-rtx4090", + "imageName": "vllm/vllm-openai:latest", + "gpuTypeId": "NVIDIA GeForce RTX 4090", + "cloudType": "SECURE", + "gpuCount": 1, + "volumeInGb": 80, + "containerDiskInGb": 30, + "minVcpuCount": 8, + "minMemoryInGb": 32, + "ports": "8000/http", + "env": [ + { "key": "MODEL_ID", "value": "meta-llama/Meta-Llama-3-8B-Instruct" }, + { "key": "QUANTIZATION", "value": "awq" }, + { "key": "MAX_MODEL_LEN", "value": "8192" }, + { "key": "TENSOR_PARALLEL_SIZE","value": "1" }, + { "key": "GPU_MEMORY_UTILIZATION", "value": "0.92" }, + { "key": "SERVED_MODEL_NAME", "value": "llama3-8b" }, + { "key": "HF_TOKEN", "value": "REPLACE_WITH_HF_TOKEN" } + ], + "dockerArgs": "--host 0.0.0.0 --port 8000 --model $MODEL_ID --quantization $QUANTIZATION --max-model-len $MAX_MODEL_LEN --tensor-parallel-size $TENSOR_PARALLEL_SIZE --gpu-memory-utilization $GPU_MEMORY_UTILIZATION --served-model-name $SERVED_MODEL_NAME" + } + + # --- Llama-3-70B on A40 (48 GB) — single GPU with AWQ quantisation --- + pod-template-a40-llama3-70b.json: | + { + "name": "neuron-llama3-70b-a40", + "imageName": "vllm/vllm-openai:latest", + "gpuTypeId": "NVIDIA A40", + "cloudType": "SECURE", + "gpuCount": 1, + "volumeInGb": 150, + "containerDiskInGb": 50, + "minVcpuCount": 16, + "minMemoryInGb": 64, + "ports": "8000/http", + "env": [ + { "key": "MODEL_ID", "value": "meta-llama/Meta-Llama-3-70B-Instruct" }, + { "key": "QUANTIZATION", "value": "awq" }, + { "key": "MAX_MODEL_LEN", "value": "4096" }, + { "key": "TENSOR_PARALLEL_SIZE","value": "1" }, + { "key": "GPU_MEMORY_UTILIZATION", "value": "0.90" }, + { "key": "SERVED_MODEL_NAME", "value": "llama3-70b" }, + { "key": "HF_TOKEN", "value": "REPLACE_WITH_HF_TOKEN" } + ], + "dockerArgs": "--host 0.0.0.0 --port 8000 --model $MODEL_ID --quantization $QUANTIZATION --max-model-len $MAX_MODEL_LEN --tensor-parallel-size $TENSOR_PARALLEL_SIZE --gpu-memory-utilization $GPU_MEMORY_UTILIZATION --served-model-name $SERVED_MODEL_NAME" + } + + # --- Llama-3-70B on 2x A40 (tensor parallel) --- + pod-template-2xa40-llama3-70b-fp16.json: | + { + "name": "neuron-llama3-70b-2xa40-fp16", + "imageName": "vllm/vllm-openai:latest", + "gpuTypeId": "NVIDIA A40", + "cloudType": "SECURE", + "gpuCount": 2, + "volumeInGb": 150, + "containerDiskInGb": 50, + "minVcpuCount": 16, + "minMemoryInGb": 64, + "ports": "8000/http", + "env": [ + { "key": "MODEL_ID", "value": "meta-llama/Meta-Llama-3-70B-Instruct" }, + { "key": "QUANTIZATION", "value": "" }, + { "key": "MAX_MODEL_LEN", "value": "8192" }, + { "key": "TENSOR_PARALLEL_SIZE","value": "2" }, + { "key": "GPU_MEMORY_UTILIZATION", "value": "0.90" }, + { "key": "SERVED_MODEL_NAME", "value": "llama3-70b" }, + { "key": "HF_TOKEN", "value": "REPLACE_WITH_HF_TOKEN" } + ], + "dockerArgs": "--host 0.0.0.0 --port 8000 --model $MODEL_ID --max-model-len $MAX_MODEL_LEN --tensor-parallel-size $TENSOR_PARALLEL_SIZE --gpu-memory-utilization $GPU_MEMORY_UTILIZATION --served-model-name $SERVED_MODEL_NAME" + } + + # --- Generic custom model template --- + pod-template-custom.json: | + { + "name": "neuron-inference-custom", + "imageName": "vllm/vllm-openai:latest", + "gpuTypeId": "NVIDIA GeForce RTX 4090", + "cloudType": "SECURE", + "gpuCount": 1, + "volumeInGb": 100, + "containerDiskInGb": 30, + "minVcpuCount": 8, + "minMemoryInGb": 32, + "ports": "8000/http", + "env": [ + { "key": "MODEL_ID", "value": "REPLACE_WITH_MODEL_ID" }, + { "key": "QUANTIZATION", "value": "awq" }, + { "key": "MAX_MODEL_LEN", "value": "4096" }, + { "key": "TENSOR_PARALLEL_SIZE","value": "1" }, + { "key": "GPU_MEMORY_UTILIZATION", "value": "0.90" }, + { "key": "SERVED_MODEL_NAME", "value": "model" }, + { "key": "HF_TOKEN", "value": "REPLACE_WITH_HF_TOKEN" } + ], + "dockerArgs": "--host 0.0.0.0 --port 8000 --model $MODEL_ID --quantization $QUANTIZATION --max-model-len $MAX_MODEL_LEN --tensor-parallel-size $TENSOR_PARALLEL_SIZE --gpu-memory-utilization $GPU_MEMORY_UTILIZATION --served-model-name $SERVED_MODEL_NAME" + } + + # --- Nginx load balancer config (updated dynamically by runpod-provision.sh) --- + # Paste pod endpoint URLs under upstream block when pods are running. + nginx-lb.conf: | + upstream runpod_inference { + least_conn; + # Populated dynamically — add lines like: + # server -8000.proxy.runpod.net:443; + keepalive 32; + } + + server { + listen 80; + server_name inference.neuralplatform.ai; + + location /v1/ { + proxy_pass https://runpod_inference; + proxy_http_version 1.1; + proxy_set_header Connection ""; + proxy_set_header Host $proxy_host; + proxy_set_header X-Real-IP $remote_addr; + proxy_read_timeout 300s; + proxy_send_timeout 60s; + } + + location /health { + return 200 'ok'; + add_header Content-Type text/plain; + } + } diff --git a/servers/legion/k8s/neuron-technologies/prod/runpod-lb-configmap.yaml b/servers/legion/k8s/neuron-technologies/prod/runpod-lb-configmap.yaml new file mode 100644 index 0000000..10f1dff --- /dev/null +++ b/servers/legion/k8s/neuron-technologies/prod/runpod-lb-configmap.yaml @@ -0,0 +1,172 @@ +--- +# RunPod Inference Load Balancer — Nginx ConfigMap +# +# Deploys a lightweight nginx reverse proxy inside the cluster that fans out +# requests to multiple RunPod inference endpoints. Update the upstream block +# with real pod proxy URLs after running runpod-provision.sh. +# +# Reload nginx after updating endpoints: +# kubectl rollout restart deployment/runpod-lb -n neuron-prod +apiVersion: v1 +kind: ConfigMap +metadata: + name: runpod-lb-nginx-config + namespace: neuron-prod + labels: + app.kubernetes.io/name: runpod-lb + app.kubernetes.io/component: load-balancer + app.kubernetes.io/managed-by: argocd +data: + nginx.conf: | + worker_processes auto; + error_log /dev/stderr warn; + pid /tmp/nginx.pid; + + events { + worker_connections 1024; + use epoll; + } + + http { + access_log /dev/stdout combined; + + # Aggressive upstream keepalive — RunPod pods are remote HTTPS + upstream runpod_inference { + least_conn; + + # ---------------------------------------------------------------- + # Add one line per active RunPod pod endpoint. + # Format: server -8000.proxy.runpod.net:443; + # Example: + # server abc123def456-8000.proxy.runpod.net:443; + # server xyz789uvw012-8000.proxy.runpod.net:443; + # ---------------------------------------------------------------- + + keepalive 32; + keepalive_requests 1000; + keepalive_timeout 75s; + } + + # Health-check upstream (one known-good pod for active probing) + # Replace with real pod ID. + upstream runpod_healthcheck { + server REPLACE_POD_ID-8000.proxy.runpod.net:443; + } + + server { + listen 8080; + server_name _; + + # Proxy timeouts generous for LLM inference + proxy_read_timeout 300s; + proxy_send_timeout 60s; + proxy_connect_timeout 10s; + + # ---- OpenAI-compatible inference passthrough ---- + location /v1/ { + proxy_pass https://runpod_inference; + proxy_http_version 1.1; + proxy_set_header Connection ""; + proxy_set_header Host $proxy_host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + + # Stream tokens back to caller without buffering + proxy_buffering off; + proxy_cache off; + chunked_transfer_encoding on; + } + + # ---- Cluster-internal health probe ---- + location /health { + return 200 'ok\n'; + add_header Content-Type text/plain; + } + + # ---- Upstream model info (routed to first healthy pod) ---- + location /v1/models { + proxy_pass https://runpod_inference; + proxy_http_version 1.1; + proxy_set_header Connection ""; + proxy_set_header Host $proxy_host; + } + } + } + +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: runpod-lb + namespace: neuron-prod + labels: + app.kubernetes.io/name: runpod-lb + app.kubernetes.io/component: load-balancer + app.kubernetes.io/managed-by: argocd +spec: + replicas: 1 + selector: + matchLabels: + app: runpod-lb + template: + metadata: + labels: + app: runpod-lb + spec: + containers: + - name: nginx + image: nginx:1.27-alpine + ports: + - containerPort: 8080 + name: http + volumeMounts: + - name: nginx-config + mountPath: /etc/nginx/nginx.conf + subPath: nginx.conf + readOnly: true + livenessProbe: + httpGet: + path: /health + port: 8080 + initialDelaySeconds: 5 + periodSeconds: 15 + readinessProbe: + httpGet: + path: /health + port: 8080 + initialDelaySeconds: 3 + periodSeconds: 10 + resources: + requests: + cpu: "50m" + memory: "64Mi" + limits: + cpu: "200m" + memory: "128Mi" + securityContext: + runAsNonRoot: true + runAsUser: 101 + allowPrivilegeEscalation: false + readOnlyRootFilesystem: false + volumes: + - name: nginx-config + configMap: + name: runpod-lb-nginx-config + +--- +apiVersion: v1 +kind: Service +metadata: + name: runpod-lb + namespace: neuron-prod + labels: + app.kubernetes.io/name: runpod-lb + app.kubernetes.io/component: load-balancer +spec: + selector: + app: runpod-lb + ports: + - name: http + port: 80 + targetPort: 8080 + type: ClusterIP