feat(runpod): add inference pod templates, nginx LB, and provisioner script
Infrastructure readiness for RunPod inference workloads: - runpod-inference.yaml: ConfigMap with pod creation payloads for RTX 4090, A40 (single+dual), and custom templates - runpod-lb-configmap.yaml: nginx least-conn load balancer for inference endpoint distribution (Deployment + ClusterIP Service) - runpod-provision.sh: bash provisioner script — reads RUNPOD_API_KEY/HF_TOKEN, creates pods via GraphQL, polls until RUNNING, outputs endpoint URLs. Does NOT spin up any pods (dry-run flag available).
This commit is contained in:
Executable
+243
@@ -0,0 +1,243 @@
|
||||
#!/usr/bin/env bash
|
||||
# runpod-provision.sh — On-demand RunPod inference pod provisioner
|
||||
#
|
||||
# Usage:
|
||||
# ./runpod-provision.sh --gpu-type "NVIDIA GeForce RTX 4090" \
|
||||
# --model "meta-llama/Meta-Llama-3-8B-Instruct" \
|
||||
# --replicas 2 \
|
||||
# [--quantization awq] \
|
||||
# [--max-model-len 8192] \
|
||||
# [--cloud-type SECURE] \
|
||||
# [--dry-run]
|
||||
#
|
||||
# Env vars required (or sourced from ~/Secrets/credentials/infrastructure.env):
|
||||
# RUNPOD_API_KEY — RunPod API key (from Vault secret/ai)
|
||||
# HF_TOKEN — HuggingFace token for gated model downloads
|
||||
#
|
||||
# The script does NOT execute automatically; run it explicitly when you need pods.
|
||||
# It outputs one endpoint URL per replica to stdout, one per line.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Defaults
|
||||
# ---------------------------------------------------------------------------
|
||||
GPU_TYPE="NVIDIA GeForce RTX 4090"
|
||||
MODEL="meta-llama/Meta-Llama-3-8B-Instruct"
|
||||
REPLICAS=1
|
||||
QUANTIZATION="awq"
|
||||
MAX_MODEL_LEN=8192
|
||||
TENSOR_PARALLEL_SIZE=1
|
||||
GPU_MEMORY_UTILIZATION="0.92"
|
||||
CLOUD_TYPE="SECURE"
|
||||
SERVED_MODEL_NAME=""
|
||||
GPU_COUNT=1
|
||||
VOLUME_GB=80
|
||||
CONTAINER_DISK_GB=30
|
||||
MIN_VCPU=8
|
||||
MIN_MEMORY_GB=32
|
||||
DRY_RUN=false
|
||||
WAIT_TIMEOUT=600 # seconds to wait for RUNNING state
|
||||
|
||||
RUNPOD_API="https://api.runpod.io/graphql"
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
log() { printf '[%s] %s\n' "$(date -u +%H:%M:%S)" "$*" >&2; }
|
||||
die() { log "ERROR: $*"; exit 1; }
|
||||
|
||||
require_cmd() { command -v "$1" &>/dev/null || die "'$1' not found — install it first"; }
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Argument parsing
|
||||
# ---------------------------------------------------------------------------
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--gpu-type) GPU_TYPE="$2"; shift 2 ;;
|
||||
--model) MODEL="$2"; shift 2 ;;
|
||||
--replicas) REPLICAS="$2"; shift 2 ;;
|
||||
--quantization) QUANTIZATION="$2"; shift 2 ;;
|
||||
--max-model-len) MAX_MODEL_LEN="$2"; shift 2 ;;
|
||||
--tensor-parallel) TENSOR_PARALLEL_SIZE="$2"; shift 2 ;;
|
||||
--gpu-count) GPU_COUNT="$2"; shift 2 ;;
|
||||
--cloud-type) CLOUD_TYPE="$2"; shift 2 ;;
|
||||
--served-model-name) SERVED_MODEL_NAME="$2"; shift 2 ;;
|
||||
--volume-gb) VOLUME_GB="$2"; shift 2 ;;
|
||||
--wait-timeout) WAIT_TIMEOUT="$2"; shift 2 ;;
|
||||
--dry-run) DRY_RUN=true; shift ;;
|
||||
-h|--help)
|
||||
grep '^#' "$0" | grep -v '^#!/' | sed 's/^# \?//'
|
||||
exit 0
|
||||
;;
|
||||
*) die "Unknown argument: $1" ;;
|
||||
esac
|
||||
done
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Environment
|
||||
# ---------------------------------------------------------------------------
|
||||
require_cmd curl
|
||||
require_cmd jq
|
||||
|
||||
# Try loading infrastructure secrets if not already set
|
||||
if [[ -z "${RUNPOD_API_KEY:-}" ]]; then
|
||||
INFRA_ENV="$HOME/Secrets/credentials/infrastructure.env"
|
||||
if [[ -f "$INFRA_ENV" ]]; then
|
||||
set -a; source "$INFRA_ENV"; set +a
|
||||
fi
|
||||
fi
|
||||
|
||||
[[ -n "${RUNPOD_API_KEY:-}" ]] || die "RUNPOD_API_KEY not set. Export it or ensure ~/Secrets/credentials/infrastructure.env is present."
|
||||
[[ -n "${HF_TOKEN:-}" ]] || log "WARNING: HF_TOKEN not set — gated models (Llama etc.) will fail to download."
|
||||
|
||||
# Derive served model name from model ID if not provided
|
||||
if [[ -z "$SERVED_MODEL_NAME" ]]; then
|
||||
SERVED_MODEL_NAME="$(basename "$MODEL" | tr '[:upper:]' '[:lower:]' | tr -cd 'a-z0-9-')"
|
||||
fi
|
||||
|
||||
# Build docker args (omit --quantization flag if value is empty)
|
||||
DOCKER_ARGS="--host 0.0.0.0 --port 8000 --model \$MODEL_ID"
|
||||
[[ -n "$QUANTIZATION" ]] && DOCKER_ARGS="$DOCKER_ARGS --quantization \$QUANTIZATION"
|
||||
DOCKER_ARGS="$DOCKER_ARGS --max-model-len \$MAX_MODEL_LEN --tensor-parallel-size \$TENSOR_PARALLEL_SIZE --gpu-memory-utilization \$GPU_MEMORY_UTILIZATION --served-model-name \$SERVED_MODEL_NAME"
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# GraphQL helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
gql() {
|
||||
local query="$1"
|
||||
curl -s --fail \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "Authorization: Bearer ${RUNPOD_API_KEY}" \
|
||||
-d "{\"query\": $(jq -Rn --arg q "$query" '$q')}" \
|
||||
"$RUNPOD_API"
|
||||
}
|
||||
|
||||
create_pod() {
|
||||
local pod_name="$1"
|
||||
local mutation
|
||||
mutation=$(cat <<MUTATION
|
||||
mutation {
|
||||
podFindAndDeployOnDemand(input: {
|
||||
name: "$pod_name"
|
||||
imageName: "vllm/vllm-openai:latest"
|
||||
gpuTypeId: "$GPU_TYPE"
|
||||
cloudType: $CLOUD_TYPE
|
||||
gpuCount: $GPU_COUNT
|
||||
volumeInGb: $VOLUME_GB
|
||||
containerDiskInGb: $CONTAINER_DISK_GB
|
||||
minVcpuCount: $MIN_VCPU
|
||||
minMemoryInGb: $MIN_MEMORY_GB
|
||||
ports: "8000/http"
|
||||
env: [
|
||||
{ key: "MODEL_ID", value: "$MODEL" }
|
||||
{ key: "QUANTIZATION", value: "$QUANTIZATION" }
|
||||
{ key: "MAX_MODEL_LEN", value: "$MAX_MODEL_LEN" }
|
||||
{ key: "TENSOR_PARALLEL_SIZE", value: "$TENSOR_PARALLEL_SIZE" }
|
||||
{ key: "GPU_MEMORY_UTILIZATION", value: "$GPU_MEMORY_UTILIZATION" }
|
||||
{ key: "SERVED_MODEL_NAME", value: "$SERVED_MODEL_NAME" }
|
||||
{ key: "HF_TOKEN", value: "${HF_TOKEN:-}" }
|
||||
]
|
||||
dockerArgs: "$DOCKER_ARGS"
|
||||
}) {
|
||||
id
|
||||
name
|
||||
desiredStatus
|
||||
}
|
||||
}
|
||||
MUTATION
|
||||
)
|
||||
gql "$mutation"
|
||||
}
|
||||
|
||||
wait_for_running() {
|
||||
local pod_id="$1"
|
||||
local deadline=$(( $(date +%s) + WAIT_TIMEOUT ))
|
||||
log "Waiting for pod $pod_id to reach RUNNING state (timeout ${WAIT_TIMEOUT}s)..."
|
||||
|
||||
while true; do
|
||||
local resp
|
||||
resp=$(gql "{ pod(input: { podId: \"$pod_id\" }) { id desiredStatus runtime { ports { ip isIpPublic privatePort publicPort type } } } }")
|
||||
local status
|
||||
status=$(printf '%s' "$resp" | jq -r '.data.pod.desiredStatus // "UNKNOWN"')
|
||||
|
||||
if [[ "$status" == "RUNNING" ]]; then
|
||||
local endpoint
|
||||
endpoint=$(printf '%s' "$resp" | jq -r '
|
||||
.data.pod.runtime.ports[]?
|
||||
| select(.privatePort == 8000 and .isIpPublic == true)
|
||||
| "https://\(.ip):\(.publicPort)"
|
||||
' 2>/dev/null || true)
|
||||
|
||||
# Fall back to RunPod proxy URL if no public port exposed
|
||||
if [[ -z "$endpoint" ]]; then
|
||||
endpoint="https://${pod_id}-8000.proxy.runpod.net"
|
||||
fi
|
||||
|
||||
printf '%s' "$endpoint"
|
||||
return 0
|
||||
fi
|
||||
|
||||
if [[ $(date +%s) -ge $deadline ]]; then
|
||||
die "Timed out waiting for pod $pod_id — last status: $status"
|
||||
fi
|
||||
|
||||
log " pod $pod_id status: $status — retrying in 15s..."
|
||||
sleep 15
|
||||
done
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Main
|
||||
# ---------------------------------------------------------------------------
|
||||
log "RunPod inference provisioner"
|
||||
log " GPU type : $GPU_TYPE"
|
||||
log " Model : $MODEL"
|
||||
log " Replicas : $REPLICAS"
|
||||
log " Quant : ${QUANTIZATION:-none}"
|
||||
log " TP size : $TENSOR_PARALLEL_SIZE"
|
||||
log " Cloud : $CLOUD_TYPE"
|
||||
log " Dry run : $DRY_RUN"
|
||||
|
||||
TIMESTAMP=$(date -u +%Y%m%d%H%M%S)
|
||||
declare -a POD_IDS=()
|
||||
|
||||
for i in $(seq 1 "$REPLICAS"); do
|
||||
POD_NAME="neuron-inf-${TIMESTAMP}-r${i}"
|
||||
|
||||
if $DRY_RUN; then
|
||||
log "[dry-run] Would create pod: $POD_NAME"
|
||||
continue
|
||||
fi
|
||||
|
||||
log "Creating pod $i/$REPLICAS: $POD_NAME ..."
|
||||
RESP=$(create_pod "$POD_NAME")
|
||||
ERR=$(printf '%s' "$RESP" | jq -r '.errors[0].message // empty')
|
||||
[[ -n "$ERR" ]] && die "RunPod API error: $ERR"
|
||||
|
||||
POD_ID=$(printf '%s' "$RESP" | jq -r '.data.podFindAndDeployOnDemand.id')
|
||||
[[ -z "$POD_ID" || "$POD_ID" == "null" ]] && die "No pod ID returned. Full response: $RESP"
|
||||
|
||||
log " Created pod ID: $POD_ID"
|
||||
POD_IDS+=("$POD_ID")
|
||||
done
|
||||
|
||||
if $DRY_RUN; then
|
||||
log "[dry-run] Complete — no pods were created."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
log ""
|
||||
log "Waiting for all $REPLICAS pod(s) to become RUNNING..."
|
||||
declare -a ENDPOINTS=()
|
||||
for POD_ID in "${POD_IDS[@]}"; do
|
||||
EP=$(wait_for_running "$POD_ID")
|
||||
ENDPOINTS+=("$EP")
|
||||
log " Pod $POD_ID ready: $EP"
|
||||
done
|
||||
|
||||
log ""
|
||||
log "=== Inference endpoints ==="
|
||||
for EP in "${ENDPOINTS[@]}"; do
|
||||
printf '%s\n' "$EP"
|
||||
done
|
||||
@@ -12,3 +12,5 @@ resources:
|
||||
- ingress.yaml
|
||||
- license-ingressroute.yaml
|
||||
- backup-cronjob.yaml
|
||||
- runpod-inference.yaml
|
||||
- runpod-lb-configmap.yaml
|
||||
|
||||
@@ -0,0 +1,149 @@
|
||||
---
|
||||
# RunPod Inference ConfigMap
|
||||
# Holds pod creation payloads for on-demand vLLM inference pods.
|
||||
# Used by the runpod-provision.sh script and any k8s Job that provisions RunPod pods.
|
||||
#
|
||||
# GPU IDs sourced from RunPod GraphQL { gpuTypes { id } } — verified 2026-04-25.
|
||||
# RTX 4090: "NVIDIA GeForce RTX 4090" (24 GB VRAM, secure+community cloud)
|
||||
# A40: "NVIDIA A40" (48 GB VRAM, secure cloud only)
|
||||
# L40S: "NVIDIA L40S" (48 GB VRAM, secure+community cloud)
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: runpod-inference-templates
|
||||
namespace: neuron-prod
|
||||
labels:
|
||||
app.kubernetes.io/name: runpod-inference
|
||||
app.kubernetes.io/component: inference-provisioner
|
||||
app.kubernetes.io/managed-by: argocd
|
||||
data:
|
||||
# --- Llama-3-8B on RTX 4090 (24 GB) ---
|
||||
# Fits quantised (GPTQ/AWQ) or full fp16 8B models.
|
||||
pod-template-rtx4090-llama3-8b.json: |
|
||||
{
|
||||
"name": "neuron-llama3-8b-rtx4090",
|
||||
"imageName": "vllm/vllm-openai:latest",
|
||||
"gpuTypeId": "NVIDIA GeForce RTX 4090",
|
||||
"cloudType": "SECURE",
|
||||
"gpuCount": 1,
|
||||
"volumeInGb": 80,
|
||||
"containerDiskInGb": 30,
|
||||
"minVcpuCount": 8,
|
||||
"minMemoryInGb": 32,
|
||||
"ports": "8000/http",
|
||||
"env": [
|
||||
{ "key": "MODEL_ID", "value": "meta-llama/Meta-Llama-3-8B-Instruct" },
|
||||
{ "key": "QUANTIZATION", "value": "awq" },
|
||||
{ "key": "MAX_MODEL_LEN", "value": "8192" },
|
||||
{ "key": "TENSOR_PARALLEL_SIZE","value": "1" },
|
||||
{ "key": "GPU_MEMORY_UTILIZATION", "value": "0.92" },
|
||||
{ "key": "SERVED_MODEL_NAME", "value": "llama3-8b" },
|
||||
{ "key": "HF_TOKEN", "value": "REPLACE_WITH_HF_TOKEN" }
|
||||
],
|
||||
"dockerArgs": "--host 0.0.0.0 --port 8000 --model $MODEL_ID --quantization $QUANTIZATION --max-model-len $MAX_MODEL_LEN --tensor-parallel-size $TENSOR_PARALLEL_SIZE --gpu-memory-utilization $GPU_MEMORY_UTILIZATION --served-model-name $SERVED_MODEL_NAME"
|
||||
}
|
||||
|
||||
# --- Llama-3-70B on A40 (48 GB) — single GPU with AWQ quantisation ---
|
||||
pod-template-a40-llama3-70b.json: |
|
||||
{
|
||||
"name": "neuron-llama3-70b-a40",
|
||||
"imageName": "vllm/vllm-openai:latest",
|
||||
"gpuTypeId": "NVIDIA A40",
|
||||
"cloudType": "SECURE",
|
||||
"gpuCount": 1,
|
||||
"volumeInGb": 150,
|
||||
"containerDiskInGb": 50,
|
||||
"minVcpuCount": 16,
|
||||
"minMemoryInGb": 64,
|
||||
"ports": "8000/http",
|
||||
"env": [
|
||||
{ "key": "MODEL_ID", "value": "meta-llama/Meta-Llama-3-70B-Instruct" },
|
||||
{ "key": "QUANTIZATION", "value": "awq" },
|
||||
{ "key": "MAX_MODEL_LEN", "value": "4096" },
|
||||
{ "key": "TENSOR_PARALLEL_SIZE","value": "1" },
|
||||
{ "key": "GPU_MEMORY_UTILIZATION", "value": "0.90" },
|
||||
{ "key": "SERVED_MODEL_NAME", "value": "llama3-70b" },
|
||||
{ "key": "HF_TOKEN", "value": "REPLACE_WITH_HF_TOKEN" }
|
||||
],
|
||||
"dockerArgs": "--host 0.0.0.0 --port 8000 --model $MODEL_ID --quantization $QUANTIZATION --max-model-len $MAX_MODEL_LEN --tensor-parallel-size $TENSOR_PARALLEL_SIZE --gpu-memory-utilization $GPU_MEMORY_UTILIZATION --served-model-name $SERVED_MODEL_NAME"
|
||||
}
|
||||
|
||||
# --- Llama-3-70B on 2x A40 (tensor parallel) ---
|
||||
pod-template-2xa40-llama3-70b-fp16.json: |
|
||||
{
|
||||
"name": "neuron-llama3-70b-2xa40-fp16",
|
||||
"imageName": "vllm/vllm-openai:latest",
|
||||
"gpuTypeId": "NVIDIA A40",
|
||||
"cloudType": "SECURE",
|
||||
"gpuCount": 2,
|
||||
"volumeInGb": 150,
|
||||
"containerDiskInGb": 50,
|
||||
"minVcpuCount": 16,
|
||||
"minMemoryInGb": 64,
|
||||
"ports": "8000/http",
|
||||
"env": [
|
||||
{ "key": "MODEL_ID", "value": "meta-llama/Meta-Llama-3-70B-Instruct" },
|
||||
{ "key": "QUANTIZATION", "value": "" },
|
||||
{ "key": "MAX_MODEL_LEN", "value": "8192" },
|
||||
{ "key": "TENSOR_PARALLEL_SIZE","value": "2" },
|
||||
{ "key": "GPU_MEMORY_UTILIZATION", "value": "0.90" },
|
||||
{ "key": "SERVED_MODEL_NAME", "value": "llama3-70b" },
|
||||
{ "key": "HF_TOKEN", "value": "REPLACE_WITH_HF_TOKEN" }
|
||||
],
|
||||
"dockerArgs": "--host 0.0.0.0 --port 8000 --model $MODEL_ID --max-model-len $MAX_MODEL_LEN --tensor-parallel-size $TENSOR_PARALLEL_SIZE --gpu-memory-utilization $GPU_MEMORY_UTILIZATION --served-model-name $SERVED_MODEL_NAME"
|
||||
}
|
||||
|
||||
# --- Generic custom model template ---
|
||||
pod-template-custom.json: |
|
||||
{
|
||||
"name": "neuron-inference-custom",
|
||||
"imageName": "vllm/vllm-openai:latest",
|
||||
"gpuTypeId": "NVIDIA GeForce RTX 4090",
|
||||
"cloudType": "SECURE",
|
||||
"gpuCount": 1,
|
||||
"volumeInGb": 100,
|
||||
"containerDiskInGb": 30,
|
||||
"minVcpuCount": 8,
|
||||
"minMemoryInGb": 32,
|
||||
"ports": "8000/http",
|
||||
"env": [
|
||||
{ "key": "MODEL_ID", "value": "REPLACE_WITH_MODEL_ID" },
|
||||
{ "key": "QUANTIZATION", "value": "awq" },
|
||||
{ "key": "MAX_MODEL_LEN", "value": "4096" },
|
||||
{ "key": "TENSOR_PARALLEL_SIZE","value": "1" },
|
||||
{ "key": "GPU_MEMORY_UTILIZATION", "value": "0.90" },
|
||||
{ "key": "SERVED_MODEL_NAME", "value": "model" },
|
||||
{ "key": "HF_TOKEN", "value": "REPLACE_WITH_HF_TOKEN" }
|
||||
],
|
||||
"dockerArgs": "--host 0.0.0.0 --port 8000 --model $MODEL_ID --quantization $QUANTIZATION --max-model-len $MAX_MODEL_LEN --tensor-parallel-size $TENSOR_PARALLEL_SIZE --gpu-memory-utilization $GPU_MEMORY_UTILIZATION --served-model-name $SERVED_MODEL_NAME"
|
||||
}
|
||||
|
||||
# --- Nginx load balancer config (updated dynamically by runpod-provision.sh) ---
|
||||
# Paste pod endpoint URLs under upstream block when pods are running.
|
||||
nginx-lb.conf: |
|
||||
upstream runpod_inference {
|
||||
least_conn;
|
||||
# Populated dynamically — add lines like:
|
||||
# server <pod-id>-8000.proxy.runpod.net:443;
|
||||
keepalive 32;
|
||||
}
|
||||
|
||||
server {
|
||||
listen 80;
|
||||
server_name inference.neuralplatform.ai;
|
||||
|
||||
location /v1/ {
|
||||
proxy_pass https://runpod_inference;
|
||||
proxy_http_version 1.1;
|
||||
proxy_set_header Connection "";
|
||||
proxy_set_header Host $proxy_host;
|
||||
proxy_set_header X-Real-IP $remote_addr;
|
||||
proxy_read_timeout 300s;
|
||||
proxy_send_timeout 60s;
|
||||
}
|
||||
|
||||
location /health {
|
||||
return 200 'ok';
|
||||
add_header Content-Type text/plain;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,172 @@
|
||||
---
|
||||
# RunPod Inference Load Balancer — Nginx ConfigMap
|
||||
#
|
||||
# Deploys a lightweight nginx reverse proxy inside the cluster that fans out
|
||||
# requests to multiple RunPod inference endpoints. Update the upstream block
|
||||
# with real pod proxy URLs after running runpod-provision.sh.
|
||||
#
|
||||
# Reload nginx after updating endpoints:
|
||||
# kubectl rollout restart deployment/runpod-lb -n neuron-prod
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: runpod-lb-nginx-config
|
||||
namespace: neuron-prod
|
||||
labels:
|
||||
app.kubernetes.io/name: runpod-lb
|
||||
app.kubernetes.io/component: load-balancer
|
||||
app.kubernetes.io/managed-by: argocd
|
||||
data:
|
||||
nginx.conf: |
|
||||
worker_processes auto;
|
||||
error_log /dev/stderr warn;
|
||||
pid /tmp/nginx.pid;
|
||||
|
||||
events {
|
||||
worker_connections 1024;
|
||||
use epoll;
|
||||
}
|
||||
|
||||
http {
|
||||
access_log /dev/stdout combined;
|
||||
|
||||
# Aggressive upstream keepalive — RunPod pods are remote HTTPS
|
||||
upstream runpod_inference {
|
||||
least_conn;
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
# Add one line per active RunPod pod endpoint.
|
||||
# Format: server <pod-id>-8000.proxy.runpod.net:443;
|
||||
# Example:
|
||||
# server abc123def456-8000.proxy.runpod.net:443;
|
||||
# server xyz789uvw012-8000.proxy.runpod.net:443;
|
||||
# ----------------------------------------------------------------
|
||||
|
||||
keepalive 32;
|
||||
keepalive_requests 1000;
|
||||
keepalive_timeout 75s;
|
||||
}
|
||||
|
||||
# Health-check upstream (one known-good pod for active probing)
|
||||
# Replace with real pod ID.
|
||||
upstream runpod_healthcheck {
|
||||
server REPLACE_POD_ID-8000.proxy.runpod.net:443;
|
||||
}
|
||||
|
||||
server {
|
||||
listen 8080;
|
||||
server_name _;
|
||||
|
||||
# Proxy timeouts generous for LLM inference
|
||||
proxy_read_timeout 300s;
|
||||
proxy_send_timeout 60s;
|
||||
proxy_connect_timeout 10s;
|
||||
|
||||
# ---- OpenAI-compatible inference passthrough ----
|
||||
location /v1/ {
|
||||
proxy_pass https://runpod_inference;
|
||||
proxy_http_version 1.1;
|
||||
proxy_set_header Connection "";
|
||||
proxy_set_header Host $proxy_host;
|
||||
proxy_set_header X-Real-IP $remote_addr;
|
||||
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||
|
||||
# Stream tokens back to caller without buffering
|
||||
proxy_buffering off;
|
||||
proxy_cache off;
|
||||
chunked_transfer_encoding on;
|
||||
}
|
||||
|
||||
# ---- Cluster-internal health probe ----
|
||||
location /health {
|
||||
return 200 'ok\n';
|
||||
add_header Content-Type text/plain;
|
||||
}
|
||||
|
||||
# ---- Upstream model info (routed to first healthy pod) ----
|
||||
location /v1/models {
|
||||
proxy_pass https://runpod_inference;
|
||||
proxy_http_version 1.1;
|
||||
proxy_set_header Connection "";
|
||||
proxy_set_header Host $proxy_host;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: runpod-lb
|
||||
namespace: neuron-prod
|
||||
labels:
|
||||
app.kubernetes.io/name: runpod-lb
|
||||
app.kubernetes.io/component: load-balancer
|
||||
app.kubernetes.io/managed-by: argocd
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: runpod-lb
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: runpod-lb
|
||||
spec:
|
||||
containers:
|
||||
- name: nginx
|
||||
image: nginx:1.27-alpine
|
||||
ports:
|
||||
- containerPort: 8080
|
||||
name: http
|
||||
volumeMounts:
|
||||
- name: nginx-config
|
||||
mountPath: /etc/nginx/nginx.conf
|
||||
subPath: nginx.conf
|
||||
readOnly: true
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /health
|
||||
port: 8080
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 15
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /health
|
||||
port: 8080
|
||||
initialDelaySeconds: 3
|
||||
periodSeconds: 10
|
||||
resources:
|
||||
requests:
|
||||
cpu: "50m"
|
||||
memory: "64Mi"
|
||||
limits:
|
||||
cpu: "200m"
|
||||
memory: "128Mi"
|
||||
securityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: 101
|
||||
allowPrivilegeEscalation: false
|
||||
readOnlyRootFilesystem: false
|
||||
volumes:
|
||||
- name: nginx-config
|
||||
configMap:
|
||||
name: runpod-lb-nginx-config
|
||||
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: runpod-lb
|
||||
namespace: neuron-prod
|
||||
labels:
|
||||
app.kubernetes.io/name: runpod-lb
|
||||
app.kubernetes.io/component: load-balancer
|
||||
spec:
|
||||
selector:
|
||||
app: runpod-lb
|
||||
ports:
|
||||
- name: http
|
||||
port: 80
|
||||
targetPort: 8080
|
||||
type: ClusterIP
|
||||
Reference in New Issue
Block a user