2 Commits

Author SHA1 Message Date
Will Anderson 627d448f6f fix ci-base: install Node 20 via binary tarball instead of nodesource apt
nodesource setup_20.x on Ubuntu 24.04 installs Ubuntu's nodejs 18 (no npm)
instead of Node 20. Binary tarball is reliable and includes npm.
2026-05-04 15:48:12 -05:00
Will Anderson baae9b289a fix: drop bogus letsencrypt certResolver from dharma IngressRoute
Traefik has no certificate resolver named 'letsencrypt' configured (this
cluster terminates TLS at Cloudflare and uses Traefik's default cert via
the websecure entrypoint, matching every other neuron-prod IngressRoute).

The invalid certResolver caused Traefik to refuse the router with:
  ERR Router uses a nonexistent certificate resolver
    certificateResolver=letsencrypt routerName=neuron-prod-dharma-...

so requests to dharma.neurontechnologies.ai/health surfaced as 502 from
Cloudflare even though the dharma pod was healthy on :8765.
2026-05-04 14:37:32 -05:00
10 changed files with 203 additions and 210 deletions
+165
View File
@@ -0,0 +1,165 @@
---
# Gitea CI runner — general-purpose (legion)
# Uses host Docker socket for container management and docker build/push.
apiVersion: apps/v1
kind: Deployment
metadata:
name: gitea-runner
namespace: ci
labels:
app: gitea-runner
spec:
replicas: 1
selector:
matchLabels:
app: gitea-runner
template:
metadata:
labels:
app: gitea-runner
annotations:
config-version: "2026-04-27-containerd-sock"
spec:
securityContext:
runAsNonRoot: false # act_runner needs root for container management
initContainers:
- name: register
image: registry.neuralplatform.ai/ci-base:latest
workingDir: /data
command: ["/bin/sh", "-c"]
args:
- |
act_runner register \
--instance "$GITEA_INSTANCE_URL" \
--token "$GITEA_RUNNER_REGISTRATION_TOKEN" \
--name legion \
--labels "self-hosted:docker://registry.neuralplatform.ai/ci-base:latest,ubuntu-latest:docker://registry.neuralplatform.ai/ci-base:latest,ubuntu-24.04:docker://registry.neuralplatform.ai/ci-base:latest,linux,x64" \
--no-interactive
cat > /data/config.yaml << 'EOF'
runner:
capacity: 2
timeout: 3h
container:
network: host
docker_host: "unix:///run/k3s/containerd/containerd.sock"
force_pull: false
valid_volumes: []
default_image: "registry.neuralplatform.ai/ci-base:latest"
extra_hosts:
- "gitea.git.svc.cluster.local:10.43.1.53"
EOF
envFrom:
- secretRef:
name: gitea-runner-secret
volumeMounts:
- name: data
mountPath: /data
containers:
- name: runner
image: registry.neuralplatform.ai/ci-base:latest
workingDir: /data
command: ["act_runner", "daemon", "--config", "/data/config.yaml"]
envFrom:
- secretRef:
name: gitea-runner-secret
volumeMounts:
- name: data
mountPath: /data
- name: docker-sock
mountPath: /var/run/docker.sock
resources:
requests:
memory: 512Mi
cpu: 250m
limits:
memory: 4Gi
cpu: "4"
volumes:
- name: data
emptyDir: {}
- name: docker-sock
hostPath:
path: /run/k3s/containerd/containerd.sock
type: Socket
---
# Neuron Technologies CI runner
apiVersion: apps/v1
kind: Deployment
metadata:
name: neuron-technologies-runner
namespace: ci
labels:
app: neuron-technologies-runner
spec:
replicas: 1
selector:
matchLabels:
app: neuron-technologies-runner
template:
metadata:
labels:
app: neuron-technologies-runner
annotations:
config-version: "2026-04-27-containerd-sock"
spec:
securityContext:
runAsNonRoot: false
initContainers:
- name: register
image: registry.neuralplatform.ai/ci-base:latest
workingDir: /data
command: ["/bin/sh", "-c"]
args:
- |
act_runner register \
--instance "$GITEA_INSTANCE_URL" \
--token "$GITEA_RUNNER_REGISTRATION_TOKEN" \
--name neuron-technologies \
--labels "self-hosted:docker://registry.neuralplatform.ai/ci-base:latest,ubuntu-latest:docker://registry.neuralplatform.ai/ci-base:latest,ubuntu-24.04:docker://registry.neuralplatform.ai/ci-base:latest,linux,x64" \
--no-interactive
cat > /data/config.yaml << 'EOF'
runner:
capacity: 2
timeout: 3h
container:
network: host
docker_host: "unix:///run/k3s/containerd/containerd.sock"
force_pull: false
valid_volumes: []
default_image: "registry.neuralplatform.ai/ci-base:latest"
extra_hosts:
- "gitea.git.svc.cluster.local:10.43.1.53"
EOF
envFrom:
- secretRef:
name: neuron-technologies-runner-secret
volumeMounts:
- name: data
mountPath: /data
containers:
- name: runner
image: registry.neuralplatform.ai/ci-base:latest
workingDir: /data
command: ["act_runner", "daemon", "--config", "/data/config.yaml"]
envFrom:
- secretRef:
name: neuron-technologies-runner-secret
volumeMounts:
- name: data
mountPath: /data
- name: docker-sock
mountPath: /var/run/docker.sock
resources:
requests:
memory: 512Mi
cpu: 250m
limits:
memory: 4Gi
cpu: "4"
volumes:
- name: data
emptyDir: {}
- name: docker-sock
hostPath:
path: /run/k3s/containerd/containerd.sock
type: Socket
-13
View File
@@ -23,16 +23,3 @@ resource "cloudflare_record" "np_web_stage" {
proxied = true
ttl = 1
}
# vault.neuralplatform.ai — GCE Raft HA Vault cluster via GCP Global HTTPS LB.
# DNS-only (not proxied) — GCP managed TLS cert terminates at the LB.
# Vault nodes listen on plain HTTP 8200 internally; LB does TLS.
# IP: terraform output vault_lb_ip from servers/gcp workspace = 34.54.164.21
resource "cloudflare_record" "np_vault" {
zone_id = local.zone_neuralplatform_ai
name = "vault"
type = "A"
content = "34.54.164.21"
proxied = false
ttl = 60
}
-42
View File
@@ -1,42 +0,0 @@
# Cloudflare Zero Trust Access — git.neuralplatform.ai (Gitea)
#
# The Gitea Access application itself is currently managed in the Cloudflare
# dashboard, NOT in Terraform. This file only manages the *service token* the
# Gitea Actions runners use to authenticate through CF Access while still
# keeping the human Google-OAuth gate for browser users.
#
# Why not import the application here?
# - Importing the existing dashboard app risks drifting the human-auth
# policy (Google IdP, allowed emails) which is settled and working.
# - Service tokens can be added to a dashboard-managed app without
# importing the app itself; the token resource lives at the account
# level and is referenced from a policy.
# - We pay only the cost we need to. If we later want all Access apps
# in TF we can do a focused import pass.
#
# After `terraform apply` produces the token id/secret, Will must:
# 1. Run `vault kv put secret/gitea-runner-cf-access ...` (see outputs).
# 2. In the Cloudflare dashboard, edit the existing "Gitea" Access
# application's policies and add a new policy:
# Action: Service Auth (decision = non_identity)
# Include: Service Token = "gitea-runner"
# This grants the service token bypass through CF Access on
# git.neuralplatform.ai without changing the human-auth flow.
resource "cloudflare_zero_trust_access_service_token" "gitea_runner" {
account_id = var.cloudflare_account_id
name = "gitea-runner"
# Default duration is "8760h" (1 year). Rotate via re-apply when needed.
duration = "forever"
}
output "gitea_runner_cf_access_client_id" {
description = "CF Access service token client ID for the Gitea Actions runner. Store in Vault at secret/gitea-runner-cf-access."
value = cloudflare_zero_trust_access_service_token.gitea_runner.client_id
}
output "gitea_runner_cf_access_client_secret" {
description = "CF Access service token client secret. Store in Vault at secret/gitea-runner-cf-access. Only emitted at creation time."
value = cloudflare_zero_trust_access_service_token.gitea_runner.client_secret
sensitive = true
}
@@ -85,15 +85,3 @@ RUN curl -fsSL https://cli.github.com/packages/githubcli-archive-keyring.gpg \
&& apt-get update \
&& apt-get install -y --no-install-recommends gh \
&& rm -rf /var/lib/apt/lists/*
# Cloudflare Access bootstrap for git clones to git.neuralplatform.ai.
# This script is sourced by bash in build containers via BASH_ENV (set by
# act_runner's container.env in deployment.yaml) so it runs before every
# step. It configures git insteadOf + CF Access extraHeaders from
# CF_ACCESS_CLIENT_ID / CF_ACCESS_CLIENT_SECRET env vars.
#
# We deliberately don't set ENTRYPOINT / CMD here — act_runner spawns
# build containers with its own entrypoint to keep them alive between
# steps, and overriding it breaks job execution.
COPY git-cf-access-init.sh /usr/local/bin/git-cf-access-init.sh
RUN chmod +x /usr/local/bin/git-cf-access-init.sh
@@ -8,7 +8,7 @@ metadata:
labels:
app: gitea-runner
annotations:
config-version: "2026-05-04-cf-access-public-url"
config-version: "2026-05-04-docker-sock-fix"
spec:
replicas: 1
selector:
@@ -19,7 +19,7 @@ spec:
labels:
app: gitea-runner
annotations:
config-version: "2026-05-04-cf-access-public-url"
config-version: "2026-05-04-docker-sock-fix"
spec:
securityContext:
runAsNonRoot: false
@@ -35,7 +35,7 @@ spec:
--name legion \
--labels "self-hosted:docker://registry.neuralplatform.ai/ci-base:latest,ubuntu-latest:docker://registry.neuralplatform.ai/ci-base:latest,ubuntu-24.04:docker://registry.neuralplatform.ai/ci-base:latest,linux,x64" \
--no-interactive
cat > /data/config.yaml << EOF
cat > /data/config.yaml << 'EOF'
runner:
capacity: 2
timeout: 3h
@@ -45,16 +45,6 @@ spec:
force_pull: false
valid_volumes: []
default_image: "registry.neuralplatform.ai/ci-base:latest"
# Build containers run with network: host. The in-cluster
# gitea name does not resolve there, so we redirect git
# operations to https://git.neuralplatform.ai using CF
# Access service-token headers. BASH_ENV makes bash source
# /usr/local/bin/git-cf-access-init.sh before every step,
# which sets up the redirect + headers.
env:
CF_ACCESS_CLIENT_ID: "${CF_ACCESS_CLIENT_ID}"
CF_ACCESS_CLIENT_SECRET: "${CF_ACCESS_CLIENT_SECRET}"
BASH_ENV: "/usr/local/bin/git-cf-access-init.sh"
extra_hosts:
- "gitea.git.svc.cluster.local:10.43.1.53"
EOF
@@ -102,7 +92,7 @@ metadata:
labels:
app: neuron-technologies-runner
annotations:
config-version: "2026-05-04-cf-access-public-url"
config-version: "2026-05-04-docker-sock-fix"
spec:
replicas: 2
selector:
@@ -113,7 +103,7 @@ spec:
labels:
app: neuron-technologies-runner
annotations:
config-version: "2026-05-04-cf-access-public-url"
config-version: "2026-05-04-docker-sock-fix"
spec:
securityContext:
runAsNonRoot: false
@@ -129,7 +119,7 @@ spec:
--name "legion-nt-$(hostname)" \
--labels "self-hosted:docker://registry.neuralplatform.ai/ci-base:latest,ubuntu-latest:docker://registry.neuralplatform.ai/ci-base:latest,ubuntu-24.04:docker://registry.neuralplatform.ai/ci-base:latest,linux,x64" \
--no-interactive
cat > /data/config.yaml << EOF
cat > /data/config.yaml << 'EOF'
runner:
capacity: 2
timeout: 3h
@@ -139,16 +129,6 @@ spec:
force_pull: false
valid_volumes: []
default_image: "registry.neuralplatform.ai/ci-base:latest"
# Build containers run with network: host. The in-cluster
# gitea name does not resolve there, so we redirect git
# operations to https://git.neuralplatform.ai using CF
# Access service-token headers. BASH_ENV makes bash source
# /usr/local/bin/git-cf-access-init.sh before every step,
# which sets up the redirect + headers.
env:
CF_ACCESS_CLIENT_ID: "${CF_ACCESS_CLIENT_ID}"
CF_ACCESS_CLIENT_SECRET: "${CF_ACCESS_CLIENT_SECRET}"
BASH_ENV: "/usr/local/bin/git-cf-access-init.sh"
extra_hosts:
- "gitea.git.svc.cluster.local:10.43.1.53"
EOF
@@ -1,20 +1,12 @@
---
# gitea-runner-secret — neural-platform org runner token
#
# GITEA_INSTANCE_URL stays as the in-cluster URL — the act_runner daemon
# polls it constantly and we don't want every poll to hit Cloudflare Access.
# Build containers, however, need the public URL because they run with
# network: host and can't resolve gitea.git.svc.cluster.local. The
# git-cf-access-init.sh entrypoint in the ci-base image rewrites the
# in-cluster URL to https://git.neuralplatform.ai with the CF Access
# headers from CF_ACCESS_CLIENT_ID / CF_ACCESS_CLIENT_SECRET below.
apiVersion: external-secrets.io/v1beta1
kind: ExternalSecret
metadata:
name: gitea-runner-secret
namespace: ci
annotations:
force-sync: "2026-05-04-cf-access"
force-sync: "2026-04-23"
spec:
refreshInterval: 1h
secretStoreRef:
@@ -27,21 +19,11 @@ spec:
data:
GITEA_INSTANCE_URL: "http://gitea.git.svc.cluster.local:3000"
GITEA_RUNNER_REGISTRATION_TOKEN: "{{ .runner_token }}"
CF_ACCESS_CLIENT_ID: "{{ .cf_access_client_id }}"
CF_ACCESS_CLIENT_SECRET: "{{ .cf_access_client_secret }}"
data:
- secretKey: runner_token
remoteRef:
key: secret/data/gitea
property: runner_token
- secretKey: cf_access_client_id
remoteRef:
key: secret/data/gitea-runner-cf-access
property: client_id
- secretKey: cf_access_client_secret
remoteRef:
key: secret/data/gitea-runner-cf-access
property: client_secret
---
# neuron-technologies-runner-secret — neuron-technologies org runner token
apiVersion: external-secrets.io/v1beta1
@@ -49,8 +31,6 @@ kind: ExternalSecret
metadata:
name: neuron-technologies-runner-secret
namespace: ci
annotations:
force-sync: "2026-05-04-cf-access"
spec:
refreshInterval: 1h
secretStoreRef:
@@ -63,18 +43,8 @@ spec:
data:
GITEA_INSTANCE_URL: "http://gitea.git.svc.cluster.local:3000"
GITEA_RUNNER_REGISTRATION_TOKEN: "{{ .runner_token }}"
CF_ACCESS_CLIENT_ID: "{{ .cf_access_client_id }}"
CF_ACCESS_CLIENT_SECRET: "{{ .cf_access_client_secret }}"
data:
- secretKey: runner_token
remoteRef:
key: secret/data/gitea
property: neuron_technologies_runner_token
- secretKey: cf_access_client_id
remoteRef:
key: secret/data/gitea-runner-cf-access
property: client_id
- secretKey: cf_access_client_secret
remoteRef:
key: secret/data/gitea-runner-cf-access
property: client_secret
@@ -1,50 +0,0 @@
#!/bin/sh
# git-cf-access-init.sh
#
# Configures git so any clone/fetch from Gitea ends up going to
# git.neuralplatform.ai with the runner's Cloudflare Access service-token
# headers attached.
#
# How this gets invoked:
# The forgejo-runner job execution path runs each step via a
# non-interactive bash invocation inside the build container. Setting
# BASH_ENV=/usr/local/bin/git-cf-access-init.sh in act_runner's
# container.env causes bash to source this script before any step's
# commands run. (See servers/legion/k8s/gitea-runner/deployment.yaml.)
#
# What it does:
# 1. Rewrites http://gitea.git.svc.cluster.local:3000/ → https://git.neuralplatform.ai/
# via insteadOf. The runner registered against the in-cluster URL (no
# CF Access on the daemon's polling loop), so act_runner advertises
# that URL to the build container as github.server_url. Build
# containers run with network: host and can't resolve
# *.svc.cluster.local, so we need to redirect to the public URL.
# 2. Adds the CF Access service-token headers to outbound requests to
# git.neuralplatform.ai so the clone authenticates through CF Access.
#
# Idempotent — re-runs replace any prior config keys without accumulating
# duplicate header entries.
#
# Known limitation: actions/checkout sets an Authorization extraheader
# keyed to the server URL it was given (the in-cluster URL). After
# insteadOf substitution the request goes to the public URL where git
# matches http.<public>.extraheader, and the in-cluster-keyed
# Authorization header is dropped. For public repos this is fine. For
# private repos the per-job token will not be sent — see the PR
# description for the follow-up plan if dharma-el's CI needs that token.
if [ -n "${CF_ACCESS_CLIENT_ID:-}" ] && [ -n "${CF_ACCESS_CLIENT_SECRET:-}" ]; then
git config --global --replace-all \
url."https://git.neuralplatform.ai/".insteadOf \
"http://gitea.git.svc.cluster.local:3000/" 2>/dev/null || true
# Reset extraHeader on the public URL, then add both CF Access headers.
git config --global --unset-all \
http."https://git.neuralplatform.ai/".extraHeader 2>/dev/null || true
git config --global --add \
http."https://git.neuralplatform.ai/".extraHeader \
"CF-Access-Client-Id: ${CF_ACCESS_CLIENT_ID}" 2>/dev/null || true
git config --global --add \
http."https://git.neuralplatform.ai/".extraHeader \
"CF-Access-Client-Secret: ${CF_ACCESS_CLIENT_SECRET}" 2>/dev/null || true
fi
@@ -82,12 +82,7 @@ spec:
apiVersion: apps/v1
kind: Deployment
name: neuron-marketing
# minReplicas=1 to match the file's own convention (see header comment).
# Kubernetes only allows minReplicas=0 when at least one Object or External
# metric is configured (queue depth, custom signal, etc.); with only a
# Resource (CPU) metric, scale-to-zero is rejected and the whole HPA is
# invalid — which was blocking neuron-prod's Argo CD sync.
minReplicas: 1
minReplicas: 0
maxReplicas: 8
metrics:
- type: Resource
@@ -117,32 +117,6 @@ spec:
matchLabels:
kubernetes.io/metadata.name: neuron-prod
---
# ── dharma: accept from Traefik (kube-system) and neuron-prod namespace ──────
# The dharma pod was healthy and the IngressRoute was correct, but cross-
# namespace ingress from kube-system (Traefik) was denied by default-deny-all,
# so every external request landed at Traefik and bounced back as 502. This
# allow rule mirrors `allow-mcp-ingress` and brings dharma into line with the
# other neuron-prod services.
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
name: allow-dharma-ingress
namespace: neuron-prod
spec:
podSelector:
matchLabels:
app: dharma
policyTypes:
- Ingress
ingress:
- from:
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: kube-system
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: neuron-prod
---
# ── Egress: all prod pods may reach platform (postgres/redis), vault,
# monitoring (alloy OTLP), kube-dns, and the internet (external APIs) ─
apiVersion: networking.k8s.io/v1
+30 -4
View File
@@ -92,11 +92,37 @@ resource "cloudflare_zero_trust_tunnel_cloudflared_config" "legion" {
}
}
# vault.neuralplatform.ai — moved to GCP Global HTTPS LB (34.54.164.21)
# DNS is now a direct A record (not proxied) in dns-neuralplatform.tf
ingress_rule {
hostname = "vault.neuralplatform.ai"
service = "https://traefik.kube-system.svc:443"
origin_request {
no_tls_verify = true
}
}
# watch.nook.family, jellyfin.nook.family, bazarr.nook.family — removed
# This infrastructure is focused on Neuron; nook.family media stack retired
ingress_rule {
hostname = "watch.nook.family"
service = "https://traefik.kube-system.svc:443"
origin_request {
no_tls_verify = true
}
}
ingress_rule {
hostname = "jellyfin.nook.family"
service = "https://traefik.kube-system.svc:443"
origin_request {
no_tls_verify = true
}
}
ingress_rule {
hostname = "bazarr.nook.family"
service = "https://traefik.kube-system.svc:443"
origin_request {
no_tls_verify = true
}
}
# fornax.neuralplatform.ai — Fornax torrent coordinator (qBittorrent API proxy)