7 Commits

Author SHA1 Message Date
Will Anderson b4b05bfe40 fix(ci): remove duplicate runner Deployment from apps/
The stale apps/gitea-runner.yaml contained two Deployment manifests
that conflicted with the canonical Deployments owned by the
gitea-runner-config Argo Application (pointing at k8s/gitea-runner/).

Dual ownership caused Argo CD to fight itself — restarting runner
pods mid-job and producing the "context canceled" failures on
neuron-technologies/dharma-el CI.

Canonical Deployments (config-version 2026-05-04-cf-access-public-url,
docker.sock, CF Access env, replicas=2 for nt-runner) live in
k8s/gitea-runner/deployment.yaml and are managed by gitea-runner-config.
2026-05-04 17:55:28 -05:00
Will Anderson 48106b27ec vault: cut over to GCE Raft HA cluster, retire nook.family media stack
- dns-neuralplatform.tf: add vault.neuralplatform.ai A record → 34.54.164.21 (GCP LB)
  DNS-only (not proxied) so GCP managed TLS cert can provision correctly
- main.tf: remove vault.neuralplatform.ai from Cloudflare tunnel ingress
  (now served directly via GCP Global HTTPS LB)
- main.tf: remove watch.nook.family, jellyfin.nook.family, bazarr.nook.family
  from tunnel ingress (nook.family media stack retired; infra is Neuron-focused)

GCE Vault cluster already initialized and running (3-node Raft, active since
2026-05-04T16:05). Secrets migrated 48/48 from k3s vault. ESO ClusterSecretStore
validated against new vault. k3s vault-0 is now superseded.
2026-05-04 16:40:03 -05:00
will.anderson 0006380c27 route runner build container clones via public URL with CF Access (#7) 2026-05-04 21:37:53 +00:00
will.anderson 7ab97eb88d add CF Access service token for Gitea Actions runner (#5) 2026-05-04 21:25:20 +00:00
will.anderson 23fc64e7b7 fix(neuron-prod): set neuron-marketing-hpa minReplicas to 1 (#8) 2026-05-04 21:21:16 +00:00
will.anderson a64860064b fix(neuron-prod): add allow-dharma-ingress NetworkPolicy (#6) 2026-05-04 21:16:01 +00:00
will.anderson cbb564ccf5 revert(ci): runner public URL — CF Access blocks registration (#4) 2026-05-04 21:05:29 +00:00
10 changed files with 210 additions and 203 deletions
-165
View File
@@ -1,165 +0,0 @@
---
# Gitea CI runner — general-purpose (legion)
# Uses host Docker socket for container management and docker build/push.
apiVersion: apps/v1
kind: Deployment
metadata:
name: gitea-runner
namespace: ci
labels:
app: gitea-runner
spec:
replicas: 1
selector:
matchLabels:
app: gitea-runner
template:
metadata:
labels:
app: gitea-runner
annotations:
config-version: "2026-04-27-containerd-sock"
spec:
securityContext:
runAsNonRoot: false # act_runner needs root for container management
initContainers:
- name: register
image: registry.neuralplatform.ai/ci-base:latest
workingDir: /data
command: ["/bin/sh", "-c"]
args:
- |
act_runner register \
--instance "$GITEA_INSTANCE_URL" \
--token "$GITEA_RUNNER_REGISTRATION_TOKEN" \
--name legion \
--labels "self-hosted:docker://registry.neuralplatform.ai/ci-base:latest,ubuntu-latest:docker://registry.neuralplatform.ai/ci-base:latest,ubuntu-24.04:docker://registry.neuralplatform.ai/ci-base:latest,linux,x64" \
--no-interactive
cat > /data/config.yaml << 'EOF'
runner:
capacity: 2
timeout: 3h
container:
network: host
docker_host: "unix:///run/k3s/containerd/containerd.sock"
force_pull: false
valid_volumes: []
default_image: "registry.neuralplatform.ai/ci-base:latest"
extra_hosts:
- "gitea.git.svc.cluster.local:10.43.1.53"
EOF
envFrom:
- secretRef:
name: gitea-runner-secret
volumeMounts:
- name: data
mountPath: /data
containers:
- name: runner
image: registry.neuralplatform.ai/ci-base:latest
workingDir: /data
command: ["act_runner", "daemon", "--config", "/data/config.yaml"]
envFrom:
- secretRef:
name: gitea-runner-secret
volumeMounts:
- name: data
mountPath: /data
- name: docker-sock
mountPath: /var/run/docker.sock
resources:
requests:
memory: 512Mi
cpu: 250m
limits:
memory: 4Gi
cpu: "4"
volumes:
- name: data
emptyDir: {}
- name: docker-sock
hostPath:
path: /run/k3s/containerd/containerd.sock
type: Socket
---
# Neuron Technologies CI runner
apiVersion: apps/v1
kind: Deployment
metadata:
name: neuron-technologies-runner
namespace: ci
labels:
app: neuron-technologies-runner
spec:
replicas: 1
selector:
matchLabels:
app: neuron-technologies-runner
template:
metadata:
labels:
app: neuron-technologies-runner
annotations:
config-version: "2026-04-27-containerd-sock"
spec:
securityContext:
runAsNonRoot: false
initContainers:
- name: register
image: registry.neuralplatform.ai/ci-base:latest
workingDir: /data
command: ["/bin/sh", "-c"]
args:
- |
act_runner register \
--instance "$GITEA_INSTANCE_URL" \
--token "$GITEA_RUNNER_REGISTRATION_TOKEN" \
--name neuron-technologies \
--labels "self-hosted:docker://registry.neuralplatform.ai/ci-base:latest,ubuntu-latest:docker://registry.neuralplatform.ai/ci-base:latest,ubuntu-24.04:docker://registry.neuralplatform.ai/ci-base:latest,linux,x64" \
--no-interactive
cat > /data/config.yaml << 'EOF'
runner:
capacity: 2
timeout: 3h
container:
network: host
docker_host: "unix:///run/k3s/containerd/containerd.sock"
force_pull: false
valid_volumes: []
default_image: "registry.neuralplatform.ai/ci-base:latest"
extra_hosts:
- "gitea.git.svc.cluster.local:10.43.1.53"
EOF
envFrom:
- secretRef:
name: neuron-technologies-runner-secret
volumeMounts:
- name: data
mountPath: /data
containers:
- name: runner
image: registry.neuralplatform.ai/ci-base:latest
workingDir: /data
command: ["act_runner", "daemon", "--config", "/data/config.yaml"]
envFrom:
- secretRef:
name: neuron-technologies-runner-secret
volumeMounts:
- name: data
mountPath: /data
- name: docker-sock
mountPath: /var/run/docker.sock
resources:
requests:
memory: 512Mi
cpu: 250m
limits:
memory: 4Gi
cpu: "4"
volumes:
- name: data
emptyDir: {}
- name: docker-sock
hostPath:
path: /run/k3s/containerd/containerd.sock
type: Socket
+13
View File
@@ -23,3 +23,16 @@ resource "cloudflare_record" "np_web_stage" {
proxied = true
ttl = 1
}
# vault.neuralplatform.ai — GCE Raft HA Vault cluster via GCP Global HTTPS LB.
# DNS-only (not proxied) — GCP managed TLS cert terminates at the LB.
# Vault nodes listen on plain HTTP 8200 internally; LB does TLS.
# IP: terraform output vault_lb_ip from servers/gcp workspace = 34.54.164.21
resource "cloudflare_record" "np_vault" {
zone_id = local.zone_neuralplatform_ai
name = "vault"
type = "A"
content = "34.54.164.21"
proxied = false
ttl = 60
}
+42
View File
@@ -0,0 +1,42 @@
# Cloudflare Zero Trust Access — git.neuralplatform.ai (Gitea)
#
# The Gitea Access application itself is currently managed in the Cloudflare
# dashboard, NOT in Terraform. This file only manages the *service token* the
# Gitea Actions runners use to authenticate through CF Access while still
# keeping the human Google-OAuth gate for browser users.
#
# Why not import the application here?
# - Importing the existing dashboard app risks drifting the human-auth
# policy (Google IdP, allowed emails) which is settled and working.
# - Service tokens can be added to a dashboard-managed app without
# importing the app itself; the token resource lives at the account
# level and is referenced from a policy.
# - We pay only the cost we need to. If we later want all Access apps
# in TF we can do a focused import pass.
#
# After `terraform apply` produces the token id/secret, Will must:
# 1. Run `vault kv put secret/gitea-runner-cf-access ...` (see outputs).
# 2. In the Cloudflare dashboard, edit the existing "Gitea" Access
# application's policies and add a new policy:
# Action: Service Auth (decision = non_identity)
# Include: Service Token = "gitea-runner"
# This grants the service token bypass through CF Access on
# git.neuralplatform.ai without changing the human-auth flow.
resource "cloudflare_zero_trust_access_service_token" "gitea_runner" {
account_id = var.cloudflare_account_id
name = "gitea-runner"
# Default duration is "8760h" (1 year). Rotate via re-apply when needed.
duration = "forever"
}
output "gitea_runner_cf_access_client_id" {
description = "CF Access service token client ID for the Gitea Actions runner. Store in Vault at secret/gitea-runner-cf-access."
value = cloudflare_zero_trust_access_service_token.gitea_runner.client_id
}
output "gitea_runner_cf_access_client_secret" {
description = "CF Access service token client secret. Store in Vault at secret/gitea-runner-cf-access. Only emitted at creation time."
value = cloudflare_zero_trust_access_service_token.gitea_runner.client_secret
sensitive = true
}
@@ -85,3 +85,15 @@ RUN curl -fsSL https://cli.github.com/packages/githubcli-archive-keyring.gpg \
&& apt-get update \
&& apt-get install -y --no-install-recommends gh \
&& rm -rf /var/lib/apt/lists/*
# Cloudflare Access bootstrap for git clones to git.neuralplatform.ai.
# This script is sourced by bash in build containers via BASH_ENV (set by
# act_runner's container.env in deployment.yaml) so it runs before every
# step. It configures git insteadOf + CF Access extraHeaders from
# CF_ACCESS_CLIENT_ID / CF_ACCESS_CLIENT_SECRET env vars.
#
# We deliberately don't set ENTRYPOINT / CMD here — act_runner spawns
# build containers with its own entrypoint to keep them alive between
# steps, and overriding it breaks job execution.
COPY git-cf-access-init.sh /usr/local/bin/git-cf-access-init.sh
RUN chmod +x /usr/local/bin/git-cf-access-init.sh
@@ -8,7 +8,7 @@ metadata:
labels:
app: gitea-runner
annotations:
config-version: "2026-05-04-docker-sock-fix"
config-version: "2026-05-04-cf-access-public-url"
spec:
replicas: 1
selector:
@@ -19,7 +19,7 @@ spec:
labels:
app: gitea-runner
annotations:
config-version: "2026-05-04-docker-sock-fix"
config-version: "2026-05-04-cf-access-public-url"
spec:
securityContext:
runAsNonRoot: false
@@ -35,7 +35,7 @@ spec:
--name legion \
--labels "self-hosted:docker://registry.neuralplatform.ai/ci-base:latest,ubuntu-latest:docker://registry.neuralplatform.ai/ci-base:latest,ubuntu-24.04:docker://registry.neuralplatform.ai/ci-base:latest,linux,x64" \
--no-interactive
cat > /data/config.yaml << 'EOF'
cat > /data/config.yaml << EOF
runner:
capacity: 2
timeout: 3h
@@ -45,6 +45,16 @@ spec:
force_pull: false
valid_volumes: []
default_image: "registry.neuralplatform.ai/ci-base:latest"
# Build containers run with network: host. The in-cluster
# gitea name does not resolve there, so we redirect git
# operations to https://git.neuralplatform.ai using CF
# Access service-token headers. BASH_ENV makes bash source
# /usr/local/bin/git-cf-access-init.sh before every step,
# which sets up the redirect + headers.
env:
CF_ACCESS_CLIENT_ID: "${CF_ACCESS_CLIENT_ID}"
CF_ACCESS_CLIENT_SECRET: "${CF_ACCESS_CLIENT_SECRET}"
BASH_ENV: "/usr/local/bin/git-cf-access-init.sh"
extra_hosts:
- "gitea.git.svc.cluster.local:10.43.1.53"
EOF
@@ -92,7 +102,7 @@ metadata:
labels:
app: neuron-technologies-runner
annotations:
config-version: "2026-05-04-docker-sock-fix"
config-version: "2026-05-04-cf-access-public-url"
spec:
replicas: 2
selector:
@@ -103,7 +113,7 @@ spec:
labels:
app: neuron-technologies-runner
annotations:
config-version: "2026-05-04-docker-sock-fix"
config-version: "2026-05-04-cf-access-public-url"
spec:
securityContext:
runAsNonRoot: false
@@ -119,7 +129,7 @@ spec:
--name "legion-nt-$(hostname)" \
--labels "self-hosted:docker://registry.neuralplatform.ai/ci-base:latest,ubuntu-latest:docker://registry.neuralplatform.ai/ci-base:latest,ubuntu-24.04:docker://registry.neuralplatform.ai/ci-base:latest,linux,x64" \
--no-interactive
cat > /data/config.yaml << 'EOF'
cat > /data/config.yaml << EOF
runner:
capacity: 2
timeout: 3h
@@ -129,6 +139,16 @@ spec:
force_pull: false
valid_volumes: []
default_image: "registry.neuralplatform.ai/ci-base:latest"
# Build containers run with network: host. The in-cluster
# gitea name does not resolve there, so we redirect git
# operations to https://git.neuralplatform.ai using CF
# Access service-token headers. BASH_ENV makes bash source
# /usr/local/bin/git-cf-access-init.sh before every step,
# which sets up the redirect + headers.
env:
CF_ACCESS_CLIENT_ID: "${CF_ACCESS_CLIENT_ID}"
CF_ACCESS_CLIENT_SECRET: "${CF_ACCESS_CLIENT_SECRET}"
BASH_ENV: "/usr/local/bin/git-cf-access-init.sh"
extra_hosts:
- "gitea.git.svc.cluster.local:10.43.1.53"
EOF
@@ -1,12 +1,20 @@
---
# gitea-runner-secret — neural-platform org runner token
#
# GITEA_INSTANCE_URL stays as the in-cluster URL — the act_runner daemon
# polls it constantly and we don't want every poll to hit Cloudflare Access.
# Build containers, however, need the public URL because they run with
# network: host and can't resolve gitea.git.svc.cluster.local. The
# git-cf-access-init.sh entrypoint in the ci-base image rewrites the
# in-cluster URL to https://git.neuralplatform.ai with the CF Access
# headers from CF_ACCESS_CLIENT_ID / CF_ACCESS_CLIENT_SECRET below.
apiVersion: external-secrets.io/v1beta1
kind: ExternalSecret
metadata:
name: gitea-runner-secret
namespace: ci
annotations:
force-sync: "2026-04-23"
force-sync: "2026-05-04-cf-access"
spec:
refreshInterval: 1h
secretStoreRef:
@@ -19,11 +27,21 @@ spec:
data:
GITEA_INSTANCE_URL: "http://gitea.git.svc.cluster.local:3000"
GITEA_RUNNER_REGISTRATION_TOKEN: "{{ .runner_token }}"
CF_ACCESS_CLIENT_ID: "{{ .cf_access_client_id }}"
CF_ACCESS_CLIENT_SECRET: "{{ .cf_access_client_secret }}"
data:
- secretKey: runner_token
remoteRef:
key: secret/data/gitea
property: runner_token
- secretKey: cf_access_client_id
remoteRef:
key: secret/data/gitea-runner-cf-access
property: client_id
- secretKey: cf_access_client_secret
remoteRef:
key: secret/data/gitea-runner-cf-access
property: client_secret
---
# neuron-technologies-runner-secret — neuron-technologies org runner token
apiVersion: external-secrets.io/v1beta1
@@ -31,6 +49,8 @@ kind: ExternalSecret
metadata:
name: neuron-technologies-runner-secret
namespace: ci
annotations:
force-sync: "2026-05-04-cf-access"
spec:
refreshInterval: 1h
secretStoreRef:
@@ -43,8 +63,18 @@ spec:
data:
GITEA_INSTANCE_URL: "http://gitea.git.svc.cluster.local:3000"
GITEA_RUNNER_REGISTRATION_TOKEN: "{{ .runner_token }}"
CF_ACCESS_CLIENT_ID: "{{ .cf_access_client_id }}"
CF_ACCESS_CLIENT_SECRET: "{{ .cf_access_client_secret }}"
data:
- secretKey: runner_token
remoteRef:
key: secret/data/gitea
property: neuron_technologies_runner_token
- secretKey: cf_access_client_id
remoteRef:
key: secret/data/gitea-runner-cf-access
property: client_id
- secretKey: cf_access_client_secret
remoteRef:
key: secret/data/gitea-runner-cf-access
property: client_secret
@@ -0,0 +1,50 @@
#!/bin/sh
# git-cf-access-init.sh
#
# Configures git so any clone/fetch from Gitea ends up going to
# git.neuralplatform.ai with the runner's Cloudflare Access service-token
# headers attached.
#
# How this gets invoked:
# The forgejo-runner job execution path runs each step via a
# non-interactive bash invocation inside the build container. Setting
# BASH_ENV=/usr/local/bin/git-cf-access-init.sh in act_runner's
# container.env causes bash to source this script before any step's
# commands run. (See servers/legion/k8s/gitea-runner/deployment.yaml.)
#
# What it does:
# 1. Rewrites http://gitea.git.svc.cluster.local:3000/ → https://git.neuralplatform.ai/
# via insteadOf. The runner registered against the in-cluster URL (no
# CF Access on the daemon's polling loop), so act_runner advertises
# that URL to the build container as github.server_url. Build
# containers run with network: host and can't resolve
# *.svc.cluster.local, so we need to redirect to the public URL.
# 2. Adds the CF Access service-token headers to outbound requests to
# git.neuralplatform.ai so the clone authenticates through CF Access.
#
# Idempotent — re-runs replace any prior config keys without accumulating
# duplicate header entries.
#
# Known limitation: actions/checkout sets an Authorization extraheader
# keyed to the server URL it was given (the in-cluster URL). After
# insteadOf substitution the request goes to the public URL where git
# matches http.<public>.extraheader, and the in-cluster-keyed
# Authorization header is dropped. For public repos this is fine. For
# private repos the per-job token will not be sent — see the PR
# description for the follow-up plan if dharma-el's CI needs that token.
if [ -n "${CF_ACCESS_CLIENT_ID:-}" ] && [ -n "${CF_ACCESS_CLIENT_SECRET:-}" ]; then
git config --global --replace-all \
url."https://git.neuralplatform.ai/".insteadOf \
"http://gitea.git.svc.cluster.local:3000/" 2>/dev/null || true
# Reset extraHeader on the public URL, then add both CF Access headers.
git config --global --unset-all \
http."https://git.neuralplatform.ai/".extraHeader 2>/dev/null || true
git config --global --add \
http."https://git.neuralplatform.ai/".extraHeader \
"CF-Access-Client-Id: ${CF_ACCESS_CLIENT_ID}" 2>/dev/null || true
git config --global --add \
http."https://git.neuralplatform.ai/".extraHeader \
"CF-Access-Client-Secret: ${CF_ACCESS_CLIENT_SECRET}" 2>/dev/null || true
fi
@@ -82,7 +82,12 @@ spec:
apiVersion: apps/v1
kind: Deployment
name: neuron-marketing
minReplicas: 0
# minReplicas=1 to match the file's own convention (see header comment).
# Kubernetes only allows minReplicas=0 when at least one Object or External
# metric is configured (queue depth, custom signal, etc.); with only a
# Resource (CPU) metric, scale-to-zero is rejected and the whole HPA is
# invalid — which was blocking neuron-prod's Argo CD sync.
minReplicas: 1
maxReplicas: 8
metrics:
- type: Resource
@@ -117,6 +117,32 @@ spec:
matchLabels:
kubernetes.io/metadata.name: neuron-prod
---
# ── dharma: accept from Traefik (kube-system) and neuron-prod namespace ──────
# The dharma pod was healthy and the IngressRoute was correct, but cross-
# namespace ingress from kube-system (Traefik) was denied by default-deny-all,
# so every external request landed at Traefik and bounced back as 502. This
# allow rule mirrors `allow-mcp-ingress` and brings dharma into line with the
# other neuron-prod services.
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
name: allow-dharma-ingress
namespace: neuron-prod
spec:
podSelector:
matchLabels:
app: dharma
policyTypes:
- Ingress
ingress:
- from:
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: kube-system
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: neuron-prod
---
# ── Egress: all prod pods may reach platform (postgres/redis), vault,
# monitoring (alloy OTLP), kube-dns, and the internet (external APIs) ─
apiVersion: networking.k8s.io/v1
+4 -30
View File
@@ -92,37 +92,11 @@ resource "cloudflare_zero_trust_tunnel_cloudflared_config" "legion" {
}
}
ingress_rule {
hostname = "vault.neuralplatform.ai"
service = "https://traefik.kube-system.svc:443"
origin_request {
no_tls_verify = true
}
}
# vault.neuralplatform.ai — moved to GCP Global HTTPS LB (34.54.164.21)
# DNS is now a direct A record (not proxied) in dns-neuralplatform.tf
ingress_rule {
hostname = "watch.nook.family"
service = "https://traefik.kube-system.svc:443"
origin_request {
no_tls_verify = true
}
}
ingress_rule {
hostname = "jellyfin.nook.family"
service = "https://traefik.kube-system.svc:443"
origin_request {
no_tls_verify = true
}
}
ingress_rule {
hostname = "bazarr.nook.family"
service = "https://traefik.kube-system.svc:443"
origin_request {
no_tls_verify = true
}
}
# watch.nook.family, jellyfin.nook.family, bazarr.nook.family — removed
# This infrastructure is focused on Neuron; nook.family media stack retired
# fornax.neuralplatform.ai — Fornax torrent coordinator (qBittorrent API proxy)