Replace k3s with direct soul-demo watchdog #57

Merged
will.anderson merged 1 commits from fix/checkout-auth-reveal into dev 2026-05-11 00:47:00 +00:00
5 changed files with 40 additions and 86 deletions
+6 -5
View File
@@ -172,11 +172,12 @@ jobs:
- name: Touch HTML placeholder files
run: touch src/index.html src/about.html src/terms.html src/enterprise-terms.html
- name: Create soul-demo-image.tar placeholder
# Dockerfile.stage COPYs this file (used by k3s at runtime).
# We only need the COPY to succeed here; real tar is built by
# build-stage.sh in the deploy pipeline.
run: touch dist/soul-demo-image.tar
- name: Create soul-demo placeholder
# Dockerfile.stage COPYs dist/soul-demo. We only need the binary to exist
# for the Docker build to succeed; the real binary is compiled in stage CI.
run: |
touch dist/soul-demo
chmod +x dist/soul-demo
- name: Build Docker image (local only — no push)
run: |
+5 -25
View File
@@ -173,15 +173,15 @@ jobs:
# ── Docker build + push ───────────────────────────────────────────────
- name: Build soul-demo image tar
# Dockerfile.stage COPYs dist/soul-demo-image.tar so k3s can import
# soul-demo:local at runtime. We compile soul-demo from source on the
# host runner (ci-base has gcc), build a minimal OCI image, and save it.
- name: Build soul-demo binary
# Compile soul-demo directly on the host runner (ci-base has gcc).
# Cloud Run runs soul-demo as a direct subprocess with a watchdog loop —
# no k3s, no OCI image needed. One binary per container; Cloud Run
# handles horizontal scaling.
# Moved AFTER JS compilation to avoid Docker memory pressure killing elc.
if: steps.changetype.outputs.asset_only != 'true'
run: |
set -euo pipefail
# Compile el_runtime.o and soul-demo on the host runner
cc -O2 -DHAVE_CURL -c runtime/el_runtime.c -I runtime/ -o /tmp/el_runtime.o
cc -O2 -rdynamic -DEL_SOUL_DEMO_BUILD \
-I runtime/ \
@@ -189,26 +189,6 @@ jobs:
dist/soul-demo.c dist/vessel_stubs.c /tmp/el_runtime.o \
-lcurl -lpthread -ldl -lm -lssl -lcrypto
echo "soul-demo compiled: $(ls -lh dist/soul-demo)"
# Package as minimal OCI image for k3s import
# --no-cache: prevents reuse of corrupted overlay2 layers from prior failed runs
docker build --no-cache -f dist/Dockerfile.soul-demo -t soul-demo:local dist/
docker save soul-demo:local -o dist/soul-demo-image.tar
echo "soul-demo-image.tar: $(du -sh dist/soul-demo-image.tar | cut -f1)"
docker rmi soul-demo:local 2>/dev/null || true
- name: Download k3s binary
# Pre-download k3s on the host runner so Dockerfile.stage can COPY it
# directly. Previously k3s was downloaded inside the Docker builder stage,
# which combined with build-essential and C compilation caused RWLayer nil
# corruption on the runner's overlay2 driver. Host-runner download is safe.
if: steps.changetype.outputs.asset_only != 'true'
run: |
set -euo pipefail
curl -fL --retry 3 --retry-delay 10 \
https://github.com/k3s-io/k3s/releases/download/v1.32.4%2Bk3s1/k3s \
-o dist/k3s
chmod +x dist/k3s
echo "k3s: $(ls -lh dist/k3s)"
- name: Build and tag image
if: steps.changetype.outputs.asset_only != 'true'
+4 -24
View File
@@ -4,15 +4,13 @@
# - neuron-web on port 8080 (landing page server)
# - soul-demo on port 7772 (demo chat, localhost only)
#
# All binaries (neuron-web, soul-demo, k3s) are pre-built by CI on the host
# runner before this Dockerfile runs. This keeps the Docker build single-stage
# with no compilation and no network downloads, eliminating the multi-stage
# complexity that caused RWLayer corruption on the runner's overlay2 driver.
# All binaries (neuron-web, soul-demo) are pre-built by CI on the host runner
# before this Dockerfile runs. This keeps the Docker build single-stage with
# no compilation and no network downloads.
#
# CI pre-build steps (in stage.yaml):
# - neuron-web: built by `elb build` → dist/neuron-landing
# - soul-demo: compiled by cc on host → dist/soul-demo
# - k3s: downloaded by curl on host → dist/k3s
FROM ubuntu:24.04
@@ -27,9 +25,7 @@ RUN apt-get update \
&& groupadd -r landing && useradd -r -g landing landing \
&& mkdir -p /srv/landing/assets /srv/landing/js /srv/landing/shares \
&& mkdir -p /srv/soul/engram-demo \
&& chown -R landing:landing /srv/landing /srv/soul \
&& mkdir -p /var/lib/rancher/k3s /tmp/k3s \
&& chown -R landing:landing /var/lib/rancher /tmp/k3s
&& chown -R landing:landing /srv/landing /srv/soul
# neuron-web binary — produced by `elb build` in CI (linux/amd64)
COPY dist/neuron-landing /usr/local/bin/neuron-web
@@ -39,18 +35,6 @@ RUN chmod +x /usr/local/bin/neuron-web
COPY dist/soul-demo /usr/local/bin/soul-demo
RUN chmod +x /usr/local/bin/soul-demo
# k3s binary — downloaded from GitHub releases by CI
COPY dist/k3s /usr/local/bin/k3s
RUN chmod +x /usr/local/bin/k3s
# soul-demo OCI image tar — k3s imports this at startup (no registry needed)
RUN mkdir -p /var/lib/rancher/k3s/agent/images
COPY dist/soul-demo-image.tar /var/lib/rancher/k3s/agent/images/soul-demo.tar
# k3s manifests — auto-applied when k3s starts
RUN mkdir -p /var/lib/rancher/k3s/server/manifests
COPY dist/k3s-soul-demo.yaml /var/lib/rancher/k3s/server/manifests/soul-demo.yaml
# Engram snapshot — baked in so soul has memory from cold start
COPY dist/engram-snapshot.json /srv/soul/engram-demo/snapshot.json
@@ -73,11 +57,7 @@ ENV LANDING_ROOT=/srv/landing
ENV PORT=8080
ENV NEURON_HOME=/srv/soul/engram-demo
ENV NEURON_PORT=7772
ENV K3S_DATA_DIR=/var/lib/rancher/k3s
ENV KUBECONFIG=/var/lib/rancher/k3s/server/cred/admin.kubeconfig
# k3s requires root to create network namespaces and mount cgroups.
# Cloud Run gen2 sandbox is the security boundary here.
EXPOSE 8080
CMD ["/usr/local/bin/entrypoint.sh"]
+17 -32
View File
@@ -1,41 +1,26 @@
#!/bin/sh
set -e
# SKIP_K3S=1 — bypass k3s/soul-demo startup and go straight to neuron-web.
# Used by the dev CI smoke test where the container runtime doesn't support
# the kernel capabilities k3s requires (overlayfs / privileged mode).
if [ "${SKIP_K3S:-0}" = "1" ]; then
echo "[entrypoint] SKIP_K3S=1: starting neuron-web directly (no k3s/soul-demo)."
echo "[entrypoint] SKIP_K3S=1: starting neuron-web directly (no soul-demo)."
exec /usr/local/bin/neuron-web
fi
echo "[entrypoint] Starting k3s server (embedded soul-demo orchestrator)..."
# Soul-demo watchdog: start soul-demo and restart it automatically on crash.
# Cloud Run gen2 doesn't reliably provide eth0 with a unicast IP, so k3s flannel
# fails at startup. Running soul-demo directly is simpler, lighter, and fully
# self-healing. Cloud Run handles horizontal scaling — no HPA needed.
echo "[entrypoint] Starting soul-demo watchdog on :${NEURON_PORT:-7772}..."
(
while true; do
echo "[soul-watchdog] starting soul-demo (NEURON_HOME=${NEURON_HOME})"
/usr/local/bin/soul-demo 2>&1 || true
echo "[soul-watchdog] soul-demo exited, restarting in 3s..."
sleep 3
done
) &
# k3s server — single-node mode, disable unused components
# --disable traefik,servicelb: we don't need an ingress or LB
# --disable metrics-server: saves ~50MB RAM
# --write-kubeconfig-mode=644: allow non-root reads
# --data-dir: use the pre-chowned dir
# --flannel-iface=eth0: explicitly set the network interface.
# Cloud Run gen2 provides eth0 but k3s default IP detection walks the routing
# table looking for a default route, which fails in Cloud Run's network sandbox.
# Pinning to eth0 bypasses that detection and lets k3s bind correctly.
k3s server \
--disable traefik \
--disable servicelb \
--disable metrics-server \
--write-kubeconfig-mode=644 \
--data-dir /var/lib/rancher/k3s \
--node-name soul-node \
--flannel-iface=eth0 &
K3S_PID=$!
# Start neuron-web immediately — do NOT block on k3s becoming ready.
# Cloud Run's startup probe requires port 8080 to be listening within the
# startup timeout. k3s may take 30-60s to initialise; blocking here causes
# probe failures and container termination before neuron-web ever starts.
# soul-demo becomes available asynchronously once k3s is ready. neuron-web
# handles soul-demo being temporarily unavailable gracefully.
echo "[entrypoint] Starting neuron-web on port ${PORT:-8080} (k3s initialising in background)..."
# Start neuron-web immediately — do NOT block.
# Cloud Run startup probe requires port 8080 to answer within the timeout.
echo "[entrypoint] Starting neuron-web on port ${PORT:-8080}..."
exec /usr/local/bin/neuron-web
+8
View File
@@ -16,4 +16,12 @@ build {
c_source "dist/page_css.c"
c_source "dist/page_ga.c"
c_source "dist/page_schema.c"
// NOTE: neuron-web requires el_runtime.c to be compiled with -DHAVE_CURL
// so that http_get/http_post forward to libcurl instead of returning
// {"error":"not built with HAVE_CURL"}. The elb binary in ci-base:dev
// hardcodes -DHAVE_CURL in its cc invocation, but older elb versions may
// not. manifest.el does not support c_flags or link_flags directives
// if upgrading elb breaks HTTP, ensure ci-base:dev ships an elb built
// with HAVE_CURL enabled in its hardcoded cc command, or pre-compile
// el_runtime.o with -DHAVE_CURL on the host and pass it as a c_source.
}