diff --git a/.gitea/workflows/dev.yaml b/.gitea/workflows/dev.yaml index 83797a4..9cb894f 100644 --- a/.gitea/workflows/dev.yaml +++ b/.gitea/workflows/dev.yaml @@ -172,11 +172,12 @@ jobs: - name: Touch HTML placeholder files run: touch src/index.html src/about.html src/terms.html src/enterprise-terms.html - - name: Create soul-demo-image.tar placeholder - # Dockerfile.stage COPYs this file (used by k3s at runtime). - # We only need the COPY to succeed here; real tar is built by - # build-stage.sh in the deploy pipeline. - run: touch dist/soul-demo-image.tar + - name: Create soul-demo placeholder + # Dockerfile.stage COPYs dist/soul-demo. We only need the binary to exist + # for the Docker build to succeed; the real binary is compiled in stage CI. + run: | + touch dist/soul-demo + chmod +x dist/soul-demo - name: Build Docker image (local only — no push) run: | diff --git a/.gitea/workflows/stage.yaml b/.gitea/workflows/stage.yaml index 05b0e96..03d56ee 100644 --- a/.gitea/workflows/stage.yaml +++ b/.gitea/workflows/stage.yaml @@ -173,15 +173,15 @@ jobs: # ── Docker build + push ─────────────────────────────────────────────── - - name: Build soul-demo image tar - # Dockerfile.stage COPYs dist/soul-demo-image.tar so k3s can import - # soul-demo:local at runtime. We compile soul-demo from source on the - # host runner (ci-base has gcc), build a minimal OCI image, and save it. + - name: Build soul-demo binary + # Compile soul-demo directly on the host runner (ci-base has gcc). + # Cloud Run runs soul-demo as a direct subprocess with a watchdog loop — + # no k3s, no OCI image needed. One binary per container; Cloud Run + # handles horizontal scaling. # Moved AFTER JS compilation to avoid Docker memory pressure killing elc. if: steps.changetype.outputs.asset_only != 'true' run: | set -euo pipefail - # Compile el_runtime.o and soul-demo on the host runner cc -O2 -DHAVE_CURL -c runtime/el_runtime.c -I runtime/ -o /tmp/el_runtime.o cc -O2 -rdynamic -DEL_SOUL_DEMO_BUILD \ -I runtime/ \ @@ -189,26 +189,6 @@ jobs: dist/soul-demo.c dist/vessel_stubs.c /tmp/el_runtime.o \ -lcurl -lpthread -ldl -lm -lssl -lcrypto echo "soul-demo compiled: $(ls -lh dist/soul-demo)" - # Package as minimal OCI image for k3s import - # --no-cache: prevents reuse of corrupted overlay2 layers from prior failed runs - docker build --no-cache -f dist/Dockerfile.soul-demo -t soul-demo:local dist/ - docker save soul-demo:local -o dist/soul-demo-image.tar - echo "soul-demo-image.tar: $(du -sh dist/soul-demo-image.tar | cut -f1)" - docker rmi soul-demo:local 2>/dev/null || true - - - name: Download k3s binary - # Pre-download k3s on the host runner so Dockerfile.stage can COPY it - # directly. Previously k3s was downloaded inside the Docker builder stage, - # which combined with build-essential and C compilation caused RWLayer nil - # corruption on the runner's overlay2 driver. Host-runner download is safe. - if: steps.changetype.outputs.asset_only != 'true' - run: | - set -euo pipefail - curl -fL --retry 3 --retry-delay 10 \ - https://github.com/k3s-io/k3s/releases/download/v1.32.4%2Bk3s1/k3s \ - -o dist/k3s - chmod +x dist/k3s - echo "k3s: $(ls -lh dist/k3s)" - name: Build and tag image if: steps.changetype.outputs.asset_only != 'true' diff --git a/Dockerfile.stage b/Dockerfile.stage index 5ae0729..5723d33 100644 --- a/Dockerfile.stage +++ b/Dockerfile.stage @@ -4,15 +4,13 @@ # - neuron-web on port 8080 (landing page server) # - soul-demo on port 7772 (demo chat, localhost only) # -# All binaries (neuron-web, soul-demo, k3s) are pre-built by CI on the host -# runner before this Dockerfile runs. This keeps the Docker build single-stage -# with no compilation and no network downloads, eliminating the multi-stage -# complexity that caused RWLayer corruption on the runner's overlay2 driver. +# All binaries (neuron-web, soul-demo) are pre-built by CI on the host runner +# before this Dockerfile runs. This keeps the Docker build single-stage with +# no compilation and no network downloads. # # CI pre-build steps (in stage.yaml): # - neuron-web: built by `elb build` → dist/neuron-landing # - soul-demo: compiled by cc on host → dist/soul-demo -# - k3s: downloaded by curl on host → dist/k3s FROM ubuntu:24.04 @@ -27,9 +25,7 @@ RUN apt-get update \ && groupadd -r landing && useradd -r -g landing landing \ && mkdir -p /srv/landing/assets /srv/landing/js /srv/landing/shares \ && mkdir -p /srv/soul/engram-demo \ - && chown -R landing:landing /srv/landing /srv/soul \ - && mkdir -p /var/lib/rancher/k3s /tmp/k3s \ - && chown -R landing:landing /var/lib/rancher /tmp/k3s + && chown -R landing:landing /srv/landing /srv/soul # neuron-web binary — produced by `elb build` in CI (linux/amd64) COPY dist/neuron-landing /usr/local/bin/neuron-web @@ -39,18 +35,6 @@ RUN chmod +x /usr/local/bin/neuron-web COPY dist/soul-demo /usr/local/bin/soul-demo RUN chmod +x /usr/local/bin/soul-demo -# k3s binary — downloaded from GitHub releases by CI -COPY dist/k3s /usr/local/bin/k3s -RUN chmod +x /usr/local/bin/k3s - -# soul-demo OCI image tar — k3s imports this at startup (no registry needed) -RUN mkdir -p /var/lib/rancher/k3s/agent/images -COPY dist/soul-demo-image.tar /var/lib/rancher/k3s/agent/images/soul-demo.tar - -# k3s manifests — auto-applied when k3s starts -RUN mkdir -p /var/lib/rancher/k3s/server/manifests -COPY dist/k3s-soul-demo.yaml /var/lib/rancher/k3s/server/manifests/soul-demo.yaml - # Engram snapshot — baked in so soul has memory from cold start COPY dist/engram-snapshot.json /srv/soul/engram-demo/snapshot.json @@ -73,11 +57,7 @@ ENV LANDING_ROOT=/srv/landing ENV PORT=8080 ENV NEURON_HOME=/srv/soul/engram-demo ENV NEURON_PORT=7772 -ENV K3S_DATA_DIR=/var/lib/rancher/k3s -ENV KUBECONFIG=/var/lib/rancher/k3s/server/cred/admin.kubeconfig -# k3s requires root to create network namespaces and mount cgroups. -# Cloud Run gen2 sandbox is the security boundary here. EXPOSE 8080 CMD ["/usr/local/bin/entrypoint.sh"] diff --git a/dist/entrypoint.sh b/dist/entrypoint.sh index 2955003..ee8a70e 100644 --- a/dist/entrypoint.sh +++ b/dist/entrypoint.sh @@ -1,41 +1,26 @@ #!/bin/sh set -e -# SKIP_K3S=1 — bypass k3s/soul-demo startup and go straight to neuron-web. -# Used by the dev CI smoke test where the container runtime doesn't support -# the kernel capabilities k3s requires (overlayfs / privileged mode). if [ "${SKIP_K3S:-0}" = "1" ]; then - echo "[entrypoint] SKIP_K3S=1: starting neuron-web directly (no k3s/soul-demo)." + echo "[entrypoint] SKIP_K3S=1: starting neuron-web directly (no soul-demo)." exec /usr/local/bin/neuron-web fi -echo "[entrypoint] Starting k3s server (embedded soul-demo orchestrator)..." +# Soul-demo watchdog: start soul-demo and restart it automatically on crash. +# Cloud Run gen2 doesn't reliably provide eth0 with a unicast IP, so k3s flannel +# fails at startup. Running soul-demo directly is simpler, lighter, and fully +# self-healing. Cloud Run handles horizontal scaling — no HPA needed. +echo "[entrypoint] Starting soul-demo watchdog on :${NEURON_PORT:-7772}..." +( + while true; do + echo "[soul-watchdog] starting soul-demo (NEURON_HOME=${NEURON_HOME})" + /usr/local/bin/soul-demo 2>&1 || true + echo "[soul-watchdog] soul-demo exited, restarting in 3s..." + sleep 3 + done +) & -# k3s server — single-node mode, disable unused components -# --disable traefik,servicelb: we don't need an ingress or LB -# --disable metrics-server: saves ~50MB RAM -# --write-kubeconfig-mode=644: allow non-root reads -# --data-dir: use the pre-chowned dir -# --flannel-iface=eth0: explicitly set the network interface. -# Cloud Run gen2 provides eth0 but k3s default IP detection walks the routing -# table looking for a default route, which fails in Cloud Run's network sandbox. -# Pinning to eth0 bypasses that detection and lets k3s bind correctly. -k3s server \ - --disable traefik \ - --disable servicelb \ - --disable metrics-server \ - --write-kubeconfig-mode=644 \ - --data-dir /var/lib/rancher/k3s \ - --node-name soul-node \ - --flannel-iface=eth0 & - -K3S_PID=$! - -# Start neuron-web immediately — do NOT block on k3s becoming ready. -# Cloud Run's startup probe requires port 8080 to be listening within the -# startup timeout. k3s may take 30-60s to initialise; blocking here causes -# probe failures and container termination before neuron-web ever starts. -# soul-demo becomes available asynchronously once k3s is ready. neuron-web -# handles soul-demo being temporarily unavailable gracefully. -echo "[entrypoint] Starting neuron-web on port ${PORT:-8080} (k3s initialising in background)..." +# Start neuron-web immediately — do NOT block. +# Cloud Run startup probe requires port 8080 to answer within the timeout. +echo "[entrypoint] Starting neuron-web on port ${PORT:-8080}..." exec /usr/local/bin/neuron-web diff --git a/manifest.el b/manifest.el index 6ca3cd8..8f33f99 100644 --- a/manifest.el +++ b/manifest.el @@ -16,4 +16,12 @@ build { c_source "dist/page_css.c" c_source "dist/page_ga.c" c_source "dist/page_schema.c" + // NOTE: neuron-web requires el_runtime.c to be compiled with -DHAVE_CURL + // so that http_get/http_post forward to libcurl instead of returning + // {"error":"not built with HAVE_CURL"}. The elb binary in ci-base:dev + // hardcodes -DHAVE_CURL in its cc invocation, but older elb versions may + // not. manifest.el does not support c_flags or link_flags directives — + // if upgrading elb breaks HTTP, ensure ci-base:dev ships an elb built + // with HAVE_CURL enabled in its hardcoded cc command, or pre-compile + // el_runtime.o with -DHAVE_CURL on the host and pass it as a c_source. }