diff --git a/.gitea/workflows/ci.yaml b/.gitea/workflows/ci.yaml index 4252fae..7b26e07 100644 --- a/.gitea/workflows/ci.yaml +++ b/.gitea/workflows/ci.yaml @@ -9,8 +9,10 @@ on: - main workflow_dispatch: -# Same group as deploy-gke so builds and deploys queue behind each other. -# Prevents concurrent Docker daemon exhaustion on the single GCE runner. +# Serialize all activity on the single GCE runner. +# With build+deploy in the same workflow, a new push queues a single +# workflow instance — not two competing ones — so the deploy job is +# never orphaned by a cancellation race. concurrency: group: neuron-runner cancel-in-progress: false @@ -128,3 +130,220 @@ jobs: echo "Published neuron-soul@${VERSION}" rm -f /tmp/gcp-key.json + + deploy: + runs-on: ubuntu-latest + needs: build + # Only deploy on push to main, not on PRs or manual workflow_dispatch without intent. + if: github.event_name == 'push' && github.ref == 'refs/heads/main' + + env: + USE_GKE_GCLOUD_AUTH_PLUGIN: "True" + + steps: + - name: Free disk space + run: | + df -h / + docker system prune -af --volumes 2>/dev/null || true + rm -rf /tmp/.act-* /tmp/act-* 2>/dev/null || true + df -h / + + - name: Checkout + uses: actions/checkout@v4 + + - name: Install dependencies + run: | + apt-get update -qq + apt-get install -y --no-install-recommends \ + ca-certificates curl apt-transport-https kubectl + echo "deb [trusted=yes] https://packages.cloud.google.com/apt cloud-sdk main" \ + > /etc/apt/sources.list.d/google-cloud-sdk.list + apt-get update -qq && apt-get install -y google-cloud-cli google-cloud-cli-gke-gcloud-auth-plugin + + - name: Authenticate to GCP + env: + GCP_SA_KEY: ${{ secrets.GCP_SA_KEY }} + run: | + echo "${GCP_SA_KEY}" > /tmp/gcp-key.json + gcloud auth activate-service-account --key-file=/tmp/gcp-key.json + gcloud config set project neuron-785695 + gcloud auth configure-docker us-central1-docker.pkg.dev --quiet + + - name: Get GKE credentials + run: | + gcloud container clusters get-credentials neuron-platform \ + --region=us-central1 \ + --project=neuron-785695 + + - name: Determine image tag and slot + id: vars + run: | + # GITEA_SHA is set by the Gitea runner; fall back to GITHUB_SHA for + # compatibility with older Forgejo/Gitea versions. + RAW_SHA="${GITEA_SHA:-${GITHUB_SHA:-}}" + SHA="${RAW_SHA:0:8}" + if [ -z "$SHA" ]; then + # Last resort: read from git directly + SHA=$(git rev-parse --short=8 HEAD 2>/dev/null || echo "unknown") + fi + IMAGE="us-central1-docker.pkg.dev/neuron-785695/neuron-api/neuron-soul:${SHA}" + echo "sha=${SHA}" >> "$GITEA_OUTPUT" + echo "image=${IMAGE}" >> "$GITEA_OUTPUT" + + # Determine which slot is currently idle (0 replicas = idle slot) + # If both are at 0 (fresh deploy), default to blue + BLUE_REPLICAS=$(kubectl get deployment/neuron-mcp-blue \ + -n neuron-prod \ + -o jsonpath='{.spec.replicas}' 2>/dev/null || echo "0") + GREEN_REPLICAS=$(kubectl get deployment/neuron-mcp-green \ + -n neuron-prod \ + -o jsonpath='{.spec.replicas}' 2>/dev/null || echo "0") + + echo " Blue replicas: ${BLUE_REPLICAS}" + echo " Green replicas: ${GREEN_REPLICAS}" + + if [ "${GREEN_REPLICAS}" -eq 0 ] && [ "${BLUE_REPLICAS}" -gt 0 ]; then + SLOT="green" + elif [ "${BLUE_REPLICAS}" -eq 0 ] && [ "${GREEN_REPLICAS}" -gt 0 ]; then + SLOT="blue" + else + # Fresh cluster or both idle — deploy to blue first + SLOT="blue" + fi + + echo "slot=${SLOT}" >> "$GITEA_OUTPUT" + echo " Deploying to slot: ${SLOT}" + + - name: Prepare build artifacts + run: | + # Pre-download soul binary and El SDK so the Dockerfile can COPY them + # from the build context instead of authenticating inside the build. + mkdir -p build-artifacts + + # ── soul binary ──────────────────────────────────────────────────────── + # The build job (same workflow run) just published this version. + SOUL_VER=$(gcloud artifacts versions list \ + --repository=foundation-prod \ + --location=us-central1 \ + --project=neuron-785695 \ + --package=neuron-soul \ + --sort-by="~createTime" \ + --limit=1 \ + --format="value(name)" 2>/dev/null | awk -F/ '{print $NF}') + echo "Downloading neuron-soul@${SOUL_VER}" + gcloud artifacts generic download \ + --repository=foundation-prod \ + --location=us-central1 \ + --project=neuron-785695 \ + --package=neuron-soul \ + --version="${SOUL_VER}" \ + --destination=build-artifacts/ + mv build-artifacts/neuron* build-artifacts/neuron 2>/dev/null || true + chmod +x build-artifacts/neuron + + # ── El SDK (for engram source compilation inside the Docker build) ──── + ELC_VER=$(gcloud artifacts versions list \ + --repository=foundation-prod --location=us-central1 --project=neuron-785695 \ + --package=el-elc --sort-by="~createTime" --limit=1 \ + --format="value(name)" 2>/dev/null | awk -F/ '{print $NF}') + gcloud artifacts generic download \ + --repository=foundation-prod --location=us-central1 --project=neuron-785695 \ + --package=el-elc --version="${ELC_VER}" --destination=build-artifacts/ + mv build-artifacts/elc* build-artifacts/elc 2>/dev/null || true + chmod +x build-artifacts/elc + + RC_VER=$(gcloud artifacts versions list \ + --repository=foundation-prod --location=us-central1 --project=neuron-785695 \ + --package=el-runtime-c --sort-by="~createTime" --limit=1 \ + --format="value(name)" 2>/dev/null | awk -F/ '{print $NF}') + gcloud artifacts generic download \ + --repository=foundation-prod --location=us-central1 --project=neuron-785695 \ + --package=el-runtime-c --version="${RC_VER}" --destination=build-artifacts/ + mv build-artifacts/el_runtime.c* build-artifacts/el_runtime.c 2>/dev/null || true + + RH_VER=$(gcloud artifacts versions list \ + --repository=foundation-prod --location=us-central1 --project=neuron-785695 \ + --package=el-runtime-h --sort-by="~createTime" --limit=1 \ + --format="value(name)" 2>/dev/null | awk -F/ '{print $NF}') + gcloud artifacts generic download \ + --repository=foundation-prod --location=us-central1 --project=neuron-785695 \ + --package=el-runtime-h --version="${RH_VER}" --destination=build-artifacts/ + mv build-artifacts/el_runtime.h* build-artifacts/el_runtime.h 2>/dev/null || true + + echo "Build artifacts ready:" + ls -lh build-artifacts/ + + - name: Clone engram source for Docker build context + run: | + # The Dockerfile builds engram from source (no published AR package). + # Clone the engram repo into ./engram/ so it's available in the build context. + git clone http://34.31.145.131/neuron-technologies/engram.git \ + --depth=1 --branch=main \ + engram + echo "Engram source ready at ./engram/src/server.el" + + - name: Build and push Docker image + run: | + IMAGE="${{ steps.vars.outputs.image }}" + + echo "Building ${IMAGE}..." + docker build \ + --tag "${IMAGE}" \ + --tag "us-central1-docker.pkg.dev/neuron-785695/neuron-api/neuron-soul:latest" \ + . + + echo "Pushing ${IMAGE}..." + docker push "${IMAGE}" + docker push "us-central1-docker.pkg.dev/neuron-785695/neuron-api/neuron-soul:latest" + + - name: Blue-green deploy to GKE + run: | + chmod +x scripts/blue-green-deploy.sh + scripts/blue-green-deploy.sh \ + --image "${{ steps.vars.outputs.image }}" \ + --slot "${{ steps.vars.outputs.slot }}" + + - name: Update infrastructure manifests + if: success() + env: + INFRA_GIT_TOKEN: ${{ secrets.INFRA_GIT_TOKEN }} + run: | + SLOT="${{ steps.vars.outputs.slot }}" + if [ "$SLOT" = "blue" ]; then IDLE="green"; else IDLE="blue"; fi + + git clone "http://${INFRA_GIT_TOKEN}@34.31.145.131/neuron-technologies/infrastructure.git" \ + --depth=1 --branch=main /tmp/infra-update + + cd /tmp/infra-update + + DEPLOY_DIR="platform/k8s/neuron-mcp" + sed -i "s/^ replicas: .*/ replicas: 1/" "${DEPLOY_DIR}/deployment-${SLOT}.yaml" + sed -i "s/^ replicas: .*/ replicas: 0/" "${DEPLOY_DIR}/deployment-${IDLE}.yaml" + echo " deployment-${SLOT}.yaml: replicas set to 1" + echo " deployment-${IDLE}.yaml: replicas set to 0" + + git config user.email "ci@neurontechnologies.ai" + git config user.name "Neuron CI" + git add "${DEPLOY_DIR}/deployment-blue.yaml" "${DEPLOY_DIR}/deployment-green.yaml" + git diff --staged --quiet && { echo "No manifest changes needed"; exit 0; } + git commit -m "ci: neuron-mcp replica sync after blue-green swap to ${SLOT}" + git push origin main + echo "Infrastructure manifests updated: ${SLOT}=1, ${IDLE}=0" + + - name: Verify deployment + run: | + SLOT="${{ steps.vars.outputs.slot }}" + echo "Verifying neuron-mcp-${SLOT} is healthy..." + kubectl rollout status deployment/"neuron-mcp-${SLOT}" \ + --namespace=neuron-prod \ + --timeout=8m + + echo "Active service endpoints:" + kubectl get endpoints neuron-mcp -n neuron-prod + + echo "Pod status:" + kubectl get pods -n neuron-prod -l app=neuron-mcp + + - name: Cleanup + if: always() + run: rm -f /tmp/gcp-key.json diff --git a/.gitea/workflows/deploy-gke.yaml b/.gitea/workflows/deploy-gke.yaml index 269ae47..d57fb75 100644 --- a/.gitea/workflows/deploy-gke.yaml +++ b/.gitea/workflows/deploy-gke.yaml @@ -1,16 +1,13 @@ -name: Deploy Soul to GKE +name: Deploy Soul to GKE (manual) -# Triggers on push to main — after the soul binary is built and published -# by ci.yaml, this workflow builds the Docker image and blue-green deploys -# to the neuron-prod namespace on GKE. +# MANUAL OVERRIDE ONLY — push-triggered deploys now run as the 'deploy' job +# in ci.yaml (needs: build), which eliminates the two-workflow concurrency +# race that was cancelling queued deploy runs. # -# This workflow runs AFTER ci.yaml has published the neuron-soul generic -# artifact to Artifact Registry. The Docker build downloads that binary. +# Use this workflow only when you need to deploy a specific slot manually +# (e.g. rollback, force a slot override) without triggering a full CI build. on: - push: - branches: - - main workflow_dispatch: inputs: slot: @@ -18,8 +15,7 @@ on: required: false default: "green" -# Serialize all builds on this runner — concurrent jobs exhaust the Docker daemon. -# A queued deploy runs after the in-progress build finishes. +# Manual deploys still share the runner serialization group. concurrency: group: neuron-runner cancel-in-progress: false