Files
neuron-web/scripts/extract-js.py
T
Will Anderson 640813e42e migrate stage build to native elc; chat restores from localStorage on return
Build pipeline
- build-stage.sh replaces the old in-Dockerfile bootstrap.py path. Host
  pre-compiles src/*.el into dist/main.c via the canonical native elc at
  foundation/el/dist/platform/elc and applies the stub-decl sed before
  docker buildx runs.
- Dockerfile.stage drops bootstrap.py + python3 from the builder stage
  and just runs cc on the host-supplied dist/main.c.
- Pre-rendered HTML shells under /srv/landing/ are now chowned to the
  landing user so the El page-builder's fs_write at startup can rewrite
  them — without that, post-COPY edits never reach the served HTML and
  the served page stays as the stale build-time fallback.

Chat restore
- session.verified + session.verifiedAt persist through localStorage so
  a return visit within 24h skips the Turnstile gate and lands directly
  in the restored conversation.
- restoreOrGreet() is the single source of truth for what shows up in
  the message pane after the gate clears: replays prior messages with
  skipSave, else drops the canned hello once and remembers it.
- applyVerifiedDom() hides the gate / reveals the chat row, called both
  from the verified-on-load path (DOMContentLoaded if loading, else
  immediate) and from the Turnstile callback.
- neuronDemoReset clears verified + verifiedAt so the gate returns next
  open.

Extracted JS assets (src/assets/js/*.js + manifest.json) and the
extract-js.py helper land here too — they match what the new build-stage
flow produces and removes the inline <script> blobs from the served HTML.
2026-05-02 11:15:09 -05:00

465 lines
15 KiB
Python

#!/usr/bin/env python3
"""
extract-js.py — Extract inline <script> blocks from El source files into
external, minified, obfuscated .js files served from /assets/js/.
Why
---
The El landing page embeds JavaScript inline as escaped string literals.
That bloats the HTML payload and exposes implementation. This script
extracts each substantial inline block to a hashed file under
src/assets/js/, replaces the El-side block with
`<script src="/assets/js/<hash>.js" defer></script>`, and writes a manifest
for cache-busting.
Behaviour
---------
- Skips `<script src=...>` external loaders (kept as-is).
- Skips `<script type="application/ld+json">` (data, not code).
- Skips inline blocks shorter than MIN_INLINE_BYTES (defaults to 200).
- Handles El-side runtime interpolation `'" + var + "'`. For each
interpolated identifier the extractor emits a tiny inline shim
`<script>window.NEURON_CFG=window.NEURON_CFG||{};window.NEURON_CFG.<id>="\"+id+\"";</script>`
immediately before the external script tag, and rewrites the JS body
to read from `window.NEURON_CFG.<id>` so the external file is fully
static and runtime values are still injected at render time.
- Pipeline per file: terser (compress + mangle, reserved globals
preserved) → javascript-obfuscator (string-array, base64, hex names).
Idempotency
-----------
- Running twice is a no-op: blocks already rewritten to
`<script src="/assets/js/...">` are not re-extracted.
- Filenames are content-hashed (sha1[:12]), so unchanged source produces
the same output filename and an unchanged manifest.
"""
from __future__ import annotations
import hashlib
import json
import os
import re
import subprocess
import sys
from pathlib import Path
from typing import List, Optional
# ── Paths ────────────────────────────────────────────────────────────────────
REPO_ROOT = Path(__file__).resolve().parent.parent
SRC_DIR = REPO_ROOT / "src"
ASSET_DIR = SRC_DIR / "assets" / "js"
MANIFEST = ASSET_DIR / "manifest.json"
# Prefer locally installed binaries; fall back to npx.
NODE_BIN = REPO_ROOT / "node_modules" / ".bin"
TERSER = str(NODE_BIN / "terser") if (NODE_BIN / "terser").exists() else "npx terser"
OBFUSCATOR = (
str(NODE_BIN / "javascript-obfuscator")
if (NODE_BIN / "javascript-obfuscator").exists()
else "npx javascript-obfuscator"
)
# ── Config ───────────────────────────────────────────────────────────────────
MIN_INLINE_BYTES = 200 # below this we keep inline (analytics shims, redirects)
# Globals referenced from outside the script (HTML onclick=, onchange=, etc).
# These names cannot be mangled or obfuscated or buttons stop working.
RESERVED_GLOBALS = [
# Chat widget
"neuronDemoToggle",
"neuronDemoSend",
"neuronDemoReset",
# Auth flows (account + checkout)
"signInWith",
"signInWithEmail",
"signUpWithEmail",
"signOut",
"resetPassword",
"sendResetEmail",
"updatePassword",
"showSignIn",
"showSignUp",
"hideReset",
# Misc handlers
"setSort",
"addFamilyMember",
"removeFamilyMember",
"copyForPlatform",
"entHeadcountChange",
# Runtime config bootstrap (do not let obfuscator mangle this name)
"NEURON_CFG",
]
# Files to scan. The extractor walks every .el file in src/ but we filter
# to skip the leaf component files known to contain no <script> blocks.
EL_FILES = sorted(SRC_DIR.glob("*.el"))
# ── El extraction ────────────────────────────────────────────────────────────
# El uses backslash-escaped quotes inside its string literals. Inside an
# El string the JS body looks like:
# <script>\n var x = '\"hello\"';\n</script>
# i.e. quotes are written as \". We unescape on the way out, re-escape on
# the way in.
# We match a *plain* opening <script> tag followed by JS body and </script>.
# Cases we deliberately don't match:
# - <script src=...>...</script> (external loader)
# - <script async ...>...</script> (external loader, even with body)
# - <script type="application/ld+json">...</script> (structured data)
SCRIPT_BLOCK_RE = re.compile(
r"<script>\s*\n(.*?)\n\s*</script>",
re.DOTALL,
)
# An interpolation point inside a JS body: `'" + ident + "'` (single-quoted
# string in JS containing an El concat). We capture the bare identifier.
INTERP_RE = re.compile(r"""'"\s*\+\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*\+\s*"'""")
def is_skip_block(body: str) -> bool:
"""True if the block is too small or non-JS to be worth extracting."""
stripped = body.strip()
if len(stripped) < MIN_INLINE_BYTES:
return True
return False
def el_unescape(s: str) -> str:
r"""Mirror the El lexer's string-escape rules (foundation/el/bootstrap.py):
\n -> LF, \t -> TAB, \r -> CR, \" -> ", \\ -> \, \<X> -> X for any X.
The catch-all means \' inside an El string yields a bare apostrophe;
if we don't replicate that here, an extracted block like
`onclick=\"window.location.href=\\\'/contact\\\'\"` parses with stray
backslashes that terser then rejects as bad escape sequences."""
out = []
i = 0
n = len(s)
while i < n:
c = s[i]
if c == "\\" and i + 1 < n:
nxt = s[i + 1]
if nxt == "n":
out.append("\n")
elif nxt == "t":
out.append("\t")
elif nxt == "r":
out.append("\r")
elif nxt == '"':
out.append('"')
elif nxt == "\\":
out.append("\\")
else:
# Catch-all: unrecognised escape collapses to the second char,
# exactly as the El lexer does.
out.append(nxt)
i += 2
continue
out.append(c)
i += 1
return "".join(out)
def el_escape_attr(s: str) -> str:
"""Escape a string for use inside an El "..." literal. We only need to
escape the double quote — backslash is already legal in URLs and we
don't emit any."""
return s.replace("\\", "\\\\").replace('"', '\\"')
def sha12(content: str) -> str:
return hashlib.sha1(content.encode("utf-8")).hexdigest()[:12]
def run(cmd: List[str], **kwargs) -> subprocess.CompletedProcess:
proc = subprocess.run(cmd, check=False, capture_output=True, text=True, **kwargs)
if proc.returncode != 0:
sys.stderr.write(
f"\n[extract-js] command failed: {' '.join(cmd[:2])} ...\n"
f" exit={proc.returncode}\n"
f" stdout: {proc.stdout[:500]}\n"
f" stderr: {proc.stderr[:2000]}\n"
)
raise subprocess.CalledProcessError(
proc.returncode, cmd, proc.stdout, proc.stderr
)
return proc
def minify_and_obfuscate(js: str, hash_id: str) -> str:
"""Run js through terser then javascript-obfuscator. Returns the final
obfuscated source."""
raw_path = ASSET_DIR / f".{hash_id}.raw.js"
min_path = ASSET_DIR / f".{hash_id}.min.js"
out_path = ASSET_DIR / f"{hash_id}.js"
def _cleanup_scratch() -> None:
raw_path.unlink(missing_ok=True)
min_path.unlink(missing_ok=True)
raw_path.write_text(js, encoding="utf-8")
reserved_arg = ",".join(RESERVED_GLOBALS)
# terser
terser_cmd = TERSER.split() + [
str(raw_path),
"--compress",
"passes=2,drop_console=true,drop_debugger=true",
"--mangle",
f"reserved=[{reserved_arg}]",
"--output",
str(min_path),
]
try:
run(terser_cmd)
except Exception:
_cleanup_scratch()
raise
# javascript-obfuscator
obf_cmd = OBFUSCATOR.split() + [
str(min_path),
"--output",
str(out_path),
"--compact",
"true",
"--simplify",
"true",
"--string-array",
"true",
"--string-array-encoding",
"base64",
"--string-array-threshold",
"0.75",
"--identifier-names-generator",
"hexadecimal",
"--rename-globals",
"false",
"--self-defending",
"false",
"--reserved-names",
",".join(RESERVED_GLOBALS),
]
try:
run(obf_cmd)
except Exception:
_cleanup_scratch()
raise
# Tidy up scratch files; keep only the final .js
_cleanup_scratch()
return out_path.read_text(encoding="utf-8")
def find_script_blocks(text: str) -> List[tuple[int, int, str]]:
"""Return (start, end, body) for every plain <script>…</script> block.
`start`/`end` are file offsets covering the entire match (the tags
too)."""
out: List[tuple[int, int, str]] = []
for m in SCRIPT_BLOCK_RE.finditer(text):
out.append((m.start(), m.end(), m.group(1)))
return out
def process_block(raw_body_escaped: str) -> Optional[tuple[str, str, List[str]]]:
"""Process a single <script> body.
Returns (hash_id, replacement_html_el_escaped, interpolated_ids) or
None if the block should remain inline.
The replacement HTML is already El-escaped (so it can be slotted back
into the El source string verbatim).
"""
if is_skip_block(raw_body_escaped):
return None
# Convert El-string-escaped JS into real JS source.
js_with_interp = el_unescape(raw_body_escaped)
# Find interpolation identifiers and rewrite them to read from
# window.NEURON_CFG.<id>. We dedupe in occurrence order.
seen: List[str] = []
def repl(m: re.Match) -> str:
ident = m.group(1)
if ident not in seen:
seen.append(ident)
return f"window.NEURON_CFG.{ident}"
js_static = INTERP_RE.sub(repl, js_with_interp)
# If interpolation existed, we need to wrap the JS body so it reads
# the runtime config. Strings come back as JS strings, so we just
# inject `var X = window.NEURON_CFG.X` shims to keep call sites
# readable. Actually simpler: leave the call sites as
# `window.NEURON_CFG.X` — that's already what `repl` produced, and
# the original had `'" + var + "'` (a string), so the new value is a
# string too.
#
# Hash + minify the static JS.
hash_id = sha12(js_static)
minify_and_obfuscate(js_static, hash_id)
# Build replacement HTML for the El source.
parts: List[str] = []
if seen:
# Inline shim: bootstrap window.NEURON_CFG with runtime values.
# Each line: window.NEURON_CFG.<id> = "<el-interp>";
cfg_assigns = "".join(
f'window.NEURON_CFG.{ident}=\\"" + {ident} + "\\";'
for ident in seen
)
# The shim itself lives inline in the El string. The `\"` are
# already the right escape for the surrounding El "..." literal.
shim = (
"<script>window.NEURON_CFG=window.NEURON_CFG||{};"
+ cfg_assigns
+ "</script>"
)
parts.append(shim)
# External script tag, defer so it runs after parse but before
# DOMContentLoaded — that's compatible with `onclick=` handlers
# because they only fire on user interaction (post-load).
parts.append(
f'<script src=\\"/assets/js/{hash_id}.js\\" defer></script>'
)
return hash_id, "".join(parts), seen
EXISTING_REF_RE = re.compile(
r'<script\s+src=\\"/assets/js/([0-9a-f]{12})\.js\\"\s+defer></script>'
)
def collect_existing_refs(text: str) -> List[str]:
"""Find /assets/js/<hash>.js references already inlined into this El
file from a previous run. Returns hash IDs in document order."""
return [m.group(1) for m in EXISTING_REF_RE.finditer(text)]
def process_file(path: Path) -> tuple[int, int, List[dict]]:
"""Rewrite a single .el file, replacing extractable <script> blocks.
Returns (blocks_extracted, bytes_saved, manifest_entries). Manifest
entries always include both newly-extracted *and* previously-extracted
references so the manifest stays a complete inventory across reruns.
"""
text = path.read_text(encoding="utf-8")
original_len = len(text)
# Pick up existing references first (idempotency).
existing: List[dict] = []
for h in collect_existing_refs(text):
asset_path = ASSET_DIR / f"{h}.js"
if asset_path.exists():
existing.append(
{
"file": path.name,
"hash": h,
"asset": f"/assets/js/{h}.js",
"size": asset_path.stat().st_size,
"interpolated": [],
"note": "carried from prior run",
}
)
blocks = find_script_blocks(text)
if not blocks:
return 0, 0, existing
extracted = 0
new_entries: List[dict] = []
# Walk in reverse so offsets remain valid as we splice.
for start, end, body in reversed(blocks):
result = process_block(body)
if result is None:
continue
hash_id, replacement, interp_ids = result
text = text[:start] + replacement + text[end:]
extracted += 1
new_entries.append(
{
"file": path.name,
"hash": hash_id,
"asset": f"/assets/js/{hash_id}.js",
"size": (ASSET_DIR / f"{hash_id}.js").stat().st_size,
"interpolated": interp_ids,
}
)
if extracted:
path.write_text(text, encoding="utf-8")
bytes_saved = original_len - len(text)
return extracted, bytes_saved, existing + new_entries
def main() -> int:
ASSET_DIR.mkdir(parents=True, exist_ok=True)
total_blocks = 0
total_saved = 0
all_entries: List[dict] = []
for el in EL_FILES:
n, saved, entries = process_file(el)
if n:
print(
f" {el.name:25s} {n} block(s) extracted, "
f"{saved:6d} bytes pulled out"
)
total_blocks += n
total_saved += saved
# Always carry entries forward so the manifest is a complete
# inventory even when this run extracted zero new blocks.
all_entries.extend(entries)
# Sort manifest for stable output regardless of file walk order.
all_entries.sort(key=lambda e: (e["file"], e["hash"]))
# Garbage-collect orphan .js files in the asset dir whose hash is no
# longer referenced by any El source. Without this, edits to the
# original JS leave stale hashed files behind forever.
keep = {f"{e['hash']}.js" for e in all_entries}
keep.add("manifest.json")
removed: List[str] = []
for f in ASSET_DIR.iterdir():
if f.is_file() and f.name not in keep and not f.name.startswith("."):
f.unlink()
removed.append(f.name)
if removed:
print(f" pruned {len(removed)} orphan asset(s): {', '.join(sorted(removed))}")
MANIFEST.write_text(
json.dumps(
{
"generated_by": "scripts/extract-js.py",
"count": len(all_entries),
"entries": all_entries,
},
indent=2,
)
+ "\n",
encoding="utf-8",
)
print(
f"\n total: {total_blocks} block(s), "
f"{total_saved} bytes removed from El sources, "
f"{len(all_entries)} asset(s) → {ASSET_DIR.relative_to(REPO_ROOT)}"
)
return 0
if __name__ == "__main__":
sys.exit(main())