246a5f0967
Gallery: remove <a> from share allowlist. Gallery cards wrap content in <a class="gal-link">; allowing <a> in sanitized answer HTML causes nested anchors that the HTML5 adoption agency algorithm resolves by restructuring the DOM, producing mismatched </div> tags that leave gallery-grid open and pull sibling elements into the grid as spurious grid columns. Account: replace email+password sign-up/sign-in with magic-link OTP. supabase.auth.signInWithOtp handles both new and existing users in one flow. Existing onAuthStateChange listener (dadeb8ddb9a8.js) retained for post-redirect dashboard display. sendMagicLink added to extract-js RESERVED_GLOBALS so the obfuscator does not mangle the onclick reference.
466 lines
15 KiB
Python
466 lines
15 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
extract-js.py — Extract inline <script> blocks from El source files into
|
|
external, minified, obfuscated .js files served from /assets/js/.
|
|
|
|
Why
|
|
---
|
|
The El landing page embeds JavaScript inline as escaped string literals.
|
|
That bloats the HTML payload and exposes implementation. This script
|
|
extracts each substantial inline block to a hashed file under
|
|
src/assets/js/, replaces the El-side block with
|
|
`<script src="/assets/js/<hash>.js" defer></script>`, and writes a manifest
|
|
for cache-busting.
|
|
|
|
Behaviour
|
|
---------
|
|
- Skips `<script src=...>` external loaders (kept as-is).
|
|
- Skips `<script type="application/ld+json">` (data, not code).
|
|
- Skips inline blocks shorter than MIN_INLINE_BYTES (defaults to 200).
|
|
- Handles El-side runtime interpolation `'" + var + "'`. For each
|
|
interpolated identifier the extractor emits a tiny inline shim
|
|
`<script>window.NEURON_CFG=window.NEURON_CFG||{};window.NEURON_CFG.<id>="\"+id+\"";</script>`
|
|
immediately before the external script tag, and rewrites the JS body
|
|
to read from `window.NEURON_CFG.<id>` so the external file is fully
|
|
static and runtime values are still injected at render time.
|
|
- Pipeline per file: terser (compress + mangle, reserved globals
|
|
preserved) → javascript-obfuscator (string-array, base64, hex names).
|
|
|
|
Idempotency
|
|
-----------
|
|
- Running twice is a no-op: blocks already rewritten to
|
|
`<script src="/assets/js/...">` are not re-extracted.
|
|
- Filenames are content-hashed (sha1[:12]), so unchanged source produces
|
|
the same output filename and an unchanged manifest.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import hashlib
|
|
import json
|
|
import os
|
|
import re
|
|
import subprocess
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import List, Optional
|
|
|
|
# ── Paths ────────────────────────────────────────────────────────────────────
|
|
|
|
REPO_ROOT = Path(__file__).resolve().parent.parent
|
|
SRC_DIR = REPO_ROOT / "src"
|
|
ASSET_DIR = SRC_DIR / "assets" / "js"
|
|
MANIFEST = ASSET_DIR / "manifest.json"
|
|
|
|
# Prefer locally installed binaries; fall back to npx.
|
|
NODE_BIN = REPO_ROOT / "node_modules" / ".bin"
|
|
TERSER = str(NODE_BIN / "terser") if (NODE_BIN / "terser").exists() else "npx terser"
|
|
OBFUSCATOR = (
|
|
str(NODE_BIN / "javascript-obfuscator")
|
|
if (NODE_BIN / "javascript-obfuscator").exists()
|
|
else "npx javascript-obfuscator"
|
|
)
|
|
|
|
# ── Config ───────────────────────────────────────────────────────────────────
|
|
|
|
MIN_INLINE_BYTES = 200 # below this we keep inline (analytics shims, redirects)
|
|
|
|
# Globals referenced from outside the script (HTML onclick=, onchange=, etc).
|
|
# These names cannot be mangled or obfuscated or buttons stop working.
|
|
RESERVED_GLOBALS = [
|
|
# Chat widget
|
|
"neuronDemoToggle",
|
|
"neuronDemoSend",
|
|
"neuronDemoReset",
|
|
# Auth flows (account + checkout)
|
|
"signInWith",
|
|
"signInWithEmail",
|
|
"signUpWithEmail",
|
|
"sendMagicLink",
|
|
"signOut",
|
|
"resetPassword",
|
|
"sendResetEmail",
|
|
"updatePassword",
|
|
"showSignIn",
|
|
"showSignUp",
|
|
"hideReset",
|
|
# Misc handlers
|
|
"setSort",
|
|
"addFamilyMember",
|
|
"removeFamilyMember",
|
|
"copyForPlatform",
|
|
"entHeadcountChange",
|
|
# Runtime config bootstrap (do not let obfuscator mangle this name)
|
|
"NEURON_CFG",
|
|
]
|
|
|
|
# Files to scan. The extractor walks every .el file in src/ but we filter
|
|
# to skip the leaf component files known to contain no <script> blocks.
|
|
EL_FILES = sorted(SRC_DIR.glob("*.el"))
|
|
|
|
# ── El extraction ────────────────────────────────────────────────────────────
|
|
|
|
# El uses backslash-escaped quotes inside its string literals. Inside an
|
|
# El string the JS body looks like:
|
|
# <script>\n var x = '\"hello\"';\n</script>
|
|
# i.e. quotes are written as \". We unescape on the way out, re-escape on
|
|
# the way in.
|
|
|
|
# We match a *plain* opening <script> tag followed by JS body and </script>.
|
|
# Cases we deliberately don't match:
|
|
# - <script src=...>...</script> (external loader)
|
|
# - <script async ...>...</script> (external loader, even with body)
|
|
# - <script type="application/ld+json">...</script> (structured data)
|
|
SCRIPT_BLOCK_RE = re.compile(
|
|
r"<script>\s*\n(.*?)\n\s*</script>",
|
|
re.DOTALL,
|
|
)
|
|
|
|
# An interpolation point inside a JS body: `'" + ident + "'` (single-quoted
|
|
# string in JS containing an El concat). We capture the bare identifier.
|
|
INTERP_RE = re.compile(r"""'"\s*\+\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*\+\s*"'""")
|
|
|
|
|
|
def is_skip_block(body: str) -> bool:
|
|
"""True if the block is too small or non-JS to be worth extracting."""
|
|
stripped = body.strip()
|
|
if len(stripped) < MIN_INLINE_BYTES:
|
|
return True
|
|
return False
|
|
|
|
|
|
def el_unescape(s: str) -> str:
|
|
r"""Mirror the El lexer's string-escape rules (foundation/el/bootstrap.py):
|
|
|
|
\n -> LF, \t -> TAB, \r -> CR, \" -> ", \\ -> \, \<X> -> X for any X.
|
|
|
|
The catch-all means \' inside an El string yields a bare apostrophe;
|
|
if we don't replicate that here, an extracted block like
|
|
`onclick=\"window.location.href=\\\'/contact\\\'\"` parses with stray
|
|
backslashes that terser then rejects as bad escape sequences."""
|
|
out = []
|
|
i = 0
|
|
n = len(s)
|
|
while i < n:
|
|
c = s[i]
|
|
if c == "\\" and i + 1 < n:
|
|
nxt = s[i + 1]
|
|
if nxt == "n":
|
|
out.append("\n")
|
|
elif nxt == "t":
|
|
out.append("\t")
|
|
elif nxt == "r":
|
|
out.append("\r")
|
|
elif nxt == '"':
|
|
out.append('"')
|
|
elif nxt == "\\":
|
|
out.append("\\")
|
|
else:
|
|
# Catch-all: unrecognised escape collapses to the second char,
|
|
# exactly as the El lexer does.
|
|
out.append(nxt)
|
|
i += 2
|
|
continue
|
|
out.append(c)
|
|
i += 1
|
|
return "".join(out)
|
|
|
|
|
|
def el_escape_attr(s: str) -> str:
|
|
"""Escape a string for use inside an El "..." literal. We only need to
|
|
escape the double quote — backslash is already legal in URLs and we
|
|
don't emit any."""
|
|
return s.replace("\\", "\\\\").replace('"', '\\"')
|
|
|
|
|
|
def sha12(content: str) -> str:
|
|
return hashlib.sha1(content.encode("utf-8")).hexdigest()[:12]
|
|
|
|
|
|
def run(cmd: List[str], **kwargs) -> subprocess.CompletedProcess:
|
|
proc = subprocess.run(cmd, check=False, capture_output=True, text=True, **kwargs)
|
|
if proc.returncode != 0:
|
|
sys.stderr.write(
|
|
f"\n[extract-js] command failed: {' '.join(cmd[:2])} ...\n"
|
|
f" exit={proc.returncode}\n"
|
|
f" stdout: {proc.stdout[:500]}\n"
|
|
f" stderr: {proc.stderr[:2000]}\n"
|
|
)
|
|
raise subprocess.CalledProcessError(
|
|
proc.returncode, cmd, proc.stdout, proc.stderr
|
|
)
|
|
return proc
|
|
|
|
|
|
def minify_and_obfuscate(js: str, hash_id: str) -> str:
|
|
"""Run js through terser then javascript-obfuscator. Returns the final
|
|
obfuscated source."""
|
|
raw_path = ASSET_DIR / f".{hash_id}.raw.js"
|
|
min_path = ASSET_DIR / f".{hash_id}.min.js"
|
|
out_path = ASSET_DIR / f"{hash_id}.js"
|
|
|
|
def _cleanup_scratch() -> None:
|
|
raw_path.unlink(missing_ok=True)
|
|
min_path.unlink(missing_ok=True)
|
|
|
|
raw_path.write_text(js, encoding="utf-8")
|
|
|
|
reserved_arg = ",".join(RESERVED_GLOBALS)
|
|
|
|
# terser
|
|
terser_cmd = TERSER.split() + [
|
|
str(raw_path),
|
|
"--compress",
|
|
"passes=2,drop_console=true,drop_debugger=true",
|
|
"--mangle",
|
|
f"reserved=[{reserved_arg}]",
|
|
"--output",
|
|
str(min_path),
|
|
]
|
|
try:
|
|
run(terser_cmd)
|
|
except Exception:
|
|
_cleanup_scratch()
|
|
raise
|
|
|
|
# javascript-obfuscator
|
|
obf_cmd = OBFUSCATOR.split() + [
|
|
str(min_path),
|
|
"--output",
|
|
str(out_path),
|
|
"--compact",
|
|
"true",
|
|
"--simplify",
|
|
"true",
|
|
"--string-array",
|
|
"true",
|
|
"--string-array-encoding",
|
|
"base64",
|
|
"--string-array-threshold",
|
|
"0.75",
|
|
"--identifier-names-generator",
|
|
"hexadecimal",
|
|
"--rename-globals",
|
|
"false",
|
|
"--self-defending",
|
|
"false",
|
|
"--reserved-names",
|
|
",".join(RESERVED_GLOBALS),
|
|
]
|
|
try:
|
|
run(obf_cmd)
|
|
except Exception:
|
|
_cleanup_scratch()
|
|
raise
|
|
|
|
# Tidy up scratch files; keep only the final .js
|
|
_cleanup_scratch()
|
|
|
|
return out_path.read_text(encoding="utf-8")
|
|
|
|
|
|
def find_script_blocks(text: str) -> List[tuple[int, int, str]]:
|
|
"""Return (start, end, body) for every plain <script>…</script> block.
|
|
`start`/`end` are file offsets covering the entire match (the tags
|
|
too)."""
|
|
out: List[tuple[int, int, str]] = []
|
|
for m in SCRIPT_BLOCK_RE.finditer(text):
|
|
out.append((m.start(), m.end(), m.group(1)))
|
|
return out
|
|
|
|
|
|
def process_block(raw_body_escaped: str) -> Optional[tuple[str, str, List[str]]]:
|
|
"""Process a single <script> body.
|
|
|
|
Returns (hash_id, replacement_html_el_escaped, interpolated_ids) or
|
|
None if the block should remain inline.
|
|
|
|
The replacement HTML is already El-escaped (so it can be slotted back
|
|
into the El source string verbatim).
|
|
"""
|
|
if is_skip_block(raw_body_escaped):
|
|
return None
|
|
|
|
# Convert El-string-escaped JS into real JS source.
|
|
js_with_interp = el_unescape(raw_body_escaped)
|
|
|
|
# Find interpolation identifiers and rewrite them to read from
|
|
# window.NEURON_CFG.<id>. We dedupe in occurrence order.
|
|
seen: List[str] = []
|
|
|
|
def repl(m: re.Match) -> str:
|
|
ident = m.group(1)
|
|
if ident not in seen:
|
|
seen.append(ident)
|
|
return f"window.NEURON_CFG.{ident}"
|
|
|
|
js_static = INTERP_RE.sub(repl, js_with_interp)
|
|
|
|
# If interpolation existed, we need to wrap the JS body so it reads
|
|
# the runtime config. Strings come back as JS strings, so we just
|
|
# inject `var X = window.NEURON_CFG.X` shims to keep call sites
|
|
# readable. Actually simpler: leave the call sites as
|
|
# `window.NEURON_CFG.X` — that's already what `repl` produced, and
|
|
# the original had `'" + var + "'` (a string), so the new value is a
|
|
# string too.
|
|
#
|
|
# Hash + minify the static JS.
|
|
hash_id = sha12(js_static)
|
|
minify_and_obfuscate(js_static, hash_id)
|
|
|
|
# Build replacement HTML for the El source.
|
|
parts: List[str] = []
|
|
if seen:
|
|
# Inline shim: bootstrap window.NEURON_CFG with runtime values.
|
|
# Each line: window.NEURON_CFG.<id> = "<el-interp>";
|
|
cfg_assigns = "".join(
|
|
f'window.NEURON_CFG.{ident}=\\"" + {ident} + "\\";'
|
|
for ident in seen
|
|
)
|
|
# The shim itself lives inline in the El string. The `\"` are
|
|
# already the right escape for the surrounding El "..." literal.
|
|
shim = (
|
|
"<script>window.NEURON_CFG=window.NEURON_CFG||{};"
|
|
+ cfg_assigns
|
|
+ "</script>"
|
|
)
|
|
parts.append(shim)
|
|
|
|
# External script tag, defer so it runs after parse but before
|
|
# DOMContentLoaded — that's compatible with `onclick=` handlers
|
|
# because they only fire on user interaction (post-load).
|
|
parts.append(
|
|
f'<script src=\\"/assets/js/{hash_id}.js\\" defer></script>'
|
|
)
|
|
|
|
return hash_id, "".join(parts), seen
|
|
|
|
|
|
EXISTING_REF_RE = re.compile(
|
|
r'<script\s+src=\\"/assets/js/([0-9a-f]{12})\.js\\"\s+defer></script>'
|
|
)
|
|
|
|
|
|
def collect_existing_refs(text: str) -> List[str]:
|
|
"""Find /assets/js/<hash>.js references already inlined into this El
|
|
file from a previous run. Returns hash IDs in document order."""
|
|
return [m.group(1) for m in EXISTING_REF_RE.finditer(text)]
|
|
|
|
|
|
def process_file(path: Path) -> tuple[int, int, List[dict]]:
|
|
"""Rewrite a single .el file, replacing extractable <script> blocks.
|
|
|
|
Returns (blocks_extracted, bytes_saved, manifest_entries). Manifest
|
|
entries always include both newly-extracted *and* previously-extracted
|
|
references so the manifest stays a complete inventory across reruns.
|
|
"""
|
|
text = path.read_text(encoding="utf-8")
|
|
original_len = len(text)
|
|
|
|
# Pick up existing references first (idempotency).
|
|
existing: List[dict] = []
|
|
for h in collect_existing_refs(text):
|
|
asset_path = ASSET_DIR / f"{h}.js"
|
|
if asset_path.exists():
|
|
existing.append(
|
|
{
|
|
"file": path.name,
|
|
"hash": h,
|
|
"asset": f"/assets/js/{h}.js",
|
|
"size": asset_path.stat().st_size,
|
|
"interpolated": [],
|
|
"note": "carried from prior run",
|
|
}
|
|
)
|
|
|
|
blocks = find_script_blocks(text)
|
|
if not blocks:
|
|
return 0, 0, existing
|
|
|
|
extracted = 0
|
|
new_entries: List[dict] = []
|
|
# Walk in reverse so offsets remain valid as we splice.
|
|
for start, end, body in reversed(blocks):
|
|
result = process_block(body)
|
|
if result is None:
|
|
continue
|
|
hash_id, replacement, interp_ids = result
|
|
text = text[:start] + replacement + text[end:]
|
|
extracted += 1
|
|
new_entries.append(
|
|
{
|
|
"file": path.name,
|
|
"hash": hash_id,
|
|
"asset": f"/assets/js/{hash_id}.js",
|
|
"size": (ASSET_DIR / f"{hash_id}.js").stat().st_size,
|
|
"interpolated": interp_ids,
|
|
}
|
|
)
|
|
|
|
if extracted:
|
|
path.write_text(text, encoding="utf-8")
|
|
|
|
bytes_saved = original_len - len(text)
|
|
return extracted, bytes_saved, existing + new_entries
|
|
|
|
|
|
def main() -> int:
|
|
ASSET_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
total_blocks = 0
|
|
total_saved = 0
|
|
all_entries: List[dict] = []
|
|
|
|
for el in EL_FILES:
|
|
n, saved, entries = process_file(el)
|
|
if n:
|
|
print(
|
|
f" {el.name:25s} {n} block(s) extracted, "
|
|
f"{saved:6d} bytes pulled out"
|
|
)
|
|
total_blocks += n
|
|
total_saved += saved
|
|
# Always carry entries forward so the manifest is a complete
|
|
# inventory even when this run extracted zero new blocks.
|
|
all_entries.extend(entries)
|
|
|
|
# Sort manifest for stable output regardless of file walk order.
|
|
all_entries.sort(key=lambda e: (e["file"], e["hash"]))
|
|
|
|
# Garbage-collect orphan .js files in the asset dir whose hash is no
|
|
# longer referenced by any El source. Without this, edits to the
|
|
# original JS leave stale hashed files behind forever.
|
|
keep = {f"{e['hash']}.js" for e in all_entries}
|
|
keep.add("manifest.json")
|
|
removed: List[str] = []
|
|
for f in ASSET_DIR.iterdir():
|
|
if f.is_file() and f.name not in keep and not f.name.startswith("."):
|
|
f.unlink()
|
|
removed.append(f.name)
|
|
if removed:
|
|
print(f" pruned {len(removed)} orphan asset(s): {', '.join(sorted(removed))}")
|
|
|
|
MANIFEST.write_text(
|
|
json.dumps(
|
|
{
|
|
"generated_by": "scripts/extract-js.py",
|
|
"count": len(all_entries),
|
|
"entries": all_entries,
|
|
},
|
|
indent=2,
|
|
)
|
|
+ "\n",
|
|
encoding="utf-8",
|
|
)
|
|
|
|
print(
|
|
f"\n total: {total_blocks} block(s), "
|
|
f"{total_saved} bytes removed from El sources, "
|
|
f"{len(all_entries)} asset(s) → {ASSET_DIR.relative_to(REPO_ROOT)}"
|
|
)
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|