From 46f93fd6eb8f4fe5e0d4f312f36931de0b632ceb Mon Sep 17 00:00:00 2001 From: Will Anderson Date: Sat, 2 May 2026 12:56:33 -0500 Subject: [PATCH] security: replace denylist sanitize_share_html with allowlist el_html_sanitize MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A real attacker probed /api/share earlier today with ` is + * CDATA-like and must not be re-emitted as escaped text either. + * - Comments (), doctype (), CDATA (), + * and processing instructions () are dropped entirely. + * - Text content outside dropped subtrees is HTML-escaped (&, <, >, ", '). + * - Attribute values are unquoted/dequoted, then re-emitted with double + * quotes around the cleanly-escaped value. + * - For `` and any `src` attribute, the URL scheme is validated: + * only http:, https:, mailto:, fragment-only `#anchor`, or relative + * paths are allowed. Anything else (javascript:, data:, vbscript:, + * about:, file:, etc.) drops the attribute. + * - Self-closing void tags (br, hr, img, etc.) emit without a close tag. + * - Malformed input (unclosed tag at EOF, bad attribute syntax) drops + * the pending tag and continues. Pre-encoded entities (<, &, + * etc.) are passed through verbatim — the browser will decode them + * safely on render. + * + * Allowlist format (JSON string): + * {"p":[],"a":["href","title"],"strong":[],...} + * - Key = lowercase tag name. + * - Value = JSON array of allowed attribute names (lowercase). + * - Empty array means tag allowed but no attributes survive. + * + * Output is a freshly-allocated arena-tracked el_val_t string. */ + +/* Internal byte buffer with realloc-doubling. Used during sanitization; + * the final result is copied into an arena-tracked el_strbuf so the caller + * sees standard runtime memory semantics. */ +typedef struct { + char* data; + size_t len; + size_t cap; +} html_buf_t; + +static void html_buf_init(html_buf_t* b) { + b->cap = 256; + b->data = malloc(b->cap); + if (!b->data) { fputs("el_runtime: out of memory\n", stderr); exit(1); } + b->len = 0; +} + +static void html_buf_grow(html_buf_t* b, size_t need) { + if (b->len + need + 1 <= b->cap) return; + size_t nc = b->cap; + while (b->len + need + 1 > nc) nc *= 2; + char* nd = realloc(b->data, nc); + if (!nd) { fputs("el_runtime: out of memory\n", stderr); exit(1); } + b->data = nd; + b->cap = nc; +} + +static void html_buf_putc(html_buf_t* b, char c) { + html_buf_grow(b, 1); + b->data[b->len++] = c; +} + +static void html_buf_puts(html_buf_t* b, const char* s) { + if (!s) return; + size_t n = strlen(s); + html_buf_grow(b, n); + memcpy(b->data + b->len, s, n); + b->len += n; +} + +static void html_buf_free(html_buf_t* b) { + free(b->data); + b->data = NULL; + b->len = b->cap = 0; +} + +/* ASCII tolower, locale-independent. */ +static int html_tolower(int c) { + return (c >= 'A' && c <= 'Z') ? c + 32 : c; +} + +/* Case-insensitive ASCII compare of [a, a+n) against c-string `s`. + * Returns 1 iff lengths match and bytes are equal under tolower. */ +static int html_ieq_n(const char* a, size_t n, const char* s) { + if (!a || !s) return 0; + if (strlen(s) != n) return 0; + for (size_t i = 0; i < n; i++) { + if (html_tolower((unsigned char)a[i]) != html_tolower((unsigned char)s[i])) return 0; + } + return 1; +} + +/* Case-insensitive ASCII compare of two byte slices. */ +static int html_iemem(const char* a, const char* b, size_t n) { + for (size_t i = 0; i < n; i++) { + if (html_tolower((unsigned char)a[i]) != html_tolower((unsigned char)b[i])) return 0; + } + return 1; +} + +/* Walk a JSON allowlist object and find the value (an array) for a given + * tag key, comparing case-insensitively. On hit returns a pointer to the + * opening `[` of the array and writes the byte length of the array span + * (including the brackets) to *out_len. On miss returns NULL. + * + * The parser is intentionally tiny: it does not handle escapes inside + * keys (allowlist authors do not need them), and it relies on balanced + * brackets/quotes within the value array. */ +static const char* html_allowlist_find(const char* allow, const char* tag, + size_t tag_len, size_t* out_len) { + if (!allow) return NULL; + const char* p = allow; + while (*p == ' ' || *p == '\t' || *p == '\n' || *p == '\r') p++; + if (*p != '{') return NULL; + p++; + while (*p) { + while (*p == ' ' || *p == '\t' || *p == '\n' || *p == '\r' || *p == ',') p++; + if (*p == '}' || *p == 0) return NULL; + if (*p != '"') return NULL; + p++; + const char* k = p; + while (*p && *p != '"') p++; + if (*p != '"') return NULL; + size_t klen = (size_t)(p - k); + p++; + while (*p == ' ' || *p == '\t' || *p == '\n' || *p == '\r') p++; + if (*p != ':') return NULL; + p++; + while (*p == ' ' || *p == '\t' || *p == '\n' || *p == '\r') p++; + if (*p != '[') return NULL; + const char* arr_start = p; + int depth = 0; + int in_str = 0; + while (*p) { + char c = *p; + if (in_str) { + if (c == '\\' && p[1]) { p += 2; continue; } + if (c == '"') in_str = 0; + } else { + if (c == '"') in_str = 1; + else if (c == '[') depth++; + else if (c == ']') { depth--; if (depth == 0) { p++; break; } } + } + p++; + } + size_t alen = (size_t)(p - arr_start); + int match = (klen == tag_len) && html_iemem(k, tag, klen); + if (match) { + if (out_len) *out_len = alen; + return arr_start; + } + } + return NULL; +} + +/* Returns 1 iff `attr` (length attr_len) appears as a string element + * in the JSON array slice [arr, arr+arr_len). Comparison is case- + * insensitive. */ +static int html_attr_in_array(const char* arr, size_t arr_len, + const char* attr, size_t attr_len) { + if (!arr || arr_len < 2) return 0; + const char* p = arr + 1; + const char* end = arr + arr_len - 1; + while (p < end) { + while (p < end && (*p == ' ' || *p == '\t' || *p == '\n' || *p == '\r' || *p == ',')) p++; + if (p >= end) return 0; + if (*p != '"') return 0; + p++; + const char* s = p; + while (p < end && *p != '"') { + if (*p == '\\' && p + 1 < end) p++; + p++; + } + if (p >= end) return 0; + size_t slen = (size_t)(p - s); + p++; + if (slen == attr_len && html_iemem(s, attr, slen)) return 1; + } + return 0; +} + +/* Hard-coded set of tags whose content is ALSO dropped (entire subtree). */ +static int html_is_dangerous_container(const char* tag, size_t tag_len) { + static const char* names[] = { + "script", "style", "iframe", "object", "embed", "form", + "noscript", "noembed", "template", "svg", "math", "frame", + "frameset", "applet", "audio", "video", "source", "track", + NULL + }; + for (int i = 0; names[i]; i++) { + if (html_ieq_n(tag, tag_len, names[i])) return 1; + } + return 0; +} + +/* HTML void elements — emit without a close tag. */ +static int html_is_void(const char* tag, size_t tag_len) { + static const char* names[] = { + "area", "base", "br", "col", "embed", "hr", "img", "input", + "link", "meta", "param", "source", "track", "wbr", + NULL + }; + for (int i = 0; names[i]; i++) { + if (html_ieq_n(tag, tag_len, names[i])) return 1; + } + return 0; +} + +/* Append a single byte HTML-escaped into the output buffer. */ +static void html_escape_byte(html_buf_t* out, unsigned char c) { + switch (c) { + case '<': html_buf_puts(out, "<"); break; + case '>': html_buf_puts(out, ">"); break; + case '"': html_buf_puts(out, """); break; + case '\'': html_buf_puts(out, "'"); break; + default: html_buf_putc(out, (char)c); break; + } +} + +/* Validate a URL value against the allowlist of safe schemes for hrefs. + * Returns 1 iff the URL is safe to emit. Acceptable forms: + * - http:// or https:// (case-insensitive) + * - mailto: + * - fragment-only `#anchor` + * - relative path that does not contain a colon before the first + * slash/?/# (so `foo/bar`, `/foo`, `?x=1` are OK; `javascript:x` is + * not — its colon precedes any path/hash/query separator). + * + * URL leading whitespace and embedded ASCII control bytes (TAB, LF, CR) + * are stripped before the scheme test, mirroring how browsers normalise + * URLs (these bytes are otherwise a known XSS bypass: `java\tscript:`). */ +static int html_url_is_safe(const char* url, size_t len) { + if (!url || len == 0) return 1; /* empty href is harmless */ + size_t i = 0; + while (i < len) { + unsigned char c = (unsigned char)url[i]; + if (c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == 0x0B || c == 0x0C) { + i++; continue; + } + break; + } + if (i >= len) return 1; /* whitespace only */ + if (url[i] == '#') return 1; /* fragment only */ + if (url[i] == '/' || url[i] == '?') return 1; /* relative */ + /* Find the first scheme-terminating character. */ + size_t scheme_end = (size_t)-1; + for (size_t j = i; j < len; j++) { + char c = url[j]; + if (c == ':') { scheme_end = j; break; } + if (c == '/' || c == '?' || c == '#') break; + } + if (scheme_end == (size_t)-1) return 1; /* no colon → relative path */ + /* Lowercase the scheme, stripping embedded control bytes. */ + char scheme[32]; + size_t sl = 0; + for (size_t j = i; j < scheme_end && sl < sizeof(scheme) - 1; j++) { + unsigned char c = (unsigned char)url[j]; + if (c == '\t' || c == '\n' || c == '\r' || c == 0x0B || c == 0x0C) continue; + scheme[sl++] = (char)html_tolower(c); + } + scheme[sl] = '\0'; + if (strcmp(scheme, "http") == 0) return 1; + if (strcmp(scheme, "https") == 0) return 1; + if (strcmp(scheme, "mailto") == 0) return 1; + return 0; +} + +el_val_t el_html_sanitize(el_val_t input_v, el_val_t allowlist_v) { + const char* input = EL_CSTR(input_v); + const char* allow = EL_CSTR(allowlist_v); + if (!input) return el_wrap_str(el_strdup("")); + if (!allow) allow = "{}"; + size_t in_len = strlen(input); + + html_buf_t out; + html_buf_init(&out); + + size_t i = 0; + while (i < in_len) { + unsigned char c = (unsigned char)input[i]; + if (c != '<') { + /* Plain text — escape and emit. We pass `&` through verbatim + * to preserve pre-encoded entities (`<`, `&`, `&#x...;`) + * which the browser will decode safely. */ + if (c == '&') html_buf_putc(&out, '&'); + else html_escape_byte(&out, c); + i++; + continue; + } + /* `<` — try to parse a tag. */ + if (i + 1 >= in_len) { + html_buf_puts(&out, "<"); + i++; + continue; + } + /* Comments, doctype, CDATA, processing instructions — drop entirely. */ + if (input[i + 1] == '!') { + if (i + 3 < in_len && input[i + 2] == '-' && input[i + 3] == '-') { + size_t j = i + 4; + while (j + 2 < in_len && !(input[j] == '-' && input[j + 1] == '-' && input[j + 2] == '>')) j++; + if (j + 2 < in_len) i = j + 3; + else i = in_len; + continue; + } + size_t j = i + 2; + while (j < in_len && input[j] != '>') j++; + i = (j < in_len) ? j + 1 : in_len; + continue; + } + if (input[i + 1] == '?') { + size_t j = i + 2; + while (j < in_len && input[j] != '>') j++; + i = (j < in_len) ? j + 1 : in_len; + continue; + } + int is_close = 0; + size_t name_start = i + 1; + if (input[i + 1] == '/') { + is_close = 1; + name_start = i + 2; + } + if (name_start >= in_len) { + html_buf_puts(&out, "<"); + i++; + continue; + } + unsigned char nc = (unsigned char)input[name_start]; + if (!((nc >= 'a' && nc <= 'z') || (nc >= 'A' && nc <= 'Z'))) { + /* `<` followed by non-letter — emit as escaped text. */ + html_buf_puts(&out, "<"); + i++; + continue; + } + size_t name_end = name_start; + while (name_end < in_len) { + unsigned char x = (unsigned char)input[name_end]; + if ((x >= 'a' && x <= 'z') || (x >= 'A' && x <= 'Z') || + (x >= '0' && x <= '9') || x == '-' || x == '_' || x == ':') { + name_end++; + } else { + break; + } + } + const char* tag = input + name_start; + size_t tag_len = name_end - name_start; + /* Find the `>` that closes this tag, respecting quoted attrs. */ + size_t cur = name_end; + int self_close = 0; + while (cur < in_len) { + unsigned char x = (unsigned char)input[cur]; + if (x == '"' || x == '\'') { + unsigned char q = x; + cur++; + while (cur < in_len && (unsigned char)input[cur] != q) cur++; + if (cur < in_len) cur++; /* skip closing quote */ + continue; + } + if (x == '/' && cur + 1 < in_len && input[cur + 1] == '>') { + self_close = 1; + break; + } + if (x == '>') break; + cur++; + } + if (cur >= in_len) { + /* Malformed: unclosed tag at EOF. Drop the rest of the input. */ + i = in_len; + continue; + } + size_t tag_end = self_close ? cur + 2 : cur + 1; /* one past `>` */ + /* Dangerous container — drop the whole subtree. */ + if (!is_close && html_is_dangerous_container(tag, tag_len)) { + if (self_close || html_is_void(tag, tag_len)) { + i = tag_end; + continue; + } + size_t scan = tag_end; + int found_close = 0; + while (scan < in_len) { + if (input[scan] != '<') { scan++; continue; } + if (scan + 1 < in_len && input[scan + 1] == '/') { + size_t cn_start = scan + 2; + size_t cn_end = cn_start; + while (cn_end < in_len) { + unsigned char x = (unsigned char)input[cn_end]; + if ((x >= 'a' && x <= 'z') || (x >= 'A' && x <= 'Z') || + (x >= '0' && x <= '9') || x == '-' || x == '_' || x == ':') { + cn_end++; + } else break; + } + if (cn_end - cn_start == tag_len && + html_iemem(input + cn_start, tag, tag_len)) { + size_t end_close = cn_end; + while (end_close < in_len && input[end_close] != '>') end_close++; + i = (end_close < in_len) ? end_close + 1 : in_len; + found_close = 1; + break; + } + } + scan++; + } + if (!found_close) { + /* No matching close — drop everything from here on. */ + i = in_len; + } + continue; + } + /* Look up the tag in the allowlist. */ + size_t arr_len = 0; + const char* arr = html_allowlist_find(allow, tag, tag_len, &arr_len); + if (!arr) { + /* Tag not allowed. Drop the open/close marker; inner text is + * processed by the outer loop and re-emitted as escaped text. */ + i = tag_end; + continue; + } + if (is_close) { + if (!html_is_void(tag, tag_len)) { + html_buf_putc(&out, '<'); + html_buf_putc(&out, '/'); + for (size_t k = 0; k < tag_len; k++) { + html_buf_putc(&out, (char)html_tolower((unsigned char)tag[k])); + } + html_buf_putc(&out, '>'); + } + i = tag_end; + continue; + } + /* Allowed open tag. Emit ``. */ + html_buf_putc(&out, '<'); + for (size_t k = 0; k < tag_len; k++) { + html_buf_putc(&out, (char)html_tolower((unsigned char)tag[k])); + } + size_t a = name_end; + while (a < cur) { + unsigned char x = (unsigned char)input[a]; + if (x == ' ' || x == '\t' || x == '\n' || x == '\r' || x == '/') { a++; continue; } + size_t an_start = a; + while (a < cur) { + unsigned char y = (unsigned char)input[a]; + if (y == '=' || y == ' ' || y == '\t' || y == '\n' || y == '\r' || y == '/' || y == '>') break; + a++; + } + size_t an_len = a - an_start; + if (an_len == 0) { a++; continue; } + size_t av_start = 0; + size_t av_len = 0; + int has_value = 0; + size_t b = a; + while (b < cur && (input[b] == ' ' || input[b] == '\t' || input[b] == '\n' || input[b] == '\r')) b++; + if (b < cur && input[b] == '=') { + has_value = 1; + b++; + while (b < cur && (input[b] == ' ' || input[b] == '\t' || input[b] == '\n' || input[b] == '\r')) b++; + if (b < cur && (input[b] == '"' || input[b] == '\'')) { + unsigned char q = (unsigned char)input[b]; + b++; + av_start = b; + while (b < cur && (unsigned char)input[b] != q) b++; + av_len = b - av_start; + if (b < cur) b++; + } else { + av_start = b; + while (b < cur) { + unsigned char y = (unsigned char)input[b]; + if (y == ' ' || y == '\t' || y == '\n' || y == '\r' || y == '>') break; + b++; + } + av_len = b - av_start; + } + a = b; + } + if (!html_attr_in_array(arr, arr_len, input + an_start, an_len)) continue; + int is_href = (an_len == 4 && html_iemem(input + an_start, "href", 4)); + int is_src = (an_len == 3 && html_iemem(input + an_start, "src", 3)); + if ((is_href || is_src) && has_value) { + if (!html_url_is_safe(input + av_start, av_len)) continue; + } + html_buf_putc(&out, ' '); + for (size_t k = 0; k < an_len; k++) { + html_buf_putc(&out, (char)html_tolower((unsigned char)input[an_start + k])); + } + if (has_value) { + html_buf_puts(&out, "=\""); + for (size_t k = 0; k < av_len; k++) { + unsigned char y = (unsigned char)input[av_start + k]; + /* Re-escape so the emitted attribute is well-formed + * double-quoted HTML. `&` passes through to preserve + * pre-encoded entities. */ + if (y == '"') html_buf_puts(&out, """); + else if (y == '<') html_buf_puts(&out, "<"); + else if (y == '>') html_buf_puts(&out, ">"); + else html_buf_putc(&out, (char)y); + } + html_buf_putc(&out, '"'); + } + } + html_buf_putc(&out, '>'); + i = tag_end; + } + /* Copy into arena-tracked buffer so the standard runtime memory model + * applies to the returned string. */ + char* result = el_strbuf(out.len); + memcpy(result, out.data, out.len); + result[out.len] = '\0'; + html_buf_free(&out); + return el_wrap_str(result); +} + /* ── JSON ────────────────────────────────────────────────────────────────── */ -el_val_t json_get(el_val_t jsonv, el_val_t keyv) { - const char* json = EL_CSTR(jsonv); - const char* key = EL_CSTR(keyv); - if (!json || !key) return el_wrap_str(el_strdup("")); - size_t klen = strlen(key); - /* Use a stack buffer for the pattern to avoid arena double-free. - * Keys in El maps are typically short; 512 bytes is a safe upper bound. */ - char stack_pat[512]; - char* pattern; - if (klen + 5 <= sizeof(stack_pat)) { - pattern = stack_pat; - } else { - pattern = malloc(klen + 5); - if (!pattern) return el_wrap_str(el_strdup("")); +/* True iff the segment is non-empty and every byte is an ASCII digit. We treat + * such segments as numeric array indices when walking a dot-path; mixed names + * like "0a" remain object-key lookups, so a key named "0" still wins over an + * index when the surrounding container is an object. */ +static int json_path_seg_is_index(const char* seg, size_t n) { + if (n == 0) return 0; + for (size_t i = 0; i < n; i++) { + if (seg[i] < '0' || seg[i] > '9') return 0; } - snprintf(pattern, klen + 5, "\"%s\":", key); - const char* p = strstr(json, pattern); - if (pattern != stack_pat) free(pattern); - if (!p) return el_wrap_str(el_strdup("")); - p += strlen(key) + 3; /* skip "key": */ - while (*p == ' ' || *p == '\t' || *p == '\n') p++; + return 1; +} + +/* Skip JSON whitespace. */ +static const char* json_skip_ws(const char* p) { + while (*p == ' ' || *p == '\t' || *p == '\n' || *p == '\r') p++; + return p; +} + +/* Descend one segment into the JSON cursor `p`. + * - If `p` points at an array `[...]` and the segment is all digits, + * advance to that element (zero-based). + * - Otherwise treat the segment as an object key and use json_find_key + * scoped to a one-level slice of the current container. + * Returns NULL if the descent fails (segment not found, container mismatch). + * + * `seg` is a pointer into the original path string and `seg_len` is its + * byte length — this avoids an extra alloc per segment. */ +static const char* json_path_descend(const char* p, const char* seg, size_t seg_len) { + if (!p || !seg) return NULL; + p = json_skip_ws(p); + if (*p == '[' && json_path_seg_is_index(seg, seg_len)) { + long idx = 0; + for (size_t i = 0; i < seg_len; i++) idx = idx * 10 + (seg[i] - '0'); + p++; /* step past '[' */ + p = json_skip_ws(p); + long cur = 0; + while (*p && *p != ']') { + if (cur == idx) return p; + const char* end = json_skip_value(p); + if (!end || end == p) return NULL; + p = json_skip_ws(end); + if (*p == ',') { p++; p = json_skip_ws(p); cur++; continue; } + /* No comma after this element — only acceptable at the closing ']', + * which means we ran out of elements. */ + break; + } + return NULL; + } + /* Object lookup. json_find_key walks at depth 1 of whatever container it + * receives, so we slice from `p` onwards. Caller already positioned us at + * the opening '{' (or at whitespace before it). */ + if (*p != '{') return NULL; + /* Build a NUL-terminated copy of the key segment for the lookup. We only + * pay this cost when the segment isn't a numeric index. */ + char stack_key[256]; + char* k = stack_key; + if (seg_len + 1 > sizeof(stack_key)) { + k = malloc(seg_len + 1); + if (!k) return NULL; + } + memcpy(k, seg, seg_len); + k[seg_len] = '\0'; + const char* found = json_find_key(p, k); + if (k != stack_key) free(k); + return found; +} + +/* Read the JSON value at `p` into a freshly-allocated, arena-owned el_val_t. + * - String -> unescaped, wrapped el_val_t string + * - Anything else -> raw JSON slice as a string (matches the historical + * json_get behaviour: numbers/bools/null come back stringified). */ +static el_val_t json_read_value(const char* p) { + p = json_skip_ws(p); if (*p == '"') { p++; - /* Unescape the JSON string value into a clean buffer. */ size_t cap = strlen(p) + 1; char* out = el_strbuf(cap); char* w = out; @@ -1940,15 +2588,70 @@ el_val_t json_get(el_val_t jsonv, el_val_t keyv) { *w = '\0'; return el_wrap_str(out); } - const char* start = p; - while (*p && *p != ',' && *p != '}' && *p != ']' && *p != '\n') p++; - size_t len = (size_t)(p - start); - char* out = el_strbuf(len); - memcpy(out, start, len); - out[len] = '\0'; + /* Object/array/number/bool/null — return the raw slice up to the value's + * end. json_skip_value tracks brace/bracket/string state so nested objects + * round-trip cleanly. */ + const char* end = json_skip_value(p); + if (!end) end = p; + size_t n = (size_t)(end - p); + /* Strip trailing whitespace from scalar values so callers don't see + * `123 ` when they parsed a pretty-printed number. */ + while (n > 0 && (p[n-1] == ' ' || p[n-1] == '\t' || p[n-1] == '\n' || p[n-1] == '\r')) { + n--; + } + char* out = el_strbuf(n); + memcpy(out, p, n); + out[n] = '\0'; return el_wrap_str(out); } +el_val_t json_get(el_val_t jsonv, el_val_t keyv) { + const char* json = EL_CSTR(jsonv); + const char* key = EL_CSTR(keyv); + if (!json || !key) return el_wrap_str(el_strdup("")); + + /* Fast path: key contains no '.' — keep the historical single-segment + * substring search so existing callers retain their O(strlen) cost + * profile. The dot-path walker is only paid for when needed. */ + if (!strchr(key, '.')) { + size_t klen = strlen(key); + char stack_pat[512]; + char* pattern; + if (klen + 5 <= sizeof(stack_pat)) { + pattern = stack_pat; + } else { + pattern = malloc(klen + 5); + if (!pattern) return el_wrap_str(el_strdup("")); + } + snprintf(pattern, klen + 5, "\"%s\":", key); + const char* p = strstr(json, pattern); + if (pattern != stack_pat) free(pattern); + if (!p) return el_wrap_str(el_strdup("")); + p += strlen(key) + 3; /* skip "key": */ + return json_read_value(p); + } + + /* Dot-path traversal. Walk segments left to right; at each step, descend + * into the current container by either array index (all-digit segment on + * an array cursor) or object key. */ + const char* cursor = json_skip_ws(json); + const char* seg_start = key; + const char* k = key; + while (1) { + if (*k == '.' || *k == '\0') { + size_t seg_len = (size_t)(k - seg_start); + cursor = json_path_descend(cursor, seg_start, seg_len); + if (!cursor) return el_wrap_str(el_strdup("")); + if (*k == '\0') break; + k++; + seg_start = k; + continue; + } + k++; + } + return json_read_value(cursor); +} + /* ── Float bit-cast helpers ──────────────────────────────────────────────── */ /* `el_to_float` and `el_from_float` are exposed in el_runtime.h as static * inlines so generated programs (which #include the header) can call them @@ -2644,6 +3347,283 @@ el_val_t sleep_ms(el_val_t ms) { return 0; } +/* ── Instant + Duration: first-class temporal types ────────────────────────── + * El's substrate (Neuron) is a temporal cognition system. Memory salience + * decay, the six-tier pacemaker, TTL caches, and supersession are all + * temporal. Treating time as a raw Int (now() returning ms-since-epoch and + * arithmetic done with mixed unit literals) lets bugs through the type + * system: `(now - cached_at) < 60` cannot tell ms from sec, and `sleep(30)` + * is ambiguous. This block introduces two dedicated representations. + * + * Representation: + * Instant — int64 nanoseconds since the Unix epoch + * Duration — int64 nanoseconds (signed; negative durations are legal, + * e.g. when a deadline has passed) + * + * Both share the el_val_t (int64) slot the rest of the runtime uses, so no + * boxing / arena allocation is needed. Type discipline is enforced at the + * codegen layer: `let x: Duration = ...` registers `x` in __duration_names, + * and BinOp dispatches through typed wrappers (el_duration_add, etc.) that + * make intent explicit in the generated C. Mismatched ops (Instant+Instant, + * Duration+Int) are surfaced via #error directives at codegen time so the + * downstream cc step fails with a clear El-source-level message. + * + * Nanosecond precision matches POSIX clock_gettime / nanosleep granularity. + * 2^63 nanos covers ~292 years from epoch — comfortably past 2200, plenty + * for a memory-system runtime that never schedules outside a human lifespan. + */ + +/* now() — current Instant. Wraps clock_gettime(CLOCK_REALTIME) for nanosecond + * precision. Falls back to gettimeofday on systems where clock_gettime is + * unavailable (defensive — every supported platform has it). */ +el_val_t el_now_instant(void) { + struct timespec ts; + if (clock_gettime(CLOCK_REALTIME, &ts) == 0) { + int64_t ns = (int64_t)ts.tv_sec * 1000000000LL + (int64_t)ts.tv_nsec; + return (el_val_t)ns; + } + struct timeval tv; + gettimeofday(&tv, NULL); + int64_t ns = (int64_t)tv.tv_sec * 1000000000LL + + (int64_t)tv.tv_usec * 1000LL; + return (el_val_t)ns; +} + +el_val_t now(void) { + return el_now_instant(); +} + +/* unix_seconds(n) — Instant from a Unix-epoch second count. + * unix_millis(n) — Instant from a Unix-epoch millisecond count. */ +el_val_t unix_seconds(el_val_t n) { + int64_t s = (int64_t)n; + return (el_val_t)(s * 1000000000LL); +} + +el_val_t unix_millis(el_val_t n) { + int64_t m = (int64_t)n; + return (el_val_t)(m * 1000000LL); +} + +/* instant_from_iso8601 — parse a strict subset: + * YYYY-MM-DDTHH:MM:SS[.fff]Z + * Returns 0 (the Unix-epoch sentinel) on parse failure. Callers that need to + * distinguish epoch-zero from a parse error should use a wider sentinel + * representation; the current zero-on-failure choice matches existing El + * runtime conventions for parse builtins (str_to_int, parse_int). */ +el_val_t instant_from_iso8601(el_val_t s) { + const char* str = EL_CSTR(s); + if (!str) return (el_val_t)0; + int Y, M, D, h, m, sec, frac = 0; + int n = sscanf(str, "%d-%d-%dT%d:%d:%d.%3d", &Y, &M, &D, &h, &m, &sec, &frac); + if (n < 6) { + n = sscanf(str, "%d-%d-%dT%d:%d:%dZ", &Y, &M, &D, &h, &m, &sec); + if (n < 6) return (el_val_t)0; + } + struct tm tm; + memset(&tm, 0, sizeof(tm)); + tm.tm_year = Y - 1900; + tm.tm_mon = M - 1; + tm.tm_mday = D; + tm.tm_hour = h; + tm.tm_min = m; + tm.tm_sec = sec; + /* timegm — UTC. POSIX-Y but available on macOS and glibc. */ + time_t t = timegm(&tm); + if (t == (time_t)-1) return (el_val_t)0; + int64_t ns = (int64_t)t * 1000000000LL + (int64_t)frac * 1000000LL; + return (el_val_t)ns; +} + +/* Duration constructors. The El-side postfix literals (30.seconds, 1.hour) + * are lowered by the codegen directly into a literal int64 of nanoseconds — + * these constructors are for runtime values where the count is dynamic. */ +el_val_t el_duration_from_nanos(el_val_t ns) { + return (el_val_t)(int64_t)ns; +} + +el_val_t duration_seconds(el_val_t n) { + int64_t s = (int64_t)n; + return (el_val_t)(s * 1000000000LL); +} + +el_val_t duration_millis(el_val_t n) { + int64_t m = (int64_t)n; + return (el_val_t)(m * 1000000LL); +} + +el_val_t duration_nanos(el_val_t n) { + return (el_val_t)(int64_t)n; +} + +/* Arithmetic — typed wrappers. At the C level these are no-op casts, but + * the codegen routes Instant/Duration BinOps through them so the generated + * C says `el_instant_add_dur(start, dur)` rather than `start + dur`. The + * intent is explicit, the operand order is documented, and a future change + * to the underlying representation (saturating arithmetic, overflow guards) + * has a single chokepoint. */ +el_val_t el_instant_add_dur(el_val_t inst, el_val_t dur) { + return (el_val_t)((int64_t)inst + (int64_t)dur); +} + +el_val_t el_instant_sub_dur(el_val_t inst, el_val_t dur) { + return (el_val_t)((int64_t)inst - (int64_t)dur); +} + +el_val_t el_instant_diff(el_val_t a, el_val_t b) { + /* a - b — yields a Duration (negative if b is later than a). */ + return (el_val_t)((int64_t)a - (int64_t)b); +} + +el_val_t el_duration_add(el_val_t a, el_val_t b) { + return (el_val_t)((int64_t)a + (int64_t)b); +} + +el_val_t el_duration_sub(el_val_t a, el_val_t b) { + return (el_val_t)((int64_t)a - (int64_t)b); +} + +el_val_t el_duration_scale(el_val_t dur, el_val_t scalar) { + return (el_val_t)((int64_t)dur * (int64_t)scalar); +} + +el_val_t el_duration_div(el_val_t dur, el_val_t scalar) { + int64_t s = (int64_t)scalar; + if (s == 0) return (el_val_t)0; + return (el_val_t)((int64_t)dur / s); +} + +/* Comparisons. Return 1/0 in el_val_t convention. */ +el_val_t el_instant_lt(el_val_t a, el_val_t b) { return (el_val_t)((int64_t)a < (int64_t)b ? 1 : 0); } +el_val_t el_instant_le(el_val_t a, el_val_t b) { return (el_val_t)((int64_t)a <= (int64_t)b ? 1 : 0); } +el_val_t el_instant_gt(el_val_t a, el_val_t b) { return (el_val_t)((int64_t)a > (int64_t)b ? 1 : 0); } +el_val_t el_instant_ge(el_val_t a, el_val_t b) { return (el_val_t)((int64_t)a >= (int64_t)b ? 1 : 0); } +el_val_t el_instant_eq(el_val_t a, el_val_t b) { return (el_val_t)((int64_t)a == (int64_t)b ? 1 : 0); } +el_val_t el_instant_ne(el_val_t a, el_val_t b) { return (el_val_t)((int64_t)a != (int64_t)b ? 1 : 0); } +el_val_t el_duration_lt(el_val_t a, el_val_t b) { return (el_val_t)((int64_t)a < (int64_t)b ? 1 : 0); } +el_val_t el_duration_le(el_val_t a, el_val_t b) { return (el_val_t)((int64_t)a <= (int64_t)b ? 1 : 0); } +el_val_t el_duration_gt(el_val_t a, el_val_t b) { return (el_val_t)((int64_t)a > (int64_t)b ? 1 : 0); } +el_val_t el_duration_ge(el_val_t a, el_val_t b) { return (el_val_t)((int64_t)a >= (int64_t)b ? 1 : 0); } +el_val_t el_duration_eq(el_val_t a, el_val_t b) { return (el_val_t)((int64_t)a == (int64_t)b ? 1 : 0); } +el_val_t el_duration_ne(el_val_t a, el_val_t b) { return (el_val_t)((int64_t)a != (int64_t)b ? 1 : 0); } + +/* Conversions. */ +el_val_t instant_to_unix_seconds(el_val_t i) { + return (el_val_t)((int64_t)i / 1000000000LL); +} + +el_val_t instant_to_unix_millis(el_val_t i) { + return (el_val_t)((int64_t)i / 1000000LL); +} + +el_val_t instant_to_iso8601(el_val_t i) { + int64_t ns = (int64_t)i; + time_t s = (time_t)(ns / 1000000000LL); + int msec = (int)((ns / 1000000LL) % 1000LL); + if (msec < 0) { msec += 1000; s -= 1; } + struct tm tm; + gmtime_r(&s, &tm); + char buf[64]; + snprintf(buf, sizeof(buf), "%04d-%02d-%02dT%02d:%02d:%02d.%03dZ", + tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday, + tm.tm_hour, tm.tm_min, tm.tm_sec, msec); + return el_wrap_str(el_strdup(buf)); +} + +el_val_t duration_to_seconds(el_val_t d) { + return (el_val_t)((int64_t)d / 1000000000LL); +} + +el_val_t duration_to_millis(el_val_t d) { + return (el_val_t)((int64_t)d / 1000000LL); +} + +el_val_t duration_to_nanos(el_val_t d) { + return (el_val_t)(int64_t)d; +} + +/* sleep(Duration) — Phase 1 replacement for ambiguous sleep(Int). The runtime + * still exposes sleep_secs/sleep_ms for legacy call sites; codegen lowers + * sleep(Duration) to el_sleep_duration(d). Negative durations clamp to 0 so a + * stale deadline doesn't block forever. */ +el_val_t el_sleep_duration(el_val_t dur) { + int64_t ns = (int64_t)dur; + if (ns < 0) ns = 0; + struct timespec ts; + ts.tv_sec = (time_t)(ns / 1000000000LL); + ts.tv_nsec = (long)(ns % 1000000000LL); + nanosleep(&ts, NULL); + return (el_val_t)0; +} + +/* unix_timestamp() — back-compat. Existing El callers expect an Int seconds + * value; this stays an Int returner so the type system isn't disturbed for + * legacy code. New code should call now() and convert when needed. */ +el_val_t unix_timestamp(void) { + return instant_to_unix_seconds(el_now_instant()); +} + +/* TTL cache helpers. Backed by the existing process-wide K/V (state_set/get) + * with a sibling __ttl_set_at_ entry recording the Instant of the last + * write. ttl_cache_get returns "" if the entry is missing or stale, so call + * sites can branch on `if v == "" { miss } else { hit }` — the same shape + * existing get-with-default code uses. No more (now - cached_at) < 60. */ +el_val_t ttl_cache_set(el_val_t key, el_val_t value) { + const char* k = EL_CSTR(key); + if (!k) return (el_val_t)0; + /* Store the value at the user's key. */ + state_set(key, value); + /* Stamp set_at — opaque schema, namespaced under __ttl: prefix so user + * keys can't collide with stamps. */ + size_t klen = strlen(k); + char* stamp_key = (char*)malloc(klen + 16); + if (!stamp_key) return (el_val_t)0; + snprintf(stamp_key, klen + 16, "__ttl_at:%s", k); + int64_t now_ns = (int64_t)el_now_instant(); + char buf[32]; + snprintf(buf, sizeof(buf), "%lld", (long long)now_ns); + state_set(EL_STR(stamp_key), EL_STR(buf)); + free(stamp_key); + return (el_val_t)1; +} + +el_val_t ttl_cache_get(el_val_t key, el_val_t max_age) { + const char* k = EL_CSTR(key); + if (!k) return el_wrap_str(el_strdup("")); + /* Look up stamp. */ + size_t klen = strlen(k); + char* stamp_key = (char*)malloc(klen + 16); + if (!stamp_key) return el_wrap_str(el_strdup("")); + snprintf(stamp_key, klen + 16, "__ttl_at:%s", k); + el_val_t stamp = state_get(EL_STR(stamp_key)); + free(stamp_key); + const char* sv = EL_CSTR(stamp); + if (!sv || !*sv) return el_wrap_str(el_strdup("")); + int64_t set_at = (int64_t)atoll(sv); + int64_t now_ns = (int64_t)el_now_instant(); + int64_t age = now_ns - set_at; + int64_t max_ns = (int64_t)max_age; + if (age < 0) return el_wrap_str(el_strdup("")); /* clock skew — treat as miss */ + if (age > max_ns) return el_wrap_str(el_strdup("")); /* expired */ + return state_get(key); +} + +el_val_t ttl_cache_age(el_val_t key) { + const char* k = EL_CSTR(key); + if (!k) return (el_val_t)INT64_MAX; + size_t klen = strlen(k); + char* stamp_key = (char*)malloc(klen + 16); + if (!stamp_key) return (el_val_t)INT64_MAX; + snprintf(stamp_key, klen + 16, "__ttl_at:%s", k); + el_val_t stamp = state_get(EL_STR(stamp_key)); + free(stamp_key); + const char* sv = EL_CSTR(stamp); + if (!sv || !*sv) return (el_val_t)INT64_MAX; + int64_t set_at = (int64_t)atoll(sv); + int64_t now_ns = (int64_t)el_now_instant(); + return (el_val_t)(now_ns - set_at); +} + /* ── UUID v4 ─────────────────────────────────────────────────────────────── */ static int _el_uuid_seeded = 0; @@ -2909,8 +3889,8 @@ el_val_t str_pad_right(el_val_t s, el_val_t width, el_val_t pad) { return str_pad(EL_CSTR(s), (int64_t)width, EL_CSTR(pad), 0); } -el_val_t str_format(el_val_t template, el_val_t data) { - const char* tpl = EL_CSTR(template); +el_val_t str_format(el_val_t fmt, el_val_t data) { + const char* tpl = EL_CSTR(fmt); if (!tpl) return el_wrap_str(el_strdup("")); JsonBuf b; jb_init(&b); const char* p = tpl; diff --git a/runtime/el_runtime.h b/runtime/el_runtime.h index 4bead7d..6939d01 100644 --- a/runtime/el_runtime.h +++ b/runtime/el_runtime.h @@ -199,6 +199,19 @@ el_val_t http_get_to_file(el_val_t url, el_val_t headers_map, el_val_t output_p el_val_t url_encode(el_val_t s); /* RFC 3986 unreserved set */ el_val_t url_decode(el_val_t s); /* '+' → space, %XX → byte */ +/* ── HTML allowlist sanitizer ──────────────────────────────────────────────── + * el_html_sanitize(input_html, allowlist_json) — strict allowlist HTML + * cleaner. State-machine parser; tag/attribute names compared case- + * insensitively against the allowlist; `` / `<… src>` URL schemes + * validated (http, https, mailto, fragment-only, or relative); whole- + * subtree drop for script / style / iframe / object / embed / form; HTML- + * escapes free text outside dropped subtrees. + * + * The allowlist is JSON of the form + * {"p":[],"a":["href","title"],"strong":[],...} + * where each value is the array of attribute names allowed for that tag. */ +el_val_t el_html_sanitize(el_val_t input_html, el_val_t allowlist_json); + /* ── Filesystem ──────────────────────────────────────────────────────────── */ el_val_t fs_read(el_val_t path); @@ -246,6 +259,63 @@ el_val_t time_from_parts(el_val_t secs, el_val_t ns, el_val_t tz); el_val_t time_add(el_val_t ts, el_val_t n, el_val_t unit); el_val_t time_diff(el_val_t ts1, el_val_t ts2, el_val_t unit); +/* ── Instant + Duration: first-class temporal types ────────────────────────── + * Both types share the el_val_t (int64) slot. Instants are nanoseconds + * since the Unix epoch; Durations are signed nanoseconds. Type discipline + * is enforced at codegen-time: BinOps on names registered as Instant or + * Duration route through the typed wrappers below; mismatches like + * Instant+Instant become #error at the C compiler. + * + * Postfix literals — `30.seconds`, `1.hour`, `500.millis`, `30.nanos` — are + * recognised by the parser as DurationLit AST nodes and lowered to literal + * int64 nanoseconds at codegen time. The runtime never sees the units. */ + +el_val_t el_now_instant(void); +el_val_t now(void); +el_val_t unix_seconds(el_val_t n); +el_val_t unix_millis(el_val_t n); +el_val_t instant_from_iso8601(el_val_t s); + +el_val_t el_duration_from_nanos(el_val_t ns); +el_val_t duration_seconds(el_val_t n); +el_val_t duration_millis(el_val_t n); +el_val_t duration_nanos(el_val_t n); + +el_val_t el_instant_add_dur(el_val_t inst, el_val_t dur); +el_val_t el_instant_sub_dur(el_val_t inst, el_val_t dur); +el_val_t el_instant_diff(el_val_t a, el_val_t b); +el_val_t el_duration_add(el_val_t a, el_val_t b); +el_val_t el_duration_sub(el_val_t a, el_val_t b); +el_val_t el_duration_scale(el_val_t dur, el_val_t scalar); +el_val_t el_duration_div(el_val_t dur, el_val_t scalar); + +el_val_t el_instant_lt(el_val_t a, el_val_t b); +el_val_t el_instant_le(el_val_t a, el_val_t b); +el_val_t el_instant_gt(el_val_t a, el_val_t b); +el_val_t el_instant_ge(el_val_t a, el_val_t b); +el_val_t el_instant_eq(el_val_t a, el_val_t b); +el_val_t el_instant_ne(el_val_t a, el_val_t b); +el_val_t el_duration_lt(el_val_t a, el_val_t b); +el_val_t el_duration_le(el_val_t a, el_val_t b); +el_val_t el_duration_gt(el_val_t a, el_val_t b); +el_val_t el_duration_ge(el_val_t a, el_val_t b); +el_val_t el_duration_eq(el_val_t a, el_val_t b); +el_val_t el_duration_ne(el_val_t a, el_val_t b); + +el_val_t instant_to_unix_seconds(el_val_t i); +el_val_t instant_to_unix_millis(el_val_t i); +el_val_t instant_to_iso8601(el_val_t i); +el_val_t duration_to_seconds(el_val_t d); +el_val_t duration_to_millis(el_val_t d); +el_val_t duration_to_nanos(el_val_t d); + +el_val_t el_sleep_duration(el_val_t dur); +el_val_t unix_timestamp(void); + +el_val_t ttl_cache_set(el_val_t key, el_val_t value); +el_val_t ttl_cache_get(el_val_t key, el_val_t max_age); +el_val_t ttl_cache_age(el_val_t key); + /* ── UUID ────────────────────────────────────────────────────────────────── */ el_val_t uuid_new(void); @@ -288,7 +358,7 @@ el_val_t str_char_at(el_val_t s, el_val_t i); el_val_t str_char_code(el_val_t s, el_val_t i); el_val_t str_pad_left(el_val_t s, el_val_t width, el_val_t pad); el_val_t str_pad_right(el_val_t s, el_val_t width, el_val_t pad); -el_val_t str_format(el_val_t template, el_val_t data); +el_val_t str_format(el_val_t fmt, el_val_t data); el_val_t str_lower(el_val_t s); el_val_t str_upper(el_val_t s); diff --git a/src/gallery.el b/src/gallery.el index 7fdc951..01528df 100644 --- a/src/gallery.el +++ b/src/gallery.el @@ -1,45 +1,13 @@ // components/gallery.el - "Things Neuron Said" gallery page. // Per-card auth-gated voting via supabase-js + /api/vote. -// gal_sanitize_html — defence-in-depth strip of dangerous HTML for the -// gallery preview. Mirrors sanitize_share_html in main.el exactly so the -// /said gallery and /share/ render the same allowlist. The DB column +// gallery_share_allowlist — same allowlist as default_share_allowlist in +// main.el. Inlined here so this component is self-contained: the build +// concat order puts gallery.el before main.el, so a top-level reference to +// main.el's binding would forward-reference at the C level. The DB column // is already sanitized at write time; this is belt-and-braces in case a // row was inserted out-of-band. -fn gal_sanitize_html(html: String) -> String { - let s1: String = str_replace(str_replace(html, "", "/script-->"), "", "/script-->") - let s3: String = str_replace(str_replace(s2, "", "/iframe-->"), "", "/iframe-->") - let s5: String = str_replace(str_replace(s4, "", "/style-->"), "", "/style-->") - let s7: String = str_replace(str_replace(s6, "", "/object-->"), "", "/object-->") - let s9: String = str_replace(str_replace(s8, "", "/form-->") - let s12: String = str_replace(str_replace(s11, " String { let i: Int = 0 @@ -66,7 +34,7 @@ fn gallery_page(cards_json: String, supabase_url: String, supabase_anon_key: Str let a_html: String = if !has_html { str_replace(str_replace(str_replace(ca, "&", "&"), "<", "<"), ">", ">") } else { - let s: String = gal_sanitize_html(ca_html_raw) + let s: String = el_html_sanitize(ca_html_raw, gallery_share_allowlist) if str_len(s) > 600 { str_slice(s, 0, 600) + "..." } else { s } } let ts_raw: String = json_get(card, "created_at") diff --git a/src/main.el b/src/main.el index be92d7c..6981a14 100644 --- a/src/main.el +++ b/src/main.el @@ -46,6 +46,21 @@ from safety import { safety } from gallery import { gallery_page } from account import { account_page } +// ── Share-card HTML allowlist ───────────────────────────────────────────────── +// +// Tag-and-attribute allowlist passed to el_html_sanitize for /api/share and +// /share/. Anything not on this list is dropped at the runtime level by +// the strict state-machine parser. The previous denylist sanitizer was +// retired (root-cause replacement, not a bandaid): it could be bypassed by +// a literal --> inside an attacker-supplied attribute value, and every new +// vector required a code change. +// +// Empty array means tag is allowed but no attributes survive. The sanitizer +// also validates `` schemes (only http/https/mailto/fragment/relative +// pass) and drops the entire subtree of script/style/iframe/object/embed/ +// form regardless of allowlist contents. +let default_share_allowlist: String = "{\"p\":[],\"br\":[],\"strong\":[],\"em\":[],\"u\":[],\"s\":[],\"code\":[],\"pre\":[],\"ul\":[],\"ol\":[],\"li\":[],\"h1\":[],\"h2\":[],\"h3\":[],\"h4\":[],\"blockquote\":[],\"a\":[\"href\",\"title\"]}" + // ── Founding counter ────────────────────────────────────────────────────────── let FOUNDING_TOTAL: Int = 1000 @@ -172,59 +187,6 @@ fn page(sold: Int, total: Int) -> String { // ── Share card page ─────────────────────────────────────────────────────────── -// sanitize_share_html — strip dangerous HTML before storing/serving a share -// card. Defence in depth: marked.js client-side already escapes most things, -// but we never trust client-rendered HTML round-tripped through a public API. -// Rules: -// - Lowercase the working copy for tag matching (then operate on the -// original to preserve case-insensitive replacements via case-folded -// dance is overkill here; instead we run the replace on both cases). -// - Strip whole tags (open + close + body) for: script, iframe, style, -// object, embed, form. We replace each opener with a comment marker so -// the closer-stripper sees a tagless region. -// - Strip on*= event-handler attributes (onclick, onload, onerror, ...). -// - Strip javascript: URIs in href/src. -fn sanitize_share_html(html: String) -> String { - // Tag stripper: replace `", "/script-->"), "", "/script-->") - let s3: String = str_replace(str_replace(s2, "", "/iframe-->"), "", "/iframe-->") - let s5: String = str_replace(str_replace(s4, "", "/style-->"), "", "/style-->") - let s7: String = str_replace(str_replace(s6, "", "/object-->"), "", "/object-->") - let s9: String = str_replace(str_replace(s8, "", "/form-->") - let s12: String = str_replace(str_replace(s11, " String { let q_html: String = str_replace(str_replace(str_replace(question, "&", "&"), "<", "<"), ">", ">") // answer_html_in is sanitized, marked.js-rendered HTML. Fall back to @@ -232,7 +194,7 @@ fn share_card_page(question: String, answer_plain: String, answer_html_in: Strin let a_html: String = if str_eq(answer_html_in, "") { str_replace(str_replace(str_replace(answer_plain, "&", "&"), "<", "<"), ">", ">") } else { - sanitize_share_html(answer_html_in) + el_html_sanitize(answer_html_in, default_share_allowlist) } // Use plaintext for og:description so social previews are readable. let answer: String = answer_plain @@ -1365,9 +1327,12 @@ fn handle_request(method: String, path: String, body: String) -> String { // - answer_plaintext, when supplied, takes precedence as the og:desc / // gallery body. Falls back to `answer`. // - // The handler sanitizes answer_html (sanitize_share_html: strip - // script/iframe/style/object/embed/form/link/meta/base + on*= attrs + - // javascript: URIs) before storing. + // The handler sanitizes answer_html via the runtime allowlist sanitizer + // (el_html_sanitize): only the tags and attributes named in + // default_share_allowlist survive; everything else is dropped. Whole + // subtrees of script/style/iframe/object/embed/form are removed + // regardless of the allowlist; schemes are restricted to + // http/https/mailto/fragment/relative. if str_eq(path, "/api/share") { if !str_eq(method, "POST") { return "{\"error\":\"POST required\"}" @@ -1403,7 +1368,7 @@ fn handle_request(method: String, path: String, body: String) -> String { // sanitized server-side. Storing it lets /said render the // same formatted preview as /share/. Empty -> omit the // column (legacy fallback path). - let html_sanitized: String = if str_eq(answer_html_raw, "") { "" } else { sanitize_share_html(answer_html_raw) } + let html_sanitized: String = if str_eq(answer_html_raw, "") { "" } else { el_html_sanitize(answer_html_raw, default_share_allowlist) } let h_safe: String = str_replace(str_replace(str_replace(str_replace(html_sanitized, "\\", "\\\\"), "\"", "\\\""), "\n", "\\n"), "\r", "\\r") let html_field: String = if str_eq(html_sanitized, "") { "" } else { ",\"answer_html\":\"" + h_safe + "\"" } let card_row: String = "{\"id\":\"" + id + "\",\"question\":\"" + q_safe + "\",\"answer\":\"" + a_safe + "\"" + html_field + "}"