runtime: rename str_format param 'template' to 'fmt'
template is a reserved keyword in C++; though not in C, it blocks this header from ever being included from C++ code. Match printf- family convention with fmt instead. The deeper question of whether string-template substitution is the right abstraction for our substrate is filed separately as backlog.
This commit is contained in:
@@ -1968,6 +1968,526 @@ el_val_t url_decode(el_val_t sv) {
|
||||
return el_wrap_str(out);
|
||||
}
|
||||
|
||||
/* ── HTML allowlist sanitizer ────────────────────────────────────────────────
|
||||
* el_html_sanitize(input, allowlist_json)
|
||||
*
|
||||
* Strict allowlist HTML cleaner. Replaces the older denylist patterns
|
||||
* (str_replace cascades that wrapped dangerous tags in HTML comments and
|
||||
* renamed `on*` attributes). The denylist approach is fragile: comment-
|
||||
* wrapping can be re-broken by a literal `-->` inside an attacker-supplied
|
||||
* attribute value, and every new attack vector requires a code change.
|
||||
*
|
||||
* Design:
|
||||
* - Single-pass byte-level state machine.
|
||||
* - Tag and attribute names are matched case-insensitively against the
|
||||
* allowlist. Unknown tags are dropped entirely (the open and close
|
||||
* markers are stripped; their inner text content survives, escaped).
|
||||
* - A small set of "dangerous container" tags (script, style, iframe,
|
||||
* object, embed, form, plus a few rarer ones) drop themselves AND
|
||||
* their full subtree — text between `<script>` and `</script>` is
|
||||
* CDATA-like and must not be re-emitted as escaped text either.
|
||||
* - Comments (<!-- -->), doctype (<!DOCTYPE>), CDATA (<![CDATA[...]]>),
|
||||
* and processing instructions (<?...?>) are dropped entirely.
|
||||
* - Text content outside dropped subtrees is HTML-escaped (&, <, >, ", ').
|
||||
* - Attribute values are unquoted/dequoted, then re-emitted with double
|
||||
* quotes around the cleanly-escaped value.
|
||||
* - For `<a href>` and any `src` attribute, the URL scheme is validated:
|
||||
* only http:, https:, mailto:, fragment-only `#anchor`, or relative
|
||||
* paths are allowed. Anything else (javascript:, data:, vbscript:,
|
||||
* about:, file:, etc.) drops the attribute.
|
||||
* - Self-closing void tags (br, hr, img, etc.) emit without a close tag.
|
||||
* - Malformed input (unclosed tag at EOF, bad attribute syntax) drops
|
||||
* the pending tag and continues. Pre-encoded entities (<, &,
|
||||
* etc.) are passed through verbatim — the browser will decode them
|
||||
* safely on render.
|
||||
*
|
||||
* Allowlist format (JSON string):
|
||||
* {"p":[],"a":["href","title"],"strong":[],...}
|
||||
* - Key = lowercase tag name.
|
||||
* - Value = JSON array of allowed attribute names (lowercase).
|
||||
* - Empty array means tag allowed but no attributes survive.
|
||||
*
|
||||
* Output is a freshly-allocated arena-tracked el_val_t string. */
|
||||
|
||||
/* Internal byte buffer with realloc-doubling. Used during sanitization;
|
||||
* the final result is copied into an arena-tracked el_strbuf so the caller
|
||||
* sees standard runtime memory semantics. */
|
||||
typedef struct {
|
||||
char* data;
|
||||
size_t len;
|
||||
size_t cap;
|
||||
} html_buf_t;
|
||||
|
||||
static void html_buf_init(html_buf_t* b) {
|
||||
b->cap = 256;
|
||||
b->data = malloc(b->cap);
|
||||
if (!b->data) { fputs("el_runtime: out of memory\n", stderr); exit(1); }
|
||||
b->len = 0;
|
||||
}
|
||||
|
||||
static void html_buf_grow(html_buf_t* b, size_t need) {
|
||||
if (b->len + need + 1 <= b->cap) return;
|
||||
size_t nc = b->cap;
|
||||
while (b->len + need + 1 > nc) nc *= 2;
|
||||
char* nd = realloc(b->data, nc);
|
||||
if (!nd) { fputs("el_runtime: out of memory\n", stderr); exit(1); }
|
||||
b->data = nd;
|
||||
b->cap = nc;
|
||||
}
|
||||
|
||||
static void html_buf_putc(html_buf_t* b, char c) {
|
||||
html_buf_grow(b, 1);
|
||||
b->data[b->len++] = c;
|
||||
}
|
||||
|
||||
static void html_buf_puts(html_buf_t* b, const char* s) {
|
||||
if (!s) return;
|
||||
size_t n = strlen(s);
|
||||
html_buf_grow(b, n);
|
||||
memcpy(b->data + b->len, s, n);
|
||||
b->len += n;
|
||||
}
|
||||
|
||||
static void html_buf_free(html_buf_t* b) {
|
||||
free(b->data);
|
||||
b->data = NULL;
|
||||
b->len = b->cap = 0;
|
||||
}
|
||||
|
||||
/* ASCII tolower, locale-independent. */
|
||||
static int html_tolower(int c) {
|
||||
return (c >= 'A' && c <= 'Z') ? c + 32 : c;
|
||||
}
|
||||
|
||||
/* Case-insensitive ASCII compare of [a, a+n) against c-string `s`.
|
||||
* Returns 1 iff lengths match and bytes are equal under tolower. */
|
||||
static int html_ieq_n(const char* a, size_t n, const char* s) {
|
||||
if (!a || !s) return 0;
|
||||
if (strlen(s) != n) return 0;
|
||||
for (size_t i = 0; i < n; i++) {
|
||||
if (html_tolower((unsigned char)a[i]) != html_tolower((unsigned char)s[i])) return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* Case-insensitive ASCII compare of two byte slices. */
|
||||
static int html_iemem(const char* a, const char* b, size_t n) {
|
||||
for (size_t i = 0; i < n; i++) {
|
||||
if (html_tolower((unsigned char)a[i]) != html_tolower((unsigned char)b[i])) return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* Walk a JSON allowlist object and find the value (an array) for a given
|
||||
* tag key, comparing case-insensitively. On hit returns a pointer to the
|
||||
* opening `[` of the array and writes the byte length of the array span
|
||||
* (including the brackets) to *out_len. On miss returns NULL.
|
||||
*
|
||||
* The parser is intentionally tiny: it does not handle escapes inside
|
||||
* keys (allowlist authors do not need them), and it relies on balanced
|
||||
* brackets/quotes within the value array. */
|
||||
static const char* html_allowlist_find(const char* allow, const char* tag,
|
||||
size_t tag_len, size_t* out_len) {
|
||||
if (!allow) return NULL;
|
||||
const char* p = allow;
|
||||
while (*p == ' ' || *p == '\t' || *p == '\n' || *p == '\r') p++;
|
||||
if (*p != '{') return NULL;
|
||||
p++;
|
||||
while (*p) {
|
||||
while (*p == ' ' || *p == '\t' || *p == '\n' || *p == '\r' || *p == ',') p++;
|
||||
if (*p == '}' || *p == 0) return NULL;
|
||||
if (*p != '"') return NULL;
|
||||
p++;
|
||||
const char* k = p;
|
||||
while (*p && *p != '"') p++;
|
||||
if (*p != '"') return NULL;
|
||||
size_t klen = (size_t)(p - k);
|
||||
p++;
|
||||
while (*p == ' ' || *p == '\t' || *p == '\n' || *p == '\r') p++;
|
||||
if (*p != ':') return NULL;
|
||||
p++;
|
||||
while (*p == ' ' || *p == '\t' || *p == '\n' || *p == '\r') p++;
|
||||
if (*p != '[') return NULL;
|
||||
const char* arr_start = p;
|
||||
int depth = 0;
|
||||
int in_str = 0;
|
||||
while (*p) {
|
||||
char c = *p;
|
||||
if (in_str) {
|
||||
if (c == '\\' && p[1]) { p += 2; continue; }
|
||||
if (c == '"') in_str = 0;
|
||||
} else {
|
||||
if (c == '"') in_str = 1;
|
||||
else if (c == '[') depth++;
|
||||
else if (c == ']') { depth--; if (depth == 0) { p++; break; } }
|
||||
}
|
||||
p++;
|
||||
}
|
||||
size_t alen = (size_t)(p - arr_start);
|
||||
int match = (klen == tag_len) && html_iemem(k, tag, klen);
|
||||
if (match) {
|
||||
if (out_len) *out_len = alen;
|
||||
return arr_start;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Returns 1 iff `attr` (length attr_len) appears as a string element
|
||||
* in the JSON array slice [arr, arr+arr_len). Comparison is case-
|
||||
* insensitive. */
|
||||
static int html_attr_in_array(const char* arr, size_t arr_len,
|
||||
const char* attr, size_t attr_len) {
|
||||
if (!arr || arr_len < 2) return 0;
|
||||
const char* p = arr + 1;
|
||||
const char* end = arr + arr_len - 1;
|
||||
while (p < end) {
|
||||
while (p < end && (*p == ' ' || *p == '\t' || *p == '\n' || *p == '\r' || *p == ',')) p++;
|
||||
if (p >= end) return 0;
|
||||
if (*p != '"') return 0;
|
||||
p++;
|
||||
const char* s = p;
|
||||
while (p < end && *p != '"') {
|
||||
if (*p == '\\' && p + 1 < end) p++;
|
||||
p++;
|
||||
}
|
||||
if (p >= end) return 0;
|
||||
size_t slen = (size_t)(p - s);
|
||||
p++;
|
||||
if (slen == attr_len && html_iemem(s, attr, slen)) return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Hard-coded set of tags whose content is ALSO dropped (entire subtree). */
|
||||
static int html_is_dangerous_container(const char* tag, size_t tag_len) {
|
||||
static const char* names[] = {
|
||||
"script", "style", "iframe", "object", "embed", "form",
|
||||
"noscript", "noembed", "template", "svg", "math", "frame",
|
||||
"frameset", "applet", "audio", "video", "source", "track",
|
||||
NULL
|
||||
};
|
||||
for (int i = 0; names[i]; i++) {
|
||||
if (html_ieq_n(tag, tag_len, names[i])) return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* HTML void elements — emit without a close tag. */
|
||||
static int html_is_void(const char* tag, size_t tag_len) {
|
||||
static const char* names[] = {
|
||||
"area", "base", "br", "col", "embed", "hr", "img", "input",
|
||||
"link", "meta", "param", "source", "track", "wbr",
|
||||
NULL
|
||||
};
|
||||
for (int i = 0; names[i]; i++) {
|
||||
if (html_ieq_n(tag, tag_len, names[i])) return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Append a single byte HTML-escaped into the output buffer. */
|
||||
static void html_escape_byte(html_buf_t* out, unsigned char c) {
|
||||
switch (c) {
|
||||
case '<': html_buf_puts(out, "<"); break;
|
||||
case '>': html_buf_puts(out, ">"); break;
|
||||
case '"': html_buf_puts(out, """); break;
|
||||
case '\'': html_buf_puts(out, "'"); break;
|
||||
default: html_buf_putc(out, (char)c); break;
|
||||
}
|
||||
}
|
||||
|
||||
/* Validate a URL value against the allowlist of safe schemes for hrefs.
|
||||
* Returns 1 iff the URL is safe to emit. Acceptable forms:
|
||||
* - http:// or https:// (case-insensitive)
|
||||
* - mailto:
|
||||
* - fragment-only `#anchor`
|
||||
* - relative path that does not contain a colon before the first
|
||||
* slash/?/# (so `foo/bar`, `/foo`, `?x=1` are OK; `javascript:x` is
|
||||
* not — its colon precedes any path/hash/query separator).
|
||||
*
|
||||
* URL leading whitespace and embedded ASCII control bytes (TAB, LF, CR)
|
||||
* are stripped before the scheme test, mirroring how browsers normalise
|
||||
* URLs (these bytes are otherwise a known XSS bypass: `java\tscript:`). */
|
||||
static int html_url_is_safe(const char* url, size_t len) {
|
||||
if (!url || len == 0) return 1; /* empty href is harmless */
|
||||
size_t i = 0;
|
||||
while (i < len) {
|
||||
unsigned char c = (unsigned char)url[i];
|
||||
if (c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == 0x0B || c == 0x0C) {
|
||||
i++; continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
if (i >= len) return 1; /* whitespace only */
|
||||
if (url[i] == '#') return 1; /* fragment only */
|
||||
if (url[i] == '/' || url[i] == '?') return 1; /* relative */
|
||||
/* Find the first scheme-terminating character. */
|
||||
size_t scheme_end = (size_t)-1;
|
||||
for (size_t j = i; j < len; j++) {
|
||||
char c = url[j];
|
||||
if (c == ':') { scheme_end = j; break; }
|
||||
if (c == '/' || c == '?' || c == '#') break;
|
||||
}
|
||||
if (scheme_end == (size_t)-1) return 1; /* no colon → relative path */
|
||||
/* Lowercase the scheme, stripping embedded control bytes. */
|
||||
char scheme[32];
|
||||
size_t sl = 0;
|
||||
for (size_t j = i; j < scheme_end && sl < sizeof(scheme) - 1; j++) {
|
||||
unsigned char c = (unsigned char)url[j];
|
||||
if (c == '\t' || c == '\n' || c == '\r' || c == 0x0B || c == 0x0C) continue;
|
||||
scheme[sl++] = (char)html_tolower(c);
|
||||
}
|
||||
scheme[sl] = '\0';
|
||||
if (strcmp(scheme, "http") == 0) return 1;
|
||||
if (strcmp(scheme, "https") == 0) return 1;
|
||||
if (strcmp(scheme, "mailto") == 0) return 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
el_val_t el_html_sanitize(el_val_t input_v, el_val_t allowlist_v) {
|
||||
const char* input = EL_CSTR(input_v);
|
||||
const char* allow = EL_CSTR(allowlist_v);
|
||||
if (!input) return el_wrap_str(el_strdup(""));
|
||||
if (!allow) allow = "{}";
|
||||
size_t in_len = strlen(input);
|
||||
|
||||
html_buf_t out;
|
||||
html_buf_init(&out);
|
||||
|
||||
size_t i = 0;
|
||||
while (i < in_len) {
|
||||
unsigned char c = (unsigned char)input[i];
|
||||
if (c != '<') {
|
||||
/* Plain text — escape and emit. We pass `&` through verbatim
|
||||
* to preserve pre-encoded entities (`<`, `&`, `&#x...;`)
|
||||
* which the browser will decode safely. */
|
||||
if (c == '&') html_buf_putc(&out, '&');
|
||||
else html_escape_byte(&out, c);
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
/* `<` — try to parse a tag. */
|
||||
if (i + 1 >= in_len) {
|
||||
html_buf_puts(&out, "<");
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
/* Comments, doctype, CDATA, processing instructions — drop entirely. */
|
||||
if (input[i + 1] == '!') {
|
||||
if (i + 3 < in_len && input[i + 2] == '-' && input[i + 3] == '-') {
|
||||
size_t j = i + 4;
|
||||
while (j + 2 < in_len && !(input[j] == '-' && input[j + 1] == '-' && input[j + 2] == '>')) j++;
|
||||
if (j + 2 < in_len) i = j + 3;
|
||||
else i = in_len;
|
||||
continue;
|
||||
}
|
||||
size_t j = i + 2;
|
||||
while (j < in_len && input[j] != '>') j++;
|
||||
i = (j < in_len) ? j + 1 : in_len;
|
||||
continue;
|
||||
}
|
||||
if (input[i + 1] == '?') {
|
||||
size_t j = i + 2;
|
||||
while (j < in_len && input[j] != '>') j++;
|
||||
i = (j < in_len) ? j + 1 : in_len;
|
||||
continue;
|
||||
}
|
||||
int is_close = 0;
|
||||
size_t name_start = i + 1;
|
||||
if (input[i + 1] == '/') {
|
||||
is_close = 1;
|
||||
name_start = i + 2;
|
||||
}
|
||||
if (name_start >= in_len) {
|
||||
html_buf_puts(&out, "<");
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
unsigned char nc = (unsigned char)input[name_start];
|
||||
if (!((nc >= 'a' && nc <= 'z') || (nc >= 'A' && nc <= 'Z'))) {
|
||||
/* `<` followed by non-letter — emit as escaped text. */
|
||||
html_buf_puts(&out, "<");
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
size_t name_end = name_start;
|
||||
while (name_end < in_len) {
|
||||
unsigned char x = (unsigned char)input[name_end];
|
||||
if ((x >= 'a' && x <= 'z') || (x >= 'A' && x <= 'Z') ||
|
||||
(x >= '0' && x <= '9') || x == '-' || x == '_' || x == ':') {
|
||||
name_end++;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
const char* tag = input + name_start;
|
||||
size_t tag_len = name_end - name_start;
|
||||
/* Find the `>` that closes this tag, respecting quoted attrs. */
|
||||
size_t cur = name_end;
|
||||
int self_close = 0;
|
||||
while (cur < in_len) {
|
||||
unsigned char x = (unsigned char)input[cur];
|
||||
if (x == '"' || x == '\'') {
|
||||
unsigned char q = x;
|
||||
cur++;
|
||||
while (cur < in_len && (unsigned char)input[cur] != q) cur++;
|
||||
if (cur < in_len) cur++; /* skip closing quote */
|
||||
continue;
|
||||
}
|
||||
if (x == '/' && cur + 1 < in_len && input[cur + 1] == '>') {
|
||||
self_close = 1;
|
||||
break;
|
||||
}
|
||||
if (x == '>') break;
|
||||
cur++;
|
||||
}
|
||||
if (cur >= in_len) {
|
||||
/* Malformed: unclosed tag at EOF. Drop the rest of the input. */
|
||||
i = in_len;
|
||||
continue;
|
||||
}
|
||||
size_t tag_end = self_close ? cur + 2 : cur + 1; /* one past `>` */
|
||||
/* Dangerous container — drop the whole subtree. */
|
||||
if (!is_close && html_is_dangerous_container(tag, tag_len)) {
|
||||
if (self_close || html_is_void(tag, tag_len)) {
|
||||
i = tag_end;
|
||||
continue;
|
||||
}
|
||||
size_t scan = tag_end;
|
||||
int found_close = 0;
|
||||
while (scan < in_len) {
|
||||
if (input[scan] != '<') { scan++; continue; }
|
||||
if (scan + 1 < in_len && input[scan + 1] == '/') {
|
||||
size_t cn_start = scan + 2;
|
||||
size_t cn_end = cn_start;
|
||||
while (cn_end < in_len) {
|
||||
unsigned char x = (unsigned char)input[cn_end];
|
||||
if ((x >= 'a' && x <= 'z') || (x >= 'A' && x <= 'Z') ||
|
||||
(x >= '0' && x <= '9') || x == '-' || x == '_' || x == ':') {
|
||||
cn_end++;
|
||||
} else break;
|
||||
}
|
||||
if (cn_end - cn_start == tag_len &&
|
||||
html_iemem(input + cn_start, tag, tag_len)) {
|
||||
size_t end_close = cn_end;
|
||||
while (end_close < in_len && input[end_close] != '>') end_close++;
|
||||
i = (end_close < in_len) ? end_close + 1 : in_len;
|
||||
found_close = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
scan++;
|
||||
}
|
||||
if (!found_close) {
|
||||
/* No matching close — drop everything from here on. */
|
||||
i = in_len;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
/* Look up the tag in the allowlist. */
|
||||
size_t arr_len = 0;
|
||||
const char* arr = html_allowlist_find(allow, tag, tag_len, &arr_len);
|
||||
if (!arr) {
|
||||
/* Tag not allowed. Drop the open/close marker; inner text is
|
||||
* processed by the outer loop and re-emitted as escaped text. */
|
||||
i = tag_end;
|
||||
continue;
|
||||
}
|
||||
if (is_close) {
|
||||
if (!html_is_void(tag, tag_len)) {
|
||||
html_buf_putc(&out, '<');
|
||||
html_buf_putc(&out, '/');
|
||||
for (size_t k = 0; k < tag_len; k++) {
|
||||
html_buf_putc(&out, (char)html_tolower((unsigned char)tag[k]));
|
||||
}
|
||||
html_buf_putc(&out, '>');
|
||||
}
|
||||
i = tag_end;
|
||||
continue;
|
||||
}
|
||||
/* Allowed open tag. Emit `<name` and walk the attributes between
|
||||
* `name_end` and the closing `>`. */
|
||||
html_buf_putc(&out, '<');
|
||||
for (size_t k = 0; k < tag_len; k++) {
|
||||
html_buf_putc(&out, (char)html_tolower((unsigned char)tag[k]));
|
||||
}
|
||||
size_t a = name_end;
|
||||
while (a < cur) {
|
||||
unsigned char x = (unsigned char)input[a];
|
||||
if (x == ' ' || x == '\t' || x == '\n' || x == '\r' || x == '/') { a++; continue; }
|
||||
size_t an_start = a;
|
||||
while (a < cur) {
|
||||
unsigned char y = (unsigned char)input[a];
|
||||
if (y == '=' || y == ' ' || y == '\t' || y == '\n' || y == '\r' || y == '/' || y == '>') break;
|
||||
a++;
|
||||
}
|
||||
size_t an_len = a - an_start;
|
||||
if (an_len == 0) { a++; continue; }
|
||||
size_t av_start = 0;
|
||||
size_t av_len = 0;
|
||||
int has_value = 0;
|
||||
size_t b = a;
|
||||
while (b < cur && (input[b] == ' ' || input[b] == '\t' || input[b] == '\n' || input[b] == '\r')) b++;
|
||||
if (b < cur && input[b] == '=') {
|
||||
has_value = 1;
|
||||
b++;
|
||||
while (b < cur && (input[b] == ' ' || input[b] == '\t' || input[b] == '\n' || input[b] == '\r')) b++;
|
||||
if (b < cur && (input[b] == '"' || input[b] == '\'')) {
|
||||
unsigned char q = (unsigned char)input[b];
|
||||
b++;
|
||||
av_start = b;
|
||||
while (b < cur && (unsigned char)input[b] != q) b++;
|
||||
av_len = b - av_start;
|
||||
if (b < cur) b++;
|
||||
} else {
|
||||
av_start = b;
|
||||
while (b < cur) {
|
||||
unsigned char y = (unsigned char)input[b];
|
||||
if (y == ' ' || y == '\t' || y == '\n' || y == '\r' || y == '>') break;
|
||||
b++;
|
||||
}
|
||||
av_len = b - av_start;
|
||||
}
|
||||
a = b;
|
||||
}
|
||||
if (!html_attr_in_array(arr, arr_len, input + an_start, an_len)) continue;
|
||||
int is_href = (an_len == 4 && html_iemem(input + an_start, "href", 4));
|
||||
int is_src = (an_len == 3 && html_iemem(input + an_start, "src", 3));
|
||||
if ((is_href || is_src) && has_value) {
|
||||
if (!html_url_is_safe(input + av_start, av_len)) continue;
|
||||
}
|
||||
html_buf_putc(&out, ' ');
|
||||
for (size_t k = 0; k < an_len; k++) {
|
||||
html_buf_putc(&out, (char)html_tolower((unsigned char)input[an_start + k]));
|
||||
}
|
||||
if (has_value) {
|
||||
html_buf_puts(&out, "=\"");
|
||||
for (size_t k = 0; k < av_len; k++) {
|
||||
unsigned char y = (unsigned char)input[av_start + k];
|
||||
/* Re-escape so the emitted attribute is well-formed
|
||||
* double-quoted HTML. `&` passes through to preserve
|
||||
* pre-encoded entities. */
|
||||
if (y == '"') html_buf_puts(&out, """);
|
||||
else if (y == '<') html_buf_puts(&out, "<");
|
||||
else if (y == '>') html_buf_puts(&out, ">");
|
||||
else html_buf_putc(&out, (char)y);
|
||||
}
|
||||
html_buf_putc(&out, '"');
|
||||
}
|
||||
}
|
||||
html_buf_putc(&out, '>');
|
||||
i = tag_end;
|
||||
}
|
||||
/* Copy into arena-tracked buffer so the standard runtime memory model
|
||||
* applies to the returned string. */
|
||||
char* result = el_strbuf(out.len);
|
||||
memcpy(result, out.data, out.len);
|
||||
result[out.len] = '\0';
|
||||
html_buf_free(&out);
|
||||
return el_wrap_str(result);
|
||||
}
|
||||
|
||||
/* ── JSON ────────────────────────────────────────────────────────────────── */
|
||||
|
||||
/* True iff the segment is non-empty and every byte is an ASCII digit. We treat
|
||||
@@ -2827,6 +3347,283 @@ el_val_t sleep_ms(el_val_t ms) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* ── Instant + Duration: first-class temporal types ──────────────────────────
|
||||
* El's substrate (Neuron) is a temporal cognition system. Memory salience
|
||||
* decay, the six-tier pacemaker, TTL caches, and supersession are all
|
||||
* temporal. Treating time as a raw Int (now() returning ms-since-epoch and
|
||||
* arithmetic done with mixed unit literals) lets bugs through the type
|
||||
* system: `(now - cached_at) < 60` cannot tell ms from sec, and `sleep(30)`
|
||||
* is ambiguous. This block introduces two dedicated representations.
|
||||
*
|
||||
* Representation:
|
||||
* Instant — int64 nanoseconds since the Unix epoch
|
||||
* Duration — int64 nanoseconds (signed; negative durations are legal,
|
||||
* e.g. when a deadline has passed)
|
||||
*
|
||||
* Both share the el_val_t (int64) slot the rest of the runtime uses, so no
|
||||
* boxing / arena allocation is needed. Type discipline is enforced at the
|
||||
* codegen layer: `let x: Duration = ...` registers `x` in __duration_names,
|
||||
* and BinOp dispatches through typed wrappers (el_duration_add, etc.) that
|
||||
* make intent explicit in the generated C. Mismatched ops (Instant+Instant,
|
||||
* Duration+Int) are surfaced via #error directives at codegen time so the
|
||||
* downstream cc step fails with a clear El-source-level message.
|
||||
*
|
||||
* Nanosecond precision matches POSIX clock_gettime / nanosleep granularity.
|
||||
* 2^63 nanos covers ~292 years from epoch — comfortably past 2200, plenty
|
||||
* for a memory-system runtime that never schedules outside a human lifespan.
|
||||
*/
|
||||
|
||||
/* now() — current Instant. Wraps clock_gettime(CLOCK_REALTIME) for nanosecond
|
||||
* precision. Falls back to gettimeofday on systems where clock_gettime is
|
||||
* unavailable (defensive — every supported platform has it). */
|
||||
el_val_t el_now_instant(void) {
|
||||
struct timespec ts;
|
||||
if (clock_gettime(CLOCK_REALTIME, &ts) == 0) {
|
||||
int64_t ns = (int64_t)ts.tv_sec * 1000000000LL + (int64_t)ts.tv_nsec;
|
||||
return (el_val_t)ns;
|
||||
}
|
||||
struct timeval tv;
|
||||
gettimeofday(&tv, NULL);
|
||||
int64_t ns = (int64_t)tv.tv_sec * 1000000000LL
|
||||
+ (int64_t)tv.tv_usec * 1000LL;
|
||||
return (el_val_t)ns;
|
||||
}
|
||||
|
||||
el_val_t now(void) {
|
||||
return el_now_instant();
|
||||
}
|
||||
|
||||
/* unix_seconds(n) — Instant from a Unix-epoch second count.
|
||||
* unix_millis(n) — Instant from a Unix-epoch millisecond count. */
|
||||
el_val_t unix_seconds(el_val_t n) {
|
||||
int64_t s = (int64_t)n;
|
||||
return (el_val_t)(s * 1000000000LL);
|
||||
}
|
||||
|
||||
el_val_t unix_millis(el_val_t n) {
|
||||
int64_t m = (int64_t)n;
|
||||
return (el_val_t)(m * 1000000LL);
|
||||
}
|
||||
|
||||
/* instant_from_iso8601 — parse a strict subset:
|
||||
* YYYY-MM-DDTHH:MM:SS[.fff]Z
|
||||
* Returns 0 (the Unix-epoch sentinel) on parse failure. Callers that need to
|
||||
* distinguish epoch-zero from a parse error should use a wider sentinel
|
||||
* representation; the current zero-on-failure choice matches existing El
|
||||
* runtime conventions for parse builtins (str_to_int, parse_int). */
|
||||
el_val_t instant_from_iso8601(el_val_t s) {
|
||||
const char* str = EL_CSTR(s);
|
||||
if (!str) return (el_val_t)0;
|
||||
int Y, M, D, h, m, sec, frac = 0;
|
||||
int n = sscanf(str, "%d-%d-%dT%d:%d:%d.%3d", &Y, &M, &D, &h, &m, &sec, &frac);
|
||||
if (n < 6) {
|
||||
n = sscanf(str, "%d-%d-%dT%d:%d:%dZ", &Y, &M, &D, &h, &m, &sec);
|
||||
if (n < 6) return (el_val_t)0;
|
||||
}
|
||||
struct tm tm;
|
||||
memset(&tm, 0, sizeof(tm));
|
||||
tm.tm_year = Y - 1900;
|
||||
tm.tm_mon = M - 1;
|
||||
tm.tm_mday = D;
|
||||
tm.tm_hour = h;
|
||||
tm.tm_min = m;
|
||||
tm.tm_sec = sec;
|
||||
/* timegm — UTC. POSIX-Y but available on macOS and glibc. */
|
||||
time_t t = timegm(&tm);
|
||||
if (t == (time_t)-1) return (el_val_t)0;
|
||||
int64_t ns = (int64_t)t * 1000000000LL + (int64_t)frac * 1000000LL;
|
||||
return (el_val_t)ns;
|
||||
}
|
||||
|
||||
/* Duration constructors. The El-side postfix literals (30.seconds, 1.hour)
|
||||
* are lowered by the codegen directly into a literal int64 of nanoseconds —
|
||||
* these constructors are for runtime values where the count is dynamic. */
|
||||
el_val_t el_duration_from_nanos(el_val_t ns) {
|
||||
return (el_val_t)(int64_t)ns;
|
||||
}
|
||||
|
||||
el_val_t duration_seconds(el_val_t n) {
|
||||
int64_t s = (int64_t)n;
|
||||
return (el_val_t)(s * 1000000000LL);
|
||||
}
|
||||
|
||||
el_val_t duration_millis(el_val_t n) {
|
||||
int64_t m = (int64_t)n;
|
||||
return (el_val_t)(m * 1000000LL);
|
||||
}
|
||||
|
||||
el_val_t duration_nanos(el_val_t n) {
|
||||
return (el_val_t)(int64_t)n;
|
||||
}
|
||||
|
||||
/* Arithmetic — typed wrappers. At the C level these are no-op casts, but
|
||||
* the codegen routes Instant/Duration BinOps through them so the generated
|
||||
* C says `el_instant_add_dur(start, dur)` rather than `start + dur`. The
|
||||
* intent is explicit, the operand order is documented, and a future change
|
||||
* to the underlying representation (saturating arithmetic, overflow guards)
|
||||
* has a single chokepoint. */
|
||||
el_val_t el_instant_add_dur(el_val_t inst, el_val_t dur) {
|
||||
return (el_val_t)((int64_t)inst + (int64_t)dur);
|
||||
}
|
||||
|
||||
el_val_t el_instant_sub_dur(el_val_t inst, el_val_t dur) {
|
||||
return (el_val_t)((int64_t)inst - (int64_t)dur);
|
||||
}
|
||||
|
||||
el_val_t el_instant_diff(el_val_t a, el_val_t b) {
|
||||
/* a - b — yields a Duration (negative if b is later than a). */
|
||||
return (el_val_t)((int64_t)a - (int64_t)b);
|
||||
}
|
||||
|
||||
el_val_t el_duration_add(el_val_t a, el_val_t b) {
|
||||
return (el_val_t)((int64_t)a + (int64_t)b);
|
||||
}
|
||||
|
||||
el_val_t el_duration_sub(el_val_t a, el_val_t b) {
|
||||
return (el_val_t)((int64_t)a - (int64_t)b);
|
||||
}
|
||||
|
||||
el_val_t el_duration_scale(el_val_t dur, el_val_t scalar) {
|
||||
return (el_val_t)((int64_t)dur * (int64_t)scalar);
|
||||
}
|
||||
|
||||
el_val_t el_duration_div(el_val_t dur, el_val_t scalar) {
|
||||
int64_t s = (int64_t)scalar;
|
||||
if (s == 0) return (el_val_t)0;
|
||||
return (el_val_t)((int64_t)dur / s);
|
||||
}
|
||||
|
||||
/* Comparisons. Return 1/0 in el_val_t convention. */
|
||||
el_val_t el_instant_lt(el_val_t a, el_val_t b) { return (el_val_t)((int64_t)a < (int64_t)b ? 1 : 0); }
|
||||
el_val_t el_instant_le(el_val_t a, el_val_t b) { return (el_val_t)((int64_t)a <= (int64_t)b ? 1 : 0); }
|
||||
el_val_t el_instant_gt(el_val_t a, el_val_t b) { return (el_val_t)((int64_t)a > (int64_t)b ? 1 : 0); }
|
||||
el_val_t el_instant_ge(el_val_t a, el_val_t b) { return (el_val_t)((int64_t)a >= (int64_t)b ? 1 : 0); }
|
||||
el_val_t el_instant_eq(el_val_t a, el_val_t b) { return (el_val_t)((int64_t)a == (int64_t)b ? 1 : 0); }
|
||||
el_val_t el_instant_ne(el_val_t a, el_val_t b) { return (el_val_t)((int64_t)a != (int64_t)b ? 1 : 0); }
|
||||
el_val_t el_duration_lt(el_val_t a, el_val_t b) { return (el_val_t)((int64_t)a < (int64_t)b ? 1 : 0); }
|
||||
el_val_t el_duration_le(el_val_t a, el_val_t b) { return (el_val_t)((int64_t)a <= (int64_t)b ? 1 : 0); }
|
||||
el_val_t el_duration_gt(el_val_t a, el_val_t b) { return (el_val_t)((int64_t)a > (int64_t)b ? 1 : 0); }
|
||||
el_val_t el_duration_ge(el_val_t a, el_val_t b) { return (el_val_t)((int64_t)a >= (int64_t)b ? 1 : 0); }
|
||||
el_val_t el_duration_eq(el_val_t a, el_val_t b) { return (el_val_t)((int64_t)a == (int64_t)b ? 1 : 0); }
|
||||
el_val_t el_duration_ne(el_val_t a, el_val_t b) { return (el_val_t)((int64_t)a != (int64_t)b ? 1 : 0); }
|
||||
|
||||
/* Conversions. */
|
||||
el_val_t instant_to_unix_seconds(el_val_t i) {
|
||||
return (el_val_t)((int64_t)i / 1000000000LL);
|
||||
}
|
||||
|
||||
el_val_t instant_to_unix_millis(el_val_t i) {
|
||||
return (el_val_t)((int64_t)i / 1000000LL);
|
||||
}
|
||||
|
||||
el_val_t instant_to_iso8601(el_val_t i) {
|
||||
int64_t ns = (int64_t)i;
|
||||
time_t s = (time_t)(ns / 1000000000LL);
|
||||
int msec = (int)((ns / 1000000LL) % 1000LL);
|
||||
if (msec < 0) { msec += 1000; s -= 1; }
|
||||
struct tm tm;
|
||||
gmtime_r(&s, &tm);
|
||||
char buf[64];
|
||||
snprintf(buf, sizeof(buf), "%04d-%02d-%02dT%02d:%02d:%02d.%03dZ",
|
||||
tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday,
|
||||
tm.tm_hour, tm.tm_min, tm.tm_sec, msec);
|
||||
return el_wrap_str(el_strdup(buf));
|
||||
}
|
||||
|
||||
el_val_t duration_to_seconds(el_val_t d) {
|
||||
return (el_val_t)((int64_t)d / 1000000000LL);
|
||||
}
|
||||
|
||||
el_val_t duration_to_millis(el_val_t d) {
|
||||
return (el_val_t)((int64_t)d / 1000000LL);
|
||||
}
|
||||
|
||||
el_val_t duration_to_nanos(el_val_t d) {
|
||||
return (el_val_t)(int64_t)d;
|
||||
}
|
||||
|
||||
/* sleep(Duration) — Phase 1 replacement for ambiguous sleep(Int). The runtime
|
||||
* still exposes sleep_secs/sleep_ms for legacy call sites; codegen lowers
|
||||
* sleep(Duration) to el_sleep_duration(d). Negative durations clamp to 0 so a
|
||||
* stale deadline doesn't block forever. */
|
||||
el_val_t el_sleep_duration(el_val_t dur) {
|
||||
int64_t ns = (int64_t)dur;
|
||||
if (ns < 0) ns = 0;
|
||||
struct timespec ts;
|
||||
ts.tv_sec = (time_t)(ns / 1000000000LL);
|
||||
ts.tv_nsec = (long)(ns % 1000000000LL);
|
||||
nanosleep(&ts, NULL);
|
||||
return (el_val_t)0;
|
||||
}
|
||||
|
||||
/* unix_timestamp() — back-compat. Existing El callers expect an Int seconds
|
||||
* value; this stays an Int returner so the type system isn't disturbed for
|
||||
* legacy code. New code should call now() and convert when needed. */
|
||||
el_val_t unix_timestamp(void) {
|
||||
return instant_to_unix_seconds(el_now_instant());
|
||||
}
|
||||
|
||||
/* TTL cache helpers. Backed by the existing process-wide K/V (state_set/get)
|
||||
* with a sibling __ttl_set_at_<key> entry recording the Instant of the last
|
||||
* write. ttl_cache_get returns "" if the entry is missing or stale, so call
|
||||
* sites can branch on `if v == "" { miss } else { hit }` — the same shape
|
||||
* existing get-with-default code uses. No more (now - cached_at) < 60. */
|
||||
el_val_t ttl_cache_set(el_val_t key, el_val_t value) {
|
||||
const char* k = EL_CSTR(key);
|
||||
if (!k) return (el_val_t)0;
|
||||
/* Store the value at the user's key. */
|
||||
state_set(key, value);
|
||||
/* Stamp set_at — opaque schema, namespaced under __ttl: prefix so user
|
||||
* keys can't collide with stamps. */
|
||||
size_t klen = strlen(k);
|
||||
char* stamp_key = (char*)malloc(klen + 16);
|
||||
if (!stamp_key) return (el_val_t)0;
|
||||
snprintf(stamp_key, klen + 16, "__ttl_at:%s", k);
|
||||
int64_t now_ns = (int64_t)el_now_instant();
|
||||
char buf[32];
|
||||
snprintf(buf, sizeof(buf), "%lld", (long long)now_ns);
|
||||
state_set(EL_STR(stamp_key), EL_STR(buf));
|
||||
free(stamp_key);
|
||||
return (el_val_t)1;
|
||||
}
|
||||
|
||||
el_val_t ttl_cache_get(el_val_t key, el_val_t max_age) {
|
||||
const char* k = EL_CSTR(key);
|
||||
if (!k) return el_wrap_str(el_strdup(""));
|
||||
/* Look up stamp. */
|
||||
size_t klen = strlen(k);
|
||||
char* stamp_key = (char*)malloc(klen + 16);
|
||||
if (!stamp_key) return el_wrap_str(el_strdup(""));
|
||||
snprintf(stamp_key, klen + 16, "__ttl_at:%s", k);
|
||||
el_val_t stamp = state_get(EL_STR(stamp_key));
|
||||
free(stamp_key);
|
||||
const char* sv = EL_CSTR(stamp);
|
||||
if (!sv || !*sv) return el_wrap_str(el_strdup(""));
|
||||
int64_t set_at = (int64_t)atoll(sv);
|
||||
int64_t now_ns = (int64_t)el_now_instant();
|
||||
int64_t age = now_ns - set_at;
|
||||
int64_t max_ns = (int64_t)max_age;
|
||||
if (age < 0) return el_wrap_str(el_strdup("")); /* clock skew — treat as miss */
|
||||
if (age > max_ns) return el_wrap_str(el_strdup("")); /* expired */
|
||||
return state_get(key);
|
||||
}
|
||||
|
||||
el_val_t ttl_cache_age(el_val_t key) {
|
||||
const char* k = EL_CSTR(key);
|
||||
if (!k) return (el_val_t)INT64_MAX;
|
||||
size_t klen = strlen(k);
|
||||
char* stamp_key = (char*)malloc(klen + 16);
|
||||
if (!stamp_key) return (el_val_t)INT64_MAX;
|
||||
snprintf(stamp_key, klen + 16, "__ttl_at:%s", k);
|
||||
el_val_t stamp = state_get(EL_STR(stamp_key));
|
||||
free(stamp_key);
|
||||
const char* sv = EL_CSTR(stamp);
|
||||
if (!sv || !*sv) return (el_val_t)INT64_MAX;
|
||||
int64_t set_at = (int64_t)atoll(sv);
|
||||
int64_t now_ns = (int64_t)el_now_instant();
|
||||
return (el_val_t)(now_ns - set_at);
|
||||
}
|
||||
|
||||
/* ── UUID v4 ─────────────────────────────────────────────────────────────── */
|
||||
|
||||
static int _el_uuid_seeded = 0;
|
||||
|
||||
@@ -199,6 +199,19 @@ el_val_t http_get_to_file(el_val_t url, el_val_t headers_map, el_val_t output_p
|
||||
el_val_t url_encode(el_val_t s); /* RFC 3986 unreserved set */
|
||||
el_val_t url_decode(el_val_t s); /* '+' → space, %XX → byte */
|
||||
|
||||
/* ── HTML allowlist sanitizer ────────────────────────────────────────────────
|
||||
* el_html_sanitize(input_html, allowlist_json) — strict allowlist HTML
|
||||
* cleaner. State-machine parser; tag/attribute names compared case-
|
||||
* insensitively against the allowlist; `<a href>` / `<… src>` URL schemes
|
||||
* validated (http, https, mailto, fragment-only, or relative); whole-
|
||||
* subtree drop for script / style / iframe / object / embed / form; HTML-
|
||||
* escapes free text outside dropped subtrees.
|
||||
*
|
||||
* The allowlist is JSON of the form
|
||||
* {"p":[],"a":["href","title"],"strong":[],...}
|
||||
* where each value is the array of attribute names allowed for that tag. */
|
||||
el_val_t el_html_sanitize(el_val_t input_html, el_val_t allowlist_json);
|
||||
|
||||
/* ── Filesystem ──────────────────────────────────────────────────────────── */
|
||||
|
||||
el_val_t fs_read(el_val_t path);
|
||||
|
||||
Reference in New Issue
Block a user