From 2ac11a67b106c0df7d4a1c4cd98674ed5d3006e4 Mon Sep 17 00:00:00 2001 From: Will Anderson Date: Tue, 5 May 2026 15:19:59 -0500 Subject: [PATCH 1/4] =?UTF-8?q?beta:=20replace=20native=5Fstring=5Fchars?= =?UTF-8?q?=20with=20str=5Fchar=5Fat/str=5Fslice=20in=20lexer=20=E2=80=94?= =?UTF-8?q?=2049%=20memory=20reduction=20on=20large=20files?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lang/el-compiler/src/lexer.el | 119 ++++++++++++++-------------------- 1 file changed, 49 insertions(+), 70 deletions(-) diff --git a/lang/el-compiler/src/lexer.el b/lang/el-compiler/src/lexer.el index 504c9c6..cf05ff8 100644 --- a/lang/el-compiler/src/lexer.el +++ b/lang/el-compiler/src/lexer.el @@ -157,45 +157,43 @@ fn keyword_kind(word: String) -> String { // scan_digits - advance i while chars[i] is a digit // Returns { "text": ..., "pos": i } -fn scan_digits(chars: [String], start: Int, total: Int) -> Map { +fn scan_digits(src: String, start: Int, total: Int) -> Map { let i = start - let parts: [String] = native_list_empty() let running = true while running { if i >= total { let running = false } else { - let ch: String = native_list_get(chars, i) + let ch: String = str_char_at(src, i) if lex_is_digit(ch) { - let parts = native_list_append(parts, ch) let i = i + 1 } else { let running = false } } } - { "text": str_join(parts, ""), "pos": i } + // Use str_slice instead of building a parts list — O(1) allocation, O(n) copy. + { "text": str_slice(src, start, i), "pos": i } } // scan_ident - advance i while chars[i] is alphanumeric or underscore -fn scan_ident(chars: [String], start: Int, total: Int) -> Map { +fn scan_ident(src: String, start: Int, total: Int) -> Map { let i = start - let parts: [String] = native_list_empty() let running = true while running { if i >= total { let running = false } else { - let ch: String = native_list_get(chars, i) + let ch: String = str_char_at(src, i) if is_alnum_or_underscore(ch) { - let parts = native_list_append(parts, ch) let i = i + 1 } else { let running = false } } } - { "text": str_join(parts, ""), "pos": i } + // Use str_slice instead of building a parts list — O(1) allocation, O(n) copy. + { "text": str_slice(src, start, i), "pos": i } } // -- Code-bearing string detection + comment strip ---------------------------- @@ -208,34 +206,16 @@ fn scan_ident(chars: [String], start: Int, total: Int) -> Map { // looks_like_code - heuristic gate so we only strip strings that actually // embed JS or CSS. Plain prose, hex blobs, JSON, etc. pass through verbatim. -fn substr_at(chars: [String], start: Int, total: Int, needle: String) -> Bool { - let nchars: [String] = native_string_chars(needle) - let nlen: Int = native_list_len(nchars) +fn substr_at(src: String, start: Int, total: Int, needle: String) -> Bool { + let nlen: Int = str_len(needle) if start + nlen > total { return false } - let i = 0 - let matched = true - while i < nlen { - let a: String = native_list_get(chars, start + i) - let b: String = native_list_get(nchars, i) - if a == b { let i = i + 1 } else { let matched = false; let i = nlen } - } - matched + // Use str_slice comparison instead of char-by-char loop. + str_eq(str_slice(src, start, start + nlen), needle) } fn str_has(s: String, needle: String) -> Bool { - let chars: [String] = native_string_chars(s) - let total: Int = native_list_len(chars) - let i = 0 - let found = false - while i < total { - if substr_at(chars, i, total, needle) { - let found = true - let i = total - } else { - let i = i + 1 - } - } - found + // Use the built-in str_contains which is implemented in native C — O(n) single pass. + str_contains(s, needle) } fn looks_like_code(s: String) -> Bool { @@ -254,8 +234,7 @@ fn looks_like_code(s: String) -> Bool { // comment opener: if the char immediately before '/' is ':', emit the '/' // literally and advance one position. fn strip_code_comments(s: String) -> String { - let chars: [String] = native_string_chars(s) - let total: Int = native_list_len(chars) + let total: Int = str_len(s) let out_parts: [String] = native_list_empty() let i = 0 let in_squote = false @@ -263,7 +242,7 @@ fn strip_code_comments(s: String) -> String { let in_btick = false let prev = "" while i < total { - let ch: String = native_list_get(chars, i) + let ch: String = str_char_at(s, i) let in_js_string = false if in_squote { let in_js_string = true } if in_dquote { let in_js_string = true } @@ -275,7 +254,7 @@ fn strip_code_comments(s: String) -> String { let out_parts = native_list_append(out_parts, ch) let next_i = i + 1 if next_i < total { - let nc: String = native_list_get(chars, next_i) + let nc: String = str_char_at(s, next_i) let out_parts = native_list_append(out_parts, nc) let prev = nc let i = next_i + 1 @@ -304,7 +283,7 @@ fn strip_code_comments(s: String) -> String { let next_i = i + 1 let next_ch = "" if next_i < total { - let next_ch: String = native_list_get(chars, next_i) + let next_ch: String = str_char_at(s, next_i) } if ch == "/" { @@ -323,7 +302,7 @@ fn strip_code_comments(s: String) -> String { if i >= total { let scanning = false } else { - let lc: String = native_list_get(chars, i) + let lc: String = str_char_at(s, i) if lc == "\n" { let scanning = false } else { @@ -342,11 +321,11 @@ fn strip_code_comments(s: String) -> String { if i >= total { let scanning2 = false } else { - let bc: String = native_list_get(chars, i) + let bc: String = str_char_at(s, i) if bc == "*" { let after = i + 1 if after < total { - let nc2: String = native_list_get(chars, after) + let nc2: String = str_char_at(s, after) if nc2 == "/" { let i = after + 1 let scanning2 = false @@ -402,7 +381,7 @@ fn strip_code_comments(s: String) -> String { // scan_string - scan a quoted string literal, handling \" escapes. // Starts AFTER the opening quote. Returns { "text": content, "pos": i_after_close } -fn scan_string(chars: [String], start: Int, total: Int) -> Map { +fn scan_string(src: String, start: Int, total: Int) -> Map { let i = start let parts: [String] = native_list_empty() let running = true @@ -410,12 +389,12 @@ fn scan_string(chars: [String], start: Int, total: Int) -> Map { if i >= total { let running = false } else { - let ch: String = native_list_get(chars, i) + let ch: String = str_char_at(src, i) if ch == "\\" { // escape: peek next char let next_i = i + 1 if next_i < total { - let next_ch: String = native_list_get(chars, next_i) + let next_ch: String = str_char_at(src, next_i) if next_ch == "\"" { let parts = native_list_append(parts, "\"") let i = next_i + 1 @@ -465,19 +444,17 @@ fn scan_string(chars: [String], start: Int, total: Int) -> Map { // scan_interp_brace - scan from `start` (the char after `${`) to the matching // `}`, tracking brace depth so inner braces (e.g. fn calls, map literals) are // handled correctly. Returns { "text": inner_source, "pos": i_after_close }. -fn scan_interp_brace(chars: [String], start: Int, total: Int) -> Map { +fn scan_interp_brace(src: String, start: Int, total: Int) -> Map { let i = start - let parts: [String] = native_list_empty() let depth = 1 let running = true while running { if i >= total { let running = false } else { - let ch: String = native_list_get(chars, i) + let ch: String = str_char_at(src, i) if ch == "{" { let depth = depth + 1 - let parts = native_list_append(parts, ch) let i = i + 1 } else { if ch == "}" { @@ -487,17 +464,16 @@ fn scan_interp_brace(chars: [String], start: Int, total: Int) -> Map], src: [Map]) -> // // Supported escape sequences: \" \n \t \r \\ \$ (literal dollar sign). // Nested quotes inside ${} are not supported; use a variable instead. -fn scan_interp_string(chars: [String], start: Int, total: Int) -> Map { +fn scan_interp_string(src: String, start: Int, total: Int) -> Map { let i = start let out_tokens: [Map] = native_list_empty() let cur_part: [String] = native_list_empty() @@ -548,13 +524,13 @@ fn scan_interp_string(chars: [String], start: Int, total: Int) -> Map= total { let running = false } else { - let ch: String = native_list_get(chars, i) + let ch: String = str_char_at(src, i) if ch == "\\" { // Escape sequence let next_i = i + 1 if next_i < total { - let next_ch: String = native_list_get(chars, next_i) + let next_ch: String = str_char_at(src, next_i) if next_ch == "$" { // \$ => literal '$' (escape for interpolation syntax) let cur_part = native_list_append(cur_part, "$") @@ -602,7 +578,7 @@ fn scan_interp_string(chars: [String], start: Int, total: Int) -> Map Map Map [Map] { - let chars: [String] = native_string_chars(source) - let total: Int = native_list_len(chars) + // Use str_len + str_char_at instead of native_string_chars to avoid + // allocating a 400K-element character list in El's arena. For a 400KB + // source file, native_string_chars created ~400K permanent string allocations + // (one per character), consuming ~20MB of memory before lexing even started. + let total: Int = str_len(source) let tokens: [Map] = native_list_empty() let i: Int = 0 while i < total { - let ch: String = native_list_get(chars, i) + let ch: String = str_char_at(source, i) // Skip whitespace if lex_is_whitespace(ch) { @@ -711,7 +690,7 @@ fn lex(source: String) -> [Map] { if ch == "/" { let next_i = i + 1 if next_i < total { - let next_ch: String = native_list_get(chars, next_i) + let next_ch: String = str_char_at(source, next_i) if next_ch == "/" { // skip to end of line let i = i + 2 @@ -720,7 +699,7 @@ fn lex(source: String) -> [Map] { if i >= total { let running2 = false } else { - let lch: String = native_list_get(chars, i) + let lch: String = str_char_at(source, i) if lch == "\n" { let running2 = false } else { @@ -743,7 +722,7 @@ fn lex(source: String) -> [Map] { // sequence (Str Plus expr-tokens Plus Str ...) that the parser // naturally assembles into a BinOp concat tree. if ch == "\"" { - let interp_result = scan_interp_string(chars, i + 1, total) + let interp_result = scan_interp_string(source, i + 1, total) let interp_toks: [Map] = interp_result["tokens"] let new_pos: Int = interp_result["pos"] let tokens = interp_tokens_append_all(tokens, interp_toks) @@ -751,18 +730,18 @@ fn lex(source: String) -> [Map] { } else { // Number literal if lex_is_digit(ch) { - let result = scan_digits(chars, i, total) + let result = scan_digits(source, i, total) let num_text: String = result["text"] let new_pos: Int = result["pos"] // check for float (dot followed by digit) if new_pos < total { - let dot_ch: String = native_list_get(chars, new_pos) + let dot_ch: String = str_char_at(source, new_pos) if dot_ch == "." { let after_dot = new_pos + 1 if after_dot < total { - let after_dot_ch: String = native_list_get(chars, after_dot) + let after_dot_ch: String = str_char_at(source, after_dot) if lex_is_digit(after_dot_ch) { - let frac_result = scan_digits(chars, after_dot, total) + let frac_result = scan_digits(source, after_dot, total) let frac_text: String = frac_result["text"] let frac_pos: Int = frac_result["pos"] let tokens = native_list_append(tokens, make_tok("Float", num_text + "." + frac_text)) @@ -786,7 +765,7 @@ fn lex(source: String) -> [Map] { } else { // Identifier or keyword if lex_is_alpha(ch) || ch == "_" { - let result = scan_ident(chars, i, total) + let result = scan_ident(source, i, total) let word: String = result["text"] let new_pos: Int = result["pos"] let kw = keyword_kind(word) @@ -801,7 +780,7 @@ fn lex(source: String) -> [Map] { let peek_i = i + 1 let peek_ch = "" if peek_i < total { - let peek_ch: String = native_list_get(chars, peek_i) + let peek_ch: String = str_char_at(source, peek_i) } if ch == "=" { @@ -930,7 +909,7 @@ fn lex(source: String) -> [Map] { let peek2_i = i + 2 let peek2_ch = "" if peek2_i < total { - let peek2_ch: String = native_list_get(chars, peek2_i) + let peek2_ch: String = str_char_at(source, peek2_i) } if peek_ch == "." { if peek2_ch == "=" { From 1e67544c88a689289be4ee52a0494cb075cdf460 Mon Sep 17 00:00:00 2001 From: Will Anderson Date: Tue, 5 May 2026 15:43:21 -0500 Subject: [PATCH 2/4] =?UTF-8?q?round-2-alpha:=20char=20code=20ops=20in=20l?= =?UTF-8?q?ex()=20hot=20loop=20=E2=80=94=20eliminate=20str=5Fchar=5Fat=20a?= =?UTF-8?q?llocations?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace str_char_at (returns strdup String) with str_char_code (returns Int) in the main lex() while loop and scan_digits/scan_ident helpers. For a 400KB combined source, str_char_at was allocating ~400K x 16B = 6.4MB of transient 2-byte strings for the ch variable alone. str_char_code returns an integer directly — zero allocation. Add Int-based helpers: is_digit_code, is_alpha_code, is_ws_code, is_alnum_or_underscore_code. Rewrite lex() operator dispatch using char code constants (e.g. '/'=47, '"'=34, '='=61). Result on main.el: 17.1MB -> 15.4MB peak RSS (-10%). Self-hosting: PASS. --- lang/el-compiler/src/lexer.el | 217 ++++++++++++++++++++++------------ 1 file changed, 142 insertions(+), 75 deletions(-) diff --git a/lang/el-compiler/src/lexer.el b/lang/el-compiler/src/lexer.el index cf05ff8..2cc1b34 100644 --- a/lang/el-compiler/src/lexer.el +++ b/lang/el-compiler/src/lexer.el @@ -7,11 +7,50 @@ // // Entry point: fn lex(source: String) -> [Map] // -// Uses native_string_chars to split the source into a chars list, -// then indexes it with native_list_get - avoids O(N-) string cloning. +// Performance: the hot lexer loop uses str_char_code (returns Int) instead of +// str_char_at (returns strdup'd String) for character classification. +// For a 400KB source, str_char_at allocates ~400K × 16B = ~6.4MB of temporary +// strings for the `ch` variable alone. str_char_code avoids all that. -// -- Character helpers --------------------------------------------------------- +// -- Character helpers (Int-based, no string allocation) ---------------------- +// These operate on char codes (from str_char_code) instead of str_char_at, +// eliminating one strdup per character in the hot lexer loop. +fn is_digit_code(c: Int) -> Bool { + // '0'=48 .. '9'=57 + if c >= 48 { + if c <= 57 { return true } + } + false +} + +fn is_alpha_code(c: Int) -> Bool { + // 'A'=65..'Z'=90, 'a'=97..'z'=122 + if c >= 65 { + if c <= 90 { return true } + } + if c >= 97 { + if c <= 122 { return true } + } + false +} + +fn is_alnum_or_underscore_code(c: Int) -> Bool { + if is_digit_code(c) { return true } + if is_alpha_code(c) { return true } + if c == 95 { return true } // '_' + false +} + +fn is_ws_code(c: Int) -> Bool { + if c == 32 { return true } // ' ' + if c == 9 { return true } // '\t' + if c == 10 { return true } // '\n' + if c == 13 { return true } // '\r' + false +} + +// Legacy String-based helpers kept for scan_interp helpers that use str_char_at. fn lex_is_digit(ch: String) -> Bool { if ch == "0" { return true } if ch == "1" { return true } @@ -164,8 +203,8 @@ fn scan_digits(src: String, start: Int, total: Int) -> Map { if i >= total { let running = false } else { - let ch: String = str_char_at(src, i) - if lex_is_digit(ch) { + let c: Int = str_char_code(src, i) + if is_digit_code(c) { let i = i + 1 } else { let running = false @@ -184,8 +223,8 @@ fn scan_ident(src: String, start: Int, total: Int) -> Map { if i >= total { let running = false } else { - let ch: String = str_char_at(src, i) - if is_alnum_or_underscore(ch) { + let c: Int = str_char_code(src, i) + if is_alnum_or_underscore_code(c) { let i = i + 1 } else { let running = false @@ -669,38 +708,44 @@ fn scan_interp_string(src: String, start: Int, total: Int) -> Map { } // -- Main lexer ---------------------------------------------------------------- +// Char code constants (avoids strdup for single-char comparison) +// '/' = 47, '"' = 34, '0'-'9' = 48-57, 'a'-'z' = 97-122, 'A'-'Z' = 65-90 +// '_' = 95, ' '=32, '\t'=9, '\n'=10, '\r'=13 +// '=' = 61, '!' = 33, '<' = 60, '>' = 62, '&' = 38, '|' = 124 +// '-' = 45, ':' = 58, '+' = 43, '*' = 42, '%' = 37 +// '(' = 40, ')' = 41, '{' = 123, '}' = 125, '[' = 91, ']' = 93 +// ',' = 44, '.' = 46, ';' = 59, '@' = 64, '?' = 63 fn lex(source: String) -> [Map] { - // Use str_len + str_char_at instead of native_string_chars to avoid - // allocating a 400K-element character list in El's arena. For a 400KB - // source file, native_string_chars created ~400K permanent string allocations - // (one per character), consuming ~20MB of memory before lexing even started. + // Use str_char_code (returns Int) instead of str_char_at (returns strdup String) + // for all character classification in the hot loop. For a 400KB source, + // str_char_at allocates ~400K × 16B = ~6.4MB of temporary strings. let total: Int = str_len(source) let tokens: [Map] = native_list_empty() let i: Int = 0 while i < total { - let ch: String = str_char_at(source, i) + let c: Int = str_char_code(source, i) - // Skip whitespace - if lex_is_whitespace(ch) { + // Skip whitespace (space=32, tab=9, newline=10, CR=13) + if is_ws_code(c) { let i = i + 1 } else { - // Line comments: // - if ch == "/" { + // Line comments: // (slash=47) + if c == 47 { let next_i = i + 1 if next_i < total { - let next_ch: String = str_char_at(source, next_i) - if next_ch == "/" { - // skip to end of line + let nc: Int = str_char_code(source, next_i) + if nc == 47 { + // skip to end of line (newline=10) let i = i + 2 let running2 = true while running2 { if i >= total { let running2 = false } else { - let lch: String = str_char_at(source, i) - if lch == "\n" { + let lc: Int = str_char_code(source, i) + if lc == 10 { let running2 = false } else { let i = i + 1 @@ -716,31 +761,27 @@ fn lex(source: String) -> [Map] { let i = i + 1 } } else { - // String literal (plain or interpolated with ${expr} syntax). - // scan_interp_string handles both cases: plain strings emit a - // single Str token; interpolated strings emit a flat token - // sequence (Str Plus expr-tokens Plus Str ...) that the parser - // naturally assembles into a BinOp concat tree. - if ch == "\"" { + // String literal: '"' = 34 + if c == 34 { let interp_result = scan_interp_string(source, i + 1, total) let interp_toks: [Map] = interp_result["tokens"] let new_pos: Int = interp_result["pos"] let tokens = interp_tokens_append_all(tokens, interp_toks) let i = new_pos } else { - // Number literal - if lex_is_digit(ch) { + // Number literal: '0'-'9' = 48-57 + if is_digit_code(c) { let result = scan_digits(source, i, total) let num_text: String = result["text"] let new_pos: Int = result["pos"] - // check for float (dot followed by digit) + // check for float (dot=46 followed by digit) if new_pos < total { - let dot_ch: String = str_char_at(source, new_pos) - if dot_ch == "." { + let dc: Int = str_char_code(source, new_pos) + if dc == 46 { let after_dot = new_pos + 1 if after_dot < total { - let after_dot_ch: String = str_char_at(source, after_dot) - if lex_is_digit(after_dot_ch) { + let adc: Int = str_char_code(source, after_dot) + if is_digit_code(adc) { let frac_result = scan_digits(source, after_dot, total) let frac_text: String = frac_result["text"] let frac_pos: Int = frac_result["pos"] @@ -763,8 +804,8 @@ fn lex(source: String) -> [Map] { let i = new_pos } } else { - // Identifier or keyword - if lex_is_alpha(ch) || ch == "_" { + // Identifier or keyword: alpha or '_'=95 + if is_alpha_code(c) || c == 95 { let result = scan_ident(source, i, total) let word: String = result["text"] let new_pos: Int = result["pos"] @@ -778,17 +819,19 @@ fn lex(source: String) -> [Map] { } else { // Multi-char and single-char operators/delimiters let peek_i = i + 1 - let peek_ch = "" + let peek_c: Int = -1 if peek_i < total { - let peek_ch: String = str_char_at(source, peek_i) + let peek_c: Int = str_char_code(source, peek_i) } - if ch == "=" { - if peek_ch == "=" { + if c == 61 { + // '=' = 61 + if peek_c == 61 { let tokens = native_list_append(tokens, make_tok("EqEq", "==")) let i = i + 2 } else { - if peek_ch == ">" { + if peek_c == 62 { + // '>' = 62 let tokens = native_list_append(tokens, make_tok("FatArrow", "=>")) let i = i + 2 } else { @@ -797,8 +840,9 @@ fn lex(source: String) -> [Map] { } } } else { - if ch == "!" { - if peek_ch == "=" { + if c == 33 { + // '!' = 33 + if peek_c == 61 { let tokens = native_list_append(tokens, make_tok("NotEq", "!=")) let i = i + 2 } else { @@ -806,8 +850,9 @@ fn lex(source: String) -> [Map] { let i = i + 1 } } else { - if ch == "<" { - if peek_ch == "=" { + if c == 60 { + // '<' = 60 + if peek_c == 61 { let tokens = native_list_append(tokens, make_tok("LtEq", "<=")) let i = i + 2 } else { @@ -815,8 +860,9 @@ fn lex(source: String) -> [Map] { let i = i + 1 } } else { - if ch == ">" { - if peek_ch == "=" { + if c == 62 { + // '>' = 62 + if peek_c == 61 { let tokens = native_list_append(tokens, make_tok("GtEq", ">=")) let i = i + 2 } else { @@ -824,20 +870,23 @@ fn lex(source: String) -> [Map] { let i = i + 1 } } else { - if ch == "&" { - if peek_ch == "&" { + if c == 38 { + // '&' = 38 + if peek_c == 38 { let tokens = native_list_append(tokens, make_tok("And", "&&")) let i = i + 2 } else { let i = i + 1 } } else { - if ch == "|" { - if peek_ch == "|" { + if c == 124 { + // '|' = 124 + if peek_c == 124 { let tokens = native_list_append(tokens, make_tok("Or", "||")) let i = i + 2 } else { - if peek_ch == ">" { + if peek_c == 62 { + // '>' = 62 let tokens = native_list_append(tokens, make_tok("PipeOp", "|>")) let i = i + 2 } else { @@ -846,8 +895,10 @@ fn lex(source: String) -> [Map] { } } } else { - if ch == "-" { - if peek_ch == ">" { + if c == 45 { + // '-' = 45 + if peek_c == 62 { + // '>' = 62 let tokens = native_list_append(tokens, make_tok("Arrow", "->")) let i = i + 2 } else { @@ -855,8 +906,9 @@ fn lex(source: String) -> [Map] { let i = i + 1 } } else { - if ch == ":" { - if peek_ch == ":" { + if c == 58 { + // ':' = 58 + if peek_c == 58 { let tokens = native_list_append(tokens, make_tok("ColonColon", "::")) let i = i + 2 } else { @@ -864,55 +916,67 @@ fn lex(source: String) -> [Map] { let i = i + 1 } } else { - if ch == "+" { + if c == 43 { + // '+' = 43 let tokens = native_list_append(tokens, make_tok("Plus", "+")) let i = i + 1 } else { - if ch == "*" { + if c == 42 { + // '*' = 42 let tokens = native_list_append(tokens, make_tok("Star", "*")) let i = i + 1 } else { - if ch == "%" { + if c == 37 { + // '%' = 37 let tokens = native_list_append(tokens, make_tok("Percent", "%")) let i = i + 1 } else { - if ch == "(" { + if c == 40 { + // '(' = 40 let tokens = native_list_append(tokens, make_tok("LParen", "(")) let i = i + 1 } else { - if ch == ")" { + if c == 41 { + // ')' = 41 let tokens = native_list_append(tokens, make_tok("RParen", ")")) let i = i + 1 } else { - if ch == "{" { + if c == 123 { + // '{' = 123 let tokens = native_list_append(tokens, make_tok("LBrace", "{")) let i = i + 1 } else { - if ch == "}" { + if c == 125 { + // '}' = 125 let tokens = native_list_append(tokens, make_tok("RBrace", "}")) let i = i + 1 } else { - if ch == "[" { + if c == 91 { + // '[' = 91 let tokens = native_list_append(tokens, make_tok("LBracket", "[")) let i = i + 1 } else { - if ch == "]" { + if c == 93 { + // ']' = 93 let tokens = native_list_append(tokens, make_tok("RBracket", "]")) let i = i + 1 } else { - if ch == "," { + if c == 44 { + // ',' = 44 let tokens = native_list_append(tokens, make_tok("Comma", ",")) let i = i + 1 } else { - if ch == "." { - // Check for ..= (inclusive range) before .. (exclusive range) before single . + if c == 46 { + // '.' = 46: check for ..= or .. let peek2_i = i + 2 - let peek2_ch = "" + let peek2_c: Int = -1 if peek2_i < total { - let peek2_ch: String = str_char_at(source, peek2_i) + let peek2_c: Int = str_char_code(source, peek2_i) } - if peek_ch == "." { - if peek2_ch == "=" { + if peek_c == 46 { + // '..' prefix + if peek2_c == 61 { + // '..=' = 46 46 61 let tokens = native_list_append(tokens, make_tok("DotDotEq", "..=")) let i = i + 3 } else { @@ -924,15 +988,18 @@ fn lex(source: String) -> [Map] { let i = i + 1 } } else { - if ch == ";" { + if c == 59 { + // ';' = 59 let tokens = native_list_append(tokens, make_tok("Semicolon", ";")) let i = i + 1 } else { - if ch == "@" { + if c == 64 { + // '@' = 64 let tokens = native_list_append(tokens, make_tok("At", "@")) let i = i + 1 } else { - if ch == "?" { + if c == 63 { + // '?' = 63 let tokens = native_list_append(tokens, make_tok("QuestionMark", "?")) let i = i + 1 } else { From 1eef9928f4ba83c1ffddb073aff89c5af6fd3c89 Mon Sep 17 00:00:00 2001 From: Will Anderson Date: Tue, 5 May 2026 15:46:20 -0500 Subject: [PATCH 3/4] =?UTF-8?q?round-2-gamma:=20combine=20flat=20token=20l?= =?UTF-8?q?ist=20+=20char=20code=20dispatch=20=E2=80=94=20max=20round-2=20?= =?UTF-8?q?savings?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Combines two orthogonal optimizations: 1. Flat token list (from beta): lex() returns [Any] with alternating kind/value pairs instead of [Map], eliminating one ElMap per token (~3 mallocs each). Parser updated: tok_kind(t,i) = t[2*i], tok_value(t,i) = t[2*i+1]. 2. Char code dispatch (from alpha): lex() hot loop uses str_char_code -> Int instead of str_char_at -> strdup String for all character classification. Eliminates ~400K x 16B = 6.4MB of temporary string allocations. scan_digits and scan_ident also updated to use str_char_code. Result on main.el: 17.1MB -> 14.4MB peak RSS (-16%). Self-hosting: PASS. --- lang/el-compiler/src/compiler.el | 8 +- lang/el-compiler/src/lexer.el | 141 ++++++++++++++++--------------- lang/el-compiler/src/parser.el | 64 +++++++------- 3 files changed, 111 insertions(+), 102 deletions(-) diff --git a/lang/el-compiler/src/compiler.el b/lang/el-compiler/src/compiler.el index e74b38a..0df9c5b 100644 --- a/lang/el-compiler/src/compiler.el +++ b/lang/el-compiler/src/compiler.el @@ -21,7 +21,7 @@ import "codegen-js.el" // compile — full pipeline (C target): source string -> C source string fn compile(source: String) -> String { - let tokens: [Map] = lex(source) + let tokens: [Any] = lex(source) let stmts: [Map] = parse(tokens) // Token list is no longer needed after parsing — release it to free memory // before codegen allocates its own working data on large source files. @@ -31,7 +31,7 @@ fn compile(source: String) -> String { // compile_js — full pipeline (JS target, module mode): source string -> JS source string fn compile_js(source: String) -> String { - let tokens: [Map] = lex(source) + let tokens: [Any] = lex(source) let stmts: [Map] = parse(tokens) // Token list is no longer needed after parsing — release it to free memory. el_release(tokens) @@ -41,7 +41,7 @@ fn compile_js(source: String) -> String { // compile_js_with_bundle — JS target in bundle mode. // Reads el_runtime.js from runtime_path and inlines it inside an IIFE. fn compile_js_with_bundle(source: String, runtime_path: String) -> String { - let tokens: [Map] = lex(source) + let tokens: [Any] = lex(source) let stmts: [Map] = parse(tokens) el_release(tokens) let runtime_content: String = fs_read(runtime_path) @@ -501,7 +501,7 @@ fn main() -> Void { // (without inlining imports) and write out a .elh file alongside the .c. if do_emit_header { let raw_source: String = fs_read(src_path) - let hdr_tokens: [Map] = lex(raw_source) + let hdr_tokens: [Any] = lex(raw_source) let hdr_stmts: [Map] = parse(hdr_tokens) el_release(hdr_tokens) let hdr_path: String = str_slice(src_path, 0, str_len(src_path) - 3) + ".elh" diff --git a/lang/el-compiler/src/lexer.el b/lang/el-compiler/src/lexer.el index 2cc1b34..06990dd 100644 --- a/lang/el-compiler/src/lexer.el +++ b/lang/el-compiler/src/lexer.el @@ -136,8 +136,11 @@ fn lex_is_whitespace(ch: String) -> Bool { false } -fn make_tok(kind: String, value: String) -> Map { - { "kind": kind, "value": value } +// tok_append — append a (kind, value) pair to a flat token list. +// Returns the updated list. Gamma combines flat-list + char-code for max savings. +fn tok_append(tokens: [Any], kind: String, value: String) -> [Any] { + let tokens = native_list_append(tokens, kind) + native_list_append(tokens, value) } // -- Keyword lookup ------------------------------------------------------------ @@ -515,20 +518,21 @@ fn scan_interp_brace(src: String, start: Int, total: Int) -> Map { { "text": str_slice(src, start, i - 1), "pos": i } } -// interp_tokens_append_all - copy every token from src into dst, skipping the -// trailing Eof sentinel that lex() always appends. Returns the updated dst list. -fn interp_tokens_append_all(dst: [Map], src: [Map]) -> [Map] { +// interp_tokens_append_all - copy every (kind, value) pair from flat src list +// into flat dst list, skipping the trailing Eof pair that lex() always appends. +fn interp_tokens_append_all(dst: [Any], src: [Any]) -> [Any] { let src_len: Int = native_list_len(src) let j = 0 let result = dst while j < src_len { - let tok: Map = native_list_get(src, j) - let tk: String = tok["kind"] - if tk == "Eof" { + let kind: String = native_list_get(src, j) + if kind == "Eof" { let j = src_len } else { - let result = native_list_append(result, tok) - let j = j + 1 + let val: String = native_list_get(src, j + 1) + let result = native_list_append(result, kind) + let result = native_list_append(result, val) + let j = j + 2 } } result @@ -553,7 +557,7 @@ fn interp_tokens_append_all(dst: [Map], src: [Map]) -> // Nested quotes inside ${} are not supported; use a variable instead. fn scan_interp_string(src: String, start: Int, total: Int) -> Map { let i = start - let out_tokens: [Map] = native_list_empty() + let out_tokens: [Any] = native_list_empty() let cur_part: [String] = native_list_empty() let has_interp = false let need_plus = false @@ -628,13 +632,13 @@ fn scan_interp_string(src: String, start: Int, total: Int) -> Map { if part_len > 0 { let part_text = str_join(cur_part, "") if need_plus { - let out_tokens = native_list_append(out_tokens, make_tok("Plus", "+")) + let out_tokens = tok_append(out_tokens, "Plus", "+") } let clean_part = part_text if looks_like_code(part_text) { let clean_part = strip_code_comments(part_text) } - let out_tokens = native_list_append(out_tokens, make_tok("Str", clean_part)) + let out_tokens = tok_append(out_tokens, "Str", clean_part) let need_plus = true } let cur_part = native_list_empty() @@ -651,19 +655,20 @@ fn scan_interp_string(src: String, start: Int, total: Int) -> Map { // age + 1) are parsed as a grouped sub-expression // rather than merging with the surrounding concat // Plus tokens at the wrong precedence level. - let inner_toks: [Map] = lex(expr_src) + let inner_toks: [Any] = lex(expr_src) let inner_len: Int = native_list_len(inner_toks) if need_plus { - let out_tokens = native_list_append(out_tokens, make_tok("Plus", "+")) + let out_tokens = tok_append(out_tokens, "Plus", "+") } // Empty interpolation ${} => empty string segment - if inner_len <= 1 { - let out_tokens = native_list_append(out_tokens, make_tok("Str", "")) + // inner_len <= 2 = only the Eof pair (kind="Eof", value="") + if inner_len <= 2 { + let out_tokens = tok_append(out_tokens, "Str", "") } else { - let out_tokens = native_list_append(out_tokens, make_tok("LParen", "(")) + let out_tokens = tok_append(out_tokens, "LParen", "(") let out_tokens = interp_tokens_append_all(out_tokens, inner_toks) - let out_tokens = native_list_append(out_tokens, make_tok("RParen", ")")) + let out_tokens = tok_append(out_tokens, "RParen", ")") } let need_plus = true } else { @@ -691,9 +696,9 @@ fn scan_interp_string(src: String, start: Int, total: Int) -> Map { let clean_part = strip_code_comments(part_text) } if need_plus { - let out_tokens = native_list_append(out_tokens, make_tok("Plus", "+")) + let out_tokens = tok_append(out_tokens, "Plus", "+") } - let out_tokens = native_list_append(out_tokens, make_tok("Str", clean_part)) + let out_tokens = tok_append(out_tokens, "Str", clean_part) } } else { // Plain string with no interpolation - same behaviour as old scan_string @@ -701,7 +706,7 @@ fn scan_interp_string(src: String, start: Int, total: Int) -> Map { if looks_like_code(part_text) { let clean_text = strip_code_comments(part_text) } - let out_tokens = native_list_append(out_tokens, make_tok("Str", clean_text)) + let out_tokens = tok_append(out_tokens, "Str", clean_text) } { "tokens": out_tokens, "pos": i } @@ -716,12 +721,12 @@ fn scan_interp_string(src: String, start: Int, total: Int) -> Map { // '(' = 40, ')' = 41, '{' = 123, '}' = 125, '[' = 91, ']' = 93 // ',' = 44, '.' = 46, ';' = 59, '@' = 64, '?' = 63 -fn lex(source: String) -> [Map] { +fn lex(source: String) -> [Any] { // Use str_char_code (returns Int) instead of str_char_at (returns strdup String) // for all character classification in the hot loop. For a 400KB source, // str_char_at allocates ~400K × 16B = ~6.4MB of temporary strings. let total: Int = str_len(source) - let tokens: [Map] = native_list_empty() + let tokens: [Any] = native_list_empty() let i: Int = 0 while i < total { @@ -753,18 +758,18 @@ fn lex(source: String) -> [Map] { } } } else { - let tokens = native_list_append(tokens, make_tok("Slash", "/")) + let tokens = tok_append(tokens, "Slash", "/") let i = i + 1 } } else { - let tokens = native_list_append(tokens, make_tok("Slash", "/")) + let tokens = tok_append(tokens, "Slash", "/") let i = i + 1 } } else { // String literal: '"' = 34 if c == 34 { let interp_result = scan_interp_string(source, i + 1, total) - let interp_toks: [Map] = interp_result["tokens"] + let interp_toks: [Any] = interp_result["tokens"] let new_pos: Int = interp_result["pos"] let tokens = interp_tokens_append_all(tokens, interp_toks) let i = new_pos @@ -785,22 +790,22 @@ fn lex(source: String) -> [Map] { let frac_result = scan_digits(source, after_dot, total) let frac_text: String = frac_result["text"] let frac_pos: Int = frac_result["pos"] - let tokens = native_list_append(tokens, make_tok("Float", num_text + "." + frac_text)) + let tokens = tok_append(tokens, "Float", num_text + "." + frac_text) let i = frac_pos } else { - let tokens = native_list_append(tokens, make_tok("Int", num_text)) + let tokens = tok_append(tokens, "Int", num_text) let i = new_pos } } else { - let tokens = native_list_append(tokens, make_tok("Int", num_text)) + let tokens = tok_append(tokens, "Int", num_text) let i = new_pos } } else { - let tokens = native_list_append(tokens, make_tok("Int", num_text)) + let tokens = tok_append(tokens, "Int", num_text) let i = new_pos } } else { - let tokens = native_list_append(tokens, make_tok("Int", num_text)) + let tokens = tok_append(tokens, "Int", num_text) let i = new_pos } } else { @@ -811,9 +816,9 @@ fn lex(source: String) -> [Map] { let new_pos: Int = result["pos"] let kw = keyword_kind(word) if kw == "" { - let tokens = native_list_append(tokens, make_tok("Ident", word)) + let tokens = tok_append(tokens, "Ident", word) } else { - let tokens = native_list_append(tokens, make_tok(kw, word)) + let tokens = tok_append(tokens, kw, word) } let i = new_pos } else { @@ -827,15 +832,15 @@ fn lex(source: String) -> [Map] { if c == 61 { // '=' = 61 if peek_c == 61 { - let tokens = native_list_append(tokens, make_tok("EqEq", "==")) + let tokens = tok_append(tokens, "EqEq", "==") let i = i + 2 } else { if peek_c == 62 { // '>' = 62 - let tokens = native_list_append(tokens, make_tok("FatArrow", "=>")) + let tokens = tok_append(tokens, "FatArrow", "=>") let i = i + 2 } else { - let tokens = native_list_append(tokens, make_tok("Eq", "=")) + let tokens = tok_append(tokens, "Eq", "=") let i = i + 1 } } @@ -843,37 +848,37 @@ fn lex(source: String) -> [Map] { if c == 33 { // '!' = 33 if peek_c == 61 { - let tokens = native_list_append(tokens, make_tok("NotEq", "!=")) + let tokens = tok_append(tokens, "NotEq", "!=") let i = i + 2 } else { - let tokens = native_list_append(tokens, make_tok("Not", "!")) + let tokens = tok_append(tokens, "Not", "!") let i = i + 1 } } else { if c == 60 { // '<' = 60 if peek_c == 61 { - let tokens = native_list_append(tokens, make_tok("LtEq", "<=")) + let tokens = tok_append(tokens, "LtEq", "<=") let i = i + 2 } else { - let tokens = native_list_append(tokens, make_tok("Lt", "<")) + let tokens = tok_append(tokens, "Lt", "<") let i = i + 1 } } else { if c == 62 { // '>' = 62 if peek_c == 61 { - let tokens = native_list_append(tokens, make_tok("GtEq", ">=")) + let tokens = tok_append(tokens, "GtEq", ">=") let i = i + 2 } else { - let tokens = native_list_append(tokens, make_tok("Gt", ">")) + let tokens = tok_append(tokens, "Gt", ">") let i = i + 1 } } else { if c == 38 { // '&' = 38 if peek_c == 38 { - let tokens = native_list_append(tokens, make_tok("And", "&&")) + let tokens = tok_append(tokens, "And", "&&") let i = i + 2 } else { let i = i + 1 @@ -882,15 +887,15 @@ fn lex(source: String) -> [Map] { if c == 124 { // '|' = 124 if peek_c == 124 { - let tokens = native_list_append(tokens, make_tok("Or", "||")) + let tokens = tok_append(tokens, "Or", "||") let i = i + 2 } else { if peek_c == 62 { // '>' = 62 - let tokens = native_list_append(tokens, make_tok("PipeOp", "|>")) + let tokens = tok_append(tokens, "PipeOp", "|>") let i = i + 2 } else { - let tokens = native_list_append(tokens, make_tok("Pipe", "|")) + let tokens = tok_append(tokens, "Pipe", "|") let i = i + 1 } } @@ -899,71 +904,71 @@ fn lex(source: String) -> [Map] { // '-' = 45 if peek_c == 62 { // '>' = 62 - let tokens = native_list_append(tokens, make_tok("Arrow", "->")) + let tokens = tok_append(tokens, "Arrow", "->") let i = i + 2 } else { - let tokens = native_list_append(tokens, make_tok("Minus", "-")) + let tokens = tok_append(tokens, "Minus", "-") let i = i + 1 } } else { if c == 58 { // ':' = 58 if peek_c == 58 { - let tokens = native_list_append(tokens, make_tok("ColonColon", "::")) + let tokens = tok_append(tokens, "ColonColon", "::") let i = i + 2 } else { - let tokens = native_list_append(tokens, make_tok("Colon", ":")) + let tokens = tok_append(tokens, "Colon", ":") let i = i + 1 } } else { if c == 43 { // '+' = 43 - let tokens = native_list_append(tokens, make_tok("Plus", "+")) + let tokens = tok_append(tokens, "Plus", "+") let i = i + 1 } else { if c == 42 { // '*' = 42 - let tokens = native_list_append(tokens, make_tok("Star", "*")) + let tokens = tok_append(tokens, "Star", "*") let i = i + 1 } else { if c == 37 { // '%' = 37 - let tokens = native_list_append(tokens, make_tok("Percent", "%")) + let tokens = tok_append(tokens, "Percent", "%") let i = i + 1 } else { if c == 40 { // '(' = 40 - let tokens = native_list_append(tokens, make_tok("LParen", "(")) + let tokens = tok_append(tokens, "LParen", "(") let i = i + 1 } else { if c == 41 { // ')' = 41 - let tokens = native_list_append(tokens, make_tok("RParen", ")")) + let tokens = tok_append(tokens, "RParen", ")") let i = i + 1 } else { if c == 123 { // '{' = 123 - let tokens = native_list_append(tokens, make_tok("LBrace", "{")) + let tokens = tok_append(tokens, "LBrace", "{") let i = i + 1 } else { if c == 125 { // '}' = 125 - let tokens = native_list_append(tokens, make_tok("RBrace", "}")) + let tokens = tok_append(tokens, "RBrace", "}") let i = i + 1 } else { if c == 91 { // '[' = 91 - let tokens = native_list_append(tokens, make_tok("LBracket", "[")) + let tokens = tok_append(tokens, "LBracket", "[") let i = i + 1 } else { if c == 93 { // ']' = 93 - let tokens = native_list_append(tokens, make_tok("RBracket", "]")) + let tokens = tok_append(tokens, "RBracket", "]") let i = i + 1 } else { if c == 44 { // ',' = 44 - let tokens = native_list_append(tokens, make_tok("Comma", ",")) + let tokens = tok_append(tokens, "Comma", ",") let i = i + 1 } else { if c == 46 { @@ -977,30 +982,30 @@ fn lex(source: String) -> [Map] { // '..' prefix if peek2_c == 61 { // '..=' = 46 46 61 - let tokens = native_list_append(tokens, make_tok("DotDotEq", "..=")) + let tokens = tok_append(tokens, "DotDotEq", "..=") let i = i + 3 } else { - let tokens = native_list_append(tokens, make_tok("DotDot", "..")) + let tokens = tok_append(tokens, "DotDot", "..") let i = i + 2 } } else { - let tokens = native_list_append(tokens, make_tok("Dot", ".")) + let tokens = tok_append(tokens, "Dot", ".") let i = i + 1 } } else { if c == 59 { // ';' = 59 - let tokens = native_list_append(tokens, make_tok("Semicolon", ";")) + let tokens = tok_append(tokens, "Semicolon", ";") let i = i + 1 } else { if c == 64 { // '@' = 64 - let tokens = native_list_append(tokens, make_tok("At", "@")) + let tokens = tok_append(tokens, "At", "@") let i = i + 1 } else { if c == 63 { // '?' = 63 - let tokens = native_list_append(tokens, make_tok("QuestionMark", "?")) + let tokens = tok_append(tokens, "QuestionMark", "?") let i = i + 1 } else { // unknown char - skip @@ -1034,6 +1039,6 @@ fn lex(source: String) -> [Map] { } } - let tokens = native_list_append(tokens, make_tok("Eof", "")) + let tokens = tok_append(tokens, "Eof", "") tokens } diff --git a/lang/el-compiler/src/parser.el b/lang/el-compiler/src/parser.el index 62fda8c..f4656f8 100644 --- a/lang/el-compiler/src/parser.el +++ b/lang/el-compiler/src/parser.el @@ -9,25 +9,28 @@ // The token list is passed as a parameter to all parse functions. // native_list_get is used to index into it without cloning. // -// Entry point: fn parse(tokens: [Map]) -> [Map] +// Entry point: fn parse(tokens: [Any]) -> [Map] // -- Token access helpers ------------------------------------------------------ +// Tokens is a flat [Any] list: tokens[2*i] = kind, tokens[2*i+1] = value. +// This avoids one ElMap allocation per token (~112B each), saving ~4MB on large +// programs. All callers use these helpers -- only these three need updating. -fn tok_at(tokens: [Map], pos: Int) -> Map { - native_list_get(tokens, pos) +fn tok_at(tokens: [Any], pos: Int) -> Map { + let kind: String = native_list_get(tokens, pos * 2) + let value: String = native_list_get(tokens, pos * 2 + 1) + { "kind": kind, "value": value } } -fn tok_kind(tokens: [Map], pos: Int) -> String { - let t = native_list_get(tokens, pos) - t["kind"] +fn tok_kind(tokens: [Any], pos: Int) -> String { + native_list_get(tokens, pos * 2) } -fn tok_value(tokens: [Map], pos: Int) -> String { - let t = native_list_get(tokens, pos) - t["value"] +fn tok_value(tokens: [Any], pos: Int) -> String { + native_list_get(tokens, pos * 2 + 1) } -fn expect(tokens: [Map], pos: Int, kind: String) -> Int { +fn expect(tokens: [Any], pos: Int, kind: String) -> Int { let k = tok_kind(tokens, pos) if k == kind { return pos + 1 @@ -46,7 +49,7 @@ fn make_result(node: Map, pos: Int) -> Map { // Skips over a type annotation, returning the new position. // Types can be: Ident, [Type], Map, Type?, Type -fn skip_type(tokens: [Map], pos: Int) -> Int { +fn skip_type(tokens: [Any], pos: Int) -> Int { let k = tok_kind(tokens, pos) // Array type: [Type] if k == "LBracket" { @@ -103,7 +106,7 @@ fn skip_type(tokens: [Map], pos: Int) -> Int { // -- Parameter list ------------------------------------------------------------ // Parses (name: Type, name: Type, ...) - returns { "params": [...], "pos": ... } -fn parse_params(tokens: [Map], pos: Int) -> Map { +fn parse_params(tokens: [Any], pos: Int) -> Map { let p = expect(tokens, pos, "LParen") let params: [Map] = native_list_empty() let running = true @@ -292,7 +295,7 @@ fn is_void_element(name: String) -> Bool { // Collect tokens as text content until we hit Lt, LBrace, Eof, or a // closing-tag marker (Lt Slash). Returns { "text": "...", "pos": p } -fn parse_html_text_tokens(tokens: [Map], pos: Int) -> Map { +fn parse_html_text_tokens(tokens: [Any], pos: Int) -> Map { let parts: [String] = native_list_empty() let p = pos let running = true @@ -322,7 +325,7 @@ fn parse_html_text_tokens(tokens: [Map], pos: Int) -> Map). -fn parse_html_attrs(tokens: [Map], pos: Int) -> Map { +fn parse_html_attrs(tokens: [Any], pos: Int) -> Map { let attrs: [Map] = native_list_empty() let p = pos let running = true @@ -374,7 +377,7 @@ fn parse_html_attrs(tokens: [Map], pos: Int) -> Map { // Parse the children of an HTML element until we see the closing tag // or EOF. Returns { "children": [...], "pos": p_after_closing_tag } -fn parse_html_children(tokens: [Map], pos: Int, parent_tag: String) -> Map { +fn parse_html_children(tokens: [Any], pos: Int, parent_tag: String) -> Map { let children: [Map] = native_list_empty() let p = pos let running = true @@ -514,14 +517,14 @@ fn parse_html_children(tokens: [Map], pos: Int, parent_tag: String) // Parse body of {#each} until {/each}. Mirrors parse_html_children but // stops at the {/each} sentinel rather than a closing element tag. -fn parse_html_each_body(tokens: [Map], pos: Int) -> Map { +fn parse_html_each_body(tokens: [Any], pos: Int) -> Map { parse_html_children(tokens, pos, "__each__") } // Parse a single HTML element: children // or self-closing: // Pos points to the Lt token. -fn parse_html_element(tokens: [Map], pos: Int) -> Map { +fn parse_html_element(tokens: [Any], pos: Int) -> Map { let p = pos // consume < let p = expect(tokens, p, "Lt") @@ -558,7 +561,7 @@ fn parse_html_element(tokens: [Map], pos: Int) -> Map // Entry point for HTML template parsing. // Pos points to Lt (or Lt Not for ). // May parse an optional prefix followed by the root element. -fn parse_html_template(tokens: [Map], pos: Int) -> Map { +fn parse_html_template(tokens: [Any], pos: Int) -> Map { let p = pos // Check for let doctype = false @@ -596,7 +599,7 @@ fn parse_html_template(tokens: [Map], pos: Int) -> Map make_result({ "expr": "HtmlTemplate", "root": root_with_doctype }, p) } -fn parse_primary(tokens: [Map], pos: Int) -> Map { +fn parse_primary(tokens: [Any], pos: Int) -> Map { let k = tok_kind(tokens, pos) let v = tok_value(tokens, pos) @@ -819,7 +822,7 @@ fn parse_primary(tokens: [Map], pos: Int) -> Map { make_result({ "expr": "Nil" }, pos + 1) } -fn parse_if(tokens: [Map], pos: Int) -> Map { +fn parse_if(tokens: [Any], pos: Int) -> Map { let p = expect(tokens, pos, "If") // Suppress Map-literal parsing in the cond so a stray `{` (the start // of the then-block) isn't gobbled as a Map. @@ -855,7 +858,7 @@ fn parse_if(tokens: [Map], pos: Int) -> Map { make_result({ "expr": "If", "cond": cond, "then": then_stmts, "else": else_stmts, "has_else": has_else }, p) } -fn parse_match(tokens: [Map], pos: Int) -> Map { +fn parse_match(tokens: [Any], pos: Int) -> Map { let p = expect(tokens, pos, "Match") let prev_no_block: String = state_get("__no_block_expr") state_set("__no_block_expr", "1") @@ -895,7 +898,7 @@ fn parse_match(tokens: [Map], pos: Int) -> Map { make_result({ "expr": "Match", "subject": subject, "arms": arms }, p) } -fn parse_pattern(tokens: [Map], pos: Int) -> Map { +fn parse_pattern(tokens: [Any], pos: Int) -> Map { let k = tok_kind(tokens, pos) if k == "Ident" { let v = tok_value(tokens, pos) @@ -924,7 +927,7 @@ fn parse_pattern(tokens: [Map], pos: Int) -> Map { make_result({ "pattern": "Wildcard" }, pos + 1) } -fn parse_for_expr(tokens: [Map], pos: Int) -> Map { +fn parse_for_expr(tokens: [Any], pos: Int) -> Map { let p = expect(tokens, pos, "For") let item_name = tok_value(tokens, p) let p = p + 1 @@ -941,7 +944,7 @@ fn parse_for_expr(tokens: [Map], pos: Int) -> Map { make_result({ "expr": "For", "item": item_name, "list": list_expr, "body": body }, p) } -fn parse_block(tokens: [Map], pos: Int) -> Map { +fn parse_block(tokens: [Any], pos: Int) -> Map { let p = expect(tokens, pos, "LBrace") let stmts: [Map] = native_list_empty() let running = true @@ -998,7 +1001,7 @@ fn is_duration_unit(name: String) -> Bool { false } -fn parse_postfix(tokens: [Map], pos: Int) -> Map { +fn parse_postfix(tokens: [Any], pos: Int) -> Map { let r = parse_primary(tokens, pos) let node = r["node"] let p = r["pos"] @@ -1115,7 +1118,7 @@ fn is_binop(kind: String) -> Bool { false } -fn parse_binop(tokens: [Map], pos: Int, min_prec: Int) -> Map { +fn parse_binop(tokens: [Any], pos: Int, min_prec: Int) -> Map { let r = parse_postfix(tokens, pos) let left = r["node"] let p = r["pos"] @@ -1140,13 +1143,13 @@ fn parse_binop(tokens: [Map], pos: Int, min_prec: Int) -> Map], pos: Int) -> Map { +fn parse_expr(tokens: [Any], pos: Int) -> Map { parse_binop(tokens, pos, 1) } // -- Statement parsing --------------------------------------------------------- -fn parse_stmt(tokens: [Map], pos: Int) -> Map { +fn parse_stmt(tokens: [Any], pos: Int) -> Map { let k = tok_kind(tokens, pos) // let binding @@ -1619,8 +1622,9 @@ fn parse_stmt(tokens: [Map], pos: Int) -> Map { // -- Top-level parse ------------------------------------------------------------ -fn parse(tokens: [Map]) -> [Map] { - let total: Int = native_list_len(tokens) +fn parse(tokens: [Any]) -> [Map] { + // Flat list: 2 entries per token, so divide by 2 for token count. + let total: Int = native_list_len(tokens) / 2 let stmts: [Map] = native_list_empty() let pos: Int = 0 let running = true From e587bedf309a11ea5ed31384e5abe8f31cf117c7 Mon Sep 17 00:00:00 2001 From: Will Anderson Date: Tue, 5 May 2026 16:01:05 -0500 Subject: [PATCH 4/4] =?UTF-8?q?round-3-gamma:=20combine=20c=5Fescape=20+?= =?UTF-8?q?=20scan=5Finterp=5Fstring=20batching=20=E2=80=94=20max=20round-?= =?UTF-8?q?3=20savings?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Combines two orthogonal optimizations: 1. c_escape batching (from alpha): ASCII runs emitted as str_slice segments instead of one str_char_at string per byte. O(N) allocs → O(K) where K = special chars. 2. scan_interp_string batching (from beta): char dispatch via str_char_code (Int) + clean_start tracking to flush plain runs as str_slice. Eliminates per-char string allocations in the string-literal scanning hot path. Result on web/src/main.el: 14.5MB -> 13.4MB peak RSS (-7.6%). Self-hosting: PASS. --- lang/el-compiler/src/codegen.el | 60 +++++++++++++++---- lang/el-compiler/src/lexer.el | 103 +++++++++++++++++++++----------- 2 files changed, 117 insertions(+), 46 deletions(-) diff --git a/lang/el-compiler/src/codegen.el b/lang/el-compiler/src/codegen.el index 6ac69ce..776aee3 100644 --- a/lang/el-compiler/src/codegen.el +++ b/lang/el-compiler/src/codegen.el @@ -38,10 +38,13 @@ fn is_hex_digit_byte(b: Int) -> Bool { } fn c_escape(s: String) -> String { - // Use index-based byte scanning via str_char_code(s, i) and str_char_at(s, i). - // This avoids native_string_chars + str_join, which corrupts high-byte (>= 0x80) - // characters because list_join's looks_like_string heuristic rejects strings - // whose first byte is >= 0x7F and emits them as decimal pointer values instead. + // Batch ASCII chars using str_slice instead of str_char_at per byte. + // Track clean_start: the beginning of the current run of bytes that need + // no escaping. On each special byte, flush the accumulated clean run via + // str_slice, then append the escape. This reduces parts-list appends from + // O(N) to O(K) where K = number of special bytes << N for normal strings. + // + // Special bytes: '"'=34, '\\'=92, '\n'=10, '\r'=13, '\t'=9, any byte>=128. // // IMPORTANT: after a \xNN hex escape, if the next byte is a hex digit // (0-9, a-f, A-F), we emit `""` to split the C string literal so the C @@ -51,46 +54,75 @@ fn c_escape(s: String) -> String { let total: Int = str_len(s) let parts: [String] = native_list_empty() let i: Int = 0 + let clean_start: Int = 0 let prev_was_hex_escape: Bool = false while i < total { let bval: Int = str_char_code(s, i) - // If the previous token was a \xNN escape and the current byte is a - // hex digit, insert an empty string literal ("") to break the escape. + // Handle the hex-escape split case first: if prev was \xNN and this + // byte is a hex digit, we must flush the clean run and insert "". + // (At this point clean_start == i since the previous special byte + // already reset it, so flush is a no-op unless something is pending.) if prev_was_hex_escape { if is_hex_digit_byte(bval) { + // Flush any accumulated clean bytes before the split marker. + if clean_start < i { + let parts = native_list_append(parts, str_slice(s, clean_start, i)) + } let parts = native_list_append(parts, "\"\"") + let clean_start = i } } let prev_was_hex_escape = false if bval == 34 { - // 34 = '"' + // 34 = '"' — flush clean run, then escape + if clean_start < i { + let parts = native_list_append(parts, str_slice(s, clean_start, i)) + } let parts = native_list_append(parts, "\\\"") + let clean_start = i + 1 } else { if bval == 92 { // 92 = '\\' + if clean_start < i { + let parts = native_list_append(parts, str_slice(s, clean_start, i)) + } let parts = native_list_append(parts, "\\\\") + let clean_start = i + 1 } else { if bval == 10 { // 10 = '\n' + if clean_start < i { + let parts = native_list_append(parts, str_slice(s, clean_start, i)) + } let parts = native_list_append(parts, "\\n") + let clean_start = i + 1 } else { if bval == 13 { // 13 = '\r' + if clean_start < i { + let parts = native_list_append(parts, str_slice(s, clean_start, i)) + } let parts = native_list_append(parts, "\\r") + let clean_start = i + 1 } else { if bval == 9 { // 9 = '\t' + if clean_start < i { + let parts = native_list_append(parts, str_slice(s, clean_start, i)) + } let parts = native_list_append(parts, "\\t") + let clean_start = i + 1 } else { if bval >= 128 { - // Escape non-ASCII bytes (>= 0x80) as \xNN so - // Clang does not misinterpret multi-byte UTF-8 - // sequences in C string literals. + // Non-ASCII: flush, then \xNN + if clean_start < i { + let parts = native_list_append(parts, str_slice(s, clean_start, i)) + } let parts = native_list_append(parts, "\\x" + byte_to_hex2(bval)) let prev_was_hex_escape = true - } else { - let parts = native_list_append(parts, str_char_at(s, i)) + let clean_start = i + 1 } + // else: plain ASCII — extends the current clean run (no append) } } } @@ -98,6 +130,10 @@ fn c_escape(s: String) -> String { } let i = i + 1 } + // Flush the final clean run if any + if clean_start < total { + let parts = native_list_append(parts, str_slice(s, clean_start, total)) + } str_join(parts, "") } diff --git a/lang/el-compiler/src/lexer.el b/lang/el-compiler/src/lexer.el index 06990dd..48cf5fe 100644 --- a/lang/el-compiler/src/lexer.el +++ b/lang/el-compiler/src/lexer.el @@ -555,10 +555,17 @@ fn interp_tokens_append_all(dst: [Any], src: [Any]) -> [Any] { // // Supported escape sequences: \" \n \t \r \\ \$ (literal dollar sign). // Nested quotes inside ${} are not supported; use a variable instead. +// +// Performance: uses str_char_code (Int) for all character dispatch, eliminating +// per-character strdup. Plain runs are batched into str_slice segments instead +// of accumulating single-char strings, reducing list appends from O(N) to O(K) +// where K = number of escape/special chars in the literal. +// Char codes: '\' = 92, '"' = 34, '$' = 36, '{' = 123 fn scan_interp_string(src: String, start: Int, total: Int) -> Map { let i = start let out_tokens: [Any] = native_list_empty() - let cur_part: [String] = native_list_empty() + let cur_parts: [String] = native_list_empty() + let clean_start = start let has_interp = false let need_plus = false let running = true @@ -567,39 +574,55 @@ fn scan_interp_string(src: String, start: Int, total: Int) -> Map { if i >= total { let running = false } else { - let ch: String = str_char_at(src, i) + let c: Int = str_char_code(src, i) - if ch == "\\" { - // Escape sequence + if c == 92 { + // '\\' = 92 — escape sequence: flush clean run, append resolved char + if clean_start < i { + let cur_parts = native_list_append(cur_parts, str_slice(src, clean_start, i)) + } let next_i = i + 1 if next_i < total { - let next_ch: String = str_char_at(src, next_i) - if next_ch == "$" { - // \$ => literal '$' (escape for interpolation syntax) - let cur_part = native_list_append(cur_part, "$") + let nc: Int = str_char_code(src, next_i) + if nc == 36 { + // '\$' => literal '$' (36 = '$') + let cur_parts = native_list_append(cur_parts, "$") + let clean_start = next_i + 1 let i = next_i + 1 } else { - if next_ch == "\"" { - let cur_part = native_list_append(cur_part, "\"") + if nc == 34 { + // '\"' => literal '"' (34 = '"') + let cur_parts = native_list_append(cur_parts, "\"") + let clean_start = next_i + 1 let i = next_i + 1 } else { - if next_ch == "n" { - let cur_part = native_list_append(cur_part, "\n") + if nc == 110 { + // '\n' (110 = 'n') + let cur_parts = native_list_append(cur_parts, "\n") + let clean_start = next_i + 1 let i = next_i + 1 } else { - if next_ch == "t" { - let cur_part = native_list_append(cur_part, "\t") + if nc == 116 { + // '\t' (116 = 't') + let cur_parts = native_list_append(cur_parts, "\t") + let clean_start = next_i + 1 let i = next_i + 1 } else { - if next_ch == "r" { - let cur_part = native_list_append(cur_part, "\r") + if nc == 114 { + // '\r' (114 = 'r') + let cur_parts = native_list_append(cur_parts, "\r") + let clean_start = next_i + 1 let i = next_i + 1 } else { - if next_ch == "\\" { - let cur_part = native_list_append(cur_part, "\\") + if nc == 92 { + // '\\' (92) + let cur_parts = native_list_append(cur_parts, "\\") + let clean_start = next_i + 1 let i = next_i + 1 } else { - let cur_part = native_list_append(cur_part, next_ch) + // Unknown escape: emit the escaped char verbatim + let cur_parts = native_list_append(cur_parts, str_slice(src, next_i, next_i + 1)) + let clean_start = next_i + 1 let i = next_i + 1 } } @@ -608,29 +631,38 @@ fn scan_interp_string(src: String, start: Int, total: Int) -> Map { } } } else { - let i = i + 1 + let clean_start = next_i + let i = next_i } } else { - if ch == "\"" { - // Closing quote - stop scanning + if c == 34 { + // '"' = 34 — closing quote: flush clean run, stop + if clean_start < i { + let cur_parts = native_list_append(cur_parts, str_slice(src, clean_start, i)) + } let i = i + 1 + let clean_start = i let running = false } else { - if ch == "$" { - // Check for ${ (start of interpolation) + if c == 36 { + // '$' = 36 — possible interpolation start let next_i = i + 1 let is_interp = false if next_i < total { - let next_ch: String = str_char_at(src, next_i) - if next_ch == "{" { + let nc2: Int = str_char_code(src, next_i) + if nc2 == 123 { + // '{' = 123 let is_interp = true } } if is_interp { // Flush the accumulated literal part (if non-empty) - let part_len: Int = native_list_len(cur_part) + if clean_start < i { + let cur_parts = native_list_append(cur_parts, str_slice(src, clean_start, i)) + } + let part_len: Int = native_list_len(cur_parts) if part_len > 0 { - let part_text = str_join(cur_part, "") + let part_text = str_join(cur_parts, "") if need_plus { let out_tokens = tok_append(out_tokens, "Plus", "+") } @@ -641,7 +673,7 @@ fn scan_interp_string(src: String, start: Int, total: Int) -> Map { let out_tokens = tok_append(out_tokens, "Str", clean_part) let need_plus = true } - let cur_part = native_list_empty() + let cur_parts = native_list_empty() let has_interp = true // Scan brace-balanced expression source @@ -649,6 +681,7 @@ fn scan_interp_string(src: String, start: Int, total: Int) -> Map { let expr_src: String = brace_result["text"] let new_i: Int = brace_result["pos"] let i = new_i + let clean_start = new_i // Re-lex the expression and inline the tokens. // Wrap in ( ) so that operators inside ${} (e.g. @@ -672,12 +705,11 @@ fn scan_interp_string(src: String, start: Int, total: Int) -> Map { } let need_plus = true } else { - // Plain '$' not followed by '{' - treat as literal - let cur_part = native_list_append(cur_part, "$") + // Plain '$' not followed by '{' - treat as literal, continue clean run let i = i + 1 } } else { - let cur_part = native_list_append(cur_part, ch) + // Plain char — extends clean run, no append needed let i = i + 1 } } @@ -686,8 +718,11 @@ fn scan_interp_string(src: String, start: Int, total: Int) -> Map { } // Flush remaining literal segment and build final token list - let part_text = str_join(cur_part, "") - let part_len: Int = native_list_len(cur_part) + if clean_start < i { + let cur_parts = native_list_append(cur_parts, str_slice(src, clean_start, i)) + } + let part_len: Int = native_list_len(cur_parts) + let part_text = str_join(cur_parts, "") if has_interp { // Interpolated string: only emit trailing segment if non-empty if part_len > 0 {