From e587bedf309a11ea5ed31384e5abe8f31cf117c7 Mon Sep 17 00:00:00 2001 From: Will Anderson Date: Tue, 5 May 2026 16:01:05 -0500 Subject: [PATCH] =?UTF-8?q?round-3-gamma:=20combine=20c=5Fescape=20+=20sca?= =?UTF-8?q?n=5Finterp=5Fstring=20batching=20=E2=80=94=20max=20round-3=20sa?= =?UTF-8?q?vings?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Combines two orthogonal optimizations: 1. c_escape batching (from alpha): ASCII runs emitted as str_slice segments instead of one str_char_at string per byte. O(N) allocs → O(K) where K = special chars. 2. scan_interp_string batching (from beta): char dispatch via str_char_code (Int) + clean_start tracking to flush plain runs as str_slice. Eliminates per-char string allocations in the string-literal scanning hot path. Result on web/src/main.el: 14.5MB -> 13.4MB peak RSS (-7.6%). Self-hosting: PASS. --- lang/el-compiler/src/codegen.el | 60 +++++++++++++++---- lang/el-compiler/src/lexer.el | 103 +++++++++++++++++++++----------- 2 files changed, 117 insertions(+), 46 deletions(-) diff --git a/lang/el-compiler/src/codegen.el b/lang/el-compiler/src/codegen.el index 6ac69ce..776aee3 100644 --- a/lang/el-compiler/src/codegen.el +++ b/lang/el-compiler/src/codegen.el @@ -38,10 +38,13 @@ fn is_hex_digit_byte(b: Int) -> Bool { } fn c_escape(s: String) -> String { - // Use index-based byte scanning via str_char_code(s, i) and str_char_at(s, i). - // This avoids native_string_chars + str_join, which corrupts high-byte (>= 0x80) - // characters because list_join's looks_like_string heuristic rejects strings - // whose first byte is >= 0x7F and emits them as decimal pointer values instead. + // Batch ASCII chars using str_slice instead of str_char_at per byte. + // Track clean_start: the beginning of the current run of bytes that need + // no escaping. On each special byte, flush the accumulated clean run via + // str_slice, then append the escape. This reduces parts-list appends from + // O(N) to O(K) where K = number of special bytes << N for normal strings. + // + // Special bytes: '"'=34, '\\'=92, '\n'=10, '\r'=13, '\t'=9, any byte>=128. // // IMPORTANT: after a \xNN hex escape, if the next byte is a hex digit // (0-9, a-f, A-F), we emit `""` to split the C string literal so the C @@ -51,46 +54,75 @@ fn c_escape(s: String) -> String { let total: Int = str_len(s) let parts: [String] = native_list_empty() let i: Int = 0 + let clean_start: Int = 0 let prev_was_hex_escape: Bool = false while i < total { let bval: Int = str_char_code(s, i) - // If the previous token was a \xNN escape and the current byte is a - // hex digit, insert an empty string literal ("") to break the escape. + // Handle the hex-escape split case first: if prev was \xNN and this + // byte is a hex digit, we must flush the clean run and insert "". + // (At this point clean_start == i since the previous special byte + // already reset it, so flush is a no-op unless something is pending.) if prev_was_hex_escape { if is_hex_digit_byte(bval) { + // Flush any accumulated clean bytes before the split marker. + if clean_start < i { + let parts = native_list_append(parts, str_slice(s, clean_start, i)) + } let parts = native_list_append(parts, "\"\"") + let clean_start = i } } let prev_was_hex_escape = false if bval == 34 { - // 34 = '"' + // 34 = '"' — flush clean run, then escape + if clean_start < i { + let parts = native_list_append(parts, str_slice(s, clean_start, i)) + } let parts = native_list_append(parts, "\\\"") + let clean_start = i + 1 } else { if bval == 92 { // 92 = '\\' + if clean_start < i { + let parts = native_list_append(parts, str_slice(s, clean_start, i)) + } let parts = native_list_append(parts, "\\\\") + let clean_start = i + 1 } else { if bval == 10 { // 10 = '\n' + if clean_start < i { + let parts = native_list_append(parts, str_slice(s, clean_start, i)) + } let parts = native_list_append(parts, "\\n") + let clean_start = i + 1 } else { if bval == 13 { // 13 = '\r' + if clean_start < i { + let parts = native_list_append(parts, str_slice(s, clean_start, i)) + } let parts = native_list_append(parts, "\\r") + let clean_start = i + 1 } else { if bval == 9 { // 9 = '\t' + if clean_start < i { + let parts = native_list_append(parts, str_slice(s, clean_start, i)) + } let parts = native_list_append(parts, "\\t") + let clean_start = i + 1 } else { if bval >= 128 { - // Escape non-ASCII bytes (>= 0x80) as \xNN so - // Clang does not misinterpret multi-byte UTF-8 - // sequences in C string literals. + // Non-ASCII: flush, then \xNN + if clean_start < i { + let parts = native_list_append(parts, str_slice(s, clean_start, i)) + } let parts = native_list_append(parts, "\\x" + byte_to_hex2(bval)) let prev_was_hex_escape = true - } else { - let parts = native_list_append(parts, str_char_at(s, i)) + let clean_start = i + 1 } + // else: plain ASCII — extends the current clean run (no append) } } } @@ -98,6 +130,10 @@ fn c_escape(s: String) -> String { } let i = i + 1 } + // Flush the final clean run if any + if clean_start < total { + let parts = native_list_append(parts, str_slice(s, clean_start, total)) + } str_join(parts, "") } diff --git a/lang/el-compiler/src/lexer.el b/lang/el-compiler/src/lexer.el index 06990dd..48cf5fe 100644 --- a/lang/el-compiler/src/lexer.el +++ b/lang/el-compiler/src/lexer.el @@ -555,10 +555,17 @@ fn interp_tokens_append_all(dst: [Any], src: [Any]) -> [Any] { // // Supported escape sequences: \" \n \t \r \\ \$ (literal dollar sign). // Nested quotes inside ${} are not supported; use a variable instead. +// +// Performance: uses str_char_code (Int) for all character dispatch, eliminating +// per-character strdup. Plain runs are batched into str_slice segments instead +// of accumulating single-char strings, reducing list appends from O(N) to O(K) +// where K = number of escape/special chars in the literal. +// Char codes: '\' = 92, '"' = 34, '$' = 36, '{' = 123 fn scan_interp_string(src: String, start: Int, total: Int) -> Map { let i = start let out_tokens: [Any] = native_list_empty() - let cur_part: [String] = native_list_empty() + let cur_parts: [String] = native_list_empty() + let clean_start = start let has_interp = false let need_plus = false let running = true @@ -567,39 +574,55 @@ fn scan_interp_string(src: String, start: Int, total: Int) -> Map { if i >= total { let running = false } else { - let ch: String = str_char_at(src, i) + let c: Int = str_char_code(src, i) - if ch == "\\" { - // Escape sequence + if c == 92 { + // '\\' = 92 — escape sequence: flush clean run, append resolved char + if clean_start < i { + let cur_parts = native_list_append(cur_parts, str_slice(src, clean_start, i)) + } let next_i = i + 1 if next_i < total { - let next_ch: String = str_char_at(src, next_i) - if next_ch == "$" { - // \$ => literal '$' (escape for interpolation syntax) - let cur_part = native_list_append(cur_part, "$") + let nc: Int = str_char_code(src, next_i) + if nc == 36 { + // '\$' => literal '$' (36 = '$') + let cur_parts = native_list_append(cur_parts, "$") + let clean_start = next_i + 1 let i = next_i + 1 } else { - if next_ch == "\"" { - let cur_part = native_list_append(cur_part, "\"") + if nc == 34 { + // '\"' => literal '"' (34 = '"') + let cur_parts = native_list_append(cur_parts, "\"") + let clean_start = next_i + 1 let i = next_i + 1 } else { - if next_ch == "n" { - let cur_part = native_list_append(cur_part, "\n") + if nc == 110 { + // '\n' (110 = 'n') + let cur_parts = native_list_append(cur_parts, "\n") + let clean_start = next_i + 1 let i = next_i + 1 } else { - if next_ch == "t" { - let cur_part = native_list_append(cur_part, "\t") + if nc == 116 { + // '\t' (116 = 't') + let cur_parts = native_list_append(cur_parts, "\t") + let clean_start = next_i + 1 let i = next_i + 1 } else { - if next_ch == "r" { - let cur_part = native_list_append(cur_part, "\r") + if nc == 114 { + // '\r' (114 = 'r') + let cur_parts = native_list_append(cur_parts, "\r") + let clean_start = next_i + 1 let i = next_i + 1 } else { - if next_ch == "\\" { - let cur_part = native_list_append(cur_part, "\\") + if nc == 92 { + // '\\' (92) + let cur_parts = native_list_append(cur_parts, "\\") + let clean_start = next_i + 1 let i = next_i + 1 } else { - let cur_part = native_list_append(cur_part, next_ch) + // Unknown escape: emit the escaped char verbatim + let cur_parts = native_list_append(cur_parts, str_slice(src, next_i, next_i + 1)) + let clean_start = next_i + 1 let i = next_i + 1 } } @@ -608,29 +631,38 @@ fn scan_interp_string(src: String, start: Int, total: Int) -> Map { } } } else { - let i = i + 1 + let clean_start = next_i + let i = next_i } } else { - if ch == "\"" { - // Closing quote - stop scanning + if c == 34 { + // '"' = 34 — closing quote: flush clean run, stop + if clean_start < i { + let cur_parts = native_list_append(cur_parts, str_slice(src, clean_start, i)) + } let i = i + 1 + let clean_start = i let running = false } else { - if ch == "$" { - // Check for ${ (start of interpolation) + if c == 36 { + // '$' = 36 — possible interpolation start let next_i = i + 1 let is_interp = false if next_i < total { - let next_ch: String = str_char_at(src, next_i) - if next_ch == "{" { + let nc2: Int = str_char_code(src, next_i) + if nc2 == 123 { + // '{' = 123 let is_interp = true } } if is_interp { // Flush the accumulated literal part (if non-empty) - let part_len: Int = native_list_len(cur_part) + if clean_start < i { + let cur_parts = native_list_append(cur_parts, str_slice(src, clean_start, i)) + } + let part_len: Int = native_list_len(cur_parts) if part_len > 0 { - let part_text = str_join(cur_part, "") + let part_text = str_join(cur_parts, "") if need_plus { let out_tokens = tok_append(out_tokens, "Plus", "+") } @@ -641,7 +673,7 @@ fn scan_interp_string(src: String, start: Int, total: Int) -> Map { let out_tokens = tok_append(out_tokens, "Str", clean_part) let need_plus = true } - let cur_part = native_list_empty() + let cur_parts = native_list_empty() let has_interp = true // Scan brace-balanced expression source @@ -649,6 +681,7 @@ fn scan_interp_string(src: String, start: Int, total: Int) -> Map { let expr_src: String = brace_result["text"] let new_i: Int = brace_result["pos"] let i = new_i + let clean_start = new_i // Re-lex the expression and inline the tokens. // Wrap in ( ) so that operators inside ${} (e.g. @@ -672,12 +705,11 @@ fn scan_interp_string(src: String, start: Int, total: Int) -> Map { } let need_plus = true } else { - // Plain '$' not followed by '{' - treat as literal - let cur_part = native_list_append(cur_part, "$") + // Plain '$' not followed by '{' - treat as literal, continue clean run let i = i + 1 } } else { - let cur_part = native_list_append(cur_part, ch) + // Plain char — extends clean run, no append needed let i = i + 1 } } @@ -686,8 +718,11 @@ fn scan_interp_string(src: String, start: Int, total: Int) -> Map { } // Flush remaining literal segment and build final token list - let part_text = str_join(cur_part, "") - let part_len: Int = native_list_len(cur_part) + if clean_start < i { + let cur_parts = native_list_append(cur_parts, str_slice(src, clean_start, i)) + } + let part_len: Int = native_list_len(cur_parts) + let part_text = str_join(cur_parts, "") if has_interp { // Interpolated string: only emit trailing segment if non-empty if part_len > 0 {