From e587bedf309a11ea5ed31384e5abe8f31cf117c7 Mon Sep 17 00:00:00 2001
From: Will Anderson <will@neuralplatform.ai>
Date: Tue, 5 May 2026 16:01:05 -0500
Subject: [PATCH] =?UTF-8?q?round-3-gamma:=20combine=20c=5Fescape=20+=20sca?=
 =?UTF-8?q?n=5Finterp=5Fstring=20batching=20=E2=80=94=20max=20round-3=20sa?=
 =?UTF-8?q?vings?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Combines two orthogonal optimizations:
1. c_escape batching (from alpha): ASCII runs emitted as str_slice segments instead
   of one str_char_at string per byte. O(N) allocs → O(K) where K = special chars.

2. scan_interp_string batching (from beta): char dispatch via str_char_code (Int)
   + clean_start tracking to flush plain runs as str_slice. Eliminates per-char
   string allocations in the string-literal scanning hot path.

Result on web/src/main.el: 14.5MB -> 13.4MB peak RSS (-7.6%).
Self-hosting: PASS.
---
 lang/el-compiler/src/codegen.el |  60 +++++++++++++++----
 lang/el-compiler/src/lexer.el   | 103 +++++++++++++++++++++-----------
 2 files changed, 117 insertions(+), 46 deletions(-)

diff --git a/lang/el-compiler/src/codegen.el b/lang/el-compiler/src/codegen.el
index 6ac69ce..776aee3 100644
--- a/lang/el-compiler/src/codegen.el
+++ b/lang/el-compiler/src/codegen.el
@@ -38,10 +38,13 @@ fn is_hex_digit_byte(b: Int) -> Bool {
 }
 
 fn c_escape(s: String) -> String {
-    // Use index-based byte scanning via str_char_code(s, i) and str_char_at(s, i).
-    // This avoids native_string_chars + str_join, which corrupts high-byte (>= 0x80)
-    // characters because list_join's looks_like_string heuristic rejects strings
-    // whose first byte is >= 0x7F and emits them as decimal pointer values instead.
+    // Batch ASCII chars using str_slice instead of str_char_at per byte.
+    // Track clean_start: the beginning of the current run of bytes that need
+    // no escaping. On each special byte, flush the accumulated clean run via
+    // str_slice, then append the escape. This reduces parts-list appends from
+    // O(N) to O(K) where K = number of special bytes << N for normal strings.
+    //
+    // Special bytes: '"'=34, '\\'=92, '\n'=10, '\r'=13, '\t'=9, any byte>=128.
     //
     // IMPORTANT: after a \xNN hex escape, if the next byte is a hex digit
     // (0-9, a-f, A-F), we emit `""` to split the C string literal so the C
@@ -51,46 +54,75 @@ fn c_escape(s: String) -> String {
     let total: Int = str_len(s)
     let parts: [String] = native_list_empty()
     let i: Int = 0
+    let clean_start: Int = 0
     let prev_was_hex_escape: Bool = false
     while i < total {
         let bval: Int = str_char_code(s, i)
-        // If the previous token was a \xNN escape and the current byte is a
-        // hex digit, insert an empty string literal ("") to break the escape.
+        // Handle the hex-escape split case first: if prev was \xNN and this
+        // byte is a hex digit, we must flush the clean run and insert "".
+        // (At this point clean_start == i since the previous special byte
+        // already reset it, so flush is a no-op unless something is pending.)
         if prev_was_hex_escape {
             if is_hex_digit_byte(bval) {
+                // Flush any accumulated clean bytes before the split marker.
+                if clean_start < i {
+                    let parts = native_list_append(parts, str_slice(s, clean_start, i))
+                }
                 let parts = native_list_append(parts, "\"\"")
+                let clean_start = i
             }
         }
         let prev_was_hex_escape = false
         if bval == 34 {
-            // 34 = '"'
+            // 34 = '"' — flush clean run, then escape
+            if clean_start < i {
+                let parts = native_list_append(parts, str_slice(s, clean_start, i))
+            }
             let parts = native_list_append(parts, "\\\"")
+            let clean_start = i + 1
         } else {
             if bval == 92 {
                 // 92 = '\\'
+                if clean_start < i {
+                    let parts = native_list_append(parts, str_slice(s, clean_start, i))
+                }
                 let parts = native_list_append(parts, "\\\\")
+                let clean_start = i + 1
             } else {
                 if bval == 10 {
                     // 10 = '\n'
+                    if clean_start < i {
+                        let parts = native_list_append(parts, str_slice(s, clean_start, i))
+                    }
                     let parts = native_list_append(parts, "\\n")
+                    let clean_start = i + 1
                 } else {
                     if bval == 13 {
                         // 13 = '\r'
+                        if clean_start < i {
+                            let parts = native_list_append(parts, str_slice(s, clean_start, i))
+                        }
                         let parts = native_list_append(parts, "\\r")
+                        let clean_start = i + 1
                     } else {
                         if bval == 9 {
                             // 9 = '\t'
+                            if clean_start < i {
+                                let parts = native_list_append(parts, str_slice(s, clean_start, i))
+                            }
                             let parts = native_list_append(parts, "\\t")
+                            let clean_start = i + 1
                         } else {
                             if bval >= 128 {
-                                // Escape non-ASCII bytes (>= 0x80) as \xNN so
-                                // Clang does not misinterpret multi-byte UTF-8
-                                // sequences in C string literals.
+                                // Non-ASCII: flush, then \xNN
+                                if clean_start < i {
+                                    let parts = native_list_append(parts, str_slice(s, clean_start, i))
+                                }
                                 let parts = native_list_append(parts, "\\x" + byte_to_hex2(bval))
                                 let prev_was_hex_escape = true
-                            } else {
-                                let parts = native_list_append(parts, str_char_at(s, i))
+                                let clean_start = i + 1
                             }
+                            // else: plain ASCII — extends the current clean run (no append)
                         }
                     }
                 }
@@ -98,6 +130,10 @@ fn c_escape(s: String) -> String {
         }
         let i = i + 1
     }
+    // Flush the final clean run if any
+    if clean_start < total {
+        let parts = native_list_append(parts, str_slice(s, clean_start, total))
+    }
     str_join(parts, "")
 }
 
diff --git a/lang/el-compiler/src/lexer.el b/lang/el-compiler/src/lexer.el
index 06990dd..48cf5fe 100644
--- a/lang/el-compiler/src/lexer.el
+++ b/lang/el-compiler/src/lexer.el
@@ -555,10 +555,17 @@ fn interp_tokens_append_all(dst: [Any], src: [Any]) -> [Any] {
 //
 // Supported escape sequences: \" \n \t \r \\ \$ (literal dollar sign).
 // Nested quotes inside ${} are not supported; use a variable instead.
+//
+// Performance: uses str_char_code (Int) for all character dispatch, eliminating
+// per-character strdup. Plain runs are batched into str_slice segments instead
+// of accumulating single-char strings, reducing list appends from O(N) to O(K)
+// where K = number of escape/special chars in the literal.
+// Char codes: '\' = 92, '"' = 34, '$' = 36, '{' = 123
 fn scan_interp_string(src: String, start: Int, total: Int) -> Map<String, Any> {
     let i = start
     let out_tokens: [Any] = native_list_empty()
-    let cur_part: [String] = native_list_empty()
+    let cur_parts: [String] = native_list_empty()
+    let clean_start = start
     let has_interp = false
     let need_plus = false
     let running = true
@@ -567,39 +574,55 @@ fn scan_interp_string(src: String, start: Int, total: Int) -> Map<String, Any> {
         if i >= total {
             let running = false
         } else {
-            let ch: String = str_char_at(src, i)
+            let c: Int = str_char_code(src, i)
 
-            if ch == "\\" {
-                // Escape sequence
+            if c == 92 {
+                // '\\' = 92 — escape sequence: flush clean run, append resolved char
+                if clean_start < i {
+                    let cur_parts = native_list_append(cur_parts, str_slice(src, clean_start, i))
+                }
                 let next_i = i + 1
                 if next_i < total {
-                    let next_ch: String = str_char_at(src, next_i)
-                    if next_ch == "$" {
-                        // \$ => literal '$' (escape for interpolation syntax)
-                        let cur_part = native_list_append(cur_part, "$")
+                    let nc: Int = str_char_code(src, next_i)
+                    if nc == 36 {
+                        // '\$' => literal '$'  (36 = '$')
+                        let cur_parts = native_list_append(cur_parts, "$")
+                        let clean_start = next_i + 1
                         let i = next_i + 1
                     } else {
-                        if next_ch == "\"" {
-                            let cur_part = native_list_append(cur_part, "\"")
+                        if nc == 34 {
+                            // '\"' => literal '"'  (34 = '"')
+                            let cur_parts = native_list_append(cur_parts, "\"")
+                            let clean_start = next_i + 1
                             let i = next_i + 1
                         } else {
-                            if next_ch == "n" {
-                                let cur_part = native_list_append(cur_part, "\n")
+                            if nc == 110 {
+                                // '\n' (110 = 'n')
+                                let cur_parts = native_list_append(cur_parts, "\n")
+                                let clean_start = next_i + 1
                                 let i = next_i + 1
                             } else {
-                                if next_ch == "t" {
-                                    let cur_part = native_list_append(cur_part, "\t")
+                                if nc == 116 {
+                                    // '\t' (116 = 't')
+                                    let cur_parts = native_list_append(cur_parts, "\t")
+                                    let clean_start = next_i + 1
                                     let i = next_i + 1
                                 } else {
-                                    if next_ch == "r" {
-                                        let cur_part = native_list_append(cur_part, "\r")
+                                    if nc == 114 {
+                                        // '\r' (114 = 'r')
+                                        let cur_parts = native_list_append(cur_parts, "\r")
+                                        let clean_start = next_i + 1
                                         let i = next_i + 1
                                     } else {
-                                        if next_ch == "\\" {
-                                            let cur_part = native_list_append(cur_part, "\\")
+                                        if nc == 92 {
+                                            // '\\' (92)
+                                            let cur_parts = native_list_append(cur_parts, "\\")
+                                            let clean_start = next_i + 1
                                             let i = next_i + 1
                                         } else {
-                                            let cur_part = native_list_append(cur_part, next_ch)
+                                            // Unknown escape: emit the escaped char verbatim
+                                            let cur_parts = native_list_append(cur_parts, str_slice(src, next_i, next_i + 1))
+                                            let clean_start = next_i + 1
                                             let i = next_i + 1
                                         }
                                     }
@@ -608,29 +631,38 @@ fn scan_interp_string(src: String, start: Int, total: Int) -> Map<String, Any> {
                         }
                     }
                 } else {
-                    let i = i + 1
+                    let clean_start = next_i
+                    let i = next_i
                 }
             } else {
-                if ch == "\"" {
-                    // Closing quote - stop scanning
+                if c == 34 {
+                    // '"' = 34 — closing quote: flush clean run, stop
+                    if clean_start < i {
+                        let cur_parts = native_list_append(cur_parts, str_slice(src, clean_start, i))
+                    }
                     let i = i + 1
+                    let clean_start = i
                     let running = false
                 } else {
-                    if ch == "$" {
-                        // Check for ${ (start of interpolation)
+                    if c == 36 {
+                        // '$' = 36 — possible interpolation start
                         let next_i = i + 1
                         let is_interp = false
                         if next_i < total {
-                            let next_ch: String = str_char_at(src, next_i)
-                            if next_ch == "{" {
+                            let nc2: Int = str_char_code(src, next_i)
+                            if nc2 == 123 {
+                                // '{' = 123
                                 let is_interp = true
                             }
                         }
                         if is_interp {
                             // Flush the accumulated literal part (if non-empty)
-                            let part_len: Int = native_list_len(cur_part)
+                            if clean_start < i {
+                                let cur_parts = native_list_append(cur_parts, str_slice(src, clean_start, i))
+                            }
+                            let part_len: Int = native_list_len(cur_parts)
                             if part_len > 0 {
-                                let part_text = str_join(cur_part, "")
+                                let part_text = str_join(cur_parts, "")
                                 if need_plus {
                                     let out_tokens = tok_append(out_tokens, "Plus", "+")
                                 }
@@ -641,7 +673,7 @@ fn scan_interp_string(src: String, start: Int, total: Int) -> Map<String, Any> {
                                 let out_tokens = tok_append(out_tokens, "Str", clean_part)
                                 let need_plus = true
                             }
-                            let cur_part = native_list_empty()
+                            let cur_parts = native_list_empty()
                             let has_interp = true
 
                             // Scan brace-balanced expression source
@@ -649,6 +681,7 @@ fn scan_interp_string(src: String, start: Int, total: Int) -> Map<String, Any> {
                             let expr_src: String = brace_result["text"]
                             let new_i: Int = brace_result["pos"]
                             let i = new_i
+                            let clean_start = new_i
 
                             // Re-lex the expression and inline the tokens.
                             // Wrap in ( ) so that operators inside ${} (e.g.
@@ -672,12 +705,11 @@ fn scan_interp_string(src: String, start: Int, total: Int) -> Map<String, Any> {
                             }
                             let need_plus = true
                         } else {
-                            // Plain '$' not followed by '{' - treat as literal
-                            let cur_part = native_list_append(cur_part, "$")
+                            // Plain '$' not followed by '{' - treat as literal, continue clean run
                             let i = i + 1
                         }
                     } else {
-                        let cur_part = native_list_append(cur_part, ch)
+                        // Plain char — extends clean run, no append needed
                         let i = i + 1
                     }
                 }
@@ -686,8 +718,11 @@ fn scan_interp_string(src: String, start: Int, total: Int) -> Map<String, Any> {
     }
 
     // Flush remaining literal segment and build final token list
-    let part_text = str_join(cur_part, "")
-    let part_len: Int = native_list_len(cur_part)
+    if clean_start < i {
+        let cur_parts = native_list_append(cur_parts, str_slice(src, clean_start, i))
+    }
+    let part_len: Int = native_list_len(cur_parts)
+    let part_text = str_join(cur_parts, "")
     if has_interp {
         // Interpolated string: only emit trailing segment if non-empty
         if part_len > 0 {