round-3-gamma: combine c_escape + scan_interp_string batching — max round-3 savings

Combines two orthogonal optimizations: 1. c_escape batching (from alpha): ASCII runs emitted as str_slice segments instead of one str_char_at string per byte. O(N) allocs → O(K) where K = special chars. 2. scan_interp_string batching (from beta): char dispatch via str_char_code (Int) + clean_start tracking to flush plain runs as str_slice. Eliminates per-char string allocations in the string-literal scanning hot path. Result on web/src/main.el: 14.5MB -> 13.4MB peak RSS (-7.6%). Self-hosting: PASS.
2026-05-05 16:01:05 -05:00
parent 1eef9928f4
commit e587bedf30
2 changed files with 117 additions and 46 deletions
@@ -38,10 +38,13 @@ fn is_hex_digit_byte(b: Int) -> Bool {
 }

 fn c_escape(s: String) -> String {
-    // Use index-based byte scanning via str_char_code(s, i) and str_char_at(s, i).
-    // This avoids native_string_chars + str_join, which corrupts high-byte (>= 0x80)
-    // characters because list_join's looks_like_string heuristic rejects strings
-    // whose first byte is >= 0x7F and emits them as decimal pointer values instead.
+    // Batch ASCII chars using str_slice instead of str_char_at per byte.
+    // Track clean_start: the beginning of the current run of bytes that need
+    // no escaping. On each special byte, flush the accumulated clean run via
+    // str_slice, then append the escape. This reduces parts-list appends from
+    // O(N) to O(K) where K = number of special bytes << N for normal strings.
+    //
+    // Special bytes: '"'=34, '\\'=92, '\n'=10, '\r'=13, '\t'=9, any byte>=128.
    //
    // IMPORTANT: after a \xNN hex escape, if the next byte is a hex digit
    // (0-9, a-f, A-F), we emit `""` to split the C string literal so the C
@@ -51,46 +54,75 @@ fn c_escape(s: String) -> String {
    let total: Int = str_len(s)
    let parts: [String] = native_list_empty()
    let i: Int = 0
+    let clean_start: Int = 0
    let prev_was_hex_escape: Bool = false
    while i < total {
        let bval: Int = str_char_code(s, i)
-        // If the previous token was a \xNN escape and the current byte is a
-        // hex digit, insert an empty string literal ("") to break the escape.
+        // Handle the hex-escape split case first: if prev was \xNN and this
+        // byte is a hex digit, we must flush the clean run and insert "".
+        // (At this point clean_start == i since the previous special byte
+        // already reset it, so flush is a no-op unless something is pending.)
        if prev_was_hex_escape {
            if is_hex_digit_byte(bval) {
+                // Flush any accumulated clean bytes before the split marker.
+                if clean_start < i {
+                    let parts = native_list_append(parts, str_slice(s, clean_start, i))
+                }
                let parts = native_list_append(parts, "\"\"")
+                let clean_start = i
            }
        }
        let prev_was_hex_escape = false
        if bval == 34 {
-            // 34 = '"'
+            // 34 = '"' — flush clean run, then escape
+            if clean_start < i {
+                let parts = native_list_append(parts, str_slice(s, clean_start, i))
+            }
            let parts = native_list_append(parts, "\\\"")
+            let clean_start = i + 1
        } else {
            if bval == 92 {
                // 92 = '\\'
+                if clean_start < i {
+                    let parts = native_list_append(parts, str_slice(s, clean_start, i))
+                }
                let parts = native_list_append(parts, "\\\\")
+                let clean_start = i + 1
            } else {
                if bval == 10 {
                    // 10 = '\n'
+                    if clean_start < i {
+                        let parts = native_list_append(parts, str_slice(s, clean_start, i))
+                    }
                    let parts = native_list_append(parts, "\\n")
+                    let clean_start = i + 1
                } else {
                    if bval == 13 {
                        // 13 = '\r'
+                        if clean_start < i {
+                            let parts = native_list_append(parts, str_slice(s, clean_start, i))
+                        }
                        let parts = native_list_append(parts, "\\r")
+                        let clean_start = i + 1
                    } else {
                        if bval == 9 {
                            // 9 = '\t'
+                            if clean_start < i {
+                                let parts = native_list_append(parts, str_slice(s, clean_start, i))
+                            }
                            let parts = native_list_append(parts, "\\t")
+                            let clean_start = i + 1
                        } else {
                            if bval >= 128 {
-                                // Escape non-ASCII bytes (>= 0x80) as \xNN so
-                                // Clang does not misinterpret multi-byte UTF-8
-                                // sequences in C string literals.
+                                // Non-ASCII: flush, then \xNN
+                                if clean_start < i {
+                                    let parts = native_list_append(parts, str_slice(s, clean_start, i))
+                                }
                                let parts = native_list_append(parts, "\\x" + byte_to_hex2(bval))
                                let prev_was_hex_escape = true
-                            } else {
-                                let parts = native_list_append(parts, str_char_at(s, i))
+                                let clean_start = i + 1
                            }
+                            // else: plain ASCII — extends the current clean run (no append)
                        }
                    }
                }
@@ -98,6 +130,10 @@ fn c_escape(s: String) -> String {
        }
        let i = i + 1
    }
+    // Flush the final clean run if any
+    if clean_start < total {
+        let parts = native_list_append(parts, str_slice(s, clean_start, total))
+    }
    str_join(parts, "")
 }

@@ -555,10 +555,17 @@ fn interp_tokens_append_all(dst: [Any], src: [Any]) -> [Any] {
 //
 // Supported escape sequences: \" \n \t \r \\ \$ (literal dollar sign).
 // Nested quotes inside ${} are not supported; use a variable instead.
+//
+// Performance: uses str_char_code (Int) for all character dispatch, eliminating
+// per-character strdup. Plain runs are batched into str_slice segments instead
+// of accumulating single-char strings, reducing list appends from O(N) to O(K)
+// where K = number of escape/special chars in the literal.
+// Char codes: '\' = 92, '"' = 34, '$' = 36, '{' = 123
 fn scan_interp_string(src: String, start: Int, total: Int) -> Map<String, Any> {
    let i = start
    let out_tokens: [Any] = native_list_empty()
-    let cur_part: [String] = native_list_empty()
+    let cur_parts: [String] = native_list_empty()
+    let clean_start = start
    let has_interp = false
    let need_plus = false
    let running = true
@@ -567,39 +574,55 @@ fn scan_interp_string(src: String, start: Int, total: Int) -> Map<String, Any> {
        if i >= total {
            let running = false
        } else {
-            let ch: String = str_char_at(src, i)
+            let c: Int = str_char_code(src, i)

-            if ch == "\\" {
-                // Escape sequence
+            if c == 92 {
+                // '\\' = 92 — escape sequence: flush clean run, append resolved char
+                if clean_start < i {
+                    let cur_parts = native_list_append(cur_parts, str_slice(src, clean_start, i))
+                }
                let next_i = i + 1
                if next_i < total {
-                    let next_ch: String = str_char_at(src, next_i)
-                    if next_ch == "$" {
-                        // \$ => literal '$' (escape for interpolation syntax)
-                        let cur_part = native_list_append(cur_part, "$")
+                    let nc: Int = str_char_code(src, next_i)
+                    if nc == 36 {
+                        // '\$' => literal '$'  (36 = '$')
+                        let cur_parts = native_list_append(cur_parts, "$")
+                        let clean_start = next_i + 1
                        let i = next_i + 1
                    } else {
-                        if next_ch == "\"" {
-                            let cur_part = native_list_append(cur_part, "\"")
+                        if nc == 34 {
+                            // '\"' => literal '"'  (34 = '"')
+                            let cur_parts = native_list_append(cur_parts, "\"")
+                            let clean_start = next_i + 1
                            let i = next_i + 1
                        } else {
-                            if next_ch == "n" {
-                                let cur_part = native_list_append(cur_part, "\n")
+                            if nc == 110 {
+                                // '\n' (110 = 'n')
+                                let cur_parts = native_list_append(cur_parts, "\n")
+                                let clean_start = next_i + 1
                                let i = next_i + 1
                            } else {
-                                if next_ch == "t" {
-                                    let cur_part = native_list_append(cur_part, "\t")
+                                if nc == 116 {
+                                    // '\t' (116 = 't')
+                                    let cur_parts = native_list_append(cur_parts, "\t")
+                                    let clean_start = next_i + 1
                                    let i = next_i + 1
                                } else {
-                                    if next_ch == "r" {
-                                        let cur_part = native_list_append(cur_part, "\r")
+                                    if nc == 114 {
+                                        // '\r' (114 = 'r')
+                                        let cur_parts = native_list_append(cur_parts, "\r")
+                                        let clean_start = next_i + 1
                                        let i = next_i + 1
                                    } else {
-                                        if next_ch == "\\" {
-                                            let cur_part = native_list_append(cur_part, "\\")
+                                        if nc == 92 {
+                                            // '\\' (92)
+                                            let cur_parts = native_list_append(cur_parts, "\\")
+                                            let clean_start = next_i + 1
                                            let i = next_i + 1
                                        } else {
-                                            let cur_part = native_list_append(cur_part, next_ch)
+                                            // Unknown escape: emit the escaped char verbatim
+                                            let cur_parts = native_list_append(cur_parts, str_slice(src, next_i, next_i + 1))
+                                            let clean_start = next_i + 1
                                            let i = next_i + 1
                                        }
                                    }
@@ -608,29 +631,38 @@ fn scan_interp_string(src: String, start: Int, total: Int) -> Map<String, Any> {
                        }
                    }
                } else {
-                    let i = i + 1
+                    let clean_start = next_i
+                    let i = next_i
                }
            } else {
-                if ch == "\"" {
-                    // Closing quote - stop scanning
+                if c == 34 {
+                    // '"' = 34 — closing quote: flush clean run, stop
+                    if clean_start < i {
+                        let cur_parts = native_list_append(cur_parts, str_slice(src, clean_start, i))
+                    }
                    let i = i + 1
+                    let clean_start = i
                    let running = false
                } else {
-                    if ch == "$" {
-                        // Check for ${ (start of interpolation)
+                    if c == 36 {
+                        // '$' = 36 — possible interpolation start
                        let next_i = i + 1
                        let is_interp = false
                        if next_i < total {
-                            let next_ch: String = str_char_at(src, next_i)
-                            if next_ch == "{" {
+                            let nc2: Int = str_char_code(src, next_i)
+                            if nc2 == 123 {
+                                // '{' = 123
                                let is_interp = true
                            }
                        }
                        if is_interp {
                            // Flush the accumulated literal part (if non-empty)
-                            let part_len: Int = native_list_len(cur_part)
+                            if clean_start < i {
+                                let cur_parts = native_list_append(cur_parts, str_slice(src, clean_start, i))
+                            }
+                            let part_len: Int = native_list_len(cur_parts)
                            if part_len > 0 {
-                                let part_text = str_join(cur_part, "")
+                                let part_text = str_join(cur_parts, "")
                                if need_plus {
                                    let out_tokens = tok_append(out_tokens, "Plus", "+")
                                }
@@ -641,7 +673,7 @@ fn scan_interp_string(src: String, start: Int, total: Int) -> Map<String, Any> {
                                let out_tokens = tok_append(out_tokens, "Str", clean_part)
                                let need_plus = true
                            }
-                            let cur_part = native_list_empty()
+                            let cur_parts = native_list_empty()
                            let has_interp = true

                            // Scan brace-balanced expression source
@@ -649,6 +681,7 @@ fn scan_interp_string(src: String, start: Int, total: Int) -> Map<String, Any> {
                            let expr_src: String = brace_result["text"]
                            let new_i: Int = brace_result["pos"]
                            let i = new_i
+                            let clean_start = new_i

                            // Re-lex the expression and inline the tokens.
                            // Wrap in ( ) so that operators inside ${} (e.g.
@@ -672,12 +705,11 @@ fn scan_interp_string(src: String, start: Int, total: Int) -> Map<String, Any> {
                            }
                            let need_plus = true
                        } else {
-                            // Plain '$' not followed by '{' - treat as literal
-                            let cur_part = native_list_append(cur_part, "$")
+                            // Plain '$' not followed by '{' - treat as literal, continue clean run
                            let i = i + 1
                        }
                    } else {
-                        let cur_part = native_list_append(cur_part, ch)
+                        // Plain char — extends clean run, no append needed
                        let i = i + 1
                    }
                }
@@ -686,8 +718,11 @@ fn scan_interp_string(src: String, start: Int, total: Int) -> Map<String, Any> {
    }

    // Flush remaining literal segment and build final token list
-    let part_text = str_join(cur_part, "")
-    let part_len: Int = native_list_len(cur_part)
+    if clean_start < i {
+        let cur_parts = native_list_append(cur_parts, str_slice(src, clean_start, i))
+    }
+    let part_len: Int = native_list_len(cur_parts)
+    let part_text = str_join(cur_parts, "")
    if has_interp {
        // Interpolated string: only emit trailing segment if non-empty
        if part_len > 0 {