round-3-gamma: combine c_escape + scan_interp_string batching — max round-3 savings

Combines two orthogonal optimizations:
1. c_escape batching (from alpha): ASCII runs emitted as str_slice segments instead
   of one str_char_at string per byte. O(N) allocs → O(K) where K = special chars.

2. scan_interp_string batching (from beta): char dispatch via str_char_code (Int)
   + clean_start tracking to flush plain runs as str_slice. Eliminates per-char
   string allocations in the string-literal scanning hot path.

Result on web/src/main.el: 14.5MB -> 13.4MB peak RSS (-7.6%).
Self-hosting: PASS.
This commit is contained in:
Will Anderson
2026-05-05 16:01:05 -05:00
parent 1eef9928f4
commit e587bedf30
2 changed files with 117 additions and 46 deletions
+48 -12
View File
@@ -38,10 +38,13 @@ fn is_hex_digit_byte(b: Int) -> Bool {
}
fn c_escape(s: String) -> String {
// Use index-based byte scanning via str_char_code(s, i) and str_char_at(s, i).
// This avoids native_string_chars + str_join, which corrupts high-byte (>= 0x80)
// characters because list_join's looks_like_string heuristic rejects strings
// whose first byte is >= 0x7F and emits them as decimal pointer values instead.
// Batch ASCII chars using str_slice instead of str_char_at per byte.
// Track clean_start: the beginning of the current run of bytes that need
// no escaping. On each special byte, flush the accumulated clean run via
// str_slice, then append the escape. This reduces parts-list appends from
// O(N) to O(K) where K = number of special bytes << N for normal strings.
//
// Special bytes: '"'=34, '\\'=92, '\n'=10, '\r'=13, '\t'=9, any byte>=128.
//
// IMPORTANT: after a \xNN hex escape, if the next byte is a hex digit
// (0-9, a-f, A-F), we emit `""` to split the C string literal so the C
@@ -51,46 +54,75 @@ fn c_escape(s: String) -> String {
let total: Int = str_len(s)
let parts: [String] = native_list_empty()
let i: Int = 0
let clean_start: Int = 0
let prev_was_hex_escape: Bool = false
while i < total {
let bval: Int = str_char_code(s, i)
// If the previous token was a \xNN escape and the current byte is a
// hex digit, insert an empty string literal ("") to break the escape.
// Handle the hex-escape split case first: if prev was \xNN and this
// byte is a hex digit, we must flush the clean run and insert "".
// (At this point clean_start == i since the previous special byte
// already reset it, so flush is a no-op unless something is pending.)
if prev_was_hex_escape {
if is_hex_digit_byte(bval) {
// Flush any accumulated clean bytes before the split marker.
if clean_start < i {
let parts = native_list_append(parts, str_slice(s, clean_start, i))
}
let parts = native_list_append(parts, "\"\"")
let clean_start = i
}
}
let prev_was_hex_escape = false
if bval == 34 {
// 34 = '"'
// 34 = '"' flush clean run, then escape
if clean_start < i {
let parts = native_list_append(parts, str_slice(s, clean_start, i))
}
let parts = native_list_append(parts, "\\\"")
let clean_start = i + 1
} else {
if bval == 92 {
// 92 = '\\'
if clean_start < i {
let parts = native_list_append(parts, str_slice(s, clean_start, i))
}
let parts = native_list_append(parts, "\\\\")
let clean_start = i + 1
} else {
if bval == 10 {
// 10 = '\n'
if clean_start < i {
let parts = native_list_append(parts, str_slice(s, clean_start, i))
}
let parts = native_list_append(parts, "\\n")
let clean_start = i + 1
} else {
if bval == 13 {
// 13 = '\r'
if clean_start < i {
let parts = native_list_append(parts, str_slice(s, clean_start, i))
}
let parts = native_list_append(parts, "\\r")
let clean_start = i + 1
} else {
if bval == 9 {
// 9 = '\t'
if clean_start < i {
let parts = native_list_append(parts, str_slice(s, clean_start, i))
}
let parts = native_list_append(parts, "\\t")
let clean_start = i + 1
} else {
if bval >= 128 {
// Escape non-ASCII bytes (>= 0x80) as \xNN so
// Clang does not misinterpret multi-byte UTF-8
// sequences in C string literals.
// Non-ASCII: flush, then \xNN
if clean_start < i {
let parts = native_list_append(parts, str_slice(s, clean_start, i))
}
let parts = native_list_append(parts, "\\x" + byte_to_hex2(bval))
let prev_was_hex_escape = true
} else {
let parts = native_list_append(parts, str_char_at(s, i))
let clean_start = i + 1
}
// else: plain ASCII extends the current clean run (no append)
}
}
}
@@ -98,6 +130,10 @@ fn c_escape(s: String) -> String {
}
let i = i + 1
}
// Flush the final clean run if any
if clean_start < total {
let parts = native_list_append(parts, str_slice(s, clean_start, total))
}
str_join(parts, "")
}
+69 -34
View File
@@ -555,10 +555,17 @@ fn interp_tokens_append_all(dst: [Any], src: [Any]) -> [Any] {
//
// Supported escape sequences: \" \n \t \r \\ \$ (literal dollar sign).
// Nested quotes inside ${} are not supported; use a variable instead.
//
// Performance: uses str_char_code (Int) for all character dispatch, eliminating
// per-character strdup. Plain runs are batched into str_slice segments instead
// of accumulating single-char strings, reducing list appends from O(N) to O(K)
// where K = number of escape/special chars in the literal.
// Char codes: '\' = 92, '"' = 34, '$' = 36, '{' = 123
fn scan_interp_string(src: String, start: Int, total: Int) -> Map<String, Any> {
let i = start
let out_tokens: [Any] = native_list_empty()
let cur_part: [String] = native_list_empty()
let cur_parts: [String] = native_list_empty()
let clean_start = start
let has_interp = false
let need_plus = false
let running = true
@@ -567,39 +574,55 @@ fn scan_interp_string(src: String, start: Int, total: Int) -> Map<String, Any> {
if i >= total {
let running = false
} else {
let ch: String = str_char_at(src, i)
let c: Int = str_char_code(src, i)
if ch == "\\" {
// Escape sequence
if c == 92 {
// '\\' = 92 escape sequence: flush clean run, append resolved char
if clean_start < i {
let cur_parts = native_list_append(cur_parts, str_slice(src, clean_start, i))
}
let next_i = i + 1
if next_i < total {
let next_ch: String = str_char_at(src, next_i)
if next_ch == "$" {
// \$ => literal '$' (escape for interpolation syntax)
let cur_part = native_list_append(cur_part, "$")
let nc: Int = str_char_code(src, next_i)
if nc == 36 {
// '\$' => literal '$' (36 = '$')
let cur_parts = native_list_append(cur_parts, "$")
let clean_start = next_i + 1
let i = next_i + 1
} else {
if next_ch == "\"" {
let cur_part = native_list_append(cur_part, "\"")
if nc == 34 {
// '\"' => literal '"' (34 = '"')
let cur_parts = native_list_append(cur_parts, "\"")
let clean_start = next_i + 1
let i = next_i + 1
} else {
if next_ch == "n" {
let cur_part = native_list_append(cur_part, "\n")
if nc == 110 {
// '\n' (110 = 'n')
let cur_parts = native_list_append(cur_parts, "\n")
let clean_start = next_i + 1
let i = next_i + 1
} else {
if next_ch == "t" {
let cur_part = native_list_append(cur_part, "\t")
if nc == 116 {
// '\t' (116 = 't')
let cur_parts = native_list_append(cur_parts, "\t")
let clean_start = next_i + 1
let i = next_i + 1
} else {
if next_ch == "r" {
let cur_part = native_list_append(cur_part, "\r")
if nc == 114 {
// '\r' (114 = 'r')
let cur_parts = native_list_append(cur_parts, "\r")
let clean_start = next_i + 1
let i = next_i + 1
} else {
if next_ch == "\\" {
let cur_part = native_list_append(cur_part, "\\")
if nc == 92 {
// '\\' (92)
let cur_parts = native_list_append(cur_parts, "\\")
let clean_start = next_i + 1
let i = next_i + 1
} else {
let cur_part = native_list_append(cur_part, next_ch)
// Unknown escape: emit the escaped char verbatim
let cur_parts = native_list_append(cur_parts, str_slice(src, next_i, next_i + 1))
let clean_start = next_i + 1
let i = next_i + 1
}
}
@@ -608,29 +631,38 @@ fn scan_interp_string(src: String, start: Int, total: Int) -> Map<String, Any> {
}
}
} else {
let i = i + 1
let clean_start = next_i
let i = next_i
}
} else {
if ch == "\"" {
// Closing quote - stop scanning
if c == 34 {
// '"' = 34 — closing quote: flush clean run, stop
if clean_start < i {
let cur_parts = native_list_append(cur_parts, str_slice(src, clean_start, i))
}
let i = i + 1
let clean_start = i
let running = false
} else {
if ch == "$" {
// Check for ${ (start of interpolation)
if c == 36 {
// '$' = 36 possible interpolation start
let next_i = i + 1
let is_interp = false
if next_i < total {
let next_ch: String = str_char_at(src, next_i)
if next_ch == "{" {
let nc2: Int = str_char_code(src, next_i)
if nc2 == 123 {
// '{' = 123
let is_interp = true
}
}
if is_interp {
// Flush the accumulated literal part (if non-empty)
let part_len: Int = native_list_len(cur_part)
if clean_start < i {
let cur_parts = native_list_append(cur_parts, str_slice(src, clean_start, i))
}
let part_len: Int = native_list_len(cur_parts)
if part_len > 0 {
let part_text = str_join(cur_part, "")
let part_text = str_join(cur_parts, "")
if need_plus {
let out_tokens = tok_append(out_tokens, "Plus", "+")
}
@@ -641,7 +673,7 @@ fn scan_interp_string(src: String, start: Int, total: Int) -> Map<String, Any> {
let out_tokens = tok_append(out_tokens, "Str", clean_part)
let need_plus = true
}
let cur_part = native_list_empty()
let cur_parts = native_list_empty()
let has_interp = true
// Scan brace-balanced expression source
@@ -649,6 +681,7 @@ fn scan_interp_string(src: String, start: Int, total: Int) -> Map<String, Any> {
let expr_src: String = brace_result["text"]
let new_i: Int = brace_result["pos"]
let i = new_i
let clean_start = new_i
// Re-lex the expression and inline the tokens.
// Wrap in ( ) so that operators inside ${} (e.g.
@@ -672,12 +705,11 @@ fn scan_interp_string(src: String, start: Int, total: Int) -> Map<String, Any> {
}
let need_plus = true
} else {
// Plain '$' not followed by '{' - treat as literal
let cur_part = native_list_append(cur_part, "$")
// Plain '$' not followed by '{' - treat as literal, continue clean run
let i = i + 1
}
} else {
let cur_part = native_list_append(cur_part, ch)
// Plain char extends clean run, no append needed
let i = i + 1
}
}
@@ -686,8 +718,11 @@ fn scan_interp_string(src: String, start: Int, total: Int) -> Map<String, Any> {
}
// Flush remaining literal segment and build final token list
let part_text = str_join(cur_part, "")
let part_len: Int = native_list_len(cur_part)
if clean_start < i {
let cur_parts = native_list_append(cur_parts, str_slice(src, clean_start, i))
}
let part_len: Int = native_list_len(cur_parts)
let part_text = str_join(cur_parts, "")
if has_interp {
// Interpolated string: only emit trailing segment if non-empty
if part_len > 0 {