merge compiler/string-interp — string interpolation via lexer desugaring

This commit is contained in:
Will Anderson
2026-05-03 15:52:21 -05:00
+241 -13
View File
@@ -460,6 +460,238 @@ fn scan_string(chars: [String], start: Int, total: Int) -> Map<String, Any> {
{ "text": str_join(parts, ""), "pos": i }
}
// -- String interpolation ------------------------------------------------------
//
// scan_interp_brace - scan from `start` (the char after `${`) to the matching
// `}`, tracking brace depth so inner braces (e.g. fn calls, map literals) are
// handled correctly. Returns { "text": inner_source, "pos": i_after_close }.
fn scan_interp_brace(chars: [String], start: Int, total: Int) -> Map<String, Any> {
let i = start
let parts: [String] = native_list_empty()
let depth = 1
let running = true
while running {
if i >= total {
let running = false
} else {
let ch: String = native_list_get(chars, i)
if ch == "{" {
let depth = depth + 1
let parts = native_list_append(parts, ch)
let i = i + 1
} else {
if ch == "}" {
let depth = depth - 1
if depth <= 0 {
// Closing brace of the interpolation - stop, do not include it
let i = i + 1
let running = false
} else {
let parts = native_list_append(parts, ch)
let i = i + 1
}
} else {
let parts = native_list_append(parts, ch)
let i = i + 1
}
}
}
}
{ "text": str_join(parts, ""), "pos": i }
}
// interp_tokens_append_all - copy every token from src into dst, skipping the
// trailing Eof sentinel that lex() always appends. Returns the updated dst list.
fn interp_tokens_append_all(dst: [Map<String, Any>], src: [Map<String, Any>]) -> [Map<String, Any>] {
let src_len: Int = native_list_len(src)
let j = 0
let result = dst
while j < src_len {
let tok: Map<String, Any> = native_list_get(src, j)
let tk: String = tok["kind"]
if tk == "Eof" {
let j = src_len
} else {
let result = native_list_append(result, tok)
let j = j + 1
}
}
result
}
// scan_interp_string - scan a string literal that may contain ${expr}
// interpolations. Starts AFTER the opening `"`.
// Returns { "tokens": [token list to inject], "pos": i_after_close_quote }.
//
// For a plain string (no ${}) this emits a single Str token, identical to the
// old scan_string path. For an interpolated string it emits a flat sequence
// of tokens equivalent to the string-concat expression, for example:
//
// "hello ${name}!"
// => Str("hello ") Plus <tokens for name> Plus Str("!")
//
// Empty literal segments between adjacent ${ } blocks are omitted. The
// resulting token stream is consumed by the existing parse_binop / parse_primary
// path in the parser with zero parser changes required.
//
// Supported escape sequences: \" \n \t \r \\ \$ (literal dollar sign).
// Nested quotes inside ${} are not supported; use a variable instead.
fn scan_interp_string(chars: [String], start: Int, total: Int) -> Map<String, Any> {
let i = start
let out_tokens: [Map<String, Any>] = native_list_empty()
let cur_part: [String] = native_list_empty()
let has_interp = false
let need_plus = false
let running = true
while running {
if i >= total {
let running = false
} else {
let ch: String = native_list_get(chars, i)
if ch == "\\" {
// Escape sequence
let next_i = i + 1
if next_i < total {
let next_ch: String = native_list_get(chars, next_i)
if next_ch == "$" {
// \$ => literal '$' (escape for interpolation syntax)
let cur_part = native_list_append(cur_part, "$")
let i = next_i + 1
} else {
if next_ch == "\"" {
let cur_part = native_list_append(cur_part, "\"")
let i = next_i + 1
} else {
if next_ch == "n" {
let cur_part = native_list_append(cur_part, "\n")
let i = next_i + 1
} else {
if next_ch == "t" {
let cur_part = native_list_append(cur_part, "\t")
let i = next_i + 1
} else {
if next_ch == "r" {
let cur_part = native_list_append(cur_part, "\r")
let i = next_i + 1
} else {
if next_ch == "\\" {
let cur_part = native_list_append(cur_part, "\\")
let i = next_i + 1
} else {
let cur_part = native_list_append(cur_part, next_ch)
let i = next_i + 1
}
}
}
}
}
}
} else {
let i = i + 1
}
} else {
if ch == "\"" {
// Closing quote - stop scanning
let i = i + 1
let running = false
} else {
if ch == "$" {
// Check for ${ (start of interpolation)
let next_i = i + 1
let is_interp = false
if next_i < total {
let next_ch: String = native_list_get(chars, next_i)
if next_ch == "{" {
let is_interp = true
}
}
if is_interp {
// Flush the accumulated literal part (if non-empty)
let part_len: Int = native_list_len(cur_part)
if part_len > 0 {
let part_text = str_join(cur_part, "")
if need_plus {
let out_tokens = native_list_append(out_tokens, make_tok("Plus", "+"))
}
let clean_part = part_text
if looks_like_code(part_text) {
let clean_part = strip_code_comments(part_text)
}
let out_tokens = native_list_append(out_tokens, make_tok("Str", clean_part))
let need_plus = true
}
let cur_part = native_list_empty()
let has_interp = true
// Scan brace-balanced expression source
let brace_result = scan_interp_brace(chars, next_i + 1, total)
let expr_src: String = brace_result["text"]
let new_i: Int = brace_result["pos"]
let i = new_i
// Re-lex the expression and inline the tokens.
// Wrap in ( ) so that operators inside ${} (e.g.
// age + 1) are parsed as a grouped sub-expression
// rather than merging with the surrounding concat
// Plus tokens at the wrong precedence level.
let inner_toks: [Map<String, Any>] = lex(expr_src)
let inner_len: Int = native_list_len(inner_toks)
if need_plus {
let out_tokens = native_list_append(out_tokens, make_tok("Plus", "+"))
}
// Empty interpolation ${} => empty string segment
if inner_len <= 1 {
let out_tokens = native_list_append(out_tokens, make_tok("Str", ""))
} else {
let out_tokens = native_list_append(out_tokens, make_tok("LParen", "("))
let out_tokens = interp_tokens_append_all(out_tokens, inner_toks)
let out_tokens = native_list_append(out_tokens, make_tok("RParen", ")"))
}
let need_plus = true
} else {
// Plain '$' not followed by '{' - treat as literal
let cur_part = native_list_append(cur_part, "$")
let i = i + 1
}
} else {
let cur_part = native_list_append(cur_part, ch)
let i = i + 1
}
}
}
}
}
// Flush remaining literal segment and build final token list
let part_text = str_join(cur_part, "")
let part_len: Int = native_list_len(cur_part)
if has_interp {
// Interpolated string: only emit trailing segment if non-empty
if part_len > 0 {
let clean_part = part_text
if looks_like_code(part_text) {
let clean_part = strip_code_comments(part_text)
}
if need_plus {
let out_tokens = native_list_append(out_tokens, make_tok("Plus", "+"))
}
let out_tokens = native_list_append(out_tokens, make_tok("Str", clean_part))
}
} else {
// Plain string with no interpolation - same behaviour as old scan_string
let clean_text = part_text
if looks_like_code(part_text) {
let clean_text = strip_code_comments(part_text)
}
let out_tokens = native_list_append(out_tokens, make_tok("Str", clean_text))
}
{ "tokens": out_tokens, "pos": i }
}
// -- Main lexer ----------------------------------------------------------------
fn lex(source: String) -> [Map<String, Any>] {
@@ -505,20 +737,16 @@ fn lex(source: String) -> [Map<String, Any>] {
let i = i + 1
}
} else {
// String literal
// String literal (plain or interpolated with ${expr} syntax).
// scan_interp_string handles both cases: plain strings emit a
// single Str token; interpolated strings emit a flat token
// sequence (Str Plus expr-tokens Plus Str ...) that the parser
// naturally assembles into a BinOp concat tree.
if ch == "\"" {
let result = scan_string(chars, i + 1, total)
let str_text: String = result["text"]
let new_pos: Int = result["pos"]
// Compile-time scrub: strings that embed JS or CSS get
// their // line comments and /* block comments stripped
// before the token reaches the parser. Plain prose passes
// through untouched.
let clean_text = str_text
if looks_like_code(str_text) {
let clean_text = strip_code_comments(str_text)
}
let tokens = native_list_append(tokens, make_tok("Str", clean_text))
let interp_result = scan_interp_string(chars, i + 1, total)
let interp_toks: [Map<String, Any>] = interp_result["tokens"]
let new_pos: Int = interp_result["pos"]
let tokens = interp_tokens_append_all(tokens, interp_toks)
let i = new_pos
} else {
// Number literal