994 lines
48 KiB
EmacsLisp
994 lines
48 KiB
EmacsLisp
// lexer.el - el self-hosting lexer
|
|
//
|
|
// Tokenises an el source string into a list of token maps.
|
|
// Each token is a Map<String, Any> with keys:
|
|
// "kind" -> String (e.g. "Int", "Ident", "Plus")
|
|
// "value" -> String (the raw text of the token)
|
|
//
|
|
// Entry point: fn lex(source: String) -> [Map<String, Any>]
|
|
//
|
|
// Uses native_string_chars to split the source into a chars list,
|
|
// then indexes it with native_list_get - avoids O(N-) string cloning.
|
|
|
|
// -- Character helpers ---------------------------------------------------------
|
|
|
|
fn lex_is_digit(ch: String) -> Bool {
|
|
if ch == "0" { return true }
|
|
if ch == "1" { return true }
|
|
if ch == "2" { return true }
|
|
if ch == "3" { return true }
|
|
if ch == "4" { return true }
|
|
if ch == "5" { return true }
|
|
if ch == "6" { return true }
|
|
if ch == "7" { return true }
|
|
if ch == "8" { return true }
|
|
if ch == "9" { return true }
|
|
false
|
|
}
|
|
|
|
fn lex_is_alpha(ch: String) -> Bool {
|
|
if ch == "a" { return true }
|
|
if ch == "b" { return true }
|
|
if ch == "c" { return true }
|
|
if ch == "d" { return true }
|
|
if ch == "e" { return true }
|
|
if ch == "f" { return true }
|
|
if ch == "g" { return true }
|
|
if ch == "h" { return true }
|
|
if ch == "i" { return true }
|
|
if ch == "j" { return true }
|
|
if ch == "k" { return true }
|
|
if ch == "l" { return true }
|
|
if ch == "m" { return true }
|
|
if ch == "n" { return true }
|
|
if ch == "o" { return true }
|
|
if ch == "p" { return true }
|
|
if ch == "q" { return true }
|
|
if ch == "r" { return true }
|
|
if ch == "s" { return true }
|
|
if ch == "t" { return true }
|
|
if ch == "u" { return true }
|
|
if ch == "v" { return true }
|
|
if ch == "w" { return true }
|
|
if ch == "x" { return true }
|
|
if ch == "y" { return true }
|
|
if ch == "z" { return true }
|
|
if ch == "A" { return true }
|
|
if ch == "B" { return true }
|
|
if ch == "C" { return true }
|
|
if ch == "D" { return true }
|
|
if ch == "E" { return true }
|
|
if ch == "F" { return true }
|
|
if ch == "G" { return true }
|
|
if ch == "H" { return true }
|
|
if ch == "I" { return true }
|
|
if ch == "J" { return true }
|
|
if ch == "K" { return true }
|
|
if ch == "L" { return true }
|
|
if ch == "M" { return true }
|
|
if ch == "N" { return true }
|
|
if ch == "O" { return true }
|
|
if ch == "P" { return true }
|
|
if ch == "Q" { return true }
|
|
if ch == "R" { return true }
|
|
if ch == "S" { return true }
|
|
if ch == "T" { return true }
|
|
if ch == "U" { return true }
|
|
if ch == "V" { return true }
|
|
if ch == "W" { return true }
|
|
if ch == "X" { return true }
|
|
if ch == "Y" { return true }
|
|
if ch == "Z" { return true }
|
|
false
|
|
}
|
|
|
|
fn is_alnum_or_underscore(ch: String) -> Bool {
|
|
if lex_is_digit(ch) { return true }
|
|
if lex_is_alpha(ch) { return true }
|
|
if ch == "_" { return true }
|
|
false
|
|
}
|
|
|
|
fn lex_is_whitespace(ch: String) -> Bool {
|
|
if ch == " " { return true }
|
|
if ch == "\t" { return true }
|
|
if ch == "\n" { return true }
|
|
if ch == "\r" { return true }
|
|
false
|
|
}
|
|
|
|
fn make_tok(kind: String, value: String) -> Map<String, Any> {
|
|
{ "kind": kind, "value": value }
|
|
}
|
|
|
|
// -- Keyword lookup ------------------------------------------------------------
|
|
|
|
fn keyword_kind(word: String) -> String {
|
|
if word == "let" { return "Let" }
|
|
if word == "fn" { return "Fn" }
|
|
if word == "type" { return "Type" }
|
|
if word == "enum" { return "Enum" }
|
|
if word == "match" { return "Match" }
|
|
if word == "return" { return "Return" }
|
|
if word == "if" { return "If" }
|
|
if word == "else" { return "Else" }
|
|
if word == "for" { return "For" }
|
|
if word == "in" { return "In" }
|
|
if word == "while" { return "While" }
|
|
if word == "import" { return "Import" }
|
|
if word == "from" { return "From" }
|
|
if word == "as" { return "As" }
|
|
if word == "with" { return "With" }
|
|
if word == "sealed" { return "Sealed" }
|
|
if word == "activate" { return "Activate" }
|
|
if word == "where" { return "Where" }
|
|
if word == "test" { return "Test" }
|
|
if word == "seed" { return "Seed" }
|
|
if word == "assert" { return "Assert" }
|
|
if word == "protocol" { return "Protocol" }
|
|
if word == "impl" { return "Impl" }
|
|
if word == "retry" { return "Retry" }
|
|
if word == "times" { return "Times" }
|
|
if word == "fallback" { return "Fallback" }
|
|
if word == "reason" { return "Reason" }
|
|
if word == "parallel" { return "Parallel" }
|
|
if word == "trace" { return "Trace" }
|
|
if word == "requires" { return "Requires" }
|
|
if word == "deploy" { return "Deploy" }
|
|
if word == "to" { return "To" }
|
|
if word == "via" { return "Via" }
|
|
if word == "target" { return "Target" }
|
|
if word == "true" { return "Bool" }
|
|
if word == "false" { return "Bool" }
|
|
if word == "cgi" { return "Cgi" }
|
|
if word == "service" { return "Service" }
|
|
if word == "manager" { return "Manager" }
|
|
if word == "engine" { return "Engine" }
|
|
if word == "accessor" { return "Accessor" }
|
|
if word == "vessel" { return "Vessel" }
|
|
if word == "extern" { return "Extern" }
|
|
if word == "break" { return "Break" }
|
|
if word == "continue" { return "Continue" }
|
|
""
|
|
}
|
|
|
|
// -- Scan helpers --------------------------------------------------------------
|
|
// All scan helpers receive the chars list and total length.
|
|
|
|
// scan_digits - advance i while chars[i] is a digit
|
|
// Returns { "text": ..., "pos": i }
|
|
fn scan_digits(chars: [String], start: Int, total: Int) -> Map<String, Any> {
|
|
let i = start
|
|
let parts: [String] = native_list_empty()
|
|
let running = true
|
|
while running {
|
|
if i >= total {
|
|
let running = false
|
|
} else {
|
|
let ch: String = native_list_get(chars, i)
|
|
if lex_is_digit(ch) {
|
|
let parts = native_list_append(parts, ch)
|
|
let i = i + 1
|
|
} else {
|
|
let running = false
|
|
}
|
|
}
|
|
}
|
|
{ "text": str_join(parts, ""), "pos": i }
|
|
}
|
|
|
|
// scan_ident - advance i while chars[i] is alphanumeric or underscore
|
|
fn scan_ident(chars: [String], start: Int, total: Int) -> Map<String, Any> {
|
|
let i = start
|
|
let parts: [String] = native_list_empty()
|
|
let running = true
|
|
while running {
|
|
if i >= total {
|
|
let running = false
|
|
} else {
|
|
let ch: String = native_list_get(chars, i)
|
|
if is_alnum_or_underscore(ch) {
|
|
let parts = native_list_append(parts, ch)
|
|
let i = i + 1
|
|
} else {
|
|
let running = false
|
|
}
|
|
}
|
|
}
|
|
{ "text": str_join(parts, ""), "pos": i }
|
|
}
|
|
|
|
// -- Code-bearing string detection + comment strip ----------------------------
|
|
// Inline JS/CSS literals embedded in El source (e.g. <script>-</script> blobs
|
|
// or stylesheet payloads inside string literals) carry their own line and
|
|
// block comments. Those comments leak into the served HTML and reveal build
|
|
// notes the visitor should never see. We strip them at the lexer so every
|
|
// downstream consumer (codegen-c, codegen-js, parser) gets the cleaned form.
|
|
//
|
|
// looks_like_code - heuristic gate so we only strip strings that actually
|
|
// embed JS or CSS. Plain prose, hex blobs, JSON, etc. pass through verbatim.
|
|
|
|
fn substr_at(chars: [String], start: Int, total: Int, needle: String) -> Bool {
|
|
let nchars: [String] = native_string_chars(needle)
|
|
let nlen: Int = native_list_len(nchars)
|
|
if start + nlen > total { return false }
|
|
let i = 0
|
|
let matched = true
|
|
while i < nlen {
|
|
let a: String = native_list_get(chars, start + i)
|
|
let b: String = native_list_get(nchars, i)
|
|
if a == b { let i = i + 1 } else { let matched = false; let i = nlen }
|
|
}
|
|
matched
|
|
}
|
|
|
|
fn str_has(s: String, needle: String) -> Bool {
|
|
let chars: [String] = native_string_chars(s)
|
|
let total: Int = native_list_len(chars)
|
|
let i = 0
|
|
let found = false
|
|
while i < total {
|
|
if substr_at(chars, i, total, needle) {
|
|
let found = true
|
|
let i = total
|
|
} else {
|
|
let i = i + 1
|
|
}
|
|
}
|
|
found
|
|
}
|
|
|
|
fn looks_like_code(s: String) -> Bool {
|
|
if str_has(s, "<script") { return true }
|
|
if str_has(s, "<style") { return true }
|
|
if str_has(s, "function") {
|
|
if str_has(s, ";") { return true }
|
|
}
|
|
false
|
|
}
|
|
|
|
// strip_code_comments - character-by-character walk. Tracks JS string state
|
|
// (single, double, backtick) and never strips inside one. Backslash escapes
|
|
// inside JS strings consume the next char verbatim. URLs like https:// are
|
|
// preserved by checking the previous char before treating // as a line
|
|
// comment opener: if the char immediately before '/' is ':', emit the '/'
|
|
// literally and advance one position.
|
|
fn strip_code_comments(s: String) -> String {
|
|
let chars: [String] = native_string_chars(s)
|
|
let total: Int = native_list_len(chars)
|
|
let out_parts: [String] = native_list_empty()
|
|
let i = 0
|
|
let in_squote = false
|
|
let in_dquote = false
|
|
let in_btick = false
|
|
let prev = ""
|
|
while i < total {
|
|
let ch: String = native_list_get(chars, i)
|
|
let in_js_string = false
|
|
if in_squote { let in_js_string = true }
|
|
if in_dquote { let in_js_string = true }
|
|
if in_btick { let in_js_string = true }
|
|
|
|
if in_js_string {
|
|
// Backslash escape: consume next char verbatim regardless of which.
|
|
if ch == "\\" {
|
|
let out_parts = native_list_append(out_parts, ch)
|
|
let next_i = i + 1
|
|
if next_i < total {
|
|
let nc: String = native_list_get(chars, next_i)
|
|
let out_parts = native_list_append(out_parts, nc)
|
|
let prev = nc
|
|
let i = next_i + 1
|
|
} else {
|
|
let prev = ch
|
|
let i = next_i
|
|
}
|
|
} else {
|
|
if in_squote {
|
|
if ch == "'" { let in_squote = false }
|
|
} else {
|
|
if in_dquote {
|
|
if ch == "\"" { let in_dquote = false }
|
|
} else {
|
|
if in_btick {
|
|
if ch == "`" { let in_btick = false }
|
|
}
|
|
}
|
|
}
|
|
let out_parts = native_list_append(out_parts, ch)
|
|
let prev = ch
|
|
let i = i + 1
|
|
}
|
|
} else {
|
|
// Not in a JS string. Check for comment openers.
|
|
let next_i = i + 1
|
|
let next_ch = ""
|
|
if next_i < total {
|
|
let next_ch: String = native_list_get(chars, next_i)
|
|
}
|
|
|
|
if ch == "/" {
|
|
if next_ch == "/" {
|
|
// URL guard: prev char ':' means this is "://", not a comment.
|
|
if prev == ":" {
|
|
let out_parts = native_list_append(out_parts, ch)
|
|
let prev = ch
|
|
let i = i + 1
|
|
} else {
|
|
// Skip until newline (newline itself is preserved so
|
|
// surrounding line counts/structure stay sane).
|
|
let i = i + 2
|
|
let scanning = true
|
|
while scanning {
|
|
if i >= total {
|
|
let scanning = false
|
|
} else {
|
|
let lc: String = native_list_get(chars, i)
|
|
if lc == "\n" {
|
|
let scanning = false
|
|
} else {
|
|
let i = i + 1
|
|
}
|
|
}
|
|
}
|
|
let prev = ""
|
|
}
|
|
} else {
|
|
if next_ch == "*" {
|
|
// Skip until matching "*/".
|
|
let i = i + 2
|
|
let scanning2 = true
|
|
while scanning2 {
|
|
if i >= total {
|
|
let scanning2 = false
|
|
} else {
|
|
let bc: String = native_list_get(chars, i)
|
|
if bc == "*" {
|
|
let after = i + 1
|
|
if after < total {
|
|
let nc2: String = native_list_get(chars, after)
|
|
if nc2 == "/" {
|
|
let i = after + 1
|
|
let scanning2 = false
|
|
} else {
|
|
let i = i + 1
|
|
}
|
|
} else {
|
|
let i = i + 1
|
|
}
|
|
} else {
|
|
let i = i + 1
|
|
}
|
|
}
|
|
}
|
|
let prev = ""
|
|
} else {
|
|
let out_parts = native_list_append(out_parts, ch)
|
|
let prev = ch
|
|
let i = i + 1
|
|
}
|
|
}
|
|
} else {
|
|
// Open a JS string?
|
|
if ch == "'" {
|
|
let in_squote = true
|
|
let out_parts = native_list_append(out_parts, ch)
|
|
let prev = ch
|
|
let i = i + 1
|
|
} else {
|
|
if ch == "\"" {
|
|
let in_dquote = true
|
|
let out_parts = native_list_append(out_parts, ch)
|
|
let prev = ch
|
|
let i = i + 1
|
|
} else {
|
|
if ch == "`" {
|
|
let in_btick = true
|
|
let out_parts = native_list_append(out_parts, ch)
|
|
let prev = ch
|
|
let i = i + 1
|
|
} else {
|
|
let out_parts = native_list_append(out_parts, ch)
|
|
let prev = ch
|
|
let i = i + 1
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
str_join(out_parts, "")
|
|
}
|
|
|
|
// scan_string - scan a quoted string literal, handling \" escapes.
|
|
// Starts AFTER the opening quote. Returns { "text": content, "pos": i_after_close }
|
|
fn scan_string(chars: [String], start: Int, total: Int) -> Map<String, Any> {
|
|
let i = start
|
|
let parts: [String] = native_list_empty()
|
|
let running = true
|
|
while running {
|
|
if i >= total {
|
|
let running = false
|
|
} else {
|
|
let ch: String = native_list_get(chars, i)
|
|
if ch == "\\" {
|
|
// escape: peek next char
|
|
let next_i = i + 1
|
|
if next_i < total {
|
|
let next_ch: String = native_list_get(chars, next_i)
|
|
if next_ch == "\"" {
|
|
let parts = native_list_append(parts, "\"")
|
|
let i = next_i + 1
|
|
} else {
|
|
if next_ch == "n" {
|
|
let parts = native_list_append(parts, "\n")
|
|
let i = next_i + 1
|
|
} else {
|
|
if next_ch == "t" {
|
|
let parts = native_list_append(parts, "\t")
|
|
let i = next_i + 1
|
|
} else {
|
|
if next_ch == "r" {
|
|
let parts = native_list_append(parts, "\r")
|
|
let i = next_i + 1
|
|
} else {
|
|
if next_ch == "\\" {
|
|
let parts = native_list_append(parts, "\\")
|
|
let i = next_i + 1
|
|
} else {
|
|
let parts = native_list_append(parts, next_ch)
|
|
let i = next_i + 1
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
let i = i + 1
|
|
}
|
|
} else {
|
|
if ch == "\"" {
|
|
let i = i + 1
|
|
let running = false
|
|
} else {
|
|
let parts = native_list_append(parts, ch)
|
|
let i = i + 1
|
|
}
|
|
}
|
|
}
|
|
}
|
|
{ "text": str_join(parts, ""), "pos": i }
|
|
}
|
|
|
|
// -- String interpolation ------------------------------------------------------
|
|
//
|
|
// scan_interp_brace - scan from `start` (the char after `${`) to the matching
|
|
// `}`, tracking brace depth so inner braces (e.g. fn calls, map literals) are
|
|
// handled correctly. Returns { "text": inner_source, "pos": i_after_close }.
|
|
fn scan_interp_brace(chars: [String], start: Int, total: Int) -> Map<String, Any> {
|
|
let i = start
|
|
let parts: [String] = native_list_empty()
|
|
let depth = 1
|
|
let running = true
|
|
while running {
|
|
if i >= total {
|
|
let running = false
|
|
} else {
|
|
let ch: String = native_list_get(chars, i)
|
|
if ch == "{" {
|
|
let depth = depth + 1
|
|
let parts = native_list_append(parts, ch)
|
|
let i = i + 1
|
|
} else {
|
|
if ch == "}" {
|
|
let depth = depth - 1
|
|
if depth <= 0 {
|
|
// Closing brace of the interpolation - stop, do not include it
|
|
let i = i + 1
|
|
let running = false
|
|
} else {
|
|
let parts = native_list_append(parts, ch)
|
|
let i = i + 1
|
|
}
|
|
} else {
|
|
let parts = native_list_append(parts, ch)
|
|
let i = i + 1
|
|
}
|
|
}
|
|
}
|
|
}
|
|
{ "text": str_join(parts, ""), "pos": i }
|
|
}
|
|
|
|
// interp_tokens_append_all - copy every token from src into dst, skipping the
|
|
// trailing Eof sentinel that lex() always appends. Returns the updated dst list.
|
|
fn interp_tokens_append_all(dst: [Map<String, Any>], src: [Map<String, Any>]) -> [Map<String, Any>] {
|
|
let src_len: Int = native_list_len(src)
|
|
let j = 0
|
|
let result = dst
|
|
while j < src_len {
|
|
let tok: Map<String, Any> = native_list_get(src, j)
|
|
let tk: String = tok["kind"]
|
|
if tk == "Eof" {
|
|
let j = src_len
|
|
} else {
|
|
let result = native_list_append(result, tok)
|
|
let j = j + 1
|
|
}
|
|
}
|
|
result
|
|
}
|
|
|
|
// scan_interp_string - scan a string literal that may contain ${expr}
|
|
// interpolations. Starts AFTER the opening `"`.
|
|
// Returns { "tokens": [token list to inject], "pos": i_after_close_quote }.
|
|
//
|
|
// For a plain string (no ${}) this emits a single Str token, identical to the
|
|
// old scan_string path. For an interpolated string it emits a flat sequence
|
|
// of tokens equivalent to the string-concat expression, for example:
|
|
//
|
|
// "hello ${name}!"
|
|
// => Str("hello ") Plus <tokens for name> Plus Str("!")
|
|
//
|
|
// Empty literal segments between adjacent ${ } blocks are omitted. The
|
|
// resulting token stream is consumed by the existing parse_binop / parse_primary
|
|
// path in the parser with zero parser changes required.
|
|
//
|
|
// Supported escape sequences: \" \n \t \r \\ \$ (literal dollar sign).
|
|
// Nested quotes inside ${} are not supported; use a variable instead.
|
|
fn scan_interp_string(chars: [String], start: Int, total: Int) -> Map<String, Any> {
|
|
let i = start
|
|
let out_tokens: [Map<String, Any>] = native_list_empty()
|
|
let cur_part: [String] = native_list_empty()
|
|
let has_interp = false
|
|
let need_plus = false
|
|
let running = true
|
|
|
|
while running {
|
|
if i >= total {
|
|
let running = false
|
|
} else {
|
|
let ch: String = native_list_get(chars, i)
|
|
|
|
if ch == "\\" {
|
|
// Escape sequence
|
|
let next_i = i + 1
|
|
if next_i < total {
|
|
let next_ch: String = native_list_get(chars, next_i)
|
|
if next_ch == "$" {
|
|
// \$ => literal '$' (escape for interpolation syntax)
|
|
let cur_part = native_list_append(cur_part, "$")
|
|
let i = next_i + 1
|
|
} else {
|
|
if next_ch == "\"" {
|
|
let cur_part = native_list_append(cur_part, "\"")
|
|
let i = next_i + 1
|
|
} else {
|
|
if next_ch == "n" {
|
|
let cur_part = native_list_append(cur_part, "\n")
|
|
let i = next_i + 1
|
|
} else {
|
|
if next_ch == "t" {
|
|
let cur_part = native_list_append(cur_part, "\t")
|
|
let i = next_i + 1
|
|
} else {
|
|
if next_ch == "r" {
|
|
let cur_part = native_list_append(cur_part, "\r")
|
|
let i = next_i + 1
|
|
} else {
|
|
if next_ch == "\\" {
|
|
let cur_part = native_list_append(cur_part, "\\")
|
|
let i = next_i + 1
|
|
} else {
|
|
let cur_part = native_list_append(cur_part, next_ch)
|
|
let i = next_i + 1
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
let i = i + 1
|
|
}
|
|
} else {
|
|
if ch == "\"" {
|
|
// Closing quote - stop scanning
|
|
let i = i + 1
|
|
let running = false
|
|
} else {
|
|
if ch == "$" {
|
|
// Check for ${ (start of interpolation)
|
|
let next_i = i + 1
|
|
let is_interp = false
|
|
if next_i < total {
|
|
let next_ch: String = native_list_get(chars, next_i)
|
|
if next_ch == "{" {
|
|
let is_interp = true
|
|
}
|
|
}
|
|
if is_interp {
|
|
// Flush the accumulated literal part (if non-empty)
|
|
let part_len: Int = native_list_len(cur_part)
|
|
if part_len > 0 {
|
|
let part_text = str_join(cur_part, "")
|
|
if need_plus {
|
|
let out_tokens = native_list_append(out_tokens, make_tok("Plus", "+"))
|
|
}
|
|
let clean_part = part_text
|
|
if looks_like_code(part_text) {
|
|
let clean_part = strip_code_comments(part_text)
|
|
}
|
|
let out_tokens = native_list_append(out_tokens, make_tok("Str", clean_part))
|
|
let need_plus = true
|
|
}
|
|
let cur_part = native_list_empty()
|
|
let has_interp = true
|
|
|
|
// Scan brace-balanced expression source
|
|
let brace_result = scan_interp_brace(chars, next_i + 1, total)
|
|
let expr_src: String = brace_result["text"]
|
|
let new_i: Int = brace_result["pos"]
|
|
let i = new_i
|
|
|
|
// Re-lex the expression and inline the tokens.
|
|
// Wrap in ( ) so that operators inside ${} (e.g.
|
|
// age + 1) are parsed as a grouped sub-expression
|
|
// rather than merging with the surrounding concat
|
|
// Plus tokens at the wrong precedence level.
|
|
let inner_toks: [Map<String, Any>] = lex(expr_src)
|
|
let inner_len: Int = native_list_len(inner_toks)
|
|
|
|
if need_plus {
|
|
let out_tokens = native_list_append(out_tokens, make_tok("Plus", "+"))
|
|
}
|
|
// Empty interpolation ${} => empty string segment
|
|
if inner_len <= 1 {
|
|
let out_tokens = native_list_append(out_tokens, make_tok("Str", ""))
|
|
} else {
|
|
let out_tokens = native_list_append(out_tokens, make_tok("LParen", "("))
|
|
let out_tokens = interp_tokens_append_all(out_tokens, inner_toks)
|
|
let out_tokens = native_list_append(out_tokens, make_tok("RParen", ")"))
|
|
}
|
|
let need_plus = true
|
|
} else {
|
|
// Plain '$' not followed by '{' - treat as literal
|
|
let cur_part = native_list_append(cur_part, "$")
|
|
let i = i + 1
|
|
}
|
|
} else {
|
|
let cur_part = native_list_append(cur_part, ch)
|
|
let i = i + 1
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Flush remaining literal segment and build final token list
|
|
let part_text = str_join(cur_part, "")
|
|
let part_len: Int = native_list_len(cur_part)
|
|
if has_interp {
|
|
// Interpolated string: only emit trailing segment if non-empty
|
|
if part_len > 0 {
|
|
let clean_part = part_text
|
|
if looks_like_code(part_text) {
|
|
let clean_part = strip_code_comments(part_text)
|
|
}
|
|
if need_plus {
|
|
let out_tokens = native_list_append(out_tokens, make_tok("Plus", "+"))
|
|
}
|
|
let out_tokens = native_list_append(out_tokens, make_tok("Str", clean_part))
|
|
}
|
|
} else {
|
|
// Plain string with no interpolation - same behaviour as old scan_string
|
|
let clean_text = part_text
|
|
if looks_like_code(part_text) {
|
|
let clean_text = strip_code_comments(part_text)
|
|
}
|
|
let out_tokens = native_list_append(out_tokens, make_tok("Str", clean_text))
|
|
}
|
|
|
|
{ "tokens": out_tokens, "pos": i }
|
|
}
|
|
|
|
// -- Main lexer ----------------------------------------------------------------
|
|
|
|
fn lex(source: String) -> [Map<String, Any>] {
|
|
let chars: [String] = native_string_chars(source)
|
|
let total: Int = native_list_len(chars)
|
|
let tokens: [Map<String, Any>] = native_list_empty()
|
|
let i: Int = 0
|
|
|
|
while i < total {
|
|
let ch: String = native_list_get(chars, i)
|
|
|
|
// Skip whitespace
|
|
if lex_is_whitespace(ch) {
|
|
let i = i + 1
|
|
} else {
|
|
// Line comments: //
|
|
if ch == "/" {
|
|
let next_i = i + 1
|
|
if next_i < total {
|
|
let next_ch: String = native_list_get(chars, next_i)
|
|
if next_ch == "/" {
|
|
// skip to end of line
|
|
let i = i + 2
|
|
let running2 = true
|
|
while running2 {
|
|
if i >= total {
|
|
let running2 = false
|
|
} else {
|
|
let lch: String = native_list_get(chars, i)
|
|
if lch == "\n" {
|
|
let running2 = false
|
|
} else {
|
|
let i = i + 1
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
let tokens = native_list_append(tokens, make_tok("Slash", "/"))
|
|
let i = i + 1
|
|
}
|
|
} else {
|
|
let tokens = native_list_append(tokens, make_tok("Slash", "/"))
|
|
let i = i + 1
|
|
}
|
|
} else {
|
|
// String literal (plain or interpolated with ${expr} syntax).
|
|
// scan_interp_string handles both cases: plain strings emit a
|
|
// single Str token; interpolated strings emit a flat token
|
|
// sequence (Str Plus expr-tokens Plus Str ...) that the parser
|
|
// naturally assembles into a BinOp concat tree.
|
|
if ch == "\"" {
|
|
let interp_result = scan_interp_string(chars, i + 1, total)
|
|
let interp_toks: [Map<String, Any>] = interp_result["tokens"]
|
|
let new_pos: Int = interp_result["pos"]
|
|
let tokens = interp_tokens_append_all(tokens, interp_toks)
|
|
let i = new_pos
|
|
} else {
|
|
// Number literal
|
|
if lex_is_digit(ch) {
|
|
let result = scan_digits(chars, i, total)
|
|
let num_text: String = result["text"]
|
|
let new_pos: Int = result["pos"]
|
|
// check for float (dot followed by digit)
|
|
if new_pos < total {
|
|
let dot_ch: String = native_list_get(chars, new_pos)
|
|
if dot_ch == "." {
|
|
let after_dot = new_pos + 1
|
|
if after_dot < total {
|
|
let after_dot_ch: String = native_list_get(chars, after_dot)
|
|
if lex_is_digit(after_dot_ch) {
|
|
let frac_result = scan_digits(chars, after_dot, total)
|
|
let frac_text: String = frac_result["text"]
|
|
let frac_pos: Int = frac_result["pos"]
|
|
let tokens = native_list_append(tokens, make_tok("Float", num_text + "." + frac_text))
|
|
let i = frac_pos
|
|
} else {
|
|
let tokens = native_list_append(tokens, make_tok("Int", num_text))
|
|
let i = new_pos
|
|
}
|
|
} else {
|
|
let tokens = native_list_append(tokens, make_tok("Int", num_text))
|
|
let i = new_pos
|
|
}
|
|
} else {
|
|
let tokens = native_list_append(tokens, make_tok("Int", num_text))
|
|
let i = new_pos
|
|
}
|
|
} else {
|
|
let tokens = native_list_append(tokens, make_tok("Int", num_text))
|
|
let i = new_pos
|
|
}
|
|
} else {
|
|
// Identifier or keyword
|
|
if lex_is_alpha(ch) || ch == "_" {
|
|
let result = scan_ident(chars, i, total)
|
|
let word: String = result["text"]
|
|
let new_pos: Int = result["pos"]
|
|
let kw = keyword_kind(word)
|
|
if kw == "" {
|
|
let tokens = native_list_append(tokens, make_tok("Ident", word))
|
|
} else {
|
|
let tokens = native_list_append(tokens, make_tok(kw, word))
|
|
}
|
|
let i = new_pos
|
|
} else {
|
|
// Multi-char and single-char operators/delimiters
|
|
let peek_i = i + 1
|
|
let peek_ch = ""
|
|
if peek_i < total {
|
|
let peek_ch: String = native_list_get(chars, peek_i)
|
|
}
|
|
|
|
if ch == "=" {
|
|
if peek_ch == "=" {
|
|
let tokens = native_list_append(tokens, make_tok("EqEq", "=="))
|
|
let i = i + 2
|
|
} else {
|
|
if peek_ch == ">" {
|
|
let tokens = native_list_append(tokens, make_tok("FatArrow", "=>"))
|
|
let i = i + 2
|
|
} else {
|
|
let tokens = native_list_append(tokens, make_tok("Eq", "="))
|
|
let i = i + 1
|
|
}
|
|
}
|
|
} else {
|
|
if ch == "!" {
|
|
if peek_ch == "=" {
|
|
let tokens = native_list_append(tokens, make_tok("NotEq", "!="))
|
|
let i = i + 2
|
|
} else {
|
|
let tokens = native_list_append(tokens, make_tok("Not", "!"))
|
|
let i = i + 1
|
|
}
|
|
} else {
|
|
if ch == "<" {
|
|
if peek_ch == "=" {
|
|
let tokens = native_list_append(tokens, make_tok("LtEq", "<="))
|
|
let i = i + 2
|
|
} else {
|
|
let tokens = native_list_append(tokens, make_tok("Lt", "<"))
|
|
let i = i + 1
|
|
}
|
|
} else {
|
|
if ch == ">" {
|
|
if peek_ch == "=" {
|
|
let tokens = native_list_append(tokens, make_tok("GtEq", ">="))
|
|
let i = i + 2
|
|
} else {
|
|
let tokens = native_list_append(tokens, make_tok("Gt", ">"))
|
|
let i = i + 1
|
|
}
|
|
} else {
|
|
if ch == "&" {
|
|
if peek_ch == "&" {
|
|
let tokens = native_list_append(tokens, make_tok("And", "&&"))
|
|
let i = i + 2
|
|
} else {
|
|
let i = i + 1
|
|
}
|
|
} else {
|
|
if ch == "|" {
|
|
if peek_ch == "|" {
|
|
let tokens = native_list_append(tokens, make_tok("Or", "||"))
|
|
let i = i + 2
|
|
} else {
|
|
if peek_ch == ">" {
|
|
let tokens = native_list_append(tokens, make_tok("PipeOp", "|>"))
|
|
let i = i + 2
|
|
} else {
|
|
let tokens = native_list_append(tokens, make_tok("Pipe", "|"))
|
|
let i = i + 1
|
|
}
|
|
}
|
|
} else {
|
|
if ch == "-" {
|
|
if peek_ch == ">" {
|
|
let tokens = native_list_append(tokens, make_tok("Arrow", "->"))
|
|
let i = i + 2
|
|
} else {
|
|
let tokens = native_list_append(tokens, make_tok("Minus", "-"))
|
|
let i = i + 1
|
|
}
|
|
} else {
|
|
if ch == ":" {
|
|
if peek_ch == ":" {
|
|
let tokens = native_list_append(tokens, make_tok("ColonColon", "::"))
|
|
let i = i + 2
|
|
} else {
|
|
let tokens = native_list_append(tokens, make_tok("Colon", ":"))
|
|
let i = i + 1
|
|
}
|
|
} else {
|
|
if ch == "+" {
|
|
let tokens = native_list_append(tokens, make_tok("Plus", "+"))
|
|
let i = i + 1
|
|
} else {
|
|
if ch == "*" {
|
|
let tokens = native_list_append(tokens, make_tok("Star", "*"))
|
|
let i = i + 1
|
|
} else {
|
|
if ch == "%" {
|
|
let tokens = native_list_append(tokens, make_tok("Percent", "%"))
|
|
let i = i + 1
|
|
} else {
|
|
if ch == "(" {
|
|
let tokens = native_list_append(tokens, make_tok("LParen", "("))
|
|
let i = i + 1
|
|
} else {
|
|
if ch == ")" {
|
|
let tokens = native_list_append(tokens, make_tok("RParen", ")"))
|
|
let i = i + 1
|
|
} else {
|
|
if ch == "{" {
|
|
let tokens = native_list_append(tokens, make_tok("LBrace", "{"))
|
|
let i = i + 1
|
|
} else {
|
|
if ch == "}" {
|
|
let tokens = native_list_append(tokens, make_tok("RBrace", "}"))
|
|
let i = i + 1
|
|
} else {
|
|
if ch == "[" {
|
|
let tokens = native_list_append(tokens, make_tok("LBracket", "["))
|
|
let i = i + 1
|
|
} else {
|
|
if ch == "]" {
|
|
let tokens = native_list_append(tokens, make_tok("RBracket", "]"))
|
|
let i = i + 1
|
|
} else {
|
|
if ch == "," {
|
|
let tokens = native_list_append(tokens, make_tok("Comma", ","))
|
|
let i = i + 1
|
|
} else {
|
|
if ch == "." {
|
|
// Check for ..= (inclusive range) before .. (exclusive range) before single .
|
|
let peek2_i = i + 2
|
|
let peek2_ch = ""
|
|
if peek2_i < total {
|
|
let peek2_ch: String = native_list_get(chars, peek2_i)
|
|
}
|
|
if peek_ch == "." {
|
|
if peek2_ch == "=" {
|
|
let tokens = native_list_append(tokens, make_tok("DotDotEq", "..="))
|
|
let i = i + 3
|
|
} else {
|
|
let tokens = native_list_append(tokens, make_tok("DotDot", ".."))
|
|
let i = i + 2
|
|
}
|
|
} else {
|
|
let tokens = native_list_append(tokens, make_tok("Dot", "."))
|
|
let i = i + 1
|
|
}
|
|
} else {
|
|
if ch == ";" {
|
|
let tokens = native_list_append(tokens, make_tok("Semicolon", ";"))
|
|
let i = i + 1
|
|
} else {
|
|
if ch == "@" {
|
|
let tokens = native_list_append(tokens, make_tok("At", "@"))
|
|
let i = i + 1
|
|
} else {
|
|
if ch == "?" {
|
|
let tokens = native_list_append(tokens, make_tok("QuestionMark", "?"))
|
|
let i = i + 1
|
|
} else {
|
|
// unknown char - skip
|
|
let i = i + 1
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
let tokens = native_list_append(tokens, make_tok("Eof", ""))
|
|
tokens
|
|
}
|