Files
2026-05-05 01:38:51 -05:00

908 lines
28 KiB
EmacsLisp

// runtime/string.el String operations implemented in El.
//
// All functions delegate character-level work to the seed primitives declared
// in el_seed.c. No C is written here; this is pure El source that compiles
// to C via the normal El pipeline.
//
// Seed primitives used (provided by el_seed.c):
// __str_len(s) -> Int
// __str_char_at(s, i) -> Int (char code at byte index i)
// __str_alloc(n) -> String (n-byte zero-filled mutable buffer)
// __str_set_char(s, i, c) -> String (mutate s[i]=c, return s)
// __str_cmp(a, b) -> Int (strcmp)
// __str_ncmp(a, b, n) -> Int (strncmp)
// __str_concat_raw(a, b) -> String
// __str_slice_raw(s, lo, hi) -> String (substring copy [lo, hi))
// __int_to_str(n) -> String
// __str_to_int(s) -> Int
// __float_to_str(f) -> String
// __str_to_float(s) -> Float
// __println(s)
// __print(s)
// __readline() -> String
// __url_encode(s) -> String
// __url_decode(s) -> String
// I/O
fn println(s: String) -> Void {
__println(s)
}
fn print(s: String) -> Void {
__print(s)
}
fn readline() -> String {
return __readline()
}
// Type conversions
fn int_to_str(n: Int) -> String {
return __int_to_str(n)
}
fn str_to_int(s: String) -> Int {
return __str_to_int(s)
}
fn float_to_str(f: Float) -> String {
return __float_to_str(f)
}
fn str_to_float(s: String) -> Float {
return __str_to_float(s)
}
fn bool_to_str(b: Bool) -> String {
if b { return "true" }
return "false"
}
// URL encoding
fn url_encode(s: String) -> String {
return __url_encode(s)
}
fn url_decode(s: String) -> String {
return __url_decode(s)
}
// Math
fn el_abs(n: Int) -> Int {
if n < 0 { return 0 - n }
return n
}
fn el_max(a: Int, b: Int) -> Int {
if a > b { return a }
return b
}
fn el_min(a: Int, b: Int) -> Int {
if a < b { return a }
return b
}
// Core string primitives
fn str_len(s: String) -> Int {
return __str_len(s)
}
fn str_eq(a: String, b: String) -> Bool {
return __str_cmp(a, b) == 0
}
fn str_concat(a: String, b: String) -> String {
return __str_concat_raw(a, b)
}
fn str_slice(s: String, start: Int, end: Int) -> String {
let slen: Int = __str_len(s)
let lo: Int = start
if lo < 0 { lo = 0 }
if lo > slen { lo = slen }
let hi: Int = end
if hi < lo { hi = lo }
if hi > slen { hi = slen }
return __str_slice_raw(s, lo, hi)
}
// Whitespace helpers (internal)
//
// _is_ws: returns true for ASCII whitespace (space, tab, \n, \r, \f, \v).
fn _is_ws(c: Int) -> Bool {
if c == 32 { return true } // space
if c == 9 { return true } // tab
if c == 10 { return true } // \n
if c == 13 { return true } // \r
if c == 12 { return true } // \f
if c == 11 { return true } // \v
return false
}
// Scan forward from index 0; return index of first byte not in whitespace,
// or n if the entire string is whitespace.
fn _find_first_non_ws(s: String, n: Int) -> Int {
let i: Int = 0
while i < n {
if !_is_ws(__str_char_at(s, i)) { return i }
i = i + 1
}
return n
}
// Scan backward from index n-1; return index of last non-whitespace byte,
// or -1 if the entire string is whitespace.
fn _find_last_non_ws(s: String, n: Int) -> Int {
let i: Int = n - 1
while i >= 0 {
if !_is_ws(__str_char_at(s, i)) { return i }
i = i - 1
}
return -1
}
// Comparison and search
fn str_starts_with(s: String, prefix: String) -> Bool {
let plen: Int = __str_len(prefix)
let slen: Int = __str_len(s)
if plen > slen { return false }
return __str_ncmp(s, prefix, plen) == 0
}
fn str_ends_with(s: String, suffix: String) -> Bool {
let slen: Int = __str_len(s)
let suflen: Int = __str_len(suffix)
if suflen > slen { return false }
let tail: String = __str_slice_raw(s, slen - suflen, slen)
return __str_cmp(tail, suffix) == 0
}
fn str_contains(s: String, sub: String) -> Bool {
let slen: Int = __str_len(s)
let sublen: Int = __str_len(sub)
if sublen == 0 { return true }
if sublen > slen { return false }
let limit: Int = slen - sublen
let i: Int = 0
while i <= limit {
let window: String = __str_slice_raw(s, i, i + sublen)
if __str_cmp(window, sub) == 0 { return true }
i = i + 1
}
return false
}
fn str_index_of(s: String, sub: String) -> Int {
let slen: Int = __str_len(s)
let sublen: Int = __str_len(sub)
if sublen == 0 { return 0 }
if sublen > slen { return -1 }
let limit: Int = slen - sublen
let i: Int = 0
while i <= limit {
let window: String = __str_slice_raw(s, i, i + sublen)
if __str_cmp(window, sub) == 0 { return i }
i = i + 1
}
return -1
}
fn str_last_index_of(s: String, sub: String) -> Int {
let slen: Int = __str_len(s)
let sublen: Int = __str_len(sub)
if sublen == 0 { return slen }
if sublen > slen { return -1 }
let last: Int = -1
let limit: Int = slen - sublen
let i: Int = 0
while i <= limit {
let window: String = __str_slice_raw(s, i, i + sublen)
if __str_cmp(window, sub) == 0 {
last = i
i = i + sublen
} else {
i = i + 1
}
}
return last
}
fn str_index_of_all(s: String, sub: String) -> [Int] {
let result: [Int] = el_list_empty()
let slen: Int = __str_len(s)
let sublen: Int = __str_len(sub)
if sublen == 0 { return result }
if sublen > slen { return result }
let limit: Int = slen - sublen
let i: Int = 0
while i <= limit {
let window: String = __str_slice_raw(s, i, i + sublen)
if __str_cmp(window, sub) == 0 {
result = el_list_append(result, i)
i = i + sublen
} else {
i = i + 1
}
}
return result
}
// Return the byte index of the first character in s that appears in any_of,
// or -1 if none found.
fn str_find_chars(s: String, any_of: String) -> Int {
let slen: Int = __str_len(s)
let alen: Int = __str_len(any_of)
if alen == 0 { return -1 }
let i: Int = 0
while i < slen {
let c: Int = __str_char_at(s, i)
let j: Int = 0
while j < alen {
if c == __str_char_at(any_of, j) { return i }
j = j + 1
}
i = i + 1
}
return -1
}
// Character access
// Return a one-character string at byte index i, or "" if out of range.
fn str_char_at(s: String, i: Int) -> String {
let slen: Int = __str_len(s)
if i < 0 { return "" }
if i >= slen { return "" }
return __str_slice_raw(s, i, i + 1)
}
// Return the char code (byte value) at byte index i, or 0 if out of range.
fn str_char_code(s: String, i: Int) -> Int {
let slen: Int = __str_len(s)
if i < 0 { return 0 }
if i >= slen { return 0 }
return __str_char_at(s, i)
}
// Case conversion
fn str_to_upper(s: String) -> String {
let n: Int = __str_len(s)
if n == 0 { return "" }
let out: String = __str_alloc(n)
let i: Int = 0
while i < n {
let c: Int = __str_char_at(s, i)
// a-z (97-122) -> A-Z (65-90): subtract 32
if c >= 97 {
if c <= 122 { c = c - 32 }
}
out = __str_set_char(out, i, c)
i = i + 1
}
return out
}
fn str_to_lower(s: String) -> String {
let n: Int = __str_len(s)
if n == 0 { return "" }
let out: String = __str_alloc(n)
let i: Int = 0
while i < n {
let c: Int = __str_char_at(s, i)
// A-Z (65-90) -> a-z (97-122): add 32
if c >= 65 {
if c <= 90 { c = c + 32 }
}
out = __str_set_char(out, i, c)
i = i + 1
}
return out
}
// Aliases used in existing El codebases.
fn str_lower(s: String) -> String {
return str_to_lower(s)
}
fn str_upper(s: String) -> String {
return str_to_upper(s)
}
// Whitespace trimming
fn str_trim(s: String) -> String {
let n: Int = __str_len(s)
if n == 0 { return "" }
let lo: Int = _find_first_non_ws(s, n)
if lo == n { return "" }
let hi: Int = _find_last_non_ws(s, n)
return __str_slice_raw(s, lo, hi + 1)
}
fn str_lstrip(s: String) -> String {
let n: Int = __str_len(s)
if n == 0 { return "" }
let lo: Int = _find_first_non_ws(s, n)
if lo == n { return "" }
return __str_slice_raw(s, lo, n)
}
fn str_rstrip(s: String) -> String {
let n: Int = __str_len(s)
if n == 0 { return "" }
let hi: Int = _find_last_non_ws(s, n)
if hi < 0 { return "" }
return __str_slice_raw(s, 0, hi + 1)
}
// Replacement
fn str_replace(s: String, from: String, to: String) -> String {
let slen: Int = __str_len(s)
let flen: Int = __str_len(from)
if flen == 0 { return s }
if slen == 0 { return s }
// Scan s left-to-right; emit `to` on each match, otherwise emit one byte.
let result: String = ""
let i: Int = 0
while i < slen {
// Try to match `from` at position i
if i + flen <= slen {
let window: String = __str_slice_raw(s, i, i + flen)
if __str_cmp(window, from) == 0 {
result = __str_concat_raw(result, to)
i = i + flen
} else {
let ch: String = __str_slice_raw(s, i, i + 1)
result = __str_concat_raw(result, ch)
i = i + 1
}
} else {
// Not enough bytes left for a match emit remainder and stop.
let tail: String = __str_slice_raw(s, i, slen)
result = __str_concat_raw(result, tail)
i = slen
}
}
return result
}
// Repetition and reversal
fn str_repeat(s: String, n: Int) -> String {
if n <= 0 { return "" }
let slen: Int = __str_len(s)
if slen == 0 { return "" }
let result: String = ""
let i: Int = 0
while i < n {
result = __str_concat_raw(result, s)
i = i + 1
}
return result
}
// Byte-reverse (correct for ASCII; for multi-byte UTF-8 codepoints this
// reverses bytes within a codepoint, which is intentional at this tier
// Phase 2 will add grapheme-aware reversal).
fn str_reverse(s: String) -> String {
let n: Int = __str_len(s)
if n == 0 { return "" }
let out: String = __str_alloc(n)
let i: Int = 0
while i < n {
let c: Int = __str_char_at(s, i)
out = __str_set_char(out, n - 1 - i, c)
i = i + 1
}
return out
}
// Prefix/suffix stripping
fn str_strip_prefix(s: String, prefix: String) -> String {
let slen: Int = __str_len(s)
let plen: Int = __str_len(prefix)
if plen == 0 { return s }
if plen > slen { return s }
if __str_ncmp(s, prefix, plen) == 0 {
return __str_slice_raw(s, plen, slen)
}
return s
}
fn str_strip_suffix(s: String, suffix: String) -> String {
let slen: Int = __str_len(s)
let suflen: Int = __str_len(suffix)
if suflen == 0 { return s }
if suflen > slen { return s }
let tail: String = __str_slice_raw(s, slen - suflen, slen)
if __str_cmp(tail, suffix) == 0 {
return __str_slice_raw(s, 0, slen - suflen)
}
return s
}
// Strip leading and trailing bytes whose char code appears in `chars`.
fn str_strip_chars(s: String, chars: String) -> String {
let slen: Int = __str_len(s)
let clen: Int = __str_len(chars)
if slen == 0 { return "" }
if clen == 0 { return s }
let lo: Int = _find_first_not_in_charset(s, chars, slen, clen)
if lo == slen { return "" }
let hi: Int = _find_last_not_in_charset(s, chars, slen, clen)
return __str_slice_raw(s, lo, hi + 1)
}
// Internal: true if char code `c` is present in the charset string.
fn _char_in_set(c: Int, chars: String, clen: Int) -> Bool {
let j: Int = 0
while j < clen {
if c == __str_char_at(chars, j) { return true }
j = j + 1
}
return false
}
fn _find_first_not_in_charset(s: String, chars: String, slen: Int, clen: Int) -> Int {
let i: Int = 0
while i < slen {
if !_char_in_set(__str_char_at(s, i), chars, clen) { return i }
i = i + 1
}
return slen
}
fn _find_last_not_in_charset(s: String, chars: String, slen: Int, clen: Int) -> Int {
let i: Int = slen - 1
while i >= 0 {
if !_char_in_set(__str_char_at(s, i), chars, clen) { return i }
i = i - 1
}
return -1
}
// Padding
// Pad s on the left to `width` total chars, repeating `pad` cyclically.
fn str_pad_left(s: String, width: Int, pad: String) -> String {
let slen: Int = __str_len(s)
if slen >= width { return s }
let plen: Int = __str_len(pad)
if plen == 0 { return s }
let need: Int = width - slen
let prefix: String = ""
let i: Int = 0
while i < need {
// Select pad character at position (i mod plen)
let pad_idx: Int = i - (i / plen) * plen
let pc: String = __str_slice_raw(pad, pad_idx, pad_idx + 1)
prefix = __str_concat_raw(prefix, pc)
i = i + 1
}
return __str_concat_raw(prefix, s)
}
// Pad s on the right to `width` total chars, repeating `pad` cyclically.
fn str_pad_right(s: String, width: Int, pad: String) -> String {
let slen: Int = __str_len(s)
if slen >= width { return s }
let plen: Int = __str_len(pad)
if plen == 0 { return s }
let need: Int = width - slen
let suffix: String = ""
let i: Int = 0
while i < need {
let pad_idx: Int = i - (i / plen) * plen
let pc: String = __str_slice_raw(pad, pad_idx, pad_idx + 1)
suffix = __str_concat_raw(suffix, pc)
i = i + 1
}
return __str_concat_raw(s, suffix)
}
// Counting
// Count non-overlapping occurrences of `sub` in `s`. Empty sub returns 0.
fn str_count(s: String, sub: String) -> Int {
let slen: Int = __str_len(s)
let sublen: Int = __str_len(sub)
if sublen == 0 { return 0 }
if sublen > slen { return 0 }
let count: Int = 0
let limit: Int = slen - sublen
let i: Int = 0
while i <= limit {
let window: String = __str_slice_raw(s, i, i + sublen)
if __str_cmp(window, sub) == 0 {
count = count + 1
i = i + sublen
} else {
i = i + 1
}
}
return count
}
// Byte count alias of str_len.
fn str_count_bytes(s: String) -> Int {
return __str_len(s)
}
// UTF-8 codepoint count: count bytes that are NOT continuation bytes (10xxxxxx).
// Continuation bytes have the pattern 10xxxxxx = 0x80..0xBF (128..191).
fn str_count_chars(s: String) -> Int {
let n: Int = __str_len(s)
let count: Int = 0
let i: Int = 0
while i < n {
let c: Int = __str_char_at(s, i)
// Continuation bytes are in range [128, 191]; skip them.
// All other bytes (< 128 ASCII, or >= 192 leading bytes) start a codepoint.
if c < 128 {
count = count + 1
} else {
if c >= 192 { count = count + 1 }
}
i = i + 1
}
return count
}
// Count newline-delimited lines. A trailing newline does NOT add an extra empty line.
fn str_count_lines(s: String) -> Int {
let n: Int = __str_len(s)
if n == 0 { return 0 }
let count: Int = 0
let has_content: Bool = false
let i: Int = 0
while i < n {
let c: Int = __str_char_at(s, i)
has_content = true
if c == 10 { // \n
count = count + 1
has_content = false
}
i = i + 1
}
if has_content { count = count + 1 }
return count
}
// Count whitespace-delimited words (non-empty tokens).
fn str_count_words(s: String) -> Int {
let n: Int = __str_len(s)
let count: Int = 0
let in_word: Bool = false
let i: Int = 0
while i < n {
let c: Int = __str_char_at(s, i)
if _is_ws(c) {
in_word = false
} else {
if !in_word {
in_word = true
count = count + 1
}
}
i = i + 1
}
return count
}
// Count ASCII letters [A-Za-z].
fn str_count_letters(s: String) -> Int {
let n: Int = __str_len(s)
let count: Int = 0
let i: Int = 0
while i < n {
let c: Int = __str_char_at(s, i)
if c >= 65 {
if c <= 90 { count = count + 1 } // A-Z
}
if c >= 97 {
if c <= 122 { count = count + 1 } // a-z
}
i = i + 1
}
return count
}
// Count ASCII decimal digits [0-9].
fn str_count_digits(s: String) -> Int {
let n: Int = __str_len(s)
let count: Int = 0
let i: Int = 0
while i < n {
let c: Int = __str_char_at(s, i)
if c >= 48 {
if c <= 57 { count = count + 1 } // '0'-'9'
}
i = i + 1
}
return count
}
// Character classification
//
// For all predicates: empty string -> false.
// Multi-char string: ALL bytes must satisfy the predicate.
fn is_letter(s: String) -> Bool {
let n: Int = __str_len(s)
if n == 0 { return false }
let i: Int = 0
while i < n {
let c: Int = __str_char_at(s, i)
let ok: Bool = false
if c >= 65 { if c <= 90 { ok = true } } // A-Z
if c >= 97 { if c <= 122 { ok = true } } // a-z
if !ok { return false }
i = i + 1
}
return true
}
fn is_digit(s: String) -> Bool {
let n: Int = __str_len(s)
if n == 0 { return false }
let i: Int = 0
while i < n {
let c: Int = __str_char_at(s, i)
if c < 48 { return false } // '0'
if c > 57 { return false } // '9'
i = i + 1
}
return true
}
fn is_alphanumeric(s: String) -> Bool {
let n: Int = __str_len(s)
if n == 0 { return false }
let i: Int = 0
while i < n {
let c: Int = __str_char_at(s, i)
let ok: Bool = false
if c >= 48 { if c <= 57 { ok = true } } // 0-9
if c >= 65 { if c <= 90 { ok = true } } // A-Z
if c >= 97 { if c <= 122 { ok = true } } // a-z
if !ok { return false }
i = i + 1
}
return true
}
fn is_whitespace(s: String) -> Bool {
let n: Int = __str_len(s)
if n == 0 { return false }
let i: Int = 0
while i < n {
if !_is_ws(__str_char_at(s, i)) { return false }
i = i + 1
}
return true
}
// ASCII punctuation: 33-47, 58-64, 91-96, 123-126.
fn is_punctuation(s: String) -> Bool {
let n: Int = __str_len(s)
if n == 0 { return false }
let i: Int = 0
while i < n {
let c: Int = __str_char_at(s, i)
let ok: Bool = false
if c >= 33 { if c <= 47 { ok = true } }
if c >= 58 { if c <= 64 { ok = true } }
if c >= 91 { if c <= 96 { ok = true } }
if c >= 123 { if c <= 126 { ok = true } }
if !ok { return false }
i = i + 1
}
return true
}
fn is_uppercase(s: String) -> Bool {
let n: Int = __str_len(s)
if n == 0 { return false }
let i: Int = 0
while i < n {
let c: Int = __str_char_at(s, i)
if c < 65 { return false } // 'A'
if c > 90 { return false } // 'Z'
i = i + 1
}
return true
}
fn is_lowercase(s: String) -> Bool {
let n: Int = __str_len(s)
if n == 0 { return false }
let i: Int = 0
while i < n {
let c: Int = __str_char_at(s, i)
if c < 97 { return false } // 'a'
if c > 122 { return false } // 'z'
i = i + 1
}
return true
}
// Splitting
fn str_split(s: String, sep: String) -> [String] {
let result: [String] = el_list_empty()
let slen: Int = __str_len(s)
let seplen: Int = __str_len(sep)
// Empty separator: return the whole string as a single element.
if seplen == 0 {
result = el_list_append(result, s)
return result
}
let part_start: Int = 0
let i: Int = 0
while i < slen {
if i + seplen <= slen {
let window: String = __str_slice_raw(s, i, i + seplen)
if __str_cmp(window, sep) == 0 {
let part: String = __str_slice_raw(s, part_start, i)
result = el_list_append(result, part)
i = i + seplen
part_start = i
} else {
i = i + 1
}
} else {
i = i + 1
}
}
// Append remaining tail (may be empty string if s ended with sep).
let tail: String = __str_slice_raw(s, part_start, slen)
result = el_list_append(result, tail)
return result
}
// Split into at most n parts. The nth part (index n-1) contains the remainder
// verbatim, including any further separators. n <= 0 returns []. n == 1
// returns [s].
fn str_split_n(s: String, sep: String, n: Int) -> [String] {
let result: [String] = el_list_empty()
if n <= 0 { return result }
if n == 1 {
result = el_list_append(result, s)
return result
}
let slen: Int = __str_len(s)
let seplen: Int = __str_len(sep)
if seplen == 0 {
result = el_list_append(result, s)
return result
}
let part_start: Int = 0
let parts: Int = 0
let i: Int = 0
while i < slen {
if parts >= n - 1 {
// Reached the split limit stop splitting, emit the rest below.
i = slen
} else {
if i + seplen <= slen {
let window: String = __str_slice_raw(s, i, i + seplen)
if __str_cmp(window, sep) == 0 {
let part: String = __str_slice_raw(s, part_start, i)
result = el_list_append(result, part)
i = i + seplen
part_start = i
parts = parts + 1
} else {
i = i + 1
}
} else {
i = i + 1
}
}
}
// Remainder verbatim.
let tail: String = __str_slice_raw(s, part_start, slen)
result = el_list_append(result, tail)
return result
}
// Split on newlines. \r\n is folded to \n. Trailing empty line after a
// final \n is dropped so "a\nb\n" yields ["a", "b"], not ["a", "b", ""].
fn str_split_lines(s: String) -> [String] {
let result: [String] = el_list_empty()
let n: Int = __str_len(s)
if n == 0 { return result }
let line_start: Int = 0
let i: Int = 0
while i < n {
let c: Int = __str_char_at(s, i)
if c == 10 { // \n
let lend: Int = i
// Fold \r\n: if the byte before \n is \r, exclude it.
if lend > line_start {
if __str_char_at(s, lend - 1) == 13 { lend = lend - 1 }
}
let line: String = __str_slice_raw(s, line_start, lend)
result = el_list_append(result, line)
line_start = i + 1
}
i = i + 1
}
// Trailing content with no terminating \n.
if line_start < n {
let line: String = __str_slice_raw(s, line_start, n)
result = el_list_append(result, line)
}
return result
}
// Split into a list of one-byte strings (byte-level chars).
fn str_split_chars(s: String) -> [String] {
let result: [String] = el_list_empty()
let n: Int = __str_len(s)
let i: Int = 0
while i < n {
let ch: String = __str_slice_raw(s, i, i + 1)
result = el_list_append(result, ch)
i = i + 1
}
return result
}
// Joining
// Join a list of strings with a separator between consecutive elements.
// Empty list yields "". Non-string elements should not be passed here.
fn str_join(parts: [String], sep: String) -> String {
let n: Int = el_list_len(parts)
if n == 0 { return "" }
let result: String = el_list_get(parts, 0)
let i: Int = 1
while i < n {
result = __str_concat_raw(result, sep)
result = __str_concat_raw(result, el_list_get(parts, i))
i = i + 1
}
return result
}
// DHARMA byte encoding (str_to_bytes)
//
// str_to_bytes encode a string as a JSON array of unsigned byte values.
// "hi" -> "[104,105]"
// Used by db.el to store content in Engram JSON nodes as a byte array.
// Note: bytes_to_str (the inverse) is defined in json.el because it depends
// on json_array_get_string which is defined there.
fn str_to_bytes(s: String) -> String {
let n: Int = __str_len(s)
if n == 0 { return "[]" }
let result: String = "["
let i: Int = 0
while i < n {
let b: Int = __str_char_at(s, i)
if i > 0 { result = __str_concat_raw(result, ",") }
result = __str_concat_raw(result, __int_to_str(b))
i = i + 1
}
return __str_concat_raw(result, "]")
}
// Cryptographic hashing
// hash_sha256 return the SHA-256 hex digest of a string.
// Delegates to the __sha256_hex seed primitive.
fn hash_sha256(s: String) -> String {
return __sha256_hex(s)
}