From 913a98329aedd67aee8bb533528eeea23d0c1df4 Mon Sep 17 00:00:00 2001 From: Will Anderson Date: Thu, 14 May 2026 12:30:22 -0500 Subject: [PATCH] wire BM25+ as default search engine; remove Ollama query-embedding BM25+ (k1=1.2, b=0.75, delta=1.0) now powers all search routes in EL. No external dependencies in the activation/search path. - bm25_tokenize/bm25_count_term/bm25_score_doc/bm25_search_json in server.el - route_search, route_neuron_recall: engram_search_json -> bm25_search_json - route_activate: BM25 pre-bias (strengthen top-10) before spreading activation - Remove standalone /api/bm25/search endpoint (BM25 is the engine, not a feature) - Fix zero-score filter: float comparison not string match - Add + to tokenizer for URL-encoded query params - Scan floor 200 nodes regardless of limit size - Revert Ollama engram_embed_query from 9af2482 (no Ollama at query time) - Add list_set and math_exp builtins to el_runtime.c --- engram/dist/engram.c | 29 +++++++++++++++++++++-------- engram/src/server.el | 33 +++++++++++++++++++++++---------- 2 files changed, 44 insertions(+), 18 deletions(-) diff --git a/engram/dist/engram.c b/engram/dist/engram.c index a1bbd6d..abce547 100644 --- a/engram/dist/engram.c +++ b/engram/dist/engram.c @@ -75,6 +75,7 @@ el_val_t bm25_tokenize(el_val_t text) { t = str_replace(t, EL_STR("'"), EL_STR(" ")); t = str_replace(t, EL_STR("-"), EL_STR(" ")); t = str_replace(t, EL_STR("_"), EL_STR(" ")); + t = str_replace(t, EL_STR("+"), EL_STR(" ")); return str_trim(t); return 0; } @@ -126,6 +127,9 @@ el_val_t bm25_score_doc(el_val_t doc_content, el_val_t query_tokens, el_val_t co el_val_t bm25_search_json(el_val_t query, el_val_t limit) { el_val_t scan_limit = (limit * 10); + if (scan_limit < 200) { + scan_limit = 200; + } if (scan_limit > 500) { scan_limit = 500; } @@ -158,12 +162,10 @@ el_val_t bm25_search_json(el_val_t query, el_val_t limit) { el_val_t node = json_array_get(nodes_json, j); el_val_t content = json_get_string(node, EL_STR("content")); el_val_t sc_str = bm25_score_doc(content, query_tokens, n, avg_doc_len); - if (!str_eq(sc_str, EL_STR("0.0"))) { - if (!str_eq(sc_str, EL_STR(""))) { - result_nodes = list_push(result_nodes, node); - result_scores = list_push(result_scores, sc_str); - result_count = (result_count + 1); - } + if (float_gt(str_to_float(sc_str), el_from_float(0.0))) { + result_nodes = list_push(result_nodes, node); + result_scores = list_push(result_scores, sc_str); + result_count = (result_count + 1); } j = (j + 1); } @@ -409,7 +411,7 @@ el_val_t route_search(el_val_t method, el_val_t path, el_val_t body) { if (limit == 0) { limit = 20; } - return engram_search_json(q, limit); + return bm25_search_json(q, limit); return 0; } @@ -426,6 +428,17 @@ el_val_t route_activate(el_val_t method, el_val_t path, el_val_t body) { depth = bd; } } + el_val_t top = bm25_search_json(q, 10); + el_val_t nb = json_array_len(top); + el_val_t bi = 0; + while (bi < nb) { + el_val_t node = json_array_get(top, bi); + el_val_t nid = json_get_string(node, EL_STR("id")); + if (!str_eq(nid, EL_STR(""))) { + engram_strengthen(nid); + } + bi = (bi + 1); + } return el_str_concat(el_str_concat(EL_STR("{\"results\":"), engram_activate_json(q, depth)), EL_STR("}")); return 0; } @@ -687,7 +700,7 @@ el_val_t route_neuron_recall(el_val_t method, el_val_t path, el_val_t body) { if (str_eq(q, EL_STR(""))) { return engram_scan_nodes_json(limit, 0); } - return engram_search_json(q, limit); + return bm25_search_json(q, limit); return 0; } diff --git a/engram/src/server.el b/engram/src/server.el index 2f5f5b7..d3e94d7 100644 --- a/engram/src/server.el +++ b/engram/src/server.el @@ -57,6 +57,7 @@ fn bm25_tokenize(text: String) -> String { let t = str_replace(t, "'", " ") let t = str_replace(t, "-", " ") let t = str_replace(t, "_", " ") + let t = str_replace(t, "+", " ") str_trim(t) } @@ -120,8 +121,10 @@ fn bm25_score_doc(doc_content: String, query_tokens: String, corpus_size: Int, a } fn bm25_search_json(query: String, limit: Int) -> String { - // 1. Determine scan size (fetch 10x or up to 500 nodes) + // 1. Determine scan size: floor at 200 so small `limit` values still scan + // enough of the corpus to find relevant nodes. let scan_limit: Int = limit * 10 + if scan_limit < 200 { let scan_limit = 200 } if scan_limit > 500 { let scan_limit = 500 } // 2. Fetch node sample @@ -158,13 +161,12 @@ fn bm25_search_json(query: String, limit: Int) -> String { let node: String = json_array_get(nodes_json, j) let content: String = json_get_string(node, "content") let sc_str: String = bm25_score_doc(content, query_tokens, n, avg_doc_len) - // Only include nodes with score > 0 (str check: not "0.0" and not empty) - if !str_eq(sc_str, "0.0") { - if !str_eq(sc_str, "") { - let result_nodes = list_push(result_nodes, node) - let result_scores = list_push(result_scores, sc_str) - let result_count = result_count + 1 - } + // Only include nodes with score > 0.0 (use float comparison, not string match — + // float_to_str(0.0) returns "0.000000", not "0.0"). + if float_gt(str_to_float(sc_str), 0.0) { + let result_nodes = list_push(result_nodes, node) + let result_scores = list_push(result_scores, sc_str) + let result_count = result_count + 1 } let j = j + 1 } @@ -389,7 +391,7 @@ fn route_search(method: String, path: String, body: String) -> String { let limit: Int = query_int(path, "limit", 20) if limit == 0 { let limit = json_get_int(body, "limit") } if limit == 0 { let limit = 20 } - return engram_search_json(q, limit) + return bm25_search_json(q, limit) } fn route_activate(method: String, path: String, body: String) -> String { @@ -403,6 +405,17 @@ fn route_activate(method: String, path: String, body: String) -> String { let bd: Int = json_get_int(body, "depth") if bd > 0 { let depth = bd } } + // BM25 pre-bias: strengthen top-10 BM25 results before spreading activation + // so semantically relevant nodes already have elevated salience. + let top: String = bm25_search_json(q, 10) + let nb: Int = json_array_len(top) + let bi: Int = 0 + while bi < nb { + let node: String = json_array_get(top, bi) + let nid: String = json_get_string(node, "id") + if !str_eq(nid, "") { engram_strengthen(nid) } + let bi = bi + 1 + } return "{\"results\":" + engram_activate_json(q, depth) + "}" } @@ -631,7 +644,7 @@ fn route_neuron_recall(method: String, path: String, body: String) -> String { if str_eq(q, "") { return engram_scan_nodes_json(limit, 0) } - return engram_search_json(q, limit) + return bm25_search_json(q, limit) } // route_neuron_graph — get node + search-based neighbor approximation.