Merge pull request #37244 from netchampfaris/sqlite-fts-ranking

refactor: make SQLite search order more reliable
This commit is contained in:
Faris Ansari 2026-02-19 14:58:46 +05:30 committed by GitHub
commit a4e2cd17f0
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -53,6 +53,7 @@ class SQLiteSearchIndexMissingError(Exception):
# Search Configuration Constants
MAX_SEARCH_RESULTS = 100
MAX_RERANK_CANDIDATES = 500
SNIPPET_LENGTH = 64
MIN_WORD_LENGTH = 4
MAX_EDIT_DISTANCE = 3
@ -375,12 +376,17 @@ class SQLiteSearch(ABC):
# Process this doctype in batches
last_indexed_modified = doctype_progress.get("last_indexed_modified")
last_indexed_name = doctype_progress.get("last_indexed_name")
progress_field = "creation"
batch_count = 0
while True:
# Get batch of documents
docs = self.get_documents_paginated(
doctype, limit=batch_size, last_indexed_modified=last_indexed_modified
doctype,
limit=batch_size,
last_indexed_modified=last_indexed_modified,
last_indexed_name=last_indexed_name,
)
if not docs:
@ -398,13 +404,12 @@ class SQLiteSearch(ABC):
if documents:
self._index_documents(documents)
# Update progress with last processed document's modification time
# Use hardcoded 'modified' field since it's reliable in all Frappe doctypes
last_doc_modified = docs[-1]["modified"]
# Update progress with last processed document cursor
last_doc_modified = docs[-1].get(progress_field) or docs[-1].get("modified")
last_doc_name = docs[-1]["name"]
self._update_index_progress(doctype, last_doc_name, last_doc_modified, len(documents))
last_indexed_modified = last_doc_modified
last_indexed_name = last_doc_name
batch_count += 1
@ -614,34 +619,48 @@ class SQLiteSearch(ABC):
return records
def get_documents_paginated(self, doctype, limit=1000, last_indexed_modified=None):
def get_documents_paginated(
self, doctype, limit=1000, last_indexed_modified=None, last_indexed_name=None
):
"""Get records for a specific doctype with pagination support."""
config = self.doc_configs.get(doctype)
if not config:
return []
filters = config.get("filters", {}).copy()
sort_field = "creation"
# Ensure 'modified' field is always included for progress tracking
# Ensure cursor fields are included for progress tracking
fields = config["fields"].copy()
if sort_field not in fields:
fields.append(sort_field)
if "modified" not in fields:
fields.append("modified")
if "name" not in fields:
fields.append("name")
# Build query with proper ordering and pagination
# Order by modified field for reliable resume capability
# Order by cursor field with name as tie-breaker for stable pagination
query = frappe.qb.get_query(
doctype,
fields=fields,
filters=filters,
order_by="creation ASC, name ASC", # Secondary sort by name for consistency
order_by=f"{sort_field} ASC, name ASC",
limit=limit,
)
# If resuming from a specific timestamp, filter by modification time
# This is more reliable than name-based filtering for VARCHAR names
# If resuming from a checkpoint, continue from cursor position.
# Include name tie-breaker to avoid skipping docs with same timestamp.
if last_indexed_modified:
Table = frappe.qb.DocType(doctype)
query = query.where(Table.modified > last_indexed_modified)
sort_column = getattr(Table, sort_field)
if last_indexed_name:
query = query.where(
(sort_column > last_indexed_modified)
| ((sort_column == last_indexed_modified) & (Table.name > last_indexed_name))
)
else:
query = query.where(sort_column > last_indexed_modified)
docs = query.run(as_dict=True)
@ -854,6 +873,8 @@ class SQLiteSearch(ABC):
select_clause = ",\n ".join(select_fields)
candidate_limit = max(MAX_SEARCH_RESULTS, MAX_RERANK_CANDIDATES)
if title_only:
sql = f"""
SELECT
@ -866,12 +887,12 @@ class SQLiteSearch(ABC):
ORDER BY bm25_score
LIMIT ?
"""
return self.sql(sql, (fts_query, fts_query, *filter_params, MAX_SEARCH_RESULTS), read_only=True)
return self.sql(sql, (fts_query, fts_query, *filter_params, candidate_limit), read_only=True)
else:
params = []
if "content" in text_fields:
params.append(SNIPPET_LENGTH)
params.extend([fts_query, *filter_params, MAX_SEARCH_RESULTS])
params.extend([fts_query, *filter_params, candidate_limit])
sql = f"""
SELECT
@ -883,7 +904,6 @@ class SQLiteSearch(ABC):
ORDER BY bm25_score
LIMIT ?
"""
print(sql)
return self.sql(sql, params, read_only=True)
def _process_search_results(self, raw_results, query):
@ -923,13 +943,19 @@ class SQLiteSearch(ABC):
processed_results.append(result)
# Sort by custom score (descending - higher is better)
processed_results.sort(key=lambda x: x["score"], reverse=True)
processed_results.sort(
key=lambda x: (
-x["score"],
x["bm25_score"] if x["bm25_score"] is not None else float("inf"),
x["original_rank"],
)
)
# Add modified ranking after custom scoring
for i, result in enumerate(processed_results):
result["modified_rank"] = i + 1
return processed_results
return processed_results[:MAX_SEARCH_RESULTS]
def get_scoring_pipeline(self):
"""
@ -984,13 +1010,22 @@ class SQLiteSearch(ABC):
def _get_base_score(self, row, query):
"""Calculate the base score from BM25."""
bm25_score = abs(row["bm25_score"]) if row["bm25_score"] is not None else 0
return 1.0 / (1.0 + bm25_score) if bm25_score > 0 else 0.5
bm25_score = row["bm25_score"]
if bm25_score is None:
return 0.5
# FTS5 BM25 is better when smaller, so don't normalize with abs().
# Clamp non-positive scores to a strong base to avoid unstable boosts.
if bm25_score <= 0:
return 1.0
return 1.0 / (1.0 + bm25_score)
def _get_title_boost(self, row, query, query_words):
"""Calculate the title matching boost based on percentage of words matched."""
original_title = (row["original_title"] or "").lower()
query_lower = query.lower()
title_tokens = set(re.findall(r"\w+", original_title))
# Check for exact phrase match first (highest boost)
if query_lower in original_title:
@ -1002,7 +1037,7 @@ class SQLiteSearch(ABC):
matched_words = 0
for word in query_words:
if word.lower() in original_title:
if word.lower() in title_tokens:
matched_words += 1
if matched_words == 0: