Merge pull request #37244 from netchampfaris/sqlite-fts-ranking

refactor: make SQLite search order more reliable
2026-02-19 14:58:46 +05:30 · 2026-02-19 14:58:46 +05:30 · a4e2cd17f0
commit a4e2cd17f0
parent 08793c57f7 7e0be7f170
1 changed files with 55 additions and 20 deletions
--- a/frappe/search/sqlite_search.py
+++ b/frappe/search/sqlite_search.py
@ -53,6 +53,7 @@ class SQLiteSearchIndexMissingError(Exception):

 # Search Configuration Constants
 MAX_SEARCH_RESULTS = 100
+MAX_RERANK_CANDIDATES = 500
 SNIPPET_LENGTH = 64
 MIN_WORD_LENGTH = 4
 MAX_EDIT_DISTANCE = 3
@ -375,12 +376,17 @@ class SQLiteSearch(ABC):

 				# Process this doctype in batches
 				last_indexed_modified = doctype_progress.get("last_indexed_modified")
+				last_indexed_name = doctype_progress.get("last_indexed_name")
+				progress_field = "creation"
 				batch_count = 0

 				while True:
 					# Get batch of documents
 					docs = self.get_documents_paginated(
-						doctype, limit=batch_size, last_indexed_modified=last_indexed_modified
+						doctype,
+						limit=batch_size,
+						last_indexed_modified=last_indexed_modified,
+						last_indexed_name=last_indexed_name,
 					)

 					if not docs:
@ -398,13 +404,12 @@ class SQLiteSearch(ABC):
 					if documents:
 						self._index_documents(documents)

-						# Update progress with last processed document's modification time
-						# Use hardcoded 'modified' field since it's reliable in all Frappe doctypes
-						last_doc_modified = docs[-1]["modified"]
-
+						# Update progress with last processed document cursor
+						last_doc_modified = docs[-1].get(progress_field) or docs[-1].get("modified")
 						last_doc_name = docs[-1]["name"]
 						self._update_index_progress(doctype, last_doc_name, last_doc_modified, len(documents))
 						last_indexed_modified = last_doc_modified
+						last_indexed_name = last_doc_name

 					batch_count += 1

@ -614,34 +619,48 @@ class SQLiteSearch(ABC):

 		return records

-	def get_documents_paginated(self, doctype, limit=1000, last_indexed_modified=None):
+	def get_documents_paginated(
+		self, doctype, limit=1000, last_indexed_modified=None, last_indexed_name=None
+	):
 		"""Get records for a specific doctype with pagination support."""
 		config = self.doc_configs.get(doctype)
 		if not config:
 			return []

 		filters = config.get("filters", {}).copy()
+		sort_field = "creation"

-		# Ensure 'modified' field is always included for progress tracking
+		# Ensure cursor fields are included for progress tracking
 		fields = config["fields"].copy()
+		if sort_field not in fields:
+			fields.append(sort_field)
 		if "modified" not in fields:
 			fields.append("modified")
+		if "name" not in fields:
+			fields.append("name")

 		# Build query with proper ordering and pagination
-		# Order by modified field for reliable resume capability
+		# Order by cursor field with name as tie-breaker for stable pagination
 		query = frappe.qb.get_query(
 			doctype,
 			fields=fields,
 			filters=filters,
-			order_by="creation ASC, name ASC",  # Secondary sort by name for consistency
+			order_by=f"{sort_field} ASC, name ASC",
 			limit=limit,
 		)

-		# If resuming from a specific timestamp, filter by modification time
-		# This is more reliable than name-based filtering for VARCHAR names
+		# If resuming from a checkpoint, continue from cursor position.
+		# Include name tie-breaker to avoid skipping docs with same timestamp.
 		if last_indexed_modified:
 			Table = frappe.qb.DocType(doctype)
-			query = query.where(Table.modified > last_indexed_modified)
+			sort_column = getattr(Table, sort_field)
+			if last_indexed_name:
+				query = query.where(
+					(sort_column > last_indexed_modified)
+					| ((sort_column == last_indexed_modified) & (Table.name > last_indexed_name))
+				)
+			else:
+				query = query.where(sort_column > last_indexed_modified)

 		docs = query.run(as_dict=True)

@ -854,6 +873,8 @@ class SQLiteSearch(ABC):

 		select_clause = ",\n                    ".join(select_fields)

+		candidate_limit = max(MAX_SEARCH_RESULTS, MAX_RERANK_CANDIDATES)
+
 		if title_only:
 			sql = f"""
                SELECT
@ -866,12 +887,12 @@ class SQLiteSearch(ABC):
                ORDER BY bm25_score
                LIMIT ?
            """
-			return self.sql(sql, (fts_query, fts_query, *filter_params, MAX_SEARCH_RESULTS), read_only=True)
+			return self.sql(sql, (fts_query, fts_query, *filter_params, candidate_limit), read_only=True)
 		else:
 			params = []
 			if "content" in text_fields:
 				params.append(SNIPPET_LENGTH)
-			params.extend([fts_query, *filter_params, MAX_SEARCH_RESULTS])
+			params.extend([fts_query, *filter_params, candidate_limit])

 			sql = f"""
                SELECT
@ -883,7 +904,6 @@ class SQLiteSearch(ABC):
                ORDER BY bm25_score
                LIMIT ?
            """
-			print(sql)
 			return self.sql(sql, params, read_only=True)

 	def _process_search_results(self, raw_results, query):
@ -923,13 +943,19 @@ class SQLiteSearch(ABC):
 			processed_results.append(result)

 		# Sort by custom score (descending - higher is better)
-		processed_results.sort(key=lambda x: x["score"], reverse=True)
+		processed_results.sort(
+			key=lambda x: (
+				-x["score"],
+				x["bm25_score"] if x["bm25_score"] is not None else float("inf"),
+				x["original_rank"],
+			)
+		)

 		# Add modified ranking after custom scoring
 		for i, result in enumerate(processed_results):
 			result["modified_rank"] = i + 1

-		return processed_results
+		return processed_results[:MAX_SEARCH_RESULTS]

 	def get_scoring_pipeline(self):
 		"""
@ -984,13 +1010,22 @@ class SQLiteSearch(ABC):

 	def _get_base_score(self, row, query):
 		"""Calculate the base score from BM25."""
-		bm25_score = abs(row["bm25_score"]) if row["bm25_score"] is not None else 0
-		return 1.0 / (1.0 + bm25_score) if bm25_score > 0 else 0.5
+		bm25_score = row["bm25_score"]
+		if bm25_score is None:
+			return 0.5
+
+		# FTS5 BM25 is better when smaller, so don't normalize with abs().
+		# Clamp non-positive scores to a strong base to avoid unstable boosts.
+		if bm25_score <= 0:
+			return 1.0
+
+		return 1.0 / (1.0 + bm25_score)

 	def _get_title_boost(self, row, query, query_words):
 		"""Calculate the title matching boost based on percentage of words matched."""
 		original_title = (row["original_title"] or "").lower()
 		query_lower = query.lower()
+		title_tokens = set(re.findall(r"\w+", original_title))

 		# Check for exact phrase match first (highest boost)
 		if query_lower in original_title:
@ -1002,7 +1037,7 @@ class SQLiteSearch(ABC):

 		matched_words = 0
 		for word in query_words:
-			if word.lower() in original_title:
+			if word.lower() in title_tokens:
 				matched_words += 1

 		if matched_words == 0: