diff --git a/frappe/search/sqlite_search.py b/frappe/search/sqlite_search.py index 061145462c..b79d93908a 100644 --- a/frappe/search/sqlite_search.py +++ b/frappe/search/sqlite_search.py @@ -53,6 +53,7 @@ class SQLiteSearchIndexMissingError(Exception): # Search Configuration Constants MAX_SEARCH_RESULTS = 100 +MAX_RERANK_CANDIDATES = 500 SNIPPET_LENGTH = 64 MIN_WORD_LENGTH = 4 MAX_EDIT_DISTANCE = 3 @@ -375,12 +376,17 @@ class SQLiteSearch(ABC): # Process this doctype in batches last_indexed_modified = doctype_progress.get("last_indexed_modified") + last_indexed_name = doctype_progress.get("last_indexed_name") + progress_field = "creation" batch_count = 0 while True: # Get batch of documents docs = self.get_documents_paginated( - doctype, limit=batch_size, last_indexed_modified=last_indexed_modified + doctype, + limit=batch_size, + last_indexed_modified=last_indexed_modified, + last_indexed_name=last_indexed_name, ) if not docs: @@ -398,13 +404,12 @@ class SQLiteSearch(ABC): if documents: self._index_documents(documents) - # Update progress with last processed document's modification time - # Use hardcoded 'modified' field since it's reliable in all Frappe doctypes - last_doc_modified = docs[-1]["modified"] - + # Update progress with last processed document cursor + last_doc_modified = docs[-1].get(progress_field) or docs[-1].get("modified") last_doc_name = docs[-1]["name"] self._update_index_progress(doctype, last_doc_name, last_doc_modified, len(documents)) last_indexed_modified = last_doc_modified + last_indexed_name = last_doc_name batch_count += 1 @@ -614,34 +619,48 @@ class SQLiteSearch(ABC): return records - def get_documents_paginated(self, doctype, limit=1000, last_indexed_modified=None): + def get_documents_paginated( + self, doctype, limit=1000, last_indexed_modified=None, last_indexed_name=None + ): """Get records for a specific doctype with pagination support.""" config = self.doc_configs.get(doctype) if not config: return [] filters = config.get("filters", {}).copy() + sort_field = "creation" - # Ensure 'modified' field is always included for progress tracking + # Ensure cursor fields are included for progress tracking fields = config["fields"].copy() + if sort_field not in fields: + fields.append(sort_field) if "modified" not in fields: fields.append("modified") + if "name" not in fields: + fields.append("name") # Build query with proper ordering and pagination - # Order by modified field for reliable resume capability + # Order by cursor field with name as tie-breaker for stable pagination query = frappe.qb.get_query( doctype, fields=fields, filters=filters, - order_by="creation ASC, name ASC", # Secondary sort by name for consistency + order_by=f"{sort_field} ASC, name ASC", limit=limit, ) - # If resuming from a specific timestamp, filter by modification time - # This is more reliable than name-based filtering for VARCHAR names + # If resuming from a checkpoint, continue from cursor position. + # Include name tie-breaker to avoid skipping docs with same timestamp. if last_indexed_modified: Table = frappe.qb.DocType(doctype) - query = query.where(Table.modified > last_indexed_modified) + sort_column = getattr(Table, sort_field) + if last_indexed_name: + query = query.where( + (sort_column > last_indexed_modified) + | ((sort_column == last_indexed_modified) & (Table.name > last_indexed_name)) + ) + else: + query = query.where(sort_column > last_indexed_modified) docs = query.run(as_dict=True) @@ -854,6 +873,8 @@ class SQLiteSearch(ABC): select_clause = ",\n ".join(select_fields) + candidate_limit = max(MAX_SEARCH_RESULTS, MAX_RERANK_CANDIDATES) + if title_only: sql = f""" SELECT @@ -866,12 +887,12 @@ class SQLiteSearch(ABC): ORDER BY bm25_score LIMIT ? """ - return self.sql(sql, (fts_query, fts_query, *filter_params, MAX_SEARCH_RESULTS), read_only=True) + return self.sql(sql, (fts_query, fts_query, *filter_params, candidate_limit), read_only=True) else: params = [] if "content" in text_fields: params.append(SNIPPET_LENGTH) - params.extend([fts_query, *filter_params, MAX_SEARCH_RESULTS]) + params.extend([fts_query, *filter_params, candidate_limit]) sql = f""" SELECT @@ -883,7 +904,6 @@ class SQLiteSearch(ABC): ORDER BY bm25_score LIMIT ? """ - print(sql) return self.sql(sql, params, read_only=True) def _process_search_results(self, raw_results, query): @@ -923,13 +943,19 @@ class SQLiteSearch(ABC): processed_results.append(result) # Sort by custom score (descending - higher is better) - processed_results.sort(key=lambda x: x["score"], reverse=True) + processed_results.sort( + key=lambda x: ( + -x["score"], + x["bm25_score"] if x["bm25_score"] is not None else float("inf"), + x["original_rank"], + ) + ) # Add modified ranking after custom scoring for i, result in enumerate(processed_results): result["modified_rank"] = i + 1 - return processed_results + return processed_results[:MAX_SEARCH_RESULTS] def get_scoring_pipeline(self): """ @@ -984,13 +1010,22 @@ class SQLiteSearch(ABC): def _get_base_score(self, row, query): """Calculate the base score from BM25.""" - bm25_score = abs(row["bm25_score"]) if row["bm25_score"] is not None else 0 - return 1.0 / (1.0 + bm25_score) if bm25_score > 0 else 0.5 + bm25_score = row["bm25_score"] + if bm25_score is None: + return 0.5 + + # FTS5 BM25 is better when smaller, so don't normalize with abs(). + # Clamp non-positive scores to a strong base to avoid unstable boosts. + if bm25_score <= 0: + return 1.0 + + return 1.0 / (1.0 + bm25_score) def _get_title_boost(self, row, query, query_words): """Calculate the title matching boost based on percentage of words matched.""" original_title = (row["original_title"] or "").lower() query_lower = query.lower() + title_tokens = set(re.findall(r"\w+", original_title)) # Check for exact phrase match first (highest boost) if query_lower in original_title: @@ -1002,7 +1037,7 @@ class SQLiteSearch(ABC): matched_words = 0 for word in query_words: - if word.lower() in original_title: + if word.lower() in title_tokens: matched_words += 1 if matched_words == 0: