Merge pull request #37244 from netchampfaris/sqlite-fts-ranking
refactor: make SQLite search order more reliable
This commit is contained in:
commit
a4e2cd17f0
1 changed files with 55 additions and 20 deletions
|
|
@ -53,6 +53,7 @@ class SQLiteSearchIndexMissingError(Exception):
|
|||
|
||||
# Search Configuration Constants
|
||||
MAX_SEARCH_RESULTS = 100
|
||||
MAX_RERANK_CANDIDATES = 500
|
||||
SNIPPET_LENGTH = 64
|
||||
MIN_WORD_LENGTH = 4
|
||||
MAX_EDIT_DISTANCE = 3
|
||||
|
|
@ -375,12 +376,17 @@ class SQLiteSearch(ABC):
|
|||
|
||||
# Process this doctype in batches
|
||||
last_indexed_modified = doctype_progress.get("last_indexed_modified")
|
||||
last_indexed_name = doctype_progress.get("last_indexed_name")
|
||||
progress_field = "creation"
|
||||
batch_count = 0
|
||||
|
||||
while True:
|
||||
# Get batch of documents
|
||||
docs = self.get_documents_paginated(
|
||||
doctype, limit=batch_size, last_indexed_modified=last_indexed_modified
|
||||
doctype,
|
||||
limit=batch_size,
|
||||
last_indexed_modified=last_indexed_modified,
|
||||
last_indexed_name=last_indexed_name,
|
||||
)
|
||||
|
||||
if not docs:
|
||||
|
|
@ -398,13 +404,12 @@ class SQLiteSearch(ABC):
|
|||
if documents:
|
||||
self._index_documents(documents)
|
||||
|
||||
# Update progress with last processed document's modification time
|
||||
# Use hardcoded 'modified' field since it's reliable in all Frappe doctypes
|
||||
last_doc_modified = docs[-1]["modified"]
|
||||
|
||||
# Update progress with last processed document cursor
|
||||
last_doc_modified = docs[-1].get(progress_field) or docs[-1].get("modified")
|
||||
last_doc_name = docs[-1]["name"]
|
||||
self._update_index_progress(doctype, last_doc_name, last_doc_modified, len(documents))
|
||||
last_indexed_modified = last_doc_modified
|
||||
last_indexed_name = last_doc_name
|
||||
|
||||
batch_count += 1
|
||||
|
||||
|
|
@ -614,34 +619,48 @@ class SQLiteSearch(ABC):
|
|||
|
||||
return records
|
||||
|
||||
def get_documents_paginated(self, doctype, limit=1000, last_indexed_modified=None):
|
||||
def get_documents_paginated(
|
||||
self, doctype, limit=1000, last_indexed_modified=None, last_indexed_name=None
|
||||
):
|
||||
"""Get records for a specific doctype with pagination support."""
|
||||
config = self.doc_configs.get(doctype)
|
||||
if not config:
|
||||
return []
|
||||
|
||||
filters = config.get("filters", {}).copy()
|
||||
sort_field = "creation"
|
||||
|
||||
# Ensure 'modified' field is always included for progress tracking
|
||||
# Ensure cursor fields are included for progress tracking
|
||||
fields = config["fields"].copy()
|
||||
if sort_field not in fields:
|
||||
fields.append(sort_field)
|
||||
if "modified" not in fields:
|
||||
fields.append("modified")
|
||||
if "name" not in fields:
|
||||
fields.append("name")
|
||||
|
||||
# Build query with proper ordering and pagination
|
||||
# Order by modified field for reliable resume capability
|
||||
# Order by cursor field with name as tie-breaker for stable pagination
|
||||
query = frappe.qb.get_query(
|
||||
doctype,
|
||||
fields=fields,
|
||||
filters=filters,
|
||||
order_by="creation ASC, name ASC", # Secondary sort by name for consistency
|
||||
order_by=f"{sort_field} ASC, name ASC",
|
||||
limit=limit,
|
||||
)
|
||||
|
||||
# If resuming from a specific timestamp, filter by modification time
|
||||
# This is more reliable than name-based filtering for VARCHAR names
|
||||
# If resuming from a checkpoint, continue from cursor position.
|
||||
# Include name tie-breaker to avoid skipping docs with same timestamp.
|
||||
if last_indexed_modified:
|
||||
Table = frappe.qb.DocType(doctype)
|
||||
query = query.where(Table.modified > last_indexed_modified)
|
||||
sort_column = getattr(Table, sort_field)
|
||||
if last_indexed_name:
|
||||
query = query.where(
|
||||
(sort_column > last_indexed_modified)
|
||||
| ((sort_column == last_indexed_modified) & (Table.name > last_indexed_name))
|
||||
)
|
||||
else:
|
||||
query = query.where(sort_column > last_indexed_modified)
|
||||
|
||||
docs = query.run(as_dict=True)
|
||||
|
||||
|
|
@ -854,6 +873,8 @@ class SQLiteSearch(ABC):
|
|||
|
||||
select_clause = ",\n ".join(select_fields)
|
||||
|
||||
candidate_limit = max(MAX_SEARCH_RESULTS, MAX_RERANK_CANDIDATES)
|
||||
|
||||
if title_only:
|
||||
sql = f"""
|
||||
SELECT
|
||||
|
|
@ -866,12 +887,12 @@ class SQLiteSearch(ABC):
|
|||
ORDER BY bm25_score
|
||||
LIMIT ?
|
||||
"""
|
||||
return self.sql(sql, (fts_query, fts_query, *filter_params, MAX_SEARCH_RESULTS), read_only=True)
|
||||
return self.sql(sql, (fts_query, fts_query, *filter_params, candidate_limit), read_only=True)
|
||||
else:
|
||||
params = []
|
||||
if "content" in text_fields:
|
||||
params.append(SNIPPET_LENGTH)
|
||||
params.extend([fts_query, *filter_params, MAX_SEARCH_RESULTS])
|
||||
params.extend([fts_query, *filter_params, candidate_limit])
|
||||
|
||||
sql = f"""
|
||||
SELECT
|
||||
|
|
@ -883,7 +904,6 @@ class SQLiteSearch(ABC):
|
|||
ORDER BY bm25_score
|
||||
LIMIT ?
|
||||
"""
|
||||
print(sql)
|
||||
return self.sql(sql, params, read_only=True)
|
||||
|
||||
def _process_search_results(self, raw_results, query):
|
||||
|
|
@ -923,13 +943,19 @@ class SQLiteSearch(ABC):
|
|||
processed_results.append(result)
|
||||
|
||||
# Sort by custom score (descending - higher is better)
|
||||
processed_results.sort(key=lambda x: x["score"], reverse=True)
|
||||
processed_results.sort(
|
||||
key=lambda x: (
|
||||
-x["score"],
|
||||
x["bm25_score"] if x["bm25_score"] is not None else float("inf"),
|
||||
x["original_rank"],
|
||||
)
|
||||
)
|
||||
|
||||
# Add modified ranking after custom scoring
|
||||
for i, result in enumerate(processed_results):
|
||||
result["modified_rank"] = i + 1
|
||||
|
||||
return processed_results
|
||||
return processed_results[:MAX_SEARCH_RESULTS]
|
||||
|
||||
def get_scoring_pipeline(self):
|
||||
"""
|
||||
|
|
@ -984,13 +1010,22 @@ class SQLiteSearch(ABC):
|
|||
|
||||
def _get_base_score(self, row, query):
|
||||
"""Calculate the base score from BM25."""
|
||||
bm25_score = abs(row["bm25_score"]) if row["bm25_score"] is not None else 0
|
||||
return 1.0 / (1.0 + bm25_score) if bm25_score > 0 else 0.5
|
||||
bm25_score = row["bm25_score"]
|
||||
if bm25_score is None:
|
||||
return 0.5
|
||||
|
||||
# FTS5 BM25 is better when smaller, so don't normalize with abs().
|
||||
# Clamp non-positive scores to a strong base to avoid unstable boosts.
|
||||
if bm25_score <= 0:
|
||||
return 1.0
|
||||
|
||||
return 1.0 / (1.0 + bm25_score)
|
||||
|
||||
def _get_title_boost(self, row, query, query_words):
|
||||
"""Calculate the title matching boost based on percentage of words matched."""
|
||||
original_title = (row["original_title"] or "").lower()
|
||||
query_lower = query.lower()
|
||||
title_tokens = set(re.findall(r"\w+", original_title))
|
||||
|
||||
# Check for exact phrase match first (highest boost)
|
||||
if query_lower in original_title:
|
||||
|
|
@ -1002,7 +1037,7 @@ class SQLiteSearch(ABC):
|
|||
|
||||
matched_words = 0
|
||||
for word in query_words:
|
||||
if word.lower() in original_title:
|
||||
if word.lower() in title_tokens:
|
||||
matched_words += 1
|
||||
|
||||
if matched_words == 0:
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue