seitime-frappe/frappe/search/sqlite_search.py
Faris Ansari 7c49237913
Merge pull request #33502 from netchampfaris/sqlite-search-fixes
fix: wrap doc index update in try block
2025-08-01 00:37:44 +05:30

1469 lines
45 KiB
Python

# Copyright (c) 2025, Frappe Technologies Pvt. Ltd. and Contributors
# MIT License. See license.txt
import datetime
import inspect
import os
import re
import sqlite3
import time
from abc import ABC, abstractmethod
from collections import defaultdict
from dataclasses import dataclass
from enum import Enum
from typing import Any
from bs4 import BeautifulSoup
import frappe
from frappe.model.document import Document
from frappe.utils import update_progress_bar
class WarningType(Enum):
"""Warning types for search indexing."""
INVALID_DOCUMENT = "invalid_document"
MISSING_TEXT_FIELDS = "missing_text_fields"
MISSING_CONTENT_FIELD = "missing_content_field"
MISSING_TITLE_FIELD = "missing_title_field"
MISSING_DOCTYPE = "missing_doctype"
MISSING_NAME = "missing_name"
OTHER = "other"
@dataclass
class IndexWarning:
"""Structured warning for search indexing."""
type: WarningType
message: str
doctype: str | None = None
docname: str | None = None
field: str | None = None
missing_fields: list | None = None
def __str__(self):
return self.message
class SQLiteSearchIndexMissingError(Exception):
pass
# Search Configuration Constants
MAX_SEARCH_RESULTS = 100
SNIPPET_LENGTH = 64
MIN_WORD_LENGTH = 4
MAX_EDIT_DISTANCE = 3
MIN_SIMILARITY_THRESHOLD = 0.6
MAX_SPELLING_SUGGESTIONS = 3
SIMILARITY_TRIGRAM_WEIGHT = 0.7
SIMILARITY_SEQUENCE_WEIGHT = 0.3
FREQUENCY_BOOST_FACTOR = 1000
MAX_FREQUENCY_BOOST = 1.2
RECENCY_DECAY_RATE = 0.005 # Linear decay per day beyond 90 days
MIN_RECENCY_BOOST = 0.5
TITLE_EXACT_MATCH_BOOST = 5.0
TITLE_PARTIAL_MATCH_BOOST = 2.0
DISCUSSION_BOOST = 1.2
COMMENT_BOOST = 1.0
# Time-based recency categories for aggressive boosting
RECENT_HOURS_BOOST = 1.8 # Documents from last 24 hours
RECENT_WEEK_BOOST = 1.5 # Documents from last 7 days
RECENT_MONTH_BOOST = 1.2 # Documents from last 30 days
RECENT_QUARTER_BOOST = 1.1 # Documents from last 90 days
class SQLiteSearch(ABC):
"""
Abstract base class for SQLite FTS5-based full-text search for Frappe.
Provides full-text search with advanced features:
- Spelling correction using trigram similarity
- Time-based recency boost with categorical scoring
- Custom scoring with title matching and document type boosts
- Ranking tracking (original BM25 vs modified scores)
- Filtering by user-defined criteria
- Permission-aware search results via query-level filtering
"""
@staticmethod
def scoring_function(func):
"""
Decorator to mark methods as scoring functions that should be automatically
included in the scoring pipeline.
Usage:
@SQLiteSearch.scoring_function
def custom_boost(self, row, query, query_words):
return 1.5
"""
func._is_scoring_function = True
return func
def __init__(self, db_name=None):
# Use class-level INDEX_NAME if db_name not provided
if db_name is None:
db_name = getattr(self, "INDEX_NAME", "search.db")
self.db_name = db_name
self.db_path = self._get_db_path()
# Validate required class attributes early
if not hasattr(self, "INDEX_SCHEMA"):
raise ValueError("INDEX_SCHEMA must be defined as a class-level variable")
if not hasattr(self, "INDEXABLE_DOCTYPES"):
raise ValueError("INDEXABLE_DOCTYPES must be defined as a class-level variable")
self.doc_configs = self._build_doc_configs()
self.warnings: list[IndexWarning] = [] # Collect warnings during indexing
self.schema = self._get_schema()
self._validate_config()
# Helper Methods for New API
def _parse_doctype_fields(self, doctype, config):
"""Parse field definitions for a doctype to extract field names and mappings."""
if "fields" not in config:
raise ValueError(f"Missing 'fields' in configuration for doctype '{doctype}'")
parsed_fields = []
field_mappings = {}
for field_def in config["fields"]:
if isinstance(field_def, str):
parsed_fields.append(field_def)
elif isinstance(field_def, dict):
for schema_field, doctype_field in field_def.items():
parsed_fields.append(doctype_field)
field_mappings[schema_field] = doctype_field
else:
raise ValueError(f"Invalid field definition: {field_def}")
return parsed_fields, field_mappings
def _build_doc_configs(self):
"""Build document configurations from class-level INDEXABLE_DOCTYPES."""
doc_configs = {}
for doctype, config in self.INDEXABLE_DOCTYPES.items():
parsed_fields, field_mappings = self._parse_doctype_fields(doctype, config)
# Determine content field
content_field = field_mappings.get("content")
if not content_field:
if "content" in parsed_fields:
content_field = "content"
else:
raise ValueError(
f"Content field must be present in fields list or explicitly mapped for '{doctype}'"
)
# Determine title field
title_field = field_mappings.get("title")
if not title_field and "title" in parsed_fields:
title_field = "title"
doc_configs[doctype] = {
"fields": parsed_fields,
"field_mappings": field_mappings,
"content_field": content_field,
"title_field": title_field,
"modified_field": field_mappings.get("modified", "modified"),
"filters": config.get("filters", {}),
}
return doc_configs
def _get_schema(self):
"""Get the search index schema with automatic defaults."""
if not hasattr(self, "INDEX_SCHEMA"):
raise ValueError("INDEX_SCHEMA must be defined as a class-level variable")
schema = self.INDEX_SCHEMA.copy()
# Default text fields to title and content
schema.setdefault("text_fields", ["title", "content"])
# Default tokenizer
schema.setdefault("tokenizer", "unicode61 remove_diacritics 2")
# Automatically add required metadata fields
metadata_fields = schema.setdefault("metadata_fields", [])
required_fields = ["doctype", "name"]
for field in required_fields:
if field not in metadata_fields:
metadata_fields.append(field)
# Add 'modified' to metadata if it's used in the schema or any doctype config
is_modified_in_schema = "modified" in self.INDEX_SCHEMA.get("metadata_fields", [])
is_modified_in_doctypes = any(
"modified" in config.get("field_mappings", {}) or "modified" in config.get("fields", [])
for config in self.doc_configs.values()
)
if (is_modified_in_schema or is_modified_in_doctypes) and "modified" not in metadata_fields:
metadata_fields.append("modified")
schema["metadata_fields"] = metadata_fields
return schema
# Abstract Method - Must be implemented by subclasses
@abstractmethod
def get_search_filters(self):
"""
Return filters to apply to search results.
Returns:
dict: Permission filters in format:
{
"field_name": value, # Single value: field = value
"field_name": [val1, val2] # List: field IN (val1, val2)
}
"""
pass
# Public API Methods
def search(self, query, title_only=False, filters=None):
"""
Main search method with advanced filtering support.
Args:
query (str): Search query text
title_only (bool): Whether to search only in titles
filters (dict): Optional filters by field names
Returns:
dict: Search results with summary statistics
"""
if not self.is_search_enabled():
return self._empty_search_result(title_only, filters)
self.raise_if_not_indexed()
if not query:
return self._empty_search_result(title_only, filters)
start_time = time.time()
# Prepare filters if provided
filters = filters or {}
# Get permission filters from subclass
permission_filters = self.get_search_filters()
# Combine user filters with permission filters
all_filters = {**filters, **permission_filters}
# Prepare FTS5 query with spelling correction
expanded_query, corrections = self._expand_query_with_corrections(query)
fts_query = self._prepare_fts_query(expanded_query)
try:
raw_results = self._execute_search_query(fts_query, title_only, all_filters)
total_matches = len(raw_results)
except sqlite3.Error as e:
frappe.log_error(f"Search query failed: {e}")
raw_results = []
total_matches = 0
# Process results
processed_results = self._process_search_results(raw_results, query)
duration = time.time() - start_time
return {
"results": processed_results,
"summary": {
"duration": round(duration, 3),
"total_matches": total_matches,
"returned_matches": total_matches,
"corrected_words": corrections,
"corrected_query": expanded_query if corrections else None,
"title_only": title_only,
"filtered_matches": len(processed_results),
"applied_filters": filters,
},
}
def build_index(self):
"""Build the complete search index from scratch using atomic replacement."""
if not self.is_search_enabled():
return
# Use temporary database path for atomic replacement
temp_db_path = self._get_db_path(is_temp=True)
original_db_path = self.db_path
# Remove temp file if it exists
if os.path.exists(temp_db_path):
os.unlink(temp_db_path)
# Temporarily switch to temp database for building
self.db_path = temp_db_path
try:
self._update_progress("Setting up search tables", 0, 100, absolute=True)
# Setup tables in temp database
self._ensure_fts_table()
self._update_progress("Fetching records", 20, 100, absolute=True)
records = self.get_documents()
documents = []
self._update_progress("Preparing documents", 30, 100, absolute=True)
total_records = len(records)
for i, doc in enumerate(records):
document = self.prepare_document(doc)
if document:
documents.append(document)
# Update progress during document preparation
if i % 100 == 0:
progress = 30 + int((i / total_records) * 20) # 30-50% range
self._update_progress("Preparing documents", progress, 100, absolute=True)
self._update_progress("Indexing documents", 50, 100, absolute=True)
self._index_documents(documents)
self._update_progress("Building spell correction vocabulary", 80, 100, absolute=True)
# Build vocabulary for spelling correction
self._build_vocabulary(documents)
# Atomic replacement: move temp database to final location
if os.path.exists(original_db_path):
os.unlink(original_db_path)
os.rename(temp_db_path, original_db_path)
self._update_progress("Search index build complete", 100, 100, absolute=True)
# Print warning summary
self._print_warning_summary()
except Exception:
# Clean up temp file on error
if os.path.exists(temp_db_path):
os.unlink(temp_db_path)
raise
finally:
# Restore original database path
self.db_path = original_db_path
# Status and Validation Methods
def index_exists(self):
"""Check if FTS index exists."""
if not os.path.exists(self.db_path):
return False
try:
result = self.sql(
"SELECT name FROM sqlite_master WHERE type='table' AND name='search_fts'", read_only=True
)
return bool(result)
except sqlite3.Error:
return False
def drop_index(self):
"""Drop the search index by removing the database file."""
if os.path.exists(self.db_path):
try:
os.unlink(self.db_path)
except OSError as e:
frappe.log_error(f"Failed to remove search index file {self.db_path}: {e}")
raise
def is_search_enabled(self):
"""Override this to enable/disable search"""
return True
def raise_if_not_indexed(self):
"""Raise exception if search index doesn't exist."""
if not self.index_exists():
raise SQLiteSearchIndexMissingError("Search index does not exist. Please build the index first.")
def get_documents(self):
"""Get all records to be indexed."""
records = []
for doctype, config in self.doc_configs.items():
docs = frappe.qb.get_query(
doctype, fields=config["fields"], filters=config.get("filters", {})
).run(as_dict=True)
for doc in docs:
doc.doctype = doctype
if config["modified_field"] != "modified":
doc.modified = getattr(doc, config["modified_field"], None) or doc.modified
records.append(doc)
return records
# Private Implementation Methods
def _execute_search_query(self, fts_query, title_only, filters):
"""Execute the FTS search query with optional filters."""
# Build filter conditions
filter_conditions = []
filter_params = []
if filters:
# Build filter conditions dynamically
for field, values in filters.items():
if not values and isinstance(values, list):
# If filter is an empty list, it should not match any documents.
filter_conditions.append("1=0")
continue
if not values: # Skip empty filters
continue
# Check if this is a LIKE filter (list with 'LIKE' operator)
if isinstance(values, list) and len(values) == 2 and values[0] == "LIKE":
# Handle LIKE filters in format ['LIKE', tag_filters]
like_values = values[1]
if isinstance(like_values, list):
# Multiple LIKE conditions (OR them together)
like_conditions = []
for like_val in like_values:
like_conditions.append(f"{field} LIKE ?")
filter_params.append(f"%{like_val}%")
filter_conditions.append(f"({' OR '.join(like_conditions)})")
else:
# Single LIKE condition
filter_conditions.append(f"{field} LIKE ?")
filter_params.append(f"%{like_values}%")
elif isinstance(values, list):
if len(values) == 1:
filter_conditions.append(f"{field} = ?")
filter_params.append(values[0])
else:
placeholders = ",".join(["?" for _ in values])
filter_conditions.append(f"{field} IN ({placeholders})")
filter_params.extend(values)
else:
filter_conditions.append(f"{field} = ?")
filter_params.append(values)
# Combine filter conditions with AND
filter_clause = ""
if filter_conditions:
filter_clause = "AND " + " AND ".join(filter_conditions)
# Get schema to build dynamic SELECT fields
text_fields = self.schema["text_fields"]
metadata_fields = self.schema["metadata_fields"]
# Build SELECT clause with all fields
select_fields = []
# Add title highlighting
title_field = "title" if "title" in text_fields else text_fields[0] if text_fields else "doc_id"
title_column_index = self._get_text_field_column_index(title_field)
if title_column_index is not None:
select_fields.append(f"highlight(search_fts, {title_column_index}, '<mark>', '</mark>') as title")
else:
select_fields.append(f"{title_field} as title")
# Add content snippet or highlighting
if not title_only and "content" in text_fields:
content_index = self._get_text_field_column_index("content")
select_fields.append(
f"snippet(search_fts, {content_index}, '<mark>', '</mark>', '...', ?) as content"
)
elif "content" in text_fields:
select_fields.append("content")
# Add all other fields
for field in metadata_fields:
if field != "doc_id": # Already handled above
select_fields.append(field)
# Add scoring fields
select_fields.extend(["bm25(search_fts) as bm25_score", f"{title_field} as original_title"])
select_clause = ",\n ".join(select_fields)
if title_only:
sql = f"""
SELECT
doc_id,
{select_clause}
FROM search_fts
WHERE search_fts MATCH ?
AND {title_field} MATCH ?
{filter_clause}
ORDER BY bm25_score
LIMIT ?
"""
return self.sql(sql, (fts_query, fts_query, *filter_params, MAX_SEARCH_RESULTS), read_only=True)
else:
params = []
if "content" in text_fields:
params.append(SNIPPET_LENGTH)
params.extend([fts_query, *filter_params, MAX_SEARCH_RESULTS])
sql = f"""
SELECT
doc_id,
{select_clause}
FROM search_fts
WHERE search_fts MATCH ?
{filter_clause}
ORDER BY bm25_score
LIMIT ?
"""
print(sql)
return self.sql(sql, params, read_only=True)
def _process_search_results(self, raw_results, query):
"""Process search results with scoring."""
processed_results = []
query_words = query.split()
# Get schema configuration
text_fields = self.schema["text_fields"]
metadata_fields = self.schema["metadata_fields"]
# 1-based ranking
for original_rank, row in enumerate(raw_results, 1):
# Apply advanced heuristics scoring
score = self._calculate_advanced_score(row, query, query_words)
# Build result dynamically based on schema
result = {
"id": row["doc_id"],
"score": score,
"original_rank": original_rank,
"bm25_score": row["bm25_score"],
}
# Add text fields
for field in text_fields:
result[field] = row[field] if field in row.keys() else ""
# Add metadata fields
for field in metadata_fields:
if field == "owner":
# Map owner to author for backward compatibility
result["author"] = row["owner"] if "owner" in row.keys() else ""
else:
result[field] = row[field] if field in row.keys() else None
processed_results.append(result)
# Sort by custom score (descending - higher is better)
processed_results.sort(key=lambda x: x["score"], reverse=True)
# Add modified ranking after custom scoring
for i, result in enumerate(processed_results):
result["modified_rank"] = i + 1
return processed_results
def get_scoring_pipeline(self):
"""
Return the scoring pipeline, a list of methods to calculate the final score.
Each method in the list should accept either (row, query) or (row, query, query_words)
and return a float. The final score is the product of all values returned by the pipeline methods.
Subclasses can override this to customize the scoring logic.
"""
pipeline = [
self._get_base_score,
self._get_title_boost,
]
# Only add recency boost if modified is available in the schema
if "modified" in self.schema["metadata_fields"]:
pipeline.append(self._get_recency_boost)
# Automatically discover and add decorated scoring functions
for attr_name in dir(self):
attr = getattr(self, attr_name)
if callable(attr) and hasattr(attr, "_is_scoring_function"):
pipeline.append(attr)
return pipeline
def _calculate_advanced_score(self, row, query, query_words):
"""
Calculate the final score by executing the scoring pipeline.
The final score is the product of all scores returned by the pipeline methods.
"""
pipeline = self.get_scoring_pipeline()
final_score = 1.0
for scoring_method in pipeline:
# Check method signature to determine how to call it
sig = inspect.signature(scoring_method)
params = list(sig.parameters.keys())
# Skip 'self' parameter
if params and params[0] == "self":
params = params[1:]
# Call method based on its signature
if len(params) >= 3 or "query_words" in params:
# Method accepts query_words parameter
final_score *= scoring_method(row, query, query_words)
else:
# Method only accepts row and query
final_score *= scoring_method(row, query)
return final_score
def _get_base_score(self, row, query):
"""Calculate the base score from BM25."""
bm25_score = abs(row["bm25_score"]) if row["bm25_score"] is not None else 0
return 1.0 / (1.0 + bm25_score) if bm25_score > 0 else 0.5
def _get_title_boost(self, row, query, query_words):
"""Calculate the title matching boost based on percentage of words matched."""
original_title = (row["original_title"] or "").lower()
query_lower = query.lower()
# Check for exact phrase match first (highest boost)
if query_lower in original_title:
return TITLE_EXACT_MATCH_BOOST
# Calculate percentage of query words that match in title
if not query_words:
return 1.0
matched_words = 0
for word in query_words:
if word.lower() in original_title:
matched_words += 1
if matched_words == 0:
return 1.0
# Calculate match percentage
match_percentage = matched_words / len(query_words)
# Scale the boost between TITLE_PARTIAL_MATCH_BOOST (2.0) and TITLE_EXACT_MATCH_BOOST (5.0)
# based on the percentage of words matched
min_boost = TITLE_PARTIAL_MATCH_BOOST # 2.0
max_boost = TITLE_EXACT_MATCH_BOOST # 5.0
# Linear interpolation: boost = min_boost + (max_boost - min_boost) * match_percentage
boost = min_boost + (max_boost - min_boost) * match_percentage
return boost
def _get_recency_boost(self, row, query):
"""Calculate the time-based recency boost."""
# Return neutral boost if modified is not available
if "modified" not in row or row["modified"] is None:
return 1.0
current_time = time.time()
doc_timestamp = row["modified"]
hours_old = (current_time - doc_timestamp) / 3600
days_old = hours_old / 24
if hours_old <= 24:
return RECENT_HOURS_BOOST
if days_old <= 7:
return RECENT_WEEK_BOOST
if days_old <= 30:
return RECENT_MONTH_BOOST
if days_old <= 90:
return RECENT_QUARTER_BOOST
# Older documents get linear decay
days_beyond_90 = days_old - 90
return max(MIN_RECENCY_BOOST, RECENT_QUARTER_BOOST - (days_beyond_90 * RECENCY_DECAY_RATE))
def _get_text_field_column_index(self, field_name):
"""Get the 1-based column index of a text field in the FTS table."""
try:
# FTS table columns are doc_id, then text_fields...
# So index is 1 (for doc_id) + index in text_fields list
return 1 + self.schema["text_fields"].index(field_name)
except ValueError:
return None
# Spelling Correction Methods
def _expand_query_with_corrections(self, query):
"""Expand query with spelling corrections."""
words = query.strip().split()
expanded_terms = []
corrections = {}
for word in words:
similar_words = self._find_similar_words(word)
if similar_words and similar_words[0] != word:
# Replace the misspelled word with the corrected word
corrected_word = similar_words[0]
expanded_terms.append(corrected_word)
corrections[word] = corrected_word
else:
expanded_terms.append(word)
expanded_query = " ".join(expanded_terms)
return expanded_query, corrections if corrections else None
def _find_similar_words(
self, word, max_suggestions=MAX_SPELLING_SUGGESTIONS, min_similarity=MIN_SIMILARITY_THRESHOLD
):
"""Find similar words using indexed trigram similarity - much faster!"""
import difflib
word = word.lower()
if len(word) < MIN_WORD_LENGTH:
return []
word_trigrams = self._generate_trigrams(word)
word_length = len(word)
try:
# Find candidate words that share trigrams (MUCH faster than checking all words)
placeholders = ",".join("?" * len(word_trigrams))
candidates = self.sql(
f"""
SELECT t.word, v.frequency, v.length, COUNT(*) as shared_trigrams
FROM search_trigrams t
JOIN search_vocabulary v ON t.word = v.word
WHERE t.trigram IN ({placeholders})
AND ABS(v.length - ?) <= ? -- Length filter for efficiency
GROUP BY t.word, v.frequency, v.length
HAVING shared_trigrams >= 1 -- Must share at least 1 trigram
ORDER BY shared_trigrams DESC, v.frequency DESC
""",
(*word_trigrams, word_length, MAX_EDIT_DISTANCE),
read_only=True,
)
except sqlite3.Error:
return []
similarities = []
word_trigram_set = set(word_trigrams)
for candidate_word, freq, candidate_length, _ in candidates:
# Quick length-based filter
if abs(candidate_length - word_length) > MAX_EDIT_DISTANCE:
continue
candidate_trigrams = set(self._generate_trigrams(candidate_word))
# Jaccard similarity for trigrams
intersection = len(word_trigram_set & candidate_trigrams)
union = len(word_trigram_set | candidate_trigrams)
trigram_similarity = intersection / union if union > 0 else 0
# Skip if trigram similarity is too low
if trigram_similarity < 0.3:
continue
# Sequence similarity for additional accuracy (only for promising candidates)
seq_similarity = difflib.SequenceMatcher(None, word, candidate_word).ratio()
# Combined similarity with frequency boost
combined_similarity = (
trigram_similarity * SIMILARITY_TRIGRAM_WEIGHT + seq_similarity * SIMILARITY_SEQUENCE_WEIGHT
)
frequency_boost = min(
MAX_FREQUENCY_BOOST, 1.0 + (freq / FREQUENCY_BOOST_FACTOR)
) # Slight boost for common words
final_score = combined_similarity * frequency_boost
if final_score >= min_similarity:
similarities.append((candidate_word, final_score))
# Sort by similarity and return top suggestions
similarities.sort(key=lambda x: x[1], reverse=True)
return [word for word, score in similarities[:max_suggestions]]
def _build_vocabulary(self, documents):
"""Build vocabulary and trigram index from documents for spelling correction."""
import re
word_freq = defaultdict(int)
word_regex = re.compile(r"\w+") # Compile regex once for efficiency
# Extract words from all documents in batches
for i, doc in enumerate(documents):
# Show progress for large document sets
if i % 1000 == 0:
progress = 80 + int((i / len(documents)) * 15) # 80-95% range
self._update_progress(
f"Processing vocabulary ({i}/{len(documents)})", progress, 100, absolute=True
)
# Process title and content together for efficiency
combined_text = " ".join(
[(doc.get("title", "") or "").lower(), (doc.get("content", "") or "").lower()]
)
# Extract all words at once with compiled regex
words = word_regex.findall(combined_text)
for word in words:
if len(word) > MIN_WORD_LENGTH - 1 and word.isalpha(): # Filter out short words and non-alpha
word_freq[word] += 1
# Clear existing data in a single transaction
conn = self._get_connection()
try:
cursor = conn.cursor()
cursor.execute("DELETE FROM search_vocabulary")
cursor.execute("DELETE FROM search_trigrams")
conn.commit()
finally:
conn.close()
if not word_freq:
return
# Prepare batch data for vocabulary
vocab_data = []
trigram_data = []
trigram_set = set() # Use set to avoid duplicate trigrams
for word, freq in word_freq.items():
vocab_data.append((word, freq, len(word)))
# Generate trigrams for this word
trigrams = self._generate_trigrams(word)
for trigram in trigrams:
trigram_key = (trigram, word)
if trigram_key not in trigram_set:
trigram_set.add(trigram_key)
trigram_data.append(trigram_key)
# Use batch inserts with a single transaction
conn = self._get_connection()
try:
cursor = conn.cursor()
# Batch insert vocabulary
cursor.executemany(
"INSERT INTO search_vocabulary (word, frequency, length) VALUES (?, ?, ?)", vocab_data
)
# Batch insert trigrams (duplicates already removed)
cursor.executemany("INSERT INTO search_trigrams (trigram, word) VALUES (?, ?)", trigram_data)
conn.commit()
finally:
conn.close()
# Database and Infrastructure Methods
def _get_connection(self, read_only=False):
"""Get SQLite connection with FTS5 support and performance optimizations."""
try:
conn = sqlite3.connect(self.db_path)
conn.row_factory = sqlite3.Row
# Apply performance optimizations
cursor = conn.cursor()
self._set_pragmas(cursor, read_only)
# Test the connection
cursor.execute("SELECT 1")
return conn
except sqlite3.Error as e:
frappe.log_error(f"Failed to connect to search database: {e}")
raise SQLiteSearchIndexMissingError(f"Search database connection failed: {e}") from e
def _set_pragmas(self, cursor, is_read=False):
"""Set SQLite performance pragmas."""
cursor.execute("PRAGMA journal_mode = WAL;") # Write-Ahead Logging for concurrency
cursor.execute("PRAGMA synchronous = NORMAL;") # Better performance vs FULL
cursor.execute("PRAGMA cache_size = -8192;") # 8MB cache
cursor.execute("PRAGMA temp_store = MEMORY;") # Memory temp storage
if is_read:
cursor.execute("PRAGMA query_only = 1;") # Read-only optimization
def _ensure_fts_table(self):
"""Create FTS table and related tables if they don't exist."""
# Get schema from subclass
text_fields = self.schema["text_fields"]
metadata_fields = self.schema["metadata_fields"]
tokenizer = self.schema["tokenizer"]
# Use a single transaction for all table creation operations
conn = self._get_connection()
try:
cursor = conn.cursor()
# Create the FTS table with dynamic columns
cursor.execute(f"""
CREATE VIRTUAL TABLE IF NOT EXISTS search_fts USING fts5(
doc_id UNINDEXED,
{", ".join([f"{field}" for field in text_fields])},
{", ".join([f"{field} UNINDEXED" for field in metadata_fields])},
tokenize="{tokenizer}"
)
""")
# Create the vocabulary and trigram tables
cursor.execute("""
CREATE TABLE IF NOT EXISTS search_vocabulary (
word TEXT PRIMARY KEY,
frequency INTEGER DEFAULT 1,
length INTEGER
)
""")
cursor.execute("""
CREATE TABLE IF NOT EXISTS search_trigrams (
trigram TEXT,
word TEXT,
PRIMARY KEY (trigram, word)
)
""")
# Index for fast trigram lookups
cursor.execute("""
CREATE INDEX IF NOT EXISTS idx_trigram_lookup ON search_trigrams(trigram)
""")
conn.commit()
finally:
conn.close()
def _index_documents(self, documents):
"""Bulk index documents into SQLite FTS."""
if not documents:
return
# Get schema configuration to build dynamic insert SQL
text_fields = self.schema["text_fields"]
metadata_fields = self.schema["metadata_fields"]
# Always add doc_id as first field (required for FTS)
all_fields = ["doc_id", *text_fields, *metadata_fields]
placeholders = ",".join(["?" for _ in all_fields])
field_names = ",".join(all_fields)
insert_sql = f"""
INSERT INTO search_fts ({field_names})
VALUES ({placeholders})
"""
# Process documents in chunks to prevent memory issues with large datasets
chunk_size = 1000
conn = self._get_connection()
try:
cursor = conn.cursor()
for i in range(0, len(documents), chunk_size):
chunk = documents[i : i + chunk_size]
values_to_insert = []
for doc in chunk:
# Validate document has required fields
if not doc.get("doctype") or not doc.get("name"):
self._warn_invalid_document(doc, "missing doctype/name")
continue
# Validate text fields are present
missing_text_fields = []
for field in text_fields:
if field not in doc or doc[field] is None:
missing_text_fields.append(field)
if missing_text_fields:
self._warn_missing_text_fields(
doc.get("doctype", ""), doc.get("name", ""), missing_text_fields
)
continue
# Build values tuple dynamically based on schema
values = []
for field in all_fields:
# Build doc_id automatically from doctype:name
if field == "doc_id":
doc_id = doc.get("id") or f"{doc.get('doctype', '')}:{doc.get('name', '')}"
values.append(doc_id)
else:
values.append(doc.get(field, ""))
values_to_insert.append(tuple(values))
# Insert the chunk
if values_to_insert:
cursor.executemany(insert_sql, values_to_insert)
conn.commit()
finally:
conn.close()
def index_doc(self, doctype, docname):
"""Index a single document."""
doc = frappe.get_doc(doctype, docname)
self.raise_if_not_indexed()
document = self.prepare_document(doc)
if document:
self._index_documents([document])
def remove_doc(self, doctype, docname):
"""Remove a single document from the index."""
self.raise_if_not_indexed()
doc_id = f"{doctype}:{docname}"
self.sql("DELETE FROM search_fts WHERE doc_id = ?", (doc_id,), commit=True)
# Utility Methods
def _update_progress(self, message, progress, total=100, absolute=True):
"""Update progress bar only if not running in a web request context or tests."""
if not hasattr(frappe.local, "request") and not frappe.flags.in_test:
update_progress_bar(message, progress, total, absolute=absolute)
def _validate_config(self):
"""Validate document configuration at startup."""
metadata_fields = self.schema["metadata_fields"]
for doctype, config in self.doc_configs.items():
# Validate that all specified fields are present in the 'fields' list
fields_to_check = ["content_field", "title_field"]
if "modified" in metadata_fields:
fields_to_check.append("modified_field")
for field_key in fields_to_check:
field_value = config.get(field_key)
if field_value and field_value not in config["fields"]:
raise ValueError(
f"{field_key.replace('_', ' ').title()} '{field_value}' not found in 'fields' list for Doctype '{doctype}'"
)
def _empty_search_result(self, title_only=False, filters=None):
"""Return empty search result structure."""
return {
"results": [],
"summary": {
"total_matches": 0,
"filtered_matches": 0,
"duration": 0,
"returned_matches": 0,
"corrected_words": None,
"corrected_query": None,
"title_only": title_only,
"applied_filters": filters or {},
},
}
def _get_db_path(self, is_temp=False):
"""Get the path for the SQLite FTS database."""
site_path = frappe.get_site_path()
db_path = os.path.join(site_path, self.db_name)
if is_temp:
return db_path.replace(".db", ".temp.db")
return db_path
def _prepare_fts_query(self, query):
"""Prepare query for FTS5 with proper escaping and operators."""
query = query.strip()
if not query:
return ""
# Simple query - split into terms and add wildcards for partial matching
terms = query.split()
fts_terms = []
for term in terms:
# Escape special FTS5 characters
term = term.replace('"', '""')
# Add wildcard for prefix matching
if len(term) > MIN_WORD_LENGTH - 1:
fts_terms.append(f'"{term}"*')
else:
fts_terms.append(f'"{term}"')
return " ".join(fts_terms)
def sql(self, query, params=None, read_only=False, commit=False):
"""Execute a SQL query on the search database."""
conn = self._get_connection(read_only=read_only)
try:
cursor = conn.cursor()
cursor.execute(query, params or [])
if read_only:
return cursor.fetchall()
if commit:
conn.commit()
# For write operations, we might not need to return anything,
# but returning the cursor could be useful for getting rowcount, etc.
return cursor
finally:
conn.close()
def prepare_document(self, doc):
"""Prepare a document for indexing by validating and transforming it."""
is_valid, config = self._validate_document_for_indexing(doc)
if not is_valid:
return None
document = {
"id": f"{doc.doctype}:{doc.name}",
"doctype": doc.doctype,
"name": doc.name,
}
self._add_text_fields_to_document(document, doc, config)
self._add_metadata_fields_to_document(document, doc, config)
return document
def _validate_document_for_indexing(self, doc):
"""Run all validation checks for a document before indexing."""
if not hasattr(doc, "doctype") or not doc.doctype:
self._warn_missing_doctype(doc)
return False, None
if not hasattr(doc, "name") or not doc.name:
self._warn_missing_name(doc.doctype)
return False, None
config = self.doc_configs.get(doc.doctype)
if not config:
return False, None
text_fields = self.schema["text_fields"]
# Validate title field
if "title" in text_fields:
title_field = config.get("title_field")
if title_field and (not hasattr(doc, title_field) or getattr(doc, title_field, None) is None):
self._warn_missing_title_field(doc.doctype, doc.name, title_field)
return False, None
# Validate content field
if "content" in text_fields:
content_field = config["content_field"]
if not hasattr(doc, content_field) or getattr(doc, content_field, None) is None:
self._warn_missing_content_field(doc.doctype, doc.name, content_field)
return False, None
return True, config
def _add_text_fields_to_document(self, document, doc, config):
"""Populate text fields in the document for indexing."""
text_fields = self.schema["text_fields"]
title_field = config.get("title_field")
content_field = config["content_field"]
for field in text_fields:
if field == "title":
if title_field:
raw_title = getattr(doc, title_field, "") or ""
document["title"] = self._process_content(raw_title)
else:
document["title"] = "" # No title field configured
elif field == "content":
raw_content = getattr(doc, content_field, "") or ""
document["content"] = self._process_content(raw_content)
else:
# Handle other custom text fields
raw_text = getattr(doc, field, "")
document[field] = self._process_content(raw_text)
def _add_metadata_fields_to_document(self, document, doc, config):
"""Populate metadata fields in the document for indexing."""
metadata_fields = self.schema["metadata_fields"]
for field in metadata_fields:
if field in document: # Skip already populated fields (id, doctype, name)
continue
if field == "modified":
modified_field = config["modified_field"]
modified_value = getattr(doc, modified_field, None)
if modified_value:
if not isinstance(modified_value, datetime.datetime):
modified_value = frappe.utils.get_datetime(modified_value)
document["modified"] = modified_value.timestamp()
continue
# Handle other metadata fields with potential mapping
field_mappings = config.get("field_mappings", {})
actual_field = field_mappings.get(field, field)
value = getattr(doc, actual_field, None)
# Convert Mock objects to strings to avoid database errors
if value is not None and hasattr(value, "_mock_name"):
value = str(value)
document[field] = value
def _process_content(self, content):
"""Process content to remove HTML tags, links, and images for better indexing quality."""
if not content:
return ""
# Convert to string in case it's a Mock object or other type
content = str(content)
soup = BeautifulSoup(content, "html.parser")
# Extract text content from links before removing HTML tags
for link in soup.find_all("a"):
link_text = link.get_text().strip()
if link_text:
link.replace_with(link_text)
else:
link.replace_with("[link]")
text = soup.get_text(separator=" ").strip() # remove tags
text = re.sub(r"https?://[^\s]+", "[link]", text) # replace standalone links
text = re.sub(r"\s+", " ", text).strip() # normalize whitespace
return text
def _generate_trigrams(self, word):
"""Generate trigrams for a word for fuzzy matching."""
word = f" {word.lower()} " # Add padding
return [word[i : i + 3] for i in range(len(word) - 2)]
def _print_warning_summary(self):
"""Print a summary of warnings collected during indexing."""
if not self.warnings:
return
print("\n" + "=" * 60)
print("SEARCH INDEX BUILD WARNINGS")
print("=" * 60)
# Group warnings by type
warning_groups: dict[WarningType, list[IndexWarning]] = {}
for warning in self.warnings:
warning_groups.setdefault(warning.type, []).append(warning)
# Define display names for warning types
type_display_names = {
WarningType.INVALID_DOCUMENT: "Invalid Documents",
WarningType.MISSING_TEXT_FIELDS: "Missing Text Fields",
WarningType.MISSING_CONTENT_FIELD: "Missing Content Field",
WarningType.MISSING_TITLE_FIELD: "Missing Title Field",
WarningType.MISSING_DOCTYPE: "Missing Document Type",
WarningType.MISSING_NAME: "Missing Document Name",
WarningType.OTHER: "Other Issues",
}
# Print grouped warnings
for warning_type, warnings in warning_groups.items():
display_name = type_display_names.get(warning_type, warning_type.value.title())
print(f"\n{display_name} ({len(warnings)} warnings):")
print("-" * 50)
for warning in warnings[:5]: # Show first 5 warnings of each type
print(f"{warning.message}")
if len(warnings) > 5:
print(f" ... and {len(warnings) - 5} more")
print(f"\nTotal warnings: {len(self.warnings)}")
print("=" * 60)
# Warning helper methods (utility functions)
def _add_warning(self, warning_type: WarningType, message: str, **kwargs):
"""Add a structured warning to the warnings list."""
warning = IndexWarning(type=warning_type, message=message, **kwargs)
self.warnings.append(warning)
def _warn_invalid_document(self, doc: dict, reason: str):
"""Add warning for invalid document."""
self._add_warning(
WarningType.INVALID_DOCUMENT,
f"Skipping document with {reason}: {doc}",
doctype=doc.get("doctype"),
docname=doc.get("name"),
)
def _warn_missing_text_fields(self, doctype: str, docname: str, missing_fields: list):
"""Add warning for missing text fields."""
self._add_warning(
WarningType.MISSING_TEXT_FIELDS,
f"Document {doctype}:{docname} missing text fields: {missing_fields}",
doctype=doctype,
docname=docname,
missing_fields=missing_fields,
)
def _warn_missing_content_field(self, doctype: str, docname: str, field: str):
"""Add warning for missing content field."""
self._add_warning(
WarningType.MISSING_CONTENT_FIELD,
f"Document {doctype}:{docname} missing content field '{field}'",
doctype=doctype,
docname=docname,
field=field,
)
def _warn_missing_title_field(self, doctype: str, docname: str, field: str):
"""Add warning for missing title field."""
self._add_warning(
WarningType.MISSING_TITLE_FIELD,
f"Document {doctype}:{docname} missing title field '{field}'",
doctype=doctype,
docname=docname,
field=field,
)
def _warn_missing_doctype(self, doc: Any):
"""Add warning for missing doctype."""
self._add_warning(
WarningType.MISSING_DOCTYPE,
f"Document missing doctype: {doc}",
docname=getattr(doc, "name", None),
)
def _warn_missing_name(self, doctype: str):
"""Add warning for missing name."""
self._add_warning(WarningType.MISSING_NAME, f"Document missing name: {doctype}", doctype=doctype)
def get_warning_statistics(self) -> dict[str, Any]:
"""Get warning statistics for programmatic use."""
if not self.warnings:
return {"total": 0, "by_type": {}}
stats = {"total": len(self.warnings), "by_type": {}}
for warning in self.warnings:
warning_type = warning.type.value
if warning_type not in stats["by_type"]:
stats["by_type"][warning_type] = {"count": 0, "examples": []}
stats["by_type"][warning_type]["count"] += 1
# Keep a few examples
if len(stats["by_type"][warning_type]["examples"]) < 3:
stats["by_type"][warning_type]["examples"].append(
{
"message": warning.message,
"doctype": warning.doctype,
"docname": warning.docname,
"field": warning.field,
"missing_fields": warning.missing_fields,
}
)
return stats
# Module-level Functions for background tasks
def build_index_if_not_exists():
"""Build index if it doesn't exist."""
search_classes = get_search_classes()
for SearchClass in search_classes:
build_index(SearchClass, force=False)
def build_index(
SearchClass: type[SQLiteSearch] | None = None, search_class_path: str | None = None, force: bool = False
):
"""Build search index for SearchClass"""
if not SearchClass and not search_class_path:
raise ValueError("Either SearchClass or search_class_path must be provided")
if search_class_path:
SearchClass = frappe.get_attr(search_class_path)
search = SearchClass()
if not search.is_search_enabled():
return
if not search.index_exists() or force:
print(f"{SearchClass.__name__}: Index does not exist, building...")
search.build_index()
def build_index_in_background():
"""Enqueue index building in background."""
search_classes = get_search_classes()
for SearchClass in search_classes:
search = SearchClass()
if not search.is_search_enabled():
return
search_class_path = f"{SearchClass.__module__}.{SearchClass.__name__}"
print(f"Enqueuing {search_class_path}.build_index")
frappe.enqueue(
"frappe.search.sqlite_search.build_index",
queue="long",
job_id=search_class_path,
deduplicate=True,
# build_index args
search_class_path=search_class_path,
force=True,
)
def update_doc_index(doc: Document, method=None):
search_classes = get_search_classes()
for SearchClass in search_classes:
search = SearchClass()
if not (search.is_search_enabled() and search.index_exists()):
return
for doctype, config in search.doc_configs.items():
if doc.doctype == doctype:
fields = config.get("fields", [])
if not fields:
continue
any_field_changed = any(doc.has_value_changed(field) for field in fields)
if any_field_changed:
try:
search.index_doc(doctype, doc.name)
except Exception:
frappe.log_error(
title="SQLite Search Index Update Error",
message=f"Failed to update index for {doctype}:{doc.name} in {search.__class__.__name__}",
)
def delete_doc_index(doc: Document, method=None):
search_classes = get_search_classes()
for SearchClass in search_classes:
search = SearchClass()
if not (search.is_search_enabled() and search.index_exists()):
return
for doctype, config in search.doc_configs.items():
if doc.doctype == doctype:
fields = config.get("fields", [])
if not fields:
continue
try:
# Remove the document from the index
search.remove_doc(doctype, doc.name)
except Exception:
frappe.log_error(
title="SQLite Search Index Delete Error",
message=f"Failed to remove index for {doctype}:{doc.name} in {search.__class__.__name__}",
)
def get_search_classes() -> list[type[SQLiteSearch]]:
module_paths = frappe.get_hooks("sqlite_search")
search_classes = [frappe.get_attr(path) for path in module_paths]
for search_class in search_classes:
# validate if search classes extend from SQLiteSearch
if not issubclass(search_class, SQLiteSearch):
raise TypeError(f"Search class {search_class.__name__} must extend SQLiteSearch")
return search_classes