feat: SQLite FTS5 search framework for Frappe apps (#33359)

- Abstract SQLiteSearch base class with full-text search
- Spelling correction, recency boosting, and custom scoring
- Supports search filtering and configurable document indexing
- hooks for auto-indexing
  - build index after migrate
  - build index (if not exists) every 15 mins
  - update doc index on_update
  - remove doc index on_trash
This commit is contained in:
Faris Ansari 2025-07-29 11:18:56 +05:30 committed by GitHub
parent 761751f269
commit 2676c9c2ec
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 2402 additions and 1 deletions

View file

@ -154,6 +154,7 @@ doc_events = {
"frappe.automation.doctype.assignment_rule.assignment_rule.update_due_date",
"frappe.core.doctype.user_type.user_type.apply_permissions_for_non_standard_user_type",
"frappe.core.doctype.permission_log.permission_log.make_perm_log",
"frappe.search.sqlite_search.update_doc_index",
],
"after_rename": "frappe.desk.notifications.clear_doctype_notifications",
"on_cancel": [
@ -164,6 +165,7 @@ doc_events = {
"on_trash": [
"frappe.desk.notifications.clear_doctype_notifications",
"frappe.workflow.doctype.workflow_action.workflow_action.process_workflow_actions",
"frappe.search.sqlite_search.delete_doc_index",
],
"on_update_after_submit": [
"frappe.workflow.doctype.workflow_action.workflow_action.process_workflow_actions",
@ -206,6 +208,7 @@ scheduler_events = {
"frappe.deferred_insert.save_to_db",
"frappe.automation.doctype.reminder.reminder.send_reminders",
"frappe.model.utils.link_count.update_link_count",
"frappe.search.sqlite_search.build_index_if_not_exists",
],
# 10 minutes
"0/10 * * * *": [
@ -278,7 +281,10 @@ setup_wizard_exception = [
]
before_migrate = ["frappe.core.doctype.patch_log.patch_log.before_migrate"]
after_migrate = ["frappe.website.doctype.website_theme.website_theme.after_migrate"]
after_migrate = [
"frappe.website.doctype.website_theme.website_theme.after_migrate",
"frappe.search.sqlite_search.build_index_in_background",
]
otp_methods = ["OTP App", "Email", "SMS"]

View file

@ -3,6 +3,7 @@
import frappe
from frappe.search.full_text_search import FullTextSearch
from frappe.search.sqlite_search import SQLiteSearch
from frappe.search.website_search import WebsiteSearch
from frappe.utils import cint

View file

@ -0,0 +1,470 @@
# SQLite Search Framework
SQLite Search is a full-text search framework for Frappe applications that provides advanced search capabilities using SQLite's FTS5 (Full-Text Search) engine. It offers features like spelling correction, time-based recency scoring, custom ranking, permission-aware filtering, and extensible scoring pipelines.
## Table of Contents
- [Quick Start](#quick-start)
- [How It Works](#how-it-works)
- [Configuration](#configuration)
- [Features & Customization](#features--customization)
- [API Reference](#api-reference)
## Quick Start
### 1. Create a Search Class
Create a search implementation by extending `SQLiteSearch`:
```python
# my_app/search.py
from frappe.search.sqlite_search import SQLiteSearch
class MyAppSearch(SQLiteSearch):
# Database file name
INDEX_NAME = "my_app_search.db"
# Define the search schema
INDEX_SCHEMA = {
"metadata_fields": ["project", "owner", "status"],
"tokenizer": "unicode61 remove_diacritics 2 tokenchars '-_'",
}
# Define which doctypes to index and their field mappings
INDEXABLE_DOCTYPES = {
"Task": {
"fields": ["name", {"title": "subject"}, {"content": "description"}, "modified", "project", "owner", "status"],
},
"Issue": {
"fields": ["name", "title", "description", {"modified": "last_updated"}, "project", "owner"],
"filters": {"status": ("!=", "Closed")}, # Only index non-closed issues
},
}
def get_search_filters(self):
"""Return permission filters for current user"""
# Get projects accessible to current user
accessible_projects = frappe.get_all(
"Project",
filters={"owner": frappe.session.user},
pluck="name"
)
if not accessible_projects:
return {"project": []} # No access
return {"project": accessible_projects}
```
### 2. Register the Search Class
Add your search class to hooks.py:
```python
# my_app/hooks.py
sqlite_search = ['my_app.search.MyAppSearch']
```
### 3. Create API Endpoint
Create a whitelisted method to expose search functionality:
```python
# my_app/api.py
import frappe
from my_app.search import MyAppSearch
@frappe.whitelist()
def search(query, filters=None):
search = MyAppSearch()
result = search.search(query, filters=filters)
return result
```
### 4. Build the Index
Build the search index programmatically or via console:
```python
from my_app.search import MyAppSearch
search = MyAppSearch()
search.build_index()
```
## How It Works
### 1. Indexing Process
#### Full Index Building
When you call `build_index()`, the framework performs a complete index rebuild:
1. **Database Preparation**: Creates a temporary SQLite database with FTS5 tables configured according to your schema
2. **Document Collection**: Queries all specified doctypes using the configured field mappings and filters
3. **Document Processing**: For each document:
- Extracts and maps fields according to `INDEXABLE_DOCTYPES` configuration
- Cleans HTML content using BeautifulSoup to extract plain text
- Applies custom document preparation logic if `prepare_document()` is overridden
- Validates required fields (title, content) are present
4. **Batch Insertion**: Inserts processed documents into the FTS5 index in batches for performance
5. **Vocabulary Building**: Constructs a spelling correction dictionary from all indexed text
6. **Atomic Replacement**: Replaces the existing index database with the new one atomically
#### Individual Document Indexing
For real-time updates using `index_doc()` or `remove_doc()`:
1. **Single Document Processing**: Retrieves and processes one document using the same field mapping logic
2. **Incremental Update**: Updates the existing FTS5 index by inserting, updating, or deleting the specific document
3. **Vocabulary Update**: Updates the spelling dictionary with new terms from the document
### 2. Search Process
When a user performs a search using `search()`, the framework executes these steps:
1. **Permission Filtering**: Calls `get_search_filters()` to determine what documents the current user can access
2. **Query Preprocessing**:
- Validates the search query is not empty
- Combines user-provided filters with permission filters
3. **Spelling Correction**:
- Analyzes query terms against the vocabulary dictionary
- Uses trigram similarity to suggest corrections for misspelled words
- Expands the original query with corrected terms
4. **FTS5 Query Execution**:
- Constructs an FTS5-compatible query string
- Executes the full-text search against the SQLite database
- Applies metadata filters (status, owner, project, etc.)
- Retrieves raw results with BM25 scores
5. **Results Processing**:
- **Custom Scoring**: Applies the scoring pipeline to calculate final relevance scores
- Base BM25 score processing
- Title matching boosts (exact and partial matches)
- Recency boosting based on document age
- Custom scoring functions (doctype-specific, priority-based, etc.)
- **Ranking**: Sorts results by final scores and assigns rank positions
- **Content Formatting**: Generates content snippets and highlights matching terms
## Configuration
### INDEX_SCHEMA
Defines the structure of your search index:
```python
INDEX_SCHEMA = {
# Text fields that will be searchable (defaults to ["title", "content"])
"text_fields": ["title", "content"],
# Metadata fields stored alongside text content for filtering
"metadata_fields": ["project", "owner", "status", "priority"],
# FTS5 tokenizer configuration
"tokenizer": "unicode61 remove_diacritics 2 tokenchars '-_@.'"
}
```
### INDEXABLE_DOCTYPES
Specifies which doctypes to index and how to map their fields:
```python
INDEXABLE_DOCTYPES = {
"Task": {
# Field mapping
"fields": [
"name",
{"title": "subject"}, # Maps subject field to title
{"content": "description"}, # Maps description field to content
{"modified": "creation"}, # Use creation instead of modified for recency boost
"project",
"owner"
],
# Optional filters to limit which records are indexed
"filters": {
"status": ("!=", "Cancelled"),
"docstatus": ("!=", 2)
}
}
}
```
### Field Mapping Rules
- **String fields**: Direct mapping `"field_name"`
- **Aliased fields**: Dictionary mapping `{"schema_field": "doctype_field"}`
- **Required fields**: `title` and `content` fields must be present or explicitly mapped (e.g., `{"title": "subject"}`)
- **Auto-added fields**: `doctype` and `name` are automatically included
- **Modified field**: Added automatically if used in any doctype configuration. Used for recency boosting - if you want to use a different timestamp field (like `creation` or `last_updated`), map it to `modified` using `{"modified": "creation"}`
## Features & Customization
### Permission Filtering
Implement `get_search_filters()` to control access:
```python
def get_search_filters(self):
"""Return filters based on user permissions"""
user = frappe.session.user
if user == "Administrator":
return {} # No restrictions
# Example: User can only see their own and public documents
return {
"owner": user,
"status": ["Active", "Published"]
}
```
### Custom Scoring
Create custom scoring functions to influence search relevance:
```python
class MyAppSearch(SQLiteSearch):
...
@SQLiteSearch.scoring_function
def _get_priority_boost(self, row, query, query_words):
"""Boost high-priority items"""
priority = row.get("priority", "Medium")
if priority == "High":
return 1.5
if priority == "Medium":
return 1.1
return 1.0
```
### Recency Boosting
The framework automatically provides time-based recency boosting using the `modified` field:
```python
# The modified field is used for calculating document age
# Recent documents get higher scores:
# - Last 24 hours: 1.8x boost
# - Last 7 days: 1.5x boost
# - Last 30 days: 1.2x boost
# - Last 90 days: 1.1x boost
# - Older documents: gradually decreasing boost
# If your doctype uses a different timestamp field, map it to modified:
INDEXABLE_DOCTYPES = {
"GP Discussion": {
"fields": ["name", "title", "content", {"modified": "last_post_at"}, "project"],
},
"Article": {
"fields": ["name", "title", "content", {"modified": "published_date"}, "category"],
}
}
```
### Document Preparation
Override `prepare_document()` for custom document processing:
```python
def prepare_document(self, doc):
"""Custom document preparation"""
document = super().prepare_document(doc)
if not document:
return None
# Add computed fields
if doc.doctype == "Task":
# Combine multiple fields into content
content_parts = [
doc.description or "",
doc.notes or "",
"\n".join([comment.content for comment in doc.get("comments", [])])
]
document["content"] = "\n".join(filter(None, content_parts))
# set fields that might be stored in another table
document["category"] = get_category_for_task(doc)
return document
```
### Spelling Correction
The framework includes built-in spelling correction using trigram similarity:
```python
# Spelling correction happens automatically
search_result = search.search("projetc managment") # Will find "project management"
# Access correction information
print(search_result["summary"]["corrected_words"])
# Output: {"projetc": "project", "managment": "management"}
```
### Content Processing
HTML content is automatically cleaned and processed using BeautifulSoup:
```python
# Complex HTML content like this:
html_content = """
<div class="article">
<h1>API Documentation</h1>
<p>Learn how to integrate with our <a href="/api">REST API</a>.</p>
<img src="/images/api-flow.png" alt="API workflow diagram" />
<ul>
<li><strong>Authentication:</strong> Use <code>Bearer tokens</code></li>
<li>Rate limiting: <em>1000 requests/hour</em></li>
</ul>
<blockquote>See our <a href="/examples">code examples</a> for details.</blockquote>
<table><tr><td>Method</td><td>POST</td></tr></table>
<script>analytics.track('page_view');</script>
<style>.hidden { display: none; }</style>
</div>
"""
# Is automatically converted to clean, searchable plain text:
"""
API Documentation
Learn how to integrate with our REST API.
Authentication: Use Bearer tokens
Rate limiting: 1000 requests/hour
See our code examples for details.
Method POST
"""
# The cleaning process:
# 1. Removes all HTML tags (<div>, <h1>, <strong>, <code>, etc.)
# 2. Strips out scripts, styles, and non-content elements
# 3. Extracts link text while removing href URLs
# 4. Normalizes whitespace and line breaks
```
### Title-Only Search
```python
results = search.search("project update", title_only=True)
```
### Advanced Filtering
```python
accessible_projects = ['PROJ001', 'PROJ002', ...]
filters = {
"project": accessible_projects, # Multiple values (IN clause)
"owner": current_user, # Single value (= clause)
}
results = search.search("bug fix", filters=filters)
```
### Automatic Index Handling
The framework handles index building and maintenance automatically when you register your search class:
```python
# hooks.py
sqlite_search = ['my_app.search.MyAppSearch']
```
**What the framework does automatically:**
1. **Post-Migration Index Building**: Builds the search index automatically after running `bench migrate`
2. **Periodic Index Verification**: Checks every 15 minutes that the index exists and rebuilds if missing
3. **Real-time Document Updates**: Automatically calls `index_doc()` and `remove_doc()` on document lifecycle events (insert, update, delete) for all doctypes defined in your `INDEXABLE_DOCTYPES`
## Manual Index Handling
If you prefer to have manual control over the lifecycle of indexing, then you can simply opt out of automatic index handling by not registering the search class in `sqlite_search` hook.
```python
from my_app.search import MyAppSearch
def build_index_in_background():
"""Manually trigger background index building"""
search = MyAppSearch()
if search.is_search_enabled() and not search.index_exists():
frappe.enqueue("my_app.search.build_index", queue="long")
# hooks.py
scheduler_events = {
# Custom scheduler (if you want different timing)
"daily": ["my_app.search.build_index_if_not_exists"],
}
```
## API Reference
#### `search(query, title_only=False, filters=None)`
Main search method that returns formatted results.
**Parameters:**
- `query` (str): Search query text
- `title_only` (bool): Search only in title fields
- `filters` (dict): Additional filters to apply
**Returns:**
```python
{
"results": [
{
"doctype": "Task",
"name": "TASK-001",
"title": "Fix login bug",
"content": "User cannot login after password reset...",
"score": 0.85,
"original_rank": 3, # original bm25 rank
"rank": 1, # modified rank after custom scoring pipeline
# ... other metadata fields
}
],
"summary": {
"duration": 0.023,
"total_matches": 15,
"returned_matches": 15,
"corrected_words": {"loggin": "login"},
"corrected_query": "Fix login bug",
"title_only": False,
"filtered_matches": 15,
"applied_filters": {"status": ["Open"]}
}
}
```
#### `build_index()`
Build the complete search index from scratch.
#### `index_doc(doctype, docname)`
Index a single document.
#### `remove_doc(doctype, docname)`
Remove a single document from the index.
#### `is_search_enabled()`
Check if search is enabled (override to add disable logic).
#### `index_exists()`
Check if the search index exists.
#### `get_search_filters()`
**Must be implemented by subclasses.** Return filters for the current user.
**Returns:**
```python
{
"field_name": "value", # Single value
"field_name": ["val1", "val2"], # Multiple values
}
```
#### `scoring_function()`
Use the `@SQLiteSearch.scoring_function` decorator to mark a function as a scoring function.

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,505 @@
import os
import sqlite3
import time
from typing import ClassVar
from unittest.mock import patch
import frappe
from frappe.search.sqlite_search import SQLiteSearch, SQLiteSearchIndexMissingError
from frappe.tests import IntegrationTestCase
class TestSQLiteSearch(SQLiteSearch):
"""Test implementation of SQLiteSearch for testing purposes."""
INDEX_NAME = "test_search.db"
INDEX_SCHEMA: ClassVar = {
"text_fields": ["title", "content"],
"metadata_fields": ["doctype", "name", "owner", "modified"],
"tokenizer": "unicode61 remove_diacritics 2",
}
INDEXABLE_DOCTYPES: ClassVar = {
"Note": {
"fields": ["name", "title", "content", "owner", {"modified": "creation"}],
},
"ToDo": {
"fields": ["name", {"title": "description"}, {"content": "description"}, "owner", "modified"],
},
"User": {
"fields": ["name", {"title": "full_name"}, {"content": "email"}, "name", "modified"],
"filters": {"enabled": 1},
},
}
def get_search_filters(self):
"""Return permission filters - for testing, allow all documents."""
if frappe.session.user == "Administrator":
return {}
# Simulate user-specific filtering
return {"owner": frappe.session.user}
class TestSQLiteSearchAPI(IntegrationTestCase):
"""Test suite for SQLiteSearch public API functionality."""
@classmethod
def setUpClass(cls):
super().setUpClass()
cls.search = TestSQLiteSearch()
# Clean up any existing test database
cls.search.drop_index()
@classmethod
def tearDownClass(cls):
super().tearDownClass()
# Clean up test database
cls.search.drop_index()
def setUp(self):
"""Set up test data for each test."""
super().setUp()
# Create test documents
self.test_notes = []
self.test_todos = []
# Create test notes with different content
note_data = [
{"title": "Python Programming Guide", "content": "Learn Python basics and advanced concepts"},
{"title": "Project Management Tips", "content": "How to manage software projects effectively"},
{"title": "Cooking Recipe Collection", "content": "Delicious recipes for home cooking"},
{
"title": "Machine Learning Tutorial",
"content": "Introduction to ML algorithms and Python implementation",
},
]
for data in note_data:
note = frappe.get_doc({"doctype": "Note", "title": data["title"], "content": data["content"]})
note.insert()
self.test_notes.append(note)
# Create test todos
todo_data = [
{"description": "Review Python code for search functionality"},
{"description": "Update project documentation"},
{"description": "Plan team meeting agenda"},
]
for data in todo_data:
todo = frappe.get_doc({"doctype": "ToDo", "description": data["description"], "status": "Open"})
todo.insert()
self.test_todos.append(todo)
def tearDown(self):
"""Clean up test data after each test."""
# Delete test documents
for note in self.test_notes:
try:
note.delete()
except Exception:
pass
for todo in self.test_todos:
try:
todo.delete()
except Exception:
pass
super().tearDown()
def test_index_lifecycle_and_status_methods(self):
"""Test index building, existence checking, and status validation."""
# Initially index should not exist
self.search.drop_index() # Ensure clean state
self.assertFalse(self.search.index_exists())
# Should raise error when trying to search without index
with self.assertRaises(SQLiteSearchIndexMissingError):
self.search.raise_if_not_indexed()
# Build index
self.search.build_index()
# Now index should exist
self.assertTrue(self.search.index_exists())
# Should not raise error now
try:
self.search.raise_if_not_indexed()
except SQLiteSearchIndexMissingError:
self.fail("raise_if_not_indexed() raised exception when index exists")
# Verify database file exists and has correct tables
self.assertTrue(os.path.exists(self.search.db_path))
conn = sqlite3.connect(self.search.db_path)
cursor = conn.cursor()
# Check if FTS table exists
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='search_fts'")
self.assertTrue(cursor.fetchone())
# Check if vocabulary tables exist
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='search_vocabulary'")
self.assertTrue(cursor.fetchone())
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='search_trigrams'")
self.assertTrue(cursor.fetchone())
conn.close()
# Test drop_index method
self.search.drop_index()
self.assertFalse(self.search.index_exists())
self.assertFalse(os.path.exists(self.search.db_path))
# Dropping non-existent index should not raise error
self.search.drop_index() # Should not raise error
def test_basic_search_functionality(self):
"""Test core search functionality with various query types."""
# Build index first
self.search.build_index()
# Test basic text search
results = self.search.search("Python")
self.assertGreater(len(results["results"]), 0)
self.assertIn("Python", results["results"][0]["title"] + results["results"][0]["content"])
# Verify result structure
result = results["results"][0]
required_fields = [
"id",
"title",
"content",
"doctype",
"name",
"score",
"original_rank",
"modified_rank",
]
for field in required_fields:
self.assertIn(field, result)
# Test case-insensitive search
results_lower = self.search.search("python")
results_upper = self.search.search("PYTHON")
self.assertEqual(len(results_lower["results"]), len(results_upper["results"]))
# Test partial word matching
results = self.search.search("prog") # Should match "Programming"
self.assertGreater(len(results["results"]), 0)
# Test multi-word search
results = self.search.search("Python programming")
self.assertGreater(len(results["results"]), 0)
# Test empty query
results = self.search.search("")
self.assertEqual(len(results["results"]), 0)
# Test title-only search
results = self.search.search("Python", title_only=True)
self.assertGreater(len(results["results"]), 0)
for result in results["results"]:
self.assertIn("Python", result["title"])
def test_search_filtering_and_permissions(self):
"""Test search filtering and permission-based result filtering."""
self.search.build_index()
# Test basic filtering by doctype
results = self.search.search("", filters={"doctype": "Note"})
for result in results["results"]:
self.assertEqual(result["doctype"], "Note")
# Test filtering with list values
results = self.search.search("", filters={"doctype": ["Note", "ToDo"]})
for result in results["results"]:
self.assertIn(result["doctype"], ["Note", "ToDo"])
# Test empty filter list (should return no results)
results = self.search.search("", filters={"doctype": []})
self.assertEqual(len(results["results"]), 0)
# Test permission filtering by switching users
original_user = frappe.session.user
try:
# Create a test user and switch to them
test_user_email = "test_search_user@example.com"
if not frappe.db.exists("User", test_user_email):
test_user = frappe.get_doc(
{
"doctype": "User",
"email": test_user_email,
"first_name": "Test",
"last_name": "User",
"enabled": 1,
}
)
test_user.insert()
frappe.set_user(test_user_email)
# Search should now filter by owner (based on our test implementation)
results = self.search.search("Python")
# Results should be limited based on permission filters
self.assertIsInstance(results["results"], list)
finally:
frappe.set_user(original_user)
def test_advanced_scoring_and_ranking(self):
"""Test scoring pipeline, ranking, and result ordering."""
self.search.build_index()
# Search for a term that appears in multiple documents
results = self.search.search("Python")
# Verify results are sorted by score (descending)
scores = [result["score"] for result in results["results"]]
self.assertEqual(scores, sorted(scores, reverse=True))
# Verify both original and modified rankings are present
for i, result in enumerate(results["results"]):
self.assertEqual(result["modified_rank"], i + 1)
self.assertIsInstance(result["original_rank"], int)
self.assertGreater(result["original_rank"], 0)
# Test title boost - documents with search term in title should rank higher
results = self.search.search("Programming")
title_match_found = False
for result in results["results"]:
if "Programming" in result["title"]:
title_match_found = True
# Title matches should have higher scores
self.assertGreater(result["score"], 1.0)
break
self.assertTrue(title_match_found, "No title matches found for scoring test")
# Test that BM25 score is included
for result in results["results"]:
self.assertIn("bm25_score", result)
self.assertIsInstance(result["bm25_score"], (int, float))
def test_spelling_correction_and_query_expansion(self):
"""Test spelling correction and query expansion functionality."""
self.search.build_index()
# Test with a misspelled word that should be corrected
results = self.search.search("Pythom") # Misspelled "Python"
# Check if corrections were applied
summary = results["summary"]
if summary.get("corrected_words"):
self.assertIsInstance(summary["corrected_words"], dict)
self.assertIsInstance(summary["corrected_query"], str)
# Even with misspelling, we should get some results due to correction
# (This might not always work depending on vocabulary, so we test gracefully)
self.assertIsInstance(results["results"], list)
# Test with a completely made-up word
results = self.search.search("xyzabc123nonexistent")
# Should return empty results or minimal results
self.assertLessEqual(len(results["results"]), 1)
def test_document_indexing_operations(self):
"""Test individual document indexing and removal operations."""
self.search.build_index()
# Create a new document after index is built
new_note = frappe.get_doc(
{
"doctype": "Note",
"title": "Newly Added Document",
"content": "This document was added after initial indexing",
}
)
new_note.insert()
try:
# Initially, the new document shouldn't be in search results
results = self.search.search("Newly Added Document")
initial_count = len(results["results"])
# Index the new document
self.search.index_doc("Note", new_note.name)
# Now it should be findable
results = self.search.search("Newly Added Document")
self.assertGreater(len(results["results"]), initial_count)
# Verify the document is in results
found = False
for result in results["results"]:
if result["name"] == new_note.name:
found = True
break
self.assertTrue(found, "Newly indexed document not found in search results")
# Remove the document from index
self.search.remove_doc("Note", new_note.name)
# Should not be findable anymore
results = self.search.search("Newly Added Document")
found = False
for result in results["results"]:
if result["name"] == new_note.name:
found = True
break
self.assertFalse(found, "Removed document still found in search results")
finally:
new_note.delete()
def test_search_result_summary_and_metadata(self):
"""Test search result summary and metadata information."""
self.search.build_index()
results = self.search.search("Python")
summary = results["summary"]
# Verify summary structure
required_summary_fields = [
"total_matches",
"filtered_matches",
"returned_matches",
"duration",
"title_only",
"applied_filters",
]
for field in required_summary_fields:
self.assertIn(field, summary)
# Verify summary values make sense
self.assertIsInstance(summary["duration"], (int, float))
self.assertGreater(summary["duration"], 0)
self.assertEqual(summary["total_matches"], summary["filtered_matches"])
self.assertEqual(summary["filtered_matches"], len(results["results"]))
self.assertFalse(summary["title_only"])
self.assertEqual(summary["applied_filters"], {})
# Test with filters applied
results = self.search.search("Python", filters={"doctype": "Note"})
summary = results["summary"]
self.assertEqual(summary["applied_filters"], {"doctype": "Note"})
# Test title-only search
results = self.search.search("Python", title_only=True)
summary = results["summary"]
self.assertTrue(summary["title_only"])
def test_configuration_and_schema_validation(self):
"""Test configuration validation and schema handling."""
# Test invalid configuration
class InvalidSearchClass(SQLiteSearch):
# Missing required INDEX_SCHEMA
INDEXABLE_DOCTYPES: ClassVar = {"Note": {"fields": ["name", "title"]}}
def get_search_filters(self):
return {}
with self.assertRaises(ValueError):
InvalidSearchClass()
# Test invalid doctype configuration
class InvalidDoctypeConfig(SQLiteSearch):
INDEX_SCHEMA: ClassVar = {"text_fields": ["title", "content"]}
INDEXABLE_DOCTYPES: ClassVar = {
"Note": {
# Missing 'fields' key
"title_field": "title"
}
}
def get_search_filters(self):
return {}
with self.assertRaises(ValueError):
InvalidDoctypeConfig()
def test_content_processing_and_html_handling(self):
"""Test content processing including HTML tag removal and text normalization."""
self.search.build_index()
# Create a note with HTML content
html_note = frappe.get_doc(
{
"doctype": "Note",
"title": "HTML Content Test",
"content": "<p>This is <strong>bold</strong> text with <a href='http://example.com'>links</a> and <br> line breaks.</p>",
}
)
html_note.insert()
try:
# Index the document
self.search.index_doc("Note", html_note.name)
# Search should find processed content
results = self.search.search("bold text links")
# Should find the document
found = False
for result in results["results"]:
if result["name"] == html_note.name:
found = True
# Content should be processed (HTML tags removed)
self.assertNotIn("<p>", result["content"])
self.assertNotIn("<strong>", result["content"])
self.assertIn("bold", result["content"])
self.assertNotIn(
"<a href='http://example.com'>", result["content"]
) # Links should be replaced
break
self.assertTrue(found, "HTML content document not found in search")
finally:
html_note.delete()
def test_search_disabled_state(self):
"""Test behavior when search is disabled."""
# Create a search class with search disabled
class DisabledSearch(TestSQLiteSearch):
def is_search_enabled(self):
return False
disabled_search = DisabledSearch()
disabled_search.drop_index() # Ensure clean state
# Should return empty results when disabled
results = disabled_search.search("Python")
self.assertEqual(len(results["results"]), 0)
# Build index should do nothing when disabled
disabled_search.build_index() # Should not raise error but do nothing
self.assertFalse(disabled_search.index_exists())
@patch("frappe.enqueue")
def test_background_operations(self, mock_enqueue):
"""Test background job integration and module-level functions."""
from frappe.search.sqlite_search import (
build_index_in_background,
get_search_classes,
)
# Test getting search classes
with patch("frappe.get_hooks") as mock_get_hooks:
mock_get_hooks.return_value = ["frappe.tests.test_sqlite_search.TestSQLiteSearch"]
classes = get_search_classes()
self.assertEqual(len(classes), 1)
self.assertEqual(classes[0], TestSQLiteSearch)
# Test background index building
with patch("frappe.get_hooks") as mock_get_hooks:
mock_get_hooks.return_value = ["frappe.tests.test_sqlite_search.TestSQLiteSearch"]
build_index_in_background()
# Should have enqueued a background job
self.assertTrue(mock_enqueue.called)