feat: SQLite FTS5 search framework for Frappe apps (#33359)
- Abstract SQLiteSearch base class with full-text search - Spelling correction, recency boosting, and custom scoring - Supports search filtering and configurable document indexing - hooks for auto-indexing - build index after migrate - build index (if not exists) every 15 mins - update doc index on_update - remove doc index on_trash
This commit is contained in:
parent
761751f269
commit
2676c9c2ec
5 changed files with 2402 additions and 1 deletions
|
|
@ -154,6 +154,7 @@ doc_events = {
|
|||
"frappe.automation.doctype.assignment_rule.assignment_rule.update_due_date",
|
||||
"frappe.core.doctype.user_type.user_type.apply_permissions_for_non_standard_user_type",
|
||||
"frappe.core.doctype.permission_log.permission_log.make_perm_log",
|
||||
"frappe.search.sqlite_search.update_doc_index",
|
||||
],
|
||||
"after_rename": "frappe.desk.notifications.clear_doctype_notifications",
|
||||
"on_cancel": [
|
||||
|
|
@ -164,6 +165,7 @@ doc_events = {
|
|||
"on_trash": [
|
||||
"frappe.desk.notifications.clear_doctype_notifications",
|
||||
"frappe.workflow.doctype.workflow_action.workflow_action.process_workflow_actions",
|
||||
"frappe.search.sqlite_search.delete_doc_index",
|
||||
],
|
||||
"on_update_after_submit": [
|
||||
"frappe.workflow.doctype.workflow_action.workflow_action.process_workflow_actions",
|
||||
|
|
@ -206,6 +208,7 @@ scheduler_events = {
|
|||
"frappe.deferred_insert.save_to_db",
|
||||
"frappe.automation.doctype.reminder.reminder.send_reminders",
|
||||
"frappe.model.utils.link_count.update_link_count",
|
||||
"frappe.search.sqlite_search.build_index_if_not_exists",
|
||||
],
|
||||
# 10 minutes
|
||||
"0/10 * * * *": [
|
||||
|
|
@ -278,7 +281,10 @@ setup_wizard_exception = [
|
|||
]
|
||||
|
||||
before_migrate = ["frappe.core.doctype.patch_log.patch_log.before_migrate"]
|
||||
after_migrate = ["frappe.website.doctype.website_theme.website_theme.after_migrate"]
|
||||
after_migrate = [
|
||||
"frappe.website.doctype.website_theme.website_theme.after_migrate",
|
||||
"frappe.search.sqlite_search.build_index_in_background",
|
||||
]
|
||||
|
||||
otp_methods = ["OTP App", "Email", "SMS"]
|
||||
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@
|
|||
|
||||
import frappe
|
||||
from frappe.search.full_text_search import FullTextSearch
|
||||
from frappe.search.sqlite_search import SQLiteSearch
|
||||
from frappe.search.website_search import WebsiteSearch
|
||||
from frappe.utils import cint
|
||||
|
||||
|
|
|
|||
470
frappe/search/sqlite_search.md
Normal file
470
frappe/search/sqlite_search.md
Normal file
|
|
@ -0,0 +1,470 @@
|
|||
# SQLite Search Framework
|
||||
|
||||
SQLite Search is a full-text search framework for Frappe applications that provides advanced search capabilities using SQLite's FTS5 (Full-Text Search) engine. It offers features like spelling correction, time-based recency scoring, custom ranking, permission-aware filtering, and extensible scoring pipelines.
|
||||
|
||||
## Table of Contents
|
||||
|
||||
- [Quick Start](#quick-start)
|
||||
- [How It Works](#how-it-works)
|
||||
- [Configuration](#configuration)
|
||||
- [Features & Customization](#features--customization)
|
||||
- [API Reference](#api-reference)
|
||||
|
||||
## Quick Start
|
||||
|
||||
### 1. Create a Search Class
|
||||
|
||||
Create a search implementation by extending `SQLiteSearch`:
|
||||
|
||||
```python
|
||||
# my_app/search.py
|
||||
from frappe.search.sqlite_search import SQLiteSearch
|
||||
|
||||
class MyAppSearch(SQLiteSearch):
|
||||
# Database file name
|
||||
INDEX_NAME = "my_app_search.db"
|
||||
|
||||
# Define the search schema
|
||||
INDEX_SCHEMA = {
|
||||
"metadata_fields": ["project", "owner", "status"],
|
||||
"tokenizer": "unicode61 remove_diacritics 2 tokenchars '-_'",
|
||||
}
|
||||
|
||||
# Define which doctypes to index and their field mappings
|
||||
INDEXABLE_DOCTYPES = {
|
||||
"Task": {
|
||||
"fields": ["name", {"title": "subject"}, {"content": "description"}, "modified", "project", "owner", "status"],
|
||||
},
|
||||
"Issue": {
|
||||
"fields": ["name", "title", "description", {"modified": "last_updated"}, "project", "owner"],
|
||||
"filters": {"status": ("!=", "Closed")}, # Only index non-closed issues
|
||||
},
|
||||
}
|
||||
|
||||
def get_search_filters(self):
|
||||
"""Return permission filters for current user"""
|
||||
# Get projects accessible to current user
|
||||
accessible_projects = frappe.get_all(
|
||||
"Project",
|
||||
filters={"owner": frappe.session.user},
|
||||
pluck="name"
|
||||
)
|
||||
|
||||
if not accessible_projects:
|
||||
return {"project": []} # No access
|
||||
|
||||
return {"project": accessible_projects}
|
||||
```
|
||||
|
||||
### 2. Register the Search Class
|
||||
|
||||
Add your search class to hooks.py:
|
||||
|
||||
```python
|
||||
# my_app/hooks.py
|
||||
sqlite_search = ['my_app.search.MyAppSearch']
|
||||
```
|
||||
|
||||
### 3. Create API Endpoint
|
||||
|
||||
Create a whitelisted method to expose search functionality:
|
||||
|
||||
```python
|
||||
# my_app/api.py
|
||||
import frappe
|
||||
from my_app.search import MyAppSearch
|
||||
|
||||
@frappe.whitelist()
|
||||
def search(query, filters=None):
|
||||
search = MyAppSearch()
|
||||
result = search.search(query, filters=filters)
|
||||
|
||||
return result
|
||||
```
|
||||
|
||||
### 4. Build the Index
|
||||
|
||||
Build the search index programmatically or via console:
|
||||
|
||||
```python
|
||||
from my_app.search import MyAppSearch
|
||||
search = MyAppSearch()
|
||||
search.build_index()
|
||||
```
|
||||
|
||||
## How It Works
|
||||
|
||||
### 1. Indexing Process
|
||||
|
||||
#### Full Index Building
|
||||
|
||||
When you call `build_index()`, the framework performs a complete index rebuild:
|
||||
|
||||
1. **Database Preparation**: Creates a temporary SQLite database with FTS5 tables configured according to your schema
|
||||
2. **Document Collection**: Queries all specified doctypes using the configured field mappings and filters
|
||||
3. **Document Processing**: For each document:
|
||||
- Extracts and maps fields according to `INDEXABLE_DOCTYPES` configuration
|
||||
- Cleans HTML content using BeautifulSoup to extract plain text
|
||||
- Applies custom document preparation logic if `prepare_document()` is overridden
|
||||
- Validates required fields (title, content) are present
|
||||
4. **Batch Insertion**: Inserts processed documents into the FTS5 index in batches for performance
|
||||
5. **Vocabulary Building**: Constructs a spelling correction dictionary from all indexed text
|
||||
6. **Atomic Replacement**: Replaces the existing index database with the new one atomically
|
||||
|
||||
#### Individual Document Indexing
|
||||
|
||||
For real-time updates using `index_doc()` or `remove_doc()`:
|
||||
|
||||
1. **Single Document Processing**: Retrieves and processes one document using the same field mapping logic
|
||||
2. **Incremental Update**: Updates the existing FTS5 index by inserting, updating, or deleting the specific document
|
||||
3. **Vocabulary Update**: Updates the spelling dictionary with new terms from the document
|
||||
|
||||
### 2. Search Process
|
||||
|
||||
When a user performs a search using `search()`, the framework executes these steps:
|
||||
|
||||
1. **Permission Filtering**: Calls `get_search_filters()` to determine what documents the current user can access
|
||||
2. **Query Preprocessing**:
|
||||
- Validates the search query is not empty
|
||||
- Combines user-provided filters with permission filters
|
||||
3. **Spelling Correction**:
|
||||
- Analyzes query terms against the vocabulary dictionary
|
||||
- Uses trigram similarity to suggest corrections for misspelled words
|
||||
- Expands the original query with corrected terms
|
||||
4. **FTS5 Query Execution**:
|
||||
- Constructs an FTS5-compatible query string
|
||||
- Executes the full-text search against the SQLite database
|
||||
- Applies metadata filters (status, owner, project, etc.)
|
||||
- Retrieves raw results with BM25 scores
|
||||
5. **Results Processing**:
|
||||
- **Custom Scoring**: Applies the scoring pipeline to calculate final relevance scores
|
||||
- Base BM25 score processing
|
||||
- Title matching boosts (exact and partial matches)
|
||||
- Recency boosting based on document age
|
||||
- Custom scoring functions (doctype-specific, priority-based, etc.)
|
||||
- **Ranking**: Sorts results by final scores and assigns rank positions
|
||||
- **Content Formatting**: Generates content snippets and highlights matching terms
|
||||
|
||||
## Configuration
|
||||
|
||||
### INDEX_SCHEMA
|
||||
|
||||
Defines the structure of your search index:
|
||||
|
||||
```python
|
||||
INDEX_SCHEMA = {
|
||||
# Text fields that will be searchable (defaults to ["title", "content"])
|
||||
"text_fields": ["title", "content"],
|
||||
|
||||
# Metadata fields stored alongside text content for filtering
|
||||
"metadata_fields": ["project", "owner", "status", "priority"],
|
||||
|
||||
# FTS5 tokenizer configuration
|
||||
"tokenizer": "unicode61 remove_diacritics 2 tokenchars '-_@.'"
|
||||
}
|
||||
```
|
||||
|
||||
### INDEXABLE_DOCTYPES
|
||||
|
||||
Specifies which doctypes to index and how to map their fields:
|
||||
|
||||
```python
|
||||
INDEXABLE_DOCTYPES = {
|
||||
"Task": {
|
||||
# Field mapping
|
||||
"fields": [
|
||||
"name",
|
||||
{"title": "subject"}, # Maps subject field to title
|
||||
{"content": "description"}, # Maps description field to content
|
||||
{"modified": "creation"}, # Use creation instead of modified for recency boost
|
||||
"project",
|
||||
"owner"
|
||||
],
|
||||
|
||||
# Optional filters to limit which records are indexed
|
||||
"filters": {
|
||||
"status": ("!=", "Cancelled"),
|
||||
"docstatus": ("!=", 2)
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Field Mapping Rules
|
||||
|
||||
- **String fields**: Direct mapping `"field_name"`
|
||||
- **Aliased fields**: Dictionary mapping `{"schema_field": "doctype_field"}`
|
||||
- **Required fields**: `title` and `content` fields must be present or explicitly mapped (e.g., `{"title": "subject"}`)
|
||||
- **Auto-added fields**: `doctype` and `name` are automatically included
|
||||
- **Modified field**: Added automatically if used in any doctype configuration. Used for recency boosting - if you want to use a different timestamp field (like `creation` or `last_updated`), map it to `modified` using `{"modified": "creation"}`
|
||||
|
||||
## Features & Customization
|
||||
|
||||
### Permission Filtering
|
||||
|
||||
Implement `get_search_filters()` to control access:
|
||||
|
||||
```python
|
||||
def get_search_filters(self):
|
||||
"""Return filters based on user permissions"""
|
||||
user = frappe.session.user
|
||||
|
||||
if user == "Administrator":
|
||||
return {} # No restrictions
|
||||
|
||||
# Example: User can only see their own and public documents
|
||||
return {
|
||||
"owner": user,
|
||||
"status": ["Active", "Published"]
|
||||
}
|
||||
```
|
||||
|
||||
### Custom Scoring
|
||||
|
||||
Create custom scoring functions to influence search relevance:
|
||||
|
||||
```python
|
||||
class MyAppSearch(SQLiteSearch):
|
||||
...
|
||||
|
||||
@SQLiteSearch.scoring_function
|
||||
def _get_priority_boost(self, row, query, query_words):
|
||||
"""Boost high-priority items"""
|
||||
priority = row.get("priority", "Medium")
|
||||
|
||||
if priority == "High":
|
||||
return 1.5
|
||||
if priority == "Medium":
|
||||
return 1.1
|
||||
return 1.0
|
||||
```
|
||||
|
||||
### Recency Boosting
|
||||
|
||||
The framework automatically provides time-based recency boosting using the `modified` field:
|
||||
|
||||
```python
|
||||
# The modified field is used for calculating document age
|
||||
# Recent documents get higher scores:
|
||||
# - Last 24 hours: 1.8x boost
|
||||
# - Last 7 days: 1.5x boost
|
||||
# - Last 30 days: 1.2x boost
|
||||
# - Last 90 days: 1.1x boost
|
||||
# - Older documents: gradually decreasing boost
|
||||
|
||||
# If your doctype uses a different timestamp field, map it to modified:
|
||||
INDEXABLE_DOCTYPES = {
|
||||
"GP Discussion": {
|
||||
"fields": ["name", "title", "content", {"modified": "last_post_at"}, "project"],
|
||||
},
|
||||
"Article": {
|
||||
"fields": ["name", "title", "content", {"modified": "published_date"}, "category"],
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Document Preparation
|
||||
|
||||
Override `prepare_document()` for custom document processing:
|
||||
|
||||
```python
|
||||
def prepare_document(self, doc):
|
||||
"""Custom document preparation"""
|
||||
document = super().prepare_document(doc)
|
||||
if not document:
|
||||
return None
|
||||
|
||||
# Add computed fields
|
||||
if doc.doctype == "Task":
|
||||
# Combine multiple fields into content
|
||||
content_parts = [
|
||||
doc.description or "",
|
||||
doc.notes or "",
|
||||
"\n".join([comment.content for comment in doc.get("comments", [])])
|
||||
]
|
||||
document["content"] = "\n".join(filter(None, content_parts))
|
||||
|
||||
# set fields that might be stored in another table
|
||||
document["category"] = get_category_for_task(doc)
|
||||
|
||||
return document
|
||||
```
|
||||
|
||||
### Spelling Correction
|
||||
|
||||
The framework includes built-in spelling correction using trigram similarity:
|
||||
|
||||
```python
|
||||
# Spelling correction happens automatically
|
||||
search_result = search.search("projetc managment") # Will find "project management"
|
||||
|
||||
# Access correction information
|
||||
print(search_result["summary"]["corrected_words"])
|
||||
# Output: {"projetc": "project", "managment": "management"}
|
||||
```
|
||||
|
||||
### Content Processing
|
||||
|
||||
HTML content is automatically cleaned and processed using BeautifulSoup:
|
||||
|
||||
```python
|
||||
# Complex HTML content like this:
|
||||
html_content = """
|
||||
<div class="article">
|
||||
<h1>API Documentation</h1>
|
||||
<p>Learn how to integrate with our <a href="/api">REST API</a>.</p>
|
||||
<img src="/images/api-flow.png" alt="API workflow diagram" />
|
||||
<ul>
|
||||
<li><strong>Authentication:</strong> Use <code>Bearer tokens</code></li>
|
||||
<li>Rate limiting: <em>1000 requests/hour</em></li>
|
||||
</ul>
|
||||
<blockquote>See our <a href="/examples">code examples</a> for details.</blockquote>
|
||||
<table><tr><td>Method</td><td>POST</td></tr></table>
|
||||
<script>analytics.track('page_view');</script>
|
||||
<style>.hidden { display: none; }</style>
|
||||
</div>
|
||||
"""
|
||||
|
||||
# Is automatically converted to clean, searchable plain text:
|
||||
"""
|
||||
API Documentation
|
||||
|
||||
Learn how to integrate with our REST API.
|
||||
|
||||
Authentication: Use Bearer tokens
|
||||
Rate limiting: 1000 requests/hour
|
||||
|
||||
See our code examples for details.
|
||||
|
||||
Method POST
|
||||
"""
|
||||
|
||||
# The cleaning process:
|
||||
# 1. Removes all HTML tags (<div>, <h1>, <strong>, <code>, etc.)
|
||||
# 2. Strips out scripts, styles, and non-content elements
|
||||
# 3. Extracts link text while removing href URLs
|
||||
# 4. Normalizes whitespace and line breaks
|
||||
```
|
||||
|
||||
### Title-Only Search
|
||||
|
||||
```python
|
||||
results = search.search("project update", title_only=True)
|
||||
```
|
||||
|
||||
### Advanced Filtering
|
||||
|
||||
```python
|
||||
accessible_projects = ['PROJ001', 'PROJ002', ...]
|
||||
|
||||
filters = {
|
||||
"project": accessible_projects, # Multiple values (IN clause)
|
||||
"owner": current_user, # Single value (= clause)
|
||||
}
|
||||
|
||||
results = search.search("bug fix", filters=filters)
|
||||
```
|
||||
|
||||
### Automatic Index Handling
|
||||
|
||||
The framework handles index building and maintenance automatically when you register your search class:
|
||||
|
||||
```python
|
||||
# hooks.py
|
||||
sqlite_search = ['my_app.search.MyAppSearch']
|
||||
```
|
||||
|
||||
**What the framework does automatically:**
|
||||
|
||||
1. **Post-Migration Index Building**: Builds the search index automatically after running `bench migrate`
|
||||
2. **Periodic Index Verification**: Checks every 15 minutes that the index exists and rebuilds if missing
|
||||
3. **Real-time Document Updates**: Automatically calls `index_doc()` and `remove_doc()` on document lifecycle events (insert, update, delete) for all doctypes defined in your `INDEXABLE_DOCTYPES`
|
||||
|
||||
## Manual Index Handling
|
||||
|
||||
If you prefer to have manual control over the lifecycle of indexing, then you can simply opt out of automatic index handling by not registering the search class in `sqlite_search` hook.
|
||||
|
||||
```python
|
||||
from my_app.search import MyAppSearch
|
||||
|
||||
def build_index_in_background():
|
||||
"""Manually trigger background index building"""
|
||||
search = MyAppSearch()
|
||||
if search.is_search_enabled() and not search.index_exists():
|
||||
frappe.enqueue("my_app.search.build_index", queue="long")
|
||||
|
||||
# hooks.py
|
||||
scheduler_events = {
|
||||
# Custom scheduler (if you want different timing)
|
||||
"daily": ["my_app.search.build_index_if_not_exists"],
|
||||
}
|
||||
```
|
||||
|
||||
## API Reference
|
||||
|
||||
#### `search(query, title_only=False, filters=None)`
|
||||
Main search method that returns formatted results.
|
||||
|
||||
**Parameters:**
|
||||
- `query` (str): Search query text
|
||||
- `title_only` (bool): Search only in title fields
|
||||
- `filters` (dict): Additional filters to apply
|
||||
|
||||
**Returns:**
|
||||
```python
|
||||
{
|
||||
"results": [
|
||||
{
|
||||
"doctype": "Task",
|
||||
"name": "TASK-001",
|
||||
"title": "Fix login bug",
|
||||
"content": "User cannot login after password reset...",
|
||||
"score": 0.85,
|
||||
"original_rank": 3, # original bm25 rank
|
||||
"rank": 1, # modified rank after custom scoring pipeline
|
||||
# ... other metadata fields
|
||||
}
|
||||
],
|
||||
"summary": {
|
||||
"duration": 0.023,
|
||||
"total_matches": 15,
|
||||
"returned_matches": 15,
|
||||
"corrected_words": {"loggin": "login"},
|
||||
"corrected_query": "Fix login bug",
|
||||
"title_only": False,
|
||||
"filtered_matches": 15,
|
||||
"applied_filters": {"status": ["Open"]}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
#### `build_index()`
|
||||
Build the complete search index from scratch.
|
||||
|
||||
#### `index_doc(doctype, docname)`
|
||||
Index a single document.
|
||||
|
||||
#### `remove_doc(doctype, docname)`
|
||||
Remove a single document from the index.
|
||||
|
||||
#### `is_search_enabled()`
|
||||
Check if search is enabled (override to add disable logic).
|
||||
|
||||
#### `index_exists()`
|
||||
Check if the search index exists.
|
||||
|
||||
#### `get_search_filters()`
|
||||
**Must be implemented by subclasses.** Return filters for the current user.
|
||||
|
||||
**Returns:**
|
||||
```python
|
||||
{
|
||||
"field_name": "value", # Single value
|
||||
"field_name": ["val1", "val2"], # Multiple values
|
||||
}
|
||||
```
|
||||
|
||||
|
||||
#### `scoring_function()`
|
||||
|
||||
Use the `@SQLiteSearch.scoring_function` decorator to mark a function as a scoring function.
|
||||
1419
frappe/search/sqlite_search.py
Normal file
1419
frappe/search/sqlite_search.py
Normal file
File diff suppressed because it is too large
Load diff
505
frappe/tests/test_sqlite_search.py
Normal file
505
frappe/tests/test_sqlite_search.py
Normal file
|
|
@ -0,0 +1,505 @@
|
|||
import os
|
||||
import sqlite3
|
||||
import time
|
||||
from typing import ClassVar
|
||||
from unittest.mock import patch
|
||||
|
||||
import frappe
|
||||
from frappe.search.sqlite_search import SQLiteSearch, SQLiteSearchIndexMissingError
|
||||
from frappe.tests import IntegrationTestCase
|
||||
|
||||
|
||||
class TestSQLiteSearch(SQLiteSearch):
|
||||
"""Test implementation of SQLiteSearch for testing purposes."""
|
||||
|
||||
INDEX_NAME = "test_search.db"
|
||||
|
||||
INDEX_SCHEMA: ClassVar = {
|
||||
"text_fields": ["title", "content"],
|
||||
"metadata_fields": ["doctype", "name", "owner", "modified"],
|
||||
"tokenizer": "unicode61 remove_diacritics 2",
|
||||
}
|
||||
|
||||
INDEXABLE_DOCTYPES: ClassVar = {
|
||||
"Note": {
|
||||
"fields": ["name", "title", "content", "owner", {"modified": "creation"}],
|
||||
},
|
||||
"ToDo": {
|
||||
"fields": ["name", {"title": "description"}, {"content": "description"}, "owner", "modified"],
|
||||
},
|
||||
"User": {
|
||||
"fields": ["name", {"title": "full_name"}, {"content": "email"}, "name", "modified"],
|
||||
"filters": {"enabled": 1},
|
||||
},
|
||||
}
|
||||
|
||||
def get_search_filters(self):
|
||||
"""Return permission filters - for testing, allow all documents."""
|
||||
if frappe.session.user == "Administrator":
|
||||
return {}
|
||||
# Simulate user-specific filtering
|
||||
return {"owner": frappe.session.user}
|
||||
|
||||
|
||||
class TestSQLiteSearchAPI(IntegrationTestCase):
|
||||
"""Test suite for SQLiteSearch public API functionality."""
|
||||
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
super().setUpClass()
|
||||
cls.search = TestSQLiteSearch()
|
||||
# Clean up any existing test database
|
||||
cls.search.drop_index()
|
||||
|
||||
@classmethod
|
||||
def tearDownClass(cls):
|
||||
super().tearDownClass()
|
||||
# Clean up test database
|
||||
cls.search.drop_index()
|
||||
|
||||
def setUp(self):
|
||||
"""Set up test data for each test."""
|
||||
super().setUp()
|
||||
# Create test documents
|
||||
self.test_notes = []
|
||||
self.test_todos = []
|
||||
|
||||
# Create test notes with different content
|
||||
note_data = [
|
||||
{"title": "Python Programming Guide", "content": "Learn Python basics and advanced concepts"},
|
||||
{"title": "Project Management Tips", "content": "How to manage software projects effectively"},
|
||||
{"title": "Cooking Recipe Collection", "content": "Delicious recipes for home cooking"},
|
||||
{
|
||||
"title": "Machine Learning Tutorial",
|
||||
"content": "Introduction to ML algorithms and Python implementation",
|
||||
},
|
||||
]
|
||||
|
||||
for data in note_data:
|
||||
note = frappe.get_doc({"doctype": "Note", "title": data["title"], "content": data["content"]})
|
||||
note.insert()
|
||||
self.test_notes.append(note)
|
||||
|
||||
# Create test todos
|
||||
todo_data = [
|
||||
{"description": "Review Python code for search functionality"},
|
||||
{"description": "Update project documentation"},
|
||||
{"description": "Plan team meeting agenda"},
|
||||
]
|
||||
|
||||
for data in todo_data:
|
||||
todo = frappe.get_doc({"doctype": "ToDo", "description": data["description"], "status": "Open"})
|
||||
todo.insert()
|
||||
self.test_todos.append(todo)
|
||||
|
||||
def tearDown(self):
|
||||
"""Clean up test data after each test."""
|
||||
# Delete test documents
|
||||
for note in self.test_notes:
|
||||
try:
|
||||
note.delete()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
for todo in self.test_todos:
|
||||
try:
|
||||
todo.delete()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
super().tearDown()
|
||||
|
||||
def test_index_lifecycle_and_status_methods(self):
|
||||
"""Test index building, existence checking, and status validation."""
|
||||
# Initially index should not exist
|
||||
self.search.drop_index() # Ensure clean state
|
||||
self.assertFalse(self.search.index_exists())
|
||||
|
||||
# Should raise error when trying to search without index
|
||||
with self.assertRaises(SQLiteSearchIndexMissingError):
|
||||
self.search.raise_if_not_indexed()
|
||||
|
||||
# Build index
|
||||
self.search.build_index()
|
||||
|
||||
# Now index should exist
|
||||
self.assertTrue(self.search.index_exists())
|
||||
|
||||
# Should not raise error now
|
||||
try:
|
||||
self.search.raise_if_not_indexed()
|
||||
except SQLiteSearchIndexMissingError:
|
||||
self.fail("raise_if_not_indexed() raised exception when index exists")
|
||||
|
||||
# Verify database file exists and has correct tables
|
||||
self.assertTrue(os.path.exists(self.search.db_path))
|
||||
|
||||
conn = sqlite3.connect(self.search.db_path)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Check if FTS table exists
|
||||
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='search_fts'")
|
||||
self.assertTrue(cursor.fetchone())
|
||||
|
||||
# Check if vocabulary tables exist
|
||||
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='search_vocabulary'")
|
||||
self.assertTrue(cursor.fetchone())
|
||||
|
||||
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='search_trigrams'")
|
||||
self.assertTrue(cursor.fetchone())
|
||||
|
||||
conn.close()
|
||||
|
||||
# Test drop_index method
|
||||
self.search.drop_index()
|
||||
self.assertFalse(self.search.index_exists())
|
||||
self.assertFalse(os.path.exists(self.search.db_path))
|
||||
|
||||
# Dropping non-existent index should not raise error
|
||||
self.search.drop_index() # Should not raise error
|
||||
|
||||
def test_basic_search_functionality(self):
|
||||
"""Test core search functionality with various query types."""
|
||||
# Build index first
|
||||
self.search.build_index()
|
||||
|
||||
# Test basic text search
|
||||
results = self.search.search("Python")
|
||||
self.assertGreater(len(results["results"]), 0)
|
||||
self.assertIn("Python", results["results"][0]["title"] + results["results"][0]["content"])
|
||||
|
||||
# Verify result structure
|
||||
result = results["results"][0]
|
||||
required_fields = [
|
||||
"id",
|
||||
"title",
|
||||
"content",
|
||||
"doctype",
|
||||
"name",
|
||||
"score",
|
||||
"original_rank",
|
||||
"modified_rank",
|
||||
]
|
||||
for field in required_fields:
|
||||
self.assertIn(field, result)
|
||||
|
||||
# Test case-insensitive search
|
||||
results_lower = self.search.search("python")
|
||||
results_upper = self.search.search("PYTHON")
|
||||
self.assertEqual(len(results_lower["results"]), len(results_upper["results"]))
|
||||
|
||||
# Test partial word matching
|
||||
results = self.search.search("prog") # Should match "Programming"
|
||||
self.assertGreater(len(results["results"]), 0)
|
||||
|
||||
# Test multi-word search
|
||||
results = self.search.search("Python programming")
|
||||
self.assertGreater(len(results["results"]), 0)
|
||||
|
||||
# Test empty query
|
||||
results = self.search.search("")
|
||||
self.assertEqual(len(results["results"]), 0)
|
||||
|
||||
# Test title-only search
|
||||
results = self.search.search("Python", title_only=True)
|
||||
self.assertGreater(len(results["results"]), 0)
|
||||
for result in results["results"]:
|
||||
self.assertIn("Python", result["title"])
|
||||
|
||||
def test_search_filtering_and_permissions(self):
|
||||
"""Test search filtering and permission-based result filtering."""
|
||||
self.search.build_index()
|
||||
|
||||
# Test basic filtering by doctype
|
||||
results = self.search.search("", filters={"doctype": "Note"})
|
||||
for result in results["results"]:
|
||||
self.assertEqual(result["doctype"], "Note")
|
||||
|
||||
# Test filtering with list values
|
||||
results = self.search.search("", filters={"doctype": ["Note", "ToDo"]})
|
||||
for result in results["results"]:
|
||||
self.assertIn(result["doctype"], ["Note", "ToDo"])
|
||||
|
||||
# Test empty filter list (should return no results)
|
||||
results = self.search.search("", filters={"doctype": []})
|
||||
self.assertEqual(len(results["results"]), 0)
|
||||
|
||||
# Test permission filtering by switching users
|
||||
original_user = frappe.session.user
|
||||
try:
|
||||
# Create a test user and switch to them
|
||||
test_user_email = "test_search_user@example.com"
|
||||
if not frappe.db.exists("User", test_user_email):
|
||||
test_user = frappe.get_doc(
|
||||
{
|
||||
"doctype": "User",
|
||||
"email": test_user_email,
|
||||
"first_name": "Test",
|
||||
"last_name": "User",
|
||||
"enabled": 1,
|
||||
}
|
||||
)
|
||||
test_user.insert()
|
||||
|
||||
frappe.set_user(test_user_email)
|
||||
|
||||
# Search should now filter by owner (based on our test implementation)
|
||||
results = self.search.search("Python")
|
||||
# Results should be limited based on permission filters
|
||||
self.assertIsInstance(results["results"], list)
|
||||
|
||||
finally:
|
||||
frappe.set_user(original_user)
|
||||
|
||||
def test_advanced_scoring_and_ranking(self):
|
||||
"""Test scoring pipeline, ranking, and result ordering."""
|
||||
self.search.build_index()
|
||||
|
||||
# Search for a term that appears in multiple documents
|
||||
results = self.search.search("Python")
|
||||
|
||||
# Verify results are sorted by score (descending)
|
||||
scores = [result["score"] for result in results["results"]]
|
||||
self.assertEqual(scores, sorted(scores, reverse=True))
|
||||
|
||||
# Verify both original and modified rankings are present
|
||||
for i, result in enumerate(results["results"]):
|
||||
self.assertEqual(result["modified_rank"], i + 1)
|
||||
self.assertIsInstance(result["original_rank"], int)
|
||||
self.assertGreater(result["original_rank"], 0)
|
||||
|
||||
# Test title boost - documents with search term in title should rank higher
|
||||
results = self.search.search("Programming")
|
||||
title_match_found = False
|
||||
for result in results["results"]:
|
||||
if "Programming" in result["title"]:
|
||||
title_match_found = True
|
||||
# Title matches should have higher scores
|
||||
self.assertGreater(result["score"], 1.0)
|
||||
break
|
||||
self.assertTrue(title_match_found, "No title matches found for scoring test")
|
||||
|
||||
# Test that BM25 score is included
|
||||
for result in results["results"]:
|
||||
self.assertIn("bm25_score", result)
|
||||
self.assertIsInstance(result["bm25_score"], (int, float))
|
||||
|
||||
def test_spelling_correction_and_query_expansion(self):
|
||||
"""Test spelling correction and query expansion functionality."""
|
||||
self.search.build_index()
|
||||
|
||||
# Test with a misspelled word that should be corrected
|
||||
results = self.search.search("Pythom") # Misspelled "Python"
|
||||
|
||||
# Check if corrections were applied
|
||||
summary = results["summary"]
|
||||
if summary.get("corrected_words"):
|
||||
self.assertIsInstance(summary["corrected_words"], dict)
|
||||
self.assertIsInstance(summary["corrected_query"], str)
|
||||
|
||||
# Even with misspelling, we should get some results due to correction
|
||||
# (This might not always work depending on vocabulary, so we test gracefully)
|
||||
self.assertIsInstance(results["results"], list)
|
||||
|
||||
# Test with a completely made-up word
|
||||
results = self.search.search("xyzabc123nonexistent")
|
||||
# Should return empty results or minimal results
|
||||
self.assertLessEqual(len(results["results"]), 1)
|
||||
|
||||
def test_document_indexing_operations(self):
|
||||
"""Test individual document indexing and removal operations."""
|
||||
self.search.build_index()
|
||||
|
||||
# Create a new document after index is built
|
||||
new_note = frappe.get_doc(
|
||||
{
|
||||
"doctype": "Note",
|
||||
"title": "Newly Added Document",
|
||||
"content": "This document was added after initial indexing",
|
||||
}
|
||||
)
|
||||
new_note.insert()
|
||||
|
||||
try:
|
||||
# Initially, the new document shouldn't be in search results
|
||||
results = self.search.search("Newly Added Document")
|
||||
initial_count = len(results["results"])
|
||||
|
||||
# Index the new document
|
||||
self.search.index_doc("Note", new_note.name)
|
||||
|
||||
# Now it should be findable
|
||||
results = self.search.search("Newly Added Document")
|
||||
self.assertGreater(len(results["results"]), initial_count)
|
||||
|
||||
# Verify the document is in results
|
||||
found = False
|
||||
for result in results["results"]:
|
||||
if result["name"] == new_note.name:
|
||||
found = True
|
||||
break
|
||||
self.assertTrue(found, "Newly indexed document not found in search results")
|
||||
|
||||
# Remove the document from index
|
||||
self.search.remove_doc("Note", new_note.name)
|
||||
|
||||
# Should not be findable anymore
|
||||
results = self.search.search("Newly Added Document")
|
||||
found = False
|
||||
for result in results["results"]:
|
||||
if result["name"] == new_note.name:
|
||||
found = True
|
||||
break
|
||||
self.assertFalse(found, "Removed document still found in search results")
|
||||
|
||||
finally:
|
||||
new_note.delete()
|
||||
|
||||
def test_search_result_summary_and_metadata(self):
|
||||
"""Test search result summary and metadata information."""
|
||||
self.search.build_index()
|
||||
|
||||
results = self.search.search("Python")
|
||||
summary = results["summary"]
|
||||
|
||||
# Verify summary structure
|
||||
required_summary_fields = [
|
||||
"total_matches",
|
||||
"filtered_matches",
|
||||
"returned_matches",
|
||||
"duration",
|
||||
"title_only",
|
||||
"applied_filters",
|
||||
]
|
||||
for field in required_summary_fields:
|
||||
self.assertIn(field, summary)
|
||||
|
||||
# Verify summary values make sense
|
||||
self.assertIsInstance(summary["duration"], (int, float))
|
||||
self.assertGreater(summary["duration"], 0)
|
||||
self.assertEqual(summary["total_matches"], summary["filtered_matches"])
|
||||
self.assertEqual(summary["filtered_matches"], len(results["results"]))
|
||||
self.assertFalse(summary["title_only"])
|
||||
self.assertEqual(summary["applied_filters"], {})
|
||||
|
||||
# Test with filters applied
|
||||
results = self.search.search("Python", filters={"doctype": "Note"})
|
||||
summary = results["summary"]
|
||||
self.assertEqual(summary["applied_filters"], {"doctype": "Note"})
|
||||
|
||||
# Test title-only search
|
||||
results = self.search.search("Python", title_only=True)
|
||||
summary = results["summary"]
|
||||
self.assertTrue(summary["title_only"])
|
||||
|
||||
def test_configuration_and_schema_validation(self):
|
||||
"""Test configuration validation and schema handling."""
|
||||
|
||||
# Test invalid configuration
|
||||
class InvalidSearchClass(SQLiteSearch):
|
||||
# Missing required INDEX_SCHEMA
|
||||
INDEXABLE_DOCTYPES: ClassVar = {"Note": {"fields": ["name", "title"]}}
|
||||
|
||||
def get_search_filters(self):
|
||||
return {}
|
||||
|
||||
with self.assertRaises(ValueError):
|
||||
InvalidSearchClass()
|
||||
|
||||
# Test invalid doctype configuration
|
||||
class InvalidDoctypeConfig(SQLiteSearch):
|
||||
INDEX_SCHEMA: ClassVar = {"text_fields": ["title", "content"]}
|
||||
INDEXABLE_DOCTYPES: ClassVar = {
|
||||
"Note": {
|
||||
# Missing 'fields' key
|
||||
"title_field": "title"
|
||||
}
|
||||
}
|
||||
|
||||
def get_search_filters(self):
|
||||
return {}
|
||||
|
||||
with self.assertRaises(ValueError):
|
||||
InvalidDoctypeConfig()
|
||||
|
||||
def test_content_processing_and_html_handling(self):
|
||||
"""Test content processing including HTML tag removal and text normalization."""
|
||||
self.search.build_index()
|
||||
|
||||
# Create a note with HTML content
|
||||
html_note = frappe.get_doc(
|
||||
{
|
||||
"doctype": "Note",
|
||||
"title": "HTML Content Test",
|
||||
"content": "<p>This is <strong>bold</strong> text with <a href='http://example.com'>links</a> and <br> line breaks.</p>",
|
||||
}
|
||||
)
|
||||
html_note.insert()
|
||||
|
||||
try:
|
||||
# Index the document
|
||||
self.search.index_doc("Note", html_note.name)
|
||||
|
||||
# Search should find processed content
|
||||
results = self.search.search("bold text links")
|
||||
|
||||
# Should find the document
|
||||
found = False
|
||||
for result in results["results"]:
|
||||
if result["name"] == html_note.name:
|
||||
found = True
|
||||
# Content should be processed (HTML tags removed)
|
||||
self.assertNotIn("<p>", result["content"])
|
||||
self.assertNotIn("<strong>", result["content"])
|
||||
self.assertIn("bold", result["content"])
|
||||
self.assertNotIn(
|
||||
"<a href='http://example.com'>", result["content"]
|
||||
) # Links should be replaced
|
||||
break
|
||||
|
||||
self.assertTrue(found, "HTML content document not found in search")
|
||||
|
||||
finally:
|
||||
html_note.delete()
|
||||
|
||||
def test_search_disabled_state(self):
|
||||
"""Test behavior when search is disabled."""
|
||||
|
||||
# Create a search class with search disabled
|
||||
class DisabledSearch(TestSQLiteSearch):
|
||||
def is_search_enabled(self):
|
||||
return False
|
||||
|
||||
disabled_search = DisabledSearch()
|
||||
disabled_search.drop_index() # Ensure clean state
|
||||
|
||||
# Should return empty results when disabled
|
||||
results = disabled_search.search("Python")
|
||||
self.assertEqual(len(results["results"]), 0)
|
||||
|
||||
# Build index should do nothing when disabled
|
||||
disabled_search.build_index() # Should not raise error but do nothing
|
||||
self.assertFalse(disabled_search.index_exists())
|
||||
|
||||
@patch("frappe.enqueue")
|
||||
def test_background_operations(self, mock_enqueue):
|
||||
"""Test background job integration and module-level functions."""
|
||||
from frappe.search.sqlite_search import (
|
||||
build_index_in_background,
|
||||
get_search_classes,
|
||||
)
|
||||
|
||||
# Test getting search classes
|
||||
with patch("frappe.get_hooks") as mock_get_hooks:
|
||||
mock_get_hooks.return_value = ["frappe.tests.test_sqlite_search.TestSQLiteSearch"]
|
||||
classes = get_search_classes()
|
||||
self.assertEqual(len(classes), 1)
|
||||
self.assertEqual(classes[0], TestSQLiteSearch)
|
||||
|
||||
# Test background index building
|
||||
with patch("frappe.get_hooks") as mock_get_hooks:
|
||||
mock_get_hooks.return_value = ["frappe.tests.test_sqlite_search.TestSQLiteSearch"]
|
||||
build_index_in_background()
|
||||
|
||||
# Should have enqueued a background job
|
||||
self.assertTrue(mock_enqueue.called)
|
||||
Loading…
Add table
Reference in a new issue