Merge pull request #35573 from netchampfaris/sqlite-search-deduplicate

This commit is contained in:
Faris Ansari 2025-12-31 22:05:47 +05:30 committed by GitHub
commit e0eb087694
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 73 additions and 0 deletions

View file

@ -961,6 +961,7 @@ class SQLiteSearch(ABC):
for i in range(0, len(documents), chunk_size):
chunk = documents[i : i + chunk_size]
doc_ids_to_delete = []
values_to_insert = []
for doc in chunk:
@ -983,6 +984,7 @@ class SQLiteSearch(ABC):
# Build values tuple dynamically based on schema
values = []
doc_id = None
for field in all_fields:
# Build doc_id automatically from doctype:name
if field == "doc_id":
@ -991,8 +993,15 @@ class SQLiteSearch(ABC):
else:
values.append(doc.get(field, ""))
doc_ids_to_delete.append(doc_id)
values_to_insert.append(tuple(values))
# Delete existing rows for these doc_ids first using a single statement
if doc_ids_to_delete:
placeholders_for_delete = ",".join(["?" for _ in doc_ids_to_delete])
delete_sql = f"DELETE FROM search_fts WHERE doc_id IN ({placeholders_for_delete})"
cursor.execute(delete_sql, doc_ids_to_delete)
# Insert the chunk
if values_to_insert:
cursor.executemany(insert_sql, values_to_insert)

View file

@ -481,6 +481,70 @@ class TestSQLiteSearchAPI(IntegrationTestCase):
disabled_search.build_index() # Should not raise error but do nothing
self.assertFalse(disabled_search.index_exists())
def test_deduplication_on_reindex(self):
"""Test that re-indexing the same document does not create duplicates."""
self.search.build_index()
# Create a test document
test_note = frappe.get_doc(
{
"doctype": "Note",
"title": "Deduplication Test Document",
"content": "This document tests deduplication functionality",
}
)
test_note.insert()
try:
# Index the document
self.search.index_doc("Note", test_note.name)
# Search for the document - should find exactly one result
results = self.search.search("Deduplication Test")
initial_count = len([r for r in results["results"] if r["name"] == test_note.name])
self.assertEqual(initial_count, 1, "Should find exactly one instance of the document")
# Re-index the same document multiple times
self.search.index_doc("Note", test_note.name)
self.search.index_doc("Note", test_note.name)
self.search.index_doc("Note", test_note.name)
# Search again - should still find exactly one result
results = self.search.search("Deduplication Test")
final_count = len([r for r in results["results"] if r["name"] == test_note.name])
self.assertEqual(final_count, 1, "Should still find exactly one instance after re-indexing")
# Update the document content and re-index
test_note.content = "Updated content for deduplication testing"
test_note.save()
self.search.index_doc("Note", test_note.name)
# Search with updated content - should find exactly one result with new content
results = self.search.search("Updated content deduplication")
updated_results = [r for r in results["results"] if r["name"] == test_note.name]
self.assertEqual(len(updated_results), 1, "Should find exactly one instance with updated content")
# Content may contain HTML markup from search highlighting, so check for words individually
self.assertIn("Updated", updated_results[0]["content"])
self.assertIn("content", updated_results[0]["content"])
# Rebuild entire index - should not create duplicates
self.search.build_index()
results = self.search.search("Deduplication Test")
rebuild_count = len([r for r in results["results"] if r["name"] == test_note.name])
self.assertEqual(rebuild_count, 1, "Should still find exactly one instance after full rebuild")
# Verify at database level - check raw count in FTS table
conn = sqlite3.connect(self.search.db_path)
cursor = conn.cursor()
doc_id = f"Note:{test_note.name}"
cursor.execute("SELECT COUNT(*) FROM search_fts WHERE doc_id = ?", (doc_id,))
db_count = cursor.fetchone()[0]
conn.close()
self.assertEqual(db_count, 1, "Database should contain exactly one entry for the document")
finally:
test_note.delete()
@patch("frappe.enqueue")
def test_background_operations(self, mock_enqueue):
"""Test background job integration and module-level functions."""