From 1f9460f1c273acf8d5a5189b48ff4ce23d37cc27 Mon Sep 17 00:00:00 2001 From: Faris Ansari Date: Wed, 31 Dec 2025 18:06:44 +0530 Subject: [PATCH] fix: deduplicate documents in search index while updating --- frappe/search/sqlite_search.py | 9 +++++ frappe/tests/test_sqlite_search.py | 64 ++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+) diff --git a/frappe/search/sqlite_search.py b/frappe/search/sqlite_search.py index 308c8b5839..2f9ffd9ee8 100644 --- a/frappe/search/sqlite_search.py +++ b/frappe/search/sqlite_search.py @@ -961,6 +961,7 @@ class SQLiteSearch(ABC): for i in range(0, len(documents), chunk_size): chunk = documents[i : i + chunk_size] + doc_ids_to_delete = [] values_to_insert = [] for doc in chunk: @@ -983,6 +984,7 @@ class SQLiteSearch(ABC): # Build values tuple dynamically based on schema values = [] + doc_id = None for field in all_fields: # Build doc_id automatically from doctype:name if field == "doc_id": @@ -991,8 +993,15 @@ class SQLiteSearch(ABC): else: values.append(doc.get(field, "")) + doc_ids_to_delete.append(doc_id) values_to_insert.append(tuple(values)) + # Delete existing rows for these doc_ids first using a single statement + if doc_ids_to_delete: + placeholders_for_delete = ",".join(["?" for _ in doc_ids_to_delete]) + delete_sql = f"DELETE FROM search_fts WHERE doc_id IN ({placeholders_for_delete})" + cursor.execute(delete_sql, doc_ids_to_delete) + # Insert the chunk if values_to_insert: cursor.executemany(insert_sql, values_to_insert) diff --git a/frappe/tests/test_sqlite_search.py b/frappe/tests/test_sqlite_search.py index be88901ebc..c4c528e294 100644 --- a/frappe/tests/test_sqlite_search.py +++ b/frappe/tests/test_sqlite_search.py @@ -481,6 +481,70 @@ class TestSQLiteSearchAPI(IntegrationTestCase): disabled_search.build_index() # Should not raise error but do nothing self.assertFalse(disabled_search.index_exists()) + def test_deduplication_on_reindex(self): + """Test that re-indexing the same document does not create duplicates.""" + self.search.build_index() + + # Create a test document + test_note = frappe.get_doc( + { + "doctype": "Note", + "title": "Deduplication Test Document", + "content": "This document tests deduplication functionality", + } + ) + test_note.insert() + + try: + # Index the document + self.search.index_doc("Note", test_note.name) + + # Search for the document - should find exactly one result + results = self.search.search("Deduplication Test") + initial_count = len([r for r in results["results"] if r["name"] == test_note.name]) + self.assertEqual(initial_count, 1, "Should find exactly one instance of the document") + + # Re-index the same document multiple times + self.search.index_doc("Note", test_note.name) + self.search.index_doc("Note", test_note.name) + self.search.index_doc("Note", test_note.name) + + # Search again - should still find exactly one result + results = self.search.search("Deduplication Test") + final_count = len([r for r in results["results"] if r["name"] == test_note.name]) + self.assertEqual(final_count, 1, "Should still find exactly one instance after re-indexing") + + # Update the document content and re-index + test_note.content = "Updated content for deduplication testing" + test_note.save() + self.search.index_doc("Note", test_note.name) + + # Search with updated content - should find exactly one result with new content + results = self.search.search("Updated content deduplication") + updated_results = [r for r in results["results"] if r["name"] == test_note.name] + self.assertEqual(len(updated_results), 1, "Should find exactly one instance with updated content") + # Content may contain HTML markup from search highlighting, so check for words individually + self.assertIn("Updated", updated_results[0]["content"]) + self.assertIn("content", updated_results[0]["content"]) + + # Rebuild entire index - should not create duplicates + self.search.build_index() + results = self.search.search("Deduplication Test") + rebuild_count = len([r for r in results["results"] if r["name"] == test_note.name]) + self.assertEqual(rebuild_count, 1, "Should still find exactly one instance after full rebuild") + + # Verify at database level - check raw count in FTS table + conn = sqlite3.connect(self.search.db_path) + cursor = conn.cursor() + doc_id = f"Note:{test_note.name}" + cursor.execute("SELECT COUNT(*) FROM search_fts WHERE doc_id = ?", (doc_id,)) + db_count = cursor.fetchone()[0] + conn.close() + self.assertEqual(db_count, 1, "Database should contain exactly one entry for the document") + + finally: + test_note.delete() + @patch("frappe.enqueue") def test_background_operations(self, mock_enqueue): """Test background job integration and module-level functions."""