Merge pull request #35573 from netchampfaris/sqlite-search-deduplicate
This commit is contained in:
commit
e0eb087694
2 changed files with 73 additions and 0 deletions
|
|
@ -961,6 +961,7 @@ class SQLiteSearch(ABC):
|
|||
|
||||
for i in range(0, len(documents), chunk_size):
|
||||
chunk = documents[i : i + chunk_size]
|
||||
doc_ids_to_delete = []
|
||||
values_to_insert = []
|
||||
|
||||
for doc in chunk:
|
||||
|
|
@ -983,6 +984,7 @@ class SQLiteSearch(ABC):
|
|||
|
||||
# Build values tuple dynamically based on schema
|
||||
values = []
|
||||
doc_id = None
|
||||
for field in all_fields:
|
||||
# Build doc_id automatically from doctype:name
|
||||
if field == "doc_id":
|
||||
|
|
@ -991,8 +993,15 @@ class SQLiteSearch(ABC):
|
|||
else:
|
||||
values.append(doc.get(field, ""))
|
||||
|
||||
doc_ids_to_delete.append(doc_id)
|
||||
values_to_insert.append(tuple(values))
|
||||
|
||||
# Delete existing rows for these doc_ids first using a single statement
|
||||
if doc_ids_to_delete:
|
||||
placeholders_for_delete = ",".join(["?" for _ in doc_ids_to_delete])
|
||||
delete_sql = f"DELETE FROM search_fts WHERE doc_id IN ({placeholders_for_delete})"
|
||||
cursor.execute(delete_sql, doc_ids_to_delete)
|
||||
|
||||
# Insert the chunk
|
||||
if values_to_insert:
|
||||
cursor.executemany(insert_sql, values_to_insert)
|
||||
|
|
|
|||
|
|
@ -481,6 +481,70 @@ class TestSQLiteSearchAPI(IntegrationTestCase):
|
|||
disabled_search.build_index() # Should not raise error but do nothing
|
||||
self.assertFalse(disabled_search.index_exists())
|
||||
|
||||
def test_deduplication_on_reindex(self):
|
||||
"""Test that re-indexing the same document does not create duplicates."""
|
||||
self.search.build_index()
|
||||
|
||||
# Create a test document
|
||||
test_note = frappe.get_doc(
|
||||
{
|
||||
"doctype": "Note",
|
||||
"title": "Deduplication Test Document",
|
||||
"content": "This document tests deduplication functionality",
|
||||
}
|
||||
)
|
||||
test_note.insert()
|
||||
|
||||
try:
|
||||
# Index the document
|
||||
self.search.index_doc("Note", test_note.name)
|
||||
|
||||
# Search for the document - should find exactly one result
|
||||
results = self.search.search("Deduplication Test")
|
||||
initial_count = len([r for r in results["results"] if r["name"] == test_note.name])
|
||||
self.assertEqual(initial_count, 1, "Should find exactly one instance of the document")
|
||||
|
||||
# Re-index the same document multiple times
|
||||
self.search.index_doc("Note", test_note.name)
|
||||
self.search.index_doc("Note", test_note.name)
|
||||
self.search.index_doc("Note", test_note.name)
|
||||
|
||||
# Search again - should still find exactly one result
|
||||
results = self.search.search("Deduplication Test")
|
||||
final_count = len([r for r in results["results"] if r["name"] == test_note.name])
|
||||
self.assertEqual(final_count, 1, "Should still find exactly one instance after re-indexing")
|
||||
|
||||
# Update the document content and re-index
|
||||
test_note.content = "Updated content for deduplication testing"
|
||||
test_note.save()
|
||||
self.search.index_doc("Note", test_note.name)
|
||||
|
||||
# Search with updated content - should find exactly one result with new content
|
||||
results = self.search.search("Updated content deduplication")
|
||||
updated_results = [r for r in results["results"] if r["name"] == test_note.name]
|
||||
self.assertEqual(len(updated_results), 1, "Should find exactly one instance with updated content")
|
||||
# Content may contain HTML markup from search highlighting, so check for words individually
|
||||
self.assertIn("Updated", updated_results[0]["content"])
|
||||
self.assertIn("content", updated_results[0]["content"])
|
||||
|
||||
# Rebuild entire index - should not create duplicates
|
||||
self.search.build_index()
|
||||
results = self.search.search("Deduplication Test")
|
||||
rebuild_count = len([r for r in results["results"] if r["name"] == test_note.name])
|
||||
self.assertEqual(rebuild_count, 1, "Should still find exactly one instance after full rebuild")
|
||||
|
||||
# Verify at database level - check raw count in FTS table
|
||||
conn = sqlite3.connect(self.search.db_path)
|
||||
cursor = conn.cursor()
|
||||
doc_id = f"Note:{test_note.name}"
|
||||
cursor.execute("SELECT COUNT(*) FROM search_fts WHERE doc_id = ?", (doc_id,))
|
||||
db_count = cursor.fetchone()[0]
|
||||
conn.close()
|
||||
self.assertEqual(db_count, 1, "Database should contain exactly one entry for the document")
|
||||
|
||||
finally:
|
||||
test_note.delete()
|
||||
|
||||
@patch("frappe.enqueue")
|
||||
def test_background_operations(self, mock_enqueue):
|
||||
"""Test background job integration and module-level functions."""
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue