Merge pull request #26183 from vmatt/data_import_delimiter

feat(Data Import): custom csv delimiters, UTF-8 BOM handling
2024-05-15 16:45:31 +05:30 · 2024-05-15 16:45:31 +05:30 · 3a4dd9a671
commit 3a4dd9a671
parent 5cb9f04bca 9986ea8171
9 changed files with 130 additions and 17 deletions
--- a/frappe/core/doctype/data_import/data_import.json
+++ b/frappe/core/doctype/data_import/data_import.json
@ -16,6 +16,8 @@
  "google_sheets_url",
  "refresh_google_sheet",
  "column_break_5",
+  "custom_delimiters",
+  "delimiter_options",
  "status",
  "submit_after_import",
  "mute_emails",
@ -167,11 +169,25 @@
   "hidden": 1,
   "label": "Payload Count",
   "read_only": 1
+  },
+  {
+   "default": ",;\\t|",
+   "depends_on": "custom_delimiters",
+   "description": "If your CSV uses a different delimiter, add that character here, ensuring no spaces or additional characters are included.",
+   "fieldname": "delimiter_options",
+   "fieldtype": "Data",
+   "label": "Delimiter Options"
+  },
+  {
+   "default": "0",
+   "fieldname": "custom_delimiters",
+   "fieldtype": "Check",
+   "label": "Custom Delimiters"
  }
 ],
 "hide_toolbar": 1,
 "links": [],
- "modified": "2024-03-23 16:02:16.953820",
+ "modified": "2024-04-27 20:42:35.843158",
 "modified_by": "Administrator",
 "module": "Core",
 "name": "Data Import",
@ -195,4 +211,4 @@
 "sort_order": "DESC",
 "states": [],
 "track_changes": 1
-}
+}
--- a/frappe/core/doctype/data_import/data_import.py
+++ b/frappe/core/doctype/data_import/data_import.py
@ -27,6 +27,8 @@ class DataImport(Document):
 	if TYPE_CHECKING:
 		from frappe.types import DF

+		custom_delimiters: DF.Check
+		delimiter_options: DF.Data | None
 		google_sheets_url: DF.Data | None
 		import_file: DF.Attach | None
 		import_type: DF.Literal["", "Insert New Records", "Update Existing Records"]
@ -50,11 +52,16 @@ class DataImport(Document):
 			self.template_options = ""
 			self.template_warnings = ""

+		self.set_delimiters_flag()
 		self.validate_doctype()
 		self.validate_import_file()
 		self.validate_google_sheets_url()
 		self.set_payload_count()

+	def set_delimiters_flag(self):
+		if self.import_file:
+			frappe.flags.delimiter_options = self.delimiter_options or ","
+
 	def validate_doctype(self):
 		if self.reference_doctype in BLOCKED_DOCTYPES:
 			frappe.throw(_("Importing {0} is not allowed.").format(self.reference_doctype))
@ -79,6 +86,7 @@ class DataImport(Document):
 	def get_preview_from_template(self, import_file=None, google_sheets_url=None):
 		if import_file:
 			self.import_file = import_file
+			self.set_delimiters_flag()

 		if google_sheets_url:
 			self.google_sheets_url = google_sheets_url
--- a/frappe/core/doctype/data_import/fixtures/sample_import_file_semicolon.csv
+++ b/frappe/core/doctype/data_import/fixtures/sample_import_file_semicolon.csv
@ -0,0 +1,5 @@
+Title  ;Description        ;Number ;another_number ;ID (Table Field 1) ;Child Title (Table Field 1) ;Child Description (Table Field 1) ;Child 2 Title (Table Field 2) ;Child 2 Date (Table Field 2) ;Child 2 Number (Table Field 2) ;Child Title (Table Field 1 Again) ;Child Date (Table Field 1 Again) ;Child Number (Table Field 1 Again) ;table_field_1_again.child_another_number
+Test 5  ;test description   ;1      ;2              ;""                 ;                            ;"child description with ,comma and"                 ;child title                   ;14-08-2019                   ;4                              ;child title again                 ;22-09-2020                       ;5                                  ; 7
+       ;                   ;       ;               ;                   ;child title 2               ;child description 2               ;title child                   ;30-10-2019                   ;5                              ;                                  ;22-09-2021                       ;                                   ;
+       ;test description 2 ;1      ;2              ;                   ;child mandatory title       ;                                  ;title child man               ;                             ;                               ;child mandatory again             ;                                 ;                                   ;
+Test 4 ;test description 3 ;4      ;5              ;""                 ;child title asdf            ;child description asdf            ;child title asdf adsf         ;15-08-2019                   ;6                              ;child title again asdf            ;22-09-2022                       ;9                                  ; 71
--- a/frappe/core/doctype/data_import/importer.py
+++ b/frappe/core/doctype/data_import/importer.py
@ -1012,7 +1012,13 @@ class Column:
 				)
 		elif self.df.fieldtype in ("Date", "Time", "Datetime"):
 			# guess date/time format
+			# TODO: add possibility for user, to define the date format explicitly in the Data Import UI
+			# for example, if date column in file is in  %d-%m-%y  format -> 23-04-24.
+			# The date guesser might fail, as, this can be also parsed as %y-%m-%d, as both 23 and 24 are valid for year & for day
+			# This is an issue that cannot be handled automatically, no matter how we try, as it completely depends on the user's input.
+			# Defining an explicit value which surely recognizes
 			self.date_format = self.guess_date_format_for_column()
+
 			if not self.date_format:
 				if self.df.fieldtype == "Time":
 					self.date_format = "%H:%M:%S"
--- a/frappe/core/doctype/data_import/test_importer.py
+++ b/frappe/core/doctype/data_import/test_importer.py
@ -50,6 +50,25 @@ class TestImporter(FrappeTestCase):
 		self.assertEqual(doc3.another_number, 5)
 		self.assertEqual(format_duration(doc3.duration), "5d 5h 45m")

+	def test_data_validation_semicolon_success(self):
+		import_file = get_import_file("sample_import_file_semicolon")
+		data_import = self.get_importer(doctype_name, import_file, update=True)
+
+		doc = data_import.get_preview_from_template().get("data", [{}])
+
+		self.assertEqual(doc[0][7], "child description with ,comma and")
+		# Column count should be 14 (+1 ID)
+		self.assertEqual(len(doc[0]), 15)
+
+	def test_data_validation_semicolon_failure(self):
+		import_file = get_import_file("sample_import_file_semicolon")
+
+		data_import = self.get_importer_semicolon(doctype_name, import_file)
+		doc = data_import.get_preview_from_template().get("data", [{}])
+		# if semicolon delimiter detection fails, and falls back to comma,
+		# column number will be less than 15 -> 2 (+1 id)
+		self.assertLessEqual(len(doc[0]), 15)
+
 	def test_data_import_preview(self):
 		import_file = get_import_file("sample_import_file")
 		data_import = self.get_importer(doctype_name, import_file)
@ -138,6 +157,18 @@ class TestImporter(FrappeTestCase):

 		return data_import

+	def get_importer_semicolon(self, doctype, import_file, update=False):
+		data_import = frappe.new_doc("Data Import")
+		data_import.import_type = "Insert New Records" if not update else "Update Existing Records"
+		data_import.reference_doctype = doctype
+		data_import.import_file = import_file.file_url
+		# deliberately overwrite default delimiter options here, causing to fail when parsing `;`
+		data_import.delimiter_options = ","
+		data_import.insert()
+		frappe.db.commit()  # nosemgrep
+
+		return data_import
+

 def create_doctype_if_not_exists(doctype_name, force=False):
 	if force:
--- a/frappe/core/doctype/data_import_log/data_import_log.json
+++ b/frappe/core/doctype/data_import_log/data_import_log.json
@ -82,4 +82,4 @@
 "sort_field": "creation",
 "sort_order": "DESC",
 "states": []
-}
+}
--- a/frappe/core/doctype/file/file.py
+++ b/frappe/core/doctype/file/file.py
@ -31,6 +31,7 @@ from .utils import *
 exclude_from_linked_with = True
 ImageFile.LOAD_TRUNCATED_IMAGES = True
 URL_PREFIXES = ("http://", "https://")
+FILE_ENCODING_OPTIONS = ("utf-8-sig", "utf-8", "windows-1250", "windows-1252")


 class File(Document):
@ -515,10 +516,11 @@ class File(Document):
 	def exists_on_disk(self):
 		return os.path.exists(self.get_full_path())

-	def get_content(self) -> bytes:
+	def get_content(self, encodings=None) -> bytes | str:
 		if self.is_folder:
 			frappe.throw(_("Cannot get file contents of a Folder"))

+		# if doc was just created, content field is already populated, return it as-is
 		if self.get("content"):
 			self._content = self.content
 			if self.decode:
@ -531,15 +533,20 @@ class File(Document):
 			self.validate_file_url()
 		file_path = self.get_full_path()

-		# read the file
+		if encodings is None:
+			encodings = FILE_ENCODING_OPTIONS
 		with open(file_path, mode="rb") as f:
 			self._content = f.read()
-			try:
-				# for plain text files
-				self._content = self._content.decode()
-			except UnicodeDecodeError:
-				# for .png, .jpg, etc
-				pass
+			# looping will not result in slowdown, as the content is usually utf-8 or utf-8-sig
+			# encoded so the first iteration will be enough most of the time
+			for encoding in encodings:
+				try:
+					# read file with proper encoding
+					self._content = self._content.decode(encoding)
+					break
+				except UnicodeDecodeError:
+					# for .png, .jpg, etc
+					continue

 		return self._content

--- a/frappe/core/doctype/file/test_file.py
+++ b/frappe/core/doctype/file/test_file.py
@ -1,7 +1,6 @@
 # Copyright (c) 2022, Frappe Technologies Pvt. Ltd. and Contributors
 # License: MIT. See LICENSE
 import base64
-import json
 import os
 import shutil
 import tempfile
@ -111,7 +110,7 @@ class TestBase64File(FrappeTestCase):
 	def setUp(self):
 		self.attached_to_doctype, self.attached_to_docname = make_test_doc()
 		self.test_content = base64.b64encode(test_content1.encode("utf-8"))
-		_file: "File" = frappe.get_doc(
+		_file: frappe.Document = frappe.get_doc(
 			{
 				"doctype": "File",
 				"file_name": "test_base64.txt",
@ -125,7 +124,7 @@ class TestBase64File(FrappeTestCase):
 		self.saved_file_url = _file.file_url

 	def test_saved_content(self):
-		_file = frappe.get_doc("File", {"file_url": self.saved_file_url})
+		_file: frappe.Document = frappe.get_doc("File", {"file_url": self.saved_file_url})
 		content = _file.get_content()
 		self.assertEqual(content, test_content1)

@ -255,6 +254,25 @@ class TestSameContent(FrappeTestCase):
 		limit_property.delete()
 		frappe.clear_cache(doctype="ToDo")

+	def test_utf8_bom_content_decoding(self):
+		utf8_bom_content = test_content1.encode("utf-8-sig")
+		_file: frappe.Document = frappe.get_doc(
+			{
+				"doctype": "File",
+				"file_name": "utf8bom.txt",
+				"attached_to_doctype": self.attached_to_doctype1,
+				"attached_to_name": self.attached_to_docname1,
+				"content": utf8_bom_content,
+				"decode": False,
+			}
+		)
+		_file.save()
+		saved_file = frappe.get_doc("File", _file.name)
+		file_content_decoded = saved_file.get_content(encodings=["utf-8"])
+		self.assertEqual(file_content_decoded[0], "\ufeff")
+		file_content_properly_decoded = saved_file.get_content(encodings=["utf-8-sig", "utf-8"])
+		self.assertEqual(file_content_properly_decoded, test_content1)
+

 class TestFile(FrappeTestCase):
 	def setUp(self):
--- a/frappe/utils/csvutils.py
+++ b/frappe/utils/csvutils.py
@ -2,12 +2,14 @@
 # License: MIT. See LICENSE
 import csv
 import json
+from csv import Sniffer
 from io import StringIO

 import requests

 import frappe
 from frappe import _, msgprint
+from frappe.core.doctype.file.file import FILE_ENCODING_OPTIONS
 from frappe.utils import cint, comma_or, cstr, flt


@ -39,7 +41,7 @@ def read_csv_content_from_attached_file(doc):
 def read_csv_content(fcontent):
 	if not isinstance(fcontent, str):
 		decoded = False
-		for encoding in ["utf-8", "windows-1250", "windows-1252"]:
+		for encoding in FILE_ENCODING_OPTIONS:
 			try:
 				fcontent = str(fcontent, encoding)
 				decoded = True
@ -49,15 +51,35 @@ def read_csv_content(fcontent):

 		if not decoded:
 			frappe.msgprint(
-				_("Unknown file encoding. Tried utf-8, windows-1250, windows-1252."), raise_exception=True
+				_("Unknown file encoding. Tried to use: {0}").format(", ".join(FILE_ENCODING_OPTIONS)),
+				raise_exception=True,
 			)

 	fcontent = fcontent.encode("utf-8")
 	content = [frappe.safe_decode(line) for line in fcontent.splitlines(True)]

+	sniffer = Sniffer()
+	# Don't need to use whole csv, if more than 20 rows, use just first 20
+	sample_content = content[:20] if len(content) > 20 else content
+	# only testing for most common delimiter types, this later can be extended
+	# init default dialect, to avoid lint errors
+	dialect = csv.get_dialect("excel")
+	try:
+		# csv by default uses excel dialect, which is not always correct
+		dialect = sniffer.sniff(sample="\n".join(sample_content), delimiters=frappe.flags.delimiter_options)
+	except csv.Error:
+		# if sniff fails, show alert on user interface. Fall back to use default dialect (excel)
+		frappe.msgprint(
+			_(
+				"Delimiter detection failed. Try to enable custom delimiters and adjust the delimiter options as per your data."
+			),
+			indicator="orange",
+			alert=True,
+		)
+
 	try:
 		rows = []
-		for row in csv.reader(content):
+		for row in csv.reader(content, dialect=dialect):
 			r = []
 			for val in row:
 				# decode everything