diff --git a/frappe/core/doctype/data_import/data_import.json b/frappe/core/doctype/data_import/data_import.json index 6325e139fd..2c33bb7b98 100644 --- a/frappe/core/doctype/data_import/data_import.json +++ b/frappe/core/doctype/data_import/data_import.json @@ -16,6 +16,8 @@ "google_sheets_url", "refresh_google_sheet", "column_break_5", + "custom_delimiters", + "delimiter_options", "status", "submit_after_import", "mute_emails", @@ -167,11 +169,25 @@ "hidden": 1, "label": "Payload Count", "read_only": 1 + }, + { + "default": ",;\\t|", + "depends_on": "custom_delimiters", + "description": "If your CSV uses a different delimiter, add that character here, ensuring no spaces or additional characters are included.", + "fieldname": "delimiter_options", + "fieldtype": "Data", + "label": "Delimiter Options" + }, + { + "default": "0", + "fieldname": "custom_delimiters", + "fieldtype": "Check", + "label": "Custom Delimiters" } ], "hide_toolbar": 1, "links": [], - "modified": "2024-03-23 16:02:16.953820", + "modified": "2024-04-27 20:42:35.843158", "modified_by": "Administrator", "module": "Core", "name": "Data Import", @@ -195,4 +211,4 @@ "sort_order": "DESC", "states": [], "track_changes": 1 -} \ No newline at end of file +} diff --git a/frappe/core/doctype/data_import/data_import.py b/frappe/core/doctype/data_import/data_import.py index 7a0397bba6..cfecb3fbda 100644 --- a/frappe/core/doctype/data_import/data_import.py +++ b/frappe/core/doctype/data_import/data_import.py @@ -27,6 +27,8 @@ class DataImport(Document): if TYPE_CHECKING: from frappe.types import DF + custom_delimiters: DF.Check + delimiter_options: DF.Data | None google_sheets_url: DF.Data | None import_file: DF.Attach | None import_type: DF.Literal["", "Insert New Records", "Update Existing Records"] @@ -50,11 +52,16 @@ class DataImport(Document): self.template_options = "" self.template_warnings = "" + self.set_delimiters_flag() self.validate_doctype() self.validate_import_file() self.validate_google_sheets_url() self.set_payload_count() + def set_delimiters_flag(self): + if self.import_file: + frappe.flags.delimiter_options = self.delimiter_options or "," + def validate_doctype(self): if self.reference_doctype in BLOCKED_DOCTYPES: frappe.throw(_("Importing {0} is not allowed.").format(self.reference_doctype)) @@ -79,6 +86,7 @@ class DataImport(Document): def get_preview_from_template(self, import_file=None, google_sheets_url=None): if import_file: self.import_file = import_file + self.set_delimiters_flag() if google_sheets_url: self.google_sheets_url = google_sheets_url diff --git a/frappe/core/doctype/data_import/fixtures/sample_import_file_semicolon.csv b/frappe/core/doctype/data_import/fixtures/sample_import_file_semicolon.csv new file mode 100644 index 0000000000..0696815278 --- /dev/null +++ b/frappe/core/doctype/data_import/fixtures/sample_import_file_semicolon.csv @@ -0,0 +1,5 @@ +Title ;Description ;Number ;another_number ;ID (Table Field 1) ;Child Title (Table Field 1) ;Child Description (Table Field 1) ;Child 2 Title (Table Field 2) ;Child 2 Date (Table Field 2) ;Child 2 Number (Table Field 2) ;Child Title (Table Field 1 Again) ;Child Date (Table Field 1 Again) ;Child Number (Table Field 1 Again) ;table_field_1_again.child_another_number +Test 5 ;test description ;1 ;2 ;"" ; ;"child description with ,comma and" ;child title ;14-08-2019 ;4 ;child title again ;22-09-2020 ;5 ; 7 + ; ; ; ; ;child title 2 ;child description 2 ;title child ;30-10-2019 ;5 ; ;22-09-2021 ; ; + ;test description 2 ;1 ;2 ; ;child mandatory title ; ;title child man ; ; ;child mandatory again ; ; ; +Test 4 ;test description 3 ;4 ;5 ;"" ;child title asdf ;child description asdf ;child title asdf adsf ;15-08-2019 ;6 ;child title again asdf ;22-09-2022 ;9 ; 71 diff --git a/frappe/core/doctype/data_import/importer.py b/frappe/core/doctype/data_import/importer.py index a177d85c66..0db3d77e7d 100644 --- a/frappe/core/doctype/data_import/importer.py +++ b/frappe/core/doctype/data_import/importer.py @@ -1012,7 +1012,13 @@ class Column: ) elif self.df.fieldtype in ("Date", "Time", "Datetime"): # guess date/time format + # TODO: add possibility for user, to define the date format explicitly in the Data Import UI + # for example, if date column in file is in %d-%m-%y format -> 23-04-24. + # The date guesser might fail, as, this can be also parsed as %y-%m-%d, as both 23 and 24 are valid for year & for day + # This is an issue that cannot be handled automatically, no matter how we try, as it completely depends on the user's input. + # Defining an explicit value which surely recognizes self.date_format = self.guess_date_format_for_column() + if not self.date_format: if self.df.fieldtype == "Time": self.date_format = "%H:%M:%S" diff --git a/frappe/core/doctype/data_import/test_importer.py b/frappe/core/doctype/data_import/test_importer.py index 965e34c3e6..8e7ae548ab 100644 --- a/frappe/core/doctype/data_import/test_importer.py +++ b/frappe/core/doctype/data_import/test_importer.py @@ -50,6 +50,25 @@ class TestImporter(FrappeTestCase): self.assertEqual(doc3.another_number, 5) self.assertEqual(format_duration(doc3.duration), "5d 5h 45m") + def test_data_validation_semicolon_success(self): + import_file = get_import_file("sample_import_file_semicolon") + data_import = self.get_importer(doctype_name, import_file, update=True) + + doc = data_import.get_preview_from_template().get("data", [{}]) + + self.assertEqual(doc[0][7], "child description with ,comma and") + # Column count should be 14 (+1 ID) + self.assertEqual(len(doc[0]), 15) + + def test_data_validation_semicolon_failure(self): + import_file = get_import_file("sample_import_file_semicolon") + + data_import = self.get_importer_semicolon(doctype_name, import_file) + doc = data_import.get_preview_from_template().get("data", [{}]) + # if semicolon delimiter detection fails, and falls back to comma, + # column number will be less than 15 -> 2 (+1 id) + self.assertLessEqual(len(doc[0]), 15) + def test_data_import_preview(self): import_file = get_import_file("sample_import_file") data_import = self.get_importer(doctype_name, import_file) @@ -138,6 +157,18 @@ class TestImporter(FrappeTestCase): return data_import + def get_importer_semicolon(self, doctype, import_file, update=False): + data_import = frappe.new_doc("Data Import") + data_import.import_type = "Insert New Records" if not update else "Update Existing Records" + data_import.reference_doctype = doctype + data_import.import_file = import_file.file_url + # deliberately overwrite default delimiter options here, causing to fail when parsing `;` + data_import.delimiter_options = "," + data_import.insert() + frappe.db.commit() # nosemgrep + + return data_import + def create_doctype_if_not_exists(doctype_name, force=False): if force: diff --git a/frappe/core/doctype/data_import_log/data_import_log.json b/frappe/core/doctype/data_import_log/data_import_log.json index c184df193d..59f1bb1983 100644 --- a/frappe/core/doctype/data_import_log/data_import_log.json +++ b/frappe/core/doctype/data_import_log/data_import_log.json @@ -82,4 +82,4 @@ "sort_field": "creation", "sort_order": "DESC", "states": [] -} \ No newline at end of file +} diff --git a/frappe/core/doctype/file/file.py b/frappe/core/doctype/file/file.py index 6e03909c1c..449fc572f4 100755 --- a/frappe/core/doctype/file/file.py +++ b/frappe/core/doctype/file/file.py @@ -31,6 +31,7 @@ from .utils import * exclude_from_linked_with = True ImageFile.LOAD_TRUNCATED_IMAGES = True URL_PREFIXES = ("http://", "https://") +FILE_ENCODING_OPTIONS = ("utf-8-sig", "utf-8", "windows-1250", "windows-1252") class File(Document): @@ -515,10 +516,11 @@ class File(Document): def exists_on_disk(self): return os.path.exists(self.get_full_path()) - def get_content(self) -> bytes: + def get_content(self, encodings=None) -> bytes | str: if self.is_folder: frappe.throw(_("Cannot get file contents of a Folder")) + # if doc was just created, content field is already populated, return it as-is if self.get("content"): self._content = self.content if self.decode: @@ -531,15 +533,20 @@ class File(Document): self.validate_file_url() file_path = self.get_full_path() - # read the file + if encodings is None: + encodings = FILE_ENCODING_OPTIONS with open(file_path, mode="rb") as f: self._content = f.read() - try: - # for plain text files - self._content = self._content.decode() - except UnicodeDecodeError: - # for .png, .jpg, etc - pass + # looping will not result in slowdown, as the content is usually utf-8 or utf-8-sig + # encoded so the first iteration will be enough most of the time + for encoding in encodings: + try: + # read file with proper encoding + self._content = self._content.decode(encoding) + break + except UnicodeDecodeError: + # for .png, .jpg, etc + continue return self._content diff --git a/frappe/core/doctype/file/test_file.py b/frappe/core/doctype/file/test_file.py index ecd431436f..6c9b9f5872 100644 --- a/frappe/core/doctype/file/test_file.py +++ b/frappe/core/doctype/file/test_file.py @@ -1,7 +1,6 @@ # Copyright (c) 2022, Frappe Technologies Pvt. Ltd. and Contributors # License: MIT. See LICENSE import base64 -import json import os import shutil import tempfile @@ -111,7 +110,7 @@ class TestBase64File(FrappeTestCase): def setUp(self): self.attached_to_doctype, self.attached_to_docname = make_test_doc() self.test_content = base64.b64encode(test_content1.encode("utf-8")) - _file: "File" = frappe.get_doc( + _file: frappe.Document = frappe.get_doc( { "doctype": "File", "file_name": "test_base64.txt", @@ -125,7 +124,7 @@ class TestBase64File(FrappeTestCase): self.saved_file_url = _file.file_url def test_saved_content(self): - _file = frappe.get_doc("File", {"file_url": self.saved_file_url}) + _file: frappe.Document = frappe.get_doc("File", {"file_url": self.saved_file_url}) content = _file.get_content() self.assertEqual(content, test_content1) @@ -255,6 +254,25 @@ class TestSameContent(FrappeTestCase): limit_property.delete() frappe.clear_cache(doctype="ToDo") + def test_utf8_bom_content_decoding(self): + utf8_bom_content = test_content1.encode("utf-8-sig") + _file: frappe.Document = frappe.get_doc( + { + "doctype": "File", + "file_name": "utf8bom.txt", + "attached_to_doctype": self.attached_to_doctype1, + "attached_to_name": self.attached_to_docname1, + "content": utf8_bom_content, + "decode": False, + } + ) + _file.save() + saved_file = frappe.get_doc("File", _file.name) + file_content_decoded = saved_file.get_content(encodings=["utf-8"]) + self.assertEqual(file_content_decoded[0], "\ufeff") + file_content_properly_decoded = saved_file.get_content(encodings=["utf-8-sig", "utf-8"]) + self.assertEqual(file_content_properly_decoded, test_content1) + class TestFile(FrappeTestCase): def setUp(self): diff --git a/frappe/utils/csvutils.py b/frappe/utils/csvutils.py index 8a31a94326..fd3a20b8ba 100644 --- a/frappe/utils/csvutils.py +++ b/frappe/utils/csvutils.py @@ -2,12 +2,14 @@ # License: MIT. See LICENSE import csv import json +from csv import Sniffer from io import StringIO import requests import frappe from frappe import _, msgprint +from frappe.core.doctype.file.file import FILE_ENCODING_OPTIONS from frappe.utils import cint, comma_or, cstr, flt @@ -39,7 +41,7 @@ def read_csv_content_from_attached_file(doc): def read_csv_content(fcontent): if not isinstance(fcontent, str): decoded = False - for encoding in ["utf-8", "windows-1250", "windows-1252"]: + for encoding in FILE_ENCODING_OPTIONS: try: fcontent = str(fcontent, encoding) decoded = True @@ -49,15 +51,35 @@ def read_csv_content(fcontent): if not decoded: frappe.msgprint( - _("Unknown file encoding. Tried utf-8, windows-1250, windows-1252."), raise_exception=True + _("Unknown file encoding. Tried to use: {0}").format(", ".join(FILE_ENCODING_OPTIONS)), + raise_exception=True, ) fcontent = fcontent.encode("utf-8") content = [frappe.safe_decode(line) for line in fcontent.splitlines(True)] + sniffer = Sniffer() + # Don't need to use whole csv, if more than 20 rows, use just first 20 + sample_content = content[:20] if len(content) > 20 else content + # only testing for most common delimiter types, this later can be extended + # init default dialect, to avoid lint errors + dialect = csv.get_dialect("excel") + try: + # csv by default uses excel dialect, which is not always correct + dialect = sniffer.sniff(sample="\n".join(sample_content), delimiters=frappe.flags.delimiter_options) + except csv.Error: + # if sniff fails, show alert on user interface. Fall back to use default dialect (excel) + frappe.msgprint( + _( + "Delimiter detection failed. Try to enable custom delimiters and adjust the delimiter options as per your data." + ), + indicator="orange", + alert=True, + ) + try: rows = [] - for row in csv.reader(content): + for row in csv.reader(content, dialect=dialect): r = [] for val in row: # decode everything