Merge pull request #26183 from vmatt/data_import_delimiter
feat(Data Import): custom csv delimiters, UTF-8 BOM handling
This commit is contained in:
commit
3a4dd9a671
9 changed files with 130 additions and 17 deletions
|
|
@ -16,6 +16,8 @@
|
|||
"google_sheets_url",
|
||||
"refresh_google_sheet",
|
||||
"column_break_5",
|
||||
"custom_delimiters",
|
||||
"delimiter_options",
|
||||
"status",
|
||||
"submit_after_import",
|
||||
"mute_emails",
|
||||
|
|
@ -167,11 +169,25 @@
|
|||
"hidden": 1,
|
||||
"label": "Payload Count",
|
||||
"read_only": 1
|
||||
},
|
||||
{
|
||||
"default": ",;\\t|",
|
||||
"depends_on": "custom_delimiters",
|
||||
"description": "If your CSV uses a different delimiter, add that character here, ensuring no spaces or additional characters are included.",
|
||||
"fieldname": "delimiter_options",
|
||||
"fieldtype": "Data",
|
||||
"label": "Delimiter Options"
|
||||
},
|
||||
{
|
||||
"default": "0",
|
||||
"fieldname": "custom_delimiters",
|
||||
"fieldtype": "Check",
|
||||
"label": "Custom Delimiters"
|
||||
}
|
||||
],
|
||||
"hide_toolbar": 1,
|
||||
"links": [],
|
||||
"modified": "2024-03-23 16:02:16.953820",
|
||||
"modified": "2024-04-27 20:42:35.843158",
|
||||
"modified_by": "Administrator",
|
||||
"module": "Core",
|
||||
"name": "Data Import",
|
||||
|
|
@ -195,4 +211,4 @@
|
|||
"sort_order": "DESC",
|
||||
"states": [],
|
||||
"track_changes": 1
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -27,6 +27,8 @@ class DataImport(Document):
|
|||
if TYPE_CHECKING:
|
||||
from frappe.types import DF
|
||||
|
||||
custom_delimiters: DF.Check
|
||||
delimiter_options: DF.Data | None
|
||||
google_sheets_url: DF.Data | None
|
||||
import_file: DF.Attach | None
|
||||
import_type: DF.Literal["", "Insert New Records", "Update Existing Records"]
|
||||
|
|
@ -50,11 +52,16 @@ class DataImport(Document):
|
|||
self.template_options = ""
|
||||
self.template_warnings = ""
|
||||
|
||||
self.set_delimiters_flag()
|
||||
self.validate_doctype()
|
||||
self.validate_import_file()
|
||||
self.validate_google_sheets_url()
|
||||
self.set_payload_count()
|
||||
|
||||
def set_delimiters_flag(self):
|
||||
if self.import_file:
|
||||
frappe.flags.delimiter_options = self.delimiter_options or ","
|
||||
|
||||
def validate_doctype(self):
|
||||
if self.reference_doctype in BLOCKED_DOCTYPES:
|
||||
frappe.throw(_("Importing {0} is not allowed.").format(self.reference_doctype))
|
||||
|
|
@ -79,6 +86,7 @@ class DataImport(Document):
|
|||
def get_preview_from_template(self, import_file=None, google_sheets_url=None):
|
||||
if import_file:
|
||||
self.import_file = import_file
|
||||
self.set_delimiters_flag()
|
||||
|
||||
if google_sheets_url:
|
||||
self.google_sheets_url = google_sheets_url
|
||||
|
|
|
|||
|
|
@ -0,0 +1,5 @@
|
|||
Title ;Description ;Number ;another_number ;ID (Table Field 1) ;Child Title (Table Field 1) ;Child Description (Table Field 1) ;Child 2 Title (Table Field 2) ;Child 2 Date (Table Field 2) ;Child 2 Number (Table Field 2) ;Child Title (Table Field 1 Again) ;Child Date (Table Field 1 Again) ;Child Number (Table Field 1 Again) ;table_field_1_again.child_another_number
|
||||
Test 5 ;test description ;1 ;2 ;"" ; ;"child description with ,comma and" ;child title ;14-08-2019 ;4 ;child title again ;22-09-2020 ;5 ; 7
|
||||
; ; ; ; ;child title 2 ;child description 2 ;title child ;30-10-2019 ;5 ; ;22-09-2021 ; ;
|
||||
;test description 2 ;1 ;2 ; ;child mandatory title ; ;title child man ; ; ;child mandatory again ; ; ;
|
||||
Test 4 ;test description 3 ;4 ;5 ;"" ;child title asdf ;child description asdf ;child title asdf adsf ;15-08-2019 ;6 ;child title again asdf ;22-09-2022 ;9 ; 71
|
||||
|
Can't render this file because it contains an unexpected character in line 2 and column 55.
|
|
|
@ -1012,7 +1012,13 @@ class Column:
|
|||
)
|
||||
elif self.df.fieldtype in ("Date", "Time", "Datetime"):
|
||||
# guess date/time format
|
||||
# TODO: add possibility for user, to define the date format explicitly in the Data Import UI
|
||||
# for example, if date column in file is in %d-%m-%y format -> 23-04-24.
|
||||
# The date guesser might fail, as, this can be also parsed as %y-%m-%d, as both 23 and 24 are valid for year & for day
|
||||
# This is an issue that cannot be handled automatically, no matter how we try, as it completely depends on the user's input.
|
||||
# Defining an explicit value which surely recognizes
|
||||
self.date_format = self.guess_date_format_for_column()
|
||||
|
||||
if not self.date_format:
|
||||
if self.df.fieldtype == "Time":
|
||||
self.date_format = "%H:%M:%S"
|
||||
|
|
|
|||
|
|
@ -50,6 +50,25 @@ class TestImporter(FrappeTestCase):
|
|||
self.assertEqual(doc3.another_number, 5)
|
||||
self.assertEqual(format_duration(doc3.duration), "5d 5h 45m")
|
||||
|
||||
def test_data_validation_semicolon_success(self):
|
||||
import_file = get_import_file("sample_import_file_semicolon")
|
||||
data_import = self.get_importer(doctype_name, import_file, update=True)
|
||||
|
||||
doc = data_import.get_preview_from_template().get("data", [{}])
|
||||
|
||||
self.assertEqual(doc[0][7], "child description with ,comma and")
|
||||
# Column count should be 14 (+1 ID)
|
||||
self.assertEqual(len(doc[0]), 15)
|
||||
|
||||
def test_data_validation_semicolon_failure(self):
|
||||
import_file = get_import_file("sample_import_file_semicolon")
|
||||
|
||||
data_import = self.get_importer_semicolon(doctype_name, import_file)
|
||||
doc = data_import.get_preview_from_template().get("data", [{}])
|
||||
# if semicolon delimiter detection fails, and falls back to comma,
|
||||
# column number will be less than 15 -> 2 (+1 id)
|
||||
self.assertLessEqual(len(doc[0]), 15)
|
||||
|
||||
def test_data_import_preview(self):
|
||||
import_file = get_import_file("sample_import_file")
|
||||
data_import = self.get_importer(doctype_name, import_file)
|
||||
|
|
@ -138,6 +157,18 @@ class TestImporter(FrappeTestCase):
|
|||
|
||||
return data_import
|
||||
|
||||
def get_importer_semicolon(self, doctype, import_file, update=False):
|
||||
data_import = frappe.new_doc("Data Import")
|
||||
data_import.import_type = "Insert New Records" if not update else "Update Existing Records"
|
||||
data_import.reference_doctype = doctype
|
||||
data_import.import_file = import_file.file_url
|
||||
# deliberately overwrite default delimiter options here, causing to fail when parsing `;`
|
||||
data_import.delimiter_options = ","
|
||||
data_import.insert()
|
||||
frappe.db.commit() # nosemgrep
|
||||
|
||||
return data_import
|
||||
|
||||
|
||||
def create_doctype_if_not_exists(doctype_name, force=False):
|
||||
if force:
|
||||
|
|
|
|||
|
|
@ -82,4 +82,4 @@
|
|||
"sort_field": "creation",
|
||||
"sort_order": "DESC",
|
||||
"states": []
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -31,6 +31,7 @@ from .utils import *
|
|||
exclude_from_linked_with = True
|
||||
ImageFile.LOAD_TRUNCATED_IMAGES = True
|
||||
URL_PREFIXES = ("http://", "https://")
|
||||
FILE_ENCODING_OPTIONS = ("utf-8-sig", "utf-8", "windows-1250", "windows-1252")
|
||||
|
||||
|
||||
class File(Document):
|
||||
|
|
@ -515,10 +516,11 @@ class File(Document):
|
|||
def exists_on_disk(self):
|
||||
return os.path.exists(self.get_full_path())
|
||||
|
||||
def get_content(self) -> bytes:
|
||||
def get_content(self, encodings=None) -> bytes | str:
|
||||
if self.is_folder:
|
||||
frappe.throw(_("Cannot get file contents of a Folder"))
|
||||
|
||||
# if doc was just created, content field is already populated, return it as-is
|
||||
if self.get("content"):
|
||||
self._content = self.content
|
||||
if self.decode:
|
||||
|
|
@ -531,15 +533,20 @@ class File(Document):
|
|||
self.validate_file_url()
|
||||
file_path = self.get_full_path()
|
||||
|
||||
# read the file
|
||||
if encodings is None:
|
||||
encodings = FILE_ENCODING_OPTIONS
|
||||
with open(file_path, mode="rb") as f:
|
||||
self._content = f.read()
|
||||
try:
|
||||
# for plain text files
|
||||
self._content = self._content.decode()
|
||||
except UnicodeDecodeError:
|
||||
# for .png, .jpg, etc
|
||||
pass
|
||||
# looping will not result in slowdown, as the content is usually utf-8 or utf-8-sig
|
||||
# encoded so the first iteration will be enough most of the time
|
||||
for encoding in encodings:
|
||||
try:
|
||||
# read file with proper encoding
|
||||
self._content = self._content.decode(encoding)
|
||||
break
|
||||
except UnicodeDecodeError:
|
||||
# for .png, .jpg, etc
|
||||
continue
|
||||
|
||||
return self._content
|
||||
|
||||
|
|
|
|||
|
|
@ -1,7 +1,6 @@
|
|||
# Copyright (c) 2022, Frappe Technologies Pvt. Ltd. and Contributors
|
||||
# License: MIT. See LICENSE
|
||||
import base64
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
|
|
@ -111,7 +110,7 @@ class TestBase64File(FrappeTestCase):
|
|||
def setUp(self):
|
||||
self.attached_to_doctype, self.attached_to_docname = make_test_doc()
|
||||
self.test_content = base64.b64encode(test_content1.encode("utf-8"))
|
||||
_file: "File" = frappe.get_doc(
|
||||
_file: frappe.Document = frappe.get_doc(
|
||||
{
|
||||
"doctype": "File",
|
||||
"file_name": "test_base64.txt",
|
||||
|
|
@ -125,7 +124,7 @@ class TestBase64File(FrappeTestCase):
|
|||
self.saved_file_url = _file.file_url
|
||||
|
||||
def test_saved_content(self):
|
||||
_file = frappe.get_doc("File", {"file_url": self.saved_file_url})
|
||||
_file: frappe.Document = frappe.get_doc("File", {"file_url": self.saved_file_url})
|
||||
content = _file.get_content()
|
||||
self.assertEqual(content, test_content1)
|
||||
|
||||
|
|
@ -255,6 +254,25 @@ class TestSameContent(FrappeTestCase):
|
|||
limit_property.delete()
|
||||
frappe.clear_cache(doctype="ToDo")
|
||||
|
||||
def test_utf8_bom_content_decoding(self):
|
||||
utf8_bom_content = test_content1.encode("utf-8-sig")
|
||||
_file: frappe.Document = frappe.get_doc(
|
||||
{
|
||||
"doctype": "File",
|
||||
"file_name": "utf8bom.txt",
|
||||
"attached_to_doctype": self.attached_to_doctype1,
|
||||
"attached_to_name": self.attached_to_docname1,
|
||||
"content": utf8_bom_content,
|
||||
"decode": False,
|
||||
}
|
||||
)
|
||||
_file.save()
|
||||
saved_file = frappe.get_doc("File", _file.name)
|
||||
file_content_decoded = saved_file.get_content(encodings=["utf-8"])
|
||||
self.assertEqual(file_content_decoded[0], "\ufeff")
|
||||
file_content_properly_decoded = saved_file.get_content(encodings=["utf-8-sig", "utf-8"])
|
||||
self.assertEqual(file_content_properly_decoded, test_content1)
|
||||
|
||||
|
||||
class TestFile(FrappeTestCase):
|
||||
def setUp(self):
|
||||
|
|
|
|||
|
|
@ -2,12 +2,14 @@
|
|||
# License: MIT. See LICENSE
|
||||
import csv
|
||||
import json
|
||||
from csv import Sniffer
|
||||
from io import StringIO
|
||||
|
||||
import requests
|
||||
|
||||
import frappe
|
||||
from frappe import _, msgprint
|
||||
from frappe.core.doctype.file.file import FILE_ENCODING_OPTIONS
|
||||
from frappe.utils import cint, comma_or, cstr, flt
|
||||
|
||||
|
||||
|
|
@ -39,7 +41,7 @@ def read_csv_content_from_attached_file(doc):
|
|||
def read_csv_content(fcontent):
|
||||
if not isinstance(fcontent, str):
|
||||
decoded = False
|
||||
for encoding in ["utf-8", "windows-1250", "windows-1252"]:
|
||||
for encoding in FILE_ENCODING_OPTIONS:
|
||||
try:
|
||||
fcontent = str(fcontent, encoding)
|
||||
decoded = True
|
||||
|
|
@ -49,15 +51,35 @@ def read_csv_content(fcontent):
|
|||
|
||||
if not decoded:
|
||||
frappe.msgprint(
|
||||
_("Unknown file encoding. Tried utf-8, windows-1250, windows-1252."), raise_exception=True
|
||||
_("Unknown file encoding. Tried to use: {0}").format(", ".join(FILE_ENCODING_OPTIONS)),
|
||||
raise_exception=True,
|
||||
)
|
||||
|
||||
fcontent = fcontent.encode("utf-8")
|
||||
content = [frappe.safe_decode(line) for line in fcontent.splitlines(True)]
|
||||
|
||||
sniffer = Sniffer()
|
||||
# Don't need to use whole csv, if more than 20 rows, use just first 20
|
||||
sample_content = content[:20] if len(content) > 20 else content
|
||||
# only testing for most common delimiter types, this later can be extended
|
||||
# init default dialect, to avoid lint errors
|
||||
dialect = csv.get_dialect("excel")
|
||||
try:
|
||||
# csv by default uses excel dialect, which is not always correct
|
||||
dialect = sniffer.sniff(sample="\n".join(sample_content), delimiters=frappe.flags.delimiter_options)
|
||||
except csv.Error:
|
||||
# if sniff fails, show alert on user interface. Fall back to use default dialect (excel)
|
||||
frappe.msgprint(
|
||||
_(
|
||||
"Delimiter detection failed. Try to enable custom delimiters and adjust the delimiter options as per your data."
|
||||
),
|
||||
indicator="orange",
|
||||
alert=True,
|
||||
)
|
||||
|
||||
try:
|
||||
rows = []
|
||||
for row in csv.reader(content):
|
||||
for row in csv.reader(content, dialect=dialect):
|
||||
r = []
|
||||
for val in row:
|
||||
# decode everything
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue