feat(Data Import): custom delimiters

frappe · Apr 30, 2024 · 45eabd3 · 45eabd3
1 parent 71dfffa
commit 45eabd3
Show file tree

Hide file tree

Showing 9 changed files with 138 additions and 17 deletions.
diff --git a/frappe/core/doctype/data_import/data_import.json b/frappe/core/doctype/data_import/data_import.json
@@ -16,6 +16,8 @@
   "google_sheets_url",
   "refresh_google_sheet",
   "column_break_5",
+  "custom_delimiters",
+  "delimiter_options",
   "status",
   "submit_after_import",
   "mute_emails",
@@ -167,11 +169,25 @@
    "hidden": 1,
    "label": "Payload Count",
    "read_only": 1
+  },
+  {
+   "default": ",;\\t|",
+   "depends_on": "custom_delimiters",
+   "description": "If your CSV uses a different delimiter, add that character here, ensuring no spaces or additional characters are included.",
+   "fieldname": "delimiter_options",
+   "fieldtype": "Data",
+   "label": "Delimiter options"
+  },
+  {
+   "default": "0",
+   "fieldname": "custom_delimiters",
+   "fieldtype": "Check",
+   "label": "Custom delimiters"
   }
  ],
  "hide_toolbar": 1,
  "links": [],
- "modified": "2024-03-23 16:02:16.953820",
+ "modified": "2024-04-27 20:42:35.843158",
  "modified_by": "Administrator",
  "module": "Core",
  "name": "Data Import",

diff --git a/frappe/core/doctype/data_import/data_import.py b/frappe/core/doctype/data_import/data_import.py
@@ -27,6 +27,8 @@ class DataImport(Document):
 	if TYPE_CHECKING:
 		from frappe.types import DF
 
+		custom_delimiters: DF.Check
+		delimiter_options: DF.Data | None
 		google_sheets_url: DF.Data | None
 		import_file: DF.Attach | None
 		import_type: DF.Literal["", "Insert New Records", "Update Existing Records"]
@@ -50,11 +52,16 @@ def validate(self):
 			self.template_options = ""
 			self.template_warnings = ""
 
+		self.set_delimiters_flag()
 		self.validate_doctype()
 		self.validate_import_file()
 		self.validate_google_sheets_url()
 		self.set_payload_count()
 
+	def set_delimiters_flag(self):
+		if self.import_file:
+			frappe.flags.delimiter_options = self.delimiter_options
+
 	def validate_doctype(self):
 		if self.reference_doctype in BLOCKED_DOCTYPES:
 			frappe.throw(_("Importing {0} is not allowed.").format(self.reference_doctype))
@@ -79,6 +86,7 @@ def set_payload_count(self):
 	def get_preview_from_template(self, import_file=None, google_sheets_url=None):
 		if import_file:
 			self.import_file = import_file
+			self.set_delimiters_flag()
 
 		if google_sheets_url:
 			self.google_sheets_url = google_sheets_url

diff --git a/frappe/core/doctype/data_import/fixtures/sample_import_file_semicolon.csv b/frappe/core/doctype/data_import/fixtures/sample_import_file_semicolon.csv
@@ -0,0 +1,5 @@
+Title  ;Description        ;Number ;another_number ;ID (Table Field 1) ;Child Title (Table Field 1) ;Child Description (Table Field 1) ;Child 2 Title (Table Field 2) ;Child 2 Date (Table Field 2) ;Child 2 Number (Table Field 2) ;Child Title (Table Field 1 Again) ;Child Date (Table Field 1 Again) ;Child Number (Table Field 1 Again) ;table_field_1_again.child_another_number
+Test 5  ;test description   ;1      ;2              ;""                 ;                            ;"child description with ,comma and"                 ;child title                   ;14-08-2019                   ;4                              ;child title again                 ;22-09-2020                       ;5                                  ; 7
+       ;                   ;       ;               ;                   ;child title 2               ;child description 2               ;title child                   ;30-10-2019                   ;5                              ;                                  ;22-09-2021                       ;                                   ;
+       ;test description 2 ;1      ;2              ;                   ;child mandatory title       ;                                  ;title child man               ;                             ;                               ;child mandatory again             ;                                 ;                                   ;
+Test 4 ;test description 3 ;4      ;5              ;""                 ;child title asdf            ;child description asdf            ;child title asdf adsf         ;15-08-2019                   ;6                              ;child title again asdf            ;22-09-2022                       ;9                                  ; 71
diff --git a/frappe/core/doctype/data_import/importer.py b/frappe/core/doctype/data_import/importer.py
@@ -1012,7 +1012,13 @@ def validate_values(self):
 				)
 		elif self.df.fieldtype in ("Date", "Time", "Datetime"):
 			# guess date/time format
+			# TODO: add possibility for user, to define the date format explicitly in the Data Import UI
+			# for example, if date column in file is in  %d-%m-%y  format -> 23-04-24.
+			# The date guesser might fail, as, this can be also parsed as %y-%m-%d, as both 23 and 24 are valid for year & for day
+			# This is an issue that cannot be handled automatically, no matter how we try, as it completely depends on the user's input.
+			# Defining an explicit value which surely recognizes
 			self.date_format = self.guess_date_format_for_column()
+
 			if not self.date_format:
 				if self.df.fieldtype == "Time":
 					self.date_format = "%H:%M:%S"

diff --git a/frappe/core/doctype/data_import/test_importer.py b/frappe/core/doctype/data_import/test_importer.py
@@ -50,6 +50,24 @@ def test_data_import_from_file(self):
 		self.assertEqual(doc3.another_number, 5)
 		self.assertEqual(format_duration(doc3.duration), "5d 5h 45m")
 
+	def test_data_validation_semicolon_success(self):
+		import_file = get_import_file("sample_import_file_semicolon")
+		data_import = self.get_importer(doctype_name, import_file, update=True)
+
+		doc = data_import.get_preview_from_template().get("data", [{}])
+
+		self.assertEqual(doc[0][7], "child description with ,comma and")
+		# Column count should be 14 (+1 ID)
+		self.assertEqual(len(doc[0]), 15)
+
+	def test_data_validation_semicolon_failure(self):
+		import_file = get_import_file("sample_import_file_semicolon")
+
+		data_import = self.get_importer_semicolon(doctype_name, import_file)
+		doc = data_import.get_preview_from_template().get("data", [{}])
+		# if semicolon delimiter detection fails, and falls back to comma, detected colum number will be 2 (+1 id) instead of
+		self.assertLessEqual(len(doc[0]), 15)
+
 	def test_data_import_preview(self):
 		import_file = get_import_file("sample_import_file")
 		data_import = self.get_importer(doctype_name, import_file)
@@ -94,6 +112,7 @@ def test_data_import_without_mandatory_values(self):
 			"Title is required",
 		)
 
+	#
 	def test_data_import_update(self):
 		existing_doc = frappe.get_doc(
 			doctype=doctype_name,
@@ -138,6 +157,18 @@ def get_importer(self, doctype, import_file, update=False):
 
 		return data_import
 
+	def get_importer_semicolon(self, doctype, import_file, update=False):
+		data_import = frappe.new_doc("Data Import")
+		data_import.import_type = "Insert New Records" if not update else "Update Existing Records"
+		data_import.reference_doctype = doctype
+		data_import.import_file = import_file.file_url
+		# deliberatly overwrite default delimiter options here, causing to fail when parsing ;
+		data_import.delimiter_options = ","
+		data_import.insert()
+		frappe.db.commit()  # nosemgrep
+
+		return data_import
+
 
 def create_doctype_if_not_exists(doctype_name, force=False):
 	if force:

diff --git a/frappe/core/doctype/data_import_log/data_import_log.json b/frappe/core/doctype/data_import_log/data_import_log.json
@@ -60,7 +60,7 @@
  "in_create": 1,
  "index_web_pages_for_search": 1,
  "links": [],
- "modified": "2024-03-23 16:02:17.334396",
+ "modified": "2024-04-16 17:32:34.363704",
  "modified_by": "Administrator",
  "module": "Core",
  "name": "Data Import Log",
@@ -77,9 +77,21 @@
    "role": "System Manager",
    "share": 1,
    "write": 1
+  },
+  {
+   "create": 1,
+   "delete": 1,
+   "email": 1,
+   "export": 1,
+   "print": 1,
+   "read": 1,
+   "report": 1,
+   "role": "SSport Role",
+   "share": 1,
+   "write": 1
   }
  ],
  "sort_field": "creation",
  "sort_order": "DESC",
  "states": []
-}
+}
diff --git a/frappe/core/doctype/file/file.py b/frappe/core/doctype/file/file.py
@@ -515,10 +515,11 @@ def unzip(self) -> list["File"]:
 	def exists_on_disk(self):
 		return os.path.exists(self.get_full_path())
 
-	def get_content(self) -> bytes:
+	def get_content(self, encodings=None) -> bytes | str:
 		if self.is_folder:
 			frappe.throw(_("Cannot get file contents of a Folder"))
 
+		# if doc was just created, content field is already populated, return it as-is
 		if self.get("content"):
 			self._content = self.content
 			if self.decode:
@@ -531,15 +532,20 @@ def get_content(self) -> bytes:
 			self.validate_file_url()
 		file_path = self.get_full_path()
 
-		# read the file
+		if encodings is None:
+			encodings = ["utf-8-sig", "utf-8", "windows-1250", "windows-1252"]
+		# read file with proper encoding
 		with open(file_path, mode="rb") as f:
 			self._content = f.read()
-			try:
-				# for plain text files
-				self._content = self._content.decode()
-			except UnicodeDecodeError:
-				# for .png, .jpg, etc
-				pass
+
+			for encoding in encodings:
+				try:
+					# for plain text files
+					self._content = self._content.decode(encoding)
+					break
+				except UnicodeDecodeError:
+					# for .png, .jpg, etc
+					continue
 
 		return self._content
 

diff --git a/frappe/core/doctype/file/test_file.py b/frappe/core/doctype/file/test_file.py
@@ -1,7 +1,6 @@
 # Copyright (c) 2022, Frappe Technologies Pvt. Ltd. and Contributors
 # License: MIT. See LICENSE
 import base64
-import json
 import os
 import shutil
 import tempfile
@@ -111,7 +110,7 @@ class TestBase64File(FrappeTestCase):
 	def setUp(self):
 		self.attached_to_doctype, self.attached_to_docname = make_test_doc()
 		self.test_content = base64.b64encode(test_content1.encode("utf-8"))
-		_file: "File" = frappe.get_doc(
+		_file: frappe.Document = frappe.get_doc(
 			{
 				"doctype": "File",
 				"file_name": "test_base64.txt",
@@ -125,7 +124,7 @@ def setUp(self):
 		self.saved_file_url = _file.file_url
 
 	def test_saved_content(self):
-		_file = frappe.get_doc("File", {"file_url": self.saved_file_url})
+		_file: frappe.Document = frappe.get_doc("File", {"file_url": self.saved_file_url})
 		content = _file.get_content()
 		self.assertEqual(content, test_content1)
 
@@ -255,6 +254,25 @@ def test_attachment_limit(self):
 		limit_property.delete()
 		frappe.clear_cache(doctype="ToDo")
 
+	def test_utf8_bom_content_decoding(self):
+		utf8_bom_content = test_content1.encode("utf-8-sig")
+		_file: frappe.Document = frappe.get_doc(
+			{
+				"doctype": "File",
+				"file_name": "utf8bom.txt",
+				"attached_to_doctype": self.attached_to_doctype1,
+				"attached_to_name": self.attached_to_docname1,
+				"content": utf8_bom_content,
+				"decode": False,
+			}
+		)
+		_file.save()
+		saved_file = frappe.get_doc("File", _file.name)
+		file_content_decoded = saved_file.get_content(encodings=["utf-8"])
+		self.assertEqual(file_content_decoded[0], "\ufeff")
+		file_content_properly_decoded = saved_file.get_content(encodings=["utf-8-sig", "utf-8"])
+		self.assertEqual(file_content_properly_decoded, test_content1)
+
 
 class TestFile(FrappeTestCase):
 	def setUp(self):

diff --git a/frappe/utils/csvutils.py b/frappe/utils/csvutils.py
@@ -2,6 +2,7 @@
 # License: MIT. See LICENSE
 import csv
 import json
+from csv import Sniffer
 from io import StringIO
 
 import requests
@@ -39,7 +40,7 @@ def read_csv_content_from_attached_file(doc):
 def read_csv_content(fcontent):
 	if not isinstance(fcontent, str):
 		decoded = False
-		for encoding in ["utf-8", "windows-1250", "windows-1252"]:
+		for encoding in ["utf-8-sig", "utf-8", "windows-1250", "windows-1252"]:
 			try:
 				fcontent = str(fcontent, encoding)
 				decoded = True
@@ -49,15 +50,33 @@ def read_csv_content(fcontent):
 
 		if not decoded:
 			frappe.msgprint(
-				_("Unknown file encoding. Tried utf-8, windows-1250, windows-1252."), raise_exception=True
+				_("Unknown file encoding. Tried utf-8-sig, utf-8, windows-1250, windows-1252."),
+				raise_exception=True,
 			)
 
 	fcontent = fcontent.encode("utf-8")
 	content = [frappe.safe_decode(line) for line in fcontent.splitlines(True)]
 
+	sniffer = Sniffer()
+	# Don't need to use whole csv, if more than 20 rows, use just first 20
+	sample_content = content[:20] if len(content) > 20 else content
+	# only testing for most common delimiter types, this later can be extended
+	# init default dialect, to avoid lint errors
+	dialect = csv.get_dialect("excel")
+	try:
+		# csv by default uses excel dialect, which is not always correct
+		dialect = sniffer.sniff(sample="\n".join(sample_content), delimiters=frappe.flags.delimiter_options)
+	except csv.Error:
+		# if sniff fails, show alert on user interface. Fall back to use default dialect (excel)
+		frappe.msgprint(
+			_("Delimiter detection failed. Try enable Custom delimiters and adjust Delimiter options."),
+			indicator="orange",
+			alert=True,
+		)
+
 	try:
 		rows = []
-		for row in csv.reader(content):
+		for row in csv.reader(content, dialect=dialect):
 			r = []
 			for val in row:
 				# decode everything