Skip to content

Commit

Permalink
feat(Data Import): custom delimiters
Browse files Browse the repository at this point in the history
  • Loading branch information
Mate Laszlo Valko committed Apr 30, 2024
1 parent 71dfffa commit 45eabd3
Show file tree
Hide file tree
Showing 9 changed files with 138 additions and 17 deletions.
18 changes: 17 additions & 1 deletion frappe/core/doctype/data_import/data_import.json
Expand Up @@ -16,6 +16,8 @@
"google_sheets_url",
"refresh_google_sheet",
"column_break_5",
"custom_delimiters",
"delimiter_options",
"status",
"submit_after_import",
"mute_emails",
Expand Down Expand Up @@ -167,11 +169,25 @@
"hidden": 1,
"label": "Payload Count",
"read_only": 1
},
{
"default": ",;\\t|",
"depends_on": "custom_delimiters",
"description": "If your CSV uses a different delimiter, add that character here, ensuring no spaces or additional characters are included.",
"fieldname": "delimiter_options",
"fieldtype": "Data",
"label": "Delimiter options"
},
{
"default": "0",
"fieldname": "custom_delimiters",
"fieldtype": "Check",
"label": "Custom delimiters"
}
],
"hide_toolbar": 1,
"links": [],
"modified": "2024-03-23 16:02:16.953820",
"modified": "2024-04-27 20:42:35.843158",
"modified_by": "Administrator",
"module": "Core",
"name": "Data Import",
Expand Down
8 changes: 8 additions & 0 deletions frappe/core/doctype/data_import/data_import.py
Expand Up @@ -27,6 +27,8 @@ class DataImport(Document):
if TYPE_CHECKING:
from frappe.types import DF

custom_delimiters: DF.Check
delimiter_options: DF.Data | None
google_sheets_url: DF.Data | None
import_file: DF.Attach | None
import_type: DF.Literal["", "Insert New Records", "Update Existing Records"]
Expand All @@ -50,11 +52,16 @@ def validate(self):
self.template_options = ""
self.template_warnings = ""

self.set_delimiters_flag()
self.validate_doctype()
self.validate_import_file()
self.validate_google_sheets_url()
self.set_payload_count()

def set_delimiters_flag(self):
if self.import_file:
frappe.flags.delimiter_options = self.delimiter_options

def validate_doctype(self):
if self.reference_doctype in BLOCKED_DOCTYPES:
frappe.throw(_("Importing {0} is not allowed.").format(self.reference_doctype))
Expand All @@ -79,6 +86,7 @@ def set_payload_count(self):
def get_preview_from_template(self, import_file=None, google_sheets_url=None):
if import_file:
self.import_file = import_file
self.set_delimiters_flag()

if google_sheets_url:
self.google_sheets_url = google_sheets_url
Expand Down
@@ -0,0 +1,5 @@
Title ;Description ;Number ;another_number ;ID (Table Field 1) ;Child Title (Table Field 1) ;Child Description (Table Field 1) ;Child 2 Title (Table Field 2) ;Child 2 Date (Table Field 2) ;Child 2 Number (Table Field 2) ;Child Title (Table Field 1 Again) ;Child Date (Table Field 1 Again) ;Child Number (Table Field 1 Again) ;table_field_1_again.child_another_number
Test 5 ;test description ;1 ;2 ;"" ; ;"child description with ,comma and" ;child title ;14-08-2019 ;4 ;child title again ;22-09-2020 ;5 ; 7
; ; ; ; ;child title 2 ;child description 2 ;title child ;30-10-2019 ;5 ; ;22-09-2021 ; ;
;test description 2 ;1 ;2 ; ;child mandatory title ; ;title child man ; ; ;child mandatory again ; ; ;
Test 4 ;test description 3 ;4 ;5 ;"" ;child title asdf ;child description asdf ;child title asdf adsf ;15-08-2019 ;6 ;child title again asdf ;22-09-2022 ;9 ; 71
6 changes: 6 additions & 0 deletions frappe/core/doctype/data_import/importer.py
Expand Up @@ -1012,7 +1012,13 @@ def validate_values(self):
)
elif self.df.fieldtype in ("Date", "Time", "Datetime"):
# guess date/time format
# TODO: add possibility for user, to define the date format explicitly in the Data Import UI
# for example, if date column in file is in %d-%m-%y format -> 23-04-24.
# The date guesser might fail, as, this can be also parsed as %y-%m-%d, as both 23 and 24 are valid for year & for day
# This is an issue that cannot be handled automatically, no matter how we try, as it completely depends on the user's input.
# Defining an explicit value which surely recognizes
self.date_format = self.guess_date_format_for_column()

if not self.date_format:
if self.df.fieldtype == "Time":
self.date_format = "%H:%M:%S"
Expand Down
31 changes: 31 additions & 0 deletions frappe/core/doctype/data_import/test_importer.py
Expand Up @@ -50,6 +50,24 @@ def test_data_import_from_file(self):
self.assertEqual(doc3.another_number, 5)
self.assertEqual(format_duration(doc3.duration), "5d 5h 45m")

def test_data_validation_semicolon_success(self):
import_file = get_import_file("sample_import_file_semicolon")
data_import = self.get_importer(doctype_name, import_file, update=True)

doc = data_import.get_preview_from_template().get("data", [{}])

self.assertEqual(doc[0][7], "child description with ,comma and")
# Column count should be 14 (+1 ID)
self.assertEqual(len(doc[0]), 15)

def test_data_validation_semicolon_failure(self):
import_file = get_import_file("sample_import_file_semicolon")

data_import = self.get_importer_semicolon(doctype_name, import_file)
doc = data_import.get_preview_from_template().get("data", [{}])
# if semicolon delimiter detection fails, and falls back to comma, detected colum number will be 2 (+1 id) instead of
self.assertLessEqual(len(doc[0]), 15)

def test_data_import_preview(self):
import_file = get_import_file("sample_import_file")
data_import = self.get_importer(doctype_name, import_file)
Expand Down Expand Up @@ -94,6 +112,7 @@ def test_data_import_without_mandatory_values(self):
"Title is required",
)

#
def test_data_import_update(self):
existing_doc = frappe.get_doc(
doctype=doctype_name,
Expand Down Expand Up @@ -138,6 +157,18 @@ def get_importer(self, doctype, import_file, update=False):

return data_import

def get_importer_semicolon(self, doctype, import_file, update=False):
data_import = frappe.new_doc("Data Import")
data_import.import_type = "Insert New Records" if not update else "Update Existing Records"
data_import.reference_doctype = doctype
data_import.import_file = import_file.file_url
# deliberatly overwrite default delimiter options here, causing to fail when parsing ;
data_import.delimiter_options = ","
data_import.insert()
frappe.db.commit() # nosemgrep

return data_import


def create_doctype_if_not_exists(doctype_name, force=False):
if force:
Expand Down
16 changes: 14 additions & 2 deletions frappe/core/doctype/data_import_log/data_import_log.json
Expand Up @@ -60,7 +60,7 @@
"in_create": 1,
"index_web_pages_for_search": 1,
"links": [],
"modified": "2024-03-23 16:02:17.334396",
"modified": "2024-04-16 17:32:34.363704",
"modified_by": "Administrator",
"module": "Core",
"name": "Data Import Log",
Expand All @@ -77,9 +77,21 @@
"role": "System Manager",
"share": 1,
"write": 1
},
{
"create": 1,
"delete": 1,
"email": 1,
"export": 1,
"print": 1,
"read": 1,
"report": 1,
"role": "SSport Role",
"share": 1,
"write": 1
}
],
"sort_field": "creation",
"sort_order": "DESC",
"states": []
}
}
22 changes: 14 additions & 8 deletions frappe/core/doctype/file/file.py
Expand Up @@ -515,10 +515,11 @@ def unzip(self) -> list["File"]:
def exists_on_disk(self):
return os.path.exists(self.get_full_path())

def get_content(self) -> bytes:
def get_content(self, encodings=None) -> bytes | str:
if self.is_folder:
frappe.throw(_("Cannot get file contents of a Folder"))

# if doc was just created, content field is already populated, return it as-is
if self.get("content"):
self._content = self.content
if self.decode:
Expand All @@ -531,15 +532,20 @@ def get_content(self) -> bytes:
self.validate_file_url()
file_path = self.get_full_path()

# read the file
if encodings is None:
encodings = ["utf-8-sig", "utf-8", "windows-1250", "windows-1252"]
# read file with proper encoding
with open(file_path, mode="rb") as f:
self._content = f.read()
try:
# for plain text files
self._content = self._content.decode()
except UnicodeDecodeError:
# for .png, .jpg, etc
pass

for encoding in encodings:
try:
# for plain text files
self._content = self._content.decode(encoding)
break
except UnicodeDecodeError:
# for .png, .jpg, etc
continue

return self._content

Expand Down
24 changes: 21 additions & 3 deletions frappe/core/doctype/file/test_file.py
@@ -1,7 +1,6 @@
# Copyright (c) 2022, Frappe Technologies Pvt. Ltd. and Contributors
# License: MIT. See LICENSE
import base64
import json
import os
import shutil
import tempfile
Expand Down Expand Up @@ -111,7 +110,7 @@ class TestBase64File(FrappeTestCase):
def setUp(self):
self.attached_to_doctype, self.attached_to_docname = make_test_doc()
self.test_content = base64.b64encode(test_content1.encode("utf-8"))
_file: "File" = frappe.get_doc(
_file: frappe.Document = frappe.get_doc(
{
"doctype": "File",
"file_name": "test_base64.txt",
Expand All @@ -125,7 +124,7 @@ def setUp(self):
self.saved_file_url = _file.file_url

def test_saved_content(self):
_file = frappe.get_doc("File", {"file_url": self.saved_file_url})
_file: frappe.Document = frappe.get_doc("File", {"file_url": self.saved_file_url})
content = _file.get_content()
self.assertEqual(content, test_content1)

Expand Down Expand Up @@ -255,6 +254,25 @@ def test_attachment_limit(self):
limit_property.delete()
frappe.clear_cache(doctype="ToDo")

def test_utf8_bom_content_decoding(self):
utf8_bom_content = test_content1.encode("utf-8-sig")
_file: frappe.Document = frappe.get_doc(
{
"doctype": "File",
"file_name": "utf8bom.txt",
"attached_to_doctype": self.attached_to_doctype1,
"attached_to_name": self.attached_to_docname1,
"content": utf8_bom_content,
"decode": False,
}
)
_file.save()
saved_file = frappe.get_doc("File", _file.name)
file_content_decoded = saved_file.get_content(encodings=["utf-8"])
self.assertEqual(file_content_decoded[0], "\ufeff")
file_content_properly_decoded = saved_file.get_content(encodings=["utf-8-sig", "utf-8"])
self.assertEqual(file_content_properly_decoded, test_content1)


class TestFile(FrappeTestCase):
def setUp(self):
Expand Down
25 changes: 22 additions & 3 deletions frappe/utils/csvutils.py
Expand Up @@ -2,6 +2,7 @@
# License: MIT. See LICENSE
import csv
import json
from csv import Sniffer
from io import StringIO

import requests
Expand Down Expand Up @@ -39,7 +40,7 @@ def read_csv_content_from_attached_file(doc):
def read_csv_content(fcontent):
if not isinstance(fcontent, str):
decoded = False
for encoding in ["utf-8", "windows-1250", "windows-1252"]:
for encoding in ["utf-8-sig", "utf-8", "windows-1250", "windows-1252"]:
try:
fcontent = str(fcontent, encoding)
decoded = True
Expand All @@ -49,15 +50,33 @@ def read_csv_content(fcontent):

if not decoded:
frappe.msgprint(
_("Unknown file encoding. Tried utf-8, windows-1250, windows-1252."), raise_exception=True
_("Unknown file encoding. Tried utf-8-sig, utf-8, windows-1250, windows-1252."),
raise_exception=True,
)

fcontent = fcontent.encode("utf-8")
content = [frappe.safe_decode(line) for line in fcontent.splitlines(True)]

sniffer = Sniffer()
# Don't need to use whole csv, if more than 20 rows, use just first 20
sample_content = content[:20] if len(content) > 20 else content
# only testing for most common delimiter types, this later can be extended
# init default dialect, to avoid lint errors
dialect = csv.get_dialect("excel")
try:
# csv by default uses excel dialect, which is not always correct
dialect = sniffer.sniff(sample="\n".join(sample_content), delimiters=frappe.flags.delimiter_options)
except csv.Error:
# if sniff fails, show alert on user interface. Fall back to use default dialect (excel)
frappe.msgprint(
_("Delimiter detection failed. Try enable Custom delimiters and adjust Delimiter options."),
indicator="orange",
alert=True,
)

try:
rows = []
for row in csv.reader(content):
for row in csv.reader(content, dialect=dialect):
r = []
for val in row:
# decode everything
Expand Down

0 comments on commit 45eabd3

Please sign in to comment.