Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(Data Import): custom csv delimiters, UTF-8 BOM handling #26183

Merged
merged 8 commits into from May 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
20 changes: 18 additions & 2 deletions frappe/core/doctype/data_import/data_import.json
Expand Up @@ -16,6 +16,8 @@
"google_sheets_url",
"refresh_google_sheet",
"column_break_5",
"custom_delimiters",
"delimiter_options",
"status",
"submit_after_import",
"mute_emails",
Expand Down Expand Up @@ -167,11 +169,25 @@
"hidden": 1,
"label": "Payload Count",
"read_only": 1
},
{
"default": ",;\\t|",
"depends_on": "custom_delimiters",
"description": "If your CSV uses a different delimiter, add that character here, ensuring no spaces or additional characters are included.",
"fieldname": "delimiter_options",
"fieldtype": "Data",
"label": "Delimiter Options"
},
{
"default": "0",
"fieldname": "custom_delimiters",
"fieldtype": "Check",
"label": "Custom Delimiters"
}
],
"hide_toolbar": 1,
"links": [],
"modified": "2024-03-23 16:02:16.953820",
"modified": "2024-04-27 20:42:35.843158",
"modified_by": "Administrator",
"module": "Core",
"name": "Data Import",
Expand All @@ -195,4 +211,4 @@
"sort_order": "DESC",
"states": [],
"track_changes": 1
}
}
8 changes: 8 additions & 0 deletions frappe/core/doctype/data_import/data_import.py
Expand Up @@ -27,6 +27,8 @@ class DataImport(Document):
if TYPE_CHECKING:
from frappe.types import DF

custom_delimiters: DF.Check
delimiter_options: DF.Data | None
google_sheets_url: DF.Data | None
import_file: DF.Attach | None
import_type: DF.Literal["", "Insert New Records", "Update Existing Records"]
Expand All @@ -50,11 +52,16 @@ def validate(self):
self.template_options = ""
self.template_warnings = ""

self.set_delimiters_flag()
self.validate_doctype()
self.validate_import_file()
self.validate_google_sheets_url()
self.set_payload_count()

def set_delimiters_flag(self):
if self.import_file:
frappe.flags.delimiter_options = self.delimiter_options or ","

def validate_doctype(self):
if self.reference_doctype in BLOCKED_DOCTYPES:
frappe.throw(_("Importing {0} is not allowed.").format(self.reference_doctype))
Expand All @@ -79,6 +86,7 @@ def set_payload_count(self):
def get_preview_from_template(self, import_file=None, google_sheets_url=None):
if import_file:
self.import_file = import_file
self.set_delimiters_flag()

if google_sheets_url:
self.google_sheets_url = google_sheets_url
Expand Down
@@ -0,0 +1,5 @@
Title ;Description ;Number ;another_number ;ID (Table Field 1) ;Child Title (Table Field 1) ;Child Description (Table Field 1) ;Child 2 Title (Table Field 2) ;Child 2 Date (Table Field 2) ;Child 2 Number (Table Field 2) ;Child Title (Table Field 1 Again) ;Child Date (Table Field 1 Again) ;Child Number (Table Field 1 Again) ;table_field_1_again.child_another_number
Test 5 ;test description ;1 ;2 ;"" ; ;"child description with ,comma and" ;child title ;14-08-2019 ;4 ;child title again ;22-09-2020 ;5 ; 7
; ; ; ; ;child title 2 ;child description 2 ;title child ;30-10-2019 ;5 ; ;22-09-2021 ; ;
;test description 2 ;1 ;2 ; ;child mandatory title ; ;title child man ; ; ;child mandatory again ; ; ;
Test 4 ;test description 3 ;4 ;5 ;"" ;child title asdf ;child description asdf ;child title asdf adsf ;15-08-2019 ;6 ;child title again asdf ;22-09-2022 ;9 ; 71
6 changes: 6 additions & 0 deletions frappe/core/doctype/data_import/importer.py
Expand Up @@ -1012,7 +1012,13 @@ def validate_values(self):
)
elif self.df.fieldtype in ("Date", "Time", "Datetime"):
# guess date/time format
# TODO: add possibility for user, to define the date format explicitly in the Data Import UI
# for example, if date column in file is in %d-%m-%y format -> 23-04-24.
# The date guesser might fail, as, this can be also parsed as %y-%m-%d, as both 23 and 24 are valid for year & for day
# This is an issue that cannot be handled automatically, no matter how we try, as it completely depends on the user's input.
# Defining an explicit value which surely recognizes
self.date_format = self.guess_date_format_for_column()

if not self.date_format:
if self.df.fieldtype == "Time":
self.date_format = "%H:%M:%S"
Expand Down
31 changes: 31 additions & 0 deletions frappe/core/doctype/data_import/test_importer.py
Expand Up @@ -50,6 +50,25 @@ def test_data_import_from_file(self):
self.assertEqual(doc3.another_number, 5)
self.assertEqual(format_duration(doc3.duration), "5d 5h 45m")

def test_data_validation_semicolon_success(self):
import_file = get_import_file("sample_import_file_semicolon")
data_import = self.get_importer(doctype_name, import_file, update=True)

doc = data_import.get_preview_from_template().get("data", [{}])

self.assertEqual(doc[0][7], "child description with ,comma and")
# Column count should be 14 (+1 ID)
self.assertEqual(len(doc[0]), 15)

def test_data_validation_semicolon_failure(self):
import_file = get_import_file("sample_import_file_semicolon")

data_import = self.get_importer_semicolon(doctype_name, import_file)
doc = data_import.get_preview_from_template().get("data", [{}])
# if semicolon delimiter detection fails, and falls back to comma,
# column number will be less than 15 -> 2 (+1 id)
self.assertLessEqual(len(doc[0]), 15)

def test_data_import_preview(self):
import_file = get_import_file("sample_import_file")
data_import = self.get_importer(doctype_name, import_file)
Expand Down Expand Up @@ -138,6 +157,18 @@ def get_importer(self, doctype, import_file, update=False):

return data_import

def get_importer_semicolon(self, doctype, import_file, update=False):
data_import = frappe.new_doc("Data Import")
data_import.import_type = "Insert New Records" if not update else "Update Existing Records"
data_import.reference_doctype = doctype
data_import.import_file = import_file.file_url
# deliberately overwrite default delimiter options here, causing to fail when parsing `;`
data_import.delimiter_options = ","
data_import.insert()
frappe.db.commit() # nosemgrep

return data_import


def create_doctype_if_not_exists(doctype_name, force=False):
if force:
Expand Down
2 changes: 1 addition & 1 deletion frappe/core/doctype/data_import_log/data_import_log.json
Expand Up @@ -82,4 +82,4 @@
"sort_field": "creation",
"sort_order": "DESC",
"states": []
}
}
23 changes: 15 additions & 8 deletions frappe/core/doctype/file/file.py
Expand Up @@ -31,6 +31,7 @@
exclude_from_linked_with = True
ImageFile.LOAD_TRUNCATED_IMAGES = True
URL_PREFIXES = ("http://", "https://")
FILE_ENCODING_OPTIONS = ("utf-8-sig", "utf-8", "windows-1250", "windows-1252")


class File(Document):
Expand Down Expand Up @@ -515,10 +516,11 @@ def unzip(self) -> list["File"]:
def exists_on_disk(self):
return os.path.exists(self.get_full_path())

def get_content(self) -> bytes:
def get_content(self, encodings=None) -> bytes | str:
if self.is_folder:
frappe.throw(_("Cannot get file contents of a Folder"))

# if doc was just created, content field is already populated, return it as-is
if self.get("content"):
self._content = self.content
if self.decode:
Expand All @@ -531,15 +533,20 @@ def get_content(self) -> bytes:
self.validate_file_url()
file_path = self.get_full_path()

# read the file
if encodings is None:
encodings = FILE_ENCODING_OPTIONS
with open(file_path, mode="rb") as f:
self._content = f.read()
try:
# for plain text files
self._content = self._content.decode()
except UnicodeDecodeError:
# for .png, .jpg, etc
pass
# looping will not result in slowdown, as the content is usually utf-8 or utf-8-sig
# encoded so the first iteration will be enough most of the time
for encoding in encodings:
try:
# read file with proper encoding
self._content = self._content.decode(encoding)
break
except UnicodeDecodeError:
# for .png, .jpg, etc
continue

return self._content

Expand Down
24 changes: 21 additions & 3 deletions frappe/core/doctype/file/test_file.py
@@ -1,7 +1,6 @@
# Copyright (c) 2022, Frappe Technologies Pvt. Ltd. and Contributors
# License: MIT. See LICENSE
import base64
import json
import os
import shutil
import tempfile
Expand Down Expand Up @@ -111,7 +110,7 @@ class TestBase64File(FrappeTestCase):
def setUp(self):
self.attached_to_doctype, self.attached_to_docname = make_test_doc()
self.test_content = base64.b64encode(test_content1.encode("utf-8"))
_file: "File" = frappe.get_doc(
_file: frappe.Document = frappe.get_doc(
{
"doctype": "File",
"file_name": "test_base64.txt",
Expand All @@ -125,7 +124,7 @@ def setUp(self):
self.saved_file_url = _file.file_url

def test_saved_content(self):
_file = frappe.get_doc("File", {"file_url": self.saved_file_url})
_file: frappe.Document = frappe.get_doc("File", {"file_url": self.saved_file_url})
content = _file.get_content()
self.assertEqual(content, test_content1)

Expand Down Expand Up @@ -255,6 +254,25 @@ def test_attachment_limit(self):
limit_property.delete()
frappe.clear_cache(doctype="ToDo")

def test_utf8_bom_content_decoding(self):
utf8_bom_content = test_content1.encode("utf-8-sig")
_file: frappe.Document = frappe.get_doc(
{
"doctype": "File",
"file_name": "utf8bom.txt",
"attached_to_doctype": self.attached_to_doctype1,
"attached_to_name": self.attached_to_docname1,
"content": utf8_bom_content,
"decode": False,
}
)
_file.save()
saved_file = frappe.get_doc("File", _file.name)
file_content_decoded = saved_file.get_content(encodings=["utf-8"])
self.assertEqual(file_content_decoded[0], "\ufeff")
file_content_properly_decoded = saved_file.get_content(encodings=["utf-8-sig", "utf-8"])
self.assertEqual(file_content_properly_decoded, test_content1)


class TestFile(FrappeTestCase):
def setUp(self):
Expand Down
28 changes: 25 additions & 3 deletions frappe/utils/csvutils.py
Expand Up @@ -2,12 +2,14 @@
# License: MIT. See LICENSE
import csv
import json
from csv import Sniffer
from io import StringIO

import requests

import frappe
from frappe import _, msgprint
from frappe.core.doctype.file.file import FILE_ENCODING_OPTIONS
from frappe.utils import cint, comma_or, cstr, flt


Expand Down Expand Up @@ -39,7 +41,7 @@ def read_csv_content_from_attached_file(doc):
def read_csv_content(fcontent):
if not isinstance(fcontent, str):
decoded = False
for encoding in ["utf-8", "windows-1250", "windows-1252"]:
for encoding in FILE_ENCODING_OPTIONS:
try:
fcontent = str(fcontent, encoding)
decoded = True
Expand All @@ -49,15 +51,35 @@ def read_csv_content(fcontent):

if not decoded:
frappe.msgprint(
_("Unknown file encoding. Tried utf-8, windows-1250, windows-1252."), raise_exception=True
_("Unknown file encoding. Tried to use: {0}").format(", ".join(FILE_ENCODING_OPTIONS)),
raise_exception=True,
)

fcontent = fcontent.encode("utf-8")
content = [frappe.safe_decode(line) for line in fcontent.splitlines(True)]

sniffer = Sniffer()
# Don't need to use whole csv, if more than 20 rows, use just first 20
sample_content = content[:20] if len(content) > 20 else content
# only testing for most common delimiter types, this later can be extended
# init default dialect, to avoid lint errors
dialect = csv.get_dialect("excel")
try:
# csv by default uses excel dialect, which is not always correct
dialect = sniffer.sniff(sample="\n".join(sample_content), delimiters=frappe.flags.delimiter_options)
except csv.Error:
# if sniff fails, show alert on user interface. Fall back to use default dialect (excel)
frappe.msgprint(
_(
"Delimiter detection failed. Try to enable custom delimiters and adjust the delimiter options as per your data."
),
indicator="orange",
alert=True,
)

try:
rows = []
for row in csv.reader(content):
for row in csv.reader(content, dialect=dialect):
r = []
for val in row:
# decode everything
Expand Down