fix(excel2xml): prevent writing empty text-prop, make text-prop valid…

…ation less restrictive (DEV-1440) #243
dasch-swiss · Oct 25, 2022 · ae777e4 · ae777e4
1 parent 9d1383c
commit ae777e4
Show file tree

Hide file tree

Showing 13 changed files with 108 additions and 33 deletions.
diff --git a/docs/dsp-tools-excel2xml.md b/docs/dsp-tools-excel2xml.md
@@ -229,6 +229,23 @@ else:
     excel2xml.make_boolean_prop(":hasBoolean", False)
 ```
 
+#### Supported text values 
+DSP's only restriction on text-properties is that the string must be longer than 0. It is, for example, possible to 
+upload the following property:
+```xml
+<text-prop name=":hasText">
+    <text encoding="utf8"> </text>
+    <text encoding="utf8">-</text>
+</text-prop>
+```
+
+`excel2xml` allows to create such a property, but text values that don't meet the requirements of 
+[`excel2xml.check_notna()`](#check-if-a-cell-contains-a-usable-value) will trigger a warning, for example:
+```python
+excel2xml.make_text_prop(":hasText", " ")   # OK, but triggers a warning
+excel2xml.make_text_prop(":hasText", "-")   # OK, but triggers a warning
+```
+
 
 ### 8. Append the resource to root
 At the end of the for-loop, it is important not to forget to append the finished resource to the root. 
@@ -245,11 +262,28 @@ usable if it is
 
  - a number (integer or float, but not numpy.nan)
  - a boolean
- - a string with at least one Unicode letter (matching the regex `\p{L}`), underscore, ?, !, or number, but not "None", 
-   "<NA>", "N/A", or "-"
+ - a string with at least one Unicode letter (matching the regex ``\\p{L}``) or number, or at least one _, !, or ?
+   (The strings "None", "<NA>", "N/A", and "-" are considered invalid.)
  - a PropertyElement whose "value" fulfills the above criteria
 
-Why not just checking a cell by its boolean value? Like:
+Examples:
+```
+check_notna(0)      == True
+check_notna(False)  == True
+check_notna("œ")    == True
+check_notna("0")    == True
+check_notna("_")    == True
+check_notna("!")    == True
+check_notna("?")    == True
+check_notna(None)   == False
+check_notna("None") == False
+check_notna(<NA>)   == False
+check_notna("<NA>") == False
+check_notna("-")    == False
+check_notna(" ")    == False
+```
+
+But why not just checking a cell by its boolean value? Like:
 ```
 if cell:
     resource.append(make_*_prop(cell))
@@ -264,7 +298,7 @@ might expect:
 | " "          | True                         | False, because an empty string is not usable for a text property |
 | numpy.nan    | True                         | False, because N/A is not a usable value                         |
 | pandas.NA    | TypeError (*)                | False, because N/A is not a usable value                         |
-| "<NA>"       | True                         | False, because this is the string representation of N/A          |
+| "&lt;NA&gt;" | True                         | False, because this is the string representation of N/A          |
 | "-"          | True                         | False, because this is a placeholder in an empty text field      |
 (*) TypeError: boolean value of NA is ambiguous
 

diff --git a/knora/dsplib/utils/shared.py b/knora/dsplib/utils/shared.py
@@ -175,15 +175,30 @@ def check_notna(value: Optional[Any]) -> bool:
     Check a value if it is usable in the context of data archiving. A value is considered usable if it is
      - a number (integer or float, but not np.nan)
      - a boolean
-     - a string with at least one Unicode letter (matching the regex ``\\p{L}``), underscore, !, ?, or number, but not
-       "None", "<NA>", "N/A", or "-"
+     - a string with at least one Unicode letter (matching the regex ``\\p{L}``) or number, or at least one _, !, or ?
+       (The strings "None", "<NA>", "N/A", and "-" are considered invalid.)
      - a PropertyElement whose "value" fulfills the above criteria
 
     Args:
         value: any object encountered when analysing data
 
     Returns:
         True if the value is usable, False if it is N/A or otherwise unusable
+
+    Examples:
+        >>> check_notna(0)      == True
+        >>> check_notna(False)  == True
+        >>> check_notna("œ")    == True
+        >>> check_notna("0")    == True
+        >>> check_notna("_")    == True
+        >>> check_notna("!")    == True
+        >>> check_notna("?")    == True
+        >>> check_notna(None)   == False
+        >>> check_notna("None") == False
+        >>> check_notna(<NA>)   == False
+        >>> check_notna("<NA>") == False
+        >>> check_notna("-")    == False
+        >>> check_notna(" ")    == False
     """
 
     if isinstance(value, PropertyElement):

diff --git a/knora/excel2xml.py b/knora/excel2xml.py
@@ -1189,9 +1189,12 @@ def make_text_prop(
 
     # check value type
     for val in values:
-        if not isinstance(val.value, str) or not check_notna(val.value):
+        if not isinstance(val.value, str) or len(val.value) < 1:
             raise BaseError(f"Failed validation in resource '{calling_resource}', property '{name}': "
                             f"'{val.value}' is not a valid string.")
+        if not check_notna(val.value):
+            warnings.warn(f"Warning for resource '{calling_resource}', property '{name}': "
+                          f"'{val.value}' is probably not a usable string.", stacklevel=2)
 
     # make xml structure of the valid values
     prop_ = etree.Element(
@@ -1743,10 +1746,6 @@ def excel2xml(datafile: str, shortcode: str, default_ontology: str) -> None:
         "text-prop": make_text_prop,
         "uri-prop": make_uri_prop
     }
-    single_value_functions = [
-        make_bitstream_prop,
-        make_boolean_prop
-    ]
     if re.search(r"\.csv$", datafile):
         # "utf_8_sig": an optional BOM at the start of the file will be skipped
         # let the "python" engine detect the separator
@@ -1757,7 +1756,7 @@ def excel2xml(datafile: str, shortcode: str, default_ontology: str) -> None:
         raise BaseError("The argument 'datafile' must have one of the extensions 'csv', 'xls', 'xlsx'")
     # replace NA-like cells by NA
     main_df = main_df.applymap(
-        lambda x: x if pd.notna(x) and regex.search(r"[\w\p{L}]", str(x), flags=regex.U) else pd.NA
+        lambda x: x if pd.notna(x) and regex.search(r"[\p{L}\d_!?\-]", str(x), flags=regex.U) else pd.NA
     )
     # remove empty columns, so that the max_prop_count can be calculated without errors
     main_df.dropna(axis="columns", how="all", inplace=True)
@@ -1847,7 +1846,7 @@ def excel2xml(datafile: str, shortcode: str, default_ontology: str) -> None:
             property_elements: list[PropertyElement] = []
             for i in range(1, max_prop_count + 1):
                 value = row[f"{i}_value"]
-                if check_notna(value):
+                if pd.notna(value):
                     kwargs_propelem = {
                         "value": value,
                         "permissions": str(row.get(f"{i}_permissions"))
@@ -1861,13 +1860,25 @@ def excel2xml(datafile: str, shortcode: str, default_ontology: str) -> None:
                         kwargs_propelem["encoding"] = str(row[f"{i}_encoding"])
 
                     property_elements.append(PropertyElement(**kwargs_propelem))
+                elif check_notna(str(row.get(f"{i}_permissions"))):
+                    raise BaseError(f"Excel row {int(str(index)) + 2} has an entry in column {i}_permissions, but not "
+                                    f"in {i}_value. Please note that cell contents that don't meet the requirements of "
+                                    r"the regex [\p{L}\d_!?\-] are considered inexistent.")
+
+            # validate property_elements
+            if len(property_elements) == 0:
+                raise BaseError(f"At least one value per property is required, but Excel row {int(str(index)) + 2}"
+                                f"doesn't contain any values.")
+            if make_prop_function == make_boolean_prop and len(property_elements) != 1:
+                raise BaseError(f"A <boolean-prop> can only have a single value, but Excel row {int(str(index)) + 2} "
+                                f"contains more than one values.")
 
             # create the property and append it to resource
             kwargs_propfunc: dict[str, Union[str, PropertyElement, list[PropertyElement]]] = {
                 "name": row["prop name"],
                 "calling_resource": resource_id
             }
-            if make_prop_function in single_value_functions and len(property_elements) == 1:
+            if make_prop_function == make_boolean_prop:
                 kwargs_propfunc["value"] = property_elements[0]
             else:
                 kwargs_propfunc["value"] = property_elements

diff --git a/test/unittests/test_excel2xml.py b/test/unittests/test_excel2xml.py
@@ -5,6 +5,7 @@
 from typing import Callable, Sequence, Union, Optional, Any
 
 import numpy as np
+import pytest
 from lxml import etree
 
 from knora import excel2xml
@@ -104,6 +105,8 @@ def run_test(
         xml_returned = method(**kwargs_to_generate_xml)
         xml_returned = etree.tostring(xml_returned, encoding="unicode")
         xml_returned = re.sub(r" xmlns(:.+?)?=\".+?\"", "", xml_returned)  # remove all xml namespace declarations
+        xml_returned = xml_returned.replace("&lt;", "<")
+        xml_returned = xml_returned.replace("&gt;", ">")
         testcase.assertEqual(xml_expected, xml_returned,
                              msg=f"Method {method.__name__} failed with kwargs {kwargs_to_generate_xml}")
 
@@ -348,11 +351,12 @@ def test_make_resptr_prop(self) -> None:
         run_test(self, prop, method, different_values, invalid_values)
 
 
+    @pytest.mark.filterwarnings("ignore")
     def test_make_text_prop(self) -> None:
         prop = "text"
         method = excel2xml.make_text_prop
-        different_values = ["text_1", "text_2", "text_3", "text_4", "text_5"]
-        invalid_values = [True, 10.0, 5]
+        different_values = ["text_1", " ", "!", "?", "-", "_", "None", "<NA>"]
+        invalid_values = [True, 10.0, 5, ""]
         run_test(self, prop, method, different_values, invalid_values)
 
         # test encoding="xml"
@@ -513,6 +517,7 @@ def test_create_json_list_mapping(self) -> None:
         self.assertDictEqual(testlist_mapping_returned, testlist_mapping_expected)
 
 
+    @pytest.mark.filterwarnings("ignore")
     def test_excel2xml(self) -> None:
         # test the valid files, 3 times identical, but in the three formats XLSX, XLS, and CSV
         with open("testdata/excel2xml-expected-output.xml") as f:
@@ -528,17 +533,20 @@ def test_excel2xml(self) -> None:
         # test the invalid files
         invalid_prefix = "testdata/invalid_testdata/excel2xml-testdata-invalid"
         invalid_cases = [
-            (f"{invalid_prefix}-id-propname-both.xlsx",             "Exactly 1 of the 2 columns 'id' and 'prop name' must have an entry"),
-            (f"{invalid_prefix}-id-propname-none.xlsx",             "Exactly 1 of the 2 columns 'id' and 'prop name' must have an entry"),
-            (f"{invalid_prefix}-missing-prop-permissions.xlsx",     "Missing permissions for value .+ of property"),
-            (f"{invalid_prefix}-missing-resource-label.xlsx",       "Missing label for resource"),
-            (f"{invalid_prefix}-missing-resource-permissions.xlsx", "Missing permissions for resource"),
-            (f"{invalid_prefix}-missing-restype.xlsx",              "Missing restype"),
-            (f"{invalid_prefix}-no-bitstream-permissions.xlsx",     "'file permissions' missing"),
-            (f"{invalid_prefix}-nonexisting-proptype.xlsx",         "Invalid prop type"),
+            (f"{invalid_prefix}-boolean-prop-two-values.xlsx",           "A <boolean-prop> can only have a single value"),
+            (f"{invalid_prefix}-empty-property.xlsx",                    "At least one value per property is required"),
+            (f"{invalid_prefix}-id-propname-both.xlsx",                  "Exactly 1 of the 2 columns 'id' and 'prop name' must have an entry"),
+            (f"{invalid_prefix}-id-propname-none.xlsx",                  "Exactly 1 of the 2 columns 'id' and 'prop name' must have an entry"),
+            (f"{invalid_prefix}-missing-prop-permissions.xlsx",          "Missing permissions for value .+ of property"),
+            (f"{invalid_prefix}-missing-resource-label.xlsx",            "Missing label for resource"),
+            (f"{invalid_prefix}-missing-resource-permissions.xlsx",      "Missing permissions for resource"),
+            (f"{invalid_prefix}-missing-restype.xlsx",                   "Missing restype"),
+            (f"{invalid_prefix}-no-bitstream-permissions.xlsx",          "'file permissions' missing"),
+            (f"{invalid_prefix}-nonexisting-proptype.xlsx",              "Invalid prop type"),
+            (f"{invalid_prefix}-single-invalid-value-for-property.xlsx", "has an entry in column \\d+_permissions, but not in \\d+_value")
         ]
-        for file, regex in invalid_cases:
-            with self.assertRaisesRegex(BaseError, regex, msg=f"Failed with file '{file}'"):
+        for file, _regex in invalid_cases:
+            with self.assertRaisesRegex(BaseError, _regex, msg=f"Failed with file '{file}'"):
                 excel2xml.excel2xml(file, "1234", f"excel2xml-invalid")
 
 

diff --git a/test/unittests/test_shared.py b/test/unittests/test_shared.py
@@ -45,12 +45,11 @@ def test_prepare_dataframe(self) -> None:
 
     def test_check_notna(self) -> None:
         na_values = [None, pd.NA, np.nan, "", "  ", "-", ",", ".", "*", " ⳰", " ῀ ", " ῾ ", " \n\t ", "N/A", "n/a",
-
-                     "<NA>", ["a", "b"], pd.array(["a", "b"]), np.array([0, 1])]
+                     "<NA>", "None", ["a", "b"], pd.array(["a", "b"]), np.array([0, 1])]
         for na_value in na_values:
             self.assertFalse(shared.check_notna(na_value), msg=f"Failed na_value: {na_value}")
 
-        notna_values = [1, 0.1, True, False, "True", "False", r" \n\t ", "0", "_", "Ὅμηρος"]
+        notna_values = [1, 0.1, True, False, "True", "False", r" \n\t ", "0", "_", "Ὅμηρος", "!", "?"]
         notna_values.extend([PropertyElement(x) for x in notna_values])
         for notna_value in notna_values:
             self.assertTrue(shared.check_notna(notna_value), msg=f"Failed notna_value: {notna_value}")

diff --git a/testdata/excel2xml-expected-output.xml b/testdata/excel2xml-expected-output.xml
@@ -28,6 +28,8 @@
         <text-prop name=":hasName">
             <text permissions="prop-default" encoding="utf8">Homer</text>
             <text permissions="prop-default" encoding="utf8">Ὅμηρος</text>
+            <text permissions="prop-default" encoding="utf8">??</text>
+            <text permissions="prop-default" encoding="utf8">-</text>
         </text-prop>
         <uri-prop name=":hasIdentifier">
             <uri permissions="prop-default">http://d-nb.info/gnd/11855333X</uri>