Skip to content

Commit

Permalink
fix(excel2xml): prevent writing empty text-prop, make text-prop valid…
Browse files Browse the repository at this point in the history
…ation less restrictive (DEV-1440) #243
  • Loading branch information
jnussbaum committed Oct 25, 2022
1 parent 9d1383c commit ae777e4
Show file tree
Hide file tree
Showing 13 changed files with 108 additions and 33 deletions.
42 changes: 38 additions & 4 deletions docs/dsp-tools-excel2xml.md
Expand Up @@ -229,6 +229,23 @@ else:
excel2xml.make_boolean_prop(":hasBoolean", False)
```

#### Supported text values
DSP's only restriction on text-properties is that the string must be longer than 0. It is, for example, possible to
upload the following property:
```xml
<text-prop name=":hasText">
<text encoding="utf8"> </text>
<text encoding="utf8">-</text>
</text-prop>
```

`excel2xml` allows to create such a property, but text values that don't meet the requirements of
[`excel2xml.check_notna()`](#check-if-a-cell-contains-a-usable-value) will trigger a warning, for example:
```python
excel2xml.make_text_prop(":hasText", " ") # OK, but triggers a warning
excel2xml.make_text_prop(":hasText", "-") # OK, but triggers a warning
```


### 8. Append the resource to root
At the end of the for-loop, it is important not to forget to append the finished resource to the root.
Expand All @@ -245,11 +262,28 @@ usable if it is

- a number (integer or float, but not numpy.nan)
- a boolean
- a string with at least one Unicode letter (matching the regex `\p{L}`), underscore, ?, !, or number, but not "None",
"<NA>", "N/A", or "-"
- a string with at least one Unicode letter (matching the regex ``\\p{L}``) or number, or at least one _, !, or ?
(The strings "None", "<NA>", "N/A", and "-" are considered invalid.)
- a PropertyElement whose "value" fulfills the above criteria

Why not just checking a cell by its boolean value? Like:
Examples:
```
check_notna(0) == True
check_notna(False) == True
check_notna("œ") == True
check_notna("0") == True
check_notna("_") == True
check_notna("!") == True
check_notna("?") == True
check_notna(None) == False
check_notna("None") == False
check_notna(<NA>) == False
check_notna("<NA>") == False
check_notna("-") == False
check_notna(" ") == False
```

But why not just checking a cell by its boolean value? Like:
```
if cell:
resource.append(make_*_prop(cell))
Expand All @@ -264,7 +298,7 @@ might expect:
| " " | True | False, because an empty string is not usable for a text property |
| numpy.nan | True | False, because N/A is not a usable value |
| pandas.NA | TypeError (*) | False, because N/A is not a usable value |
| "<NA>" | True | False, because this is the string representation of N/A |
| "&lt;NA&gt;" | True | False, because this is the string representation of N/A |
| "-" | True | False, because this is a placeholder in an empty text field |
(*) TypeError: boolean value of NA is ambiguous

Expand Down
19 changes: 17 additions & 2 deletions knora/dsplib/utils/shared.py
Expand Up @@ -175,15 +175,30 @@ def check_notna(value: Optional[Any]) -> bool:
Check a value if it is usable in the context of data archiving. A value is considered usable if it is
- a number (integer or float, but not np.nan)
- a boolean
- a string with at least one Unicode letter (matching the regex ``\\p{L}``), underscore, !, ?, or number, but not
"None", "<NA>", "N/A", or "-"
- a string with at least one Unicode letter (matching the regex ``\\p{L}``) or number, or at least one _, !, or ?
(The strings "None", "<NA>", "N/A", and "-" are considered invalid.)
- a PropertyElement whose "value" fulfills the above criteria
Args:
value: any object encountered when analysing data
Returns:
True if the value is usable, False if it is N/A or otherwise unusable
Examples:
>>> check_notna(0) == True
>>> check_notna(False) == True
>>> check_notna("œ") == True
>>> check_notna("0") == True
>>> check_notna("_") == True
>>> check_notna("!") == True
>>> check_notna("?") == True
>>> check_notna(None) == False
>>> check_notna("None") == False
>>> check_notna(<NA>) == False
>>> check_notna("<NA>") == False
>>> check_notna("-") == False
>>> check_notna(" ") == False
"""

if isinstance(value, PropertyElement):
Expand Down
27 changes: 19 additions & 8 deletions knora/excel2xml.py
Expand Up @@ -1189,9 +1189,12 @@ def make_text_prop(

# check value type
for val in values:
if not isinstance(val.value, str) or not check_notna(val.value):
if not isinstance(val.value, str) or len(val.value) < 1:
raise BaseError(f"Failed validation in resource '{calling_resource}', property '{name}': "
f"'{val.value}' is not a valid string.")
if not check_notna(val.value):
warnings.warn(f"Warning for resource '{calling_resource}', property '{name}': "
f"'{val.value}' is probably not a usable string.", stacklevel=2)

# make xml structure of the valid values
prop_ = etree.Element(
Expand Down Expand Up @@ -1743,10 +1746,6 @@ def excel2xml(datafile: str, shortcode: str, default_ontology: str) -> None:
"text-prop": make_text_prop,
"uri-prop": make_uri_prop
}
single_value_functions = [
make_bitstream_prop,
make_boolean_prop
]
if re.search(r"\.csv$", datafile):
# "utf_8_sig": an optional BOM at the start of the file will be skipped
# let the "python" engine detect the separator
Expand All @@ -1757,7 +1756,7 @@ def excel2xml(datafile: str, shortcode: str, default_ontology: str) -> None:
raise BaseError("The argument 'datafile' must have one of the extensions 'csv', 'xls', 'xlsx'")
# replace NA-like cells by NA
main_df = main_df.applymap(
lambda x: x if pd.notna(x) and regex.search(r"[\w\p{L}]", str(x), flags=regex.U) else pd.NA
lambda x: x if pd.notna(x) and regex.search(r"[\p{L}\d_!?\-]", str(x), flags=regex.U) else pd.NA
)
# remove empty columns, so that the max_prop_count can be calculated without errors
main_df.dropna(axis="columns", how="all", inplace=True)
Expand Down Expand Up @@ -1847,7 +1846,7 @@ def excel2xml(datafile: str, shortcode: str, default_ontology: str) -> None:
property_elements: list[PropertyElement] = []
for i in range(1, max_prop_count + 1):
value = row[f"{i}_value"]
if check_notna(value):
if pd.notna(value):
kwargs_propelem = {
"value": value,
"permissions": str(row.get(f"{i}_permissions"))
Expand All @@ -1861,13 +1860,25 @@ def excel2xml(datafile: str, shortcode: str, default_ontology: str) -> None:
kwargs_propelem["encoding"] = str(row[f"{i}_encoding"])

property_elements.append(PropertyElement(**kwargs_propelem))
elif check_notna(str(row.get(f"{i}_permissions"))):
raise BaseError(f"Excel row {int(str(index)) + 2} has an entry in column {i}_permissions, but not "
f"in {i}_value. Please note that cell contents that don't meet the requirements of "
r"the regex [\p{L}\d_!?\-] are considered inexistent.")

# validate property_elements
if len(property_elements) == 0:
raise BaseError(f"At least one value per property is required, but Excel row {int(str(index)) + 2}"
f"doesn't contain any values.")
if make_prop_function == make_boolean_prop and len(property_elements) != 1:
raise BaseError(f"A <boolean-prop> can only have a single value, but Excel row {int(str(index)) + 2} "
f"contains more than one values.")

# create the property and append it to resource
kwargs_propfunc: dict[str, Union[str, PropertyElement, list[PropertyElement]]] = {
"name": row["prop name"],
"calling_resource": resource_id
}
if make_prop_function in single_value_functions and len(property_elements) == 1:
if make_prop_function == make_boolean_prop:
kwargs_propfunc["value"] = property_elements[0]
else:
kwargs_propfunc["value"] = property_elements
Expand Down
32 changes: 20 additions & 12 deletions test/unittests/test_excel2xml.py
Expand Up @@ -5,6 +5,7 @@
from typing import Callable, Sequence, Union, Optional, Any

import numpy as np
import pytest
from lxml import etree

from knora import excel2xml
Expand Down Expand Up @@ -104,6 +105,8 @@ def run_test(
xml_returned = method(**kwargs_to_generate_xml)
xml_returned = etree.tostring(xml_returned, encoding="unicode")
xml_returned = re.sub(r" xmlns(:.+?)?=\".+?\"", "", xml_returned) # remove all xml namespace declarations
xml_returned = xml_returned.replace("&lt;", "<")
xml_returned = xml_returned.replace("&gt;", ">")
testcase.assertEqual(xml_expected, xml_returned,
msg=f"Method {method.__name__} failed with kwargs {kwargs_to_generate_xml}")

Expand Down Expand Up @@ -348,11 +351,12 @@ def test_make_resptr_prop(self) -> None:
run_test(self, prop, method, different_values, invalid_values)


@pytest.mark.filterwarnings("ignore")
def test_make_text_prop(self) -> None:
prop = "text"
method = excel2xml.make_text_prop
different_values = ["text_1", "text_2", "text_3", "text_4", "text_5"]
invalid_values = [True, 10.0, 5]
different_values = ["text_1", " ", "!", "?", "-", "_", "None", "<NA>"]
invalid_values = [True, 10.0, 5, ""]
run_test(self, prop, method, different_values, invalid_values)

# test encoding="xml"
Expand Down Expand Up @@ -513,6 +517,7 @@ def test_create_json_list_mapping(self) -> None:
self.assertDictEqual(testlist_mapping_returned, testlist_mapping_expected)


@pytest.mark.filterwarnings("ignore")
def test_excel2xml(self) -> None:
# test the valid files, 3 times identical, but in the three formats XLSX, XLS, and CSV
with open("testdata/excel2xml-expected-output.xml") as f:
Expand All @@ -528,17 +533,20 @@ def test_excel2xml(self) -> None:
# test the invalid files
invalid_prefix = "testdata/invalid_testdata/excel2xml-testdata-invalid"
invalid_cases = [
(f"{invalid_prefix}-id-propname-both.xlsx", "Exactly 1 of the 2 columns 'id' and 'prop name' must have an entry"),
(f"{invalid_prefix}-id-propname-none.xlsx", "Exactly 1 of the 2 columns 'id' and 'prop name' must have an entry"),
(f"{invalid_prefix}-missing-prop-permissions.xlsx", "Missing permissions for value .+ of property"),
(f"{invalid_prefix}-missing-resource-label.xlsx", "Missing label for resource"),
(f"{invalid_prefix}-missing-resource-permissions.xlsx", "Missing permissions for resource"),
(f"{invalid_prefix}-missing-restype.xlsx", "Missing restype"),
(f"{invalid_prefix}-no-bitstream-permissions.xlsx", "'file permissions' missing"),
(f"{invalid_prefix}-nonexisting-proptype.xlsx", "Invalid prop type"),
(f"{invalid_prefix}-boolean-prop-two-values.xlsx", "A <boolean-prop> can only have a single value"),
(f"{invalid_prefix}-empty-property.xlsx", "At least one value per property is required"),
(f"{invalid_prefix}-id-propname-both.xlsx", "Exactly 1 of the 2 columns 'id' and 'prop name' must have an entry"),
(f"{invalid_prefix}-id-propname-none.xlsx", "Exactly 1 of the 2 columns 'id' and 'prop name' must have an entry"),
(f"{invalid_prefix}-missing-prop-permissions.xlsx", "Missing permissions for value .+ of property"),
(f"{invalid_prefix}-missing-resource-label.xlsx", "Missing label for resource"),
(f"{invalid_prefix}-missing-resource-permissions.xlsx", "Missing permissions for resource"),
(f"{invalid_prefix}-missing-restype.xlsx", "Missing restype"),
(f"{invalid_prefix}-no-bitstream-permissions.xlsx", "'file permissions' missing"),
(f"{invalid_prefix}-nonexisting-proptype.xlsx", "Invalid prop type"),
(f"{invalid_prefix}-single-invalid-value-for-property.xlsx", "has an entry in column \\d+_permissions, but not in \\d+_value")
]
for file, regex in invalid_cases:
with self.assertRaisesRegex(BaseError, regex, msg=f"Failed with file '{file}'"):
for file, _regex in invalid_cases:
with self.assertRaisesRegex(BaseError, _regex, msg=f"Failed with file '{file}'"):
excel2xml.excel2xml(file, "1234", f"excel2xml-invalid")


Expand Down
5 changes: 2 additions & 3 deletions test/unittests/test_shared.py
Expand Up @@ -45,12 +45,11 @@ def test_prepare_dataframe(self) -> None:

def test_check_notna(self) -> None:
na_values = [None, pd.NA, np.nan, "", " ", "-", ",", ".", "*", " ⳰", " ῀ ", " ῾ ", " \n\t ", "N/A", "n/a",

"<NA>", ["a", "b"], pd.array(["a", "b"]), np.array([0, 1])]
"<NA>", "None", ["a", "b"], pd.array(["a", "b"]), np.array([0, 1])]
for na_value in na_values:
self.assertFalse(shared.check_notna(na_value), msg=f"Failed na_value: {na_value}")

notna_values = [1, 0.1, True, False, "True", "False", r" \n\t ", "0", "_", "Ὅμηρος"]
notna_values = [1, 0.1, True, False, "True", "False", r" \n\t ", "0", "_", "Ὅμηρος", "!", "?"]
notna_values.extend([PropertyElement(x) for x in notna_values])
for notna_value in notna_values:
self.assertTrue(shared.check_notna(notna_value), msg=f"Failed notna_value: {notna_value}")
Expand Down
2 changes: 2 additions & 0 deletions testdata/excel2xml-expected-output.xml
Expand Up @@ -28,6 +28,8 @@
<text-prop name=":hasName">
<text permissions="prop-default" encoding="utf8">Homer</text>
<text permissions="prop-default" encoding="utf8">Ὅμηρος</text>
<text permissions="prop-default" encoding="utf8">??</text>
<text permissions="prop-default" encoding="utf8">-</text>
</text-prop>
<uri-prop name=":hasIdentifier">
<uri permissions="prop-default">http://d-nb.info/gnd/11855333X</uri>
Expand Down

0 comments on commit ae777e4

Please sign in to comment.