Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(excel2xml): prevent writing empty text-prop, make text-prop validation less restrictive (DEV-1440) #243

Merged
merged 1 commit into from Oct 25, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
42 changes: 38 additions & 4 deletions docs/dsp-tools-excel2xml.md
Expand Up @@ -229,6 +229,23 @@ else:
excel2xml.make_boolean_prop(":hasBoolean", False)
```

#### Supported text values
DSP's only restriction on text-properties is that the string must be longer than 0. It is, for example, possible to
upload the following property:
```xml
<text-prop name=":hasText">
<text encoding="utf8"> </text>
<text encoding="utf8">-</text>
</text-prop>
```

`excel2xml` allows to create such a property, but text values that don't meet the requirements of
[`excel2xml.check_notna()`](#check-if-a-cell-contains-a-usable-value) will trigger a warning, for example:
```python
excel2xml.make_text_prop(":hasText", " ") # OK, but triggers a warning
excel2xml.make_text_prop(":hasText", "-") # OK, but triggers a warning
```


### 8. Append the resource to root
At the end of the for-loop, it is important not to forget to append the finished resource to the root.
Expand All @@ -245,11 +262,28 @@ usable if it is

- a number (integer or float, but not numpy.nan)
- a boolean
- a string with at least one Unicode letter (matching the regex `\p{L}`), underscore, ?, !, or number, but not "None",
"<NA>", "N/A", or "-"
- a string with at least one Unicode letter (matching the regex ``\\p{L}``) or number, or at least one _, !, or ?
(The strings "None", "<NA>", "N/A", and "-" are considered invalid.)
- a PropertyElement whose "value" fulfills the above criteria

Why not just checking a cell by its boolean value? Like:
Examples:
```
check_notna(0) == True
check_notna(False) == True
check_notna("œ") == True
check_notna("0") == True
check_notna("_") == True
check_notna("!") == True
check_notna("?") == True
check_notna(None) == False
check_notna("None") == False
check_notna(<NA>) == False
check_notna("<NA>") == False
check_notna("-") == False
check_notna(" ") == False
```

But why not just checking a cell by its boolean value? Like:
```
if cell:
resource.append(make_*_prop(cell))
Expand All @@ -264,7 +298,7 @@ might expect:
| " " | True | False, because an empty string is not usable for a text property |
| numpy.nan | True | False, because N/A is not a usable value |
| pandas.NA | TypeError (*) | False, because N/A is not a usable value |
| "<NA>" | True | False, because this is the string representation of N/A |
| "&lt;NA&gt;" | True | False, because this is the string representation of N/A |
| "-" | True | False, because this is a placeholder in an empty text field |
(*) TypeError: boolean value of NA is ambiguous

Expand Down
19 changes: 17 additions & 2 deletions knora/dsplib/utils/shared.py
Expand Up @@ -175,15 +175,30 @@ def check_notna(value: Optional[Any]) -> bool:
Check a value if it is usable in the context of data archiving. A value is considered usable if it is
- a number (integer or float, but not np.nan)
- a boolean
- a string with at least one Unicode letter (matching the regex ``\\p{L}``), underscore, !, ?, or number, but not
"None", "<NA>", "N/A", or "-"
- a string with at least one Unicode letter (matching the regex ``\\p{L}``) or number, or at least one _, !, or ?
(The strings "None", "<NA>", "N/A", and "-" are considered invalid.)
- a PropertyElement whose "value" fulfills the above criteria

Args:
value: any object encountered when analysing data

Returns:
True if the value is usable, False if it is N/A or otherwise unusable

Examples:
>>> check_notna(0) == True
>>> check_notna(False) == True
>>> check_notna("œ") == True
>>> check_notna("0") == True
>>> check_notna("_") == True
>>> check_notna("!") == True
>>> check_notna("?") == True
>>> check_notna(None) == False
>>> check_notna("None") == False
>>> check_notna(<NA>) == False
>>> check_notna("<NA>") == False
>>> check_notna("-") == False
>>> check_notna(" ") == False
"""

if isinstance(value, PropertyElement):
Expand Down
27 changes: 19 additions & 8 deletions knora/excel2xml.py
Expand Up @@ -1189,9 +1189,12 @@ def make_text_prop(

# check value type
for val in values:
if not isinstance(val.value, str) or not check_notna(val.value):
if not isinstance(val.value, str) or len(val.value) < 1:
jnussbaum marked this conversation as resolved.
Show resolved Hide resolved
raise BaseError(f"Failed validation in resource '{calling_resource}', property '{name}': "
f"'{val.value}' is not a valid string.")
if not check_notna(val.value):
warnings.warn(f"Warning for resource '{calling_resource}', property '{name}': "
f"'{val.value}' is probably not a usable string.", stacklevel=2)

# make xml structure of the valid values
prop_ = etree.Element(
Expand Down Expand Up @@ -1743,10 +1746,6 @@ def excel2xml(datafile: str, shortcode: str, default_ontology: str) -> None:
"text-prop": make_text_prop,
"uri-prop": make_uri_prop
}
single_value_functions = [
make_bitstream_prop,
make_boolean_prop
]
if re.search(r"\.csv$", datafile):
# "utf_8_sig": an optional BOM at the start of the file will be skipped
# let the "python" engine detect the separator
Expand All @@ -1757,7 +1756,7 @@ def excel2xml(datafile: str, shortcode: str, default_ontology: str) -> None:
raise BaseError("The argument 'datafile' must have one of the extensions 'csv', 'xls', 'xlsx'")
# replace NA-like cells by NA
main_df = main_df.applymap(
lambda x: x if pd.notna(x) and regex.search(r"[\w\p{L}]", str(x), flags=regex.U) else pd.NA
lambda x: x if pd.notna(x) and regex.search(r"[\p{L}\d_!?\-]", str(x), flags=regex.U) else pd.NA
)
# remove empty columns, so that the max_prop_count can be calculated without errors
main_df.dropna(axis="columns", how="all", inplace=True)
Expand Down Expand Up @@ -1847,7 +1846,7 @@ def excel2xml(datafile: str, shortcode: str, default_ontology: str) -> None:
property_elements: list[PropertyElement] = []
for i in range(1, max_prop_count + 1):
value = row[f"{i}_value"]
if check_notna(value):
if pd.notna(value):
kwargs_propelem = {
"value": value,
"permissions": str(row.get(f"{i}_permissions"))
Expand All @@ -1861,13 +1860,25 @@ def excel2xml(datafile: str, shortcode: str, default_ontology: str) -> None:
kwargs_propelem["encoding"] = str(row[f"{i}_encoding"])

property_elements.append(PropertyElement(**kwargs_propelem))
elif check_notna(str(row.get(f"{i}_permissions"))):
raise BaseError(f"Excel row {int(str(index)) + 2} has an entry in column {i}_permissions, but not "
f"in {i}_value. Please note that cell contents that don't meet the requirements of "
r"the regex [\p{L}\d_!?\-] are considered inexistent.")

# validate property_elements
if len(property_elements) == 0:
raise BaseError(f"At least one value per property is required, but Excel row {int(str(index)) + 2}"
f"doesn't contain any values.")
if make_prop_function == make_boolean_prop and len(property_elements) != 1:
raise BaseError(f"A <boolean-prop> can only have a single value, but Excel row {int(str(index)) + 2} "
f"contains more than one values.")

# create the property and append it to resource
kwargs_propfunc: dict[str, Union[str, PropertyElement, list[PropertyElement]]] = {
"name": row["prop name"],
"calling_resource": resource_id
}
if make_prop_function in single_value_functions and len(property_elements) == 1:
if make_prop_function == make_boolean_prop:
kwargs_propfunc["value"] = property_elements[0]
else:
kwargs_propfunc["value"] = property_elements
Expand Down
32 changes: 20 additions & 12 deletions test/unittests/test_excel2xml.py
Expand Up @@ -5,6 +5,7 @@
from typing import Callable, Sequence, Union, Optional, Any

import numpy as np
import pytest
from lxml import etree

from knora import excel2xml
Expand Down Expand Up @@ -104,6 +105,8 @@ def run_test(
xml_returned = method(**kwargs_to_generate_xml)
xml_returned = etree.tostring(xml_returned, encoding="unicode")
xml_returned = re.sub(r" xmlns(:.+?)?=\".+?\"", "", xml_returned) # remove all xml namespace declarations
xml_returned = xml_returned.replace("&lt;", "<")
xml_returned = xml_returned.replace("&gt;", ">")
testcase.assertEqual(xml_expected, xml_returned,
msg=f"Method {method.__name__} failed with kwargs {kwargs_to_generate_xml}")

Expand Down Expand Up @@ -348,11 +351,12 @@ def test_make_resptr_prop(self) -> None:
run_test(self, prop, method, different_values, invalid_values)


@pytest.mark.filterwarnings("ignore")
def test_make_text_prop(self) -> None:
prop = "text"
method = excel2xml.make_text_prop
different_values = ["text_1", "text_2", "text_3", "text_4", "text_5"]
invalid_values = [True, 10.0, 5]
different_values = ["text_1", " ", "!", "?", "-", "_", "None", "<NA>"]
invalid_values = [True, 10.0, 5, ""]
run_test(self, prop, method, different_values, invalid_values)

# test encoding="xml"
Expand Down Expand Up @@ -513,6 +517,7 @@ def test_create_json_list_mapping(self) -> None:
self.assertDictEqual(testlist_mapping_returned, testlist_mapping_expected)


@pytest.mark.filterwarnings("ignore")
def test_excel2xml(self) -> None:
# test the valid files, 3 times identical, but in the three formats XLSX, XLS, and CSV
with open("testdata/excel2xml-expected-output.xml") as f:
Expand All @@ -528,17 +533,20 @@ def test_excel2xml(self) -> None:
# test the invalid files
invalid_prefix = "testdata/invalid_testdata/excel2xml-testdata-invalid"
invalid_cases = [
(f"{invalid_prefix}-id-propname-both.xlsx", "Exactly 1 of the 2 columns 'id' and 'prop name' must have an entry"),
(f"{invalid_prefix}-id-propname-none.xlsx", "Exactly 1 of the 2 columns 'id' and 'prop name' must have an entry"),
(f"{invalid_prefix}-missing-prop-permissions.xlsx", "Missing permissions for value .+ of property"),
(f"{invalid_prefix}-missing-resource-label.xlsx", "Missing label for resource"),
(f"{invalid_prefix}-missing-resource-permissions.xlsx", "Missing permissions for resource"),
(f"{invalid_prefix}-missing-restype.xlsx", "Missing restype"),
(f"{invalid_prefix}-no-bitstream-permissions.xlsx", "'file permissions' missing"),
(f"{invalid_prefix}-nonexisting-proptype.xlsx", "Invalid prop type"),
(f"{invalid_prefix}-boolean-prop-two-values.xlsx", "A <boolean-prop> can only have a single value"),
(f"{invalid_prefix}-empty-property.xlsx", "At least one value per property is required"),
(f"{invalid_prefix}-id-propname-both.xlsx", "Exactly 1 of the 2 columns 'id' and 'prop name' must have an entry"),
(f"{invalid_prefix}-id-propname-none.xlsx", "Exactly 1 of the 2 columns 'id' and 'prop name' must have an entry"),
(f"{invalid_prefix}-missing-prop-permissions.xlsx", "Missing permissions for value .+ of property"),
(f"{invalid_prefix}-missing-resource-label.xlsx", "Missing label for resource"),
(f"{invalid_prefix}-missing-resource-permissions.xlsx", "Missing permissions for resource"),
(f"{invalid_prefix}-missing-restype.xlsx", "Missing restype"),
(f"{invalid_prefix}-no-bitstream-permissions.xlsx", "'file permissions' missing"),
(f"{invalid_prefix}-nonexisting-proptype.xlsx", "Invalid prop type"),
(f"{invalid_prefix}-single-invalid-value-for-property.xlsx", "has an entry in column \\d+_permissions, but not in \\d+_value")
]
for file, regex in invalid_cases:
with self.assertRaisesRegex(BaseError, regex, msg=f"Failed with file '{file}'"):
for file, _regex in invalid_cases:
with self.assertRaisesRegex(BaseError, _regex, msg=f"Failed with file '{file}'"):
excel2xml.excel2xml(file, "1234", f"excel2xml-invalid")


Expand Down
5 changes: 2 additions & 3 deletions test/unittests/test_shared.py
Expand Up @@ -45,12 +45,11 @@ def test_prepare_dataframe(self) -> None:

def test_check_notna(self) -> None:
na_values = [None, pd.NA, np.nan, "", " ", "-", ",", ".", "*", " ⳰", " ῀ ", " ῾ ", " \n\t ", "N/A", "n/a",

"<NA>", ["a", "b"], pd.array(["a", "b"]), np.array([0, 1])]
"<NA>", "None", ["a", "b"], pd.array(["a", "b"]), np.array([0, 1])]
for na_value in na_values:
self.assertFalse(shared.check_notna(na_value), msg=f"Failed na_value: {na_value}")

notna_values = [1, 0.1, True, False, "True", "False", r" \n\t ", "0", "_", "Ὅμηρος"]
notna_values = [1, 0.1, True, False, "True", "False", r" \n\t ", "0", "_", "Ὅμηρος", "!", "?"]
notna_values.extend([PropertyElement(x) for x in notna_values])
for notna_value in notna_values:
self.assertTrue(shared.check_notna(notna_value), msg=f"Failed notna_value: {notna_value}")
Expand Down
2 changes: 2 additions & 0 deletions testdata/excel2xml-expected-output.xml
Expand Up @@ -28,6 +28,8 @@
<text-prop name=":hasName">
<text permissions="prop-default" encoding="utf8">Homer</text>
<text permissions="prop-default" encoding="utf8">Ὅμηρος</text>
<text permissions="prop-default" encoding="utf8">??</text>
<text permissions="prop-default" encoding="utf8">-</text>
</text-prop>
<uri-prop name=":hasIdentifier">
<uri permissions="prop-default">http://d-nb.info/gnd/11855333X</uri>
Expand Down