Skip to content

Commit

Permalink
test: add unittest for excel to json lists (DEV-534) (#157)
Browse files Browse the repository at this point in the history
* update gitignore, add test data

* first draft

* finished unit test

* update german test file

* add jsonpath_ng to requirements.txt

* add pandas to requirements.txt

* correct bazel

* corrected test data, added some error messages

* corrected test data

* import jsonpath_ng.ext separately

* improvement: take the english label instead of the node name, because the node name can differ because of its appended counter

* implement reviewer's feedback
  • Loading branch information
jnussbaum committed Feb 22, 2022
1 parent 660d57b commit 021f05c
Show file tree
Hide file tree
Showing 7 changed files with 79 additions and 4 deletions.
6 changes: 5 additions & 1 deletion .gitignore
Expand Up @@ -45,6 +45,8 @@ venv/
ENV/
env.bak/
venv.bak/
Pipfile
Pipfile.lock

# Spyder project settings
.spyderproject
Expand All @@ -56,7 +58,7 @@ venv.bak/
# mkdocs documentation
/site

# mypy
# IDE
.mypy_cache/
.idea
.vscode
Expand All @@ -65,6 +67,8 @@ venv.bak/
lists.json
out.json
id2iri_*
**/~$*.*
testdata/tmp/lists_output.json

# bazel
/bazel-*
Expand Down
13 changes: 10 additions & 3 deletions knora/dsplib/utils/excel_to_json_lists.py
Expand Up @@ -86,8 +86,9 @@ def get_values_from_excel(
list_of_lists_of_previous_cell_values.append(new_check_list)

if contains_duplicates(list_of_lists_of_previous_cell_values):
print('There is at least one duplicate node in the list. Found duplicate: ', cell.value.strip())
quit()
print(f'There is at least one duplicate node in the list. Found duplicate in column {cell.column}, '
f'row {cell.row}:\n"{cell.value.strip()}"')
quit(1)

# create a simplified version of the cell value and use it as name of the node
nodename = simplify_name(cell.value.strip())
Expand All @@ -102,7 +103,13 @@ def get_values_from_excel(
# read label values from the other Excel files (other languages)
labels_dict: dict[str, str] = {}
for other_lang, ws_other_lang in excelfiles.items():
labels_dict[other_lang] = ws_other_lang.cell(column=col, row=row).value.strip()
cell_value = ws_other_lang.cell(column=col, row=row).value
if not(isinstance(cell_value, str) and len(cell_value) > 0):
print(f'ERROR: Malformed Excel file: The Excel file with the language code "{other_lang}" '
f'should have a value in row {row}, column {col}')
quit(1)
else:
labels_dict[other_lang] = cell_value.strip()

# create current node from extracted cell values and append it to the nodes list
currentnode = {'name': nodename, 'labels': labels_dict}
Expand Down
2 changes: 2 additions & 0 deletions requirements.txt
Expand Up @@ -16,6 +16,7 @@ isodate==0.6.0
Jinja2==3.0.2
joblib==1.1.0
jsonschema==4.2.1
jsonpath_ng==1.5.3
keyring==23.2.1
livereload==2.6.3
lunr==0.5.8
Expand All @@ -32,6 +33,7 @@ mkdocstrings==0.16.2
nltk==3.6.6
openpyxl==3.0.9
packaging==21.2
pandas==1.4.1
pkginfo==1.7.1
Pygments==2.10.0
pymdown-extensions==9.0
Expand Down
11 changes: 11 additions & 0 deletions test/unittests/BUILD.bazel
Expand Up @@ -15,6 +15,17 @@ py_test(
],
)

py_test(
name = "test_excel_to_json_lists",
srcs = ["test_excel_to_json_lists.py"],
deps = [
"//knora/dsplib/utils:excel_to_json_lists"
],
data = [
"//testdata",
]
)

py_test(
name = "test_langstring",
srcs = ["test_langstring.py"],
Expand Down
51 changes: 51 additions & 0 deletions test/unittests/test_excel_to_json_lists.py
@@ -0,0 +1,51 @@
"""unit tests for Excel to JSON list"""
import os
import unittest
import json
import jsonpath_ng
import jsonpath_ng.ext
import pandas

from knora.dsplib.utils import excel_to_json_lists as e2l


class TestExcelToJSONList(unittest.TestCase):

def test_excel2jsonlist(self) -> None:
# check that the output file was created
excelfolder = "testdata/lists"
outfile = "testdata/tmp/lists_output.json"
e2l.list_excel2json(listname=None, excelfolder=excelfolder, outfile=outfile)
self.assertTrue(os.path.exists(outfile), f'The outfile {outfile} was not created')

# check that the output file has the same number of nodes than the Excel file has rows
with open(outfile) as f:
output_as_dict = json.load(f)
output_nodes_matches = jsonpath_ng.parse('$..name').find(output_as_dict)
input_df = pandas.read_excel("testdata/lists/description_en.xlsx", header=None, dtype='str')
self.assertTrue(
len(input_df.index) == len(output_nodes_matches) - 1,
f"The output JSON file doesn't have the same number of nodes than the Excel file has rows"
)

# check that the longest Excel row(s) were correctly translated to the deepest-nested node(s)
last_non_empty_column_index = input_df.count().index[-1]
longest_rows_selector = input_df[last_non_empty_column_index].notna()
# count() returns a Series that maps each column number to the number of entries it contains
# index[-1] returns the number of the last non-empty column (in this test case: 3)
# input_df[3].notna() returns a boolean Series with 'true' for every non-empty cell in column 3
for index, row in input_df.loc[longest_rows_selector].iterrows():
jsonpath_elems = [cell.strip() for cell in row]
parser_string = '$'
for elem in jsonpath_elems:
parser_string = parser_string + f'.nodes[?(@.labels.en == "{elem}")]'
node_match = jsonpath_ng.ext.parse(parser_string).find(output_as_dict)
self.assertTrue(
len(node_match) == 1,
f'The node "{jsonpath_elems[-1]}" from Excel row {index+1} was not correctly translated to the output '
f'JSON file.'
)


if __name__ == '__main__':
unittest.main()
Binary file modified testdata/lists/Beschreibung_de.xlsx
Binary file not shown.
Binary file modified testdata/lists/description_en.xlsx
Binary file not shown.

0 comments on commit 021f05c

Please sign in to comment.