test: add unittest for excel to json lists (DEV-534) (#157)

* update gitignore, add test data * first draft * finished unit test * update german test file * add jsonpath_ng to requirements.txt * add pandas to requirements.txt * correct bazel * corrected test data, added some error messages * corrected test data * import jsonpath_ng.ext separately * improvement: take the english label instead of the node name, because the node name can differ because of its appended counter * implement reviewer's feedback
dasch-swiss · Feb 22, 2022 · 021f05c · 021f05c
1 parent 660d57b
commit 021f05c
Show file tree

Hide file tree

Showing 7 changed files with 79 additions and 4 deletions.
diff --git a/.gitignore b/.gitignore
@@ -45,6 +45,8 @@ venv/
 ENV/
 env.bak/
 venv.bak/
+Pipfile
+Pipfile.lock
 
 # Spyder project settings
 .spyderproject
@@ -56,7 +58,7 @@ venv.bak/
 # mkdocs documentation
 /site
 
-# mypy
+# IDE
 .mypy_cache/
 .idea
 .vscode
@@ -65,6 +67,8 @@ venv.bak/
 lists.json
 out.json
 id2iri_*
+**/~$*.*
+testdata/tmp/lists_output.json
 
 # bazel
 /bazel-*

diff --git a/knora/dsplib/utils/excel_to_json_lists.py b/knora/dsplib/utils/excel_to_json_lists.py
@@ -86,8 +86,9 @@ def get_values_from_excel(
             list_of_lists_of_previous_cell_values.append(new_check_list)
 
             if contains_duplicates(list_of_lists_of_previous_cell_values):
-                print('There is at least one duplicate node in the list. Found duplicate: ', cell.value.strip())
-                quit()
+                print(f'There is at least one duplicate node in the list. Found duplicate in column {cell.column}, '
+                      f'row {cell.row}:\n"{cell.value.strip()}"')
+                quit(1)
 
             # create a simplified version of the cell value and use it as name of the node
             nodename = simplify_name(cell.value.strip())
@@ -102,7 +103,13 @@ def get_values_from_excel(
             # read label values from the other Excel files (other languages)
             labels_dict: dict[str, str] = {}
             for other_lang, ws_other_lang in excelfiles.items():
-                labels_dict[other_lang] = ws_other_lang.cell(column=col, row=row).value.strip()
+                cell_value = ws_other_lang.cell(column=col, row=row).value
+                if not(isinstance(cell_value, str) and len(cell_value) > 0):
+                    print(f'ERROR: Malformed Excel file: The Excel file with the language code "{other_lang}" '
+                          f'should have a value in row {row}, column {col}')
+                    quit(1)
+                else:
+                    labels_dict[other_lang] = cell_value.strip()
 
             # create current node from extracted cell values and append it to the nodes list
             currentnode = {'name': nodename, 'labels': labels_dict}

diff --git a/requirements.txt b/requirements.txt
@@ -16,6 +16,7 @@ isodate==0.6.0
 Jinja2==3.0.2
 joblib==1.1.0
 jsonschema==4.2.1
+jsonpath_ng==1.5.3
 keyring==23.2.1
 livereload==2.6.3
 lunr==0.5.8
@@ -32,6 +33,7 @@ mkdocstrings==0.16.2
 nltk==3.6.6
 openpyxl==3.0.9
 packaging==21.2
+pandas==1.4.1
 pkginfo==1.7.1
 Pygments==2.10.0
 pymdown-extensions==9.0

diff --git a/test/unittests/BUILD.bazel b/test/unittests/BUILD.bazel
@@ -15,6 +15,17 @@ py_test(
     ],
 )
 
+py_test(
+    name = "test_excel_to_json_lists",
+    srcs = ["test_excel_to_json_lists.py"],
+    deps = [
+        "//knora/dsplib/utils:excel_to_json_lists"
+    ],
+    data = [
+        "//testdata",
+    ]
+)
+
 py_test(
     name = "test_langstring",
     srcs = ["test_langstring.py"],

diff --git a/test/unittests/test_excel_to_json_lists.py b/test/unittests/test_excel_to_json_lists.py
@@ -0,0 +1,51 @@
+"""unit tests for Excel to JSON list"""
+import os
+import unittest
+import json
+import jsonpath_ng
+import jsonpath_ng.ext
+import pandas
+
+from knora.dsplib.utils import excel_to_json_lists as e2l
+
+
+class TestExcelToJSONList(unittest.TestCase):
+
+    def test_excel2jsonlist(self) -> None:
+        # check that the output file was created
+        excelfolder = "testdata/lists"
+        outfile = "testdata/tmp/lists_output.json"
+        e2l.list_excel2json(listname=None, excelfolder=excelfolder, outfile=outfile)
+        self.assertTrue(os.path.exists(outfile), f'The outfile {outfile} was not created')
+
+        # check that the output file has the same number of nodes than the Excel file has rows
+        with open(outfile) as f:
+            output_as_dict = json.load(f)
+        output_nodes_matches = jsonpath_ng.parse('$..name').find(output_as_dict)
+        input_df = pandas.read_excel("testdata/lists/description_en.xlsx", header=None, dtype='str')
+        self.assertTrue(
+            len(input_df.index) == len(output_nodes_matches) - 1,
+            f"The output JSON file doesn't have the same number of nodes than the Excel file has rows"
+        )
+
+        # check that the longest Excel row(s) were correctly translated to the deepest-nested node(s)
+        last_non_empty_column_index = input_df.count().index[-1]
+        longest_rows_selector = input_df[last_non_empty_column_index].notna()
+            # count() returns a Series that maps each column number to the number of entries it contains
+            # index[-1] returns the number of the last non-empty column (in this test case: 3)
+            # input_df[3].notna() returns a boolean Series with 'true' for every non-empty cell in column 3
+        for index, row in input_df.loc[longest_rows_selector].iterrows():
+            jsonpath_elems = [cell.strip() for cell in row]
+            parser_string = '$'
+            for elem in jsonpath_elems:
+                parser_string = parser_string + f'.nodes[?(@.labels.en == "{elem}")]'
+            node_match = jsonpath_ng.ext.parse(parser_string).find(output_as_dict)
+            self.assertTrue(
+                len(node_match) == 1,
+                f'The node "{jsonpath_elems[-1]}" from Excel row {index+1} was not correctly translated to the output '
+                f'JSON file.'
+            )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/testdata/lists/Beschreibung_de.xlsx b/testdata/lists/Beschreibung_de.xlsx
diff --git a/testdata/lists/description_en.xlsx b/testdata/lists/description_en.xlsx