From 8a8c9d02fe3a4c47523bb70bbaa754ece5c007cd Mon Sep 17 00:00:00 2001
From: Johannes Nussbaum <39048939+jnussbaum@users.noreply.github.com>
Date: Thu, 17 Feb 2022 18:41:51 +0100
Subject: [PATCH] refactor: excel to json list (DEV-431) (#155)

* Don't open the file each time in the recursive function get_values_from_excel()

* instead of side effects, all methods should return the result they computed

* - make code more understandable by improving the logic
 - improve docstrings and annotations

* improve docstrings and annotations

* - various style/annotation improvements
 - remove side-effect of get_values_from_excel by making a copy of parentnode before returning it

* - correct mistake in logic
 - improve error message output

* - reconstruct the side-effect from before

* strip strings

* adapt 'expand_all_lists.py' to the refactorings

* elminate little bug

* introduce the side effect again

* rename variable

Co-authored-by: irinaschubert <irina.schubert@dasch.swiss>

* correct previous error

* improvements suggested by reviewer

Co-authored-by: irinaschubert <irina.schubert@dasch.swiss>

* implement reviewer's feedback

Co-authored-by: irinaschubert <irina.schubert@dasch.swiss>
---
 knora/dsplib/utils/excel_to_json_lists.py | 262 ++++++++++++----------
 knora/dsplib/utils/expand_all_lists.py    |  27 +--
 2 files changed, 160 insertions(+), 129 deletions(-)

diff --git a/knora/dsplib/utils/excel_to_json_lists.py b/knora/dsplib/utils/excel_to_json_lists.py
index 3f961b490..3f67f6f96 100644
--- a/knora/dsplib/utils/excel_to_json_lists.py
+++ b/knora/dsplib/utils/excel_to_json_lists.py
@@ -1,103 +1,117 @@
-"""This module handles all the operations which are used for the creation of JSON lists form Excel files."""
+"""This module handles all the operations which are used for the creation of JSON lists from Excel files."""
 import csv
 import glob
 import json
 import os
 import re
 import unicodedata
+from typing import Any, Union, Optional
 
 import jsonschema
-from jsonschema import validate
 from openpyxl import load_workbook
-
-list_of_lists = []
-cell_names = []
-
-
-def get_values_from_excel(excelfiles: list[str], base_file: str, parentnode: {}, row: int, col: int,
-                          preval: list[str]) -> int:
+from openpyxl.cell import Cell
+from openpyxl.worksheet.worksheet import Worksheet
+
+# Global variables used to ensure that there are no duplicate node names
+list_of_lists_of_previous_cell_values: list[list[str]] = []
+list_of_previous_node_names: list[str] = []
+
+
+def get_values_from_excel(
+    excelfiles: dict[str, Worksheet],
+    base_file: dict[str, Worksheet],
+    parentnode: dict[str, Any],
+    row: int,
+    col: int,
+    preval: list[str]
+) -> tuple[int, dict[str, Any]]:
     """
-    This function calls itself recursively to go through the Excel files. It extracts the cell values and creates the JSON list
-    file.
+    This function calls itself recursively to go through the Excel files. It extracts the cell values and creates
+    the JSON list file.
 
     Args:
-        base_file: File name of the base file
         excelfiles: List of Excel files with the values in different languages
-        parentnode: Name(s) of the parent node(s) of the actual node
-        row: The index of the actual row of the Excel sheet
-        col: The index of the actual column of the Excel sheet
+        base_file: File name of the base file
+        parentnode: Name(s) of the parent node(s) of the current node
+        row: The index of the current row of the Excel sheet
+        col: The index of the current column of the Excel sheet
         preval: List of previous values, needed to check the consistency of the list hierarchy
 
     Returns:
-        int: Row index for the next loop (actual row index minus 1)
+        int: Row index for the next loop (current row index minus 1)
+        dict: The JSON list up to the current recursion. At the last recursion, this is the final JSON list.
     """
-    nodes = []
-    currentnode = {}
-    wb = load_workbook(filename=base_file, read_only=True)
-    worksheet = wb.worksheets[0]
-    cell = worksheet.cell(column=col, row=row)
+    nodes: list[dict[str, Any]] = []
+    currentnode: dict[str, Any] = dict()
+    base_file_ws: Worksheet = list(base_file.values())[0]
+    cell: Cell = base_file_ws.cell(column=col, row=row)
+
+    for excelfile in excelfiles.values():
+        if any((not excelfile['A1'].value, excelfile['B1'].value)):
+            print(f'Inconsistency in Excel list: The first row must consist of exactly one value, in cell A1. '
+                  f'All other cells of row 1 must be empty.\nInstead, found the following:\n'
+                  f'Cell A1: "{excelfile["A1"].value}"\n'
+                  f'Cell B1: "{excelfile["B1"].value}"')
+            quit()
 
     if col > 1:
-        # append the cell value of the parent node (which is one value to the left of the actual cell) to the list of
+        # append the cell value of the parent node (which is one value to the left of the current cell) to the list of
         # previous values
-        preval.append(worksheet.cell(column=col - 1, row=row).value)
+        preval.append(base_file_ws.cell(column=col-1, row=row).value.strip())
 
     while cell.value:
         # check if all predecessors in row (values to the left) are consistent with the values in preval list
         for idx, val in enumerate(preval[:-1]):
-            if val != worksheet.cell(column=idx + 1, row=row).value:
-                print(
-                    f'Inconsistency in Excel list: {val} not equal to {worksheet.cell(column=idx + 1, row=row).value}')
+            if val != base_file_ws.cell(column=idx+1, row=row).value.strip():
+                print(f'Inconsistency in Excel list: {val} not equal to '
+                      f'{base_file_ws.cell(column=idx+1, row=row).value.strip()}')
                 quit()
 
         # loop through the row until the last (furthest right) value is found
-        if worksheet.cell(column=col + 1, row=row).value:
-            row = get_values_from_excel(excelfiles=excelfiles, base_file=base_file, parentnode=currentnode, col=col + 1,
-                                        row=row,
-                                        preval=preval)
+        if base_file_ws.cell(column=col+1, row=row).value:
+            row, _ = get_values_from_excel(
+                excelfiles=excelfiles,
+                base_file=base_file,
+                parentnode=currentnode,
+                col=col+1,
+                row=row,
+                preval=preval
+            )
 
         # if value was last in row (no further values to the right), it's a node, continue here
         else:
             # check if there are duplicate nodes (i.e. identical rows), quit the program if so
             new_check_list = preval.copy()
-            new_check_list.append(cell.value)
-
-            list_of_lists.append(new_check_list)
+            new_check_list.append(cell.value.strip())
+            list_of_lists_of_previous_cell_values.append(new_check_list)
 
-            if check_list_for_duplicates(list_of_lists):
-                print('There is at least one duplicate node in the list. Found duplicate:', cell.value)
+            if contains_duplicates(list_of_lists_of_previous_cell_values):
+                print('There is at least one duplicate node in the list. Found duplicate: ', cell.value.strip())
                 quit()
 
             # create a simplified version of the cell value and use it as name of the node
-            cellname = simplify_name(cell.value)
-            cell_names.append(cellname)
+            nodename = simplify_name(cell.value.strip())
+            list_of_previous_node_names.append(nodename)
 
             # append a number (p.ex. node-name-2) if there are list nodes with identical names
-            if check_list_for_duplicates(cell_names):
-                n = cell_names.count(cellname)
+            if contains_duplicates(list_of_previous_node_names):
+                n = list_of_previous_node_names.count(nodename)
                 if n > 1:
-                    cellname = cellname + '-' + str(n)
-
-            labels_dict = {}
+                    nodename = nodename + '-' + str(n)
 
             # read label values from the other Excel files (other languages)
-            for filename_other_lang in excelfiles:
-                wb_other_lang = load_workbook(filename=filename_other_lang, read_only=True)
-                ws_other_lang = wb_other_lang.worksheets[0]
-
-                lang = os.path.splitext(filename_other_lang)[0].split('_')[-1]
-                labels_dict[lang] = ws_other_lang.cell(column=col, row=row).value
+            labels_dict: dict[str, str] = {}
+            for other_lang, ws_other_lang in excelfiles.items():
+                labels_dict[other_lang] = ws_other_lang.cell(column=col, row=row).value.strip()
 
             # create current node from extracted cell values and append it to the nodes list
-            currentnode = {'name': cellname, 'labels': labels_dict}
-
+            currentnode = {'name': nodename, 'labels': labels_dict}
             nodes.append(currentnode)
-
-            print(f'Added list node: {cell.value} ({cellname})')
+            print(f'Added list node: {cell.value.strip()} ({nodename})')
 
         # go one row down and repeat loop if there is a value
         row += 1
-        cell = worksheet.cell(column=col, row=row)
+        cell = base_file_ws.cell(column=col, row=row)
 
     if col > 1:
         preval.pop()
@@ -105,41 +119,58 @@ def get_values_from_excel(excelfiles: list[str], base_file: str, parentnode: {},
     # add the new nodes to the parentnode
     parentnode['nodes'] = nodes
 
-    return row - 1
+    return row - 1, parentnode
 
 
-def make_json_list_from_excel(rootnode: {}, excelfiles: list[str]) -> None:
+def make_json_list_from_excel(rootnode: dict[str, Any], excelfile_names: list[str]) -> dict[str, Any]:
     """
-    Reads Excel files and makes a JSON list file from them. The JSON can then be used in an ontology that is uploaded to the
-    DaSCH Service Platform.
+    Reads Excel files and makes a JSON list file from them. The JSON can then be used in an ontology that
+    is uploaded to the DaSCH Service Platform.
 
     Args:
         rootnode: The root node of the JSON list
-        excelfiles: A list with all the Excel files to be processed
+        excelfile_names: A list with all the Excel files to be processed
 
     Returns:
-        None
-
+        The finished list as a dict
     """
     # Define starting point in Excel file
     startrow = 1
     startcol = 1
 
-    # Check if English file is available and take it as base file, take last one from list of Excel files if English
-    # is not available. The node names are later  derived from the labels of the base file.
-    base_file = ''
-
-    for filename in excelfiles:
-        base_file = filename
+    # Check if English file is available and take it as base file. Take last one from list of Excel files if English
+    # is not available. The node names are later derived from the labels of the base file.
+    base_file: dict[str, Worksheet] = dict()
+    for filename in excelfile_names:
         if '_en.xlsx' in os.path.basename(filename):
-            base_file = filename
-            break
-
-    get_values_from_excel(excelfiles=excelfiles, base_file=base_file, parentnode=rootnode, row=startrow, col=startcol,
-                          preval=[])
-
-
-def check_list_for_duplicates(list_to_check: list) -> bool:
+            lang = 'en'
+            ws = load_workbook(filename, read_only=True).worksheets[0]
+            base_file = {lang: ws}
+    if len(base_file) == 0:
+        file = excelfile_names[-1]
+        lang = os.path.splitext(file)[0].split('_')[-1]
+        ws = load_workbook(file, read_only=True).worksheets[0]
+        base_file = {lang: ws}
+
+    excelfiles: dict[str, Worksheet] = {}
+    for f in excelfile_names:
+        lang = os.path.splitext(f)[0].split('_')[-1]
+        ws = load_workbook(f, read_only=True).worksheets[0]
+        excelfiles[lang] = ws
+
+    _, finished_list = get_values_from_excel(
+        excelfiles=excelfiles,
+        base_file=base_file,
+        parentnode=rootnode,
+        row=startrow,
+        col=startcol,
+        preval=[]
+    )
+
+    return finished_list
+
+
+def contains_duplicates(list_to_check: list[Any]) -> bool:
     """
     Checks if the given list contains any duplicate items.
 
@@ -148,7 +179,6 @@ def check_list_for_duplicates(list_to_check: list) -> bool:
 
     Returns:
         True if there is a duplicate, false otherwise
-
     """
     has_duplicates = False
 
@@ -168,7 +198,6 @@ def simplify_name(value: str) -> str:
 
     Returns:
         str: The simplified value
-
     """
     simplified_value = str(value).lower()
 
@@ -194,7 +223,6 @@ def check_language_code(lang_code: str) -> bool:
 
     Returns:
         True if valid, False if not
-
     """
     current_dir = os.path.dirname(os.path.realpath(__file__))
     with open(os.path.join(current_dir, 'language-codes-3b2_csv.csv'), 'r') as language_codes_file:
@@ -205,7 +233,11 @@ def check_language_code(lang_code: str) -> bool:
     return False
 
 
-def make_root_node_from_args(excelfiles: list[str], listname_from_args: str, comments: dict[str, str]) -> dict:
+def make_root_node_from_args(
+    excelfiles: list[str],
+    listname_from_args: Optional[str],
+    comments: dict[str, str]
+) -> dict[str, Any]:
     """
     Creates the root node for the JSON list
 
@@ -216,36 +248,34 @@ def make_root_node_from_args(excelfiles: list[str], listname_from_args: str, com
 
     Returns:
         dict: The root node of the list as dictionary (JSON)
-
     """
-    rootnode_labels_dict = {}
-    listname = listname_from_args
-    listname_en = ''
+    listname_from_lang_code = {}
+    listname_en: str = ''
+    lang_specific_listname: str = ''
 
     for filename in excelfiles:
         basename = os.path.basename(filename)
-        label, lang_code = os.path.splitext(basename)[0].rsplit('_', 1)
+        lang_specific_listname, lang_code = os.path.splitext(basename)[0].rsplit('_', 1)
 
-        # check if language code is valid
         if not check_language_code(lang_code):
-            print('Invalid language code is used. Only language codes from ISO 639-1 and ISO 639-2 are accepted.')
+            print(f'Invalid language code "{lang_code}" is used. Only language codes from ISO 639-1 ',
+                  f'and ISO 639-2 are accepted.')
             quit()
 
-        rootnode_labels_dict[lang_code] = label
-        listname = label
+        listname_from_lang_code[lang_code] = lang_specific_listname
 
         if '_en.xlsx' in filename:
-            listname_en = label
-
-    # if an english list is available use its label as listname
-    if listname_en:
-        listname = listname_en
+            listname_en = lang_specific_listname
 
-    # if the user provided a listname use it
+    # the listname is taken from the following sources, with descending priority
     if listname_from_args:
         listname = listname_from_args
+    elif listname_en:
+        listname = listname_en
+    else:
+        listname = lang_specific_listname
 
-    rootnode = {'name': listname, 'labels': rootnode_labels_dict, 'comments': comments}
+    rootnode = {'name': listname, 'labels': listname_from_lang_code, 'comments': comments}
 
     return rootnode
 
@@ -259,14 +289,13 @@ def validate_list_with_schema(json_list: str) -> bool:
 
     Returns:
         True if the list passed validation, False otherwise
-
     """
     current_dir = os.path.dirname(os.path.realpath(__file__))
     with open(os.path.join(current_dir, '../schemas/lists-only.json')) as schema:
         list_schema = json.load(schema)
 
     try:
-        validate(instance=json_list, schema=list_schema)
+        jsonschema.validate(instance=json_list, schema=list_schema)
     except jsonschema.exceptions.ValidationError as err:
         print(err)
         return False
@@ -274,9 +303,13 @@ def validate_list_with_schema(json_list: str) -> bool:
     return True
 
 
-def prepare_list_creation(excelfolder: str, listname: str, comments: dict):
+def prepare_list_creation(
+    excelfolder: str,
+    listname: Optional[str],
+    comments: dict[str, Any]
+) -> tuple[dict[str, Any], list[str]]:
     """
-    Gets the excelfolder parameter and checks the validity of the files. It then makes the root node for the list.
+    Creates the list from Excel files that can be used to build a JSON list. Then, creates the root node for the JSON list.
 
     Args:
         excelfolder: path to the folder containing the Excel file(s)
@@ -284,32 +317,30 @@ def prepare_list_creation(excelfolder: str, listname: str, comments: dict):
         comments: comments for the list to be created
 
     Returns:
-        rootnode (dict): The rootnode of the list as a dictionary
-        excel_files (list[str]): list of the Excel files to process
+        rootnode: The rootnode of the list as a dictionary
+        excel_files: list of the Excel files to process
     """
     # reset the global variables before list creation starts
-    global cell_names
-    global list_of_lists
+    global list_of_previous_node_names
+    global list_of_lists_of_previous_cell_values
 
-    list_of_lists = []
-    cell_names = []
+    list_of_previous_node_names = []
+    list_of_lists_of_previous_cell_values = []
 
     # check if the given folder parameter is actually a folder
     if not os.path.isdir(excelfolder):
-        print(excelfolder, 'is not a directory.')
+        print(excelfolder, ' is not a directory.')
         exit(1)
 
     # create a list with all excel files from the path provided by the user
-    excel_files = [filename for filename in glob.iglob(f'{excelfolder}/*.xlsx') if
-                   not os.path.basename(filename).startswith('~$')]
+    excel_files = [filename for filename in glob.iglob(f'{excelfolder}/*.xlsx')
+                   if not os.path.basename(filename).startswith('~$')
+                   and os.path.isfile(filename)]
 
-    # check if all excel_files are actually files
+    # print the files that can be used
     print('Found the following files:')
     for file in excel_files:
         print(file)
-        if not os.path.isfile(file):
-            print(file, 'is not a valid file.')
-            exit(1)
 
     # create root node of list
     rootnode = make_root_node_from_args(excel_files, listname, comments)
@@ -317,7 +348,7 @@ def prepare_list_creation(excelfolder: str, listname: str, comments: dict):
     return rootnode, excel_files
 
 
-def list_excel2json(listname: str, excelfolder: str, outfile: str):
+def list_excel2json(listname: Union[str, None], excelfolder: str, outfile: str) -> None:
     """
     Takes the arguments from the command line, checks folder and files and starts the process of list creation.
 
@@ -333,13 +364,12 @@ def list_excel2json(listname: str, excelfolder: str, outfile: str):
     rootnode, excel_files = prepare_list_creation(excelfolder, listname, comments={})
 
     # create the list from the Excel files
-    make_json_list_from_excel(rootnode, excel_files)
+    finished_list = make_json_list_from_excel(rootnode, excel_files)
 
     # validate created list with schema
-    if validate_list_with_schema(json.loads(json.dumps(rootnode, indent=4))):
-        # write final list to JSON file if list passed validation
+    if validate_list_with_schema(json.loads(json.dumps(finished_list, indent=4))):
         with open(outfile, 'w', encoding='utf-8') as fp:
-            json.dump(rootnode, fp, indent=4, sort_keys=False, ensure_ascii=False)
+            json.dump(finished_list, fp, indent=4, sort_keys=False, ensure_ascii=False)
             print('List was created successfully and written to file:', outfile)
     else:
         print('List is not valid according to schema.')
diff --git a/knora/dsplib/utils/expand_all_lists.py b/knora/dsplib/utils/expand_all_lists.py
index 3e9009e1b..d3fa78d67 100644
--- a/knora/dsplib/utils/expand_all_lists.py
+++ b/knora/dsplib/utils/expand_all_lists.py
@@ -1,9 +1,11 @@
-from typing import Any
+from typing import Any, Union, Optional, cast
 
 from knora.dsplib.utils.excel_to_json_lists import make_json_list_from_excel, prepare_list_creation
 
 
-def expand_lists_from_excel(data_model: dict[str, Any]) -> list[str]:
+def expand_lists_from_excel(
+    data_model: dict[str, dict[str, list[dict[str, Union[str, dict[str, Any]]]]]]
+) -> list[dict[str, Any]]:
     """
     Gets all list definitions from a data model and expands them to JSON if they are only referenced via an Excel file
 
@@ -13,25 +15,24 @@ def expand_lists_from_excel(data_model: dict[str, Any]) -> list[str]:
     Returns:
         A list of all expanded lists. It can be added to the root node of an ontology as lists section.
     """
-    # create and add lists from Excel references to the ontology
-    lists = data_model["project"].get("lists")
 
-    if not lists:
+    if 'project' not in data_model or 'lists' not in data_model['project']:
         return []
 
+    lists = data_model['project']['lists']
     new_lists = []
+
     for rootnode in lists:
-        nodes = rootnode["nodes"]
+        nodes = rootnode.get('nodes')
+        listname = cast(Optional[str], rootnode.get('name'))
+        comments = cast(dict[str, Any], rootnode.get('comments'))
         # check if the folder parameter is used
-        if rootnode.get("nodes") and isinstance(nodes, dict) and nodes.get("folder"):
+        if isinstance(nodes, dict) and 'folder' in nodes and comments:
             # get the Excel files from the folder and create the rootnode of the list
-            excel_folder = nodes["folder"]
-            rootnode, excel_files = prepare_list_creation(excel_folder, rootnode.get("name"), rootnode.get("comments"))
-
+            prepared_rootnode, excel_files = prepare_list_creation(nodes['folder'], listname, comments)
             # create the list from the Excel files
-            make_json_list_from_excel(rootnode, excel_files)
-
-            new_lists.append(rootnode)
+            finished_list = make_json_list_from_excel(prepared_rootnode, excel_files)
+            new_lists.append(finished_list)
         else:
             new_lists.append(rootnode)