From 8a8c9d02fe3a4c47523bb70bbaa754ece5c007cd Mon Sep 17 00:00:00 2001 From: Johannes Nussbaum <39048939+jnussbaum@users.noreply.github.com> Date: Thu, 17 Feb 2022 18:41:51 +0100 Subject: [PATCH] refactor: excel to json list (DEV-431) (#155) * Don't open the file each time in the recursive function get_values_from_excel() * instead of side effects, all methods should return the result they computed * - make code more understandable by improving the logic - improve docstrings and annotations * improve docstrings and annotations * - various style/annotation improvements - remove side-effect of get_values_from_excel by making a copy of parentnode before returning it * - correct mistake in logic - improve error message output * - reconstruct the side-effect from before * strip strings * adapt 'expand_all_lists.py' to the refactorings * elminate little bug * introduce the side effect again * rename variable Co-authored-by: irinaschubert * correct previous error * improvements suggested by reviewer Co-authored-by: irinaschubert * implement reviewer's feedback Co-authored-by: irinaschubert --- knora/dsplib/utils/excel_to_json_lists.py | 262 ++++++++++++---------- knora/dsplib/utils/expand_all_lists.py | 27 +-- 2 files changed, 160 insertions(+), 129 deletions(-) diff --git a/knora/dsplib/utils/excel_to_json_lists.py b/knora/dsplib/utils/excel_to_json_lists.py index 3f961b490..3f67f6f96 100644 --- a/knora/dsplib/utils/excel_to_json_lists.py +++ b/knora/dsplib/utils/excel_to_json_lists.py @@ -1,103 +1,117 @@ -"""This module handles all the operations which are used for the creation of JSON lists form Excel files.""" +"""This module handles all the operations which are used for the creation of JSON lists from Excel files.""" import csv import glob import json import os import re import unicodedata +from typing import Any, Union, Optional import jsonschema -from jsonschema import validate from openpyxl import load_workbook - -list_of_lists = [] -cell_names = [] - - -def get_values_from_excel(excelfiles: list[str], base_file: str, parentnode: {}, row: int, col: int, - preval: list[str]) -> int: +from openpyxl.cell import Cell +from openpyxl.worksheet.worksheet import Worksheet + +# Global variables used to ensure that there are no duplicate node names +list_of_lists_of_previous_cell_values: list[list[str]] = [] +list_of_previous_node_names: list[str] = [] + + +def get_values_from_excel( + excelfiles: dict[str, Worksheet], + base_file: dict[str, Worksheet], + parentnode: dict[str, Any], + row: int, + col: int, + preval: list[str] +) -> tuple[int, dict[str, Any]]: """ - This function calls itself recursively to go through the Excel files. It extracts the cell values and creates the JSON list - file. + This function calls itself recursively to go through the Excel files. It extracts the cell values and creates + the JSON list file. Args: - base_file: File name of the base file excelfiles: List of Excel files with the values in different languages - parentnode: Name(s) of the parent node(s) of the actual node - row: The index of the actual row of the Excel sheet - col: The index of the actual column of the Excel sheet + base_file: File name of the base file + parentnode: Name(s) of the parent node(s) of the current node + row: The index of the current row of the Excel sheet + col: The index of the current column of the Excel sheet preval: List of previous values, needed to check the consistency of the list hierarchy Returns: - int: Row index for the next loop (actual row index minus 1) + int: Row index for the next loop (current row index minus 1) + dict: The JSON list up to the current recursion. At the last recursion, this is the final JSON list. """ - nodes = [] - currentnode = {} - wb = load_workbook(filename=base_file, read_only=True) - worksheet = wb.worksheets[0] - cell = worksheet.cell(column=col, row=row) + nodes: list[dict[str, Any]] = [] + currentnode: dict[str, Any] = dict() + base_file_ws: Worksheet = list(base_file.values())[0] + cell: Cell = base_file_ws.cell(column=col, row=row) + + for excelfile in excelfiles.values(): + if any((not excelfile['A1'].value, excelfile['B1'].value)): + print(f'Inconsistency in Excel list: The first row must consist of exactly one value, in cell A1. ' + f'All other cells of row 1 must be empty.\nInstead, found the following:\n' + f'Cell A1: "{excelfile["A1"].value}"\n' + f'Cell B1: "{excelfile["B1"].value}"') + quit() if col > 1: - # append the cell value of the parent node (which is one value to the left of the actual cell) to the list of + # append the cell value of the parent node (which is one value to the left of the current cell) to the list of # previous values - preval.append(worksheet.cell(column=col - 1, row=row).value) + preval.append(base_file_ws.cell(column=col-1, row=row).value.strip()) while cell.value: # check if all predecessors in row (values to the left) are consistent with the values in preval list for idx, val in enumerate(preval[:-1]): - if val != worksheet.cell(column=idx + 1, row=row).value: - print( - f'Inconsistency in Excel list: {val} not equal to {worksheet.cell(column=idx + 1, row=row).value}') + if val != base_file_ws.cell(column=idx+1, row=row).value.strip(): + print(f'Inconsistency in Excel list: {val} not equal to ' + f'{base_file_ws.cell(column=idx+1, row=row).value.strip()}') quit() # loop through the row until the last (furthest right) value is found - if worksheet.cell(column=col + 1, row=row).value: - row = get_values_from_excel(excelfiles=excelfiles, base_file=base_file, parentnode=currentnode, col=col + 1, - row=row, - preval=preval) + if base_file_ws.cell(column=col+1, row=row).value: + row, _ = get_values_from_excel( + excelfiles=excelfiles, + base_file=base_file, + parentnode=currentnode, + col=col+1, + row=row, + preval=preval + ) # if value was last in row (no further values to the right), it's a node, continue here else: # check if there are duplicate nodes (i.e. identical rows), quit the program if so new_check_list = preval.copy() - new_check_list.append(cell.value) - - list_of_lists.append(new_check_list) + new_check_list.append(cell.value.strip()) + list_of_lists_of_previous_cell_values.append(new_check_list) - if check_list_for_duplicates(list_of_lists): - print('There is at least one duplicate node in the list. Found duplicate:', cell.value) + if contains_duplicates(list_of_lists_of_previous_cell_values): + print('There is at least one duplicate node in the list. Found duplicate: ', cell.value.strip()) quit() # create a simplified version of the cell value and use it as name of the node - cellname = simplify_name(cell.value) - cell_names.append(cellname) + nodename = simplify_name(cell.value.strip()) + list_of_previous_node_names.append(nodename) # append a number (p.ex. node-name-2) if there are list nodes with identical names - if check_list_for_duplicates(cell_names): - n = cell_names.count(cellname) + if contains_duplicates(list_of_previous_node_names): + n = list_of_previous_node_names.count(nodename) if n > 1: - cellname = cellname + '-' + str(n) - - labels_dict = {} + nodename = nodename + '-' + str(n) # read label values from the other Excel files (other languages) - for filename_other_lang in excelfiles: - wb_other_lang = load_workbook(filename=filename_other_lang, read_only=True) - ws_other_lang = wb_other_lang.worksheets[0] - - lang = os.path.splitext(filename_other_lang)[0].split('_')[-1] - labels_dict[lang] = ws_other_lang.cell(column=col, row=row).value + labels_dict: dict[str, str] = {} + for other_lang, ws_other_lang in excelfiles.items(): + labels_dict[other_lang] = ws_other_lang.cell(column=col, row=row).value.strip() # create current node from extracted cell values and append it to the nodes list - currentnode = {'name': cellname, 'labels': labels_dict} - + currentnode = {'name': nodename, 'labels': labels_dict} nodes.append(currentnode) - - print(f'Added list node: {cell.value} ({cellname})') + print(f'Added list node: {cell.value.strip()} ({nodename})') # go one row down and repeat loop if there is a value row += 1 - cell = worksheet.cell(column=col, row=row) + cell = base_file_ws.cell(column=col, row=row) if col > 1: preval.pop() @@ -105,41 +119,58 @@ def get_values_from_excel(excelfiles: list[str], base_file: str, parentnode: {}, # add the new nodes to the parentnode parentnode['nodes'] = nodes - return row - 1 + return row - 1, parentnode -def make_json_list_from_excel(rootnode: {}, excelfiles: list[str]) -> None: +def make_json_list_from_excel(rootnode: dict[str, Any], excelfile_names: list[str]) -> dict[str, Any]: """ - Reads Excel files and makes a JSON list file from them. The JSON can then be used in an ontology that is uploaded to the - DaSCH Service Platform. + Reads Excel files and makes a JSON list file from them. The JSON can then be used in an ontology that + is uploaded to the DaSCH Service Platform. Args: rootnode: The root node of the JSON list - excelfiles: A list with all the Excel files to be processed + excelfile_names: A list with all the Excel files to be processed Returns: - None - + The finished list as a dict """ # Define starting point in Excel file startrow = 1 startcol = 1 - # Check if English file is available and take it as base file, take last one from list of Excel files if English - # is not available. The node names are later derived from the labels of the base file. - base_file = '' - - for filename in excelfiles: - base_file = filename + # Check if English file is available and take it as base file. Take last one from list of Excel files if English + # is not available. The node names are later derived from the labels of the base file. + base_file: dict[str, Worksheet] = dict() + for filename in excelfile_names: if '_en.xlsx' in os.path.basename(filename): - base_file = filename - break - - get_values_from_excel(excelfiles=excelfiles, base_file=base_file, parentnode=rootnode, row=startrow, col=startcol, - preval=[]) - - -def check_list_for_duplicates(list_to_check: list) -> bool: + lang = 'en' + ws = load_workbook(filename, read_only=True).worksheets[0] + base_file = {lang: ws} + if len(base_file) == 0: + file = excelfile_names[-1] + lang = os.path.splitext(file)[0].split('_')[-1] + ws = load_workbook(file, read_only=True).worksheets[0] + base_file = {lang: ws} + + excelfiles: dict[str, Worksheet] = {} + for f in excelfile_names: + lang = os.path.splitext(f)[0].split('_')[-1] + ws = load_workbook(f, read_only=True).worksheets[0] + excelfiles[lang] = ws + + _, finished_list = get_values_from_excel( + excelfiles=excelfiles, + base_file=base_file, + parentnode=rootnode, + row=startrow, + col=startcol, + preval=[] + ) + + return finished_list + + +def contains_duplicates(list_to_check: list[Any]) -> bool: """ Checks if the given list contains any duplicate items. @@ -148,7 +179,6 @@ def check_list_for_duplicates(list_to_check: list) -> bool: Returns: True if there is a duplicate, false otherwise - """ has_duplicates = False @@ -168,7 +198,6 @@ def simplify_name(value: str) -> str: Returns: str: The simplified value - """ simplified_value = str(value).lower() @@ -194,7 +223,6 @@ def check_language_code(lang_code: str) -> bool: Returns: True if valid, False if not - """ current_dir = os.path.dirname(os.path.realpath(__file__)) with open(os.path.join(current_dir, 'language-codes-3b2_csv.csv'), 'r') as language_codes_file: @@ -205,7 +233,11 @@ def check_language_code(lang_code: str) -> bool: return False -def make_root_node_from_args(excelfiles: list[str], listname_from_args: str, comments: dict[str, str]) -> dict: +def make_root_node_from_args( + excelfiles: list[str], + listname_from_args: Optional[str], + comments: dict[str, str] +) -> dict[str, Any]: """ Creates the root node for the JSON list @@ -216,36 +248,34 @@ def make_root_node_from_args(excelfiles: list[str], listname_from_args: str, com Returns: dict: The root node of the list as dictionary (JSON) - """ - rootnode_labels_dict = {} - listname = listname_from_args - listname_en = '' + listname_from_lang_code = {} + listname_en: str = '' + lang_specific_listname: str = '' for filename in excelfiles: basename = os.path.basename(filename) - label, lang_code = os.path.splitext(basename)[0].rsplit('_', 1) + lang_specific_listname, lang_code = os.path.splitext(basename)[0].rsplit('_', 1) - # check if language code is valid if not check_language_code(lang_code): - print('Invalid language code is used. Only language codes from ISO 639-1 and ISO 639-2 are accepted.') + print(f'Invalid language code "{lang_code}" is used. Only language codes from ISO 639-1 ', + f'and ISO 639-2 are accepted.') quit() - rootnode_labels_dict[lang_code] = label - listname = label + listname_from_lang_code[lang_code] = lang_specific_listname if '_en.xlsx' in filename: - listname_en = label - - # if an english list is available use its label as listname - if listname_en: - listname = listname_en + listname_en = lang_specific_listname - # if the user provided a listname use it + # the listname is taken from the following sources, with descending priority if listname_from_args: listname = listname_from_args + elif listname_en: + listname = listname_en + else: + listname = lang_specific_listname - rootnode = {'name': listname, 'labels': rootnode_labels_dict, 'comments': comments} + rootnode = {'name': listname, 'labels': listname_from_lang_code, 'comments': comments} return rootnode @@ -259,14 +289,13 @@ def validate_list_with_schema(json_list: str) -> bool: Returns: True if the list passed validation, False otherwise - """ current_dir = os.path.dirname(os.path.realpath(__file__)) with open(os.path.join(current_dir, '../schemas/lists-only.json')) as schema: list_schema = json.load(schema) try: - validate(instance=json_list, schema=list_schema) + jsonschema.validate(instance=json_list, schema=list_schema) except jsonschema.exceptions.ValidationError as err: print(err) return False @@ -274,9 +303,13 @@ def validate_list_with_schema(json_list: str) -> bool: return True -def prepare_list_creation(excelfolder: str, listname: str, comments: dict): +def prepare_list_creation( + excelfolder: str, + listname: Optional[str], + comments: dict[str, Any] +) -> tuple[dict[str, Any], list[str]]: """ - Gets the excelfolder parameter and checks the validity of the files. It then makes the root node for the list. + Creates the list from Excel files that can be used to build a JSON list. Then, creates the root node for the JSON list. Args: excelfolder: path to the folder containing the Excel file(s) @@ -284,32 +317,30 @@ def prepare_list_creation(excelfolder: str, listname: str, comments: dict): comments: comments for the list to be created Returns: - rootnode (dict): The rootnode of the list as a dictionary - excel_files (list[str]): list of the Excel files to process + rootnode: The rootnode of the list as a dictionary + excel_files: list of the Excel files to process """ # reset the global variables before list creation starts - global cell_names - global list_of_lists + global list_of_previous_node_names + global list_of_lists_of_previous_cell_values - list_of_lists = [] - cell_names = [] + list_of_previous_node_names = [] + list_of_lists_of_previous_cell_values = [] # check if the given folder parameter is actually a folder if not os.path.isdir(excelfolder): - print(excelfolder, 'is not a directory.') + print(excelfolder, ' is not a directory.') exit(1) # create a list with all excel files from the path provided by the user - excel_files = [filename for filename in glob.iglob(f'{excelfolder}/*.xlsx') if - not os.path.basename(filename).startswith('~$')] + excel_files = [filename for filename in glob.iglob(f'{excelfolder}/*.xlsx') + if not os.path.basename(filename).startswith('~$') + and os.path.isfile(filename)] - # check if all excel_files are actually files + # print the files that can be used print('Found the following files:') for file in excel_files: print(file) - if not os.path.isfile(file): - print(file, 'is not a valid file.') - exit(1) # create root node of list rootnode = make_root_node_from_args(excel_files, listname, comments) @@ -317,7 +348,7 @@ def prepare_list_creation(excelfolder: str, listname: str, comments: dict): return rootnode, excel_files -def list_excel2json(listname: str, excelfolder: str, outfile: str): +def list_excel2json(listname: Union[str, None], excelfolder: str, outfile: str) -> None: """ Takes the arguments from the command line, checks folder and files and starts the process of list creation. @@ -333,13 +364,12 @@ def list_excel2json(listname: str, excelfolder: str, outfile: str): rootnode, excel_files = prepare_list_creation(excelfolder, listname, comments={}) # create the list from the Excel files - make_json_list_from_excel(rootnode, excel_files) + finished_list = make_json_list_from_excel(rootnode, excel_files) # validate created list with schema - if validate_list_with_schema(json.loads(json.dumps(rootnode, indent=4))): - # write final list to JSON file if list passed validation + if validate_list_with_schema(json.loads(json.dumps(finished_list, indent=4))): with open(outfile, 'w', encoding='utf-8') as fp: - json.dump(rootnode, fp, indent=4, sort_keys=False, ensure_ascii=False) + json.dump(finished_list, fp, indent=4, sort_keys=False, ensure_ascii=False) print('List was created successfully and written to file:', outfile) else: print('List is not valid according to schema.') diff --git a/knora/dsplib/utils/expand_all_lists.py b/knora/dsplib/utils/expand_all_lists.py index 3e9009e1b..d3fa78d67 100644 --- a/knora/dsplib/utils/expand_all_lists.py +++ b/knora/dsplib/utils/expand_all_lists.py @@ -1,9 +1,11 @@ -from typing import Any +from typing import Any, Union, Optional, cast from knora.dsplib.utils.excel_to_json_lists import make_json_list_from_excel, prepare_list_creation -def expand_lists_from_excel(data_model: dict[str, Any]) -> list[str]: +def expand_lists_from_excel( + data_model: dict[str, dict[str, list[dict[str, Union[str, dict[str, Any]]]]]] +) -> list[dict[str, Any]]: """ Gets all list definitions from a data model and expands them to JSON if they are only referenced via an Excel file @@ -13,25 +15,24 @@ def expand_lists_from_excel(data_model: dict[str, Any]) -> list[str]: Returns: A list of all expanded lists. It can be added to the root node of an ontology as lists section. """ - # create and add lists from Excel references to the ontology - lists = data_model["project"].get("lists") - if not lists: + if 'project' not in data_model or 'lists' not in data_model['project']: return [] + lists = data_model['project']['lists'] new_lists = [] + for rootnode in lists: - nodes = rootnode["nodes"] + nodes = rootnode.get('nodes') + listname = cast(Optional[str], rootnode.get('name')) + comments = cast(dict[str, Any], rootnode.get('comments')) # check if the folder parameter is used - if rootnode.get("nodes") and isinstance(nodes, dict) and nodes.get("folder"): + if isinstance(nodes, dict) and 'folder' in nodes and comments: # get the Excel files from the folder and create the rootnode of the list - excel_folder = nodes["folder"] - rootnode, excel_files = prepare_list_creation(excel_folder, rootnode.get("name"), rootnode.get("comments")) - + prepared_rootnode, excel_files = prepare_list_creation(nodes['folder'], listname, comments) # create the list from the Excel files - make_json_list_from_excel(rootnode, excel_files) - - new_lists.append(rootnode) + finished_list = make_json_list_from_excel(prepared_rootnode, excel_files) + new_lists.append(finished_list) else: new_lists.append(rootnode)