From d2c2e085813680007a5228c1ef5bab82b4f1a6c7 Mon Sep 17 00:00:00 2001 From: Johannes Nussbaum <39048939+jnussbaum@users.noreply.github.com> Date: Mon, 19 Sep 2022 17:22:52 +0200 Subject: [PATCH] chore: tidy up excel2lists, excel2resources, excel2properties (DEV-1352) (#229) --- docs/dsp-tools-excel2xml.md | 3 +- knora/dsp_tools.py | 20 +-- knora/dsplib/models/propertyelement.py | 54 +++++++++ knora/dsplib/schemas/properties-only.json | 13 +- knora/dsplib/utils/excel_to_json_lists.py | 56 ++++----- .../dsplib/utils/excel_to_json_properties.py | 61 +++++----- knora/dsplib/utils/excel_to_json_resources.py | 89 +++++--------- knora/dsplib/utils/onto_create_lists.py | 2 +- knora/dsplib/utils/onto_create_ontology.py | 2 +- .../utils/{shared_methods.py => shared.py} | 101 +++++++++++++++- knora/dsplib/utils/xml_upload.py | 6 +- knora/excel2xml.py | 77 +----------- test/e2e/test_tools.py | 16 +-- test/unittests/test_excel2xml.py | 12 -- test/unittests/test_excel_to_json_lists.py | 97 +++++++++++++-- ...es.py => test_excel_to_json_properties.py} | 30 ++--- ...rce.py => test_excel_to_json_resources.py} | 53 +++----- test/unittests/test_shared_methods.py | 44 +++++++ testdata/{single_list => list_single}/de.xlsx | Bin testdata/{single_list => list_single}/en.xlsx | Bin .../de.xlsx | Bin .../en.xlsx | Bin .../fr.xlsx | Bin .../de.xlsx | Bin .../en.xlsx | Bin .../fr.xlsx | Bin .../de.xlsx | Bin .../de.xlsx | Bin .../en.xlsx | Bin .../fr.xlsx | Bin .../lists_multilingual_output_expected.json | 102 ++++++++++++++++ testdata/lists_section_expanded.json | 114 ++++++++++++++++++ testdata/test-project-systematic.json | 2 +- 33 files changed, 640 insertions(+), 314 deletions(-) create mode 100644 knora/dsplib/models/propertyelement.py rename knora/dsplib/utils/{shared_methods.py => shared.py} (50%) rename test/unittests/{test_excel_to_properties.py => test_excel_to_json_properties.py} (87%) rename test/unittests/{test_excel_to_resource.py => test_excel_to_json_resources.py} (67%) create mode 100644 test/unittests/test_shared_methods.py rename testdata/{single_list => list_single}/de.xlsx (100%) rename testdata/{single_list => list_single}/en.xlsx (100%) rename testdata/{invalid_lists_1 => lists_invalid_1}/de.xlsx (100%) rename testdata/{invalid_lists_1 => lists_invalid_1}/en.xlsx (100%) rename testdata/{invalid_lists_1 => lists_invalid_1}/fr.xlsx (100%) rename testdata/{invalid_lists_2 => lists_invalid_2}/de.xlsx (100%) rename testdata/{invalid_lists_2 => lists_invalid_2}/en.xlsx (100%) rename testdata/{invalid_lists_2 => lists_invalid_2}/fr.xlsx (100%) rename testdata/{monolingual_lists => lists_monolingual}/de.xlsx (100%) rename testdata/{multilingual_lists => lists_multilingual}/de.xlsx (100%) rename testdata/{multilingual_lists => lists_multilingual}/en.xlsx (100%) rename testdata/{multilingual_lists => lists_multilingual}/fr.xlsx (100%) create mode 100644 testdata/lists_multilingual_output_expected.json create mode 100644 testdata/lists_section_expanded.json diff --git a/docs/dsp-tools-excel2xml.md b/docs/dsp-tools-excel2xml.md index 8f86e73db..5cc21d200 100644 --- a/docs/dsp-tools-excel2xml.md +++ b/docs/dsp-tools-excel2xml.md @@ -63,7 +63,8 @@ For `make_boolean_prop(cell)`, the following formats are supported: #### Check if a cell contains a usable value The method `check_notna(cell)` checks a value if it is usable in the context of data archiving. A value is considered usable if it is - - a number (integer or float, but not np.nan) + + - a number (integer or float, but not numpy.nan) - a boolean - a string with at least one Unicode letter, underscore, or number, but not "None", "", "N/A", or "-" - a PropertyElement whose "value" fulfills the above criteria diff --git a/knora/dsp_tools.py b/knora/dsp_tools.py index 0dc7c5b8b..6f0f9034b 100644 --- a/knora/dsp_tools.py +++ b/knora/dsp_tools.py @@ -6,16 +6,16 @@ import sys from importlib.metadata import version -from knora.dsplib.utils.excel_to_json_lists import list_excel2json, validate_lists_section_with_schema -from knora.dsplib.utils.excel_to_json_properties import properties_excel2json -from knora.dsplib.utils.excel_to_json_resources import resources_excel2json +from knora.dsplib.utils.excel_to_json_lists import excel2lists, validate_lists_section_with_schema +from knora.dsplib.utils.excel_to_json_properties import excel2properties +from knora.dsplib.utils.excel_to_json_resources import excel2resources from knora.dsplib.utils.id_to_iri import id_to_iri from knora.dsplib.utils.onto_create_lists import create_lists from knora.dsplib.utils.onto_create_ontology import create_project from knora.dsplib.utils.onto_get import get_ontology from knora.dsplib.utils.onto_validate import validate_project from knora.dsplib.utils.xml_upload import xml_upload -from knora.dsplib.utils.shared_methods import validate_xml_against_schema +from knora.dsplib.utils.shared import validate_xml_against_schema from knora.excel2xml import excel2xml @@ -186,14 +186,14 @@ def program(user_args: list[str]) -> None: verbose=args.verbose, incremental=args.incremental) elif args.action == 'excel2lists': - list_excel2json(excelfolder=args.excelfolder, - outfile=args.outfile) + excel2lists(excelfolder=args.excelfolder, + outfile=args.outfile) elif args.action == 'excel2resources': - resources_excel2json(excelfile=args.excelfile, - outfile=args.outfile) + excel2resources(excelfile=args.excelfile, + outfile=args.outfile) elif args.action == 'excel2properties': - properties_excel2json(excelfile=args.excelfile, - outfile=args.outfile) + excel2properties(excelfile=args.excelfile, + outfile=args.outfile) elif args.action == 'id2iri': id_to_iri(xml_file=args.xmlfile, json_file=args.jsonfile, diff --git a/knora/dsplib/models/propertyelement.py b/knora/dsplib/models/propertyelement.py new file mode 100644 index 000000000..eb3291a42 --- /dev/null +++ b/knora/dsplib/models/propertyelement.py @@ -0,0 +1,54 @@ +from typing import Union, Optional +import pandas as pd +import regex +from dataclasses import dataclass +from knora.dsplib.models.helpers import BaseError + + +@dataclass(frozen=True) +class PropertyElement: + """ + A PropertyElement object carries more information about a property value than the value itself. + The "value" is the value that could be passed to a method as plain string/int/float/bool. Use a PropertyElement + instead to define more precisely what attributes your tag (for example) will have. + + Args: + value: This is the content that will be written between the tags (for example) + permissions: This is the permissions that your tag (for example) will have + comment: This is the comment that your tag (for example) will have + encoding: For tags only. Can be "xml" or "utf8". + + Examples: + See the difference between the first and the second example: + + >>> make_text_prop(":testproperty", "first text") + + + first text + + + >>> make_text_prop(":testproperty", PropertyElement("first text", permissions="prop-restricted", encoding="xml")) + + + first text + + + """ + value: Union[str, int, float, bool] + permissions: str = "prop-default" + comment: Optional[str] = None + encoding: Optional[str] = None + + def __post_init__(self) -> None: + if not any([ + isinstance(self.value, int), + isinstance(self.value, float) and pd.notna(self.value), # necessary because isinstance(np.nan, float) + isinstance(self.value, bool), + isinstance(self.value, str) and all([ + regex.search(r"\p{L}|\d|_", self.value, flags=regex.UNICODE), + not bool(regex.search(r"^(none||-|n/a)$", self.value, flags=regex.IGNORECASE)) + ]) + ]): + raise BaseError(f"'{self.value}' is not a valid value for a PropertyElement") + if self.encoding not in ["utf8", "xml", None]: + raise BaseError(f"'{self.encoding}' is not a valid encoding for a PropertyElement") diff --git a/knora/dsplib/schemas/properties-only.json b/knora/dsplib/schemas/properties-only.json index 6720d6758..dc16fe9a0 100644 --- a/knora/dsplib/schemas/properties-only.json +++ b/knora/dsplib/schemas/properties-only.json @@ -67,17 +67,17 @@ "oneOf": [ { "enum": [ - "TextValue", + "BooleanValue", "ColorValue", "DateValue", "DecimalValue", "GeonameValue", "IntValue", - "BooleanValue", - "TimeValue", - "UriValue", "IntervalValue", "ListValue", + "TextValue", + "TimeValue", + "UriValue", "Resource", "Representation" ] @@ -96,11 +96,11 @@ "gui_element": { "type": "string", "enum": [ + "Checkbox", "Colorpicker", "Date", "Geonames", "Interval", - "TimeStamp", "List", "Radio", "Richtext", @@ -109,8 +109,7 @@ "Slider", "Spinbox", "Textarea", - "Checkbox", - "Fileupload" + "TimeStamp" ] }, "gui_attributes": { diff --git a/knora/dsplib/utils/excel_to_json_lists.py b/knora/dsplib/utils/excel_to_json_lists.py index 998731624..f3b84ff00 100644 --- a/knora/dsplib/utils/excel_to_json_lists.py +++ b/knora/dsplib/utils/excel_to_json_lists.py @@ -3,7 +3,6 @@ import json import os import re -import unicodedata from typing import Any, Union, Optional, Tuple import jsonschema @@ -13,6 +12,7 @@ import regex from knora.dsplib.models.helpers import BaseError +from knora.dsplib.utils.shared import simplify_name list_of_lists_of_previous_cell_values: list[list[str]] = [] """Module level variable used to ensure that there are no duplicate node names""" @@ -236,30 +236,6 @@ def _make_json_lists_from_excel(excel_file_paths: list[str], verbose: bool = Fal return finished_lists -def simplify_name(value: str) -> str: - """ - Simplifies a given value in order to use it as node name - - Args: - value: The value to be simplified - - Returns: - str: The simplified value - """ - simplified_value = str(value).lower() - - # normalize characters (p.ex. ä becomes a) - simplified_value = unicodedata.normalize("NFKD", simplified_value) - - # replace forward slash and whitespace with a dash - simplified_value = re.sub("[/\\s]+", "-", simplified_value) - - # delete all characters which are not letters, numbers or dashes - simplified_value = re.sub("[^A-Za-z0-9\\-]+", "", simplified_value) - - return simplified_value - - def validate_lists_section_with_schema( path_to_json_project_file: Optional[str] = None, lists_section: Optional[list[dict[str, Any]]] = None @@ -273,7 +249,7 @@ def validate_lists_section_with_schema( lists_section: the "lists" section as Python object Returns: - True if the list passed validation. Otherwise, a BaseError with a detailed error report is raised + True if the "lists" section passed validation. Otherwise, a BaseError with a detailed error report is raised """ if bool(path_to_json_project_file) == bool(lists_section): raise BaseError("Validation of the 'lists' section works only if exactly one of the two arguments is given.") @@ -283,12 +259,15 @@ def validate_lists_section_with_schema( if path_to_json_project_file: with open(path_to_json_project_file) as f: project = json.load(f) - lists_section = project["project"]["lists"] + lists_section = project["project"].get("lists") + if not lists_section: + raise BaseError(f"Cannot validate \"lists\" section of {path_to_json_project_file}, because there is " + f"no \"lists\" section in this file.") try: jsonschema.validate(instance={"lists": lists_section}, schema=lists_schema) except jsonschema.exceptions.ValidationError as err: - raise BaseError(f'"Lists" section did not pass validation. The error message is: {err.message}\n' + raise BaseError(f'"lists" section did not pass validation. The error message is: {err.message}\n' f'The error occurred at {err.json_path}') return True @@ -318,23 +297,30 @@ def _extract_excel_file_paths(excelfolder: str) -> list[str]: return excel_file_paths -def list_excel2json(excelfolder: str, outfile: str) -> None: +def excel2lists(excelfolder: str, path_to_output_file: Optional[str] = None) -> list[dict[str, Any]]: """ - This method writes a JSON file with a "lists" section that can later be inserted into a JSON project file. + Converts lists described in Excel files into a "lists" section that can be inserted into a JSON project file. Args: excelfolder: path to the folder containing the Excel file(s) - outfile: path to the JSON file the output is written into + path_to_output_file: if provided, the output is written into this JSON file Returns: - None + the "lists" section as Python list """ + # read the data excel_file_paths = _extract_excel_file_paths(excelfolder) print("The following Excel files will be processed:") [print(f" - {filename}") for filename in excel_file_paths] + + # construct the "lists" section finished_lists = _make_json_lists_from_excel(excel_file_paths, verbose=True) validate_lists_section_with_schema(lists_section=finished_lists) - with open(outfile, "w", encoding="utf-8") as fp: - json.dump({"lists": finished_lists}, fp, indent=4, sort_keys=False, ensure_ascii=False) - print("List was created successfully and written to file:", outfile) + # write final "lists" section + if path_to_output_file: + with open(path_to_output_file, "w", encoding="utf-8") as fp: + json.dump(finished_lists, fp, indent=4, ensure_ascii=False) + print('"lists" section was created successfully and written to file:', path_to_output_file) + + return finished_lists diff --git a/knora/dsplib/utils/excel_to_json_properties.py b/knora/dsplib/utils/excel_to_json_properties.py index 822ba9584..78b800457 100644 --- a/knora/dsplib/utils/excel_to_json_properties.py +++ b/knora/dsplib/utils/excel_to_json_properties.py @@ -1,37 +1,32 @@ import json -import os import re -from typing import Any - +from typing import Any, Optional import jsonschema import pandas as pd -from knora.dsplib.utils.excel_to_json_resources import prepare_dataframe +from knora.dsplib.models.helpers import BaseError +from knora.dsplib.utils.shared import prepare_dataframe languages = ["en", "de", "fr", "it", "rm"] -def _validate_properties_with_schema(json_file: str) -> bool: +def _validate_properties_with_schema(properties_list: list[dict[str, Any]]) -> bool: """ - This function checks if the json properties are valid according to the schema. + This function checks if the "properties" section of a JSON project file is valid according to the schema. Args: - json_file: the json with the properties to be validated + properties_list: the "properties" section of a JSON project as a list of dicts Returns: - True if the data passed validation, False otherwise - + True if the "properties" section passed validation. Otherwise, a BaseError with a detailed error report is raised. """ - current_dir = os.path.dirname(os.path.realpath(__file__)) - with open(os.path.join(current_dir, "../schemas/properties-only.json")) as schema: + with open("knora/dsplib/schemas/properties-only.json") as schema: properties_schema = json.load(schema) - try: - jsonschema.validate(instance=json_file, schema=properties_schema) + jsonschema.validate(instance=properties_list, schema=properties_schema) except jsonschema.exceptions.ValidationError as err: - print(err) - return False - print("Properties data passed schema validation.") + raise BaseError(f'"properties" section did not pass validation. The error message is: {err.message}\n' + f'The error occurred at {err.json_path}') return True @@ -42,19 +37,19 @@ def _row2prop(row: pd.Series, row_count: int, excelfile: str) -> dict[str, Any]: Args: row: row from a pandas DataFrame that defines a property row_count: row number of Excel file - excelfile: name of the original excel file + excelfile: name of the original Excel file Returns: dict object of the property """ + # extract the elements that are necessary to build the property name = row["name"] supers = [s.strip() for s in row["super"].split(",")] _object = row["object"] labels = {lang: row[lang] for lang in languages if row.get(lang)} comments = {lang: row[f"comment_{lang}"] for lang in languages if row.get(f"comment_{lang}")} gui_element = row["gui_element"] - gui_attributes = dict() if row.get("hlist"): gui_attributes["hlist"] = row["hlist"] @@ -71,12 +66,13 @@ def _row2prop(row: pd.Series, row_count: int, excelfile: str) -> dict[str, Any]: val = int(val) gui_attributes[attr] = val - # build the dict structure of this property and append it to the list of properties + # build the dict structure of this property _property = { "name": name, "super": supers, "object": _object, - "labels": labels} + "labels": labels + } if comments: _property["comments"] = comments _property["gui_element"] = gui_element @@ -86,16 +82,17 @@ def _row2prop(row: pd.Series, row_count: int, excelfile: str) -> dict[str, Any]: return _property -def properties_excel2json(excelfile: str, outfile: str) -> None: +def excel2properties(excelfile: str, path_to_output_file: Optional[str] = None) -> list[dict[str, Any]]: """ - Converts properties described in an Excel file into a properties section which can be integrated into a DSP ontology + Converts properties described in an Excel file into a "properties" section which can be inserted into a JSON + project file. Args: excelfile: path to the Excel file containing the properties - outfile: path to the output JSON file containing the properties section for the ontology + path_to_output_file: if provided, the output is written into this JSON file Returns: - None + the "properties" section as Python list """ # load file @@ -107,12 +104,12 @@ def properties_excel2json(excelfile: str, outfile: str) -> None: # transform every row into a property props = [_row2prop(row, i, excelfile) for i, row in df.iterrows()] + _validate_properties_with_schema(props) + + # write final JSON file + if path_to_output_file: + with open(file=path_to_output_file, mode="w", encoding="utf-8") as file: + json.dump(props, file, indent=4, ensure_ascii=False) + print('"properties" section was created successfully and written to file:', path_to_output_file) - # write final list to JSON file if list passed validation - if _validate_properties_with_schema(json.loads(json.dumps(props, indent=4))): - with open(file=outfile, mode="w+", encoding="utf-8") as file: - file.write('"properties": ') - json.dump(props, file, indent=4) - print("Properties file was created successfully and written to file: ", outfile) - else: - print("Properties data is not valid according to schema.") + return props diff --git a/knora/dsplib/utils/excel_to_json_resources.py b/knora/dsplib/utils/excel_to_json_resources.py index c5d5641e5..af9615e03 100644 --- a/knora/dsplib/utils/excel_to_json_resources.py +++ b/knora/dsplib/utils/excel_to_json_resources.py @@ -1,70 +1,33 @@ import json -import os -import re -from typing import Any - +from typing import Any, Optional import jsonschema import pandas as pd +from knora.dsplib.models.helpers import BaseError +from knora.dsplib.utils.shared import prepare_dataframe languages = ["en", "de", "fr", "it", "rm"] -def _validate_resources_with_schema(json_file: str) -> bool: +def _validate_resources_with_schema(resources_list: list[dict[str, Any]]) -> bool: """ - This function checks if the json resources are valid according to the schema. + This function checks if the "resources" section of a JSON project file is valid according to the schema. Args: - json_file: the json with the resources to be validated + resources_list: the "resources" section of a JSON project as a list of dicts Returns: - True if the data passed validation, False otherwise + True if the "resources" section passed validation. Otherwise, a BaseError with a detailed error report is raised. """ - current_dir = os.path.dirname(os.path.realpath(__file__)) - with open(os.path.join(current_dir, "../schemas/resources-only.json")) as schema: + with open("knora/dsplib/schemas/resources-only.json") as schema: resources_schema = json.load(schema) - try: - jsonschema.validate(instance=json_file, schema=resources_schema) + jsonschema.validate(instance=resources_list, schema=resources_schema) except jsonschema.exceptions.ValidationError as err: - print(err) - return False - print("Resource data passed schema validation.") + raise BaseError(f'"resources" section did not pass validation. The error message is: {err.message}\n' + f'The error occurred at {err.json_path}') return True -def prepare_dataframe(df: pd.DataFrame, required_columns: list[str], location_of_sheet: str) -> pd.DataFrame: - """ - Takes a pandas DataFrame, strips the column headers from whitespaces and transforms them to lowercase, - strips every cell from whitespaces and inserts "" if there is no string in it, and deletes the rows that don't have - a value in one of the required cells. - - Args: - df: pandas DataFrame - required_columns: headers of the columns where a value is required - location_of_sheet: for better error messages, provide this information of the caller - - Returns: - prepared DataFrame - """ - - any_char_regex = r"[\wäàçëéèêïöôòüÄÀÇËÉÊÏÖÔÒÜ]" - - # strip column headers and transform to lowercase, so that the script doesn't break when the headers vary a bit - new_df = df.rename(columns=lambda x: x.strip().lower()) - required_columns = [x.strip().lower() for x in required_columns] - # strip every cell, and insert "" if there is no valid word in it - new_df = new_df.applymap(lambda x: str(x).strip() if pd.notna(x) and re.search(any_char_regex, str(x), flags=re.IGNORECASE) else "") - # delete rows that don't have the required columns - for req in required_columns: - if req not in new_df: - raise ValueError(f"{location_of_sheet} requires a column named '{req}'") - new_df = new_df[pd.notna(new_df[req])] - new_df = new_df[[bool(re.search(any_char_regex, x, flags=re.IGNORECASE)) for x in new_df[req]]] - if len(new_df) < 1: - raise ValueError(f"{location_of_sheet} requires at least one row") - return new_df - - def _row2resource(row: pd.Series, excelfile: str) -> dict[str, Any]: """ Method that takes a row from a pandas DataFrame, reads its content, and returns a dict object of the resource @@ -112,16 +75,17 @@ def _row2resource(row: pd.Series, excelfile: str) -> dict[str, Any]: return resource -def resources_excel2json(excelfile: str, outfile: str) -> None: +def excel2resources(excelfile: str, path_to_output_file: Optional[str] = None) -> list[dict[str, Any]]: """ - Converts properties described in an Excel file into a properties section which can be integrated into a DSP ontology + Converts resources described in an Excel file into a "resources" section which can be inserted into a JSON + project file. Args: - excelfile: path to the Excel file containing the properties - outfile: path to the output JSON file containing the properties section for the ontology + excelfile: path to the Excel file containing the resources + path_to_output_file: if provided, the output is written into this JSON file Returns: - None + the "resources" section as Python list """ # load file @@ -129,16 +93,17 @@ def resources_excel2json(excelfile: str, outfile: str) -> None: all_classes_df = prepare_dataframe( df=all_classes_df, required_columns=["name", "super"], - location_of_sheet=f"Sheet 'classes' in file '{excelfile}'") + location_of_sheet=f"Sheet 'classes' in file '{excelfile}'" + ) # transform every row into a resource resources = [_row2resource(row, excelfile) for i, row in all_classes_df.iterrows()] + _validate_resources_with_schema(resources) + + # write final "resources" section into a JSON file + if path_to_output_file: + with open(file=path_to_output_file, mode="w", encoding="utf-8") as file: + json.dump(resources, file, indent=4, ensure_ascii=False) + print('"resources" section was created successfully and written to file:', path_to_output_file) - # write final list of all resources to JSON file, if list passed validation - if _validate_resources_with_schema(json.loads(json.dumps(resources, indent=4))): - with open(file=outfile, mode="w+", encoding="utf-8") as file: - file.write('"resources": ') - json.dump(resources, file, indent=4) - print("Resource file was created successfully and written to file ", outfile) - else: - print("Resource data is not valid according to schema.") + return resources diff --git a/knora/dsplib/utils/onto_create_lists.py b/knora/dsplib/utils/onto_create_lists.py index 9a3340510..faff0fe71 100644 --- a/knora/dsplib/utils/onto_create_lists.py +++ b/knora/dsplib/utils/onto_create_lists.py @@ -7,7 +7,7 @@ from ..models.helpers import BaseError from ..models.listnode import ListNode from ..models.project import Project -from .shared_methods import login, try_network_action +from .shared import login, try_network_action def _create_list_node( diff --git a/knora/dsplib/utils/onto_create_ontology.py b/knora/dsplib/utils/onto_create_ontology.py index ec3bc3b19..1f3224dde 100644 --- a/knora/dsplib/utils/onto_create_ontology.py +++ b/knora/dsplib/utils/onto_create_ontology.py @@ -16,7 +16,7 @@ from knora.dsplib.utils.excel_to_json_lists import expand_lists_from_excel from knora.dsplib.utils.onto_create_lists import create_lists from knora.dsplib.utils.onto_validate import validate_project -from knora.dsplib.utils.shared_methods import login, try_network_action +from knora.dsplib.utils.shared import login, try_network_action def _create_project(con: Connection, project_definition: dict[str, Any]) -> Project: diff --git a/knora/dsplib/utils/shared_methods.py b/knora/dsplib/utils/shared.py similarity index 50% rename from knora/dsplib/utils/shared_methods.py rename to knora/dsplib/utils/shared.py index 4c34468f1..6ade52982 100644 --- a/knora/dsplib/utils/shared_methods.py +++ b/knora/dsplib/utils/shared.py @@ -1,13 +1,15 @@ -import re import time -from datetime import datetime -from typing import Union, Callable, Any, Optional - +import unicodedata +import pandas as pd +import regex from lxml import etree from requests import RequestException +from datetime import datetime +from typing import Callable, Any, Optional from knora.dsplib.models.connection import Connection from knora.dsplib.models.helpers import BaseError +from knora.dsplib.models.propertyelement import PropertyElement def login(server: str, user: str, password: str) -> Connection: @@ -64,7 +66,7 @@ def try_network_action( time.sleep(2 ** i) continue except BaseError as err: - if re.search(r'try again later', err.message) or re.search(r'status code=5\d\d', err.message): + if regex.search(r'try again later', err.message) or regex.search(r'status code=5\d\d', err.message): print(f'{datetime.now().isoformat()}: Try reconnecting to DSP server, next attempt in {2 ** i} seconds...') time.sleep(2 ** i) continue @@ -107,3 +109,92 @@ def validate_xml_against_schema(input_file: str, schema_file: str) -> bool: for error in xmlschema.error_log: error_msg = error_msg + f"\n Line {error.line}: {error.message}" raise BaseError(error_msg) + + +def prepare_dataframe(df: pd.DataFrame, required_columns: list[str], location_of_sheet: str) -> pd.DataFrame: + """ + Takes a pandas DataFrame, strips the column headers from whitespaces and transforms them to lowercase, + strips every cell from whitespaces and inserts "" if there is no string in it, and deletes the rows that don't have + a value in one of the required cells. + + Args: + df: pandas DataFrame + required_columns: headers of the columns where a value is required + location_of_sheet: for better error messages, provide this information of the caller + + Returns: + prepared DataFrame + """ + # strip column headers and transform to lowercase, so that the script doesn't break when the headers vary a bit + new_df = df.rename(columns=lambda x: x.strip().lower()) + required_columns = [x.strip().lower() for x in required_columns] + # strip every cell, and insert "" if there is no valid word in it + new_df = new_df.applymap( + lambda x: str(x).strip() if pd.notna(x) and regex.search(r"[\w\p{L}]", str(x), flags=regex.U) else "" + ) + # delete rows that don't have the required columns + for req in required_columns: + if req not in new_df: + raise ValueError(f"{location_of_sheet} requires a column named '{req}'") + new_df = new_df[pd.notna(new_df[req])] + new_df = new_df[[bool(regex.search(r"[\w\p{L}]", x, flags=regex.U)) for x in new_df[req]]] + if len(new_df) < 1: + raise ValueError(f"{location_of_sheet} requires at least one row") + return new_df + + +def simplify_name(value: str) -> str: + """ + Simplifies a given value in order to use it as node name + + Args: + value: The value to be simplified + + Returns: + str: The simplified value + """ + simplified_value = str(value).lower() + + # normalize characters (p.ex. ä becomes a) + simplified_value = unicodedata.normalize("NFKD", simplified_value) + + # replace forward slash and whitespace with a dash + simplified_value = regex.sub("[/\\s]+", "-", simplified_value) + + # delete all characters which are not letters, numbers or dashes + simplified_value = regex.sub("[^A-Za-z0-9\\-]+", "", simplified_value) + + return simplified_value + + +def check_notna(value: Optional[Any]) -> bool: + """ + Check a value if it is usable in the context of data archiving. A value is considered usable if it is + - a number (integer or float, but not np.nan) + - a boolean + - a string with at least one Unicode letter, underscore, or number, but not "None", "", "N/A", or "-" + - a PropertyElement whose "value" fulfills the above criteria + + Args: + value: any object encountered when analysing data + + Returns: + True if the value is usable, False if it is N/A or otherwise unusable + """ + + if isinstance(value, PropertyElement): + value = value.value + + if any([ + isinstance(value, int), + isinstance(value, float) and pd.notna(value), # necessary because isinstance(np.nan, float) + isinstance(value, bool) + ]): + return True + elif isinstance(value, str): + return all([ + regex.search(r"\p{L}|\d|_", value, flags=regex.UNICODE), + not bool(regex.search(r"^(none||-|n/a)$", value, flags=regex.IGNORECASE)) + ]) + else: + return False diff --git a/knora/dsplib/utils/xml_upload.py b/knora/dsplib/utils/xml_upload.py index 9f761f427..65a0fa146 100644 --- a/knora/dsplib/utils/xml_upload.py +++ b/knora/dsplib/utils/xml_upload.py @@ -22,7 +22,7 @@ from knora.dsplib.models.xmlpermission import XmlPermission from knora.dsplib.models.xmlproperty import XMLProperty from knora.dsplib.models.xmlresource import XMLResource -from knora.dsplib.utils.shared_methods import try_network_action, validate_xml_against_schema +from knora.dsplib.utils.shared import try_network_action, validate_xml_against_schema def _remove_circular_references(resources: list[XMLResource], verbose: bool) -> \ @@ -382,8 +382,8 @@ def _upload_resources( print(err.message) failed_uploads.append(resource.id) continue - bitstream_size_uploaded_mb += round(next(bitstream_all_sizes_iterator), 1) - print(f"Uploaded file '{resource.bitstream.value}' ({bitstream_size_uploaded_mb} MB / {bitstream_size_total_mb} MB)") + bitstream_size_uploaded_mb += next(bitstream_all_sizes_iterator) + print(f"Uploaded file '{resource.bitstream.value}' ({bitstream_size_uploaded_mb:.1f} MB / {bitstream_size_total_mb} MB)") internal_file_name_bitstream = img['uploadedFiles'][0]['internalFilename'] resource_bitstream = resource.get_bitstream(internal_file_name_bitstream, permissions_lookup) diff --git a/knora/excel2xml.py b/knora/excel2xml.py index 0a41a70a4..af6e79a0e 100644 --- a/knora/excel2xml.py +++ b/knora/excel2xml.py @@ -15,7 +15,8 @@ import dataclasses from knora.dsplib.models.helpers import BaseError -from knora.dsplib.utils.excel_to_json_lists import simplify_name +from knora.dsplib.models.propertyelement import PropertyElement +from knora.dsplib.utils.shared import simplify_name, check_notna ############################## # global variables and classes @@ -26,47 +27,6 @@ } -@dataclasses.dataclass(frozen=True) -class PropertyElement: - """ - A PropertyElement object carries more information about a property value than the value itself. - The "value" is the value that could be passed to a method as plain string/int/float/bool. Use a PropertyElement - instead to define more precisely what attributes your tag (for example) will have. - - Args: - value: This is the content that will be written between the tags (for example) - permissions: This is the permissions that your tag (for example) will have - comment: This is the comment that your tag (for example) will have - encoding: For tags only. Can be "xml" or "utf8". - - Examples: - See the difference between the first and the second example: - - >>> make_text_prop(":testproperty", "first text") - - - first text - - - >>> make_text_prop(":testproperty", PropertyElement("first text", permissions="prop-restricted", encoding="xml")) - - - first text - - - """ - value: Union[str, int, float, bool] - permissions: str = "prop-default" - comment: Optional[str] = None - encoding: Optional[str] = None - - def __post_init__(self) -> None: - if not check_notna(self.value): - raise BaseError(f"'{self.value}' is not a valid value for a PropertyElement") - if self.encoding not in ["utf8", "xml", None]: - raise BaseError(f"'{self.encoding}' is not a valid encoding for a PropertyElement") - - ########### # functions ########### @@ -269,39 +229,6 @@ def find_date_in_string(string: str, calling_resource: str = "") -> Optional[str return None -def check_notna(value: Optional[Any]) -> bool: - """ - Check a value if it is usable in the context of data archiving. A value is considered usable if it is - - a number (integer or float, but not np.nan) - - a boolean - - a string with at least one Unicode letter, underscore, or number, but not "None", "", "N/A", or "-" - - a PropertyElement whose "value" fulfills the above criteria - - Args: - value: any object encountered when analysing data - - Returns: - True if the value is usable, False if it is N/A or otherwise unusable - """ - - if isinstance(value, PropertyElement): - value = value.value - - if any([ - isinstance(value, int), - isinstance(value, float) and pd.notna(value), # necessary because isinstance(np.nan, float) - isinstance(value, bool) - ]): - return True - elif isinstance(value, str): - return all([ - regex.search(r"\p{L}|\d|_", value, flags=re.UNICODE), - not bool(re.search(r"^(none||-|n/a)$", value, flags=re.IGNORECASE)) - ]) - else: - return False - - def _check_and_prepare_values( value: Optional[Union[PropertyElement, str, int, float, bool]], values: Optional[Iterable[Union[PropertyElement, str, int, float, bool]]], diff --git a/test/e2e/test_tools.py b/test/e2e/test_tools.py index 281687402..f20f4d991 100644 --- a/test/e2e/test_tools.py +++ b/test/e2e/test_tools.py @@ -6,8 +6,8 @@ import re from knora.dsplib.utils import excel_to_json_lists -from knora.dsplib.utils.excel_to_json_properties import properties_excel2json -from knora.dsplib.utils.excel_to_json_resources import resources_excel2json +from knora.dsplib.utils.excel_to_json_properties import excel2properties +from knora.dsplib.utils.excel_to_json_resources import excel2resources from knora.dsplib.utils.id_to_iri import id_to_iri from knora.dsplib.utils.onto_create_ontology import create_project from knora.dsplib.utils.onto_get import get_ontology @@ -157,16 +157,16 @@ def test_get(self) -> None: self.assertEqual(excel_list.get('comments'), excel_list_out.get('comments')) def test_excel_to_json_list(self) -> None: - excel_to_json_lists.list_excel2json(excelfolder='testdata/multilingual_lists', - outfile='testdata/tmp/_lists-out.json') + excel_to_json_lists.excel2lists(excelfolder='testdata/lists_multilingual', + path_to_output_file='testdata/tmp/_lists-out.json') def test_excel_to_json_resources(self) -> None: - resources_excel2json(excelfile='testdata/Resources.xlsx', - outfile='testdata/tmp/_out_resources.json') + excel2resources(excelfile='testdata/Resources.xlsx', + path_to_output_file='testdata/tmp/_out_resources.json') def test_excel_to_json_properties(self) -> None: - properties_excel2json(excelfile='testdata/Properties.xlsx', - outfile='testdata/tmp/_out_properties.json') + excel2properties(excelfile='testdata/Properties.xlsx', + path_to_output_file='testdata/tmp/_out_properties.json') def test_create_project(self) -> None: result1 = create_project( diff --git a/test/unittests/test_excel2xml.py b/test/unittests/test_excel2xml.py index 6fd104012..a897e4b01 100644 --- a/test/unittests/test_excel2xml.py +++ b/test/unittests/test_excel2xml.py @@ -141,18 +141,6 @@ def test_make_xsd_id_compatible(self) -> None: self.assertRaises(BaseError, excel2xml.make_xsd_id_compatible, ".") - def test_check_notna(self) -> None: - na_values = [None, pd.NA, np.nan, "", " ", "-", ",", ".", "*", "!", " ⳰", " ῀ ", " ῾ ", " \n\t ", "N/A", "n/a", - "", ["a", "b"], pd.array(["a", "b"]), np.array([0, 1])] - for na_value in na_values: - self.assertFalse(excel2xml.check_notna(na_value), msg=f"Failed na_value: {na_value}") - - notna_values = [1, 0.1, True, False, "True", "False", r" \n\t ", "0", "_", "Ὅμηρος"] - notna_values.extend([excel2xml.PropertyElement(x) for x in notna_values]) - for notna_value in notna_values: - self.assertTrue(excel2xml.check_notna(notna_value), msg=f"Failed notna_value: {notna_value}") - - def test_find_date_in_string(self) -> None: # template: 2021-01-01 | 2015_01_02 diff --git a/test/unittests/test_excel_to_json_lists.py b/test/unittests/test_excel_to_json_lists.py index 4d9714112..d1feb05a4 100644 --- a/test/unittests/test_excel_to_json_lists.py +++ b/test/unittests/test_excel_to_json_lists.py @@ -1,4 +1,5 @@ """unit tests for Excel to JSON list""" +import copy import os import unittest import json @@ -6,6 +7,7 @@ import jsonpath_ng.ext import pandas as pd import regex +from typing import Any from knora.dsplib.models.helpers import BaseError from knora.dsplib.utils import excel_to_json_lists as e2l @@ -25,20 +27,95 @@ def tearDownClass(cls) -> None: os.remove('testdata/tmp/' + file) os.rmdir('testdata/tmp') - def test_excel2jsonlist(self) -> None: + + def test_expand_lists_from_excel(self) -> None: + # take the "lists" section of the systematic test project, expand it, and check if it is equal to the expanded + # version stored in the testdata folder + with open("testdata/test-project-systematic.json") as f: + lists_with_excel_reference = json.load(f)["project"]["lists"] + lists_with_excel_reference_output, success1 = e2l.expand_lists_from_excel(lists_with_excel_reference) + with open("testdata/lists_section_expanded.json") as f: + lists_with_excel_reference_output_expected = json.load(f)["expanded lists section of test-project-systematic.json"] + self.assertTrue(success1) + self.assertListEqual(lists_with_excel_reference_output, lists_with_excel_reference_output_expected) + + # take the expanded version, and make sure that it is returned unchanged + lists_without_excel_reference = lists_with_excel_reference_output_expected + lists_without_excel_reference_output, success2 = e2l.expand_lists_from_excel(lists_without_excel_reference) + self.assertTrue(success2) + self.assertListEqual(lists_without_excel_reference, lists_without_excel_reference_output) + + + def test_make_json_lists_from_excel(self) -> None: + lists_multilingual = [f"testdata/lists_multilingual/{lang}.xlsx" for lang in ["de", "en", "fr"]] + lists_multilingual_output = e2l._make_json_lists_from_excel(lists_multilingual) + with open("testdata/lists_multilingual_output_expected.json") as f: + lists_multilingual_output_expected = json.load(f) + self.assertListEqual(lists_multilingual_output, lists_multilingual_output_expected) + + + def test_validate_lists_section_with_schema(self) -> None: + with open("testdata/lists_multilingual_output_expected.json") as f: + lists_section_valid = json.load(f) + + # validate the valid "lists" section in a correct way + self.assertTrue(e2l.validate_lists_section_with_schema(lists_section=lists_section_valid)) + + # remove mandatory "comments" section from root node + lists_section_without_comment_at_rootnode = copy.deepcopy(lists_section_valid) + del lists_section_without_comment_at_rootnode[0]["comments"] + with self.assertRaisesRegex( + BaseError, + "\"lists\" section did not pass validation. The error message is: 'comments' is a required property" + ): + e2l.validate_lists_section_with_schema(lists_section=lists_section_without_comment_at_rootnode) + + # remove mandatory "comments" section from root node + lists_section_with_invalid_lang = copy.deepcopy(lists_section_valid) + lists_section_with_invalid_lang[0]["comments"]["eng"] = "wrong English label" + with self.assertRaisesRegex( + BaseError, + "\"lists\" section did not pass validation. The error message is: 'eng' does not match any of the regexes" + ): + e2l.validate_lists_section_with_schema(lists_section=lists_section_with_invalid_lang) + + # wrong usage of the method + with self.assertRaisesRegex( + BaseError, + "Validation of the 'lists' section works only if exactly one of the two arguments is given." + ): + e2l.validate_lists_section_with_schema( + path_to_json_project_file="testdata/test-project-systematic.json", + lists_section=lists_section_valid + ) + with self.assertRaisesRegex( + BaseError, + "Validation of the 'lists' section works only if exactly one of the two arguments is given." + ): + e2l.validate_lists_section_with_schema() + + # pass a file that doesn't have a "lists" section + with self.assertRaisesRegex(BaseError, "there is no \"lists\" section"): + e2l.validate_lists_section_with_schema(path_to_json_project_file="testdata/test-project-minimal.json") + + + def test_excel2lists(self) -> None: for mode in ["monolingual", "multilingual"]: # create output files - input_df = pd.read_excel(f"testdata/{mode}_lists/de.xlsx", header=None, dtype='str') + input_df = pd.read_excel(f"testdata/lists_{mode}/de.xlsx", header=None, dtype='str') input_df = input_df.applymap(lambda x: x if pd.notna(x) and regex.search(r"\p{L}", str(x), flags=regex.UNICODE) else pd.NA) input_df.dropna(axis="index", how="all", inplace=True) - excelfolder = f"testdata/{mode}_lists" + excelfolder = f"testdata/lists_{mode}" outfile = f"testdata/tmp/lists_output_{mode}.json" - e2l.list_excel2json(excelfolder=excelfolder, outfile=outfile) + output_from_method = e2l.excel2lists(excelfolder=excelfolder, path_to_output_file=outfile) - # check that the output file has the same number of nodes than the Excel file has rows + # check that output from file and from method are equal with open(outfile) as f: - output_as_dict = json.load(f) - output_nodes_matches = jsonpath_ng.parse('$..name').find(output_as_dict) + output_from_file: list[dict[str, Any]] = json.load(f) + self.assertListEqual(output_from_file, output_from_method) + + # check that the output file has the same number of nodes than the Excel file has rows + output_nodes_matches = jsonpath_ng.parse('$..name').find(output_from_file) self.assertTrue( len(input_df.index) == len(output_nodes_matches), f"The output JSON file doesn't have the same number of nodes than the Excel file has rows" @@ -55,7 +132,7 @@ def test_excel2jsonlist(self) -> None: parser_string = '$' for elem in jsonpath_elems: parser_string = parser_string + f'.nodes[?(@.labels.en == "{elem}")]' - node_match = jsonpath_ng.ext.parse(parser_string).find(output_as_dict) + node_match = jsonpath_ng.ext.parse(parser_string).find(output_from_file) self.assertTrue( len(node_match) == 1, f'The node "{jsonpath_elems[-1]}" from Excel row {index+1} was not correctly translated to the ' @@ -64,9 +141,9 @@ def test_excel2jsonlist(self) -> None: # make sure that the invalid lists raise an Error with self.assertRaisesRegex(BaseError, r"Found duplicate in column 2, row 9"): - e2l.list_excel2json(excelfolder="testdata/invalid_lists_1", outfile=outfile) + e2l.excel2lists(excelfolder="testdata/lists_invalid_1", path_to_output_file=outfile) with self.assertRaisesRegex(BaseError, r"The Excel file with the language code 'de' should have a value in row 10, column 2"): - e2l.list_excel2json(excelfolder="testdata/invalid_lists_2", outfile=outfile) + e2l.excel2lists(excelfolder="testdata/lists_invalid_2", path_to_output_file=outfile) if __name__ == '__main__': diff --git a/test/unittests/test_excel_to_properties.py b/test/unittests/test_excel_to_json_properties.py similarity index 87% rename from test/unittests/test_excel_to_properties.py rename to test/unittests/test_excel_to_json_properties.py index ebe6e7b00..d9d4c3b5e 100644 --- a/test/unittests/test_excel_to_properties.py +++ b/test/unittests/test_excel_to_json_properties.py @@ -4,6 +4,7 @@ import json import jsonpath_ng import jsonpath_ng.ext +from typing import Any from knora.dsplib.utils import excel_to_json_properties as e2j @@ -22,10 +23,10 @@ def tearDownClass(cls) -> None: os.remove('testdata/tmp/' + file) os.rmdir('testdata/tmp') - def test_excel2json(self) -> None: + def test_excel2properties(self) -> None: excelfile = "testdata/Properties.xlsx" outfile = "testdata/tmp/_out_properties.json" - e2j.properties_excel2json(excelfile, outfile) + output_from_method = e2j.excel2properties(excelfile, outfile) # define the expected values from the excel file excel_names = ["correspondsToGenericAnthroponym", "hasAnthroponym", "hasGender", "isDesignatedAs", "hasTitle", @@ -76,16 +77,17 @@ def test_excel2json(self) -> None: # read json file with open(outfile) as f: - json_string = f.read() - json_string = "{" + json_string + "}" - json_file = json.loads(json_string) + output_from_file: list[dict[str, Any]] = json.load(f) + + # check that output from file and from method are equal + self.assertListEqual(output_from_file, output_from_method) # extract infos from json file - json_names = [match.value for match in jsonpath_ng.parse("$.properties[*].name").find(json_file)] - json_supers = [match.value for match in jsonpath_ng.parse("$.properties[*].super").find(json_file)] - json_objects = [match.value for match in jsonpath_ng.parse("$.properties[*].object").find(json_file)] + json_names = [match.value for match in jsonpath_ng.parse("$[*].name").find(output_from_file)] + json_supers = [match.value for match in jsonpath_ng.parse("$[*].super").find(output_from_file)] + json_objects = [match.value for match in jsonpath_ng.parse("$[*].object").find(output_from_file)] - json_labels_all = [match.value for match in jsonpath_ng.parse("$.properties[*].labels").find(json_file)] + json_labels_all = [match.value for match in jsonpath_ng.parse("$[*].labels").find(output_from_file)] json_labels: dict[str, list[str]] = dict() for lang in ["de", "it"]: json_labels[lang] = [label.get(lang, "").strip() for label in json_labels_all] @@ -93,13 +95,13 @@ def test_excel2json(self) -> None: json_comments: dict[str, list[str]] = dict() for lang in ["fr", "it"]: json_comments[f"comment_{lang}"] = [resource.get("comments", {}).get(lang, "").strip() - for resource in json_file["properties"]] + for resource in output_from_file] - json_gui_elements = [match.value for match in jsonpath_ng.parse("$.properties[*].gui_element").find(json_file)] + json_gui_elements = [match.value for match in jsonpath_ng.parse("$[*].gui_element").find(output_from_file)] - json_gui_attributes_hasGender = jsonpath_ng.ext.parse("$.properties[?name='hasGender'].gui_attributes").find(json_file)[0].value - json_gui_attributes_hasGND = jsonpath_ng.ext.parse("$.properties[?name='hasGND'].gui_attributes").find(json_file)[0].value - json_gui_attributes_hasDecimal = jsonpath_ng.ext.parse("$.properties[?name='hasDecimal'].gui_attributes").find(json_file)[0].value + json_gui_attributes_hasGender = jsonpath_ng.ext.parse("$[?name='hasGender'].gui_attributes").find(output_from_file)[0].value + json_gui_attributes_hasGND = jsonpath_ng.ext.parse("$[?name='hasGND'].gui_attributes").find(output_from_file)[0].value + json_gui_attributes_hasDecimal = jsonpath_ng.ext.parse("$[?name='hasDecimal'].gui_attributes").find(output_from_file)[0].value # make checks self.assertListEqual(excel_names, json_names) diff --git a/test/unittests/test_excel_to_resource.py b/test/unittests/test_excel_to_json_resources.py similarity index 67% rename from test/unittests/test_excel_to_resource.py rename to test/unittests/test_excel_to_json_resources.py index 8a74d4d9a..a92add6c9 100644 --- a/test/unittests/test_excel_to_resource.py +++ b/test/unittests/test_excel_to_json_resources.py @@ -4,9 +4,7 @@ import json import jsonpath_ng import jsonpath_ng.ext -import pandas as pd -import numpy as np - +from typing import Any from knora.dsplib.utils import excel_to_json_resources as e2j @@ -24,31 +22,11 @@ def tearDownClass(cls) -> None: os.remove('testdata/tmp/' + file) os.rmdir('testdata/tmp') - def test_prepare_dataframe(self) -> None: - original_df = pd.DataFrame({ - " TitLE of Column 1 ": ["1", " 0-1 ", "1-n ", pd.NA, " ", " ", "", " 0-n ", np.nan], - " Title of Column 2 ": [None, "1", 1, "text", "text", "text", "text", "text", "text"], - "Title of Column 3": ["", pd.NA, None, "text", "text", "text", "text", np.nan, "text"] - }) - expected_df = pd.DataFrame({ - "title of column 1": [ "0-1", "1-n", "0-n"], - "title of column 2": [ "1", "1", "text"], - "title of column 3": [ "", "", ""] - }) - returned_df = e2j.prepare_dataframe( - df=original_df, - required_columns=[" TitLE of Column 1 ", " Title of Column 2 "], - location_of_sheet='' - ) - for expected, returned in zip(expected_df.iterrows(), returned_df.iterrows()): - _, expected_row = expected - _, returned_row = returned - self.assertListEqual(list(expected_row), list(returned_row)) - - def test_excel2json(self) -> None: + + def test_excel2resources(self) -> None: excelfile = "testdata/Resources.xlsx" outfile = "testdata/tmp/_out_resources.json" - e2j.resources_excel2json(excelfile, outfile) + output_from_method = e2j.excel2resources(excelfile, outfile) # define the expected values from the excel file excel_names = ["Owner", "Title", "GenericAnthroponym", "FamilyMember", "MentionedPerson", "Alias", "Image", @@ -80,32 +58,33 @@ def test_excel2json(self) -> None: # read json file with open(outfile) as f: - json_string = f.read() - json_string = "{" + json_string + "}" - json_file = json.loads(json_string) + output_from_file: list[dict[str, Any]] = json.load(f) + # check that output from file and from method are equal + self.assertListEqual(output_from_file, output_from_method) + # extract infos from json file - json_names = [match.value for match in jsonpath_ng.parse("$.resources[*].name").find(json_file)] - json_supers = [match.value for match in jsonpath_ng.parse("$.resources[*].super").find(json_file)] + json_names = [match.value for match in jsonpath_ng.parse("$[*].name").find(output_from_file)] + json_supers = [match.value for match in jsonpath_ng.parse("$[*].super").find(output_from_file)] - json_labels_all = [match.value for match in jsonpath_ng.parse("$.resources[*].labels").find(json_file)] + json_labels_all = [match.value for match in jsonpath_ng.parse("$[*].labels").find(output_from_file)] json_labels: dict[str, list[str]] = dict() for lang in ["en", "rm"]: json_labels[lang] = [label.get(lang, "").strip() for label in json_labels_all] - json_labels_of_image = jsonpath_ng.ext.parse('$.resources[?name="Image"].labels').find(json_file)[0].value + json_labels_of_image = jsonpath_ng.ext.parse('$[?name="Image"].labels').find(output_from_file)[0].value json_comments: dict[str, list[str]] = dict() for lang in ["de", "fr"]: # make sure the lists of the json comments contain a blank string even if there is no "comments" section # at all in this resource json_comments[f"comment_{lang}"] = [resource.get("comments", {}).get(lang, "").strip() - for resource in json_file["resources"]] - json_comments_of_image = jsonpath_ng.ext.parse('$.resources[?name="Image"].comments').find(json_file)[0].value + for resource in output_from_file] + json_comments_of_image = jsonpath_ng.ext.parse('$[?name="Image"].comments').find(output_from_file)[0].value json_first_class_properties = [match.value for match in - jsonpath_ng.parse("$.resources[0].cardinalities[*].propname").find(json_file)] + jsonpath_ng.parse("$[0].cardinalities[*].propname").find(output_from_file)] json_first_class_cardinalities = [match.value for match in - jsonpath_ng.parse("$.resources[0].cardinalities[*].cardinality").find(json_file)] + jsonpath_ng.parse("$[0].cardinalities[*].cardinality").find(output_from_file)] # make checks self.assertListEqual(excel_names, json_names) diff --git a/test/unittests/test_shared_methods.py b/test/unittests/test_shared_methods.py new file mode 100644 index 000000000..26ce3f220 --- /dev/null +++ b/test/unittests/test_shared_methods.py @@ -0,0 +1,44 @@ +import unittest +import pandas as pd +import numpy as np +from knora.dsplib.utils import shared +from knora.dsplib.models.propertyelement import PropertyElement + + +class TestSharedMethods(unittest.TestCase): + def test_prepare_dataframe(self) -> None: + original_df = pd.DataFrame({ + " TitLE of Column 1 ": ["1", " 0-1 ", "1-n ", pd.NA, " ", " ", "", " 0-n ", np.nan], + " Title of Column 2 ": [None, "1", 1, "text", "text", "text", "text", "text", "text"], + "Title of Column 3": ["", pd.NA, None, "text", "text", "text", "text", np.nan, "text"] + }) + expected_df = pd.DataFrame({ + "title of column 1": [ "0-1", "1-n", "0-n"], + "title of column 2": [ "1", "1", "text"], + "title of column 3": [ "", "", ""] + }) + returned_df = shared.prepare_dataframe( + df=original_df, + required_columns=[" TitLE of Column 1 ", " Title of Column 2 "], + location_of_sheet='' + ) + for expected, returned in zip(expected_df.iterrows(), returned_df.iterrows()): + i, expected_row = expected + _, returned_row = returned + self.assertListEqual(list(expected_row), list(returned_row), msg=f"Failed in row {i}") + + + def test_check_notna(self) -> None: + na_values = [None, pd.NA, np.nan, "", " ", "-", ",", ".", "*", "!", " ⳰", " ῀ ", " ῾ ", " \n\t ", "N/A", "n/a", + "", ["a", "b"], pd.array(["a", "b"]), np.array([0, 1])] + for na_value in na_values: + self.assertFalse(shared.check_notna(na_value), msg=f"Failed na_value: {na_value}") + + notna_values = [1, 0.1, True, False, "True", "False", r" \n\t ", "0", "_", "Ὅμηρος"] + notna_values.extend([PropertyElement(x) for x in notna_values]) + for notna_value in notna_values: + self.assertTrue(shared.check_notna(notna_value), msg=f"Failed notna_value: {notna_value}") + + +if __name__ == '__main__': + unittest.main() diff --git a/testdata/single_list/de.xlsx b/testdata/list_single/de.xlsx similarity index 100% rename from testdata/single_list/de.xlsx rename to testdata/list_single/de.xlsx diff --git a/testdata/single_list/en.xlsx b/testdata/list_single/en.xlsx similarity index 100% rename from testdata/single_list/en.xlsx rename to testdata/list_single/en.xlsx diff --git a/testdata/invalid_lists_1/de.xlsx b/testdata/lists_invalid_1/de.xlsx similarity index 100% rename from testdata/invalid_lists_1/de.xlsx rename to testdata/lists_invalid_1/de.xlsx diff --git a/testdata/invalid_lists_1/en.xlsx b/testdata/lists_invalid_1/en.xlsx similarity index 100% rename from testdata/invalid_lists_1/en.xlsx rename to testdata/lists_invalid_1/en.xlsx diff --git a/testdata/invalid_lists_1/fr.xlsx b/testdata/lists_invalid_1/fr.xlsx similarity index 100% rename from testdata/invalid_lists_1/fr.xlsx rename to testdata/lists_invalid_1/fr.xlsx diff --git a/testdata/invalid_lists_2/de.xlsx b/testdata/lists_invalid_2/de.xlsx similarity index 100% rename from testdata/invalid_lists_2/de.xlsx rename to testdata/lists_invalid_2/de.xlsx diff --git a/testdata/invalid_lists_2/en.xlsx b/testdata/lists_invalid_2/en.xlsx similarity index 100% rename from testdata/invalid_lists_2/en.xlsx rename to testdata/lists_invalid_2/en.xlsx diff --git a/testdata/invalid_lists_2/fr.xlsx b/testdata/lists_invalid_2/fr.xlsx similarity index 100% rename from testdata/invalid_lists_2/fr.xlsx rename to testdata/lists_invalid_2/fr.xlsx diff --git a/testdata/monolingual_lists/de.xlsx b/testdata/lists_monolingual/de.xlsx similarity index 100% rename from testdata/monolingual_lists/de.xlsx rename to testdata/lists_monolingual/de.xlsx diff --git a/testdata/multilingual_lists/de.xlsx b/testdata/lists_multilingual/de.xlsx similarity index 100% rename from testdata/multilingual_lists/de.xlsx rename to testdata/lists_multilingual/de.xlsx diff --git a/testdata/multilingual_lists/en.xlsx b/testdata/lists_multilingual/en.xlsx similarity index 100% rename from testdata/multilingual_lists/en.xlsx rename to testdata/lists_multilingual/en.xlsx diff --git a/testdata/multilingual_lists/fr.xlsx b/testdata/lists_multilingual/fr.xlsx similarity index 100% rename from testdata/multilingual_lists/fr.xlsx rename to testdata/lists_multilingual/fr.xlsx diff --git a/testdata/lists_multilingual_output_expected.json b/testdata/lists_multilingual_output_expected.json new file mode 100644 index 000000000..2a8c63e00 --- /dev/null +++ b/testdata/lists_multilingual_output_expected.json @@ -0,0 +1,102 @@ +[ + { + "name": "first-list", + "labels": { + "fr": "première liste", + "en": "first list", + "de": "erste Liste" + }, + "comments": { + "fr": "première liste", + "en": "first list", + "de": "erste Liste" + }, + "nodes": [ + { + "name": "special-characters-12-0-are-embedded", + "labels": { + "fr": "caractères spéciales 1&2-%*_0 dedans", + "en": "special characters 1&2-%*_0 are embedded", + "de": "Spezialzeichen 1&2-%*_0 sind eingebettet" + }, + "nodes": [ + { + "name": "very", + "labels": { + "fr": "très", + "en": "very", + "de": "sehr" + }, + "nodes": [ + { + "name": "deeply", + "labels": { + "fr": "profondément", + "en": "deeply", + "de": "tief" + }, + "nodes": [ + { + "name": "nested", + "labels": { + "fr": "niché!", + "en": "nested!", + "de": "verschachtelt!" + } + } + ] + } + ] + } + ] + } + ] + }, + { + "name": "second-list", + "labels": { + "fr": "deuxième liste", + "en": "second list", + "de": "zweite Liste" + }, + "comments": { + "fr": "deuxième liste", + "en": "second list", + "de": "zweite Liste" + }, + "nodes": [ + { + "name": "first-node", + "labels": { + "fr": "premier noeud", + "en": "first node", + "de": "erster Knoten" + } + }, + { + "name": "duplicate-nodename", + "labels": { + "fr": "noeud doublé", + "en": "duplicate nodename", + "de": "Doppelung" + } + }, + { + "name": "duplicate-nodename-2", + "labels": { + "fr": "noeud doublé!", + "en": "duplicate nodename!", + "de": "Doppelung!" + } + }, + { + "name": "duplicate-nodename-3", + "labels": { + "fr": "noeud doublé?", + "en": "duplicate nodename?", + "de": "Doppelung?" + } + } + ] + } +] diff --git a/testdata/lists_section_expanded.json b/testdata/lists_section_expanded.json new file mode 100644 index 000000000..c8dd4e063 --- /dev/null +++ b/testdata/lists_section_expanded.json @@ -0,0 +1,114 @@ +{ + "expanded lists section of test-project-systematic.json": [ + { + "name": "testlist", + "labels": { + "en": "Testlist", + "rm": "Glista test in Rumantsch" + }, + "comments": { + "en": "no comment", + "de": "kein Kommentar", + "rm": "nagin commentar in Rumantsch" + }, + "nodes": [ + { + "name": "first node of testlist", + "labels": { + "en": "First node of the Test-List", + "rm": "Rumantsch" + } + }, + { + "name": "second node of testlist", + "labels": { + "en": "Second node of the Test-List" + }, + "nodes": [ + { + "name": "first subnode", + "labels": { + "en": "First Sub-Node" + } + }, + { + "name": "second subnode", + "labels": { + "en": "Second Sub-Node" + } + } + ] + }, + { + "name": "third node of testlist", + "labels": { + "en": "Third node of the Test-List" + } + } + ] + }, + { + "name": "my-list-from-excel", + "labels": { + "en": "My list from Excel" + }, + "comments": { + "en": "a comment", + "de": "ein Kommentar", + "fr": "un commentaire" + }, + "nodes": [ + { + "name": "red", + "labels": { + "en": "red", + "de": "rot" + } + }, + { + "name": "yellow", + "labels": { + "en": "yellow", + "de": "gelb" + } + }, + { + "name": "blue", + "labels": { + "en": "blue", + "de": "blau" + } + }, + { + "name": "green", + "labels": { + "en": "green", + "de": "grün" + } + } + ] + }, + { + "name": "notUsedList", + "labels": { + "en": "Not used list" + }, + "comments": { + "en": "no comment", + "de": "kein Kommentar" + }, + "nodes": [ + { + "name": "notUsedNode_1", + "labels": { + "en": "nodeLabel_1\"'" + }, + "comments": { + "en": "Nodes can have comments, too!", + "rm": "Even in Rumantsch!" + } + } + ] + } + ] +} diff --git a/testdata/test-project-systematic.json b/testdata/test-project-systematic.json index 1c4f9656c..c8e0f659b 100644 --- a/testdata/test-project-systematic.json +++ b/testdata/test-project-systematic.json @@ -77,7 +77,7 @@ "fr": "un commentaire" }, "nodes": { - "folder": "testdata/single_list" + "folder": "testdata/list_single" } }, {