Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: tidy up excel2lists, excel2resources, excel2properties (DEV-1352) #229

Merged
merged 8 commits into from Sep 19, 2022
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
18 changes: 9 additions & 9 deletions knora/dsp_tools.py
Expand Up @@ -6,9 +6,9 @@
import sys
from importlib.metadata import version

from knora.dsplib.utils.excel_to_json_lists import list_excel2json, validate_lists_section_with_schema
from knora.dsplib.utils.excel_to_json_properties import properties_excel2json
from knora.dsplib.utils.excel_to_json_resources import resources_excel2json
from knora.dsplib.utils.excel_to_json_lists import excel2lists, validate_lists_section_with_schema
from knora.dsplib.utils.excel_to_json_properties import excel2properties
from knora.dsplib.utils.excel_to_json_resources import excel2resources
from knora.dsplib.utils.id_to_iri import id_to_iri
from knora.dsplib.utils.onto_create_lists import create_lists
from knora.dsplib.utils.onto_create_ontology import create_project
Expand Down Expand Up @@ -186,14 +186,14 @@ def program(user_args: list[str]) -> None:
verbose=args.verbose,
incremental=args.incremental)
elif args.action == 'excel2lists':
list_excel2json(excelfolder=args.excelfolder,
outfile=args.outfile)
excel2lists(excelfolder=args.excelfolder,
outfile=args.outfile)
elif args.action == 'excel2resources':
resources_excel2json(excelfile=args.excelfile,
outfile=args.outfile)
excel2resources(excelfile=args.excelfile,
outfile=args.outfile)
elif args.action == 'excel2properties':
properties_excel2json(excelfile=args.excelfile,
outfile=args.outfile)
excel2properties(excelfile=args.excelfile,
outfile=args.outfile)
elif args.action == 'id2iri':
id_to_iri(xml_file=args.xmlfile,
json_file=args.jsonfile,
Expand Down
54 changes: 54 additions & 0 deletions knora/dsplib/models/propertyelement.py
@@ -0,0 +1,54 @@
import dataclasses
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To make excel2xml.py at least a bit shorter, I moved this class here.

from typing import Union, Optional
import pandas as pd
import regex
from knora.dsplib.models.helpers import BaseError


@dataclasses.dataclass(frozen=True)
jnussbaum marked this conversation as resolved.
Show resolved Hide resolved
class PropertyElement:
"""
A PropertyElement object carries more information about a property value than the value itself.
The "value" is the value that could be passed to a method as plain string/int/float/bool. Use a PropertyElement
instead to define more precisely what attributes your <text> tag (for example) will have.

Args:
value: This is the content that will be written between the <text></text> tags (for example)
permissions: This is the permissions that your <text> tag (for example) will have
comment: This is the comment that your <text> tag (for example) will have
encoding: For <text> tags only. Can be "xml" or "utf8".
jnussbaum marked this conversation as resolved.
Show resolved Hide resolved
jnussbaum marked this conversation as resolved.
Show resolved Hide resolved

Examples:
See the difference between the first and the second example:

>>> make_text_prop(":testproperty", "first text")
<text-prop name=":testproperty">
<text encoding="utf8" permissions="prop-default">
first text
</text>
</text-prop>
>>> make_text_prop(":testproperty", PropertyElement("first text", permissions="prop-restricted", encoding="xml"))
<text-prop name=":testproperty">
<text encoding="xml" permissions="prop-restricted">
first text
</text>
</text-prop>
"""
value: Union[str, int, float, bool]
permissions: str = "prop-default"
comment: Optional[str] = None
jnussbaum marked this conversation as resolved.
Show resolved Hide resolved
encoding: Optional[str] = None
jnussbaum marked this conversation as resolved.
Show resolved Hide resolved

def __post_init__(self) -> None:
if not any([
isinstance(self.value, int),
isinstance(self.value, float) and pd.notna(self.value), # necessary because isinstance(np.nan, float)
isinstance(self.value, bool),
isinstance(self.value, str) and all([
regex.search(r"\p{L}|\d|_", self.value, flags=regex.UNICODE),
not bool(regex.search(r"^(none|<NA>|-|n/a)$", self.value, flags=regex.IGNORECASE))
])
]):
raise BaseError(f"'{self.value}' is not a valid value for a PropertyElement")
if self.encoding not in ["utf8", "xml", None]:
raise BaseError(f"'{self.encoding}' is not a valid encoding for a PropertyElement")
13 changes: 6 additions & 7 deletions knora/dsplib/schemas/properties-only.json
Expand Up @@ -67,17 +67,17 @@
"oneOf": [
{
"enum": [
"TextValue",
"BooleanValue",
"ColorValue",
"DateValue",
"DecimalValue",
"GeonameValue",
"IntValue",
"BooleanValue",
"TimeValue",
"UriValue",
"IntervalValue",
"ListValue",
"TextValue",
"TimeValue",
"UriValue",
"Resource",
"Representation"
]
Expand All @@ -96,11 +96,11 @@
"gui_element": {
"type": "string",
"enum": [
"Checkbox",
"Colorpicker",
"Date",
"Geonames",
"Interval",
"TimeStamp",
"List",
"Radio",
"Richtext",
Expand All @@ -109,8 +109,7 @@
"Slider",
"Spinbox",
"Textarea",
"Checkbox",
"Fileupload"
"TimeStamp"
]
},
"gui_attributes": {
Expand Down
47 changes: 14 additions & 33 deletions knora/dsplib/utils/excel_to_json_lists.py
Expand Up @@ -3,7 +3,6 @@
import json
import os
import re
import unicodedata
from typing import Any, Union, Optional, Tuple

import jsonschema
Expand All @@ -13,6 +12,7 @@
import regex

from knora.dsplib.models.helpers import BaseError
from knora.dsplib.utils.shared_methods import simplify_name
jnussbaum marked this conversation as resolved.
Show resolved Hide resolved

list_of_lists_of_previous_cell_values: list[list[str]] = []
"""Module level variable used to ensure that there are no duplicate node names"""
Expand Down Expand Up @@ -236,30 +236,6 @@ def _make_json_lists_from_excel(excel_file_paths: list[str], verbose: bool = Fal
return finished_lists


def simplify_name(value: str) -> str:
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

moved to shared_methods.py

"""
Simplifies a given value in order to use it as node name

Args:
value: The value to be simplified

Returns:
str: The simplified value
"""
simplified_value = str(value).lower()

# normalize characters (p.ex. ä becomes a)
simplified_value = unicodedata.normalize("NFKD", simplified_value)

# replace forward slash and whitespace with a dash
simplified_value = re.sub("[/\\s]+", "-", simplified_value)

# delete all characters which are not letters, numbers or dashes
simplified_value = re.sub("[^A-Za-z0-9\\-]+", "", simplified_value)

return simplified_value


def validate_lists_section_with_schema(
path_to_json_project_file: Optional[str] = None,
lists_section: Optional[list[dict[str, Any]]] = None
Expand All @@ -273,7 +249,7 @@ def validate_lists_section_with_schema(
lists_section: the "lists" section as Python object

Returns:
True if the list passed validation. Otherwise, a BaseError with a detailed error report is raised
True if the "lists" section passed validation. Otherwise, a BaseError with a detailed error report is raised
"""
if bool(path_to_json_project_file) == bool(lists_section):
raise BaseError("Validation of the 'lists' section works only if exactly one of the two arguments is given.")
Expand All @@ -283,12 +259,15 @@ def validate_lists_section_with_schema(
if path_to_json_project_file:
with open(path_to_json_project_file) as f:
project = json.load(f)
lists_section = project["project"]["lists"]
lists_section = project["project"].get("lists")
if not lists_section:
raise BaseError(f"Cannot validate \"lists\" section of {path_to_json_project_file}, because there is "
f"no \"lists\" section in this file.")

try:
jsonschema.validate(instance={"lists": lists_section}, schema=lists_schema)
except jsonschema.exceptions.ValidationError as err:
raise BaseError(f'"Lists" section did not pass validation. The error message is: {err.message}\n'
raise BaseError(f'"lists" section did not pass validation. The error message is: {err.message}\n'
f'The error occurred at {err.json_path}')
return True

Expand Down Expand Up @@ -318,16 +297,16 @@ def _extract_excel_file_paths(excelfolder: str) -> list[str]:
return excel_file_paths


def list_excel2json(excelfolder: str, outfile: str) -> None:
def excel2lists(excelfolder: str, outfile: str) -> list[dict[str, Any]]:
"""
This method writes a JSON file with a "lists" section that can later be inserted into a JSON project file.
Converts lists described in Excel files into a "lists" section that can be inserted into a JSON project file.

Args:
excelfolder: path to the folder containing the Excel file(s)
outfile: path to the JSON file the output is written into

Returns:
None
the "lists" section as Python list
"""
excel_file_paths = _extract_excel_file_paths(excelfolder)
print("The following Excel files will be processed:")
Expand All @@ -336,5 +315,7 @@ def list_excel2json(excelfolder: str, outfile: str) -> None:
validate_lists_section_with_schema(lists_section=finished_lists)

with open(outfile, "w", encoding="utf-8") as fp:
json.dump({"lists": finished_lists}, fp, indent=4, sort_keys=False, ensure_ascii=False)
print("List was created successfully and written to file:", outfile)
json.dump({"lists": finished_lists}, fp, indent=4, ensure_ascii=False)
print('"lists" section was created successfully and written to file:', outfile)

return finished_lists
56 changes: 26 additions & 30 deletions knora/dsplib/utils/excel_to_json_properties.py
@@ -1,37 +1,32 @@
import json
import os
import re
from typing import Any

import jsonschema
import pandas as pd

from knora.dsplib.utils.excel_to_json_resources import prepare_dataframe
from knora.dsplib.models.helpers import BaseError
from knora.dsplib.utils.shared_methods import prepare_dataframe

languages = ["en", "de", "fr", "it", "rm"]


def _validate_properties_with_schema(json_file: str) -> bool:
def _validate_properties_with_schema(properties_list: list[dict[str, Any]]) -> bool:
"""
This function checks if the json properties are valid according to the schema.
This function checks if the "properties" section of a JSON project file is valid according to the schema.

Args:
json_file: the json with the properties to be validated
properties_list: the "properties" section of a JSON project as a list of dicts

Returns:
True if the data passed validation, False otherwise

True if the "properties" section passed validation. Otherwise, a BaseError with a detailed error report is raised.
"""
current_dir = os.path.dirname(os.path.realpath(__file__))
with open(os.path.join(current_dir, "../schemas/properties-only.json")) as schema:
with open("knora/dsplib/schemas/properties-only.json") as schema:
properties_schema = json.load(schema)

try:
jsonschema.validate(instance=json_file, schema=properties_schema)
jsonschema.validate(instance=properties_list, schema=properties_schema)
except jsonschema.exceptions.ValidationError as err:
print(err)
return False
print("Properties data passed schema validation.")
raise BaseError(f'"properties" section did not pass validation. The error message is: {err.message}\n'
f'The error occurred at {err.json_path}')
return True


Expand All @@ -42,19 +37,19 @@ def _row2prop(row: pd.Series, row_count: int, excelfile: str) -> dict[str, Any]:
Args:
row: row from a pandas DataFrame that defines a property
row_count: row number of Excel file
excelfile: name of the original excel file
excelfile: name of the original Excel file

Returns:
dict object of the property
"""

# extract the elements that are necessary to build the property
name = row["name"]
supers = [s.strip() for s in row["super"].split(",")]
_object = row["object"]
labels = {lang: row[lang] for lang in languages if row.get(lang)}
comments = {lang: row[f"comment_{lang}"] for lang in languages if row.get(f"comment_{lang}")}
gui_element = row["gui_element"]

gui_attributes = dict()
if row.get("hlist"):
gui_attributes["hlist"] = row["hlist"]
Expand All @@ -71,12 +66,13 @@ def _row2prop(row: pd.Series, row_count: int, excelfile: str) -> dict[str, Any]:
val = int(val)
gui_attributes[attr] = val

# build the dict structure of this property and append it to the list of properties
# build the dict structure of this property
_property = {
"name": name,
"super": supers,
"object": _object,
"labels": labels}
"labels": labels
}
if comments:
_property["comments"] = comments
_property["gui_element"] = gui_element
Expand All @@ -86,16 +82,17 @@ def _row2prop(row: pd.Series, row_count: int, excelfile: str) -> dict[str, Any]:
return _property


def properties_excel2json(excelfile: str, outfile: str) -> None:
def excel2properties(excelfile: str, outfile: str) -> list[dict[str, Any]]:
"""
Converts properties described in an Excel file into a properties section which can be integrated into a DSP ontology
Converts properties described in an Excel file into a "properties" section which can be inserted into a JSON
project file.

Args:
excelfile: path to the Excel file containing the properties
outfile: path to the output JSON file containing the properties section for the ontology
outfile: path to the JSON file the output is written into

Returns:
None
the "properties" section as Python list
"""

# load file
Expand All @@ -109,10 +106,9 @@ def properties_excel2json(excelfile: str, outfile: str) -> None:
props = [_row2prop(row, i, excelfile) for i, row in df.iterrows()]

# write final list to JSON file if list passed validation
if _validate_properties_with_schema(json.loads(json.dumps(props, indent=4))):
with open(file=outfile, mode="w+", encoding="utf-8") as file:
file.write('"properties": ')
json.dump(props, file, indent=4)
print("Properties file was created successfully and written to file: ", outfile)
else:
print("Properties data is not valid according to schema.")
_validate_properties_with_schema(props)
with open(file=outfile, mode="w", encoding="utf-8") as file:
json.dump({"properties": props}, file, indent=4, ensure_ascii=False)
print('"properties" section was created successfully and written to file:', outfile)

return props