Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
chore: tidy up excel2lists, excel2resources, excel2properties (DEV-1352
…) (#229)
  • Loading branch information
jnussbaum committed Sep 19, 2022
1 parent 21cc6bc commit d2c2e08
Show file tree
Hide file tree
Showing 33 changed files with 640 additions and 314 deletions.
3 changes: 2 additions & 1 deletion docs/dsp-tools-excel2xml.md
Expand Up @@ -63,7 +63,8 @@ For `make_boolean_prop(cell)`, the following formats are supported:
#### Check if a cell contains a usable value
The method `check_notna(cell)` checks a value if it is usable in the context of data archiving. A value is considered
usable if it is
- a number (integer or float, but not np.nan)

- a number (integer or float, but not numpy.nan)
- a boolean
- a string with at least one Unicode letter, underscore, or number, but not "None", "<NA>", "N/A", or "-"
- a PropertyElement whose "value" fulfills the above criteria
Expand Down
20 changes: 10 additions & 10 deletions knora/dsp_tools.py
Expand Up @@ -6,16 +6,16 @@
import sys
from importlib.metadata import version

from knora.dsplib.utils.excel_to_json_lists import list_excel2json, validate_lists_section_with_schema
from knora.dsplib.utils.excel_to_json_properties import properties_excel2json
from knora.dsplib.utils.excel_to_json_resources import resources_excel2json
from knora.dsplib.utils.excel_to_json_lists import excel2lists, validate_lists_section_with_schema
from knora.dsplib.utils.excel_to_json_properties import excel2properties
from knora.dsplib.utils.excel_to_json_resources import excel2resources
from knora.dsplib.utils.id_to_iri import id_to_iri
from knora.dsplib.utils.onto_create_lists import create_lists
from knora.dsplib.utils.onto_create_ontology import create_project
from knora.dsplib.utils.onto_get import get_ontology
from knora.dsplib.utils.onto_validate import validate_project
from knora.dsplib.utils.xml_upload import xml_upload
from knora.dsplib.utils.shared_methods import validate_xml_against_schema
from knora.dsplib.utils.shared import validate_xml_against_schema
from knora.excel2xml import excel2xml


Expand Down Expand Up @@ -186,14 +186,14 @@ def program(user_args: list[str]) -> None:
verbose=args.verbose,
incremental=args.incremental)
elif args.action == 'excel2lists':
list_excel2json(excelfolder=args.excelfolder,
outfile=args.outfile)
excel2lists(excelfolder=args.excelfolder,
outfile=args.outfile)
elif args.action == 'excel2resources':
resources_excel2json(excelfile=args.excelfile,
outfile=args.outfile)
excel2resources(excelfile=args.excelfile,
outfile=args.outfile)
elif args.action == 'excel2properties':
properties_excel2json(excelfile=args.excelfile,
outfile=args.outfile)
excel2properties(excelfile=args.excelfile,
outfile=args.outfile)
elif args.action == 'id2iri':
id_to_iri(xml_file=args.xmlfile,
json_file=args.jsonfile,
Expand Down
54 changes: 54 additions & 0 deletions knora/dsplib/models/propertyelement.py
@@ -0,0 +1,54 @@
from typing import Union, Optional
import pandas as pd
import regex
from dataclasses import dataclass
from knora.dsplib.models.helpers import BaseError


@dataclass(frozen=True)
class PropertyElement:
"""
A PropertyElement object carries more information about a property value than the value itself.
The "value" is the value that could be passed to a method as plain string/int/float/bool. Use a PropertyElement
instead to define more precisely what attributes your <text> tag (for example) will have.
Args:
value: This is the content that will be written between the <text></text> tags (for example)
permissions: This is the permissions that your <text> tag (for example) will have
comment: This is the comment that your <text> tag (for example) will have
encoding: For <text> tags only. Can be "xml" or "utf8".
Examples:
See the difference between the first and the second example:
>>> make_text_prop(":testproperty", "first text")
<text-prop name=":testproperty">
<text encoding="utf8" permissions="prop-default">
first text
</text>
</text-prop>
>>> make_text_prop(":testproperty", PropertyElement("first text", permissions="prop-restricted", encoding="xml"))
<text-prop name=":testproperty">
<text encoding="xml" permissions="prop-restricted">
first text
</text>
</text-prop>
"""
value: Union[str, int, float, bool]
permissions: str = "prop-default"
comment: Optional[str] = None
encoding: Optional[str] = None

def __post_init__(self) -> None:
if not any([
isinstance(self.value, int),
isinstance(self.value, float) and pd.notna(self.value), # necessary because isinstance(np.nan, float)
isinstance(self.value, bool),
isinstance(self.value, str) and all([
regex.search(r"\p{L}|\d|_", self.value, flags=regex.UNICODE),
not bool(regex.search(r"^(none|<NA>|-|n/a)$", self.value, flags=regex.IGNORECASE))
])
]):
raise BaseError(f"'{self.value}' is not a valid value for a PropertyElement")
if self.encoding not in ["utf8", "xml", None]:
raise BaseError(f"'{self.encoding}' is not a valid encoding for a PropertyElement")
13 changes: 6 additions & 7 deletions knora/dsplib/schemas/properties-only.json
Expand Up @@ -67,17 +67,17 @@
"oneOf": [
{
"enum": [
"TextValue",
"BooleanValue",
"ColorValue",
"DateValue",
"DecimalValue",
"GeonameValue",
"IntValue",
"BooleanValue",
"TimeValue",
"UriValue",
"IntervalValue",
"ListValue",
"TextValue",
"TimeValue",
"UriValue",
"Resource",
"Representation"
]
Expand All @@ -96,11 +96,11 @@
"gui_element": {
"type": "string",
"enum": [
"Checkbox",
"Colorpicker",
"Date",
"Geonames",
"Interval",
"TimeStamp",
"List",
"Radio",
"Richtext",
Expand All @@ -109,8 +109,7 @@
"Slider",
"Spinbox",
"Textarea",
"Checkbox",
"Fileupload"
"TimeStamp"
]
},
"gui_attributes": {
Expand Down
56 changes: 21 additions & 35 deletions knora/dsplib/utils/excel_to_json_lists.py
Expand Up @@ -3,7 +3,6 @@
import json
import os
import re
import unicodedata
from typing import Any, Union, Optional, Tuple

import jsonschema
Expand All @@ -13,6 +12,7 @@
import regex

from knora.dsplib.models.helpers import BaseError
from knora.dsplib.utils.shared import simplify_name

list_of_lists_of_previous_cell_values: list[list[str]] = []
"""Module level variable used to ensure that there are no duplicate node names"""
Expand Down Expand Up @@ -236,30 +236,6 @@ def _make_json_lists_from_excel(excel_file_paths: list[str], verbose: bool = Fal
return finished_lists


def simplify_name(value: str) -> str:
"""
Simplifies a given value in order to use it as node name
Args:
value: The value to be simplified
Returns:
str: The simplified value
"""
simplified_value = str(value).lower()

# normalize characters (p.ex. ä becomes a)
simplified_value = unicodedata.normalize("NFKD", simplified_value)

# replace forward slash and whitespace with a dash
simplified_value = re.sub("[/\\s]+", "-", simplified_value)

# delete all characters which are not letters, numbers or dashes
simplified_value = re.sub("[^A-Za-z0-9\\-]+", "", simplified_value)

return simplified_value


def validate_lists_section_with_schema(
path_to_json_project_file: Optional[str] = None,
lists_section: Optional[list[dict[str, Any]]] = None
Expand All @@ -273,7 +249,7 @@ def validate_lists_section_with_schema(
lists_section: the "lists" section as Python object
Returns:
True if the list passed validation. Otherwise, a BaseError with a detailed error report is raised
True if the "lists" section passed validation. Otherwise, a BaseError with a detailed error report is raised
"""
if bool(path_to_json_project_file) == bool(lists_section):
raise BaseError("Validation of the 'lists' section works only if exactly one of the two arguments is given.")
Expand All @@ -283,12 +259,15 @@ def validate_lists_section_with_schema(
if path_to_json_project_file:
with open(path_to_json_project_file) as f:
project = json.load(f)
lists_section = project["project"]["lists"]
lists_section = project["project"].get("lists")
if not lists_section:
raise BaseError(f"Cannot validate \"lists\" section of {path_to_json_project_file}, because there is "
f"no \"lists\" section in this file.")

try:
jsonschema.validate(instance={"lists": lists_section}, schema=lists_schema)
except jsonschema.exceptions.ValidationError as err:
raise BaseError(f'"Lists" section did not pass validation. The error message is: {err.message}\n'
raise BaseError(f'"lists" section did not pass validation. The error message is: {err.message}\n'
f'The error occurred at {err.json_path}')
return True

Expand Down Expand Up @@ -318,23 +297,30 @@ def _extract_excel_file_paths(excelfolder: str) -> list[str]:
return excel_file_paths


def list_excel2json(excelfolder: str, outfile: str) -> None:
def excel2lists(excelfolder: str, path_to_output_file: Optional[str] = None) -> list[dict[str, Any]]:
"""
This method writes a JSON file with a "lists" section that can later be inserted into a JSON project file.
Converts lists described in Excel files into a "lists" section that can be inserted into a JSON project file.
Args:
excelfolder: path to the folder containing the Excel file(s)
outfile: path to the JSON file the output is written into
path_to_output_file: if provided, the output is written into this JSON file
Returns:
None
the "lists" section as Python list
"""
# read the data
excel_file_paths = _extract_excel_file_paths(excelfolder)
print("The following Excel files will be processed:")
[print(f" - {filename}") for filename in excel_file_paths]

# construct the "lists" section
finished_lists = _make_json_lists_from_excel(excel_file_paths, verbose=True)
validate_lists_section_with_schema(lists_section=finished_lists)

with open(outfile, "w", encoding="utf-8") as fp:
json.dump({"lists": finished_lists}, fp, indent=4, sort_keys=False, ensure_ascii=False)
print("List was created successfully and written to file:", outfile)
# write final "lists" section
if path_to_output_file:
with open(path_to_output_file, "w", encoding="utf-8") as fp:
json.dump(finished_lists, fp, indent=4, ensure_ascii=False)
print('"lists" section was created successfully and written to file:', path_to_output_file)

return finished_lists
61 changes: 29 additions & 32 deletions knora/dsplib/utils/excel_to_json_properties.py
@@ -1,37 +1,32 @@
import json
import os
import re
from typing import Any

from typing import Any, Optional
import jsonschema
import pandas as pd

from knora.dsplib.utils.excel_to_json_resources import prepare_dataframe
from knora.dsplib.models.helpers import BaseError
from knora.dsplib.utils.shared import prepare_dataframe

languages = ["en", "de", "fr", "it", "rm"]


def _validate_properties_with_schema(json_file: str) -> bool:
def _validate_properties_with_schema(properties_list: list[dict[str, Any]]) -> bool:
"""
This function checks if the json properties are valid according to the schema.
This function checks if the "properties" section of a JSON project file is valid according to the schema.
Args:
json_file: the json with the properties to be validated
properties_list: the "properties" section of a JSON project as a list of dicts
Returns:
True if the data passed validation, False otherwise
True if the "properties" section passed validation. Otherwise, a BaseError with a detailed error report is raised.
"""
current_dir = os.path.dirname(os.path.realpath(__file__))
with open(os.path.join(current_dir, "../schemas/properties-only.json")) as schema:
with open("knora/dsplib/schemas/properties-only.json") as schema:
properties_schema = json.load(schema)

try:
jsonschema.validate(instance=json_file, schema=properties_schema)
jsonschema.validate(instance=properties_list, schema=properties_schema)
except jsonschema.exceptions.ValidationError as err:
print(err)
return False
print("Properties data passed schema validation.")
raise BaseError(f'"properties" section did not pass validation. The error message is: {err.message}\n'
f'The error occurred at {err.json_path}')
return True


Expand All @@ -42,19 +37,19 @@ def _row2prop(row: pd.Series, row_count: int, excelfile: str) -> dict[str, Any]:
Args:
row: row from a pandas DataFrame that defines a property
row_count: row number of Excel file
excelfile: name of the original excel file
excelfile: name of the original Excel file
Returns:
dict object of the property
"""

# extract the elements that are necessary to build the property
name = row["name"]
supers = [s.strip() for s in row["super"].split(",")]
_object = row["object"]
labels = {lang: row[lang] for lang in languages if row.get(lang)}
comments = {lang: row[f"comment_{lang}"] for lang in languages if row.get(f"comment_{lang}")}
gui_element = row["gui_element"]

gui_attributes = dict()
if row.get("hlist"):
gui_attributes["hlist"] = row["hlist"]
Expand All @@ -71,12 +66,13 @@ def _row2prop(row: pd.Series, row_count: int, excelfile: str) -> dict[str, Any]:
val = int(val)
gui_attributes[attr] = val

# build the dict structure of this property and append it to the list of properties
# build the dict structure of this property
_property = {
"name": name,
"super": supers,
"object": _object,
"labels": labels}
"labels": labels
}
if comments:
_property["comments"] = comments
_property["gui_element"] = gui_element
Expand All @@ -86,16 +82,17 @@ def _row2prop(row: pd.Series, row_count: int, excelfile: str) -> dict[str, Any]:
return _property


def properties_excel2json(excelfile: str, outfile: str) -> None:
def excel2properties(excelfile: str, path_to_output_file: Optional[str] = None) -> list[dict[str, Any]]:
"""
Converts properties described in an Excel file into a properties section which can be integrated into a DSP ontology
Converts properties described in an Excel file into a "properties" section which can be inserted into a JSON
project file.
Args:
excelfile: path to the Excel file containing the properties
outfile: path to the output JSON file containing the properties section for the ontology
path_to_output_file: if provided, the output is written into this JSON file
Returns:
None
the "properties" section as Python list
"""

# load file
Expand All @@ -107,12 +104,12 @@ def properties_excel2json(excelfile: str, outfile: str) -> None:

# transform every row into a property
props = [_row2prop(row, i, excelfile) for i, row in df.iterrows()]
_validate_properties_with_schema(props)

# write final JSON file
if path_to_output_file:
with open(file=path_to_output_file, mode="w", encoding="utf-8") as file:
json.dump(props, file, indent=4, ensure_ascii=False)
print('"properties" section was created successfully and written to file:', path_to_output_file)

# write final list to JSON file if list passed validation
if _validate_properties_with_schema(json.loads(json.dumps(props, indent=4))):
with open(file=outfile, mode="w+", encoding="utf-8") as file:
file.write('"properties": ')
json.dump(props, file, indent=4)
print("Properties file was created successfully and written to file: ", outfile)
else:
print("Properties data is not valid according to schema.")
return props

0 comments on commit d2c2e08

Please sign in to comment.