Skip to content

Commit

Permalink
fix: xml validation (DEV-1360) (#230)
Browse files Browse the repository at this point in the history
  • Loading branch information
jnussbaum committed Sep 23, 2022
1 parent d2c2e08 commit 0b2bd40
Show file tree
Hide file tree
Showing 12 changed files with 414 additions and 224 deletions.
43 changes: 22 additions & 21 deletions knora/dsp_tools.py
Expand Up @@ -48,30 +48,29 @@ def program(user_args: list[str]) -> None:
subparsers = parser.add_subparsers(title='Subcommands', description='Valid subcommands are', help='sub-command help')

# create
parser_create = subparsers.add_parser('create', help='Upload an ontology and/or list(s) from a JSON file to the '
'DaSCH Service Platform')
parser_create = subparsers.add_parser('create', help='Upload a project and/or list(s) from a JSON project file to '
'the DaSCH Service Platform')
parser_create.set_defaults(action='create')
parser_create.add_argument('-s', '--server', type=str, default=default_localhost, help=url_text)
parser_create.add_argument('-u', '--user', default=default_user, help=username_text)
parser_create.add_argument('-p', '--password', default=default_pw, help=password_text)
parser_create.add_argument('-V', '--validate-only', action='store_true', help='Do only validation of JSON, no '
'upload of the ontology')
parser_create.add_argument('-V', '--validate-only', action='store_true', help='Only validate the project against '
'the JSON schema, without uploading it')
parser_create.add_argument('-l', '--lists-only', action='store_true', help='Upload only the list(s)')
parser_create.add_argument('-v', '--verbose', action='store_true', help=verbose_text)
parser_create.add_argument('-d', '--dump', action='store_true', help='dump test files for DSP-API requests')
parser_create.add_argument('datamodelfile', help='path to data model file')
parser_create.add_argument('projectfile', help='path to a JSON project file')

# get
parser_get = subparsers.add_parser('get', help='Get the ontology (data model) of a project from the DaSCH Service '
'Platform.')
parser_get = subparsers.add_parser('get', help='Get a project from the DaSCH Service Platform.')
parser_get.set_defaults(action='get')
parser_get.add_argument('-u', '--user', default=default_user, help=username_text)
parser_get.add_argument('-p', '--password', default=default_pw, help=password_text)
parser_get.add_argument('-s', '--server', type=str, default=default_localhost, help=url_text)
parser_get.add_argument('-P', '--project', type=str, help='Shortcode, shortname or iri of project', required=True)
parser_get.add_argument('-v', '--verbose', action='store_true', help=verbose_text)
parser_get.add_argument('datamodelfile', help='Path to the file the ontology should be written to',
default='onto.json')
parser_get.add_argument('projectfile', help='Path to the file the project should be written to',
default='project.json')

# xmlupload
parser_upload = subparsers.add_parser('xmlupload', help='Upload data from an XML file to the DaSCH Service Platform.')
Expand Down Expand Up @@ -101,7 +100,7 @@ def program(user_args: list[str]) -> None:

# excel2resources
parser_excel_resources = subparsers.add_parser('excel2resources', help='Create a JSON file from an Excel file '
'containing resources for a DSP ontology. ')
'containing resources for a DSP ontology. ')
parser_excel_resources.set_defaults(action='excel2resources')
parser_excel_resources.add_argument('excelfile', help='Path to the Excel file containing the resources',
default='resources.xlsx')
Expand Down Expand Up @@ -147,35 +146,37 @@ def program(user_args: list[str]) -> None:
if args.action == 'create':
if args.lists_only:
if args.validate_only:
validate_lists_section_with_schema(path_to_json_project_file=args.datamodelfile)
validate_lists_section_with_schema(path_to_json_project_file=args.projectfile)
print('"Lists" section of the JSON project file is syntactically correct and passed validation.')
exit(0)
else:
create_lists(input_file=args.datamodelfile,
create_lists(input_file=args.projectfile,
server=args.server,
user=args.user,
password=args.password,
dump=args.dump)
else:
if args.validate_only and validate_project(args.datamodelfile):
print('Data model is syntactically correct and passed validation.')
if args.validate_only:
validate_project(args.projectfile)
print('JSON project file is syntactically correct and passed validation.')
exit(0)
else:
create_project(input_file=args.datamodelfile,
create_project(input_file=args.projectfile,
server=args.server,
user_mail=args.user,
password=args.password,
verbose=args.verbose,
dump=args.dump if args.dump else False)
elif args.action == 'get':
get_ontology(project_identifier=args.project,
outfile=args.datamodelfile,
outfile=args.projectfile,
server=args.server,
user=args.user,
password=args.password,
verbose=args.verbose)
elif args.action == 'xmlupload':
if args.validate:
validate_xml_against_schema(input_file=args.xmlfile,
schema_file="knora/dsplib/schemas/data.xsd")
validate_xml_against_schema(input_file=args.xmlfile)
else:
xml_upload(input_file=args.xmlfile,
server=args.server,
Expand All @@ -187,13 +188,13 @@ def program(user_args: list[str]) -> None:
incremental=args.incremental)
elif args.action == 'excel2lists':
excel2lists(excelfolder=args.excelfolder,
outfile=args.outfile)
path_to_output_file=args.outfile)
elif args.action == 'excel2resources':
excel2resources(excelfile=args.excelfile,
outfile=args.outfile)
path_to_output_file=args.outfile)
elif args.action == 'excel2properties':
excel2properties(excelfile=args.excelfile,
outfile=args.outfile)
path_to_output_file=args.outfile)
elif args.action == 'id2iri':
id_to_iri(xml_file=args.xmlfile,
json_file=args.jsonfile,
Expand Down
9 changes: 7 additions & 2 deletions knora/dsplib/utils/onto_create_lists.py
Expand Up @@ -71,10 +71,15 @@ def create_lists(
dump: bool = False
) -> Tuple[dict[str, Any], bool]:
"""
This method uploads the "lists" section of a JSON project definition file to a DSP server. If the JSON project file
is still unparsed, this method parses it, expands the Excel sheets that are referenced, and validates it.
This method uploads the "lists" section of a JSON project definition file to a DSP server. The project must already
exist on the DSP server.
If the JSON project file is passed as "input_file", this method parses it, expands the Excel sheets that are
referenced, and validates it. If it is passed as "project_definition", these preliminary steps are not necessary.
The "lists" section of the parsed project definition is then uploaded to the DSP server. If a list with the same
name is already existing in this project on the DSP server, this list is skipped.
Returns a tuple consisting of a dict and a bool. The dict contains the IRIs of the created list nodes. If there are
no lists in the project definition, an empty dictionary is returned. The bool indicates if everything went smoothly
during the process. If a warning or error occurred (e.g. one of the lists already exists, or one of the nodes could
Expand Down
83 changes: 41 additions & 42 deletions knora/dsplib/utils/onto_validate.py
@@ -1,5 +1,5 @@
import os
import re
import regex
from typing import Any, Union
import jsonschema
import json
Expand All @@ -10,7 +10,7 @@


def validate_project(
input_file_or_json: Union[dict[str, Any], os.PathLike[Any]],
input_file_or_json: Union[dict[str, Any], str],
expand_lists: bool = True
) -> bool:
"""
Expand All @@ -28,32 +28,31 @@ def validate_project(
True if the project passed validation. Otherwise, a BaseError with a detailed error report is raised.
"""

if isinstance(input_file_or_json, dict):
if isinstance(input_file_or_json, dict) and "project" in input_file_or_json:
project_definition = input_file_or_json
elif os.path.isfile(input_file_or_json):
elif isinstance(input_file_or_json, str) and os.path.isfile(input_file_or_json) and regex.search(r"\.json$", input_file_or_json):
with open(input_file_or_json) as f:
project_json_str = f.read()
project_definition = json.loads(project_json_str)
project_definition = json.load(f)
else:
raise BaseError(f"Input '{input_file_or_json}' is neither a file path nor a JSON object.")

if expand_lists:
# expand all lists referenced in the "lists" section of the project definition, and add them to the project
# definition
new_lists, _ = expand_lists_from_excel(project_definition["project"].get("lists"))
new_lists, _ = expand_lists_from_excel(project_definition["project"].get("lists", []))
if new_lists:
project_definition['project']['lists'] = new_lists
project_definition["project"]["lists"] = new_lists

# validate the project definition against the schema
current_dir = os.path.dirname(os.path.realpath(__file__))
with open(os.path.join(current_dir, '../schemas/ontology.json')) as s:
with open(os.path.join(current_dir, "../schemas/ontology.json")) as s:
schema = json.load(s)
try:
jsonschema.validate(instance=project_definition, schema=schema)
except jsonschema.exceptions.ValidationError as err:
raise BaseError(f'The JSON project file cannot be created due to the following validation error: {err.message}.\n'
f'The error occurred at {err.json_path}:\n'
f'{err.instance}')
raise BaseError(f"The JSON project file cannot be created due to the following validation error: {err.message}.\n"
f"The error occurred at {err.json_path}:\n"
f"{err.instance}")

# cardinalities check for circular references
if _check_cardinalities_of_circular_references(project_definition):
Expand Down Expand Up @@ -81,53 +80,53 @@ def _check_cardinalities_of_circular_references(project_definition: dict[Any, An
return True
else:
error_message = \
'ERROR: Your ontology contains properties derived from "hasLinkTo" that allow circular references ' \
'between resources. This is not a problem in itself, but if you try to upload data that actually ' \
'contains circular references, these "hasLinkTo" properties will be temporarily removed from the ' \
'affected resources. Therefore, it is necessary that all involved "hasLinkTo" properties have a ' \
'cardinality of 0-1 or 0-n. \n' \
'Please make sure that the following properties have a cardinality of 0-1 or 0-n:'
"ERROR: Your ontology contains properties derived from 'hasLinkTo' that allow circular references " \
"between resources. This is not a problem in itself, but if you try to upload data that actually " \
"contains circular references, these 'hasLinkTo'' properties will be temporarily removed from the " \
"affected resources. Therefore, it is necessary that all involved 'hasLinkTo' properties have a " \
"cardinality of 0-1 or 0-n. \n" \
"Please make sure that the following properties have a cardinality of 0-1 or 0-n:"
for error in errors:
error_message = error_message + f'\n\t- Resource {error[0]}, property {error[1]}'
error_message = f"{error_message}\n\t- Resource {error[0]}, property {error[1]}"
raise BaseError(error_message)


def _collect_link_properties(project_definition: dict[Any, Any]) -> dict[str, list[str]]:
"""
map the properties derived from hasLinkTo to the resource classes they point to, for example:
link_properties = {'rosetta:hasImage2D': ['rosetta:Image2D'], ...}
link_properties = {"rosetta:hasImage2D": ["rosetta:Image2D"], ...}
"""
ontos = project_definition['project']['ontologies']
hasLinkTo_props = {'hasLinkTo', 'isPartOf', 'isRegionOf', 'isAnnotationOf'}
ontos = project_definition["project"]["ontologies"]
hasLinkTo_props = {"hasLinkTo", "isPartOf", "isRegionOf", "isAnnotationOf"}
link_properties: dict[str, list[str]] = dict()
for index, onto in enumerate(ontos):
hasLinkTo_matches = list()
# look for child-properties down to 5 inheritance levels that are derived from hasLinkTo-properties
for i in range(5):
for hasLinkTo_prop in hasLinkTo_props:
hasLinkTo_matches.extend(jsonpath_ng.ext.parse(
f'$.project.ontologies[{index}].properties[?super[*] == {hasLinkTo_prop}]'
f"$.project.ontologies[{index}].properties[?super[*] == {hasLinkTo_prop}]"
).find(project_definition))
# make the children from this iteration to the parents of the next iteration
hasLinkTo_props = {x.value['name'] for x in hasLinkTo_matches}
hasLinkTo_props = {x.value["name"] for x in hasLinkTo_matches}
prop_obj_pair: dict[str, list[str]] = dict()
for match in hasLinkTo_matches:
prop = onto['name'] + ':' + match.value['name']
target = match.value['object']
if target != 'Resource':
prop = onto["name"] + ":" + match.value["name"]
target = match.value["object"]
if target != "Resource":
# make the target a fully qualified name (with the ontology's name prefixed)
target = re.sub(r'^:([^:]+)$', f'{onto["name"]}:\\1', target)
target = regex.sub(r"^:([^:]+)$", f"{onto['name']}:\\1", target)
prop_obj_pair[prop] = [target]
link_properties.update(prop_obj_pair)

# in case the object of a property is "Resource", the link can point to any resource class
all_res_names: list[str] = list()
for index, onto in enumerate(ontos):
matches = jsonpath_ng.ext.parse(f'$.resources[*].name').find(onto)
tmp = [f'{onto["name"]}:{match.value}' for match in matches]
matches = jsonpath_ng.ext.parse(f"$.resources[*].name").find(onto)
tmp = [f"{onto['name']}:{match.value}" for match in matches]
all_res_names.extend(tmp)
for prop, targ in link_properties.items():
if 'Resource' in targ:
if "Resource" in targ:
link_properties[prop] = all_res_names

return link_properties
Expand All @@ -138,31 +137,31 @@ def _identify_problematic_cardinalities(project_definition: dict[Any, Any], link
make an error list with all cardinalities that are part of a circle but have a cardinality of "1" or "1-n"
"""
# make 2 dicts of the following form:
# dependencies = {'rosetta:Text': {'rosetta:hasImage2D': ['rosetta:Image2D'], ...}}
# cardinalities = {'rosetta:Text': {'rosetta:hasImage2D': '0-1', ...}}
# dependencies = {"rosetta:Text": {"rosetta:hasImage2D": ["rosetta:Image2D"], ...}}
# cardinalities = {"rosetta:Text": {"rosetta:hasImage2D": "0-1", ...}}
dependencies: dict[str, dict[str, list[str]]] = dict()
cardinalities: dict[str, dict[str, str]] = dict()
for onto in project_definition['project']['ontologies']:
for resource in onto['resources']:
resname: str = onto['name'] + ':' + resource['name']
for card in resource['cardinalities']:
for onto in project_definition["project"]["ontologies"]:
for resource in onto["resources"]:
resname: str = onto["name"] + ":" + resource["name"]
for card in resource["cardinalities"]:
# make the cardinality a fully qualified name (with the ontology's name prefixed)
cardname = re.sub(r'^(:?)([^:]+)$', f'{onto["name"]}:\\2', card['propname'])
cardname = regex.sub(r"^(:?)([^:]+)$", f"{onto['name']}:\\2", card["propname"])
if cardname in link_properties:
# Look out: if `targets` is created with `targets = link_properties[cardname]`, the ex-
# pression `dependencies[resname][cardname] = targets` causes `dependencies[resname][cardname]`
# to point to `link_properties[cardname]`. Due to that, the expression
# `dependencies[resname][cardname].extend(targets)` will modify 'link_properties'!
# `dependencies[resname][cardname].extend(targets)` will modify "link_properties"!
# For this reason, `targets` must be created with `targets = list(link_properties[cardname])`
targets = list(link_properties[cardname])
if resname not in dependencies:
dependencies[resname] = dict()
dependencies[resname][cardname] = targets
cardinalities[resname] = dict()
cardinalities[resname][cardname] = card['cardinality']
cardinalities[resname][cardname] = card["cardinality"]
elif cardname not in dependencies[resname]:
dependencies[resname][cardname] = targets
cardinalities[resname][cardname] = card['cardinality']
cardinalities[resname][cardname] = card["cardinality"]
else:
dependencies[resname][cardname].extend(targets)

Expand All @@ -182,7 +181,7 @@ def _identify_problematic_cardinalities(project_definition: dict[Any, Any], link
for property, targets in dependencies[resource].items():
if target in targets:
prop = property
if cardinalities[resource][prop] not in ['0-1', '0-n']:
if cardinalities[resource][prop] not in ["0-1", "0-n"]:
errors.add((resource, prop))

return sorted(errors, key=lambda x: x[0])
6 changes: 4 additions & 2 deletions knora/dsplib/utils/shared.py
Expand Up @@ -2,6 +2,7 @@
import unicodedata
import pandas as pd
import regex
import os
from lxml import etree
from requests import RequestException
from datetime import datetime
Expand Down Expand Up @@ -87,17 +88,18 @@ def try_network_action(
raise BaseError(failure_msg)


def validate_xml_against_schema(input_file: str, schema_file: str) -> bool:
def validate_xml_against_schema(input_file: str) -> bool:
"""
Validates an XML file against an XSD schema
Args:
input_file: the XML file to be validated
schema_file: the schema against which the XML file should be validated
Returns:
True if the XML file is valid. Otherwise, a BaseError with a detailed error log is raised
"""
current_dir = os.path.dirname(os.path.realpath(__file__))
schema_file = os.path.join(current_dir, "../schemas/data.xsd")
xmlschema = etree.XMLSchema(etree.parse(schema_file))
doc = etree.parse(input_file)

Expand Down
4 changes: 1 addition & 3 deletions knora/dsplib/utils/xml_upload.py
Expand Up @@ -241,10 +241,8 @@ def xml_upload(input_file: str, server: str, user: str, password: str, imgdir: s
"""

# Validate the input XML file
current_dir = os.path.dirname(os.path.realpath(__file__))
schema_file = os.path.join(current_dir, '../schemas/data.xsd')
try:
validate_xml_against_schema(input_file, schema_file)
validate_xml_against_schema(input_file)
except BaseError as err:
print(f"=====================================\n"
f"{err.message}")
Expand Down

0 comments on commit 0b2bd40

Please sign in to comment.