/
onto_validate.py
182 lines (158 loc) · 8.42 KB
/
onto_validate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
import os
import re
from typing import Any, Union
import jsonschema
import json
import jsonpath_ng, jsonpath_ng.ext
import networkx as nx
from ..utils.expand_all_lists import expand_lists_from_excel
def validate_ontology(input_file_or_json: Union[str, dict[Any, Any], 'os.PathLike[Any]']) -> bool:
"""
Validates an ontology against the knora schema
Args:
input_file_or_json: the ontology to be validated, can either be a file or a json string (dict)
Returns:
True if ontology passed validation, False otherwise
"""
data_model: dict[Any, Any] = {}
if isinstance(input_file_or_json, dict):
data_model = input_file_or_json
elif os.path.isfile(input_file_or_json):
with open(input_file_or_json) as f:
onto_json_str = f.read()
data_model = json.loads(onto_json_str)
else:
print('Input is not valid.')
exit(1)
# expand all lists referenced in the list section of the data model
new_lists = expand_lists_from_excel(data_model)
# add the newly created lists from Excel to the ontology
data_model['project']['lists'] = new_lists
# validate the data model against the schema
current_dir = os.path.dirname(os.path.realpath(__file__))
with open(os.path.join(current_dir, '../schemas/ontology.json')) as s:
schema = json.load(s)
try:
jsonschema.validate(instance=data_model, schema=schema)
except jsonschema.exceptions.ValidationError as err:
print(f'Data model did not pass validation. The error message is: {err.message}\n'
f'The error occurred at {err.json_path}')
return False
# cardinalities check for circular references
if check_cardinalities_of_circular_references(data_model):
print('Data model is syntactically correct and passed validation.')
return True
else:
return False
def check_cardinalities_of_circular_references(data_model: dict[Any, Any]) -> bool:
"""
Check a data model if it contains properties derived from hasLinkTo that form a circular reference. If so, these
properties must have the cardinality 0-1 or 0-n, because during the xmlupload process, these values
are temporarily removed.
Args:
data_model: dictionary with a DSP project (as defined in a JSON ontology file)
Returns:
True if no circle was detected, or if all elements of all circles are of cardinality "0-1" or "0-n".
False if there is a circle with at least one element that has a cardinality of "1" or "1-n".
"""
link_properties = collect_link_properties(data_model)
errors = identify_problematic_cardinalities(data_model, link_properties)
if len(errors) == 0:
return True
else:
print('ERROR: Your ontology contains properties derived from "hasLinkTo" that allow circular references '
'between resources. This is not a problem in itself, but if you try to upload data that actually '
'contains circular references, these "hasLinkTo" properties will be temporarily removed from the '
'affected resources. Therefore, it is necessary that all involved "hasLinkTo" properties have a '
'cardinality of 0-1 or 0-n. \n'
'Please make sure that the following properties have a cardinality of 0-1 or 0-n:')
for error in errors:
print(f'\t- Resource {error[0]}, property {error[1]}')
return False
def collect_link_properties(data_model: dict[Any, Any]) -> dict[str, list[str]]:
"""
map the properties derived from hasLinkTo to the resource classes they point to, for example:
link_properties = {'rosetta:hasImage2D': ['rosetta:Image2D'], ...}
"""
ontos = data_model['project']['ontologies']
hasLinkTo_props = {'hasLinkTo', 'isPartOf', 'isRegionOf', 'isAnnotationOf'}
link_properties: dict[str, list[str]] = dict()
for index, onto in enumerate(ontos):
hasLinkTo_matches = list()
# look for child-properties down to 5 inheritance levels that are derived from hasLinkTo-properties
for i in range(5):
for hasLinkTo_prop in hasLinkTo_props:
hasLinkTo_matches.extend(jsonpath_ng.ext.parse(
f'$.project.ontologies[{index}].properties[?super[*] == {hasLinkTo_prop}]'
).find(data_model))
# make the children from this round to the parents of the next round
hasLinkTo_props = {x.value['name'] for x in hasLinkTo_matches}
prop_obj_pair: dict[str, list[str]] = dict()
for match in hasLinkTo_matches:
prop = onto['name'] + ':' + match.value['name']
target = match.value['object']
if target != 'Resource':
# make the target a fully qualified name (with the ontology's name prefixed)
target = re.sub(r'^:([^:]+)$', f'{onto["name"]}:\\1', target)
prop_obj_pair[prop] = [target]
link_properties.update(prop_obj_pair)
# in case the object of a property is "Resource", the link can point to any resource class
all_res_names: list[str] = list()
for index, onto in enumerate(ontos):
matches = jsonpath_ng.ext.parse(f'$.resources[*].name').find(onto)
tmp = [f'{onto["name"]}:{match.value}' for match in matches]
all_res_names.extend(tmp)
for prop, targ in link_properties.items():
if 'Resource' in targ:
link_properties[prop] = all_res_names
return link_properties
def identify_problematic_cardinalities(data_model: dict[Any, Any], link_properties: dict[str, list[str]]) -> list[tuple[str, str]]:
"""
make an error list with all cardinalities that are part of a circle but have "1" or "1-n"
"""
# make 2 dicts of the following form:
# dependencies = {'rosetta:Text': {'rosetta:hasImage2D': ['rosetta:Image2D'], ...}}
# cardinalities = {'rosetta:Text': {'rosetta:hasImage2D': '0-1', ...}}
dependencies: dict[str, dict[str, list[str]]] = dict()
cardinalities: dict[str, dict[str, str]] = dict()
for onto in data_model['project']['ontologies']:
for resource in onto['resources']:
resname: str = onto['name'] + ':' + resource['name']
for card in resource['cardinalities']:
# make the cardinality a fully qualified name (with the ontology's name prefixed)
cardname = re.sub(r'^(:?)([^:]+)$', f'{onto["name"]}:\\2', card['propname'])
if cardname in link_properties:
# Look out: if `targets` is created with `targets = link_properties[cardname]`, the ex-
# pression `dependencies[resname][cardname] = targets` causes `dependencies[resname][cardname]`
# to point to `link_properties[cardname]`. Due to that, the expression
# `dependencies[resname][cardname].extend(targets)` will modify 'link_properties'!
# For this reason, `targets` must be created with `targets = list(link_properties[cardname])`
targets = list(link_properties[cardname])
if resname not in dependencies:
dependencies[resname] = dict()
dependencies[resname][cardname] = targets
cardinalities[resname] = dict()
cardinalities[resname][cardname] = card['cardinality']
elif cardname not in dependencies[resname]:
dependencies[resname][cardname] = targets
cardinalities[resname][cardname] = card['cardinality']
else:
dependencies[resname][cardname].extend(targets)
# transform the dependencies into a graph structure
graph = nx.MultiDiGraph()
for start, cards in dependencies.items():
for edge, targets in cards.items():
for target in targets:
graph.add_edge(start, target, edge)
# find elements of circles that have a cardinality of "1" or "1-n"
errors: set[tuple[str, str]] = set()
circles = list(nx.simple_cycles(graph))
for circle in circles:
for index, resource in enumerate(circle):
target = circle[(index+1) % len(circle)]
for property, targets in dependencies[resource].items():
if target in targets:
prop = property
if cardinalities[resource][prop] not in ['0-1', '0-n']:
errors.add((resource, prop))
return sorted(errors, key=lambda x: x[0])