Skip to content

Commit

Permalink
refactor
Browse files Browse the repository at this point in the history
  • Loading branch information
jnussbaum committed Jul 8, 2022
1 parent 84c07e0 commit 584cf55
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 48 deletions.
50 changes: 34 additions & 16 deletions knora/dsplib/utils/xml_upload.py
Expand Up @@ -211,6 +211,39 @@ def _convert_ark_v0_to_resource_iri(ark: str) -> str:
return "http://rdfh.ch/" + project_id + "/" + dsp_uuid


def parse_xml_file(input_file: str) -> etree.ElementTree:
"""
Parse an XML file with DSP-conform data, remove namespace URI from the elements' names, and transform the special
tags <annotation>, <region>, and <link> to their technically correct form <resource restype="Annotation">,
<resource restype="Region">, and <resource restype="LinkObj">.
Args:
input_file: path to the XML file
Returns:
the parsed etree.ElementTree
"""
tree = etree.parse(input_file)
for elem in tree.getiterator():
if not (isinstance(elem, etree._Comment) or isinstance(elem, etree._ProcessingInstruction)):
# remove namespace URI in the element's name
elem.tag = etree.QName(elem).localname
if elem.tag == "annotation":
elem.attrib["restype"] = "Annotation"
elem.tag = "resource"
elif elem.tag == "link":
elem.attrib["restype"] = "LinkObj"
elem.tag = "resource"
elif elem.tag == "region":
elem.attrib["restype"] = "Region"
elem.tag = "resource"

# remove unused namespace declarations
etree.cleanup_namespaces(tree)

return tree


def xml_upload(input_file: str, server: str, user: str, password: str, imgdir: str, sipi: str, verbose: bool,
validate_only: bool, incremental: bool) -> bool:
"""
Expand Down Expand Up @@ -249,22 +282,7 @@ def xml_upload(input_file: str, server: str, user: str, password: str, imgdir: s
proj_context = ProjectContext(con=con)
sipi_server = Sipi(sipi, con.get_token())

# parse the XML file
tree = etree.parse(input_file)
for elem in tree.getiterator():
if not (isinstance(elem, etree._Comment) or isinstance(elem, etree._ProcessingInstruction)):
elem.tag = etree.QName(elem).localname # remove namespace URI in the element's name
if elem.tag == "annotation":
elem.attrib["restype"] = "Annotation"
elem.tag = "resource"
if elem.tag == "link":
elem.attrib["restype"] = "LinkObj"
elem.tag = "resource"
if elem.tag == "region":
elem.attrib["restype"] = "Region"
elem.tag = "resource"
etree.cleanup_namespaces(tree) # remove unused namespace declarations

tree = parse_xml_file(input_file)
root = tree.getroot()
default_ontology = root.attrib['default-ontology']
shortcode = root.attrib['shortcode']
Expand Down
20 changes: 5 additions & 15 deletions test/unittests/test_id_to_iri.py
Expand Up @@ -3,8 +3,7 @@
import unittest
import os

from lxml import etree

from knora.dsplib.utils.xml_upload import parse_xml_file
from knora.dsplib.utils.id_to_iri import id_to_iri


Expand All @@ -15,7 +14,7 @@ def setUp(self) -> None:
"""Is executed before each test"""
os.makedirs('testdata/tmp', exist_ok=True)

def test_invalid_xml_file_name(self):
def test_invalid_xml_file_name(self) -> None:
with self.assertRaises(SystemExit) as cm:
id_to_iri(xml_file='test.xml',
json_file='testdata/test-id2iri-mapping.json',
Expand All @@ -24,7 +23,7 @@ def test_invalid_xml_file_name(self):

self.assertEqual(cm.exception.code, 1)

def test_invalid_json_file_name(self):
def test_invalid_json_file_name(self) -> None:
with self.assertRaises(SystemExit) as cm:
id_to_iri(xml_file='testdata/test-id2iri-data.xml',
json_file='test.json',
Expand All @@ -33,22 +32,13 @@ def test_invalid_json_file_name(self):

self.assertEqual(cm.exception.code, 1)

def test_replace_id_with_iri(self):
def test_replace_id_with_iri(self) -> None:
id_to_iri(xml_file='testdata/test-id2iri-data.xml',
json_file='testdata/test-id2iri-mapping.json',
out_file=self.out_file,
verbose=True)

tree = etree.parse(self.out_file)

for elem in tree.getiterator():
# skip comments and processing instructions as they do not have namespaces
if not (
isinstance(elem, etree._Comment)
or isinstance(elem, etree._ProcessingInstruction)
):
# remove namespace declarations
elem.tag = etree.QName(elem).localname
tree = parse_xml_file(self.out_file)

resource_elements = tree.xpath("/knora/resource/resptr-prop/resptr")
result = []
Expand Down
17 changes: 2 additions & 15 deletions test/unittests/test_xmlupload.py
Expand Up @@ -4,7 +4,7 @@
from lxml import etree

from knora.dsplib.models.helpers import BaseError
from knora.dsplib.utils.xml_upload import _convert_ark_v0_to_resource_iri, _remove_circular_references
from knora.dsplib.utils.xml_upload import _convert_ark_v0_to_resource_iri, _remove_circular_references, parse_xml_file
from knora.dsplib.models.xmlresource import XMLResource


Expand Down Expand Up @@ -34,20 +34,7 @@ def test_convert_ark_v0_to_resource_iri(self) -> None:

def test_remove_circular_references(self) -> None:
# create a list of XMLResources from the test data file
tree = etree.parse('testdata/test-data.xml')
for elem in tree.getiterator():
if not (isinstance(elem, etree._Comment) or isinstance(elem, etree._ProcessingInstruction)):
elem.tag = etree.QName(elem).localname # remove namespace URI in the element's name
if elem.tag == "annotation":
elem.attrib["restype"] = "Annotation"
elem.tag = "resource"
if elem.tag == "link":
elem.attrib["restype"] = "LinkObj"
elem.tag = "resource"
if elem.tag == "region":
elem.attrib["restype"] = "Region"
elem.tag = "resource"
etree.cleanup_namespaces(tree) # remove unused namespace declarations
tree = parse_xml_file('testdata/test-data.xml')
resources = [XMLResource(x, 'testonto') for x in tree.getroot() if x.tag == "resource"]

# get the purged resources and the stashes from the function to be tested
Expand Down
4 changes: 2 additions & 2 deletions testdata/test-id2iri-mapping.json
@@ -1,4 +1,4 @@
{
"obj_0001": "http://rdfh.ch/082E/ylRvrg7tQI6aVpcTJbVrwg",
"obj_0011": "http://rdfh.ch/082E/JK63OpYWTDWNYVOYFN7FdQ"
"test_thing_1": "http://rdfh.ch/082E/ylRvrg7tQI6aVpcTJbVrwg",
"test_thing_2": "http://rdfh.ch/082E/JK63OpYWTDWNYVOYFN7FdQ"
}

0 comments on commit 584cf55

Please sign in to comment.