Skip to content

Commit

Permalink
feat(xmlupload): use custom IRIs created from salsah ARKs for XML upl…
Browse files Browse the repository at this point in the history
…oad (DEV-179) (#147)

* add custom IRI and ARK to resource

* add creation of custom IRIs from salsah ARKs to XML upload

* use https for version 5 uuid

* validate salsah ARK before calculating UUID

* remove main methods from tests

* Update xml_upload.py

* add unittest

* refactor list from Excel documentation

* add documentation for ARK and IRI for XML upload

* improve code after review
  • Loading branch information
irinaschubert committed Jan 27, 2022
1 parent 853068d commit 873324a
Show file tree
Hide file tree
Showing 8 changed files with 128 additions and 21 deletions.
16 changes: 6 additions & 10 deletions docs/dsp-tools-create.md
Expand Up @@ -317,12 +317,8 @@ Example of a list:

#### Lists from Excel

A list can be directly imported from one or several Excel files. The Excel sheet must have the following format:

![img-list-example.png](assets/images/img-list-example.png)

If there are several languages, a separate Excel file for each language has to be provided. The folder with the Excel
file(s) can be directly referenced inside the list definition by defining it as new list node:
A list can be directly imported from one or several Excel files. The folder with the Excel file(s) can then directly
be referenced inside the list definition by defining it as new list node:

```json
{
Expand All @@ -341,12 +337,12 @@ file(s) can be directly referenced inside the list definition by defining it as
}
```

The nodes section must contain the field:
The `nodes` section has to contain the field:

- _folder_: Path to the folder where the Excel files are stored
- _folder_: Path to the folder containing the Excel files

Further details to this functionality can be found
[here](dsp-tools-excel#create-a-list-from-one-or-several-excel-files).
Further information about the expected format of the Excel lists and details to this functionality can be found
[here](./dsp-tools-excel.md#create-a-list-from-one-or-several-excel-files).

The `lists` element is optional. If not used, it should be omitted.

Expand Down
1 change: 1 addition & 0 deletions docs/dsp-tools-excel.md
Expand Up @@ -76,6 +76,7 @@ For further information about properties, see [here](./dsp-tools-create-ontologi

## Create a list from one or several Excel files


With dsp-tools a JSON list can be created from one or several Excel files. The list can then be inserted into a JSON
ontology and uploaded to a DSP server. The expected structure of the Excel files is described
[here](./dsp-tools-create.md#lists-from-excel). It is possible to create multilingual lists. In this case, a separate
Expand Down
3 changes: 3 additions & 0 deletions docs/dsp-tools-xmlupload.md
Expand Up @@ -179,6 +179,9 @@ A `<resource>` element contains all necessary information to create a resource.
- `id`: a unique, arbitrary string providing a unique ID to the resource in order to be referencable by other resources;
the ID is only used during the import process and later replaced by the IRI used internally by DSP (required)
- `permissions`: a reference to a permission set; the permissions will be applied to the created resource (optional)
- `iri`: a custom IRI used when migrating existing resources (optional)
- `ark`: a version 0 ARK used when migrating existing resources from salsah.org to DSP (optional), it is not possible to
use `iri` and `ark` in the same resource. When `ark` is used, it overrides `iri`.

A complete `<resource>` element may look as follows:

Expand Down
5 changes: 4 additions & 1 deletion knora/dsplib/models/resource.py
Expand Up @@ -217,6 +217,9 @@ def fromJsonLdObj(self, con: Connection, jsonld_obj: Any) -> 'ResourceInstance':
def toJsonLdObj(self, action: Actions) -> Any:
tmp = {}
if action == Actions.Create:
# if a custom IRI is provided, use it
if self._iri:
tmp['@id'] = self._iri
tmp['@type'] = self.classname
tmp["knora-api:attachedToProject"] = {
"@id": self.project
Expand Down Expand Up @@ -335,7 +338,7 @@ def __init__(self,
con: Connection,
projident: str):
self._con = con
if re.match("^[0-9aAbBcCdDeEfF]{4}$", projident):
if re.match("^[0-9a-fA-F]{4}$", projident):
project = Project(con=self._con, shortcode=projident)
elif re.match("^[\\w-]+$", projident):
project = Project(con=self._con, shortname=projident)
Expand Down
2 changes: 2 additions & 0 deletions knora/dsplib/schemas/data.xsd
Expand Up @@ -409,7 +409,9 @@
<xs:attribute name="label" type="xs:string" use="required"/>
<xs:attribute name="restype" type="xs:string" use="required"/>
<xs:attribute name="id" type="xs:ID" use="required"/>
<xs:attribute name="iri" type="xs:string" use="optional"/>
<xs:attribute name="permissions" type="xs:NCName" use="optional"/>
<xs:attribute name="ark" type="xs:string" use="optional"/>
</xs:complexType>

<!-- data type for knora shortcode -->
Expand Down
79 changes: 69 additions & 10 deletions knora/dsplib/utils/xml_upload.py
@@ -1,8 +1,11 @@
"""
This module handles the import of XML data into the DSP platform.
"""
import base64
import json
import os
import re
import uuid
from datetime import datetime
from pathlib import Path
from typing import Optional, Union
Expand Down Expand Up @@ -229,6 +232,8 @@ class XMLResource:
"""Represents a resource in the XML used for data import"""

_id: str
_iri: Optional[str]
_ark: Optional[str]
_label: str
_restype: str
_permissions: Optional[str]
Expand All @@ -242,8 +247,13 @@ def __init__(self, node: etree.Element, default_ontology: Optional[str] = None)
Args:
node: The DOM node to be processed representing a resource (which is a child of the knora element)
default_ontology: The default ontology (given in the attribute default-ontology of the knora element)
Returns:
None
"""
self._id = node.attrib['id']
self._iri = node.attrib.get('iri')
self._ark = node.attrib.get('ark')
self._label = node.attrib['label']
# get the resource type which is in format namespace:resourcetype, p.ex. rosetta:Image
tmp_res_type = node.attrib['restype'].split(':')
Expand All @@ -255,11 +265,7 @@ def __init__(self, node: etree.Element, default_ontology: Optional[str] = None)
self._restype = default_ontology + ':' + tmp_res_type[1]
else:
self._restype = 'knora-admin:' + tmp_res_type[0]
permissions_tmp = node.attrib.get("permissions")
if permissions_tmp:
self._permissions = node.attrib['permissions']
else:
self._permissions = None
self._permissions = node.attrib.get("permissions")
self._bitstream = None
self._properties = []
for subnode in node:
Expand All @@ -277,6 +283,16 @@ def id(self) -> str:
"""The unique id of the resource"""
return self._id

@property
def iri(self) -> Optional[str]:
"""The custom IRI of the resource"""
return self._iri

@property
def ark(self) -> Optional[str]:
"""The custom ARK of the resource"""
return self._ark

@property
def label(self) -> str:
"""The label of the resource"""
Expand All @@ -288,7 +304,7 @@ def restype(self) -> str:
return self._restype

@property
def permissions(self) -> str:
def permissions(self) -> Optional[str]:
"""The reference to the permissions set for this resource"""
return self._permissions

Expand Down Expand Up @@ -323,8 +339,7 @@ def get_resptrs(self) -> list[str]:
resptrs.extend(value.resrefs)
return resptrs

def get_propvals(self, resiri_lookup: dict[str, str], permissions_lookup: dict[str, Permissions]) -> dict[
str, Permissions]:
def get_propvals(self, resiri_lookup: dict[str, str], permissions_lookup: dict[str, Permissions]) -> dict[str, Permissions]:
"""
Get a dictionary of the property names and their values belonging to a resource
Expand Down Expand Up @@ -371,8 +386,7 @@ def get_propvals(self, resiri_lookup: dict[str, str], permissions_lookup: dict[s
prop_data[prop.name] = vals if len(vals) > 1 else vals[0]
return prop_data

def get_bitstream(self, internal_file_name_bitstream: str, permissions_lookup: dict[str, Permissions]) -> Optional[
dict[str, Union[str, Permissions]]]:
def get_bitstream(self, internal_file_name_bitstream: str, permissions_lookup: dict[str, Permissions]) -> Optional[dict[str, Union[str, Permissions]]]:
"""
Get the bitstream object belonging to the resource
Expand Down Expand Up @@ -574,6 +588,46 @@ def validate_xml_against_schema(input_file: str, schema_file: str) -> bool:
return is_valid


def convert_ark_v0_to_resource_iri(ark: str) -> str:
"""
Converts an ARK URL from salsah.org (ARK version 0) of the form ark:/72163/080c-779b9990a0c3f-6e to a DSP resource
IRI of the form http://rdfh.ch/080C/Ef9heHjPWDS7dMR_gGax2Q
This method is needed for the migration of projects from salsah.org to DSP. Resources need to be created with an
existing ARK, so the IRI needs to be extracted from that ARK in order for the ARK URL to be still valid after the
migration.
Args:
ark : an ARK version 0 of the form ark:/72163/080c-779b9990a0c3f-6e, '72163' being the Name Assigning Authority
number, '080c' being the project shortcode, '779b9990a0c3f' being an ID derived from the object's Salsah ID and
'6e' being check digits
Returns:
Resource IRI (str) of the form http://rdfh.ch/080C/Ef9heHjPWDS7dMR_gGax2Q
"""
# create the DaSCH namespace to create version 5 UUIDs
generic_namespace_url = uuid.NAMESPACE_URL
dasch_uuid_ns = uuid.uuid5(generic_namespace_url, "https://dasch.swiss") # cace8b00-717e-50d5-bcb9-486f39d733a2

# get the salsah resource ID from the ARK and convert it to a UUID version 5 (base64 encoded)
if ark.count("-") != 2:
raise BaseError(f"while converting ARK '{ark}'. The ARK seems to be invalid")
project_id, resource_id, _ = ark.split("-")
_, project_id = project_id.rsplit("/", 1)
project_id = project_id.upper()
if not re.match("^[0-9a-fA-F]{4}$", project_id):
raise BaseError(f"while converting ARK '{ark}'. Invalid project shortcode '{project_id}'")
if not re.match("^[0-9A-Za-z]+$", resource_id):
raise BaseError(f"while converting ARK '{ark}'. Invalid Salsah ID '{resource_id}'")

# make a UUID v5 from the namespace created above (which is a UUID itself) and the resource ID and encode it to base64
dsp_uuid = base64.urlsafe_b64encode(uuid.uuid5(dasch_uuid_ns, resource_id).bytes).decode("utf-8")
dsp_uuid = dsp_uuid[:-2]

# use the new UUID to create the resource IRI
return "http://rdfh.ch/" + project_id + "/" + dsp_uuid


def xml_upload(input_file: str, server: str, user: str, password: str, imgdir: str, sipi: str, verbose: bool,
validate_only: bool, incremental: bool) -> None:
"""
Expand Down Expand Up @@ -670,6 +724,10 @@ def xml_upload(input_file: str, server: str, user: str, password: str, imgdir: s
if verbose:
resource.print()

resource_iri = resource.iri
if resource.ark:
resource_iri = convert_ark_v0_to_resource_iri(resource.ark)

resource_bitstream = None
if resource.bitstream:
img = sipi.upload_bitstream(os.path.join(imgdir, resource.bitstream.value))
Expand All @@ -682,6 +740,7 @@ def xml_upload(input_file: str, server: str, user: str, password: str, imgdir: s
# create a resource instance (ResourceInstance) from the given resource in the XML (XMLResource)
instance: ResourceInstance = res_classes[resource.restype](con=con,
label=resource.label,
iri=resource_iri,
permissions=permissions_tmp,
bitstream=resource_bitstream,
values=resource.get_propvals(res_iri_lookup,
Expand Down
9 changes: 9 additions & 0 deletions test/unittests/BUILD.bazel
Expand Up @@ -6,6 +6,15 @@ load("@knora_py_deps//:requirements.bzl", "requirement")

package(default_visibility = ["//visibility:public"])

py_test(
name = "test_convert_ark_v0_to_resource_iri",
srcs = ["test_convert_ark_v0_to_resource_iri.py"],
deps = [
"//knora/dsplib/utils:xml_upload",
"//knora/dsplib/models:helpers"
],
)

py_test(
name = "test_langstring",
srcs = ["test_langstring.py"],
Expand Down
34 changes: 34 additions & 0 deletions test/unittests/test_convert_ark_v0_to_resource_iri.py
@@ -0,0 +1,34 @@
"""Unit tests for ARK v0 conversion"""

import unittest

from knora.dsplib.models.helpers import BaseError
from knora.dsplib.utils.xml_upload import convert_ark_v0_to_resource_iri


class TestARKV02IRI(unittest.TestCase):

def test_convert_ark_v0_to_resource_iri(self):
ark = "ark:/72163/080c-779b9990a0c3f-6e"
iri = convert_ark_v0_to_resource_iri(ark)
self.assertEqual("http://rdfh.ch/080C/Ef9heHjPWDS7dMR_gGax2Q", iri)

with self.assertRaises(BaseError) as err1:
convert_ark_v0_to_resource_iri("ark:/72163/080c-779b999-0a0c3f-6e")
self.assertEqual(err1.exception.message, "while converting ARK 'ark:/72163/080c-779b999-0a0c3f-6e'. The ARK seems to be invalid")

with self.assertRaises(BaseError) as err2:
convert_ark_v0_to_resource_iri("ark:/72163/080X-779b9990a0c3f-6e")
self.assertEqual(err2.exception.message, "while converting ARK 'ark:/72163/080X-779b9990a0c3f-6e'. Invalid project shortcode '080X'")

with self.assertRaises(BaseError) as err3:
convert_ark_v0_to_resource_iri("ark:/72163/080c1-779b9990a0c3f-6e")
self.assertEqual(err3.exception.message, "while converting ARK 'ark:/72163/080c1-779b9990a0c3f-6e'. Invalid project shortcode '080C1'")

with self.assertRaises(BaseError) as err3:
convert_ark_v0_to_resource_iri("ark:/72163/080c-779b99+90a0c3f-6e")
self.assertEqual(err3.exception.message, "while converting ARK 'ark:/72163/080c-779b99+90a0c3f-6e'. Invalid Salsah ID '779b99+90a0c3f'")


if __name__ == '__main__':
unittest.main()

0 comments on commit 873324a

Please sign in to comment.