Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: xmlupload crashes without writing id2iri mapping (DEV-813) #194

Merged
Merged
Changes from 8 commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
eaae265
catch BaseException when connection crashes, write mapping
jnussbaum May 25, 2022
923f7ce
cover more cases
jnussbaum May 25, 2022
e3f2be7
Merge branch 'main' into wip/dev-813-write-id2iri-mapping-when-xmlupl…
jnussbaum May 25, 2022
80b8564
continue
jnussbaum May 30, 2022
20eae27
Merge remote-tracking branch 'origin/wip/dev-813-write-id2iri-mapping…
jnussbaum May 30, 2022
30438aa
improve error handling
jnussbaum May 31, 2022
3ac9ee7
adapt same error handling to resource creation
jnussbaum May 31, 2022
e387588
adapt same error handling to both update functions
jnussbaum May 31, 2022
7e5dd46
Merge branch 'main' into wip/dev-813-write-id2iri-mapping-when-xmlupl…
jnussbaum Jun 1, 2022
018f456
remove salsah-link from testdata/test-id2iri-data.xml
jnussbaum Jun 1, 2022
77ca482
improve error catching
jnussbaum Jun 7, 2022
f5c453b
raise BaseError if salsah-link point to invalid internal ID, so that …
jnussbaum Jun 7, 2022
60c83bd
catch BaseError if sipi upload fails, so that this resource can be sk…
jnussbaum Jun 7, 2022
f759257
adapt same solution to both update methods
jnussbaum Jun 9, 2022
fa477c7
write both stashes into files if necessary
jnussbaum Jun 9, 2022
63792d8
fix typo
jnussbaum Jun 9, 2022
f440450
refactoring
jnussbaum Jun 9, 2022
63418bc
remove unused try-except clauses
jnussbaum Jun 9, 2022
ed13131
refactor
jnussbaum Jun 9, 2022
1ddf7a3
improve variable names
jnussbaum Jun 10, 2022
67a7382
catch BaseExceptions while updating the stashes
jnussbaum Jun 10, 2022
769b178
Merge branch 'main' into wip/dev-813-write-id2iri-mapping-when-xmlupl…
jnussbaum Jun 10, 2022
d534b0d
remove code smell
jnussbaum Jun 10, 2022
c69be05
replace dysfunctional MP4 testfile
jnussbaum Jun 10, 2022
47849d1
e2e test asserts that xmlupload ends without any unsuccessful uploads
jnussbaum Jun 10, 2022
7745c92
replace knora by DSP
jnussbaum Jun 13, 2022
61b5a64
instead of exit(1): handle_upload_error() re-raises the original error
jnussbaum Jun 13, 2022
6e84eed
rename 'update_stashed...()' to 'upload_stashed...()'
jnussbaum Jun 13, 2022
6d860cc
improve architecture of try_sipi_upload()
jnussbaum Jun 13, 2022
a16e328
correct typos
jnussbaum Jun 13, 2022
ef18641
postpone method-hiding to later
jnussbaum Jun 13, 2022
ffefa2a
correct typos in docstrings
jnussbaum Jun 13, 2022
857806c
- instead of exit(1), xml_upload() returns bool to show if everythin…
jnussbaum Jun 13, 2022
01fe081
sleep for 1,2,4,8,16 seconds instead of 5*1 second
jnussbaum Jun 13, 2022
62dba52
sleep longer
jnussbaum Jun 14, 2022
26da262
don't try to upload stash to a non-existing resource
jnussbaum Jun 14, 2022
8392c6f
refactor
jnussbaum Jun 14, 2022
e14a969
'upload' stash instead of 'update' stash
jnussbaum Jun 14, 2022
9d08042
improve terminal output and content of log files
jnussbaum Jun 14, 2022
618cfd5
refactor: put all network actions in a wrapper method
jnussbaum Jun 15, 2022
ba29da5
adapt tests
jnussbaum Jun 15, 2022
efdd4ae
apply reviewer's feedback
jnussbaum Jun 17, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
190 changes: 149 additions & 41 deletions knora/dsplib/utils/xml_upload.py
Expand Up @@ -5,12 +5,15 @@
import json
import os
import re
import time
import uuid
from datetime import datetime
from pathlib import Path
from typing import Optional, Union, cast, Tuple
from typing import Optional, Union, cast, Tuple, Any
from urllib.parse import quote_plus

import requests

from lxml import etree

from knora.dsplib.models.connection import Connection
Expand Down Expand Up @@ -767,7 +770,15 @@ def update_xml_texts(
"@context": context
}
jsondata = json.dumps(jsonobj, indent=4, separators=(',', ': '), cls=KnoraStandoffXmlEncoder)
new_value = con.put(path='/v2/values', jsondata=jsondata)
new_value = None
for _ in range(20):
try:
new_value = con.put(path='/v2/values', jsondata=jsondata)
break
except BaseException:
print(f'{datetime.now().isoformat()}: Try reconnecting to DSP server...')
time.sleep(1)
continue
jnussbaum marked this conversation as resolved.
Show resolved Hide resolved
if not new_value:
print(f'ERROR while updating the xml text of {link_prop.name} of resource {resource.id}')
elif verbose:
Expand Down Expand Up @@ -799,7 +810,15 @@ def update_resptr_props(
'@context': context
}
jsondata = json.dumps(jsonobj, indent=4, separators=(',', ': '))
new_value = con.post(path='/v2/values', jsondata=jsondata)
new_value = None
for _ in range(20):
try:
new_value = con.post(path='/v2/values', jsondata=jsondata)
break
except BaseException:
print(f'{datetime.now().isoformat()}: Try reconnecting to DSP server...')
time.sleep(1)
continue
if not new_value:
print(f'ERROR while updating the resptr prop of {link_prop.name} of resource {resource.id}')
elif verbose:
Expand Down Expand Up @@ -880,6 +899,9 @@ def xml_upload(input_file: str, server: str, user: str, password: str, imgdir: s
# temporarily remove circular references, but only if not an incremental upload
if not incremental:
resources, stashed_xml_texts, stashed_resptr_props = remove_circular_references(resources, verbose)
else:
stashed_xml_texts = dict()
stashed_resptr_props = dict()

sipi_server = Sipi(sipi, con.get_token())

Expand All @@ -897,8 +919,55 @@ def xml_upload(input_file: str, server: str, user: str, password: str, imgdir: s
resclass_name_2_type[res_class_name] = project.get_resclass_type(res_class_name)

res_iri_lookup: dict[str, str] = {}
failed_uploads: list[str] = []

try:
upload_resources(verbose, resources, imgdir, sipi_server, permissions_lookup,
resclass_name_2_type, res_iri_lookup, con, failed_uploads)
except BaseException as err:
handle_upload_error(err, input_file, res_iri_lookup, failed_uploads, stashed_xml_texts, stashed_resptr_props)
exit(1)
jnussbaum marked this conversation as resolved.
Show resolved Hide resolved

# update the resources with the stashed XML texts
if len(stashed_xml_texts) > 0:
jnussbaum marked this conversation as resolved.
Show resolved Hide resolved
update_stashed_xml_texts(verbose, res_iri_lookup, con, stashed_xml_texts)

failed_uploads = []
# update the resources with the stashed resptrs
if len(stashed_resptr_props) > 0:
update_stashed_resptr_props(verbose, res_iri_lookup, con, stashed_resptr_props)

write_id2iri_mapping(input_file, res_iri_lookup, datetime.now().strftime("%Y%m%d-%H%M%S"))
if failed_uploads:
print(f"Could not upload the following resources: {failed_uploads}")


def try_sipi_upload(sipi_server: Sipi, filepath: str) -> dict[Any, Any]:
jnussbaum marked this conversation as resolved.
Show resolved Hide resolved
img = None
for _ in range(20):
try:
img = sipi_server.upload_bitstream(filepath)
break
jnussbaum marked this conversation as resolved.
Show resolved Hide resolved
except BaseException:
print(f'{datetime.now().isoformat()}: Try reconnecting to DSP server (SIPI)...')
jnussbaum marked this conversation as resolved.
Show resolved Hide resolved
time.sleep(1)
continue
if img:
return img
else:
return sipi_server.upload_bitstream(filepath)
jnussbaum marked this conversation as resolved.
Show resolved Hide resolved


def upload_resources(
verbose: bool,
resources: list[XMLResource],
imgdir: str,
sipi_server: Sipi,
permissions_lookup: dict[str, Permissions],
resclass_name_2_type: dict[str, type],
res_iri_lookup: dict[str, str],
con: Connection,
failed_uploads: list[str]
) -> None:
for resource in resources:
if verbose:
resource.print()
Expand All @@ -909,42 +978,49 @@ def xml_upload(input_file: str, server: str, user: str, password: str, imgdir: s

resource_bitstream = None
if resource.bitstream:
img = sipi_server.upload_bitstream(os.path.join(imgdir, resource.bitstream.value))
img = try_sipi_upload(sipi_server=sipi_server, filepath=os.path.join(imgdir, resource.bitstream.value))
internal_file_name_bitstream = img['uploadedFiles'][0]['internalFilename']
resource_bitstream = resource.get_bitstream(internal_file_name_bitstream, permissions_lookup)

permissions_tmp = permissions_lookup.get(resource.permissions)

try:
# create a resource instance (ResourceInstance) from the given resource in the XML (XMLResource)
resclass_type = resclass_name_2_type[resource.restype]
properties = resource.get_propvals(res_iri_lookup, permissions_lookup)
resclass_instance: ResourceInstance = resclass_type(
con=con,
label=resource.label,
iri=resource_iri,
permissions=permissions_tmp,
bitstream=resource_bitstream,
values=properties
)
resclass_instance = resclass_instance.create()
except BaseError as err:
print(f"ERROR while trying to create resource '{resource.label}' ({resource.id}). "
f"The error message was: {err.message}")
failed_uploads.append(resource.id)
continue
except Exception as exception:
print(f"EXCEPTION while trying to create resource '{resource.label}' ({resource.id}). "
f"The exception message was: {exception}")
resclass_instance = None

for _ in range(20):
try:
# create a resource instance (ResourceInstance) from the given resource in the XML (XMLResource)
resclass_type = resclass_name_2_type[resource.restype]
properties = resource.get_propvals(res_iri_lookup, permissions_lookup)
resclass_instance: ResourceInstance = resclass_type(
con=con,
label=resource.label,
iri=resource_iri,
permissions=permissions_tmp,
bitstream=resource_bitstream,
values=properties
)
resclass_instance = resclass_instance.create()
break
except BaseError:
print(f'{datetime.now().isoformat()}: Try reconnecting to DSP server...')
time.sleep(1)
continue

if not resclass_instance:
print(f"ERROR while trying to create resource '{resource.label}' ({resource.id}). ")
failed_uploads.append(resource.id)
continue
else:
res_iri_lookup[resource.id] = resclass_instance.iri
print(f"Created resource '{resclass_instance.label}' ({resource.id}) with IRI '{resclass_instance.iri}'")
jnussbaum marked this conversation as resolved.
Show resolved Hide resolved

res_iri_lookup[resource.id] = resclass_instance.iri
print(f"Created resource '{resclass_instance.label}' ({resource.id}) with IRI '{resclass_instance.iri}'")

# update the resources with the stashed XML texts
if len(stashed_xml_texts) > 0:
print('Update the stashed XML texts...')
def update_stashed_xml_texts(
verbose: bool,
res_iri_lookup: dict[str, str],
con: Connection,
stashed_xml_texts: dict[XMLResource, dict[XMLProperty, dict[str, KnoraStandoffXml]]]
) -> None:
print('Update the stashed XML texts...')
for resource, link_props in stashed_xml_texts.items():
print(f'Update XML text(s) of resource "{resource.id}"...')
res_iri = res_iri_lookup[resource.id]
Expand All @@ -964,9 +1040,14 @@ def xml_upload(input_file: str, server: str, user: str, password: str, imgdir: s
print(f'Exception while updating an XML text of resource "{resource.id}": {exception}')
continue

# update the resources with the stashed resptrs
if len(stashed_resptr_props) > 0:
print('Update the stashed resptrs...')

def update_stashed_resptr_props(
jnussbaum marked this conversation as resolved.
Show resolved Hide resolved
verbose: bool,
res_iri_lookup: dict[str, str],
con: Connection,
stashed_resptr_props: dict[XMLResource, dict[XMLProperty, list[str]]]
) -> None:
print('Update the stashed resptrs...')
for resource, prop_2_resptrs in stashed_resptr_props.items():
print(f'Update resptrs of resource "{resource.id}"...')

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't this be printed only if verbose=True? I'm not sure, but I feel that this could be very verbose in case of lots of resptrs..?

res_iri = res_iri_lookup[resource.id]
Expand All @@ -986,15 +1067,42 @@ def xml_upload(input_file: str, server: str, user: str, password: str, imgdir: s
print(f'Exception while updating an XML text of resource "{resource.id}": {exception}')
continue

# write mapping of internal IDs to IRIs to file with timestamp
timestamp_now = datetime.now()
timestamp_str = timestamp_now.strftime("%Y%m%d-%H%M%S")

def handle_upload_error(
err: Any,
input_file: str,
res_iri_lookup: dict[str, str],
failed_uploads: list[str],
stashed_xml_texts: dict[XMLResource, dict[XMLProperty, dict[str, KnoraStandoffXml]]],
stashed_resptr_props: dict[XMLResource, dict[XMLProperty, list[str]]]
) -> None:
print(f'xmlupload must be aborted because of the following error: {err}')
timestamp_str = datetime.now().strftime("%Y%m%d-%H%M%S")
write_id2iri_mapping(input_file, res_iri_lookup, timestamp_str)
if len(stashed_xml_texts) > 0:
with open(f'stashed_xml_texts_{timestamp_str}.txt', 'a') as f:
for res, props in stashed_xml_texts.items():
f.write(f'\n{res.id}\n---------\n')
for prop, stashed_texts in props.items():
f.write(f'{prop.name}\n')
for name, standoff in stashed_texts.items():
f.write(f'\t{name}: {standoff}')
if len(stashed_resptr_props) > 0:
with open(f'stashed_resptr_props_{timestamp_str}.txt', 'a') as f:
for res, props_ in stashed_resptr_props.items():
f.write(f'\n{res.id}\n---------\n')
for prop, stashed_props in props_.items():
f.write(f'{prop.name}\n\t{stashed_props}')

if failed_uploads:
print(f"Independently of this error, there were some resources that could not be uploaded: "
f"{failed_uploads}")


def write_id2iri_mapping(input_file: str, res_iri_lookup: dict[str, str], timestamp_str: str) -> None:
# write mapping of internal IDs to IRIs to file with timestamp
xml_file_name = Path(input_file).stem
res_iri_lookup_file = "id2iri_" + xml_file_name + "_mapping_" + timestamp_str + ".json"
with open(res_iri_lookup_file, "w") as outfile:
print(f"============\nThe mapping of internal IDs to IRIs was written to {res_iri_lookup_file}")
outfile.write(json.dumps(res_iri_lookup))

if failed_uploads:
print(f"Could not upload the following resources: {failed_uploads}")