diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 000000000..c3ee74dc0 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "knora/dsplib/import_scripts"] + path = knora/dsplib/import_scripts + url = https://github.com/dasch-swiss/0123-import-scripts diff --git a/docs/assets/images/img-excel2xml-raw-data-category.png b/docs/assets/images/img-excel2xml-raw-data-category.png new file mode 100644 index 000000000..d9b40d47d Binary files /dev/null and b/docs/assets/images/img-excel2xml-raw-data-category.png differ diff --git a/docs/assets/templates/excel2xml_sample_data.csv b/docs/assets/templates/excel2xml_sample_data.csv deleted file mode 100644 index c199d22d0..000000000 --- a/docs/assets/templates/excel2xml_sample_data.csv +++ /dev/null @@ -1,10 +0,0 @@ -Resource identifier,Resource name,Long text,Image,Category,Complete?,Color,Date discovered,Exact time,Weight (kg),Find location,Number of descendants,Similar to,See also -res_0,First Resource,"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.",testdata/bitstreams/test.jpg,"Mamals, Insects",yes,#00ff66,01.01.01,2019-10-23T13:45:12Z,,2761369,0,res_1,http://test.org -res_1,Second Resource,"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.",,Birds,No,#ff00ff,2015_01_01,2019-10-23T13:45:12Z,45.8,2761369,12,res_8, -res_2,Third Resource,"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.",testdata/bitstreams/test.jpg, Reptilles ,,,05.11.21,,,,3,, -res_3,Fourth Resource,"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.",,Plants,0,,1.12.1973 - 6.1.1974,,,,2,, -res_4,Fifth Resource,"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.",,Physics,TRUE,,"March 5,1908",,,,1,, -res_5,Sixth Resource,"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.",,"Humans, Animals",FALSE,,1886/7,2009-10-10T12:00:00-05:00,200.382,,3,, -res_6,Seventh Resource,"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.",,Amphibians,None,,1849/1850,,,,99,, -res_7,Eighth Resource,"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.",testdata/bitstreams/test.jpg,Physics,,#ff00ff,01.01.01,,,,6,, -res_8,Ninth Resource,"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.",,Plants,1,,26.2.-24.3.1948,,,,7,, \ No newline at end of file diff --git a/docs/assets/templates/excel2xml_sample_onto.json b/docs/assets/templates/excel2xml_sample_onto.json deleted file mode 100644 index e35bc4802..000000000 --- a/docs/assets/templates/excel2xml_sample_onto.json +++ /dev/null @@ -1,1526 +0,0 @@ -{ - "prefixes": { - "dcterms": "http://purl.org/dc/terms/" - }, - "$schema": "https://raw.githubusercontent.com/dasch-swiss/dsp-tools/main/knora/dsplib/schemas/ontology.json", - "project": { - "shortcode": "082E", - "shortname": "rosetta", - "longname": "Rosetta: DSP example project", - "descriptions": { - "de": "Rosetta ist das Beispielprojekt für die DaSCH Service Platform. Es soll einerseits die Möglichkeiten illustrieren, die die Plattform aktuell bietet, andererseits aber auch intern aufzeigen, wo noch Verbesserungsbedarf besteht.", - "fr": "Rosetta est le projet exemplaire de la DaSCH Service Platform. D'une part, il vise à illustrer les possibilités actuellement offertes par la plate-forme, mais d'autre part, il montre également en interne les domaines où il est encore possible d'apporter des améliorations.", - "en": "Rosetta is the sample project for the DaSCH Service Platform. On one hand, it is intended to illustrate the possibilities currently offered by the platform, but on the other hand, it also shows internally where there is still room for improvement." - }, - "keywords": [ - "Textquellen", - "Objekte", - "Bilder", - "Audio", - "Kyrillisch", - "Griechisch", - "Keilschrift", - "Hieroglyphen", - "Hebräisch", - "Arabisch", - "Japanisch", - "Sonderzeichen", - "XML", - "Markup", - "Annotation", - "Texteigenschaften", - "textual sources", - "objects", - "images", - "audio", - "Cyrillic", - "Greek", - "cuneiform", - "hieroglyphs", - "Hebrew", - "Arabic", - "Japanese", - "special characters", - "textual properties", - "sources", - "objets", - "Cyrillique", - "Grec", - "cunéiforme", - "hiéroglyphes", - "Hébreu", - "Arabe", - "Japonais", - "caractères spéciaux", - "propriétés de texte", - "Data and Service Center for the Humanities (DaSCH)" - ], - "lists": [ - { - "name": "flatlist", - "labels": { - "en": "Flat list" - }, - "comments": { - "en": "Flat list" - }, - "nodes": [ - { - "name": "first-node", - "labels": { - "en": "First node" - } - }, - { - "name": "second-node", - "labels": { - "en": "Second node" - } - } - ] - }, - { - "name": "category", - "labels": { - "de": "Kategorie", - "en": "Category", - "fr": "Catégorie" - }, - "comments": { - "en": "A list containing categories", - "de": "Die Liste enthält Kategorien" - }, - "nodes": [ - { - "name": "artwork", - "labels": { - "de": "Kunstwerk", - "en": "Artwork", - "fr": "Oeuvre d'art" - } - }, - { - "name": "vehicles", - "labels": { - "de": "Fahrzeuge", - "en": "Vehicles", - "fr": "Véhicules" - } - }, - { - "name": "nature", - "labels": { - "de": "Natur", - "en": "Nature", - "fr": "Nature" - }, - "nodes": [ - { - "name": "humans", - "labels": { - "de": "Menschen", - "en": "Humans", - "fr": "Humains" - } - }, - { - "name": "animals", - "labels": { - "de": "Tiere", - "en": "Animals", - "fr": "Animaux" - }, - "nodes": [ - { - "name": "mammals", - "labels": { - "de": "Säugetiere", - "en": "Mammals", - "fr": "Mammifères" - } - }, - { - "name": "insects", - "labels": { - "de": "Insekten", - "en": "Insects", - "fr": "Insectes" - } - }, - { - "name": "birds", - "labels": { - "de": "Vögel", - "en": "Birds", - "fr": "Oiseaux" - } - }, - { - "name": "amphibians", - "labels": { - "de": "Amphibien", - "en": "Amphibians", - "fr": "Amphibiens" - } - }, - { - "name": "reptiles", - "labels": { - "de": "Reptilien", - "en": "Reptiles", - "fr": "Reptiles" - } - } - ] - }, - { - "name": "plants", - "labels": { - "de": "Pflanzen", - "en": "Plants", - "fr": "Plantes" - } - }, - { - "name": "weather", - "labels": { - "de": "Wetter", - "en": "Weather", - "fr": "Météo" - } - }, - { - "name": "physics", - "labels": { - "de": "Physik", - "en": "Physics", - "fr": "Physique" - } - } - ] - } - ] - } - ], - "groups": [ - { - "name": "rosetta-editors", - "descriptions": { - "en": "Editors for the rosetta-project" - }, - "selfjoin": false, - "status": true - } - ], - "users": [ - { - "username": "rosettaedit", - "email": "rosettaedit@test.org", - "givenName": "rosetta-given", - "familyName": "rosetta-family", - "password": "rosetta1234", - "lang": "de", - "groups": [ - ":rosetta-editors" - ], - "projects": [ - ":admin" - ] - } - ], - "ontologies": [ - { - "name": "rosetta", - "label": "rosetta", - "properties": [ - { - "name": "hasTime", - "super": [ - "hasValue" - ], - "object": "TimeValue", - "labels": { - "de": "Zeit", - "en": "Time", - "fr": "Heure" - }, - "gui_element": "TimeStamp" - }, - { - "name": "hasTextMedium", - "super": [ - "hasLinkTo" - ], - "object": "StillImageRepresentation", - "labels": { - "de": "Textträger", - "en": "Text medium", - "fr": "Support de texte" - }, - "gui_element": "Searchbox", - "gui_attributes": { - "numprops": 1 - } - }, - { - "name": "hasImage", - "super": [ - "hasLinkTo" - ], - "object": ":Image2D", - "labels": { - "de": "Bild", - "en": "Image", - "fr": "Image" - }, - "gui_element": "Searchbox", - "gui_attributes": { - "numprops": 1 - } - }, - { - "name": "hasOriginalText", - "super": [ - "hasValue" - ], - "object": "TextValue", - "labels": { - "de": "Text", - "en": "Text", - "fr": "Texte" - }, - "gui_element": "Richtext" - }, - { - "name": "hasTranscription", - "super": [ - "hasValue" - ], - "object": "TextValue", - "labels": { - "de": "Transkription", - "en": "Transcription", - "fr": "Transcription" - }, - "gui_element": "Richtext" - }, - { - "name": "hasTransliteration", - "super": [ - "hasValue" - ], - "object": "TextValue", - "labels": { - "de": "Transliteration", - "en": "Transliteration", - "fr": "Translittération" - }, - "gui_element": "Richtext" - }, - { - "name": "hasTranslation", - "super": [ - "hasValue" - ], - "object": "TextValue", - "labels": { - "de": "Übersetzung", - "en": "Translation", - "fr": "Traduction" - }, - "gui_element": "Richtext" - }, - { - "name": "hasDescription", - "super": [ - "hasValue", - "dcterms:description" - ], - "object": "TextValue", - "labels": { - "de": "Beschreibung", - "en": "Description", - "fr": "Description" - }, - "gui_element": "Richtext" - }, - { - "name": "hasAuthor", - "super": [ - "hasLinkTo", - "dcterms:creator" - ], - "object": ":Person", - "labels": { - "de": "Autor", - "en": "Author", - "fr": "Auteur" - }, - "gui_element": "Searchbox", - "gui_attributes": { - "numprops": 1 - } - }, - { - "name": "hasName", - "super": [ - "hasValue" - ], - "object": "TextValue", - "labels": { - "de": "Name", - "en": "Name", - "fr": "Nom" - }, - "gui_element": "SimpleText", - "gui_attributes": { - "maxlength": 128, - "size": 32 - } - }, - { - "name": "hasTitle", - "super": [ - "hasValue", - "dcterms:title" - ], - "object": "TextValue", - "labels": { - "de": "Titel", - "en": "Title", - "fr": "Titre" - }, - "gui_element": "SimpleText", - "gui_attributes": { - "maxlength": 255, - "size": 80 - } - }, - { - "name": "hasDate", - "super": [ - "hasValue" - ], - "object": "DateValue", - "labels": { - "de": "Datierung", - "en": "Dating", - "fr": "Datation" - }, - "gui_element": "Date" - }, - { - "name": "hasFindspot", - "super": [ - "hasValue" - ], - "object": "GeonameValue", - "labels": { - "de": "Fundort", - "en": "Find spot", - "fr": "Gisement" - }, - "gui_element": "Geonames" - }, - { - "name": "hasLocation", - "super": [ - "hasValue" - ], - "object": "GeonameValue", - "labels": { - "de": "Ort", - "en": "Location", - "fr": "Lieu" - }, - "gui_element": "Geonames" - }, - { - "name": "hasBibliographicReference", - "super": [ - "hasValue" - ], - "object": "TextValue", - "labels": { - "de": "Literatur", - "en": "Literature", - "fr": "Littérature" - }, - "gui_element": "SimpleText", - "gui_attributes": { - "maxlength": 128, - "size": 128 - } - }, - { - "name": "hasExternalLink", - "super": [ - "hasValue" - ], - "object": "UriValue", - "labels": { - "de": "Externer Link", - "en": "External link", - "fr": "Lien hypertexte externe" - }, - "gui_element": "SimpleText", - "gui_attributes": { - "maxlength": 128, - "size": 128 - } - }, - { - "name": "hasIdentifier", - "super": [ - "hasValue" - ], - "object": "UriValue", - "labels": { - "de": "GND", - "en": "GND", - "fr": "GND" - }, - "gui_element": "SimpleText", - "gui_attributes": { - "maxlength": 128, - "size": 128 - } - }, - { - "name": "hasRelatedArtwork", - "super": [ - "hasLinkTo" - ], - "object": "Resource", - "labels": { - "de": "Verwandtes Werk", - "en": "Related Artwork", - "fr": "Oeuvre reliée" - }, - "gui_element": "Searchbox", - "gui_attributes": { - "numprops": 1 - } - }, - { - "name": "hasCreator", - "super": [ - "hasLinkTo", - "dcterms:creator" - ], - "object": ":Person", - "labels": { - "de": "Künstler", - "en": "Artist", - "fr": "Artiste" - }, - "gui_element": "Searchbox" - }, - { - "name": "inInstitution", - "super": [ - "hasLinkTo" - ], - "object": ":Institution", - "labels": { - "de": "Aufbewahrende Institution", - "en": "Custodian institution", - "fr": "Institution de garde" - }, - "gui_element": "Searchbox", - "gui_attributes": { - "numprops": 1 - } - }, - { - "name": "hasInventoryNumber", - "super": [ - "hasValue" - ], - "object": "TextValue", - "labels": { - "de": "Inventarnummer", - "en": "Inventory number", - "fr": "Numéro d'inventaire" - }, - "gui_element": "SimpleText", - "gui_attributes": { - "maxlength": 80, - "size": 25 - } - }, - { - "name": "hasPagenum", - "super": [ - "seqnum" - ], - "object": "IntValue", - "labels": { - "de": "Seitenzahl", - "en": "Page number", - "fr": "Numéro de page" - }, - "gui_element": "SimpleText", - "gui_attributes": { - "maxlength": 8, - "size": 8 - } - }, - { - "name": "partOf", - "super": [ - "isPartOf" - ], - "object": ":Book", - "labels": { - "de": "ist Teil von", - "en": "is part of", - "fr": "fait partie de" - }, - "gui_element": "Searchbox", - "gui_attributes": { - "numprops": 1 - } - }, - { - "name": "hasCategory", - "super": [ - "hasValue" - ], - "object": "ListValue", - "labels": { - "de": "Kategorie", - "en": "Category", - "fr": "Catégorie" - }, - "gui_element": "List", - "gui_attributes": { - "hlist": "category" - } - }, - { - "name": "hasFlatList", - "super": [ - "hasValue" - ], - "object": "ListValue", - "labels": { - "de": "Flatlist", - "en": "Flatlist", - "fr": "Flatlist" - }, - "gui_element": "Radio", - "gui_attributes": { - "hlist": "flatlist" - } - }, - { - "name": "hasColor", - "super": [ - "hasColor" - ], - "object": "ColorValue", - "labels": { - "de": "Farbe", - "en": "Colour", - "fr": "Couleur" - }, - "gui_element": "Colorpicker" - }, - { - "name": "hasCopyright", - "super": [ - "hasValue" - ], - "object": "TextValue", - "labels": { - "de": "Copyright", - "en": "Copyright", - "fr": "Droit d'auteur" - }, - "gui_element": "SimpleText", - "gui_attributes": { - "maxlength": 128, - "size": 64 - } - }, - { - "name": "isPublic", - "super": [ - "hasValue" - ], - "object": "BooleanValue", - "labels": { - "de": "Öffentlich", - "en": "Public", - "fr": "Public" - }, - "gui_element": "Checkbox" - }, - { - "name": "hasChildren", - "super": [ - "hasValue" - ], - "object": "IntValue", - "labels": { - "de": "Anzahl der Kinder", - "en": "Number of children", - "fr": "Nombre d'enfants" - }, - "gui_element": "Spinbox", - "gui_attributes": { - "max": 25.0, - "min": 0.0 - } - }, - { - "name": "hasWeight", - "super": [ - "hasValue" - ], - "object": "DecimalValue", - "labels": { - "de": "Gewicht", - "en": "Weight", - "fr": "Poids" - }, - "gui_element": "SimpleText", - "gui_attributes": { - "maxlength": 255, - "size": 80 - } - }, - { - "name": "hasFiles", - "super": [ - "hasLinkTo" - ], - "object": ":Document", - "labels": { - "de": "Modell-Dateien", - "en": "Files belonging to the model", - "fr": "Fichiers appartenants au modèle" - }, - "gui_element": "Searchbox" - }, - { - "name": "linksToRegion", - "super": [ - "hasLinkTo" - ], - "object": "Region", - "labels": { - "de": "verweist auf eine Region in einem Bild", - "en": "links to a region of an image", - "fr": "réfère à une région d'une image" - }, - "gui_element": "Searchbox" - }, - { - "name": "hasComment", - "super": [ - "hasComment" - ], - "object": "TextValue", - "labels": { - "de": "Kommentar", - "en": "Comment", - "fr": "Commentaire" - }, - "gui_element": "SimpleText" - }, - { - "name": "hasRepresentation", - "super": [ - "hasRepresentation" - ], - "object": "Representation", - "labels": { - "en": "Represented by" - }, - "gui_element": "Searchbox" - } - ], - "resources": [ - { - "name": "Text", - "labels": { - "de": "Text", - "en": "Text", - "fr": "Texte" - }, - "super": "Resource", - "cardinalities": [ - { - "propname": ":hasTextMedium", - "cardinality": "0-1", - "gui_order": 0 - }, - { - "propname": ":hasAuthor", - "cardinality": "0-1", - "gui_order": 1 - }, - { - "propname": ":hasTitle", - "cardinality": "0-1", - "gui_order": 2 - }, - { - "propname": ":hasOriginalText", - "cardinality": "0-1", - "gui_order": 3 - }, - { - "propname": ":hasTranscription", - "cardinality": "0-1", - "gui_order": 4 - }, - { - "propname": ":hasTransliteration", - "cardinality": "0-1", - "gui_order": 5 - }, - { - "propname": ":hasTranslation", - "cardinality": "0-n", - "gui_order": 6 - }, - { - "propname": ":hasFindspot", - "cardinality": "0-1", - "gui_order": 7 - }, - { - "propname": ":hasBibliographicReference", - "cardinality": "0-n", - "gui_order": 8 - }, - { - "propname": ":inInstitution", - "cardinality": "0-n", - "gui_order": 9 - }, - { - "propname": ":hasLocation", - "cardinality": "0-1", - "gui_order": 10 - }, - { - "propname": ":hasCopyright", - "cardinality": "0-1", - "gui_order": 11 - }, - { - "propname": ":hasExternalLink", - "cardinality": "0-n", - "gui_order": 12 - }, - { - "propname": ":hasRelatedArtwork", - "cardinality": "0-1", - "gui_order": 13 - } - ] - }, - { - "name": "Image2D", - "labels": { - "de": "2D-Bild", - "en": "2D image", - "fr": "image 2D" - }, - "super": "StillImageRepresentation", - "cardinalities": [ - { - "propname": ":hasTitle", - "cardinality": "1", - "gui_order": 0 - }, - { - "propname": ":hasCreator", - "cardinality": "0-1", - "gui_order": 1 - }, - { - "propname": ":hasDate", - "cardinality": "0-1", - "gui_order": 2 - }, - { - "propname": ":hasDescription", - "cardinality": "0-n", - "gui_order": 3 - }, - { - "propname": ":hasCopyright", - "cardinality": "0-1", - "gui_order": 4 - }, - { - "propname": ":inInstitution", - "cardinality": "0-n", - "gui_order": 5 - }, - { - "propname": ":hasLocation", - "cardinality": "0-1", - "gui_order": 6 - }, - { - "propname": ":hasExternalLink", - "cardinality": "0-n", - "gui_order": 7 - }, - { - "propname": ":linksToRegion", - "cardinality": "0-n", - "gui_order": 8 - } - ] - }, - { - "name": "ImagePartOfABook", - "labels": { - "de": "Bild in einem Buch", - "en": "Image in a book", - "fr": "Image dans un livre" - }, - "comments": { - "en": "Image that forms part of a book" - }, - "super": "StillImageRepresentation", - "cardinalities": [ - { - "propname": ":hasTitle", - "cardinality": "1" - }, - { - "propname": "seqnum", - "cardinality": "1" - }, - { - "propname": "isPartOf", - "cardinality": "1" - } - ] - }, - { - "name": "Image3D", - "labels": { - "de": "3D-Bild", - "en": "3D image", - "fr": "image 3D" - }, - "super": "DDDRepresentation", - "cardinalities": [ - { - "propname": ":hasTitle", - "cardinality": "1", - "gui_order": 0 - }, - { - "propname": ":hasCreator", - "cardinality": "0-1", - "gui_order": 1 - }, - { - "propname": ":hasDate", - "cardinality": "0-1", - "gui_order": 2 - }, - { - "propname": ":hasDescription", - "cardinality": "0-1", - "gui_order": 3 - }, - { - "propname": ":hasCopyright", - "cardinality": "0-1", - "gui_order": 4 - }, - { - "propname": ":hasFiles", - "cardinality": "0-1", - "gui_order": 5 - }, - { - "propname": ":inInstitution", - "cardinality": "0-n", - "gui_order": 6 - }, - { - "propname": ":hasLocation", - "cardinality": "0-1", - "gui_order": 7 - }, - { - "propname": ":hasExternalLink", - "cardinality": "0-n", - "gui_order": 8 - } - ] - }, - { - "name": "Audio", - "labels": { - "de": "Audio", - "en": "Audio", - "fr": "Audio" - }, - "super": "AudioRepresentation", - "cardinalities": [ - { - "propname": ":hasTitle", - "cardinality": "1", - "gui_order": 1 - }, - { - "propname": ":hasCreator", - "cardinality": "0-1", - "gui_order": 2 - }, - { - "propname": ":hasDate", - "cardinality": "0-1", - "gui_order": 3 - }, - { - "propname": ":hasDescription", - "cardinality": "0-1", - "gui_order": 4 - }, - { - "propname": ":hasCopyright", - "cardinality": "0-1", - "gui_order": 5 - }, - { - "propname": ":inInstitution", - "cardinality": "0-n", - "gui_order": 6 - }, - { - "propname": ":hasLocation", - "cardinality": "0-1", - "gui_order": 7 - }, - { - "propname": ":hasExternalLink", - "cardinality": "0-n", - "gui_order": 8 - } - ] - }, - { - "name": "Video", - "labels": { - "de": "Video", - "en": "Video", - "fr": "Video" - }, - "super": "MovingImageRepresentation", - "cardinalities": [ - { - "propname": ":hasTitle", - "cardinality": "1", - "gui_order": 1 - }, - { - "propname": ":hasCreator", - "cardinality": "0-1", - "gui_order": 2 - }, - { - "propname": ":hasDate", - "cardinality": "0-1", - "gui_order": 3 - }, - { - "propname": ":hasDescription", - "cardinality": "0-1", - "gui_order": 4 - }, - { - "propname": ":hasCopyright", - "cardinality": "0-1", - "gui_order": 5 - }, - { - "propname": ":inInstitution", - "cardinality": "0-n", - "gui_order": 6 - }, - { - "propname": ":hasLocation", - "cardinality": "0-1", - "gui_order": 7 - }, - { - "propname": ":hasExternalLink", - "cardinality": "0-n", - "gui_order": 8 - } - ] - }, - { - "name": "VideoSequence", - "labels": { - "de": "Sequenz einer Video-Ressource", - "en": "Sequence of a video resource" - }, - "super": "Resource", - "cardinalities": [ - { - "propname": ":hasTitle", - "cardinality": "1" - }, - { - "propname": "isSequenceOf", - "cardinality": "1" - }, - { - "propname": "hasSequenceBounds", - "cardinality": "1" - }, - { - "propname": ":hasCreator", - "cardinality": "0-1" - }, - { - "propname": ":hasDate", - "cardinality": "0-1" - }, - { - "propname": ":hasDescription", - "cardinality": "0-1" - } - ] - }, - { - "name": "AudioSequence", - "labels": { - "de": "Sequenz einer Audio-Ressource", - "en": "Sequence of an audio resource" - }, - "super": "Resource", - "cardinalities": [ - { - "propname": ":hasTitle", - "cardinality": "1" - }, - { - "propname": "isSequenceOf", - "cardinality": "1" - }, - { - "propname": "hasSequenceBounds", - "cardinality": "1" - }, - { - "propname": ":hasCreator", - "cardinality": "0-1" - }, - { - "propname": ":hasDate", - "cardinality": "0-1" - }, - { - "propname": ":hasDescription", - "cardinality": "0-1" - } - ] - }, - { - "name": "Book", - "labels": { - "de": "Buch / Manuskript", - "en": "book / manuscript", - "fr": "livre / manuscrit" - }, - "super": "Resource", - "cardinalities": [ - { - "propname": ":hasTitle", - "cardinality": "1-n", - "gui_order": 1 - }, - { - "propname": ":hasDescription", - "cardinality": "0-1", - "gui_order": 2 - }, - { - "propname": ":hasAuthor", - "cardinality": "0-n", - "gui_order": 3 - }, - { - "propname": ":hasDate", - "cardinality": "0-1", - "gui_order": 4 - }, - { - "propname": ":inInstitution", - "cardinality": "0-n", - "gui_order": 5 - }, - { - "propname": ":hasLocation", - "cardinality": "0-1", - "gui_order": 6 - }, - { - "propname": ":hasCopyright", - "cardinality": "0-1", - "gui_order": 7 - }, - { - "propname": ":hasExternalLink", - "cardinality": "0-n", - "gui_order": 8 - } - ] - }, - { - "name": "Page", - "super": "Resource", - "labels": { - "de": "Buchseite / Manuskriptseite", - "en": "Book page / Manuscript page", - "fr": "Page du livre / Page du manuscrit" - }, - "comments": { - "en": "A page is a part of a book or manuscript" - }, - "cardinalities": [ - { - "propname": ":hasTextMedium", - "cardinality": "0-1", - "gui_order": 0 - }, - { - "propname": ":hasPagenum", - "cardinality": "1", - "gui_order": 1 - }, - { - "propname": ":hasDescription", - "cardinality": "0-1", - "gui_order": 2 - }, - { - "propname": ":partOf", - "cardinality": "1", - "gui_order": 3 - }, - { - "propname": ":hasTranscription", - "cardinality": "0-1", - "gui_order": 4 - }, - { - "propname": ":hasTransliteration", - "cardinality": "0-1", - "gui_order": 5 - }, - { - "propname": ":hasTranslation", - "cardinality": "0-n", - "gui_order": 6 - }, - { - "propname": ":hasBibliographicReference", - "cardinality": "0-n", - "gui_order": 7 - } - ] - }, - { - "name": "Person", - "labels": { - "de": "Person", - "en": "Person", - "fr": "Personne" - }, - "super": "Resource", - "cardinalities": [ - { - "propname": ":hasName", - "cardinality": "1", - "gui_order": 1 - }, - { - "propname": ":hasIdentifier", - "cardinality": "0-1", - "gui_order": 2 - }, - { - "propname": ":hasChildren", - "cardinality": "0-1", - "gui_order": 3 - }, - { - "propname": ":hasExternalLink", - "cardinality": "0-n", - "gui_order": 4 - } - ] - }, - { - "name": "Institution", - "labels": { - "de": "Institution", - "en": "Institution", - "fr": "Institution" - }, - "super": "Resource", - "cardinalities": [ - { - "propname": ":hasName", - "cardinality": "1", - "gui_order": 1 - }, - { - "propname": ":hasLocation", - "cardinality": "1", - "gui_order": 2 - }, - { - "propname": ":hasIdentifier", - "cardinality": "0-1", - "gui_order": 3 - }, - { - "propname": ":hasExternalLink", - "cardinality": "0-n", - "gui_order": 4 - } - ] - }, - { - "name": "Object", - "labels": { - "de": "Objekt", - "en": "Object", - "fr": "Objet" - }, - "super": "Resource", - "cardinalities": [ - { - "propname": ":hasImage", - "cardinality": "0-n", - "gui_order": 0 - }, - { - "propname": ":hasCategory", - "cardinality": "0-n", - "gui_order": 1 - }, - { - "propname": ":hasFlatList", - "cardinality": "0-n", - "gui_order": 1 - }, - { - "propname": ":hasName", - "cardinality": "0-n", - "gui_order": 2 - }, - { - "propname": ":hasDescription", - "cardinality": "0-n", - "gui_order": 3 - }, - { - "propname": ":hasWeight", - "cardinality": "0-n", - "gui_order": 4 - }, - { - "propname": ":hasCreator", - "cardinality": "0-1", - "gui_order": 5 - }, - { - "propname": ":hasFindspot", - "cardinality": "0-n", - "gui_order": 6 - }, - { - "propname": ":hasDate", - "cardinality": "0-n", - "gui_order": 7 - }, - { - "propname": ":inInstitution", - "cardinality": "0-1", - "gui_order": 8 - }, - { - "propname": ":hasLocation", - "cardinality": "0-1", - "gui_order": 9 - }, - { - "propname": ":hasInventoryNumber", - "cardinality": "0-1", - "gui_order": 10 - }, - { - "propname": ":isPublic", - "cardinality": "0-1", - "gui_order": 11 - }, - { - "propname": ":hasColor", - "cardinality": "0-n", - "gui_order": 12 - }, - { - "propname": ":hasExternalLink", - "cardinality": "0-n", - "gui_order": 13 - }, - { - "propname": ":hasComment", - "cardinality": "0-n", - "gui_order": 14 - }, - { - "propname": ":hasTime", - "cardinality": "0-n", - "gui_order": 15 - } - ] - }, - { - "name": "Document", - "labels": { - "de": "Dokument", - "en": "Document", - "fr": "Document" - }, - "super": "DocumentRepresentation", - "cardinalities": [ - { - "propname": ":hasName", - "cardinality": "1", - "gui_order": 1 - }, - { - "propname": ":hasDescription", - "cardinality": "1", - "gui_order": 2 - }, - { - "propname": ":hasTime", - "cardinality": "0-1", - "gui_order": 3 - } - ] - }, - { - "name": "Archive", - "labels": { - "de": "Archiv", - "en": "Archive", - "fr": "Archive" - }, - "super": "ArchiveRepresentation", - "cardinalities": [ - { - "propname": ":hasName", - "cardinality": "1", - "gui_order": 1 - }, - { - "propname": ":hasDescription", - "cardinality": "1", - "gui_order": 2 - } - ] - }, - { - "name": "TextDocument", - "labels": { - "de": "Text-Dokument", - "en": "Text document", - "fr": "Document texte" - }, - "super": "TextRepresentation", - "cardinalities": [ - { - "propname": ":hasName", - "cardinality": "1", - "gui_order": 1 - }, - { - "propname": ":hasDescription", - "cardinality": "1", - "gui_order": 2 - } - ] - }, - { - "name": "ObjectWithDifferentRepresentations", - "labels": { - "de": "Objekt mit unterschiedlichen Repräsentationen", - "en": "Object with different representations" - }, - "comments": { - "en": "an object that can have representations of different types (audio, images,...)" - }, - "super": "Resource", - "cardinalities": [ - { - "propname": ":hasRepresentation", - "cardinality": "0-n", - "gui_order": 1 - } - ] - } - ] - }, - { - "name": "rosetta2", - "label": "rosetta2", - "properties": [ - { - "name": "hasLinkToOtherOnto", - "super": [ - "hasLinkTo" - ], - "object": "rosetta:Image2D", - "labels": { - "de": "Textträger", - "en": "Text medium", - "fr": "Support de texte" - }, - "gui_element": "Searchbox", - "gui_attributes": { - "numprops": 1 - } - } - ], - "resources": [ - { - "name": "TestResource", - "labels": { - "en": "TestResource" - }, - "super": "Resource", - "cardinalities": [ - { - "propname": ":hasLinkToOtherOnto", - "cardinality": "1", - "gui_order": 1 - } - ] - } - ] - } - ] - } -} diff --git a/docs/assets/templates/excel2xml_sample_script.py b/docs/assets/templates/excel2xml_sample_script.py deleted file mode 100644 index 6ea630929..000000000 --- a/docs/assets/templates/excel2xml_sample_script.py +++ /dev/null @@ -1,101 +0,0 @@ -import pandas as pd -import warnings -from knora import excel2xml - -# general preparation -# ------------------- -path_to_json = "excel2xml_sample_onto.json" -main_df = pd.read_csv("excel2xml_sample_data.csv", dtype="str", sep=",") -# main_df = pd.read_excel("path-to-your-data-source", dtype="str") -# main_df.drop_duplicates(inplace = True) -# main_df.dropna(how = "all", inplace = True) -root = excel2xml.make_root(shortcode="0123", default_ontology="onto-name") -root = excel2xml.append_permissions(root) - -# create list mappings -# -------------------- -category_dict = excel2xml.create_json_list_mapping( - path_to_json=path_to_json, - list_name="category", - language_label="en" -) -category_dict_fallback = excel2xml.create_json_excel_list_mapping( - path_to_json=path_to_json, - list_name="category", - excel_values=main_df["Category"], - sep="," -) - -# create all resources -# -------------------- -for index, row in main_df.iterrows(): - resource = excel2xml.make_resource( - label=row["Resource name"], - restype=":MyResource", - id=excel2xml.make_xsd_id_compatible(row["Resource identifier"]) - ) - if excel2xml.check_notna(row["Image"]): - resource.append(excel2xml.make_bitstream_prop(row["Image"], permissions="prop-default")) - resource.append(excel2xml.make_text_prop(":name", row["Resource name"])) - resource.append(excel2xml.make_text_prop( - ":longtext", - excel2xml.PropertyElement(value=row["Long text"], permissions="prop-restricted", comment="long text", - encoding="xml") - )) - - # to get the correct category values, first split the cell, then look up the values in "category_dict", - # and if it's not there, look in "category_dict_fallback" - category_values = [category_dict.get(x.strip(), category_dict_fallback[x.strip()]) for x in - row["Category"].split(",")] - resource.append(excel2xml.make_list_prop("category", ":hasCategory", category_values)) - if excel2xml.check_notna(row["Complete?"]): - resource.append(excel2xml.make_boolean_prop(":isComplete", row["Complete?"])) - if excel2xml.check_notna(row["Color"]): - resource.append(excel2xml.make_color_prop(":colorprop", row["Color"])) - if pd.notna(row["Date discovered"]): - potential_date = excel2xml.find_date_in_string(row["Date discovered"]) - if potential_date: - resource.append(excel2xml.make_date_prop(":date", potential_date)) - else: - warnings.warn(f"Error in row {index + 2}: The column 'Date discovered' should contain a date, " - f"but no date was detected in the string '{row['Date discovered']}'") - if excel2xml.check_notna(row["Exact time"]): - resource.append(excel2xml.make_time_prop(":timeprop", row["Exact time"])) - if excel2xml.check_notna(row["Weight (kg)"]): - resource.append(excel2xml.make_decimal_prop(":weight", row["Weight (kg)"])) - if excel2xml.check_notna(row["Find location"]): - resource.append(excel2xml.make_geoname_prop(":location", row["Find location"])) - resource.append(excel2xml.make_integer_prop(":descendantsCount", row["Number of descendants"])) - if excel2xml.check_notna(row["Similar to"]): - resource.append(excel2xml.make_resptr_prop(":similarTo", row["Similar to"])) - if excel2xml.check_notna(row["See also"]): - resource.append(excel2xml.make_uri_prop(":url", row["See also"])) - - root.append(resource) - -# Annotation, Region, Link -# ------------------------ -annotation = excel2xml.make_annotation("Annotation of Resource 0", "annotation_of_res_0") -annotation.append(excel2xml.make_text_prop("hasComment", "This is a comment")) -annotation.append(excel2xml.make_resptr_prop("isAnnotationOf", "res_0")) -root.append(annotation) - -region = excel2xml.make_region("Region of Image 0", "region_of_image_0") -region.append(excel2xml.make_text_prop("hasComment", "This is a comment")) -region.append(excel2xml.make_color_prop("hasColor", "#5d1f1e")) -region.append(excel2xml.make_resptr_prop("isRegionOf", "image_0")) -region.append(excel2xml.make_geometry_prop( - "hasGeometry", - '{"type": "rectangle", "lineColor": "#ff3333", "lineWidth": 2, "points": [{"x": 0.08, "y": 0.16}, {"x": 0.73, ' - '"y": 0.72}], "original_index": 0}' -)) -root.append(region) - -link = excel2xml.make_link("Link between Resource 0 and 1", "link_res_0_res_1") -link.append(excel2xml.make_text_prop("hasComment", "This is a comment")) -link.append(excel2xml.make_resptr_prop("hasLinkTo", ["res_0", "res_1"])) -root.append(link) - -# write file -# ---------- -excel2xml.write_xml(root, "data.xml") diff --git a/docs/dsp-tools-excel2xml.md b/docs/dsp-tools-excel2xml.md index 7b5527a00..e848f4f20 100644 --- a/docs/dsp-tools-excel2xml.md +++ b/docs/dsp-tools-excel2xml.md @@ -1,24 +1,22 @@ [![PyPI version](https://badge.fury.io/py/dsp-tools.svg)](https://badge.fury.io/py/dsp-tools) # `excel2xml`: Convert a data source to XML -dsp-tools assists you in converting a data source in CSV/XLS(X) format to an XML file. There are two use cases for a -transformation from Excel/CSV to XML: +dsp-tools assists you in converting a data source in CSV/XLS(X) format to an XML file. - - The CLI command `dsp-tools excel2xml` creates an XML file from an Excel/CSV file which is already structured - according to the DSP specifications. This is mostly used for DaSCH-interal data migration. **The CLI command is - documented [here](dsp-tools-excel.md#cli-command-excel2xml).** - - The module `excel2xml` can be imported into a custom Python script that transforms any tabular data into an XML. This - use case is more frequent, because data from research projects have a variety of formats/structures. **The - `excel2xml` module is documented on this page.** +| **Hint** | +|-------------------------------------------------------------------------------------------------------------------------------------------| +| This page is about the **module** `excel2xml`. The CLI command is documented [here](dsp-tools-excel.md#xml-data-file-from-excelcsv-file). | -
-**In the following, an example is given how to use the module `excel2xml`:** +To demonstrate the usage of the `excel2xml` module, there is a GitHub repository named `0123-import-scripts`. It +contains: -Save the following files into a directory, and run the Python script: +- a sample JSON project file +- sample data that fits the data model of the JSON project file +- a sample Python script that demonstrates how to use the module `excel2xml`. - - sample data: [excel2xml_sample_data.csv](./assets/templates/excel2xml_sample_data.csv) - - sample ontology: [excel2xml_sample_onto.json](./assets/templates/excel2xml_sample_onto.json) - - sample script: [excel2xml_sample_script.py](./assets/templates/excel2xml_sample_script.py) +Navigate to [https://github.com/dasch-swiss/0123-import-scripts](https://github.com/dasch-swiss/0123-import-scripts) and +follow the steps described there. The README will teach you some basics that will be necessary to work with `excel2xml`. +Once you are familiar with the basics, return to this page to learn how the sample Python script works. This is the simplified pattern how the Python script works: @@ -39,9 +37,9 @@ This is the simplified pattern how the Python script works: 3 append the permissions 4 if necessary: create list mappings (see below) 5 iterate through the rows of your data source: -6 create the `` tag -7 append properties to it -8 append the resource to the root tag `` +6 create the `` tag +7 append properties to it +8 append the resource to the root tag `` 9 save the finished XML file ``` @@ -74,12 +72,69 @@ here](./dsp-tools-xmlupload.md#how-to-use-the-permissions-attribute-in-resources Let's assume that your data source has a column containing list values named after the "label" of the JSON project list, instead of the "name" which is needed for the `dsp-tools xmlupload`. You need a way to get the names from the labels. If your data source uses the labels correctly, this is an easy task: The method `create_json_list_mapping()` creates a -dictionary that maps the labels to the names. +dictionary that maps the labels to the names: + +The list "category" in `0123-import-scripts/import_project.json` looks as follows: +```json +{ + "name": "category", + "labels": {"de": "Kategorie", "en": "Category"}, + "comments": {"en": "A list containing categories", "de": "Eine Liste mit Kategorien"}, + "nodes": [ + { + "name": "artwork", + "labels": {"de": "Kunstwerk", "en": "Artwork"} + }, + { + "name": "nature", + "labels": {"de": "Natur", "en": "Nature"}, + "nodes": [ + { + "name": "humans", + "labels": {"de": "Menschen", "en": "Humans"} + }, + {"...": "..."} + ] + } + ] +} +``` + +If you pass this list to `create_json_list_mapping()`, it creates the following dictionary: +```json +{ + "Kunstwerk": "artwork", + "kunstwerk": "artwork", + "Menschen": "humans", + "menschen": "humans", + "Natur": "nature", + "natur": "nature", + "...": "..." +} +``` + + If, however, your data source has spelling variants, you need the more sophisticated approach of `create_json_excel_list_mapping()`: This method creates a dictionary that maps the list values in your data source to their correct JSON project node name. This happens based on string similarity. Please carefully check the result if there are no false matches! +The column "Category" in `0123-import-scripts/data_raw.csv` has spelling mistakes: +![column category](./assets/images/img-excel2xml-raw-data-category.png) + +The dictionary that results if you call `create_json_excel_list_mapping()`: +```json +{ + "Huumans": "humans", + "huumans": "humans", + "Artwörk": "artwork", + "artwörk": "artwork" +} +``` + +The sample Python scripts features an example how to call these two methods, and how the resulting dictionaries can be +used. + ## 5. Iterate through the rows of your data source With the help of Pandas, you can then iterate through the rows of your Excel/CSV, and create resources and properties. @@ -98,9 +153,13 @@ There are four kind of resources that can be created: `` is the most frequent of them. The other three are [explained here](./dsp-tools-xmlupload.md#dsp-base-resources--base-properties-to-be-used-directly-in-the-xml-file). +#### Resource ID Special care is needed when the ID of a resource is created. Every resource must have an ID that is unique in the file, and it must meet the constraints of xsd:ID. You can simply achieve this if you use the method `make_xsd_id_compatible()`. +If later, another resource would like to set a resptr-link to the resource that you are coding now, you must store the +ID in a dict, so that you can retrieve it later. The example script contains an example of such a dict. + ### 7. Append the properties For every property, there is a helper function that explains itself when you hover over it. So you don't need to worry @@ -190,6 +249,27 @@ usable if it is "", "N/A", or "-" - a PropertyElement whose "value" fulfills the above criteria +Why not just checking a cell by its boolean value? Like: +``` +if cell: + resource.append(make_*_prop(cell)) +``` + +There are many problems that can occur with this simple approach! Often, a cell won't evaluate to the boolean that you +might expect: + +| cell content | return value of `bool(cell)` | You might have expected... | +|--------------|------------------------------|------------------------------------------------------------------| +| 0 | False | True, because 0 is a valid integer for your integer property | +| " " | True | False, because an empty string is not usable for a text property | +| numpy.nan | True | False, because N/A is not a usable value | +| pandas.NA | TypeError (*) | False, because N/A is not a usable value | +| "" | True | False, because this is the string representation of N/A | +| "-" | True | False, because this is a placeholder in an empty text field | +(*) TypeError: boolean value of NA is ambiguous + +In contrast, `check_notna(cell)` will return the expected value for all cases in the table! + ### Calendar date parsing The method `find_date_in_string(string)` tries to find a calendar date in a string. If successful, it diff --git a/knora/dsplib/import_scripts b/knora/dsplib/import_scripts new file mode 160000 index 000000000..dca61579f --- /dev/null +++ b/knora/dsplib/import_scripts @@ -0,0 +1 @@ +Subproject commit dca61579f1fac3cbc0b4e10e463ecb84df01dcad diff --git a/knora/dsplib/utils/shared.py b/knora/dsplib/utils/shared.py index 7c7f74811..6d20c0095 100644 --- a/knora/dsplib/utils/shared.py +++ b/knora/dsplib/utils/shared.py @@ -1,12 +1,13 @@ +import os import time import unicodedata +from datetime import datetime +from typing import Callable, Any, Optional + import pandas as pd import regex -import os from lxml import etree from requests import RequestException -from datetime import datetime -from typing import Callable, Any, Optional from knora.dsplib.models.connection import Connection from knora.dsplib.models.helpers import BaseError @@ -93,7 +94,7 @@ def validate_xml_against_schema(input_file: str) -> bool: Validates an XML file against an XSD schema Args: - input_file: the XML file to be validated + input_file: path to the XML file to be validated Returns: True if the XML file is valid. Otherwise, a BaseError with a detailed error log is raised diff --git a/knora/dsplib/utils/xml_upload.py b/knora/dsplib/utils/xml_upload.py index 623b170c6..4a5246b4b 100644 --- a/knora/dsplib/utils/xml_upload.py +++ b/knora/dsplib/utils/xml_upload.py @@ -10,12 +10,13 @@ from pathlib import Path from typing import Optional, cast, Tuple, Any from urllib.parse import quote_plus + from lxml import etree -from knora.dsplib.models.projectContext import ProjectContext from knora.dsplib.models.connection import Connection from knora.dsplib.models.helpers import BaseError from knora.dsplib.models.permission import Permissions +from knora.dsplib.models.projectContext import ProjectContext from knora.dsplib.models.resource import ResourceInstanceFactory, ResourceInstance, KnoraStandoffXmlEncoder from knora.dsplib.models.sipi import Sipi from knora.dsplib.models.value import KnoraStandoffXml @@ -355,7 +356,7 @@ def _upload_resources( """ # If there are multimedia files: calculate their total size - bitstream_all_sizes_mb = [os.path.getsize(res.bitstream.value) / 1000000 for res in resources if res.bitstream] + bitstream_all_sizes_mb = [os.path.getsize(os.path.join(imgdir, res.bitstream.value)) / 1000000 for res in resources if res.bitstream] if len(bitstream_all_sizes_mb) > 0: bitstream_size_total_mb = round(sum(bitstream_all_sizes_mb), 1) bitstream_all_sizes_iterator = iter(bitstream_all_sizes_mb) # for later reuse, to avoid later system calls diff --git a/knora/excel2xml.py b/knora/excel2xml.py index aa009174a..90bad1f66 100644 --- a/knora/excel2xml.py +++ b/knora/excel2xml.py @@ -1,23 +1,22 @@ +import dataclasses import datetime +import difflib import json import os import re import uuid import warnings -import difflib from operator import xor -import regex +from typing import Any, Iterable, Optional, Union import pandas as pd -from typing import Any, Iterable, Optional, Union +import regex from lxml import etree from lxml.builder import E -import dataclasses from knora.dsplib.models.helpers import BaseError from knora.dsplib.models.propertyelement import PropertyElement -from knora.dsplib.utils.shared import simplify_name, check_notna - +from knora.dsplib.utils.shared import simplify_name, check_notna, validate_xml_against_schema xml_namespace_map = { None: "https://dasch.swiss/schema", @@ -30,7 +29,7 @@ def make_xsd_id_compatible(string: str) -> str: Make a string compatible with the constraints of xsd:ID, so that it can be used as "id" attribute of a tag. An xsd:ID must not contain special characters, and it must be unique in the document. - This method replaces the illegal characters by "_" and appends a random number to the string to make it unique. + This method replaces the illegal characters by "_" and appends a random component to the string to make it unique. The string must contain at least one Unicode letter (matching the regex ``\\p{L}``), underscore, !, ?, or number, but must not be "None", "", "N/A", or "-". Otherwise, a BaseError will be raised. @@ -58,7 +57,35 @@ def make_xsd_id_compatible(string: str) -> str: return res -def find_date_in_string(string: str, calling_resource: str = "") -> Optional[str]: +def _derandomize_xsd_id(string: str, multiple_occurrences: bool = False) -> str: + """ + In some contexts, the random component of the output of make_xsd_id_compatible() is a hindrance, especially for + testing. This method removes the random part, but leaves the other modifications introduced by + make_xsd_id_compatible() in place. This method's behaviour is defined by the example in the "Examples" section. + + Args: + string: the output of make_xsd_id_compatible() + multiple_occurrences: If true, string can be an entire XML document, and all occurrences will be removed + + Returns: + the derandomized string + + Examples: + >>> id_1 = make_xsd_id_compatible("Hello!") + >>> id_2 = make_xsd_id_compatible("Hello!") + >>> assert _derandomize_xsd_id(id_1) == _derandomize_xsd_id(id_2) + """ + if not isinstance(string, str) or not check_notna(string): + raise BaseError(f"The input '{string}' cannot be derandomized.") + + uuid4_regex = r"[a-f0-9]{8}-?[a-f0-9]{4}-?4[a-f0-9]{3}-?[89ab][a-f0-9]{3}-?[a-f0-9]{12}" + if multiple_occurrences: + return re.subn(uuid4_regex, "", string, flags=re.IGNORECASE)[0] + else: + return re.sub(uuid4_regex, "", string, re.IGNORECASE) + + +def find_date_in_string(string: str) -> Optional[str]: """ Checks if a string contains a date value (single date, or date range), and returns the first found date as DSP-formatted string. Returns None if no date was found. @@ -86,7 +113,6 @@ def find_date_in_string(string: str, calling_resource: str = "") -> Optional[str Args: string: string to check - calling_resource: the name of the parent resource (for better error messages) Returns: DSP-formatted date string, or None @@ -98,6 +124,11 @@ def find_date_in_string(string: str, calling_resource: str = "") -> Optional[str See https://docs.dasch.swiss/latest/DSP-TOOLS/dsp-tools-xmlupload/#date-prop """ + # sanitize input, just in case that the method was called on an empty or N/A cell + if not check_notna(string): + return None + string = str(string) + monthes_dict = { "January": 1, "Jan": 1, @@ -669,7 +700,9 @@ def make_decimal_prop( # check value type for val in values: - if not re.search(r"^\d+\.\d+$", str(val.value).strip()): + try: + float(val.value) + except ValueError: raise BaseError(f"Failed validation in resource '{calling_resource}', property '{name}': " f"'{val.value}' is not a valid decimal number.") @@ -688,7 +721,7 @@ def make_decimal_prop( **kwargs, nsmap=xml_namespace_map ) - value_.text = str(val.value) + value_.text = str(float(val.value)) prop_.append(value_) return prop_ @@ -875,7 +908,9 @@ def make_integer_prop( # check value type for val in values: - if not re.search(r"^\d+$", str(val.value).strip()): + try: + int(val.value) + except ValueError: raise BaseError(f"Failed validation in resource '{calling_resource}', property '{name}': " f"'{val.value}' is not a valid integer.") @@ -894,7 +929,7 @@ def make_integer_prop( **kwargs, nsmap=xml_namespace_map ) - value_.text = str(val.value) + value_.text = str(int(val.value)) prop_.append(value_) return prop_ @@ -1473,7 +1508,7 @@ def create_json_excel_list_mapping( ) -> dict[str, str]: """ Often, data sources contain list values that aren't identical to the name of the node in the list of the JSON - project file (a.k.a. ontology). In order to create a correct XML for the `dsp-tools xmlupload`, a mapping is + project file (colloquially: ontology). In order to create a correct XML for the `dsp-tools xmlupload`, a mapping is necessary. This function takes a JSON list and an Excel column containing list-values, and tries to match them automatically based on similarity. The result is a dict of the form {excel_value: list_node_name}. @@ -1632,6 +1667,12 @@ def write_xml(root: etree.Element, filepath: str) -> None: xml_string = xml_string.replace(">", ">") with open(filepath, "w", encoding="utf-8") as f: f.write(xml_string) + try: + validate_xml_against_schema(filepath) + print(f"The XML file was successfully saved to {filepath}") + except BaseError as err: + warnings.warn(f"The XML file was successfully saved to {filepath}, but the following Schema validation " + f"error(s) occurred: {err.message}") def excel2xml(datafile: str, shortcode: str, default_ontology: str) -> None: diff --git a/test/e2e/test_0123_import_scripts.py b/test/e2e/test_0123_import_scripts.py new file mode 100644 index 000000000..e97b49d43 --- /dev/null +++ b/test/e2e/test_0123_import_scripts.py @@ -0,0 +1,71 @@ +import os +import unittest + +import pytest + +from knora.dsplib.utils.onto_create_ontology import create_project +from knora.dsplib.utils.xml_upload import xml_upload +from knora.excel2xml import _derandomize_xsd_id + + +class TestImportScripts(unittest.TestCase): + + def tearDown(self) -> None: + """ + Remove generated data. This method is executed after every test method. + """ + if os.path.isfile("knora/dsplib/import_scripts/data-processed.xml"): + os.remove("knora/dsplib/import_scripts/data-processed.xml") + + + @pytest.mark.filterwarnings("ignore") + def test_import_scripts(self) -> None: + """ + Execute the import script in its directory, create the project on the DSP server, and upload the created XML to + the DSP server. + """ + # pull the latest state of the git submodule + os.system("git submodule update --init --recursive") + from knora.dsplib.import_scripts import import_script + + # execute the import script in its directory + old_working_directory = os.getcwd() + os.chdir("knora/dsplib/import_scripts") + try: + import_script.main() + finally: + os.chdir(old_working_directory) + + # check the output XML (but before, remove random components from resource IDs and resptr targets) + with open("testdata/0123-data-processed-expected.xml") as f: + xml_expected = _derandomize_xsd_id(f.read(), multiple_occurrences=True) + with open("knora/dsplib/import_scripts/data-processed.xml") as f: + xml_returned = _derandomize_xsd_id(f.read(), multiple_occurrences=True) + self.assertEqual(xml_expected, xml_returned) + + # create the JSON project file, and upload the XML + success_on_creation = create_project( + input_file="knora/dsplib/import_scripts/import_project.json", + server="http://0.0.0.0:3333", + user_mail="root@example.com", + password="test", + verbose=False, + dump=False + ) + self.assertTrue(success_on_creation) + + success_on_xmlupload = xml_upload( + input_file="knora/dsplib/import_scripts/data-processed.xml", + server="http://0.0.0.0:3333", + user="root@example.com", + password="test", + imgdir="knora/dsplib/import_scripts/", + sipi="http://0.0.0.0:1024", + verbose=False, + incremental=False + ) + self.assertTrue(success_on_xmlupload) + + +if __name__ == '__main__': + unittest.main() diff --git a/test/unittests/test_excel2xml.py b/test/unittests/test_excel2xml.py index fa23476be..d44ab2cfe 100644 --- a/test/unittests/test_excel2xml.py +++ b/test/unittests/test_excel2xml.py @@ -1,10 +1,10 @@ import os -import unittest import re +import time +import unittest from typing import Callable, Sequence, Union, Optional, Any import numpy as np -import pytest from lxml import etree from knora import excel2xml @@ -140,6 +140,23 @@ def test_make_xsd_id_compatible(self) -> None: self.assertRaises(BaseError, excel2xml.make_xsd_id_compatible, ".") + def test_derandomize_xsd_id(self) -> None: + teststring = "0aüZ/_-äöü1234567890?`^':.;+*ç%&/()=±“#Ç[]|{}≠" + id_1 = excel2xml.make_xsd_id_compatible(teststring) + time.sleep(1) + id_2 = excel2xml.make_xsd_id_compatible(teststring) + id_1_derandom = excel2xml._derandomize_xsd_id(id_1) + id_2_derandom = excel2xml._derandomize_xsd_id(id_2) + + # test single occurrence + self.assertEqual(id_1_derandom, id_2_derandom) + + # test multiple occurrence + multiple_ids = f"{id_1}----{id_2}----{id_1}----{id_2}" + multiple_ids_derandom = excel2xml._derandomize_xsd_id(multiple_ids, multiple_occurrences=True) + self.assertListEqual(multiple_ids_derandom.split("----"), [id_1_derandom] * 4) + + def test_find_date_in_string(self) -> None: # template: 2021-01-01 | 2015_01_02 @@ -275,9 +292,9 @@ def test_make_date_prop(self) -> None: def test_make_decimal_prop(self) -> None: prop = "decimal" method = excel2xml.make_decimal_prop - different_values = ["3.14159", 3.14159, .1, 100.0, "100.0"] - invalid_values = ["100", ".1", 100] - run_test(self, prop, method, different_values, invalid_values) + different_values = ["3.14159", 3.14159, "1.3e3", "100", ".1", 100] + invalid_values = ["string"] + run_test(self, prop, method, [float(x) for x in different_values], invalid_values) def test_make_geometry_prop(self) -> None: @@ -302,9 +319,9 @@ def test_make_geoname_prop(self) -> None: def test_make_integer_prop(self) -> None: prop = "integer" method = excel2xml.make_integer_prop - different_values = [1283416, "1283416", 71, "71", 0, "0"] - invalid_values = ["text", 10.0, ["text"]] - run_test(self, prop, method, different_values, invalid_values) + different_values = [1283416, "1283416", 3.14159, " 11 ", 0, "0"] + invalid_values = [" 10.3 ", "text", ["text"]] + run_test(self, prop, method, [int(x) for x in different_values], invalid_values) def test_make_interval_prop(self) -> None: @@ -506,30 +523,6 @@ def test_excel2xml(self) -> None: if os.path.isfile("excel2xml-output-data.xml"): os.remove("excel2xml-output-data.xml") - @pytest.mark.filterwarnings("ignore") - def test_excel2xml_sample_script(self) -> None: - old_working_directory = os.getcwd() - os.chdir("docs/assets/templates") - with open("excel2xml_sample_script.py") as f: - template_script = f.read() - exec(template_script, {}) - with open("../../../testdata/excel2xml-template-expected-output.xml") as f: - template_expected = f.read() - # remove the resource ids, because they contain a random component - template_expected = re.sub(r'(? + + + V + V + CR + CR + + + RV + V + CR + CR + + + V + V + CR + CR + + + RV + V + CR + CR + + + images/Anubis.jpg + + Anubis.jpg + + + + images/Basho_Horohoroto.jpg + + Basho_Horohoroto.jpg + + + + images/BM1888-0601-716.png + + BM1888-0601-716.png + + + + images/GibeonMeteorite.jpg + + GibeonMeteorite.jpg + + + + + Anubis.jpg_37fa98fd-a2e4-4031-bbf6-57da0ff82750 + + + Bengal cat + + + An example of a domesticated cat + + + mammals + humans + + + true + + + #f5f5dc + + + GREGORIAN:CE:2015-01-01:CE:2015-01-01 + + + + + + 4.8 + + + 2661604 + + + https://en.wikipedia.org/wiki/Cat + + + + + GibeonMeteorite.jpg_79eeb9f5-96db-4595-addd-524306d0e586 + + + Gibeon Meteorite + + + This is a piece of the so-called Gibeon Meteroite + + + physics + + + true + + + #808080 + + + GREGORIAN:CE:1908-03-05:CE:1908-03-05 + + + + + + 0.3 + + + 11821111 + + + https://en.wikipedia.org/wiki/Gibeon_(meteorite) + + + + + BM1888-0601-716.png_dbf7cc27-582a-4fc2-a28e-d5b66d29693c + + + Lekythos + + + Attic red-figured Lekythos BM 1888,601.716 + + + artwork + + + true + + + GREGORIAN:CE:1973-12-01:CE:1974-01-06 + + + 0.5 + + + 351274 + + + https://www.britishmuseum.org/collection/object/G_1888-0601-716 + + + + + Basho_Horohoroto.jpg_71a7e332-56cf-4e3b-a717-c34b2463e2c6 + + + Picture and Poem by Matsuo Bashō + + + ほろほろと山吹ちるかたきのおと + + + artwork + + + GREGORIAN:CE:1849:CE:1850 + + + 1.0 + + + https://en.wikipedia.org/wiki/Haiku#/media/File:Basho_Horohoroto.jpg + + + + + Date and time are invented, like for the other resources. + + + Anubis_fda7c834-692e-4e2c-b7c8-82e26a2775d9 + + + + + This is a comment + + + #5d1f1e + + + GibeonMeteorite.jpg_79eeb9f5-96db-4595-addd-524306d0e586 + + + {"type": "rectangle", "lineColor": "#ff3333", "lineWidth": 2, "points": [{"x": 0.08, "y": 0.16}, {"x": 0.73, "y": 0.72}], "original_index": 0} + + + + + This is a comment + + + BM1888-0601-716_97f7982b-eaed-4ef2-8813-cb93f8e2c480 + Horohoroto_5652b411-2c50-4086-896c-23b208ea27a8 + + + diff --git a/testdata/excel2xml-template-expected-output.xml b/testdata/excel2xml-template-expected-output.xml deleted file mode 100644 index ec03ff107..000000000 --- a/testdata/excel2xml-template-expected-output.xml +++ /dev/null @@ -1,264 +0,0 @@ - - - - V - V - CR - CR - - - RV - V - CR - CR - - - V - V - CR - CR - - - RV - V - CR - CR - - - testdata/bitstreams/test.jpg - - First Resource - - - Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. - - - mammals - insects - - - true - - - #00ff66 - - - - - - 2761369 - - - 0 - - - res_1 - - - http://test.org - - - - - Second Resource - - - Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. - - - birds - - - false - - - #ff00ff - - - GREGORIAN:CE:2015-01-01:CE:2015-01-01 - - - - - - 45.8 - - - 2761369 - - - 12 - - - res_8 - - - - testdata/bitstreams/test.jpg - - Third Resource - - - Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. - - - reptiles - - - 3 - - - - - Fourth Resource - - - Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. - - - plants - - - false - - - GREGORIAN:CE:1973-12-01:CE:1974-01-06 - - - 2 - - - - - Fifth Resource - - - Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. - - - physics - - - true - - - GREGORIAN:CE:1908-03-05:CE:1908-03-05 - - - 1 - - - - - Sixth Resource - - - Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. - - - humans - animals - - - false - - - GREGORIAN:CE:1886:CE:1886 - - - - - - 200.382 - - - 3 - - - - - Seventh Resource - - - Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. - - - amphibians - - - GREGORIAN:CE:1849:CE:1850 - - - 99 - - - - testdata/bitstreams/test.jpg - - Eighth Resource - - - Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. - - - physics - - - #ff00ff - - - 6 - - - - - Ninth Resource - - - Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. - - - plants - - - true - - - GREGORIAN:CE:1948-02-26:CE:1948-03-24 - - - 7 - - - - - This is a comment - - - res_0 - - - - - This is a comment - - - #5d1f1e - - - image_0 - - - {"type": "rectangle", "lineColor": "#ff3333", "lineWidth": 2, "points": [{"x": 0.08, "y": 0.16}, {"x": 0.73, "y": 0.72}], "original_index": 0} - - - - - This is a comment - - - res_0 - res_1 - - -