From f72e7a01e9396430e88c75c1ab3ec2743e6cf053 Mon Sep 17 00:00:00 2001 From: irinaschubert Date: Tue, 12 Apr 2022 18:02:15 +0200 Subject: [PATCH] docs(ingest): Add accepted file formats to documentation (DEV-677) (#2038) * add accepted file formats * Update data-formats.md * update sipi path * rename data-formats.md to file-formats.md * update index.md * fix footnote * reset scala setting --- .scalafmt.conf | 2 +- docs/01-introduction/data-formats.md | 22 ---------------------- docs/01-introduction/file-formats.md | 24 ++++++++++++++++++++++++ docs/01-introduction/index.md | 2 +- docs/01-introduction/what-is-knora.md | 12 ++++-------- docs/faq/index.md | 6 +++--- mkdocs.yml | 2 +- 7 files changed, 34 insertions(+), 36 deletions(-) delete mode 100644 docs/01-introduction/data-formats.md create mode 100644 docs/01-introduction/file-formats.md diff --git a/.scalafmt.conf b/.scalafmt.conf index e94dd85083..bf360eab5b 100644 --- a/.scalafmt.conf +++ b/.scalafmt.conf @@ -1,6 +1,6 @@ version = "2.7.5" maxColumn = 120 -align.preset = most +align.preset = some align.multiline = false continuationIndent.defnSite = 2 assumeStandardLibraryStripMargin = true diff --git a/docs/01-introduction/data-formats.md b/docs/01-introduction/data-formats.md deleted file mode 100644 index 16df023bf6..0000000000 --- a/docs/01-introduction/data-formats.md +++ /dev/null @@ -1,22 +0,0 @@ - - -# Data Formats in DSP-API - -As explained in [What Is DSP and DSP-API (previous Knora)?](what-is-knora.md), the DSP stores data -in a small number of formats that are suitable for long-term preservation while -facilitating data reuse. - -The following is a non-exhaustive list of data formats and how their content -can be stored and managed by DSP-API: - -| Original Format | Format in DSP | -|----------------------------------------------|------------------------------------------------------------------------------------------------------------------------------| -| Text (XML, LaTeX, Microsoft Word, etc.) | [Knora resources](../03-apis/api-v2/editing-resources.md) (RDF) containing [Standoff/RDF](standoff-rdf.md) | -| Tabular data, including relational databases | [Knora resources](../03-apis/api-v2/editing-resources.md) | -| Data in tree or graph structures | [Knora resources](../03-apis/api-v2/editing-resources.md) | -| Images (JPEG, PNG, etc.) | JPEG 2000 files stored by [Sipi](https://github.com/dhlab-basel/Sipi) | -| Audio and video files | Audio and video files stored by [Sipi](https://github.com/dhlab-basel/Sipi) (in archival formats to be determined) | -| PDF | Can be stored by Sipi, but data reuse is improved by extracting the text for storage as [Standoff/RDF](standoff-rdf.md) | diff --git a/docs/01-introduction/file-formats.md b/docs/01-introduction/file-formats.md new file mode 100644 index 0000000000..173b5518b7 --- /dev/null +++ b/docs/01-introduction/file-formats.md @@ -0,0 +1,24 @@ + + +# File Formats in DSP-API + +Currently, only a limited number of file formats is accepted to be uploaded onto DSP. Some metadata is extracted from the files during the ingest but the file formats are not validated. Only image file formats are currently migrated into another format. Both, the migrated version of the file and the original are kept. + +The following table shows the accepted file formats: + +| Category | Accepted format | Converted during ingest? | +| --------------------- | ------------------------- | -------------------------------------------------------------------------- | +| Text, XML1 | TXT, XML, XSL, XSD | No | +| Tables | CSV, XLS, XLSX | No | +| 2D Images | JPEG, PNG, TIFF, JP2 | Yes, converted to JPEG 2000 by [Sipi](https://github.com/dasch-swiss/sipi) | +| Audio | MPEG (MP3), MP4, WAV | No | +| Video | MP4 | No | +| Office | PDF, DOC, DOCX, PPT, PPTX | No | +| Archives | ZIP, TAR, ISO, GZIP, 7Z | No | + + +1: If your XML files represent text with markup (e.g. [TEI/XML](http://www.tei-c.org/)), +the recommended approach is to allow Knora to store it as [Standoff/RDF](standoff-rdf.md). diff --git a/docs/01-introduction/index.md b/docs/01-introduction/index.md index 7da6fbba7f..7f6bcb15a6 100644 --- a/docs/01-introduction/index.md +++ b/docs/01-introduction/index.md @@ -6,6 +6,6 @@ # Introduction * [What Is DSP and DSP-API (previous Knora)?](what-is-knora.md) -* [Data Formats in DSP-API](data-formats.md) +* [File Formats in DSP-API](file-formats.md) * [Standoff/RDF Text Markup](standoff-rdf.md) * [An Example Project](example-project.md) diff --git a/docs/01-introduction/what-is-knora.md b/docs/01-introduction/what-is-knora.md index 8040cd1141..2a4aac52c5 100644 --- a/docs/01-introduction/what-is-knora.md +++ b/docs/01-introduction/what-is-knora.md @@ -23,15 +23,11 @@ DSP solves this problem by keeping the data alive. You can query all the data in a DSP repository, not just the metadata. You can import thousands of databases into DSP, and run queries that search through all of them at once. -Another problem is that researchers use a multitude of different data formats, many of +Another problem is that researchers use a multitude of different file formats, many of which are proprietary and quickly become obsolete. It is not practical to maintain -all the programs that were used to create and read old data files, or even -all the operating systems that these programs ran on. - -Instead of preserving all these data formats, DSP supports -the conversion of all sorts of data to a [small number of formats](data-formats.md) -that are suitable for long-term preservation, and that maintain the data's meaning and -structure: +all the programs that were used to create and read old files, or even +all the operating systems that these programs ran on. Therefore, DSP only accepts a +certain number of [file formats](file-formats.md). - Non-binary data is stored as [RDF](http://www.w3.org/TR/2014/NOTE-rdf11-primer-20140624/), in a dedicated diff --git a/docs/faq/index.md b/docs/faq/index.md index b3a7f86d47..e21cd3f3b1 100644 --- a/docs/faq/index.md +++ b/docs/faq/index.md @@ -5,11 +5,11 @@ # Frequently Asked Questions -## Data Formats +## File Formats -### What data formats does Knora store? +### What file formats does Knora store? -See [Data Formats in Knora](../01-introduction/data-formats.md). +See [File Formats in Knora](../01-introduction/file-formats.md). ### Does Knora store XML files? diff --git a/mkdocs.yml b/mkdocs.yml index a5dde90d52..505486de81 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -10,7 +10,7 @@ nav: - Introduction: - Index: 01-introduction/index.md - What is DSP?: 01-introduction/what-is-knora.md - - Data Formats in DSP-API: 01-introduction/data-formats.md + - File Formats in DSP-API: 01-introduction/file-formats.md - Standoff/RDF Text Markup: 01-introduction/standoff-rdf.md - An Example Project: 01-introduction/example-project.md - DSP Ontologies: