Skip to content

Commit

Permalink
Write ASpace XLSX by default
Browse files Browse the repository at this point in the history
- Use openpxyl to write description info to ASpace import template
for Disk Image Processor by default
- To write previous description.csv instead, use -c/--csv flag
- Install python dependencies in install script
  • Loading branch information
tw4l committed Aug 5, 2023
1 parent 5c57138 commit a89e198
Show file tree
Hide file tree
Showing 9 changed files with 250 additions and 8 deletions.
9 changes: 7 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

Analyze disk images and/or create ready-to-ingest SIPs from a directory of disk images and related files.

Version: 1.1.1
Version: 1.2.0

## Usage

Expand Down Expand Up @@ -48,7 +48,12 @@ For HFS file systems, files are exported from the disk image using CLI version o

For UDF file systems, files are copied from the mounted disk image and `walk_to_dfxml.py` is used to generate DFXML.

When complete, a "description.csv" spreadsheet is created containing some pre-populated archival description:
When complete, a description spreadsheet will be created containings ome pre-populated archival description.

From v1.2.0, Disk Image Processor will write this information into an ArchivesSpace description XLSX spreadsheet.

In previous versions or if the `"-c"/"--csv"` option is passed in v1.2.0+, a description.csv file will be created instead, containing the following columns:

* Date statement
* Date begin
* Date end
Expand Down
Binary file added aspace_template/aspace_import_template.xlsx
Binary file not shown.
189 changes: 187 additions & 2 deletions diskimageprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,10 @@
import datetime
import itertools
import logging
import openpyxl
import os
import shutil
import stat
import subprocess
import sys
import time
Expand Down Expand Up @@ -256,6 +258,174 @@ def create_spreadsheet(args, sips, volumes, logger):
logger.info("Description CSV created.")


def create_aspace_excel_sheet(args, sips, volumes, logger):
"""Create new copy of ASpace XLSX and append rows describing disk images."""
xlsx_path = os.path.abspath(os.path.join(args.destination, "description.xlsx"))
template_path = os.path.abspath(
os.path.join(THIS_DIR, "aspace_template", "aspace_import_template.xlsx")
)

try:
shutil.copyfile(template_path, xlsx_path)
except OSError as err:
logger.error(f"Unable to copy ASpace template to destination: {err}")

# Set ASpace file permissions
try:
os.chmod(
xlsx_path,
stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IWGRP | stat.S_IROTH,
)
except OSError as err:
logger.error(f"Error setting permissions: {err}")

workbook = openpyxl.load_workbook(filename=xlsx_path)
worksheet = workbook["Data"]

# TODO: Deduplicate with create_speadsheet
# Maybe create separate method that creates dict with info, and handle
# opening/writing csv or xlsx separately
for item in sorted(os.listdir(sips)):
sip_path = os.path.join(sips, item)

if not os.path.isdir(sip_path):
continue

disk_volumes = volumes[item]
number_volumes = len(disk_volumes)

date_earliest = ""
date_latest = ""

# Get and sum information from all DFXML files generated
dfxml_files = []
subdoc_dir = os.path.join(sip_path, "metadata", "submissionDocumentation")
if args.bagfiles:
subdoc_dir = os.path.join(
sip_path, "data", "metadata", "submissionDocumentation"
)
for root, _, files in os.walk(subdoc_dir):
for file in files:
if file.startswith("dfxml"):
dfxml_files.append(os.path.join(root, file))

dfxml_files_info = []
for dfxml_file in dfxml_files:
dfxml_info = _parse_dfxml(dfxml_file, logger)
if not dfxml_info:
logger.warning(
"No fileobjects in DFXML file {} - possibly file system fiwalk doesn't recognize".format(
dfxml_file
)
)
continue
dfxml_files_info.append(dfxml_info)

file_count = sum([dfxml_info["files"] for dfxml_info in dfxml_files_info])
total_bytes = sum([dfxml_info["bytes"] for dfxml_info in dfxml_files_info])
file_systems = [volume["file_system"] for volume in disk_volumes]
# Deduplicate list
file_systems = list(dict.fromkeys(file_systems))
file_systems_str = ", ".join(file_systems)

for dfxml_info in dfxml_files_info:
if not date_earliest or dfxml_info["date_earliest"] < date_earliest:
date_earliest = dfxml_info["date_earliest"]
if not date_latest or dfxml_info["date_latest"] > date_latest:
date_latest = dfxml_info["date_latest"]

# Create list with empty string for each of template's columns
row_to_write = []
for _ in range(173):
row_to_write.append("")

# Row indices for fields to write
INDEX_FILENAME = 6
INDEX_LEVEL_OF_DESCRIPTION = 8
INDEX_DATE_START = 23
INDEX_DATE_END = 24
INDEX_EXTENT_NUMBER = 34
INDEX_EXTENT_TYPE = 35
INDEX_SIZE = 36
INDEX_SCOPE_CONTENTS = 170

# Fields that are always constant
row_to_write[INDEX_FILENAME] = item
row_to_write[INDEX_LEVEL_OF_DESCRIPTION] = "File"

if file_count == 0:
row_to_write[
INDEX_SCOPE_CONTENTS
] = "Error gathering statistics from SIP directory"

worksheet.append(row_to_write)

logger.error("Unable to read DFXML files for {}".format(sip_path))
continue

# Get file formats from Brunnhilde
file_formats = []
file_format_csv = os.path.join(
sip_path,
"metadata",
"submissionDocumentation",
"brunnhilde",
"csv_reports",
"formats.csv",
)
if args.bagfiles:
file_format_csv = os.path.join(
sip_path,
"data",
"metadata",
"submissionDocumentation",
"brunnhilde",
"csv_reports",
"formats.csv",
)

try:
with open(file_format_csv, "r") as f:
reader = csv.reader(f)
next(reader)
for row in itertools.islice(reader, 5):
file_formats.append(row[0])
except:
file_formats.append(
"ERROR! No Brunnhilde formats.csv file to pull formats from."
)

file_formats = [element or "Unidentified" for element in file_formats]
file_formats_str = ", ".join(file_formats)

if number_volumes > 1:
scope_content = "Files exported from {} volumes with file systems: {}. File formats: {}".format(
number_volumes, file_systems_str, file_formats_str
)
else:
scope_content = (
"Files exported from {} file system volume. File formats: {}".format(
disk_volumes[0]["file_system"], file_formats_str
)
)

row_to_write[INDEX_DATE_START] = str(date_earliest[:4])
row_to_write[INDEX_DATE_END] = str(date_latest[:4])
row_to_write[INDEX_EXTENT_NUMBER] = str(file_count)
row_to_write[INDEX_EXTENT_TYPE] = "digital files"
row_to_write[INDEX_SIZE] = str(human_readable_size(total_bytes))
row_to_write[INDEX_SCOPE_CONTENTS] = scope_content

worksheet.append(row_to_write)

logger.info("Described %s successfully." % (sip_path))

workbook.save(filename=xlsx_path)
workbook.close()

logger.info("ArchivesSpace description XLSX created.")


def _parse_dfxml(dfxml_path, logger, export_all=False):
"""Parse DFXML and return dict of information for spreadsheet."""
volume_info = {
Expand Down Expand Up @@ -423,6 +593,12 @@ def _make_parser():
help="Export AppleDouble resource forks from HFS-formatted disks",
action="store_true",
)
parser.add_argument(
"-c",
"--csv",
help="Write description CSV (old default) instead of ArchivesSpace XLSX",
action="store_true",
)
parser.add_argument("--quiet", action="store_true", help="Write only errors to log")
parser.add_argument(
"source", help="Source directory containing disk images (and related files)"
Expand Down Expand Up @@ -563,8 +739,17 @@ def main():
shell=True,
)

# write description CSV
create_spreadsheet(args, sips, volumes, logger)
# write description
if args.csv:
try:
create_spreadsheet(args, sips, volumes, logger)
except Exception as err:
logger.error(f"Error creating description csv: {err}")
else:
try:
create_aspace_excel_sheet(args, sips, volumes, logger)
except Exception as err:
logger.error(f"Error creating ArchivesSpace description xlsx: {err}")

# print unprocessed list
if unprocessed:
Expand Down
5 changes: 5 additions & 0 deletions install-bc2-ubuntu18.sh
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,11 @@ sudo cp LICENSE $dip_dir
sudo cp README.md $dip_dir
sudo cp -r disk_image_toolkit/ $dip_dir

if [ ! -d $dip_dir/aspace_template ]; then
sudo mkdir $dip_dir/aspace_template
fi
sudo cp aspace_template/aspace_import_template.xlsx $dip_dir/aspace_template

if [ ! -d $dip_dir/disk_image_toolkit/dfxml ]; then
sudo mkdir $dip_dir/disk_image_toolkit/dfxml/
fi
Expand Down
8 changes: 8 additions & 0 deletions install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@

### Install script for CCA Disk Image Processor in Bitcurator 4/Ubuntu 22

sudo python3 -m pip install pyqt5
sudo python3 -m pip install -r requirements/base.txt

if [ ! -d /usr/share/ccatools ]; then
sudo mkdir /usr/share/ccatools
fi
Expand Down Expand Up @@ -30,6 +33,11 @@ sudo cp LICENSE $dip_dir
sudo cp README.md $dip_dir
sudo cp -r disk_image_toolkit/ $dip_dir

if [ ! -d $dip_dir/aspace_template ]; then
sudo mkdir $dip_dir/aspace_template
fi
sudo cp aspace_template/aspace_import_template.xlsx $dip_dir/aspace_template

if [ ! -d $dip_dir/disk_image_toolkit/dfxml ]; then
sudo mkdir $dip_dir/disk_image_toolkit/dfxml/
fi
Expand Down
2 changes: 1 addition & 1 deletion main.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def about_dialog(self):
QMessageBox.information(
self,
"About",
"Disk Image Processor v1.1.1\nCanadian Centre for Architecture\nDeveloper: Tessa Walsh\n2018-2023\nMIT License\nhttps://github.com/CCA-Public/cca-diskimageprocessor",
"Disk Image Processor v1.2.0\nCanadian Centre for Architecture\nDeveloper: Tessa Walsh\n2018-2023\nMIT License\nhttps://github.com/CCA-Public/cca-diskimageprocessor",
)

def browse_analysis_source(self):
Expand Down
1 change: 1 addition & 0 deletions requirements/base.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
bagit>=1.7.0
brunnhilde>=1.9.1
openpyxl>=3.1.2
5 changes: 5 additions & 0 deletions test-install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,11 @@ sudo cp disk_image_toolkit/dfxml/dfxml.py /usr/share/ccatools/diskimageprocessor
sudo cp disk_image_toolkit/dfxml/objects.py /usr/share/ccatools/diskimageprocessor
sudo cp disk_image_toolkit/dfxml/walk_to_dfxml.py /usr/share/ccatools/diskimageprocessor

if [ ! -d /usr/share/ccatools/diskimageprocessor/aspace_template ]; then
sudo mkdir /usr/share/ccatools/diskimageprocessor/aspace_template
fi
sudo cp aspace_template/aspace_import_template.xlsx /usr/share/ccatools/diskimageprocessor/aspace_template

sudo cp disk_image_toolkit/dfxml/dfxml.py .
sudo cp disk_image_toolkit/dfxml/objects.py .
sudo cp disk_image_toolkit/dfxml/walk_to_dfxml.py .

0 comments on commit a89e198

Please sign in to comment.