diff --git a/.gitignore b/.gitignore index 7aa4ac9..f386bf2 100644 --- a/.gitignore +++ b/.gitignore @@ -88,7 +88,12 @@ ENV/ # Rope project settings .ropeproject -# CUBE and h5 files -*.cube -*.h5 +# gedit temp/bak files +*.rst~ +*.py~ +.gitignore~ + +# Misc .bak files +*.bak + diff --git a/CHANGELOG.txt b/CHANGELOG.txt new file mode 100644 index 0000000..87dd704 --- /dev/null +++ b/CHANGELOG.txt @@ -0,0 +1,34 @@ +Changelog for h5cube + + +v0.1 +================================================================================ + +Initial beta release, without documentation or a test suite. Below functionality +believed functional and substantially bug-free + +File extensions are fixed: + + - .h5cube files (case insensitive) are decompressed + - .cube and .cub files (case insensitive) are compressed + +Clobber of an existing output file always occurs. No options are available for +selecting/changing the name of the output file. + +* General options + --delete, optionally delete the source file after (de)compression + +* Compression options + --compress, gzip compression level within the HDF5 file + --truncate, truncated precision of the log-10 mantissa of each data value + * Thresholding options + --absolute / --signed, whether indicated threshold values are applied to + the signed value or the absolute magnitude + --minmax / --isofactor, whether the threshold values are specified by + explicit min/max values, or a central isovalue + and a multiplicative factor + +* Decompression options + --precision, the significant figures past the decimal point output for + each data point + diff --git a/LICENSE b/LICENSE.txt similarity index 100% rename from LICENSE rename to LICENSE.txt diff --git a/README.md b/README.md deleted file mode 100644 index d5135dd..0000000 --- a/README.md +++ /dev/null @@ -1,2 +0,0 @@ -# h5cube -Gaussian CUBE file compression via h5py binary storage diff --git a/README.rst b/README.rst new file mode 100644 index 0000000..38ad5ea --- /dev/null +++ b/README.rst @@ -0,0 +1,19 @@ +Gaussian CUBE Compression via h5py +================================== + +Compression/decompression command-line tool and Python package for +Gaussian CUBE files, exploiting the capabilities of the +`HDF5 `__ binary format via ``h5py``. + +Available on `PyPI `__ +(``pip install h5cube``). + +Source on `GitHub `__. + +Documentation at Read the Docs: + +.. image:: https://readthedocs.org/projects/h5cube/badge/?version=latest + :target: http://h5cube.readthedocs.io/en/latest/?badge=latest + :alt: Documentation Status + + diff --git a/doc/source/_static/Bourke_Gaussian_Cube_Files.pdf b/doc/source/_static/Bourke_Gaussian_Cube_Files.pdf new file mode 100644 index 0000000..fc93157 Binary files /dev/null and b/doc/source/_static/Bourke_Gaussian_Cube_Files.pdf differ diff --git a/doc/source/conf.py b/doc/source/conf.py index 5869742..24c35f6 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -25,7 +25,7 @@ # If your documentation needs a minimal Sphinx version, state it here. # -# needs_sphinx = '1.0' +needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom @@ -65,9 +65,9 @@ # built documents. # # The short X.Y version. -version = '0.0' +version = '0.1' # The full version, including alpha/beta/rc tags. -release = '0.0' +release = '0.1' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. @@ -127,7 +127,7 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -html_theme = 'alabaster' +html_theme = 'sphinx_rtd_theme' # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the @@ -325,7 +325,7 @@ # dir menu entry, description, category) texinfo_documents = [ (master_doc, 'h5cube', 'h5cube Documentation', - author, 'h5cube', 'One line description of project.', + author, 'h5cube', 'Gaussian CUBE file compression via h5py', 'Miscellaneous'), ] @@ -347,4 +347,5 @@ # Example configuration for intersphinx: refer to the Python standard library. -intersphinx_mapping = {'https://docs.python.org/': None} +intersphinx_mapping = {'python': ('https://docs.python.org/3.5', None)} + diff --git a/doc/source/index.rst b/doc/source/index.rst index 51e28ff..233c9d1 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -6,6 +6,8 @@ Welcome to h5cube's documentation! ================================== +*Pending...* + Contents: .. toctree:: diff --git a/h5cube/__init__.py b/h5cube/__init__.py new file mode 100644 index 0000000..dab2d7c --- /dev/null +++ b/h5cube/__init__.py @@ -0,0 +1,21 @@ +# ------------------------------------------------------------------------------ +# Name: init +# Purpose: Package information for h5cube +# +# Author: Brian Skinn +# bskinn@alum.mit.edu +# +# Created: 22 Aug 2016 +# Copyright: (c) Brian Skinn 2016 +# License: The MIT License; see "license.txt" for full license terms +# and contributor agreement. +# +# http://www.github.com/bskinn/h5cube +# +# ------------------------------------------------------------------------------ + +from __future__ import absolute_import + +from .h5cube import cube_to_h5, h5_to_cube, H5, EXIT + +__version__ = '0.1' diff --git a/h5cube/h5cube.py b/h5cube/h5cube.py index 370c7bc..e45d352 100644 --- a/h5cube/h5cube.py +++ b/h5cube/h5cube.py @@ -1,50 +1,121 @@ -import os -import sys +# ------------------------------------------------------------------------------ +# Name: h5cube +# Purpose: Script for h5py (de)compression of Gaussian CUBE files +# +# Author: Brian Skinn +# bskinn@alum.mit.edu +# +# Created: 20 Aug 2016 +# Copyright: (c) Brian Skinn 2016 +# License: The MIT License; see "license.txt" for full license terms +# and contributor agreement. +# +# http://www.github.com/bskinn/h5cube +# +# ------------------------------------------------------------------------------ -COMMENT1 = 'COMMENT1' -COMMENT2 = 'COMMENT2' -NATOMS = 'NATOMS' -ORIGIN = 'ORIGIN' -XAXIS = 'XAXIS' -YAXIS = 'YAXIS' -ZAXIS = 'ZAXIS' -GEOM = 'GEOM' -SIGNS = 'SIGNS' -LOGDATA = 'LOGDATA' +# Argparse constants +class AP(object): + PATH = 'path' + DELETE = 'delete' + COMPRESS = 'compress' + TRUNC = 'truncate' + PREC = 'precision' + ABSMODE = 'absolute' + SIGNMODE = 'signed' + NOTHRESH = 'nothresh' + MINMAX = 'minmax' + ISOFACTOR = 'isofactor' -def cube_to_h5(cubepath): +# h5py constants +class H5(object): + COMMENT1 = 'COMMENT1' + COMMENT2 = 'COMMENT2' + NATOMS = 'NATOMS' + ORIGIN = 'ORIGIN' + XAXIS = 'XAXIS' + YAXIS = 'YAXIS' + ZAXIS = 'ZAXIS' + GEOM = 'GEOM' + SIGNS = 'SIGNS' + LOGDATA = 'LOGDATA' + +# Default values +class DEF(object): + TRUNC = 5 + PREC = 5 + COMP = 9 + DEL = False + THRESH = False + +# Exit codes +class EXIT(object): + GENERIC = 1 + CMDLINE = 2 + FILEREAD = 4 + FILEWRITE = 8 + +def _exp_format(val, prec): + """ [Docstring] + + """ + + # Convert val using string formatting: Always a leading space; + # positive values with another leading space; negatives with the negative + # sign; one digit in front of the decimal, 'dec' digits after. + # Capital 'E' for the exponent. + out = " {{: #1.{0}E}}".format(prec).format(val) + + # Return the results + return out + + +def cube_to_h5(cubepath, *, delsrc=DEF.DEL, comp=DEF.COMP, trunc=DEF.TRUNC, + thresh=DEF.THRESH, signed=None, minmax=None, isofactor=None): + """ [Docstring] + + """ import h5py as h5 import itertools as itt import numpy as np + import os import re + # Default compression and truncations, if no value(s) passed on commandline + if comp is None: + comp = DEF.COMP + if trunc is None: + trunc = DEF.TRUNC + + # Pull the file contents and make an iterator with open(cubepath) as f: filedata = f.read() - datalines = iter(filedata.splitlines()) - h5path = "{0}.h5".format(cubepath) + # Construct the .h5cube path + h5path = os.path.splitext(cubepath)[0] + '.h5cube' + + # Clobber to new file if os.path.isfile(h5path): os.remove(h5path) - hf = h5.File(h5path) # Comment lines - hf.create_dataset(COMMENT1, data=next(datalines)) - hf.create_dataset(COMMENT2, data=next(datalines)) + hf.create_dataset(H5.COMMENT1, data=next(datalines)) + hf.create_dataset(H5.COMMENT2, data=next(datalines)) # Number of atoms and origin elements = iter(next(datalines).split()) natoms = abs(int(next(elements))) - hf.create_dataset(NATOMS, data=natoms) - hf.create_dataset(ORIGIN, data=np.array([float(next(elements)) + hf.create_dataset(H5.NATOMS, data=natoms) + hf.create_dataset(H5.ORIGIN, data=np.array([float(next(elements)) for i in range(3)])) # Dimensions and vectors dims = [] - for dsname in [XAXIS, YAXIS, ZAXIS]: + for dsname in [H5.XAXIS, H5.YAXIS, H5.ZAXIS]: elements = iter(next(datalines).split()) hf.create_dataset(dsname, data=np.array([float(next(elements)) for i in range(4)])) @@ -58,7 +129,7 @@ def cube_to_h5(cubepath): for j in range(5): geom[i, j] = elements[j] - hf.create_dataset(GEOM, data=geom) + hf.create_dataset(H5.GEOM, data=geom) # Volumetric field data # Create one big iterator over a scientific notation regular @@ -69,19 +140,29 @@ def cube_to_h5(cubepath): -? # Optional leading negative sign \\d # Single leading digit \\. # Decimal point - \\d+ # Multiple digits + \\d* # Digits (could be none, zero precision) [de] # Accept either 0.000d00 or 0.000e00 [+-] # Sign of the exponent \\d+ # Digits of the exponent """, re.X | re.I) # Agglomerated iterator - dataiter = itt.chain(*(p_scinot.finditer(l) for l in datalines)) + dataiter = itt.chain.from_iterable([p_scinot.finditer(l) + for l in datalines]) # Initialize the numpy objects logdataarr = np.zeros(dims) signsarr = np.zeros(dims) + # Preassign the calculated minmax values if isofactored thresh + # is enabled + if thresh and isofactor is not None: + # Populate minmax with the isovalue/factor based + # threshold values + minmax = np.zeros((2,)) + minmax[0] = isofactor[0] / isofactor[1] + minmax[1] = isofactor[0] * isofactor[1] + # Loop over the respective dimensions for x in range(dims[0]): for y in range(dims[1]): @@ -91,6 +172,19 @@ def cube_to_h5(cubepath): except StopIteration as e: raise ValueError("Insufficient data in CUBE file") from e + # Threshold, if indicated + if thresh: + if signed: + if val < minmax[0]: + val = minmax[0] + elif val > minmax[1]: + val = minmax[1] + else: + if np.abs(val) < minmax[0]: + val = np.sign(val) * minmax[0] + elif np.abs(val) > minmax[1]: + val = np.sign(val) * minmax[1] + signsarr[x, y, z] = np.sign(val) logdataarr[x, y, z] = np.log10(np.abs(val)) @@ -103,26 +197,343 @@ def cube_to_h5(cubepath): raise ValueError("CUBE file dataset not exhausted") # Store the arrays, compressed - hf.create_dataset(LOGDATA, data=logdataarr, compression="gzip", - compression_opts=9, shuffle=True, scaleoffset=5) - hf.create_dataset(SIGNS, data=signsarr, compression="gzip", - compression_opts=9, shuffle=True) + hf.create_dataset(H5.LOGDATA, data=logdataarr, compression="gzip", + compression_opts=comp, shuffle=True, scaleoffset=trunc) + hf.create_dataset(H5.SIGNS, data=signsarr, compression="gzip", + compression_opts=comp, shuffle=True) # Close the h5 file hf.close() + # If indicated, delete the source file + if delsrc: + os.remove(cubepath) -def h5_to_cube(path): - pass +def h5_to_cube(h5path, *, delsrc=DEF.DEL, prec=DEF.PREC): + """ [Docstring] -if __name__ == '__main__': + Less error/syntax checking here since presumably the data was + parsed for validity when the .h5cube file was created. + """ + + import h5py as h5 + import os + + # Default precision value, if no value passed on commandline + if prec is None: + prec = DEF.PREC + + # Define the header block substitution strings + hdr_3val = "{:5d} {: 1.6f} {: 1.6f} {: 1.6f}" + hdr_4val = "{:5d} {: 1.6f} {: 1.6f} {: 1.6f} {: 1.6f}" + + # Define the uncompressed filename + cubepath = os.path.splitext(h5path)[0] + '.cube' + + # Open the source file + hf = h5.File(h5path) + + # Delete any existing output file + if os.path.isfile(cubepath): + os.remove(cubepath) + + # Open the output file for writing as a context manager + with open(cubepath, 'w') as f: + # Write the two comment lines + f.write(hf[H5.COMMENT1].value + '\n') + f.write(hf[H5.COMMENT2].value + '\n') + + # Write the number-of-atoms and system origin line + natoms = hf[H5.NATOMS].value + f.write(hdr_3val.format(natoms, *(hf[H5.ORIGIN].value)) + '\n') + + # Write the three axes lines + dims = [] + for dsname in [H5.XAXIS, H5.YAXIS, H5.ZAXIS]: + ds = hf[dsname].value + f.write(hdr_3val.format(int(ds[0]), *ds[1:]) + '\n') + dims.append(int(ds[0])) + + # Write the geometry + geom = hf[H5.GEOM].value + for i in range(natoms): + f.write(hdr_4val.format(int(geom[i,0]), *geom[i,1:]) + '\n') + + # Write the data blocks + signs = hf[H5.SIGNS].value + logvals = hf[H5.LOGDATA].value + for x in range(dims[0]): + for y in range(dims[1]): + for z in range(dims[2]): + f.write(_exp_format(signs[x, y, z] * + 10.**logvals[x, y, z], prec)) + if z % 6 == 5: + f.write('\n') + + f.write('\n') + + # Close the h5 file + hf.close() + + # If indicated, delete the source file + if delsrc: + os.remove(h5path) + +def _validate_minmax(minmax, signed): + """ [Docstring] + + """ + + import argparse as ap + import sys + + if minmax[0] >= minmax[1]: + print("Error: 'max' is not greater than 'min'") + sys.exit(EXIT.CMDLINE) + + if not signed and minmax[0] < 0: + print("Error: Negative 'min' in absolute thresholding mode") + sys.exit(EXIT.CMDLINE) + + +def _validate_isofactor(isofactor, signed): + """ [Docstring] + + """ + + import argparse as ap + import sys + + if isofactor[0] == 0.0: + print("Error: 'isovalue' cannot be zero") + sys.exit(EXIT.CMDLINE) + + if isofactor[1] <= 1.0: + print("Error: 'factor' must be greater than one") + sys.exit(EXIT.CMDLINE) - path = sys.argv[1] + if not signed and isofactor[0] < 0: + print("Error: Negative 'isovalue' in absolute thresholding mode") + sys.exit(EXIT.CMDLINE) + + +def _get_parser(): + """ [Docstring] + + """ + + import argparse as ap + + # Core parser + prs = ap.ArgumentParser(description="Gaussian CUBE (de)compression " + "via h5py") + + # Compression group + gp_comp = prs.add_argument_group(title="compression options") + + # Thresholding "subgroups" within compression + gp_threshmode = prs.add_argument_group(title="compression thresholding " + "mode (mutually exclusive)") + gp_threshvals = prs.add_argument_group(title="compression thresholding " + "values (mutually exclusive)") + + # Decompression group + gp_decomp = prs.add_argument_group(title="decompression options") + + + + # Mutually exclusive subgroups for the compression operation + meg_threshmode = gp_threshmode.add_mutually_exclusive_group() + meg_threshvals = gp_threshvals.add_mutually_exclusive_group() + + # Argument for the filename (core parser) + prs.add_argument(AP.PATH, action='store', + help="path to .(h5)cube file to be (de)compressed") + + # Argument to delete the source file; default is to keep (core) + prs.add_argument('-{0}'.format(AP.DELETE[0]), '--{0}'.format(AP.DELETE), + action='store_true', + help="delete the source file after (de)compression") + + # gzip compression level (compress) + gp_comp.add_argument('-{0}'.format(AP.COMPRESS[0]), + '--{0}'.format(AP.COMPRESS), + action='store', default=None, type=int, + choices=list(range(10)), + metavar='#', + help="gzip compression level for volumetric " + "data (0-9, default {0})".format(DEF.COMP)) + + # gzip truncation level (compress) + gp_comp.add_argument('-{0}'.format(AP.TRUNC[0]), + '--{0}'.format(AP.TRUNC), + action='store', default=None, type=int, + choices=list(range(1,16)), + metavar='#', + help="gzip truncation width for volumetric " + "data (1-15, default {0})".format(DEF.TRUNC)) + + # Absolute thresholding mode (compress -- threshold mode) + meg_threshmode.add_argument('-{0}'.format(AP.ABSMODE[0]), + '--{0}'.format(AP.ABSMODE), + action='store_true', + help="absolute-value thresholding " + "mode (default if -{0} or -{1} " + "specified".format(AP.MINMAX[0], + AP.ISOFACTOR[0])) + + # Signed thresholding mode (compress -- threshold mode) + meg_threshmode.add_argument('-{0}'.format(AP.SIGNMODE[0]), + '--{0}'.format(AP.SIGNMODE), + action='store_true', + help="signed-value thresholding " + "mode") + + # Thresholding mode disabled (compress -- threshold mode) + meg_threshmode.add_argument('-{0}'.format(AP.NOTHRESH[0]), + '--{0}'.format(AP.NOTHRESH), + action='store_true', + help="thresholding disabled (default)") + + + # Min/max threshold specification (compress -- threshold values) + meg_threshvals.add_argument('-{0}'.format(AP.MINMAX[0]), + '--{0}'.format(AP.MINMAX), + action='store', + default=None, + nargs=2, + metavar='#', + help="min and max values for " + "threshold specification") + + # Isovalue/factor threshold specification (compress -- threshold values) + meg_threshvals.add_argument('-{0}'.format(AP.ISOFACTOR[0]), + '--{0}'.format(AP.ISOFACTOR), + action='store', + default=None, + nargs=2, + metavar='#', + help="Isovalue and multiplicative " + "factor values for " + "threshold specification") + + # Data block output precision (decompress) + gp_decomp.add_argument('-{0}'.format(AP.PREC[0]), + '--{0}'.format(AP.PREC), + action='store', default=None, type=int, + choices=list(range(16)), + metavar='#', + help="volumetric data block output " + "precision (0-15, " + "default {0})".format(DEF.PREC)) + return prs + + +def main(): + + import argparse as ap + import numpy as np + import os + import sys + + # Retrieve the argument parser + prs = _get_parser() + + # Parse known args, convert to dict, and leave unknown args in sys.argv + ns, args_left = prs.parse_known_args() + params = vars(ns) + sys.argv = sys.argv[:1] + args_left + + # Retrieve path and file extension + path = params[AP.PATH] ext = os.path.splitext(path)[1] - if ext == '.h5': - h5_to_cube(path) - elif ext == '.cube': - cube_to_h5(path) + # Check for existence + if not os.path.isfile(path): + print("File not found. Exiting...") + sys.exit(EXIT.FILEREAD) + + # Retrieve other parameters + delsrc = params[AP.DELETE] + comp = params[AP.COMPRESS] + trunc = params[AP.TRUNC] + prec = params[AP.PREC] + absolute = params[AP.ABSMODE] + signed = params[AP.SIGNMODE] + nothresh = params[AP.NOTHRESH] + minmax = params[AP.MINMAX] + isofactor = params[AP.ISOFACTOR] + + # Composite indicators for which types of arguments passed + def notNoneFalse(x): + return x is not None and x is not False + + compargs = any(map(notNoneFalse, [comp, trunc, absolute, + signed, nothresh, + minmax, isofactor])) + + decompargs = any(map(notNoneFalse, [prec])) + + # Complain if nothresh specified but minmax or isofactor provided + if nothresh and not (minmax is None and isofactor is None): + print("Error: Thresholding parameter specified with --nothresh") + sys.exit(EXIT.CMDLINE) + + # Complain if compression and decompression arguments mixed + if compargs and decompargs: + print("Error: Both compression and decompression options specified") + sys.exit(EXIT.CMDLINE) + + # Convert and validate the thresholding inputs + if minmax is not None: + minmax = np.float_(minmax) + _validate_minmax(minmax, signed) + if isofactor is not None: + isofactor = np.float_(isofactor) + _validate_isofactor(isofactor, signed) + + # Complain if a thresholding mode is indicated but no + # threshold values are provided + if (absolute or signed) and (minmax is None and isofactor is None): + print("Error: Thresholding mode specified but no values provided") + sys.exit(EXIT.CMDLINE) + + # Check file extension as indication of execution mode + if ext == '.h5cube': + # Decompression mode + if compargs: + print("Error: compression arguments passed to " + "decompression operation") + sys.exit(EXIT.CMDLINE) + + h5_to_cube(path, delsrc=delsrc, prec=prec) + + elif ext in ['.cube', '.cub']: + # Compression mode + if decompargs: + print("Error: decompression arguments passed to " + "compression operation") + sys.exit(EXIT.CMDLINE) + + if minmax is not None: + # Min/max thresholding + cube_to_h5(path, delsrc=delsrc, comp=comp, trunc=trunc, + thresh=True, signed=signed, minmax=minmax) + elif isofactor is not None: + # Isovalue thresholding + cube_to_h5(path, delsrc=delsrc, comp=comp, trunc=trunc, + thresh=True, signed=signed, isofactor=isofactor) + else: + # No thresholding + cube_to_h5(path, thresh=False, delsrc=delsrc, comp=comp, + trunc=trunc) + + else: + print("File extension not recognized. Exiting...") + sys.exit(EXIT.CMDLINE) + + +if __name__ == '__main__': + main() diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..3ead813 --- /dev/null +++ b/setup.py @@ -0,0 +1,32 @@ +from setuptools import setup + +setup( + name='h5cube', + version='0.1', + requires='h5py (>=2.4)', + packages=['h5cube'], + url='https://www.github.com/bskinn/h5cube', + license='MIT License', + author='Brian Skinn', + author_email='bskinn@alum.mit.edu', + description='Gaussian CUBE File Compression Utility', + classifiers=['License :: OSI Approved :: MIT License', + 'Natural Language :: English', + 'Environment :: Console', + 'Intended Audience :: Science/Research', + 'Operating System :: OS Independent', + 'Programming Language :: Python :: 3 :: Only', +# 'Programming Language :: Python :: 3.2', +# 'Programming Language :: Python :: 3.3', +# 'Programming Language :: Python :: 3.4', +# 'Programming Language :: Python :: 3.5', + 'Topic :: Scientific/Engineering', + 'Topic :: System :: Archiving :: Compression', + 'Topic :: Utilities', + 'Development Status :: 4 - Beta'], + entry_points={ + 'console_scripts': [ + 'h5cube = h5cube.h5cube:main' + ] + } +)