diff --git a/CHANGES.md b/CHANGES.md index bfc8c08b..78ead5a9 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -2,6 +2,14 @@ ## Upcoming release... +## [v0.8.9](https://github.com/mirnylab/cooler/compare/v0.8.8...v0.8.9) + +Date : 2020-07-17 + +### Enhancements +* Added single-cell cooler file flavor (.scool) (#201) + + ## [v0.8.8](https://github.com/mirnylab/cooler/compare/v0.8.7...v0.8.8) Date : 2020-06-23 diff --git a/cooler/__init__.py b/cooler/__init__.py index d9e636e9..65bcf170 100644 --- a/cooler/__init__.py +++ b/cooler/__init__.py @@ -12,7 +12,7 @@ """ from ._version import __version__, __format_version__ from .api import Cooler, annotate -from .create import create_cooler, rename_chroms +from .create import create_cooler, rename_chroms, create_scool from .reduce import merge_coolers, coarsen_cooler, zoomify_cooler from .balance import balance_cooler from .util import binnify, read_chromsizes, fetch_chromsizes diff --git a/cooler/_version.py b/cooler/_version.py index 7cad5675..d718d4c3 100644 --- a/cooler/_version.py +++ b/cooler/_version.py @@ -1,2 +1,4 @@ -__version__ = "0.8.8" +__version__ = "0.8.9" __format_version__ = 3 +__format_version_mcool__ = 2 +__format_version_scool__ = 1 diff --git a/cooler/create/__init__.py b/cooler/create/__init__.py index 9a37d806..531c29a5 100644 --- a/cooler/create/__init__.py +++ b/cooler/create/__init__.py @@ -2,6 +2,9 @@ import numpy as np MAGIC = u"HDF5::Cooler" +MAGIC_SCOOL = u"HDF5::SCOOL" +MAGIC_MCOOL = u"HDF5::MCOOL" + URL = u"https://github.com/mirnylab/cooler" CHROM_DTYPE = np.dtype("S") CHROMID_DTYPE = np.int32 @@ -28,4 +31,4 @@ ContactBinner, ) -from ._create import create_cooler, create, create_from_unordered, append, rename_chroms +from ._create import create_cooler, create, create_from_unordered, append, rename_chroms, create_scool diff --git a/cooler/create/_create.py b/cooler/create/_create.py index b3e3e2c8..917fef48 100644 --- a/cooler/create/_create.py +++ b/cooler/create/_create.py @@ -13,7 +13,7 @@ import simplejson as json import six -from .._version import __version__, __format_version__ +from .._version import __version__, __format_version__, __format_version_scool__ from .._logging import get_logger from ..core import put, get from ..util import ( @@ -27,6 +27,7 @@ from ._ingest import validate_pixels from . import ( MAGIC, + MAGIC_SCOOL, URL, CHROM_DTYPE, CHROMID_DTYPE, @@ -296,7 +297,7 @@ def write_indexes(grp, chrom_offset, bin1_offset, h5opts): ) -def write_info(grp, info): +def write_info(grp, info, scool=False): """ Write the file description and metadata attributes. @@ -317,13 +318,19 @@ def write_info(grp, info): """ assert "nbins" in info - assert "nnz" in info + if not scool: + assert "nnz" in info info.setdefault("genome-assembly", "unknown") info["metadata"] = json.dumps(info.get("metadata", {})) info["creation-date"] = datetime.now().isoformat() info["generated-by"] = six.text_type("cooler-" + __version__) - info["format"] = MAGIC - info["format-version"] = six.text_type(__format_version__) + if scool: + info["format"] = MAGIC_SCOOL + info["format-version"] = six.text_type(__format_version_scool__) + + else: + info["format"] = MAGIC + info["format-version"] = six.text_type(__format_version__) info["format-url"] = URL grp.attrs.update(info) @@ -439,6 +446,8 @@ def create( ensure_sorted=False, lock=None, append=False, + append_scool=False, + scool_root_uri=None, **kwargs ): """ @@ -470,7 +479,10 @@ def create( "Note that the `chromsizes` argument is now deprecated: " "see documentation for `create`." ) - + if append_scool == True and scool_root_uri is None: + raise ValueError( + "If the parameter `append_scool` is set, the parameter `scool_root_uri` must be defined." + ) dtypes = _get_dtypes_arg(dtypes, kwargs) for col in ["chrom", "start", "end"]: @@ -572,23 +584,52 @@ def create( f.create_group(group_path) # Write chroms, bins and pixels - with h5py.File(file_path, "r+") as f: - h5 = f[group_path] + if append_scool: + src_path, src_group = parse_cooler_uri(scool_root_uri) + dst_path, dst_group = parse_cooler_uri(cool_uri) + + with h5py.File(src_path, "r+") as src, h5py.File(dst_path, "r+") as dst: + + dst[dst_group]["chroms"] = src["chroms"] + + # hard link to root bins table, but only the three main datasets + dst[dst_group]["bins/chrom"] = src["bins/chrom"] + dst[dst_group]["bins/start"]= src["bins/start"] + dst[dst_group]["bins/end"]= src["bins/end"] + + # create per cell the additional columns e.g. 'weight' + # these columns are individual for each cell + columns = list(bins.keys()) + for col in ["chrom", "start", "end"]: + columns.remove(col) + if columns: + put(dst[dst_group]['bins'], bins[columns]) + with h5py.File(file_path, "r+") as f: + h5 = f[group_path] + grp = h5.create_group("pixels") + if symmetric_upper: + max_size = n_bins * (n_bins - 1) // 2 + n_bins + else: + max_size = n_bins * n_bins + prepare_pixels(grp, n_bins, max_size, meta.columns, dict(meta.dtypes), h5opts) + else: + with h5py.File(file_path, "r+") as f: + h5 = f[group_path] - logger.info("Writing chroms") - grp = h5.create_group("chroms") - write_chroms(grp, chroms, h5opts) + logger.info("Writing chroms") + grp = h5.create_group("chroms") + write_chroms(grp, chroms, h5opts) - logger.info("Writing bins") - grp = h5.create_group("bins") - write_bins(grp, bins, chroms["name"], h5opts) + logger.info("Writing bins") + grp = h5.create_group("bins") + write_bins(grp, bins, chroms["name"], h5opts) - grp = h5.create_group("pixels") - if symmetric_upper: - max_size = n_bins * (n_bins - 1) // 2 + n_bins - else: - max_size = n_bins * n_bins - prepare_pixels(grp, n_bins, max_size, meta.columns, dict(meta.dtypes), h5opts) + grp = h5.create_group("pixels") + if symmetric_upper: + max_size = n_bins * (n_bins - 1) // 2 + n_bins + else: + max_size = n_bins * n_bins + prepare_pixels(grp, n_bins, max_size, meta.columns, dict(meta.dtypes), h5opts) # Multiprocess HDF5 reading is supported only if the same HDF5 file is not # open in write mode anywhere. To read and write to the same file, pass a @@ -629,8 +670,6 @@ def create( info["metadata"] = metadata write_info(h5, info) - logger.info("Done") - def create_from_unordered( cool_uri, @@ -822,55 +861,7 @@ def append(cool_uri, table, data, chunked=False, force=False, h5opts=None, lock= lock.release() -def create_cooler( - cool_uri, - bins, - pixels, - columns=None, - dtypes=None, - metadata=None, - assembly=None, - ordered=False, - symmetric_upper=True, - mode=None, - mergebuf=int(20e6), - delete_temp=True, - temp_dir=None, - max_merge=200, - boundscheck=True, - dupcheck=True, - triucheck=True, - ensure_sorted=False, - h5opts=None, - lock=None, -): - """ - Create a cooler from bins and pixels at the specified URI. - - Because the number of pixels is often very large, the input pixels are - normally provided as an iterable (e.g., an iterator or generator) of - DataFrame **chunks** that fit in memory. - - .. versionadded:: 0.8.0 - - Parameters - ---------- - cool_uri : str - Path to cooler file or URI string. If the file does not exist, - it will be created. - bins : pandas.DataFrame - Segmentation of the chromosomes into genomic bins as a BED-like - DataFrame with columns ``chrom``, ``start`` and ``end``. May contain - additional columns. - pixels : DataFrame, dictionary, or iterable of either - A table, given as a dataframe or a column-oriented dict, containing - columns labeled ``bin1_id``, ``bin2_id`` and ``count``, sorted by - (``bin1_id``, ``bin2_id``). If additional columns are included in the - pixel table, their names and dtypes must be specified using the - ``columns`` and ``dtypes`` arguments. For larger input data, an - **iterable** can be provided that yields the pixel data as a sequence - of chunks. If the input is a dask DataFrame, it will also be processed - one chunk at a time. +_DOC_OTHER_PARAMS = """ columns : sequence of str, optional Customize which value columns from the input pixels to store in the cooler. Non-standard value columns will be given dtype ``float64`` @@ -930,12 +921,9 @@ def create_cooler( triucheck : bool, optional Input validation: Check that ``bin1_id`` <= ``bin2_id`` when creating coolers in symmetric-upper mode. +""".strip() - See also - -------- - cooler.create.sanitize_records - cooler.create.sanitize_pixels - +_DOC_NOTES = """ Notes ----- If the pixel chunks are provided in the correct order required for the @@ -953,6 +941,75 @@ def create_cooler( Each chunk of pixels will go through a validation pipeline, which can be customized with the following options: ``boundscheck``, ``triucheck``, ``dupcheck``, ``ensure_sorted``. +""".strip() + + +def _format_docstring(**kwargs): + def decorate(func): + func.__doc__ = func.__doc__.format(**kwargs) + return func + return decorate + + +@_format_docstring(other_parameters=_DOC_OTHER_PARAMS, notes=_DOC_NOTES) +def create_cooler( + cool_uri, + bins, + pixels, + columns=None, + dtypes=None, + metadata=None, + assembly=None, + ordered=False, + symmetric_upper=True, + mode="w", + mergebuf=int(20e6), + delete_temp=True, + temp_dir=None, + max_merge=200, + boundscheck=True, + dupcheck=True, + triucheck=True, + ensure_sorted=False, + h5opts=None, + lock=None, +): + r""" + Create a cooler from bins and pixels at the specified URI. + + Because the number of pixels is often very large, the input pixels are + normally provided as an iterable (e.g., an iterator or generator) of + DataFrame **chunks** that fit in memory. + + .. versionadded:: 0.8.0 + + Parameters + ---------- + cool_uri : str + Path to cooler file or URI string. If the file does not exist, + it will be created. + bins : pandas.DataFrame + Segmentation of the chromosomes into genomic bins as a BED-like + DataFrame with columns ``chrom``, ``start`` and ``end``. May contain + additional columns. + pixels : DataFrame, dictionary, or iterable of either + A table, given as a dataframe or a column-oriented dict, containing + columns labeled ``bin1_id``, ``bin2_id`` and ``count``, sorted by + (``bin1_id``, ``bin2_id``). If additional columns are included in the + pixel table, their names and dtypes must be specified using the + ``columns`` and ``dtypes`` arguments. For larger input data, an + **iterable** can be provided that yields the pixel data as a sequence + of chunks. If the input is a dask DataFrame, it will also be processed + one chunk at a time. + {other_parameters} + + See also + -------- + cooler.create_scool + cooler.create.sanitize_records + cooler.create.sanitize_pixels + + {notes} """ # dispatch to the approprate creation method @@ -1000,3 +1057,200 @@ def create_cooler( temp_dir=temp_dir, max_merge=max_merge, ) + + +@_format_docstring(other_parameters=_DOC_OTHER_PARAMS, notes=_DOC_NOTES) +def create_scool( + cool_uri, + bins, + cell_name_pixels_dict, + columns=None, + dtypes=None, + metadata=None, + assembly=None, + ordered=False, + symmetric_upper=True, + mode="w", + mergebuf=int(20e6), + delete_temp=True, + temp_dir=None, + max_merge=200, + boundscheck=True, + dupcheck=True, + triucheck=True, + ensure_sorted=False, + h5opts=None, + lock=None, + **kwargs): + r""" + Create a single-cell (scool) file. + + For each cell store a cooler matrix under **/cells**, where all matrices + have the same dimensions. + + Each cell is a regular cooler data collection, so the input must be a + bin table and pixel table for each cell. The pixel tables are provided as + a dictionary where the key is a unique cell name. The bin tables can be + provided as a dict with the same keys or a single common bin table can be + given. + + .. versionadded:: 0.8.9 + + Parameters + ---------- + cool_uri : str + Path to scool file or URI string. If the file does not exist, + it will be created. + bins : :class:`pandas.DataFrame` or Dict[str, DataFrame] + A single bin table or dictionary of cell names to bins tables. A bin + table is a dataframe with columns ``chrom``, ``start`` and ``end``. + May contain additional columns. + cell_name_pixels_dict : Dict[str, DataFrame] + Cell name as key and pixel table DataFrame as value. + A table, given as a dataframe or a column-oriented dict, containing + columns labeled ``bin1_id``, ``bin2_id`` and ``count``, sorted by + (``bin1_id``, ``bin2_id``). If additional columns are included in the + pixel table, their names and dtypes must be specified using the + ``columns`` and ``dtypes`` arguments. For larger input data, an + **iterable** can be provided that yields the pixel data as a sequence + of chunks. If the input is a dask DataFrame, it will also be processed + one chunk at a time. + {other_parameters} + + See also + -------- + cooler.create_cooler + cooler.zoomify_cooler + + {notes} + + """ + file_path, group_path = parse_cooler_uri(cool_uri) + h5opts = _set_h5opts(h5opts) + + if isinstance(bins, pd.DataFrame): + bins_dict = {cell_name: bins for cell_name in cell_name_pixels_dict} + cell_names = sorted(cell_name_pixels_dict) + else: + # Assume bins is a dict of cell name -> dataframe + bins_dict = bins + if len(bins_dict) == 0: + raise ValueError("At least one bin must be given.") + else: + bins = bins_dict[next(iter(bins_dict))][["chrom", "start", "end"]] + + # Sort bins_dict and cell_name_pixels_dict to guarantee matching keys + bins_keys = sorted(bins_dict) + cell_names = sorted(cell_name_pixels_dict) + for key_bins, key_pixels in zip(bins_keys, cell_names): + if key_bins != key_pixels: + raise ValueError('Bins and pixel dicts do not have matching keys') + + dtypes = _get_dtypes_arg(dtypes, kwargs) + + for col in ["chrom", "start", "end"]: + if col not in bins.columns: + raise ValueError("Missing column from bin table: '{}'.".format(col)) + + # Populate dtypes for expected pixel columns, and apply user overrides. + if dtypes is None: + dtypes = dict(PIXEL_DTYPES) + else: + dtypes_ = dict(dtypes) + dtypes = dict(PIXEL_DTYPES) + dtypes.update(dtypes_) + + # Determine the appropriate iterable + try: + from dask.dataframe import DataFrame as dask_df + except (ImportError, AttributeError): # pragma: no cover + dask_df = () + + # Prepare chroms and bins + bins = bins.copy() + bins["chrom"] = bins["chrom"].astype(object) + chromsizes = get_chromsizes(bins) + try: + chromsizes = six.iteritems(chromsizes) + except AttributeError: + pass + chromnames, lengths = zip(*chromsizes) + chroms = pd.DataFrame( + {"name": chromnames, "length": lengths}, columns=["name", "length"] + ) + binsize = get_binsize(bins) + n_chroms = len(chroms) + n_bins = len(bins) + + # Create root group + with h5py.File(file_path, mode) as f: + logger.info('Creating cooler at "{}::{}"'.format(file_path, group_path)) + if group_path == "/": + for name in ["chroms", "bins"]: + if name in f: + del f[name] + else: + try: + f.create_group(group_path) + except ValueError: + del f[group_path] + f.create_group(group_path) + + with h5py.File(file_path, "r+") as f: + h5 = f[group_path] + + logger.info("Writing chroms") + grp = h5.create_group("chroms") + write_chroms(grp, chroms, h5opts) + + logger.info("Writing bins") + grp = h5.create_group("bins") + write_bins(grp, bins, chroms["name"], h5opts) + + with h5py.File(file_path, "r+") as f: + h5 = f[group_path] + + logger.info("Writing info") + info = {} + info["bin-type"] = u"fixed" if binsize is not None else u"variable" + info["bin-size"] = binsize if binsize is not None else u"null" + info["nchroms"] = n_chroms + info["ncells"] = len(cell_name_pixels_dict) + info["nbins"] = n_bins + if assembly is not None: + info["genome-assembly"] = assembly + if metadata is not None: + info["metadata"] = metadata + write_info(h5, info, True) + + # Append single cells + for key in cell_names: + if '/' in key: + cell_name = key.split('/')[-1] + else: + cell_name = key + + create( + cool_uri + '::/cells/' + cell_name, + bins_dict[key], + cell_name_pixels_dict[key], + columns=columns, + dtypes=dtypes, + metadata=metadata, + assembly=assembly, + ordered=ordered, + symmetric_upper=symmetric_upper, + mode='a', + boundscheck=boundscheck, + dupcheck=dupcheck, + triucheck=triucheck, + ensure_sorted=ensure_sorted, + h5opts=h5opts, + lock=lock, + mergebuf=mergebuf, + delete_temp=delete_temp, + temp_dir=temp_dir, + max_merge=max_merge, + append_scool=True, + scool_root_uri=cool_uri + ) diff --git a/cooler/fileops.py b/cooler/fileops.py index 89087bfa..facf2e85 100644 --- a/cooler/fileops.py +++ b/cooler/fileops.py @@ -19,7 +19,7 @@ import h5py from .util import parse_cooler_uri, natsorted -from .create import MAGIC, URL +from .create import MAGIC, URL, MAGIC_SCOOL __all__ = ["is_cooler", "is_multires_file", "list_coolers", "cp", "mv", "ln"] @@ -104,8 +104,7 @@ def _visititems(node, func, result=None): def _is_cooler(grp): fmt = grp.attrs.get("format", None) - url = grp.attrs.get("format-url", None) - if fmt == MAGIC or url == URL: + if fmt == MAGIC: keys = ("chroms", "bins", "pixels", "indexes") if not all(name in grp.keys() for name in keys): warnings.warn("Cooler path {} appears to be corrupt".format(grp.name)) @@ -117,7 +116,7 @@ def is_cooler(uri): """ Determine if a URI string references a cooler data collection. Returns False if the file or group path doesn't exist. - + """ filepath, grouppath = parse_cooler_uri(uri) if not h5py.is_hdf5(filepath): @@ -147,6 +146,31 @@ def is_multires_file(filepath, min_version=1): return False +def is_scool_file(filepath): + """ + Determine if a file is a single-cell cooler file. + Returns False if the file doesn't exist. + + """ + if not h5py.is_hdf5(filepath): + raise OSError("'{}' is not an HDF5 file.".format(filepath)) + return False + + with h5py.File(filepath) as f: + fmt = f.attrs.get("format", None) + if fmt == MAGIC_SCOOL: + keys = ("chroms", "bins", "cells") + if not all(name in f.keys() for name in keys): + warnings.warn("Scooler path {} appears to be corrupt".format(grp.name)) + return False + if "cells" in f.keys() and len(f["cells"].keys()) > 0: + for cells in f["cells"].keys(): + if not _is_cooler(f["cells"][cells]): + return False + return True + return False + + def list_coolers(filepath): """ List group paths to all cooler data collections in a file. @@ -177,6 +201,37 @@ def _check_cooler(pth, grp): return natsorted(listing) +def list_scool_cells(filepath): + """ + List the paths to all single-cell cool matrices in a file scool file. + + Parameters + ---------- + filepath : str + + Returns + ------- + list + Cooler group paths of all cells in the file. + + """ + if is_scool_file(filepath): + + listing = [] + def _check_cooler(pth, grp): + if _is_cooler(grp): + listing.append("/" + pth if not pth.startswith("/") else pth) + with h5py.File(filepath, "r") as f: + _check_cooler("/", f) + visititems(f, _check_cooler) + if '/' in listing: + listing.remove('/') + return natsorted(listing) + else: + raise OSError("'{}' is not a scool file.".format(filepath)) + return False + + def ls(uri): """ Get all groups and datasets in an HDF5 file. diff --git a/cooler/reduce.py b/cooler/reduce.py index f32e3528..a1671b08 100644 --- a/cooler/reduce.py +++ b/cooler/reduce.py @@ -12,6 +12,7 @@ import numpy as np import h5py +from ._version import __format_version_mcool__ from ._logging import get_logger from .create import ContactBinner, create from .util import parse_cooler_uri, GenomeSegmentation @@ -851,7 +852,10 @@ def zoomify_cooler( ) with h5py.File(outfile, "r+") as fw: - fw.attrs.update({"format": u"HDF5::MCOOL", "format-version": 2}) + fw.attrs.update({ + "format": u"HDF5::MCOOL", + "format-version": __format_version_mcool__ + }) def legacy_zoomify(input_uri, outfile, nproc, chunksize, lock=None): diff --git a/docs/api.rst b/docs/api.rst index 39768066..91bf884b 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -32,6 +32,7 @@ Creation/reduction cooler.merge_coolers cooler.coarsen_cooler cooler.zoomify_cooler + cooler.create_scool Manipulation ~~~~~~~~~~~~ @@ -70,6 +71,8 @@ cooler .. autofunction:: cooler.zoomify_cooler .. autofunction:: cooler.balance_cooler .. autofunction:: cooler.rename_chroms +.. autofunction:: cooler.create_scool + ---- diff --git a/docs/concepts.rst b/docs/concepts.rst index 9941c38a..84fc1931 100644 --- a/docs/concepts.rst +++ b/docs/concepts.rst @@ -213,3 +213,25 @@ The experimental ``read_table`` function can be used to generate a dask datafram Learn more about the `Dask `_ project. +Create a scool file +------------------- + +The creation of a single-cell cooler file is similar to a regular cooler file. Each cell needs to have a name, bin table and a pixel table. +All cells must have the same dimensions, and the bins and pixels needs to be provided as two dicts with the cell names as keys. + + +.. code-block:: python + + >>> name_pixel_dict = {'cell1': pixels_cell1, 'cell2': pixels_cell2, 'cell3': pixels_cell3} + >>> name_bins_dict = {'cell1': bins_cell1, 'cell2': bins_cell2, 'cell3': bins_cell3} + >>> cooler.create_scool('single_cell_cool.scool', name_bins_dict, name_pixel_dict) + +To read the content, each individual cell must be handled as a regular cool file. + +.. code-block:: python + + >> content_of_scool = cooler.fileops.list_coolers('single_cell_cool.scool') + ['/', '/cells/cell1', '/cells/cell2', '/cells/cell3'] + >>> c1 = cooler.Cooler('single_cell_cool.scool::cells/cell1') + >>> c2 = cooler.Cooler('single_cell_cool.scool::cells/cell2') + >>> c3 = cooler.Cooler('single_cell_cool.scool::cells/cell3') diff --git a/docs/conf.py b/docs/conf.py index 1721848f..50951eed 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -18,11 +18,14 @@ import re import shlex + +# -- Path setup -------------------------------------------------------------- + # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. -#sys.path.insert(0, os.path.abspath('.')) -sys.path.insert(0, os.path.abspath('..')) +# sys.path.insert(0, os.path.abspath('.')) +sys.path.insert(0, os.path.abspath("..")) # autodoc_mock_imports = [ # 'numpy', @@ -33,293 +36,259 @@ # 'cytoolz', # ] import mock + MOCK_MODULES = [ - 'numpy', - 'scipy', - 'scipy.sparse', - 'pandas', - 'pandas.algos', - 'pandas.api', - 'pandas.api.types', - 'h5py', - 'dask', - 'dask.base', - 'dask.array', - 'dask.dataframe', - 'dask.dataframe.core', - 'dask.dataframe.utils', - 'simplejson', + "numpy", + "scipy", + "scipy.sparse", + "pandas", + "pandas.algos", + "pandas.api", + "pandas.api.types", + "h5py", + "dask", + "dask.base", + "dask.array", + "dask.dataframe", + "dask.dataframe.core", + "dask.dataframe.utils", + "simplejson", ] for mod_name in MOCK_MODULES: sys.modules[mod_name] = mock.Mock() -# -- General configuration ------------------------------------------------ - -# If your documentation needs a minimal Sphinx version, state it here. -#needs_sphinx = '1.0' - -# Add any Sphinx extension module names here, as strings. They can be -# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom -# ones. -extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.doctest', - 'sphinx.ext.todo', - 'sphinx.ext.coverage', - 'sphinx.ext.mathjax', - 'sphinx.ext.ifconfig', - 'sphinx.ext.viewcode', - 'sphinx.ext.autosummary', - 'sphinx.ext.napoleon', # 'numpydoc' -] - -numpydoc_show_class_members = False -napoleon_use_rtype = False - -# Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] - -# The suffix(es) of source filenames. -# You can specify multiple suffix as a list of string: -source_suffix = { - '.rst': 'restructuredtext', - '.md': 'markdown', -} - -# The encoding of source files. -#source_encoding = 'utf-8-sig' - -source_parsers = { - '.md': 'recommonmark.parser.CommonMarkParser' -} - -# The master toctree document. -master_doc = 'index' +# -- Project information ----------------------------------------------------- # General information about the project. -project = 'cooler' -copyright = '2016-2019, Nezar Abdennur' -author = 'Nezar Abdennur' +project = "cooler" +copyright = "2016-2019, Nezar Abdennur" +author = "Nezar Abdennur" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # + def _get_version(): - init = os.path.join('..', 'cooler', '_version.py') + init = os.path.join("..", "cooler", "_version.py") with open(init) as fh: text = fh.read() version = re.search( - r'^__version__\s*=\s*[\'"]([^\'"]*)[\'"]', - text, - re.MULTILINE).group(1) + r'^__version__\s*=\s*[\'"]([^\'"]*)[\'"]', text, re.MULTILINE + ).group(1) return version + + # The full version, including alpha/beta/rc tags. release = _get_version() + # The short X.Y version. -version = release.rsplit('.', maxsplit=1)[0] +version = release.rsplit(".", maxsplit=1)[0] -# The language for content autogenerated by Sphinx. Refer to documentation -# for a list of supported languages. -# -# This is also used if you do content translation via gettext catalogs. -# Usually you set "language" from the command line for these cases. -language = None -# There are two options for replacing |today|: either, you set today to some -# non-false value, then it is used: -#today = '' -# Else, today_fmt is used as the format for a strftime call. -#today_fmt = '%B %d, %Y' +# -- General configuration ------------------------------------------------ -# List of patterns, relative to source directory, that match files and -# directories to ignore when looking for source files. -exclude_patterns = ['_build'] +# If your documentation needs a minimal Sphinx version, state it here. +# needs_sphinx = '1.0' -# The reST default role (used for this markup: `text`) to use for all -# documents. -#default_role = None +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + "sphinx.ext.autodoc", + "sphinx.ext.doctest", + "sphinx.ext.todo", + "sphinx.ext.coverage", + "sphinx.ext.mathjax", + "sphinx.ext.ifconfig", + "sphinx.ext.viewcode", + "sphinx.ext.autosummary", + "sphinx.ext.napoleon", # 'numpydoc' + "recommonmark", +] -# If true, '()' will be appended to :func: etc. cross-reference text. -#add_function_parentheses = True +numpydoc_show_class_members = False +napoleon_use_rtype = False -# If true, the current module name will be prepended to all description -# unit titles (such as .. function::). -#add_module_names = True +# Add any paths that contain templates here, relative to this directory. +templates_path = ["_templates"] -# If true, sectionauthor and moduleauthor directives will be shown in the -# output. They are ignored by default. -#show_authors = False +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +source_suffix = { + ".rst": "restructuredtext", + ".md": "markdown", +} + +# source_parsers = {".md": "recommonmark.parser.CommonMarkParser"} # The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' +pygments_style = "sphinx" # A list of ignored prefixes for module index sorting. -#modindex_common_prefix = [] +# modindex_common_prefix = [] # If true, keep warnings as "system message" paragraphs in the built documents. -#keep_warnings = False +# keep_warnings = False # If true, `todo` and `todoList` produce output, else they produce nothing. todo_include_todos = False +master_doc = "index" # -- Options for HTML output ---------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. -html_theme = 'sphinx_rtd_theme' +html_theme = "sphinx_rtd_theme" # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. -#html_theme_options = {} +# html_theme_options = {} # Add any paths that contain custom themes here, relative to this directory. -#html_theme_path = [] +# html_theme_path = [] # The name for this set of Sphinx documents. If None, it defaults to # " v documentation". -#html_title = None +# html_title = None # A shorter title for the navigation bar. Default is the same as html_title. -#html_short_title = None +# html_short_title = None # The name of an image file (relative to this directory) to place at the top # of the sidebar. -#html_logo = None +# html_logo = None # The name of an image file (within the static path) to use as favicon of the # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 # pixels large. -#html_favicon = None +# html_favicon = None # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +html_static_path = ["_static"] # Add any extra paths that contain custom files (such as robots.txt or # .htaccess) here, relative to this directory. These files are copied # directly to the root of the documentation. -#html_extra_path = [] +# html_extra_path = [] # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, # using the given strftime format. -#html_last_updated_fmt = '%b %d, %Y' +# html_last_updated_fmt = '%b %d, %Y' # If true, SmartyPants will be used to convert quotes and dashes to # typographically correct entities. -#html_use_smartypants = True +# html_use_smartypants = True # Custom sidebar templates, maps document names to template names. -#html_sidebars = {} +# html_sidebars = {} # Additional templates that should be rendered to pages, maps page names to # template names. -#html_additional_pages = {} +# html_additional_pages = {} # If false, no module index is generated. -#html_domain_indices = True +# html_domain_indices = True # If false, no index is generated. -#html_use_index = True +# html_use_index = True # If true, the index is split into individual pages for each letter. -#html_split_index = False +# html_split_index = False # If true, links to the reST sources are added to the pages. -#html_show_sourcelink = True +# html_show_sourcelink = True # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. -#html_show_sphinx = True +# html_show_sphinx = True # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. -#html_show_copyright = True +# html_show_copyright = True # If true, an OpenSearch description file will be output, and all pages will # contain a tag referring to it. The value of this option must be the # base URL from which the finished HTML is served. -#html_use_opensearch = '' +# html_use_opensearch = '' # This is the file name suffix for HTML files (e.g. ".xhtml"). -#html_file_suffix = None +# html_file_suffix = None # Language to be used for generating the HTML full-text search index. # Sphinx supports the following languages: # 'da', 'de', 'en', 'es', 'fi', 'fr', 'h', 'it', 'ja' # 'nl', 'no', 'pt', 'ro', 'r', 'sv', 'tr' -#html_search_language = 'en' +# html_search_language = 'en' # A dictionary with options for the search language support, empty by default. # Now only 'ja' uses this config value -#html_search_options = {'type': 'default'} +# html_search_options = {'type': 'default'} # The name of a javascript file (relative to the configuration directory) that # implements a search results scorer. If empty, the default will be used. -#html_search_scorer = 'scorer.js' +# html_search_scorer = 'scorer.js' # Output file base name for HTML help builder. -htmlhelp_basename = 'coolerdoc' +htmlhelp_basename = "coolerdoc" # -- Options for LaTeX output --------------------------------------------- latex_elements = { -# The paper size ('letterpaper' or 'a4paper'). -#'papersize': 'letterpaper', - -# The font size ('10pt', '11pt' or '12pt'). -#'pointsize': '10pt', - -# Additional stuff for the LaTeX preamble. -#'preamble': '', - -# Latex figure (float) alignment -#'figure_align': 'htbp', + # The paper size ('letterpaper' or 'a4paper'). + #'papersize': 'letterpaper', + # The font size ('10pt', '11pt' or '12pt'). + #'pointsize': '10pt', + # Additional stuff for the LaTeX preamble. + #'preamble': '', + # Latex figure (float) alignment + #'figure_align': 'htbp', } # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - (master_doc, 'cooler.tex', 'cooler Documentation', - 'Nezar Abdennur', 'manual'), + (master_doc, "cooler.tex", "cooler Documentation", "Nezar Abdennur", "manual"), ] # The name of an image file (relative to this directory) to place at the top of # the title page. -#latex_logo = None +# latex_logo = None # For "manual" documents, if this is true, then toplevel headings are parts, # not chapters. -#latex_use_parts = False +# latex_use_parts = False # If true, show page references after internal links. -#latex_show_pagerefs = False +# latex_show_pagerefs = False # If true, show URL addresses after external links. -#latex_show_urls = False +# latex_show_urls = False # Documents to append as an appendix to all manuals. -#latex_appendices = [] +# latex_appendices = [] # If false, no module index is generated. -#latex_domain_indices = True +# latex_domain_indices = True # -- Options for manual page output --------------------------------------- # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). -man_pages = [ - (master_doc, 'cooler', 'cooler Documentation', - [author], 1) -] +man_pages = [(master_doc, "cooler", "cooler Documentation", [author], 1)] # If true, show URL addresses after external links. -#man_show_urls = False +# man_show_urls = False # -- Options for Texinfo output ------------------------------------------- @@ -328,19 +297,25 @@ def _get_version(): # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - (master_doc, 'cooler', 'cooler Documentation', - author, 'cooler', 'One line description of project.', - 'Miscellaneous'), + ( + master_doc, + "cooler", + "cooler Documentation", + author, + "cooler", + "One line description of project.", + "Miscellaneous", + ), ] # Documents to append as an appendix to all manuals. -#texinfo_appendices = [] +# texinfo_appendices = [] # If false, no module index is generated. -#texinfo_domain_indices = True +# texinfo_domain_indices = True # How to display URL addresses: 'footnote', 'no', or 'inline'. -#texinfo_show_urls = 'footnote' +# texinfo_show_urls = 'footnote' # If true, do not generate a @detailmenu in the "Top" node's menu. -#texinfo_no_detailmenu = False +# texinfo_no_detailmenu = False diff --git a/docs/schema_v3.rst b/docs/schema_v3.rst index 9cf2ae07..f6d78912 100644 --- a/docs/schema_v3.rst +++ b/docs/schema_v3.rst @@ -279,3 +279,73 @@ In addition, a multi-resolution cooler file may indicate to clients that it is u .. versionchanged:: 0.8 Both the legacy layout and the new mcool layout are supported by `HiGlass `_. Prior to cooler 0.8, the new layout was produced only when requesting a specific list of resolutions. As of cooler 0.8, the new layout is always produced by the :command:`cooler zoomify` command unless the ``--legacy`` option is given. Files produced by :py:func:`cooler.zoomify_cooler`, `hic2cool `_, and the mcools from the `4DN data portal `_ also follow the new layout. + + + +Single-cell (single-resolution) +------------------------------- + +A single-cell cooler file contains all the matrices of a single-cell Hi-C data set. All cells are stored under a group called ``/cells``, and all cells share the primary bin table columns +i.e. ``bins['chrom']``, ``bins['start']`` and ``bins['end']`` which are `hardlinked `_ to the root-level bin table. Any individual cell can be accessed using the regular :class:`cooler.Cooler` interface. +Conventional file extension: ``.scool``. + +:: + + XYZ.scool + / + ├── bins + ├── chroms + └── cells + ├── cell_id1 + │ ├── bins + │ ├── chroms + │ ├── pixels + │ └── indexes + ├── cell_id2 + │ ├── bins + │ ├── chroms + │ ├── pixels + │ └── indexes + ├── cell_id3 + │ ├── bins + │ ├── chroms + │ ├── pixels + │ └── indexes + ├── cell_id4 + │ ├── bins + │ ├── chroms + │ ├── pixels + │ └── indexes + . + . + . + +In addition, a single-cell single-resolution cooler file may indicate to clients that it is using this layout with the following ``/``-level attributes: + +.. describe:: format : string (constant) + + "HDF5::SCOOL" + +.. describe:: format-version : int + + 1 + +.. describe:: bin-type : { "fixed", "variable" } + + Indicates whether the resolution is constant along both axes. + +.. describe:: bin-size : int + + The bin resolution + +.. describe:: nbins : int + + The number of bins + +.. describe:: nchroms : int + + The number of chromosomes of the cells + +.. describe:: ncells : int + + The number of stored cells diff --git a/requirements-dev.txt b/requirements-dev.txt index 46562ff0..a53c0044 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -2,3 +2,5 @@ dask[array,dataframe] pytest mock +pytest-flake8 +pytest-cov \ No newline at end of file diff --git a/tests/data/scool_test_file.scool b/tests/data/scool_test_file.scool new file mode 100644 index 00000000..4f047631 Binary files /dev/null and b/tests/data/scool_test_file.scool differ diff --git a/tests/test_create.py b/tests/test_create.py index 60007c99..d569edc5 100644 --- a/tests/test_create.py +++ b/tests/test_create.py @@ -432,3 +432,42 @@ def test_create_cooler_from_dask(): # pixels, # ordered=False # ) + + +@pytest.mark.parametrize( + "fp", [op.join(datadir, "hg19.GM12878-MboI.matrix.2000kb.cool")] +) +def test_create_scool(fp): + c = cooler.Cooler(fp) + # chromsizes = c.chromsizes + bins = c.bins()[:] + pixels = c.pixels()[:] + + # random and different content to prove only chrom, start, end is linked and the rest is independent for each cell + from copy import deepcopy + bins_cell1 = deepcopy(bins) + bins_cell2 = deepcopy(bins) + bins_cell3 = deepcopy(bins) + bins_cell1['weight'] = np.array([0] * len(bins_cell1["start"])) + bins_cell2['weight'] = np.array([1] * len(bins_cell1["start"])) + bins_cell3['weight'] = np.array([2] * len(bins_cell1["start"])) + + bins_cell1['KR'] = np.array([3] * len(bins_cell1["start"])) + bins_cell2['KR'] = np.array([4] * len(bins_cell1["start"])) + bins_cell3['KR'] = np.array([5] * len(bins_cell1["start"])) + + name_pixel_dict = {'cell1': pixels, 'cell2': pixels, 'cell3': pixels} + name_bins_dict = {'cell1': bins_cell1, 'cell2': bins_cell2, 'cell3': bins_cell3} + + with isolated_filesystem(): + cooler.create_scool('outfile_test.scool', name_bins_dict, name_pixel_dict) + content_of_scool = cooler.fileops.list_scool_cells('outfile_test.scool') + content_expected = ['/cells/cell1', '/cells/cell2', '/cells/cell3'] + for content in content_expected: + assert content in content_of_scool + + cooler.create_scool('outfile_test.scool', bins, name_pixel_dict) + content_of_scool = cooler.fileops.list_scool_cells('outfile_test.scool') + content_expected = ['/cells/cell1', '/cells/cell2', '/cells/cell3'] + for content in content_expected: + assert content in content_of_scool diff --git a/tests/test_fileops.py b/tests/test_fileops.py index eb6b3564..8a1d4670 100644 --- a/tests/test_fileops.py +++ b/tests/test_fileops.py @@ -150,3 +150,20 @@ def test_print_trees(): with h5py.File(src_file) as f: t = fileops.TreeViewer(f) t._ipython_display_() + + +def test_is_scool_file(): + src_file = op.join(testdir, "data", 'scool_test_file.scool') + assert fileops.is_scool_file(src_file) + + +def test_list_scool_cells(): + src_file = op.join(testdir, "data", 'scool_test_file.scool') + paths = ['/cells/GSM2687248_41669_ACAGTG-R1-DpnII.100000.cool', '/cells/GSM2687249_41670_GGCTAC-R1-DpnII.100000.cool', + '/cells/GSM2687250_41671_TTAGGC-R1-DpnII.100000.cool', '/cells/GSM2687251_41672_AGTTCC-R1-DpnII.100000.cool', + '/cells/GSM2687252_41673_CCGTCC-R1-DpnII.100000.cool'] + cell_paths = fileops.list_scool_cells(src_file) + assert len(cell_paths) == 5 + for cell in paths: + if cell not in cell_paths: + assert False