diff --git a/src/cooler/cli/cload.py b/src/cooler/cli/cload.py index 655399c..b2fa749 100644 --- a/src/cooler/cli/cload.py +++ b/src/cooler/cli/cload.py @@ -292,8 +292,24 @@ def tabix( default=2, show_default=True, ) +@click.option( + "--block-char", + help="Character separating contig names in the block names of the pairix " + "index.", + type=str, + default="|", + show_default=True, +) def pairix( - bins, pairs_path, cool_path, metadata, assembly, nproc, zero_based, max_split + bins, + pairs_path, + cool_path, + metadata, + assembly, + nproc, + zero_based, + max_split, + block_char, ): """ Bin a pairix-indexed contact list file. @@ -326,6 +342,7 @@ def pairix( map=map_func, is_one_based=(not zero_based), n_chunks=max_split, + block_char=block_char, ) create_cooler( diff --git a/src/cooler/create/_ingest.py b/src/cooler/create/_ingest.py index 94095d2..3362a95 100644 --- a/src/cooler/create/_ingest.py +++ b/src/cooler/create/_ingest.py @@ -13,7 +13,7 @@ from bisect import bisect_left from collections import Counter, OrderedDict from functools import partial -from typing import Any, Callable +from typing import Any, Callable, Iterator import h5py import numpy as np @@ -514,7 +514,7 @@ def __getstate__(self) -> dict: d.pop("_map", None) return d - def __iter__(self) -> dict[str, np.ndarray]: + def __iter__(self) -> Iterator[dict[str, np.ndarray]]: """Iterator over chunks of binned contacts (i.e., nonzero pixels) Chunks are expected to have the following format: @@ -633,7 +633,7 @@ def aggregate(self, chrom: str) -> pd.DataFrame: # pragma: no cover def size(self) -> int: return len(self.h5["chrms1"]) - def __iter__(self) -> dict[str, np.ndarray]: + def __iter__(self) -> Iterator[dict[str, np.ndarray]]: for chrom in self.gs.contigs: for df in self.aggregate(chrom): yield {k: v.values for k, v in df.items()} @@ -700,7 +700,9 @@ def __init__( "ordering of read ends in the contact list file." ) - def aggregate(self, grange): # pragma: no cover + def aggregate( + self, grange: tuple[str, int, int] + ) -> pd.DataFrame | None: # pragma: no cover chrom1, start, end = grange import pysam @@ -771,7 +773,7 @@ def aggregate(self, grange): # pragma: no cover return pd.concat(rows, axis=0) if len(rows) else None - def __iter__(self): + def __iter__(self) -> Iterator[dict[str, np.ndarray]]: granges = balanced_partition(self.gs, self.n_chunks, self.file_contigs) for df in self._map(self.aggregate, granges): if df is not None: @@ -793,6 +795,7 @@ def __init__( map: MapFunctor = map, n_chunks: int = 1, is_one_based: bool = False, + block_char: str = "|", **kwargs, ): try: @@ -811,13 +814,24 @@ def __init__( self._map = map self.n_chunks = n_chunks self.is_one_based = bool(is_one_based) + self.block_char = block_char + f = pypairix.open(filepath, "r") self.C1 = f.get_chr1_col() self.C2 = f.get_chr2_col() self.P1 = f.get_startpos1_col() self.P2 = f.get_startpos2_col() + blocknames = f.get_blocknames() + if block_char not in blocknames[0]: + raise ValueError( + f"The contig separator character `{block_char}` does not " + f"appear in the first block name `{blocknames[0]}`. Please " + "specify the correct character as `block_char`." + ) self.file_contigs = set( - itertools.chain.from_iterable([b.split("|") for b in f.get_blocknames()]) + itertools.chain.from_iterable([ + blockname.split(block_char) for blockname in blocknames + ]) ) if not len(self.file_contigs): @@ -826,7 +840,7 @@ def __init__( if f.exists2(c1, c2) and f.exists2(c2, c1): raise RuntimeError( "Pairs are not triangular: found blocks " - + f"'{c1}|{c2}'' and '{c2}|{c1}'" + + f"'{c1}{block_char}{c2}'' and '{c2}{block_char}{c1}'" ) # dumb heuristic to prevent excessively large chunks on one worker @@ -942,7 +956,7 @@ def aggregate( return pd.concat(rows, axis=0) if len(rows) else None - def __iter__(self) -> dict[str, np.ndarray]: + def __iter__(self) -> Iterator[dict[str, np.ndarray]]: granges = balanced_partition(self.gs, self.n_chunks, self.file_contigs) for df in self._map(self.aggregate, granges): if df is not None: @@ -981,7 +995,7 @@ def select_block(self, chrom1, chrom2): raise return block - def __iter__(self) -> dict[str, np.ndarray]: + def __iter__(self) -> Iterator[dict[str, np.ndarray]]: # n_bins = len(self.bins) import scipy.sparse @@ -1036,7 +1050,7 @@ def __init__( self.array = array self.chunksize = chunksize - def __iter__(self) -> dict[str, np.ndarray]: + def __iter__(self) -> Iterator[dict[str, np.ndarray]]: n_bins = self.array.shape[0] spans = partition(0, n_bins, self.chunksize) diff --git a/tests/test_create_ingest.py b/tests/test_create_ingest.py index 1bcc0e8..a0e5752 100644 --- a/tests/test_create_ingest.py +++ b/tests/test_create_ingest.py @@ -17,7 +17,7 @@ from cooler.cli.load import load pysam_missing = importlib.util.find_spec("pysam") is None -pairix_missing = importlib.util.find_spec("pairix") is None +pairix_missing = importlib.util.find_spec("pypairix") is None _pandas_major_version = int(pd.__version__.split(".")[0]) tmp = tempfile.gettempdir() @@ -246,6 +246,7 @@ def test_cload_pairix(bins_path, pairs_path, ref_path, nproc): nproc=nproc, zero_based=False, max_split=2, + block_char="|", ) with h5py.File(testcool_path, "r") as f1, h5py.File(ref_path, "r") as f2: assert np.all(f1["pixels/bin1_id"][:] == f2["pixels/bin1_id"][:]) @@ -257,6 +258,22 @@ def test_cload_pairix(bins_path, pairs_path, ref_path, nproc): pass +@pytest.mark.skipif(pairix_missing, reason="pairix not installed") +def test_cload_pairix_wrong_block_char(): + with pytest.raises(ValueError): + cload_pairix.callback( + op.join(testdir, "data", "hg19.bins.2000kb.bed.gz"), + op.join(testdir, "data", "hg19.GM12878-MboI.pairs.subsample.blksrt.txt.gz"), + testcool_path, + metadata=None, + assembly="hg19", + nproc=1, + zero_based=False, + max_split=2, + block_char="?", + ) + + @pytest.mark.skipif( _pandas_major_version < 1, reason="hash fix only works with pandas >= 1.0" )