feat: Support custom contig sep characters in pairix index (#398)

* feat: Support custom contig sep characters in pairix index * Fix typo * Skip new pairix test when pypairix not available
open2c · Mar 8, 2024 · b9d2e8a · b9d2e8a
1 parent fc02676
commit b9d2e8a
Show file tree

Hide file tree

Showing 3 changed files with 60 additions and 12 deletions.
diff --git a/src/cooler/cli/cload.py b/src/cooler/cli/cload.py
@@ -292,8 +292,24 @@ def tabix(
     default=2,
     show_default=True,
 )
+@click.option(
+    "--block-char",
+    help="Character separating contig names in the block names of the pairix "
+    "index.",
+    type=str,
+    default="|",
+    show_default=True,
+)
 def pairix(
-    bins, pairs_path, cool_path, metadata, assembly, nproc, zero_based, max_split
+    bins,
+    pairs_path,
+    cool_path,
+    metadata,
+    assembly,
+    nproc,
+    zero_based,
+    max_split,
+    block_char,
 ):
     """
     Bin a pairix-indexed contact list file.
@@ -326,6 +342,7 @@ def pairix(
             map=map_func,
             is_one_based=(not zero_based),
             n_chunks=max_split,
+            block_char=block_char,
         )
 
         create_cooler(

diff --git a/src/cooler/create/_ingest.py b/src/cooler/create/_ingest.py
@@ -13,7 +13,7 @@
 from bisect import bisect_left
 from collections import Counter, OrderedDict
 from functools import partial
-from typing import Any, Callable
+from typing import Any, Callable, Iterator
 
 import h5py
 import numpy as np
@@ -514,7 +514,7 @@ def __getstate__(self) -> dict:
         d.pop("_map", None)
         return d
 
-    def __iter__(self) -> dict[str, np.ndarray]:
+    def __iter__(self) -> Iterator[dict[str, np.ndarray]]:
         """Iterator over chunks of binned contacts (i.e., nonzero pixels)
 
         Chunks are expected to have the following format:
@@ -633,7 +633,7 @@ def aggregate(self, chrom: str) -> pd.DataFrame:  # pragma: no cover
     def size(self) -> int:
         return len(self.h5["chrms1"])
 
-    def __iter__(self) -> dict[str, np.ndarray]:
+    def __iter__(self) -> Iterator[dict[str, np.ndarray]]:
         for chrom in self.gs.contigs:
             for df in self.aggregate(chrom):
                 yield {k: v.values for k, v in df.items()}
@@ -700,7 +700,9 @@ def __init__(
             "ordering of read ends in the contact list file."
         )
 
-    def aggregate(self, grange):  # pragma: no cover
+    def aggregate(
+        self, grange: tuple[str, int, int]
+    ) -> pd.DataFrame | None:  # pragma: no cover
         chrom1, start, end = grange
         import pysam
 
@@ -771,7 +773,7 @@ def aggregate(self, grange):  # pragma: no cover
 
         return pd.concat(rows, axis=0) if len(rows) else None
 
-    def __iter__(self):
+    def __iter__(self) -> Iterator[dict[str, np.ndarray]]:
         granges = balanced_partition(self.gs, self.n_chunks, self.file_contigs)
         for df in self._map(self.aggregate, granges):
             if df is not None:
@@ -793,6 +795,7 @@ def __init__(
         map: MapFunctor = map,
         n_chunks: int = 1,
         is_one_based: bool = False,
+        block_char: str = "|",
         **kwargs,
     ):
         try:
@@ -811,13 +814,24 @@ def __init__(
         self._map = map
         self.n_chunks = n_chunks
         self.is_one_based = bool(is_one_based)
+        self.block_char = block_char
+
         f = pypairix.open(filepath, "r")
         self.C1 = f.get_chr1_col()
         self.C2 = f.get_chr2_col()
         self.P1 = f.get_startpos1_col()
         self.P2 = f.get_startpos2_col()
+        blocknames = f.get_blocknames()
+        if block_char not in blocknames[0]:
+            raise ValueError(
+                f"The contig separator character `{block_char}` does not "
+                f"appear in the first block name `{blocknames[0]}`. Please "
+                "specify the correct character as `block_char`."
+            )
         self.file_contigs = set(
-            itertools.chain.from_iterable([b.split("|") for b in f.get_blocknames()])
+            itertools.chain.from_iterable([
+                blockname.split(block_char) for blockname in blocknames
+            ])
         )
 
         if not len(self.file_contigs):
@@ -826,7 +840,7 @@ def __init__(
             if f.exists2(c1, c2) and f.exists2(c2, c1):
                 raise RuntimeError(
                     "Pairs are not triangular: found blocks "
-                    + f"'{c1}|{c2}'' and '{c2}|{c1}'"
+                    + f"'{c1}{block_char}{c2}'' and '{c2}{block_char}{c1}'"
                 )
 
         # dumb heuristic to prevent excessively large chunks on one worker
@@ -942,7 +956,7 @@ def aggregate(
 
         return pd.concat(rows, axis=0) if len(rows) else None
 
-    def __iter__(self) -> dict[str, np.ndarray]:
+    def __iter__(self) -> Iterator[dict[str, np.ndarray]]:
         granges = balanced_partition(self.gs, self.n_chunks, self.file_contigs)
         for df in self._map(self.aggregate, granges):
             if df is not None:
@@ -981,7 +995,7 @@ def select_block(self, chrom1, chrom2):
                 raise
         return block
 
-    def __iter__(self) -> dict[str, np.ndarray]:
+    def __iter__(self) -> Iterator[dict[str, np.ndarray]]:
         # n_bins = len(self.bins)
 
         import scipy.sparse
@@ -1036,7 +1050,7 @@ def __init__(
         self.array = array
         self.chunksize = chunksize
 
-    def __iter__(self) -> dict[str, np.ndarray]:
+    def __iter__(self) -> Iterator[dict[str, np.ndarray]]:
         n_bins = self.array.shape[0]
         spans = partition(0, n_bins, self.chunksize)
 

diff --git a/tests/test_create_ingest.py b/tests/test_create_ingest.py
@@ -17,7 +17,7 @@
 from cooler.cli.load import load
 
 pysam_missing = importlib.util.find_spec("pysam") is None
-pairix_missing = importlib.util.find_spec("pairix") is None
+pairix_missing = importlib.util.find_spec("pypairix") is None
 _pandas_major_version = int(pd.__version__.split(".")[0])
 
 tmp = tempfile.gettempdir()
@@ -246,6 +246,7 @@ def test_cload_pairix(bins_path, pairs_path, ref_path, nproc):
         nproc=nproc,
         zero_based=False,
         max_split=2,
+        block_char="|",
     )
     with h5py.File(testcool_path, "r") as f1, h5py.File(ref_path, "r") as f2:
         assert np.all(f1["pixels/bin1_id"][:] == f2["pixels/bin1_id"][:])
@@ -257,6 +258,22 @@ def test_cload_pairix(bins_path, pairs_path, ref_path, nproc):
         pass
 
 
+@pytest.mark.skipif(pairix_missing, reason="pairix not installed")
+def test_cload_pairix_wrong_block_char():
+    with pytest.raises(ValueError):
+        cload_pairix.callback(
+            op.join(testdir, "data", "hg19.bins.2000kb.bed.gz"),
+            op.join(testdir, "data", "hg19.GM12878-MboI.pairs.subsample.blksrt.txt.gz"),
+            testcool_path,
+            metadata=None,
+            assembly="hg19",
+            nproc=1,
+            zero_based=False,
+            max_split=2,
+            block_char="?",
+        )
+
+
 @pytest.mark.skipif(
     _pandas_major_version < 1, reason="hash fix only works with pandas >= 1.0"
 )