Skip to content

Commit

Permalink
feat: Support custom contig sep characters in pairix index (#398)
Browse files Browse the repository at this point in the history
* feat: Support custom contig sep characters in pairix index

* Fix typo

* Skip new pairix test when pypairix not available
  • Loading branch information
nvictus committed Mar 8, 2024
1 parent fc02676 commit b9d2e8a
Show file tree
Hide file tree
Showing 3 changed files with 60 additions and 12 deletions.
19 changes: 18 additions & 1 deletion src/cooler/cli/cload.py
Expand Up @@ -292,8 +292,24 @@ def tabix(
default=2,
show_default=True,
)
@click.option(
"--block-char",
help="Character separating contig names in the block names of the pairix "
"index.",
type=str,
default="|",
show_default=True,
)
def pairix(
bins, pairs_path, cool_path, metadata, assembly, nproc, zero_based, max_split
bins,
pairs_path,
cool_path,
metadata,
assembly,
nproc,
zero_based,
max_split,
block_char,
):
"""
Bin a pairix-indexed contact list file.
Expand Down Expand Up @@ -326,6 +342,7 @@ def pairix(
map=map_func,
is_one_based=(not zero_based),
n_chunks=max_split,
block_char=block_char,
)

create_cooler(
Expand Down
34 changes: 24 additions & 10 deletions src/cooler/create/_ingest.py
Expand Up @@ -13,7 +13,7 @@
from bisect import bisect_left
from collections import Counter, OrderedDict
from functools import partial
from typing import Any, Callable
from typing import Any, Callable, Iterator

import h5py
import numpy as np
Expand Down Expand Up @@ -514,7 +514,7 @@ def __getstate__(self) -> dict:
d.pop("_map", None)
return d

def __iter__(self) -> dict[str, np.ndarray]:
def __iter__(self) -> Iterator[dict[str, np.ndarray]]:
"""Iterator over chunks of binned contacts (i.e., nonzero pixels)
Chunks are expected to have the following format:
Expand Down Expand Up @@ -633,7 +633,7 @@ def aggregate(self, chrom: str) -> pd.DataFrame: # pragma: no cover
def size(self) -> int:
return len(self.h5["chrms1"])

def __iter__(self) -> dict[str, np.ndarray]:
def __iter__(self) -> Iterator[dict[str, np.ndarray]]:
for chrom in self.gs.contigs:
for df in self.aggregate(chrom):
yield {k: v.values for k, v in df.items()}
Expand Down Expand Up @@ -700,7 +700,9 @@ def __init__(
"ordering of read ends in the contact list file."
)

def aggregate(self, grange): # pragma: no cover
def aggregate(
self, grange: tuple[str, int, int]
) -> pd.DataFrame | None: # pragma: no cover
chrom1, start, end = grange
import pysam

Expand Down Expand Up @@ -771,7 +773,7 @@ def aggregate(self, grange): # pragma: no cover

return pd.concat(rows, axis=0) if len(rows) else None

def __iter__(self):
def __iter__(self) -> Iterator[dict[str, np.ndarray]]:
granges = balanced_partition(self.gs, self.n_chunks, self.file_contigs)
for df in self._map(self.aggregate, granges):
if df is not None:
Expand All @@ -793,6 +795,7 @@ def __init__(
map: MapFunctor = map,
n_chunks: int = 1,
is_one_based: bool = False,
block_char: str = "|",
**kwargs,
):
try:
Expand All @@ -811,13 +814,24 @@ def __init__(
self._map = map
self.n_chunks = n_chunks
self.is_one_based = bool(is_one_based)
self.block_char = block_char

f = pypairix.open(filepath, "r")
self.C1 = f.get_chr1_col()
self.C2 = f.get_chr2_col()
self.P1 = f.get_startpos1_col()
self.P2 = f.get_startpos2_col()
blocknames = f.get_blocknames()
if block_char not in blocknames[0]:
raise ValueError(
f"The contig separator character `{block_char}` does not "
f"appear in the first block name `{blocknames[0]}`. Please "
"specify the correct character as `block_char`."
)
self.file_contigs = set(
itertools.chain.from_iterable([b.split("|") for b in f.get_blocknames()])
itertools.chain.from_iterable([
blockname.split(block_char) for blockname in blocknames
])
)

if not len(self.file_contigs):
Expand All @@ -826,7 +840,7 @@ def __init__(
if f.exists2(c1, c2) and f.exists2(c2, c1):
raise RuntimeError(
"Pairs are not triangular: found blocks "
+ f"'{c1}|{c2}'' and '{c2}|{c1}'"
+ f"'{c1}{block_char}{c2}'' and '{c2}{block_char}{c1}'"
)

# dumb heuristic to prevent excessively large chunks on one worker
Expand Down Expand Up @@ -942,7 +956,7 @@ def aggregate(

return pd.concat(rows, axis=0) if len(rows) else None

def __iter__(self) -> dict[str, np.ndarray]:
def __iter__(self) -> Iterator[dict[str, np.ndarray]]:
granges = balanced_partition(self.gs, self.n_chunks, self.file_contigs)
for df in self._map(self.aggregate, granges):
if df is not None:
Expand Down Expand Up @@ -981,7 +995,7 @@ def select_block(self, chrom1, chrom2):
raise
return block

def __iter__(self) -> dict[str, np.ndarray]:
def __iter__(self) -> Iterator[dict[str, np.ndarray]]:
# n_bins = len(self.bins)

import scipy.sparse
Expand Down Expand Up @@ -1036,7 +1050,7 @@ def __init__(
self.array = array
self.chunksize = chunksize

def __iter__(self) -> dict[str, np.ndarray]:
def __iter__(self) -> Iterator[dict[str, np.ndarray]]:
n_bins = self.array.shape[0]
spans = partition(0, n_bins, self.chunksize)

Expand Down
19 changes: 18 additions & 1 deletion tests/test_create_ingest.py
Expand Up @@ -17,7 +17,7 @@
from cooler.cli.load import load

pysam_missing = importlib.util.find_spec("pysam") is None
pairix_missing = importlib.util.find_spec("pairix") is None
pairix_missing = importlib.util.find_spec("pypairix") is None
_pandas_major_version = int(pd.__version__.split(".")[0])

tmp = tempfile.gettempdir()
Expand Down Expand Up @@ -246,6 +246,7 @@ def test_cload_pairix(bins_path, pairs_path, ref_path, nproc):
nproc=nproc,
zero_based=False,
max_split=2,
block_char="|",
)
with h5py.File(testcool_path, "r") as f1, h5py.File(ref_path, "r") as f2:
assert np.all(f1["pixels/bin1_id"][:] == f2["pixels/bin1_id"][:])
Expand All @@ -257,6 +258,22 @@ def test_cload_pairix(bins_path, pairs_path, ref_path, nproc):
pass


@pytest.mark.skipif(pairix_missing, reason="pairix not installed")
def test_cload_pairix_wrong_block_char():
with pytest.raises(ValueError):
cload_pairix.callback(
op.join(testdir, "data", "hg19.bins.2000kb.bed.gz"),
op.join(testdir, "data", "hg19.GM12878-MboI.pairs.subsample.blksrt.txt.gz"),
testcool_path,
metadata=None,
assembly="hg19",
nproc=1,
zero_based=False,
max_split=2,
block_char="?",
)


@pytest.mark.skipif(
_pandas_major_version < 1, reason="hash fix only works with pandas >= 1.0"
)
Expand Down

0 comments on commit b9d2e8a

Please sign in to comment.