Skip to content

Commit

Permalink
Make annotate faster by using indexing instead of merge (#353)
Browse files Browse the repository at this point in the history
* Make annotate faster by using indexing instead of merge

* copy bins, so they aren't modified twice... is it ok for memory?

* fix bug

* black

* Refactor a little

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Move int conversion

* Add initial value to allow min/max on empty array

* Fix regression

* Handle the empty case first

* Add safe casting and no array copying if not necessary

* Remove unnecessary nan handling

---------

Co-authored-by: Nezar Abdennur <nabdennur@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
3 people committed Oct 25, 2023
1 parent a72327e commit cb529bc
Showing 1 changed file with 63 additions and 59 deletions.
122 changes: 63 additions & 59 deletions src/cooler/api.py
Expand Up @@ -116,7 +116,7 @@ def _load_attrs(self, path):
return dict(grp[path].attrs)

def open(self, mode="r", **kwargs):
""" Open the HDF5 group containing the Cooler with :py:mod:`h5py`
"""Open the HDF5 group containing the Cooler with :py:mod:`h5py`
Functions as a context manager. Any ``open_kws`` passed during
construction are ignored.
Expand Down Expand Up @@ -145,21 +145,21 @@ def storage_mode(self):

@property
def binsize(self):
""" Resolution in base pairs if uniform else None """
"""Resolution in base pairs if uniform else None"""
return self._info["bin-size"]

@property
def chromsizes(self):
""" Ordered mapping of reference sequences to their lengths in bp """
"""Ordered mapping of reference sequences to their lengths in bp"""
return self._chromsizes

@property
def chromnames(self):
""" List of reference sequence names """
"""List of reference sequence names"""
return list(self._chromsizes.index)

def offset(self, region):
""" Bin ID containing the left end of a genomic region
"""Bin ID containing the left end of a genomic region
Parameters
----------
Expand All @@ -182,11 +182,11 @@ def offset(self, region):
grp,
self._chromids,
parse_region(region, self._chromsizes),
self.binsize
self.binsize,
)

def extent(self, region):
""" Bin IDs containing the left and right ends of a genomic region
"""Bin IDs containing the left and right ends of a genomic region
Parameters
----------
Expand All @@ -209,12 +209,12 @@ def extent(self, region):
grp,
self._chromids,
parse_region(region, self._chromsizes),
self.binsize
self.binsize,
)

@property
def info(self):
""" File information and metadata
"""File information and metadata
Returns
-------
Expand All @@ -230,7 +230,7 @@ def shape(self):
return (self._info["nbins"],) * 2

def chroms(self, **kwargs):
""" Chromosome table selector
"""Chromosome table selector
Returns
-------
Expand All @@ -246,7 +246,7 @@ def _slice(fields, lo, hi):
return RangeSelector1D(None, _slice, None, self._info["nchroms"])

def bins(self, **kwargs):
""" Bin table selector
"""Bin table selector
Returns
-------
Expand All @@ -272,7 +272,7 @@ def _fetch(region):
return RangeSelector1D(None, _slice, _fetch, self._info["nbins"])

def pixels(self, join=False, **kwargs):
""" Pixel table selector
"""Pixel table selector
Parameters
----------
Expand Down Expand Up @@ -317,7 +317,7 @@ def matrix(
divisive_weights=None,
chunksize=10000000,
):
""" Contact matrix selector
"""Contact matrix selector
Parameters
----------
Expand Down Expand Up @@ -391,12 +391,8 @@ def _fetch(region, region2=None):
region2 = region
region1 = parse_region(region, self._chromsizes)
region2 = parse_region(region2, self._chromsizes)
i0, i1 = region_to_extent(
grp, self._chromids, region1, self.binsize
)
j0, j1 = region_to_extent(
grp, self._chromids, region2, self.binsize
)
i0, i1 = region_to_extent(grp, self._chromids, region1, self.binsize)
j0, j1 = region_to_extent(grp, self._chromids, region2, self.binsize)
return i0, i1, j0, j1

return RangeSelector2D(field, _slice, _fetch, (self._info["nbins"],) * 2)
Expand Down Expand Up @@ -578,59 +574,67 @@ def annotate(pixels, bins, replace=False):
Returns
-------
:py:class:`DataFrame`
"""
columns = pixels.columns
ncols = len(columns)
is_selector = isinstance(bins, RangeSelector1D)

# End-inclusive slicer for bins
if isinstance(bins, RangeSelector1D):
def _loc_slice(sel, beg, end):
# slicing a range selector is end-exclusive like iloc
return sel[beg : end + 1 if end is not None else None]
else:
def _loc_slice(df, beg, end):
# loc slicing a dataframe is end-inclusive
return df.loc[beg:end]

# Extract the required bin ranges from the bin table.
# NOTE: Bin IDs in the pixel table may be uint. Avoid using these for
# indexing - they can easily get cast to float and cause problems.
anns = []

# Select bin annotations that correspond to the bin1 IDs in the pixels df
if "bin1_id" in columns:
if len(bins) > len(pixels):
bin1 = pixels["bin1_id"]
lo = bin1.min()
hi = bin1.max()
lo = 0 if np.isnan(lo) else lo
hi = 0 if np.isnan(hi) else hi
if is_selector:
right = bins[lo:hi + bin1.dtype.type(1)] # slicing works like iloc
else:
right = bins.loc[lo:hi]
elif is_selector:
right = bins[:]
bin1 = pixels["bin1_id"].to_numpy().astype(int, copy=False, casting="safe")
if len(bin1) == 0:
bmin = bmax = 0
elif len(bins) > len(pixels):
bmin, bmax = bin1.min(), bin1.max()
else:
right = bins

pixels = pixels.merge(right, how="left", left_on="bin1_id", right_index=True)
bmin, bmax = 0, None
ann1 = _loc_slice(bins, bmin, bmax)
anns.append(
ann1
.iloc[bin1 - bmin]
.rename(columns=lambda x: x + "1")
.reset_index(drop=True)
)

# Select bin annotations that correspond to the bin2 IDs in the pixels df
if "bin2_id" in columns:
if len(bins) > len(pixels):
bin2 = pixels["bin2_id"]
lo = bin2.min()
hi = bin2.max()
lo = 0 if np.isnan(lo) else lo
hi = 0 if np.isnan(hi) else hi
if is_selector:
right = bins[lo:hi + bin2.dtype.type(1)] # slicing works like iloc
else:
right = bins.loc[lo:hi]
elif is_selector:
right = bins[:]
bin2 = pixels["bin2_id"].to_numpy().astype(int, copy=False, casting="safe")
if len(bin2) == 0:
bmin = bmax = 0
elif len(bins) > len(pixels):
bmin, bmax = bin2.min(), bin2.max()
else:
right = bins

pixels = pixels.merge(
right, how="left", left_on="bin2_id", right_index=True, suffixes=("1", "2")
bmin, bmax = 0, None
ann2 = _loc_slice(bins, bmin, bmax)
anns.append(
ann2
.iloc[bin2 - bmin]
.rename(columns=lambda x: x + "2")
.reset_index(drop=True)
)

# rearrange columns
pixels = pixels[list(pixels.columns[ncols:]) + list(pixels.columns[:ncols])]

# drop bin IDs
# Drop original bin IDs if not wanted
if replace:
cols_to_drop = [col for col in ("bin1_id", "bin2_id") if col in columns]
pixels = pixels.drop(cols_to_drop, axis=1)

return pixels
# Concatenate bin annotations with pixels
out = pd.concat([*anns, pixels.reset_index(drop=True)], axis=1)
out.index = pixels.index
return out


def matrix(
Expand Down Expand Up @@ -710,7 +714,7 @@ def matrix(
+ "calculate balancing weights or set balance=False."
)

reader = CSRReader(h5['pixels'], h5['indexes/bin1_offset'][:])
reader = CSRReader(h5["pixels"], h5["indexes/bin1_offset"][:])

if as_pixels:
# The historical behavior for as_pixels is to return only explicitly stored
Expand Down

0 comments on commit cb529bc

Please sign in to comment.