Make annotate faster by using indexing instead of merge (#353)

* Make annotate faster by using indexing instead of merge * copy bins, so they aren't modified twice... is it ok for memory? * fix bug * black * Refactor a little * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Move int conversion * Add initial value to allow min/max on empty array * Fix regression * Handle the empty case first * Add safe casting and no array copying if not necessary * Remove unnecessary nan handling --------- Co-authored-by: Nezar Abdennur <nabdennur@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
open2c · Oct 25, 2023 · cb529bc · cb529bc
1 parent a72327e
commit cb529bc
Showing 1 changed file with 63 additions and 59 deletions.
diff --git a/src/cooler/api.py b/src/cooler/api.py
@@ -116,7 +116,7 @@ def _load_attrs(self, path):
             return dict(grp[path].attrs)
 
     def open(self, mode="r", **kwargs):
-        """ Open the HDF5 group containing the Cooler with :py:mod:`h5py`
+        """Open the HDF5 group containing the Cooler with :py:mod:`h5py`
 
         Functions as a context manager. Any ``open_kws`` passed during
         construction are ignored.
@@ -145,21 +145,21 @@ def storage_mode(self):
 
     @property
     def binsize(self):
-        """ Resolution in base pairs if uniform else None """
+        """Resolution in base pairs if uniform else None"""
         return self._info["bin-size"]
 
     @property
     def chromsizes(self):
-        """ Ordered mapping of reference sequences to their lengths in bp """
+        """Ordered mapping of reference sequences to their lengths in bp"""
         return self._chromsizes
 
     @property
     def chromnames(self):
-        """ List of reference sequence names """
+        """List of reference sequence names"""
         return list(self._chromsizes.index)
 
     def offset(self, region):
-        """ Bin ID containing the left end of a genomic region
+        """Bin ID containing the left end of a genomic region
 
         Parameters
         ----------
@@ -182,11 +182,11 @@ def offset(self, region):
                 grp,
                 self._chromids,
                 parse_region(region, self._chromsizes),
-                self.binsize
+                self.binsize,
             )
 
     def extent(self, region):
-        """ Bin IDs containing the left and right ends of a genomic region
+        """Bin IDs containing the left and right ends of a genomic region
 
         Parameters
         ----------
@@ -209,12 +209,12 @@ def extent(self, region):
                 grp,
                 self._chromids,
                 parse_region(region, self._chromsizes),
-                self.binsize
+                self.binsize,
             )
 
     @property
     def info(self):
-        """ File information and metadata
+        """File information and metadata
 
         Returns
         -------
@@ -230,7 +230,7 @@ def shape(self):
         return (self._info["nbins"],) * 2
 
     def chroms(self, **kwargs):
-        """ Chromosome table selector
+        """Chromosome table selector
 
         Returns
         -------
@@ -246,7 +246,7 @@ def _slice(fields, lo, hi):
         return RangeSelector1D(None, _slice, None, self._info["nchroms"])
 
     def bins(self, **kwargs):
-        """ Bin table selector
+        """Bin table selector
 
         Returns
         -------
@@ -272,7 +272,7 @@ def _fetch(region):
         return RangeSelector1D(None, _slice, _fetch, self._info["nbins"])
 
     def pixels(self, join=False, **kwargs):
-        """ Pixel table selector
+        """Pixel table selector
 
         Parameters
         ----------
@@ -317,7 +317,7 @@ def matrix(
         divisive_weights=None,
         chunksize=10000000,
     ):
-        """ Contact matrix selector
+        """Contact matrix selector
 
         Parameters
         ----------
@@ -391,12 +391,8 @@ def _fetch(region, region2=None):
                     region2 = region
                 region1 = parse_region(region, self._chromsizes)
                 region2 = parse_region(region2, self._chromsizes)
-                i0, i1 = region_to_extent(
-                    grp, self._chromids, region1, self.binsize
-                )
-                j0, j1 = region_to_extent(
-                    grp, self._chromids, region2, self.binsize
-                )
+                i0, i1 = region_to_extent(grp, self._chromids, region1, self.binsize)
+                j0, j1 = region_to_extent(grp, self._chromids, region2, self.binsize)
                 return i0, i1, j0, j1
 
         return RangeSelector2D(field, _slice, _fetch, (self._info["nbins"],) * 2)
@@ -578,59 +574,67 @@ def annotate(pixels, bins, replace=False):
     Returns
     -------
     :py:class:`DataFrame`
-
     """
     columns = pixels.columns
-    ncols = len(columns)
-    is_selector = isinstance(bins, RangeSelector1D)
 
+    # End-inclusive slicer for bins
+    if isinstance(bins, RangeSelector1D):
+        def _loc_slice(sel, beg, end):
+            # slicing a range selector is end-exclusive like iloc
+            return sel[beg : end + 1 if end is not None else None]
+    else:
+        def _loc_slice(df, beg, end):
+            # loc slicing a dataframe is end-inclusive
+            return df.loc[beg:end]
+
+    # Extract the required bin ranges from the bin table.
+    # NOTE: Bin IDs in the pixel table may be uint. Avoid using these for
+    # indexing - they can easily get cast to float and cause problems.
+    anns = []
+
+    # Select bin annotations that correspond to the bin1 IDs in the pixels df
     if "bin1_id" in columns:
-        if len(bins) > len(pixels):
-            bin1 = pixels["bin1_id"]
-            lo = bin1.min()
-            hi = bin1.max()
-            lo = 0 if np.isnan(lo) else lo
-            hi = 0 if np.isnan(hi) else hi
-            if is_selector:
-                right = bins[lo:hi + bin1.dtype.type(1)]  # slicing works like iloc
-            else:
-                right = bins.loc[lo:hi]
-        elif is_selector:
-            right = bins[:]
+        bin1 = pixels["bin1_id"].to_numpy().astype(int, copy=False, casting="safe")
+        if len(bin1) == 0:
+            bmin = bmax = 0
+        elif len(bins) > len(pixels):
+            bmin, bmax = bin1.min(), bin1.max()
         else:
-            right = bins
-
-        pixels = pixels.merge(right, how="left", left_on="bin1_id", right_index=True)
+            bmin, bmax = 0, None
+        ann1 = _loc_slice(bins, bmin, bmax)
+        anns.append(
+            ann1
+            .iloc[bin1 - bmin]
+            .rename(columns=lambda x: x + "1")
+            .reset_index(drop=True)
+        )
 
+    # Select bin annotations that correspond to the bin2 IDs in the pixels df
     if "bin2_id" in columns:
-        if len(bins) > len(pixels):
-            bin2 = pixels["bin2_id"]
-            lo = bin2.min()
-            hi = bin2.max()
-            lo = 0 if np.isnan(lo) else lo
-            hi = 0 if np.isnan(hi) else hi
-            if is_selector:
-                right = bins[lo:hi + bin2.dtype.type(1)]  # slicing works like iloc
-            else:
-                right = bins.loc[lo:hi]
-        elif is_selector:
-            right = bins[:]
+        bin2 = pixels["bin2_id"].to_numpy().astype(int, copy=False, casting="safe")
+        if len(bin2) == 0:
+            bmin = bmax = 0
+        elif len(bins) > len(pixels):
+            bmin, bmax = bin2.min(), bin2.max()
         else:
-            right = bins
-
-        pixels = pixels.merge(
-            right, how="left", left_on="bin2_id", right_index=True, suffixes=("1", "2")
+            bmin, bmax = 0, None
+        ann2 = _loc_slice(bins, bmin, bmax)
+        anns.append(
+            ann2
+            .iloc[bin2 - bmin]
+            .rename(columns=lambda x: x + "2")
+            .reset_index(drop=True)
         )
 
-    # rearrange columns
-    pixels = pixels[list(pixels.columns[ncols:]) + list(pixels.columns[:ncols])]
-
-    # drop bin IDs
+    # Drop original bin IDs if not wanted
     if replace:
         cols_to_drop = [col for col in ("bin1_id", "bin2_id") if col in columns]
         pixels = pixels.drop(cols_to_drop, axis=1)
 
-    return pixels
+    # Concatenate bin annotations with pixels
+    out = pd.concat([*anns, pixels.reset_index(drop=True)], axis=1)
+    out.index = pixels.index
+    return out
 
 
 def matrix(
@@ -710,7 +714,7 @@ def matrix(
             + "calculate balancing weights or set balance=False."
         )
 
-    reader = CSRReader(h5['pixels'], h5['indexes/bin1_offset'][:])
+    reader = CSRReader(h5["pixels"], h5["indexes/bin1_offset"][:])
 
     if as_pixels:
         # The historical behavior for as_pixels is to return only explicitly stored