open2c · emdann · May 7, 2023 · May 7, 2023 · Apr 7, 2024 · Apr 7, 2024
diff --git a/bioframe/__init__.py b/bioframe/__init__.py
@@ -68,6 +68,7 @@
     "select_labels",
     "select_mask",
     "setdiff",
+    "shift",
     "sort_bedframe",
     "subtract",
     "trim",
@@ -141,6 +142,7 @@
     select_labels,
     select_mask,
     setdiff,
+    shift,
     sort_bedframe,
     subtract,
     trim,

diff --git a/bioframe/ops.py b/bioframe/ops.py
@@ -1,3 +1,5 @@
+import warnings
+
 import numpy as np
 import pandas as pd
 
@@ -18,6 +20,7 @@
     "closest",
     "subtract",
     "setdiff",
+    "shift",
     "count_overlaps",
     "trim",
     "complement",
@@ -147,7 +150,113 @@ def select(df, region, cols=None):
     return df.loc[select_mask(df, region, cols)]
 
 
+def shift(df, amount, along=None, drop_invalid=False, cols=None):
+    """
+    Translate the bounds of each genomic interval.
+
+    Different shift amounts can be applied to leading and trailing bounds, and
+    can be applied in a strand-aware manner. Negative values indicate a shift
+    leftwards or upstream.
+
+    Parameters
+    ----------
+    df : pandas.DataFrame
+
+    amount : int, array-like, or pair of int or array-like, optional
+        The amount(s) by which the bounds are linearly shifted. If a pair
+        ``(x, y)``, shift the leading bound by ``x`` and the trailing bound by
+        ``y``. Negative and positive values shift in the upstream and
+        downstream directions, respectively. Features are taken to assume the
+        reference orientation unless ``along`` is specified.
+
+    along: str, array-like, or None
+        Name of column that will set up/downstream orientation for each
+        feature. The column should contain compliant strand values
+        ("+", "-", "."). Unstranded features will be ignored.
+
+    drop_invalid: bool, optional [default: False]
+        Remove any intervals having negative length after shifting bounds.
+        By default, they will not be removed but a warning will be raised.
+
+    cols : (str, str, str) or None
+        The names of columns containing the chromosome, start and end of the
+        genomic intervals. Default values are 'chrom', 'start', 'end'.
+
+    Returns
+    -------
+    pandas.DataFrame
+
+    Notes
+    -----
+    See :func:`bioframe.trim` for trimming interals after expansion or shift.
+    """
+    ck, sk, ek = _get_default_colnames() if cols is None else cols
+    checks.is_bedframe(df, raise_errors=True, cols=[ck, sk, ek])
+
+    if along is not None:
+        if isinstance(along, str):
+            if along not in df.columns:
+                raise ValueError(
+                    f'Cannot do strand-aware operation: {along} column is missing.'
+                )
+            strands = df[along]
+        else:
+            strands = along
+
+        if not strands.isin(['+', '-', '.']).all():
+            missing_strand = (~strands.isin(['+', '-', '.'])).sum()
+            raise ValueError(
+                'Cannot do strand-aware operation: strand information missing '
+                f'for {missing_strand}/{df.shape[0]} ranges.'
+            )
+
+    if not isinstance(amount, (list, tuple)):
+        amount = (amount, amount)
+    elif len(amount) != 2:
+        raise ValueError(
+            "`amount` should be a single object or a sequence of length 2; "
+            f"got length {len(amount)}."
+        )
+
+    out = df.copy()
+    if along is None:
+        out[sk] = df[sk] + amount[0]
+        out[ek] = df[ek] + amount[1]
+    else:
+        out[sk] = np.where(
+            strands == '+',
+            df[sk] + amount[0],
+            np.where(
+                strands == '-',
+                df[sk] - amount[1],
+                df[sk]
+            )
+        )
+        out[ek] = np.where(
+            strands == '+',
+            df[ek] + amount[1],
+            np.where(
+                strands == '-',
+                df[ek] - amount[0],
+                df[ek]
+            )
+        )
+
+    is_neglen = (out[ek] - out[sk]) < 0
+    if is_neglen.any():
+        if drop_invalid:
+            out = out.loc[~is_neglen]
+        else:
+            warnings.warn(
+                f"Operation produced {is_neglen.sum()}/{out.shape[0]} "
+                "intervals with negative length."
+            )
+
+    return out
+
+
 def expand(df, pad=None, scale=None, side="both", cols=None):
+
     """
     Expand each interval by an amount specified with `pad`.
 
@@ -185,9 +294,7 @@ def expand(df, pad=None, scale=None, side="both", cols=None):
     Notes
     -----
     See :func:`bioframe.trim` for trimming interals after expansion.
-
     """
-
     ck, sk, ek = _get_default_colnames() if cols is None else cols
     checks.is_bedframe(df, raise_errors=True, cols=[ck, sk, ek])
 

diff --git a/tests/test_ops.py b/tests/test_ops.py
@@ -214,6 +214,170 @@ def test_trim():
     )
 
 
+def test_shift():
+    df = pd.DataFrame(
+        [
+            ["chr1", 1000, 1200, "+"],
+            ["chr1", 800, 1200, "-"],
+            ["chrX", 1000, 1500, "+"],
+        ],
+        columns=["chrom", "start", "end", "strand"],
+    )
+
+    pd.testing.assert_frame_equal(
+        bioframe.shift(df, 10),
+        pd.DataFrame(
+            [
+                ["chr1", 1000 + 10, 1200 + 10, "+"],
+                ["chr1", 800 + 10, 1200 + 10, "-"],
+                ["chrX", 1000 + 10, 1500 + 10, "+"],
+            ],
+            columns=["chrom", "start", "end", "strand"],
+        )
+    )
+
+    pd.testing.assert_frame_equal(
+        bioframe.shift(df, -10),
+        pd.DataFrame(
+            [
+                ["chr1", 1000 - 10, 1200 - 10, "+"],
+                ["chr1", 800 - 10, 1200 - 10, "-"],
+                ["chrX", 1000 - 10, 1500 - 10, "+"],
+            ],
+            columns=["chrom", "start", "end", "strand"],
+        )
+    )
+
+    pd.testing.assert_frame_equal(
+        bioframe.shift(df, (-10, 20)),
+        pd.DataFrame(
+            [
+                ["chr1", 1000 - 10, 1200 + 20, "+"],
+                ["chr1", 800 - 10, 1200 + 20, "-"],
+                ["chrX", 1000 - 10, 1500 + 20, "+"],
+            ],
+            columns=["chrom", "start", "end", "strand"],
+        )
+    )
+
+    pd.testing.assert_frame_equal(
+        bioframe.shift(df, (10, -20)),
+        pd.DataFrame(
+            [
+                ["chr1", 1000 + 10, 1200 - 20, "+"],
+                ["chr1", 800 + 10, 1200 - 20, "-"],
+                ["chrX", 1000 + 10, 1500 - 20, "+"],
+            ],
+            columns=["chrom", "start", "end", "strand"],
+        )
+    )
+
+    pd.testing.assert_frame_equal(
+        bioframe.shift(df, (10, -200), drop_invalid=True),
+        pd.DataFrame(
+            [
+                ["chr1", 800 + 10, 1200 - 200, "-"],
+                ["chrX", 1000 + 10, 1500 - 200, "+"],
+            ],
+            columns=["chrom", "start", "end", "strand"],
+            index=[1, 2],
+        )
+    )
+
+
+def test_shift_strandaware():
+    df = pd.DataFrame(
+        [
+            ["chr1", 1000, 1200, "+"],
+            ["chr1", 800, 1200, "-"],
+            ["chrX", 1000, 1500, "+"],
+        ],
+        columns=["chrom", "start", "end", "strand"],
+    )
+
+    pd.testing.assert_frame_equal(
+        bioframe.shift(df, 10, along="strand"),
+        pd.DataFrame(
+            [
+                ["chr1", 1000 + 10, 1200 + 10, "+"],
+                ["chr1", 800 - 10, 1200 - 10, "-"],
+                ["chrX", 1000 + 10, 1500 + 10, "+"],
+            ],
+            columns=["chrom", "start", "end", "strand"],
+        )
+    )
+
+    pd.testing.assert_frame_equal(
+        bioframe.shift(df, -10, along="strand"),
+        pd.DataFrame(
+            [
+                ["chr1", 1000 - 10, 1200 - 10, "+"],
+                ["chr1", 800 + 10, 1200 + 10, "-"],
+                ["chrX", 1000 - 10, 1500 - 10, "+"],
+            ],
+            columns=["chrom", "start", "end", "strand"],
+        )
+    )
+
+    pd.testing.assert_frame_equal(
+        bioframe.shift(df, (-10, 20), along="strand"),
+        pd.DataFrame(
+            [
+                ["chr1", 1000 - 10, 1200 + 20, "+"],
+                ["chr1", 800 - 20, 1200 + 10, "-"],
+                ["chrX", 1000 - 10, 1500 + 20, "+"],
+            ],
+            columns=["chrom", "start", "end", "strand"],
+        )
+    )
+
+    pd.testing.assert_frame_equal(
+        bioframe.shift(df, (10, -20), along="strand"),
+        pd.DataFrame(
+            [
+                ["chr1", 1000 + 10, 1200 - 20, "+"],
+                ["chr1", 800 + 20, 1200 - 10, "-"],
+                ["chrX", 1000 + 10, 1500 - 20, "+"],
+            ],
+            columns=["chrom", "start", "end", "strand"],
+        )
+    )
+
+    pd.testing.assert_frame_equal(
+        bioframe.shift(df, (10, -200), along="strand", drop_invalid=True),
+        pd.DataFrame(
+            [
+                ["chr1", 800 + 200, 1200 - 10, "-"],
+                ["chrX", 1000 + 10, 1500 - 200, "+"],
+            ],
+            columns=["chrom", "start", "end", "strand"],
+            index=[1, 2],
+        )
+    )
+
+
+def test_shift_strandaware_unstranded():
+    df = pd.DataFrame(
+        [
+            ["chr1", 1000, 1200, "+"],
+            ["chr1", 800, 1200, "."],
+            ["chrX", 1000, 1500, "+"],
+        ],
+        columns=["chrom", "start", "end", "strand"],
+    )
+    pd.testing.assert_frame_equal(
+        bioframe.shift(df, (10, -20), along="strand"),
+        pd.DataFrame(
+            [
+                ["chr1", 1000 + 10, 1200 - 20, "+"],
+                ["chr1", 800, 1200, "."],
+                ["chrX", 1000 + 10, 1500 - 20, "+"],
+            ],
+            columns=["chrom", "start", "end", "strand"],
+        )
+    )
+
+
 def test_expand():
     d = """chrom  start  end
          0  chr1      1    5