Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Option for strand-aware expand #152

Draft
wants to merge 7 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 2 additions & 0 deletions bioframe/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@
"select_labels",
"select_mask",
"setdiff",
"shift",
"sort_bedframe",
"subtract",
"trim",
Expand Down Expand Up @@ -141,6 +142,7 @@
select_labels,
select_mask,
setdiff,
shift,
sort_bedframe,
subtract,
trim,
Expand Down
111 changes: 109 additions & 2 deletions bioframe/ops.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import warnings

import numpy as np
import pandas as pd

Expand All @@ -18,6 +20,7 @@
"closest",
"subtract",
"setdiff",
"shift",
"count_overlaps",
"trim",
"complement",
Expand Down Expand Up @@ -147,7 +150,113 @@ def select(df, region, cols=None):
return df.loc[select_mask(df, region, cols)]


def shift(df, amount, along=None, drop_invalid=False, cols=None):
"""
Translate the bounds of each genomic interval.

Different shift amounts can be applied to leading and trailing bounds, and
can be applied in a strand-aware manner. Negative values indicate a shift
leftwards or upstream.

Parameters
----------
df : pandas.DataFrame

amount : int, array-like, or pair of int or array-like, optional
The amount(s) by which the bounds are linearly shifted. If a pair
``(x, y)``, shift the leading bound by ``x`` and the trailing bound by
``y``. Negative and positive values shift in the upstream and
downstream directions, respectively. Features are taken to assume the
reference orientation unless ``along`` is specified.

along: str, array-like, or None
Name of column that will set up/downstream orientation for each
feature. The column should contain compliant strand values
("+", "-", "."). Unstranded features will be ignored.

drop_invalid: bool, optional [default: False]
Remove any intervals having negative length after shifting bounds.
By default, they will not be removed but a warning will be raised.

cols : (str, str, str) or None
The names of columns containing the chromosome, start and end of the
genomic intervals. Default values are 'chrom', 'start', 'end'.

Returns
-------
pandas.DataFrame

Notes
-----
See :func:`bioframe.trim` for trimming interals after expansion or shift.
"""
ck, sk, ek = _get_default_colnames() if cols is None else cols
checks.is_bedframe(df, raise_errors=True, cols=[ck, sk, ek])

if along is not None:
if isinstance(along, str):
if along not in df.columns:
raise ValueError(
f'Cannot do strand-aware operation: {along} column is missing.'
)
strands = df[along]
else:
strands = along

if not strands.isin(['+', '-', '.']).all():
missing_strand = (~strands.isin(['+', '-', '.'])).sum()
raise ValueError(
'Cannot do strand-aware operation: strand information missing '
f'for {missing_strand}/{df.shape[0]} ranges.'
)

if not isinstance(amount, (list, tuple)):
amount = (amount, amount)
elif len(amount) != 2:
raise ValueError(
"`amount` should be a single object or a sequence of length 2; "
f"got length {len(amount)}."
)

out = df.copy()
if along is None:
out[sk] = df[sk] + amount[0]
out[ek] = df[ek] + amount[1]
else:
out[sk] = np.where(
strands == '+',
df[sk] + amount[0],
np.where(
strands == '-',
df[sk] - amount[1],
df[sk]
)
)
out[ek] = np.where(
strands == '+',
df[ek] + amount[1],
np.where(
strands == '-',
df[ek] - amount[0],
df[ek]
)
)

is_neglen = (out[ek] - out[sk]) < 0
if is_neglen.any():
if drop_invalid:
out = out.loc[~is_neglen]
else:
warnings.warn(
f"Operation produced {is_neglen.sum()}/{out.shape[0]} "
"intervals with negative length."
)

return out


def expand(df, pad=None, scale=None, side="both", cols=None):

"""
Expand each interval by an amount specified with `pad`.

Expand Down Expand Up @@ -185,9 +294,7 @@ def expand(df, pad=None, scale=None, side="both", cols=None):
Notes
-----
See :func:`bioframe.trim` for trimming interals after expansion.

"""

ck, sk, ek = _get_default_colnames() if cols is None else cols
checks.is_bedframe(df, raise_errors=True, cols=[ck, sk, ek])

Expand Down
164 changes: 164 additions & 0 deletions tests/test_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,170 @@ def test_trim():
)


def test_shift():
df = pd.DataFrame(
[
["chr1", 1000, 1200, "+"],
["chr1", 800, 1200, "-"],
["chrX", 1000, 1500, "+"],
],
columns=["chrom", "start", "end", "strand"],
)

pd.testing.assert_frame_equal(
bioframe.shift(df, 10),
pd.DataFrame(
[
["chr1", 1000 + 10, 1200 + 10, "+"],
["chr1", 800 + 10, 1200 + 10, "-"],
["chrX", 1000 + 10, 1500 + 10, "+"],
],
columns=["chrom", "start", "end", "strand"],
)
)

pd.testing.assert_frame_equal(
bioframe.shift(df, -10),
pd.DataFrame(
[
["chr1", 1000 - 10, 1200 - 10, "+"],
["chr1", 800 - 10, 1200 - 10, "-"],
["chrX", 1000 - 10, 1500 - 10, "+"],
],
columns=["chrom", "start", "end", "strand"],
)
)

pd.testing.assert_frame_equal(
bioframe.shift(df, (-10, 20)),
pd.DataFrame(
[
["chr1", 1000 - 10, 1200 + 20, "+"],
["chr1", 800 - 10, 1200 + 20, "-"],
["chrX", 1000 - 10, 1500 + 20, "+"],
],
columns=["chrom", "start", "end", "strand"],
)
)

pd.testing.assert_frame_equal(
bioframe.shift(df, (10, -20)),
pd.DataFrame(
[
["chr1", 1000 + 10, 1200 - 20, "+"],
["chr1", 800 + 10, 1200 - 20, "-"],
["chrX", 1000 + 10, 1500 - 20, "+"],
],
columns=["chrom", "start", "end", "strand"],
)
)

pd.testing.assert_frame_equal(
bioframe.shift(df, (10, -200), drop_invalid=True),
pd.DataFrame(
[
["chr1", 800 + 10, 1200 - 200, "-"],
["chrX", 1000 + 10, 1500 - 200, "+"],
],
columns=["chrom", "start", "end", "strand"],
index=[1, 2],
)
)


def test_shift_strandaware():
df = pd.DataFrame(
[
["chr1", 1000, 1200, "+"],
["chr1", 800, 1200, "-"],
["chrX", 1000, 1500, "+"],
],
columns=["chrom", "start", "end", "strand"],
)

pd.testing.assert_frame_equal(
bioframe.shift(df, 10, along="strand"),
pd.DataFrame(
[
["chr1", 1000 + 10, 1200 + 10, "+"],
["chr1", 800 - 10, 1200 - 10, "-"],
["chrX", 1000 + 10, 1500 + 10, "+"],
],
columns=["chrom", "start", "end", "strand"],
)
)

pd.testing.assert_frame_equal(
bioframe.shift(df, -10, along="strand"),
pd.DataFrame(
[
["chr1", 1000 - 10, 1200 - 10, "+"],
["chr1", 800 + 10, 1200 + 10, "-"],
["chrX", 1000 - 10, 1500 - 10, "+"],
],
columns=["chrom", "start", "end", "strand"],
)
)

pd.testing.assert_frame_equal(
bioframe.shift(df, (-10, 20), along="strand"),
pd.DataFrame(
[
["chr1", 1000 - 10, 1200 + 20, "+"],
["chr1", 800 - 20, 1200 + 10, "-"],
["chrX", 1000 - 10, 1500 + 20, "+"],
],
columns=["chrom", "start", "end", "strand"],
)
)

pd.testing.assert_frame_equal(
bioframe.shift(df, (10, -20), along="strand"),
pd.DataFrame(
[
["chr1", 1000 + 10, 1200 - 20, "+"],
["chr1", 800 + 20, 1200 - 10, "-"],
["chrX", 1000 + 10, 1500 - 20, "+"],
],
columns=["chrom", "start", "end", "strand"],
)
)

pd.testing.assert_frame_equal(
bioframe.shift(df, (10, -200), along="strand", drop_invalid=True),
pd.DataFrame(
[
["chr1", 800 + 200, 1200 - 10, "-"],
["chrX", 1000 + 10, 1500 - 200, "+"],
],
columns=["chrom", "start", "end", "strand"],
index=[1, 2],
)
)


def test_shift_strandaware_unstranded():
df = pd.DataFrame(
[
["chr1", 1000, 1200, "+"],
["chr1", 800, 1200, "."],
["chrX", 1000, 1500, "+"],
],
columns=["chrom", "start", "end", "strand"],
)
pd.testing.assert_frame_equal(
bioframe.shift(df, (10, -20), along="strand"),
pd.DataFrame(
[
["chr1", 1000 + 10, 1200 - 20, "+"],
["chr1", 800, 1200, "."],
["chrX", 1000 + 10, 1500 - 20, "+"],
],
columns=["chrom", "start", "end", "strand"],
)
)


def test_expand():
d = """chrom start end
0 chr1 1 5
Expand Down