Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] FIx smoothing and aggregation for unbalanced expected #466

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
38 changes: 29 additions & 9 deletions cooltools/api/expected.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,9 @@ def make_diag_tables(clr, regions, regions2=None, clr_weight_name="weight"):
regions2 = bioframe.make_viewframe(
regions2, check_bounds=clr.chromsizes
).to_numpy()
except ValueError: # If there are non-unique entries in regions1/2, possible only for asymmetric expected:
except (
ValueError
): # If there are non-unique entries in regions1/2, possible only for asymmetric expected:
regions = pd.concat(
[
bioframe.make_viewframe([region], check_bounds=clr.chromsizes)
Expand Down Expand Up @@ -1017,25 +1019,43 @@ def expected_cis(

# additional smoothing and aggregating options would add columns only, not replace them
if smooth:
if clr_weight_name is None:
# result["count.avg"] = result["count.sum"] / result["n_valid"]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is this commented out line?

cols = {
"dist": "dist",
"n_pixels": _NUM_VALID,
"n_contacts": "count.sum",
"contact_freq": "count.avg",
"smooth_suffix": ".smoothed",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consider renaming "suffix", "smooth suffix seems rather confusing and hard to generalize in future.

}
else:
cols = expected_smoothing.DEFAULT_CVD_COLS

result_smooth = expected_smoothing.agg_smooth_cvd(
result,
sigma_log10=smooth_sigma,
result, sigma_log10=smooth_sigma, cols=cols
)
# add smoothed columns to the result (only balanced for now)
result = result.merge(
result_smooth[["balanced.avg.smoothed", _DIST]],
result_smooth[[cols["contact_freq"] + cols["smooth_suffix"], _DIST]],
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I find this notation very hard to read. Is it possible to retain something like this?
"count.sum" + suffix / without dictionary, having the regular notation

I endorse both readability and flexibility of the code, and here it looks like the balance is shifted towards flexibility.

on=[_REGION1, _REGION2, _DIST],
how="left",
)
if aggregate_smoothed:
result_smooth_agg = expected_smoothing.agg_smooth_cvd(
result,
groupby=None,
sigma_log10=smooth_sigma,
).rename(columns={"balanced.avg.smoothed": "balanced.avg.smoothed.agg"})
result, groupby=None, sigma_log10=smooth_sigma, cols=cols
).rename(
columns={
cols["contact_freq"]
+ cols["smooth_suffix"]: cols["contact_freq"]
+ cols["smooth_suffix"]
+ ".agg"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

".agg" is not in the cols dict anymore, although it's another type of suffix!
Consistency would be great.

}
)
# add smoothed columns to the result
result = result.merge(
result_smooth_agg[["balanced.avg.smoothed.agg", _DIST]],
result_smooth_agg[
[cols["contact_freq"] + cols["smooth_suffix"] + ".agg", _DIST]
],
on=[
_DIST,
],
Expand Down
48 changes: 48 additions & 0 deletions tests/test_expected.py
Original file line number Diff line number Diff line change
Expand Up @@ -476,6 +476,54 @@ def test_expected_smooth_cli(request, tmpdir):
assert _delta_smooth_agg < 0.02


def test_expected_smooth_nobalance_cli(request, tmpdir):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like this test

# CLI compute-expected for chrom-wide cis-data
in_cool = op.join(request.fspath.dirname, "data/CN.mm9.1000kb.cool")
out_cis_expected = op.join(tmpdir, "cis_unb.exp.tsv")
runner = CliRunner()
result = runner.invoke(
cli,
[
"expected-cis",
"--smooth",
"--aggregate-smoothed",
"--clr-weight-name",
"",
"-o",
out_cis_expected,
in_cool,
],
)
assert result.exit_code == 0
clr = cooler.Cooler(in_cool)
cis_expected = pd.read_table(out_cis_expected, sep="\t")
grouped = cis_expected.groupby(["region1", "region2"])
# full chromosomes in this example:
for (chrom1, chrom2), group in grouped:
assert chrom1 == chrom2
# work only on "large" crhomosomes, skip chrM and such
if chrom1 not in ["chrM", "chrY", "chrX"]:
# extract dense matrix and get desired expected:
matrix = clr.matrix(balance=False).fetch(chrom1)
desired_expected = np.where(
group["dist"] < ignore_diags,
np.nan, # fill nan for ignored diags
_diagsum_symm_dense(matrix),
)
# do overall tolerance instead of element by element comparison
# because of non-matching NaNs and "noiseness" of the non-smoothed
# expected
_delta_smooth = np.nanmax(
np.abs(group["count.avg.smoothed"].to_numpy() - desired_expected)
)
_delta_smooth_agg = np.nanmax(
np.abs(group["count.avg.smoothed.agg"].to_numpy() - desired_expected)
)
# some made up tolerances, that work for this example
assert _delta_smooth < 2000
assert _delta_smooth_agg < 4000


def test_trans_expected_view_cli(request, tmpdir):
# CLI compute expected for cis-data with arbitrary view
# which cannot overlap. But it is symmetrical cis-case.
Expand Down