Skip to content

Commit

Permalink
bugfixes in similarity calculation
Browse files Browse the repository at this point in the history
fast prefilter similarity calculation now functional
  • Loading branch information
fungs committed Apr 17, 2018
1 parent 6f4ad36 commit 6cf0f49
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 66 deletions.
46 changes: 20 additions & 26 deletions mglex/cli/bincompare.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,19 @@
Usage:
bincompare (--help | --version)
bincompare [--weight <file> --data <file> --responsibility <file> --subset-1 <file> --subset-2 <file> --beta <float> --posterior-ratio --prefilter-threshold <float> --edge-list <float>]
bincompare [--weight <file> --data <file> --subset-1 <file> --subset-2 <file> --beta <float> --posterior-ratio]
[--prefilter-thresh <float> --edge-thresh <float>]
-h, --help Show this screen
-v, --version Show version
-q, --posterior-ratio Weigh sequences by (subset) bin posterior [default: False]
-d <file>, --data <file> Likelihood matrix [standard input]
-r <file>, --responsibility <file> Responsibility (weight) matrix file; when not given data is used [None]
-w <file>, --weight <file> Optional weights (sequence length) file [None]
-s <file, --subset-1 <file> Use subset of column indices (1-based) [None]
-S <file, --subset-2 <file> Use subset of column indices (1-based) [None]
-b <float>, --beta <float> Beta correction factor (e.g. determined via MSE evaluation) [default: 1.0]
-p <float>, --prefilter-threshold Contig overlap similarity used to avoid likelihood calculations [default: 0.5]
-e <float>, --edge-list <float> If given, output distances as edge list;
only distances <= threshold are reported; use "inf" to show all [inf]
-h, --help Show this screen
-v, --version Show version
-q, --posterior-ratio Weigh sequences by (subset) bin posterior [default: False]
-d <file>, --data <file> Likelihood matrix [standard input]
-w <file>, --weight <file> Optional weights (sequence length) file [None]
-s <file, --subset-1 <file> Use subset of column indices (1-based) [None]
-S <file, --subset-2 <file> Use subset of column indices (1-based) [None]
-b <float>, --beta <float> Beta correction factor (e.g. determined via MSE evaluation) [default: 1.0]
-p <float>, --prefilter-thresh Contig overlap similarity used to avoid likelihood calculations [default: 0.5]
-e <float>, --edge-thresh <float> Only distances <= threshold are reported; use "inf" to show all [default: 30]
"""

import sys
Expand Down Expand Up @@ -58,11 +57,6 @@ def main(argv):
with np.errstate(over='ignore'):
likelihood *= beta

# load responsibility matrix
log_responsibility = argument["--responsibility"]
if log_responsibility is not None:
log_responsibility = common.load_probmatrix_file(log_responsibility)

# load weights
log_weight = argument["--weight"]
if log_weight is not None:
Expand All @@ -84,18 +78,18 @@ def main(argv):
if argument["--posterior-ratio"]:
with np.errstate(invalid='ignore'):
likelihood -= np.max(likelihood, axis=1, keepdims=True)
posterior_scale = True
else:
posterior_scale = False

if not np.all(np.isfinite(np.sum(likelihood, axis=1))):
warnings.warn("Warning: some sequences have all zero likelihood and are ignored in distance calculations", UserWarning)

if argument["--edge-list"]:
threshold = float(argument["--edge-list"])
for i, j, dist, ssim in evaluation.binsimilarity_iter(likelihood, log_weight=log_weight, indices=(subset1,subset2), log_responsibility=log_responsibility, prefilter_threshold=float(argument["--prefilter-threshold"]), log_p_threshold=threshold):
sys.stdout.write("%i\t%i\t%.2f\t%.2f\n" % (i+1, j+1, np.abs(dist), ssim)) # TODO: make precision
# configurable
else:
distmat = evaluation.binsimilarity_matrix(likelihood, log_weight=log_weight, log_responsibility=log_responsibility) # TODO: remove, gets too large anyways
common.write_probmatrix(distmat)
for i, j, dist, ssim in evaluation.binsimilarity_iter(likelihood, log_weight=log_weight, indices=(subset1, subset2),
prefilter_threshold=float(argument["--prefilter-thresh"]),
log_p_threshold=-float(argument["--edge-thresh"]),
posterior_scale=posterior_scale):
sys.stdout.write("%i\t%i\t%.2f\t%.2f\n" % (i+1, j+1, np.abs(dist), ssim)) # TODO: make precision configurable

if __name__ == "__main__":
main(sys.argv[1:])
60 changes: 20 additions & 40 deletions mglex/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,8 @@ def kbl_similarity(log_col1, log_col2, log_weight=None, truncate=0.05):
tmp_pair[:n] = tmp_pair[mask]
tmp_pair.resize((n, 2)) # resize
factor[:n] = factor[mask]
factor.resize(n) # resize and change dimension

factor.resize(n) # resize/change dimension

# calculate likelihood ratios for all data points
log_col1 = tmp_pair[:, 0] # first column view
Expand Down Expand Up @@ -331,6 +332,7 @@ def kbl_similarity(log_col1, log_col2, log_weight=None, truncate=0.05):
mix = np.divide(p1 + p2*r2, r2 + 1.)

#common.print_probvector(mix, file=sys.stderr)
#sys.stderr.write("%s, %s\n" % (factor.shape, mix.shape))
factor *= mix #np.squeeze(factor, axis=1) # TODO: might give zero factors: remove or use log-space

# compress remove all nan values
Expand Down Expand Up @@ -418,35 +420,10 @@ def combine_weight(log_weight, log_responsibility, i, j):
return extra


def binsimilarity_matrix(log_mat, log_weight=None, log_responsibility=None): # TODO: remove, gets too large anyways
"""Calculate n*(n-1)/2 bin similarities by formula (2*L_b*L_b)/(L_a^2+L_b^2) and output as full matrix"""

n, d = log_mat.shape
smat = np.zeros(shape=(d, d), dtype=types.logprob_type) # TODO: use numpy triangle matrix object?

if log_weight is not None:
assert log_weight.shape[0] == n
assert np.any(np.isfinite(log_weight)) # TODO: allow also zero weights

for i in range(d):
for j in range(i+1, d):
# print("\n\ncol %i vs. %i:" % (i,j), file=sys.stderr)
lw = combine_weight(log_weight, log_responsibility, i, j)
#if lw is not None:
# sys.stderr.write("Nonzero elements in weights: %i\n" % sum(np.isfinite(lw)))
log_p = kbl_similarity(log_mat[:, i], log_mat[:, j], lw)

if log_p >= .0:
smat[i, j] = smat[j, i] = 0.0
if log_p > .0:
warnings.warn("Similarity larger than 1.0", UserWarning)
else:
smat[i, j] = smat[j, i] = log_p
return smat


def binsimilarity_iter(log_mat, log_weight=None, log_responsibility=None, indices=(None,None), prefilter_set_threshold=0.01, prefilter_threshold=0.5, log_p_threshold=0.0):
"""Calculate n*(n-1)/2 bin similarities by formula (2*L_b*L_b)/(L_a^2+L_b^2) and output pairwise"""
def binsimilarity_iter(log_mat, log_weight=None, indices=(None,None), prefilter_set_threshold=0.01,
prefilter_threshold=0.5, log_p_threshold=0.0, posterior_scale=False):
"""Calculate n*(n-1)/2 bin similarities by formula (w*((L_a^2+L_b^2)/(L_a+L_b))/Z)*log((2*L_a*L_b)/(
L_a^2+L_b^2)), Z is a normalization constant, and output pairwise"""

n, d = log_mat.shape
d1 = list(range(d)) if indices[0] is None else indices[0]
Expand All @@ -455,14 +432,17 @@ def binsimilarity_iter(log_mat, log_weight=None, log_responsibility=None, indice
if log_weight is not None:
assert log_weight.shape[0] == n
assert np.any(np.isfinite(log_weight)) # TODO: allow also zero weights


# prefiltering depends on top posterior overlap estimation
prefilter_threshold_setconstruct = np.log(prefilter_set_threshold) # transform into log space
if log_responsibility is not None: # create sets to check fast if two bins are similar
dominant = [frozenset(np.where(col>prefilter_threshold_setconstruct)[0]) for col in
log_responsibility.T] # TODO: use seqlen-weighted version
else: # we need the normalized weights for fast filtering!
dominant = [frozenset(np.where(col > prefilter_threshold_setconstruct)[0]) for col in
log_mat.T] # TODO: use seqlen-weighted version
if posterior_scale:
log_mat_posterior = log_mat
else:
log_mat_posterior = log_mat - np.max(log_mat, axis=1, keepdims=True)

# create sets to check fast if two bins are similar
dominant = [frozenset(np.where(col>prefilter_threshold_setconstruct)[0]) for col in
log_mat_posterior.T] # TODO: use seqlen-weighted version

memory=set() # remember edges (kbl divergence is symmetric)
for i, j in itertools.product(d1, d2):
Expand All @@ -478,10 +458,10 @@ def binsimilarity_iter(log_mat, log_weight=None, log_responsibility=None, indice
if setsim < prefilter_threshold:
continue

lw = combine_weight(log_weight, log_responsibility, i, j) # TODO: check implementation and internal subsetting
log_p = kbl_similarity(log_mat[:, i], log_mat[:, j], lw)
#lw = combine_weight(log_weight, log_responsibility, i, j) # TODO: check implementation and internal subsetting
log_p = kbl_similarity(log_mat[:, i], log_mat[:, j], log_weight)

if log_p > log_p_threshold:
if log_p < log_p_threshold:
continue

if log_p > .0:
Expand Down

0 comments on commit 6cf0f49

Please sign in to comment.