Skip to content

Commit

Permalink
Merge pull request #97 from kundajelab/dev
Browse files Browse the repository at this point in the history
Bringing down memory footprint of agkm embeddings, pynnd=True option for coarse-grained affmat
  • Loading branch information
AvantiShri committed Nov 29, 2021
2 parents 7de50c1 + 96ae789 commit b136c20
Show file tree
Hide file tree
Showing 9 changed files with 5,054 additions and 444 deletions.
958 changes: 588 additions & 370 deletions examples/simulated_TAL_GATA_deeplearning/TF_MoDISco_TAL_GATA.ipynb

Large diffs are not rendered by default.

Large diffs are not rendered by default.

106 changes: 106 additions & 0 deletions modisco/affinitymat/core.py
Expand Up @@ -12,6 +12,7 @@
import sklearn
from joblib import Parallel, delayed
from tqdm import tqdm
from datetime import datetime


def print_memory_use():
Expand Down Expand Up @@ -177,6 +178,111 @@ def top_k_fwdandrev_dot_prod(fwd_vecs2, fwd_vecs, rev_vecs,
return (sorted_topk_indices, sorted_topk_sims)


class PynndSparseNumpyCosineSimFromFwdAndRevOneDVecs(
AbstractSparseAffmatFromFwdAndRevOneDVecs):

def __init__(self, n_neighbors, n_jobs, verbose):
self.n_neighbors = n_neighbors
self.n_jobs = n_jobs
self.verbose = verbose
if (self.verbose):
print("Using pynnd for nearest neighbor cosine sims")

def __call__(self, fwd_vecs, rev_vecs, initclusters, fwd_vecs2=None):

from pynndescent import NNDescent

assert initclusters is None, ("Currently I haven't built support"
+" for initclusters; use SparseNumpyCosineSimFromFwdAndRevOneDVecs"
+" instead")

#fwd_vecs2 is used when you don't just want to compute self-similarities

#normalize the vectors
fwd_vecs = magnitude_norm_sparsemat(sparse_mat=fwd_vecs)
if (rev_vecs is not None):
rev_vecs = magnitude_norm_sparsemat(sparse_mat=rev_vecs)
else:
rev_vecs = None

if (fwd_vecs2 is None):
fwd_vecs2 = fwd_vecs
else:
fwd_vecs2 = magnitude_norm_sparsemat(sparse_mat=fwd_vecs2)

#build the index
if (self.verbose):
print(datetime.now(),"Building the index"); sys.stdout.flush()

index = NNDescent(fwd_vecs2, metric="cosine")

if (self.verbose):
print(datetime.now(),"Preparing the index"); sys.stdout.flush()

index.prepare()

if (self.verbose):
print(datetime.now(),"Index ready"); sys.stdout.flush()

if (self.verbose):
print(datetime.now(),"Querying neighbors for fwd")
sys.stdout.flush()

fwd_neighbs, fwd_dists = index.query(fwd_vecs, k=self.n_neighbors)

if (rev_vecs is not None):
if (self.verbose):
print(datetime.now(),"Querying neighbors for rev")
sys.stdout.flush()
rev_neighbs, rev_dists = index.query(fwd_vecs, k=self.n_neighbors)
if (self.verbose):
print(datetime.now(),"Unifying fwd and rev")
sys.stdout.flush()


fwdrev_neighbs = np.concatenate([fwd_neighbs, rev_neighbs], axis=1)
fwdrev_dists = np.concatenate([fwd_dists, rev_dists], axis=1)
fwdrev_dists_argsort = np.argsort(fwdrev_dists, axis=1)

#need to remove redundancy
sims = []
neighbors = []
for i in range(len(fwdrev_dists_argsort)):
sims_this_ex = []
neighbors_this_ex = []
neighbors_seen = set()
#iterate in order of similarities in the fwd/rev sim search
for j in fwdrev_dists_argsort[i]:
#get the neighbor
neighbor = fwdrev_neighbs[i][j]
#make sure it hasn't appeared before (this can happen if
# a point is a neighbor according to both the fwd and
# the rev search)
if neighbor not in neighbors_seen:
neighbors_seen.add(neighbor)
neighbors_this_ex.append(neighbor)
#Need to subtract from 1 because pynndescent returns
# 1 - cosinesim
sims_this_ex.append(1 - fwdrev_dists[i][j])
#leave once we have n_neighbors neighbors; since we
# iterated over the distances in ascending order, these
# should be the nearest neighbors
if (len(sims_this_ex)==self.n_neighbors):
break
assert len(neighbors_seen)==self.n_neighbors
sims.append(np.array(sims_this_ex))
#neighbors need to be converted to integers as they'll
# be used later for indexing
neighbors.append(np.array(neighbors_this_ex).astype("int"))

else:
#Need to subtract from 1 because pynndescent returns 1 - cosinesim
sims = 1.0 - fwd_dists
neighbors = fwd_neighbs

return sims, neighbors


class SparseNumpyCosineSimFromFwdAndRevOneDVecs(
AbstractSparseAffmatFromFwdAndRevOneDVecs):

Expand Down
44 changes: 41 additions & 3 deletions modisco/hit_scoring/densityadapted_hitscoring.py
Expand Up @@ -319,8 +319,10 @@ def __init__(self, patterns,
self.seqlet_batch_size = seqlet_batch_size
self.build()

#fine_affmat_nn and seqlet_neighbors are lists of lists, indicating which
# seqlets were the closest ones
def get_classwise_fine_affmat_nn_sumavg(self,
fine_affmat_nn, seqlet_neighbors):
fine_affmat_nn, seqlet_neighbors, exclude_self=False):
num_classes = max(self.motifmemberships)+1
#(not used in the density-adapted scoring) for each class, compute
# the total fine-grained similarity for each class in the topk
Expand All @@ -330,16 +332,37 @@ def get_classwise_fine_affmat_nn_sumavg(self,
(len(fine_affmat_nn), num_classes))
fine_affmat_nn_perclassavg = np.zeros(
(len(fine_affmat_nn), num_classes))

if (exclude_self):
self_not_in_nn = 0 #keep a count for sanity-check purposes

for i in range(len(fine_affmat_nn)):
if (exclude_self):
#exclude_self means exclude the self-similarity
# (which would be 1.0 assuming the alignment works out),
# for the case where we are just sanity-checking
# how this score
# works on the original motif seqlets themselves.
if (i not in seqlet_neighbors[i]):
self_not_in_nn += 1
for classidx in range(num_classes):
class_entries = [fine_affmat_nn[i][j] for
j in range(len(fine_affmat_nn[i]))
if self.motifmemberships[seqlet_neighbors[i][j]]==classidx]
if ((self.motifmemberships[
seqlet_neighbors[i][j]]==classidx)
and (exclude_self==False
or seqlet_neighbors[i][j] != i) )]
if (len(class_entries) > 0):
fine_affmat_nn_perclassum[i][classidx] =\
np.sum(class_entries)
fine_affmat_nn_perclassavg[i][classidx] =\
np.mean(class_entries)

if (exclude_self):
print(self_not_in_nn,"seqlets out of",len(fine_affmat_nn),
"did not have themselves in their nearest neighbs, likely"
"due to alignment issues")

return (fine_affmat_nn_perclassum, fine_affmat_nn_perclassavg)

def pad_seqletdata_to_align(self, fwdseqletdata, revseqletdata,
Expand Down Expand Up @@ -614,13 +637,28 @@ def build(self):
(fann_perclassum, fann_perclassavg) = (
self.get_classwise_fine_affmat_nn_sumavg(
fine_affmat_nn=fine_affmat_nn,
seqlet_neighbors=seqlet_neighbors))
seqlet_neighbors=seqlet_neighbors,
exclude_self=True))
if (self.verbose):
print("Insantiating a precision scorer based on fann_perclasssum")
self.fann_perclasssum_precscorer = util.ClasswisePrecisionScorer(
true_classes=motifmemberships,
class_membership_scores=fann_perclassum)
if (self.verbose):
print("Insantiating a precision scorer based on fann_perclassavg")
self.fann_perclassavg_precscorer = util.ClasswisePrecisionScorer(
true_classes=motifmemberships,
class_membership_scores=fann_perclassavg)

#As a baseline, compare to a scorer that uses aggregate similarity
classpattern_simsandalnmnts = self.get_similarities_to_classpatterns(
seqlets=motifseqlets,
trim_to_central=0)
if (self.verbose):
print("Insantiating a precision scorer based on aggregate sim")
self.aggsim_precscorer = util.ClasswisePrecisionScorer(
true_classes=motifmemberships,
class_membership_scores=classpattern_simsandalnmnts[:,:,0])

if (self.verbose):
print("Mapping affinity to distmat")
Expand Down

0 comments on commit b136c20

Please sign in to comment.