Merge pull request #97 from kundajelab/dev

Bringing down memory footprint of agkm embeddings, pynnd=True option for coarse-grained affmat
kundajelab · Nov 29, 2021 · b136c20 · b136c20
2 parents 7de50c1 + 96ae789
commit b136c20
Show file tree

Hide file tree

Showing 9 changed files with 5,054 additions and 444 deletions.
diff --git a/examples/simulated_TAL_GATA_deeplearning/TF_MoDISco_TAL_GATA.ipynb b/examples/simulated_TAL_GATA_deeplearning/TF_MoDISco_TAL_GATA.ipynb
diff --git a/.../InteractiveViz_TF_MoDISco_TAL_GATA.ipynb → .../InteractiveViz_TF_MoDISco_TAL_GATA.ipynb b/.../InteractiveViz_TF_MoDISco_TAL_GATA.ipynb → .../InteractiveViz_TF_MoDISco_TAL_GATA.ipynb
diff --git a/...Sco_TAL_GATA_With_Filter_Embeddings.ipynb → ...Sco_TAL_GATA_With_Filter_Embeddings.ipynb b/...Sco_TAL_GATA_With_Filter_Embeddings.ipynb → ...Sco_TAL_GATA_With_Filter_Embeddings.ipynb
diff --git a/examples/simulated_TAL_GATA_deeplearning/other/TF_MoDISco_TAL_GATA_withpynndescent.ipynb b/examples/simulated_TAL_GATA_deeplearning/other/TF_MoDISco_TAL_GATA_withpynndescent.ipynb
diff --git a/modisco/affinitymat/core.py b/modisco/affinitymat/core.py
@@ -12,6 +12,7 @@
 import sklearn
 from joblib import Parallel, delayed
 from tqdm import tqdm
+from datetime import datetime
 
 
 def print_memory_use():
@@ -177,6 +178,111 @@ def top_k_fwdandrev_dot_prod(fwd_vecs2, fwd_vecs, rev_vecs,
     return (sorted_topk_indices, sorted_topk_sims)
 
 
+class PynndSparseNumpyCosineSimFromFwdAndRevOneDVecs(
+        AbstractSparseAffmatFromFwdAndRevOneDVecs):
+
+    def __init__(self, n_neighbors, n_jobs, verbose):
+        self.n_neighbors = n_neighbors   
+        self.n_jobs = n_jobs
+        self.verbose = verbose
+        if (self.verbose):
+            print("Using pynnd for nearest neighbor cosine sims")
+
+    def __call__(self, fwd_vecs, rev_vecs, initclusters, fwd_vecs2=None):
+
+        from pynndescent import NNDescent
+
+        assert initclusters is None, ("Currently I haven't built support"
+          +" for initclusters; use SparseNumpyCosineSimFromFwdAndRevOneDVecs"
+          +" instead")
+
+        #fwd_vecs2 is used when you don't just want to compute self-similarities
+
+        #normalize the vectors 
+        fwd_vecs = magnitude_norm_sparsemat(sparse_mat=fwd_vecs)
+        if (rev_vecs is not None):
+            rev_vecs = magnitude_norm_sparsemat(sparse_mat=rev_vecs)
+        else:
+            rev_vecs = None
+
+        if (fwd_vecs2 is None):
+            fwd_vecs2 = fwd_vecs
+        else:
+            fwd_vecs2 = magnitude_norm_sparsemat(sparse_mat=fwd_vecs2)
+
+        #build the index
+        if (self.verbose):
+            print(datetime.now(),"Building the index"); sys.stdout.flush()
+
+        index = NNDescent(fwd_vecs2, metric="cosine")
+
+        if (self.verbose):
+            print(datetime.now(),"Preparing the index"); sys.stdout.flush()
+
+        index.prepare()
+
+        if (self.verbose):
+            print(datetime.now(),"Index ready"); sys.stdout.flush()
+
+        if (self.verbose):
+            print(datetime.now(),"Querying neighbors for fwd")
+            sys.stdout.flush()
+
+        fwd_neighbs, fwd_dists = index.query(fwd_vecs, k=self.n_neighbors) 
+
+        if (rev_vecs is not None):
+            if (self.verbose):
+                print(datetime.now(),"Querying neighbors for rev")
+                sys.stdout.flush()
+            rev_neighbs, rev_dists = index.query(fwd_vecs, k=self.n_neighbors) 
+            if (self.verbose):
+                print(datetime.now(),"Unifying fwd and rev")
+                sys.stdout.flush()
+
+
+            fwdrev_neighbs = np.concatenate([fwd_neighbs, rev_neighbs], axis=1)
+            fwdrev_dists = np.concatenate([fwd_dists, rev_dists], axis=1)
+            fwdrev_dists_argsort = np.argsort(fwdrev_dists, axis=1)
+
+            #need to remove redundancy
+            sims = [] 
+            neighbors = []
+            for i in range(len(fwdrev_dists_argsort)):
+                sims_this_ex = []
+                neighbors_this_ex = []
+                neighbors_seen = set()
+                #iterate in order of similarities in the fwd/rev sim search
+                for j in fwdrev_dists_argsort[i]:
+                    #get the neighbor
+                    neighbor = fwdrev_neighbs[i][j]
+                    #make sure it hasn't appeared before (this can happen if
+                    # a point is a neighbor according to both the fwd and
+                    # the rev search)
+                    if neighbor not in neighbors_seen:
+                        neighbors_seen.add(neighbor)
+                        neighbors_this_ex.append(neighbor)
+                        #Need to subtract from 1 because pynndescent returns
+                        # 1 - cosinesim
+                        sims_this_ex.append(1 - fwdrev_dists[i][j])
+                    #leave once we have n_neighbors neighbors; since we
+                    # iterated over the distances in ascending order, these
+                    # should be the nearest neighbors
+                    if (len(sims_this_ex)==self.n_neighbors):
+                        break
+                assert len(neighbors_seen)==self.n_neighbors
+                sims.append(np.array(sims_this_ex))
+                #neighbors need to be converted to integers as they'll
+                # be used later for indexing
+                neighbors.append(np.array(neighbors_this_ex).astype("int"))
+
+        else:
+            #Need to subtract from 1 because pynndescent returns 1 - cosinesim
+            sims = 1.0 - fwd_dists 
+            neighbors = fwd_neighbs 
+
+        return sims, neighbors
+
+
 class SparseNumpyCosineSimFromFwdAndRevOneDVecs(
         AbstractSparseAffmatFromFwdAndRevOneDVecs):
 

diff --git a/modisco/hit_scoring/densityadapted_hitscoring.py b/modisco/hit_scoring/densityadapted_hitscoring.py
@@ -319,8 +319,10 @@ def __init__(self, patterns,
         self.seqlet_batch_size = seqlet_batch_size
         self.build()
 
+    #fine_affmat_nn and seqlet_neighbors are lists of lists, indicating which
+    # seqlets were the closest ones
     def get_classwise_fine_affmat_nn_sumavg(self,
-            fine_affmat_nn, seqlet_neighbors):
+            fine_affmat_nn, seqlet_neighbors, exclude_self=False):
         num_classes = max(self.motifmemberships)+1
         #(not used in the density-adapted scoring) for each class, compute
         # the total fine-grained similarity for each class in the topk
@@ -330,16 +332,37 @@ def get_classwise_fine_affmat_nn_sumavg(self,
             (len(fine_affmat_nn), num_classes))
         fine_affmat_nn_perclassavg = np.zeros(
             (len(fine_affmat_nn), num_classes))
+
+        if (exclude_self):
+            self_not_in_nn = 0 #keep a count for sanity-check purposes
+
         for i in range(len(fine_affmat_nn)):
+            if (exclude_self): 
+                #exclude_self means exclude the self-similarity
+                # (which would be 1.0 assuming the alignment works out),
+                # for the case where we are just sanity-checking
+                # how this score
+                # works on the original motif seqlets themselves.
+                if (i not in seqlet_neighbors[i]):
+                    self_not_in_nn += 1
             for classidx in range(num_classes):
                 class_entries = [fine_affmat_nn[i][j] for
                    j in range(len(fine_affmat_nn[i]))
-                   if self.motifmemberships[seqlet_neighbors[i][j]]==classidx]
+                   if ((self.motifmemberships[
+                              seqlet_neighbors[i][j]]==classidx)
+                       and (exclude_self==False
+                            or seqlet_neighbors[i][j] != i) )]
                 if (len(class_entries) > 0):
                     fine_affmat_nn_perclassum[i][classidx] =\
                         np.sum(class_entries)
                     fine_affmat_nn_perclassavg[i][classidx] =\
                         np.mean(class_entries)
+
+        if (exclude_self):
+            print(self_not_in_nn,"seqlets out of",len(fine_affmat_nn),
+                  "did not have themselves in their nearest neighbs, likely"
+                  "due to alignment issues") 
+
         return (fine_affmat_nn_perclassum, fine_affmat_nn_perclassavg)
 
     def pad_seqletdata_to_align(self, fwdseqletdata, revseqletdata,
@@ -614,13 +637,28 @@ def build(self):
         (fann_perclassum, fann_perclassavg) = (
             self.get_classwise_fine_affmat_nn_sumavg(
                 fine_affmat_nn=fine_affmat_nn,
-                seqlet_neighbors=seqlet_neighbors))
+                seqlet_neighbors=seqlet_neighbors,
+                exclude_self=True))
+        if (self.verbose):
+            print("Insantiating a precision scorer based on fann_perclasssum")
         self.fann_perclasssum_precscorer = util.ClasswisePrecisionScorer(
             true_classes=motifmemberships,
             class_membership_scores=fann_perclassum) 
+        if (self.verbose):
+            print("Insantiating a precision scorer based on fann_perclassavg")
         self.fann_perclassavg_precscorer = util.ClasswisePrecisionScorer(
             true_classes=motifmemberships,
             class_membership_scores=fann_perclassavg) 
+
+        #As a baseline, compare to a scorer that uses aggregate similarity
+        classpattern_simsandalnmnts = self.get_similarities_to_classpatterns(
+                                seqlets=motifseqlets,
+                                trim_to_central=0)
+        if (self.verbose):
+            print("Insantiating a precision scorer based on aggregate sim")
+        self.aggsim_precscorer = util.ClasswisePrecisionScorer(
+            true_classes=motifmemberships,
+            class_membership_scores=classpattern_simsandalnmnts[:,:,0]) 
 
         if (self.verbose):
             print("Mapping affinity to distmat")