Skip to content

Commit

Permalink
Merge pull request #99 from kundajelab/dev2
Browse files Browse the repository at this point in the history
Trying to bring down Leiden memory use - 1
  • Loading branch information
AvantiShri committed Jan 27, 2022
2 parents cb2ec8e + 587ccd5 commit c1cbf7c
Show file tree
Hide file tree
Showing 6 changed files with 43 additions and 12 deletions.
28 changes: 25 additions & 3 deletions modisco/cluster/core.py
Expand Up @@ -11,6 +11,7 @@
import os, re
import subprocess
from joblib import Parallel, delayed
from ..util import print_memory_use


class ClusterResults(object):
Expand Down Expand Up @@ -105,6 +106,7 @@ def __call__(self, orig_affinity_mat, initclusters):

if (self.verbose):
print("Beginning preprocessing + Leiden")
print_memory_use()
sys.stdout.flush()
all_start = time.time()
if (self.affmat_transformer is not None):
Expand Down Expand Up @@ -227,14 +229,20 @@ def __call__(self, orig_affinity_mat, initclusters):

if (self.verbose):
print("Beginning preprocessing + Leiden")
print_memory_use()
sys.stdout.flush()

all_start = time.time()

if (self.affmat_transformer is not None):
affinity_mat = self.affmat_transformer(orig_affinity_mat)
if (self.verbose):
print("Affmat transformed")
print_memory_use()
sys.stdout.flush()
else:
affinity_mat = orig_affinity_mat

the_graph = get_igraph_from_adjacency(adjacency=affinity_mat)
best_clustering = None
best_quality = None

Expand All @@ -250,24 +258,33 @@ def __call__(self, orig_affinity_mat, initclusters):
if (initclusters is not None):
initclusters_to_try_list.append(True)


#write out the contents of affinity_mat and initclusters if applicable
uid = uuid.uuid1().hex

sources, targets = affinity_mat.nonzero()
weights = affinity_mat[sources, targets]

if (self.verbose):
print("sources, targets, weights extracted")
print_memory_use()
sys.stdout.flush()

np.save(uid+"_sources.npy", sources)
np.save(uid+"_targets.npy", targets)
np.save(uid+"_weights.npy", weights.A1) #A1 is the same as ravel()

del sources, targets, weights

if (initclusters is not None):
np.save(uid+"_initclusters.npy", initclusters)
print("initclusters length:",len(initclusters))

for use_initclusters in initclusters_to_try_list:

print("Affmat shape:",affinity_mat.shape[0])
if (self.verbose):
print("About to launch parallel Leiden runs")
print_memory_use()
sys.stdout.flush()

parallel_leiden_results = (
Parallel(n_jobs=self.n_jobs,
Expand All @@ -279,6 +296,11 @@ def __call__(self, orig_affinity_mat, initclusters):
seed*100, self.refine)
for seed in toiterover))

if (self.verbose):
print("Parallel Leiden runs finished")
print_memory_use()
sys.stdout.flush()

for quality,membership in parallel_leiden_results:
if ((best_quality is None) or (quality > best_quality)):
best_quality = quality
Expand Down
4 changes: 2 additions & 2 deletions modisco/cluster/run_leiden
Expand Up @@ -14,8 +14,8 @@ def get_igraph(sources_idxs_file, targets_idxs_file, weights_file, n_vertices):
weights = np.load(weights_file)
g = ig.Graph(directed=None)
g.add_vertices(n_vertices) # this adds adjacency.shap[0] vertices
g.add_edges(list(zip(sources.tolist(), targets.tolist())))
g.es['weight'] = weights.tolist()
g.add_edges(list(zip(sources, targets)))
g.es['weight'] = weights
if g.vcount() != n_vertices:
print('WARNING: The constructed graph has only '
+str(g.vcount())+' nodes. '
Expand Down
15 changes: 11 additions & 4 deletions modisco/tfmodisco_workflow/seqlets_to_patterns.py
Expand Up @@ -79,6 +79,7 @@ class TfModiscoSeqletsToPatternsFactory(object):

@legacy_tfmodiscoseqletstopatternsfactory
def __init__(self, n_cores=4,
n_cores_mainclustering=None,
min_overlap_while_sliding=0.7,

#init clusterer factory
Expand Down Expand Up @@ -138,7 +139,10 @@ def __init__(self, n_cores=4,
+" set use_louvain to False")

#affinity_mat calculation
if (n_cores_mainclustering is None):
n_cores_mainclustering = n_cores
self.n_cores = n_cores
self.n_cores_mainclustering = n_cores_mainclustering
self.min_overlap_while_sliding = min_overlap_while_sliding

self.embedder_factory = embedder_factory
Expand Down Expand Up @@ -202,6 +206,7 @@ def get_jsonable_config(self):
to_return = OrderedDict([
('class_name', type(self).__name__),
('n_cores', self.n_cores),
('n_cores_mainclustering', self.n_cores_mainclustering),
('initclusterer_factory',
self.initclusterer_factory.get_jsonable_config()),
('min_overlap_while_sliding', self.min_overlap_while_sliding),
Expand Down Expand Up @@ -341,15 +346,16 @@ def __call__(self, track_set, onehot_track_name,
affinitymat.transformers.LouvainMembershipAverage(
n_runs=n_runs,
level_to_return=level_to_return,
parallel_threads=self.n_cores, seed=self.seed))
parallel_threads=self.n_cores_mainclustering,
seed=self.seed))
clusterer_r1 = cluster.core.LouvainCluster(
level_to_return=self.final_louvain_level_to_return,
affmat_transformer=affmat_transformer_r1,
contin_runs=self.contin_runs_r1,
verbose=self.verbose, seed=self.seed)
else:
clusterer_r1 = cluster.core.LeidenClusterParallel(
n_jobs=self.n_cores,
n_jobs=self.n_cores_mainclustering,
affmat_transformer=affmat_transformer_r1,
numseedstotry=self.contin_runs_r1,
n_leiden_iterations=self.n_leiden_iterations_r1,
Expand All @@ -367,7 +373,8 @@ def __call__(self, track_set, onehot_track_name,
affinitymat.transformers.LouvainMembershipAverage(
n_runs=n_runs,
level_to_return=level_to_return,
parallel_threads=self.n_cores, seed=self.seed))
parallel_threads=self.n_cores_mainclustering,
seed=self.seed))
clusterer_r2 = cluster.core.LouvainCluster(
level_to_return=self.final_louvain_level_to_return,
affmat_transformer=affmat_transformer_r2,
Expand All @@ -376,7 +383,7 @@ def __call__(self, track_set, onehot_track_name,
initclusters_weight=self.louvain_initclusters_weight)
else:
clusterer_r2 = cluster.core.LeidenClusterParallel(
n_jobs=self.n_cores,
n_jobs=self.n_cores_mainclustering,
affmat_transformer=affmat_transformer_r2,
numseedstotry=self.contin_runs_r2,
n_leiden_iterations=self.n_leiden_iterations_r2,
Expand Down
3 changes: 2 additions & 1 deletion modisco/util.py
Expand Up @@ -9,13 +9,14 @@
from sklearn.metrics import average_precision_score, precision_recall_curve
from sklearn.isotonic import IsotonicRegression
from joblib import Parallel, delayed
from datetime import datetime


def print_memory_use():
import os
import psutil
process = psutil.Process(os.getpid())
print("MEMORY",process.memory_info().rss/1000000000)
print(datetime.now(),"MEMORY",process.memory_info().rss/1000000000)


def load_patterns(grp, track_set):
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Expand Up @@ -6,7 +6,7 @@
description='TF MOtif Discovery from Importance SCOres',
long_description="""Algorithm for discovering consolidated patterns from base-pair-level importance scores""",
url='https://github.com/kundajelab/tfmodisco',
version='0.5.16.1',
version='0.5.16.2',
packages=find_packages(),
package_data={
'': ['cluster/phenograph/louvain/*convert*', 'cluster/phenograph/louvain/*community*', 'cluster/phenograph/louvain/*hierarchy*']
Expand Down
3 changes: 2 additions & 1 deletion test/test_tfmodisco_workflow.py
Expand Up @@ -120,7 +120,8 @@ def test_base_workflow(self):
initial_flank_to_add=5,
kmer_len=5, num_gaps=1,
num_mismatches=0,
final_min_cluster_size=60)
final_min_cluster_size=60,
n_cores=4, n_cores_mainclustering=4)
)(
task_names=["task0", "task1", "task2"],
contrib_scores=task_to_scores,
Expand Down

0 comments on commit c1cbf7c

Please sign in to comment.