From df2320e73c0d883fdb36704f221496b098175999 Mon Sep 17 00:00:00 2001 From: Hanna Mendes Levitin Date: Fri, 6 Dec 2019 18:08:21 -0500 Subject: [PATCH 1/7] timestr on diffex outdir if already exists --- scripts/cluster_diffex.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/scripts/cluster_diffex.py b/scripts/cluster_diffex.py index 29003d8..b54bd36 100644 --- a/scripts/cluster_diffex.py +++ b/scripts/cluster_diffex.py @@ -1,6 +1,7 @@ -import os import argparse import json +import os +import time import numpy as np import pandas as pd @@ -123,6 +124,8 @@ def _get_distance_label(metric): args = parser.parse_args() args = _parseargs_post(args) + timestr = time.strftime("%Y%m%d-%H%M%S") + # load the count matrix print('Loading UMI count matrix') cellinfo = None @@ -140,7 +143,8 @@ def _get_distance_label(metric): running_prefix = [args.prefix] # save the arguments - arg_file = '{}/{}.commandline_ags.json'.format(args.outdir, args.prefix) + arg_file = '{}/{}.commandline_args.json'.format(args.outdir, args.prefix) + args.starttime = timestr print('Writing args to {}'.format(arg_file)) with open(arg_file, 'w') as f: json.dump(args.__dict__, f, indent=2) @@ -232,6 +236,9 @@ def _get_distance_label(metric): verbose=True) diffex_outdir = '{}/{}.diffex_binom/'.format(args.outdir, '.'.join(running_prefix)) + if os.path.exists(diffex_outdir): + diffex_outdir.replace('_binom', '_binom-' + timestr) + write_diffex_by_cluster(up, down, diffex_outdir, cluster_info) diffex_heatmap(counts, genes, communities, up, 10, diffex_outdir, '.'.join(running_prefix)) From b8ec262704b35e2375a447f4d4be0dc8c57c8f04 Mon Sep 17 00:00:00 2001 From: Hanna Mendes Levitin Date: Fri, 6 Dec 2019 19:17:18 -0500 Subject: [PATCH 2/7] ix to loc --- clusterdiffex/diffex.py | 12 ++++++------ setup.py | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/clusterdiffex/diffex.py b/clusterdiffex/diffex.py index c8bd571..a044590 100644 --- a/clusterdiffex/diffex.py +++ b/clusterdiffex/diffex.py @@ -75,8 +75,8 @@ def sort(self, index): index : pandas index or series """ assert(index.sort_values().equals(self.n_cells_exp.index.sort_values())) - self.n_cells_exp = self.n_cells_exp.ix[index] - self.n_mol = self.n_mol.ix[index] + self.n_cells_exp = self.n_cells_exp.loc[index] + self.n_mol = self.n_mol.loc[index] if self.med_mol is not None: self.med_mol = self.med_mol.loc[index] @@ -319,13 +319,13 @@ def binomial_test_cluster_vs_rest(expression, genes, clusters, min_effectsize=min_effectsize, FDR=FDR, min_proportion=min_proportion) up_c['cluster'] = cluster.id - up_c['gene'] = genes.ix[up_c.index].gene - up_c['ens'] = genes.ix[up_c.index].ens + up_c['gene'] = genes.loc[up_c.index].gene + up_c['ens'] = genes.loc[up_c.index].ens up.append(up_c) down_c['cluster'] = cluster.id - down_c['gene'] = genes.ix[down_c.index].gene - down_c['ens'] = genes.ix[down_c.index].ens + down_c['gene'] = genes.loc[down_c.index].gene + down_c['ens'] = genes.loc[down_c.index].ens down.append(down_c) if garbage_collect: diff --git a/setup.py b/setup.py index 263414b..56ebe20 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ from setuptools import find_packages, setup -__version__ = '0.0.1' +__version__ = '0.0.2' requires = ['scipy >= 1.1', 'numpy', From 98a50732d0e8bc0fc18a5a46223cc3ae73a3afde Mon Sep 17 00:00:00 2001 From: Hanna Mendes Levitin Date: Mon, 9 Dec 2019 09:24:11 -0500 Subject: [PATCH 3/7] clearer json file name, (less conflict probability) --- scripts/cluster_diffex.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/cluster_diffex.py b/scripts/cluster_diffex.py index b54bd36..3b008e8 100644 --- a/scripts/cluster_diffex.py +++ b/scripts/cluster_diffex.py @@ -143,7 +143,7 @@ def _get_distance_label(metric): running_prefix = [args.prefix] # save the arguments - arg_file = '{}/{}.commandline_args.json'.format(args.outdir, args.prefix) + arg_file = '{}/{}.cluster_diffex_args.json'.format(args.outdir, args.prefix) args.starttime = timestr print('Writing args to {}'.format(arg_file)) with open(arg_file, 'w') as f: From d3d3c85de986e6a4359b82429af01d55b17bcbbc Mon Sep 17 00:00:00 2001 From: Hanna Mendes Levitin Date: Tue, 21 Jan 2020 20:33:13 -0500 Subject: [PATCH 4/7] fill nans for umap --- scripts/cluster_diffex.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/cluster_diffex.py b/scripts/cluster_diffex.py index 3b008e8..5cf40f2 100644 --- a/scripts/cluster_diffex.py +++ b/scripts/cluster_diffex.py @@ -131,6 +131,7 @@ def _get_distance_label(metric): cellinfo = None if args.count_matrix.endswith('.loom'): counts, genes, cellinfo = load_loom(args.count_matrix) + genes = genes[['Accession', 'Gene']] else: counts, genes = load_txt(args.count_matrix) counts = pd.DataFrame(counts.T.A) @@ -204,8 +205,8 @@ def _get_distance_label(metric): print('{} clusters identified by Phenograph'.format(nclusters)) # visualize - umap = run_umap(distance, prefix='.'.join(running_prefix), - outdir=args.outdir) + umap = run_umap(np.nan_to_num(distance).astype(np.float32), + prefix='.'.join(running_prefix), outdir=args.outdir) if args.dmap: try: dca = run_dca(distance, prefix='.'.join(running_prefix), From 1936aae4b497a61ba7f4e7ec084be8dff680aa5e Mon Sep 17 00:00:00 2001 From: Hanna Mendes Levitin Date: Wed, 1 Jul 2020 17:32:29 -0400 Subject: [PATCH 5/7] compatability with tcell2020 --- scripts/cluster_diffex.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/scripts/cluster_diffex.py b/scripts/cluster_diffex.py index 5cf40f2..01b9a7a 100644 --- a/scripts/cluster_diffex.py +++ b/scripts/cluster_diffex.py @@ -65,6 +65,8 @@ def _parser(): parser.add_argument('--unscaled-score', action='store_true', default=False, help='Use an unscaled score with fixed bins for marker selection' ' rather than the default scaled score with fixed bin.') + parser.add_argument('--markers-only', action='store_true', default=False, + help='Only do marker things then stop') # visualization parser.add_argument('--tsne', action='store_true', default=False) @@ -119,7 +121,7 @@ def _get_distance_label(metric): return metric -if __name__=='__main__': +def main(): parser = _parser() args = parser.parse_args() args = _parseargs_post(args) @@ -177,19 +179,25 @@ def _get_distance_label(metric): msg += 'in count matrix.' print(msg.format(len(markers.ens.unique()), args.marker_file, len(marker_ix))) + running_prefix.append('fmarkers') elif args.unscaled_score: marker_ix = select_markers_static_bins_unscaled(counts.values, outdir=args.outdir, prefix=args.prefix, gene_names=genes, t=args.absolute_threshold) + running_prefix.append('markers') else: # pick our own markers using the dropout curve marker_ix = select_markers(counts.values, outdir=args.outdir, prefix=args.prefix, gene_names=genes, window=args.window_size, nstd=args.nstd, t=args.absolute_threshold) - running_prefix.append('markers') + running_prefix.append('markers') redux = norm.iloc[marker_ix] + if args.markers_only: + print('Exiting because received flag --markers-only') + return + # get distance metric_label = _get_distance_label(args.distance) running_prefix.append(metric_label) @@ -250,4 +258,5 @@ def _get_distance_label(metric): cellinfo.to_csv(cellinfo_file, sep='\t', index=False) - +if __name__=='__main__': + main() From 4f5de809c7e376aabc3a17526598e5a06b405fc3 Mon Sep 17 00:00:00 2001 From: Hanna Mendes Levitin Date: Wed, 15 Jul 2020 19:12:30 +0000 Subject: [PATCH 6/7] approximate knn --- clusterdiffex/cluster.py | 78 ++++++++++++++++++++++++++++++++++++-- clusterdiffex/distance.py | 56 ++++++++++++++++++++++++++- clusterdiffex/visualize.py | 23 +++++++---- scripts/cluster_diffex.py | 62 ++++++++++++++++++++---------- 4 files changed, 188 insertions(+), 31 deletions(-) diff --git a/clusterdiffex/cluster.py b/clusterdiffex/cluster.py index 54d8950..9fa8355 100644 --- a/clusterdiffex/cluster.py +++ b/clusterdiffex/cluster.py @@ -1,7 +1,11 @@ -#!/usr/bin/python +#!/usr/bin/env python +import os import numpy as np from scipy.sparse import coo_matrix +from scipy.io import mmread, mmwrite +from clusterdiffex.distance import spearmanr +import umap import phenograph @@ -33,7 +37,75 @@ def get_knn(distance, k=20): return coo_matrix((np.hstack(values), indices),shape=distance.shape) -def run_phenograph(distance, k=20, outdir='', prefix='', **kwargs): +def get_approx_knn(X, k=20, metric=spearmanr, metric_kw={}, angular=True, + random_state=0): + """ + Parameters + ----------- + X: ndarray + cell x gene + k: int + + """ + from sklearn.utils import check_random_state + cols, vals, _ = umap.umap_.nearest_neighbors(X, k, metric, metric_kw, + angular, check_random_state(random_state)) + cols_flat = np.hstack(cols) + vals_flat = np.hstack(vals) + rows = np.ravel(np.column_stack( [np.arange(cols.shape[0])]*k )) + assert len(cols_flat) == len(vals_flat) + assert len(cols_flat) == len(rows) + return coo_matrix((vals_flat, (rows, cols_flat)), shape=(X.shape[0], + X.shape[0])) + + +def run_phenograph_approx_knn(X, k=20, outdir='', prefix='', **kwargs): + """ + Runs Phenograph on an expression- or PCA-based distance matrix. + + Parameters + ---------- + X: ndarray + cell x feature data matrix + k: int (default 20) + number of nearest neighbors to use + outdir: str (default '') + prefix: str (default '') + label: str (default '') + + Returns + ------- + communities + graph + Q : float + + """ + assert X.shape[0] > X.shape[1] + fileprefix = '{}/{}'.format(outdir, prefix) + knn_file = f'{fileprefix}.knn{k}_approx.mtx' + if os.path.exists(knn_file): + knn = mmread(knn_file) + else: + knn = get_approx_knn(X, k) #.tolil() + mmwrite(knn_file, knn) + + print(83, knn.shape) + communities, graph, Q = phenograph.cluster(knn, **kwargs) + + if outdir is not None and len(outdir)>0: + fileprefix = '{}/{}'.format(outdir, prefix) + clusterfile = fileprefix + '.pg.txt' + np.savetxt(clusterfile, communities, fmt='%i') + + logfile = fileprefix + '.pg.info.txt' + with open(logfile, 'w') as f: + f.write('k:{}\nQ:{}'.format(k, Q)) + + return communities, graph, Q + + +def run_phenograph(distance, k=20, outdir='', prefix='', approx_knn=False, + **kwargs): """ Runs Phenograph on an expression- or PCA-based distance matrix. @@ -54,7 +126,7 @@ def run_phenograph(distance, k=20, outdir='', prefix='', **kwargs): Q : float """ - knn = get_knn(distance, k) + knn = get_knn(distance, k).tolil() communities, graph, Q = phenograph.cluster(knn, **kwargs) if outdir is not None and len(outdir)>0: diff --git a/clusterdiffex/distance.py b/clusterdiffex/distance.py index dd2d106..e5ff8a2 100644 --- a/clusterdiffex/distance.py +++ b/clusterdiffex/distance.py @@ -6,8 +6,9 @@ """ import numpy as np -from scipy.stats.stats import spearmanr +from scipy import stats from scipy.spatial.distance import pdist, squareform +import numba try: from scipy.stats import energy_distance, wasserstein_distance @@ -44,7 +45,7 @@ def get_distance(matrix, outdir='', prefix='', metric='spearman'): print('Computing {} distance matrix...'.format(metric)) if metric=='spearman': - distance = 1 - spearmanr(matrix)[0] + distance = 1 - stats.spearmanr(matrix)[0] elif metric == 'pearson': distance = 1 - np.corrcoef(matrix.T) elif metric in ['jaccard', 'hamming']: @@ -352,3 +353,54 @@ def _rolling_window(a, window): shape = a.shape[:-1] + (a.shape[-1] - window + 1, window) strides = a.strides + (a.strides[-1],) return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides) + + +# taken from pynndescent +@numba.njit() +def rankdata(a, method="average"): + arr = np.ravel(np.asarray(a)) + if method == "ordinal": + sorter = arr.argsort(kind="mergesort") + else: + sorter = arr.argsort(kind="quicksort") + + inv = np.empty(sorter.size, dtype=np.intp) + inv[sorter] = np.arange(sorter.size) + + if method == "ordinal": + return (inv + 1).astype(np.float64) + + arr = arr[sorter] + obs = np.ones(arr.size, np.bool_) + obs[1:] = arr[1:] != arr[:-1] + dense = obs.cumsum()[inv] + + if method == "dense": + return dense.astype(np.float64) + + # cumulative counts of each unique value + nonzero = np.nonzero(obs)[0] + count = np.concatenate((nonzero, np.array([len(obs)], nonzero.dtype))) + + if method == "max": + return count[dense].astype(np.float64) + + if method == "min": + return (count[dense - 1] + 1).astype(np.float64) + + # average method + return 0.5 * (count[dense] + count[dense - 1] + 1) + + +# from pynndescent, except returns a distance +@numba.njit(fastmath=True) +def spearmanr(x, y): + a = np.column_stack((x, y)) + + n_vars = a.shape[1] + + for i in range(n_vars): + a[:, i] = rankdata(a[:, i]) + rs = np.corrcoef(a, rowvar=0) + + return 1 - rs[1, 0] diff --git a/clusterdiffex/visualize.py b/clusterdiffex/visualize.py index a214170..46de8f8 100644 --- a/clusterdiffex/visualize.py +++ b/clusterdiffex/visualize.py @@ -52,13 +52,13 @@ def get_cluster_cmap(N, mpl): return colors[:N] -def run_umap(distance, outdir='', prefix=''): +def run_umap(data, outdir='', prefix='', metric='precomputed'): """ Compute and plot a 2D umap projection from a distance matrix. Parameters ---------- - distance : ndarray - cell x cell distance matrix + data : ndarray + cell x cell distance matrix or cell x feature matrix outdir : str, optional (Default: '') output directory for saving coordinates and plot prefix : str, optional (Default: '') @@ -75,8 +75,8 @@ def run_umap(distance, outdir='', prefix=''): print('Running umap...') with warnings.catch_warnings(): warnings.simplefilter("ignore") - umap_model = UMAP(metric='precomputed', ) - embedding = umap_model.fit_transform(distance) + umap_model = UMAP(metric=metric) + embedding = umap_model.fit_transform(data) if outdir is not None and len(outdir): _, plt, _ = _import_plotlibs() @@ -206,7 +206,8 @@ def run_tsne(distance, outdir='', prefix=''): return embedding -def plot_clusters(clusters, coordinates, outdir, prefix, label_name='UMAP'): +def plot_clusters(clusters, coordinates, outdir, prefix, label_name='UMAP', + outfile=None): """ Plot 2D embedding colored by cluster and save to file Note : currently only works for saving to file, not in notebooks @@ -219,13 +220,21 @@ def plot_clusters(clusters, coordinates, outdir, prefix, label_name='UMAP'): (ncells, 2) array of coordinates outdir : str the output directory + prefix : str + prefix for file + label_name: str, optional (Default: UMAP) + Label name for axes + outfile : str, optional (Default: None) + If not none, ignore outdir and prefix and use for output + instead """ assert len(clusters) == len(coordinates) # get appropriate libraries in a context-specific manner mpl, plt, _ = _import_plotlibs() from matplotlib.backends.backend_pdf import PdfPages - outfile = '{}/{}.pg.pdf'.format(outdir, prefix) + if outfile is None: + outfile = '{}/{}.pg.pdf'.format(outdir, prefix) N = len(np.unique(clusters)) colors = get_cluster_cmap(N, mpl) diff --git a/scripts/cluster_diffex.py b/scripts/cluster_diffex.py index 01b9a7a..daf9f10 100644 --- a/scripts/cluster_diffex.py +++ b/scripts/cluster_diffex.py @@ -1,3 +1,5 @@ +#/usr/bin/env python + import argparse import json import os @@ -9,10 +11,11 @@ from clusterdiffex.util import load_txt, load_loom from clusterdiffex.distance import select_markers, get_distance, \ select_markers_static_bins_unscaled -from clusterdiffex.cluster import run_phenograph +from clusterdiffex.cluster import run_phenograph, run_phenograph_approx_knn from clusterdiffex.visualize import run_umap, run_dca, run_tsne, plot_clusters from clusterdiffex.diffex import binomial_test_cluster_vs_rest, \ write_diffex_by_cluster, diffex_heatmap +from clusterdiffex import distance as dist def _parser(): @@ -81,6 +84,8 @@ def _parser(): parser.add_argument('-mcs', '--min-cluster-size', default=10, type=int, help='Minimum cluster size for phenograph.') + parser.add_argument('-aknn', '--approx-knn', action='store_true') + return parser @@ -201,31 +206,50 @@ def main(): # get distance metric_label = _get_distance_label(args.distance) running_prefix.append(metric_label) - distance = get_distance(redux, metric=args.distance, - outdir=args.outdir if args.save_distance else '', - prefix='.'.join(running_prefix)) # cluster - communities, graph, Q = run_phenograph(distance, k=args.k, - prefix='.'.join(running_prefix), outdir=args.outdir, - min_cluster_size=args.min_cluster_size) + if args.approx_knn: + if args.distance != 'spearman': + raise RuntimeError('approx-knn only implemented for spearman') + communities, graph, Q = run_phenograph_approx_knn(redux.T.values, + k=args.k, prefix='.'.join(running_prefix), outdir=args.outdir, + min_cluster_size=args.min_cluster_size) + + else: + distance = get_distance(redux, metric=args.distance, + outdir=args.outdir if args.save_distance else '', + prefix='.'.join(running_prefix)) + communities, graph, Q = run_phenograph(distance, k=args.k, + prefix='.'.join(running_prefix), outdir=args.outdir, + min_cluster_size=args.min_cluster_size) + nclusters = len(np.unique(communities[communities > -1])) print('{} clusters identified by Phenograph'.format(nclusters)) # visualize - umap = run_umap(np.nan_to_num(distance).astype(np.float32), - prefix='.'.join(running_prefix), outdir=args.outdir) - if args.dmap: - try: - dca = run_dca(distance, prefix='.'.join(running_prefix), - outdir=args.outdir) - except RuntimeError as e: - print('DCA error: {}'.format(e)) - dca = None + if args.approx_knn: + umap = run_umap(redux.T, + prefix='.'.join(running_prefix), outdir=args.outdir, + metric=dist.spearmanr) + if args.dmap: + raise RuntimeError('dmap not implemented for approx-knn') + if args.tsne: + raise RuntimeError('tsne not implemented for approx-knn') - if args.tsne: - tsne = run_tsne(distance, prefix='.'.join(running_prefix), - outdir=args.outdir) + else: + umap = run_umap(np.nan_to_num(distance).astype(np.float32), + prefix='.'.join(running_prefix), outdir=args.outdir) + if args.dmap: + try: + dca = run_dca(distance, prefix='.'.join(running_prefix), + outdir=args.outdir) + except RuntimeError as e: + print('DCA error: {}'.format(e)) + dca = None + + if args.tsne: + tsne = run_tsne(distance, prefix='.'.join(running_prefix), + outdir=args.outdir) # visualize communities plot_clusters(communities, umap, outdir=args.outdir, From 733f70e2f19a99579d2e9a0cc655caaa83e3c22f Mon Sep 17 00:00:00 2001 From: Hanna Mendes Levitin Date: Wed, 15 Jul 2020 15:18:56 -0400 Subject: [PATCH 7/7] handle case where n_neighbors K > n_cells --- clusterdiffex/cluster.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/clusterdiffex/cluster.py b/clusterdiffex/cluster.py index 54d8950..ec97050 100644 --- a/clusterdiffex/cluster.py +++ b/clusterdiffex/cluster.py @@ -22,6 +22,8 @@ def get_knn(distance, k=20): values are the distance between the cell for the row and its k nearest neighbors """ + if k > distance.shape[0]: + raise ValueError(f'k {k} < number of cells {distance.shape[0]}') topk = np.argsort(distance)[:, 1:k+1] values, row, col, kstart = [], [], [], -(k+1) for cell in np.arange(topk.shape[0]):