Merge pull request #5 from simslab/dev

Dev
simslab · Nov 13, 2020 · 63bdcc6 · 63bdcc6
2 parents 875782d + 137aa42
commit 63bdcc6
Show file tree

Hide file tree

Showing 6 changed files with 220 additions and 43 deletions.
diff --git a/clusterdiffex/cluster.py b/clusterdiffex/cluster.py
@@ -1,7 +1,11 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 
+import os
 import numpy as np
 from scipy.sparse import coo_matrix
+from scipy.io import mmread, mmwrite
+from clusterdiffex.distance import spearmanr
+import umap
 
 import phenograph
 
@@ -22,6 +26,8 @@ def get_knn(distance, k=20):
         values are the distance between the cell for the row and its k nearest
         neighbors
     """
+    if k > distance.shape[0]:
+        raise ValueError(f'k {k} < number of cells {distance.shape[0]}')
     topk = np.argsort(distance)[:, 1:k+1]
     values, row, col, kstart = [], [], [], -(k+1)
     for cell in np.arange(topk.shape[0]):
@@ -33,7 +39,75 @@ def get_knn(distance, k=20):
     return coo_matrix((np.hstack(values), indices),shape=distance.shape)
 
 
-def run_phenograph(distance, k=20, outdir='', prefix='', **kwargs):
+def get_approx_knn(X, k=20, metric=spearmanr, metric_kw={}, angular=True,
+        random_state=0):
+    """
+    Parameters
+    -----------
+    X: ndarray
+        cell x gene
+    k: int
+
+    """
+    from sklearn.utils import check_random_state
+    cols, vals, _ = umap.umap_.nearest_neighbors(X, k, metric, metric_kw,
+            angular, check_random_state(random_state))
+    cols_flat = np.hstack(cols)
+    vals_flat = np.hstack(vals)
+    rows = np.ravel(np.column_stack( [np.arange(cols.shape[0])]*k ))
+    assert len(cols_flat) == len(vals_flat)
+    assert len(cols_flat) == len(rows)
+    return coo_matrix((vals_flat, (rows, cols_flat)), shape=(X.shape[0],
+        X.shape[0]))
+
+
+def run_phenograph_approx_knn(X, k=20, outdir='', prefix='', **kwargs):
+    """
+    Runs Phenograph on an expression- or PCA-based distance matrix.
+
+    Parameters
+    ----------
+    X: ndarray
+        cell x feature data matrix
+    k: int (default 20)
+        number of nearest neighbors to use
+    outdir: str (default '')
+    prefix: str (default '')
+    label: str (default '')
+
+    Returns
+    -------
+    communities
+    graph
+    Q : float
+
+    """
+    assert X.shape[0] > X.shape[1]
+    fileprefix = '{}/{}'.format(outdir, prefix)
+    knn_file = f'{fileprefix}.knn{k}_approx.mtx'
+    if os.path.exists(knn_file):
+        knn = mmread(knn_file)
+    else:
+        knn = get_approx_knn(X, k) #.tolil()
+        mmwrite(knn_file, knn)
+
+    print(83, knn.shape)
+    communities, graph, Q = phenograph.cluster(knn, **kwargs)
+
+    if outdir is not None and len(outdir)>0:
+        fileprefix = '{}/{}'.format(outdir, prefix)
+        clusterfile = fileprefix + '.pg.txt'
+        np.savetxt(clusterfile, communities, fmt='%i')
+
+        logfile = fileprefix + '.pg.info.txt'
+        with open(logfile, 'w') as f:
+            f.write('k:{}\nQ:{}'.format(k, Q))
+
+    return communities, graph, Q
+
+
+def run_phenograph(distance, k=20, outdir='', prefix='', approx_knn=False,
+        **kwargs):
     """
     Runs Phenograph on an expression- or PCA-based distance matrix.
 
@@ -54,7 +128,7 @@ def run_phenograph(distance, k=20, outdir='', prefix='', **kwargs):
     Q : float
 
     """
-    knn = get_knn(distance, k)
+    knn = get_knn(distance, k).tolil()
     communities, graph, Q = phenograph.cluster(knn, **kwargs)
 
     if outdir is not None and len(outdir)>0:

diff --git a/clusterdiffex/diffex.py b/clusterdiffex/diffex.py
@@ -75,8 +75,8 @@ def sort(self, index):
         index : pandas index or series
         """
         assert(index.sort_values().equals(self.n_cells_exp.index.sort_values()))
-        self.n_cells_exp = self.n_cells_exp.ix[index]
-        self.n_mol = self.n_mol.ix[index]
+        self.n_cells_exp = self.n_cells_exp.loc[index]
+        self.n_mol = self.n_mol.loc[index]
         if self.med_mol is not None:
             self.med_mol = self.med_mol.loc[index]
 
@@ -319,13 +319,13 @@ def binomial_test_cluster_vs_rest(expression, genes, clusters,
                 min_effectsize=min_effectsize, FDR=FDR,
                 min_proportion=min_proportion)
         up_c['cluster'] = cluster.id
-        up_c['gene'] = genes.ix[up_c.index].gene
-        up_c['ens'] = genes.ix[up_c.index].ens
+        up_c['gene'] = genes.loc[up_c.index].gene
+        up_c['ens'] = genes.loc[up_c.index].ens
         up.append(up_c)
 
         down_c['cluster'] = cluster.id
-        down_c['gene'] = genes.ix[down_c.index].gene
-        down_c['ens'] = genes.ix[down_c.index].ens
+        down_c['gene'] = genes.loc[down_c.index].gene
+        down_c['ens'] = genes.loc[down_c.index].ens
         down.append(down_c)
 
         if garbage_collect:

diff --git a/clusterdiffex/distance.py b/clusterdiffex/distance.py
@@ -6,8 +6,9 @@
 """
 
 import numpy as np
-from scipy.stats.stats import spearmanr
+from scipy import stats
 from scipy.spatial.distance import pdist, squareform
+import numba
 
 try:
     from scipy.stats import energy_distance, wasserstein_distance
@@ -44,7 +45,7 @@ def get_distance(matrix, outdir='', prefix='', metric='spearman'):
     print('Computing {} distance matrix...'.format(metric))
 
     if metric=='spearman':
-        distance = 1 - spearmanr(matrix)[0]
+        distance = 1 - stats.spearmanr(matrix)[0]
     elif metric == 'pearson':
         distance = 1 - np.corrcoef(matrix.T)
     elif metric in ['jaccard', 'hamming']:
@@ -352,3 +353,54 @@ def _rolling_window(a, window):
     shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
     strides = a.strides + (a.strides[-1],)
     return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
+
+
+# taken from pynndescent
+@numba.njit()
+def rankdata(a, method="average"):
+    arr = np.ravel(np.asarray(a))
+    if method == "ordinal":
+        sorter = arr.argsort(kind="mergesort")
+    else:
+        sorter = arr.argsort(kind="quicksort")
+
+    inv = np.empty(sorter.size, dtype=np.intp)
+    inv[sorter] = np.arange(sorter.size)
+
+    if method == "ordinal":
+        return (inv + 1).astype(np.float64)
+
+    arr = arr[sorter]
+    obs = np.ones(arr.size, np.bool_)
+    obs[1:] = arr[1:] != arr[:-1]
+    dense = obs.cumsum()[inv]
+
+    if method == "dense":
+        return dense.astype(np.float64)
+
+    # cumulative counts of each unique value
+    nonzero = np.nonzero(obs)[0]
+    count = np.concatenate((nonzero, np.array([len(obs)], nonzero.dtype)))
+
+    if method == "max":
+        return count[dense].astype(np.float64)
+
+    if method == "min":
+        return (count[dense - 1] + 1).astype(np.float64)
+
+    # average method
+    return 0.5 * (count[dense] + count[dense - 1] + 1)
+
+
+# from pynndescent, except returns a distance
+@numba.njit(fastmath=True)
+def spearmanr(x, y):
+    a = np.column_stack((x, y))
+
+    n_vars = a.shape[1]
+
+    for i in range(n_vars):
+        a[:, i] = rankdata(a[:, i])
+    rs = np.corrcoef(a, rowvar=0)
+
+    return 1 - rs[1, 0]
diff --git a/clusterdiffex/visualize.py b/clusterdiffex/visualize.py
@@ -52,13 +52,13 @@ def get_cluster_cmap(N, mpl):
     return colors[:N]
 
 
-def run_umap(distance, outdir='', prefix=''):
+def run_umap(data, outdir='', prefix='', metric='precomputed'):
     """ Compute and plot a 2D umap projection from a distance matrix.
 
     Parameters
     ----------
-    distance : ndarray
-        cell x cell distance matrix
+    data : ndarray
+        cell x cell distance matrix or cell x feature matrix
     outdir : str, optional (Default: '')
         output directory for saving coordinates and plot
     prefix : str, optional (Default: '')
@@ -75,8 +75,8 @@ def run_umap(distance, outdir='', prefix=''):
     print('Running umap...')
     with warnings.catch_warnings():
         warnings.simplefilter("ignore")
-        umap_model = UMAP(metric='precomputed', )
-        embedding = umap_model.fit_transform(distance)
+        umap_model = UMAP(metric=metric)
+        embedding = umap_model.fit_transform(data)
 
     if outdir is not None and len(outdir):
         _, plt, _ = _import_plotlibs()
@@ -206,7 +206,8 @@ def run_tsne(distance, outdir='', prefix=''):
     return embedding
 
 
-def plot_clusters(clusters, coordinates, outdir, prefix, label_name='UMAP'):
+def plot_clusters(clusters, coordinates, outdir, prefix, label_name='UMAP',
+        outfile=None):
     """ Plot 2D embedding colored by cluster and save to file
 
     Note : currently only works for saving to file, not in notebooks
@@ -219,13 +220,21 @@ def plot_clusters(clusters, coordinates, outdir, prefix, label_name='UMAP'):
         (ncells, 2) array of coordinates
     outdir  : str
         the output directory
+    prefix : str
+        prefix for file
+    label_name: str, optional (Default: UMAP)
+        Label name for axes
+    outfile : str, optional (Default: None)
+        If not none, ignore outdir  and prefix and use  for output
+        instead
     """
     assert len(clusters) == len(coordinates)
     # get appropriate libraries in a context-specific manner
     mpl, plt, _ = _import_plotlibs()
     from matplotlib.backends.backend_pdf import PdfPages
 
-    outfile = '{}/{}.pg.pdf'.format(outdir, prefix)
+    if outfile is None:
+        outfile = '{}/{}.pg.pdf'.format(outdir, prefix)
 
     N = len(np.unique(clusters))
     colors = get_cluster_cmap(N, mpl)