Skip to content

Commit

Permalink
Merge pull request #5 from simslab/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
hannaml committed Nov 13, 2020
2 parents 875782d + 137aa42 commit 63bdcc6
Show file tree
Hide file tree
Showing 6 changed files with 220 additions and 43 deletions.
80 changes: 77 additions & 3 deletions clusterdiffex/cluster.py
@@ -1,7 +1,11 @@
#!/usr/bin/python
#!/usr/bin/env python

import os
import numpy as np
from scipy.sparse import coo_matrix
from scipy.io import mmread, mmwrite
from clusterdiffex.distance import spearmanr
import umap

import phenograph

Expand All @@ -22,6 +26,8 @@ def get_knn(distance, k=20):
values are the distance between the cell for the row and its k nearest
neighbors
"""
if k > distance.shape[0]:
raise ValueError(f'k {k} < number of cells {distance.shape[0]}')
topk = np.argsort(distance)[:, 1:k+1]
values, row, col, kstart = [], [], [], -(k+1)
for cell in np.arange(topk.shape[0]):
Expand All @@ -33,7 +39,75 @@ def get_knn(distance, k=20):
return coo_matrix((np.hstack(values), indices),shape=distance.shape)


def run_phenograph(distance, k=20, outdir='', prefix='', **kwargs):
def get_approx_knn(X, k=20, metric=spearmanr, metric_kw={}, angular=True,
random_state=0):
"""
Parameters
-----------
X: ndarray
cell x gene
k: int
"""
from sklearn.utils import check_random_state
cols, vals, _ = umap.umap_.nearest_neighbors(X, k, metric, metric_kw,
angular, check_random_state(random_state))
cols_flat = np.hstack(cols)
vals_flat = np.hstack(vals)
rows = np.ravel(np.column_stack( [np.arange(cols.shape[0])]*k ))
assert len(cols_flat) == len(vals_flat)
assert len(cols_flat) == len(rows)
return coo_matrix((vals_flat, (rows, cols_flat)), shape=(X.shape[0],
X.shape[0]))


def run_phenograph_approx_knn(X, k=20, outdir='', prefix='', **kwargs):
"""
Runs Phenograph on an expression- or PCA-based distance matrix.
Parameters
----------
X: ndarray
cell x feature data matrix
k: int (default 20)
number of nearest neighbors to use
outdir: str (default '')
prefix: str (default '')
label: str (default '')
Returns
-------
communities
graph
Q : float
"""
assert X.shape[0] > X.shape[1]
fileprefix = '{}/{}'.format(outdir, prefix)
knn_file = f'{fileprefix}.knn{k}_approx.mtx'
if os.path.exists(knn_file):
knn = mmread(knn_file)
else:
knn = get_approx_knn(X, k) #.tolil()
mmwrite(knn_file, knn)

print(83, knn.shape)
communities, graph, Q = phenograph.cluster(knn, **kwargs)

if outdir is not None and len(outdir)>0:
fileprefix = '{}/{}'.format(outdir, prefix)
clusterfile = fileprefix + '.pg.txt'
np.savetxt(clusterfile, communities, fmt='%i')

logfile = fileprefix + '.pg.info.txt'
with open(logfile, 'w') as f:
f.write('k:{}\nQ:{}'.format(k, Q))

return communities, graph, Q


def run_phenograph(distance, k=20, outdir='', prefix='', approx_knn=False,
**kwargs):
"""
Runs Phenograph on an expression- or PCA-based distance matrix.
Expand All @@ -54,7 +128,7 @@ def run_phenograph(distance, k=20, outdir='', prefix='', **kwargs):
Q : float
"""
knn = get_knn(distance, k)
knn = get_knn(distance, k).tolil()
communities, graph, Q = phenograph.cluster(knn, **kwargs)

if outdir is not None and len(outdir)>0:
Expand Down
12 changes: 6 additions & 6 deletions clusterdiffex/diffex.py
Expand Up @@ -75,8 +75,8 @@ def sort(self, index):
index : pandas index or series
"""
assert(index.sort_values().equals(self.n_cells_exp.index.sort_values()))
self.n_cells_exp = self.n_cells_exp.ix[index]
self.n_mol = self.n_mol.ix[index]
self.n_cells_exp = self.n_cells_exp.loc[index]
self.n_mol = self.n_mol.loc[index]
if self.med_mol is not None:
self.med_mol = self.med_mol.loc[index]

Expand Down Expand Up @@ -319,13 +319,13 @@ def binomial_test_cluster_vs_rest(expression, genes, clusters,
min_effectsize=min_effectsize, FDR=FDR,
min_proportion=min_proportion)
up_c['cluster'] = cluster.id
up_c['gene'] = genes.ix[up_c.index].gene
up_c['ens'] = genes.ix[up_c.index].ens
up_c['gene'] = genes.loc[up_c.index].gene
up_c['ens'] = genes.loc[up_c.index].ens
up.append(up_c)

down_c['cluster'] = cluster.id
down_c['gene'] = genes.ix[down_c.index].gene
down_c['ens'] = genes.ix[down_c.index].ens
down_c['gene'] = genes.loc[down_c.index].gene
down_c['ens'] = genes.loc[down_c.index].ens
down.append(down_c)

if garbage_collect:
Expand Down
56 changes: 54 additions & 2 deletions clusterdiffex/distance.py
Expand Up @@ -6,8 +6,9 @@
"""

import numpy as np
from scipy.stats.stats import spearmanr
from scipy import stats
from scipy.spatial.distance import pdist, squareform
import numba

try:
from scipy.stats import energy_distance, wasserstein_distance
Expand Down Expand Up @@ -44,7 +45,7 @@ def get_distance(matrix, outdir='', prefix='', metric='spearman'):
print('Computing {} distance matrix...'.format(metric))

if metric=='spearman':
distance = 1 - spearmanr(matrix)[0]
distance = 1 - stats.spearmanr(matrix)[0]
elif metric == 'pearson':
distance = 1 - np.corrcoef(matrix.T)
elif metric in ['jaccard', 'hamming']:
Expand Down Expand Up @@ -352,3 +353,54 @@ def _rolling_window(a, window):
shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
strides = a.strides + (a.strides[-1],)
return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)


# taken from pynndescent
@numba.njit()
def rankdata(a, method="average"):
arr = np.ravel(np.asarray(a))
if method == "ordinal":
sorter = arr.argsort(kind="mergesort")
else:
sorter = arr.argsort(kind="quicksort")

inv = np.empty(sorter.size, dtype=np.intp)
inv[sorter] = np.arange(sorter.size)

if method == "ordinal":
return (inv + 1).astype(np.float64)

arr = arr[sorter]
obs = np.ones(arr.size, np.bool_)
obs[1:] = arr[1:] != arr[:-1]
dense = obs.cumsum()[inv]

if method == "dense":
return dense.astype(np.float64)

# cumulative counts of each unique value
nonzero = np.nonzero(obs)[0]
count = np.concatenate((nonzero, np.array([len(obs)], nonzero.dtype)))

if method == "max":
return count[dense].astype(np.float64)

if method == "min":
return (count[dense - 1] + 1).astype(np.float64)

# average method
return 0.5 * (count[dense] + count[dense - 1] + 1)


# from pynndescent, except returns a distance
@numba.njit(fastmath=True)
def spearmanr(x, y):
a = np.column_stack((x, y))

n_vars = a.shape[1]

for i in range(n_vars):
a[:, i] = rankdata(a[:, i])
rs = np.corrcoef(a, rowvar=0)

return 1 - rs[1, 0]
23 changes: 16 additions & 7 deletions clusterdiffex/visualize.py
Expand Up @@ -52,13 +52,13 @@ def get_cluster_cmap(N, mpl):
return colors[:N]


def run_umap(distance, outdir='', prefix=''):
def run_umap(data, outdir='', prefix='', metric='precomputed'):
""" Compute and plot a 2D umap projection from a distance matrix.
Parameters
----------
distance : ndarray
cell x cell distance matrix
data : ndarray
cell x cell distance matrix or cell x feature matrix
outdir : str, optional (Default: '')
output directory for saving coordinates and plot
prefix : str, optional (Default: '')
Expand All @@ -75,8 +75,8 @@ def run_umap(distance, outdir='', prefix=''):
print('Running umap...')
with warnings.catch_warnings():
warnings.simplefilter("ignore")
umap_model = UMAP(metric='precomputed', )
embedding = umap_model.fit_transform(distance)
umap_model = UMAP(metric=metric)
embedding = umap_model.fit_transform(data)

if outdir is not None and len(outdir):
_, plt, _ = _import_plotlibs()
Expand Down Expand Up @@ -206,7 +206,8 @@ def run_tsne(distance, outdir='', prefix=''):
return embedding


def plot_clusters(clusters, coordinates, outdir, prefix, label_name='UMAP'):
def plot_clusters(clusters, coordinates, outdir, prefix, label_name='UMAP',
outfile=None):
""" Plot 2D embedding colored by cluster and save to file
Note : currently only works for saving to file, not in notebooks
Expand All @@ -219,13 +220,21 @@ def plot_clusters(clusters, coordinates, outdir, prefix, label_name='UMAP'):
(ncells, 2) array of coordinates
outdir : str
the output directory
prefix : str
prefix for file
label_name: str, optional (Default: UMAP)
Label name for axes
outfile : str, optional (Default: None)
If not none, ignore outdir and prefix and use for output
instead
"""
assert len(clusters) == len(coordinates)
# get appropriate libraries in a context-specific manner
mpl, plt, _ = _import_plotlibs()
from matplotlib.backends.backend_pdf import PdfPages

outfile = '{}/{}.pg.pdf'.format(outdir, prefix)
if outfile is None:
outfile = '{}/{}.pg.pdf'.format(outdir, prefix)

N = len(np.unique(clusters))
colors = get_cluster_cmap(N, mpl)
Expand Down

0 comments on commit 63bdcc6

Please sign in to comment.