Skip to content

Commit

Permalink
Merge pull request #147 from zktuong/devel
Browse files Browse the repository at this point in the history
v0.2.1
  • Loading branch information
zktuong committed May 19, 2022
2 parents 980e64d + 9b42809 commit 85d4fa0
Show file tree
Hide file tree
Showing 32 changed files with 1,965 additions and 1,370 deletions.
20 changes: 12 additions & 8 deletions .github/workflows/tests.yml
Expand Up @@ -22,12 +22,14 @@ jobs:
max-parallel: 6
matrix:
config:
- { python-version: 3.7, os: ubuntu-latest }
- { python-version: 3.8, os: ubuntu-latest }
- { python-version: 3.9, os: ubuntu-latest }
- { python-version: 3.7, os: macos-latest }
- { python-version: 3.8, os: macos-latest }
- { python-version: 3.9, os: macos-latest } # segmentation fault due to missing py3.9 macOSX wheel https://github.com/rpy2/rpy2/issues/846
- { python-version: 3.7, os: ubuntu-latest}
- { python-version: 3.8, os: ubuntu-latest}
- { python-version: 3.9, os: ubuntu-latest}
- { python-version: "3.10", os: ubuntu-latest}
- { python-version: 3.7, os: macos-latest}
- { python-version: 3.8, os: macos-latest}
- { python-version: 3.9, os: macos-latest}
- { python-version: "3.10", os: macos-latest}
# disabled until it's fixed.
runs-on: ${{ matrix.config.os }}
env:
Expand All @@ -45,7 +47,7 @@ jobs:
if: matrix.config.os == 'ubuntu-latest'
run: |
sudo apt-get install libcurl4-openssl-dev
sudo apt-get update -y && sudo apt-get install -y zlib1g-dev libglpk-dev libgmp3-dev libxml2-dev libicu-dev libhdf5-serial-dev
sudo apt-get update -y && sudo apt-get install -y zlib1g-dev libglpk-dev libgmp3-dev libxml2-dev libicu-dev libhdf5-serial-dev libcurl4-gnutls-dev
- name: Install macOS system dependencies
if: matrix.config.os == 'macos-latest'
Expand All @@ -64,7 +66,7 @@ jobs:
hashFiles('environment.yml') }}

- name: Setup Miniconda
uses: conda-incubator/setup-miniconda@v2
uses: conda-incubator/setup-miniconda@v2.1.1
with:
auto-activate-base: true
auto-update-conda : true
Expand Down Expand Up @@ -141,6 +143,7 @@ jobs:
install.packages(c('RCurl','XML'))
remotes::install_cran("BiocManager")
BiocManager::install(version = ${{ steps.R.outputs.biocversion}}, ask = FALSE)
BiocManager::install(c('GenomeInfoDb', 'Rsamtools'))
BiocManager::install(c('Biostrings', 'GenomicAlignments', 'IRanges'))
install.packages(c('shazam', 'alakazam', 'tigger', 'airr', 'optparse'))
shell: Rscript {0}
Expand All @@ -152,6 +155,7 @@ jobs:
install.packages(c('RCurl','XML'))
remotes::install_cran("BiocManager")
BiocManager::install(version = ${{ steps.R.outputs.biocversion}}, ask = FALSE)
BiocManager::install(c('GenomeInfoDb', 'Rsamtools'))
BiocManager::install(c('Biostrings', 'GenomicAlignments', 'IRanges'))
install.packages('matrixStats')
install.packages(c('shazam', 'alakazam', 'tigger', 'airr', 'optparse'))
Expand Down
2 changes: 0 additions & 2 deletions README.md
Expand Up @@ -136,7 +136,6 @@ python>=3.7,<=3.8 (conda-forge)
numpy>=1.18.4 (conda-forge)
pandas>=1.0.3 (conda-forge)
distance>=0.1.3 (conda-forge)
joblib>=0.14.1 (conda-forge)
jupyter (conda-forge) # if running via a notebook
scikit-learn>=0.23.0 (conda-forge)
numba>=0.48.0 (conda-forge)
Expand All @@ -153,7 +152,6 @@ igblast>=1.15.0 (bioconda)
anndata>=0.7.1
scanpy>=1.4.6
scrublet>=0.2.1
scikit-bio>=0.5.6
changeo>=1.0.0
presto>=0.6.0
polyleven>=0.5
Expand Down
4 changes: 2 additions & 2 deletions dandelion/logging/_logging.py
Expand Up @@ -2,11 +2,11 @@
# @Author: Kelvin
# @Date: 2021-02-06 13:18:58
# @Last Modified by: Kelvin
# @Last Modified time: 2021-02-20 09:07:55
# @Last Modified time: 2022-05-18 16:18:14
from typing import Union, Sequence, Tuple

modules = ['dandelion', 'pandas', 'numpy', 'matplotlib',
'networkx', 'scipy', 'skbio', 'distance', 'polyleven']
'networkx', 'scipy', 'distance', 'polyleven']


# borrowed from scanpy's logging module
Expand Down
165 changes: 120 additions & 45 deletions dandelion/plotting/_plotting.py
Expand Up @@ -2,7 +2,7 @@
# @Author: Kelvin
# @Date: 2020-05-18 00:15:00
# @Last Modified by: Kelvin
# @Last Modified time: 2021-07-31 19:08:07
# @Last Modified time: 2022-05-18 14:09:49

import seaborn as sns
import pandas as pd
Expand Down Expand Up @@ -365,7 +365,7 @@ def stackedbarplot(self: Union[AnnData, Dandelion],

dat_ = pd.DataFrame(data_.groupby(color)[groupby].value_counts(
normalize=normalize).unstack(fill_value=0).stack(),
columns=['value'])
columns=['value'])
dat_.reset_index(drop=False, inplace=True)
dat_order = pd.DataFrame(data[color].value_counts(normalize=normalize))
dat_ = dat_.pivot(index=color, columns=groupby, values='value')
Expand Down Expand Up @@ -551,7 +551,7 @@ def spectratype(self: Union[AnnData, Dandelion],
data[groupby] = [str(l) for l in data[groupby]]
dat_ = pd.DataFrame(data.groupby(color)[groupby].value_counts(
normalize=False).unstack(fill_value=0).stack(),
columns=['value'])
columns=['value'])
dat_.reset_index(drop=False, inplace=True)
dat_[color] = pd.to_numeric(dat_[color], errors='coerce')
dat_.sort_values(by=color)
Expand Down Expand Up @@ -654,6 +654,7 @@ def clone_overlap(self: Union[AnnData, Dandelion],
groupby: str,
colorby: str,
min_clone_size: Optional[int] = None,
weighted_overlap: bool = False,
clone_key: Optional[str] = None,
color_mapping: Optional[Union[Sequence, Dict]] = None,
node_labels: bool = True,
Expand All @@ -666,10 +667,14 @@ def clone_overlap(self: Union[AnnData, Dandelion],
float]] = (8, 8),
return_graph: bool = False,
save: Optional[str] = None,
show_plot: bool = True,
**kwargs):
"""
A plot function to visualise clonal overlap as a circos-style plot. Requires nxviz.
Written with nxviz < 0.7.3. Will need to revisit for newer nxviz versions, or change how it's called?
TODO: workout how to modify edge thickness with both old and new versions.
Parameters
----------
self : Dandelion, AnnData
Expand All @@ -680,10 +685,15 @@ def clone_overlap(self: Union[AnnData, Dandelion],
column name in obs/metadata for grouping and color of nodes in circos plot.
min_clone_size : int, Optional
minimum size of clone for plotting connections. Defaults to 2 if left as None.
weighted_overlapt : bool
if True, instead of collapsing to overlap to binary, edge thickness will reflect the number of
cells found in the overlap. In the future, there will be the option to use something like a jaccard
index instead.
clone_key : str, Optional
column name for clones. None defaults to 'clone_id'.
color_maopping : Dict, Sequence, Optional
custom color mapping provided as a sequence (correpsonding to order of categories or alpha-numeric order if dtype is not category), or dictionary containing custom {category:color} mapping.
custom color mapping provided as a sequence (correpsonding to order of categories or
alpha-numeric order ifdtype is not category), or dictionary containing custom {category:color} mapping.
node_labels : bool, Optional
whether to use node objects as labels or not
node_label_layout : bool, Optional
Expand All @@ -696,6 +706,10 @@ def clone_overlap(self: Union[AnnData, Dandelion],
figure size. Default is (8, 8).
return_graph : bool
whether or not to return the graph for fine tuning. Default is False.
save : str
file path for saving plot
show_plot : bool
whether or not to show the plot.
**kwargs
passed to `matplotlib.pyplot.savefig`.
Expand Down Expand Up @@ -740,15 +754,16 @@ def clone_overlap(self: Union[AnnData, Dandelion],
dictg_ = dict(data[groupby])
datc_[groupby] = [dictg_[l] for l in datc_['cell_id']]

overlap = pd.crosstab(data[clone_], data[groupby])
overlap = pd.crosstab(datc_[clone_], datc_[groupby])

if min_size == 0:
raise ValueError('min_size must be greater than 0.')
elif min_size > 2:
overlap[overlap < min_size] = 0
overlap[overlap >= min_size] = 1
elif min_size == 2:
overlap[overlap >= min_size] = 1
if not weighted_overlap:
if min_size > 2:
overlap[overlap < min_size] = 0
overlap[overlap >= min_size] = 1
elif min_size == 2:
overlap[overlap >= min_size] = 1

overlap.index.name = None
overlap.columns.name = None
Expand All @@ -767,30 +782,58 @@ def clone_overlap(self: Union[AnnData, Dandelion],
dictg_ = dict(data[groupby])
datc_[groupby] = [dictg_[l] for l in datc_['cell_id']]

overlap = pd.crosstab(data[clone_], data[groupby])

overlap = pd.crosstab(datc_[clone_], datc_[groupby])
if min_size == 0:
raise ValueError('min_size must be greater than 0.')
elif min_size > 2:
overlap[overlap < min_size] = 0
overlap[overlap >= min_size] = 1
elif min_size == 2:
overlap[overlap >= min_size] = 1
if not weighted_overlap:
if min_size > 2:
overlap[overlap < min_size] = 0
overlap[overlap >= min_size] = 1
elif min_size == 2:
overlap[overlap >= min_size] = 1

overlap.index.name = None
overlap.columns.name = None

edges = {}
for x in overlap.index:
if overlap.loc[x].sum() > 1:
edges[x] = [
y + ({
str(clone_): x
}, ) for y in list(
combinations(
[i for i in overlap.loc[x][overlap.loc[x] == 1].index],
2))
]
if not weighted_overlap:
for x in overlap.index:
if overlap.loc[x].sum() > 1:
edges[x] = [
y + ({
str(clone_): x
}, ) for y in list(
combinations([
i for i in overlap.loc[x][overlap.loc[x] > 0].index
], 2))
]
else:
tmp_overlap = overlap.astype(bool).sum(axis=1)
combis = {
x: list(
combinations(
[i for i in overlap.loc[x][overlap.loc[x] > 0].index], 2))
for x in tmp_overlap.index if tmp_overlap.loc[x] > 1
}

tmp_edge_weight_dict = defaultdict(list)
for k_clone, val_pair in combis.items():
for pair in val_pair:
tmp_edge_weight_dict[pair].append(
overlap.loc[k_clone, list(pair)].sum())
for combix in tmp_edge_weight_dict:
tmp_edge_weight_dict[combix] = sum(tmp_edge_weight_dict[combix])
for x in overlap.index:
if overlap.loc[x].sum() > 1:
edges[x] = [
y + ({
str(clone_): x,
'weight': tmp_edge_weight_dict[y],
}, ) for y in list(
combinations([
i for i in overlap.loc[x][overlap.loc[x] > 0].index
], 2))
]

# create graph
G = nx.Graph()
Expand All @@ -802,6 +845,12 @@ def clone_overlap(self: Union[AnnData, Dandelion],
# unpack the edgelist and add to the graph
for edge in edges:
G.add_edges_from(edges[edge])

if not weighted_overlap:
weighted_attr = None
else:
weighted_attr = 'weight'

groupby_dict = dict(zip(data[groupby], data[colorby]))
if color_mapping is None:
if self.__class__ == AnnData:
Expand Down Expand Up @@ -832,21 +881,47 @@ def clone_overlap(self: Union[AnnData, Dandelion],
df = df.sort_values(colorby).drop_duplicates(
subset=groupby, keep="first").reset_index(drop=True)

c = nxv.CircosPlot(G,
node_color=colorby,
node_grouping=colorby,
node_labels=node_labels,
node_label_layout=node_label_layout,
group_label_position=group_label_position,
group_label_offset=group_label_offset,
figsize=figsize)
c.nodes = list(df[groupby])
if 'colorby_dict' in locals():
c.node_colors = [colorby_dict[groupby_dict[c]] for c in c.nodes]
c.compute_group_label_positions()
c.compute_group_colors()
c.draw()
if save is not None:
plt.savefig(save, bbox_inches='tight', **kwargs)
if return_graph:
return (c)
try:
from importlib.metadata import version
NXVIZVERSION = version("nxviz")
except:
from pkg_resources import get_distribution
NXVIZVERSION = get_distribution("nxviz").version
if NXVIZVERSION < '0.7.3':
c = nxv.CircosPlot(G,
node_color=colorby,
node_grouping=colorby,
node_labels=node_labels,
node_label_layout=node_label_layout,
group_label_position=group_label_position,
group_label_offset=group_label_offset,
edge_width=weighted_attr,
figsize=figsize)
c.nodes = list(df[groupby])
if 'colorby_dict' in locals():
c.node_colors = [colorby_dict[groupby_dict[c]] for c in c.nodes]
c.compute_group_label_positions()
c.compute_group_colors()
if show_plot:
c.draw()
if save is not None:
plt.savefig(save, bbox_inches='tight', **kwargs)
if return_graph:
return (c)
else:
# some limited support for future nxviz plotting api
from nxviz import annotate
c = nxv.circos(
G,
group_by=colorby,
node_color_by=colorby,
edge_lw_by=weighted_attr,
) # group_by
annotate.circos_group(G, group_by=colorby)
annotate.node_colormapping(G, color_by=colorby)
if show_plot:
plt.show()
if save is not None:
plt.savefig(save, bbox_inches='tight', **kwargs)
if return_graph:
return (c.fig, c.ax)
10 changes: 6 additions & 4 deletions dandelion/preprocessing/_preprocessing.py
Expand Up @@ -2,13 +2,12 @@
# @Author: kt16
# @Date: 2020-05-12 17:56:02
# @Last Modified by: Kelvin
# @Last Modified time: 2022-04-08 11:40:40
# @Last Modified time: 2022-05-19 08:24:22

import os
import pandas as pd
from subprocess import run
from tqdm import tqdm
from joblib import Parallel, delayed
from collections import OrderedDict
from time import sleep
from ..utilities._utilities import *
Expand Down Expand Up @@ -2374,6 +2373,7 @@ def calculate_threshold(self: Union[Dandelion, pd.DataFrame, str],
plot_group: Optional[str] = None,
figsize: Tuple[Union[int, float],
Union[int, float]] = (4.5, 2.5),
ncpu: int = 1,
**kwargs) -> Dandelion:
"""
Calculating nearest neighbor distances for tuning clonal assignment with `shazam`.
Expand Down Expand Up @@ -2443,6 +2443,8 @@ def calculate_threshold(self: Union[Dandelion, pd.DataFrame, str],
determines the fill color and facets.
figsize : Tuple[Union[int,float], Union[int,float]]
size of plot. Default is (4.5, 2.5).
ncpu : float
number of cpus to run `distToNearest`. defaults to 1.
**kwargs
passed to shazam's `distToNearest <https://shazam.readthedocs.io/en/stable/topics/distToNearest/>`__.
Expand Down Expand Up @@ -2519,7 +2521,7 @@ def calculate_threshold(self: Union[Dandelion, pd.DataFrame, str],
onlyHeavy=onlyHeavy,
normalize=norm_,
model=model_,
nproc=ncpu_,
nproc=ncpu,
**kwargs)
except:
print(
Expand All @@ -2536,7 +2538,7 @@ def calculate_threshold(self: Union[Dandelion, pd.DataFrame, str],
vCallColumn=v_call,
model=model_,
normalize=norm_,
nproc=ncpu_,
nproc=ncpu,
**kwargs)
# Find threshold using density method
dist = np.array(dist_ham['dist_nearest'])
Expand Down

0 comments on commit 85d4fa0

Please sign in to comment.