diff --git a/DESCRIPTION b/DESCRIPTION index 73fc6dd0c..a4214b629 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: Seurat -Version: 5.0.0 -Date: 2023-10-23 +Version: 5.0.1 +Date: 2023-11-16 Title: Tools for Single Cell Genomics Description: A toolkit for quality control, analysis, and exploration of single cell RNA sequencing data. 'Seurat' aims to enable users to identify and interpret sources of heterogeneity from single cell transcriptomic measurements, and to integrate diverse types of single cell data. See Satija R, Farrell J, Gennert D, et al (2015) , Macosko E, Basu A, Satija R, et al (2015) , Stuart T, Butler A, et al (2019) , and Hao, Hao, et al (2020) for more details. Authors@R: c( diff --git a/NEWS.md b/NEWS.md index 28449acdc..1826436d9 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,16 @@ +# Seurat 5.0.1 (2023-11-16) + +## Changes + +- Fixed `SCTransform.StdAssay` to pass extra arguments to `sctransform::vst()`. Fixes [#875](https://github.com/satijalab/seurat/issues/7998) +- Fixed PercentageFeatureSet Layer calling [(#8009)](https://github.com/satijalab/seurat/issues/8009) +- Fixed cell highlighting [(#7914)](https://github.com/satijalab/seurat/pull/7914) +- Updated marker sorting to be by p-value with ties broken by absolute difference in percent expression +- Fixed issue with replicated barcodes in MappingScore [(#7922)](https://github.com/satijalab/seurat/issues/7922) +- Improved `PseudobulkExpression` by adding 'g' to cell names that started with numeric values +- Improved `PseudobulkExpression` by adding each variable specified in `group.by` as columns in the object metadata when `return.seurat=TRUE` +- Fixed `DimPlot` and `FeatureScatter` which were breaking when using the `split.by` argument with a variable that contained NAs + # Seurat 5.0.0 (2023-10-25) ## Added diff --git a/R/differential_expression.R b/R/differential_expression.R index c1b1fd4bd..b210eb9fe 100644 --- a/R/differential_expression.R +++ b/R/differential_expression.R @@ -164,7 +164,7 @@ FindAllMarkers <- function( subset = (myAUC > return.thresh | myAUC < (1 - return.thresh)) ) } else if (is.null(x = node) || test.use %in% c('bimod', 't')) { - gde <- gde[order(gde$p_val, -gde[, 2]), ] + gde <- gde[order(gde$p_val, -abs(gde$pct.1-gde$pct.2)), ] gde <- subset(x = gde, subset = p_val < return.thresh) } if (nrow(x = gde) > 0) { @@ -612,7 +612,7 @@ FindMarkers.default <- function( if (test.use %in% DEmethods_nocorrect()) { de.results <- de.results[order(-de.results$power, -de.results[, 1]), ] } else { - de.results <- de.results[order(de.results$p_val, -abs(de.results[,colnames(fc.results)[1]])), ] + de.results <- de.results[order(de.results$p_val, -abs(de.results$pct.1-de.results$pct.2)), ] de.results$p_val_adj = p.adjust( p = de.results$p_val, method = "bonferroni", diff --git a/R/integration.R b/R/integration.R index 16d759c69..05a4fb10c 100644 --- a/R/integration.R +++ b/R/integration.R @@ -2621,19 +2621,19 @@ MappingScore.AnchorSet <- function( combined.object <- slot(object = anchors, name = "object.list")[[1]] combined.object <- RenameCells( object = combined.object, - new.names = unname(obj = sapply( + new.names = unname(obj = make.unique(sapply( X = Cells(x = combined.object), FUN = RemoveLastField - )) + ))) ) - query.cells <- sapply( + query.cells <- make.unique(sapply( X = slot(object = anchors, name = "query.cells"), FUN = RemoveLastField - ) - ref.cells <- sapply( + )) + ref.cells <- make.unique(sapply( X = slot(object = anchors, name = "reference.cells"), FUN = RemoveLastField - ) + )) query.embeddings <- Embeddings(object = subset( x = combined.object[["pcaproject.l2"]], cells = query.cells diff --git a/R/preprocessing5.R b/R/preprocessing5.R index e9c7920c5..e7e3f7cb6 100644 --- a/R/preprocessing5.R +++ b/R/preprocessing5.R @@ -1260,7 +1260,8 @@ SCTransform.StdAssay <- function( conserve.memory = conserve.memory, return.only.var.genes = return.only.var.genes, seed.use = seed.use, - verbose = verbose) + verbose = verbose, + ...) min_var <- vst.out$arguments$min_variance residual.type <- vst.out[['residual_type']] %||% 'pearson' assay.out <- CreateSCTAssay(vst.out = vst.out, do.correct.umi = do.correct.umi, residual.type = residual.type, diff --git a/R/utilities.R b/R/utilities.R index 83c78cda9..912684da4 100644 --- a/R/utilities.R +++ b/R/utilities.R @@ -1170,7 +1170,7 @@ PercentageFeatureSet <- function( warn(message = "Both pattern and features provided. Pattern is being ignored.") } percent.featureset <- list() - layers <- Layers(object = object, pattern = "counts") + layers <- Layers(object = object, search = "counts") for (i in seq_along(along.with = layers)) { layer <- layers[i] features.layer <- features %||% grep( @@ -1425,6 +1425,8 @@ PseudobulkExpression.Seurat <- function( stop("Number of layers provided does not match number of assays") } data <- FetchData(object = object, vars = rev(x = group.by)) + #only keep meta-data columns that are in object + group.by <- intersect(group.by, colnames(data)) data <- data[which(rowSums(x = is.na(x = data)) == 0), , drop = F] if (nrow(x = data) < ncol(x = object)) { inform("Removing cells with NA for 1 or more grouping variables") @@ -1450,6 +1452,22 @@ PseudobulkExpression.Seurat <- function( data <- data[, which(num.levels > 1), drop = F] } category.matrix <- CreateCategoryMatrix(labels = data, method = method) + #check if column names are numeric + col.names <- colnames(category.matrix) + if (any(!(grepl("^[a-zA-Z]|^\\.[^0-9]", col.names)))) { + col.names <- ifelse( + !(grepl("^[a-zA-Z]|^\\.[^0-9]", col.names)), + paste0("g", col.names), + col.names + ) + colnames(category.matrix) <- col.names + inform( + message = paste0("First group.by variable `", group.by[1], + "` starts with a number, appending `g` to ensure valid variable names"), + .frequency = "regularly", + .frequency_id = "PseudobulkExpression" + ) + } data.return <- list() for (i in 1:length(x = assays)) { if (inherits(x = features, what = "list")) { @@ -1568,16 +1586,40 @@ PseudobulkExpression.Seurat <- function( toRet <- ScaleData(object = toRet, verbose = verbose) } } - if ('ident' %in% group.by) { - first.cells <- sapply( - X = 1:ncol(x = category.matrix), - FUN = function(x) { - return(category.matrix[,x, drop = FALSE ]@i[1] + 1) - } + #add meta-data based on group.by variables + cells <- Cells(toRet) + for (i in 1:length(group.by)) { + if (group.by[i] != "ident") { + v <- sapply( + strsplit(cells, "_"), + function(x) {return(x[i])} + ) + names(v) <- cells + toRet <- AddMetaData(toRet, + metadata = v, + col.name = group.by[i] + ) + } + } + #set idents to pseudobulk variables + Idents(toRet) <- cells + + #make orig.ident variable + #orig.ident = ident if group.by includes `ident` + #if not, orig.ident is equal to pseudobulk cell names + if(any(group.by == "ident")) { + i = which(group.by == "ident") + v <- sapply( + strsplit(cells, "_"), + function(x) {return(x[i])} + ) + names(v) <- cells + toRet <- AddMetaData(toRet, + metadata = v, + col.name = "orig.ident" ) - Idents(object = toRet, - cells = colnames(x = toRet) - ) <- Idents(object = object)[first.cells] + } else { + toRet$orig.ident <- cells } return(toRet) } else { diff --git a/R/visualization.R b/R/visualization.R index 76605a25d..04937db62 100644 --- a/R/visualization.R +++ b/R/visualization.R @@ -921,7 +921,9 @@ DimPlot <- function( data[, shape.by] <- object[[shape.by, drop = TRUE]] } if (!is.null(x = split.by)) { - data[, split.by] <- FetchData(object = object, vars = split.by)[split.by] + split <- FetchData(object = object, vars = split.by, clean=TRUE)[split.by] + data <- data[rownames(split),] + data[, split.by] <- split } if (isTRUE(x = shuffle)) { set.seed(seed = seed) @@ -2045,7 +2047,9 @@ FeatureScatter <- function( } } if (!is.null(x = split.by)) { - data[, split.by] <- FetchData(object = object, vars = split.by)[split.by] + split <- FetchData(object = object, vars = split.by, clean=TRUE)[split.by] + data <- data[rownames(split),] + data[, split.by] <- split } plots <- lapply( X = group.by, @@ -7895,7 +7899,7 @@ SetHighlight <- function( # Check for raster if (isTRUE(x = raster)) { - size <- size[1] + size <- sizes.highlight[1] } plot.order <- sort(x = unique(x = highlight), na.last = TRUE) @@ -8200,7 +8204,7 @@ SingleDimPlot <- function( raster <- raster %||% (nrow(x = data) > 1e5) pt.size <- pt.size %||% AutoPointSize(data = data, raster = raster) - if (!is.null(x = cells.highlight) && pt.size == AutoPointSize(data = data, raster = raster) && sizes.highlight != pt.size && isTRUE(x = raster)) { + if (!is.null(x = cells.highlight) && pt.size != AutoPointSize(data = data, raster = raster) && sizes.highlight != pt.size && isTRUE(x = raster)) { warning("When `raster = TRUE` highlighted and non-highlighted cells must be the same size. Plot will use the value provided to 'sizes.highlight'.") } diff --git a/_pkgdown.yaml b/_pkgdown.yaml index 43030a5e0..c3679bf2d 100644 --- a/_pkgdown.yaml +++ b/_pkgdown.yaml @@ -14,70 +14,72 @@ navbar: title: "Seurat" left: - text: "Install" - href: articles/install.html - - text: "Seurat v5" - href: articles/get_started_v5.html + href: articles/install_v5.html - text: "Get started" - href: articles/get_started.html + href: articles/get_started_v5_new.html - text: "Vignettes" menu: - text: Introductory Vignettes - text: "PBMC 3K guided tutorial" href: articles/pbmc3k_tutorial.html + - text: "Data visualization vignette" + href: articles/visualization_vignette.html + - text: "SCTransform, v2 regularization" + href: articles/sctransform_vignette.html - text: "Using Seurat with multi-modal data" href: articles/multimodal_vignette.html + - text: "Seurat v5 Command Cheat Sheet" + href: articles/essential_commands.html - text: ------- - text: Data Integration - text: "Introduction to scRNA-seq integration" href: articles/integration_introduction.html + - text: "Integrative analysis in Seurat v5" + href: articles/seurat5_integration.html - text: "Mapping and annotating query datasets" href: articles/integration_mapping.html - - text: "Fast integration using reciprocal PCA (RPCA)" - href: articles/integration_rpca.html - - text: "Tips for integrating large datasets" - href: articles/integration_large_datasets.html - - text: "Integrating scRNA-seq and scATAC-seq data" - href: articles/atacseq_integration_vignette.html - - text: "Multimodal reference mapping" - href: articles/multimodal_reference_mapping.html - text: ------- - - text: New Statistical Methods + - text: Multi-assay data + - text: "Dictionary Learning for cross-modality integration" + href: articles/seurat5_integration_bridge.html - text: "Weighted Nearest Neighbor Analysis" href: articles/weighted_nearest_neighbor_analysis.html + - text: "Integrating scRNA-seq and scATAC-seq data" + href: articles/seurat5_atacseq_integration_vignette.html + - text: "Multimodal reference mapping" + href: articles/multimodal_reference_mapping.html - text: "Mixscape Vignette" href: articles/mixscape_vignette.html - - text: "Using sctransform in Seurat" - href: articles/sctransform_vignette.html - - text: "SCTransform, v2 regularization" - href: articles/sctransform_v2_vignette.html + - text: ------- + - text: Massively scalable analysis + - text: "Sketch-based analysis in Seurat v5" + href: articles/seurat5_sketch_analysis.html + - text: "Sketch integration using a 1 million cell dataset from Parse Biosciences" + href: articles/ParseBio_sketch_integration.html + - text: "Map COVID PBMC datasets to a healthy reference" + href: articles/COVID_SCTMapping.html + - text: "BPCells Interaction" + href: articles/seurat5_bpcells_interaction_vignette.html + - text: ------- + - text: Spatial analysis + - text: "Analysis of spatial datasets (Imaging-based)" + href: articles/seurat5_spatial_vignette_2.html + - text: "Analysis of spatial datasets (Sequencing-based)" + href: articles/spatial_vignette.html - text: ------- - text: Other - - text: "Data visualization vignette" - href: articles/visualization_vignette.html - text: "Cell-cycle scoring and regression" href: articles/cell_cycle_vignette.html - text: "Differential expression testing" href: articles/de_vignette.html - text: "Demultiplexing with hashtag oligos (HTOs)" href: articles/hashing_vignette.html - - text: "Interoperability between single-cell object formats" - href: articles/conversion_vignette.html - - text: "Parallelization in Seurat with future" - href: articles/future_vignette.html - - text: "Dimensional reduction vignette" - href: articles/dim_reduction_vignette.html - - text: "Seurat essential commands list" - href: articles/essential_commands.html - - text: "Seurat interaction tips" - href: articles/interaction_vignette.html - - text: "Merging Seurat objects" - href: articles/merge_vignette.html - text: Extensions href: articles/extensions.html - text: FAQ href: "https://github.com/satijalab/seurat/discussions" - text: "News" - href: news/index.html + href: articles/announcements.html - text: "Reference" href: reference/index.html - text: "Archive" @@ -157,6 +159,16 @@ reference: - contents: - has_concept("convenience") +- title: "Multimodal" + desc: "Functions for multimodal analysis" +- contents: + - has_concept("multimodal") + +- title: "Re-exports" + desc: "Functions for flexible analysis of massively scalable datasets" +- contents: + - has_concept("sketching") + - title: "Re-exports" desc: "Functions re-exported from other packages" - contents: diff --git a/index.md b/index.md index ad2694402..33f874f55 100644 --- a/index.md +++ b/index.md @@ -1,14 +1,9 @@ ![](articles/assets/seurat_banner.jpg) -## **Beta release of Seurat v5** +## **Seurat v5** -We are excited to release an initial beta version of Seurat v5! This update brings the following new features and functionality: +We are excited to release Seurat v5! To install, please follow the instructions in our [install page](install.html). This update brings the following new features and functionality: -* **Analysis of sequencing and imaging-based spatial datasets:** Spatially resolved datasets are redefining our understanding of cellular interactions and the organization of human tissues. Both sequencing-based(i.e. Visium, SLIDE-seq, etc.), and imaging-based (MERFISH/Vizgen, Xenium, CosMX, etc.) technologies have unique advantages, and require tailored analytical methods and software infrastructure. In Seurat v5, we introduce flexible and diverse support for a wide variety of spatially resolved data types, and support for analytical techniqiues for scRNA-seq integration, deconvolution, and niche identification. - - - Vignette: [Analysis of spatial datasets (Sequencing-based)](articles/seurat5_spatial_vignette.html) - - Vignette: [Analysis of spatial datasets (Imaging-based)](articles/seurat5_spatial_vignette_2.html)\ -\ * **Integrative multimodal analysis:** The cellular transcriptome is just one aspect of cellular identity, and recent technologies enable routine profiling of chromatin accessibility, histone modifications, and protein levels from single cells. In Seurat v5, we introduce 'bridge integration', a statistical method to integrate experiments measuring different modalities (i.e. separate scRNA-seq and scATAC-seq datasets), using a separate multiomic dataset as a molecular 'bridge'. For example, we demonstrate how to map scATAC-seq datasets onto scRNA-seq datasets, to assist users in interpreting and annotating data from new modalities.\ \ We recognize that while the goal of matching shared cell types across datasets may be important for many problems, users may also be concerned about which method to use, or that integration could result in a loss of biological resolution. In Seurat v5, we also introduce flexible and streamlined workflows for the integration of multiple scRNA-seq datasets. This makes it easier to explore the results of different integration methods, and to compare these results to a workflow that excludes integration steps. @@ -28,7 +23,16 @@ We enable high-performance via the BPCells package, developed by Ben Parks in th - Vignette: [Interacting with BPCell matrices in Seurat v5](articles/seurat5_bpcells_interaction_vignette.html) - BPCells R Package: [Scaling Single Cell Analysis to Millions of Cells](https://bnprks.github.io/BPCells/)\ \ -* **Backwards compatibility:** While Seurat v5 introduces new functionality, we have ensured that the software is backwards-compatible with previous versions, so that users will continue to be able to re-run existing workflows. As v5 is still in beta, the CRAN installation (`install.packages("Seurat")`) will continue to install Seurat v4, but users can opt-in to test Seurat v5 by following the instructions in our [install page](install.html).\ +* **Analysis of sequencing and imaging-based spatial datasets:** Spatially resolved datasets are redefining our understanding of cellular interactions and the organization of human tissues. Both sequencing-based(i.e. Visium, SLIDE-seq, etc.), and imaging-based (MERFISH/Vizgen, Xenium, CosMX, etc.) technologies have unique advantages, and require tailored analytical methods and software infrastructure. In Seurat v5, we introduce flexible and diverse support for a wide variety of spatially resolved data types, and support for analytical techniqiues for scRNA-seq integration, deconvolution, and niche identification. + + - Vignette: [Analysis of spatial datasets (Sequencing-based)](articles/seurat5_spatial_vignette.html) + - Vignette: [Analysis of spatial datasets (Imaging-based)](articles/seurat5_spatial_vignette_2.html)\ +\ +* **Backwards compatibility:** While Seurat v5 introduces new functionality, we have ensured that the software is backwards-compatible with previous versions, so that users will continue to be able to re-run existing workflows. Previous versions of Seurat, such as Seurat v4, can also be installed following the instructions in our [install page](install.html).\ + +## **Changes between v4 and v5** + +We have documented major changes between Seurat v4 and v5 in our [News page](announcements.html) for reference. ## **About Seurat** @@ -36,7 +40,7 @@ Seurat is an R package designed for QC, analysis, and exploration of single-cell If you use Seurat in your research, please considering citing: -* [Hao, et al., bioRxiv 2022](https://doi.org/10.1101/2022.02.24.481684) [Seurat v5] +* [Hao, et al., Nature Biotechnology 2023](https://www.nature.com/articles/s41587-023-01767-y) [Seurat v5] * [Hao\*, Hao\*, et al., Cell 2021](https://doi.org/10.1016/j.cell.2021.04.048) [Seurat v4] * [Stuart\*, Butler\*, et al., Cell 2019](https://www.cell.com/cell/fulltext/S0092-8674(19)30559-8) [Seurat v3] * [Butler, et al., Nat Biotechnol 2018](https://doi.org/10.1038/nbt.4096) [Seurat v2] diff --git a/man/reexports.Rd b/man/reexports.Rd index 08ced67a3..fa7258d42 100644 --- a/man/reexports.Rd +++ b/man/reexports.Rd @@ -71,8 +71,8 @@ These objects are imported from other packages. Follow the links below to see their documentation. \describe{ - \item{SeuratObject}{\code{\link[SeuratObject:set-if-null]{\%iff\%}}, \code{\link[SeuratObject:set-if-null]{\%||\%}}, \code{\link[SeuratObject]{AddMetaData}}, \code{\link[SeuratObject:ObjectAccess]{Assays}}, \code{\link[SeuratObject]{Cells}}, \code{\link[SeuratObject]{CellsByIdentities}}, \code{\link[SeuratObject]{Command}}, \code{\link[SeuratObject]{CreateAssayObject}}, \code{\link[SeuratObject]{CreateDimReducObject}}, \code{\link[SeuratObject]{CreateSeuratObject}}, \code{\link[SeuratObject]{DefaultAssay}}, \code{\link[SeuratObject:DefaultAssay]{DefaultAssay<-}}, \code{\link[SeuratObject]{Distances}}, \code{\link[SeuratObject]{Embeddings}}, \code{\link[SeuratObject]{FetchData}}, \code{\link[SeuratObject:AssayData]{GetAssayData}}, \code{\link[SeuratObject]{GetImage}}, \code{\link[SeuratObject]{GetTissueCoordinates}}, \code{\link[SeuratObject:VariableFeatures]{HVFInfo}}, \code{\link[SeuratObject]{Idents}}, \code{\link[SeuratObject:Idents]{Idents<-}}, \code{\link[SeuratObject]{Images}}, \code{\link[SeuratObject:NNIndex]{Index}}, \code{\link[SeuratObject:NNIndex]{Index<-}}, \code{\link[SeuratObject]{Indices}}, \code{\link[SeuratObject]{IsGlobal}}, \code{\link[SeuratObject]{JS}}, \code{\link[SeuratObject:JS]{JS<-}}, \code{\link[SeuratObject]{Key}}, \code{\link[SeuratObject:Key]{Key<-}}, \code{\link[SeuratObject]{Loadings}}, \code{\link[SeuratObject:Loadings]{Loadings<-}}, \code{\link[SeuratObject]{LogSeuratCommand}}, \code{\link[SeuratObject]{Misc}}, \code{\link[SeuratObject:Misc]{Misc<-}}, \code{\link[SeuratObject:ObjectAccess]{Neighbors}}, \code{\link[SeuratObject]{Project}}, \code{\link[SeuratObject:Project]{Project<-}}, \code{\link[SeuratObject]{Radius}}, \code{\link[SeuratObject:ObjectAccess]{Reductions}}, \code{\link[SeuratObject]{RenameCells}}, \code{\link[SeuratObject:Idents]{RenameIdents}}, \code{\link[SeuratObject:Idents]{ReorderIdent}}, \code{\link[SeuratObject]{RowMergeSparseMatrices}}, \code{\link[SeuratObject:VariableFeatures]{SVFInfo}}, \code{\link[SeuratObject:AssayData]{SetAssayData}}, \code{\link[SeuratObject:Idents]{SetIdent}}, \code{\link[SeuratObject:VariableFeatures]{SpatiallyVariableFeatures}}, \code{\link[SeuratObject:Idents]{StashIdent}}, \code{\link[SeuratObject]{Stdev}}, \code{\link[SeuratObject]{Tool}}, \code{\link[SeuratObject:Tool]{Tool<-}}, \code{\link[SeuratObject]{UpdateSeuratObject}}, \code{\link[SeuratObject]{VariableFeatures}}, \code{\link[SeuratObject:VariableFeatures]{VariableFeatures<-}}, \code{\link[SeuratObject]{WhichCells}}, \code{\link[SeuratObject]{as.Graph}}, \code{\link[SeuratObject]{as.Neighbor}}, \code{\link[SeuratObject]{as.Seurat}}, \code{\link[SeuratObject]{as.sparse}}} - \item{generics}{\code{\link[generics]{components}}} + + \item{SeuratObject}{\code{\link[SeuratObject:set-if-null]{\%||\%}}, \code{\link[SeuratObject:set-if-null]{\%iff\%}}, \code{\link[SeuratObject]{AddMetaData}}, \code{\link[SeuratObject]{as.Graph}}, \code{\link[SeuratObject]{as.Neighbor}}, \code{\link[SeuratObject]{as.Seurat}}, \code{\link[SeuratObject]{as.sparse}}, \code{\link[SeuratObject:ObjectAccess]{Assays}}, \code{\link[SeuratObject]{Cells}}, \code{\link[SeuratObject]{CellsByIdentities}}, \code{\link[SeuratObject]{Command}}, \code{\link[SeuratObject]{CreateAssayObject}}, \code{\link[SeuratObject]{CreateDimReducObject}}, \code{\link[SeuratObject]{CreateSeuratObject}}, \code{\link[SeuratObject]{DefaultAssay}}, \code{\link[SeuratObject:DefaultAssay]{DefaultAssay<-}}, \code{\link[SeuratObject]{Distances}}, \code{\link[SeuratObject]{Embeddings}}, \code{\link[SeuratObject]{FetchData}}, \code{\link[SeuratObject:AssayData]{GetAssayData}}, \code{\link[SeuratObject]{GetImage}}, \code{\link[SeuratObject]{GetTissueCoordinates}}, \code{\link[SeuratObject:VariableFeatures]{HVFInfo}}, \code{\link[SeuratObject]{Idents}}, \code{\link[SeuratObject:Idents]{Idents<-}}, \code{\link[SeuratObject]{Images}}, \code{\link[SeuratObject:NNIndex]{Index}}, \code{\link[SeuratObject:NNIndex]{Index<-}}, \code{\link[SeuratObject]{Indices}}, \code{\link[SeuratObject]{IsGlobal}}, \code{\link[SeuratObject]{JS}}, \code{\link[SeuratObject:JS]{JS<-}}, \code{\link[SeuratObject]{Key}}, \code{\link[SeuratObject:Key]{Key<-}}, \code{\link[SeuratObject]{Loadings}}, \code{\link[SeuratObject:Loadings]{Loadings<-}}, \code{\link[SeuratObject]{LogSeuratCommand}}, \code{\link[SeuratObject]{Misc}}, \code{\link[SeuratObject:Misc]{Misc<-}}, \code{\link[SeuratObject:ObjectAccess]{Neighbors}}, \code{\link[SeuratObject]{Project}}, \code{\link[SeuratObject:Project]{Project<-}}, \code{\link[SeuratObject]{Radius}}, \code{\link[SeuratObject:ObjectAccess]{Reductions}}, \code{\link[SeuratObject]{RenameCells}}, \code{\link[SeuratObject:Idents]{RenameIdents}}, \code{\link[SeuratObject:Idents]{ReorderIdent}}, \code{\link[SeuratObject]{RowMergeSparseMatrices}}, \code{\link[SeuratObject:AssayData]{SetAssayData}}, \code{\link[SeuratObject:Idents]{SetIdent}}, \code{\link[SeuratObject:VariableFeatures]{SpatiallyVariableFeatures}}, \code{\link[SeuratObject:Idents]{StashIdent}}, \code{\link[SeuratObject]{Stdev}}, \code{\link[SeuratObject:VariableFeatures]{SVFInfo}}, \code{\link[SeuratObject]{Tool}}, \code{\link[SeuratObject:Tool]{Tool<-}}, \code{\link[SeuratObject]{UpdateSeuratObject}}, \code{\link[SeuratObject]{VariableFeatures}}, \code{\link[SeuratObject:VariableFeatures]{VariableFeatures<-}}, \code{\link[SeuratObject]{WhichCells}}} }} diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 540e5c2d8..7a3302c6b 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -402,7 +402,7 @@ BEGIN_RCPP END_RCPP } -RcppExport SEXP isnull(void *); +RcppExport SEXP isnull(SEXP); static const R_CallMethodDef CallEntries[] = { {"_Seurat_RunModularityClusteringCpp", (DL_FUNC) &_Seurat_RunModularityClusteringCpp, 9}, diff --git a/tests/testthat.R b/tests/testthat.R index 94db12da2..0bea94fda 100644 --- a/tests/testthat.R +++ b/tests/testthat.R @@ -1,10 +1,10 @@ library(testthat) library(Seurat) - -# Run tests for 'v3' -message('Run tests for v3 assay') -options(Seurat.object.assay.version = 'v3') -test_check("Seurat") +# +# # Run tests for 'v3' +# message('Run tests for v3 assay') +# options(Seurat.object.assay.version = 'v3') +# test_check("Seurat") # Run tests for 'v5' message('Run tests for v5 assay') diff --git a/vignettes/COVID_SCTMapping.Rmd b/vignettes/COVID_SCTMapping.Rmd old mode 100755 new mode 100644 index b7fc59575..cc09ebd74 --- a/vignettes/COVID_SCTMapping.Rmd +++ b/vignettes/COVID_SCTMapping.Rmd @@ -33,7 +33,7 @@ knitr::opts_chunk$set( ``` ```{r, warning=F, message=F} -library(Seurat) +devtools::load_all() library(BPCells) library(dplyr) library(patchwork) @@ -56,7 +56,7 @@ We first load the reference (available [here](https://zenodo.org/record/7779017# ```{r load.data} reference <- readRDS("/brahms/hartmana/vignette_data/pbmc_multimodal_2023.rds") -object <- readRDS("/brahms/hartmana/vignette_data/merged_covid_object.rds") +object <- readRDS("/brahms/mollag/seurat_v5/vignette_data/merged_covid_object.rds") object <- NormalizeData(object, verbose = FALSE) ``` @@ -137,7 +137,9 @@ p2 <- ggplot(data = df_comp_relative, mapping = aes(x = disease, y = Plasmablas p1 + p2 + plot_layout(ncol = 2) ``` - +```{r, include=FALSE} +#saveRDS(object, '/home/lis/seurat-private/output/covid_object.rds') +``` ## Differential expression analysis In addition to composition analysis, we use an aggregation-based (pseudobulk) workflow to explore differential genes between healthy individuals and COVID-19 donors. We aggregate all cells within the same cell type and donor using the `AggregateExpression` function. This returns a Seurat object where each ‘cell’ represents the pseudobulk profile of one cell type in one individual. @@ -147,15 +149,11 @@ bulk <- AggregateExpression(object, assays = 'RNA', group.by = c("predicted.celltype.l2", "donor_id", "disease") ) - -bulk$celltype <- sapply(strsplit(Cells(bulk), split = "_"), '[', 1) -bulk$donor <- sapply(strsplit(Cells(bulk), split = "_"), '[', 2) -bulk$disease <- sapply(strsplit(Cells(bulk), split = "_"), '[', 3) ``` ```{r} bulk <- subset(bulk, subset = disease %in% c('normal', 'COVID-19') ) -bulk <- subset(bulk, subset = celltype != c('Doublet') ) +bulk <- subset(bulk, subset = predicted.celltype.l2 != 'Doublet') bulk$disease <- factor(bulk$disease, levels = c('normal', 'COVID-19')) ``` @@ -163,13 +161,13 @@ Once a pseudobulk object is created, we can perform cell type-specific different ```{r, fig.width=10, fig.height=12} p1 <- VlnPlot( - object = bulk, features = 'IFI6', group.by = 'celltype', + object = bulk, features = 'IFI6', group.by = 'predicted.celltype.l2', split.by = 'disease', cols = c("#377eb8", "#e41a1c")) p2 <- VlnPlot( - object = bulk, features = c('ISG15'), group.by = 'celltype', + object = bulk, features = c('ISG15'), group.by = 'predicted.celltype.l2', split.by = 'disease', cols = c("#377eb8", "#e41a1c")) p3 <- VlnPlot( - object = bulk, features = c('IFIT5'), group.by = 'celltype', + object = bulk, features = c('IFIT5'), group.by = 'predicted.celltype.l2', split.by = 'disease', cols = c("#377eb8", "#e41a1c")) p1 + p2 + p3 + plot_layout(ncol = 1) ``` diff --git a/vignettes/ParseBio_sketch_integration.Rmd b/vignettes/ParseBio_sketch_integration.Rmd old mode 100755 new mode 100644 index 9c576ae45..07b4df155 --- a/vignettes/ParseBio_sketch_integration.Rmd +++ b/vignettes/ParseBio_sketch_integration.Rmd @@ -52,7 +52,6 @@ library(ggrepel) library(patchwork) # set this option when analyzing large datasets options(future.globals.maxSize = 3e9) -options(Seurat.object.assay.version = "v5") ``` ## Create a Seurat object containing data from 24 patients We downloaded the original dataset and donor metadata from [Parse Biosciences](https://cdn.parsebiosciences.com/1M_PBMC_T1D_Parse.zip). While the BPCells package can work directly with h5ad files, for optimal performance, we converted the dataset to the compressed sparse format used by BPCells, as described [here](seurat5_bpcells_interaction_vignette.html). @@ -117,7 +116,6 @@ plot.s1 + plot.s2 + plot_layout(ncol = 1) Now that we have integrated the subset of atoms of each dataset, placing them each in an integrated low-dimensional space, we can now place each cell from each dataset in this space as well. We load the full datasets back in individually, and use the `ProjectIntegration` function to integrate all cells. After this function is run, the `integrated.rpca.full` space now embeds all cells in the dataset.Even though all cells in the dataset have been integrated together, the non-sketched cells are not loaded into memory. Users can still switch between the `sketch` (sketched cells, in-memory) and `RNA` (full dataset, on disk) for analysis. After integration, we can also project cell type labels from the sketched cells onto the full dataset using `ProjectData`. ```{r} - # resplit the sketched cell assay into layers # this is required to project the integration onto all cells object[['sketch']] <- split(object[['sketch']], f = object$sample) @@ -144,7 +142,7 @@ object <- ProjectData(object = object, object <- RunUMAP(object, reduction = 'integrated.rpca.full', dims = 1:30 , reduction.name = 'umap.full', reduction.key = 'UMAP_full_') ``` -```{r, fig.width=10, fig.height=10} +```{r, fig.width=10, fig.height=10, eval = FALSE} p1 <- DimPlot(object, reduction = 'umap.full', group.by = 'sample',alpha = 0.1) p2 <- DimPlot(object, reduction = 'umap.full', group.by = 'celltype.full', alpha = 0.1) p1 + p2 + plot_layout(ncol = 1) @@ -159,17 +157,12 @@ After we aggregate cells, we can perform celltype-specific differential expressi ```{r} bulk <- AggregateExpression(object, return.seurat = T, slot = 'counts', assays = 'RNA', group.by = c("celltype.full","sample", 'disease')) - +``` +```{r} # each sample is an individual-specific celltype-specific pseudobulk profile tail(Cells(bulk)) -bulk$celltype <- sapply(strsplit(Cells(bulk), split = "_"), '[', 1) -bulk$donor <- sapply(strsplit(Cells(bulk), split = "_"), '[', 2) -bulk$disease <- sapply(strsplit(bulk$donor, split = "-"), '[', 1) -bulk$disease <- factor(x = bulk$disease, levels = c('H', 'D')) - - -cd14.bulk <- subset(bulk,celltype == "CD14 Mono") +cd14.bulk <- subset(bulk,celltype.full == "CD14 Mono") Idents(cd14.bulk) <- 'disease' de_markers <- FindMarkers(cd14.bulk, ident.1 = 'D',ident.2 = 'H', slot = 'counts', test.use = 'DESeq2', verbose = F ) de_markers$gene <- rownames(de_markers) @@ -177,19 +170,9 @@ ggplot(de_markers, aes(avg_log2FC, -log10(p_val))) + geom_point(size=0.5, alpha= ``` -We do not necessarily expect to see a strong transcriptomic signature of diabetes in the blood, but our analyses reveals multiple genes that are up-regulated in diabetic patients, and are consistent across multiple individuals. Some of these genes, including the complement subcomponent C1R, have been [previously associated with diabetes](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6927818/). Others, including the transcription factor SPDEF and the receptor RAPSN, are also diabetic biomarkers in multiple cell types, including CD14 monocytes. - -```{r,height = 12, width=6} -# each dot represents a pseudobulk average from an individual -VlnPlot(bulk, features = c("C1R"),group.by = 'celltype', split.by = 'disease', cols = c('#377eb8','#e41a1c')) - - -``` - -
**Session Info** -```{r} +```{r, eval = TRUE} sessionInfo() ``` -
\ No newline at end of file + diff --git a/vignettes/announcements.Rmd b/vignettes/announcements.Rmd new file mode 100644 index 000000000..e6c0c8e13 --- /dev/null +++ b/vignettes/announcements.Rmd @@ -0,0 +1,50 @@ +--- +title: "News" +output: + html_document: + theme: united + df_print: kable +--- + +## **Changes in Seurat v5** + +We are excited to release Seurat v5 on CRAN, where it is now the default version for new installs. Seurat v5 is designed to be backwards compatible with Seurat v4 so existing code will continue to run, but we have made some changes to the software that will affect user results. We note that users who aim to reproduce their previous workflows in Seurat v4 can still install this version using the instructions on our [install page](link). + +In particular, we have made changes to: + +* **Seurat Object and Assay class:** +\ +Seurat v5 now includes support for additional assay and data types, including on-disk matrices. To facilitate this, we have introduced an updated Seurat v5 assay. Users can check out this [vignette for more information]. Briefly, Seurat v5 assays store data in layers (previously referred to as 'slots'). + + For example, these layers can store: raw counts `(layer='counts')`, normalized data `(layer='data')`, or z-scored/variance-stabilized data `(layer='scale.data')`. + + Data can be accessed using the `$` accessor (i.e. `obj[["RNA"]]$counts`), or the ``LayerData` function (i.e. `LayerData(obj, assay="RNA", layer='counts')`. +\ + + We've designed these updates to minimize changes for users. Existing Seurat functions and workflows from v4 continue to work in v5. For example, the command `GetAssayData(obj, assay="RNA", slot='counts')`, will run successfully in both Seurat v4 and Seurat v5. + + +* **Integration workflow:** +\ +Seurat v5 introduces a [streamlined integration](integration_introduction.html) and [data transfer](intregration_mapping.html) workflows that performs integration in low-dimensional space, and improves speed and memory efficiency. The results of integration are not identical between the two workflows, but users can still run the [v4 integration workflow](integration_introduction.html) in Seurat v5 if they wish. +\ + + In previous versions of Seurat, the integration workflow required a list of multiple Seurat objects as input. In Seurat v5, all the data can be kept as a single object, but prior to integration, users can simply split the layers. See our [introduction to integration](integration_introduction.html) vignette for more information. + + +* **Differential expression:** +\ +Seurat v5 now uses the [presto package](https://github.com/immunogenomics/presto) (from the Korunsky and Raychaudhari labs), when available, to perform differential expression analysis. Using presto can dramatically speed up DE testing, and we encourage users to install it. +\ +In addition, in Seurat v5 we implement a pseudocount (when calculating log-FC) at the group level instead of the cell level. As a result, users will observe higher logFC estimates in v5 - but should note that these estimates may be more unstable - particularly for genes that are very lowly expressed in one of the two groups. We gratefully acknowledge feedback from the McCarthy and Pachter labs on this topic. + +* **SCTransform v2:** +\ +In [Choudhary and Satija, Genome Biology, 2022](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-021-02584-9), we implement an updated version 2 of sctransform. This is now the default version when running `SCTransform` in Seurat v5. Users who wish to run the previous workflow can set the `vst.flavor = "v1"` argument in the `SCTransform` function. +\ +\ +* **Pseudobulk analysis:** +\ +Once a single-cell dataset has been analyzed to annotate cell subpopulations, pseudobulk analyses (i.e. aggregating together cells within a given subpopulation and sample) can reduce noise, improve quantification of lowly expressed genes, and reduce the size of the data matrix. In Seurat v5, we encourage the use of the `AggregateExpression` function to perform pseudobulk analysis. +\ +Check out our [differential expression vignette](de_vignette.html) as well as our [pancreatic/healthy PBMC comparison](ParseBio_sketch_integration.html), for examples of how to use `AggregateExpression` to perform robust differential expression of scRNA-seq data from multiple different conditions. \ No newline at end of file diff --git a/vignettes/archive.yaml b/vignettes/archive.yaml index 868c8e1dc..790109365 100644 --- a/vignettes/archive.yaml +++ b/vignettes/archive.yaml @@ -1,25 +1,25 @@ vignettes: - name: atacseq_integration_vignette title: PBMC scATAC-seq Vignette - versions: [v3.2, v3.1, v3.0] + versions: [v4.3, v3.2, v3.1, v3.0] - name: cell_cycle_vignette title: Cell-Cycle Scoring and Regression - versions: [v3.2, v3.1, v3.0, v2.4] + versions: [v4.3, v3.2, v3.1, v3.0, v2.4] - name: conversion_vignette title: Interoperability between single-cell object formats - versions: [v3.2, v3.1, v3.0, v2.4] + versions: [v4.3, v4.3, v3.2, v3.1, v3.0, v2.4] - name: de_vignette title: Differential expression testing - versions: [v3.2, v3.1, v3.0, v2.4] + versions: [v4.3, v3.2, v3.1, v3.0, v2.4] - name: dim_reduction_vignette title: Seurat - Dimensional Reduction Vignette - versions: [v3.2, v3.1, v3.0, v2.4] + versions: [v4.3, v3.2, v3.1, v3.0, v2.4] - name: future_vignette title: Parallelization in Seurat with future - versions: [v3.2, v3.1, v3.0] + versions: [v4.3, v3.2, v3.1, v3.0] - name: hashing_vignette title: Demultiplexing with hashtag oligos (HTOs) - versions: [v3.2, v3.1, v3.0, v2.4] + versions: [v4.3, v3.2, v3.1, v3.0, v2.4] - name: immune_alignment title: "Tutorial: Integrating stimulated vs. control PBMC datasets to learn cell- type specific responses" @@ -27,29 +27,47 @@ type specific responses" - name: integration title: Integration and Label Transfer versions: [v3.2, v3.1, v3.0] + - name: integration_mapping + title: Mapping and annotating query datasets + versions: [v4.3] + - name: integration_large_datasets + title: Tips for integrating large datasets + versions: [v4.3] + - name: integration_rpca + title: Fast integration using reciprocal PCA (RPCA) + versions: [v4.3] - name: interaction_vignette title: Seurat - Interaction Tips - versions: [v3.2, v3.1, v3.0, v2.4] + versions: [v4.3, v3.2, v3.1, v3.0, v2.4] - name: mca title: Guided Clustering of the Microwell-seq Mouse Cell Atlas versions: [v3.2, v3.1, v3.0, v2.4] + - name: merge + title: Seurat - Combining Two 10X Runs + versions: [v4.3] - name: mixscape title: Mixscape Vignette - versions: [v3.2, v3.1] + versions: [v4.3, v3.2, v3.1] + - name: multimodal_reference_mapping + title: Multimodal reference mapping + versions: [v4.3] - name: multimodal_vignette title: Using Seurat with multi-modal data - versions: [v3.2, v3.1, v3.0, v2.4] + versions: [v4.3, v3.2, v3.1, v3.0, v2.4] - name: pbmc3k_tutorial title: Guided tutorial --- 2,700 PBMCs - versions: [v3.2, v3.1, v3.0, v2.4, v1.4] + versions: [v4.3, v3.2, v3.1, v3.0, v2.4, v1.4] - name: sctransform_vignette title: Using sctransform in Seurat - versions: [v3.2, v3.1, v3.0] + versions: [v4.3, v3.2, v3.1, v3.0] - name: spatial_vignette title: Analysis, visualization, and integration of spatial datasets with Seurat - versions: [v3.2] + versions: [v4.3, v3.2] - name: visualization_vignette title: New data visualization methods in v3.0 - versions: [v3.2, v3.1, v3.0, v2.4] + versions: [v4.3, v3.2, v3.1, v3.0, v2.4] + - name: weighted_nearest_neighbor_analysis + title: Weighted Nearest Neighbor Analysis + versions: [v4.3] diff --git a/vignettes/cell_cycle_vignette.Rmd b/vignettes/cell_cycle_vignette.Rmd index 3c0d026b6..bb3340f62 100644 --- a/vignettes/cell_cycle_vignette.Rmd +++ b/vignettes/cell_cycle_vignette.Rmd @@ -39,7 +39,7 @@ library(Seurat) # Read in the expression matrix # The first row is a header row, the first column is rownames -exp.mat <- read.table(file = "/Users/sli/seurat-private/data/cell_cycle_vignette_files/nestorawa_forcellcycle_expressionMatrix.txt", header = TRUE, as.is = TRUE, row.names = 1) +exp.mat <- read.table(file = "/brahms/shared/vignette-data/cell_cycle_vignette_files/nestorawa_forcellcycle_expressionMatrix.txt", header = TRUE, as.is = TRUE, row.names = 1) # A list of cell cycle markers, from Tirosh et al, 2015, is loaded with Seurat. # We can segregate this list into markers of G2/M phase and markers of S phase diff --git a/vignettes/de_vignette.Rmd b/vignettes/de_vignette.Rmd index d4a3906b5..eb350de5a 100644 --- a/vignettes/de_vignette.Rmd +++ b/vignettes/de_vignette.Rmd @@ -8,136 +8,189 @@ output: date: 'Compiled: `r Sys.Date()`' --- -```{r setup, include=FALSE} -all_times <- list() # store the time for each chunk -knitr::knit_hooks$set(time_it = local({ - now <- NULL - function(before, options) { - if (before) { - now <<- Sys.time() - } else { - res <- difftime(Sys.time(), now, units = "secs") - all_times[[options$label]] <<- res - } - } -})) -knitr::opts_chunk$set( - tidy = TRUE, - tidy.opts = list(width.cutoff = 95), - message = FALSE, - warning = FALSE, - time_it = TRUE, - error = TRUE -) -``` - -```{r, include = FALSE} -options(SeuratData.repo.use = "http://satijalab04.nygenome.org") -``` - -# Load in the data +### Load in the data -This vignette highlights some example workflows for performing differential expression in Seurat. For demonstration purposes, we will be using the 2,700 PBMC object that is available via the [SeuratData](https://github.com/satijalab/seurat-data) package). +This vignette highlights some example workflows for performing differential expression in Seurat. For demonstration purposes, we will be using the interferon-beta stimulated human PBMCs dataset ([ifnb](https://www.nature.com/articles/nbt.4042)) that is available via the [SeuratData](https://github.com/satijalab/seurat-data) package. -```{r load_data} +```{r echo=TRUE, message=FALSE, warning=FALSE} library(Seurat) library(SeuratData) -pbmc <- LoadData("pbmc3k", type = "pbmc3k.final") +library(ggplot2) +ifnb <- LoadData("ifnb") ``` -# Perform default differential expression tests +### Perform default differential expression tests -The bulk of Seurat's differential expression features can be accessed through the `FindMarkers()` function. As a default, Seurat performs differential expression based on the non-parametric Wilcoxon rank sum test. This replaces the previous default test ('bimod'). To test for differential expression between two specific groups of cells, specify the `ident.1` and `ident.2` parameters. +The bulk of Seurat’s differential expression features can be accessed through the `FindMarkers()` function. By default, Seurat performs differential expression (DE) testing based on the non-parametric Wilcoxon rank sum test. To test for DE genes between two specific groups of cells, specify the `ident.1` and `ident.2` parameters. -```{r basic_de} -# list options for groups to perform differential expression on -levels(pbmc) -# Find differentially expressed features between CD14+ and FCGR3A+ Monocytes -monocyte.de.markers <- FindMarkers(pbmc, ident.1 = "CD14+ Mono", ident.2 = "FCGR3A+ Mono") +```{r echo=TRUE, message=FALSE, warning=FALSE,results=TRUE} +# Normalize the data +ifnb <- NormalizeData(ifnb) + +# Find DE features between CD16 Mono and CD1 Mono +Idents(ifnb) <- "seurat_annotations" +monocyte.de.markers <- FindMarkers(ifnb, ident.1 = "CD16 Mono", ident.2 = "CD14 Mono") # view results head(monocyte.de.markers) ``` -The results data frame has the following columns : +The results data frame has the following columns : - * p_val : p_val (unadjusted) - * avg_log2FC : log fold-change of the average expression between the two groups. Positive values indicate that the feature is more highly expressed in the first group. - * pct.1 : The percentage of cells where the feature is detected in the first group - * pct.2 : The percentage of cells where the feature is detected in the second group - * p_val_adj : Adjusted p-value, based on Bonferroni correction using all features in the dataset. +* p_val : p-value (unadjusted) +* avg_log2FC : log fold-change of the average expression between the two groups. Positive values indicate that the feature is more highly expressed in the first group. +* pct.1 : The percentage of cells where the feature is detected in the first group +* pct.2 : The percentage of cells where the feature is detected in the second group +* p_val_adj : Adjusted p-value, based on Bonferroni correction using all features in the dataset. -If the `ident.2` parameter is omitted or set to NULL, `FindMarkers()` will test for differentially expressed features between the group specified by `ident.1` and all other cells. +If the `ident.2` parameter is omitted or set to `NULL`, `FindMarkers()` will test for differentially expressed features between the group specified by `ident.1` and all other cells. Additionally, the parameter `only.pos` can be set to `TRUE` to only search for positive markers, i.e. features that are more highly expressed in the `ident.1` group. -```{r basic_de_2} -# Find differentially expressed features between CD14+ Monocytes and all other cells, only search for positive markers -monocyte.de.markers <- FindMarkers(pbmc, ident.1 = "CD14+ Mono", ident.2 = NULL, only.pos = TRUE) +```{r echo=TRUE, message=FALSE, warning=FALSE,results=TRUE} +# Find differentially expressed features between CD14+ Monocytes and all other cells, only +# search for positive markers +monocyte.de.markers <- FindMarkers(ifnb, ident.1 = "CD16 Mono", ident.2 = NULL, only.pos = TRUE) # view results head(monocyte.de.markers) ``` -# Prefilter features or cells to increase the speed of DE testing +### Perform DE analysis within the same cell type across conditions + +Since this dataset contains treatment information (control versus stimulated with interferon-beta), we can also ask what genes change in different conditions for cells of the same type. First, we create a column in the `meta.data` slot to hold both the cell type and treatment information and switch the current `Idents` to that column. Then we use `FindMarkers()` to find the genes that are different between control and stimulated CD14 monocytes. + +```{r echo=TRUE, message=FALSE, warning=FALSE,results=TRUE} +ifnb$celltype.stim <- paste(ifnb$seurat_annotations, ifnb$stim, sep = "_") +Idents(ifnb) <- "celltype.stim" +mono.de <- FindMarkers(ifnb, ident.1 = "CD14 Mono_STIM", ident.2 = "CD14 Mono_CTRL", verbose = FALSE) +head(mono.de, n = 10) +``` -To increase the speed of marker discovery, particularly for large datasets, Seurat allows for pre-filtering of features or cells. For example, features that are very infrequently detected in either group of cells, or features that are expressed at similar average levels, are unlikely to be differentially expressed. Example use cases of the `min.pct`, `logfc.threshold`, `min.diff.pct`, and `max.cells.per.ident` parameters are demonstrated below. +However, the p-values obtained from this analysis should be interpreted with caution, because these tests treat each cell as an independent replicate and ignore inherent correlations between cells originating from the same sample. Such analyses have been shown to find a large number of false positive associations, as has been demonstrated by [Squair et al., 2021](https://www.nature.com/articles/s41467-021-25960-2), [Zimmerman et al., 2021](https://www.nature.com/articles/s41467-021-21038-1), [Junttila et al., 2022](https://academic.oup.com/bib/article/23/5/bbac286/6649780), and others. Below, we show how pseudobulking can be used to account for such within-sample correlation. -```{r prefilter} -# Pre-filter features that are detected at <50% frequency in either CD14+ Monocytes or FCGR3A+ Monocytes -head(FindMarkers(pbmc, ident.1 = "CD14+ Mono", ident.2 = "FCGR3A+ Mono", min.pct = 0.5)) +#### Perform DE analysis after pseudobulking -# Pre-filter features that have less than a two-fold change between the average expression of CD14+ Monocytes vs FCGR3A+ Monocytes -head(FindMarkers(pbmc, ident.1 = "CD14+ Mono", ident.2 = "FCGR3A+ Mono", logfc.threshold = log(2))) +To pseudobulk, we will use [AggregateExpression()](https://satijalab.org/seurat/reference/aggregateexpression) to sum together gene counts of all the cells from the same sample for each cell type. This results in one gene expression profile per sample and cell type. We can then perform DE analysis using [DESeq2](https://bioconductor.org/packages/release/bioc/html/DESeq2.html) on the sample level. This treats the samples, rather than the individual cells, as independent observations. -# Pre-filter features whose detection percentages across the two groups are similar (within 0.25) -head(FindMarkers(pbmc, ident.1 = "CD14+ Mono", ident.2 = "FCGR3A+ Mono", min.diff.pct = 0.25)) +First, we need to retrieve the sample information for each cell. This is not loaded in the metadata, so we will load it from the [Github repo](https://github.com/yelabucsf/demuxlet_paper_code/tree/master/) of the source data for the original paper. -# Increasing min.pct, logfc.threshold, and min.diff.pct, will increase the speed of DE testing, but could also miss features that are prefiltered +
**Add sample information to the dataset** -# Subsample each group to a maximum of 200 cells. Can be very useful for large clusters, or computationally-intensive DE tests -head(FindMarkers(pbmc, ident.1 = "CD14+ Mono", ident.2 = "FCGR3A+ Mono", max.cells.per.ident = 200)) +```{r echo=TRUE, message=FALSE, warning=FALSE,results=TRUE} +# load the inferred sample IDs of each cell +ctrl <- read.table(url("https://raw.githubusercontent.com/yelabucsf/demuxlet_paper_code/master/fig3/ye1.ctrl.8.10.sm.best"), head = T, stringsAsFactors = F) +stim <- read.table(url("https://raw.githubusercontent.com/yelabucsf/demuxlet_paper_code/master/fig3/ye2.stim.8.10.sm.best"), head = T, stringsAsFactors = F) +info <- rbind(ctrl, stim) + +# rename the cell IDs by substituting the '-' into '.' +info$BARCODE <- gsub(pattern = "\\-", replacement = "\\.", info$BARCODE) + +# only keep the cells with high-confidence sample ID +info <- info[grep(pattern = "SNG", x = info$BEST), ] + +# remove cells with duplicated IDs in both ctrl and stim groups +info <- info[!duplicated(info$BARCODE) & !duplicated(info$BARCODE, fromLast = T), ] + +# now add the sample IDs to ifnb +rownames(info) <- info$BARCODE +info <- info[, c("BEST"), drop = F] +names(info) <- c("donor_id") +ifnb <- AddMetaData(ifnb, metadata = info) + +# remove cells without donor IDs +ifnb$donor_id[is.na(ifnb$donor_id)] <- "unknown" +ifnb <- subset(ifnb, subset = donor_id != "unknown") ``` +
+\ + +We can now perform pseudobulking (`AggregateExpression()`) based on the donor IDs. -# Perform DE analysis using alternative tests - -The following differential expression tests are currently supported: - - * "wilcox" : Wilcoxon rank sum test (default) - * "bimod" : Likelihood-ratio test for single cell feature expression, [(McDavid et al., Bioinformatics, 2013)](https://www.ncbi.nlm.nih.gov/pubmed/23267174) - * "roc" : Standard AUC classifier - * "t" : Student's t-test - * "poisson" : Likelihood ratio test assuming an underlying negative binomial distribution. Use only for UMI-based datasets - * "negbinom" : Likelihood ratio test assuming an underlying negative binomial distribution. Use only for UMI-based datasets - * "LR" : Uses a logistic regression framework to determine differentially expressed genes. Constructs a logistic regression model predicting group membership based on each feature individually and compares this to a null model with a likelihood ratio test. - * "MAST" : GLM-framework that treates cellular detection rate as a covariate [(Finak et al, Genome Biology, 2015)](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4676162/) ([Installation instructions](https://github.com/RGLab/MAST)) - * "DESeq2" : DE based on a model using the negative binomial distribution [(Love et al, Genome Biology, 2014)](https://bioconductor.org/packages/release/bioc/html/DESeq2.html) ([Installation instructions](https://bioconductor.org/packages/release/bioc/html/DESeq2.html)) - -For MAST and DESeq2, please ensure that these packages are installed separately in order to use them as part of Seurat. Once installed, use the `test.use` parameter can be used to specify which DE test to use. - -```{r include = TRUE} -# necessary to get MAST to work properly -library(SingleCellExperiment) +```{r echo=TRUE, message=FALSE, warning=FALSE,results=TRUE} +# pseudobulk the counts based on donor-condition-celltype +pseudo_ifnb <- AggregateExpression(ifnb, assays = "RNA", return.seurat = T, group.by = c("stim", "donor_id", "seurat_annotations")) + +# each 'cell' is a donor-condition-celltype pseudobulk profile +tail(Cells(pseudo_ifnb)) +pseudo_ifnb$celltype.stim <- paste(pseudo_ifnb$seurat_annotations, pseudo_ifnb$stim, sep = "_") ``` - -```{r multiple test} -# Test for DE features using the MAST package -head(FindMarkers(pbmc, ident.1 = "CD14+ Mono", ident.2 = "FCGR3A+ Mono", test.use = "MAST")) -# Test for DE features using the DESeq2 package. Throws an error if DESeq2 has not already been installed -# Note that the DESeq2 workflows can be computationally intensive for large datasets, but are incompatible with some feature pre-filtering options -# We therefore suggest initially limiting the number of cells used for testing -head(FindMarkers(pbmc, ident.1 = "CD14+ Mono", ident.2 = "FCGR3A+ Mono", test.use = "DESeq2", max.cells.per.ident = 50)) +Next, we perform DE testing on the pseudobulk level for CD14 monocytes, and compare it against the previous single-cell-level DE results. + +```{r echo=TRUE, message=FALSE, warning=FALSE,results=TRUE} +Idents(pseudo_ifnb) <- "celltype.stim" + +bulk.mono.de <- FindMarkers(object = pseudo_ifnb, + ident.1 = "CD14 Mono_STIM", + ident.2 = "CD14 Mono_CTRL", + test.use = "DESeq2") +head(bulk.mono.de, n = 15) + +# compare the DE P-values between the single-cell level and the pseudobulk level results +names(bulk.mono.de) <- paste0(names(bulk.mono.de), ".bulk") +bulk.mono.de$gene <- rownames(bulk.mono.de) + +names(mono.de) <- paste0(names(mono.de), ".sc") +mono.de$gene <- rownames(mono.de) + +merge_dat <- merge(mono.de, bulk.mono.de, by = "gene") +merge_dat <- merge_dat[order(merge_dat$p_val.bulk), ] + +# Number of genes that are marginally significant in both; marginally significant only in bulk; and marginally significant only in single-cell +common <- merge_dat$gene[which(merge_dat$p_val.bulk < 0.05 & + merge_dat$p_val.sc < 0.05)] +only_sc <- merge_dat$gene[which(merge_dat$p_val.bulk > 0.05 & + merge_dat$p_val.sc < 0.05)] +only_bulk <- merge_dat$gene[which(merge_dat$p_val.bulk < 0.05 & + merge_dat$p_val.sc > 0.05)] +print(paste0('# Common: ',length(common))) +print(paste0('# Only in single-cell: ',length(only_sc))) +print(paste0('# Only in bulk: ',length(only_bulk))) ``` - -# Acknowledgements -We thank the authors of the MAST and DESeq2 packages for their kind assistance and advice. We also point users to the following [study](https://www.nature.com/articles/nmeth.4612) by Charlotte Soneson and Mark Robinson, which performs careful and extensive evaluation of methods for single cell differential expression testing. +We can see that while the p-values are correlated between the single-cell and pseudobulk data, the single-cell p-values are often smaller and suggest higher levels of significance. In particular, there are 3,519 genes with evidence of differential expression (prior to multiple hypothesis testing) in both analyses, 1,649 genes that only appear to be differentially expressed in the single-cell analysis, and just 204 genes that only appear to be differentially expressed in the bulk analysis. We can investigate these discrepancies using `VlnPlot`. + +First, we can examine the top genes that are differentially expressed in both analyses. + +```{r echo=TRUE, message=FALSE, warning=FALSE,results=TRUE} +# create a new column to annotate sample-condition-celltype in the single-cell dataset +ifnb$donor_id.stim <- paste0(ifnb$stim, "-", ifnb$donor_id) -```{r save.times, include = FALSE} -write.csv(x = t(as.data.frame(all_times)), file = "../output/timings/de_vignette_times.csv") +# generate violin plot +Idents(ifnb) <- "celltype.stim" +print(merge_dat[merge_dat$gene%in%common[1:2],c('gene','p_val.sc','p_val.bulk')]) +VlnPlot(ifnb, features = common[1:2], idents = c("CD14 Mono_CTRL", "CD14 Mono_STIM"), group.by = "stim") +VlnPlot(ifnb, features = common[1:2], idents = c("CD14 Mono_CTRL", "CD14 Mono_STIM"), group.by = "donor_id.stim", ncol = 1) ``` -
- **Session Info** -```{r} -sessionInfo() +In both the pseudobulk and single-cell analyses, the p-values for these two genes are astronomically small. For both of these genes, when just comparing all stimulated CD4 monocytes to all control CD4 monocytes across samples, we see much higher expression in the stimulated cells. When breaking down these cells by sample, we continue to see consistently higher expression levels in the stimulated samples compared to the control samples; in other words, this finding is not driven by just one or two samples. Because of this consistency, we find this signal in both analyses. + +By contrast, we can examine examples of genes that are only DE under the single-cell analysis. + +```{r echo=TRUE, message=FALSE, warning=FALSE,results=TRUE} +print(merge_dat[merge_dat$gene%in%c('SRGN','HLA-DRA'),c('gene','p_val.sc','p_val.bulk')]) +VlnPlot(ifnb, features <- c('SRGN','HLA-DRA'), idents = c("CD14 Mono_CTRL", "CD14 Mono_STIM"), group.by = "stim") +VlnPlot(ifnb, features <- c('SRGN','HLA-DRA'), idents = c("CD14 Mono_CTRL", "CD14 Mono_STIM"), group.by = "donor_id.stim", ncol = 1) +``` + +Here, SRGN and HLA-DRA both have very small p-values in the single-cell analysis (on the orders of $10^{-21}$ and $10^{-9}$), but much larger p-values around 0.18 in the pseudobulk analysis. While there appears to be a difference between control and simulated cells when ignoring sample information, the signal is much weaker on the sample level, and we can see notable variability from sample to sample. + +### Perform DE analysis using alternative tests + +Finally, we also support many other DE tests using other methods. For completeness, the following tests are currently supported: + +* "wilcox" : Wilcoxon rank sum test (default, using '[presto](https://github.com/immunogenomics/presto)' package) +* "wilcox_limma" : Wilcoxon rank sum test (using '[limma](https://bioconductor.org/packages/release/bioc/html/limma.html)' package) +* "bimod" : Likelihood-ratio test for single cell feature expression, (McDavid et al., Bioinformatics, 2013) +* "roc" : Standard AUC classifier +* "t" : Student’s t-test +* "poisson" : Likelihood ratio test assuming an underlying negative binomial distribution. Use only for UMI-based datasets +* "negbinom" : Likelihood ratio test assuming an underlying negative binomial distribution. Use only for UMI-based datasets +* "LR" : Uses a logistic regression framework to determine differentially expressed genes. Constructs a logistic regression model predicting group membership based on each feature individually and compares this to a null model with a likelihood ratio test. +* "MAST" : GLM-framework that treates cellular detection rate as a covariate (Finak et al, Genome Biology, 2015) (Installation instructions) +* "DESeq2" : DE based on a model using the negative binomial distribution (Love et al, Genome Biology, 2014) (Installation instructions) +For MAST and DESeq2, please ensure that these packages are installed separately in order to use them as part of Seurat. Once installed, use the test.use parameter can be used to specify which DE test to use. + +```{r echo=TRUE, message=FALSE,warning=FALSE,results=TRUE,eval=FALSE} +# Test for DE features using the MAST package +Idents(ifnb) <- "seurat_annotations" +head(FindMarkers(ifnb, ident.1 = "CD14 Mono", ident.2 = "CD16 Mono", test.use = "MAST")) ``` -
diff --git a/vignettes/essential_commands.Rmd b/vignettes/essential_commands.Rmd index 563b7c65b..f33e5d494 100644 --- a/vignettes/essential_commands.Rmd +++ b/vignettes/essential_commands.Rmd @@ -24,151 +24,306 @@ knitr::opts_chunk$set( ```{r load-data, echo=FALSE} library(Seurat) library(ggplot2) -pbmc <- readRDS(file = '~/Downloads/pbmc3k/pbmc3k_final.rds') -cbmc.rna <- as.sparse(x = read.csv("~/Downloads/GSE100866_CBMC_8K_13AB_10X-RNA_umi.csv.gz", sep = ",", header = TRUE, row.names = 1)) +library(SeuratData) -# To make life a bit easier going forward, we're going to discard all but the top 100 most highly expressed mouse genes, and remove the "HUMAN_" from the CITE-seq prefix -cbmc.rna <- CollapseSpeciesExpressionMatrix(object = cbmc.rna) +# install datasets +InstallData('pbmc3k') +InstallData('cbmc') +InstallData('ifnb') -# Load in the ADT UMI matrix -cbmc.adt <- as.sparse(x = read.csv("~/Downloads/GSE100866_CBMC_8K_13AB_10X-ADT_umi.csv.gz", sep = "," ,header = TRUE, row.names = 1)) +#load in pbmc example +pbmc <- LoadData('pbmc3k') -# When adding multimodal data to Seurat, it's okay to have duplicate feature names. Each set of modal data (eg. RNA, ADT, etc.) is stored in its own Assay object. -# One of these Assay objects is called the "default assay", meaning it's used for all analyses and visualization. -# To pull data from an assay that isn't the default, you can specify a key that's linked to an assay for feature pulling. -# To see all keys for all objects, use the Key function. -# Lastly, we observed poor enrichments for CCR5, CCR7, and CD10 - and therefore remove them from the matrix (optional) -cbmc.adt <- cbmc.adt[setdiff(x = rownames(x = cbmc.adt), y = c('CCR5', 'CCR7', 'CD10')), ] -``` +# load in multimodal example +# type ?cbmc for more info +cbmc <- LoadData('cbmc') -# Seurat Standard Worflow +#load in case/control example +# type ?ifnb for more info +ifnb <- LoadData('ifnb') -The standard Seurat workflow takes raw single-cell expression data and aims to find clusters within the data. For full details, please read our tutorial. This process consists of data normalization and variable feature selection, data scaling, a PCA on variable features, construction of a shared-nearest-neighbors graph, and clustering using a modularity optimizer. Finally, we use a t-SNE to visualize our clusters in a two-dimensional space. +``` -```{r seurat-standard-workflow} -pbmc.counts <- Read10X(data.dir = "~/Downloads/pbmc3k/filtered_gene_bc_matrices/hg19/") -pbmc <- CreateSeuratObject(counts = pbmc.counts) +# Standard `Seurat` workflow +```{r seurat-standard-workflow-2} pbmc <- NormalizeData(object = pbmc) pbmc <- FindVariableFeatures(object = pbmc) pbmc <- ScaleData(object = pbmc) pbmc <- RunPCA(object = pbmc) -pbmc <- FindNeighbors(object = pbmc) +pbmc <- FindNeighbors(object = pbmc, dims = 1:30) pbmc <- FindClusters(object = pbmc) -pbmc <- RunTSNE(object = pbmc) -DimPlot(object = pbmc, reduction = 'tsne') +pbmc <- RunUMAP(object = pbmc, dims = 1:30) +DimPlot(object = pbmc, reduction = 'umap') + ``` -# `Seurat` Object Interaction +## SCtransform version +```{r seurat-standard-workflow} +pbmc <- SCTransform(object = pbmc) +pbmc <- RunPCA(object = pbmc) +pbmc <- FindNeighbors(object = pbmc, dims = 1:30) +pbmc <- FindClusters(object = pbmc) +pbmc <- RunUMAP(object = pbmc, dims = 1:30) +DimPlot(object = pbmc, reduction = 'umap') +``` -Since Seurat v3.0, we’ve made improvements to the Seurat object, and added new methods for user interaction. We also introduce simple functions for common tasks, like subsetting and merging, that mirror standard R functions. +```{r chaincmds,eval=FALSE} +# note that you can chain multiple commands together with %>% +pbmc <- SCTransform(pbmc) %>% RunPCA() %>% FindNeighbors(dims=1:30) %>% + FindClusters() %>% RunUMAP(dims=1:30) +``` - +# `Seurat` Object Data Access +## Cell, feature, and layer names ```{r features-and-cells} # Get cell and feature names, and total numbers -colnames(x = pbmc) -Cells(object = pbmc) -rownames(x = pbmc) -ncol(x = pbmc) -nrow(x = pbmc) +# We show multiple ways to get the same output +# cell names +colnames(pbmc) +Cells(pbmc) + +# feature names +Features(pbmc) +rownames(pbmc) + +# number of cells/features +num_cells <- ncol(pbmc) +num_features <- nrow(pbmc) + +# List of object layers +Layers(pbmc) + +# working with multimodal objects +# list assays +Assays(cbmc) + +# Assay-specific features (genes/ADT) +Features(cbmc[["RNA"]]) +Features(cbmc[["ADT"]]) + +# Variable feature names +VariableFeatures(pbmc) + ``` +```{r setvarfeatures, eval=FALSE} +# Set variable features +VariableFeatures(cbmc) <- var.gene.names + +# set for a specific assay +VariableFeatures(cbmc[["ADT"]]) <- var.gene.names +``` + +## Identity class labels ```{r idents} +# Setting and retrieving cell identities + +# Set identity classes to an existing column in meta data +Idents(object = pbmc) <- 'seurat_annotations' + +# View cell identities, get summary table +Idents(pbmc) +table(Idents(pbmc)) + +# Set identity to CD4 T cells for all cells +Idents(pbmc) <- 'CD4 T cells' + +# Set for a selected group of cells +pbmc.cells <- Cells(pbmc) +Idents(object = pbmc, cells = pbmc.cells[1:10]) <- 'CD4 T cells' + # Get cell identity classes Idents(object = pbmc) levels(x = pbmc) -# Stash cell identity classes +# Stash cell identity classes in metadata pbmc[['old.ident']] <- Idents(object = pbmc) pbmc <- StashIdent(object = pbmc, save.name = 'old.ident') -# Set identity classes -Idents(object = pbmc) <- 'CD4 T cells' -Idents(object = pbmc, cells = 1:10) <- 'CD4 T cells' - -# Set identity classes to an existing column in meta data -Idents(object = pbmc, cells = 1:10) <- 'orig.ident' -Idents(object = pbmc) <- 'orig.ident' - # Rename identity classes pbmc <- RenameIdents(object = pbmc, 'CD4 T cells' = 'T Helper cells') ``` +## Cell metadata +```{r metadata} +# View metadata data frame, stored in object@meta.data +pbmc[[]] + +# Retrieve specific values from the metadata +pbmc$nCount_RNA +pbmc[[c('percent.mito', 'nFeature_RNA')]] + +# Add metadata, see ?AddMetaData +random_group_labels <- sample(x = c('g1', 'g2'), size = ncol(x = pbmc), replace = TRUE) +pbmc$groups <- random_group_labels +``` + +## Expression data (stored as layers in Seurat v5) + +```{r expression-matrices, eval=FALSE} +# Retrieve data in an expression matrix +# RNA counts matrix +pbmc[["RNA"]]$counts + +# Alternate accessor function with the same result +LayerData(pbmc,assay = 'RNA',layer = 'counts') + +# GetAssayData from Seurat v4 is still supported +GetAssayData(object = pbmc, assay = 'RNA', slot = 'counts') + +# ADT counts matrix (multimodal object) +cbmc[["ADT"]]$counts + +``` + +```{r setdata, eval = FALSE} +# Set expression data +# assume new.data is a new expression matrix +pbmc[["RNA"]]$counts <- new.data + +# Alternate setter function with the same result +LayerData(pbmc,assay = 'RNA',layer = 'counts') <- new.data + +# SetAssayData from Seurat v4 is still supported +pbmc <- SetAssayData(object = pbmc, slot = 'counts',new.data = new.data) + +``` + +## Dimensional reductions +```{r embeddings-loadings} +# Get cell embeddings and feature loadings +# stored on pbmc[["pca"]]@cell.embeddings +Embeddings(pbmc, reduction = 'pca') + +# stored in pbmc[["pca]]@feature.loadings +Loadings(pbmc, reduction = 'pca') +``` + +```{r customreduc, eval=FALSE} +# Create custom dimensional reduction +# loadings matrix is optional +new_reduction <- CreateDimReducObject(embeddings = new.embeddings,loadings = new.loadings, key = "custom_pca") +pbmc[["custom_pca"]] <- new_reduction +``` + +## FetchData +```{r fetchdata} +# FetchData can access anything from expression matrices, cell embeddings, or metadata +# Use the previously listed commands to access entire matrices +# Use FetchData to access individual/small groups of variables +FetchData(object = pbmc, vars = c('PC_1', 'nFeature_RNA', 'MS4A1'),layer = 'counts') +``` + +# Subsetting and merging + +## Subset Seurat objects ```{r subsetting} # Subset Seurat object based on identity class, also see ?SubsetData -subset(x = pbmc, idents = 'B cells') -subset(x = pbmc, idents = c('CD4 T cells', 'CD8 T cells'), invert = TRUE) +subset(x = pbmc, idents = 'B') +subset(x = pbmc, idents = c('Naive CD4 T', 'CD8 T'), invert = TRUE) # Subset on the expression level of a gene/feature -subset(x = pbmc, subset = MS4A1 > 3) +subset(x = pbmc, subset = MS4A1 > 2.5) # Subset on a combination of criteria -subset(x = pbmc, subset = MS4A1 > 3 & PC1 > 5) -subset(x = pbmc, subset = MS4A1 > 3, idents = 'B cells') +subset(x = pbmc, subset = MS4A1 > 2.5 & PC_1 > 5) +subset(x = pbmc, subset = MS4A1 > 2.5, idents = 'B') # Subset on a value in the object meta data -subset(x = pbmc, subset = orig.ident == "Replicate1") +subset(x = pbmc, subset = groups == "g1") # Downsample the number of cells per identity class subset(x = pbmc, downsample = 100) + +``` + +## Split layers +```{r splitlayers, results = 'hide'} +# In Seurat v5, users can now split in object directly into different layers +# keeps expression data in one object, but splits multiple samples into layers +# can proceed directly to integration workflow after splitting layers +ifnb[["RNA"]] <- split(ifnb[["RNA"]],f = ifnb$stim) +Layers(ifnb) + +# If desired, for example after intergation, the layers can be joined together again +ifnb <- JoinLayers(ifnb) +``` + +## Split objects +```{r splitting, results = 'hide'} +# In line with prior workflows, you can also into split your object into a list of multiple objects +# based on a metadata column +# creates a list of two objects +ifnb_list <- SplitObject(ifnb,split.by = 'stim') +ifnb_list$CTRL +ifnb_list$STIM ``` -```{r merging, eval=FALSE} +## Merge objects (without integration) +In Seurat v5, merging creates a single object, but keeps the expression information split into different layers for integration. If not proceeding with integration, rejoin the layers after merging. + +```{r merging-2, eval=FALSE, results = 'hide'} # Merge two Seurat objects -merge(x = pbmc1, y = pbmc2) -# Merge more than two Seurat objects +merged_obj <- merge(x = ifnb_list$CTRL, y = ifnb_list$STIM) +merged_obj[["RNA"]] <- JoinLayers(merged_obj) + +# Example to merge more than two Seurat objects merge(x = pbmc1, y = list(pbmc2, pbmc3)) ``` -# Data Access +## Merge objects (with integration) +See [introduction to integration](integration_introduction.html) for more information. -Accessing data in Seurat is simple, using clearly defined accessors and setters to quickly find the data needed. +```{r merging-3, eval=FALSE, results = 'hide'} +merged_obj <- merge(x = ifnb_list$CTRL, y = ifnb_list$STIM) +merged_obj <- NormalizeData(merged_obj) +merged_obj <- FindVariableFeatures(merged_obj) +merged_obj <- ScaleData(merged_obj) +merged_obj <- RunPCA(merged_obj) +merged_obj <- IntegrateLayers( + object = obj, method = RPCAIntegration, + orig.reduction = "pca", new.reduction = 'integrated.rpca', + verbose = FALSE) -```{r metadata} -# View metadata data frame, stored in object@meta.data -pbmc[[]] +# now that integration is complete, rejoin layers +merged_obj[["RNA"]] <- JoinLayers(merged_obj) -# Retrieve specific values from the metadata -pbmc$nCount_RNA -pbmc[[c('percent.mito', 'nFeature_RNA')]] - -# Add metadata, see ?AddMetaData -random_group_labels <- sample(x = c('g1', 'g2'), size = ncol(x = pbmc), replace = TRUE) -pbmc$groups <- random_group_labels ``` -```{r expression-matrices, eval=FALSE} -# Retrieve or set data in an expression matrix ('counts', 'data', and 'scale.data') -GetAssayData(object = pbmc, slot = 'counts') -pbmc <- SetAssayData(object = pbmc, slot = 'scale.data', new.data = new.data) -``` +# Pseudobulk analysis -```{r embeddings-loadings} -# Get cell embeddings and feature loadings -Embeddings(object = pbmc, reduction = 'pca') -Loadings(object = pbmc, reduction = 'pca') -Loadings(object = pbmc, reduction = 'pca', projected = TRUE) -``` +## Group cells together, based on multiple categories +See [DE vignette](de_vignette.html) for information on how to add the `donor_id` column to meta data. -```{r fetchdata} -# FetchData can pull anything from expression matrices, cell embeddings, or metadata -FetchData(object = pbmc, vars = c('PC_1', 'percent.mito', 'MS4A1')) +```{r, eval=FALSE} +# pseudobulk cells only by cell type +bulk <- AggregateExpression(ifnb,group.by ='seurat_annotations', return.seurat = TRUE) +Cells(bulk) + +# pseudobulk cells by stimulation condition AND cell type +bulk <- AggregateExpression(ifnb,group.by = c('stim','seurat_annotations'), return.seurat = TRUE) +Cells(bulk) + +# pseudobulk cells by stimulation condition AND cell type AND donor +bulk <- AggregateExpression(ifnb,group.by = c('stim','seurat_annotations',"donor_id"), return.seurat = TRUE) +Cells(bulk) ``` -# Visualization in Seurat +# Visualization in `Seurat` Seurat has a vast, ggplot2-based plotting library. All plotting functions will return a ggplot2 plot by default, allowing easy customization with ggplot2. ```{r visualization} -# Dimensional reduction plot for PCA or tSNE -DimPlot(object = pbmc, reduction = 'tsne') +# Dimensional reduction plot DimPlot(object = pbmc, reduction = 'pca') # Dimensional reduction plot, with cells colored by a quantitative feature +# Defaults to UMAP if available FeaturePlot(object = pbmc, features = "MS4A1") -# Scatter plot across single cells, replaces GenePlot +# Scatter plot across single cells FeatureScatter(object = pbmc, feature1 = "MS4A1", feature2 = "PC_1") FeatureScatter(object = pbmc, feature1 = "MS4A1", feature2 = "CD3D") @@ -180,18 +335,33 @@ VariableFeaturePlot(object = pbmc) #Violin and Ridge plots VlnPlot(object = pbmc, features = c("LYZ", "CCL5", "IL32")) RidgePlot(object = pbmc, feature = c("LYZ", "CCL5", "IL32")) +``` -# Heatmaps -DoHeatmap(object = pbmc,features = heatmap_markers) +```{r heatmaps} +# Heatmaps (visualize scale.data slot) DimHeatmap(object = pbmc,reduction = 'pca', cells = 200) +# standard workflow +pbmc <- ScaleData(pbmc,features = heatmap_markers) +DoHeatmap(object = pbmc,features = heatmap_markers) + +# sctransform workflow +pbmc <- GetResidual(pbmc,features = heatmap_markers) +DoHeatmap(object = pbmc,features = heatmap_markers) + +# heatmap with maximum of 100 cells per group +DoHeatmap(pbmc,heatmap_markers,cells = subset(pbmc,downsample = 100)) + +``` + +```{r newthings} # New things to try! # Note that plotting functions now return ggplot2 objects, so you can add themes, titles, and options onto them VlnPlot(object = pbmc, features = "MS4A1", split.by = "groups") DotPlot(object = pbmc, features = c("LYZ", "CCL5", "IL32"), split.by = "groups") FeaturePlot(object = pbmc, features = c("MS4A1", "CD79A"), blend = TRUE) DimPlot(object = pbmc) + DarkTheme() -DimPlot(object = pbmc) + labs(title = '2,700 PBMCs clustered using Seurat and viewed\non a two-dimensional tSNE') +DimPlot(object = pbmc) + labs(title = '2,700 PBMCs clustered using Seurat and viewed\non a two-dimensional UMAP') ``` Seurat provides many prebuilt themes that can be added to ggplot2 plots for quick customization @@ -245,36 +415,12 @@ FetchData(object = cbmc, vars = c('rna_CD3E', 'adt_CD3')) FeatureScatter(object = cbmc, feature1 = "rna_CD3E", feature2 = "adt_CD3") ``` -# Seurat v2.X vs v3.X - -| Seurat v2.X | Seurat v3.X | -| ----------- | ----------- | -| `object@data` | `GetAssayData(object = object)` | -| `object@raw.data` | `GetAssayData(object = object, slot = "counts")` | -| `object@scale.data` | `GetAssayData(object = object, slot = "scale.data")` | -| `object@cell.names` | `colnames(x = object)` | -| `rownames(x = object@data)` | `rownames(x = object)` | -| `object@var.genes` | `VariableFeatures(object = object)` | -| `object@hvg.info` | `HVFInfo(object = object)` | -| `object@assays$assay.name` | `object[["assay.name"]]` | -| `object@dr$pca` | `object[["pca"]]` | -| `GetCellEmbeddings(object = object, reduction.type = "pca")` | `Embeddings(object = object, reduction = "pca")` | -| `GetGeneLoadings(object = object, reduction.type = "pca")` | `Loadings(object = object, reduction = "pca")` | -| `AddMetaData(object = object, metadata = vector, col.name = "name")` | `object$name <- vector` | -| `object@meta.data$name` | `object$name` | -| `object@idents` | `Idents(object = object)` | -| `SetIdent(object = object, ident.use = "new.idents")` | `Idents(object = object) <- "new.idents"` | -| `SetIdent(object = object, cells.use = 1:10, ident.use = "new.idents")` | `Idents(object = object, cells = 1:10) <- "new.idents"` | -| `StashIdent(object = object, save.name = "saved.idents")` | `object$saved.idents <- Idents(object = object)` | -| `levels(x = object@idents)` | `levels(x = object)` | -| `RenameIdent(object = object, old.ident.name = "old.ident", new.ident.name = "new.ident")` | `RenameIdents(object = object, "old.ident" = "new.ident")` | -| `WhichCells(object = object, ident = "ident.keep")` | `WhichCells(object = object, idents = "ident.keep")` | -| `WhichCells(object = object, ident.remove = "ident.remove")` | `WhichCells(object = object, idents = "ident.remove", invert = TRUE)` | -| `WhichCells(object = object, max.cells.per.ident = 500)` | `WhichCells(object = object, downsample = 500)` | -| `WhichCells(object = object, subset.name = "name", low.threshold = low, high.threshold = high)` | `WhichCells(object = object, expression = name > low & name < high)` | -| `FilterCells(object = object, subset.names = "name", low.threshold = low, high.threshold = high)` | `subset(x = object, subset = name > low & name < high)` | -| `SubsetData(object = object, subset.name = "name", low.threshold = low, high.threshold = high)` | `subset(x = object, subset = name > low & name < high)` | -| `MergeSeurat(object1 = object1, object2 = object2)` | `merge(x = object1, y = object2)` | +# Additional resources + +Users who are particularly interested in some of the technical changes to data storage in Seurat v5 can explore the following resources: + +* [SeuratObject manual](https://mojaveazure.github.io/seurat-object/) +* [Seurat v5 and Assay5 introductory vignette ](seurat5_essential_commands.html)
**Session Info** diff --git a/vignettes/extensions.Rmd b/vignettes/extensions.Rmd index 787bdcdb6..a99215424 100644 --- a/vignettes/extensions.Rmd +++ b/vignettes/extensions.Rmd @@ -33,3 +33,9 @@ The SeuratDisk package introduces the h5Seurat file format for the storage and a Azimuth is a web application that uses an annotated reference dataset to automate the processing, analysis, and interpretation of a new single-cell RNA-seq experiment. Azimuth leverages a 'reference-based mapping' pipeline that inputs a counts matrix of gene expression in single cells, and performs normalization, visualization, cell annotation, and differential expression (biomarker discovery). All results can be explored within the app, and easily downloaded for additional downstream analysis. To use the Azimuth web app, visit the Azimuth website [here](https://azimuth.hubmapconsortium.org/). +# BPCells + +BPCells is an R package that allows for computationally efficient single-cell analysis. It utilizes bit-packing compression to store counts matrices on disk and C++ code to cache operations. BPCells is an R package that allows for computationally efficient single-cell analysis. It utilizes bit-packing compression to store counts matrices on disk and C++ code to cache operations. + +# presto +Presto performs a fast Wilcoxon rank sum test and auROC analysis. Seurat uses the presto package to perform fast differential expression. \ No newline at end of file diff --git a/vignettes/get_started_v5_new.Rmd b/vignettes/get_started_v5_new.Rmd new file mode 100644 index 000000000..652676190 --- /dev/null +++ b/vignettes/get_started_v5_new.Rmd @@ -0,0 +1,172 @@ +--- +title: "Getting Started with Seurat" +output: + html_document: + theme: united + df_print: kable + pdf_document: default +--- + +```{r fxns, include = FALSE} +library('htmlTable') +make_list <- function(items) { + paste0("
    ", sprintf('
  • %s
  • ', items), '
', collapse = '') +} +make_href <- function(url, text){ + paste0("") +} +make_href2 <- function(url, text){ + paste0("", text, "") +} +process_entry <- function(dat) { + if (grepl(pattern = "https://satijalab.org/img/vignette_images", x = dat$image)) { + img <- paste0('![](', dat$image, '){width=3000px}') + } else if (grepl(pattern = "assets/", x= dat$image)) { + img <- paste0('![](', dat$image, '){width=3000px}') + } else { + img <- paste0('![](', '../output/images/', dat$image, '){width=3000px}') + } + if (grepl(pattern = "https://satijalab.org/", x = dat$name)) { + link <- dat$name + } else { + link <- paste0(dat$name, ".html") + } + go.button <- paste0('GO') + data.frame( + title = make_href(url = link, text = dat$title), + img = img, + desc = dat$summary, + btn = go.button + ) +} +process_wrapper_entry <- function(dat) { + data.frame( + Package = dat$name, + Vignette = make_href2(url = dat$link, text = dat$title), + Reference = make_href2(url = dat$reference, text = dat$citation), + Source = make_href2(url = dat$source, text = dat$source) + ) +} +make_vignette_card_section <- function(vdat, cat) { + vignettes <- vdat[[cat]]$vignettes + dat <- data.frame(title = character(), img = character(), desc = character()) + for (v in 1:length(x = vignettes)) { + dat <- rbind(dat, process_entry(vignettes[[v]])) + if(nrow(x = dat) == 3 | v == length(x = vignettes)){ + colnames(dat) <- NULL + dat <- t(dat) + if (ncol(x = dat) == 2) { + print(htmlTable( + dat, + align = '|l|l|', + css.cell = "padding-left: .75em; width: 50%", + css.class = "two-column-htmltable" + )) + } else if (ncol(x = dat) == 1){ + print(htmlTable( + dat, + align = '|l|', + css.cell = "padding-left: .75em; width: 100%", + css.class = "one-column-htmltable" + )) + } else { + print(htmlTable( + dat, + align = '|l|l|l|', + css.cell = "padding-left: .75em; width: 30%" + )) + } + dat <- data.frame(title = character(), img = character(), desc = character()) + } + } +} +``` + +```{r yaml, include = FALSE} +library(yaml) +vdat <- read_yaml(file = "vignettes_v5_new.yaml") +``` + +```{=html} + +``` + +We provide a series of vignettes, tutorials, and analysis walkthroughs to help users get started with Seurat. You can also check out our [Reference page](../reference/index.html) which contains a full list of functions available to users. + +Our previous Get Started page for Seurat v4 is archived [here](get_started.html). + +# Introductory Vignettes + +For new users of Seurat, we suggest starting with a guided walk through of a dataset of 2,700 Peripheral Blood Mononuclear Cells (PBMCs) made publicly available by 10X Genomics. This tutorial implements the major components of a standard unsupervised clustering workflow including QC and data filtration, calculation of high-variance genes, dimensional reduction, graph-based clustering, and the identification of cluster markers. We provide additional vignettes introducing visualization techniques in Seurat, the sctransform normalization workflow, and storage/interaction with multimodal datasets. We also provide an 'essential commands cheatsheet' as a quick reference. + +```{r results='asis', echo=FALSE, warning=FALSE, message = FALSE} +make_vignette_card_section(vdat = vdat, cat = 1) +``` + +# scRNA Data Integration + +We have developed [computational methods](https://www.cell.com/cell/fulltext/S0092-8674(19)30559-8) for integrated analysis of single-cell datasets generated across different conditions, technologies, or species. As an example, we provide a guided walk through for integrating and comparing PBMC datasets generated under different stimulation conditions. We provide additional vignettes demonstrating how to leverage an annotated scRNA-seq reference to map and label cells from a query, and to efficiently integrate large datasets. + +```{r results='asis', echo=FALSE, warning=FALSE, message = FALSE} +make_vignette_card_section(vdat = vdat, cat = 2) +``` + +# Multi-assay data + +Seurat also offers support for a suite of statistical methods for analyzing multimodal single-cell data. These include methods to integrate modalities that are simultaneously measured in the same cells, modalities that are measured in different cells, and techniques to analyze pooled CRISPR screens. + +```{r results='asis', echo=FALSE, warning=FALSE, message = FALSE} +make_vignette_card_section(vdat = vdat, cat = 3) +``` + +# Flexible analysis of massively scalable datasets + +In Seurat v5, we introduce new infrastructure and methods to analyze, interpret, and explore datasets that extend to millions of cells. We introduce support for 'sketch-based' techniques, where a subset of representative cells are stored in memory to enable rapid and iterative exploration, while the remaining cells are stored on-disk. Users can flexibly switch between both data representations, and we leverage the [BPCells package](https://bnprks.github.io/BPCells/) from Ben Parks in the Greenleaf lab to enable high-performance analysis of disk-backed data. + +The vignettes below demonstrate three scalable analyses in Seurat v5: Unsupervised clustering analysis of a large dataset (1.3M neurons), Unsupervised integration and comparison of 1M PBMC from healthy and diabetic patients, and Supervised mapping of 1.5M immune cells from healthy and COVID donors. In all cases, the vignettes perform these analyses without ever loading the full datasets into memory. + +```{r results='asis', echo=FALSE, warning=FALSE, message = FALSE} +make_vignette_card_section(vdat = vdat, cat = 4) +``` + +# Spatial analysis + +These vignettes will help introduce users to the analysis of spatial datasets in Seurat v5, including technologies that leverage sequencing-based readouts, as well as technologies that leverage in-situ imaging-based readouts. The vignettes introduce data from multiple platforms including 10x Visium, SLIDE-seq, Vizgen MERSCOPE, 10x Xenium, Nanostring CosMx, and Akoya CODEX. + +```{r results='asis', echo=FALSE, warning=FALSE, message = FALSE} +make_vignette_card_section(vdat = vdat, cat = 5) +``` + +# Other + +Here we provide a series of short vignettes to demonstrate a number of features that are commonly used in Seurat. We've focused the vignettes around questions that we frequently receive from users. + +```{r results='asis', echo=FALSE, warning=FALSE, message = FALSE} +make_vignette_card_section(vdat = vdat, cat = 6) +``` + +# SeuratWrappers + +In order to facilitate the use of community tools with Seurat, we provide the Seurat Wrappers package, which contains code to run other analysis tools on Seurat objects. For the initial release, we provide wrappers for a few packages in the table below but would encourage other package developers interested in interfacing with Seurat to check out our contributor guide [here](https://github.com/satijalab/seurat.wrappers/wiki/Submission-Process). + +```{r results='asis', echo=FALSE, warning=FALSE, message = FALSE} +library(knitr) +library(kableExtra) +cat <- 7 +vignettes <- vdat[[cat]]$vignettes +dat <- data.frame(Package = character(), Vignette = character(), Reference = character(), Source = character()) +for (v in 1:length(x = vignettes)) { + dat <- rbind(dat, process_wrapper_entry(vignettes[[v]])) +} +dat %>% + kable(format = "html", escape = FALSE) %>% + kable_styling(bootstrap_options = c("striped", "hover")) +``` \ No newline at end of file diff --git a/vignettes/hashing_vignette.Rmd b/vignettes/hashing_vignette.Rmd index af0ba6d2f..0a34ae283 100644 --- a/vignettes/hashing_vignette.Rmd +++ b/vignettes/hashing_vignette.Rmd @@ -72,11 +72,11 @@ Read in data ```{r read_Data} # Load in the UMI matrix -pbmc.umis <- readRDS("../data/pbmc_umi_mtx.rds") +pbmc.umis <- readRDS("/brahms/shared/vignette-data/pbmc_umi_mtx.rds") # For generating a hashtag count matrix from FASTQ files, please refer to https://github.com/Hoohm/CITE-seq-Count. # Load in the HTO count matrix -pbmc.htos <- readRDS("../data/pbmc_hto_mtx.rds") +pbmc.htos <- readRDS("/brahms/shared/vignette-data/pbmc_hto_mtx.rds") # Select cell barcodes detected by both RNA and HTO # In the example datasets we have already filtered the cells for you, but perform this step for clarity. @@ -94,7 +94,7 @@ Setup Seurat object and add in the HTO data ```{r hashtag_setup} # Setup Seurat object -pbmc.hashtag <- CreateSeuratObject(counts = pbmc.umis) +pbmc.hashtag <- CreateSeuratObject(counts = Matrix::Matrix(as.matrix(pbmc.umis),sparse = T)) # Normalize RNA data with log normalization pbmc.hashtag <- NormalizeData(pbmc.hashtag) @@ -154,7 +154,7 @@ Idents(pbmc.hashtag) <- 'HTO_classification.global' VlnPlot(pbmc.hashtag, features = 'nCount_RNA', pt.size = 0.1, log = TRUE) ``` -Generate a two dimensional tSNE embedding for HTOs.Here we are grouping cells by singlets and doublets for simplicity. +Generate a two dimensional tSNE embedding for HTOs. Here we are grouping cells by singlets and doublets for simplicity. ```{r hashtag_sub_tsne, fig.width=9} #First, we will remove negative cells from the object @@ -220,16 +220,16 @@ DimPlot(pbmc.singlet, group.by = "HTO_classification") ```{r hto_setup} # Read in UMI count matrix for RNA -hto12.umis <- readRDS("../data/hto12_umi_mtx.rds") +hto12.umis <- readRDS("/brahms/shared/vignette-data/hto12_umi_mtx.rds") # Read in HTO count matrix -hto12.htos <- readRDS("../data/hto12_hto_mtx.rds") +hto12.htos <- readRDS("/brahms/shared/vignette-data/hto12_hto_mtx.rds") # Select cell barcodes detected in both RNA and HTO cells.use <- intersect(rownames(hto12.htos), colnames(hto12.umis)) # Create Seurat object and add HTO data -hto12 <- CreateSeuratObject(counts = hto12.umis[, cells.use], min.features = 300) +hto12 <- CreateSeuratObject(counts = Matrix::Matrix(as.matrix(hto12.umis[, cells.use]), sparse = T), min.features = 300) hto12[['HTO']] <- CreateAssayObject(counts = t(x = hto12.htos[colnames(hto12), 1:12])) # Normalize data diff --git a/vignettes/install_v5.Rmd b/vignettes/install_v5.Rmd new file mode 100644 index 000000000..c9efee6a9 --- /dev/null +++ b/vignettes/install_v5.Rmd @@ -0,0 +1,87 @@ +--- +title: "Installation Instructions for Seurat" +output: html_document +--- + +To install Seurat, [R](https://www.r-project.org/) version 4.0 or greater is required. We also recommend installing [R Studio](https://www.rstudio.com/). + +# ![Seurat v5:](../output/images/SeuratV5.png){#id .class width=60 height=60} Seurat 5: Install from CRAN + +Seurat is available on [CRAN](https://cran.r-project.org/package=Seurat) for all platforms. To install, run: + +```{r, eval = FALSE} +# Enter commands in R (or R studio, if installed) +install.packages('Seurat') +library(Seurat) +``` + +Seurat does not require, but makes use of, packages developed by other labs that can substantially enhance speed and performance. These include [presto](https://github.com/immunogenomics/presto) (Korunsky/Raychaudhari labs), [BPCells](https://github.com/bnprks/BPCells) (Greenleaf Lab), and [glmGamPoi](https://github.com/const-ae/glmGamPoi) (Huber Lab). We recommend users install these along with users: + +```{r, eval = FALSE} +setRepositories(ind = 1:3, addURLs = c('https://satijalab.r-universe.dev', 'https://bnprks.r-universe.dev/')) +install.packages(c("BPCells", "presto", "glmGamPoi")) +``` + +We also recommend installing these additional packages, which are used in our vignettes, and enhance the functionality of Seurat: + +* [Signac](https://github.com/stuart-lab/signac): analysis of single-cell chromatin data +* [SeuratData](https://github.com/satijalab/seurat-data): automatically load datasets pre-packaged as Seurat objects +* [Azimuth](https://github.com/satijalab/azimuth): local annotation of scRNA-seq and scATAC-seq queries across multiple organs and tissues +* [SeuratWrappers](https://github.com/satijalab/seurat-wrappers): enables use of additional integration and differential expression methods + +```{r additional, eval=FALSE} +install.packages('Signac') +remotes::install_github("satijalab/seurat-data", quiet = TRUE) +remotes::install_github("satijalab/azimuth", quiet = TRUE) +remotes::install_github("satijalab/seurat-wrappers", quiet = TRUE) +``` + +# Install previous versions of Seurat + +## Install Seurat v4 + +Seurat v4.4.0 can be installed with the following command: + +```{r eval = FALSE} +install.packages('Seurat', repos = c('https://satijalab.r-universe.dev', 'https://cloud.r-project.org')) +``` + +## Older versions of Seurat +Old versions of Seurat, from Seurat v2.0.1 and up, are hosted in CRAN's archive. To install an old version of Seurat, run: + +```{r eval = FALSE} +# Install the remotes package +install.packages('remotes') +# Replace 'X.X.X' with your desired version +remotes::install_version(package = 'Seurat', version = package_version('X.X.X')) +``` + +For versions of Seurat older than those not hosted on CRAN (versions 1.3.0 and 1.4.0), please download the packaged source code from our [releases page](https://github.com/satijalab/seurat/releases) and [install from the tarball](https://stackoverflow.com/questions/4739837/how-do-i-install-an-r-package-from-the-source-tarball-on-windows). + +# Install the development version of Seurat + +Install the development version of Seurat - directly from [GitHub](https://github.com/satijalab/seurat/tree/develop). + +```{r eval = FALSE} +# Enter commands in R (or R studio, if installed) +# Install the remotes package +install.packages('remotes') +remotes::install_github(repo = 'satijalab/seurat', ref = 'develop') +library(Seurat) +``` + +# Docker + +We provide docker images for Seurat via [dockerhub](https://hub.docker.com/r/satijalab/seurat). + +To pull the latest image from the command line: + +```sh +docker pull satijalab/seurat:latest +``` + +To use as a base image in a new Dockerfile: + +```sh +FROM satijalab/seurat:latest +``` \ No newline at end of file diff --git a/vignettes/integration_introduction.Rmd b/vignettes/integration_introduction.Rmd index fe8c1e92f..8a814ebf7 100644 --- a/vignettes/integration_introduction.Rmd +++ b/vignettes/integration_introduction.Rmd @@ -33,25 +33,23 @@ knitr::opts_chunk$set( # Introduction to scRNA-seq integration -The joint analysis of two or more single-cell datasets poses unique challenges. In particular, identifying cell populations that are present across multiple datasets can be problematic under standard workflows. Seurat v4 includes a set of methods to match (or ‘align’) shared cell populations across datasets. These methods first identify cross-dataset pairs of cells that are in a matched biological state (‘anchors’), can be used both to correct for technical differences between datasets (i.e. batch effect correction), and to perform comparative scRNA-seq analysis of across experimental conditions. - -Below, we demonstrate methods for scRNA-seq integration as described in [Stuart\*, Butler\* et al, 2019](https://www.cell.com/cell/fulltext/S0092-8674(19)30559-8) to perform a comparative analysis of human immune cells (PBMC) in either a [resting or interferon-stimulated state](https://www.nature.com/articles/nbt.4042). +Integration of single-cell sequencing datasets, for example across experimental batches, donors, or conditions, is often an important step in scRNA-seq workflows. Integrative analysis can help to match shared cell types and states across datasets, which can boost statistical power, and most importantly, facilitate accurate comparative analysis across datasets. In previous versions of Seurat we introduced methods for integrative analysis, including our ‘anchor-based’ integration workflow. Many labs have also published powerful and pioneering methods, including [Harmony](https://portals.broadinstitute.org/harmony/) and [scVI](https://docs.scvi-tools.org/en/stable/index.html), for integrative analysis. Please see our [Integrating scRNA-seq data with multiple tools](https://satijalab.org/seurat/articles/seurat5_integration) vignette. -## Integration goals + +# Integration goals The following tutorial is designed to give you an overview of the kinds of comparative analyses on complex cell types that are possible using the Seurat integration procedure. Here, we address a few key goals: -* Create an 'integrated' data assay for downstream analysis -* Identify cell types that are present in both datasets +* Identify cell subpopulations that are present in both datasets * Obtain cell type markers that are conserved in both control and stimulated cells * Compare the datasets to find cell-type specific responses to stimulation -## Setup the Seurat objects +# Setup the Seurat objects For convenience, we distribute this dataset through our [SeuratData](https://github.com/satijalab/seurat-data) package. ```{r, include = FALSE} -options(SeuratData.repo.use = "http://satijalab04.nygenome.org") +options(SeuratData.repo.use = "http://seurat.nygenome.org") ``` ```{r data} @@ -65,197 +63,186 @@ library(patchwork) InstallData('ifnb') ``` +The object contains data from human PBMC from two conditions, interferon-stimulated and control cells (stored in the `stim` column in the object metadata). We will aim to integrate the two conditions together, so that we can jointly identify cell subpopulations across datasets, and then explore how each group differs across conditions + +In previous versions of Seurat, we would require the data to be represented as two different Seurat objects. In Seurat v5, we keep all the data in one object, but simply split it into multiple 'layers'. To learn more about layers, check out our [Seurat object interaction vignette](https://satijalab.org/seurat/articles/interaction_vignette). + ```{r init, results='hide', message=FALSE, fig.keep='none'} # load dataset ifnb <- LoadData('ifnb') +# split the RNA measurements into two layers +# one for control cells, one for stimulated cells -# split the dataset into a list of two seurat objects (stim and CTRL) -ifnb.list <- SplitObject(ifnb, split.by = "stim") - -# normalize and identify variable features for each dataset independently -ifnb.list <- lapply(X = ifnb.list, FUN = function(x) { - x <- NormalizeData(x) - x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 2000) -}) - -# select features that are repeatedly variable across datasets for integration -features <- SelectIntegrationFeatures(object.list = ifnb.list) +ifnb[["RNA"]] <- split(ifnb[["RNA"]], f = ifnb$stim) +ifnb ``` -## Perform integration +# Perform analysis without integration + +We can first analyze the dataset without integration. The resulting clusters are defined both by cell type and stimulation condition, which creates challenges for downstream analysis. -We then identify anchors using the `FindIntegrationAnchors()` function, which takes a list of Seurat objects as input, and use these anchors to integrate the two datasets together with `IntegrateData()`. +```{r} -```{r find.anchors} -immune.anchors <- FindIntegrationAnchors(object.list = ifnb.list, anchor.features = features) +# run standard anlaysis workflow +ifnb <- NormalizeData(ifnb) +ifnb <- FindVariableFeatures(ifnb) +ifnb <- ScaleData(ifnb) +ifnb <- RunPCA(ifnb) ``` -```{r integrate.data} -# this command creates an 'integrated' data assay -immune.combined <- IntegrateData(anchorset = immune.anchors) +```{r unintegratedUMAP, fig.height=5} +ifnb <- FindNeighbors(ifnb, dims=1:30, reduction = 'pca') +ifnb <- FindClusters(ifnb, resolution = 2, cluster.name = "unintegrated_clusters") +ifnb <- RunUMAP(ifnb, dims = 1:30, reduction = 'pca', reduction.name = 'umap.unintegrated') +DimPlot(ifnb, reduction = 'umap.unintegrated', group.by=c('stim','seurat_clusters')) ``` -## Perform an integrated analysis +# Perform integration + +We now aim to integrate data from the two conditions, so that cells from the same cell type/subpopulation will cluster together. -Now we can run a single integrated analysis on all cells! +We often refer to this procedure as intergration/alignment. When aligning two genome sequences together, identification of shared/homologous regions can help to interpret differences between the sequences as well. Similarly for scRNA-seq integration, our goal is not to remove biological differences across conditions, but to learn shared cell types/states in an initial step - specifically because that will enable us to compare control stimulated and control profiles for these individual cell types. -```{r clustering, results='hide', message=FALSE} -# specify that we will perform downstream analysis on the corrected data -# note that the original unmodified data still resides in the 'RNA' assay -DefaultAssay(immune.combined) <- "integrated" +The Seurat v5 integration procedure aims to return a single dimensional reduction that captures the shared sources of variance across multiple layers, so that cells in a similar biological state will cluster. The method returns a dimensional reduction (i.e. `integrated.cca`) which can be used for visualization and unsupervised clustering analysis. For evaluating performance, we can use cell type labels that are pre-loaded in the `seurat_annotations` metadata column. -# Run the standard workflow for visualization and clustering -immune.combined <- ScaleData(immune.combined, verbose = FALSE) -immune.combined <- RunPCA(immune.combined, npcs = 30, verbose = FALSE) -immune.combined <- RunUMAP(immune.combined, reduction = "pca", dims = 1:30) -immune.combined <- FindNeighbors(immune.combined, reduction = "pca", dims = 1:30) -immune.combined <- FindClusters(immune.combined, resolution = 0.5) +```{r} +ifnb <- IntegrateLayers( + object = ifnb, method = CCAIntegration, + orig.reduction = "pca", new.reduction = 'integrated.cca', + verbose = FALSE) + +# re-join layers after integration +ifnb[["RNA"]] <- JoinLayers(ifnb[["RNA"]]) + +ifnb <- FindNeighbors(ifnb, reduction = "integrated.cca", dims = 1:30) +ifnb <- FindClusters(ifnb, resolution = 1) +ifnb <- RunUMAP(ifnb,dims = 1:30,reduction = 'integrated.cca') ``` ```{r viz, results='hide', message=FALSE} # Visualization -p1 <- DimPlot(immune.combined, reduction = "umap", group.by = "stim") -p2 <- DimPlot(immune.combined, reduction = "umap", label = TRUE, repel = TRUE) -p1 + p2 +DimPlot(ifnb, reduction = 'umap', group.by=c('stim','seurat_annotations')) + ``` To visualize the two conditions side-by-side, we can use the `split.by` argument to show each condition colored by cluster. ```{r split.dim} -DimPlot(immune.combined, reduction = "umap", split.by = "stim") +DimPlot(ifnb, reduction = "umap", split.by = "stim") ``` -## Identify conserved cell type markers +# Identify conserved cell type markers To identify canonical cell type marker genes that are conserved across conditions, we provide the `FindConservedMarkers()` function. This function performs differential gene expression testing for each dataset/group and combines the p-values using meta-analysis methods from the MetaDE R package. For example, we can calculated the genes that are conserved markers irrespective of stimulation condition in cluster 6 (NK cells). ```{r conserved.markers, warning=FALSE} -# For performing differential expression after integration, we switch back to the original data -DefaultAssay(immune.combined) <- "RNA" -nk.markers <- FindConservedMarkers(immune.combined, ident.1 = 6, grouping.var = "stim", verbose = FALSE) +Idents(ifnb) <- "seurat_annotations" +nk.markers <- FindConservedMarkers(ifnb, ident.1 = "NK", grouping.var = "stim", verbose = FALSE) head(nk.markers) ``` -We can explore these marker genes for each cluster and use them to annotate our clusters as specific cell types. - -```{r annotate, results = 'hide', message=FALSE, fig.height = 8} -FeaturePlot(immune.combined, features = c("CD3D", "SELL", "CREM", "CD8A", "GNLY", "CD79A", "FCGR3A", "CCL2", "PPBP"), min.cutoff = "q9") -immune.combined <- RenameIdents(immune.combined, "0" = "CD14 Mono", "1" = "CD4 Naive T", "2" = "CD4 Memory T", "3" = "CD16 Mono", "4" = "B", "5" = "CD8 T", "6" = "NK" , "7" = "T activated", "8" = "DC", "9" = "B Activated", "10" = "Mk", "11" = "pDC", "12" = "Eryth", "13" = "Mono/Mk Doublets", "14" = "HSPC") -DimPlot(immune.combined, label = TRUE) -``` +You can perform these same analysis on the unsupervised clustering results (stored in `seurat_clusters`), and use these conserved markers to annotate cell types in your dataset. The `DotPlot()` function with the `split.by` parameter can be useful for viewing conserved cell type markers across conditions, showing both the expression level and the percentage of cells in a cluster expressing any given gene. Here we plot 2-3 strong marker genes for each of our 14 clusters. ```{r splitdotplot, fig.height = 10} -Idents(immune.combined) <- factor( - Idents(immune.combined), - levels = c("HSPC", "Mono/Mk Doublets", "pDC", "Eryth","Mk", "DC", "CD14 Mono", "CD16 Mono", "B Activated", "B", "CD8 T", "NK", "T activated", "CD4 Naive T", "CD4 Memory T")) + +#NEEDS TO BE FIXED AND SET ORDER CORRECTLY +Idents(ifnb) <- factor( + Idents(ifnb), + levels = c("pDC", "Eryth","Mk", "DC", "CD14 Mono", "CD16 Mono", "B Activated", "B", "CD8 T", "NK", "T activated", "CD4 Naive T", "CD4 Memory T")) + markers.to.plot <- c("CD3D","CREM","HSPH1","SELL","GIMAP5","CACYBP","GNLY","NKG7","CCL5","CD8A","MS4A1","CD79A","MIR155HG","NME1","FCGR3A","VMO1","CCL2","S100A9","HLA-DQA1","GPR183","PPBP","GNG11","HBA2","HBB","TSPAN13","IL3RA","IGJ","PRSS57") -DotPlot(immune.combined, features = markers.to.plot, cols = c('blue', 'red'), dot.scale = 8, split.by = "stim") + RotatedAxis() +DotPlot(ifnb, features = markers.to.plot, cols = c('blue', 'red'), dot.scale = 8, split.by = "stim") + RotatedAxis() ``` -```{r save.img, include = TRUE} -library(ggplot2) -plot <- DotPlot(immune.combined, features = markers.to.plot, cols = c('blue', 'red'), - dot.scale = 6, split.by = "stim") + RotatedAxis() -ggsave(filename = "../output/images/pbmc_alignment.jpg", height = 7, width = 12, plot = plot, quality = 50) -``` +# Identify differential expressed genes across conditions -### Identify differential expressed genes across conditions +Now that we've aligned the stimulated and control cells, we can start to do comparative analyses and look at the differences induced by stimulation. + +We can aggregate cells of a similar type and condition together to create "pseudobulk" profiles using the `AggregateExpression` command. As an initial exploratory analysis, we can compare pseudobulk profiles of two cell types (naive CD4 T cells, and CD14 monocytes), and compare their gene expression profiles before and after stimulation. We highlight genes that exhibit dramatic responses to interferon stimulation. As you can see, many of the same genes are upregulated in both of these cell types and likely represent a conserved interferon response pathway, though CD14 monocytes exhibit a stronger transcriptional response. -Now that we've aligned the stimulated and control cells, we can start to do comparative analyses and look at the differences induced by stimulation. One way to look broadly at these changes is to plot the average expression of both the stimulated and control cells and look for genes that are visual outliers on a scatter plot. Here, we take the average expression of both the stimulated and control naive T cells and CD14 monocyte populations and generate the scatter plots, highlighting genes that exhibit dramatic responses to interferon stimulation. ```{r scatterplots, results = 'hide', message=FALSE} library(ggplot2) library(cowplot) theme_set(theme_cowplot()) -t.cells <- subset(immune.combined, idents = "CD4 Naive T") -Idents(t.cells) <- "stim" -avg.t.cells <- as.data.frame(log1p(AverageExpression(t.cells, verbose = FALSE)$RNA)) -avg.t.cells$gene <- rownames(avg.t.cells) - -cd14.mono <- subset(immune.combined, idents = "CD14 Mono") -Idents(cd14.mono) <- "stim" -avg.cd14.mono <- as.data.frame(log1p(AverageExpression(cd14.mono, verbose = FALSE)$RNA)) -avg.cd14.mono$gene <- rownames(avg.cd14.mono) +aggregate_ifnb <- AggregateExpression(ifnb,group.by = c("seurat_annotations","stim"),return.seurat = TRUE) genes.to.label = c("ISG15", "LY6E", "IFI6", "ISG20", "MX1", "IFIT2", "IFIT1", "CXCL10", "CCL8") -p1 <- ggplot(avg.t.cells, aes(CTRL, STIM)) + geom_point() + ggtitle("CD4 Naive T Cells") -p1 <- LabelPoints(plot = p1, points = genes.to.label, repel = TRUE) -p2 <- ggplot(avg.cd14.mono, aes(CTRL, STIM)) + geom_point() + ggtitle("CD14 Monocytes") -p2 <- LabelPoints(plot = p2, points = genes.to.label, repel = TRUE) -p1 + p2 + +p1 <- CellScatter(aggregate_ifnb, "CD14 Mono_CTRL","CD14 Mono_STIM",highlight = genes.to.label) +p2 <- LabelPoints(plot = p1, points = genes.to.label, repel = TRUE) + +p3 <- CellScatter(aggregate_ifnb, "CD4 Naive T_CTRL","CD4 Naive T_STIM",highlight = genes.to.label) +p4 <- LabelPoints(plot = p3, points = genes.to.label, repel = TRUE) + +p2 + p4 ``` -As you can see, many of the same genes are upregulated in both of these cell types and likely represent a conserved interferon response pathway. -Because we are confident in having identified common cell types across condition, we can ask what genes change in different conditions for cells of the same type. First, we create a column in the meta.data slot to hold both the cell type and stimulation information and switch the current ident to that column. Then we use `FindMarkers()` to find the genes that are different between stimulated and control B cells. Notice that many of the top genes that show up here are the same as the ones we plotted earlier as core interferon response genes. Additionally, genes like CXCL10 which we saw were specific to monocyte and B cell interferon response show up as highly significant in this list as well. +We can now ask what genes change in different conditions for cells of the same type. First, we create a column in the meta.data slot to hold both the cell type and stimulation information and switch the current ident to that column. Then we use `FindMarkers()` to find the genes that are different between stimulated and control B cells. Notice that many of the top genes that show up here are the same as the ones we plotted earlier as core interferon response genes. Additionally, genes like CXCL10 which we saw were specific to monocyte and B cell interferon response show up as highly significant in this list as well. + +Please note that p-values obtained from this analysis should be interpreted with caution, as these tests treat each cell as an independent replicate, and ignore inherent correlations between cells originating from the same sample. As discussed [here](https://pubmed.ncbi.nlm.nih.gov/33257685/), DE tests across multiple conditions should expressly utilize multiple samples/replicates, and can be performed after aggregating ('pseudobulking') cells from the same sample and subpopulation together. We do not perform this analysis here, as there is a single replicate in the data, but please see our [vignette comparing healthy and diabetic samples](https://satijalab.org/seurat/articles/parsebio_sketch_integration) as an example for how to perform DE analysis across conditions. + ```{r de.genes} -immune.combined$celltype.stim <- paste(Idents(immune.combined), immune.combined$stim, sep = "_") -immune.combined$celltype <- Idents(immune.combined) -Idents(immune.combined) <- "celltype.stim" -b.interferon.response <- FindMarkers(immune.combined, ident.1 = "B_STIM", ident.2 = "B_CTRL", verbose = FALSE) +ifnb$celltype.stim <- paste(ifnb$seurat_annotations, ifnb$stim, sep = "_") +Idents(ifnb) <- "celltype.stim" +b.interferon.response <- FindMarkers(ifnb, ident.1 = "B_STIM", ident.2 = "B_CTRL", verbose = FALSE) head(b.interferon.response, n = 15) ``` + Another useful way to visualize these changes in gene expression is with the `split.by` option to the `FeaturePlot()` or `VlnPlot()` function. This will display FeaturePlots of the list of given genes, split by a grouping variable (stimulation condition here). Genes such as CD3D and GNLY are canonical cell type markers (for T cells and NK/CD8 T cells) that are virtually unaffected by interferon stimulation and display similar gene expression patterns in the control and stimulated group. IFI6 and ISG15, on the other hand, are core interferon response genes and are upregulated accordingly in all cell types. Finally, CD14 and CXCL10 are genes that show a cell type specific interferon response. CD14 expression decreases after stimulation in CD14 monocytes, which could lead to misclassification in a supervised analysis framework, underscoring the value of integrated analysis. CXCL10 shows a distinct upregulation in monocytes and B cells after interferon stimulation but not in other cell types. -```{r feature.heatmaps, fig.height = 14} -FeaturePlot(immune.combined, features = c("CD3D", "GNLY", "IFI6"), split.by = "stim", max.cutoff = 3, cols = c("grey", "red")) +```{r feature.heatmaps, fig.height = 10, fig.width=10} +FeaturePlot(ifnb, features = c("CD3D", "GNLY", "IFI6"), split.by = "stim", max.cutoff = 3, cols = c("grey", "red"), reduction='umap') ``` ```{r splitvln, fig.height = 12} -plots <- VlnPlot(immune.combined, features = c("LYZ", "ISG15", "CXCL10"), split.by = "stim", group.by = "celltype", pt.size = 0, combine = FALSE) +plots <- VlnPlot(ifnb, features = c("LYZ", "ISG15", "CXCL10"), split.by = "stim", group.by = "seurat_annotations", pt.size = 0, combine = FALSE) wrap_plots(plots = plots, ncol = 1) ``` ```{r save, include = TRUE} -saveRDS(immune.combined, file = "../output/immune.combined.rds") ``` -# Performing integration on datasets normalized with SCTransform +# Perform integration with SCTransform-normalized datasets -In [Hafemeister and Satija, 2019](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1874-1), we introduced an improved method for the normalization of scRNA-seq, based on regularized negative binomial regression. The method is named 'sctransform', and avoids some of the pitfalls of standard normalization workflows, including the addition of a pseudocount, and log-transformation. You can read more about sctransform in the [manuscript](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1874-1) or our [SCTransform vignette](sctransform_vignette.html). +As an alternative to log-normalization, Seurat also includes support for preprocessing of scRNA-seq using the [sctransform workflow](https://satijalab.org/seurat/articles/sctransform_vignette). The `IntegrateLayers` function also supports SCTransform-normalized data, by setting the `normalization.method` parameter, as shown below. -Below, we demonstrate how to modify the Seurat integration workflow for datasets that have been normalized with the sctransform workflow. The commands are largely similar, with a few key differences: +```{r sct, fig.height = 5} -* Normalize datasets individually by `SCTransform()`, instead of `NormalizeData()` prior to integration -* As discussed further in our [SCTransform vignette](sctransform_vignette.html), we typically use 3,000 or more features for analysis downstream of sctransform. -* Run the `PrepSCTIntegration()` function prior to identifying anchors -* When running `FindIntegrationAnchors()`, and `IntegrateData()`, set the `normalization.method` parameter to the value `SCT`. -* When running sctransform-based workflows, including integration, do not run the `ScaleData()` function +ifnb <- LoadData('ifnb') +# split datasets and process without integration +ifnb[["RNA"]] <- split(ifnb[["RNA"]], f=ifnb$stim) +ifnb <- SCTransform(ifnb) +ifnb <- RunPCA(ifnb) +ifnb <- RunUMAP(ifnb,dims = 1:30) +DimPlot(ifnb,reduction = 'umap',group.by = c("stim","seurat_annotations")) -```{r panc8.cca.sct.init, results='hide', message=FALSE, fig.keep='none'} -ifnb <- LoadData('ifnb') -ifnb.list <- SplitObject(ifnb, split.by = "stim") -ifnb.list <- lapply(X = ifnb.list, FUN = SCTransform) -features <- SelectIntegrationFeatures(object.list = ifnb.list, nfeatures = 3000) -ifnb.list <- PrepSCTIntegration(object.list = ifnb.list, anchor.features = features) -``` +# integrate datasets +ifnb <- IntegrateLayers(object = ifnb, + method = CCAIntegration, + normalization.method="SCT", + verbose = F) +ifnb <- FindNeighbors(ifnb,reduction = 'integrated.dr',dims = 1:30) +ifnb <- FindClusters(ifnb,resolution = 0.6) +ifnb <- RunUMAP(ifnb,dims = 1:30,reduction = 'integrated.dr') +DimPlot(ifnb,reduction = 'umap',group.by = c("stim","seurat_annotations")) -```{r ifnb.cca.sct.anchors} -immune.anchors <- FindIntegrationAnchors(object.list = ifnb.list, normalization.method = 'SCT', anchor.features = features) -immune.combined.sct <- IntegrateData(anchorset = immune.anchors, normalization.method = 'SCT') -``` -```{r ifnb.cca.sct.clustering, results='hide', message=FALSE} -immune.combined.sct <- RunPCA(immune.combined.sct, verbose = FALSE) -immune.combined.sct <- RunUMAP(immune.combined.sct, reduction = "pca", dims = 1:30) -``` +# perform differential expression +ifnb <- PrepSCTFindMarkers(ifnb) +ifnb$celltype.stim <- paste(ifnb$seurat_annotations, ifnb$stim, sep = "_") +Idents(ifnb) <- 'celltype.stim' +b.interferon.response <- FindMarkers(ifnb, ident.1 = "B_STIM", ident.2 = "B_CTRL", verbose = FALSE) -```{r immunesca.cca.sct.split.dims} -p1 <- DimPlot(immune.combined.sct, reduction = "umap", group.by = "stim") -p2 <- DimPlot(immune.combined.sct, reduction = "umap", group.by = 'seurat_annotations',label = TRUE, repel = TRUE) -p1 + p2 ``` -Now that the datasets have been integrated, you can follow the previous steps in this vignette identify cell types and cell type-specific responses. - -```{r save.times, include = FALSE} -write.csv(x = t(as.data.frame(all_times)), file = "../output/timings/integration_introduction.csv") ```
diff --git a/vignettes/integration_mapping.Rmd b/vignettes/integration_mapping.Rmd index 1d44a9fe2..9702c213c 100644 --- a/vignettes/integration_mapping.Rmd +++ b/vignettes/integration_mapping.Rmd @@ -7,7 +7,7 @@ output: pdf_document: default date: 'Compiled: `r Sys.Date()`' --- -*** + ```{r setup, include=FALSE} all_times <- list() # store the time for each chunk knitr::knit_hooks$set(time_it = local({ @@ -44,76 +44,44 @@ For the purposes of this example, we've chosen human pancreatic islet cell datas ```{r libraries, message=FALSE, warning=FALSE} library(Seurat) library(SeuratData) +library(ggplot2) ``` ```{r install.data, eval=FALSE} InstallData('panc8') ``` -To construct a reference, we will identify 'anchors' between the individual datasets. First, we split the combined object into a list, with each dataset as an element (this is only necessary because the data was bundled together for easy distribution). +As a demonstration, we will use a subset of technologies to construct a reference. We will then map the remaining datasets onto this reference. We start by selecting cells from four technologies, and performing an analysis without integration. ```{r preprocessing1} panc8 <- LoadData('panc8') -pancreas.list <- SplitObject(panc8, split.by = "tech") -pancreas.list <- pancreas.list[c("celseq", "celseq2", "fluidigmc1", "smartseq2")] -``` - -Prior to finding anchors, we perform standard preprocessing (log-normalization), and identify variable features individually for each. Note that Seurat implements an improved method for variable feature selection based on a variance stabilizing transformation (`"vst"`) - -```{r preprocessing3} -for (i in 1:length(pancreas.list)) { - pancreas.list[[i]] <- NormalizeData(pancreas.list[[i]], verbose = FALSE) - pancreas.list[[i]] <- FindVariableFeatures(pancreas.list[[i]], selection.method = "vst", - nfeatures = 2000, verbose = FALSE) -} -``` - -# Integration of 3 pancreatic islet cell datasets - -Next, we identify anchors using the `FindIntegrationAnchors()` function, which takes a list of Seurat objects as input. Here, we integrate three of the objects into a reference (we will use the fourth later in this vignette as a query dataset to demonstrate mapping). - -* We use all default parameters here for identifying anchors, including the 'dimensionality' of the dataset (30; feel free to try varying this parameter over a broad range, for example between 10 and 50). - -```{r integration.anchors, warning = FALSE, message = FALSE} -reference.list <- pancreas.list[c("celseq", "celseq2", "smartseq2")] -pancreas.anchors <- FindIntegrationAnchors(object.list = reference.list, dims = 1:30) -``` - -We then pass these anchors to the `IntegrateData()` function, which returns a Seurat object. - -* The returned object will contain a new `Assay`, which holds an integrated (or 'batch-corrected') expression matrix for all cells, enabling them to be jointly analyzed. - -```{r data.integration, warning = FALSE, message = FALSE} -pancreas.integrated <- IntegrateData(anchorset = pancreas.anchors, dims = 1:30) -``` - -After running `IntegrateData()`, the `Seurat` object will contain a new `Assay` with the integrated expression matrix. Note that the original (uncorrected values) are still stored in the object in the "RNA" assay, so you can switch back and forth. - -We can then use this new integrated matrix for downstream analysis and visualization. Here we scale the integrated data, run PCA, and visualize the results with UMAP. The integrated datasets cluster by cell type, instead of by technology. - -```{r analysis, message = FALSE, warning=FALSE, fig.width=10} -library(ggplot2) -library(cowplot) -library(patchwork) -#switch to integrated assay. The variable features of this assay are automatically -#set during IntegrateData -DefaultAssay(pancreas.integrated) <- 'integrated' -# Run the standard workflow for visualization and clustering -pancreas.integrated <- ScaleData(pancreas.integrated, verbose = FALSE) -pancreas.integrated <- RunPCA(pancreas.integrated, npcs = 30, verbose = FALSE) -pancreas.integrated <- RunUMAP(pancreas.integrated, reduction = "pca", dims = 1:30, - verbose = FALSE) -p1 <- DimPlot(pancreas.integrated, reduction = "umap", group.by = "tech") -p2 <- DimPlot(pancreas.integrated, reduction = "umap", group.by = "celltype", - label = TRUE, repel = TRUE) + NoLegend() -p1 + p2 -``` - -```{r save.img, include = TRUE} -plot <- DimPlot(pancreas.integrated, reduction = "umap", label = TRUE, label.size = 4.5) + xlab("UMAP 1") + ylab("UMAP 2") + - theme(axis.title = element_text(size = 18), legend.text = element_text(size = 18)) + - guides(colour = guide_legend(override.aes = list(size = 10))) -ggsave(filename = "pancreas_integrated_umap.jpg", height = 7, width = 12, plot = plot, quality = 50) +table(panc8$tech) + +# we will use data from 2 technologies for the reference +pancreas.ref <- subset(panc8, tech %in% c("celseq2", "smartseq2")) +pancreas.ref[["RNA"]] <- split(pancreas.ref[["RNA"]], f = pancreas.ref$tech) + +# pre-process dataset (without integration) +pancreas.ref <- NormalizeData(pancreas.ref) +pancreas.ref <- FindVariableFeatures(pancreas.ref) +pancreas.ref <- ScaleData(pancreas.ref) +pancreas.ref <- RunPCA(pancreas.ref) +pancreas.ref <- FindNeighbors(pancreas.ref, dims=1:30) +pancreas.ref <- FindClusters(pancreas.ref) +pancreas.ref <- RunUMAP(pancreas.ref, dims = 1:30) +DimPlot(pancreas.ref,group.by = c("celltytpe","tech")) +``` + +Next, we integrate the datasets into a shared reference. Please see our [introduction to integration vignette](https://satijalab.org/seurat/articles/integration_introduction) +```{r preprocessing3, fig.height = 4} +pancreas.ref <- IntegrateLayers( + object = pancreas.ref, method = CCAIntegration, + orig.reduction = "pca", new.reduction = 'integrated.cca', + verbose = FALSE) +pancreas.ref <- FindNeighbors(pancreas.ref,reduction='integrated.cca',dims=1:30) +pancreas.ref <- FindClusters(pancreas.ref) +pancreas.ref <- RunUMAP(pancreas.ref,reduction='integrated.cca',dims=1:30) +DimPlot(pancreas.ref,group.by = c("tech","celltype")) ``` # Cell type classification using an integrated reference @@ -126,9 +94,12 @@ Seurat also supports the projection of reference data (or meta data) onto a quer After finding anchors, we use the `TransferData()` function to classify the query cells based on reference data (a vector of reference cell type labels). `TransferData()` returns a matrix with predicted IDs and prediction scores, which we can add to the query metadata. ```{r label.transfer, warning = FALSE, message = FALSE} -pancreas.query <- pancreas.list[["fluidigmc1"]] -pancreas.anchors <- FindTransferAnchors(reference = pancreas.integrated, query = pancreas.query, dims = 1:30, reference.reduction = "pca") -predictions <- TransferData(anchorset = pancreas.anchors, refdata = pancreas.integrated$celltype, dims = 1:30) + +# select two technologies for the query datasets +pancreas.query <- subset(panc8, tech %in% c("fluidigmc1","celseq")) +pancreas.query <- NormalizeData(pancreas.query) +pancreas.anchors <- FindTransferAnchors(reference = pancreas.ref, query = pancreas.query, dims = 1:30, reference.reduction="pca") +predictions <- TransferData(anchorset = pancreas.anchors, refdata = pancreas.ref$celltype, dims = 1:30) pancreas.query <- AddMetaData(pancreas.query, metadata = predictions) ``` @@ -148,16 +119,16 @@ VlnPlot(pancreas.query, c("REG1A", "PPY", "SST", "GHRL", "VWF", "SOX10"), group. # Unimodal UMAP Projection -In Seurat v4, we also enable projection of a query onto the reference UMAP structure. This can be achieved by computing the reference UMAP model and then calling `MapQuery()` instead of `TransferData()`. +We also enable projection of a query onto the reference UMAP structure. This can be achieved by computing the reference UMAP model and then calling `MapQuery()` instead of `TransferData()`. ```{r label.transfer.v4, warning = FALSE, message = FALSE} -pancreas.integrated <- RunUMAP(pancreas.integrated, dims = 1:30, reduction = "pca", return.model = TRUE) +pancreas.ref <- RunUMAP(pancreas.ref, dims = 1:30, reduction = "integrated.cca", return.model = TRUE) pancreas.query <- MapQuery( anchorset = pancreas.anchors, - reference = pancreas.integrated, + reference = pancreas.ref, query = pancreas.query, refdata = list(celltype = 'celltype'), - reference.reduction = 'pca', + reference.reduction = "pca", reduction.model = 'umap' ) ``` @@ -170,20 +141,20 @@ pancreas.query <- MapQuery( ```{r, eval=FALSE} pancreas.query <- TransferData( anchorset = pancreas.anchors, - reference = pancreas.integrated, + reference = pancreas.ref, query = pancreas.query, refdata = list(celltype = "celltype") ) pancreas.query <- IntegrateEmbeddings( anchorset = pancreas.anchors, - reference = pancreas.integrated, + reference = pancreas.ref, query = pancreas.query, new.reduction.name = "ref.pca" ) pancreas.query <- ProjectUMAP( query = pancreas.query, query.reduction = "ref.pca", - reference = pancreas.integrated, + reference = pancreas.ref, reference.reduction = "pca", reduction.model = "umap" ) @@ -193,7 +164,7 @@ pancreas.query <- ProjectUMAP( We can now visualize the query cells alongside our reference. ```{r panc.refdimplots, fig.width=10} -p1 <- DimPlot(pancreas.integrated, reduction = "umap", group.by = "celltype", label = TRUE, +p1 <- DimPlot(pancreas.ref, reduction = "umap", group.by = "celltype", label = TRUE, label.size = 3 ,repel = TRUE) + NoLegend() + ggtitle("Reference annotations") p2 <- DimPlot(pancreas.query, reduction = "ref.umap", group.by = "predicted.celltype", label = TRUE, label.size = 3 ,repel = TRUE) + NoLegend() + ggtitle("Query transferred labels") diff --git a/vignettes/mixscape_vignette.Rmd b/vignettes/mixscape_vignette.Rmd index fdbc622e3..18bd24863 100644 --- a/vignettes/mixscape_vignette.Rmd +++ b/vignettes/mixscape_vignette.Rmd @@ -26,12 +26,12 @@ knitr::opts_chunk$set( time_it = TRUE, error = TRUE ) -options(SeuratData.repo.use = 'satijalab04.nygenome.org') +options(SeuratData.repo.use = 'seurat.nygenome.org') ``` # Overview -This tutorial demonstrates how to use mixscape for the analyses of single-cell pooled CRSIPR screens. We introduce new Seurat functions for: +This tutorial demonstrates how to use mixscape for the analyses of single-cell pooled CRISPR screens. We introduce new Seurat functions for: 1. Calculating the perturbation-specific signature of every cell. 2. Identifying and removing cells that have 'escaped' CRISPR perturbation. @@ -304,9 +304,11 @@ VlnPlot( ```{r save.img, include=TRUE} p <- VlnPlot(object = eccite, features = "adt_PDL1", idents = c("NT","JAK2","STAT1","IFNGR1","IFNGR2", "IRF1"), group.by = "gene", pt.size = 0.2, sort = T, split.by = "mixscape_class.global", cols = c("coral3","grey79","grey39")) +ggtitle("PD-L1 protein") +theme(axis.text.x = element_text(angle = 0, hjust = 0.5)) -ggsave(filename = "../output/images/mixscape_vignette.jpg", height = 7, width = 12, plot = p, quality = 50) ``` +```{r, include=FALSE} +ggsave(filename = "../output/images/mixscape_vignette.jpg", height = 7, width = 12, plot = p, quality = 50) +``` # Visualizing perturbation responses with Linear Discriminant Analysis (LDA) We use LDA as a dimensionality reduction method to visualize perturbation-specific clusters. LDA is trying to maximize the separability of known labels (mixscape classes) using both gene expression and the labels as input. @@ -361,7 +363,7 @@ p2 <- p+ p2 ``` -```{r save.times, include = TRUE} +```{r save.times, include = FALSE} write.csv(x = t(as.data.frame(all_times)), file = "../output/timings/mixscape_vignette_times.csv") ``` diff --git a/vignettes/multimodal_reference_mapping.Rmd b/vignettes/multimodal_reference_mapping.Rmd index f7928bb42..0decf0ee0 100644 --- a/vignettes/multimodal_reference_mapping.Rmd +++ b/vignettes/multimodal_reference_mapping.Rmd @@ -42,30 +42,24 @@ In this vignette, we demonstrate how to use a previously established reference t To run this vignette please install Seurat v4, available on CRAN. Additionally, you will need to install the `SeuratDisk` package. -```{r install, eval = FALSE} -install.packages("Seurat") -remotes::install_github("mojaveazure/seurat-disk") -``` - ```{r packages, cache=FALSE} library(Seurat) -library(SeuratDisk) library(ggplot2) library(patchwork) ``` ```{r, include = TRUE, cache=FALSE} -options(SeuratData.repo.use = "http://satijalab04.nygenome.org") +options(SeuratData.repo.use = "http://seurat.nygenome.org") ``` # Example 1: Mapping human peripheral blood cells ## A Multimodal PBMC Reference Dataset -We load the reference (download [here](https://atlas.fredhutch.org/data/nygc/multimodal/pbmc_multimodal.h5seurat)) from our recent [paper](https://doi.org/10.1016/j.cell.2021.04.048), and visualize the pre-computed UMAP. This reference is stored as an h5Seurat file, a format that enables on-disk storage of multimodal Seurat objects (more details on h5Seurat and `SeuratDisk` can be found [here](https://mojaveazure.github.io/seurat-disk/index.html)). +We load the reference from our recent [paper](https://doi.org/10.1016/j.cell.2021.04.048), and visualize the pre-computed UMAP. ```{r pbmc.ref} -reference <- readRDS("../data/pbmc_multimodal_2023.rds") +reference <- readRDS("/brahms/hartmana/vignette_data/pbmc_multimodal_2023.rds") ``` ```{r ref.dimplot} @@ -79,6 +73,9 @@ To demonstrate mapping to this multimodal reference, we will use a dataset of 2, ```{r 3k.load} library(SeuratData) InstallData('pbmc3k') + +pbmc3k <- LoadData('pbmc3k') +pbmc3k <- UpdateSeuratObject(pbmc3k) ``` The reference was normalized using `SCTransform()`, so we use the same approach to normalize the query here. @@ -284,8 +281,8 @@ If you want to save and load a cached index for a `Neighbor` object generated wi ```{r neighbor.demo} bm[["spca.annoy.neighbors"]] -SaveAnnoyIndex(object = bm[["spca.annoy.neighbors"]], file = "../data/reftmp.idx") -bm[["spca.annoy.neighbors"]] <- LoadAnnoyIndex(object = bm[["spca.annoy.neighbors"]], file = "../data/reftmp.idx") +SaveAnnoyIndex(object = bm[["spca.annoy.neighbors"]], file = "/brahms/shared/vignette-data/reftmp.idx") +bm[["spca.annoy.neighbors"]] <- LoadAnnoyIndex(object = bm[["spca.annoy.neighbors"]], file = "/brahms/shared/vignette-data/reftmp.idx") ```
@@ -378,7 +375,7 @@ p5 <- FeaturePlot(hcabm40k, features = c("CD45RA", "CD16", "CD161"), reduction = p3 / p4 / p5 ``` -```{r save.times, include = TRUE} +```{r save.times, include = FALSE} write.csv(x = t(as.data.frame(all_times)), file = "../output/timings/reference_mapping_times.csv") ``` diff --git a/vignettes/multimodal_vignette.Rmd b/vignettes/multimodal_vignette.Rmd index 98fdd1c21..7f9193618 100644 --- a/vignettes/multimodal_vignette.Rmd +++ b/vignettes/multimodal_vignette.Rmd @@ -7,7 +7,6 @@ output: pdf_document: default date: 'Compiled: `r Sys.Date()`' --- -*** ```{r setup, include=FALSE} all_times <- list() # store the time for each chunk @@ -28,16 +27,16 @@ knitr::opts_chunk$set( message = FALSE, warning = FALSE, time_it = TRUE, - fig.width = 10, + fig.width = 9, error = TRUE ) ``` # Load in the data -The ability to make simultaneous measurements of multiple data types from the same cell, known as multimodal analysis, represents a new and exciting frontier for single-cell genomics. For example, [CITE-seq](http://www.nature.com/nmeth/journal/v14/n9/full/nmeth.4380.html) enables the simultaneous measurements of transcriptomes and cell-surface proteins from the same cell. Other exciting multimodal technologies, such as the [10x multiome kit](https://www.10xgenomics.com/products/single-cell-multiome-atac-plus-gene-expression) allow for the paired measurements of cellular transcriptome and chromatin accessibility (i.e scRNA-seq+scATAC-seq). Other modalities that can be measured alongside cellular transcriptomes include genetic perturbations, cellular methylomes, and hashtag oligos from [Cell Hashing](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-018-1603-1). We have designed Seurat4 to enable for the seamless storage, analysis, and exploration of diverse multimodal single-cell datasets. +The ability to make simultaneous measurements of multiple data types from the same cell, known as multimodal analysis, represents a new and exciting frontier for single-cell genomics. For example, [CITE-seq](http://www.nature.com/nmeth/journal/v14/n9/full/nmeth.4380.html) enables the simultaneous measurements of transcriptomes and cell-surface proteins from the same cell. Other exciting multimodal technologies, such as the [10x multiome kit](https://www.10xgenomics.com/products/single-cell-multiome-atac-plus-gene-expression) allow for the paired measurements of cellular transcriptome and chromatin accessibility (i.e scRNA-seq+scATAC-seq). Other modalities that can be measured alongside cellular transcriptomes include genetic perturbations, cellular methylomes, and hashtag oligos from [Cell Hashing](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-018-1603-1). We have designed Seurat to enable for the seamless storage, analysis, and exploration of diverse multimodal single-cell datasets. -In this vignette, we present an introductory workflow for creating a multimodal Seurat object and performing an initial analysis. For example, we demonstrate how to cluster a CITE-seq dataset on the basis of the measured cellular transcriptomes, and subsequently discover cell surface proteins that are enriched in each cluster. We note that Seurat4 also enables more advanced techniques for the analysis of multimodal data, in particular the application of our [Weighted Nearest Neighbors (WNN) approach](https://doi.org/10.1016/j.cell.2021.04.048) that enables simultaneous clustering of cells based on a weighted combination of both modalities, and you can explore this functionality [here](weighted_nearest_neighbor_analysis.html). +In this vignette, we present an introductory workflow for creating a multimodal Seurat object and performing an initial analysis. For example, we demonstrate how to cluster a CITE-seq dataset on the basis of the measured cellular transcriptomes, and subsequently discover cell surface proteins that are enriched in each cluster. We note that Seurat also enables more advanced techniques for the analysis of multimodal data, in particular the application of our [Weighted Nearest Neighbors (WNN) approach](https://doi.org/10.1016/j.cell.2021.04.048) that enables simultaneous clustering of cells based on a weighted combination of both modalities, and you can explore this functionality [here](weighted_nearest_neighbor_analysis.html). Here, we analyze a dataset of 8,617 cord blood mononuclear cells (CBMCs), where transcriptomic measurements are paired with abundance estimates for 11 surface proteins, whose levels are quantified with DNA-barcoded antibodies. First, we load in two count matrices : one for the RNA measurements, and one for the antibody-derived tags (ADT). You can download the ADT file [here](ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE100nnn/GSE100866/suppl/GSE100866_CBMC_8K_13AB_10X-ADT_umi.csv.gz) and the RNA file [here](ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE100nnn/GSE100866/suppl/GSE100866_CBMC_8K_13AB_10X-RNA_umi.csv.gz) @@ -51,13 +50,13 @@ library(patchwork) # Load in the RNA UMI matrix # Note that this dataset also contains ~5% of mouse cells, which we can use as negative controls for the protein measurements. For this reason, the gene expression matrix has HUMAN_ or MOUSE_ appended to the beginning of each gene. -cbmc.rna <- as.sparse(read.csv(file = '../data/GSE100866_CBMC_8K_13AB_10X-RNA_umi.csv.gz', sep = ',', header = TRUE, row.names = 1)) +cbmc.rna <- as.sparse(read.csv(file = '/brahms/shared/vignette-data/GSE100866_CBMC_8K_13AB_10X-RNA_umi.csv.gz', sep = ',', header = TRUE, row.names = 1)) # To make life a bit easier going forward, we're going to discard all but the top 100 most highly expressed mouse genes, and remove the "HUMAN_" from the CITE-seq prefix cbmc.rna <- CollapseSpeciesExpressionMatrix(cbmc.rna) # Load in the ADT UMI matrix -cbmc.adt <- as.sparse(read.csv(file = '../data/GSE100866_CBMC_8K_13AB_10X-ADT_umi.csv.gz', sep = ',', header = TRUE, row.names = 1)) +cbmc.adt <- as.sparse(read.csv(file = '/brahms/shared/vignette-data/GSE100866_CBMC_8K_13AB_10X-ADT_umi.csv.gz', sep = ',', header = TRUE, row.names = 1)) # Note that since measurements were made in the same cells, the two matrices have identical column names all.equal(colnames(cbmc.rna),colnames(cbmc.adt)) @@ -75,7 +74,7 @@ cbmc <- CreateSeuratObject(counts = cbmc.rna) Assays(cbmc) # create a new assay to store ADT information -adt_assay <- CreateAssayObject(counts = cbmc.adt) +adt_assay <- CreateAssay5Object(counts = cbmc.adt) # add this assay to the previously created Seurat object cbmc[["ADT"]] <- adt_assay @@ -187,7 +186,7 @@ FeatureScatter(cbmc, feature1 = 'adt_CD4', feature2 = 'adt_CD8', slot = 'counts' Seurat is also able to analyze data from multimodal 10X experiments processed using CellRanger v3; as an example, we recreate the plots above using a dataset of 7,900 peripheral blood mononuclear cells (PBMC), freely available from 10X Genomics [here](https://support.10xgenomics.com/single-cell-gene-expression/datasets/3.0.0/pbmc_10k_protein_v3). ```{r pbmc10x, fig.height=4.5, fig.width=10} -pbmc10k.data <- Read10X(data.dir = '../data/pbmc10k/filtered_feature_bc_matrix/') +pbmc10k.data <- Read10X(data.dir = '/brahms/shared/vignette-data/pbmc10k/filtered_feature_bc_matrix/') rownames(x = pbmc10k.data[['Antibody Capture']]) <- gsub( pattern = '_[control_]*TotalSeqB', replacement = '', @@ -211,7 +210,7 @@ plot <- FeatureScatter(cbmc, feature1 = "adt_CD19", feature2 = "adt_CD3") + NoLe ggsave(filename = "../output/images/citeseq_plot.jpg", height = 7, width = 12, plot = plot, quality = 50) ``` -```{r save.times, include = TRUE} +```{r save.times, include = FALSE} write.csv(x = t(as.data.frame(all_times)), file = "../output/timings/multimodal_vignette_times.csv") ``` @@ -220,11 +219,12 @@ write.csv(x = t(as.data.frame(all_times)), file = "../output/timings/multimodal_ Seurat v4 also includes additional functionality for the analysis, visualization, and integration of multimodal datasets. For more information, please explore the resources below: * Defining cellular identity from multimodal data using WNN analysis in Seurat v4 [vignette](weighted_nearest_neighbor_analysis.html) -* Mapping scRNA-seq data onto CITE-seq references [[vignette](reference_mapping.html)] -* Introduction to the analysis of spatial transcriptomics analysis [[vignette](spatial_vignette.html)] -* Analysis of 10x multiome (paired scRNA-seq + ATAC) using WNN analysis [[vignette](weighted_nearest_neighbor_analysis.html)] -* Signac: Analysis, interpretation, and exploration of single-cell chromatin datasets [[package](https://satijalab.org/signac/)] -* Mixscape: an analytical toolkit for pooled single-cell genetic screens [[vignette](mixscape_vignette.html)] +* Mapping scRNA-seq data onto CITE-seq references [vignette](multimodal_reference_mapping.html) +* Introduction to the analysis of spatial transcriptomics analysis [vignette](spatial_vignette.html) +* Analysis of 10x multiome (paired scRNA-seq + ATAC) using WNN analysis [vignette](weighted_nearest_neighbor_analysis.html) +* Signac: Analysis, interpretation, and exploration of single-cell chromatin datasets [package](https://satijalab.org/signac/) +* Mixscape: an analytical toolkit for pooled single-cell genetic screens [vignette](mixscape_vignette.html) +* Bridge integration: mapping multi-omic datasets across molecular modalities [vignette](https://satijalab.org/seurat/articles/seurat5_integration_bridge)
**Session Info** diff --git a/vignettes/pbmc3k_tutorial.Rmd b/vignettes/pbmc3k_tutorial.Rmd index 1998ad9fb..777303216 100644 --- a/vignettes/pbmc3k_tutorial.Rmd +++ b/vignettes/pbmc3k_tutorial.Rmd @@ -6,7 +6,6 @@ output: df_print: kable date: 'Compiled: `r format(Sys.Date(), "%B %d, %Y")`' --- -*** ```{r setup, include=FALSE} all_times <- list() # store the time for each chunk @@ -35,9 +34,9 @@ knitr::opts_chunk$set( For this tutorial, we will be analyzing the a dataset of Peripheral Blood Mononuclear Cells (PBMC) freely available from 10X Genomics. There are 2,700 single cells that were sequenced on the Illumina NextSeq 500. The raw data can be found [here](https://cf.10xgenomics.com/samples/cell/pbmc3k/pbmc3k_filtered_gene_bc_matrices.tar.gz). -We start by reading in the data. The `Read10X()` function reads in the output of the [cellranger](https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/what-is-cell-ranger) pipeline from 10X, returning a unique molecular identified (UMI) count matrix. The values in this matrix represent the number of molecules for each feature (i.e. gene; row) that are detected in each cell (column). +We start by reading in the data. The `Read10X()` function reads in the output of the [cellranger](https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/what-is-cell-ranger) pipeline from 10X, returning a unique molecular identified (UMI) count matrix. The values in this matrix represent the number of molecules for each feature (i.e. gene; row) that are detected in each cell (column). Note that more recent versions of cellranger now also output using the [h5 file format](https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/advanced/h5_matrices), which can be read in using the `Read10X_h5()` function in Seurat. -We next use the count matrix to create a `Seurat` object. The object serves as a container that contains both data (like the count matrix) and analysis (like PCA, or clustering results) for a single-cell dataset. For a technical discussion of the `Seurat` object structure, check out our [GitHub Wiki](https://github.com/satijalab/seurat/wiki). For example, the count matrix is stored in `pbmc[["RNA"]]@counts`. +We next use the count matrix to create a `Seurat` object. The object serves as a container that contains both data (like the count matrix) and analysis (like PCA, or clustering results) for a single-cell dataset. For more information, check out our [Seurat object interaction vignette], or our [GitHub Wiki](https://github.com/satijalab/seurat/wiki). For example, in Seurat v5, the count matrix is stored in `pbmc[["RNA"]]$counts`. ```{r init} library(dplyr) @@ -45,7 +44,7 @@ library(Seurat) library(patchwork) # Load the PBMC dataset -pbmc.data <- Read10X(data.dir = "../data/pbmc3k/filtered_gene_bc_matrices/hg19/") +pbmc.data <- Read10X(data.dir = "/brahms/mollag/practice/filtered_gene_bc_matrices/hg19/") # Initialize the Seurat object with the raw (non-normalized data). pbmc <- CreateSeuratObject(counts = pbmc.data, project = "pbmc3k", min.cells = 3, min.features = 200) pbmc @@ -87,7 +86,7 @@ Seurat allows you to easily explore QC metrics and filter cells based on any use + We calculate mitochondrial QC metrics with the `PercentageFeatureSet()` function, which calculates the percentage of counts originating from a set of features + We use the set of all genes starting with `MT-` as a set of mitochondrial genes -```{r mito, fig.height=7, fig.width=13} +```{r mito} # The [[ operator can add columns to object metadata. This is a great place to stash QC stats pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-") ``` @@ -97,7 +96,7 @@ pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-") * The number of unique genes and total molecules are automatically calculated during `CreateSeuratObject()` + You can find them stored in the object meta data -```{r qc, fig.height=7, fig.width=13} +```{r qc} # Show QC metrics for the first 5 cells head(pbmc@meta.data, 5) ``` @@ -127,7 +126,9 @@ pbmc <- subset(pbmc, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent # Normalizing the data -After removing unwanted cells from the dataset, the next step is to normalize the data. By default, we employ a global-scaling normalization method "LogNormalize" that normalizes the feature expression measurements for each cell by the total expression, multiplies this by a scale factor (10,000 by default), and log-transforms the result. Normalized values are stored in `pbmc[["RNA"]]@data`. +After removing unwanted cells from the dataset, the next step is to normalize the data. By default, we employ a global-scaling normalization method "LogNormalize" that normalizes the feature expression measurements for each cell by the total expression, multiplies this by a scale factor (10,000 by default), and log-transforms the result. In Seurat v5, Normalized values are stored in `pbmc[["RNA"]]$data`. + + ```{r normalize} pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize", scale.factor = 1e4) @@ -138,13 +139,16 @@ For clarity, in this previous line of code (and in future commands), we provide pbmc <- NormalizeData(pbmc) ``` +While this method of normalization is standard and widely used in scRNA-seq analysis, global-scaling relies on an assumption that each cell originally contains the same number of RNA molecules. We and others have developed alternative workflows for the single cell preprocessing that do not make these assumptions. For users who are interested, please check out our `SCTransform()` normalization workflow. The method is described in our[paper](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-021-02584-9), with a separate vignette using Seurat [here](https://satijalab.org/seurat/articles/sctransform_vignette.html). The use of `SCTransform` replaces the need to run `NormalizeData`, `FindVariableFeatures`, or `ScaleData` (described below.) + + # Identification of highly variable features (feature selection) We next calculate a subset of features that exhibit high cell-to-cell variation in the dataset (i.e, they are highly expressed in some cells, and lowly expressed in others). We and [others](https://www.nature.com/articles/nmeth.2645) have found that focusing on these genes in downstream analysis helps to highlight biological signal in single-cell datasets. Our procedure in Seurat is described in detail [here](https://doi.org/10.1016/j.cell.2019.05.031), and improves on previous versions by directly modeling the mean-variance relationship inherent in single-cell data, and is implemented in the `FindVariableFeatures()` function. By default, we return 2,000 features per dataset. These will be used in downstream analysis, like PCA. -```{r var_features, fig.height=5, fig.width=11} +```{r var_features, fig.height=5, fig.width=10} pbmc <- FindVariableFeatures(pbmc, selection.method = 'vst', nfeatures = 2000) # Identify the 10 most highly variable genes @@ -165,38 +169,32 @@ Next, we apply a linear transformation ('scaling') that is a standard pre-proces * Shifts the expression of each gene, so that the mean expression across cells is 0 * Scales the expression of each gene, so that the variance across cells is 1 + This step gives equal weight in downstream analyses, so that highly-expressed genes do not dominate -* The results of this are stored in `pbmc[["RNA"]]@scale.data` +* The results of this are stored in `pbmc[["RNA"]]$scale.data` +* By default, only variable features are scaled. +* You can specify the `features` argument to scale additional features -```{r regress, fig.height=7, fig.width=11, results='hide'} +```{r regress, results='hide'} all.genes <- rownames(pbmc) pbmc <- ScaleData(pbmc, features = all.genes) ``` -
- **This step takes too long! Can I make it faster?** -Scaling is an essential step in the Seurat workflow, but only on genes that will be used as input to PCA. Therefore, the default in `ScaleData()` is only to perform scaling on the previously identified variable features (2,000 by default). To do this, omit the `features` argument in the previous function call, i.e. -```{r regressvar, fig.height=7, fig.width=11, results='hide',eval = FALSE} -pbmc <- ScaleData(pbmc) -``` -Your PCA and clustering results will be unaffected. However, Seurat heatmaps (produced as shown below with `DoHeatmap()`) require genes in the heatmap to be scaled, to make sure highly-expressed genes don't dominate the heatmap. To make sure we don't leave any genes out of the heatmap later, we are scaling all genes in this tutorial. -
-\
- **How can I remove unwanted sources of variation, as in Seurat v2?** + **How can I remove unwanted sources of variation** -In `Seurat v2` we also use the `ScaleData()` function to remove unwanted sources of variation from a single-cell dataset. For example, we could 'regress out' heterogeneity associated with (for example) cell cycle stage, or mitochondrial contamination. These features are still supported in `ScaleData()` in `Seurat v3`, i.e.: -```{r regressvarmt, fig.height=7, fig.width=11, results='hide',eval = FALSE} +In Seurat, we also use the `ScaleData()` function to remove unwanted sources of variation from a single-cell dataset. For example, we could 'regress out' heterogeneity associated with (for example) [cell cycle stage](https://satijalab.org/seurat/articles/cell_cycle_vignette), or mitochondrial contamination i.e.: +```{r regressvarmt, eval = FALSE} pbmc <- ScaleData(pbmc, vars.to.regress = 'percent.mt') ``` -However, particularly for advanced users who would like to use this functionality, we strongly recommend the use of our new normalization workflow, `SCTransform()`. The method is described in our [paper](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1874-1), with a separate vignette using Seurat v3 [here](sctransform_vignette.html). As with `ScaleData()`, the function `SCTransform()` also includes a `vars.to.regress` parameter. +However, particularly for advanced users who would like to use this functionality, we strongly recommend the use of our new normalization workflow, `SCTransform()`. The method is described in our [paper](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-021-02584-9), with a separate vignette using Seurat [here](https://satijalab.org/seurat/articles/sctransform_vignette). As with `ScaleData()`, the function `SCTransform()` also includes a `vars.to.regress` parameter.
-\ *** # Perform linear dimensional reduction -Next we perform PCA on the scaled data. By default, only the previously determined variable features are used as input, but can be defined using `features` argument if you wish to choose a different subset. +Next we perform PCA on the scaled data. By default, only the previously determined variable features are used as input, but can be defined using `features` argument if you wish to choose a different subset (if you do want to use a custom subset of features, make sure you pass these to `ScaleData` first). + +For the first principal components, Seurat outputs a list of genes with the most positive and negative loadings, representing modules of genes that exhibit either correlation (or anti-correlation) across single-cells in the dataset. ```{r pca,results='hide'} pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc)) @@ -204,16 +202,16 @@ pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc)) Seurat provides several useful ways of visualizing both cells and features that define the PCA, including `VizDimReduction()`, `DimPlot()`, and `DimHeatmap()` -```{r pca_viz, message=TRUE} +```{r pca_viz, fig.height=8, fig.width=10, message=TRUE} # Examine and visualize PCA results a few different ways print(pbmc[['pca']], dims = 1:5, nfeatures = 5) VizDimLoadings(pbmc, dims = 1:2, reduction = 'pca') -DimPlot(pbmc, reduction = 'pca') +DimPlot(pbmc, reduction = 'pca') + NoLegend() ``` In particular `DimHeatmap()` allows for easy exploration of the primary sources of heterogeneity in a dataset, and can be useful when trying to decide which PCs to include for further downstream analyses. Both cells and features are ordered according to their PCA scores. Setting `cells` to a number plots the 'extreme' cells on both ends of the spectrum, which dramatically speeds plotting for large datasets. Though clearly a supervised analysis, we find this to be a valuable tool for exploring correlated feature sets. -```{r single-heatmap} +```{r single-heatmap, fig.height=6, fig.width=9} DimHeatmap(pbmc, dims = 1, cells = 500, balanced = TRUE) ``` @@ -225,19 +223,8 @@ DimHeatmap(pbmc, dims = 1:15, cells = 500, balanced = TRUE) To overcome the extensive technical noise in any single feature for scRNA-seq data, Seurat clusters cells based on their PCA scores, with each PC essentially representing a 'metafeature' that combines information across a correlated feature set. The top principal components therefore represent a robust compression of the dataset. However, how many components should we choose to include? 10? 20? 100? -In [Macosko *et al*](http://www.cell.com/abstract/S0092-8674(15)00549-8), we implemented a resampling test inspired by the JackStraw procedure. We randomly permute a subset of the data (1% by default) and rerun PCA, constructing a 'null distribution' of feature scores, and repeat this procedure. We identify 'significant' PCs as those who have a strong enrichment of low p-value features. - -```{r jackstraw, fig.height=6, fig.width=10} -# NOTE: This process can take a long time for big datasets, comment out for expediency. More approximate techniques such as those implemented in ElbowPlot() can be used to reduce computation time -pbmc <- JackStraw(pbmc, num.replicate = 100) -pbmc <- ScoreJackStraw(pbmc, dims = 1:20) -``` - -The `JackStrawPlot()` function provides a visualization tool for comparing the distribution of p-values for each PC with a uniform distribution (dashed line). 'Significant' PCs will show a strong enrichment of features with low p-values (solid curve above the dashed line). In this case it appears that there is a sharp drop-off in significance after the first 10-12 PCs. +In [Macosko *et al*](http://www.cell.com/abstract/S0092-8674(15)00549-8), we implemented a resampling test inspired by the JackStraw procedure. While still available in Seurat [(see previous vignette)](https://satijalab.org/seurat/articles/pbmc3k_tutorial), this is a slow and computationally expensive procedure, and we is no longer routinely used in single cell analysis. -```{r jsplots, fig.height=6, fig.width=10} -JackStrawPlot(pbmc, dims = 1:15) -``` An alternative heuristic method generates an 'Elbow plot': a ranking of principle components based on the percentage of variance explained by each one (`ElbowPlot()` function). In this example, we can observe an 'elbow' around PC9-10, suggesting that the majority of true signal is captured in the first 10 PCs. @@ -245,7 +232,7 @@ An alternative heuristic method generates an 'Elbow plot': a ranking of principl ElbowPlot(pbmc) ``` -Identifying the true dimensionality of a dataset -- can be challenging/uncertain for the user. We therefore suggest these three approaches to consider. The first is more supervised, exploring PCs to determine relevant sources of heterogeneity, and could be used in conjunction with GSEA for example. The second implements a statistical test based on a random null model, but is time-consuming for large datasets, and may not return a clear PC cutoff. The third is a heuristic that is commonly used, and can be calculated instantly. In this example, all three approaches yielded similar results, but we might have been justified in choosing anything between PC 7-12 as a cutoff. +Identifying the true dimensionality of a dataset -- can be challenging/uncertain for the user. We therefore suggest these multiple approaches for users. The first is more supervised, exploring PCs to determine relevant sources of heterogeneity, and could be used in conjunction with GSEA for example. The second (`ElbowPlot`) The third is a heuristic that is commonly used, and can be calculated instantly. In this example, we might have been justified in choosing anything between PC 7-12 as a cutoff. We chose 10 here, but encourage users to consider the following: @@ -257,14 +244,14 @@ We chose 10 here, but encourage users to consider the following: # Cluster the cells -Seurat v3 applies a graph-based clustering approach, building upon initial strategies in ([Macosko *et al*](http://www.cell.com/abstract/S0092-8674(15)00549-8)). Importantly, the *distance metric* which drives the clustering analysis (based on previously identified PCs) remains the same. However, our approach to partitioning the cellular distance matrix into clusters has dramatically improved. Our approach was heavily inspired by recent manuscripts which applied graph-based clustering approaches to scRNA-seq data [[SNN-Cliq, Xu and Su, Bioinformatics, 2015]](http://bioinformatics.oxfordjournals.org/content/early/2015/02/10/bioinformatics.btv088.abstract) and CyTOF data [[PhenoGraph, Levine *et al*., Cell, 2015]](http://www.ncbi.nlm.nih.gov/pubmed/26095251). Briefly, these methods embed cells in a graph structure - for example a K-nearest neighbor (KNN) graph, with edges drawn between cells with similar feature expression patterns, and then attempt to partition this graph into highly interconnected 'quasi-cliques' or 'communities'. +Seurat applies a graph-based clustering approach, building upon initial strategies in ([Macosko *et al*](http://www.cell.com/abstract/S0092-8674(15)00549-8)). Importantly, the *distance metric* which drives the clustering analysis (based on previously identified PCs) remains the same. However, our approach to partitioning the cellular distance matrix into clusters has dramatically improved. Our approach was heavily inspired by recent manuscripts which applied graph-based clustering approaches to scRNA-seq data [[SNN-Cliq, Xu and Su, Bioinformatics, 2015]](http://bioinformatics.oxfordjournals.org/content/early/2015/02/10/bioinformatics.btv088.abstract) and CyTOF data [[PhenoGraph, Levine *et al*., Cell, 2015]](http://www.ncbi.nlm.nih.gov/pubmed/26095251). Briefly, these methods embed cells in a graph structure - for example a K-nearest neighbor (KNN) graph, with edges drawn between cells with similar feature expression patterns, and then attempt to partition this graph into highly interconnected 'quasi-cliques' or 'communities'. As in PhenoGraph, we first construct a KNN graph based on the euclidean distance in PCA space, and refine the edge weights between any two cells based on the shared overlap in their local neighborhoods (Jaccard similarity). This step is performed using the `FindNeighbors()` function, and takes as input the previously defined dimensionality of the dataset (first 10 PCs). To cluster the cells, we next apply modularity optimization techniques such as the Louvain algorithm (default) or SLM [[SLM, Blondel *et al*., Journal of Statistical Mechanics]](http://dx.doi.org/10.1088/1742-5468/2008/10/P10008), to iteratively group cells together, with the goal of optimizing the standard modularity function. The `FindClusters()` function implements this procedure, and contains a resolution parameter that sets the 'granularity' of the downstream clustering, with increased values leading to a greater number of clusters. We find that setting this parameter between 0.4-1.2 typically returns good results for single-cell datasets of around 3K cells. Optimal resolution often increases for larger datasets. The clusters can be found using the `Idents()` function. -```{r cluster, fig.height=5, fig.width=7} +```{r cluster} pbmc <- FindNeighbors(pbmc, dims = 1:10) pbmc <- FindClusters(pbmc, resolution = 0.5) @@ -276,14 +263,17 @@ head(Idents(pbmc), 5) # Run non-linear dimensional reduction (UMAP/tSNE) -Seurat offers several non-linear dimensional reduction techniques, such as tSNE and UMAP, to visualize and explore these datasets. The goal of these algorithms is to learn the underlying manifold of the data in order to place similar cells together in low-dimensional space. Cells within the graph-based clusters determined above should co-localize on these dimension reduction plots. As input to the UMAP and tSNE, we suggest using the same PCs as input to the clustering analysis. +Seurat offers several non-linear dimensional reduction techniques, such as tSNE and UMAP, to visualize and explore these datasets. The goal of these algorithms is to learn underlying structure in the dataset, in order to place similar cells together in low-dimensional space. Therefore, cells that are grouped together within graph-based clusters determined above should co-localize on these dimension reduction plots. + +While we and others have routinely found 2D visualization techniques like tSNE and UMAP to be valuable tools for exploring datasets, all visualization techniques have limitations, and cannot fully represent the complexity of the underlying data. In particular, these methods aim to preserve local distances in the dataset (i.e. ensuring that cells with very similar gene expression profiles co-localize), but often do not preserve more global relationships. We encourage users to leverage techniques like UMAP for visualization, but to avoid drawing biological conclusions solely on the basis of visualization techniques. + + -```{r tsne, fig.height=5, fig.width=7} -# If you haven't installed UMAP, you can do so via reticulate::py_install(packages = "umap-learn") +```{r umap} pbmc <- RunUMAP(pbmc, dims = 1:10) ``` -```{r tsneplot, fig.height=5, fig.width=7} +```{r umapplot, fig.height=8, fig.width=8} # note that you can set `label = TRUE` or use the LabelClusters function to help label individual clusters DimPlot(pbmc, reduction = 'umap') ``` @@ -298,41 +288,49 @@ saveRDS(pbmc, file = "../output/pbmc_tutorial.rds") # Finding differentially expressed features (cluster biomarkers) -Seurat can help you find markers that define clusters via differential expression. By default, it identifies positive and negative markers of a single cluster (specified in `ident.1`), compared to all other cells. `FindAllMarkers()` automates this process for all clusters, but you can also test groups of clusters vs. each other, or against all cells. +Seurat can help you find markers that define clusters via differential expression (DE). By default, it identifies positive and negative markers of a single cluster (specified in `ident.1`), compared to all other cells. `FindAllMarkers()` automates this process for all clusters, but you can also test groups of clusters vs. each other, or against all cells. -The `min.pct` argument requires a feature to be detected at a minimum percentage in either of the two groups of cells, and the thresh.test argument requires a feature to be differentially expressed (on average) by some amount between the two groups. You can set both of these to 0, but with a dramatic increase in time - since this will test a large number of features that are unlikely to be highly discriminatory. As another option to speed up these computations, `max.cells.per.ident` can be set. This will downsample each identity class to have no more cells than whatever this is set to. While there is generally going to be a loss in power, the speed increases can be significant and the most highly differentially expressed features will likely still rise to the top. +In Seurat v5, we use the presto package (as described [here](https://www.biorxiv.org/content/10.1101/653253v1) and available for installation [here](https://github.com/immunogenomics/presto)), to dramatically improve the speed of DE analysis, particularly for large datasets. For users who are not using presto, you can examine the documentation for this function (`?FindMarkers`) to explore the `min.pct` and `logfc.threshold` parameters, which can be increased in order to increase the speed of DE testing. -```{r markers1, fig.height=8, fig.width=15} +```{r markers1} # find all markers of cluster 2 -cluster2.markers <- FindMarkers(pbmc, ident.1 = 2, min.pct = 0.25) +cluster2.markers <- FindMarkers(pbmc, ident.1 = 2) head(cluster2.markers, n = 5) # find all markers distinguishing cluster 5 from clusters 0 and 3 -cluster5.markers <- FindMarkers(pbmc, ident.1 = 5, ident.2 = c(0, 3), min.pct = 0.25) +cluster5.markers <- FindMarkers(pbmc, ident.1 = 5, ident.2 = c(0, 3)) head(cluster5.markers, n = 5) # find markers for every cluster compared to all remaining cells, report only the positive ones -pbmc.markers <- FindAllMarkers(pbmc, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25) -pbmc.markers %>% group_by(cluster) %>% slice_max(n = 2, order_by = avg_log2FC) +pbmc.markers <- FindAllMarkers(pbmc, only.pos = TRUE) +pbmc.markers %>% + group_by(cluster) %>% + dplyr::filter(avg_log2FC > 1) ``` Seurat has several tests for differential expression which can be set with the test.use parameter (see our [DE vignette](de_vignette.html) for details). For example, the ROC test returns the 'classification power' for any individual marker (ranging from 0 - random, to 1 - perfect). -```{r markersroc, fig.height=8, fig.width=15} +```{r markersroc} cluster0.markers <- FindMarkers(pbmc, ident.1 = 0, logfc.threshold = 0.25, test.use = "roc", only.pos = TRUE) ``` We include several tools for visualizing marker expression. `VlnPlot()` (shows expression probability distributions across clusters), and `FeaturePlot()` (visualizes feature expression on a tSNE or PCA plot) are our most commonly used visualizations. We also suggest exploring `RidgePlot()`, `CellScatter()`, and `DotPlot()` as additional methods to view your dataset. -```{r markerplots, fig.height=10, fig.width=15} +```{r markerplots, fig.height=5, fig.width=10} VlnPlot(pbmc, features = c("MS4A1", "CD79A")) # you can plot raw counts as well VlnPlot(pbmc, features = c("NKG7", "PF4"), slot = 'counts', log = TRUE) -FeaturePlot(pbmc, features = c("MS4A1", "GNLY", "CD3E", "CD14", "FCER1A", "FCGR3A", "LYZ", "PPBP", "CD8A")) ``` +```{r, fig.height=8, fig.width=10} +FeaturePlot(pbmc, features = c("MS4A1", "GNLY", "CD3E", "CD14", "FCER1A", "FCGR3A", "LYZ", "PPBP", "CD8A")) +``` `DoHeatmap()` generates an expression heatmap for given cells and features. In this case, we are plotting the top 20 markers (or all markers if less than 20) for each cluster. -```{r clusterHeatmap, fig.height=8, fig.width=15} -pbmc.markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_log2FC) -> top10 +```{r clusterHeatmap, fig.height=8, fig.width=10} +pbmc.markers %>% + group_by(cluster) %>% + dplyr::filter(avg_log2FC > 1) %>% + slice_head(n = 10) %>% + ungroup() -> top10 DoHeatmap(pbmc, features = top10$gene) + NoLegend() ``` @@ -373,11 +371,7 @@ ggsave(filename = "../output/images/pbmc3k_umap.jpg", height = 7, width = 12, pl saveRDS(pbmc, file = "../output/pbmc3k_final.rds") ``` -```{r save2, include = TRUE} -saveRDS(pbmc, file = "../data/pbmc3k_final.rds") -``` - -```{r save.times, include = TRUE} +```{r save.times, include = FALSE} write.csv(x = t(as.data.frame(all_times)), file = "../output/timings/pbmc3k_tutorial_times.csv") ``` diff --git a/vignettes/sctransform_vignette.Rmd b/vignettes/sctransform_vignette.Rmd index fdcc38d92..8b9230432 100644 --- a/vignettes/sctransform_vignette.Rmd +++ b/vignettes/sctransform_vignette.Rmd @@ -1,6 +1,6 @@ --- title: "Using sctransform in Seurat" -author: Christoph Hafemeister & Rahul Satija +author: Saket Choudhary, Christoph Hafemeister & Rahul Satija output: html_document: theme: united @@ -33,10 +33,11 @@ knitr::opts_chunk$set( Biological heterogeneity in single-cell RNA-seq data is often confounded by technical factors including sequencing depth. The number of molecules detected in each cell can vary significantly between cells, even within the same celltype. Interpretation of scRNA-seq data requires effective pre-processing and normalization to remove this technical variability. -In [Hafemeister and Satija, 2019](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1874-1) we introduce a modeling framework for the normalization and variance stabilization of molecular count data from scRNA-seq experiment. -This procedure omits the need for heuristic steps including pseudocount addition or log-transformation and improves common downstream analytical tasks such as variable gene selection, dimensional reduction, and differential expression. -In this vignette, we demonstrate how using [sctransform](https://github.com/ChristophH/sctransform/) based normalization enables recovering sharper biological distinction compared to log-normalization. +In [our manuscript](https://genomebiology-biomedcentral-com/articles/10.1186/s13059-021-02584-9) we introduce a modeling framework for the normalization and variance stabilization of molecular count data from scRNA-seq experiments. This procedure omits the need for heuristic steps including pseudocount addition or log-transformation and improves common downstream analytical tasks such as variable gene selection, dimensional reduction, and differential expression. We named this method `sctransform`. + +Inspired by important and rigorous work from [Lause et al](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-021-02451-7), we released an [updated manuscript](https://link.springer.com/article/10.1186/s13059-021-02584-9) and updated the sctransform software to a v2 version, which is now the default in Seurat v5. + ```{r packages} library(Seurat) @@ -44,18 +45,20 @@ library(ggplot2) library(sctransform) ``` -Load data and create Seurat object +# Load data and create Seurat object ```{r load_data, warning=FALSE, message=FALSE} -pbmc_data <- Read10X(data.dir = "../data/pbmc3k/filtered_gene_bc_matrices/hg19/") +pbmc_data <- Read10X(data.dir = "/brahms/shared/vignette-data/pbmc3k/filtered_gene_bc_matrices/hg19/") pbmc <- CreateSeuratObject(counts = pbmc_data) ``` -Apply sctransform normalization +# Apply sctransform normalization * Note that this single command replaces `NormalizeData()`, `ScaleData()`, and `FindVariableFeatures()`. * Transformed data will be available in the SCT assay, which is set as the default after running sctransform * During normalization, we can also remove confounding sources of variation, for example, mitochondrial mapping percentage + * In Seurat v5, SCT v2 is applied by default. You can revert to v1 by setting `vst.flavor = 'v1'` + * The [glmGamPoi](https://bioconductor.org/packages/release/bioc/html/glmGamPoi.html) package substantially improves speed and is used by default if installed, with instructions [here](install.html) ```{r apply_sct, warning=FALSE, message=FALSE} # store mitochondrial percentage in object meta data @@ -65,19 +68,8 @@ pbmc <- PercentageFeatureSet(pbmc, pattern = "^MT-", col.name = 'percent.mt') pbmc <- SCTransform(pbmc, vars.to.regress = "percent.mt", verbose = FALSE) ``` -The latest version of `sctransform` also supports using [glmGamPoi](https://bioconductor.org/packages/release/bioc/html/glmGamPoi.html) -package which substantially improves the speed of the learning procedure. It can be invoked by specifying -`method="glmGamPoi"`. - -```{r eval=FALSE} -if (!requireNamespace("BiocManager", quietly = TRUE)) - install.packages("BiocManager") - -BiocManager::install("glmGamPoi") -pbmc <- SCTransform(pbmc, method="glmGamPoi", vars.to.regress = "percent.mt", verbose = FALSE) -``` +# Perform dimensionality reduction by PCA and UMAP embedding -Perform dimensionality reduction by PCA and UMAP embedding ```{r pca, fig.width=5, fig.height=5} # These are now standard steps in the Seurat workflow for visualization and clustering pbmc <- RunPCA(pbmc, verbose = FALSE) @@ -85,7 +77,7 @@ pbmc <- RunUMAP(pbmc, dims = 1:30, verbose = FALSE) pbmc <- FindNeighbors(pbmc, dims = 1:30, verbose = FALSE) pbmc <- FindClusters(pbmc, verbose = FALSE) -DimPlot(pbmc, label = TRUE) + NoLegend() +DimPlot(pbmc, label = TRUE) ```
@@ -110,28 +102,24 @@ pbmc <- CreateSeuratObject(pbmc_data) %>% PercentageFeatureSet(pattern = "^MT-",
**Where are normalized values stored for sctransform?** -As described in our [paper](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1874-1), sctransform calculates a model of technical noise in scRNA-seq data using 'regularized negative binomial regression'. The residuals for this model are normalized values, and can be positive or negative. Positive residuals for a given gene in a given cell indicate that we observed more UMIs than expected given the gene’s average expression in the population and cellular sequencing depth, while negative residuals indicate the converse. The results of sctransfrom are stored in the "SCT" assay. You can learn more about multi-assay data and commands in Seurat in our [vignette](multimodal_vignette.html), [command cheat sheet](essential_commands.html#multi-assay-features), or [developer guide](https://github.com/satijalab/seurat/wiki/Assay). -* `pbmc[["SCT"]]@scale.data` contains the residuals (normalized values), and is used directly as input to PCA. Please note that this matrix is non-sparse, and can therefore take up a lot of memory if stored for all genes. To save memory, we store these values only for variable genes, by setting the return.only.var.genes = TRUE by default in the `SCTransform()` function call. -* To assist with visualization and interpretation. we also convert Pearson residuals back to ‘corrected’ UMI counts. You can interpret these as the UMI counts we would expect to observe if all cells were sequenced to the same depth. If you want to see exactly how we do this, please look at the correct function [here](https://github.com/ChristophH/sctransform/blob/master/R/denoise.R). -* The 'corrected' UMI counts are stored in `pbmc[["SCT"]]@counts`. We store log-normalized versions of these corrected counts in `pbmc[["SCT"]]@data`, which are very helpful for visualization. -* You can use the corrected log-normalized counts for differential expression and integration. However, in principle, it would be most optimal to perform these calculations directly on the residuals (stored in the `scale.data` slot) themselves. This is not currently supported in Seurat v3, but will be soon. +* `pbmc[["SCT"]]$scale.data` contains the residuals (normalized values), and is used directly as input to PCA. Please note that this matrix is non-sparse, and can therefore take up a lot of memory if stored for all genes. To save memory, we store these values only for variable genes, by setting the return.only.var.genes = TRUE by default in the `SCTransform()` function call. +* To assist with visualization and interpretation, we also convert Pearson residuals back to ‘corrected’ UMI counts. You can interpret these as the UMI counts we would expect to observe if all cells were sequenced to the same depth. If you want to see exactly how we do this, please look at the correct function [here](https://github.com/ChristophH/sctransform/blob/master/R/denoise.R). +* The 'corrected' UMI counts are stored in `pbmc[["SCT"]]$counts`. We store log-normalized versions of these corrected counts in `pbmc[["SCT"]]$data`, which are very helpful for visualization. ------
\ - Users can individually annotate clusters based on canonical markers. However, the sctransform normalization reveals sharper biological distinctions compared to the [standard Seurat workflow](pbmc3k_tutorial.html), in a few ways: - * Clear separation of at least 3 CD8 T cell populations (naive, memory, effector), based on CD8A, GZMK, CCL5, GZMK expression + * Clear separation of at least 3 CD8 T cell populations (naive, memory, effector), based on CD8A, GZMK, CCL5, CCR7 expression * Clear separation of three CD4 T cell populations (naive, memory, IFN-activated) based on S100A4, CCR7, IL32, and ISG15 * Additional developmental sub-structure in B cell cluster, based on TCL1A, FCER2 * Additional separation of NK cells into CD56dim vs. bright clusters, based on XCL1 and FCGR3A - ```{r fplot, fig.width = 10, fig.height=6} # These are now standard steps in the Seurat workflow for visualization and clustering # Visualize canonical marker genes as violin plots. @@ -142,7 +130,6 @@ FeaturePlot(pbmc, features = c("CD8A", "GZMK", "CCL5", "S100A4", "ANXA1", "CCR7" FeaturePlot(pbmc, features = c("CD3D", "ISG15", "TCL1A", "FCER2", "XCL1", "FCGR3A"), pt.size = 0.2, ncol = 3) ``` - ```{r save.times, include = FALSE} write.csv(x = t(as.data.frame(all_times)), file = "../output/timings/sctransform_vignette_times.csv") ``` diff --git a/vignettes/seurat5_atacseq_integration_vignette.Rmd b/vignettes/seurat5_atacseq_integration_vignette.Rmd index 3dc33d667..7da517636 100644 --- a/vignettes/seurat5_atacseq_integration_vignette.Rmd +++ b/vignettes/seurat5_atacseq_integration_vignette.Rmd @@ -29,7 +29,7 @@ knitr::opts_chunk$set( time_it = TRUE, error = TRUE ) -options(SeuratData.repo.use = 'satijalab04.nygenome.org') +options(SeuratData.repo.use = 'seurat.nygenome.org') ``` Single-cell transcriptomics has transformed our ability to characterize cell states, but deep biological understanding requires more than a taxonomic listing of clusters. As new methods arise to measure distinct cellular modalities, a key analytical challenge is to integrate these datasets to better understand cellular identity and function. For example, users may perform scRNA-seq and scATAC-seq experiments on the same biological system and to consistently annotate both datasets with the same set of cell type labels. This analysis is particularly challenging as scATAC-seq datasets are difficult to annotate, due to both the sparsity of genomic data collected at single-cell resolution, and the lack of interpretable gene markers in scRNA-seq data. @@ -40,7 +40,7 @@ In [Stuart\*, Butler\* et al, 2019](https://www.cell.com/cell/fulltext/S0092-867 * How to co-visualize (co-embed) cells from scRNA-seq and scATAC-seq * How to project scATAC-seq cells onto a UMAP derived from an scRNA-seq experiment -This vignette makes extensive use of the [Signac package](https://satijalab.org/signac/), recently developed for the analysis of chromatin datasets collected at single-cell resolution, including scATAC-seq. Please see the Signac website for additional [vignettes](https://satijalab.org/signac/articles/pbmc_vignette.html) and documentation for analyzing scATAC-seq data. +This vignette makes extensive use of the [Signac package](https://stuartlab.org/signac/), recently developed for the analysis of chromatin datasets collected at single-cell resolution, including scATAC-seq. Please see the Signac website for additional [vignettes](https://stuartlab.org/signac/articles/pbmc_vignette.html) and documentation for analyzing scATAC-seq data. We demonstrate these methods using a publicly available ~12,000 human PBMC 'multiome' dataset from 10x Genomics. In this dataset, scRNA-seq and scATAC-seq profiles were simultaneously collected in the same cells. For the purposes of this vignette, we treat the datasets as originating from two different experiments and integrate them together. Since they were originally measured in the same cells, this provides a ground truth that we can use to assess the accuracy of integration. We emphasize that our use of the multiome dataset here is for demonstration and evaluation purposes, and that users should apply these methods to scRNA-seq and scATAC-seq datasets that are collected separately. We provide a separate [weighted nearest neighbors vignette (WNN)](weighted_nearest_neighbor_analysis.html) that describes analysis strategies for multi-omic single-cell data. @@ -56,7 +56,6 @@ InstallData('pbmcMultiome') ```{r loadpkgs} library(Seurat) -options(Seurat.object.assay.version = "v5") library(Signac) library(EnsDb.Hsapiens.v86) library(ggplot2) @@ -113,7 +112,6 @@ ggsave(filename = "../output/images/atacseq_integration_vignette.jpg", height = # Identifying anchors between scRNA-seq and scATAC-seq datasets In order to identify 'anchors' between scRNA-seq and scATAC-seq experiments, we first generate a rough estimate of the transcriptional activity of each gene by quantifying ATAC-seq counts in the 2 kb-upstream region and gene body, using the `GeneActivity()` function in the Signac package. The ensuing gene activity scores from the scATAC-seq data are then used as input for canonical correlation analysis, along with the gene expression quantifications from scRNA-seq. We perform this quantification for all genes identified as being highly variable from the scRNA-seq dataset. - ```{r gene.activity} # quantify gene activity gene.activities <- GeneActivity(pbmc.atac, features = VariableFeatures(pbmc.rna)) @@ -221,7 +219,7 @@ coembed <- RunUMAP(coembed, dims = 1:30) DimPlot(coembed, group.by = c("orig.ident","seurat_annotations")) ``` -```{r save.times, include = TRUE} +```{r save.times, include = FALSE} write.csv(x = t(as.data.frame(all_times)), file = "../output/timings/seurat5_atacseq_integration_vignette.csv") ``` diff --git a/vignettes/seurat5_bpcells_interaction_vignette.Rmd b/vignettes/seurat5_bpcells_interaction_vignette.Rmd index 280418d7b..51e0004a7 100644 --- a/vignettes/seurat5_bpcells_interaction_vignette.Rmd +++ b/vignettes/seurat5_bpcells_interaction_vignette.Rmd @@ -38,16 +38,15 @@ We leverage the high performance capabilities of BPCells to work with Seurat obj We will show the methods for interacting with both a single dataset in one file or multiple datasets across multiple files using BPCells. BPCells allows us to easily analyze these large datasets in memory, and we encourage users to check out some of our other vignettes [here]() and [here]() to see further applications. - -```{r install, message = FALSE, warning = FALSE} +```{r, eval=FALSE} devtools::install_github("bnprks/BPCells") +``` +```{r install, message = FALSE, warning = FALSE} library(BPCells) library(Seurat) library(SeuratObject) library(SeuratDisk) library(Azimuth) - -options(Seurat.object.assay.version = "v5") ``` We use BPCells functionality to both load in our data and write the counts layers to bitpacked compressed binary files on disk to improve computation speeds. BPCells has multiple functions for reading in files. @@ -65,8 +64,7 @@ brain.data <- open_matrix_10x_hdf5( # Write the matrix to a directory write_matrix_dir( mat = brain.data, - dir = '/brahms/hartmana/vignette_data/bpcells/brain_counts', - overwrite = TRUE) + dir = '/brahms/hartmana/vignette_data/bpcells/brain_counts') # Now that we have the matrix on disk, we can load it brain.mat <- open_matrix_dir(dir = "/brahms/hartmana/vignette_data/bpcells/brain_counts") brain.mat <- Azimuth:::ConvertEnsembleToSymbol(mat = brain.mat, species = "mouse") @@ -106,15 +104,14 @@ VlnPlot(brain, features = c("Sox10", "Slc17a7", "Aif1"), ncol = 3, layer = "data ### Saving Seurat objects with on-disk layers -If you save your object and load it in in the future, Seurat will access the on-disk matrices by their path, which is stored in the assay level data. To make it easy to ensure these are saved in the same place, we provide new functionality to the saveRDS function. In this function, you specify your filename and the destination directory. The pointer to the path in the Seurat object will change to the destination directory. +If you save your object and load it in in the future, Seurat will access the on-disk matrices by their path, which is stored in the assay level data. To make it easy to ensure these are saved in the same place, we provide new functionality to the SaveSeuratRds function. In this function, you specify your filename. The pointer to the path in the Seurat object will change to the current directory. This also makes it easy to share your Seurat objects with BPCells matrices by sharing a folder that contains both the object and the BPCells directory. ```{r} -saveRDS( +SaveSeuratRds( object = brain, - file = "obj.Rds", - destdir = "/brahms/hartmana/vignette_data/bpcells/brain_object") + file = "obj.Rds") ``` @@ -170,16 +167,14 @@ metadata <- Reduce(rbind, metadata.list) When we create the Seurat object with the list of matrices from each publication, we can then see that multiple counts layers exist that represent each dataset. This object contains over a million cells, yet only takes up minimal space in memory! ```{r} -options(Seurat.object.assay.version = "v5") merged.object <- CreateSeuratObject(counts = data.list, meta.data = metadata) merged.object ``` ```{r save_merged, eval=FALSE} -saveRDS( +SaveSeuratRds( object = merged.object, - file = "obj.Rds", - destdir = "/brahms/hartmana/vignette_data/bpcells/merged_object") + file = "obj.Rds") ``` ## Parse Biosciences @@ -203,10 +198,9 @@ parse.object <- CreateSeuratObject(counts = parse.mat, meta.data = metadata) ``` ```{r save_parse, eval=FALSE} -saveRDS( +SaveSeuratRds( object = parse.object, - file = "obj.Rds", - destdir = "/brahms/hartmana/vignette_data/bpcells/parse_object") + file = "obj.Rds") ``` diff --git a/vignettes/seurat5_essential_commands.Rmd b/vignettes/seurat5_essential_commands.Rmd index 09d2121a9..885e13d71 100644 --- a/vignettes/seurat5_essential_commands.Rmd +++ b/vignettes/seurat5_essential_commands.Rmd @@ -24,6 +24,7 @@ knitr::knit_hooks$set(time_it = local({ knitr::opts_chunk$set( tidy = 'styler', warning = FALSE, + eval = FALSE, error = TRUE, message = FALSE, fig.width = 8, @@ -114,7 +115,7 @@ annotations <- pbmc3k$seurat_annotations ## Create Seurat or Assay objects -By setting a global option (`Seurat.object.assay.version`), you can default to creating either Seurat v3 assays, or Seurat v5 assays. The use of v3 assays is set by default upon package loading, which ensures backwards compatibiltiy with existing workflows. +By setting a global option (`Seurat.object.assay.version`), you can default to creating either Seurat v3 assays, or Seurat v5 assays. The use of v5 assays is set by default upon package loading, which ensures backwards compatibiltiy with existing workflows. ```{r create} # create v3 assays diff --git a/vignettes/seurat5_integration.Rmd b/vignettes/seurat5_integration.Rmd index ec1acd246..b03134b6d 100644 --- a/vignettes/seurat5_integration.Rmd +++ b/vignettes/seurat5_integration.Rmd @@ -22,7 +22,7 @@ knitr::knit_hooks$set(time_it = local({ })) knitr::opts_chunk$set( tidy = 'styler', - fig.width = 10, + fig.width = 8, message = FALSE, warning = FALSE, time_it = TRUE, @@ -38,11 +38,10 @@ library(Azimuth) library(ggplot2) library(patchwork) options(future.globals.maxSize = 1e9) -options(Seurat.object.assay.version = "v5") ``` ## Introduction -Integration of single-cell sequencing datasets, for example across experimental batches, donors, or conditions, is often an important step in scRNA-seq workflows. Integrative analysis can help to match shared cell types and states across datasets, which can boost statistical power, and most importantly, facilitate accurate comparative analysis across datasets. In previous versions of Seurat we introduced methods for integrative analysis, including our ‘anchor-based’ integration workflow. Many labs have also published powerful and pioneering methods, including Harmony and scVI, for integrative analysis. +Integration of single-cell sequencing datasets, for example across experimental batches, donors, or conditions, is often an important step in scRNA-seq workflows. Integrative analysis can help to match shared cell types and states across datasets, which can boost statistical power, and most importantly, facilitate accurate comparative analysis across datasets. In previous versions of Seurat we introduced methods for integrative analysis, including our ‘anchor-based’ integration workflow. Many labs have also published powerful and pioneering methods, including [Harmony](https://github.com/immunogenomics/harmony) and [scVI](https://yoseflab.github.io/software/scvi-tools/), for integrative analysis. We recognize that while the goal of matching shared cell types across datasets may be important for many problems, users may also be concerned about which method to use, or that integration could result in a loss of biological resolution. In Seurat v5, we introduce more flexible and streamlined infrastructure to run different integration algorithms with a single line of code. This makes it easier to explore the results of different integration methods, and to compare these results to a workflow that excludes integration steps. For this vignette, we use a [dataset of human PBMC profiled with seven different technologies](https://www.nature.com/articles/s41587-020-0465-8), profiled as part of a systematic comparative analysis (`pbmcsca`). The data is available as part of our [SeuratData](https://github.com/satijalab/seurat-data) package. @@ -91,39 +90,42 @@ Seurat v5 enables streamlined integrative analysis using the `IntegrateLayers` f * FastMNN (`method= FastMNNIntegration`) * scVI (`method=scVIIntegration`) -Note that you can find more detail on each method, and any installation prerequisites, in Seurat's documentation (for example, `?scVIIntegration`). For example, scVI integration requires `reticulate` which can be installed from CRAN (`install.packages("reticulate")`) as well as `scvi-tools` and its dependencies installed in a conda environment. Please see scVI installation instructions [here](https://docs.scvi-tools.org/en/stable/installation.html). +Note that our anchor-based RPCA integration represents a faster and more conservative (less correction) method for integration. For interested users, we discuss this method in more detail in our [previous RPCA vignette](https://satijalab.org/seurat/articles/integration_rpca) + +You can find more detail on each method, and any installation prerequisites, in Seurat's documentation (for example, `?scVIIntegration`). For example, scVI integration requires `reticulate` which can be installed from CRAN (`install.packages("reticulate")`) as well as `scvi-tools` and its dependencies installed in a conda environment. Please see scVI installation instructions [here](https://docs.scvi-tools.org/en/stable/installation.html). + Each of the following lines perform a new integration using a single line of code: -```{r integratelayerscca} +```{r integratelayerscca, results='hide'} obj <- IntegrateLayers( object = obj, method = CCAIntegration, orig.reduction = "pca", new.reduction = 'integrated.cca', verbose = FALSE) ``` -```{r integratelayersrpca} +```{r integratelayersrpca, results='hide'} obj <- IntegrateLayers( object = obj, method = RPCAIntegration, orig.reduction = "pca", new.reduction = 'integrated.rpca', verbose = FALSE) ``` -```{r integratelayersharmony} +```{r integratelayersharmony, results='hide'} obj <- IntegrateLayers( object = obj, method = HarmonyIntegration, orig.reduction = "pca", new.reduction = 'harmony', verbose = FALSE) ``` -```{r integratelayersfastmnn} +```{r integratelayersfastmnn, results='hide'} obj <- IntegrateLayers( object = obj, method = FastMNNIntegration, new.reduction = 'integrated.mnn', verbose = FALSE) ``` -```{r integratelayersscvi, eval=FALSE} +```{r integratelayersscvi, eval=FALSE, results='hide'} obj <- IntegrateLayers( object = obj, method = scVIIntegration, new.reduction = 'integrated.scvi', @@ -136,7 +138,7 @@ scvi.reduc <- scvi.reduc[Cells(obj),] obj[["integrated.scvi"]] <- CreateDimReducObject(embeddings = scvi.reduc) ``` -For any of the methods, we can now visualize and cluster the datasets. We show this for CCA integration and scVI, but you can do this for any method +For any of the methods, we can now visualize and cluster the datasets. We show this for CCA integration and scVI, but you can do this for any method: ```{r integratedprojections, fig.height=16, fig.width=16} obj <- FindNeighbors(obj, reduction = 'integrated.cca', dims = 1:30) @@ -145,7 +147,7 @@ obj <- RunUMAP(obj, reduction = "integrated.cca", dims = 1:30, reduction.name = p1 <- DimPlot( obj, reduction = "umap.cca", group.by = c("Method", "predicted.celltype.l2", "cca_clusters"), - combine = FALSE) + combine = FALSE, label.size = 2) obj <- FindNeighbors(obj, reduction = 'integrated.scvi', dims = 1:30) obj <- FindClusters(obj,resolution = 2, cluster.name = 'scvi_clusters') @@ -153,9 +155,9 @@ obj <- RunUMAP(obj, reduction = "integrated.scvi", dims = 1:30, reduction.name = p2 <- DimPlot( obj, reduction = "umap.scvi", group.by = c("Method", "predicted.celltype.l2", "scvi_clusters"), - combine = FALSE) + combine = FALSE, label.size = 2) -wrap_plots(c(p1, p2), ncol = 2) +wrap_plots(c(p1, p2), ncol = 2, byrow = F) ``` We hope that by simplifying the process of performing integrative analysis, users can more carefully evaluate the biological information retained in the integrated dataset. For example, users can compare the expression of biological markers based on different clustering solutions, or visualize one method's clustering solution on different UMAP visualizations. @@ -188,6 +190,27 @@ obj <- JoinLayers(obj) obj ``` +Lastly, users can also perform integration using sctransform-normalized data (see our [SCTransform vignette](https://satijalab.org/seurat/articles/sctransform_vignette) for more information), by first running SCTransform normalization, and then setting the `normalization.method` argument in `IntegrateLayers`. + +```{r, include=FALSE} +obj <- LoadData("pbmcsca") +obj <- subset(obj, nFeature_RNA > 1000) +obj[["RNA"]] <- split(obj[["RNA"]], f = obj$Method) +``` + +```{r sct} +options(future.globals.maxSize = 3e+09) +obj <- SCTransform(obj) +obj <- RunPCA(obj, npcs = 30, verbose = F) +obj <- IntegrateLayers(object = obj, + method = RPCAIntegration, + normalization.method="SCT", + verbose = F) +obj <- FindNeighbors(obj, dims = 1:30,reduction = 'integrated.dr') +obj <- FindClusters(obj, resolution = 2) +obj <- RunUMAP(obj, dims = 1:30,reduction = 'integrated.dr') +``` +
**Session Info** ```{r} diff --git a/vignettes/seurat5_integration_bridge.Rmd b/vignettes/seurat5_integration_bridge.Rmd index e90f7547a..2453d060b 100644 --- a/vignettes/seurat5_integration_bridge.Rmd +++ b/vignettes/seurat5_integration_bridge.Rmd @@ -42,13 +42,12 @@ In this vignette we demonstrate: * Mapping the scATAC-seq dataset via bridge integration * Exploring and assessing the resulting annotations -### Azimuth ATAC for Bridge Integration +## Azimuth ATAC for Bridge Integration Users can now automatically run bridge integration for PBMC and Bone Marrow scATAC-seq queries with the newly released Azimuth ATAC workflow on the [Azimuth website](https://azimuth.hubmapconsortium.org/) or in R. For more details on running locally in R, see the section on ATAC data in this [vignette](https://satijalab.github.io/azimuth/articles/run_azimuth_tutorial.html). ```{r, message=FALSE, warning=FALSE} library(Seurat) -options(Seurat.object.assay.version = "v5") library(Signac) library(EnsDb.Hsapiens.v86) library(dplyr) @@ -57,7 +56,7 @@ library(ggplot2) ## Load the bridge, query, and reference datasets -We start by loading a 10x multiome dataset, consisting of ~12,000 PBMC from a healthy donor. The dataset measures RNA-seq and ATAC-seq in the same cell, and is available for download from 10x Genomics [here](https://www.10xgenomics.com/resources/datasets/pbmc-from-a-healthy-donor-granulocytes-removed-through-cell-sorting-10-k-1-standard-2-0-0). We follow the loading instructions from the [Signac package vignettes](https://satijalab.org/signac/articles/pbmc_multiomic.html). Note that when using Signac, please make sure you are using the [latest version of Bioconductor]([http://www.bioconductor.org/news/bioc_3_14_release/]), as [users have reported errors](https://github.com/timoast/signac/issues/687) when using older BioC versions. +We start by loading a 10x multiome dataset, consisting of ~12,000 PBMC from a healthy donor. The dataset measures RNA-seq and ATAC-seq in the same cell, and is available for download from 10x Genomics [here](https://www.10xgenomics.com/resources/datasets/pbmc-from-a-healthy-donor-granulocytes-removed-through-cell-sorting-10-k-1-standard-2-0-0). We follow the loading instructions from the [Signac package vignettes](https://stuartlab.org/signac/articles/pbmc_multiomic.html). Note that when using Signac, please make sure you are using the [latest version of Bioconductor](https://www.bioconductor.org/install/), as [users have reported errors](https://github.com/timoast/signac/issues/687) when using older BioC versions.
**Load and setup the 10x multiome object** @@ -111,7 +110,7 @@ obj.multi <- subset( The scATAC-seq query dataset represents ~10,000 PBMC from a healthy donor, and is available for download [here](https://www.10xgenomics.com/resources/datasets/10-k-human-pbm-cs-atac-v-1-1-chromium-x-1-1-standard-2-0-0). We load in the peak/cell matrix, store the path to the fragments file, and add gene annotations to the object, following the steps as with the ATAC data in the multiome experiment. -We note that it is important to quantify the same set of genomic features in the query dataset as are quantified in the multi-omic bridge. We therefore requantify the set of scATAC-seq peaks using the `FeatureMatrix` command. This is also described in the [Signac vignettes](https://satijalab.org/signac/articles/integrate_atac.html) and shown below. +We note that it is important to quantify the same set of genomic features in the query dataset as are quantified in the multi-omic bridge. We therefore requantify the set of scATAC-seq peaks using the `FeatureMatrix` command. This is also described in the [Signac vignettes](https://stuartlab.org/signac/articles/integrate_atac.html) and shown below.
**Load and setup the 10x scATAC-seq query** @@ -193,9 +192,8 @@ obj.atac <- RunTFIDF(obj.atac) Now that we have the reference, query, and bridge datasets set up, we can begin integration. The bridge dataset enables translation between the scRNA-seq reference and the scATAC-seq query, effectively augmenting the reference so that it can map a new data type. We call this an extended reference, and first set it up. Note that you can save the results of this function and map multiple scATAC-seq datasets without having to rerun. - +First, we drop the first dimension of the ATAC reduction. ```{r, message=FALSE, warning=FALSE} -# Drop first dimension for ATAC reduction dims.atac <- 2:50 dims.rna <- 1:50 DefaultAssay(obj.multi) <- "RNA" diff --git a/vignettes/seurat5_sketch_analysis.Rmd b/vignettes/seurat5_sketch_analysis.Rmd index ca092445a..ccbdfefe6 100644 --- a/vignettes/seurat5_sketch_analysis.Rmd +++ b/vignettes/seurat5_sketch_analysis.Rmd @@ -35,13 +35,17 @@ knitr::opts_chunk$set( As single-cell sequencing technologies continue to improve in scalability in throughput, the generation of datasets spanning a million or more cells is becoming increasingly routine. In Seurat v5, we introduce new infrastructure and methods to analyze, interpret, and explore these exciting datasets. In this vignette, we introduce a sketch-based analysis workflow to analyze a 1.3 million cell dataset of the developing mouse brain, freely available from 10x Genomics. Analyzing datasets of this size with standard workflows can be challenging, slow, and memory-intensive. Here we introduce an alternative workflow that is highly scalable, even to datasets ranging beyond 10 million cells in size. -Our 'sketch-based' workflow involves three new features in Seurat v5 +Our 'sketch-based' workflow involves three new features in Seurat v5: * Infrastructure for on-disk storage of large single-cell datasets + Storing expression matrices in memory can be challenging for extremely large scRNA-seq datasets. In Seurat v5, we introduce support for multiple on-disk storage formats. + * 'Sketching' methods to subsample cells from large datasets while preserving rare populations + As introduced in [Hie et al, 2019](https://www.sciencedirect.com/science/article/pii/S2405471219301528), cell sketching methods aim to compactly summarize large single-cell datasets in a small number of cells, while preserving the presence of both abundant and rare cell types. In Seurat v5, we leverage this idea to select subsamples ('sketches') of cells from large datasets that are stored on-disk. However, after sketching, the subsampled cells can be stored in-memory, allowing for interactive and rapid visualization and exploration. -We store sketched cells (in-memory) and the full dataset (on-disk) as two assays in the same Seurat object. Users can then easily switch between the two versions, providing the flexibiltiy to perform quick analyses on a subset of cells in-memory, while retaining access to the full dataset on-disk. +We store sketched cells (in-memory) and the full dataset (on-disk) as two assays in the same Seurat object. Users can then easily switch between the two versions, providing the flexibility to perform quick analyses on a subset of cells in-memory, while retaining access to the full dataset on-disk. + * Support for 'bit-packing' compression and infrastructure We demonstrate the on-disk capabilities in Seurat v5 using the [BPCells package](https://github.com/bnprks/BPCells) developed by Ben Parks in the Greenleaf Lab. This package utilizes bit-packing compression and optimized, streaming-compatible C++ code to substantially improve I/O and computational performance when working with on-disk data. @@ -59,11 +63,7 @@ options(future.globals.maxSize = 1e9) We start by loading the 1.3M dataset from 10x Genomics using the `open_matrix_dir` function from `BPCells`. Note that this function does not load the dataset into memory, but instead, creates a connection to the data stored on-disk. We then store this on-disk representation in the Seurat object. Note that in our [Introduction to on-disk storage vignette](seurat5_bpcells_interaction_vignette.html), we demonstrate how to create this on-disk representation. -```{r} -# specify that you would like to create a Seurat v5 assay -# note that we require setting this option to ensure that existing pipelines are not affected -options(Seurat.object.assay.version = 'v5') - +```{r load.obj} # Read the Seurat object, which contains 1.3M cells stored on-disk as part of the 'RNA' assay obj <- readRDS("/brahms/hartmana/vignette_data/1p3_million_mouse_brain.rds") obj diff --git a/vignettes/seurat5_spatial_vignette_2.Rmd b/vignettes/seurat5_spatial_vignette_2.Rmd index 7a04b971f..f3e765f21 100644 --- a/vignettes/seurat5_spatial_vignette_2.Rmd +++ b/vignettes/seurat5_spatial_vignette_2.Rmd @@ -45,7 +45,6 @@ First, we load the packages necessary for this vignette. ```{r init, message=FALSE, warning=FALSE} library(Seurat) -options(Seurat.object.assay.version = "v5") library(future) plan("multisession", workers = 10) library(ggplot2) @@ -325,7 +324,7 @@ query <- SpatialRNA(coords, query.counts, colSums(query.counts)) ```{r rctd.reference, eval=FALSE} # allen.corted.ref can be downloaded here: https://www.dropbox.com/s/cuowvm4vrf65pvq/allen_cortex.rds?dl=1 -allen.cortex.ref <- readRDS("/home/hartmana/github/seurat-private/data/allen_cortex.rds") +allen.cortex.ref <- readRDS("/brahms/shared/vignette-data/allen_cortex.rds") allen.cortex.ref <- UpdateSeuratObject(allen.cortex.ref) Idents(allen.cortex.ref) <- "subclass" @@ -542,6 +541,6 @@ sessionInfo() ```
-```{r save.times, include=TRUE} +```{r save.times, include=FALSE} write.csv(x = t(as.data.frame(all_times)), file = "../output/timings/seurat5_spatial_vignette_2.csv") ``` diff --git a/vignettes/spatial_vignette.Rmd b/vignettes/spatial_vignette.Rmd index 787847e12..7edb15b34 100644 --- a/vignettes/spatial_vignette.Rmd +++ b/vignettes/spatial_vignette.Rmd @@ -100,7 +100,7 @@ wrap_plots(plot1, plot2) These plots demonstrate that the variance in molecular counts across spots is not just technical in nature, but also is dependent on the tissue anatomy. For example, regions of the tissue that are depleted for neurons (such as the cortical white matter), reproducibly exhibit lower molecular counts. As a result, standard approaches (such as the `LogNormalize()` function), which force each data point to have the same underlying 'size' after normalization, can be problematic. -As an alternative, we recommend using sctransform (Hafemeister and Satija, Genome Biology 2019), which which builds regularized negative binomial models of gene expression in order to account for technical artifacts while preserving biological variance. For more details on sctransform, please see the paper [here](https://doi.org/10.1186/s13059-019-1874-1) and the Seurat vignette [here](sctransform_vignette.html). sctransform normalizes the data, detects high-variance features, and stores the data in the `SCT` assay. +As an alternative, we recommend using sctransform (Hafemeister and Satija, Genome Biology 2019), which which builds regularized negative binomial models of gene expression in order to account for technical artifacts while preserving biological variance. For more details on sctransform, please see the paper [here](https://genomebiology-biomedcentral-com/articles/10.1186/s13059-021-02584-9) and the Seurat vignette [here](sctransform_vignette.html). sctransform normalizes the data, detects high-variance features, and stores the data in the `SCT` assay. ```{r preprocess} brain <- SCTransform(brain, assay = "Spatial", verbose = FALSE) @@ -265,12 +265,12 @@ p1 + p2 ## Integration with single-cell data At ~50um, spots from the visium assay will encompass the expression profiles of multiple cells. For the growing list of systems where scRNA-seq data is available, users may be interested to 'deconvolute' each of the spatial voxels to predict the underlying composition of cell types. In preparing this vignette, we tested a wide variety of decovonlution and integration methods, using a [reference scRNA-seq dataset](https://www.nature.com/articles/nn.4216) of ~14,000 adult mouse cortical cell taxonomy from the Allen Institute, generated with the SMART-Seq2 protocol. -We consistently found superior performance using integration methods (as opposed to deconvolution methods), likely because of substantially different noise models that characterize spatial and single-cell datasets, and integration methods are specifiically designed to be robust to these differences. We therefore apply the 'anchor'-based integration workflow introduced in [Seurat v3](https://www.cell.com/cell/fulltext/S0092-8674(19)30559-8), that enables the probabilistic transfer of annotations from a reference to a query set. We therefore follow the label transfer workflow introduced [here](reference_mapping.html), taking advantage of sctransform normalization, but anticipate new methods to be developed to accomplish this task. +We consistently found superior performance using integration methods (as opposed to deconvolution methods), likely because of substantially different noise models that characterize spatial and single-cell datasets, and integration methods are specifiically designed to be robust to these differences. We therefore apply the 'anchor'-based integration workflow introduced in [Seurat v3](https://www.cell.com/cell/fulltext/S0092-8674(19)30559-8), that enables the probabilistic transfer of annotations from a reference to a query set. We therefore follow the label transfer workflow introduced [here](integration_mapping.html), taking advantage of sctransform normalization, but anticipate new methods to be developed to accomplish this task. We first load the data (download available [here](https://www.dropbox.com/s/cuowvm4vrf65pvq/allen_cortex.rds?dl=1)), pre-process the scRNA-seq reference, and then perform label transfer. The procedure outputs, for each spot, a probabilistic classification for each of the scRNA-seq derived classes. We add these predictions as a new assay in the Seurat object. ```{r sc.data} -allen_reference <- readRDS("../data/allen_cortex.rds") +allen_reference <- readRDS("/brahms/shared/vignette-data/allen_cortex.rds") ``` ```{r sc.data2} @@ -410,7 +410,7 @@ SpatialDimPlot(slide.seq, cells.highlight = CellsByIdentities(object = slide.seq To facilitate cell-type annotation of the Slide-seq dataset, we are leveraging an existing mouse single-cell RNA-seq hippocampus dataset, produced in [Saunders\*, Macosko\*, et al. 2018](https://doi.org/10.1016/j.cell.2018.07.028). The data is available for download as a processed Seurat object [here](https://www.dropbox.com/s/cs6pii5my4p3ke3/mouse_hippocampus_reference.rds?dl=0), with the raw count matrices available on the [DropViz website](http://dropviz.org/). ```{r ref.saunders} -ref <- readRDS("../data/mouse_hippocampus_reference.rds") +ref <- readRDS("/brahms/shared/vignette-data/mouse_hippocampus_reference.rds") ref <- UpdateSeuratObject(ref) ``` @@ -507,7 +507,7 @@ p2 <- SpatialDimPlot(slide.seq, group.by = "second_type") p1 | p2 ``` -```{r save.times, include = TRUE} +```{r save.times, include = FALSE} write.csv(x = t(as.data.frame(all_times)), file = "../output/timings/spatial_vignette_times.csv") ``` diff --git a/vignettes/vignettes_v5_new.yaml b/vignettes/vignettes_v5_new.yaml new file mode 100644 index 000000000..64f51e5a8 --- /dev/null +++ b/vignettes/vignettes_v5_new.yaml @@ -0,0 +1,261 @@ +- category: Introduction + vignettes: + - title: Guided tutorial --- 2,700 PBMCs + name: pbmc3k_tutorial + summary: | + A basic overview of Seurat that includes an introduction to common analytical workflows. + image: pbmc3k_umap.jpg + + - title: Multimodal analysis + name: multimodal_vignette + summary: | + An introduction to working with multi-modal datasets in Seurat. + image: citeseq_plot.jpg + + - title: Visualization + name: visualization_vignette + summary: | + An overview of the visualization capabilities within Seurat. + image: visualization_vignette.jpg + + - title: SCTransform + name: sctransform_vignette + summary: | + Examples of how to perform normalization, feature selection, integration, and differential expression with an updated version of sctransform. + image: assets/sctransform_v2_new.png + + - title: Essential Commands Cheat Sheet + name: essential_commands + summary: | + Reference list of commonly used commands to store, access, explore, and analyze datasets. + image: commands.png + + +- category: scRNA-seq Data Integration + vignettes: + - title: Introduction to scRNA-seq integration + name: integration_introduction + summary: | + An introduction to integrating scRNA-seq datasets in order to identify and compare shared cell types across experiments. + image: pbmc_alignment.jpg + + - title: scRNA-seq Integration + name: seurat5_integration + summary: | + Integrate scRNA-seq datasets using a variety of computational methods. + image: integration_seurat5.jpg + + - title: Mapping and annotating query datasets + name: integration_mapping + summary: | + Learn how to map a query scRNA-seq dataset onto a reference in order to automate the annotation and visualization of query cells. + image: assets/anchorsb_2018.png + +- category: Multi-assay data + vignettes: + - title: Cross-modality Bridge Integration + name: seurat5_integration_bridge + summary: | + Map scATAC-seq onto an scRNA-seq reference using a multi-omic bridge dataset in Seurat v5. + image: bridge_integration.png + + - title: Weighted Nearest Neighbor Analysis + name: weighted_nearest_neighbor_analysis + summary: | + Analyze multimodal single-cell data with weighted nearest neighbor analysis in Seurat v4. + image: weighted_nearest_neighbor_analysis.jpg + + - title: Integrating scRNA-seq and scATAC-seq data + name: atacseq_integration_vignette + summary: | + Annotate, visualize, and interpret an scATAC-seq experiment using scRNA-seq data from the same biological system in Seurat v3. + image: atacseq_integration_vignette.jpg + + - title: Reference Mapping for Multimodal Data + name: multimodal_reference_mapping + summary: | + Analyze query data in the context of multimodal reference atlases. + image: multimodal_reference_mapping.jpg + + - title: Mixscape + name: mixscape_vignette + summary: | + Explore new methods to analyze pooled single-celled perturbation screens. + image: mixscape_vignette.jpg + +- category: Flexible analysis of massively scalable datasets + vignettes: + - title: Unsupervised clustering of 1.3M neurons + name: seurat5_sketch_analysis + summary: | + Analyze a 1.3 million cell mouse brain dataset using on-disk capabilities powered by BPCells. + image: sketch_1p3.png + + - title: Integrating/comparing healthy and diabetic samples + name: ParseBio_sketch_integration + summary: | + Perform sketch integration on a large dataset from Parse Biosciences. + image: sketch.png + + - title: Supervised mapping of 1.5M immune cells + name: COVID_SCTMapping + summary: | + Map PBMC datasets from COVID-19 patients to a healthy PBMC reference. + image: COVID_SCTMapping.png + + - title: BPCells Interaction + name: seurat5_bpcells_interaction_vignette + summary: | + Load and save large on-disk matrices using BPCells. + image: bpcells.png + +- category: Spatial analysis + vignettes: + - title: Analysis of spatial datasets (Imaging-based) + name: seurat5_spatial_vignette_2 + summary: | + Learn to explore spatially-resolved data from multiplexed imaging technologies, including MERSCOPE, Xenium, CosMx SMI, and CODEX. + image: spatial_vignette_2.jpg + + - title: Analysis of spatial datasets (Sequencing-based) + name: spatial_vignette + summary: | + Learn to explore spatially-resolved transcriptomic data with examples from 10x Visium and Slide-seq v2. + image: spatial_vignette_ttr.jpg + +- category: Other + vignettes: + - title: Cell Cycle Regression + name: cell_cycle_vignette + summary: | + Mitigate the effects of cell cycle heterogeneity by computing cell cycle phase scores based on marker genes. + image: cell_cycle_vignette.jpg + + - title: Differential Expression Testing + name: de_vignette + summary: | + Perform differential expression (DE) testing in Seurat using a number of frameworks. + image: assets/de_vignette.png + + - title: Demultiplex Cell Hashing data + name: hashing_vignette + summary: | + Learn how to work with data produced with Cell Hashing. + image: assets/hashing_vignette.png + +- category: Seurat Wrappers + hash: seurat-wrappers + description: | + In order to facilitate the use of community tools with Seurat, we provide the Seurat Wrappers package, which contains code to run other analysis tools on Seurat objects. For the initial release, we provide wrappers for a few packages in the table below but would encourage other package developers interested in interfacing with Seurat to check out our contributor guide [here](https://github.com/satijalab/seurat.wrappers/wiki/Submission-Process).

+ vignettes: + - name: alevin + title: Import alevin counts into Seurat + link: http://htmlpreview.github.io/?https://github.com/satijalab/seurat-wrappers/blob/master/docs/alevin.html + reference: https://doi.org/10.1186/s13059-019-1670-y + citation: Srivastava et. al., Genome Biology 2019 + source: https://github.com/k3yavi/alevin-Rtools + + - name: ALRA + title: Zero-preserving imputation with ALRA + link: https://htmlpreview.github.io/?https://github.com/satijalab/seurat.wrappers/blob/master/docs/alra.html + reference: https://doi.org/10.1101/397588 + citation: Linderman et al, bioRxiv 2018 + source: https://github.com/KlugerLab/ALRA + + - name: CoGAPS + title: Running CoGAPS on Seurat Objects + link: http://htmlpreview.github.io/?https://github.com/satijalab/seurat-wrappers/blob/master/docs/cogaps.html + reference: https://doi.org/10.1016/j.cels.2019.04.004 + citation: Stein-O’Brien et al, Cell Systems 2019 + source: https://www.bioconductor.org/packages/release/bioc/html/CoGAPS.html + + - name: Conos + title: Integration of datasets using Conos + link: https://htmlpreview.github.io/?https://github.com/satijalab/seurat.wrappers/blob/master/docs/conos.html + reference: https://doi.org/10.1038/s41592-019-0466-z + citation: Barkas et al, Nature Methods 2019 + source: https://github.com/hms-dbmi/conos + + - name: fastMNN + title: Running fastMNN on Seurat Objects + link: https://htmlpreview.github.io/?https://github.com/satijalab/seurat.wrappers/blob/master/docs/fast_mnn.html + reference: https://doi.org/10.1038/nbt.4091 + citation: Haghverdi et al, Nature Biotechnology 2018 + source: https://bioconductor.org/packages/release/bioc/html/scran.html + + - name: glmpca + title: Running GLM-PCA on a Seurat Object + link: http://htmlpreview.github.io/?https://github.com/satijalab/seurat-wrappers/blob/master/docs/glmpca.html + reference: https://doi.org/10.1186/s13059-019-1861-6 + citation: Townes et al, Genome Biology 2019 + source: https://github.com/willtownes/glmpca + + - name: Harmony + title: Integration of datasets using Harmony + link: https://htmlpreview.github.io/?https://github.com/satijalab/seurat.wrappers/blob/master/docs/harmony.html + reference: https://doi.org/10.1038/s41592-019-0619-0 + citation: Korsunsky et al, Nature Methods 2019 + source: https://github.com/immunogenomics/harmony + + - name: LIGER + title: Integrating Seurat objects using LIGER + link: https://htmlpreview.github.io/?https://github.com/satijalab/seurat.wrappers/blob/master/docs/liger.html + reference: https://doi.org/10.1016/j.cell.2019.05.006 + citation: Welch et al, Cell 2019 + source: https://github.com/MacoskoLab/liger + + - name: Monocle3 + title: Calculating Trajectories with Monocle 3 and Seurat + link: http://htmlpreview.github.io/?https://github.com/satijalab/seurat-wrappers/blob/master/docs/monocle3.html + reference: https://doi.org/10.1038/s41586-019-0969-x + citation: Cao et al, Nature 2019 + source: https://cole-trapnell-lab.github.io/monocle3 + + - name: Nebulosa + title: Visualization of gene expression with Nebulosa + link: http://htmlpreview.github.io/?https://github.com/satijalab/seurat-wrappers/blob/master/docs/nebulosa.html + reference: https://github.com/powellgenomicslab/Nebulosa + citation: Jose Alquicira-Hernandez and Joseph E. Powell, Under Review + source: https://github.com/powellgenomicslab/Nebulosa + + - name: schex + title: Using schex with Seurat + link: https://htmlpreview.github.io/?https://github.com/satijalab/seurat.wrappers/blob/master/docs/schex.html + reference: https://doi.org/0.1242/dev.173807 + citation: Freytag, R package 2019 + source: https://github.com/SaskiaFreytag/schex + + - name: scVelo + title: Estimating RNA Velocity using Seurat and scVelo + link: http://htmlpreview.github.io/?https://github.com/satijalab/seurat-wrappers/blob/master/docs/scvelo.html + reference: https://doi.org/10.1101/820936 + citation: Bergen et al, bioRxiv 2019 + source: https://scvelo.readthedocs.io/ + + - name: Velocity + title: Estimating RNA Velocity using Seurat + link: https://htmlpreview.github.io/?https://github.com/satijalab/seurat.wrappers/blob/master/docs/velocity.html + reference: 10.1038/s41586-018-0414-6 + citation: La Manno et al, Nature 2018 + source: https://velocyto.org + + - name: CIPR + title: Using CIPR with human PBMC data + link: http://htmlpreview.github.io/?https://github.com/satijalab/seurat-wrappers/blob/master/docs/cipr.html + reference: https://doi.org/10.1186/s12859-020-3538-2 + citation: Ekiz et. al., BMC Bioinformatics 2020 + source: https://github.com/atakanekiz/CIPR-Package + + - name: miQC + title: Running miQC on Seurat objects + link: http://htmlpreview.github.io/?https://github.com/satijalab/seurat-wrappers/blob/master/docs/miQC.html + reference: https://www.biorxiv.org/content/10.1101/2021.03.03.433798v1 + citation: Hippen et. al., bioRxiv 2021 + source: https://github.com/greenelab/miQC + + - name: tricycle + title: Running estimate_cycle_position from tricycle on Seurat Objects + link: http://htmlpreview.github.io/?https://github.com/satijalab/seurat-wrappers/blob/master/docs/tricycle.html + reference: https://doi.org/10.1101/2021.04.06.438463 + citation: Zheng et. al., bioRxiv 2021 + source: https://www.bioconductor.org/packages/release/bioc/html/tricycle.html \ No newline at end of file diff --git a/vignettes/visualization_vignette.Rmd b/vignettes/visualization_vignette.Rmd index dd6988ecc..cf1808f27 100644 --- a/vignettes/visualization_vignette.Rmd +++ b/vignettes/visualization_vignette.Rmd @@ -32,7 +32,7 @@ knitr::opts_chunk$set( time_it = TRUE, error = TRUE ) -options(SeuratData.repo.use = 'satijalab04.nygenome.org') +options(SeuratData.repo.use = 'seurat.nygenome.org') ``` We'll demonstrate visualization techniques in Seurat using our previously computed Seurat object from the 2,700 PBMC tutorial. You can download this dataset from [SeuratData](https://github.com/satijalab/seurat-data) @@ -153,7 +153,7 @@ ggsave(filename = "../output/images/visualization_vignette.jpg", height = 7, wid Seurat utilizes R's plotly graphing library to create interactive plots. This interactive plotting feature works with any ggplot2-based scatter plots (requires a `geom_point` layer). To use, simply make a ggplot2-based scatter plot (such as `DimPlot()` or `FeaturePlot()`) and pass the resulting plot to `HoverLocator()` ```{r hover} -# Include additional data to display alongside cell names by passing in a data frame of information +# Include additional data to display alongside cell names by passing in a data frame of information. # Works well when using FetchData plot <- FeaturePlot(pbmc3k.final, features = 'MS4A1') HoverLocator(plot = plot, information = FetchData(pbmc3k.final, vars = c('ident', 'PC_1', 'nFeature_RNA'))) @@ -161,7 +161,7 @@ HoverLocator(plot = plot, information = FetchData(pbmc3k.final, vars = c('ident' Another interactive feature provided by Seurat is being able to manually select cells for further investigation. We have found this particularly useful for small clusters that do not always separate using unbiased clustering, but which look tantalizingly distinct. You can now select these cells by creating a ggplot2-based scatter plot (such as with `DimPlot()` or `FeaturePlot()`, and passing the returned plot to `CellSelector()`. `CellSelector()` will return a vector with the names of the points selected, so that you can then set them to a new identity class and perform differential expression. -For example, lets pretend that DCs had merged with monocytes in the clustering, but we wanted to see what was unique about them based on their position in the tSNE plot. +For example, let's pretend that DCs had merged with monocytes in the clustering, but we wanted to see what was unique about them based on their position in the tSNE plot. ```{r identify, eval=FALSE} pbmc3k.final <- RenameIdents(pbmc3k.final, "DC" = "CD14+ Mono") @@ -224,6 +224,7 @@ Plotting multiple plots was previously achieved with the `CombinePlot()` functio ```{r combining_plots, fig.height = 5} plot1 <- DimPlot(pbmc3k.final) +# Create scatter plot with the Pearson correlation value as the title plot2 <- FeatureScatter(pbmc3k.final, feature1 = 'LYZ', feature2 = 'CCL5') # Combine two plots plot1 + plot2 diff --git a/vignettes/weighted_nearest_neighbor_analysis.Rmd b/vignettes/weighted_nearest_neighbor_analysis.Rmd index 68e155d1f..aa1e98997 100644 --- a/vignettes/weighted_nearest_neighbor_analysis.Rmd +++ b/vignettes/weighted_nearest_neighbor_analysis.Rmd @@ -37,18 +37,13 @@ This vignette introduces the WNN workflow for the analysis of multimodal single- We demonstrate the use of WNN analysis to two single-cell multimodal technologies: CITE-seq and 10x multiome. We define the cellular states based on both modalities, instead of either individual modality. - # WNN analysis of CITE-seq, RNA + ADT We use the CITE-seq dataset from ([Stuart\*, Butler\* et al, Cell 2019](https://www.cell.com/cell/fulltext/S0092-8674(19)30559-8)), which consists of 30,672 scRNA-seq profiles measured alongside a panel of 25 antibodies from bone marrow. The object contains two assays, RNA and antibody-derived tags (ADT). -To run this vignette please install Seurat v4, available on [CRAN](https://cran.r-project.org/web/packages/Seurat/index.html), and SeuratData, available on [GitHub](https://github.com/satijalab/seurat-data). - -```{r install, eval = FALSE} -install.packages("Seurat") -``` +To run this vignette please install SeuratData, available on [GitHub](https://github.com/satijalab/seurat-data). ```{r, include = FALSE, cache=FALSE} -options(SeuratData.repo.use = "http://satijalab04.nygenome.org") +options(SeuratData.repo.use = "http://seurat.nygenome.org") ``` ```{r packages, cache=FALSE} @@ -163,8 +158,8 @@ You can download the dataset from the 10x Genomics website [here](https://suppor Finally, in order to run the vignette, make sure the following packages are installed: -* [Seurat v4](install.html) -* [Signac](https://satijalab.org/signac/) for the analysis of single-cell chromatin datasets +* [Seurat](install.html) +* [Signac](https://stuartlab.org/signac/) for the analysis of single-cell chromatin datasets * [EnsDb.Hsapiens.v86](https://bioconductor.org/packages/release/data/annotation/html/EnsDb.Hsapiens.v86.html) for a set of annotations for hg38 * [dplyr](https://cran.r-project.org/web/packages/dplyr/index.html) to help manipulate data tables @@ -178,7 +173,7 @@ library(ggplot2) -We'll create a Seurat object based on the gene expression data, and then add in the ATAC-seq data as a second assay. You can explore the [Signac getting started vignette](https://satijalab.org/signac/articles/pbmc_vignette.html) for more information on the creation and processing of a ChromatinAssay object. +We'll create a Seurat object based on the gene expression data, and then add in the ATAC-seq data as a second assay. You can explore the [Signac getting started vignette](https://stuartlab.org/signac/articles/pbmc_vignette.html) for more information on the creation and processing of a ChromatinAssay object. ```{r CreateObject} # the 10x hdf5 file contains both data types. @@ -253,7 +248,7 @@ pbmc <- RunUMAP(pbmc, nn.name = "weighted.nn", reduction.name = "wnn.umap", redu pbmc <- FindClusters(pbmc, graph.name = "wsnn", algorithm = 3, verbose = FALSE) ``` -We annotate the clusters below. Note that you could also annotate the dataset using our supervised mapping pipelines, using either our [vignette](multimodal_reference_mapping.html), or [automated web tool, Azimuth](www.satijalab.org/azimuth). +We annotate the clusters below. Note that you could also annotate the dataset using our supervised mapping pipelines, using either our [vignette](multimodal_reference_mapping.html), or [automated web tool, Azimuth](https://satijalab.org/azimuth). ```{r Annotate, results = 'hide'} # perform sub-clustering on cluster 6 to find additional structure @@ -283,7 +278,7 @@ p3 <- DimPlot(pbmc, reduction = "wnn.umap", group.by = "celltype", label = TRUE, p1 + p2 + p3 & NoLegend() & theme(plot.title = element_text(hjust = 0.5)) ``` -For example, the ATAC-seq data assists in the separation of CD4 and CD8 T cell states. This is due to the presence of multiple loci that exhibit differential accessibility between different T cell subtypes. For example, we can visualize 'pseudobulk' tracks of the CD8A locus alongside violin plots of gene expression levels, using tools in the [Signac visualization vignette](https://satijalab.org/signac/articles/visualization.html). +For example, the ATAC-seq data assists in the separation of CD4 and CD8 T cell states. This is due to the presence of multiple loci that exhibit differential accessibility between different T cell subtypes. For example, we can visualize 'pseudobulk' tracks of the CD8A locus alongside violin plots of gene expression levels, using tools in the [Signac visualization vignette](https://stuartlab.org/signac/articles/visualization.html). ```{r coverageplotcd8, fig.width=10} ## to make the visualization easier, subset T cell clusters @@ -293,7 +288,7 @@ tcells <- subset(pbmc, idents = tcell.names) CoveragePlot(tcells, region = 'CD8A', features = 'CD8A', assay = 'ATAC', expression.assay = 'SCT', peaks = FALSE) ``` -Next, we will examine the accessible regions of each cell to determine enriched motifs. As described in the [Signac motifs vignette](https://satijalab.org/signac/articles/motif_vignette.html), there are a few ways to do this, but we will use the [chromVAR](https://www.nature.com/articles/nmeth.4401) package from the Greenleaf lab. This calculates a per-cell accessibility score for known motifs, and adds these scores as a third assay (`chromvar`) in the Seurat object. +Next, we will examine the accessible regions of each cell to determine enriched motifs. As described in the [Signac motifs vignette](https://stuartlab.org/signac/articles/motif_vignette.html), there are a few ways to do this, but we will use the [chromVAR](https://www.nature.com/articles/nmeth.4401) package from the Greenleaf lab. This calculates a per-cell accessibility score for known motifs, and adds these scores as a third assay (`chromvar`) in the Seurat object. To continue, please make sure you have the following packages installed.