Pseduo-bulk Differential Expression on Integrated Data #8507

IzaakH-A1 · 2024-02-20T18:53:08Z

IzaakH-A1
Feb 20, 2024

Hi, I am following this Vignette on differential gene expression analysis - https://satijalab.org/seurat/articles/de_vignette.

I am successful up to the point of pseudo-bulk analysis - i.e. I have been able to do non bulk analysis. I have two conditions HEB knock-out and WT. I want to compare the differential genes in cluster 2 (and other clusters) between the two conditions.

However, I am running into a lot of issues with the aggregation step.

This is what I have included for the pseudo-bulk De in my script:

pseudo_bulk_data <- AggregateExpression(integrated_data, assays = "RNA", return.seurat = T, group.by = c("seurat_clusters", "orig.ident"))

tail(Cells(pseudo_bulk_data))

pseudo_bulk_data$combined_ident <- paste(pseudo_bulk_data$seurat_clusters, pseudo_bulk_data$orig.ident, sep = "_")

Idents(pseudo_bulk_data) <- "combined_ident"

bulk.mono.de <- FindMarkers(object = pseudo_bulk_data, 
                            ident.1 = "2_FoxP3_HEB_KO", 
                            ident.2 = "2_WT",
                            test.use = "DESeq2")

But when I look at pseudo_bulk_data the metadata only contains the orig.ident in the metadata, not the seaurat_clusters. So therefore the next line isn't going to work, since seurat_clusters and orig.ident (not properly at least) aren't defined in the pseudo_bulk_data object. The orig.ident seems to be 0-9 (10 clusters, which is what I have); but shouldn't the orig.ident be WT or HEB_KO? So I am assuming something is going wrong with the aggregation.

Here is my whole script (below), I would greatly appreciate advice on how to modify this for my indented goal. Thank you in advance.

# Load libraries
library(Seurat)
library(SeuratData)
library(patchwork)
library(sctransform)
library(ggplot2)
library(harmony)
library(loupeR)
library(dplyr)
library(openxlsx)
library(DeSeq2)


# Define file paths
foxp3_heb_ko_1_file <- "/Volumes/SHARED/[01] Seq data/5. Single_Cell-Seq/Exp_David/10x Cell Ranger Anaylsis/GEX_HEB_KO_1_10x/filtered_feature_bc_matrix.h5"

wt_file_1 <- "/Volumes/SHARED/[01] Seq data/5. Single_Cell-Seq/Exp_Osman/B6 - From Archive - Cell Ranger ONLY/Dup 1 - Cell Ranger /filtered_feature_bc_matrix.h5"

# Read data
foxp3_heb_ko_1.data <- Read10X_h5(filename = foxp3_heb_ko_1_file)

wt_1.data <- Read10X_h5(filename = wt_file_1)

# Create Seurat objects
foxp3_heb_ko_1 <- CreateSeuratObject(counts = foxp3_heb_ko_1.data, project = "FoxP3_HEB_KO", min.cells = 3, min.features = 200)

wt_1 <- CreateSeuratObject(counts = wt_1.data, project = "WT", min.cells = 3, min.features = 200)

#QC on Seurat Objects 

foxp3_heb_ko_1[["percent.mt"]] <- PercentageFeatureSet(foxp3_heb_ko_1, pattern = "^mt-")

wt_1[["percent.mt"]] <- PercentageFeatureSet(wt_1, pattern = "^mt-")

foxp3_heb_ko_1 <- subset(foxp3_heb_ko_1, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 5)

wt_1 <- subset(wt_1, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 5)


#Normalize 
foxp3_heb_ko_1 <- NormalizeData(foxp3_heb_ko_1)

wt_1 <- NormalizeData(wt_1)

#Find variable features 
foxp3_heb_ko_1 <- FindVariableFeatures(foxp3_heb_ko_1)

wt_1 <- FindVariableFeatures(wt_1)

top10_foxp3 <- head(VariableFeatures(foxp3_heb_ko_1), 10)

top10_wt <- top10 <- head(VariableFeatures(wt_1), 10)

#Merge into one seurat object 
integrated_data <- merge(
  x = foxp3_heb_ko_1, 
  y = list(wt_1), 
  add.cell.ids = c("foxp3_heb_ko", "wt")
)



# Scale the data and run PCA
integrated_data <- ScaleData(integrated_data)
integrated_data <- RunPCA(integrated_data)

# Integration of datasets

integrated_data <- IntegrateLayers(
  object = integrated_data, method = HarmonyIntegration,
  orig.reduction = "pca", new.reduction = "harmony",
  verbose = FALSE
)

# Find Neighbors
integrated_data <- FindNeighbors(integrated_data, reduction = "harmony", dims = 1:30)

# Find Clusters
integrated_data <- FindClusters(integrated_data, resolution = 0.25, cluster.name = "harmony_clusters")

# Insert the cluster_cell_counts code block here
cluster_cell_counts <- data.frame(Sample = character(), Cluster = integer(), NumberOfCells = integer(), stringsAsFactors = FALSE)

# Get unique samples
samples <- unique(integrated_data$orig.ident)

# Loop through each sample
for (sample in samples) {
  # Subset the integrated data for the current sample
  sample_data <- subset(integrated_data, subset = orig.ident == sample)
  
  # Find the number of cells per cluster within the current sample
  cluster_counts <- table(Idents(sample_data))
  
  # Create a data frame for the current sample
  sample_cluster_counts <- data.frame(
    Sample = rep(sample, length(cluster_counts)),
    Cluster = as.integer(names(cluster_counts)),
    NumberOfCells = as.integer(cluster_counts),
    stringsAsFactors = FALSE
  )
  
  # Append the sample_cluster_counts to the cluster_cell_counts data frame
  cluster_cell_counts <- rbind(cluster_cell_counts, sample_cluster_counts)
}

write.csv(cluster_cell_counts, "/Users/M300189/Downloads/cells_per_cluster2.csv", row.names = FALSE)


# Run UMAP using Harmony embeddings

integrated_data <- RunUMAP(integrated_data, reduction = "harmony", dims = 1:30, reduction.name = "harmony")

# Visualization
DimPlot(integrated_data, reduction = "harmony", group.by = "orig.ident") + ggtitle("UMAP with Harmony Integration")

DimPlot(integrated_data, reduction = "harmony", group.by = "orig.ident", split.by = "orig.ident") + NoLegend()

DimPlot(integrated_data, reduction = "harmony", group.by = "harmony_clusters") + ggtitle("UMAP with Clusters")

DimPlot(integrated_data, reduction = "harmony", group.by = "harmony_clusters", split.by = "orig.ident")


# Differential Expression for Clusters against ALL other clusters 

integrated_data <- JoinLayers(integrated_data)

markers <- FindAllMarkers(integrated_data, only.pos = TRUE)

markers %>%
  group_by(cluster) %>%
  dplyr::filter(avg_log2FC > 1)

write.csv(markers, "/Users/M300189/Downloads/markers_heb_v_WT.csv")

#DE Analysis between clusters

cluster5v3.markers <- FindMarkers(integrated_data, ident.1 = 5, ident.2 = 3)

write.csv(cluster5v3.markers, "/Users/M300189/Downloads/cluster5v3.markers.csv")

cluster5v2.markers <- FindMarkers(integrated_data, ident.1 = 5, ident.2 = 2)

write.csv(cluster5v2.markers, "/Users/M300189/Downloads/cluster5v2.markers.csv")

cluster3v2.markers <- FindMarkers(integrated_data, ident.1 = 3, ident.2 = 2)

write.csv(cluster3v2.markers, "/Users/M300189/Downloads/cluster3v2.markers.csv")

#DE Analysis between Conditions (WITHOUT Pseduobulk)

integrated_data$combined_idents <- paste(integrated_data$seurat_clusters, integrated_data$orig.ident, sep = "_")

Idents(integrated_data) <- "combined_idents"

# Subset to include only WT and HEB_KO samples

de_results <- FindMarkers(integrated_data, ident.1 = "0_FoxP3_HEB_KO", ident.2 = "2_WT", verbose = FALSE)

#DE Differential Expression WITH Pseudo-bulking (recommended)

pseudo_bulk_data <- AggregateExpression(integrated_data, assays = "RNA", return.seurat = T, group.by = c("seurat_clusters", "orig.ident"))

tail(Cells(pseudo_bulk_data))

pseudo_bulk_data$combined_ident <- paste(pseudo_bulk_data$seurat_clusters, pseudo_bulk_data$orig.ident, sep = "_")

Idents(pseudo_bulk_data) <- "combined_ident"

bulk.mono.de <- FindMarkers(object = pseudo_bulk_data, 
                            ident.1 = "2_FoxP3_HEB_KO", 
                            ident.2 = "2_WT",
                            test.use = "DESeq2")

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Pseduo-bulk Differential Expression on Integrated Data #8507

{{title}}

{{editor}}'s edit

{{editor}}'s edit

Replies: 0 comments

Select a reply

Pseduo-bulk Differential Expression on Integrated Data #8507

IzaakH-A1 Feb 20, 2024

Replies: 0 comments

IzaakH-A1
Feb 20, 2024