knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
MacBook Pro (Big Sur, 16-inch, 2019), Processor (2.4 GHz 8-Core Intel Core i9), Memory (64 GB 2667 MHz DDR4).
Attach necessary libraries:
library(ASURAT) library(SingleCellExperiment) library(SummarizedExperiment)
Create random variables following Normal distributions and compute separation indices.
size <- 1000 for(i in c(3, 4, 5, 9)){ # Preparation set.seed(1) scores_0 <- as.numeric(rnorm(size, mean = 5, sd = 1)) names(scores_0) <- seq_len(size) scores_1 <- as.numeric(rnorm(size, mean = 1 + 1 * (i - 1), sd = 1)) names(scores_1) <- size + seq_len(size) scores <- c(scores_0, scores_1) scores <- sort(scores, decreasing = FALSE) vec <- ifelse(as.integer(names(scores)) <= size, 0, 1) # Count the number of steps in bubble sort. dist1 <- bubble_sort(list(vec, 0))[[2]] # dist(vec, (0,...,1)). dist2 <- bubble_sort(list(1 - vec, 0))[[2]] # dist(vec, (1,...,0)). sepI <- round((dist2 - dist1) / (dist2 + dist1), digits = 6) # Plot the result. df <- data.frame(s = scores, l = vec) mytext <- paste("I = ", round(sepI, digits = 3), sep = "") mycolors <- c(rainbow(3)[1], rainbow(3)[3]) color <- factor(df$l, levels = c(1, 0)) p <- ggplot2::ggplot(df, ggplot2::aes(x = s, y = 0, color = color)) + ggplot2::geom_jitter(width = 0, alpha = 0.4, size = .5) + ggplot2::theme_classic(base_size = 18) + ggplot2::scale_y_continuous(breaks = NULL) + ggplot2::scale_colour_manual(values = mycolors) + ggplot2::labs(title = "", x = "Score", y = "", color = "Label") + ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5, vjust = -1, size = 16)) + ggplot2::annotate("text", x = Inf, y = Inf, label = mytext, hjust = 1, vjust = 1, size = 5) + ggplot2::guides(colour = ggplot2::guide_legend(override.aes = list(size = 4))) p <- ggExtra::ggMarginal(p, type = "density", margins = "x", size = 0.8, groupColour = TRUE, groupFill = TRUE) filename <- sprintf("figures/figure_90_%04d.png", 19 + i) ggplot2::ggsave(file = filename, plot = p, dpi = 50, width = 4, height = 3) }
Select GO IDs of interest.
GOOI <- c("GO:0070371", # ERK1 and ERK2 cascade "GO:1904666") # regulation of ubiquitin protein ligase activity
Load the result of the section "Normalize data" in SCLC data analyses.
sclc <- readRDS("backup/01_003_sclc_normalized.rds")
sclc <- readRDS("<file path>")
Load the result of the section "Compute correlation matrices" in SCLC data analyses.
cormat <- readRDS("backup/01_003_sclc_cormat.rds")
cormat <- readRDS("<file path>")
Load databases.
urlpath <- "https://github.com/keita-iida/ASURATDB/blob/main/genes2bioterm/" load(url(paste0(urlpath, "20201213_human_GO_red.rda?raw=TRUE"))) # GO
The reformatted knowledge-based data were available from the following repositories:
Add formatted databases to metadata(sce)$sign
.
sclcs <- list(GO = sclc) metadata(sclcs$GO) <- list(sign = human_GO[["BP"]])
Redefines functional gene sets for the input database by removing genes,
which are not included in rownames(sce)
, and further removes biological terms
including too few or too many genes.
sclcs$GO <- remove_signs(sce = sclcs$GO, min_ngenes = 2, max_ngenes = 1000)
Clusters functional gene sets using a correlation graph-based decomposition method, producing strongly, variably, and weakly correlated gene sets (SCG, VCG, and WCG, respectively).
set.seed(1) sclcs$GO <- cluster_genesets(sce = sclcs$GO, cormat = cormat, th_posi = 0.20, th_nega = -0.20)
Select IDs and prepare a correlation submatrix.
goid <- GOOI[1] df <- metadata(sclcs$GO)$sign goi_strg <- unlist(strsplit(df[df$SignID == goid, ]$StrgCorrGene, "/")) goi_vari <- unlist(strsplit(df[df$SignID == goid, ]$VariCorrGene, "/")) goi_weak <- unlist(strsplit(df[df$SignID == goid, ]$WeakCorrGene, "/")) goi_strg <- goi_strg[seq_len(6)] goi_weak <- goi_weak[seq_len(10)] goi_all <- c(goi_weak[1], goi_weak[2], goi_strg[1:6], goi_weak[3:7], goi_vari[1:2], goi_weak[8:10]) #goi_all <- unique(c(goi_strg, goi_vari, goi_weak)) cmat <- cormat[goi_all, goi_all] diag(cmat) <- 0
Plot the correlation graph.
label <- goi_all label[which(goi_all %in% goi_strg)] <- 1 label[which(goi_all %in% goi_vari)] <- 2 label[which(goi_all %in% goi_weak)] <- 3 colors <- label colors[which(colors == 1)] <- "grey20" colors[which(colors == 2)] <- "grey70" colors[which(colors == 3)] <- "white" text_colors <- label text_colors[which(text_colors == "1")] <- "white" text_colors[which(text_colors == "2")] <- "black" text_colors[which(text_colors == "3")] <- "black" node_sizes <- 9 title <- "GO:0070371 (ERK1 and ERK2 cascade)" filename <- "figures/figure_90_0001" qgraph::qgraph(input = cmat, title = title, title.cex = 0.3, filename = filename, filetype = "png", height = 1, width = 1, color = colors, vsize = node_sizes, label.color = text_colors, border.width = 3, edge.labels = FALSE, posCol = "red", negCol = "blue", threshold = 0)
where the average correlation coefficients of SCG, VCG, and WCG are 0.3389764, 0.2280671, and 0.011150431, respectively.
Perform the following ASURAT functions.
# Keep only GO terms of interest. df <- metadata(sclcs$GO)$sign df <- df[which(df$SignID %in% GOOI), ] metadata(sclcs$GO)$sign <- df # Creates signs. sclcs$GO <- create_signs(sce = sclcs$GO, min_cnt_strg = 2, min_cnt_vari = 2) # Create sign-by-sample matrices. sclcs$GO <- makeSignMatrix(sce = sclcs$GO, weight_strg = 0.5, weight_vari = 0.5)
# Save data. saveRDS(sclcs, file = "backup/90_001_sclc_asurat.rds") # Load data. sclcs <- readRDS("backup/90_001_sclc_asurat.rds")
Install an escape package v1.5, which requires R (>= 4.1).
# devtools::install_github("ncborcherding/escape") packageVersion("escape")
[1] ‘1.5.1’
Set a GO term of interest.
GOOI <- c("GOBP_ERK1_AND_ERK2_CASCADE", "GOBP_REGULATION_OF_UBIQUITIN_PROTEIN_LIGASE_ACTIVITY")
Load the result of the section "Normalize data" in SCLC data analyses.
sclc <- readRDS("backup/01_003_sclc_normalized.rds")
sclc <- readRDS("<file path>")
Create Seurat objects.
sclc <- Seurat::CreateSeuratObject(counts = as.matrix(assay(sclc, "normalized")), project = "SCLC")
Perform getGeneSets()
with an argument library = "C5"
("ontology gene sets" in
MSigDB) and
select GO terms of interest.
sclc@misc[["getGeneSets"]] <- escape::getGeneSets(species = "Homo sapiens", library = "C5") df <- sclc@misc[["getGeneSets"]] df <- df[grepl("GOBP", names(df))] sclc@misc[["getGeneSets"]] <- df sclc@misc[["GO_TERMS"]] <- df[grepl(paste(GOOI, collapse="|"), names(df))]
Perform enrichIt()
, estimating ssGSEA scores, in which the arguments are
the same with those in the vignettes in escape package.
set.seed(1) ES <- escape::enrichIt(obj = sclc, gene.sets = sclc@misc[["GO_TERMS"]], groups = 1000, cores = 4) sclc@misc[["enrichIt"]] <- ES
[1] "Using sets of 1000 cells. Running 3 times." Setting parallel calculations through a SnowParam back-end with workers=4 and tasks=100. Estimating ssGSEA scores for 2 gene sets. [1] "Calculating ranks..." [1] "Calculating absolute values from ranks..." Setting parallel calculations through a SnowParam back-end with workers=4 and tasks=100. Estimating ssGSEA scores for 2 gene sets. [1] "Calculating ranks..." [1] "Calculating absolute values from ranks..." Setting parallel calculations through a SnowParam back-end with workers=4 and tasks=100. Estimating ssGSEA scores for 2 gene sets. [1] "Calculating ranks..." [1] "Calculating absolute values from ranks..."
# Save data. saveRDS(sclc, file = "backup/90_002_sclc_ssGSEA.rds") # Load data. sclc <- readRDS("backup/90_002_sclc_ssGSEA.rds")
packageVersion("pagoda2")
[1] ‘1.0.10’
Set GO terms of interest.
GOOI <- c("GO:0070371", # ERK1 and ERK2 cascade "GO:1904666") # regulation of ubiquitin protein ligase activity
Load the result of the section "Normalize data" in SCLC data analyses.
sclc <- readRDS("backup/01_003_sclc_normalized.rds")
sclc <- readRDS("<file path>")
Create a pagoda2 object.
mat <- as.matrix(assay(sclc, "normalized")) sclc <- pagoda2::Pagoda2$new(x = mat, modelType = "plain", n.cores = 1, log.scale = TRUE, min.cells.per.gene = 0)
2283 cells, 6346 genes; normalizing ... Using plain model log scale ... done.
Adjust the variance.
sclc$adjustVariance(plot = FALSE)
calculating variance fit ... using gam 1278 overdispersed genes ... 1278 persisting ... done.
Perform principal component analysis, where n.odgenes
is the number of
overdispersed genes.
sclc$calculatePcaReduction(nPcs = 50, n.odgenes = 1278)
running PCA using 1278 OD genes . . . . done
Perform pathway overdispersion analysis. Here, computations were partially performed on the IPR server at Institute for Protein Research, Osaka University.
# Translate gene names to ids. suppressMessages(library(org.Hs.eg.db)) ids <- unlist(lapply(mget(colnames(sclc$counts), org.Hs.egALIAS2EG, ifnotfound = NA), function(x) x[1])) # Reverse map rids <- names(ids) names(rids) <- ids # List all the ids per GO category. go.env <- list2env(eapply(org.Hs.egGO2ALLEGS, function(x) as.character(na.omit(rids[x])))) # Remove unused GO terms. goterms <- names(go.env) for(i in seq_along(goterms)){ if(goterms[i] %in% GOOI){ next }else{ rlang::env_unbind(go.env, goterms[i]) } } # Pathway overdispersion analysis sclc$testPathwayOverdispersion(setenv = go.env, verbose = TRUE, correlation.distance.threshold = 0.8, min.pathway.size = 1, max.pathway.size = 100, recalculate.pca = FALSE)
determining valid pathways processing 2 valid pathways scoring pathway od signifcance compiling pathway reduction clustering aspects based on gene loading ... 2 aspects remaining clustering aspects based on pattern similarity ... 2 aspects remaining
# Save data. saveRDS(sclc, file = "backup/90_003_sclc_pagoda2.rds") # Load data. sclc <- readRDS("backup/90_003_sclc_pagoda2.rds")
Load the above computational results.
sclc_asurat <- readRDS("backup/90_001_sclc_asurat.rds") sclc_ssgsea <- readRDS("backup/90_002_sclc_ssGSEA.rds") sclc_pagoda <- readRDS("backup/90_003_sclc_pagoda2.rds")
sclc_asurat <- readRDS("<file path>") sclc_ssgsea <- readRDS("<file path>") sclc_pagoda <- readRDS("<file path>")
Create score-by-sample (cell) matrices.
mat_asurat <- as.matrix(assay(sclc_asurat$GO, "counts")) mat_ssgsea <- t(as.matrix(sclc_ssgsea@misc[["enrichIt"]])) mat_pagoda <- sclc_pagoda[["misc"]][["pathwayOD"]][["xv"]] rownames(mat_ssgsea) <- c("GO:0070371", "GO:1904666") rownames(mat_pagoda) <- c(sclc_pagoda[["misc"]][["pathwayOD"]][["cnam"]][["aspect1"]], sclc_pagoda[["misc"]][["pathwayOD"]][["cnam"]][["aspect2"]]) # Select rows. submat_asurat <- as.matrix(mat_asurat[c(1, 3), ]) submat_ssgsea <- as.matrix(mat_ssgsea[1, ]) submat_pagoda <- as.matrix(mat_pagoda[2, ]) submat_ssgsea <- t(submat_ssgsea) ; rownames(submat_ssgsea) <- rownames(mat_ssgsea)[1] submat_pagoda <- t(submat_pagoda) ; rownames(submat_pagoda) <- rownames(mat_pagoda)[2] # Select columns by random sampling. set.seed(1) inds <- sample(ncol(mat_asurat), size = 1000, replace = FALSE) submat_asurat <- as.matrix(submat_asurat[, inds]) submat_ssgsea <- as.matrix(submat_ssgsea[, inds]) submat_pagoda <- as.matrix(submat_pagoda[, inds]) submat_ssgsea <- t(submat_ssgsea) ; rownames(submat_ssgsea) <- rownames(mat_ssgsea)[1] submat_pagoda <- t(submat_pagoda) ; rownames(submat_pagoda) <- rownames(mat_pagoda)[2] # Scale data. submat_asurat <- t(scale(t(submat_asurat))) submat_ssgsea <- t(scale(t(submat_ssgsea))) submat_pagoda <- t(scale(t(submat_pagoda)))
Create gene expression matrices.
genes <- c("JUN", "MIF") mat_geneex <- assay(altExp(sclc_asurat$GO), "counts") mat_geneex <- mat_geneex[genes, ] # Select columns. submat_geneex <- as.matrix(mat_geneex[, inds]) # Scale data. submat_geneex <- t(scale(t(submat_geneex)))
Examine the scores of samples (cells).
suppressMessages(library(ComplexHeatmap)) filename <- "figures/figure_90_0010.png" png(file = filename, height = 120, width = 400, res = 80) #png(file = filename, height = 600, width = 2000, res = 400) p <- ComplexHeatmap::Heatmap(submat_asurat, column_title = "ASURAT", name = "Scaled\nSign scores", cluster_rows = FALSE, show_row_names = TRUE, row_names_side = "right", show_row_dend = FALSE, show_column_names = FALSE, column_dend_side = "top", show_parent_dend_line = FALSE) q <- ComplexHeatmap::Heatmap(submat_geneex, name = "Scaled\nExpression", cluster_rows = FALSE, show_row_names = TRUE, row_names_side = "right", show_row_dend = FALSE, show_column_names = FALSE, show_parent_dend_line = FALSE) p <- p %v% q p dev.off() filename <- "figures/figure_90_0011.png" png(file = filename, height = 120, width = 400, res = 80) #png(file = filename, height = 600, width = 2000, res = 400) p <- ComplexHeatmap::Heatmap(submat_ssgsea, column_title = "ssGSEA", name = "Scaled\nssGSEA score", cluster_rows = FALSE, show_row_names = TRUE, row_names_side = "right", show_row_dend = FALSE, show_column_names = FALSE, column_dend_side = "top", show_parent_dend_line = FALSE) q <- ComplexHeatmap::Heatmap(submat_geneex, name = "Scaled\nExpression", cluster_rows = FALSE, show_row_names = TRUE, row_names_side = "right", show_row_dend = FALSE, show_column_names = FALSE, show_parent_dend_line = FALSE) p <- p %v% q p dev.off() filename <- "figures/figure_90_0012.png" png(file = filename, height = 120, width = 440, res = 80) #png(file = filename, height = 620, width = 2200, res = 400) p <- ComplexHeatmap::Heatmap(submat_pagoda, column_title = "PAGODA2", name = "Scaled\nPC1 score", cluster_rows = FALSE, show_row_names = TRUE, row_names_side = "right", show_row_dend = FALSE, show_column_names = FALSE, column_dend_side = "top", show_parent_dend_line = FALSE) q <- ComplexHeatmap::Heatmap(submat_geneex, name = "Scaled\nExpression", cluster_rows = FALSE, show_row_names = TRUE, row_names_side = "right", show_row_dend = FALSE, show_column_names = FALSE, show_parent_dend_line = FALSE) p <- p %v% q p dev.off()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.