This vignette covers the main function and workflow of SCAVENGE. The standard processed input data including fine-mapped variants and single-cell epigenomic profiles. For fine-mapped variants of the trait of interest, we typically need information of genomic locations of variants and their corresponding posterior propability of causality. A peak-by-cell matrix of scATAC-seq profiles is needed. To walk through the workflow of SCAVENGE, we provided a blood cell trait of monocyte count and a 10X PBMC dataset as an example.
library(SCAVENGE) library(chromVAR) library(gchromVAR) library(BuenColors) library(SummarizedExperiment) library(data.table) library(dplyr) library(BiocParallel) library(BSgenome.Hsapiens.UCSC.hg19) library(igraph) set.seed(9527)
The PBMC data was processed using ArchR package. The peak-by-cell count matrix and corresponding meta data were extracted and stored in a RangedSummarizedExperiment object (for more details please follow our paper). The typical input of fine-mapped data for SCAVENGE or g-chromVAR can be found here (https://github.com/sankaranlab/SCAVENGE-reproducibility/tree/main/data/finemappedtraits_hg19). You can use this (trait_import <- importBedScore(rowRanges(SE_pbmc5k), trait_file, colidx=5)) to import your fine-mapped data into SummarizedExperiment when you have your data ready (trait_file is the path of your file).
trait_import <- example_data(name="mono.PP001.bed") SE_pbmc5k <- example_data(name="pbmc5k_SE.rda")
SE_pbmc5k <- addGCBias(object = SE_pbmc5k, genome = BSgenome.Hsapiens.UCSC.hg19::BSgenome.Hsapiens.UCSC.hg19) SE_pbmc5k_bg <- getBackgroundPeaks(object = SE_pbmc5k, niterations = 200) SE_pbmc5k_DEV <- computeWeightedDeviations(object = SE_pbmc5k, weights = trait_import, background_peaks = SE_pbmc5k_bg)
Reformat results
z_score_mat <- data.frame(SummarizedExperiment::colData(SE_pbmc5k), z_score=t(SummarizedExperiment::assays(SE_pbmc5k_DEV)[["z"]]) |> c()) head(z_score_mat)
seed_idx <- seedindex(z_score = z_score_mat$z_score, percent_cut = 0.05)
calculate scale factor
scale_factor <- cal_scalefactor(z_score = z_score_mat$z_score, percent_cut = 0.01)
Calculate tfidf-mat
peak_by_cell_mat <- SummarizedExperiment::assay(SE_pbmc5k) tfidf_mat <- tfidf(bmat=peak_by_cell_mat, mat_binary=TRUE, TF=TRUE, log_TF=TRUE)
Calculate lsi-mat
lsi_mat <- do_lsi(mat = tfidf_mat, dims = 30)
Please be sure that there is no potential batch effects for cell-to-cell graph construction. If the cells are from different samples or different conditions etc., please consider using Harmony analysis (HarmonyMatrix
from Harmony package). Typically you could take the lsi_mat as the input with parameter do_pca = FALSE
and provide meta data describing extra data such as sample and batch for each cell. Finally, a harmony-fixed LSI matrix can be used as input for the following analysis.
Calculate m-knn graph
mutualknn30 <- getmutualknn(lsimat = lsi_mat, num_k = 30)
np_score <- randomWalk_sparse(intM = mutualknn30, queryCells = rownames(mutualknn30)[seed_idx], gamma = 0.05)
Trait relevant score (TRS) with scaled and normalized
A few cells are singletons are removed from further analysis, this will lead very few cells be removed for the following analysis. You can always recover those cells with a unified score of 0 and it will not impact the following analysis.
omit_idx <- np_score==0 sum(omit_idx) mutualknn30 <- mutualknn30[!omit_idx, !omit_idx] np_score <- np_score[!omit_idx] TRS <- capOutlierQuantile(x = np_score, q_ceiling = 0.95) |> max_min_scale() TRS <- TRS * scale_factor mono_mat <- data.frame(z_score_mat[!omit_idx, ], seed_idx[!omit_idx], np_score, TRS) head(mono_mat)
Cell type annotation
p <- ggplot(data=mono_mat, aes(x, y, color=color)) + geom_point(size=1, na.rm = TRUE) + pretty_plot() + theme(legend.title = element_blank()) + labs(x="UMAP 1",y="UMAP 2") p
Visualize cell-to-cell graph if you have low-dimensional coordinates such as UMAP1 and UMAP2
mutualknn30_graph <- graph_from_adjacency_matrix(adjmatrix = mutualknn30, mode = "undirected", diag = FALSE) igraph::plot.igraph(x = mutualknn30_graph, vertex.size=0.8, vertex.label=NA, vertex.color=adjustcolor("#c7ce3d", alpha.f = 1), vertex.frame.color=NA, edge.color=adjustcolor("#443dce", alpha.f = 1), edge.width=0.3, edge.curved=.5, layout=as.matrix(data.frame(mono_mat$x, mono_mat$y)))
p1 <- ggplot(data=mono_mat, aes(x, y, color=z_score)) + geom_point(size=1, na.rm = TRUE, alpha = 0.6) + scale_color_viridis_c() + scale_alpha() + pretty_plot() + theme(legend.title = element_blank()) + labs(x="UMAP 1", y="UMAP 2") p1
Bar plot
pp1 <- ggplot(data=mono_mat, aes(x=color, y=z_score)) + geom_boxplot(aes(fill=color, color=color), outlier.shape=NA) + guides(fill=FALSE) + pretty_plot(fontsize = 10) + stat_summary(geom = "crossbar", width=0.65, fatten=0, color="black", fun.data = function(x){ return(c(y=median(x), ymin=median(x), ymax=median(x))) }) + theme(legend.position = "none") pp1
p2 <- ggplot(data=mono_mat, aes(x, y, color=TRS)) + geom_point(size=1, na.rm = TRUE, alpha = 0.6) + scale_color_viridis_c() + scale_alpha() + pretty_plot() + theme(legend.title = element_blank()) + labs(x="UMAP 1", y="UMAP 2") p2
Bar plot
pp2 <- ggplot(data=mono_mat, aes(x=color, y=TRS)) + geom_boxplot(aes(fill=color, color=color), outlier.shape=NA) + guides(fill=FALSE) + pretty_plot(fontsize = 10) + stat_summary(geom = "crossbar", width=0.65, fatten=0, color="black", fun.data = function(x){ return(c(y=median(x), ymin=median(x), ymax=median(x))) }) + theme(legend.position = "none") pp2
About 2 mins
please set @mycores >= 1 and @permutation_times >= 1,000 in the real setting
mono_permu <- get_sigcell_simple(knn_sparse_mat=mutualknn30, seed_idx=mono_mat$seed_idx, topseed_npscore=mono_mat$np_score, permutation_times=100, # Increase to >=1000 in practice true_cell_significance=0.05, rda_output=FALSE, # mycores=8,# Increase in practice rw_gamma=0.05) mono_mat2 <- data.frame(mono_mat, mono_permu)
Enriched cells
mono_mat2 |> dplyr::group_by(color) |> dplyr::summarise(enriched_cell=sum(true_cell_top_idx)) |> ggplot(aes(x=color, y=enriched_cell, fill=color)) + geom_bar(stat="identity") + theme_classic()
Depleted cells
mono_mat2$rev_true_cell_top_idx <- !mono_mat2$true_cell_top_idx mono_mat2 |> dplyr::group_by(color) |> dplyr::summarise(depleted_cell=sum(rev_true_cell_top_idx)) |> ggplot(aes(x=color, y=depleted_cell, fill=color)) + geom_bar(stat="identity") + theme_classic()
utils::sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.