Overview

This vignette covers the main function and workflow of SCAVENGE. The standard processed input data including fine-mapped variants and single-cell epigenomic profiles. For fine-mapped variants of the trait of interest, we typically need information of genomic locations of variants and their corresponding posterior propability of causality. A peak-by-cell matrix of scATAC-seq profiles is needed. To walk through the workflow of SCAVENGE, we provided a blood cell trait of monocyte count and a 10X PBMC dataset as an example.

Load required packages

library(SCAVENGE)
library(chromVAR)
library(gchromVAR)
library(BuenColors)
library(SummarizedExperiment)
library(data.table)
library(dplyr)
library(BiocParallel)
library(BSgenome.Hsapiens.UCSC.hg19)
library(igraph)

set.seed(9527)

Load example data

The PBMC data was processed using ArchR package. The peak-by-cell count matrix and corresponding meta data were extracted and stored in a RangedSummarizedExperiment object (for more details please follow our paper). The typical input of fine-mapped data for SCAVENGE or g-chromVAR can be found here (https://github.com/sankaranlab/SCAVENGE-reproducibility/tree/main/data/finemappedtraits_hg19). You can use this (trait_import <- importBedScore(rowRanges(SE_pbmc5k), trait_file, colidx=5)) to import your fine-mapped data into SummarizedExperiment when you have your data ready (trait_file is the path of your file).

trait_import <- example_data(name="mono.PP001.bed")
SE_pbmc5k <- example_data(name="pbmc5k_SE.rda")

gchromVAR analysis

SE_pbmc5k <- addGCBias(object = SE_pbmc5k, 
                       genome = BSgenome.Hsapiens.UCSC.hg19::BSgenome.Hsapiens.UCSC.hg19)
SE_pbmc5k_bg <- getBackgroundPeaks(object = SE_pbmc5k, 
                                   niterations = 200)
SE_pbmc5k_DEV <- computeWeightedDeviations(object = SE_pbmc5k, 
                                           weights = trait_import, 
                                           background_peaks = SE_pbmc5k_bg)

Reformat results

z_score_mat <- data.frame(SummarizedExperiment::colData(SE_pbmc5k), 
                          z_score=t(SummarizedExperiment::assays(SE_pbmc5k_DEV)[["z"]]) |> c())
head(z_score_mat)

Generate the seed cell index (using the top 5% if too many cells are eligible)

seed_idx <- seedindex(z_score = z_score_mat$z_score,
                      percent_cut = 0.05)

calculate scale factor

scale_factor <- cal_scalefactor(z_score = z_score_mat$z_score, 
                                percent_cut = 0.01)

Construct m-knn graph

Calculate tfidf-mat

peak_by_cell_mat <- SummarizedExperiment::assay(SE_pbmc5k)
tfidf_mat <- tfidf(bmat=peak_by_cell_mat, 
                   mat_binary=TRUE, 
                   TF=TRUE, 
                   log_TF=TRUE)

Calculate lsi-mat

lsi_mat <- do_lsi(mat = tfidf_mat, 
                  dims = 30)

Please be sure that there is no potential batch effects for cell-to-cell graph construction. If the cells are from different samples or different conditions etc., please consider using Harmony analysis (HarmonyMatrix from Harmony package). Typically you could take the lsi_mat as the input with parameter do_pca = FALSE and provide meta data describing extra data such as sample and batch for each cell. Finally, a harmony-fixed LSI matrix can be used as input for the following analysis.
Calculate m-knn graph

mutualknn30 <- getmutualknn(lsimat = lsi_mat, 
                            num_k = 30)

Network propagation

np_score <- randomWalk_sparse(intM = mutualknn30, 
                              queryCells = rownames(mutualknn30)[seed_idx], 
                              gamma = 0.05)

Trait relevant score (TRS) with scaled and normalized
A few cells are singletons are removed from further analysis, this will lead very few cells be removed for the following analysis. You can always recover those cells with a unified score of 0 and it will not impact the following analysis.

omit_idx <- np_score==0
sum(omit_idx)
mutualknn30 <- mutualknn30[!omit_idx, !omit_idx]
np_score <- np_score[!omit_idx]
TRS <- capOutlierQuantile(x = np_score, 
                          q_ceiling = 0.95) |> max_min_scale()
TRS <- TRS * scale_factor
mono_mat <- data.frame(z_score_mat[!omit_idx, ], 
                       seed_idx[!omit_idx], 
                       np_score, 
                       TRS)
head(mono_mat)

UMAP plots of cell type annotation and cell-to-cell graph

Cell type annotation

p <- ggplot(data=mono_mat, aes(x, y, color=color)) + 
  geom_point(size=1, na.rm = TRUE) + 
  pretty_plot() + 
  theme(legend.title = element_blank()) + 
  labs(x="UMAP 1",y="UMAP 2")
p

Visualize cell-to-cell graph if you have low-dimensional coordinates such as UMAP1 and UMAP2

mutualknn30_graph <- graph_from_adjacency_matrix(adjmatrix = mutualknn30, 
                                                 mode = "undirected", 
                                                 diag = FALSE)
igraph::plot.igraph(x = mutualknn30_graph, 
                    vertex.size=0.8, 
                    vertex.label=NA, 
                    vertex.color=adjustcolor("#c7ce3d", alpha.f = 1), 
                    vertex.frame.color=NA, 
                    edge.color=adjustcolor("#443dce", alpha.f = 1), 
                    edge.width=0.3, edge.curved=.5, 
                    layout=as.matrix(data.frame(mono_mat$x, mono_mat$y)))

Comparsion before and after SCAVENGE analysis

p1 <- ggplot(data=mono_mat, aes(x, y, color=z_score)) + 
  geom_point(size=1, na.rm = TRUE, alpha = 0.6) + 
  scale_color_viridis_c() + 
  scale_alpha() +
  pretty_plot() + 
  theme(legend.title = element_blank()) + 
  labs(x="UMAP 1", y="UMAP 2")
p1

Bar plot

pp1 <- ggplot(data=mono_mat,  aes(x=color, y=z_score))  +
  geom_boxplot(aes(fill=color, color=color), outlier.shape=NA) + 
  guides(fill=FALSE) + 
  pretty_plot(fontsize = 10) +
  stat_summary(geom = "crossbar", width=0.65, fatten=0, color="black", 
               fun.data = function(x){ return(c(y=median(x), ymin=median(x), ymax=median(x))) }) + theme(legend.position = "none")
pp1
p2 <- ggplot(data=mono_mat, aes(x, y, color=TRS)) + 
  geom_point(size=1, na.rm = TRUE, alpha = 0.6) + 
  scale_color_viridis_c() + 
  scale_alpha() + pretty_plot() + 
  theme(legend.title = element_blank()) + 
  labs(x="UMAP 1", y="UMAP 2")
p2

Bar plot

pp2 <- ggplot(data=mono_mat,  aes(x=color, y=TRS))  +
  geom_boxplot(aes(fill=color, color=color), outlier.shape=NA) + 
  guides(fill=FALSE) + 
  pretty_plot(fontsize = 10) +
  stat_summary(geom = "crossbar", width=0.65, fatten=0, color="black", fun.data = function(x){ return(c(y=median(x), ymin=median(x), ymax=median(x))) }) + theme(legend.position = "none")
pp2

Trait relevant cell determination from permutation test

About 2 mins
please set @mycores >= 1 and @permutation_times >= 1,000 in the real setting

mono_permu <- get_sigcell_simple(knn_sparse_mat=mutualknn30, 
                                 seed_idx=mono_mat$seed_idx, 
                                 topseed_npscore=mono_mat$np_score, 
                                 permutation_times=100, # Increase to >=1000 in practice
                                 true_cell_significance=0.05, 
                                 rda_output=FALSE, 
                                 # mycores=8,# Increase in practice
                                 rw_gamma=0.05)
mono_mat2 <- data.frame(mono_mat, mono_permu)

Look at the distribution of statistically significant phenotypically enriched and depleted cells

Enriched cells

mono_mat2 |>
  dplyr::group_by(color) |> 
  dplyr::summarise(enriched_cell=sum(true_cell_top_idx)) |> 
  ggplot(aes(x=color, y=enriched_cell, fill=color)) + 
  geom_bar(stat="identity") + 
  theme_classic()

Depleted cells

mono_mat2$rev_true_cell_top_idx <- !mono_mat2$true_cell_top_idx
mono_mat2 |>
  dplyr::group_by(color) |> 
  dplyr::summarise(depleted_cell=sum(rev_true_cell_top_idx)) |> 
  ggplot(aes(x=color, y=depleted_cell, fill=color)) + 
  geom_bar(stat="identity") + 
  theme_classic()

Sesssion info

utils::sessionInfo()




sankaranlab/SCAVENGE documentation built on March 2, 2023, 2:17 a.m.