# March 8, 2023
#' Gene set analysis using pre-computed test statistic
#'
#' Perform gene set analysis on the result of a pre-computed test statistic. Test whether statistics in a gene set are larger/smaller than statistics not in the set.
#'
#' @param statistics pre-computed test statistics
#' @param ids name of gene for each entry in \code{statistics}
#' @param geneSets \code{GeneSetCollection}
#' @param use.ranks do a rank-based test \code{TRUE} or a parametric test \code{FALSE}? default: FALSE
#' @param n_genes_min minumum number of genes in a geneset
#' @param progressbar if TRUE, show progress bar
#' @param inter.gene.cor correlation of test statistics with in gene set
#' @param coef.name name of column to store test statistic
#'
#' @details
#' This is the same as \code{zenith_gsa()}, but uses pre-computed test statistics. Note that \code{zenithPR_gsa()} may give slightly different results for small samples sizes, if \code{zenithPR_gsa()} is fed t-statistics instead of z-statistics.
#'
#' @return
#' \itemize{
#' \item \code{NGenes}: number of genes in this set
#' \item \code{Correlation}: mean correlation between expression of genes in this set
#' \item \code{delta}: difference in mean t-statistic for genes in this set compared to genes not in this set
#' \item \code{se}: standard error of \code{delta}
#' \item \code{p.less}: p-value for hypothesis test of \code{H0: delta < 0}
#' \item \code{p.greater}: p-value for hypothesis test of \code{H0: delta > 0}
#' \item \code{PValue}: p-value for hypothesis test \code{H0: delta != 0}
#' \item \code{Direction}: direction of effect based on sign(delta)
#' \item \code{FDR}: false discovery rate based on Benjamini-Hochberg method in \code{p.adjust}
#' \item \code{coef.name}: name for pre-computed test statistics. Default: \code{zenithPR}
#' }
#'
#' @examples
#' # Load packages
#' library(edgeR)
#' library(variancePartition)
#' library(tweeDEseqCountData)
#'
#' # Load RNA-seq data from LCL's
#' data(pickrell)
#' geneCounts = exprs(pickrell.eset)
#' df_metadata = pData(pickrell.eset)
#'
#' # Filter genes
#' # Note this is low coverage data, so just use as code example
#' dsgn = model.matrix(~ gender, df_metadata)
#' keep = filterByExpr(geneCounts, dsgn, min.count=5)
#'
#' # Compute library size normalization
#' dge = DGEList(counts = geneCounts[keep,])
#' dge = calcNormFactors(dge)
#'
#' # Estimate precision weights using voom
#' vobj = voomWithDreamWeights(dge, ~ gender, df_metadata)
#'
#' # Apply dream analysis
#' fit = dream(vobj, ~ gender, df_metadata)
#' fit = eBayes(fit)
#'
#' # Load Hallmark genes from MSigDB
#' # use gene 'SYMBOL', or 'ENSEMBL' id
#' # use get_GeneOntology() to load Gene Ontology
#' gs = get_MSigDB("H", to="ENSEMBL")
#'
#' # Run zenithPR analysis with a test statistic for each gene
#' tab = topTable(fit, coef='gendermale', number=Inf)
#'
#' res.gsa = zenithPR_gsa(tab$t, rownames(tab), gs)
#
#' @seealso \code{zenith_gsa()}, \code{limma::cameraPR()}
#' @importFrom Rdpack reprompt
#' @importFrom stats runif
#'
#' @export
zenithPR_gsa = function(statistics, ids, geneSets, use.ranks = FALSE, n_genes_min = 10, progressbar=TRUE, inter.gene.cor = 0.01, coef.name = "zenithPR"){
if( length(statistics) != length(ids) ){
stop("statsitics and ids must be the same length")
}
if( any(duplicated(ids)) ){
stop("All entries in ids must be unique")
}
if( any(is.null(ids) | is.na(ids)) ){
stop("All entries in ids characters and not NULL or NA")
}
# Global statistics
meanStat <- mean(statistics)
varStat <- var(statistics)
G = length( statistics )
df.camera = G - 2
allow.neg.cor = FALSE
# convert GeneSetCollection to list
geneSets.lst = geneIds( geneSets )
# Map from genes to gene sets
index = ids2indices( geneSets.lst, ids)
# filter by size of gene set
index = index[vapply(index, length, FUN.VALUE=numeric(1)) >= n_genes_min]
nsets = length(index)
if( nsets == 0 ){
stop("No sets contain genes matching entries in ids")
}
# set up progress bar
pb <- progress_bar$new(format = ":current/:total [:bar] :percent ETA::eta", total = nsets, width= 60, clear=FALSE)
# Analysis for each gene set
tab = lapply(seq_len(nsets), function(i){
if( progressbar & runif(1) < .01 ){
pb$update(ratio = i / nsets )
}
iset <- index[[i]]
if(is.character(iset)) iset <- which(ids %in% iset)
StatInSet <- statistics[iset]
m <- length(StatInSet)
m2 <- G - m
# convert correlation to VIF
correlation = inter.gene.cor
vif = 1 + inter.gene.cor * (m - 1)
if(use.ranks) {
corr.use = correlation
if( ! allow.neg.cor ) corr.use <- max(0,corr.use)
res = .rankSumTestWithCorrelation(iset, statistics=statistics, correlation=corr.use, df=df.camera)
df = data.frame( NGenes = m,
Correlation = correlation,
delta = res$effect,
se = res$se,
p.less = res$less,
p.greater = res$greater )
}else{
if( ! allow.neg.cor ) vif <- max(1,vif)
meanStatInSet <- mean(StatInSet)
delta <- G/m2*(meanStatInSet-meanStat)
varStatPooled <- ( (G-1)*varStat - delta^2*m*m2/G ) / (G-2)
delta.se = sqrt( varStatPooled * (vif/m + 1/m2) )
two.sample.t = delta / delta.se
df = data.frame(NGenes = m,
Correlation = correlation,
delta = delta,
se = delta.se,
p.less = pt(two.sample.t,df=df.camera),
p.greater = pt(two.sample.t,df=df.camera,lower.tail=FALSE) )
}
df
})
tab = do.call(rbind, tab)
rownames(tab) <- names(index)
if( progressbar & ! pb$finished ) pb$update(ratio = 1)
pb$terminate()
# Post-process results
tab$PValue = 2*pmin(tab$p.less, tab$p.greater)
tab$Direction = ifelse(tab$p.less < tab$p.greater, "Down", "Up")
tab$FDR = p.adjust(tab$PValue, "BH")
if( progressbar & ! pb$finished ) pb$update( 1.0 )
pb$terminate()
# Sort by p-value
o <- order(tab$PValue)
tab <- tab[o,]
# Make results compatible with plotZenithResults
tab$Geneset <- rownames(tab)
tab$coef <- coef.name
tab
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.