# functions that have to do with the permutation test
# and getting p values from that test with the gamma distribution
#' Run COCOA permutations to get p-values
#'
#' This is a convenience function that runs multiple steps of the
#' permutation process together: it runs COCOA permutations, converts these
#' to null distributions, gets the empirical p value (which is limited by the
#' number of permutations), gets z scores, and fits a gamma distribution
#' to each null distribution to estimate p values (not limited by the
#' number of permutations),
#' Requires that the user has previously calculated the real COCOA scores.
#' See these individual functions for more info on each step: runCOCOA,
#' convertToFromNullDist, getPermStat, and getGammaPVal.
#'
#'
#' For reproducibility, set seed with 'set.seed()' function before running.
#' @template genomicSignal
#' @template signalCoord
#' @template GRList
#' @template rsScores
#' @template targetVar
#' @templateVar usesTargetVar TRUE
#' @template signalCol
#' @template scoringMetric
#' @template absVal
#' @template olList
#' @param nPerm Numeric. The number of permutations to do.
#' @param minRSCov Numeric. This parameter specifies the minimum coverage
#' (number of overlapping positions) required for a region to be considered
#' in the analysis.
#' @param centerGenomicSignal Logical. Should rows in genomicSignal
#' be centered based on
#' their means? (subtracting row mean from each row)
#' @param centerTargetVar Logical. Should columns in targetVar be
#' centered based
#' on their means? (subtract column mean from each column)
#' @param dataID Character. A unique identifier for this dataset
#' (for saving results with simpleCache)
#' @template variationMetric
#' @param useSimpleCache Logical. Whether to use save caches. Caches
#' will be created for each permutation so that if the function is disrupted
#' it can restart where it left off. The final results are also saved
#' as a cache. See simpleCache package for more details.
#' @param cacheDir Character. The path for the directory in which the
#' caches should be saved.
# @param correctionMethod Character. P value correction method. Default
# is "BH" for Benjamini and Hochberg false discovery rate. For acceptable
# arguments and more info see ?stats::p.adjust() (method parameter)
#' @param testType Character. Parameter for `getPermStat`. Whether to
#' create p values based on one a two sided test or a lesser/greater one
#' sided test. Options are: "greater", "lesser", "two-sided"
#' @param gammaFitMethod Character. method to use for fitting the gamma
#' distribution to null distribution. Options are
#' "mme" (moment matching estimation), "mle" (maximum likelihood estimation),
#' "qme" (quantile matching estimation), and "mge" (maximum goodness-of-fit
#' estimation). See ?COCOA::getGammaPVal and
#' ?fitdistrplus::fitdist() for more info.
#' @param realScoreInDist Logical. Should the actual score (from
#' test with no permutations) be included in the null distribution
#' when fitting the gamma distribution. realScoreInDist=TRUE is
#' recommended.
#' @param force Logical. If force=TRUE, when fitting the gamma distribution
#' returns an error (as may happen when a method other than "mme"
#' is used) then allow the error. If force=FALSE, when fitting the
#' gamma distribution returns an error then don't return an error but
#' instead use the "mme" method
#' for fitting that specific gamma distribution.
#' @template verbose
#' @template returnCovInfo
#' @param ... Character. Optional additional arguments for simpleCache.
#'
#' @return Returns a list with the following 4 items:
#' 1. a list of length nPerm where each item is a data.frame
#' of the COCOA scores from a single permutation.
#' Each data.frame is the output of `runCOCOA()`
#' 2. a data.table/data.frame of empirical p-values (the
#' output of `getPermStat`)
#' 3. a data.table/data.frame of z-scores (the output of `getPermStat`.
#' 4. a data.frame of p-values based on the gamma approximation
#' (the output of getGammaPVal().
#'
#' @examples
#' data("esr1_chr1")
#' data("nrf1_chr1")
#' data("brcaMethylData1")
#' data("brcaMCoord1")
#' pcScores <- prcomp(t(brcaMethylData1))$x
#' targetVarCols <- c("PC1", "PC2")
#' targetVar <- pcScores[, targetVarCols]
#'
#' # give the actual order of samples to `runCOCOA` to get the real scores
#' correctSampleOrder=1:nrow(targetVar)
#' realRSScores <- runCOCOA(genomicSignal=brcaMethylData1,
#' signalCoord=brcaMCoord1,
#' GRList=GRangesList(esr1_chr1, nrf1_chr1),
#' signalCol=c("PC1", "PC2"),
#' targetVar=targetVar,
#' sampleOrder=correctSampleOrder,
#' variationMetric="cor")
#'
#' # give random order of samples to get random COCOA scores
#' # so you start building a null distribution for each region set
#' # (see vignette for example of building a null distribution with `runCOCOA`)
#' randomOrder <- sample(1:nrow(targetVar),
#' size=nrow(targetVar),
#' replace=FALSE)
#' randomRSScores <- runCOCOA(genomicSignal=brcaMethylData1,
#' signalCoord=brcaMCoord1,
#' GRList=GRangesList(esr1_chr1, nrf1_chr1),
#' signalCol=c("PC1", "PC2"),
#' targetVar=targetVar,
#' sampleOrder=randomOrder,
#' variationMetric="cor")
#'
#' # runCOCOAPerm
#' permResults <- runCOCOAPerm(genomicSignal=brcaMethylData1,
#' signalCoord=brcaMCoord1,
#' GRList=GRangesList(esr1_chr1=esr1_chr1, nrf1_chr1=nrf1_chr1),
#' rsScores=realRSScores,
#' targetVar=targetVar,
#' signalCol=c("PC1", "PC2"),
#' variationMetric="cor",
#' nPerm = 10,
#' useSimpleCache=FALSE)
#' permResults
#'
#'
#' @export
runCOCOAPerm <- function(genomicSignal,
signalCoord,
GRList,
rsScores,
targetVar,
signalCol,
scoringMetric="default",
absVal=TRUE,
olList=NULL,
centerGenomicSignal=TRUE,
centerTargetVar=TRUE,
variationMetric="cor",
nPerm=NULL, # NULL by default
useSimpleCache=TRUE,
cacheDir=getwd(),
dataID="",
testType="greater",
gammaFitMethod="mme",
realScoreInDist=TRUE,
force=FALSE,
verbose=TRUE,
returnCovInfo=FALSE, minRSCov=100, ...) {
# if doing matrix calculations of cor/cov, center and/or scale only once ahead of time
alreadyCenteredFM <- FALSE
alreadyCenteredDM <- FALSE
alreadyScaledFM <- FALSE
alreadyScaledDM <- FALSE
colsToAnnotate <- signalCol
allResultsList <- list()
## Initialize variables for centering and scaling flags and a list to store results.
if (is(rsScores, "data.table")) {
rsScores = as.data.frame(rsScores)
}
# Make sure signalCol is a character vector
if (!is.character(colsToAnnotate)) {
colsToAnnotate <- as.character(colsToAnnotate)
}
rsScores = rsScores[, colsToAnnotate, drop = FALSE] # prevents error that occurs if extra column is factor
## Convert rsScores to data frame if needed and retain only specified columns.
# more efficient to only do once (not that high impact though)
if (centerGenomicSignal) {
cpgMeans <- rowMeans(genomicSignal, na.rm = TRUE)
# centering before calculating correlation
genomicSignal <- apply(X = genomicSignal, MARGIN = 2, function(x) x - cpgMeans)
# don't do later
centerGenomicSignal <- FALSE
}
# check that targetVar is not empty
if (centerTargetVar) {
if (nrow(targetVarDF) == 0) {
stop("Error: targetVarDF is empty. Make sure it contains at least one row.")
}
}
# check if targetVar contains numeric data
if (centerTargetVar) {
# Save the orginical colnames
original_colnames <- colnames(targetVar)
featureMeans <- colMeans(targetVar, na.rm = TRUE)
# centering before calculating correlation (also, t() converts to matrix)
targetVar <- t(apply(X = t(targetVar), MARGIN = 2, function(x) x - featureMeans))
if (dim(targetVar)[1] == 1) {
targetVar <- t(targetVar)
}
colnames(targetVar) <- original_colnames
centerTargetVar <- FALSE
}
## Center and scale genomicSignal and targetVar if specified.
checkConvertInputClasses(signal=genomicSignal,
signalCoord=signalCoord,
regionSet=NULL,
signalCol=signalCol,
rsOL=NULL)
## Call a function to check and convert input data classes
# detect signalCoordType
# when signalCoord is a GRanges object
if (any(start(signalCoord) != end(signalCoord))) {
signalCoordType <- "multiBase"
} else {
signalCoordType <- "singleBase"
}
## Determine the type of signalCoord based on the coordinates.
# if "default" scoring method is given, choose based on signalCoordType
if (scoringMetric == "default") {
if (signalCoordType == "singleBase") {
scoringMetric <- "regionMean"
} else if (signalCoordType == "multiBase") {
scoringMetric <- "proportionWeightedMean"
} else {
stop(cleanws("signalCoordType not recognized.
Check spelling/capitalization."))
}
}
## Determine the scoring metric based on the signalCoordType.
# if no NA, NaN or +/-Inf, can do matrix correlation calculations
# different COCOA step than matrix scoring of region sets
noNA <- all(is.finite(genomicSignal))
if (noNA & (variationMetric %in% c("cov", "cor"))) {
# more efficient to only do once
original_colnames <- colnames(targetVar)
cpgMeans <- rowMeans(genomicSignal, na.rm = TRUE)
# centering before calculating correlation
genomicSignal <- apply(X = genomicSignal, MARGIN = 2, function(x) x - cpgMeans)
# don't do later
alreadyCenteredDM <- TRUE
featureMeans <- colMeans(targetVar, na.rm = TRUE)
# centering before calculating correlation (also, t() converts to matrix)
targetVar <- t(apply(X = t(targetVar), MARGIN = 2, function(x) x - featureMeans))
if (dim(targetVar)[1] == 1) {
targetVar <- t(targetVar)
}
# don't do later
alreadyCenteredFM <- TRUE
colnames(targetVar) <- original_colnames
}
if (noNA & (variationMetric == "cor")) {
original_colnames <- colnames(targetVar)
targetVar <- scale(x = targetVar, center = FALSE, scale = TRUE)
colnames(targetVar) <- original_colnames
alreadyScaledFM <- TRUE
featSD <- apply(X = genomicSignal, MARGIN = 1, FUN = sd)
featSD[featSD == 0] = 1 # 0 sd if no variation, 1 will leave unchanged and not cause error
genomicSignal <- genomicSignal / featSD
alreadyScaledDM <- TRUE
}
## Perform matrix calculations for correlation and covariance, if applicable.
# matrix-based region set scoring only works for mean-based scoringMetric's
if (scoringMetric %in% c("simpleMean",
"regionMean",
"proportionWeightedMean")) {
doRSMatScore <- TRUE
} else {
doRSMatScore <- FALSE
}
## Determine if matrix-based region set scoring is applicable.
if (!noNA) {
# orientation for "cor()" and "cov()"
# for these functions, samples need to be rows for both objects
# must be transposed now so it is not (and not copied) during each permutation
genomicSignal <- t(genomicSignal)
}
# for matrix correlation, samples should be rows for only one object (genSig and tarVar)
# data is already in correct orientation
########################################################################
# create region set overlap matrix
# this code should be after code modifying "signal"
if (doRSMatScore) {
olMatRes <- olToMat(signalCoord = signalCoord,
GRList = GRList,
scoringMetric = scoringMetric, minRSCov=minRSCov)
rsMatList <- olMatRes[[1]]
rsInfo <- olMatRes[[2]]
GRList <- NULL
if (!is.null(rsScores)) {
if (nrow(rsInfo) != nrow(rsScores)) {
stop("rsScores should be filtered beforehand to remove region sets with less than minRSCov coverage.")
}
}
## NOTE: there might be an error if rsScores is given but not screened by minRSCov
}
## Generate overlap matrices and associated information if matrix-based region
# set scoring is required.
if (is.null(olList)) {
#######
# must take out NA rows before getting OL list. Otherwise later calculations
# will use wrong indices.
# what happens if there are NAs or Inf in `signal`?
# any NAs that overlap the regionSet will cause the score to be NA
if (is(genomicSignal, "data.table")) {
naRows = apply(X = genomicSignal[, , with=FALSE, drop=FALSE],
MARGIN = 1, FUN = function(x) any(is.na(x)))
} else {
naRows = apply(X = genomicSignal[, , drop=FALSE],
MARGIN = 1, FUN = function(x) any(is.na(x)))
}
if (any(naRows)) {
genomicSignal <- genomicSignal[!naRows, ]
signalCoord <- signalCoord[!naRows]
warning("Removing rows with NA from `genomicSignal`")
}
########################################################################
# calculate overlaps only once
# region set must be subject to fit with scoring functions
olList <- lapply(X = GRList, FUN = function(x) findOverlaps(query = signalCoord,
subject = x))
totalRegionNumber = sapply(X = GRList, length)
meanRegionSize = sapply(X = GRList, function(x) round(mean(width(x))))
}
# also calculate coverage info
# @param rsOL
calculateCovInfo <- function(rsOL,
scoringMetric=scoringMetric,
pOlap=NULL) {
covInfo <- data.frame(signalCoverage=length(unique(queryHits(rsOL))),
regionSetCoverage=length(unique(subjectHits(rsOL))))
if (scoringMetric == "proportionWeightedMean") {
covInfo$sumProportionOverlap <- sum(pOlap)
}
}
covInfo <- lapply(X = olList, FUN = function(x) calculateCovInfo(rsOL=x,
scoringMetric = scoringMetric))
if (scoringMetric == "proportionWeightedMean") {
getPOlap <- function(rsOL, signalGR, regionSet) {
olap <- pintersect(signalGR[queryHits(rsOL)],
regionSet[subjectHits(rsOL)])
pOlap <- width(olap) / width(regionSet[subjectHits(rsOL)])
return(pOlap)
}
# list
pOlapList <- mapply(FUN = function(x, y) getPOlap(rsOL = x,
signalGR=signalCoord,
regionSet = y),
x=olList, y=GRList, SIMPLIFY = FALSE)
covInfo$sumProportionOverlap <- sapply(X = pOlapList, FUN = sum)
} else {
pOlapList <- NULL
}
########################################################################
# Permutations part
if (!is.null(nPerm) && nPerm > 0) {
indList <- list()
for (i in seq(nPerm)) {
indList[[i]] <- sample(1:nrow(targetVar), nrow(targetVar))
}
}
## Generate indices for permutation tests if nPerm is specified.
if (useSimpleCache) {
if (is.null(rsScores)) {
simpleCache(paste0("rsScores_", variationMetric, "_", dataID), {
rsScores <- runCOCOA(sampleOrder=1:nrow(targetVar),
genomicSignal=genomicSignal,
signalCoord=signalCoord,
GRList=GRList,
signalCol=colsToAnnotate,
targetVar=targetVar,
variationMetric = variationMetric,
scoringMetric=scoringMetric,
absVal=absVal,
verbose=verbose,
rsMatList = rsMatList,
rsInfo = rsInfo,
returnCovInfo = returnCovInfo,
noNA=noNA,
alreadyCenteredDM = alreadyCenteredDM,
alreadyScaledDM = alreadyScaledDM,
alreadyCenteredFM = alreadyCenteredFM,
alreadyScaledFM = alreadyScaledFM)
rsScores
}, assignToVariable="rsScores")
}
## Use caching to store results if requested.
if (!is.null(nPerm) && nPerm > 0) {
# create the main permutation cache
simpleCache(paste0("rsPermScores_", nPerm, "Perm_", variationMetric, "_", dataID), {
helperFun <- function(x, y, ...) {
onePermCacheName <- paste0("rsPermScores_", variationMetric, "_", dataID, "_Cache", y)
simpleCache(onePermCacheName, cacheSubDir = paste0("rsPermScores_", variationMetric, "_", dataID), {
tmp <- runCOCOA(sampleOrder=x,
genomicSignal=genomicSignal,
signalCoord=signalCoord,
GRList=GRList,
signalCol=colsToAnnotate,
targetVar=targetVar,
variationMetric = variationMetric,
scoringMetric=scoringMetric,
absVal=absVal,
verbose=verbose,
rsMatList = rsMatList,
rsInfo = rsInfo,
returnCovInfo = returnCovInfo,
noNA=noNA,
alreadyCenteredDM = alreadyCenteredDM,
alreadyScaledDM = alreadyScaledDM,
alreadyCenteredFM = alreadyCenteredFM,
alreadyScaledFM = alreadyScaledFM)
rownames(tmp) <- seq_len(nrow(tmp))
colnames(tmp)[colnames(tmp)== "rsName"] <- "regionSetName"
message(y) # must be ahead of object that is saved as cache, not after
tmp
}, cacheDir=cacheDir, assignToVariable="tmp", ...)
return(tmp)
}
rsPermScores = mapply(FUN = helperFun, x=indList, y=seq_along(indList), ..., SIMPLIFY=FALSE)
}, assignToVariable="rsPermScores", cacheDir=cacheDir, ...)
} else {
rsPermScores = rsScores
}
} else {
if (is.null(rsScores)) {
rsScores <- runCOCOA(sampleOrder=1:nrow(targetVar),
genomicSignal=genomicSignal,
signalCoord=signalCoord,
GRList=GRList,
signalCol=colsToAnnotate,
targetVar=targetVar,
variationMetric = variationMetric,
scoringMetric=scoringMetric,
absVal=absVal,
verbose=verbose,
rsMatList = rsMatList,
rsInfo = rsInfo,
returnCovInfo = returnCovInfo,
noNA=noNA,
alreadyCenteredDM = alreadyCenteredDM,
alreadyScaledDM = alreadyScaledDM,
alreadyCenteredFM = alreadyCenteredFM,
alreadyScaledFM = alreadyScaledFM)
}
if (!is.null(nPerm) && nPerm > 0) {
helperFun2 <- function(x) {
tmp <- runCOCOA(sampleOrder=x,
genomicSignal=genomicSignal,
signalCoord=signalCoord,
GRList=GRList,
signalCol=colsToAnnotate,
targetVar=targetVar,
variationMetric = variationMetric,
scoringMetric=scoringMetric,
absVal=absVal,
verbose=verbose,
rsMatList=rsMatList,
rsInfo = rsInfo,
returnCovInfo = returnCovInfo,
noNA=noNA,
alreadyCenteredDM = alreadyCenteredDM,
alreadyScaledDM = alreadyScaledDM,
alreadyCenteredFM = alreadyCenteredFM,
alreadyScaledFM = alreadyScaledFM)
rownames(tmp) <- seq_len(nrow(tmp))
colnames(tmp)[colnames(tmp)== "rsName"] <- "regionSetName"
message(":", appendLF=FALSE)
return(tmp)
}
rsPermScores <- lapply(X = indList, helperFun2)
} else {
rsPermScores = rsScores
}
}
## Run the COCOA analysis, either using caching or not, for empirical p-values and scores.
.analysisID <- paste0("_", nPerm, "Perm_", variationMetric, "_", dataID)
if (is.null(nPerm) || nPerm == 0) {
rsScores <- NULL
rsPVals <- NULL
rsZScores <- NULL
gPValDF <- NULL
} else {
nullDistList <- convertToFromNullDist(rsPermScores)
print(rsInfo)
if (useSimpleCache) {
simpleCache(paste0("empiricalPValsUncorrected", .analysisID), {
rsPVals <- getPermStat(rsScores=rsScores, nullDistList=nullDistList,
signalCol=colsToAnnotate, whichMetric = "pval",
testType=testType)
rsPVals
}, assignToVariable="rsPVals", cacheDir=cacheDir, ...)
simpleCache(paste0("permZScores", .analysisID), {
rsZScores <- getPermStat(rsScores=rsScores, nullDistList=nullDistList,
signalCol=colsToAnnotate, whichMetric = "zscore")
rsZScores
}, assignToVariable="rsZScores", cacheDir=cacheDir, ...)
simpleCache(paste0("gammaPValsUncorrected", .analysisID), {
gPValDF <- getGammaPVal(rsScores = rsScores,
nullDistList = nullDistList,
signalCol = colsToAnnotate,
method = gammaFitMethod,
realScoreInDist = realScoreInDist,
force=force)
gPValDF <- cbind(gPValDF,
rsScores[, colnames(rsScores)[!(colnames(rsScores)
%in% colsToAnnotate)]])
gPValDF
}, assignToVariable="gPValDF", cacheDir=cacheDir, ...)
} else {
rsPVals <- getPermStat(rsScores=rsScores, nullDistList=nullDistList,
signalCol=colsToAnnotate, whichMetric = "pval",
testType = testType)
rsPVals
rsZScores <- getPermStat(rsScores=rsScores, nullDistList=nullDistList,
signalCol=colsToAnnotate, whichMetric = "zscore")
rsZScores
gPValDF <- getGammaPVal(rsScores = rsScores,
nullDistList = nullDistList,
signalCol = colsToAnnotate,
method = gammaFitMethod,
realScoreInDist = realScoreInDist,
force=force)
gPValDF <- cbind(gPValDF,
rsScores[, colnames(rsScores)[!(colnames(rsScores)
%in% colsToAnnotate)]])
gPValDF
}
}
# Add region set names
rsScores$regionSetName <- rsInfo$rsName
rsPVals$regionSetName <- rsInfo$rsName
rsZScores$regionSetName <- rsInfo$rsName
gPValDF$regionSetName <- rsInfo$rsName
# Sort by first column
rsScores[order(rsScores[,1], decreasing=TRUE), ]
rsPVals[order(rsPVals[,1], decreasing=TRUE), ]
rsZScores[order(rsZScores[,1], decreasing=TRUE), ]
gPValDF[order(gPValDF[,1], decreasing=TRUE), ]
# Create one list of results
allResultsList$rsScores <- rsScores
allResultsList$permRSScores <- rsPermScores
allResultsList$empiricalPVals <- rsPVals
allResultsList$zScores <- rsZScores
allResultsList$gammaPVal <- gPValDF
return(allResultsList)
}
#' Run COCOA: quantify inter-sample variation, score region sets
#'
#' A convenience function that does the two steps of COCOA: it 1) quantifies
#' the epigenetic variation; and 2) scores the region sets. This function
#' returns the real COCOA scores if using the default `sampleOrder` parameter
#' value. With the sampleOrder parameter, you can shuffle the samples, then
#' run COCOA to get permuted scores for each region set, which generates null
#' distributions to evaluate statistical significance.
#'
#' There are multiple options for quantifying the epigenetic variation,
#' specified by the `variationMetric` parameter. Quantifying the variation for
#' the real/non-permuted COCOA scores should be done with the same variation
#' metric as is used for the random permutations. For an unsupervised analysis
#' using dimensionality reduction, first, the dimensionality reduction is done
#' outside `runCOCOA`, then the latent factors/principal components are input
#' to `runCOCOA` as the sample labels (targetVar parameter) when calculating
#' both the real and also the permutated region set scores. For a supervised
#' analysis, the target variables/phenotypes are the targetVar. See the
#' vignettes for examples.
#'
#' @param sampleOrder numeric. A vector of length (number of samples). If
#' sampleOrder is 1:(number of samples) then this function will return the
#' real COCOA scores.
#' To generate random COCOA scores in order to make
#' null distributions, shuffle the samples in a random order.
#' E.g. sampleOrder = sample(1:ncol(genomicSignal), ncol(genomicSignal))
#' where ncol(genomicSignal) is the number of samples.
#' Set the seed with set.seed() before making sampleOrder to ensure reproducibility.
#' @template genomicSignal
#' @template signalCoord
#' @template GRList
#' @templateVar usesTargetVar TRUE
#' @template signalCol
#' @template targetVar
#' @template variationMetric
#' @template scoringMetric
#' @template verbose
#' @template absVal
#' @template olList
#' @template pOlapList
#' @permutations numeric -- specify the number of permutations. Leave NULL for no permutations (NULL by default).
#' @cores numeric -- specify the number of cores, default: cores=1
#' @param centerGenomicSignal Logical. Should rows in genomicSignal
#' be centered based on
#' their means? (subtracting row mean from each row)
#' @param centerTargetVar Logical. Should columns in targetVar be
#' centered based
#' on their means? (subtract column mean from each column)
#' @template returnCovInfo
#' @return data.frame. The output of aggregateSignalGRList for one permutation.
#' @examples
#' data("esr1_chr1")
#' data("nrf1_chr1")
#' data("brcaMethylData1")
#' data("brcaMCoord1")
#' pcScores <- prcomp(t(brcaMethylData1))$x
#' targetVarCols <- c("PC1", "PC2")
#' targetVar <- pcScores[, targetVarCols]
#'
#' # give the actual order of samples to `runCOCOA` to get the real scores
#' correctSampleOrder=1:nrow(targetVar)
#' realRSScores <- runCOCOA(genomicSignal=brcaMethylData1,
#' signalCoord=brcaMCoord1,
#' GRList=GRangesList(esr1_chr1, nrf1_chr1),
#' signalCol=c("PC1", "PC2"),
#' targetVar=targetVar,
#' sampleOrder=correctSampleOrder,
#' variationMetric="cor")
#' realRSScores
#'
#' # give random order of samples to get random COCOA scores
#' # so you start building a null distribution for each region set
#' # (see vignette for example of building a null distribution with `runCOCOA`)
#' randomOrder <- sample(1:nrow(targetVar),
#' size=nrow(targetVar),
#' replace=FALSE)
#' randomRSScores <- runCOCOA(genomicSignal=brcaMethylData1,
#' signalCoord=brcaMCoord1,
#' GRList=GRangesList(esr1_chr1, nrf1_chr1),
#' signalCol=c("PC1", "PC2"),
#' targetVar=targetVar,
#' sampleOrder=randomOrder,
#' variationMetric="cor")
#' randomRSScores
#' @export
runCOCOA <- function(genomicSignal,
signalCoord, GRList, signalCol,
targetVar,
sampleOrder=1:nrow(targetVar),
variationMetric = "cor",
scoringMetric="default", verbose=TRUE,
absVal=TRUE, olList=NULL, pOlapList=NULL,
centerGenomicSignal=TRUE,
centerTargetVar=TRUE,
returnCovInfo=TRUE,
rsMatList=NULL,
rsInfo=NULL,
alreadyCenteredDM = FALSE,
alreadyScaledDM = FALSE,
alreadyCenteredFM = FALSE,
alreadyScaledFM = FALSE,
noNA=FALSE) {
signalList <- NULL
# if vector is given, return error
if (is.null(dim(targetVar))) {
stop("`targetVar` should be a matrix or data.frame")
}
if (any(!(signalCol %in% colnames(targetVar)))) {
stop("Not all specified columns are present in `targetVar`")
}
# if vector is given, return error
if (is.null(dim(targetVar))) {
stop("`targetVar` should be a matrix or data.frame")
}
if (any(!(signalCol %in% colnames(targetVar)))) {
stop("Not all specified columns are present in `targetVar`")
}
# subset to only signalCol
targetVar <- as.data.frame(targetVar[, signalCol, drop=FALSE])
# because names are dropped for a single column data.frame when indexing
# single col data.frame is automatically converted to numeric
featureNames <- colnames(targetVar)
# reorder the sample labels
targetVar <- data.frame(targetVar[sampleOrder, ])
colnames(targetVar) <- featureNames
# calculate correlation
featureLabelCor <- createCorFeatureMat(dataMat = genomicSignal,
featureMat = targetVar,
testType = variationMetric)
# more efficient to do only once instead of for each region set later on
if (absVal) {
featureLabelCor <- abs(featureLabelCor)
absVal <- FALSE
}
if (!is.null(rsMatList) && (scoringMetric %in% c("simpleMean", "regionMean",
"proportionWeightedMean"))) {
if (is.null(signalList)) {
signalList <- splitSignal(signal = featureLabelCor,
maxRow = ncol(rsMatList[[1]]))
}
}
# run COCOA
thisPermRes <- aggregateSignalGRList(signal=featureLabelCor,
signalCoord=signalCoord, GRList=GRList,
signalCol = signalCol,
scoringMetric = scoringMetric, verbose = verbose,
absVal = absVal,
rsMatList=rsMatList,
rsInfo = rsInfo,
signalList=signalList,
returnCovInfo=returnCovInfo)
# return
return(thisPermRes)
}
#' Converts COCOA permutation results to null distributions and vice versa
#'
#' This function will take a list of results of permutation tests that included
#' many region sets and return a list of data.frames where each data.frame
#' contains the null distribution for a single region set.
#' The function can also convert in the reverse order from
#' a list of null distributions to a list of COCOA results.
#'
#' @param rsScoresList each item in the list is a data.frame, one item for
#' each permutation with the results of that permutation. Each row in the
#' data.frame is a region set. All data.frames should be the same size and
#' each data.frame's rows should be in the same order
#'
#' @return a list of data.frames. The function returns a list of data.frames.
#' If the input is a list of data.frames from COCOA permutations, the output
#' will be a list of data.frames containing null distributions for each region set,
#' with the same columns as the input data.frames. If the input is a list of data.frames
#' with null distributions for individual region sets, the output will be a list of data.frames,
#' each with one row for each region set, representing results for a single COCOA permutation.
#'
#' @examples
#' # six region sets (rows), 2 signals (columns)
#' fakePermScores <- data.frame(abs(rnorm(6)), abs(rnorm(6)))
#' fakePermScores2 <- data.frame(abs(rnorm(6)), abs(rnorm(6)))
#' # 2 fake COCOA results (i.e. nPerm=2)
#' permRSScores <- list(fakePermScores, fakePermScores2)
#' convertToFromNullDist(permRSScores)
#'
#' @export
convertToFromNullDist <- function(rsScoresList) {
nullDistList <- lapply(X = 1:nrow(rsScoresList[[1]]),
FUN = function(x) permListToOneNullDist(resultsList=rsScoresList,
rsInd = x))
return(nullDistList)
}
# @param rsInd numeric. The row number for the region set of interest.
# do for only one region set
permListToOneNullDist <- function(resultsList, rsInd) {
rowList <- lapply(resultsList, FUN = function(x) x[rsInd, ])
rsNullDist <- as.data.frame(rbindlist(rowList))
return(rsNullDist)
}
# for gammaNormalize()
pGammaSingle <- function(scoreVec, oneDist) {
pValVec <- pgamma(q = scoreVec, shape = oneDist$estimate["shape"],
rate = oneDist$estimate["rate"],
lower.tail = FALSE) # lower.tail=FALSE means 1-x is given
return(pValVec)
}
# get gamma p-value for each region set based on feature values
# of all CpGs. This is done to normalize for the mean and variance of
# permutation feature value distributions
# @param featureVals matrix/data.frame. Should
gammaNormalize <- function(rsScores, featureVals, method="mme", force=FALSE) {
# estimate distribution parameters
# returns a list where each item is a gamma dist for a column of featureVals
gList <- fitGammaNullDist(nullDistDF = featureVals,
method=method,
force=force)
#### get p-val for score
pValList = list()
# once for each column
for (i in 1:ncol(rsScores)) {
pValList[[i]] <- as.data.frame(pGammaSingle(scoreVec = as.numeric(rsScores[, i]),
oneDist = gList[[i]]))
}
pValDF <- as.data.frame(do.call("cbind", pValList))
colnames(pValDF) <- colnames(rsScores)
return(pValDF)
}
################################################################################
# p value functions
#' Get a p-value for region set scores based on a gamma distribution.
#'
#' First fit a gamma distribution to each region set's null distribution/s
#' (nullDistList). Then use this gamma distribution to convert scores in
#' rsScores to p-values.
#'
#' @template rsScores
#' @param nullDistList list of data.frames. Each list item
#' has null distributions for a single
#' region set (list items should be in the same order as rows of rsScores).
#' Has same score columns as rsScores.
#' Each column corresponds to a null distribution for that
#' region set for a given sample variable of interest/target variable
#' (e.g. PC or sample phenotype).
#' @templateVar usesRSScores TRUE
#' @template signalCol
#' @param method Character. Has the method to use to fit the gamma
#' distribution to the null distribution.
#' Options are
#' "mme" (moment matching estimation), "mle" (maximum likelihood estimation),
#' "qme" (quantile matching estimation), and "mge" (maximum goodness-of-fit
#' estimation). See ?fitdistrplus::fitdist() for
#' available options and meaning.
#' @param realScoreInDist logical. Should the actual score (from
#' test with no permutations) be included in the null distribution
#' when fitting the gamma distribution. realScoreInDist=TRUE is
#' recommended.
#' @param force logical. If force=TRUE, when fitting the gamma distribution
#' returns an error (as may happen when a method other than "mme"
#' is used) then allow the error. If force=FALSE, when fitting the
#' gamma distribution returns an error then don't return an error but
#' instead use the "mme" method
#' for fitting that specific gamma distribution.
#
#' @return Returns a data.frame with p values, one column for each signalCol in
#' rsScores
#'
#' @examples
# fakeOriginalScores <- data.frame(PC1=abs(rnorm(6)), PC2=abs(rnorm(6)))
# fakePermScores <- data.frame(PC1=abs(rnorm(6)), PC2=abs(rnorm(6)))
# fakePermScores2 <- data.frame(PC1=abs(rnorm(6)), PC2=abs(rnorm(6)))
# fakePermScores3 <- data.frame(PC1=abs(rnorm(6)), PC2=abs(rnorm(6)))
# permRSScores <- list(fakePermScores, fakePermScores2, fakePermScores3)
# nullDistList <- convertToFromNullDist(permRSScores)
# getGammaPVal(rsScores=fakeOriginalScores,
# nullDistList=nullDistList,
# signalCol=c("PC1", "PC2"))
#'
#' @export
getGammaPVal <- function(rsScores, nullDistList, signalCol, method="mme",
realScoreInDist=TRUE, force=FALSE) {
# take absolute value since gamma distribution cannot have negative values
if (any(rsScores < 0, na.rm = TRUE)) {
rsScores <- abs(rsScores)
}
conditionalAbs <- function(x) {
if (any(x < 0, na.rm = TRUE)) {
return(abs(x))
} else {
return(x)
}
}
nullDistList <- lapply(X = nullDistList, FUN = conditionalAbs)
# make sure the same columns are present/in the same order
colsToAnnotate <- signalCol[signalCol %in% colnames(rsScores)]
if (realScoreInDist) {
# to get a more accurate gamma distribution, include the score from unpermuted test.
# add to each null distribution
for (i in 1:nrow(rsScores)) {
nullDistList[[i]] <- rbind(nullDistList[[i]], as.numeric(rsScores[i, ]))
}
}
rsScores <- rsScores[, colsToAnnotate, drop=FALSE]
# returns list, each item in list is also a list.
# in sub list: each col in nullDistDF has one list item
fittedDistList <- lapply(X = nullDistList,
function(x) fitGammaNullDist(nullDistDF =
x[, colsToAnnotate,
drop=FALSE],
method=method,
force=force))
# region sets with no coverage have NA for fittedDistList value
# pGammaList cannot assign these column names so assign them ahead of time
# and don't run pGammaList on those region sets
naInd = vapply(X = fittedDistList,
FUN = function(x) all(is.na(x)),
FUN.VALUE = TRUE)
pValList <- list()
# assign na items
naEntry = as.data.frame(t(rep(NA, length(colsToAnnotate))))
colnames(naEntry) = colsToAnnotate
pValList[naInd] = list(naEntry)
# once for each region set
for (i in seq_along(nullDistList)[!naInd]) {
pValList[[i]] <- as.data.frame(t(pGammaList(scoreVec = as.numeric(rsScores[i, ]),
fitDistrList = fittedDistList[[i]])))
}
pValDF <- as.data.frame(rbindlist(pValList))
# Check the NA values and convert them to numeric type (0)
if(any(is.na(pValDF))) {
pValDF[is.na(pValDF)] <- 0.0000
}
return(pValDF)
}
# @param nullDistDF a data.frame. Has null distributions for a single
# region set. Each column corresponds to a null distribution for that
# region set for a given variable/sample attribute.
#
# @return Returns a list or NA. Each list item is a "fitdist" object which is
# a fitted function. If there are any NA's in nullDistDF, then NA will
# be returned instead.
# for one of the columns in nullDistDF (output of fitdist() from
# fitdistrplus.
# These are gamma distributions and can be used to get p values for the
# null distribution so that a large number of permutations
# are not required.The list is in order of the columns and will
# have the names of the data.frame columns.
fitGammaNullDist <- function(nullDistDF, method="mme", force=FALSE) {
processedList <- lapply(nullDistDF, function(x) {
# Replace NA and NaN with a small number close to zero
if(any(is.na(x))) {
x[is.na(x) | is.nan(x)] <- 0.0000
}
return(x)
})
modelList <- lapply(processedList, function(x) {
tryCatch({
x <- as.numeric(x)
fitdistrplus::fitdist(data=x, distr = "gamma", method=method)
}, error = function(e) {
print(paste("Error in fitting column:", e$message))
return(NA) # Return NA in case of an error
})
})
return(modelList)
}
pGammaList <- function(scoreVec, fitDistrList) {
if (any(is.na(scoreVec))) {
naVec <- rep(NA, length(scoreVec))
names(naVec) <- names(scoreVec)
return(naVec)
}
pValVec <- mapply(FUN = function(x, y) pgamma(q = y,
shape = x$estimate["shape"],
rate = x$estimate["rate"],
lower.tail = FALSE), x = fitDistrList, y = scoreVec)
return(pValVec)
}
#' Get p-value or z-score based on permutation results
#'
#' This function starts with real COCOA scores for each
#' region set and null distributions for each
#' region set that comes from running COCOA on permuted data.
#' Then this function uses the null distributions to get
#' an empirical p-value or z-score for each region set.
#' See vignettes for the workflow that leads to this function.
#' The calculation of the p-value/z-score does not
#' include the real region set score in the null distribution.
#'
#' @template rsScores
#' @param nullDistList List. one item per region set. Each item is a
#' data.frame with the null distribution/s for a single region set.
#' Each column in the data.frame is for a target variable
#' (e.g. PC or phenotype), which is given by the `signalCol` parameter
#' (each target variable has a different null distribution for a given region set).
#' @templateVar usesRSScores TRUE
#' @template signalCol
#' @param testType Character. "greater", "lesser", "two-sided" Whether to
#' create p values based on one sided test or not. Only applies when
#' whichMetric="pval".
#' @param whichMetric Character. Can be "pval" or "zscore"
#' @return A data.table/data.frame.
#' If whichMetric="pval", returns the empirical p-value for
#' each region set in `rsScores`. If the region set score is more extreme
#' than all scores in the null distribution, a p-value of 0 is returned but
#' this simply means the p-value is the minimum detectable p-value with
#' the given number of permutations used to make the null distributions. If
#' whichMetric="zscore", the function returns a z-score for each region set
#' score: ((region set score) - mean(null distribution)) / sd(null distribution)
#' @examples
#' fakeOriginalScores <- data.frame(PC1=abs(rnorm(6)), PC2=abs(rnorm(6)))
#' fakePermScores <- data.frame(PC1=abs(rnorm(6)), PC2=abs(rnorm(6)))
#' fakePermScores2 <- data.frame(PC1=abs(rnorm(6)), PC2=abs(rnorm(6)))
#' fakePermScores3 <- data.frame(PC1=abs(rnorm(6)), PC2=abs(rnorm(6)))
#' permRSScores <- list(fakePermScores, fakePermScores2, fakePermScores3)
#' nullDistList <- convertToFromNullDist(permRSScores)
#' getPermStat(rsScores=fakeOriginalScores, nullDistList=nullDistList,
#' signalCol=c("PC1", "PC2"), whichMetric="pval")
#' getPermStat(rsScores=fakeOriginalScores, nullDistList=nullDistList,
#' signalCol=c("PC1", "PC2"), whichMetric="zscore")
#'
#' @export
getPermStat <- function(rsScores, nullDistList, signalCol,
testType="greater", whichMetric = "pval") {
if (is(rsScores, "data.table")) {
rsScores <- as.data.frame(rsScores)
}
# do once for each region set
thisStatList <- list()
for (i in 1:nrow(rsScores)) {
thisStatList[[i]] <- as.data.frame(t(getPermStatSingle(rsScore=rsScores[i, signalCol],
nullDist = nullDistList[[i]],
signalCol = signalCol,
whichMetric=whichMetric)))
colnames(thisStatList[[i]]) <- signalCol
}
thisStat <- rbindlist(thisStatList)
# add back on annotation info
thisStat <- cbind(thisStat, rsScores[, colnames(rsScores)[!(colnames(rsScores) %in% signalCol)]])
# pVals <- mapply(FUN = function(x, y) getPermPvalSingle(rsScore=x,
# nullDist = y,
# signalCol = signalCol),
# x = rsScores[, signalCol], y=nullDistList)
return(thisStat)
}
# get p values for a single region set (can get p val for multiple columns)
# @param rsScore a row of values for a single region set. One
# value for each signalCol
getPermStatSingle <- function(rsScore, nullDist,
signalCol, testType="greater", whichMetric = "pval") {
# Check if nullDist is a list and convert it to a data frame
if (is.list(nullDist)) {
nullDist <- data.frame(do.call(cbind, nullDist))
} else if (is(nullDist, "data.table")) {
nullDist <- as.data.frame(nullDist)
}
# Replace NaN values with 0.0000 in each column
nullDist[] <- lapply(nullDist, function(x) replace(x, is.nan(x), 0.0000))
# if score was NA, return NA
if (any(is.na(rsScore))) {
return(rep(NA, length(rsScore)))
}
if (whichMetric == "pval") {
pVal <- rep(-1, length(rsScore))
if (testType == "greater") {
for (i in seq_along(pVal)) {
# only for one sided test (greater than)
pVal[i] <- 1 - ecdf(x = nullDist[, signalCol[i]])(rsScore[i])
}
} else if (testType == "lesser") {
for (i in seq_along(pVal)) {
# only for one sided test (less than)
pVal[i] <- ecdf(x = nullDist[, signalCol[i]])(rsScore[i])
}
} else if (testType == "two-sided") {
for (i in seq_along(pVal)) {
cdfVal <- ecdf(x = nullDist[, signalCol[i]])(rsScore[i])
# convert to two sided pval
pVal[i] <- -2 * abs(cdfVal - 0.5) + 1
}
}
thisStat <- pVal
}
if (whichMetric == "zscore") {
zScore <- rep(NA, length(rsScore))
for (i in seq_along(zScore)) {
if (is.nan(mean(nullDist[, signalCol[i]])) || is.nan(sd(x = nullDist[, signalCol[i]]))) {
zScore[i] <- 0.0000
} else {
zScore[i] <- (rsScore[i] - mean(nullDist[, signalCol[i]])) / sd(x = nullDist[, signalCol[i]])
}
}
# Condition to check NA in zScore
if(any(is.na(zScore))) {
zScore[is.na(zScore)] <- 0.0000
}
thisStat <- as.numeric(zScore)
}
return(thisStat)
}
##############################################################################
############### old ###################
# Run COCOA permutations to get null distributions for each region set.
# For one permutation, there are several steps:
# First, we shuffle the sample labels. Second, we calculate the association
# between the epigenetic data and the shuffled sample labels using the
# chosen metric (e.g. correlation). Third, the resulting feature coefficients
# are used as input to the aggregateSignalGRList function to score each region set.
# This process is repeated `nPerm` times.
#
# @param groupByRS logical
# @return A list where each item is a data.frame with the null
# distribution for a single region set. The length of the list
# is equal to the number of region sets. The number of rows of
# each data.frame is equal to the number of permutations.
# getNullDist <- function(groupByRS=TRUE) {
#
# }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.