R/permutation.R

Defines functions getPermStatSingle getPermStat pGammaList fitGammaNullDist getGammaPVal permListToOneNullDist convertToFromNullDist runCOCOA runCOCOAPerm detectSignalCoordType

Documented in convertToFromNullDist getGammaPVal getPermStat runCOCOA runCOCOAPerm

# functions that have to do with the permutation test
# and getting p values from that test with the gamma distribution





#' Run COCOA permutations to get p-values
#' 
#' This is a convenience function that runs multiple steps of the 
#' permutation process together: it runs COCOA permutations, converts these
#' to null distributions, gets the empirical p value (which is limited by the
#' number of permutations), gets z scores, and fits a gamma distribution 
#' to each null distribution to estimate p values (not limited by the 
#' number of permutations),
#' Requires that the user has previously calculated the real COCOA scores. 
#' See these individual functions for more info on each step: runCOCOA, 
#' convertToFromNullDist, getPermStat, and getGammaPVal. 
#' 
#' 
#' For reproducibility, set seed with 'set.seed()' function before running.
#' @param nPerm Numeric. The number of permutations to do.
#' @template genomicSignal
#' @template signalCoord
#' @template GRList
#' @template rsScores
#' @template targetVar
#' @templateVar usesTargetVar TRUE
#' @template signalCol
#' @template scoringMetric
#' @template absVal
#' @template olList
#' @param centerGenomicSignal Logical. Should rows in genomicSignal
#' be centered based on
#' their means? (subtracting row mean from each row)
#' @param centerTargetVar Logical. Should columns in targetVar be 
#' centered based
#' on their means? (subtract column mean from each column)
#' @param dataID Character. A unique identifier for this dataset 
#' (for saving results with simpleCache)
#' @template variationMetric
#' @param useSimpleCache Logical. Whether to use save caches. Caches
#' will be created for each permutation so that if the function is disrupted
#' it can restart where it left off. The final results are also saved 
#' as a cache. See simpleCache package for more details.
#' @param cacheDir Character. The path for the directory in which the
#' caches should be saved. 
# @param correctionMethod Character. P value correction method. Default
# is "BH" for Benjamini and Hochberg false discovery rate. For acceptable 
# arguments and more info see ?stats::p.adjust() (method parameter) 
#' @param testType Character. Parameter for `getPermStat`. Whether to
#' create p values based on one a two sided test or a lesser/greater one
#' sided test. Options are: "greater", "lesser", "two-sided" 
#' @param gammaFitMethod Character. method to use for fitting the gamma
#' distribution to null distribution. Options are 
#' "mme" (moment matching estimation), "mle" (maximum likelihood estimation), 
#' "qme" (quantile matching estimation), and "mge" (maximum goodness-of-fit 
#' estimation). See ?COCOA::getGammaPVal and 
#' ?fitdistrplus::fitdist() for more info.
#' @param realScoreInDist Logical. Should the actual score (from 
#' test with no permutations) be included in the null distribution 
#' when fitting the gamma distribution. realScoreInDist=TRUE is 
#' recommended.
#' @param force Logical. If force=TRUE, when fitting the gamma distribution
#' returns an error (as may happen when a method other than "mme"
#' is used) then allow the error. If force=FALSE, when fitting the 
#' gamma distribution returns an error then don't return an error but 
#' instead use the "mme" method
#' for fitting that specific gamma distribution.
#' @template verbose
#' @template returnCovInfo
#' @param ... Character. Optional additional arguments for simpleCache.
#'
#' @return Returns a list with the following 4 items: 
#' 1. a list of length nPerm where each item is a data.frame 
#' of the COCOA scores from a single permutation. 
#' Each data.frame is the output of `runCOCOA()` 
#' 2. a data.table/data.frame of empirical p-values (the
#' output of `getPermStat`) 
#' 3. a data.table/data.frame of z-scores (the output of `getPermStat`. 
#' 4. a data.frame of p-values based on the gamma approximation 
#' (the output of getGammaPVal(). 
#' 
#' @examples 
#' data("esr1_chr1")
#' data("nrf1_chr1")
#' data("brcaMethylData1")
#' data("brcaMCoord1")
#' pcScores <- prcomp(t(brcaMethylData1))$x
#' targetVarCols <- c("PC1", "PC2")
#' targetVar <- pcScores[, targetVarCols]
#' 
#' # give the actual order of samples to `runCOCOA` to get the real scores
#' correctSampleOrder=1:nrow(targetVar)
#' realRSScores <- runCOCOA(genomicSignal=brcaMethylData1,
#'                         signalCoord=brcaMCoord1,
#'                         GRList=GRangesList(esr1_chr1, nrf1_chr1),
#'                         signalCol=c("PC1", "PC2"),
#'                         targetVar=targetVar,
#'                         sampleOrder=correctSampleOrder,
#'                         variationMetric="cor")
#'         
#' # give random order of samples to get random COCOA scores 
#' # so you start building a null distribution for each region set 
#' # (see vignette for example of building a null distribution with `runCOCOA`)
#' randomOrder <- sample(1:nrow(targetVar), 
#'                       size=nrow(targetVar),
#'                       replace=FALSE)
#' randomRSScores <- runCOCOA(genomicSignal=brcaMethylData1,
#'                           signalCoord=brcaMCoord1,
#'                           GRList=GRangesList(esr1_chr1, nrf1_chr1),
#'                           signalCol=c("PC1", "PC2"),
#'                           targetVar=targetVar,
#'                           sampleOrder=randomOrder,
#'                           variationMetric="cor")
#' 
#' # runCOCOAPerm
#' permResults <- runCOCOAPerm(genomicSignal=brcaMethylData1,
#'                            signalCoord=brcaMCoord1,
#'                            GRList=GRangesList(esr1_chr1, nrf1_chr1),
#'                            rsScores=realRSScores,
#'                            targetVar=targetVar,
#'                            signalCol=c("PC1", "PC2"),
#'                            variationMetric="cor",
#'                            nPerm = 10,
#'                            useSimpleCache=FALSE)
#' permResults
#'   
#' 
#' @export

# Function to detect signalCoordType
detectSignalCoordType <- function(signalCoord) {
    if (any(start(signalCoord) != end(signalCoord))) {
        return("multiBase")
    } else if (length(signalCoord) == 1) {
        return("singleBase")
    } else {
        stop("Unable to determine signalCoordType. Please check the input.")
    }
}

runCOCOAPerm <- function(genomicSignal,
                         signalCoord,
                         GRList,
                         rsScores,
                         targetVar,
                         signalCol=c("PC1", "PC2"),
                         scoringMetric="default",
                         absVal=TRUE,
                         olList=NULL,
                         centerGenomicSignal=TRUE,
                         centerTargetVar=TRUE,
                         variationMetric="cor",
                         nPerm=300,
                         useSimpleCache=TRUE,
                         cacheDir=getwd(),
                         dataID="",
                         testType="greater",
                         gammaFitMethod="mme",
                         realScoreInDist=TRUE,
                         force=FALSE,
                         verbose=TRUE,
                         returnCovInfo=FALSE, ...) {
    
    
    colsToAnnotate <- signalCol
    allResultsList <- list()
    
    if (is(rsScores, "data.table")) {
        rsScores = as.data.frame(rsScores)
    }
    rsScores = rsScores[, colsToAnnotate] # prevents error that occurs if extra column is factor
    
    # more efficient to only do once (not that high impact though)
    if (centerGenomicSignal) {
        cpgMeans <- rowMeans(genomicSignal, na.rm = TRUE)
        # centering before calculating correlation
        genomicSignal <- apply(X = genomicSignal, MARGIN = 2, function(x) x - cpgMeans)
        # don't do later
        centerGenomicSignal <- FALSE
    }

    # check that targetVar is not empty
    if (centerTargetVar) {
        if (nrow(targetVarDF) == 0) {
            stop("Error: targetVarDF is empty. Make sure it contains at least one row.")
            }
    } 
                               
    # check if targetVar contains numeric data                           
    if (centerTargetVar) {
        if (all(is.numeric(targetVar))) {
            featureMeans <- colMeans(targetVar, na.rm = TRUE)
            # centering before calculating correlation (also, t() converts to matrix)
            targetVar <- t(apply(X = t(targetVar), MARGIN = 2, function(x) x - featureMeans))
            if (dim(targetVar)[1] == 1) {
                targetVar <- t(targetVar)
            }
            # don't do later
            centerTargetVar <- FALSE
        } else {
            cat("Error: 'targetVar' must contain only numeric data.\n")
            
            # Find non-numeric columns and display their indexes
            nonNumericCols <- which(!is.numeric(targetVar))
            cat("Columns with non-numeric data: ", paste(nonNumericCols,collapse = ", "), "\n")            
        }
    }
    

    checkConvertInputClasses(signal=genomicSignal,
                             signalCoord=signalCoord,
                             regionSet=NULL,
                             signalCol=signalCol,
                             rsOL=NULL)
    
        # detect signalCoordType
        # when signalCoord is a GRanges object
        if (is(signalCoord, "GRanges")) {
          signalCoordType <- detectSignalCoordType(signalCoord)
      } else {
          stop("Error: 'signalCoord' must be a GRanges object.")
      }
      
      # if "default" scoring method is given, choose based on signalCoordType
      if (scoringMetric == "default") {
          if (signalCoordType == "singleBase") {
              scoringMetric <- "regionMean"   
          } else if (signalCoordType == "multiBase") {
              scoringMetric <- "proportionWeightedMean"
          } else {
              stop("Error: signalCoordType not recognized. Check spelling/capitalization.")
          }
      }
    
    #################
    if (is.null(olList)) {
        
        #######
        # must take out NA rows before getting OL list. Otherwise later calculations 
        # will use wrong indices. 
        
        # what happens if there are NAs or Inf in `signal`?
        # any NAs that overlap the regionSet will cause the score to be NA
        if (is(genomicSignal, "data.table")) {
            naRows = apply(X = genomicSignal[, , with=FALSE, drop=FALSE], 
                           MARGIN = 1, FUN = function(x) any(is.na(x)))
        } else {
            naRows = apply(X = genomicSignal[, , drop=FALSE], 
                           MARGIN = 1, FUN = function(x) any(is.na(x)))    
        }
        
        if (any(naRows)) {
            genomicSignal <- genomicSignal[!naRows, ]
            signalCoord <- signalCoord[!naRows]
            warning("Removing rows with NA from `genomicSignal`")
        }
        
        #################################################################
        
        # calculate overlaps only once
        # region set must be subject to fit with scoring functions
        olList <- lapply(X = GRList, FUN = function(x) findOverlaps(query = signalCoord, 
                                                                 subject = x))
        totalRegionNumber = sapply(X = GRList, length)
        meanRegionSize = sapply(X = GRList, function(x) round(mean(width(x))))
    }
    
    # also calculate coverage info
    # @param rsOL 
    calculateCovInfo <- function(rsOL, 
                                 scoringMetric=scoringMetric, 
                                 pOlap=NULL) {
        
        covInfo <- data.frame(signalCoverage=length(unique(queryHits(rsOL))), 
                              regionSetCoverage=length(unique(subjectHits(rsOL))))
        
        if (scoringMetric == "proportionWeightedMean") {
            
            covInfo$sumProportionOverlap <- sum(pOlap)
        }
        
    }
    
    covInfo <- lapply(X = olList, FUN = function(x) calculateCovInfo(rsOL=x, 
                                                          scoringMetric = scoringMetric))
    
    
    if (scoringMetric == "proportionWeightedMean") {
        getPOlap <- function(rsOL, signalGR, regionSet) {
            olap  <- pintersect(signalGR[queryHits(rsOL)],
                                regionSet[subjectHits(rsOL)])
            pOlap <- width(olap) / width(regionSet[subjectHits(rsOL)])
            return(pOlap)
        }
        
        # list
        pOlapList <- mapply(FUN = function(x, y) getPOlap(rsOL = x, 
                                                     signalGR=signalCoord, 
                                                     regionSet = y), 
                       x=olList, y=GRList, SIMPLIFY = FALSE)
        covInfo$sumProportionOverlap <- sapply(X = pOlapList, FUN = sum)
    } else {
        pOlapList <- NULL
    }
    
    #################
    
        
    indList <- list()
    # generate random indices for shuffling of samples
    for (i in 1:nPerm) {
        indList[[i]] <- sample(1:nrow(targetVar), nrow(targetVar))
    }
    # replicate(10, sample(seq_len(nrow(targetVar)))

    if (useSimpleCache) {
        
        # create the main permutation cache
        simpleCache(paste0("rsPermScores_", nPerm, "Perm_", variationMetric, "_", dataID), {
            
        helperFun <- function(x, y, ...) {
            # for (i in (length(rsPermScores) + 1):nPerm) {
            onePermCacheName <- paste0("rsPermScores_", nPerm, "Perm_", variationMetric, "_", dataID, "_Cache", y)
            # create sub caches, one for each permutation
            simpleCache(onePermCacheName, cacheSubDir = paste0("rsPermScores_", nPerm, "Perm_", variationMetric, "_", dataID), {
                
                tmp <- runCOCOA(sampleOrder=x,
                                genomicSignal=genomicSignal,
                                signalCoord=signalCoord,
                                GRList=GRList,
                                signalCol=colsToAnnotate,
                                targetVar=targetVar,
                                variationMetric = variationMetric,
                                scoringMetric=scoringMetric,
                                absVal=absVal,
                                centerGenomicSignal = centerGenomicSignal,
                                centerTargetVar = centerTargetVar,
                                verbose=verbose,
                                olList=olList,
                                pOlapList=pOlapList,
                                returnCovInfo = returnCovInfo)
                message(y) # must be ahead of object that is saved as cache, not after
                tmp
                
            }, cacheDir=cacheDir, assignToVariable="tmp", ...)
            return(tmp)
        }
        rsPermScores = mapply(FUN = helperFun, x=indList, y=seq_along(indList), ..., SIMPLIFY=FALSE)
        rsPermScores
            
        }, assignToVariable="rsPermScores", cacheDir=cacheDir, ...)    
    } else {
        
        helperFun2 <- function(x) {
            tmp <- runCOCOA(sampleOrder=x,
                          genomicSignal=genomicSignal,
                          signalCoord=signalCoord,
                          GRList=GRList,
                          signalCol=colsToAnnotate,
                          targetVar=targetVar,
                          variationMetric = variationMetric,
                          scoringMetric=scoringMetric,
                          absVal=absVal,
                          verbose=verbose,
                          olList=olList,
                          pOlapList=pOlapList,
                          returnCovInfo = returnCovInfo)
            message(".")
            return(tmp)
        }
        
        rsPermScores <- lapply(X = indList, helperFun2)
 
    }
    
    
    .analysisID <- paste0("_", nPerm, "Perm_", variationMetric, "_", dataID)
    
    # # remove region sets that had no overlap
    # keepInd <- apply(rsPermScores[[1]], MARGIN = 1, FUN = function(x) !any(is.na(x)))
    # 
    # # screen out region sets with no overlap
    # nullDistList <- nullDistList[keepInd]
    # rsScores <- rsScores[keepInd, ]
    
    nullDistList <- convertToFromNullDist(rsPermScores)
    if (useSimpleCache) {

        simpleCache(paste0("empiricalPValsUncorrected", .analysisID), {
            rsPVals <- getPermStat(rsScores=rsScores, nullDistList=nullDistList,
                                  signalCol=colsToAnnotate, whichMetric = "pval",
                                  testType=testType)
            rsPVals
        }, assignToVariable="rsPVals", cacheDir=cacheDir, ...)
        
        simpleCache(paste0("permZScores", .analysisID), {
            rsZScores <- getPermStat(rsScores=rsScores, nullDistList=nullDistList,
                                    signalCol=colsToAnnotate, whichMetric = "zscore")
            rsZScores
            
        }, assignToVariable="rsZScores", cacheDir=cacheDir, ...)
    
        simpleCache(paste0("gammaPValsUncorrected", .analysisID), {
            # p-values based on fitted gamma distributions
            gPValDF <- getGammaPVal(rsScores = rsScores, 
                                   nullDistList = nullDistList, 
                                   signalCol = colsToAnnotate, 
                                   method = gammaFitMethod, 
                                   realScoreInDist = realScoreInDist,
                                   force=force)
            # gPValDF <- apply(X = gPValDF, MARGIN = 2, 
            #                 FUN = function(x) p.adjust(p = x, method = correctionMethod))
            gPValDF <- cbind(gPValDF, 
                            rsScores[, colnames(rsScores)[!(colnames(rsScores) 
                                                                    %in% colsToAnnotate)]])
            gPValDF
        }, assignToVariable="gPValDF", cacheDir=cacheDir, ...)    
        
    } else {
        rsPVals <- getPermStat(rsScores=rsScores, nullDistList=nullDistList,
                              signalCol=colsToAnnotate, whichMetric = "pval",
                              testType = testType)
        
        rsZScores <- getPermStat(rsScores=rsScores, nullDistList=nullDistList,
                                signalCol=colsToAnnotate, whichMetric = "zscore")
        
        # p-values based on fitted gamma distributions
        gPValDF <- getGammaPVal(rsScores = rsScores, 
                               nullDistList = nullDistList, 
                               signalCol = colsToAnnotate, 
                               method = gammaFitMethod, 
                               realScoreInDist = realScoreInDist,
                               force=force)
        gPValDF <- cbind(gPValDF, 
                        rsScores[, colnames(rsScores)[!(colnames(rsScores) 
                                                                %in% colsToAnnotate)]])
        
    }
    
    allResultsList$permRSScores <- rsPermScores
    allResultsList$empiricalPVals <- rsPVals
    allResultsList$zScores <- rsZScores
    allResultsList$gammaPVal <- gPValDF
    return(allResultsList)

    # return(named list)
    # definitely: rsPermScores or nullDistList, 
    # also uncorrected perm pvals, z scores, uncorrected gamma pvals
    # possible outputs:  
    # uncorrected pvals perm and gamma, corrected pvals perm and gamma
    # zscores
}

#' Run COCOA: quantify inter-sample variation, score region sets
#' 
#' Run COCOA easily with two steps: quantify epigenetic variation and score 
#' region sets. Get real COCOA scores with default settings. Generate null 
#' distributions by shuffling samples and running COCOA multiple times 
#' to evaluate statistical significance. Choose different options for 
#' epigenetic variation using the variationMetric parameter. For unsupervised 
#' analysis, perform dimensionality reduction first and use the output 
#' as sample labels in runCOCOA. For supervised analysis, use target 
#' variables/phenotypes as targetVar. Check vignettes for examples.
#' 
#' 
#' @param sampleOrder numeric. A vector of length (number of samples). If
#' sampleOrder is 1:(number of samples) then this function will return the
#' real COCOA scores.
#' To generate random COCOA scores in order to make 
#' null distributions, shuffle the samples in a random order.
#' E.g. sampleOrder = sample(1:ncol(genomicSignal), ncol(genomicSignal))
#' where ncol(genomicSignal) is the number of samples. 
#' Set the seed with set.seed() before making sampleOrder to ensure reproducibility.
#' @template genomicSignal
#' @template signalCoord
#' @template GRList
#' @templateVar usesTargetVar TRUE
#' @template signalCol
#' @template targetVar
#' @template variationMetric
#' @template scoringMetric
#' @template verbose
#' @template absVal
#' @template olList
#' @template pOlapList
#' @permutations numeric -- specify the number of permutations. Leave NULL for no permutations (NULL by default).
#' @cores numeric -- specify the number of cores, default: cores=1
#' @param centerGenomicSignal Logical. Should rows in genomicSignal
#' be centered based on
#' their means? (subtracting row mean from each row)
#' @param centerTargetVar Logical. Should columns in targetVar be 
#' centered based
#' on their means? (subtract column mean from each column)
#' @template returnCovInfo
#' @return data.frame. The output of aggregateSignalGRList for one permutation.
#' @examples
#' data("esr1_chr1")
#' data("nrf1_chr1")
#' data("brcaMethylData1")
#' data("brcaMCoord1")
#' pcScores <- prcomp(t(brcaMethylData1))$x
#' targetVarCols <- c("PC1", "PC2")
#' targetVar <- pcScores[, targetVarCols]
#' 
#' # give the actual order of samples to `runCOCOA` to get the real scores
#' correctSampleOrder=1:nrow(targetVar)
#' realRSScores <- runCOCOA(genomicSignal=brcaMethylData1,
#'                         signalCoord=brcaMCoord1,
#'                         GRList=GRangesList(esr1_chr1, nrf1_chr1),
#'                         signalCol=c("PC1", "PC2"),
#'                         targetVar=targetVar,
#'                         sampleOrder=correctSampleOrder,
#'                         variationMetric="cor")
#' realRSScores
#'         
#' # give random order of samples to get random COCOA scores 
#' # so you start building a null distribution for each region set 
#' # (see vignette for example of building a null distribution with `runCOCOA`)
#' randomOrder <- sample(1:nrow(targetVar), 
#'                       size=nrow(targetVar),
#'                       replace=FALSE)
#' randomRSScores <- runCOCOA(genomicSignal=brcaMethylData1,
#'                           signalCoord=brcaMCoord1,
#'                           GRList=GRangesList(esr1_chr1, nrf1_chr1),
#'                           signalCol=c("PC1", "PC2"),
#'                           targetVar=targetVar,
#'                           sampleOrder=randomOrder,
#'                           variationMetric="cor")
#' randomRSScores
#' @export
# Load the required library
library(parallel)

runCOCOA <- function(genomicSignal, 
                    signalCoord, GRList, signalCol,
                    targetVar, 
                    sampleOrder=1:nrow(targetVar),
                    variationMetric = "cor", 
                    scoringMetric="default", verbose=TRUE,
                    absVal=TRUE, olList=NULL, pOlapList=NULL,
                    centerGenomicSignal=TRUE,
                    centerTargetVar=TRUE, 
                    returnCovInfo=TRUE,
                    cores=1,
                    permutations=1) { 
    
    # check if targetVar is empty
    if (is.null(targetVar) || length(targetVar) == 0) {
        stop("`targetVar` is empty")
    }
    
    # if vector is given, return error
    if (is.null(dim(targetVar))) {
        stop("`targetVar` should be a matrix or data.frame")
    }
    
    if (any(!(signalCol %in% colnames(targetVar)))) {
        stop("Not all specified columns are present in `targetVar`")
    }
    
    
    # subset to only signalCol
    targetVar <- as.data.frame(targetVar[, signalCol, drop=FALSE])
    
    # because names are dropped for a single column data.frame when indexing
    # single col data.frame is automatically converted to numeric
    featureNames <- colnames(targetVar)
    # reorder the sample labels
    targetVar <- data.frame(targetVar[sampleOrder, ])
    colnames(targetVar) <- featureNames
    
    # calculate correlation
    featureLabelCor <- createCorFeatureMat(dataMat = genomicSignal, 
                                          featureMat = targetVar, 
                                          centerDataMat = centerGenomicSignal, 
                                          centerFeatureMat = centerTargetVar,
                                          testType = variationMetric)
    
    # more efficient to do only once instead of for each region set later on
    if (absVal) {
        featureLabelCor <- abs(featureLabelCor)
        absVal <- FALSE    
    }
    
    
    # list for results
    permResults <- list()
    
    # Loop through permutations
    for (i in 1:permutations) {
        # Shuffling sample labels
        sampleOrder <- sample(1:nrow(targetVar), nrow(targetVar))
        targetVarPerm <- targetVar[sampleOrder, ]  

        # Run COCOA with parallel processing (if cores > 1)
        if (cores > 1) {
        cl <- makeCluster(cores)
        clusterEvalQ(cl, library(CoCoh))
        thisPermRes <- parLapply(cl, GRList, function(GR) {
            aggregateSignalGRList(signal = featureLabelCor, 
                                signalCoord = signalCoord, GRList = GR, 
                                signalCol = signalCol, 
                                scoringMetric = scoringMetric, verbose = verbose,
                                absVal = absVal, olList = olList, pOlapList = pOlapList,
                                returnCovInfo = returnCovInfo)
        })
        stopCluster(cl)
        } else {
        thisPermRes <- aggregateSignalGRList(signal = featureLabelCor, 
                                            signalCoord = signalCoord, GRList = GRList, 
                                            signalCol = signalCol, 
                                            scoringMetric = scoringMetric, verbose = verbose,
                                            absVal = absVal, olList = olList, pOlapList = pOlapList,
                                            returnCovInfo = returnCovInfo)
        }

        # Store this permutation's result in the list
        permResults[[i]] <- thisPermRes
        
    } 
    # Return the list of permutation results
    return(permResults)
}


#' Converts COCOA permutation results to null distributions and vice versa
#' 
#' This function will take a list of results of permutation tests that included
#' many region sets and return a list of data.frames where each data.frame
#' contains the null distribution for a single region set.
#' The function can also convert in the reverse order from 
#' a list of null distributions to a list of COCOA results. 
#'  
#' @param rsScoresList each item in the list is a data.frame, one item for
#' each permutation with the results of that permutation. Each row in the 
#' data.frame is a region set. All data.frames should be the same size and
#' each data.frame's rows should be in the same order
#'                       
#' @return a list of data.frames. The function returns a list of data.frames. 
#' If the input is a list of data.frames from COCOA permutations, the output 
#' will be a list of data.frames containing null distributions for each region set, 
#' with the same columns as the input data.frames. If the input is a list of data.frames 
#' with null distributions for individual region sets, the output will be a list of data.frames, 
#' each with one row for each region set, representing results for a single COCOA permutation.
#' 
#' @examples
#' # six region sets (rows), 2 signals (columns)
#' fakePermScores <- data.frame(abs(rnorm(6)), abs(rnorm(6)))
#' fakePermScores2 <- data.frame(abs(rnorm(6)), abs(rnorm(6)))
#' # 2 fake COCOA results (i.e. nPerm=2)
#' permRSScores <- list(fakePermScores, fakePermScores2)
#' convertToFromNullDist(permRSScores)
#' 
#' @export

convertToFromNullDist <- function(rsScoresList) {

    nullDistList <- lapply(X = 1:nrow(rsScoresList[[1]]),
                          FUN = function(x) permListToOneNullDist(resultsList=rsScoresList, 
                                                                  rsInd = x))
    return(nullDistList)
}


# @param rsInd numeric. The row number for the region set of interest.
# do for only one region set
permListToOneNullDist <- function(resultsList, rsInd) {
    
    rowList <- lapply(resultsList, FUN = function(x) x[rsInd, ])
    rsNullDist <- as.data.frame(rbindlist(rowList))
    return(rsNullDist)
}





################################################################################
# p value functions 


#' Get a p-value for region set scores based on a gamma distribution. 
#' 
#' First fit a gamma distribution to each region set's null distribution/s
#' (nullDistList). Then use this gamma distribution to convert scores in
#' rsScores to p-values.
#' 
#' @template rsScores 
#' @param nullDistList list of data.frames. Each list item 
#' has null distributions for a single 
#' region set (list items should be in the same order as rows of rsScores). 
#' Has same score columns as rsScores. 
#' Each column corresponds to a null distribution for that 
#' region set for a given sample variable of interest/target variable
#' (e.g. PC or sample phenotype).  
#' @templateVar usesRSScores TRUE
#' @template signalCol
#' @param method Character. Has the method to use to fit the gamma 
#' distribution to the null distribution.
#' Options are 
#' "mme" (moment matching estimation), "mle" (maximum likelihood estimation), 
#' "qme" (quantile matching estimation), and "mge" (maximum goodness-of-fit 
#' estimation). See ?fitdistrplus::fitdist() for
#' available options and meaning.
#' @param realScoreInDist logical. Should the actual score (from 
#' test with no permutations) be included in the null distribution 
#' when fitting the gamma distribution. realScoreInDist=TRUE is 
#' recommended.
#' @param force logical. If force=TRUE, when fitting the gamma distribution
#' returns an error (as may happen when a method other than "mme"
#' is used) then allow the error. If force=FALSE, when fitting the 
#' gamma distribution returns an error then don't return an error but 
#' instead use the "mme" method
#' for fitting that specific gamma distribution.
# 
#' @return Returns a data.frame with p values, one column for each signalCol in
#' rsScores 
#' 
#' @examples 
# fakeOriginalScores <- data.frame(PC1=abs(rnorm(6)), PC2=abs(rnorm(6)))
# fakePermScores <- data.frame(PC1=abs(rnorm(6)), PC2=abs(rnorm(6)))
# fakePermScores2 <- data.frame(PC1=abs(rnorm(6)), PC2=abs(rnorm(6)))
# fakePermScores3 <- data.frame(PC1=abs(rnorm(6)), PC2=abs(rnorm(6)))
# permRSScores <- list(fakePermScores, fakePermScores2, fakePermScores3)
# nullDistList <- convertToFromNullDist(permRSScores)
# getGammaPVal(rsScores=fakeOriginalScores,
#              nullDistList=nullDistList,
#              signalCol=c("PC1", "PC2"))
#' 
#' @export

getGammaPVal <- function(rsScores, nullDistList, signalCol, method="mme", 
                         realScoreInDist=TRUE, force=FALSE) {
    
    
    # take absolute value since gamma distribution cannot have negative values
    if (any(rsScores < 0, na.rm = TRUE)) {
        rsScores <- abs(rsScores)
    }
    conditionalAbs <- function(x) {
        if (any(x < 0, na.rm = TRUE)) {
            return(abs(x))
        } else {
            return(x)
        }
    }
    nullDistList <- lapply(X = nullDistList, FUN = conditionalAbs)
    
    
    # make sure the same columns are present/in the same order
    
    
    colsToAnnotate <- signalCol[signalCol %in% colnames(rsScores)]
    
    if (realScoreInDist) {
        # to get a more accurate gamma distribution, include the score from unpermuted test.
        # add to each null distribution
        for (i in 1:nrow(rsScores)) {
            nullDistList[[i]] <- rbind(nullDistList[[i]], as.numeric(rsScores[i, ]))
        }
    }
    
    rsScores <- rsScores[, colsToAnnotate, drop=FALSE]  
    
    # returns list, each item in list is also a list.
    # in sub list: each col in nullDistDF has one list item
    fittedDistList <- lapply(X = nullDistList, 
                             function(x) fitGammaNullDist(nullDistDF = 
                                                              x[, colsToAnnotate, 
                                                                drop=FALSE], 
                                                          method=method, 
                                                          force=force))
    
    # region sets with no coverage have NA for fittedDistList value
    # pGammaList cannot assign these column names so assign them ahead of time
    # and don't run pGammaList on those region sets
    naInd = vapply(X = fittedDistList, 
                   FUN = function(x) all(is.na(x)), 
                   FUN.VALUE = TRUE)
    
    pValList <- list()
    
    # assign na items
    naEntry = as.data.frame(t(rep(NA, length(colsToAnnotate))))
    colnames(naEntry) = colsToAnnotate
    pValList[naInd] = list(naEntry)
    
    # once for each region set
    for (i in seq_along(nullDistList)[!naInd]) {
        
        pValList[[i]] <- as.data.frame(t(pGammaList(scoreVec = as.numeric(rsScores[i, ]), 
                                                   fitDistrList = fittedDistList[[i]])))
    }
    pValDF <- as.data.frame(rbindlist(pValList))
    
    
    return(pValDF)
}


# @param nullDistDF a data.frame. Has null distributions for a single 
# region set. Each column corresponds to a null distribution for that 
# region set for a given variable/sample attribute.   
# 
# @return Returns a list or NA. Each list item is a "fitdist" object which is 
# a fitted function. If there are any NA's in nullDistDF, then NA will 
# be returned instead.
# for one of the columns in nullDistDF (output of fitdist() from
# fitdistrplus.
# These are gamma distributions and can be used to get p values for the 
# null distribution so that a large number of permutations 
# are not required.The list is in order of the columns and will
# have the names of the data.frame columns. 

fitGammaNullDist <- function(nullDistDF, method="mme", force=FALSE) {
    
    if (any(is.na(nullDistDF))) {
        
        return(NA)
    }
    
    if (force) {
        # apply returns a list of "fitdist" objects, one list item for each column
        modelList <- apply(X = nullDistDF, 
                          MARGIN = 2, 
                          FUN = function(x) fitdistrplus::fitdist(data=x, 
                                                                  distr = "gamma", 
                                                                  method=method))
    } else {
        # try "method". if it fails, do method = "mme"
        modelList <- apply(X = nullDistDF, 
                          MARGIN = 2, 
                          FUN = function(x) tryCatch({fitdistrplus::fitdist(data=x, 
                                                                            distr = "gamma", 
                                                                            method=method)}, 
                                                     error = function(e) {fitdistrplus::fitdist(data=x, 
                                                                                                distr = "gamma", 
                                                                                                method="mme")}))   
    }
    
    return(modelList)
}

pGammaList <- function(scoreVec, fitDistrList) {
    
    if (any(is.na(scoreVec))) {
        naVec <- rep(NA, length(scoreVec))
        names(naVec) <- names(scoreVec)
        return(naVec)
    }
    
    pValVec <- mapply(FUN = function(x, y) pgamma(q = y, 
                                                 shape = x$estimate["shape"], 
                                                 rate = x$estimate["rate"], 
                                                 lower.tail = FALSE), x = fitDistrList, y = scoreVec)
    return(pValVec)
}

#' Get p-value or z-score based on permutation results
#' 
#' This function starts with real COCOA scores for each
#' region set and null distributions for each
#' region set that comes from running COCOA on permuted data. 
#' Then this function uses the null distributions to get 
#' an empirical p-value or z-score for each region set. 
#' See vignettes for the workflow that leads to this function. 
#' The calculation of the p-value/z-score does not 
#' include the real region set score in the null distribution.
#'
#' @template rsScores 
#' @param nullDistList List. one item per region set. Each item is a 
#' data.frame with the null distribution/s for a single region set. 
#' Each column in the data.frame is for a target variable 
#' (e.g. PC or phenotype), which is given by the `signalCol` parameter 
#' (each target variable has a different null distribution for a given region set).
#' @templateVar usesRSScores TRUE
#' @template signalCol 
#' @param testType Character. "greater", "lesser", "two-sided" Whether to
#' create p values based on one sided test or not. Only applies when
#' whichMetric="pval".
#' @param whichMetric Character. Can be "pval" or "zscore"
#' @return A data.table/data.frame. 
#' If whichMetric="pval", returns the empirical p-value for
#' each region set in `rsScores`. If the region set score is more extreme
#' than all scores in the null distribution, a p-value of 0 is returned but
#' this simply means the p-value is the minimum detectable p-value with
#' the given number of permutations used to make the null distributions. If
#' whichMetric="zscore", the function returns a z-score for each region set
#' score: ((region set score) - mean(null distribution)) / sd(null distribution)
#' @examples 
#' fakeOriginalScores <- data.frame(PC1=abs(rnorm(6)), PC2=abs(rnorm(6)))
#' fakePermScores <- data.frame(PC1=abs(rnorm(6)), PC2=abs(rnorm(6)))
#' fakePermScores2 <- data.frame(PC1=abs(rnorm(6)), PC2=abs(rnorm(6)))
#' fakePermScores3 <- data.frame(PC1=abs(rnorm(6)), PC2=abs(rnorm(6)))
#' permRSScores <- list(fakePermScores, fakePermScores2, fakePermScores3)
#' nullDistList <- convertToFromNullDist(permRSScores)
#' getPermStat(rsScores=fakeOriginalScores, nullDistList=nullDistList, 
#'             signalCol=c("PC1", "PC2"), whichMetric="pval") 
#' getPermStat(rsScores=fakeOriginalScores, nullDistList=nullDistList, 
#'             signalCol=c("PC1", "PC2"), whichMetric="zscore") 
#' 
#' @export

getPermStat <- function(rsScores, nullDistList, signalCol, 
                        testType="greater", whichMetric = "pval") {
    
    if (is(rsScores, "data.table")) {
        rsScores <- as.data.frame(rsScores)
    }
    
    # do once for each region set
    thisStatList <- list()
    for (i in 1:nrow(rsScores)) {
        thisStatList[[i]] <- as.data.frame(t(getPermStatSingle(rsScore=rsScores[i, signalCol], 
                                                              nullDist = nullDistList[[i]],
                                                              signalCol = signalCol,
                                                              whichMetric=whichMetric)))
        colnames(thisStatList[[i]]) <- signalCol
    }
    thisStat <- rbindlist(thisStatList)
    # add back on annotation info
    thisStat <- cbind(thisStat, rsScores[, colnames(rsScores)[!(colnames(rsScores) %in% signalCol)]])
    
    # pVals <- mapply(FUN = function(x, y) getPermPvalSingle(rsScore=x, 
    #                                            nullDist = y,
    #                                            signalCol = signalCol), 
    #        x = rsScores[, signalCol], y=nullDistList)
    
    
    return(thisStat)
} 


# get p values for a single region set (can get p val for multiple columns)
# @param rsScore a row of values for a single region set. One 
# value for each signalCol
getPermStatSingle <- function(rsScore, nullDist, 
                              signalCol, testType="greater", whichMetric = "pval") {
    
    if (is(nullDist, "data.table")) {
        nullDist <- as.data.frame(nullDist)
    }
    
    # if score was NA, return NA
    if (any(is.na(rsScore))) {
        return(rep(NA, length(rsScore)))
    }
    
    
    if (whichMetric == "pval") {
        
        pVal <- rep(-1, length(rsScore))
        if (testType == "greater") {
            for (i in seq_along(pVal)) {
                # only for one sided test (greater than)
                pVal[i] <- 1 - ecdf(x = nullDist[, signalCol[i]])(rsScore[i])
            }
        } else if (testType == "lesser") {
            for (i in seq_along(pVal)) {
                # only for one sided test (less than)
                pVal[i] <- ecdf(x = nullDist[, signalCol[i]])(rsScore[i])
            }
        } else if (testType == "two-sided") {
            
            for (i in seq_along(pVal)) {
                cdfVal <- ecdf(x = nullDist[, signalCol[i]])(rsScore[i])
                # convert to two sided pval
                pVal[i] <- -2 * abs(cdfVal - 0.5) + 1
            }

        }
        
        thisStat <- pVal
    }
    
    if (whichMetric == "zscore") {
        
        zScore <- rep(NA, length(rsScore))
        for (i in seq_along(zScore)) {
            # greater than/less than hypothesis does not matter for z-score
            zScore[i] <- (rsScore[i] - mean(nullDist[, signalCol[i]])) / sd(x = nullDist[, signalCol[i]])
        }
        
        thisStat <- as.numeric(zScore)
    }
    
    return(thisStat)
}


##############################################################################
############### old ###################

# Run COCOA permutations to get null distributions for each region set.
# For one permutation, there are several steps: 
# First, we shuffle the sample labels. Second, we calculate the association
# between the epigenetic data and the shuffled sample labels using the 
# chosen metric (e.g. correlation). Third, the resulting feature coefficients
# are used as input to the aggregateSignalGRList function to score each region set.
# This process is repeated `nPerm` times.
# 
# @param groupByRS logical

# @return A list where each item is a data.frame with the null 
# distribution for a single region set. The length of the list
# is equal to the number of region sets. The number of rows of 
# each data.frame is equal to the number of permutations.
# getNullDist <- function(groupByRS=TRUE) {
#     
# }
databio/COCOA documentation built on Sept. 1, 2023, 5:50 p.m.