R/egseaUtils.R

Defines functions makeContrastPairs runStandardLimmaDEA get.toptables createComparison runbaseGSEA extractPvaluesRanks extractAdjPvaluesRanks voteRank combineBaseGSEAs combinePvalues runegsea runbaseGSEAParallelWorker egsea.selectTopGeneSets getBaseInfo calculateSetScores optimizeNumThreads egsea.main

#Ensemble of Gene Set Enrichment Analyses 
# 
# Author: Monther Alhamdoosh, E:m.hamdoosh@gmail.com


egsea.main <- function(voom.results, contrast, gs.annots, baseGSEAs, 
                combineMethod, combineWeights, sort.by, report.dir, 
                kegg.dir, logFC, symbolsMap, minSize, display.top, 
                logFC.cutoff, fdr.cutoff, sum.plot.cutoff, sum.plot.axis, 
                vote.bin.width, keep.base, verbose, num.threads, 
                report, keep.limma, keep.set.scores, interactive){    
    message("EGSEA analysis has started")
    start.time <- proc.time()
    timestamp()
    # check arguments are valid    
#    print(class(voom.results))
    stopifnot((is(voom.results, "list") && 
                        "ids" %in% names(voom.results)) 
                    || is(voom.results, "EList"))  
    #stopifnot(!is.null(contrast))
    stopifnot(!is.null(gs.annots))
    stopifnot(length(baseGSEAs) > 0 && length(setdiff(baseGSEAs, egsea.base())) == 0)
    stopifnot(combineMethod %in% egsea.combine())
    stopifnot(sort.by %in% c(egsea.sort()[1:8], baseGSEAs))
    baseGSEAs = unique(sapply(baseGSEAs, tolower))    
    combineMethod = tolower(combineMethod)  
    sort.by = tolower(sort.by)   
    if (length(baseGSEAs) <= 1){
        warning("The ensemble mode was disabled. No sufficient base methods.")
        if (! sort.by %in% c("p.adj", "p.value")){
            sort.by = "p.adj"
            warning("The argument 'sort.by' was set to \"p.adj\"")
        }
    }
    keep.set.scores = (keep.set.scores && length(intersect(baseGSEAs, 
                    c("ssgsea")))) # , "gsva", "plage", "zscore"  
    if (is.null(sum.plot.axis))
        sum.plot.axis = "p.adj"
    # create a list of GSCollectionIndex objects
    if (is(gs.annots, "GSCollectionIndex")){
        gs.annot = gs.annots
        gs.annots = list()
        gs.annots[[gs.annot@label]] = gs.annot
    }  
    # format the contrast matrix/vector and create contrast names
    if (is.null(contrast)){
        if (is.null(voom.results$targets) || 
                is.null(voom.results$targets$group))
            stop(paste0("The data frame 'targets' of the object 'voom.results' ",
                            "must have a column named 'group'."))
        group.levels = levels(factor(voom.results$targets$group))
        if (all(group.levels %in% colnames(voom.results$design))){
            contrast = makeContrastPairs(
                    colnames(voom.results$design),
                    group.levels)
        }else
            contrast = 2:length(group.levels)        
    }
    if (!is.matrix(contrast)){
        stopifnot(max(contrast) <= ncol(voom.results$design))
        message("The argument 'contrast' is recommended to be a matrix object.\n",
                    "See Vignette or Help.")
        if (is.null(names(contrast))){
            if (is.null(voom.results$targets) || 
                    is.null(voom.results$targets$group))
                stop("The data frame 'targets' of the object 'voom.results' ",
                                "must have a column named 'group'.")
            ref.group = levels(factor(voom.results$targets$group))[1]  
            names(contrast) = paste0(colnames(voom.results$design)[contrast], 
                    "vs", ref.group)
        }
        contr.names = gsub("[[:punct:]]+", "", names(contrast))
        contr.names = gsub("[[:space:]]+", "", names(contrast))
        names(contrast) = contr.names
        colnames(voom.results$design)[contrast] = contr.names
    }else {
        stopifnot(nrow(contrast) == ncol(voom.results$design))
        if (is.null(colnames(contrast)))
            colnames(contrast) = paste0("Contrast", rep(1, ncol(contrast)))
        contr.names = gsub("[[:punct:]]", "", colnames(contrast))
        contr.names = gsub("[[:space:]]+", "", colnames(contrast))
        colnames(contrast) = contr.names
    }   
    # calculate logFC from limma DE analysis 
    logFC.calculated = "No"
    if (is.null(logFC)){
        #row names should be Entrez Gene IDs in order to plot KEGG pathways
        message("Log fold changes are estimated using limma package ... ")
        tmp = runStandardLimmaDEA(voom.results, contrast, logFC.cutoff, fdr.cutoff)
        logFC = tmp$logFC
        limma.results = tmp$limma.results
        limma.tops = tmp$limma.tops
        logFC.calculated = "Yes"
    }else if (!is.matrix(logFC) || !identical(colnames(logFC), contr.names)){
        stop(paste0("logFC should be a matrix object with column names equal ", 
                        "to the (column) names of the argument 'contrast'."))
    }else if (is(voom.results, "EList")){
        tmp = runStandardLimmaDEA(voom.results, contrast, logFC.cutoff, fdr.cutoff)
        limma.results = tmp$limma.results
        limma.tops = tmp$limma.tops
    }else{
        limma.results = new("MArrayLM")
        limma.tops = list()
    }    
    # check featureIDs and logFC matrix row names
    featureIDs = as.character(gs.annots[[1]]@featureIDs)
    if (!identical(rownames(logFC), featureIDs)){     
        logFC = logFC[match(featureIDs, rownames(logFC)) , ]    
        if (!identical(rownames(logFC), featureIDs)){
            stop("The row names of the fold change matrix should \n", 
                  "match the featureIDs vector in the gs.annot list.\n",
                 paste(rownames(logFC)[1:10], collapse=' '), "\n", 
                 paste(featureIDs[1:10], collapse= ' '))          
        }
    }    
    # check the 'symbolsMap' argument and replace NA symbols 
    if (is.null(symbolsMap)){
        symbolsMap = data.frame()
    }else if (nrow(symbolsMap) > 0 && ncol(symbolsMap) >= 2){
        na.sym = is.na(symbolsMap[, 2])
        if (sum(na.sym) > 0){
            warning("Some \"NA\" Gene Symbols were replaced with Feature IDs")
            symbolsMap[na.sym, 2] = symbolsMap[na.sym, 1]
        }
    }    
    if (is.null(report.dir))
      report.dir = ""
    # optimize number of cores to be used 
    num.threads = optimizeNumThreads(num.threads, length(baseGSEAs), 
            length(contr.names), verbose)    
    # create the EGSEAResults object to be populated
    gsas = EGSEAResults(contr.names = contr.names, 
            contrast = contrast,
            sampleSize = getNumberofSamples(voom.results, contrast), 
            gs.annots = gs.annots, baseMethods=baseGSEAs, 
            baseInfo = getBaseInfo(baseGSEAs),
            combineMethod = combineMethod, sort.by = sort.by,   
            symbolsMap = symbolsMap,
            logFC = logFC, logFC.calculated = logFC.calculated,
            sum.plot.axis = sum.plot.axis, 
            sum.plot.cutoff = sum.plot.cutoff,
            report = report, report.dir = report.dir,
            egsea.version = paste0(packageVersion("EGSEA")),
            egseaData.version = paste0(packageVersion("EGSEAdata"))
            )  
    if (keep.limma && length(limma.results) > 0)
        gsas@limmaResults = limma.results
    ###### START THE EGSEA ANALYSIS ON INDIVIDUAL COLLECTIONS ######
    skipped = c()    
    for (gs.annot in gs.annots){
        gs.annot = selectGeneSets(gs.annot, min.size=minSize)  
        if (length(gs.annot@idx) == 0){
            message("No gene sets in ", gs.annot@label, 
                    " meets the minimum size criterion.")
            skipped = c(skipped, gs.annot@label)
            next
        }
        # run egsea and write out ranked gene sets for all contrasts
        message("EGSEA is running on the provided data and ",
                        gs.annot@label, " collection")   
        
        results <- runegsea(voom.results = voom.results, 
                contrast = contrast, limma.tops = limma.tops,
                baseGSEAs = baseGSEAs,
                combineMethod = combineMethod, combineWeights = NULL,
                gs.annot = gs.annot, logFC = logFC, 
                logFC.cutoff = logFC.cutoff, fdr.cutoff = fdr.cutoff,
                vote.bin.width=vote.bin.width, keep.base = keep.base,
                num.workers = num.threads, verbose=verbose)            
             
        egsea.results = results[["egsea.results"]]
        # order results based on the sort.by argument
        for (i in 1:length(egsea.results)){
            # sort based on the average ranking
            egsea.results[[i]] = egsea.results[[i]][
                                order(egsea.results[[i]][,sort.by],
                                decreasing=(sort.by == "significance")), 
                                ]      
        }  
        
        gsa = list("test.results"=egsea.results)
        if (keep.base)
            gsa[["base.results"]] = results[["base.results"]]        
        if (keep.set.scores){
            set.methods = intersect(baseGSEAs, 
                    c("ssgsea")) # , "gsva", "plage", "zscore"
            gsa[["set.scores"]] = calculateSetScores(voom.results, gs.annot, 
                    set.methods, num.threads, verbose)
        }  
        # Comparison analysis reports generated here
        if (length(egsea.results) > 1){         
            egsea.comparison = createComparison(egsea.results, 
                    combineMethod = combineMethod, 
                    display.top=Inf, sort.by = sort.by)
            gsa[["comparison"]] = list()
            gsa$comparison[["test.results"]] = egsea.comparison
            egsea.comparison.all = egsea.comparison            
            egsea.comparison = egsea.comparison[1:ifelse(nrow(egsea.comparison) 
                > display.top, 
                display.top, nrow(egsea.comparison)), ]
            # gsa$comparison[["top.gene.sets"]] = rownames(egsea.comparison)
        }   
        gsas = addEGSEAResult(gsas, gs.annot@label, gsa)
        #gsas[[gs.annot@label]] = gsa
    }    
    elapsed.time = proc.time() - start.time    
    timestamp()
    message("EGSEA analysis took ", elapsed.time["elapsed"], " seconds.")
    message("EGSEA analysis has completed")
    if (report){
      generateMainReport(gsas, limma.tops = limma.tops,
               display.top = display.top,
               kegg.dir = kegg.dir, num.threads = num.threads, 
               print.base = TRUE,
               interactive = interactive,
               verbose = verbose) 
    }
    return(gsas)
}

optimizeNumThreads <- function(num.threads, base.num, contr.num, verbose=TRUE){    
    cores = detectCores()    
    exp.threads.num = min(num.threads, base.num) * 
                    min(num.threads, contr.num)
    if (verbose)
        message("Expected number of running processes: ", 
                        exp.threads.num)
    if (cores < exp.threads.num ){
        new.threads.num = 1
        exp.threads.num = min(new.threads.num, base.num) * 
                min(new.threads.num, contr.num)
        while (new.threads.num < cores &&
                exp.threads.num  < cores ){
            new.threads.num = new.threads.num + 1
            exp.threads.num = min(new.threads.num, base.num) * 
                    min(new.threads.num, contr.num)
#            print(new.threads.num)
#            print(exp.threads.num)
        }        
    } else if (cores < num.threads){
        new.threads.num = cores
    }
    else
        new.threads.num = num.threads
    if (new.threads.num  != num.threads){      
        message("Number of used cores has changed to ", 
                        new.threads.num , "\nin order to avoid ",
                        "CPU overloading.")
    }
    return(new.threads.num)
}

calculateSetScores <- function(voom.results, gs.annot, set.methods,
        num.workers, verbose = TRUE){    
    set.scores = list()
    if (length(set.methods) == 0){
        message("The parameter keep.set.scores has nothing to keep.")
        return(set.scores)
    }
    data.log = voom.results$E
    rownames(data.log) = as.character(seq(1, nrow(data.log)))
    gsets = list()
    for (j in 1:length(gs.annot@idx)){
        gsets[[j]] = as.character(gs.annot@idx[[j]])
    }
    names(gsets) = names(gs.annot@idx)
    for (method in set.methods){
        if (verbose)
            message("Gene set enrichment scores per sample are\n", 
                    "being calculated using ", 
                        method, "...")
        gs.es = calculateSetScores.parallel(data.log, gsets, method, num.workers)
        set.scores[[method]] = gs.es
    }
    return(set.scores)
}

getBaseInfo <- function(baseGSEAs){
    basePkg = list()
    basePkg[["camera"]] = "limma"
    basePkg[["roast"]] = "limma"
    basePkg[["safe"]] = "safe"
    basePkg[["gage"]] = "gage"
    basePkg[["padog"]] = "PADOG"
    basePkg[["plage"]] = "GSVA"
    basePkg[["zscore"]] = "GSVA"
    basePkg[["gsva"]] = "GSVA"
    basePkg[["ssgsea"]] = "GSVA"
    basePkg[["globaltest"]] = "globaltest"
    basePkg[["ora"]] = "stats"
    basePkg[["fry"]] = "limma"    
    baseInfo = list()
    for (baseGSEA in baseGSEAs){
        baseInfoi = list()
        pkg = basePkg[[baseGSEA]]
        baseInfoi$version = packageVersion(pkg)        
        baseInfoi$package = pkg
        baseInfo[[baseGSEA]] = baseInfoi
    }
    return(baseInfo)
}


egsea.selectTopGeneSets <- function(egsea.results, display.top, gs.annot, 
 verbose=TRUE){
    contrast.names = names(egsea.results)
    top.gene.sets = list()
    
    for(i in 1:length(egsea.results)){      
        #num.gene.sets.fdr = sum(egsea.results[[i]]$FDR < fdr[i])
        num.gene.sets.fdr = ifelse(length(gs.annot@idx) > display.top, 
display.top,length(gs.annot@idx))
        top.print = ifelse(length(gs.annot@idx) >= 10, 10,length(gs.annot@idx))
        if (top.print > num.gene.sets.fdr)
            top.print = num.gene.sets.fdr
        if (num.gene.sets.fdr > 0){
            egsea.results.top = egsea.results[[i]][1:num.gene.sets.fdr,]    
        
            top.gene.sets[[i]] = rownames(egsea.results.top) 
            if (verbose){
                print(paste0("The top gene sets for contrast ", 
                                contrast.names[i], " are:"))
                if (length(grep("^kegg", gs.annot@label)) == 0){
                    top.table = cbind(gs.annot@anno[
                    match(rownames(egsea.results.top), gs.annot@anno[,2])
                                    ,-6],
                            egsea.results.top)
                    print(top.table[1:top.print, c("ID", "p.adj")])
                }else{
                    top.table = cbind(gs.annot@anno[
                                    match(rownames(egsea.results.top), 
                                        gs.annot@anno[,2]), -2],
                            egsea.results.top)
                    print(top.table[1:top.print, c("Type", "p.adj")])
                }                
            }
            
        }else{            
            message("No gene sets found")
            top.gene.sets[[i]] = NA
        }
        
    }
    
    names(top.gene.sets) = contrast.names
    return(top.gene.sets)
}


runbaseGSEAParallelWorker <- function(args){
    #print(paste0("Running ", toupper(args$baseGSEA), " on all contrasts ... "))
    tryCatch({
                #t = system.time(
                temp.result <- runbaseGSEA(method=args$baseGSEA, 
                        args$voom.results, args$contrast, args$gs.annot,                        
                        num.threads = args$num.threads, verbose=args$verbose)
                #)
                #print(paste0(args$baseGSEA, " ", args$gs.annot$name, " ", t[3]))
                if (args$verbose)
                    message(paste0("Running ", toupper(args$baseGSEA), " on all ", 
							"contrasts ... COMPLETED"))
                else{
                    message(args$baseGSEA, "*", appendLF = FALSE)
                }
                return(temp.result)
            }, 
            error = function(e) {
                message(toupper(args$baseGSEA), 
                           " encountered an error -> ", e )
            })
    return(NULL)
}

runegsea <- function(voom.results, contrast, limma.tops,
            baseGSEAs, combineMethod, 
            combineWeights=NULL, gs.annot, logFC, 
            logFC.cutoff, fdr.cutoff, 
            vote.bin.width, keep.base=TRUE, 
            num.workers=8, verbose=FALSE){     
    stopifnot(is(gs.annot, "GSCollectionIndex"))
    if (!is.null(voom.results$E))
        geneIDs = rownames(voom.results$E)
    else 
        geneIDs = voom.results$ids
    stopifnot(identical(geneIDs, gs.annot@featureIDs))
    if (is.matrix(contrast)){
        contr.names = colnames(contrast)        
    }else{
        contr.names = names(contrast)        
    }
    # egsea.results.details stores Contrasts ==> Individual Results
    egsea.results.details = vector("list", length(contr.names))  
    names(egsea.results.details) = contr.names
    for (i in 1:length(contr.names)){
        egsea.results.details[[i]] = vector("list", length(baseGSEAs))
        names(egsea.results.details[[i]]) = baseGSEAs
    }
    # run enrichment analysis using base methods
    args.all = list()
    threads.per.base = num.workers / 2 # ceiling(num.workers / length(baseGSEAs))
    for (baseGSEA in baseGSEAs){
        args.all[[baseGSEA]] = list(baseGSEA=baseGSEA, 
                    voom.results=voom.results,
                    contrast=contrast, gs.annot=gs.annot,                                        
                    num.threads=threads.per.base,
                    verbose=verbose)
    }
    # temp.results stores Methods ==> Contrasts
    if (Sys.info()['sysname'] == "Windows" || num.workers <= 1 || 
length(baseGSEAs) == 1)
        # sequential processing
        temp.results = lapply(args.all, runbaseGSEAParallelWorker) 
    else
        # parallel processing
        temp.results = mclapply(args.all, runbaseGSEAParallelWorker, 
mc.cores=num.workers)   
    if (!verbose)
        message("")
    # collect results   
    #print(names(temp.results))
    for (baseGSEA in baseGSEAs){
        for (i in 1:length(contr.names)){
            # order is important when combine
            if (is.null(temp.results[[baseGSEA]])){
                stop("ERROR: One of the base methods failed on this ", 
                "dataset (", baseGSEA , 
                ").\nRemove it and try again.\nSee error messages for ", 
                "more information.")
            }
#            print(paste0(baseGSEA, colnames(contrast)[i]))
#            if (baseGSEA == "globaltest")
#                print(temp.results[[baseGSEA]][[i]])
            egsea.results.details[[i]][[baseGSEA]] = 
temp.results[[baseGSEA]][[i]][names(gs.annot@idx),]         
        }
    }   
  
    # combine the results of base methods 
#    print(names(egsea.results.details))
    egsea.results = combineBaseGSEAs(results.multi=egsea.results.details, 
            combineMethod=combineMethod, combineWeights=combineWeights, 
            bin.width = vote.bin.width) 
    # calculate additional stats
#    print(names(egsea.results))    
    for (i in 1:length(egsea.results)){ # i over contrasts
        gs.avg.fcs = numeric(0)  
        gs.avg.fcs.dir = numeric(0)
        gs.dirs = numeric(0)
        gsets = as.character(rownames(egsea.results[[i]]))
        fc = logFC[, i]
        if (length(limma.tops) > 0)
            t = limma.tops[[i]]        
        for (j in 1:length(gsets)){             
            sel.genes = gs.annot@idx[[gsets[j]]]
            gset.fc = fc[sel.genes]    
            if (length(limma.tops) > 0){
                temp = gset.fc[t[sel.genes, "adj.P.Val"] <= fdr.cutoff]
                if (length(temp) > 0)
                    gset.fc = temp            
                temp = abs(gset.fc)
                temp = temp[temp >= logFC.cutoff]
                if (length(temp) == 0)
                    temp = abs(gset.fc)
            }else
                temp = abs(gset.fc)
            gs.avg.fcs = c(gs.avg.fcs, mean(temp, na.rm=TRUE))          
            up = sum(gset.fc > logFC.cutoff, na.rm=TRUE)
            dn = sum(gset.fc < -logFC.cutoff, na.rm=TRUE)
            if (up > dn){
                gs.dirs = c(gs.dirs, 1)
                gs.avg.fcs.dir = c(gs.avg.fcs.dir, 
                        mean(gset.fc[gset.fc > logFC.cutoff], na.rm=TRUE))
            }else{
                gs.dirs = c(gs.dirs, -1)
                gs.avg.fcs.dir = c(gs.avg.fcs.dir, 
                        mean(gset.fc[gset.fc < -logFC.cutoff], na.rm=TRUE))
            }            
        }   
#        print(head(egsea.results[[i]]))
        pvalues = egsea.results[[i]][, "p.adj"]        
        pvalues = -1 * log10(pvalues)
        pvalues[pvalues == Inf] = max(pvalues[pvalues != Inf]) + 50
        pvalues[is.na(pvalues)] = 0 
        sig = pvalues * gs.avg.fcs
        if (max(sig, na.rm=TRUE) != min(sig, na.rm=TRUE))
            sig = (sig - min(sig, na.rm=TRUE)) / (max(sig, na.rm=TRUE) - 
                    min(sig, na.rm=TRUE)) * 100
        m = length(baseGSEAs)       
        if (m == 1){            
            egsea.results[[i]] = cbind(egsea.results[[i]], 
                        "avg.logfc" = gs.avg.fcs,
                        "avg.logfc.dir" = gs.avg.fcs.dir,
                    "direction" = gs.dirs, "significance" = sig)
        }
        else{
            n = ncol(egsea.results[[i]])
            # insert stat columns in the middle
            egsea.results[[i]] = cbind(egsea.results[[i]][, 1:(n-m)], 
                            "avg.logfc" = gs.avg.fcs,
                            "avg.logfc.dir" = gs.avg.fcs.dir,
                    "direction" = gs.dirs, "significance" = sig, 
                    egsea.results[[i]][, (n-m+1):n] )
        }        
    }
    
    names(egsea.results) = contr.names
    results = list("egsea.results"=egsea.results)
    if (keep.base){
        results[["base.results"]] = egsea.results.details
    }
    return(results)
}


combinePvalues <- function(data, combineMethod, combineWeights = NULL){
    if (ncol(data) > 1){
        if (combineMethod == "average"){
            #print(head(data))
            pvalues =sapply(1:nrow(data),  
                            function(i) {
                                x = data[i, ]                                
                                return(ifelse(length(x[!is.na(x)]) >= 4, 
                                        meanp(x[!is.na(x)])$p,
                                        mean(x)
                                      )) 
                             }
                           )    
        } else if (combineMethod == "fisher"){        
            data[data == 0] = 1*10^-22           
            pvalues = sapply(apply(data,  1, function(y) sumlog(y[!is.na(y)])), 
                    function(x) x$p)            
        } else if (combineMethod == "logitp"){        
            data[data == 0] = 1*10^-22
            data[data == 1] = 1 - 1*10^-5
            pvalues = sapply(apply(data,  1, function(y) logitp(y[!is.na(y)])), 
                    function(x) x$p)            
        }else if (combineMethod == "sump"){
            pvalues = sapply(apply(data,  1, function(y) sump(y[!is.na(y)])), 
                    function(x) x$p)            
        }else if (combineMethod == "sumz"){        
            data[data == 0] = 1*10^-22
            data[data == 1] = 1 - 1*10^-5
            pvalues = sapply(apply(data,  1, function(y) sumz(y[!is.na(y)])), #, combineWeights 
                    function(x) x$p)           
        }else if (combineMethod == "wilkinson"){        
            data[data == 0] = 1*10^-22
            data[data == 1] = 1 - 1*10^-5
            pvalues = sapply(apply(data,  1, function(y) wilkinsonp(y[!is.na(y)], r = 1)), 
                    function(x) x$p)            
        } else if (combineMethod == "votep"){        
          data[data == 0] = 1*10^-22
          data[data == 1] = 1 - 1*10^-5
          pvalues = sapply(apply(data,  1, function(y) votep(y[!is.na(y)])), 
                           function(x) x$p)            
        } else if (combineMethod == "median"){        
          data[data == 0] = 1*10^-22
          data[data == 1] = 1 - 1*10^-5
          pvalues = sapply(apply(data,  1, function(y) median(y[!is.na(y)])), 
                           function(x) x$p)            
        }    
    } else{
        pvalues = data[, 1]
    }
    #print(class(pvalues))
    #print(pvalues)
    adj.pvals = p.adjust(pvalues, method="BH")
    return(list(pvalues=pvalues, adj.pvals = adj.pvals))
}

combineBaseGSEAs <- function(results.multi, combineMethod, combineWeights=NULL, 
                    bin.width=5){
    if (length(results.multi) == 1 && 
        length(results.multi[[1]]) == 1 && 
        names(results.multi[[1]]) == "ora"){
        results.multi[[1]] = results.multi[[1]][[1]]
        results.multi[[1]] = results.multi[[1]][, 
                colnames(results.multi[[1]]) != "Rank"]
        return(results.multi)
    }
  
    if (length(results.multi[[1]]) == 1){
        results.combined = vector('list', length(results.multi))
        names(results.combined) = names(results.multi)
        for (i in 1:length(results.combined)){
            results.combined[[i]] = results.multi[[i]][[1]]
            results.combined[[i]] = results.combined[[i]][, 
                            colnames(results.combined[[i]]) != "Rank"]
        }
        return(results.combined)
    }
    # compress detailed results into matrices
    temp = extractPvaluesRanks(results.multi) 
    results.comp = temp$pvalues
    results.ranked = temp$ranks # gene sets ranked for each method
    results.combined = vector('list', length(results.ranked))
    names(results.combined) = names(results.comp)
#   results.comp = extractFDRs(results.multi)
    
    for (i in 1:length(results.comp)){ # i over contrasts
        temp = combinePvalues(results.comp[[i]], combineMethod, combineWeights)
        pvalues = temp$pvalues
        adj.pvals = temp$adj.pvals
        results.combined[[i]] = data.frame(cbind(
                "p.value"=pvalues, 
                "p.adj"=adj.pvals))
        if (ncol(results.ranked[[i]]) > 1){
            results.combined[[i]] = cbind(results.combined[[i]],  
                "vote.rank" = voteRank(results.ranked[[i]], 
                                        bin.width=bin.width),
                "avg.rank"=rowMeans(results.ranked[[i]], 
                            na.rm=TRUE), 
                "med.rank"=rowMedians(results.ranked[[i]], 
                            na.rm=TRUE),
                "min.pvalue"=sapply(1:nrow(results.comp[[i]]), 
                        function(x) min(results.comp[[i]][x, ], na.rm=TRUE)),
                "min.rank"=sapply(1:nrow(results.ranked[[i]]), 
                        function(x) min(results.ranked[[i]][x, ], na.rm=TRUE)),
                results.ranked[[i]]
            )
        }
        rownames(results.combined[[i]]) = rownames(results.comp[[i]])   
        
    }    
    
    #TODO: weighted average ranking
    #"wavg.rank"=rowMeans(sweep(results.ranked[[i]], MARGIN=2, combineWeights, 
    #`*`)), 
    
    return(results.combined)
}

voteRank <- function(results.ranked, bin.width=5){
    # convert to bins of bin.width 
    results.votes = numeric(nrow(results.ranked))
    if (bin.width != -1)
        results.ranked = (floor((results.ranked-1) / bin.width) + 1) * bin.width
    
    # apply majority voting: find frequencies. if tie, ignore 
    for (i in 1:nrow(results.ranked)){
        votes = results.ranked[i, ]
        cnts = list()
        for (j in 1:length(votes)){
            if (!is.null(cnts[[paste0(votes[j])]]))
                cnts[[paste0(votes[j])]] = cnts[[paste0(votes[j])]] + 1
            else
                cnts[[paste0(votes[j])]] = 1
        }       
        #if (max(as.numeric(cnts)) > floor(ncol(results.ranked) / 2)){
        indx = which.max(as.numeric(cnts))
        results.votes[i] = names(cnts)[indx]
#       }else
#           results.votes[i] = NA
    }
    # return the new ranking
    return(as.numeric(results.votes))
}



extractAdjPvaluesRanks <- function(results.multi){
    results.comp = vector("list", length(results.multi))
    names(results.comp) = names(results.multi)
    results.ranked = vector("list", length(results.comp))
    names(results.ranked) = names(results.comp)
    for (i in 1:length(results.multi)){ # i over contrasts
        results.comp[[i]] = matrix(0, nrow(results.multi[[i]][[1]]), 
                length(results.multi[[i]]))
        rownames(results.comp[[i]]) = rownames(results.multi[[i]][[1]])
        colnames(results.comp[[i]]) = names(results.multi[[i]]) 
        results.ranked[[i]] = matrix(0, nrow(results.comp[[i]]), 
                ncol(results.comp[[i]]))    
        rownames(results.ranked[[i]]) = rownames(results.comp[[i]])
        colnames(results.ranked[[i]]) = colnames(results.comp[[i]])
        j = 1
        #print(names(results.comp)[i])
        for (method in names(results.multi[[i]])){ # j is over base methods           
            results.comp[[i]][,j] = as.numeric(results.multi[[i]][[j]][, 
                        "p.adj"])           
            results.ranked[[i]][,j] = as.numeric(results.multi[[i]][[j]][, 
                            "Rank"])
            if (max(results.comp[[i]][,j], na.rm = TRUE) < 0.000001)
                warning(method, 
                    " produces very low p-values on ", 
                    names(results.comp)[i])
            j = j + 1
        }
    }
    return(list(pvalues=results.comp, ranks=results.ranked))
}

extractPvaluesRanks <- function(results.multi){
    results.comp = vector("list", length(results.multi))
    names(results.comp) = names(results.multi)
    results.ranked = vector("list", length(results.comp))
    names(results.ranked) = names(results.comp)
    for (i in 1:length(results.multi)){ # i over contrasts
        results.comp[[i]] = matrix(0, nrow(results.multi[[i]][[1]]), 
                                    length(results.multi[[i]]))
        rownames(results.comp[[i]]) = rownames(results.multi[[i]][[1]])
        colnames(results.comp[[i]]) = names(results.multi[[i]]) 
        results.ranked[[i]] = matrix(0, nrow(results.comp[[i]]), 
                                        ncol(results.comp[[i]]))    
        rownames(results.ranked[[i]]) = rownames(results.comp[[i]])
        colnames(results.ranked[[i]]) = colnames(results.comp[[i]])
        j = 1
        #print(names(results.comp)[i])
        for (method in names(results.multi[[i]])){ # j is over base methods           
            results.comp[[i]][,j] = as.numeric(results.multi[[i]][[j]][, 
                            "p.value"]) 
            results.ranked[[i]][,j] = as.numeric(results.multi[[i]][[j]][, 
                "Rank"])
            if (max(results.comp[[i]][,j], na.rm = TRUE) < 0.000001)
                warning(method, 
                      " produces very low p-values on ", 
                      names(results.comp)[i])
            j = j + 1
        }
    }
    return(list(pvalues=results.comp, ranks=results.ranked))
}

runbaseGSEA <- function(method, voom.results, contrast, gs.annot,                         
                        num.threads = 4, verbose=TRUE){
    if (method == "camera"){        
        return(runcamera(voom.results = voom.results, contrast = contrast, 
            gs.annot = gs.annot, num.workers = num.threads, verbose = verbose))     
    }else if (method == "roast"){       
        return(runroast(voom.results = voom.results, contrast = contrast, 
            gs.annot = gs.annot, num.workers = num.threads, verbose = verbose))     
    }else if (method == "fry"){
        return(runfry(voom.results = voom.results, contrast = contrast, 
            gs.annot = gs.annot, num.workers = num.threads, verbose = verbose))
    }else if (method == "gage"){        
        return(rungage(voom.results = voom.results, contrast = contrast, 
            gs.annot = gs.annot, num.workers = num.threads, verbose = verbose))     
    }else if (method == "padog"){       
        return(runpadog(voom.results = voom.results, contrast = contrast, 
            gs.annot = gs.annot, num.workers = num.threads, verbose = verbose))
    }else if (method == "plage"){       
        return(rungsva(method="plage", voom.results = voom.results, contrast = 
            contrast, gs.annot = gs.annot, num.workers = num.threads, 
            verbose = verbose))
    }else if (method == "zscore"){  
        return(rungsva(method="zscore", voom.results = voom.results, contrast = 
            contrast, gs.annot = gs.annot, num.workers = num.threads, 
            verbose = verbose))
    }else if (method == "gsva"){        
        return(rungsva(method="gsva", voom.results = voom.results, contrast = 
            contrast, gs.annot = gs.annot, num.workers = num.threads, 
            verbose = verbose))
    }else if (method == "ssgsea"){  
        return(rungsva(method="ssgsea", voom.results = voom.results, contrast = 
            contrast, gs.annot = gs.annot, num.workers = num.threads, 
            verbose = verbose))
    }else if (method == "globaltest"){      
        return(runglobaltest(voom.results = voom.results, contrast = contrast, 
            gs.annot = gs.annot, num.workers = num.threads, verbose = verbose))
    }else if (method == "safe"){        
        return(runsafe(voom.results = voom.results, contrast = contrast, 
            gs.annot = gs.annot, num.workers = num.threads, verbose = verbose))
    }else if (method == "ora"){
        return (runora(voom.results = voom.results, contrast = contrast, 
            gs.annot = gs.annot, num.workers = num.threads, verbose = verbose))         
    }else{
        stop("Method not recognized. Type egsea.base() to see supported base 
methods.")
    }
}

createComparison <- function(egsea.results, combineMethod="fisher", display.top=100, 
        sort.by="p.value"){
    egsea.comparison = numeric(0)
    col.names = colnames(egsea.results[[1]])
    col.names.sel = character(0)
    gset.names =  rownames(egsea.results[[1]])
    for (i in 1:length(col.names)){ # iterate over egsea.results columns
        if (col.names[i] == "p.adj")
            next
        temp= numeric(0)
        for (j in 1:length(egsea.results)){ # iterate over contrasts
            temp = cbind(temp, egsea.results[[j]][gset.names, i])
        }
        
        if (col.names[i] == "p.value"){         
            temp = combinePvalues(temp, combineMethod)
            pvalues = temp$pvalues
            adj.pvals = temp$adj.pvals
            egsea.comparison = cbind(egsea.comparison, 
                    pvalues, adj.pvals)
            col.names.sel = c(col.names.sel, col.names[i], "p.adj")
        }
        else if (col.names[i] == "med.rank"){
            egsea.comparison = cbind(egsea.comparison, rowMedians(temp, 
                    na.rm=TRUE))
            col.names.sel = c(col.names.sel, col.names[i])
        }
        else if (length(grep("min", col.names[i])) > 0){
            minVals = sapply(1:nrow(temp), function(x) min(temp[x, ], 
                    na.rm=TRUE))
            egsea.comparison = cbind(egsea.comparison, minVals)
            col.names.sel = c(col.names.sel, col.names[i])
        }
        else if (col.names[i] %in% c("avg.rank", "direction", "significance", 
                "avg.logfc", "avg.logfc.dir")){
            egsea.comparison = cbind(egsea.comparison, rowMeans(temp, 
                na.rm=TRUE))
            col.names.sel = c(col.names.sel, col.names[i]) 
        }else if (col.names[i] == "vote.rank"){
            if (length(egsea.results) > 2){
                egsea.comparison = cbind(egsea.comparison, voteRank(temp, 
                    bin.width = -1))
                col.names.sel = c(col.names.sel, col.names[i])
            }else{
                egsea.comparison = cbind(egsea.comparison, rowMeans(temp, 
                            na.rm=TRUE))
                col.names.sel = c(col.names.sel, "vote.rank.mean")
            }
        }else{
            next
        }
    }
    rownames(egsea.comparison) = gset.names
#   print(colnames(egsea.comparison))
#   print(col.names)
    colnames(egsea.comparison) = col.names.sel
    egsea.comparison = egsea.comparison[order(egsea.comparison[,sort.by]), ]
    display.top = ifelse(nrow(egsea.comparison) > display.top, display.top, 
nrow(egsea.comparison))
    return(egsea.comparison[1:display.top, ])
    
}

get.toptables <- function(ebayes.results, contrast){
    if (is.matrix(contrast)){        
        contr.names = colnames(contrast)
        coefs = 1:ncol(contrast)
# between the groups
    }else{
        contr.names = names(contrast)
        coefs = contrast
    }
    limma.tops = list()
    for (i in 1:length(coefs)){
        top.table = topTable(ebayes.results, coef=coefs[i], 
                number=Inf, sort.by="none")        
        rownames(top.table) = rownames(ebayes.results)
        limma.tops[[contr.names[i]]] = top.table
    }
    return(limma.tops)
}

runStandardLimmaDEA <- function(voom.results, contrast, 
        logFC.cutoff, fdr.cutoff){
    # to be changed for gene symbols support
    stopifnot(is(voom.results, "EList"))
    message("limma DE analysis is carried out ... ")
    # fit linear model for each gene using limma package functions
    vfit = lmFit(voom.results, design=voom.results$design) # Fit linear model 
# for each gene given a series of arrays
    if (is.matrix(contrast)){
        vfit = contrasts.fit(vfit, contrast) # make all pair-wise comparisons
        contr.names = colnames(contrast)    
        contr.num =  ncol(contrast)
        coefs = 1:ncol(contrast)
# between the groups
    }else{
        contr.names = names(contrast)    
        contr.num = length(contrast)
        coefs = contrast
    }
    ebayes.results = eBayes(vfit) # compute moderated t-statistics, moderated 
#F-statistic, and log-odds of differential expression by empirical 
#    Bayes moderation of the standard errors towards a common value  
    logFC = matrix(0, nrow(ebayes.results), contr.num)
    limma.tops = list()
    for (i in 1:length(coefs)){
        top.table = topTable(ebayes.results, coef=coefs[i], 
                number=Inf, sort.by="none")
        limma.fc = top.table$logFC      
        names(limma.fc) = rownames(ebayes.results)  
        logFC[, i] = limma.fc
        rownames(top.table) = rownames(ebayes.results)
        limma.tops[[contr.names[i]]] = top.table
        de.genes = top.table[top.table[, "adj.P.Val"] <= fdr.cutoff, ]
        if (nrow(de.genes) == 0)
            warning("It seems the contrast ", 
                    contr.names[i], 
                    " has no DE genes at the selected 'fdr.cutoff'.\n",
                    "The 'fdr.cutoff' was ignored in the calculations.")
        if (nrow(de.genes) > 0){
            de.genes = de.genes[abs(de.genes[, "logFC"]) >= logFC.cutoff, ]
            if (nrow(de.genes) == 0)
                warning("It seems the contrast ", 
                    contr.names[i], 
                    " has no DE genes at the selected 'logFC.cutoff'.\n",
                    "The 'logFC.cutoff' was ignored in the calculations.")
        }
    }
    
    rownames(logFC) = rownames(ebayes.results)
    colnames(logFC) = contr.names
    
    return(list(logFC=logFC, limma.results=ebayes.results, limma.tops=limma.tops))
}

# Adapted from Gordon Smyth https://support.bioconductor.org/p/9228/
makeContrastPairs <- function(
        design.cols,
        group.levels){
    n = length(group.levels)
    stopifnot(identical(group.levels, design.cols[1:n]))
    contr = matrix(0, length(design.cols), choose(n, 2))
    rownames(contr) = design.cols    
    colnames(contr) = 1:choose(n,2)
    k = 0
    for (i in 1:(n-1)){
        for (j in (i+1):n){
            k = k + 1
            contr[j, k] = 1
            contr[i, k] = -1
#            print(contr)
            colnames(contr)[k] = paste0(group.levels[j],
                            "vs", group.levels[i])          
        }
    }        
    return(contr)
}

Try the EGSEA package in your browser

Any scripts or data that you put into this service are public.

EGSEA documentation built on Jan. 30, 2021, 2:01 a.m.