Mitch: Multi-Contrast Gene Set Enrichment Analysis

Documented in gmt_import mitch_calc mitch_import mitch_plots mitch_report

#' mitch: An R package for multi-dimensional pathway enrichment analysis
#'
#' mitch is an R package for multi-dimensional enrichment analysis. At it's 
#' heart, it uses a rank-MANOVA based statistical approach to detect sets of 
#' genes that exhibit enrichment in the multidimensional space as compared to 
#' the background. mitch is useful for pathway analysis of profiling studies 
#' with two to or more contrasts, or in studies with multiple omics profiling, 
#' for example proteomic, transcriptomic, epigenomic analysis of the same 
#' samples. mitch is perfectly suited for pathway level differential analysis 
#' of scRNA-seq data.
#'
#' A typical mitch workflow consists of:
#' 1) Import gene sets with gmt_import()
#' 2) Import profiling data with mitch_import()
#' 3) Calculate enrichments with mitch_calc()
#' 4) And generate plots and reports with mitch_plots() and mitch_report()
#'
#' More documentation on the github page https://github.com/markziemann/mitch
#' or with ?<function>, eg: ?mitch_import
#'
#' @docType package
#' @name mitch
#' @examples
#' # Example workflow
#' # Import some gene sets
#' genesetsExample<-gmt_import(system.file('extdata/sample_genesets.gmt', 
#' package = 'mitch'))
#' # Load some edgeR tables (rna, k9a, k36a). 
#' data(rna,k9a,k36a)
#' # Create a list of differential profiles
#' myList<-list('rna'=rna,'k9a'=k9a,'k36a'=k36a)
#' # Import as edgeR table 
#' myImportedData<-mitch_import(myList,DEtype='edger')
#' # Calculate enrichment using MANOVA
#' resExample<-mitch_calc(myImportedData,genesetsExample,priority='effect',
#' resrows=5,cores=2)
#' # Generate some high res plots in PDF format
#' mitch_plots(resExample,outfile='outres.pdf')
#' #' Generate a report of the analysis in HTML format
#' mitch_report(resExample,'outres.html')
NULL

#' @import utils
utils::globalVariables(c("p.adjustMANOVA", "effect", "p.adjustANOVA", "contrast",
    "value", "..density..","dummy_x","dummy_y"))


mapGeneIds <- function(y, z) {
    if (!is.null(attributes(y)$geneTable)) {
        gt <- attributes(y)$geneTable
        col1 <- length(which(z$geneidentifiers %in% gt[, 1]))
        col2 <- length(which(z$geneidentifiers %in% gt[, 2]))
        
        if (col1 + col2 < (nrow(y)/2)) {
            stop("Error it looks as if the Gene IDs in the profile don't match
            the geneTable")
        }
        
        if (col1 > col2) {
            colnames(gt) <- c("geneidentifiers", "GeneSymbol")
            z <- merge(gt, z, by = "geneidentifiers")
            z$geneidentifiers <- NULL
        } else {
            colnames(gt) <- c("GeneSymbol", "geneidentifiers")
            z <- merge(gt, z, by = "geneidentifiers")
            z$geneidentifiers <- NULL
        }
        z <- aggregate(. ~ GeneSymbol, z, function(x) {
            mean(as.numeric(as.character(x)))
        })

    }
    colnames(z) <- c("geneidentifiers", "y")
    z
}


edger_score <- function(y , geneIDcol = geneIDcol) {
    
    NCOL <- ncol(y)
    if (NCOL < 2) {
        stop("Error: there are <2 columns in the input, 'PValue' and 'logFC' 
        are required ")
    }
    
    PCOL <- length(which(names(y) == "PValue"))
    if (PCOL > 1) {
        stop("Error, there is more than 1 column named 'PValue' in the input")
    }
    if (PCOL < 1) {
        stop("Error, there is no column named 'PValue' in the input")
    }
    
    FCCOL <- length(which(names(y) == "logFC"))
    if (FCCOL > 1) {
        stop("Error, there is more than 1 column named 'logFC' in the input")
    }
    if (FCCOL < 1) {
        stop("Error, there is no column named 'logFC' in the input")
    }
    
    s <- sign(y$logFC) * -log10(y$PValue)
    
    if (!is.null(attributes(y)$geneIDcol)) {
        g <- y[, attributes(y)$geneIDcol]
    } else {
        g <- rownames(y)
    }
    z <- data.frame(g, s, stringsAsFactors = FALSE)
    colnames(z) <- c("geneidentifiers", "y")
    z <- mapGeneIds(y, z)
    z
}


deseq2_score <- function(y , geneIDcol = geneIDcol ) {

    ZCOL <- length(which(names(y) == "stat"))
    if (ZCOL > 1) {
        stop("Error, there is more than 1 column named 'stat' in the input")
    }
    if (ZCOL < 1) {
        stop("Error, there is no column named 'stat' in the input")
    }

    s <- y$stat

    if (!is.null(attributes(y)$geneIDcol)) {
        g <- y[, attributes(y)$geneIDcol]
    } else {
        g <- rownames(y)
    }
    z <- data.frame(g, s, stringsAsFactors = FALSE)
    colnames(z) <- c("geneidentifiers", "y")
    z <- mapGeneIds(y, z)
    z
}


absseq_score <- function(y, geneIDcol = geneIDcol ) {
    
    NCOL <- ncol(y)
    if (NCOL < 2) {
        stop("Error: there are <2 columns in the input, 'pvalue' and 
        'foldChange' are required ")
    }
    
    PCOL <- length(which(names(y) == "pvalue"))
    if (PCOL > 1) {
        stop("Error, there is more than 1 column named 'pvalue' in the input")
    }
    if (PCOL < 1) {
        stop("Error, there is no column named 'pvalue' in the input")
    }
    
    FCCOL <- length(which(names(y) == "foldChange"))
    if (FCCOL > 1) {
        stop("Error, there is more than 1 column named 'foldChange' in
        the input")
    }
    if (FCCOL < 1) {
        stop("Error, there is no column named 'foldChange' in the input")
    }
    
    s <- sign(y$foldChange) * -log10(y$pvalue)
    
    if (!is.null(attributes(y)$geneIDcol)) {
        g <- y[, attributes(y)$geneIDcol]
    } else {
        g <- rownames(y)
    }
    z <- data.frame(g, s, stringsAsFactors = FALSE)
    colnames(z) <- c("geneidentifiers", "y")
    z <- mapGeneIds(y, z)
    z
}


sleuth_score <- function(y , geneIDcol = geneIDcol ) {
    
    NCOL <- ncol(y)
    if (NCOL < 2) {
        stop("Error: there are <2 columns in the input, 'pval' and 'b'
        are required ")
    }
    
    PCOL <- length(which(names(y) == "pval"))
    if (PCOL > 1) {
        stop("Error, there is more than 1 column named 'pval' in the input")
    }
    if (PCOL < 1) {
        stop("Error, there is no column named 'pval' in the input")
    }
    
    FCCOL <- length(which(names(y) == "b"))
    if (FCCOL > 1) {
        stop("Error, there is more than 1 column named 'b' in the input")
    }
    if (FCCOL < 1) {
        stop("Error, there is no column named 'b' in the input")
    }
    
    s <- sign(y$b) * -log10(y$pval)
    
    if (!is.null(attributes(y)$geneIDcol)) {
        g <- y[, attributes(y)$geneIDcol]
    } else {
        g <- rownames(y)
    }
    z <- data.frame(g, s, stringsAsFactors = FALSE)
    colnames(z) <- c("geneidentifiers", "y")
    z <- mapGeneIds(y, z)
    z
}


topconfect_score <- function(y , geneIDcol = geneIDcol ) {
    
    FCCOL <- length(which(names(y) == "confect"))
    if (FCCOL > 1) {
        stop("Error, there is more than 1 column named 'confect' in the input")
    }
    if (FCCOL < 1) {
        stop("Error, there is no column named 'confect' in the input")
    }
    
    # better to get the sign of fold change from the effect column
    FCCOL <- length(which(names(y) == "effect"))
    if (FCCOL > 1) {
        stop("Error, there is more than 1 column named 'effect' in the input")
    }
    if (FCCOL < 1) {
        stop("Error, there is no column named 'effect' in the input")
    }
    
    # there is a problem with topconfects having some NA values
    yy <- y[!is.na(y$effect), ]
    
    pos <- subset(yy, effect > 0)
    neg <- subset(yy, effect < 0)
    pos$mitchrank <- rev(seq(from = 1, to = nrow(pos)))
    neg$mitchrank <- rev(seq(from = -1, to = -nrow(neg)))
    yy <- rbind(pos, neg)
    s <- yy$mitchrank
    
    if (!is.null(attributes(y)$geneIDcol)) {
        g <- yy[, attributes(y)$geneIDcol]
    } else {
        g <- rownames(yy)
    }
    z <- data.frame(g, s, stringsAsFactors = FALSE)
    colnames(z) <- c("geneidentifiers", "y")
    z <- mapGeneIds(y, z)
    z
}


ballgown_score <- function(y , geneIDcol = geneIDcol ) {

    NCOL <- ncol(y)
    if (NCOL < 2) {
        stop("Error: there are <2 columns in the input, 'pval' and 'fc' are
        required ")
    }

    PCOL <- length(which(names(y) == "pval"))
    if (PCOL > 1) {
        stop("Error, there is more than 1 column named 'pval' in the input")
    }
    if (PCOL < 1) {
        stop("Error, there is no column named 'pval' in the input")
    }

    FCCOL <- length(which(names(y) == "fc"))
    if (FCCOL > 1) {
        stop("Error, there is more than 1 column named 'fc' in the input")
    }
    if (FCCOL < 1) {
        stop("Error, there is no column named 'fc' in the input")
    }

    s <- sign(log2(y$fc)) * -log10(y$pval)

    if (!is.null(attributes(y)$geneIDcol)) {
        g <- y[, attributes(y)$geneIDcol]
    } else {
        g <- rownames(y)
    }
    z <- data.frame(g, s, stringsAsFactors = FALSE)
    colnames(z) <- c("geneidentifiers", "y")
    z <- mapGeneIds(y, z)
    z
}


noiseq_score <- function(y , geneIDcol = geneIDcol ) {

    ZCOL <- length(which(names(y) == "ranking"))
    if (ZCOL > 1) {
        stop("Error, there is more than 1 column named 'ranking' in the input")
    }
    if (ZCOL < 1) {
        stop("Error, there is no column named 'ranking' in the input")
    }

    s <- y$ranking

    if (!is.null(attributes(y)$geneIDcol)) {
        g <- y[, attributes(y)$geneIDcol]
    } else {
        g <- rownames(y)
    }
    z <- data.frame(g, s, stringsAsFactors = FALSE)
    colnames(z) <- c("geneidentifiers", "y")
    z <- mapGeneIds(y, z)
    z
}


tcc_score <- function(y , geneIDcol = geneIDcol ) {

    NCOL <- ncol(y)
    if (NCOL < 2) {
        stop("Error: there are <2 columns in the input, 'p.value' and 'm.value'
        are required ")
    }

    PCOL <- length(which(names(y) == "p.value"))
    if (PCOL > 1) {
        stop("Error, there is more than 1 column named 'p.value' in the input")
    }
    if (PCOL < 1) {
        stop("Error, there is no column named 'p.value' in the input")
    }

    FCCOL <- length(which(names(y) == "m.value"))
    if (FCCOL > 1) {
        stop("Error, there is more than 1 column named 'm.value' in the input")
    }
    if (FCCOL < 1) {
        stop("Error, there is no column named 'm.value' in the input")
    }

    s <- sign(y$m.value) * -log10(y$p.value)

    if (!is.null(attributes(y)$geneIDcol)) {
        g <- y[, attributes(y)$geneIDcol]
    } else {
        g <- rownames(y)
    }
    z <- data.frame(g, s, stringsAsFactors = FALSE)
    colnames(z) <- c("geneidentifiers", "y")
    z <- mapGeneIds(y, z)
    z
}


deds_score <- function(y , geneIDcol = geneIDcol ) {

    ZCOL <- length(which(names(y) == "t"))
    if (ZCOL > 1) {
        stop("Error, there is more than 1 column named 't' in the input")
    }
    if (ZCOL < 1) {
        stop("Error, there is no column named 't' in the input")
    }

    s <- y$t

    if (!is.null(attributes(y)$geneIDcol)) {
        g <- y[, attributes(y)$geneIDcol]
    } else {
        g <- rownames(y)
    }
    z <- data.frame(g, s, stringsAsFactors = FALSE)
    colnames(z) <- c("geneidentifiers", "y")
    z <- mapGeneIds(y, z)
    z
}


cuffdiff_score <- function(y , geneIDcol = geneIDcol ) {

    ZCOL <- length(which(names(y) == "test_stat"))
    if (ZCOL > 1) {
        stop("Error, there is more than 1 column named 'test_stat' in the
        input")
    }
    if (ZCOL < 1) {
        stop("Error, there is no column named 'test_stat' in the input")
    }

    s <- y$test_stat

    if (!is.null(attributes(y)$geneIDcol)) {
        g <- y[, attributes(y)$geneIDcol]
    } else {
        g <- rownames(y)
    }
    z <- data.frame(g, s, stringsAsFactors = FALSE)
    colnames(z) <- c("geneidentifiers", "y")
    z <- mapGeneIds(y, z)
    z
}


seurat_score <- function(y , geneIDcol = geneIDcol ) {

    colnames(y) <- gsub("avg_log2FC","avg_logFC",colnames(y))
    NCOL <- ncol(y)
    if (NCOL < 2) {
        stop("Error: there are <2 columns in the input, 'p_val' and 'avg_logFC'
        are required ")
    }
    
    PCOL <- length(which(names(y) == "p_val"))
    if (PCOL > 1) {
        stop("Error, there is more than 1 column named 'p_val' in the input")
    }
    if (PCOL < 1) {
        stop("Error, there is no column named 'p_val' in the input")
    }
    
    FCCOL <- length(which(names(y) == "avg_logFC"))
    if (FCCOL > 1) {
        stop("Error, there is more than 1 column named 'avg_logFC' in the
        input")
    }
    if (FCCOL < 1) {
        stop("Error, there is no column named 'avg_logFC' in the input")
    }
    
    s <- sign(y$avg_logFC) * -log10(y$p_val)
    
    if (!is.null(attributes(y)$geneIDcol)) {
        g <- y[, attributes(y)$geneIDcol]
    } else {
        g <- rownames(y)
    }
    z <- data.frame(g, s, stringsAsFactors = FALSE)
    colnames(z) <- c("geneidentifiers", "y")
    z <- mapGeneIds(y, z)
    
    z$y[is.infinite(z$y) & z$y < 0] <- min(z$y[!is.infinite(z$y)]) - 0.01
    z$y[is.infinite(z$y) & z$y > 0] <- max(z$y[!is.infinite(z$y)]) + 0.01
    z
}


muscat_score <- function(y , geneIDcol = geneIDcol ) {
    
    NCOL <- ncol(y)
    if (NCOL < 2) {
        stop("Error: there are <2 columns in the input, 'p_val' and 'logFC'
        are required ")
    }
    
    PCOL <- length(which(names(y) == "p_val"))
    if (PCOL > 1) {
        stop("Error, there is more than 1 column named 'p_val' in the input")
    }
    if (PCOL < 1) {
        stop("Error, there is no column named 'p_val' in the input")
    }
    
    FCCOL <- length(which(names(y) == "logFC"))
    if (FCCOL > 1) {
        stop("Error, there is more than 1 column named 'logFC' in the input")
    }
    if (FCCOL < 1) {
        stop("Error, there is no column named 'logFC' in the input")
    }
    
    s <- sign(y$logFC) * -log10(y$p_val)
    
    if (!is.null(attributes(y)$geneIDcol)) {
        g <- y[, attributes(y)$geneIDcol]
    } else {
        g <- rownames(y)
    }
    z <- data.frame(g, s, stringsAsFactors = FALSE)
    colnames(z) <- c("geneidentifiers", "y")
    z <- mapGeneIds(y, z)
    z
}

scde_score <- function(y , geneIDcol = geneIDcol ) {

    ZCOL <- length(which(names(y) == "Z"))
    if (ZCOL > 1) {
        stop("Error, there is more than 1 column named 'Z' in the input")
    }
    if (ZCOL < 1) {
        stop("Error, there is no column named 'Z' in the input")
    }

    s <- y$Z

    if (!is.null(attributes(y)$geneIDcol)) {
        g <- y[, attributes(y)$geneIDcol]
    } else {
        g <- rownames(y)
    }
    z <- data.frame(g, s, stringsAsFactors = FALSE)
    colnames(z) <- c("geneidentifiers", "y")
    z <- mapGeneIds(y, z)
    z
}


mast_score <- function(y , geneIDcol = geneIDcol ) {

    NCOL <- ncol(y)
    if (NCOL < 2) {
        stop("Error: there are <2 columns in the input, 'Pr(>Chisq)' and 'coef'
        are required ")
    }

    PCOL <- length(which(names(y) == "Pr(>Chisq)"))
    if (PCOL > 1) {
        stop("Error, there is more than 1 column named 'Pr(>Chisq)' in the
        input")
    }
    if (PCOL < 1) {
        stop("Error, there is no column named 'Pr(>Chisq)' in the input")
    }

    FCCOL <- length(which(names(y) == "coef"))
    if (FCCOL > 1) {
        stop("Error, there is more than 1 column named 'coef' in the input")
    }
    if (FCCOL < 1) {
        stop("Error, there is no column named 'coef' in the input")
    }

    # because data.table object  doesn't seem to work
    y<-as.data.frame(y)

    s <- sign(y$coef) * -log10(y[,"Pr(>Chisq)"])

    if (!is.null(attributes(y)$geneIDcol)) {
        g <- y[, attributes(y)$geneIDcol]
    } else {
        g <- rownames(y)
    }
    z <- data.frame(g, s, stringsAsFactors = FALSE)
    colnames(z) <- c("geneidentifiers", "y")
    z <- mapGeneIds(y, z)
    z
}


desingle_score <- function(y , geneIDcol = geneIDcol ) {

    NCOL <- ncol(y)
    if (NCOL < 2) {
        stop("Error: there are <2 columns in the input, 'pvalue' and 
        'foldChange' are required ")
    }

    PCOL <- length(which(names(y) == "pvalue"))
    if (PCOL > 1) {
        stop("Error, there is more than 1 column named 'pvalue' in the input")
    }
    if (PCOL < 1) {
        stop("Error, there is no column named 'pvalue' in the input")
    }

    FCCOL <- length(which(names(y) == "foldChange"))
    if (FCCOL > 1) {
        stop("Error, there is more than 1 column named 'foldChange' in the
        input")
    }
    if (FCCOL < 1) {
        stop("Error, there is no column named 'foldChange' in the input")
    }

    s <- sign(log2(y$foldChange)) * -log10(y$pvalue)

    if (!is.null(attributes(y)$geneIDcol)) {
        g <- y[, attributes(y)$geneIDcol]
    } else {
        g <- rownames(y)
    }
    z <- data.frame(g, s, stringsAsFactors = FALSE)
    colnames(z) <- c("geneidentifiers", "y")
    z <- mapGeneIds(y, z)
    z
}


dmrcate_score <- function(y , geneIDcol = geneIDcol ) {

    NCOL <- ncol(y)
    if (NCOL < 2) {
        stop("Error: there are <2 columns in the input, 'Stouffer' and 
        'meanbetafc' are required ")
    }

    PCOL <- length(which(names(y) == "Stouffer"))
    if (PCOL > 1) {
        stop("Error, there is more than 1 column named 'Stouffer' in the
        input")
    }
    if (PCOL < 1) {
        stop("Error, there is no column named 'Stouffer' in the input")
    }

    FCCOL <- length(which(names(y) == "meanbetafc"))
    if (FCCOL > 1) {
        stop("Error, there is more than 1 column named 'meanbetafc' in the
        input")
    }
    if (FCCOL < 1) {
        stop("Error, there is no column named 'meanbetafc' in the input")
    }

    y<-as.data.frame(y)
    s <- sign(y$meanbetafc) * -log10(y$Stouffer)

    if (!is.null(attributes(y)$geneIDcol)) {
        g <- y[, attributes(y)$geneIDcol]
    } else {
        g <- rownames(y)
    }
    z <- data.frame(g, s, stringsAsFactors = FALSE)
    colnames(z) <- c("geneidentifiers", "y")
    z <- mapGeneIds(y, z)
    z
}


dep_score <- function(y , geneIDcol = geneIDcol ) {

    NCOL <- ncol(y)
    if (NCOL < 2) {
        stop("Error: there are <2 columns in the input, '*p.val' and '*ratio' 
        are required ")
    }

    PCOL <- length(grep("p.val",names(y)))
    if (PCOL > 1) {
        message("Note, using the leftmost column with '*p.val' in the name")
    }
    if (PCOL < 1) {
        stop("Error, there is no column named '*p.val' in the input")
    }

    FCCOL <- length(grep("ratio",names(y)))
    if (FCCOL > 1) {
        message("Note, using the leftmost column with 'ratio' in the name")
    }
    if (FCCOL < 1) {
        stop("Error, there is no column named 'ratio' in the input")
    }

    PCOL <- grep("p.val",names(y))[1]

    FCCOL <- grep("ratio",names(y))[1]

    s <- sign(y[,FCCOL]) * -log10(y[,PCOL])

    if (!is.null(attributes(y)$geneIDcol)) {
        g <- y[, attributes(y)$geneIDcol]
    } else {
        g <- rownames(y)
    }
    z <- data.frame(g, s, stringsAsFactors = FALSE)
    colnames(z) <- c("geneidentifiers", "y")
    z <- mapGeneIds(y, z)
    z
}


msmstests_score <- function(y , geneIDcol = geneIDcol ) {

    NCOL <- ncol(y)
    if (NCOL < 2) {
        stop("Error: there are <2 columns in the input, 'p.value' and 'LogFC'
        are required ")
    }

    PCOL <- length(which(names(y) == "p.value"))
    if (PCOL > 1) {
        stop("Error, there is more than 1 column named 'p.value' in the input")
    }
    if (PCOL < 1) {
        stop("Error, there is no column named 'p.value' in the input")
    }

    FCCOL <- length(which(names(y) == "LogFC"))
    if (FCCOL > 1) {
        stop("Error, there is more than 1 column named 'LogFC' in the input")
    }
    if (FCCOL < 1) {
        stop("Error, there is no column named 'LogFC' in the input")
    }

    s <- sign(y$LogFC) * -log10(y$p.value)

    if (!is.null(attributes(y)$geneIDcol)) {
        g <- y[, attributes(y)$geneIDcol]
    } else {
        g <- rownames(y)
    }
    z <- data.frame(g, s, stringsAsFactors = FALSE)
    colnames(z) <- c("geneidentifiers", "y")
    z <- mapGeneIds(y, z)
    z
}


plgem_score <- function(y , geneIDcol = geneIDcol ) {

    LEN <- length(y)
    if (LEN < 2) {
        stop("Error: there are <2 items in the input list, '$p.value' and
        '$PLGEM.STN' are required")
    }

    PCOL <- length(which(names(y) == "p.value"))
    if (PCOL > 1) {
        stop("Error, there is more than 1 item in the list named 'p.value'")
    }
    if (PCOL < 1) {
        stop("Error, there is no item in the list named 'p.value'")
    }

    FCCOL <- length(which(names(y) == "PLGEM.STN"))
    if (FCCOL > 1) {
        stop("Error, there is more than 1 column named 'PLGEM.STN' in the
        input")
    }
    if (FCCOL < 1) {
        stop("Error, there is no column named 'PLGEM.STN' in the input")
    }

    s <- sign(y$PLGEM.STN) * -log10(y$p.value)
    g <- rownames(y$PLGEM.STN)
    z <- data.frame(g, s, stringsAsFactors = FALSE)
    colnames(z) <- c("geneidentifiers", "y")
    z <- mapGeneIds(y, z)
    z
}


sdams_score <- function(y , geneIDcol = geneIDcol ) {

    LEN <- length(y)
    if (LEN < 2) {
        stop("Error: there are <2 items in the input list, '$pv_2part' and
        '$beta' are required")
    }

    PCOL <- length(which(names(y) == "pv_2part"))
    if (PCOL > 1) {
        stop("Error, there is more than 1 item in the list named 'pv_2part'")
    }
    if (PCOL < 1) {
        stop("Error, there is no item in the list named 'pv_2part'")
    }

    FCCOL <- length(which(names(y) == "beta"))
    if (FCCOL > 1) {
        stop("Error, there is more than 1 column named 'beta' in the input")
    }
    if (FCCOL < 1) {
        stop("Error, there is no column named 'beta' in the input")
    }

    s <- sign(y$beta) * -log10(y$pv_2part)
    g <- y$feat.names
    z <- data.frame(g, s, stringsAsFactors = FALSE)
    colnames(z) <- c("geneidentifiers", "y")
    z <- mapGeneIds(y, z)
    z
}


diffbind_score <- function(y , geneIDcol = geneIDcol ) {

    NCOL <- ncol(y)
    if (NCOL < 2) {
        stop("Error: there are <2 columns in the input, 'p.value' and 'Fold'
        are required ")
    }

    PCOL <- length(which(names(y) == "p.value"))
    if (PCOL > 1) {
        stop("Error, there is more than 1 column named 'p.value' in the input")
    }
    if (PCOL < 1) {
        stop("Error, there is no column named 'p.value' in the input")
    }

    FCCOL <- length(which(names(y) == "Fold"))
    if (FCCOL > 1) {
        stop("Error, there is more than 1 column named 'Fold' in the input")
    }
    if (FCCOL < 1) {
        stop("Error, there is no column named 'Fold' in the input")
    }

    y <- as.data.frame(y)
    s <- sign(y$Fold) * -log10(y$p.value)

    if (!is.null(attributes(y)$geneIDcol)) {
        g <- y[, attributes(y)$geneIDcol]
    } else {
        g <- rownames(y)
    }
    z <- data.frame(g, s, stringsAsFactors = FALSE)
    colnames(z) <- c("geneidentifiers", "y")
    z <- mapGeneIds(y, z)
    z
}


preranked_score <- function(y, joinType , geneIDcol ) {

    if (!is.null(attributes(y)$geneIDcol)) {
        NCOL <- ncol(y)
        if (NCOL > 2) {
            stop("Error: there are >2 columns in the input. Your files need to 
            have only the gene ID and rank stat ")
        }
        g <- y[, attributes(y)$geneIDcol]
        s <- y[ , !(names(y) %in% geneIDcol)]
    } else {
        NCOL <- ncol(y)
        if (NCOL > 1) {
            stop("Error: there are >1 columns in the input. Should only contain
            the gene ID and rank stat ")
        }
        g <- rownames(y)
        s <- y[,1]
    }
    z <- data.frame(g, s, stringsAsFactors = FALSE)
    colnames(z) <- c("geneidentifiers", "y")
    if ( is.null(joinType) ) {
        joinType <- "inner" 
    }
    if ( joinType == "inner" ) {
        z <- na.omit(z)
    }
    z <- mapGeneIds(y, z)
    z
}


#' mitch_import
#'
#' This function imports differential omics data from common differential tools
#' like edgeR, limma and DESeq2. It calculates a summarised differential
#' expression metric by multiplying the sign of the log fold change by the 
#' -log10 of the p-value. If this behaviour is not desired, mitch_import can be
#' bypassed in favour of another scoring metric.
#' @param x a list of differential expression tables.
#' @param DEtype the program that generated the differential expression table
#' Valid options are 'edgeR', 'DESeq2', 'limma', 'ABSSeq', 'Sleuth', 'Seurat',
#' 'topConfects', 'muscat', 'Swish', 'scDE', 'MAST', 'DEsingle', 'ballgown',
#' 'NOIseq', 'TCC', 'DEDS', 'cuffdiff', 'fishpond', 'missMethyl', 'DMRcate',
#' 'DEP', 'msmsTests', 'plgem', 'SDAMS', 'DEqMS', 'DiffBind' and 'prescored'. 
#' Where 'prescored' is a dataframe containing the test statistic and gene ID 
#' (either in rowname or separate column) and nothing else. 'preranked' is an 
#' alias for 'prescored'.
#' @param geneIDcol the column containing gene names. If gene names are 
#' specified as row names, then geneIDcol=NULL.
#' @param joinType the type of join to perform, either 'inner' or 'full'.
#' By default, joins are 'inner' except for Seurat and muscat where full is 
#' used.
#' @param geneTable a 2 column table mapping gene identifiers in the profile to
#' gene identifiers in the gene sets. 
#' @return a multi-column table compatible with mitch_calc analysis.
#' @keywords import mitch
#' @export
#' @examples
#' # first step is to create a list of differential profiles
#' data(rna,k9a,k36a)
#' x<-list('rna'=rna,'k9a'=k9a,'k36a'=k36a)
#' # import as edgeR table 
#' imported<-mitch_import(x,DEtype='edger')
#' @importFrom plyr join_all
mitch_import <- function(x, DEtype, geneIDcol = NULL, geneTable = NULL, 
    joinType = NULL) {
    
    if (is.data.frame(x)) {
        message("The input is a single dataframe; one contrast only. Converting
        it to a list for you.")
        NAME <- deparse(substitute(x))
        x <- list(x = x)
    }
    
    if (!is.list(x)) {
        stop("Error: Input (x) must be a LIST of dataframes.")
    }
    
    if (is.null(names(x))) {
        stop("Error: Input (x) must be a NAMED list of dataframes.")
    }

    if ( length(x)>69 ) {
        stop("Error: mitch is currently limited to 69 dimensions or fewer.")
    }
    
    if (!is.null(geneTable) && !is.data.frame(geneTable)) {
        stop("Error: the geneTable needs to be a dataframe.")
    }
    
    if (!is.null(geneTable) && (ncol(geneTable) < 2 || ncol(geneTable) > 2)) {
        stop("Error: the geneTable needs to be a dataframe of 2 columns.")
    }
    
    # the geneIDcol should be an attribute added to each list item
    for (i in seq_len(length(x))) {
        if (!is.null(geneIDcol)) {
            LEN <- length(which(names(x[[i]]) %in% geneIDcol))
            if (LEN < 1) {
                stop("Error: the specified geneIDcol doesn't seem to exist")
            }
            if (LEN > 1) {
                stop("Error: there are multiple matches for the specified
                geneIDcol")
            }
            attributes(x[[i]])$geneIDcol <- which(names(x[[i]]) %in% geneIDcol)
        } else {
            attributes(x[[i]])$geneIDcol <- NULL
        }
        if (!is.null(geneTable)) {
            if (!is.data.frame(geneTable)) {
                stop("Error: geneTable is not a data frame.")
            }
            if (ncol(geneTable) != 2) {
                stop("Error: geneTable must be a 2 column dataframe.")
            }
            attributes(x[[i]])$geneTable <- geneTable
        }
    }
    
    DEtype <- tolower(DEtype)
    
    validDEtype <- c("edger", "deseq2", "limma", "absseq", "sleuth", "seurat",
        "topconfects", "muscat", "swish", "scde", "mast", "desingle",
        "ballgown", "noiseq", "tcc", "deds", "cuffdiff", "preranked", 
        "prescored","fishpond", "missmethyl", "dmrcate", "dep", "msmstests",
        "plgem", "sdams", "deqms", "diffbind")

    if (DEtype == "edger") {
        xx <- lapply(x, edger_score)
    } else if (DEtype == "deseq2" || DEtype == "swish" || 
    DEtype == "fishpond" ) {
        xx <- lapply(x, deseq2_score)
    } else if (DEtype == "absseq") {
        xx <- lapply(x, absseq_score)
    } else if (DEtype == "sleuth") {
        xx <- lapply(x, sleuth_score)
    } else if (DEtype == "seurat") {
        xx <- lapply(x, seurat_score)
    } else if (DEtype == "topconfects") {
        xx <- lapply(x, topconfect_score)
    } else if (DEtype == "muscat") {
        xx <- lapply(x, muscat_score)
    } else if (DEtype == "scde") {
        xx <- lapply(x, scde_score)
    } else if (DEtype == "mast" ) {
        xx <- lapply(x, mast_score)
    } else if (DEtype == "desingle" ) {
        xx <- lapply(x, desingle_score)
    } else if (DEtype == "ballgown" ) {
        xx <- lapply(x, ballgown_score)
    } else if (DEtype == "noiseq" ) {
        xx <- lapply(x, noiseq_score)
    } else if (DEtype == "tcc" ) {
        xx <- lapply(x, tcc_score)
    } else if (DEtype == "deds" || DEtype == "missmethyl" || 
    DEtype == "limma" || DEtype == "deqms" ) {
        xx <- lapply(x, deds_score)
    } else if (DEtype == "cuffdiff" ) {
        xx <- lapply(x, cuffdiff_score)
    } else if (DEtype == "dmrcate" ) {
        xx <- lapply(x, dmrcate_score)
    } else if (DEtype == "dep" ) {
        xx <- lapply(x, dep_score)
    } else if (DEtype == "msmstests" ) {
        xx <- lapply(x, msmstests_score)
    } else if (DEtype == "plgem" ) {
        xx <- lapply(x, plgem_score)
    } else if (DEtype == "sdams" ) {
        xx <- lapply(x, sdams_score)
    } else if (DEtype == "diffbind" ) {
        xx <- lapply(x, diffbind_score)
    } else if (DEtype == "preranked" || DEtype == "prescored") {
        xx <- lapply(x, preranked_score, joinType = joinType, geneIDcol = geneIDcol)
    } else {
        stop(paste("Specified DEtype does not match one of the following:",
        validDEtype))
    }
    
    # give the colums a unique name otherwise join_all will fail
    for (i in seq_len(length(xx))) {
        colnames(xx[[i]]) <- c("geneidentifiers", paste("y", i, sep = ""))
    }
    
    if (is.null(joinType)) {
        if (DEtype == "seurat" || DEtype == "muscat" || DEtype == "scde" ||
        DEtype == "mast" || DEtype == "desingle" ) {
            xxx <- join_all(xx, by = "geneidentifiers", type = "full")
        } else {
            xxx <- join_all(xx, by = "geneidentifiers", type = "inner")
        }
    } else {
        xxx <- join_all(xx, by = "geneidentifiers", type = joinType)
    }
    rownames(xxx) <- xxx$geneidentifiers
    xxx$geneidentifiers <- NULL
    colnames(xxx) <- names(x)
    
    STARTSWITHNUM <- length(grep("^[0-9]", colnames(xxx)))
    if (STARTSWITHNUM > 0) {
        stop("Error: it looks like one or more column names starts with a
        number. This is incompatible with downstream analysis. Please modify")
    }
    
    MEAN_N_GENES_IN <- mean(unlist(lapply(x, nrow)))
    N_GENES_OUT <- nrow(xxx)
    PROP <- signif(N_GENES_OUT/MEAN_N_GENES_IN, 3)
    message(paste("Note: Mean no. genes in input =", MEAN_N_GENES_IN))
    message(paste("Note: no. genes in output =", N_GENES_OUT))
    if (PROP < 0.05) {
        warning("Warning: less than half of the input genes are also in the
        output")
    } else {
        message(paste("Note: estimated proportion of input genes in output =",
        PROP))
    }
    return(xxx)
}

#' gmt_import
#'
#' This function imports GMT files into a list of character vectors for mitch 
#' analysis. GMT files are a commonly used
#' format for lists of genes used in pathway enrichment analysis. GMT files can
#' be obtained from Reactome, MSigDB, etc.
#' @param gmtfile a gmt file.
#' @return a list of gene sets.
#' @keywords import genesets
#' @export
#' @examples
#' # Import some gene sets
#' genesetsExample<-gmt_import(system.file('extdata/sample_genesets.gmt', 
#' package = 'mitch'))
#' @import utils
gmt_import <- function(gmtfile) {
    genesetLines <- strsplit(readLines(gmtfile), "\t")
    genesets <- lapply(genesetLines, utils::tail, -2)
    names(genesets) <- unlist(lapply(genesetLines, head, 1))
    attributes(genesets)$originfile <- gmtfile
    genesets <- genesets[lapply(genesets,length)>0]
    if( any(duplicated(names(genesets))) ) {
        warning("Duplicated gene sets names detected")
    }
    genesets
}

MANOVA <- function(x, genesets, minsetsize = 10, cores = detectCores() - 1,
    priority = NULL) {
    
    STARTSWITHNUM <- length(grep("^[0-9]", colnames(x)))
    if (STARTSWITHNUM > 0) {
        stop("Error: it looks like one or more column names starts with a
        number. This is incompatible with downstream analysis. Please modify")
    }
    
    sets <- names(genesets)
    
    if (is.null(priority)) {
        priority <- "significance"
    }
    
    if (priority !="significance" && priority !="effect" && priority !="SD"){
        stop("Error: Parameter 'priority' must be either 'significance'(the 
        default), 'effect' or 'SD'.")
    }
    
    hypotenuse <- function(x) {
        sqrt(sum(unlist(lapply(x, function(x) { x^2 })),na.rm = TRUE ))
    }
    
    # calculate the hypotenuse for downstream use
    HYPOT <- hypotenuse(apply(x, 2, length))
    
    res <- mclapply(sets, function(set) {
        
        inset <- rownames(x) %in% as.character(unlist(genesets[set]))
        
        NROW <- nrow(x)
        
        if (length(which(inset)) >= minsetsize) {
            fit <- manova(x ~ inset)
            sumMANOVA <- summary.manova(fit)
            sumAOV <- summary.aov(fit)
            pMANOVA <- sumMANOVA$stats[1, "Pr(>F)"]
            raov <- lapply(sumAOV, function(zz) {
                zz[1, "Pr(>F)"]
            })
            raov <- unlist(raov)
            names(raov) <- gsub("^ Response ", "p.", names(raov))
            # S coordinates
            NOTINSET <- colMeans(x[!inset, ], na.rm=TRUE)
            scord <- (2 * (colMeans(x[inset, ], na.rm=TRUE) - NOTINSET))/NROW
            names(scord) <- paste0("s-", names(scord))
            # calculate the hypotenuse length of s scores
            s.dist <- hypotenuse(scord)
            names(s.dist) <- "s.dist"
            mysd <- sd(scord)
            names(mysd) <- "SD"
            
            return(data.frame(set, setSize = sum(inset), pMANOVA, t(scord), 
                t(raov),t(s.dist), t(mysd), stringsAsFactors = FALSE))
        }
    }, mc.cores = cores)
    
    fres <- ldply(res, data.frame)
    
    if (nrow(fres) < 1) {
        
        message("Warning: No results found. Check that the gene names in the
            profile match the gene sets and consider loosening the minsetsize
            parameter.")
        
    } else {
        fres$p.adjustMANOVA <- p.adjust(fres$pMANOVA, "fdr")
        
        # prioritisation
        if (priority == "significance") {
            fres <- fres[order(fres$pMANOVA), ]
            message("Note: When prioritising by significance (ie: small 
            p-values), large effect sizes might be missed.")
        }
        if (priority == "effect") {
            fres <- fres[order(-fres$s.dist), ]
            message("Note: Enrichments with large effect sizes may not be 
            statistically significant.")
        }
        if (priority == "SD") {
            fres <- fres[order(-fres$SD), ]
            fres <- subset(fres, p.adjustMANOVA <= 0.05)
            message("Note: Prioritisation by SD after selecting sets with 
            p.adjustMANOVA<=0.05.")
        }
        attributes(fres)$priority <- priority
        return(fres)
    }
}


ANOVA <- function(x, genesets, minsetsize = 10, cores = detectCores() - 1, 
priority = NULL) {
    
    STARTSWITHNUM <- length(grep("^[0-9]", colnames(x)))
    if (STARTSWITHNUM > 0) {
        stop("Error: it looks like one or more column names starts with a
        number. This is incompatible with downstream analysis. Please modify")
    }
    
    sets <- names(genesets)
    
    if (is.null(priority)) {
        priority <- "significance"
    }
    
    if (priority != "significance" && priority != "effect") {
        stop("Error: Parameter 'priority' must be either 'significance'
        (default) or 'effect'.")
    }

    x <- x[which(!is.na(x)),,drop=FALSE]
    
    res <- mclapply(sets, function(set) {
        resample <- function(x, set) {
            sss <- x[which(rownames(x) %in% 
                as.character(unlist(genesets[set]))),]
            mysample <- sample(sss, length(sss), replace = TRUE)
            mean(mysample)
        }
        
        inset <- rownames(x) %in% as.character(unlist(genesets[set]))
        
        NROW <- nrow(x)
        
        if (length(which(inset)) >= minsetsize) {
            fit <- aov(x[, 1] ~ inset)
            pANOVA <- summary(fit)[[1]][, 5][1]
            NOTINSET <- mean(x[!inset, ])
            s.dist <- (2 * (mean(x[inset, ]) - NOTINSET))/NROW
            gres <- data.frame(set, setSize = sum(inset), pANOVA, s.dist,
            stringsAsFactors = FALSE)
            gres
        }
    }, mc.cores = cores)
    
    fres <- ldply(res, data.frame)
    
    if (nrow(fres) < 1) {
        
        message("Warning: No results found. Check that the gene names in the
        profile match the gene sets and consider loosening the minsetsize
        parameter.")
        
    } else {
        fres$p.adjustANOVA <- p.adjust(fres$pANOVA, "fdr")
        
        # prioritisation
        if (priority == "significance") {
            fres <- fres[order(fres$pANOVA), ]
            message("Note: When prioritising by significance (ie: small
            p-values), large effect sizes might be missed.")
        }
        if (priority == "effect") {
            fres <- fres[order(-abs(fres$s.dist)), ]
            message("Note: Enrichments with large effect sizes may not be
            statistically significant.")
        }
        attributes(fres)$priority <- priority
        return(fres)
    }
}

mitch_metrics_calc<-function(x, genesets, enrichment_result, minsetsize = 10) {
    
    if (!is.null(enrichment_result)) {
        
        num_genesets <- length(genesets)
        included_genesets <- nrow(enrichment_result)
        geneset_counts <- as.data.frame(as.vector(unlist(lapply(genesets, 
        function(set) {
            length(which(as.vector(unlist(set)) %in% rownames(x)))
        }))))
        rownames(geneset_counts) <- names(genesets)
        colnames(geneset_counts) <- "count"
        genesets_excluded <-
            names(genesets)[which(geneset_counts$count<minsetsize)]
        genesets_included <-
            names(genesets)[which(geneset_counts$count>=minsetsize)]
        num_genesets_excluded <- length(genesets_excluded)
        num_genesets_included <- length(genesets_included)
        num_genes_in_genesets <- length(unique(as.vector(unlist(genesets))))
        num_genes_in_profile <- length(unique(rownames(x)))
        duplicated_genes_present <- length(rownames(x)) > num_genes_in_profile
        num_profile_genes_in_sets <- length(which(rownames(x) %in% 
        as.vector(unlist(genesets))))
        num_profile_genes_not_in_sets <- num_genes_in_profile - 
        num_profile_genes_in_sets
        num_sets_significant <- 
            nrow(enrichment_result[which(enrichment_result$p.adjustMANOVA < 
            0.05), ])
        profile_pearson_correl <- cor(x, method = "p")[2, 1]
        profile_spearman_correl <- cor(x, method = "s")[2, 1]
        
        # genes in each quadrant
        g1 <- length(which(x[, 1] > 0 & x[, 2] > 0))
        g2 <- length(which(x[, 1] > 0 & x[, 2] < 0))
        g3 <- length(which(x[, 1] < 0 & x[, 2] < 0))
        g2 <- length(which(x[, 1] < 0 & x[, 2] > 0))
        
        # genesets in each quadrant
        ns1 <- nrow(subset(enrichment_result, p.adjustMANOVA < 0.05 &
            enrichment_result[,4] > 0 & enrichment_result[, 5] > 0))
        ns2 <- nrow(subset(enrichment_result, p.adjustMANOVA < 0.05 & 
            enrichment_result[,4] > 0 & enrichment_result[, 5] < 0))
        ns3 <- nrow(subset(enrichment_result, p.adjustMANOVA < 0.05 &
            enrichment_result[,4] < 0 & enrichment_result[, 5] < 0))
        ns4 <- nrow(subset(enrichment_result, p.adjustMANOVA < 0.05 &
            enrichment_result[,4] < 0 & enrichment_result[, 5] > 0))
        num_sets_significant_by_quadrant <- paste(ns1, ns2, ns3, ns4, sep = ",")
        
        dat <- list(num_genesets = num_genesets, 
            num_genes_in_profile = num_genes_in_profile, 
            duplicated_genes_present = duplicated_genes_present, 
            num_profile_genes_in_sets = num_profile_genes_in_sets, 
            num_profile_genes_not_in_sets = num_profile_genes_not_in_sets,
            num_genesets_excluded = num_genesets_excluded, 
            num_genesets_included = num_genesets_included, 
            num_genes_in_genesets = num_genes_in_genesets, 
            genesets_excluded = genesets_excluded,
            genesets_included = genesets_included, 
            profile_pearson_correl = profile_pearson_correl,
            profile_spearman_correl = profile_spearman_correl, 
            num_sets_significant = num_sets_significant,
            num_sets_significant_by_quadrant=num_sets_significant_by_quadrant,
            geneset_counts = geneset_counts)
        dat
    }
}

mitch_metrics_calc1d <- function(x, genesets, anova_result, minsetsize = 10) {
    
    if (!is.null(anova_result)) {
        
        num_genesets <- length(genesets)
        included_genesets <- nrow(anova_result)
        geneset_counts <- as.data.frame(as.vector(unlist(lapply(genesets,
            function(set) {
            length(which(as.vector(unlist(set)) %in% rownames(x)))
        }))))
        rownames(geneset_counts) <- names(genesets)
        colnames(geneset_counts) <- "count"
        genesets_excluded <-
            names(genesets)[which(geneset_counts$count<minsetsize)]
        genesets_included <-
            names(genesets)[which(geneset_counts$count >= minsetsize)]
        num_genesets_excluded <- length(genesets_excluded)
        num_genesets_included <- length(genesets_included)
        num_genes_in_genesets <- length(unique(as.vector(unlist(genesets))))
        num_genes_in_profile <- length(unique(rownames(x)))
        duplicated_genes_present <- length(rownames(x)) > num_genes_in_profile
        num_profile_genes_in_sets <- length(which(rownames(x) %in% 
            as.vector(unlist(genesets))))
        num_profile_genes_not_in_sets <- num_genes_in_profile-num_profile_genes_in_sets
        num_sets_significant <-
            nrow(anova_result[which(anova_result$p.adjustANOVA<0.05),])
        
        # genes up and down
        g1 <- length(which(x[, 1] > 0))
        g2 <- length(which(x[, 1] < 0))
        
        # genesets in each quadrant
        num_sets_up <- nrow(subset(anova_result, p.adjustANOVA < 0.05 &
            anova_result[,4] > 0))
        num_sets_dn <- nrow(subset(anova_result, p.adjustANOVA < 0.05 &
            anova_result[,4] < 0))
        
        dat <- list(num_genesets = num_genesets,
            num_genes_in_profile = num_genes_in_profile, 
            duplicated_genes_present = duplicated_genes_present,
            num_profile_genes_in_sets = num_profile_genes_in_sets, 
            num_profile_genes_not_in_sets = num_profile_genes_not_in_sets,
            num_genesets_excluded = num_genesets_excluded, 
            num_genesets_included = num_genesets_included,
            num_genes_in_genesets = num_genes_in_genesets, 
            genesets_excluded = genesets_excluded,
            genesets_included = genesets_included, 
            num_sets_significant = num_sets_significant,
            num_sets_up = num_sets_up, 
            num_sets_dn = num_sets_dn,
            geneset_counts = geneset_counts)
        dat
    }
}


mitch_rank <- function(x) {
    
    for (i in seq_len(ncol(x))) {
        LEN <- length(x[, i])
        UNIQLEN <- length(unique(x[, i]))
        if (UNIQLEN/LEN < 0.4) {
            warning("Warning: >60% of genes have the same score. This isn't
            optimal for rank based enrichment analysis.")
        }
    }
    
    rank_adj <- function(x) {
        xx <- rank(x,na.last = "keep")
        num_neg <- length(which(x < 0))
        num_zero <- length(which(x == 0))
        num_adj <- num_neg + (num_zero/2)
        adj <- xx - num_adj
        adj
    }
    adj <- apply(x, 2, rank_adj)
    adj
}


detailed_sets <- function(res, resrows = 50) {
    # collect ranked genelist of each genest
    genesets <- res$input_genesets
    ss <- res$ranked_profile
    mykeys <- as.character(res$enrichment_result[seq_len(resrows), 1])
    dat <- vector(mode = "list", length = resrows)
    names(dat) <- mykeys
    
    for (i in seq_len(resrows)) {
        sss <- ss[which(rownames(ss) %in% genesets[[which(names(genesets) %in%
        as.character(res$enrichment_result[i, 
            1]))]]), ]
        dat[[i]] <- sss
    }
    dat
}


get_os <- function(){
    sysinf <- Sys.info()
    if (!is.null(sysinf)){
        os <- sysinf['sysname']
        if (os == 'Darwin')
            os <- "osx"
    } else { ## mystery machine
        os <- .Platform$OS.type
        if (grepl("^darwin", R.version$os))
            os <- "osx"
        if (grepl("linux-gnu", R.version$os))
            os <- "linux"
    }
    tolower(os)
}


#' mitch_calc
#'
#' This function performs multivariate gene set enrichment analysis. 
#' @param x a multicolumn numerical table with each column containing 
#' differential expression scores for a contrast.
#' Rownames must match genesets.
#' @param genesets lists of genes imported by the gmt_imprt function or
#' similar.
#' @param minsetsize the minimum number of genes required in a set for it to be
#' included in the statistical analysis.
#' Default is 10.
#' @param cores the number of parallel threads for computation. Defaults to 1.
#' @param resrows an integer value representing the number of top genesets for
#' which a detailed report is to be 
#' generated. Default is 50.
#' @param priority the prioritisation metric used to selecting top gene sets. 
#' Valid options are 'significance', 
#' 'effect' and 'SD'. 
#' @return mitch res object with the following parts:
#' $input_profile: the supplied input differential profile
#' $input_genesets: the supplied input gene sets
#' $ranked_profile: the differential profile after ranking
#' $enrichment_result: the table of MANOVA/ANOVA enrichment results for each
#' gene set
#' $analysis_metrics:  several metrics that are important to the interpretation
#' of the results
#' $detailed_sets: a list of dataframes containing ranks of members of
#' prioritised gene sets.
#' @keywords mitch calc calculate manova 
#' @import parallel
#' @import stats
#' @importFrom plyr ldply
#' @export
#' @examples
#' # Example using mitch to calculate multivariate enrichments and
#' # prioritise based on effect size 
#' data(myImportedData,genesetsExample)
#' resExample<-mitch_calc(myImportedData,genesetsExample,priority='effect',
#' minsetsize=5,cores=2)
mitch_calc <- function(x, genesets, minsetsize = 10, cores = 1,
    resrows = 50, priority = NULL) {
    
    colnames(x) <- gsub("[[:punct:]]", "_", colnames(x))
    colnames(x) <- substr(colnames(x), 1, 14)
    if ( any(duplicated(colnames(x)))) { stop("Duplicate column names.") }
    input_profile <- x
    input_genesets <- genesets
    ranked_profile <- mitch_rank(input_profile)
    if (get_os() == "windows") { cores <- 1 }
    
    if ( ncol(x)>69 ) {
        stop("Error: mitch is currently limited to 69 dimensions or fewer.")
    }

    if (ncol(x) > 1) {

    corx <- cor(x,use="pairwise.complete.obs")
    diag(corx) <- 0

        if ( max(corx) == 1 ) {
            stop("Error: It looks like two or more contrasts are identical. Aborting.")
        }

        enrichment_result <- MANOVA(ranked_profile, genesets, 
            minsetsize <- minsetsize, cores = cores, priority = priority)
        
        if (!is.null(enrichment_result)) {
            mitch_metrics <- mitch_metrics_calc(x, genesets, enrichment_result)
            dat <- list(input_profile = input_profile,
                input_genesets = input_genesets,
                ranked_profile = ranked_profile,
                enrichment_result = enrichment_result, 
                analysis_metrics = mitch_metrics)
            
            if (nrow(enrichment_result) < resrows) {
                resrows <- nrow(enrichment_result)
            }
            dat$detailed_sets <- detailed_sets(dat, resrows)
            attr(dat, "profile_dimensions") <- colnames(dat$input_profile)
            dat
        }
    } else if (ncol(x) == 1) {
        
        enrichment_result <- ANOVA(ranked_profile, genesets,
            minsetsize <- minsetsize, cores = cores, priority = priority)
        
        if (!is.null(enrichment_result)) {
            mitch_metrics <- mitch_metrics_calc1d(x, genesets,
            enrichment_result)
            dat <- list(input_profile = input_profile,
                input_genesets = input_genesets, 
                ranked_profile = ranked_profile,
                enrichment_result = enrichment_result, 
                analysis_metrics = mitch_metrics)
            
            if (nrow(enrichment_result) < resrows) {
                resrows <- nrow(enrichment_result)
            }
            dat$detailed_sets <- detailed_sets(dat, resrows)
            attr(dat, "profile_dimensions") <- colnames(dat$input_profile)
            dat
        }
    }
}

plot1d_profile_dist <- function(res) {
    par(mfrow = c(2, 1))
    hist(res$input_profile[, 1], breaks = 50,
        main = "Distribution of DE scores", xlab = paste("DE score for ",
        colnames(res$input_profile)))
    plot(res$input_profile, xlab = paste("DE score for ",
        colnames(res$input_profile)), 
        pch = "|", frame.plot = FALSE)
    UPS <- length(which(res$input_profile > 0))
    DNS <- length(which(res$input_profile < 0))
    TOTAL <- nrow(res$input_profile)
    mtext(paste(TOTAL, "genes in total,", UPS, "trending up-regulated,", 
    DNS, "trending down-regulated"))
    pl <- recordPlot()
    pl
}

plot_geneset_hist <- function(res) {
    par(mfrow = c(3, 1))
    geneset_counts <- res$analysis_metrics$geneset_counts
    boxplot(geneset_counts$count, horizontal = TRUE, frame = FALSE, 
        main = "Gene set size", 
        xlab = "number of member genes included in profile")
    hist(geneset_counts$count, 100, xlab = "geneset size",
        main = "Histogram of geneset size")
    hist(geneset_counts$count, 100, xlim = c(0, 500), xlab = "geneset size",
        main = "Trimmed histogram of geneset size")
    pl <- recordPlot()
    pl
}

plot1d_volcano <- function(res) {
    par(mfrow = c(1, 1))
    sig <- subset(res$enrichment_result, p.adjustANOVA <= 0.05)
    plot(res$enrichment_result$s.dist, -log10(res$enrichment_result$pANOVA),
        xlab = "s score", ylab = "-log10(p-value)",
        main = "volcano plot of gene set enrichments",
        pch = 19, cex = 0.8)
    points(sig$s.dist, -log10(sig$pANOVA), pch = 19, cex = 0.85, col = "red")
    TOTAL <- nrow(res$enrichment_result)
    SIG <- nrow(sig)
    UP <- length(which(sig$s.dist > 0))
    DN <- length(which(sig$s.dist < 0))
    SUBHEADER <- paste(TOTAL, "gene sets in total,", UP, "upregulated and ",
        DN, "downregulated (FDR<=0.05)")
    mtext(SUBHEADER)
    pl <- recordPlot()
    pl
}

plot1d_detailed <- function(res, i) {
    par(mfrow = c(3, 1))
    ss <- res$ranked_profile
    sss <- res$detailed_sets[[i]]
    set <- names(res$detailed_sets[i])
    size <- length(sss)
    
    beeswarm(sss, vertical = FALSE, cex = 0.75, xlim = c(min(ss), max(ss)),
        col = "darkgray", pch = 19, main = set, cex.main = 1.5,
        xlab = paste("ranked DE score in:", colnames(ss)))
    mtext("beeswarm plot", cex = 0.8)
    
    hist(sss, xlim = c(min(ss), max(ss)), breaks = 15, col = "darkgray", 
        main = NULL, border = "black", 
        xlab = paste("ranked DE score in:", colnames(ss)))
    mtext("histogram", cex = 0.8)
    
    plot(sss, rep(1, length(sss)), type = "n", xlim = c(min(ss), max(ss)), 
        frame = FALSE, axes = FALSE, ylab = "", 
        xlab = paste("ranked DE score in:", colnames(ss)))
    rug(sss, ticksize = 0.9)
    axis(1)
    mtext("rugplot", cex = 0.8)
    pl <- recordPlot()
    pl
}

plot2d_profile_dist <- function(res) {
    plot(res$input_profile, pch = 19, 
        col = rgb(red = 0, green = 0, blue = 0, alpha = 0.2), 
        main = "Scatterplot of all genes")
    abline(v = 0, h = 0, lty = 2, lwd = 2, col = "blue")
    pl <- recordPlot()
    pl
}


plot2d_profile_density <- function(res) {
    palette <- colorRampPalette(c("white", "yellow", "orange", "red", 
        "darkred", "black"))
    
    ss <- res$ranked_profile
    xmin <- min(ss[, 1])
    xmax <- max(ss[, 1])
    ymin <- min(ss[, 2])
    ymax <- max(ss[, 2])
    
    k <- MASS::kde2d(ss[, 1], ss[, 2])
    X_AXIS <- paste("Rank in contrast", colnames(ss)[1])
    Y_AXIS <- paste("Rank in contrast", colnames(ss)[2])
    
    filled.contour(k, xlim = c(xmin, xmax), ylim = c(ymin, ymax), 
        color.palette = palette, plot.title = {
            abline(v = 0, h = 0, lty = 2, lwd = 2, col = "blue")
            title(main = "Rank-rank plot of all genes", xlab = X_AXIS, 
            ylab = Y_AXIS)
        })
    pl <- recordPlot()
    pl
}

plot2d_gene_quadrant_barchart <- function(res) {
    uu <- length(which(res$input_profile[, 1] > 0 & res$input_profile[, 2] > 0))
    ud <- length(which(res$input_profile[, 1] > 0 & res$input_profile[, 2] < 0))
    dd <- length(which(res$input_profile[, 1] < 0 & res$input_profile[, 2] < 0))
    du <- length(which(res$input_profile[, 1] < 0 & res$input_profile[, 2] > 0))
    a <- as.data.frame(c(uu, ud, dd, du))
    rownames(a) <- c("top-right", "bottom-right", "bottom-left", "top-left")
    colnames(a) <- "a"
    barplot(a$a, names.arg = rownames(a), 
        main = "number of genes in each quadrant")
    pl <- recordPlot()
    pl
}

plot2d_set_quadrant_barchart <- function(res) {
    par(mfrow = c(1, 1))
    a <- res$analysis_metrics[14]
    a <- as.data.frame(as.numeric(unlist(strsplit(as.character(a), ","))), 
    stringsAsFactors = FALSE)
    rownames(a) <- c("top-right", "bottom-right", "bottom-left", "top-left")
    colnames(a) <- "a"
    barplot(a$a, names.arg = rownames(a), main = "number of genesets FDR<0.05")
    pl <- recordPlot()
    pl
}


plot2d_set_scatter <- function(res) {
    sig <- subset(res$enrichment_result, p.adjustMANOVA < 0.05)
    plot(res$enrichment_result[, 4:5], pch = 19, col = rgb(red = 0, green = 0, 
        blue = 0, alpha = 0.2),
        main = "Scatterplot of all gene sets; FDR<0.05 in red")
    abline(v = 0, h = 0, lty = 2, lwd = 2, col = "blue")
    points(sig[, 4:5], pch = 19, col = rgb(red = 1, green = 0, blue = 0,
    alpha = 0.5))
    pl <- recordPlot()
    pl
}

plot2d_set_scatter_top <- function(res) {
    resrows <- length(res$detailed_sets)
    top <- head(res$enrichment_result, resrows)
    plot(res$enrichment_result[, 4:5], pch = 19, col = rgb(red = 0, green = 0,
        blue = 0, alpha = 0.2),
        main = paste("Scatterplot of all gene sets; top", resrows, 
        "in red"))
    abline(v = 0, h = 0, lty = 2, lwd = 2, col = "blue")
    points(top[, 4:5], pch = 19, col = rgb(red = 1, green = 0, blue = 0,
    alpha = 0.5))
    pl <- recordPlot()
    pl
}

plot2d_heatmap <- function(res) {
    d <- ncol(res$input_profile)
    resrows <- length(res$detailed_sets)
    pl <- NULL
    if (resrows > 2) {
        hmapx <- head(res$enrichment_result[, 4:(4 + d - 1)], resrows)
        rownames(hmapx) <- head(res$enrichment_result$set, resrows)
        colnames(hmapx) <- gsub("^s.", "", colnames(hmapx))
        my_palette <- colorRampPalette(c("blue", "white", "red"))(n = 25)
        heatmap.2(as.matrix(hmapx), scale = "none", margins = c(10, 25),
            cexRow = 0.8, trace = "none", cexCol = 0.8, col = my_palette)
        pl <- recordPlot()
    }
    pl
}

plot_effect_vs_significance <- function(res) {
    par(mfrow = c(1, 1))
    plot(res$enrichment_result$s.dist,
        -log(res$enrichment_result$p.adjustMANOVA), 
        xlab = "s.dist (effect size)", 
        ylab = "-log(p.adjustMANOVA) (significance)", 
        pch = 19, col = rgb(red = 0, green = 0, blue = 0, alpha = 0.2),
        main = "effect size versus statistical significance")
    pl <- recordPlot()
    pl
}


plot2d_detailed_density <- function(res, i) {
    palette <- colorRampPalette(c("white", "yellow", "orange", "red",
        "darkred", "black"))
    ss <- res$ranked_profile
    xmin <- min(ss[, 1])
    xmax <- max(ss[, 1])
    ymin <- min(ss[, 2])
    ymax <- max(ss[, 2])
    ll <- res$enrichment_result[i, ]
    size <- ll$setSize
    sss <- res$detailed_sets[[i]]
    X_AXIS <- paste("Rank in contrast", colnames(ss)[1])
    Y_AXIS <- paste("Rank in contrast", colnames(ss)[2])
    par(mar = c(5, 4, 4, 2))
    k <- MASS::kde2d(sss[, 1], sss[, 2])
    filled.contour(k, color.palette = palette, xlim = c(xmin, xmax),
        ylim = c(ymin, ymax), plot.title = {
        abline(v = 0, h = 0, lty = 2, lwd = 2, col = "blue")
        title(main = ll$set, xlab = X_AXIS, ylab = Y_AXIS)
    })
    pl <- recordPlot()
    pl
}


plot2d_detailed_scatter <- function(res, i) {
    ss <- res$ranked_profile
    xmin <- min(ss[, 1])
    xmax <- max(ss[, 1])
    ymin <- min(ss[, 2])
    ymax <- max(ss[, 2])
    sss <- res$detailed_sets[[i]]
    X_AXIS <- paste("Rank in contrast", colnames(ss)[1])
    Y_AXIS <- paste("Rank in contrast", colnames(ss)[2])
    ll <- res$enrichment_result[i, ]
    plot(sss, pch = 19, col = rgb(red = 0, green = 0, blue = 0, alpha = 0.2),
        main = ll$set, xlim = c(xmin, xmax), ylim = c(ymin, ymax), 
        xlab = X_AXIS, ylab = Y_AXIS)
    abline(v = 0, h = 0, lty = 2, lwd = 2, col = "blue")
    pl <- recordPlot()
    pl
}


plot2d_detailed_violin <- function(res, i) {
    pl <- list()
    
    ss <- res$ranked_profile
    xmin <- min(ss[, 1])
    xmax <- max(ss[, 1])
    ymin <- min(ss[, 2])
    ymax <- max(ss[, 2])
    ll <- res$enrichment_result[i, ]
    size <- ll$setSize
    sss <- res$detailed_sets[[i]]
    X_AXIS <- paste("Rank in contrast", colnames(ss)[1])
    Y_AXIS <- paste("Rank in contrast", colnames(ss)[2])
    ss_long <- melt(ss)
    colnames(ss_long) <- c("gene","contrast","value")
    sss_long <- melt(sss)
    colnames(sss_long) <- c("gene","contrast","value")
    
    p <- ggplot(ss_long, aes(contrast, value)) + geom_violin(data = ss_long,
        fill = "grey", colour = "grey") + geom_boxplot(data = ss_long, 
        width = 0.9, fill = "grey", outlier.shape = NA, 
        coef = 0) + geom_violin(data = sss_long, fill = "black", 
        colour = "black") + geom_boxplot(data = sss_long, width = 0.1, 
        outlier.shape = NA) + labs(y = "Position in rank", title = ll[, 1])
    
    print(p + theme_bw() + theme(axis.text = element_text(size = 14), 
        axis.title = element_text(size = 15), 
        plot.title = element_text(size = 20)))
    pl <- recordPlot()
    pl
}


ggpairs_points <- function(res) {
    ggpairs_points_plot <- function(data, mapping, ...) {
        p <- ggplot(data = data, mapping = mapping) + 
            geom_point(alpha = 0.05) + 
            geom_vline(xintercept = 0, linetype = "dashed") +
            geom_hline(yintercept = 0, 
            linetype = "dashed")
    }
    
    p <- ggpairs(as.data.frame(res$input_profile),
        title = "Scatterplot of all genes", 
        lower = list(continuous = ggpairs_points_plot))
    print(p + theme_bw())
}


ggpairs_points_subset <- function(res) {
    d <- ncol(res$ranked_profile)
    ggpairs_points_plot <- function(data, mapping, ...) {
        p <- ggplot(data = data, mapping = mapping) +
            geom_point(alpha = 0.05) + 
            geom_vline(xintercept = 0, linetype = "dashed") +
            geom_hline(yintercept = 0, 
            linetype = "dashed")
    }
    enrichment_result_clipped <- res$enrichment_result[, 4:(3 + d)]
    colnames(enrichment_result_clipped) <- colnames(res$input_profile)
    p <- ggpairs(enrichment_result_clipped,
        title = "Scatterplot of all genessets; FDR<0.05 in red", 
        lower = list(continuous = ggpairs_points_plot))
    print(p + theme_bw())
}


ggpairs_contour <- function(res) {
    palette <- colorRampPalette(c("white", "yellow", "orange", "red",
        "darkred", "black"))
    ggpairs_func <- function(data, mapping, ...) {
        p <- ggplot(data = data, mapping = mapping) +
            stat_density2d(aes(fill = ..density..), 
            geom = "tile", contour = FALSE) +
            geom_vline(xintercept = 0, linetype = "dashed") + 
            geom_hline(yintercept = 0, linetype = "dashed") +
            scale_fill_gradientn(colours = palette(25))
        p
    }
    ss <- res$ranked_profile
    p <- ggpairs(as.data.frame(ss),
        title = "Contour plot of all genes after ranking", 
        lower = list(continuous = ggpairs_func),
            diag = list(continuous = wrap("barDiag", 
            binwidth = nrow(ss)/100)))
    print(p + theme_bw())
}


colname_substitute <- function(res) {
    d <- ncol(res$ranked_profile)
    if (d > 5) {
        mydims <- data.frame(attributes(res)$profile_dimensions)
        colnames(mydims) <- "dimensions"
        colnames(res$input_profile) <- paste("d",
            seq_len(ncol(res$input_profile)), sep = "")
        colnames(res$ranked_profile) <- paste("d",
            seq_len(ncol(res$ranked_profile)), sep = "")
    }
    res
}


gene_sector_table <- function(res) {
    mytheme <- gridExtra::ttheme_default(
        core = list(fg_params = list(cex = 0.5)), 
        colhead = list(fg_params = list(cex = 0.7)),
        rowhead = list(fg_params = list(cex = 0.7)))
    
    d <- ncol(res$ranked_profile)
    ss <- res$ranked_profile
    sig <- sign(ss)
    if (d < 6) {
        sig <- sign(ss)
        sector_count <- aggregate(seq(from = 1, to = nrow(sig)) ~ ., 
            sig, FUN = length)
        colnames(sector_count)[ncol(sector_count)] <- 
            "Number of genes in each sector"
        grid.newpage()
        grid.table(sector_count, theme = mytheme)
    }
}


geneset_sector_table <- function(res) {
    d <- ncol(res$ranked_profile)
    mytheme <- gridExtra::ttheme_default(
        core = list(fg_params = list(cex = 0.5)), 
        colhead = list(fg_params = list(cex = 0.7)),
        rowhead = list(fg_params = list(cex = 0.7)))
    
    sig <- 
        sign(res$enrichment_result[which(res$enrichment_result$p.adjustMANOVA< 
        0.05), 4:(4 + d - 1)])
    if (d < 6) {
        if (nrow(sig) > 0) {
            sector_count <- aggregate(seq(from = 1, to = nrow(sig)) ~ ., 
                sig, FUN = length)
            colnames(sector_count)[ncol(sector_count)] <-
                "Number of gene sets in each sector"
            grid.newpage()
            grid.table(sector_count, theme = mytheme)
        }
    }
}

heatmapx <- function(res) {
    d <- ncol(res$ranked_profile)
    resrows <- length(res$detailed_sets)
    hmapx <- head(res$enrichment_result[, 4:(4 + d - 1)], resrows)
    rownames(hmapx) <- head(res$enrichment_result$set, resrows)
    colnames(hmapx) <- gsub("^s.", "", colnames(hmapx))
    my_palette <- colorRampPalette(c("blue", "white", "red"))(n = 25)
    heatmap.2(as.matrix(hmapx), scale = "none", margins = c(10, 25),
        cexRow = 0.8, trace = "none", cexCol = 0.8, col = my_palette)
    pl <- recordPlot()
    pl
}

plot3d_detailed_density <- function(res, i) {
    palette <- colorRampPalette(c("white", "yellow", "orange", "red",
        "darkred", "black"))
    d <- ncol(res$ranked_profile)
    ss <- res$ranked_profile
    ll <- res$enrichment_result[i, ]
    size <- ll$setSize
    sss <- res$detailed_sets[[i]]
    
    empty_cnt <- length(which(is.na(cor(sss,use="pairwise.complete.obs"))))
    if ( empty_cnt > 0 ) { return("Too few genes. Skipping density plot.") }

    if (d > 5) {
        colnames(sss) <- paste("d", seq_len(ncol(res$input_profile)), sep = "")
    }
    
    ggpairs_contour_limit_range <- function(data, mapping, ...) {
        p <- ggplot(data = data, mapping = mapping) +
            stat_density2d(aes(fill = ..density..), 
            geom = "tile", contour = FALSE) +
            geom_vline(xintercept = 0, linetype = "dashed") + 
            geom_hline(yintercept = 0, linetype = "dashed") +
            scale_fill_gradientn(colours = palette(25)) + 
            scale_x_continuous(limits = 
                range(min(ss[, gsub("~", "", as.character(mapping[1]))]), 
                max(ss[, gsub("~", "", 
                as.character(mapping[1]))]))) +
            scale_y_continuous(limits = range(min(ss[, 
            gsub("~", "", as.character(mapping[2]))]),
            max(ss[, gsub("~", "", as.character(mapping[2]))])))
        p
    }
    
    p <- ggpairs(as.data.frame(sss), title = ll[, 1],
        lower = list(continuous = ggpairs_contour_limit_range), 
        diag = list(continuous = wrap("barDiag", binwidth = nrow(ss)/10)))
    print(p + theme_bw())
}


plot3d_detailed_points <- function(res, i) {
    d <- ncol(res$ranked_profile)
    ss <- res$ranked_profile
    ll <- res$enrichment_result[i, ]
    size <- ll$setSize
    sss <- res$detailed_sets[[i]]

    empty_cnt <- length(which(is.na(cor(sss,use="pairwise.complete.obs"))))
    if ( empty_cnt > 0 ) { return("Too few genes. Skipping ggpairs plot.") }
    
    if (d > 5) {
        colnames(sss) <- paste("d", seq_len(ncol(res$input_profile)), sep = "")
    }
    
    ggpairs_points_limit_range <- function(data, mapping, ...) {
        p <- ggplot(data = data, mapping = mapping) +
            geom_point(alpha = 0.1) +
            geom_vline(xintercept = 0, linetype = "dashed") +
            geom_hline(yintercept = 0, linetype = "dashed") + 
            scale_x_continuous(limits =
                range(min(ss[, gsub("~", "", as.character(mapping[1]))]), 
                max(ss[, gsub("~", "", as.character(mapping[1]))]))) +
            scale_y_continuous(limits = range(min(ss[, 
            gsub("~", "", as.character(mapping[2]))]), 
            max(ss[, gsub("~", "", as.character(mapping[2]))])))
        p
    }
    
    p <- ggpairs(as.data.frame(sss), title = ll[, 1],
        lower = list(continuous = ggpairs_points_limit_range), 
        diag = list(continuous = wrap("barDiag", binwidth = nrow(ss)/10)))
    print(p + theme_bw())
}


plot3d_detailed_violin <- function(res, i) {
    d <- ncol(res$ranked_profile)
    ss <- res$ranked_profile
    ll <- res$enrichment_result[i, ]
    sss <- res$detailed_sets[[i]]

    empty_cnt <- apply(sss,2,function(x) sum(as.numeric(is.finite(x))))
    empty_cnt <- length(which(empty_cnt==0))
    if ( empty_cnt > 0 ) { return("Too few genes. Skipping violin plot.") }
    
    if (d > 5) {
        colnames(sss) <- paste("d", seq_len(ncol(res$input_profile)), sep = "")
    }
    ss_long <- melt(ss)
colnames(ss_long) <- c("gene","contrast","value")
    sss_long <- melt(sss)
colnames(sss_long) <- c("gene","contrast","value")
    p <- ggplot(ss_long, aes(contrast, value)) +
        geom_violin(data = ss_long, fill = "grey", colour = "grey") +
        geom_boxplot(data = ss_long, width = 0.9, fill = "grey", 
        outlier.shape = NA, coef = 0) +
        geom_violin(data = sss_long, fill = "black", colour = "black") +
        geom_boxplot(data = sss_long, width = 0.1, outlier.shape = NA) + 
        labs(y = "Position in rank", title = ll[, 1])
    
    print(p + theme_bw() +
        theme(axis.text = element_text(size = 14),
        axis.title = element_text(size = 15), 
        plot.title = element_text(size = 20)))
}


#' mitch_plots
#'
#' This function generates several plots of multivariate gene set enrichment in
#' high resolution PDF format.
#' The number of detailed sets to generate is dictated by the resrows set in
#' the mitch_calc command.
#' @param res a mitch results object.
#' @param outfile the destination file for the plots in PDF format. should
#' contain 'pdf' suffix. Defaults to 
#' 'Rplots.pdf'
#' @return generates a PDF file containing enrichment plots.
#' @keywords mitch plot plots pdf 
#' @export
#' @examples
#' data(resExample)
#' mitch_plots(resExample,outfile='outres.pdf')
#' @import grDevices
#' @import graphics
#' @import GGally
#' @import grid
#' @import gridExtra
#' @importFrom beeswarm beeswarm
#' @importFrom gplots heatmap.2
#' @importFrom reshape2 melt
#' @import ggplot2
#' @importFrom MASS kde2d
mitch_plots <- function(res, outfile = "Rplots.pdf") {
    
    resrows <- length(res$detailed_sets)
    d <- ncol(res$ranked_profile)
    pdf(outfile)
    
    if ( d>20 ) {
        stop("Error: mitch plotting features are impractical for over 20
        dimensions.")
    }

    if (d == 1) {
        
        plot1d_profile_dist(res)
        plot_geneset_hist(res)
        plot1d_volcano(res)
        lapply(seq_len(resrows), function(i) {
            plot1d_detailed(res, i)
        })
        
    } else if (d == 2) {
        
        plot2d_profile_dist(res)
        plot2d_profile_density(res)
        plot2d_gene_quadrant_barchart(res)
        plot_geneset_hist(res)
        plot2d_set_quadrant_barchart(res)
        plot2d_set_scatter(res)
        plot2d_set_scatter_top(res)
        plot2d_heatmap(res)
        plot_effect_vs_significance(res)
        
        lapply(seq_len(resrows), function(i) {
            plot2d_detailed_density(res, i)
            plot2d_detailed_scatter(res, i)
            plot2d_detailed_violin(res, i)
        })
    } else if (d > 2) {
        
        res <- colname_substitute(res)
        ggpairs_points(res)
        ggpairs_contour(res)
        gene_sector_table(res)
        plot_geneset_hist(res)
        geneset_sector_table(res)
        ggpairs_points_subset(res)
        heatmapx(res)
        plot_effect_vs_significance(res)
        
        lapply(seq_len(resrows), function(i) {
            plot3d_detailed_density(res, i)
            plot3d_detailed_points(res, i)
            plot3d_detailed_violin(res, i)
        })
    }
    dev.off()
}


#' mitch_report
#'
#' This function generates an R markdown based html report containing tables
#' and several plots of mitch results 
#' The plots are in png format, so are not as high in resolution as compared to
#' the PDF generated by mitch_plots 
#' function. The number of detailed sets to generate is dictated by the resrows
#' set in the mitch_calc command.
#' @param res a mitch results object.
#' @param outfile the destination file for the html report. should contain
#' 'html' suffix. Defaults to 
#' 'Rplots.pdf'
#' @param overwrite should overwrite the report file if it already exists?
#' @return generates a HTML file containing enrichment plots.
#' @keywords mitch report html markdown knitr
#' @export
#' @examples
#' data(resExample)
#' mitch_report(resExample,'outres2.html')
#' @import knitr
#' @importFrom rmarkdown render
#' @import echarts4r
#' @import kableExtra
mitch_report <- function(res, outfile , overwrite=FALSE) {

    df <- data.frame(dummy_x = seq(20), dummy_y = rnorm(20, 10, 3))
    trash<-df %>% 
        e_charts(dummy_x) %>% 
        e_scatter(dummy_y, symbol_size = 10)

    DIRNAME <- normalizePath(dirname(outfile))
    HTMLNAME <- paste( basename(outfile), ".html", sep = "")
    HTMLNAME <- gsub(".html.html$", ".html", HTMLNAME)
    
    if (file.exists(HTMLNAME)) {
        if (overwrite==FALSE) {
            stop("Error: the output HTML file aready exists.")
        } else {
            message("Note: overwriting existing report")
        }
    }

    if (!file.exists(DIRNAME)) {
        stop("Error: the output folder does not exist.")
    }
    
    rmd_tmpdir <- tempdir()
    rmd_tmpfile <- paste(rmd_tmpdir, "/mitch.Rmd", sep = "")
    html_tmp <- paste(paste(rmd_tmpdir, "/mitch_report.html", sep = ""))
    
    DATANAME <- gsub(".html$", ".rds", HTMLNAME)
    DATANAME <- paste(rmd_tmpdir, "/", DATANAME, sep = "")
    saveRDS(res,DATANAME)
    MYMESSAGE <- paste("Dataset saved as \"", DATANAME, "\".")
    message(MYMESSAGE)
    
    knitrenv <- new.env()
    assign("DATANAME", DATANAME, knitrenv)
    assign("res", res, knitrenv)
    
    rmd <- system.file("mitch.Rmd", package = "mitch")
    rmarkdown::render(rmd, intermediates_dir = "." , output_file = html_tmp)
    file.copy(html_tmp, outfile, overwrite=overwrite)
}



#' Reactome gene sets
#'
#' Genesets from Reactome database suitable for enrichment analysis.
#' Acquired August 2019. The structure of this data is a named list of
#' vectors, containing human gene names (character strings). This is 
#' a sample of 200 gene sets from the approximately 2000 present in the
#' full dataset.
#' @docType data
#' @usage data(genesetsExample)
#' @format A list of gene sets
#' @keywords datasets
#' @references Fabregat et al. (2017) BMC Bioinformatics volume 18, 
#' Article number: 142, https://www.ncbi.nlm.nih.gov/pubmed/28249561
#' @source Reactome website: https://reactome.org/
#' @examples
#' data(genesetsExample)
"genesetsExample"

#' H3K36ac profile
#' 
#' Example edgeR result of differential ChIP-seq H3K36ac.
#' This is a dataframe which contains columns for log fold change, log counts
#' per million, p-value and FDR adjusted p-value. These columns consist of 
#' numerical values. The row names represent human gene names. This is a sample
#' of 1000 gene of an original dataset that contains measurements of ~30000 
#' genes.
#' @docType data
#' @usage data(k36a)
#' @format data frame
#' @keywords datasets
#' @examples
#' data(k36a)
"k36a"

#' H3K9ac profile
#' 
#' Example edgeR result of differential ChIP-seq H3K9ac.
#' This is a dataframe which contains columns for log fold change, log counts
#' per million, p-value and FDR adjusted p-value. These columns consist of 
#' numerical values. The row names represent human gene names. This is a sample
#' of 1000 gene of an original dataset that contains measurements of ~30000 
#' genes.
#' @docType data
#' @usage data(k9a)
#' @format data frame     
#' @keywords datasets
#' @examples
#' data(k9a)
"k9a"

#' RNA profile
#' 
#' Example edgeR result of differential RNA expression.
#' This is a dataframe which contains columns for log fold change, log counts
#' per million, p-value and FDR adjusted p-value. These columns consist of 
#' numerical values. The row names represent human gene names. This is a sample
#' of 1000 gene of an original dataset that contains measurements of ~15000 
#' genes.
#' @docType data
#' @usage data(rna)
#' @format data frame     
#' @keywords datasets
#' @examples
#' data(rna)
"rna"

#' myList: A list of three edgeR results 
#' 
#' Example edgeR results of differential RNA, H3K9ac and H3K36ac profiling.
#' The structure of this data is a list of three dataframes. Each data frame
#' is 1000 lines only.
#' @docType data
#' @usage data(myList)
#' @format data frame     
#' @keywords datasets
#' @examples
#' data(myList)
"myList"

#' myImportedData: Example imported profiles
#' 
#' Example of three edgeR profiles imported using mitch.
#' The structure of this data is a dataframe where each column represents
#' one of the following profiling datasets after scoring: RNA, H3K9ac and
#' H3K36ac. Each row represents one gene and this dataset contains just 1000
#' rows to keep the example dataset small.
#' @docType data
#' @usage data(myImportedData)
#' @format data frame     
#' @keywords datasets
#' @examples
#' data(myImportedData)
"myImportedData"

#' resExample: Example mitch result
#' 
#' Example of mitch results. Enrichment of the Reactome gene sets in the RNA,
#' H3K9ac and H3K36ac datasets. The structure of this data set is a list where
#' the 1st element is "input_profile" that has been imported (data frame), 2nd
#' element is the "input_genesets" (names list of gene names [character 
#' vectors]), 3rd is "ranked_profile" which is the input profiling data after
#' ranking (data frame), 4th is "enrichment_result" which is a data frame which
#' provides enrichment information on each gene set in the profiling data
#' including s scores, p-values and FDR adjusted p-values. 5th is 
#' "analysis_metrics" (list). The 6th slot is "detailed_sets" which is a list
#' of 5 matrices which details the enrichment of members of selected gene sets.
#' @docType data
#' @usage data(resExample)
#' @format list of mixed data types
#' @keywords datasets
#' @examples
#' data(resExample)
"resExample"