R/checks.R
In ADAM: ADAM: Activity and Diversity Analysis Module

##########
#Argument check functions
##########

if(getRversion() >= "3.5"){
    utils::globalVariables(c("GOID","Ontology","Term","mappedkeys",
                            "org.At.tairENTREZID","org.Sc.sgdENTREZID",
                            "org.Pf.plasmoALIAS2ORF"))
}

.checkExpressionData <- function(ExpressionData){
    
    DataFile <- ExpressionData
    
    if(is(DataFile,"SummarizedExperiment") | 
        is(DataFile,"RangedSummarizedExperiment")){
        DataFile <- as.data.frame(assay(DataFile))
        colnames(DataFile)[1] <- "gene"
        GeneColumn <- as.data.frame(as.character(DataFile$gene))
        DataFile$gene <- NULL
        DataFile <- as.matrix(DataFile)
        DataFile <- apply(DataFile,2,as.numeric)
        DataFile <- as.data.frame(DataFile)
        DataFile <- cbind.data.frame(GeneColumn,DataFile)
    }else if(!is.data.frame(DataFile)){
        if(!file.exists(DataFile)){
            stop("Please, check the format of the expression data. 
                ExpressionData argument must be a valid file or a 
                datra frame.")
        }else{
            DataFile <- read.table(ExpressionData, header = TRUE, sep = "\t",
                                    quote = "", stringsAsFactors = FALSE)
        }
    }
    colnames(DataFile)[1] <- "gene"
    
    if(ncol(DataFile)<3){
        stop("Check the data format to Conditions! The Expression data must 
            have at list 3 columns.")
    }else{
        ExpressionColumns <- DataFile    
        ExpressionColumns$gene <- NULL
        ExpressionColumns <- as.list(ExpressionColumns)
        ExpressionColumns <- sapply(ExpressionColumns,
                                function(ExpColumn) is.numeric(ExpColumn))
        if(sum(ExpressionColumns)!=(ncol(DataFile)-1)){
            stop("Check the format data! Expression values must be numeric.")
        }else if(length(unique(as.character(DataFile$gene))) != 
                length(as.character(DataFile$gene))){
            stop("Gene identifiers must be unique. Check for duplicated
            gene IDs.")
        }
    }
    return(DataFile)
}

.checkComparisonID <- function(ComparisonID,ExpressionData){
    ExpressionData <- .checkExpressionData(ExpressionData)
    ListComparisons <- as.list(ComparisonID)
    ListComparisons <- lapply(ListComparisons,function(CompID) length(unlist(
                                strsplit(CompID,","))))
    ListComparisons <- sum(ifelse(ListComparisons!=2,1,0))
    ExpDataColumns <- colnames(ExpressionData)[-1]
    if(!is.vector(ComparisonID)){
        stop("Check the format data. ComparisonID argument must be a vector!")
    }else if(ListComparisons>0){
        stop("Comparison IDs must have 2 elements!")
    }else if(sum(ifelse(unique(unlist(strsplit(ComparisonID,","))) %in% 
            ExpDataColumns,0,1)) > 0){
        stop("Invalid Comparison IDs! The IDs must correspond to the
                Expression Data column names.")
    }
    return(ComparisonID)
}


.checkGeneNumbers <- function(MinGene,MaxGene){
    if(!is.integer(MinGene) | !is.integer(MaxGene)){
        if(!is.numeric(MinGene) | !is.numeric(MaxGene)){
            stop("Please, check the format of Conditions! MinGene and MaxGene 
            arguments must be integer.")
        }else{
            MinGene <- as.integer(MinGene)
            MaxGene <- as.integer(MaxGene)
        }
    }
    if(MinGene<=0 | MaxGene<=0){
        stop("Check the Data format to Conditions! The number of genes must be
            positive and different from zero.")
    }else if(MinGene>MaxGene){
        stop("Check the Data format to Conditions! The maximum number of genes
            must be greater than the minimum number.")
    }
    return(c(MinGene,MaxGene))
}

.checkSeedNumber <- function(SeedNumber){
    if(!is.numeric(SeedNumber)){
        stop("Please, check the format of Conditions! SeedNumber argument must
            be numeric.")
    }else if(SeedNumber<0){
            stop("Check the Data format Conditions!The seed must be a positive
                value.")
    }
    return(SeedNumber)
}

.checkBootstrapNumber <- function(BootstrapNumber){
    if(!is.integer(BootstrapNumber)){
        if(!is.numeric(BootstrapNumber)){
            stop("Please, check the format of Conditions! BootstrapNumber
                argument must be integer.")
        }else{
            BootstrapNumber<-as.integer(BootstrapNumber)
        }
    }
    if(BootstrapNumber<=0){
        stop("Check the Data format Conditions! The number of bootstrap steps
            genes must be positive and different from zero.")
    }
    return(BootstrapNumber)
}

.checkPCorrection <- function(PCorrection){
    if(!is.numeric(PCorrection)){
        stop("Please, check the format of Conditions! PCorrection argument
            must be numeric.")
    }else if(PCorrection<0 || PCorrection>1){
        stop("Check the Data format Conditions! Correction value must be 
            between zero and one.")
    }
    return(PCorrection)
}

.checkPCorrectionMethod <- function(PCorrectionMethod){
    options <- c("holm", "hochberg", "hommel", "bonferroni", "bh", "by","fdr")
    OptionsPAdjust <- c("holm", "hochberg", "hommel", "bonferroni", "BH",
                        "BY","fdr")
    if  (!(tolower(PCorrectionMethod) %in% options)){
        stop(paste("Correction method",PCorrectionMethod,"not found."))
    }
    return(OptionsPAdjust[grep(tolower(PCorrectionMethod),options)])
}

.checkWilcoxonTest <- function(WilcoxonTest){
    if(!is.logical(WilcoxonTest)){
        stop("Please, check the format of Conditions! WilcoxonTest argument 
            must be logical.")
    }
    return(WilcoxonTest)
}

.checkFisherTest <- function(FisherTest){
    if(!is.logical(FisherTest)){
        stop("Please, check the format of Conditions! FisherTest argument must
            be logical.")
    }
    return(FisherTest)
}

###
# Creates a data frame relating Species and gene annotation package. This 
# data frame is used by .checkAnalysisDomain_DBSpecies_GeneIdentifier funcion.
###
.CreateDataSpecies <-function(){
    SpeciesDescription <- c("Anopheles gambiae", "Arabdopsis thaliana",
                        "Bos taurus", "Caenorhabditis elegans",
                        "Canis familiaris", "Drosophila melanogaster",
                        "Danio rerio", "Escherichia coli K12",
                        "Escherichia coli Sakai", "Gallus gallus",
                        "Homo sapiens", "Mus musculus", "Macaca mulatta",
                        "Plasmodium falciparum", "Pan troglodytes",
                        "Rattus norvegicus", "Saccharomyces cerevisiae",
                        "Sus scrofa","Xenopus laevis")
    SpeciesID <- c("aga","ath","bta","cel","cfa","dme","dre","eco","ecs",
                "gga","hsa","mmu","mcc","pfa","ptr","rno","sce","ssc","xla")
    DBSpeciesList <- c("org.Ag.eg.db", "org.At.tair.db", "org.Bt.eg.db",
                    "org.Ce.eg.db", "org.Cf.eg.db","org.Dm.eg.db",
                    "org.Dr.eg.db", "org.EcK12.eg.db", "org.EcSakai.eg.db",
                    "org.Gg.eg.db", "org.Hs.eg.db", "org.Mm.eg.db",
                    "org.Mmu.eg.db", "org.Pf.plasmo.db", "org.Pt.eg.db",
                    "org.Rn.eg.db", "org.Sc.sgd.db", "org.Ss.eg.db",
                    "org.Xl.eg.db")

    DataFrameSpecies <- data.frame(DBSpeciesList, SpeciesDescription,
                                    SpeciesID)
    return(DataFrameSpecies)
}

.checkAnalysisDomain_DBSpecies_GeneIdentifier <- function(AnalysisDomain,
                                                        DBSpecies,
                                                        GeneIdentifier,
                                                        GeneExpressionData){
    #Creates a data frame relating Species and gene annotation packages
    DataFrameSpecies <- .CreateDataSpecies()
    
    #Available domains
    options <- c("gobp","gocc","gomf","kegg","own")
    
    #Available nomenclatures
    GeneNomenclatures <- c("symbol","entrez","tair","orf")
    
    SpeciesID <- DBSpecies
    if(!(tolower(AnalysisDomain) %in% options)){
        stop(paste("Analysis domain",AnalysisDomain,"does not exist!"))
    }else if(AnalysisDomain!=options[5]){
        if(!is.data.frame(SpeciesID)){
            if(!file.exists(SpeciesID)){
                if(!SpeciesID %in% DataFrameSpecies$DBSpeciesList){
                    if(tolower(SpeciesID)=="eck12"){
                        SpeciesID <- "EcK12"
                    }else if(tolower(SpeciesID)=="ecsakai"){
                        SpeciesID <- "EcSakai"
                    }else if(tolower(SpeciesID)=="mmu"){
                        SpeciesID <- "Mmu"
                    }else{
                        SpeciesID <- unlist(strsplit(SpeciesID,""))
                        SpeciesID <- paste0(toupper(SpeciesID[1]),
                            paste0(tolower(SpeciesID[2:length(SpeciesID)]),
                            collapse = ""))
                    }
                    SpeciesID <- ifelse(length(DataFrameSpecies$DBSpeciesList[
                        grep(SpeciesID,DataFrameSpecies$DBSpeciesList)])==0,
                        DBSpecies,
                        as.character(DataFrameSpecies$DBSpeciesList[
                        grep(SpeciesID,DataFrameSpecies$DBSpeciesList)]))
                    if(!SpeciesID %in% DataFrameSpecies$DBSpeciesList){
                        stop("Gene annotation package does not supported! 
                            Please check it.")
                    }
                }
            }else{
                stop("Please inform a valid annotation package name!")
            }
        }else{
            stop("Please inform a valid annotation package name!")
        }
    }

    ExpressionData <- GeneExpressionData
    if(!(is.data.frame(GeneExpressionData))){
        ExpressionData <- read.table(GeneExpressionData, sep = "\t",
                                    header = TRUE, quote = "",
                                    stringsAsFactors = FALSE)
    }
    colnames(ExpressionData)[1] <- "gene"
    
    if(tolower(AnalysisDomain)==options[5]){
        if(!is.data.frame(SpeciesID)){
            if(!file.exists(SpeciesID)){
                stop("Own functions file does not exist!")
            }else{
                DBFile <- read.table(SpeciesID, header = TRUE,
                                    sep = "\t", quote = "",
                                    stringsAsFactors = FALSE)
                colnames(DBFile) <- c("gene","ID","Description")
                DBFile <- .checkDB(DBFile = DBFile,
                        ExpressionData = ExpressionData)
            }
        }else{
            DBFile <- SpeciesID
            colnames(DBFile) <- c("gene","ID","Description")
            DBFile <- .checkDB(DBFile=DBFile,
                                ExpressionData = ExpressionData)
        }
    }else if(.checkPackage(NamePackage = SpeciesID)){
            if(!GeneIdentifier %in% GeneNomenclatures){
                stop("Gene identifier invalid!")
            }else if(GeneIdentifier==GeneNomenclatures[3] & 
                    SpeciesID!="org.At.tair.db"){
                stop("Incorrect GeneIdentifier!")
            }else if(GeneIdentifier==GeneNomenclatures[4] & 
                    SpeciesID!="org.Sc.sgd.db" & 
                    SpeciesID!="org.Pf.plasmo.db"){
                    stop("Incorrect GeneIdentifier!")
            }else{
                if(SpeciesID=="org.At.tair.db"){
                    SymbolEGClass <- org.At.tairENTREZID #EntrezID and tairID
                    SymbolEGList <- unlist(as.list(SymbolEGClass[mappedkeys(
                                    SymbolEGClass)]))
                    SymbolEGList <-data.frame(tair=names(SymbolEGList),
                                            entrez=as.vector(SymbolEGList))
            }else if(SpeciesID=="org.Sc.sgd.db"){
                    SymbolEGClass <- org.Sc.sgdENTREZID #EntrezID/ORF numbers
                    SymbolEGList <- unlist(as.list(SymbolEGClass[mappedkeys(
                                    SymbolEGClass)]))
                    SymbolEGList <-data.frame(orf=names(SymbolEGList),
                                    entrez=as.vector(SymbolEGList))
            }else if(SpeciesID=="org.Pf.plasmo.db"){
                    SymbolEGClass<-org.Pf.plasmoALIAS2ORF #GeneSymbol / ORF ID
                    SymbolEGList <- unlist(as.list(SymbolEGClass[mappedkeys(
                                    SymbolEGClass)]))
                    SymbolEGList <-data.frame(symbol=names(SymbolEGList),
                                    orf=as.vector(SymbolEGList))
            }else{
                SymbolEGClass <- get(paste0("org.",unlist(strsplit(SpeciesID,
                                "[.]"))[2],".egSYMBOL2EG")) #GeneSymbol/Entrez
                SymbolEGList <- unlist(as.list(SymbolEGClass[mappedkeys(
                                SymbolEGClass)]))
                SymbolEGList <-data.frame(symbol=names(SymbolEGList),
                                entrez=as.vector(SymbolEGList))
                
                if(SpeciesID=="org.Ag.eg.db"){
                    GenesAg<-str_split(as.character(SymbolEGList[,1]),"AgaP_")
                    GenesAg <- lapply(GenesAg, function(GeneNameAg){
                        if(length(GeneNameAg) > 1){
                            return(GeneNameAg[2])
                        }else{
                            return(GeneNameAg[1])
                        }
                    })
                    SymbolEGList$symbol <- unlist(GenesAg)
                }
            }

            if(sum(ifelse(ExpressionData$gene %in% 
                            SymbolEGList[,c(grep(tolower(GeneIdentifier),
                            colnames(SymbolEGList)))] |
                            tolower(ExpressionData$gene) %in% 
                            SymbolEGList[,c(grep(tolower(GeneIdentifier),
                            colnames(SymbolEGList)))] |
                            toupper(ExpressionData$gene) %in% 
                            SymbolEGList[,c(grep(tolower(GeneIdentifier),
                            colnames(SymbolEGList)))],1,0)) < 1){
                stop("The genes in the Expression Data are not in the species 
                    database. Check the gene identifiers!")
            }else{
                DBFile <- .checkGenesDomain(AnalysisDomain = 
                                                    tolower(AnalysisDomain), 
                DBSpecies = SpeciesID, 
                IDGene = tolower(GeneIdentifier), 
                ListGenes = SymbolEGList,
                keggID=as.character(DataFrameSpecies$SpeciesID[grep(SpeciesID,
                                    DataFrameSpecies$DBSpeciesList)]),
                ExpressionData = ExpressionData)
            }
        }
    }

    return(list(options[grep(tolower(AnalysisDomain),options)],DBFile[[1]],
            DBFile[[2]], tolower(GeneIdentifier)))
}

.checkPackage <- function(NamePackage){
    if(!suppressMessages(requireNamespace(NamePackage))){
        stop(paste("Annotation package",NamePackage,"not availlable.
                    Please install it!"))
    }else{
        suppressMessages(require(NamePackage, character.only = TRUE,
                                warn.conflicts = FALSE))
    }
    return(TRUE)
}

.checkDB <- function(DBFile,ExpressionData){
    if(ncol(DBFile)!=3){
        stop("Own functions file must have 3 columns!")
    }else if(nrow(DBFile)<1){
        stop("Own functions file is empty!")
    }else if(sum(ifelse(ExpressionData$gene %in% DBFile$gene | 
        tolower(ExpressionData$gene) %in% DBFile$gene |
        toupper(ExpressionData$gene) %in% DBFile$gene,1,0)) < 1){
        stop("The genes in the Expression Data are not in the species
            database. Check the gene identifiers!")
        }else if(length(colnames(DBFile))==length(colnames(ExpressionData))){
                if(sum(colnames(DBFile)==colnames(ExpressionData))==3){
                    stop("Check the ExpressionData and DBSpecies arguments.
                        The column names of both files are the same!!")
                }
    }

    DBFunctionsList <- lapply(as.vector(unique(as.character(DBFile$ID))), 
                        FUN = .ListConversionDB, DBFile = DBFile)
    names(DBFunctionsList) <- as.vector(unique(as.character(DBFile$ID)))
    TermOntologies <- DBFile
    TermOntologies$gene <- NULL 
    TermOntologies <- unique(TermOntologies)

    DBFunctionsListSample <- lapply(DBFunctionsList, function(ListElement,
                                    SampleGenes) {SampleGenes[SampleGenes %in% 
                                    ListElement]},
                                    SampleGenes = ExpressionData$gene)
    names(DBFunctionsListSample) <- vapply(names(DBFunctionsList),
                function(ListElement,TermOntologies){
                SubTermOntologies<-subset(TermOntologies,
                            TermOntologies$ID == ListElement)
                return(paste0(as.character(SubTermOntologies$ID),
                    " <==> ",as.character(SubTermOntologies$Description)))},
                TermOntologies=TermOntologies, FUN.VALUE = character(1))

    DBFunctionsListRaw <- lapply(DBFunctionsList,function(ListElement){
                                length(ListElement)})
    names(DBFunctionsListRaw) <- names(DBFunctionsList)

    return(list(DBFunctionsListSample,DBFunctionsListRaw))
}

.ListConversionDB <- function(ListElement,DBFile){
    ListResult <- subset(DBFile, DBFile$ID == ListElement)
    ListResult <- ListResult$gene
    return(ListResult)
}

.checkGenesDomain<-function(AnalysisDomain,DBSpecies,IDGene,ListGenes,keggID,
                            ExpressionData){

    #Each annotation package has its own set of functions that relates genes
    #to ontologies and kegg pathways. This way, we create a list linking the 
    #package to its functions. Besides that, the list also presents the 
    #default and alternative nomenclature supported according to each DB 
    #species. For example, let's consider Arabdopsis thaliana: List element
    #name: org.At.tair.db; Function to relate genes to gene ontologies terms:
    #org.At.tairGO2ALLTAIRS; Default gene nomenclature: "tair" ID;
    #Alternative gene nomenclature: "entrez" ID; Function to relate genes to
    #kegg pathways: org.At.tairARACYC.
    ListDBPackages <- list(org.At.tair.db = c("org.At.tairGO2ALLTAIRS","tair",
                                            "entrez","org.At.tairPATH2TAIR"),
                org.Sc.sgd.db = c("org.Sc.sgdGO2ALLORFS","orf",
                                "entrez","org.Sc.sgdPATH2ORF"),
                org.Pf.plasmo.db = c("org.Pf.plasmoGO2ALLORFS","orf",
                                "symbol","org.Pf.plasmoPATH2ORF"),
                otherDB=c(paste0("org.",unlist(strsplit(DBSpecies,"[.]"))[2],
                        ".egGO2ALLEGS"),"entrez","symbol",
                        paste0("org.",unlist(strsplit(DBSpecies,"[.]"))[2],
                        ".egPATH2EG")))

    if(DBSpecies %in% names(ListDBPackages)){
        OntologyFunction <- c(ListDBPackages[[grep(DBSpecies,names(
                            ListDBPackages))]][1],
                            ListDBPackages[[grep(DBSpecies,names(
                            ListDBPackages))]][2],
                            ListDBPackages[[grep(DBSpecies,names(
                            ListDBPackages))]][3],
                            ListDBPackages[[grep(DBSpecies,names(
                            ListDBPackages))]][4])
    }else{
        OntologyFunction <- c(ListDBPackages[[4]][1],ListDBPackages[[4]][2],
        ListDBPackages[[4]][3],ListDBPackages[[4]][4])
    }

    if(AnalysisDomain == "kegg"){
        keggPathways <- as.data.frame(keggList("pathway", keggID))
        colnames(keggPathways)[1] <- "ID"
        keggPathwaysID <- unlist(lapply(rownames(keggPathways),
                        function(KPathway)
                        unlist(strsplit(KPathway,paste0(":",keggID)))[2]))
        
        keggPathwaysDescription<-unlist(lapply(as.character(keggPathways$ID),
                                function(KPathway) unlist(strsplit(KPathway,
                                paste0("- ",keggID)))[1]))
        
        TermOntologies <- data.frame(ID = keggPathwaysID, 
                                    Description = keggPathwaysDescription)


        DBKeggClass <- get(OntologyFunction[4])
        
        DBFunctionsList <- as.list(DBKeggClass[mappedkeys(DBKeggClass)])

        DBFunctionsList <- lapply(DBFunctionsList, function(DBFunction) 
                                unique(as.vector(DBFunction)))
        
        DBFunctionsList <- mapply(.CheckTerm, ListElement = DBFunctionsList, 
                                ListElementName = names(DBFunctionsList),
                                MoreArgs =  list(VectorTerms = 
                                as.character(TermOntologies$ID)))

        TermOntologies <- TermOntologies[TermOntologies$ID %in% 
                            names(DBFunctionsList),]
        TermOntologies$ID <- paste0(keggID,TermOntologies$ID)

        DBFunctionsList<-DBFunctionsList[lapply(DBFunctionsList, length) > 0]
        names(DBFunctionsList) <- paste0(keggID,names(DBFunctionsList))
    }else{
        infoGO <- GOTERM
        TermOntologies <- data.frame(t(vapply(infoGO, function(GOntology) 
                    c(trimws(GOID(GOntology)), trimws(Ontology(GOntology)), 
                    trimws(Term(GOntology))), FUN.VALUE = character(3))))

        TermOntologies <- subset(TermOntologies, TermOntologies$X2 == 
                            toupper(unlist(strsplit(AnalysisDomain,"go"))[2]))
        colnames(TermOntologies) <- c("ID","Domain","Description")
        TermOntologies$Domain <- NULL
        rownames(TermOntologies) <- NULL

        DBFunctions <- get(OntologyFunction[1])
        DBFunctionsList <- as.list(DBFunctions[mappedkeys(DBFunctions)])
        DBFunctionsList <- lapply(DBFunctionsList, function(DBFunction)
                            unique(as.vector(DBFunction)))
        DBFunctionsList <- mapply(.CheckTerm, ListElement = DBFunctionsList,
                            ListElementName = names(DBFunctionsList),
                            MoreArgs =  list(VectorTerms = 
                            as.character(TermOntologies$ID)))
        DBFunctionsList<-DBFunctionsList[lapply(DBFunctionsList, length) > 0]
    }

    if(IDGene != OntologyFunction[2]){
        DBFunctionsList <- lapply(DBFunctionsList, FUN = .GeneIDConversion, 
                            ListGenes = ListGenes,
                            StandardGeneID = OntologyFunction[2], 
                            AlternativeGeneID = OntologyFunction[3])
    }

    TermOntologies <- subset(TermOntologies, TermOntologies$ID %in%
                        names(DBFunctionsList))

    DBFunctionsListSample <- lapply(DBFunctionsList, 
                            function(ListElement,SampleGenes){
                            SampleGenes[SampleGenes %in% 
                            ListElement]},
                            SampleGenes = ExpressionData$gene)

    CheckSample<-DBFunctionsListSample[lapply(DBFunctionsListSample,
                                                        length) > 0]
    
    if(length(CheckSample) == 0){
        stop(paste("Genes in the Expression Data file are not related to any",
                   AnalysisDomain,"term. Please check the genes in your
                   sample!"))
    }
    
    names(DBFunctionsListSample) <- vapply(names(DBFunctionsList),
                function(ListElement,TermOntologies){
                SubTermOntologies<-subset(TermOntologies, 
                            TermOntologies$ID==ListElement)
                return(paste0(as.character(SubTermOntologies$ID),
                    " <==> ",as.character(SubTermOntologies$Description)))},
                TermOntologies=TermOntologies, FUN.VALUE = character(1))

    
    DBFunctionsListRaw <- lapply(DBFunctionsList,function(ListElement){
                                length(ListElement)})
    names(DBFunctionsListRaw) <- names(DBFunctionsList)

    DBFunctionsListSample<-DBFunctionsListSample[lapply(DBFunctionsListSample,
                                                length) > 0]

    return(list(DBFunctionsListSample,DBFunctionsListRaw))
}

.CheckTerm <- function(ListElement,ListElementName,VectorTerms){
    if(as.character(ListElementName) %in% as.character(VectorTerms)){
        result <- ListElement
    }else{
        result <- NULL
    }
    return(result)
}

.GeneIDConversion <- function(ListElement,ListGenes,StandardGeneID,
                            AlternativeGeneID){
    SubListGenes <- subset(ListGenes, ListGenes[,grep(StandardGeneID,
                    colnames(ListGenes))] %in% ListElement)
    return(as.character(SubListGenes[,grep(AlternativeGeneID,
        colnames(SubListGenes))]))
}