R/parsePhyloProfile.R

Defines functions fromInputToProfile reduceProfile filterProfileData parseInfoProfile calcPresSpec sortInputTaxa getSelectedTaxonNames getInputTaxaName getInputTaxaID getTaxonomyMatrix getNameList getTaxonomyRanks

Documented in calcPresSpec filterProfileData fromInputToProfile getInputTaxaID getInputTaxaName getNameList getSelectedTaxonNames getTaxonomyMatrix getTaxonomyRanks parseInfoProfile reduceProfile sortInputTaxa

# Functions for parsing and pre-processing PhyloProfile input

#' Create a list containing all main taxanomy ranks
#' @export
#' @return A list of all main ranks (from strain to superkingdom)
#' @author Carla Mölbert (carla.moelbert@gmx.de)
#' @examples
#' getTaxonomyRanks()

getTaxonomyRanks <- function(){
    allRanks <- list(
        "Strain " = "strain",
        "Species" = "species",
        "Genus" = "genus",
        "Family" = "family",
        "Order" = "order",
        "Class" = "class",
        "Phylum" = "phylum",
        "Kingdom" = "kingdom",
        "Superkingdom" = "superkingdom",
        "unselected" = ""
    )
    return(allRanks)
}

#' Get list of pre-installed NCBI taxon names
#' @description Get all NCBI taxon names from
#' "PhyloProfile/data/taxonNamesReduced.txt"
#' @export
#' @param taxDB Path to the taxonomy DB files
#' @return List of taxon IDs, their full names, taxonomy ranks and parent IDs
#' obtained from "PhyloProfile/data/taxonNamesReduced.txt"
#' @author Vinh Tran tran@bio.uni-frankfurt.de
#' @examples
#' getNameList()

getNameList <- function(taxDB = NULL) {
    if (is.null(taxDB)) {
        nameReducedFile <- paste(
            system.file(package = "PhyloProfile"),
            "PhyloProfile/data/taxonNamesReduced.txt", sep = "/"
        )
    } else nameReducedFile <- paste(taxDB, "taxonNamesReduced.txt", sep = "/")

    if (!file.exists(nameReducedFile)) {
        utils::data(taxonNamesReduced)
    } else {
        taxonNamesReduced <- data.table::fread(
            nameReducedFile, sep = "\t", header = TRUE, fill = TRUE,
            showProgress = FALSE
        )
    }

    taxonNamesReduced$fullName <- as.character(taxonNamesReduced$fullName)
    taxonNamesReduced$rank <- as.character(taxonNamesReduced$rank)
    taxonNamesReduced <- unique(taxonNamesReduced)

    return(taxonNamesReduced)
}

#' Get taxonomy matrix
#' @description Get the (full or subset) taxonomy matrix from
#' "data/taxonomyMatrix.txt" based on an input taxon list
#' @export
#' @param taxDB Path to the taxonomy DB files
#' @param subsetTaxaCheck TRUE/FALSE subset taxonomy matrix based on input taxon
#' IDs. Default = FALSE
#' @param taxonIDs list of input taxon IDs (e.g. ncbi1234). Default = NULL
#' @return Data frame contains the (subset of) taxonomy matrix for list of
#' input taxa.
#' @author Vinh Tran tran@bio.uni-frankfurt.de
#' @examples
#' # get full pre-installed taxonomy matrix
#' getTaxonomyMatrix()
#' # get taxonomy matrix for a list of taxon IDs
#' taxonIDs <- c("ncbi9606", "ncbi10116")
#' getTaxonomyMatrix(NULL, TRUE, taxonIDs)

getTaxonomyMatrix <- function(
        taxDB = NULL, subsetTaxaCheck = FALSE, taxonIDs = NULL
){
    taxonomyMatrixFile <- if (is.null(taxDB)) {
        file.path(
            system.file(package = "PhyloProfile"),
            "PhyloProfile/data/taxonomyMatrix.txt"
        )
    } else {
        file.path(taxDB, "taxonomyMatrix.txt")
    }
    if (!file.exists(taxonomyMatrixFile)) {
        utils::data(taxonomyMatrix)
    } else {
        taxonomyMatrix <- data.table::fread(
            taxonomyMatrixFile, sep = "\t", header = TRUE,
            stringsAsFactors = TRUE, showProgress = FALSE
        )
    }
    if (subsetTaxaCheck) {
        if (!missing(taxonIDs)) {
            taxonomyMatrix <- taxonomyMatrix[
                taxonomyMatrix$abbrName %in% taxonIDs,]
        }
    }
    return(data.frame(taxonomyMatrix))
}

#' Get ID list of input taxa from the main input
#' @param rawProfile A dataframe of input phylogenetic profile in long format
#' @return List of all input taxon IDs (e.g. ncbi1234). Default = NULL.
#' @author Vinh Tran tran@bio.uni-frankfurt.de
#' @export
#' @seealso \code{\link{createLongMatrix}}, \code{\link{mainLongRaw}}
#' @examples
#' data("mainLongRaw", package="PhyloProfile")
#' getInputTaxaID(mainLongRaw)

getInputTaxaID <- function(rawProfile = NULL){
    if (is.null(rawProfile)) stop("Input profile data cannot be NULL!")
    inputTaxa <- unique(rawProfile$ncbiID)
    inputTaxa <- inputTaxa[inputTaxa != "geneID"]
    # return input taxa
    return(inputTaxa)
}

#' Get NCBI taxon names for a selected list of taxa
#' @description Get NCBI taxon names from
#' "PhyloProfile/data/taxonNamesReduced.txt" for a list of input taxa
#' @param rankName taxonomy rank (e.g. "species","phylum",...)
#' @param taxonIDs list of taxon IDs (e.g. ncbi1234). Default = NULL
#' @param taxDB Path to the taxonomy DB files
#' @return Data frame contains a list of full names, taxonomy ranks and parent
#' IDs for the input taxa.
#' @author Vinh Tran tran@bio.uni-frankfurt.de
#' @export
#' @seealso \code{\link{getInputTaxaID}} for getting input taxon IDs,
#' \code{\link{getNameList}} for getting the full taxon name list
#' @examples
#' taxonIDs <- c("ncbi9606", "ncbi10116")
#' getInputTaxaName("species", taxonIDs)

getInputTaxaName <- function(rankName, taxonIDs = NULL, taxDB = NULL){
    # check input parameters
    if (missing(rankName)) stop("No taxonomy rank name given!")
    allMainRanks <- getTaxonomyRanks()
    if (!(rankName[1] %in% allMainRanks)) stop("Invalid taxonomy rank given!")
    # load list of unsorted taxa
    taxMatrix <- getTaxonomyMatrix(
        taxDB, subsetTaxaCheck = TRUE, taxonIDs = taxonIDs
    )
    # load list of taxon name
    nameList <- getNameList(taxDB)
    # return
    choice <- data.frame(
        "ncbiID" = unlist(taxMatrix[[rankName]]), stringsAsFactors = FALSE
    )
    choice <- merge(choice, nameList, by = "ncbiID", all = FALSE)
    return(choice)
}

#' Get a subset of input taxa based on a selected taxonomy rank
#' @description Get a subset of taxon ncbi IDs and names from an input list of
#' taxa based on a selected supertaxon (identified by its taxonomy rank and
#' supertaxon name or supertaxon ID).
#' @usage getSelectedTaxonNames(inputTaxonIDs = NULL, rank = NULL,
#'     higherRank = NULL, higherID = NULL, higherName = NULL, taxDB = NULL)
#' @param inputTaxonIDs list of input taxon IDs (e.g. c("10116", "122586"))
#' @param rank taxonomy rank of input taxa (e.g. "species")
#' @param higherRank selected taxonomy rank (e.g. "phylum")
#' @param higherID supertaxon ID (e.g. 7711). NOTE: either supertaxon ID or
#' name is required, not neccessary to give both
#' @param higherName supertaxon name (e.g. "Chordata"). NOTE: either
#' supertaxon ID or name is required, not neccessary to give both
#' @param taxDB Path to the taxonomy DB files
#' @export
#' @return A data frame contains ncbi IDs and names of taxa from the input taxon
#' list that belong to the selected supertaxon.
#' @author Vinh Tran tran@bio.uni-frankfurt.de
#' @examples
#' inputTaxonIDs <- c("10116", "122586", "123851", "13616", "188937", "189518",
#' "208964", "224129", "224324", "237631", "243230")
#' rank <- "species"
#' higherRank <- "phylum"
#' higherID <- 7711
#' getSelectedTaxonNames(inputTaxonIDs, rank, higherRank, higherID, NULL)
#' higherName <- "Chordata"
#' getSelectedTaxonNames(inputTaxonIDs, rank, higherRank, NULL, higherName,NULL)

getSelectedTaxonNames <- function(
    inputTaxonIDs = NULL, rank = NULL,
    higherRank = NULL, higherID = NULL, higherName = NULL, taxDB = NULL
) {
    rankName <- NULL
    if (is.null(inputTaxonIDs) | is.null(rank))
        stop("Input taxa and taxonomy rank cannot be NULL!")
    taxDf <- getTaxonomyMatrix(taxDB, TRUE, paste0("ncbi", inputTaxonIDs))
    if (is.null(higherID) & is.null(higherName))
        return(
            data.frame(
                ncbiID = taxDf$ncbiID[
                    taxDf$abbrName %in% paste0("ncbi", inputTaxonIDs)],
                name = taxDf$fullName[
                    taxDf$abbrName %in% paste0("ncbi", inputTaxonIDs)],
                stringsAsFactors = FALSE))
    if (is.null(higherRank)) {
        return(
            data.frame(
                ncbiID = taxDf$ncbiID[
                    taxDf$abbrName %in% paste0("ncbi", inputTaxonIDs)],
                name = taxDf$fullName[
                    taxDf$abbrName %in% paste0("ncbi", inputTaxonIDs)],
                stringsAsFactors = FALSE))
    } else {
        if (!is.null(higherName) & is.null(higherID)) {
            taxaList <- getNameList(taxDB)
            superID <- taxaList$ncbiID[
                taxaList$fullName == higherName
                & taxaList$rank %in% c(higherRank, "norank")]
            customizedtaxaID <- levels(
                as.factor(taxDf[rank][taxDf[higherRank] == superID, ]))
            return(
                data.frame(
                    ncbiID = taxaList$ncbiID[
                        taxaList$rank %in% c(rank, "norank")
                        & taxaList$ncbiID %in% customizedtaxaID],
                    name = taxaList$fullName[
                        taxaList$rank %in% c(rank, "norank")
                        & taxaList$ncbiID %in% customizedtaxaID],
                    stringsAsFactors = FALSE))
        } else if (!is.null(higherID)) {
            return(
                data.frame(
                    ncbiID = taxDf$ncbiID[taxDf[,higherRank] == higherID],
                    name = taxDf$fullName[taxDf[,higherRank] == higherID],
                    stringsAsFactors = FALSE))
        }
    }
}

#' Sort list of (super)taxa based on a selected reference (super)taxon
#' @usage sortInputTaxa(taxonIDs = NULL, rankName, refTaxon = NULL,
#'     taxaTree = NULL, sortedTaxonList = NULL, taxDB = NULL)
#' @param taxonIDs list of taxon IDs (e.g.: ncbi1234, ncbi9999, ...). Default =
#' NULL
#' @param rankName working taxonomy rank (e.g. "species", "phylum",...)
#' @param refTaxon selected reference taxon. Default = NULL
#' @param taxaTree taxonomy tree for the input taxa (optional). Default = NULL
#' @param sortedTaxonList list of sorted taxa (optional). Default = NULL
#' @param taxDB Path to the taxonomy DB files
#' @return A taxonomy matrix for the input taxa ordered by the selected
#' reference taxon. This matrix is sorted either based on the NCBI taxonomy
#' info, or based on an user-defined taxonomy tree (if provided).
#' @importFrom ape read.tree root
#' @author Vinh Tran tran@bio.uni-frankfurt.de
#' @export
#' @seealso \code{\link{getNameList}}, \code{\link{getTaxonomyMatrix}},
#' \code{\link{createUnrootedTree}}, \code{\link{sortTaxaFromTree}},
#' \code{\link{getInputTaxaName}}, \code{\link{getInputTaxaID}},
#' \code{\link{createLongMatrix}}
#' @examples
#' taxonIDs <- c(
#'     "ncbi10116", "ncbi123851", "ncbi3702", "ncbi13616", "ncbi9606"
#' )
#' sortInputTaxa(taxonIDs, "species", "Homo sapiens", NULL, NULL)

sortInputTaxa <- function(
    taxonIDs = NULL, rankName, refTaxon = NULL, taxaTree = NULL,
    sortedTaxonList = NULL, taxDB = NULL
){
    ncbiID <- fullName <- abbrName <- NULL
    if (missing(rankName)) return("No taxonomy rank name given!")
    allMainRanks <- getTaxonomyRanks()
    if (!(rankName[1] %in% allMainRanks)) return("Invalid taxonomy rank given!")
    # get list of taxon names
    fullnameList <- getNameList(taxDB)
    taxonNames <- getInputTaxaName(rankName, taxonIDs, taxDB)
    if (is.null(refTaxon))  refTaxon <- taxonNames$fullName[1]
    # get selected supertaxon ID(s)
    rankNameTMP <- taxonNames$rank[taxonNames$fullName == refTaxon]
    if (rankName == "strain") {
        superID <- fullnameList$ncbiID[fullnameList$fullName == refTaxon]
    } else
        superID <- fullnameList$ncbiID[
            fullnameList$fullName == refTaxon
            & fullnameList$rank == rankNameTMP[1]]
    # get full taxonomy data & representative taxon
    Dt <- getTaxonomyMatrix(taxDB)
    repTaxon <- Dt[Dt[, rankName] == superID, ][1, ]
    # THEN, SORT TAXON LIST BASED ON TAXONOMY TREE or SORTED TAXON LIST
    if (is.null(sortedTaxonList)) {
        if (is.null(taxaTree)) {
            if (is.null(taxDB)) {
                preCalcTreeFile <- paste(
                    system.file(package = "PhyloProfile"),
                    "PhyloProfile/data/preCalcTree.nw", sep = "/"
                )
            } else preCalcTreeFile <- paste(taxDB, "preCalcTree.nw", sep = "/")

            if (file.exists(preCalcTreeFile)) {
                preTree <- ape::read.tree(preCalcTreeFile)
            } else preTree <- createUnrootedTree(Dt)

            if (!(repTaxon$abbrName %in% preTree$tip.label))
                message(c(repTaxon$abbrName, " not found in ", preCalcTreeFile))
            taxaTree <- ape::root(
                preTree, outgroup=as.character(repTaxon$abbrName),
                resolve.root=TRUE
            )
        } else
            taxaTree <- ape::root(
                taxaTree, outgroup=as.character(repTaxon$abbrName),
                resolve.root=TRUE
            )
        taxonList <- sortTaxaFromTree(taxaTree)
    } else {
        taxonList <- sortedTaxonList
    }
    sortedDt <- Dt[match(taxonList, Dt$abbrName), ]
    # subset to get list of input taxa only
    sortedDt <- subset(sortedDt, abbrName %in% taxonIDs)
    # get only taxonIDs list of selected rank and rename columns
    sortedOut <- subset(
        sortedDt,
        select = c("abbrName", "ncbiID", "fullName", as.character(rankName)))
    colnames(sortedOut) <- c("abbrName", "species", "fullName", "ncbiID")
    # add name of supertaxa into sortedOut list
    sortedOut <- merge(
        sortedOut, fullnameList, by = "ncbiID", all.x = TRUE, sort = FALSE)
    sortedOut$species <- paste0("ncbi", sortedOut$species)
    ## create new column for sorted supertaxon
    indexSpec <- unlist(lapply(
        seq_len(nlevels(as.factor(sortedOut$fullName.y))),function (x) 100000+x)
    )
    indexSpecDf <- data.frame(
        fullName.y = unique(as.character(sortedOut$fullName.y)),
        sortedSupertaxon = paste0(
            indexSpec, "_", unique(as.character(sortedOut$fullName.y))
        ), stringsAsFactors = FALSE)
    sortedOut <- merge(indexSpecDf, sortedOut, by = "fullName.y")
    # final sorted supertaxa list
    sortedOut$taxonID <- 0
    sortedOut$category <- "cat"
    sortedOut <- sortedOut[, c(
        "abbrName", "taxonID", "fullName.x", "species", "ncbiID",
        "sortedSupertaxon", "rank", "category")]
    colnames(sortedOut) <- c(
        "abbrName", "taxonID", "fullName", "ncbiID", "supertaxonID",
        "supertaxon", "rank", "category")
    sortedOut$ncbiID <- as.factor(sortedOut$ncbiID)
    sortedOut$supertaxon <- as.factor(sortedOut$supertaxon)
    sortedOut$category <- as.factor(sortedOut$category)
    return(sortedOut)
}

#' Calculate percentage of present species in each super taxon
#' @export
#' @param profileWithTax data frame of main PhyloProfile input together
#' with their taxonomy info (see ?profileWithTaxonomy)
#' @param taxaCount number of species occur in each supertaxon (e.g. phylum
#' or kingdom)
#' @return A data frame with % of present species in each supertaxon
#' @author Vinh Tran tran@bio.uni-frankfurt.de
#' @importFrom dplyr count select distinct summarise group_by inner_join mutate
#' filter
#' @seealso \code{\link{profileWithTaxonomy}} for a demo input data
#' @examples
#' # NOTE: for internal testing only
#' library(dplyr)
#' data("profileWithTaxonomy", package="PhyloProfile")
#' taxaCount <- profileWithTaxonomy %>% dplyr::count(supertaxon)
#' taxaCount$n <- 1
#' calcPresSpec(profileWithTaxonomy, taxaCount)

calcPresSpec <- function(profileWithTax, taxaCount) {
    if (missing(profileWithTax)) stop("No input data given")
    if (missing(taxaCount)) stop("No supertaxon count given")
    geneID <- supertaxon <- paralog <- abbrName <- NULL
    presSpec <- presentTaxa <- totalTaxa <- NULL
    # Check if  working with the lowest taxonomy rank
    if (all(profileWithTax$numberSpec == 1)) {
        profileWithTax <- profileWithTax %>% select(geneID, supertaxon) %>%
            mutate(presentTaxa = 1, presSpec = as.double(1), totalTaxa = 1) %>%
            arrange(geneID, supertaxon) %>% distinct()
        return(profileWithTax)
    }
    # Filter out rows with missing orthoID to reduce subsequent data size
    profileWithTax <- profileWithTax[
        !is.na(profileWithTax$orthoID) & profileWithTax$orthoID != "NA",
    ]
    # Pre-calculate geneID and supertaxon combinations for merging later
    geneIDSupertaxon <- profileWithTax %>%
        select(geneID, supertaxon, paralog, abbrName) %>% distinct()
    # Use summarise directly instead of count for better control
    geneSupertaxonCount <- geneIDSupertaxon %>% group_by(geneID, supertaxon) %>%
        summarise(presentTaxa = n(), .groups = "drop")
    # Perform the join with taxaCount and calculate presSpec efficiently
    presSpecDt <- geneSupertaxonCount %>%
        inner_join(taxaCount, by = "supertaxon") %>%
        mutate(presSpec = presentTaxa / n, totalTaxa = n) %>%
        select(-n) %>% filter(presSpec <= 1)
    # Merge with geneIDSupertaxon for missing combinations and fill defaults
    finalPresSpecDt <- geneIDSupertaxon %>%
        left_join(presSpecDt, by = c("geneID", "supertaxon")) %>%
        mutate(
            presSpec = ifelse(is.na(presSpec), 0, presSpec),
            presentTaxa = ifelse(is.na(presentTaxa), 0, presentTaxa),
            totalTaxa = ifelse(is.na(totalTaxa), 0, totalTaxa)
        ) %>% select(-c(abbrName, paralog))
    # Remove duplicates and sort results
    finalPresSpecDt <- finalPresSpecDt %>% arrange(geneID, supertaxon) %>%
        distinct()
    return(finalPresSpecDt)
}

#' Parsing info for phylogenetic profiles
#' @description Creating main dataframe for the input phylogenetic profiles
#' based on selected input taxonomy level (e.g. strain, species) and reference
#' taxon. The output contains the number of paralogs, the max/min/mean/median
#' of VAR1 and VAR2.
#' @usage parseInfoProfile(inputDf, sortedInputTaxa, taxaCount, coorthoCOMax)
#' @param inputDf input profiles in long format
#' @param sortedInputTaxa sorted taxonomy data for the input taxa
#' (check sortInputTaxa())
#' @param taxaCount dataframe counting present taxa in each supertaxon
#' @param coorthoCOMax maximum number of co-orthologs allowed
#' @return A dataframe contains all info for the input phylogenetic profiles.
#' This full processed profile that is required for several profiling analyses
#' e.g. estimation of gene age (?estimateGeneAge) or identification of core gene
#' (?getCoreGene).
#' @author Vinh Tran tran@bio.uni-frankfurt.de
#' @importFrom dplyr count
#' @export
#' @seealso \code{\link{createLongMatrix}}, \code{\link{sortInputTaxa}},
#' \code{\link{calcPresSpec}}, \code{\link{mainLongRaw}}
#' @examples
#' library(dplyr)
#' data("mainLongRaw", package="PhyloProfile")
#' taxonIDs <- getInputTaxaID(mainLongRaw)
#' sortedInputTaxa <- sortInputTaxa(
#'     taxonIDs, "class", "Mammalia", NULL, NULL
#' )
#' taxaCount <- sortedInputTaxa %>% dplyr::group_by(supertaxon) %>%
#'     summarise(n = n(), .groups = "drop")
#' coorthoCOMax <- 999
#' parseInfoProfile(
#'     mainLongRaw, sortedInputTaxa, taxaCount, coorthoCOMax
#' )

parseInfoProfile <- function(
        inputDf, sortedInputTaxa, taxaCount, coorthoCOMax = 9999
) {
    if (is.null(inputDf) | is.null(sortedInputTaxa))
        stop("Input profiles and sorted taxonomy data cannot be NULL!")
    geneID <- ncbiID <- paralog <- geneName <- NULL
    if (ncol(inputDf) > 3) {
        if (ncol(inputDf) < 5) {colnames(inputDf)[4] <- "var1"}
        else if (ncol(inputDf) < 6) {
            colnames(inputDf)[c(4,5)] <- c("var1", "var2")
        }
    }
    if (coorthoCOMax < 1) coorthoCOMax <- 1
    # count number of inparalogs & calculate frequency of all supertaxa
    inputDf <- inputDf %>%
        dplyr::group_by(geneID, ncbiID) %>%
        dplyr::mutate(paralog = dplyr::n()) %>%
        dplyr::ungroup()
    # filter by number of coorthologs
    inputDf <- inputDf %>% filter(!(paralog > coorthoCOMax))
    # add tax info from sortedInputTaxa and number of species from taxaCount
    fullMdData <- inputDf %>%
        left_join(sortedInputTaxa, by = "ncbiID") %>%
        left_join(taxaCount, by = "supertaxon") %>%
        distinct()
    fullMdData$fullName <- as.character(fullMdData$fullName)
    # add geneName column if not yet exist
    if (!("geneName" %in% colnames(fullMdData)))
        fullMdData$geneName <- fullMdData$geneID
    # Sort `geneName` based on the order of `geneID`
    # Extract unique gene names for each geneID
    orderedName <- fullMdData %>%
        dplyr::group_by(geneID) %>%
        dplyr::reframe(geneName = unique(as.character(geneName))) %>%
        dplyr::pull(geneName)
    # Update `geneName` as a factor with the ordered levels
    fullMdData$geneName <- factor(fullMdData$geneName, levels = orderedName)
    # Rename column "n" to "numberSpec"
    fullMdData <- fullMdData %>% dplyr::rename(numberSpec = n)
    return(fullMdData)
}


#' Filter phylogentic profiles
#' @description Create a filtered data needed for plotting or clustering
#' phylogenetic profiles. NOTE: this function require some intermediate steps
#' using the results from other functions. If you would like to get a full
#' processed data from the raw input, please use the function
#' fromInputToProfile() instead!
#' @usage filterProfileData(DF, taxaCount, refTaxon = NULL,
#'     percentCO = c(0, 1), coorthoCOMax = 9999,
#'     var1CO  = c(0, 1), var2CO = c(0, 1), var1Rel = "protein",
#'     var2Rel = "protein", groupByCat = FALSE, catDt = NULL,
#'     var1AggregateBy = "max", var2AggregateBy = "max")
#' @param DF a reduced dataframe contains info for all phylogenetic
#' profiles in the selected taxonomy rank.
#' @param taxaCount dataframe counting present taxa in each supertaxon
#' @param refTaxon selected reference taxon. NOTE: This taxon will not be
#' affected by the filtering. If you want to filter all, set refTaxon <- NULL.
#' Default = NULL.
#' @param percentCO min and max cutoffs for percentage of species present
#' in a supertaxon. Default = c(0, 1).
#' @param coorthoCOMax maximum number of co-orthologs allowed. Default =
#' 9999.
#' @param var1CO min and max cutoffs for var1. Default = c(0, 1).
#' @param var2CO min anc max cutoffs for var2. Default = c(0, 1).
#' @param var1Rel relation of var1 ("protein" for protein-protein or
#' "species" for protein-species). Default = "protein".
#' @param var2Rel relation of var2 ("protein" for protein-protein or
#' "species" for protein-species). Default = "protein".
#' @param groupByCat group genes by their categories (TRUE or FALSE). Default =
#' FALSE.
#' @param catDt dataframe contains gene categories
#' (optional, NULL if groupByCat = FALSE or no info provided). Default = NULL.
#' @param var1AggregateBy aggregate method for VAR1 (max, min, mean
#' or median), applied for calculating var1 of supertaxa. Default = "max".
#' @param var2AggregateBy aggregate method for VAR2 (max, min, mean
#' or median), applied for calculating var2 of supertaxa. Default = "max".
#' @return A filtered dataframe for generating profile plot including seed gene
#' IDs (or orthologous group IDs), their ortholog IDs and the corresponding
#' (super)taxa, (super)taxon IDs, number of co-orthologs in each (super)taxon,
#' values for two additional variables var1, var2, % of species present in each
#' supertaxon, and the categories of seed genes (or ortholog groups).
#' @author Vinh Tran tran@bio.uni-frankfurt.de
#' @importFrom stats complete.cases aggregate
#' @export
#' @seealso \code{\link{parseInfoProfile}} and \code{\link{reduceProfile}}
#' for generating input dataframe, \code{\link{fullProcessedProfile}} for a
#' demo full processed profile dataframe, \code{\link{fromInputToProfile}} for
#' generating fully processed data from raw input.
#' @examples
#' # NOTE: this function require some intermediate steps using the results from
#' # other functions. If you would like to get a full processed data from the
#' # raw input, please use the function fromInputToProfile() instead!
#' library(dplyr)
#' data("fullProcessedProfile", package="PhyloProfile")
#' rankName <- "class"
#' refTaxon <- "Mammalia"
#' percentCutoff <- c(0.0, 1.0)
#' coorthologCutoffMax <- 10
#' var1Cutoff <- c(0.75, 1.0)
#' var2Cutoff <- c(0.5, 1.0)
#' var1Relation <- "protein"
#' var2Relation <- "species"
#' groupByCat <- FALSE
#' catDt <- NULL
#' var1AggregateBy <- "max"
#' var2AggregateBy <- "max"
#' taxonIDs <- levels(as.factor(fullProcessedProfile$ncbiID))
#' sortedInputTaxa <- sortInputTaxa(
#'     taxonIDs, rankName, refTaxon, NULL, NULL
#' )
#' taxaCount <- sortedInputTaxa %>% dplyr::group_by(supertaxon) %>%
#'     summarise(n = n(), .groups = "drop")
#' filterProfileData(
#'     fullProcessedProfile,
#'     taxaCount,
#'     refTaxon,
#'     percentCutoff,
#'     coorthologCutoffMax,
#'     var1Cutoff,
#'     var2Cutoff,
#'     var1Relation,
#'     var2Relation,
#'     groupByCat,
#'     catDt,
#'     var1AggregateBy,
#'     var2AggregateBy
#' )

filterProfileData <- function(
    DF, taxaCount, refTaxon = NULL, percentCO = c(0, 1), coorthoCOMax = 9999,
    var1CO = c(0, 1), var2CO = c(0, 1), var1Rel = "protein",
    var2Rel = "protein", groupByCat = FALSE, catDt = NULL,
    var1AggregateBy = "max", var2AggregateBy = "max"
) {
    if (is.null(DF)) stop("Profile data cannot be NULL!")
    if (is.null(refTaxon)) refTaxon <- "NA"
    geneID <- supertaxon <- value <- NULL
    flag <- ifelse(all(DF$numberSpec == 1), 0, 1)

    DF$taxonMod <- sub("^[[:digit:]]*_", "", DF$supertaxon)
    filter_var <- function(var, varCO, rel, refTaxon, taxonMod) {
        if (rel == "protein") {
            var[taxonMod != refTaxon & (var < varCO[1] | var > varCO[2])] <- NA
        } else {
            var[taxonMod != refTaxon & (var < varCO[1] | var > varCO[2])] <- 0
        }
        return(var)
    }

    if (flag == 0) {
        DF$var1 <- filter_var(DF$var1, var1CO, var1Rel, refTaxon, DF$taxonMod)
        DF$var2 <- filter_var(DF$var2, var2CO, var2Rel, refTaxon, DF$taxonMod)
    } else {
        DF$var1 <- filter_var(DF$var1, var1CO, var1Rel, NA, NA)
        DF$var2 <- filter_var(DF$var2, var2CO, var2Rel, NA, NA)
    }

    DFtmp <- DF[stats::complete.cases(DF), ]
    finalPresSpecDt <- calcPresSpec(DFtmp, taxaCount)
    DF <- DF %>% left_join(finalPresSpecDt, by = c("geneID", "supertaxon"))
    DF$presSpec <- ifelse(
        DF$presSpec < percentCO[1] | DF$presSpec > percentCO[2], 0, DF$presSpec
    )

    # Convert orthoID to character and set to NA where presSpec == 0
    DF$orthoID <- ifelse(DF$presSpec == 0, NA, as.character(DF$orthoID))
    # Conditionally update var1 and var2 based on orthoID being NA
    if (var1Rel == "protein") DF$var1[is.na(DF$orthoID)] <- NA
    if (var2Rel == "protein") DF$var2[is.na(DF$orthoID)] <- NA
    # Set presSpec to 0 where orthoID is NA
    DF$presSpec[is.na(DF$orthoID)] <- 0
    # Set paralog to 1 if flag == 1
    if (flag == 1) DF$paralog <- 1
    # Dropping unused levels and ensuring geneID and supertaxon are factors
    DF <- DF %>% droplevels() %>% mutate(
        geneID = factor(geneID), supertaxon = factor(supertaxon)
    )

    aggregate_scores <- function(df, var, aggBy) {
        if (nrow(df) == 0) {
            # Return an empty data frame
            return(data.frame(
                supertaxon=character(0), geneID=character(0), value=numeric(0)
            ))
        }
        aggregated <- stats::aggregate(
            df[[var]], by = list(df$supertaxon, df$geneID), FUN = aggBy
        )
        colnames(aggregated) <- c("supertaxon", "geneID", "value")
        return(aggregated)
    }

    mVar1Dt <- aggregate_scores(
        DF[!is.na(DF$var1), ], "var1", var1AggregateBy
    ) %>% dplyr::rename(mVar1 = value)

    mVar2Dt <- aggregate_scores(DF[!is.na(DF$var2), ], "var2", var2AggregateBy)
    if (nrow(mVar2Dt) == 0) {
        mVar2Dt <- unique(DF[, c("supertaxon", "geneID")]) %>%
            dplyr::mutate(mVar2 = 0)
    } else {
        mVar2Dt <- mVar2Dt %>% dplyr::rename(mVar2 = value)
    }

    scoreDf <- dplyr::full_join(mVar1Dt, mVar2Dt, by = c("supertaxon","geneID"))
    DF <- dplyr::left_join(DF, scoreDf, by = c("geneID", "supertaxon"))

    if (groupByCat) {
        if (is.null(catDt)) {
            catDt <- data.frame(geneID = levels(DF$geneID), group="noCategory")
        }
        dfCat <- merge(
            expand.grid(
                supertaxon = levels(DF$supertaxon), geneID = levels(DF$geneID)
            ), catDt, by = "geneID", all.x = TRUE
        )
        DF <- dplyr::left_join(dfCat, DF, by = c("geneID", "supertaxon"))
        DF$category <- DF$group
    }

    return(DF)
}

#' Reduce the filtered profile data into supertaxon level
#' @description Reduce data of the processed phylogenetic profiles from input
#' taxonomy rank into supertaxon level (e.g. from species to phylum)
#' @param filteredProfile dataframe contains the filtered profiles (see
#' ?parseInfoProfile, ?filterProfileData and ?filteredProfile)
#' @return A reduced dataframe contains only profile data for the selected
#' supertaxon rank. This dataframe contains only supertaxa and their value
#' (mVar1 & mVar2) for each gene.
#' @author Vinh Tran tran@bio.uni-frankfurt.de
#' @export
#' @seealso \code{\link{parseInfoProfile}} for creating a full processed
#' profile dataframe, \code{\link{filterProfileData}} for filter processed
#' profile and \code{\link{filteredProfile}} for a demo filtered
#' profile dataframe
#' @examples
#' data("filteredProfile", package="PhyloProfile")
#' reduceProfile(filteredProfile)

reduceProfile <- function(filteredProfile) {
    if (is.null(filteredProfile)) stop("Profile data cannot be NULL!")
    # check if working with the lowest taxonomy rank; 1 for NO; 0 for YES
    flag <- 1
    if (length(unique(levels(as.factor(filteredProfile$numberSpec)))) == 1) {
        if (unique(levels(as.factor(filteredProfile$numberSpec))) == 1) {
            superDfExt <- filteredProfile[, c(
                "geneID", "supertaxon", "supertaxonID", "orthoID", "category",
                "presSpec", "paralog", "mVar1", "mVar2", "geneName"
            )]
            superDfExt$presentTaxa <- 1
            superDfExt$totalTaxa <- 1
            # rename mVar to var
            names(superDfExt)[names(superDfExt) == "mVar1"] <- "var1"
            names(superDfExt)[names(superDfExt) == "mVar2"] <- "var2"
            flag <- 0
        }
    }
    if (flag == 1) {
        # get representative orthoID that has m VAR1 for each supertaxon
        mOrthoID <- filteredProfile[, c(
            "geneID", "supertaxon", "var1", "mVar1", "orthoID", "presSpec",
            "geneName"
        )]
        mOrthoID$sVar1 <- mOrthoID$mVar1 - mOrthoID$var1
        # mOrthoID <- subset(mOrthoID, mOrthoID$var1 == mOrthoID$mVar1)
        mOrthoID <- subset(mOrthoID, mOrthoID$sVar1 <= 0)
        colnames(mOrthoID) <- c(
            "geneID","supertaxon","var1","mVar1","orthoID","presSpec","sVar1",
            "geneName"
        )
        mOrthoID <- mOrthoID[order(mOrthoID$sVar1, decreasing = TRUE), ]
        mOrthoID <- mOrthoID[!is.na(mOrthoID$orthoID), ]
        mOrthoID <- mOrthoID[, c("geneID", "supertaxon", "orthoID", "presSpec")]
        mOrthoID <- mOrthoID[!duplicated(mOrthoID[, seq_len(2)]), ]
        # get data set for PhyloProfile plotting (contains only supertaxa info)
        superDf <- subset(filteredProfile, select = c(
            "geneID", "supertaxon", "supertaxonID", "mVar1", "category",
            "mVar2", "paralog", "presentTaxa", "totalTaxa", "geneName"
        ))
        superDf <- superDf[!duplicated(superDf), ]
        superDfExt <- merge(
            superDf, mOrthoID, by = c("geneID", "supertaxon"), all.x = TRUE
        )
        superDfExt <- superDfExt[, c(
            "geneID", "supertaxon", "supertaxonID",
            "mVar1", "presSpec", "category", "orthoID", "mVar2", "paralog",
            "presentTaxa", "totalTaxa", "geneName"
        )]
        # rename mVar to var
        names(superDfExt)[names(superDfExt) == "mVar1"] <- "var1"
        names(superDfExt)[names(superDfExt) == "mVar2"] <- "var2"
    }
    superDfExt$orthoID <- as.character(superDfExt$orthoID)
    return(superDfExt)
}

#' Complete processing of raw input phylogenetic profiles
#' @description Create a processed and filtered data for plotting or analysing
#' phylogenetic profiles from raw input file (from raw input to final filtered
#' dataframe)
#' @usage fromInputToProfile(rawInput, rankName, refTaxon = NULL,
#'     taxaTree = NULL, sortedTaxonList = NULL, var1AggregateBy = "max",
#'     var2AggregateBy = "max", percentCutoff = c(0, 1),
#'     coorthologCutoffMax = 9999, var1Cutoff = c(0, 1), var2Cutoff = c(0, 1),
#'     var1Relation = "protein", var2Relation = "protein", groupByCat = FALSE,
#'     catDt = NULL, taxDB = NULL)
#' @param rawInput input file (in long, wide, multi-fasta or orthoxml format)
#' @param rankName taxonomy rank (e.g. "species","phylum",...)
#' @param refTaxon selected reference taxon name (used for sorting and will be
#' protected from filtering). Default = NULL.
#' @param taxaTree input taxonomy tree for taxa in input profiles (optional).
#' Default = NULL.
#' @param sortedTaxonList list of sorted taxa (optional). Default = NULL.
#' @param var1AggregateBy aggregate method for var1 (min, max, mean or median).
#' Default = "max".
#' @param var2AggregateBy aggregate method for VAR2 (min, max, mean or median).
#' Default = "max".
#' @param percentCutoff min and max cutoffs for percentage of species present
#' in a supertaxon. Default = c(0, 1).
#' @param coorthologCutoffMax maximum number of co-orthologs allowed. Default =
#' 9999.
#' @param var1Cutoff min and max cutoffs for var1. Default = c(0, 1).
#' @param var2Cutoff min and max cutoffs for var2. Default = c(0, 1).
#' @param var1Relation relation of var1 ("protein" for protein-protein or
#' "species" for protein-species). Default = "protein".
#' @param var2Relation relation of var2 ("protein" for protein-protein or
#' "species" for protein-species). Default = "protein".
#' @param groupByCat group genes by their categories (TRUE or FALSE). Default =
#' FALSE.
#' @param catDt dataframe contains gene categories. Default = NULL
#' @param taxDB Path to the taxonomy DB files
#' @return Dataframe required for generating phylogenetic profile plot or
#' clustering analysis. It contains seed gene IDs (or orthologous group IDs),
#' their ortholog IDs and the corresponding (super)taxa, (super)taxon IDs,
#' number of co-orthologs in each (super)taxon, values for two additional
#' variables var1, var2, % of species present in each supertaxon, and the
#' categories of seed genes (or ortholog groups).
#' @author Vinh Tran tran@bio.uni-frankfurt.de
#' @importFrom dplyr count
#' @export
#' @seealso \code{\link{createLongMatrix}}, \code{\link{getInputTaxaID}},
#' \code{\link{getInputTaxaName}}, \code{\link{sortInputTaxa}},
#' \code{\link{parseInfoProfile}}, \code{\link{reduceProfile}},
#' \code{\link{filterProfileData}}
#' @examples
#' rawInput <- system.file(
#'     "extdata", "test.main.long", package = "PhyloProfile", mustWork = TRUE
#' )
#' rankName <- "class"
#' refTaxon <- "Mammalia"
#' taxaTree <- NULL
#' sortedTaxonList <- NULL
#' var1AggregateBy <- "max"
#' var2AggregateBy <- "mean"
#' percentCutoff <- c(0.0, 1.0)
#' coorthologCutoffMax <- 10
#' var1Cutoff <- c(0.75, 1.0)
#' var2Cutoff <- c(0.5, 1.0)
#' var1Relation <- "protein"
#' var2Relation <- "species"
#' groupByCat <- FALSE
#' catDt <- NULL
#' fromInputToProfile(
#'     rawInput,
#'     rankName,
#'     refTaxon,
#'     taxaTree,
#'     sortedTaxonList,
#'     var1AggregateBy,
#'     var2AggregateBy,
#'     percentCutoff,
#'     coorthologCutoffMax,
#'     var1Cutoff,
#'     var2Cutoff,
#'     var1Relation,
#'     var2Relation,
#'     groupByCat,
#'     catDt
#' )

fromInputToProfile <- function(
    rawInput,
    rankName,
    refTaxon = NULL,
    taxaTree = NULL,
    sortedTaxonList = NULL,
    var1AggregateBy = "max",
    var2AggregateBy = "max",
    percentCutoff = c(0, 1),
    coorthologCutoffMax = 9999,
    var1Cutoff = c(0, 1),
    var2Cutoff = c(0, 1),
    var1Relation = "protein",
    var2Relation = "protein",
    groupByCat = FALSE,
    catDt = NULL,
    taxDB = NULL
) {
    if (missing(rawInput) | missing(rankName)) return("Missing input")
    if (is.null(rawInput) | is.null(rankName)) return("Missing input")
    supertaxon <- NULL
    # convert raw input into long format
    inputDf <- createLongMatrix(rawInput)
    # get input taxon IDs and names
    inputTaxonID <- getInputTaxaID(inputDf)
    # sort input taxa based on selected reference taxon or input taxonomy tree
    sortedInputTaxa <- sortInputTaxa(
        inputTaxonID, rankName, refTaxon, taxaTree, sortedTaxonList, taxDB
    )
    # count present taxa in each supertaxon
    taxaCount <- sortedInputTaxa %>% dplyr::group_by(supertaxon) %>%
        summarise(n = n(), .groups = "drop")
    # parse info (additional values...) into profile df
    fullMdData <- parseInfoProfile(inputDf, sortedInputTaxa, taxaCount)
    # filter profile
    filteredDf <- filterProfileData(
        fullMdData,
        taxaCount,
        refTaxon,
        percentCutoff,
        coorthologCutoffMax,
        var1Cutoff,
        var2Cutoff,
        var1Relation,
        var2Relation,
        groupByCat = FALSE,
        catDt = NULL,
        var1AggregateBy, var2AggregateBy
    )
    # reduce profile df into supertaxon level
    dataHeat <- reduceProfile(filteredDf)
    return(dataHeat)
}
BIONF/PhyloProfile documentation built on Feb. 11, 2025, 7:53 p.m.