# Functions for parsing and pre-processing PhyloProfile input
#' Create a list containing all main taxanomy ranks
#' @export
#' @return A list of all main ranks (from strain to superkingdom)
#' @author Carla Mölbert (carla.moelbert@gmx.de)
#' @examples
#' getTaxonomyRanks()
getTaxonomyRanks <- function(){
allRanks <- list(
"Strain " = "strain",
"Species" = "species",
"Genus" = "genus",
"Family" = "family",
"Order" = "order",
"Class" = "class",
"Phylum" = "phylum",
"Kingdom" = "kingdom",
"Superkingdom" = "superkingdom",
"unselected" = ""
)
return(allRanks)
}
#' Get list of pre-installed NCBI taxon names
#' @description Get all NCBI taxon names from
#' "PhyloProfile/data/taxonNamesReduced.txt"
#' @export
#' @param taxDB Path to the taxonomy DB files
#' @return List of taxon IDs, their full names, taxonomy ranks and parent IDs
#' obtained from "PhyloProfile/data/taxonNamesReduced.txt"
#' @author Vinh Tran tran@bio.uni-frankfurt.de
#' @examples
#' getNameList()
getNameList <- function(taxDB = NULL) {
if (is.null(taxDB)) {
nameReducedFile <- paste(
system.file(package = "PhyloProfile"),
"PhyloProfile/data/taxonNamesReduced.txt", sep = "/"
)
} else nameReducedFile <- paste(taxDB, "taxonNamesReduced.txt", sep = "/")
if (!file.exists(nameReducedFile)) {
utils::data(taxonNamesReduced)
} else {
taxonNamesReduced <- data.table::fread(
nameReducedFile, sep = "\t", header = TRUE, fill = TRUE,
showProgress = FALSE
)
}
taxonNamesReduced$fullName <- as.character(taxonNamesReduced$fullName)
taxonNamesReduced$rank <- as.character(taxonNamesReduced$rank)
taxonNamesReduced <- unique(taxonNamesReduced)
return(taxonNamesReduced)
}
#' Get taxonomy matrix
#' @description Get the (full or subset) taxonomy matrix from
#' "data/taxonomyMatrix.txt" based on an input taxon list
#' @export
#' @param taxDB Path to the taxonomy DB files
#' @param subsetTaxaCheck TRUE/FALSE subset taxonomy matrix based on input taxon
#' IDs. Default = FALSE
#' @param taxonIDs list of input taxon IDs (e.g. ncbi1234). Default = NULL
#' @return Data frame contains the (subset of) taxonomy matrix for list of
#' input taxa.
#' @author Vinh Tran tran@bio.uni-frankfurt.de
#' @examples
#' # get full pre-installed taxonomy matrix
#' getTaxonomyMatrix()
#' # get taxonomy matrix for a list of taxon IDs
#' taxonIDs <- c("ncbi9606", "ncbi10116")
#' getTaxonomyMatrix(NULL, TRUE, taxonIDs)
getTaxonomyMatrix <- function(
taxDB = NULL, subsetTaxaCheck = FALSE, taxonIDs = NULL
){
taxonomyMatrixFile <- if (is.null(taxDB)) {
file.path(
system.file(package = "PhyloProfile"),
"PhyloProfile/data/taxonomyMatrix.txt"
)
} else {
file.path(taxDB, "taxonomyMatrix.txt")
}
if (!file.exists(taxonomyMatrixFile)) {
utils::data(taxonomyMatrix)
} else {
taxonomyMatrix <- data.table::fread(
taxonomyMatrixFile, sep = "\t", header = TRUE,
stringsAsFactors = TRUE, showProgress = FALSE
)
}
if (subsetTaxaCheck) {
if (!missing(taxonIDs)) {
taxonomyMatrix <- taxonomyMatrix[
taxonomyMatrix$abbrName %in% taxonIDs,]
}
}
return(data.frame(taxonomyMatrix))
}
#' Get ID list of input taxa from the main input
#' @param rawProfile A dataframe of input phylogenetic profile in long format
#' @return List of all input taxon IDs (e.g. ncbi1234). Default = NULL.
#' @author Vinh Tran tran@bio.uni-frankfurt.de
#' @export
#' @seealso \code{\link{createLongMatrix}}, \code{\link{mainLongRaw}}
#' @examples
#' data("mainLongRaw", package="PhyloProfile")
#' getInputTaxaID(mainLongRaw)
getInputTaxaID <- function(rawProfile = NULL){
if (is.null(rawProfile)) stop("Input profile data cannot be NULL!")
inputTaxa <- unique(rawProfile$ncbiID)
inputTaxa <- inputTaxa[inputTaxa != "geneID"]
# return input taxa
return(inputTaxa)
}
#' Get NCBI taxon names for a selected list of taxa
#' @description Get NCBI taxon names from
#' "PhyloProfile/data/taxonNamesReduced.txt" for a list of input taxa
#' @param rankName taxonomy rank (e.g. "species","phylum",...)
#' @param taxonIDs list of taxon IDs (e.g. ncbi1234). Default = NULL
#' @param taxDB Path to the taxonomy DB files
#' @return Data frame contains a list of full names, taxonomy ranks and parent
#' IDs for the input taxa.
#' @author Vinh Tran tran@bio.uni-frankfurt.de
#' @export
#' @seealso \code{\link{getInputTaxaID}} for getting input taxon IDs,
#' \code{\link{getNameList}} for getting the full taxon name list
#' @examples
#' taxonIDs <- c("ncbi9606", "ncbi10116")
#' getInputTaxaName("species", taxonIDs)
getInputTaxaName <- function(rankName, taxonIDs = NULL, taxDB = NULL){
# check input parameters
if (missing(rankName)) stop("No taxonomy rank name given!")
allMainRanks <- getTaxonomyRanks()
if (!(rankName[1] %in% allMainRanks)) stop("Invalid taxonomy rank given!")
# load list of unsorted taxa
taxMatrix <- getTaxonomyMatrix(
taxDB, subsetTaxaCheck = TRUE, taxonIDs = taxonIDs
)
# load list of taxon name
nameList <- getNameList(taxDB)
# return
choice <- data.frame(
"ncbiID" = unlist(taxMatrix[[rankName]]), stringsAsFactors = FALSE
)
choice <- merge(choice, nameList, by = "ncbiID", all = FALSE)
return(choice)
}
#' Get a subset of input taxa based on a selected taxonomy rank
#' @description Get a subset of taxon ncbi IDs and names from an input list of
#' taxa based on a selected supertaxon (identified by its taxonomy rank and
#' supertaxon name or supertaxon ID).
#' @usage getSelectedTaxonNames(inputTaxonIDs = NULL, rank = NULL,
#' higherRank = NULL, higherID = NULL, higherName = NULL, taxDB = NULL)
#' @param inputTaxonIDs list of input taxon IDs (e.g. c("10116", "122586"))
#' @param rank taxonomy rank of input taxa (e.g. "species")
#' @param higherRank selected taxonomy rank (e.g. "phylum")
#' @param higherID supertaxon ID (e.g. 7711). NOTE: either supertaxon ID or
#' name is required, not neccessary to give both
#' @param higherName supertaxon name (e.g. "Chordata"). NOTE: either
#' supertaxon ID or name is required, not neccessary to give both
#' @param taxDB Path to the taxonomy DB files
#' @export
#' @return A data frame contains ncbi IDs and names of taxa from the input taxon
#' list that belong to the selected supertaxon.
#' @author Vinh Tran tran@bio.uni-frankfurt.de
#' @examples
#' inputTaxonIDs <- c("10116", "122586", "123851", "13616", "188937", "189518",
#' "208964", "224129", "224324", "237631", "243230")
#' rank <- "species"
#' higherRank <- "phylum"
#' higherID <- 7711
#' getSelectedTaxonNames(inputTaxonIDs, rank, higherRank, higherID, NULL)
#' higherName <- "Chordata"
#' getSelectedTaxonNames(inputTaxonIDs, rank, higherRank, NULL, higherName,NULL)
getSelectedTaxonNames <- function(
inputTaxonIDs = NULL, rank = NULL,
higherRank = NULL, higherID = NULL, higherName = NULL, taxDB = NULL
) {
rankName <- NULL
if (is.null(inputTaxonIDs) | is.null(rank))
stop("Input taxa and taxonomy rank cannot be NULL!")
taxDf <- getTaxonomyMatrix(taxDB, TRUE, paste0("ncbi", inputTaxonIDs))
if (is.null(higherID) & is.null(higherName))
return(
data.frame(
ncbiID = taxDf$ncbiID[
taxDf$abbrName %in% paste0("ncbi", inputTaxonIDs)],
name = taxDf$fullName[
taxDf$abbrName %in% paste0("ncbi", inputTaxonIDs)],
stringsAsFactors = FALSE))
if (is.null(higherRank)) {
return(
data.frame(
ncbiID = taxDf$ncbiID[
taxDf$abbrName %in% paste0("ncbi", inputTaxonIDs)],
name = taxDf$fullName[
taxDf$abbrName %in% paste0("ncbi", inputTaxonIDs)],
stringsAsFactors = FALSE))
} else {
if (!is.null(higherName) & is.null(higherID)) {
taxaList <- getNameList(taxDB)
superID <- taxaList$ncbiID[
taxaList$fullName == higherName
& taxaList$rank %in% c(higherRank, "norank")]
customizedtaxaID <- levels(
as.factor(taxDf[rank][taxDf[higherRank] == superID, ]))
return(
data.frame(
ncbiID = taxaList$ncbiID[
taxaList$rank %in% c(rank, "norank")
& taxaList$ncbiID %in% customizedtaxaID],
name = taxaList$fullName[
taxaList$rank %in% c(rank, "norank")
& taxaList$ncbiID %in% customizedtaxaID],
stringsAsFactors = FALSE))
} else if (!is.null(higherID)) {
return(
data.frame(
ncbiID = taxDf$ncbiID[taxDf[,higherRank] == higherID],
name = taxDf$fullName[taxDf[,higherRank] == higherID],
stringsAsFactors = FALSE))
}
}
}
#' Sort list of (super)taxa based on a selected reference (super)taxon
#' @usage sortInputTaxa(taxonIDs = NULL, rankName, refTaxon = NULL,
#' taxaTree = NULL, sortedTaxonList = NULL, taxDB = NULL)
#' @param taxonIDs list of taxon IDs (e.g.: ncbi1234, ncbi9999, ...). Default =
#' NULL
#' @param rankName working taxonomy rank (e.g. "species", "phylum",...)
#' @param refTaxon selected reference taxon. Default = NULL
#' @param taxaTree taxonomy tree for the input taxa (optional). Default = NULL
#' @param sortedTaxonList list of sorted taxa (optional). Default = NULL
#' @param taxDB Path to the taxonomy DB files
#' @return A taxonomy matrix for the input taxa ordered by the selected
#' reference taxon. This matrix is sorted either based on the NCBI taxonomy
#' info, or based on an user-defined taxonomy tree (if provided).
#' @importFrom ape read.tree root
#' @author Vinh Tran tran@bio.uni-frankfurt.de
#' @export
#' @seealso \code{\link{getNameList}}, \code{\link{getTaxonomyMatrix}},
#' \code{\link{createUnrootedTree}}, \code{\link{sortTaxaFromTree}},
#' \code{\link{getInputTaxaName}}, \code{\link{getInputTaxaID}},
#' \code{\link{createLongMatrix}}
#' @examples
#' taxonIDs <- c(
#' "ncbi10116", "ncbi123851", "ncbi3702", "ncbi13616", "ncbi9606"
#' )
#' sortInputTaxa(taxonIDs, "species", "Homo sapiens", NULL, NULL)
sortInputTaxa <- function(
taxonIDs = NULL, rankName, refTaxon = NULL, taxaTree = NULL,
sortedTaxonList = NULL, taxDB = NULL
){
ncbiID <- fullName <- abbrName <- NULL
if (missing(rankName)) return("No taxonomy rank name given!")
allMainRanks <- getTaxonomyRanks()
if (!(rankName[1] %in% allMainRanks)) return("Invalid taxonomy rank given!")
# get list of taxon names
fullnameList <- getNameList(taxDB)
taxonNames <- getInputTaxaName(rankName, taxonIDs, taxDB)
if (is.null(refTaxon)) refTaxon <- taxonNames$fullName[1]
# get selected supertaxon ID(s)
rankNameTMP <- taxonNames$rank[taxonNames$fullName == refTaxon]
if (rankName == "strain") {
superID <- fullnameList$ncbiID[fullnameList$fullName == refTaxon]
} else
superID <- fullnameList$ncbiID[
fullnameList$fullName == refTaxon
& fullnameList$rank == rankNameTMP[1]]
# get full taxonomy data & representative taxon
Dt <- getTaxonomyMatrix(taxDB)
repTaxon <- Dt[Dt[, rankName] == superID, ][1, ]
# THEN, SORT TAXON LIST BASED ON TAXONOMY TREE or SORTED TAXON LIST
if (is.null(sortedTaxonList)) {
if (is.null(taxaTree)) {
if (is.null(taxDB)) {
preCalcTreeFile <- paste(
system.file(package = "PhyloProfile"),
"PhyloProfile/data/preCalcTree.nw", sep = "/"
)
} else preCalcTreeFile <- paste(taxDB, "preCalcTree.nw", sep = "/")
if (file.exists(preCalcTreeFile)) {
preTree <- ape::read.tree(preCalcTreeFile)
} else preTree <- createUnrootedTree(Dt)
if (!(repTaxon$abbrName %in% preTree$tip.label))
message(c(repTaxon$abbrName, " not found in ", preCalcTreeFile))
taxaTree <- ape::root(
preTree, outgroup=as.character(repTaxon$abbrName),
resolve.root=TRUE
)
} else
taxaTree <- ape::root(
taxaTree, outgroup=as.character(repTaxon$abbrName),
resolve.root=TRUE
)
taxonList <- sortTaxaFromTree(taxaTree)
} else {
taxonList <- sortedTaxonList
}
sortedDt <- Dt[match(taxonList, Dt$abbrName), ]
# subset to get list of input taxa only
sortedDt <- subset(sortedDt, abbrName %in% taxonIDs)
# get only taxonIDs list of selected rank and rename columns
sortedOut <- subset(
sortedDt,
select = c("abbrName", "ncbiID", "fullName", as.character(rankName)))
colnames(sortedOut) <- c("abbrName", "species", "fullName", "ncbiID")
# add name of supertaxa into sortedOut list
sortedOut <- merge(
sortedOut, fullnameList, by = "ncbiID", all.x = TRUE, sort = FALSE)
sortedOut$species <- paste0("ncbi", sortedOut$species)
## create new column for sorted supertaxon
indexSpec <- unlist(lapply(
seq_len(nlevels(as.factor(sortedOut$fullName.y))),function (x) 100000+x)
)
indexSpecDf <- data.frame(
fullName.y = unique(as.character(sortedOut$fullName.y)),
sortedSupertaxon = paste0(
indexSpec, "_", unique(as.character(sortedOut$fullName.y))
), stringsAsFactors = FALSE)
sortedOut <- merge(indexSpecDf, sortedOut, by = "fullName.y")
# final sorted supertaxa list
sortedOut$taxonID <- 0
sortedOut$category <- "cat"
sortedOut <- sortedOut[, c(
"abbrName", "taxonID", "fullName.x", "species", "ncbiID",
"sortedSupertaxon", "rank", "category")]
colnames(sortedOut) <- c(
"abbrName", "taxonID", "fullName", "ncbiID", "supertaxonID",
"supertaxon", "rank", "category")
sortedOut$ncbiID <- as.factor(sortedOut$ncbiID)
sortedOut$supertaxon <- as.factor(sortedOut$supertaxon)
sortedOut$category <- as.factor(sortedOut$category)
return(sortedOut)
}
#' Calculate percentage of present species in each super taxon
#' @export
#' @param profileWithTax data frame of main PhyloProfile input together
#' with their taxonomy info (see ?profileWithTaxonomy)
#' @param taxaCount number of species occur in each supertaxon (e.g. phylum
#' or kingdom)
#' @return A data frame with % of present species in each supertaxon
#' @author Vinh Tran tran@bio.uni-frankfurt.de
#' @importFrom dplyr count select distinct summarise group_by inner_join mutate
#' filter
#' @seealso \code{\link{profileWithTaxonomy}} for a demo input data
#' @examples
#' # NOTE: for internal testing only
#' library(dplyr)
#' data("profileWithTaxonomy", package="PhyloProfile")
#' taxaCount <- profileWithTaxonomy %>% dplyr::count(supertaxon)
#' taxaCount$n <- 1
#' calcPresSpec(profileWithTaxonomy, taxaCount)
calcPresSpec <- function(profileWithTax, taxaCount) {
if (missing(profileWithTax)) stop("No input data given")
if (missing(taxaCount)) stop("No supertaxon count given")
geneID <- supertaxon <- paralog <- abbrName <- NULL
presSpec <- presentTaxa <- totalTaxa <- NULL
# Check if working with the lowest taxonomy rank
if (all(profileWithTax$numberSpec == 1)) {
profileWithTax <- profileWithTax %>% select(geneID, supertaxon) %>%
mutate(presentTaxa = 1, presSpec = as.double(1), totalTaxa = 1) %>%
arrange(geneID, supertaxon) %>% distinct()
return(profileWithTax)
}
# Filter out rows with missing orthoID to reduce subsequent data size
profileWithTax <- profileWithTax[
!is.na(profileWithTax$orthoID) & profileWithTax$orthoID != "NA",
]
# Pre-calculate geneID and supertaxon combinations for merging later
geneIDSupertaxon <- profileWithTax %>%
select(geneID, supertaxon, paralog, abbrName) %>% distinct()
# Use summarise directly instead of count for better control
geneSupertaxonCount <- geneIDSupertaxon %>% group_by(geneID, supertaxon) %>%
summarise(presentTaxa = n(), .groups = "drop")
# Perform the join with taxaCount and calculate presSpec efficiently
presSpecDt <- geneSupertaxonCount %>%
inner_join(taxaCount, by = "supertaxon") %>%
mutate(presSpec = presentTaxa / n, totalTaxa = n) %>%
select(-n) %>% filter(presSpec <= 1)
# Merge with geneIDSupertaxon for missing combinations and fill defaults
finalPresSpecDt <- geneIDSupertaxon %>%
left_join(presSpecDt, by = c("geneID", "supertaxon")) %>%
mutate(
presSpec = ifelse(is.na(presSpec), 0, presSpec),
presentTaxa = ifelse(is.na(presentTaxa), 0, presentTaxa),
totalTaxa = ifelse(is.na(totalTaxa), 0, totalTaxa)
) %>% select(-c(abbrName, paralog))
# Remove duplicates and sort results
finalPresSpecDt <- finalPresSpecDt %>% arrange(geneID, supertaxon) %>%
distinct()
return(finalPresSpecDt)
}
#' Parsing info for phylogenetic profiles
#' @description Creating main dataframe for the input phylogenetic profiles
#' based on selected input taxonomy level (e.g. strain, species) and reference
#' taxon. The output contains the number of paralogs, the max/min/mean/median
#' of VAR1 and VAR2.
#' @usage parseInfoProfile(inputDf, sortedInputTaxa, taxaCount, coorthoCOMax)
#' @param inputDf input profiles in long format
#' @param sortedInputTaxa sorted taxonomy data for the input taxa
#' (check sortInputTaxa())
#' @param taxaCount dataframe counting present taxa in each supertaxon
#' @param coorthoCOMax maximum number of co-orthologs allowed
#' @return A dataframe contains all info for the input phylogenetic profiles.
#' This full processed profile that is required for several profiling analyses
#' e.g. estimation of gene age (?estimateGeneAge) or identification of core gene
#' (?getCoreGene).
#' @author Vinh Tran tran@bio.uni-frankfurt.de
#' @importFrom dplyr count
#' @export
#' @seealso \code{\link{createLongMatrix}}, \code{\link{sortInputTaxa}},
#' \code{\link{calcPresSpec}}, \code{\link{mainLongRaw}}
#' @examples
#' library(dplyr)
#' data("mainLongRaw", package="PhyloProfile")
#' taxonIDs <- getInputTaxaID(mainLongRaw)
#' sortedInputTaxa <- sortInputTaxa(
#' taxonIDs, "class", "Mammalia", NULL, NULL
#' )
#' taxaCount <- sortedInputTaxa %>% dplyr::group_by(supertaxon) %>%
#' summarise(n = n(), .groups = "drop")
#' coorthoCOMax <- 999
#' parseInfoProfile(
#' mainLongRaw, sortedInputTaxa, taxaCount, coorthoCOMax
#' )
parseInfoProfile <- function(
inputDf, sortedInputTaxa, taxaCount, coorthoCOMax = 9999
) {
if (is.null(inputDf) | is.null(sortedInputTaxa))
stop("Input profiles and sorted taxonomy data cannot be NULL!")
geneID <- ncbiID <- paralog <- geneName <- NULL
if (ncol(inputDf) > 3) {
if (ncol(inputDf) < 5) {colnames(inputDf)[4] <- "var1"}
else if (ncol(inputDf) < 6) {
colnames(inputDf)[c(4,5)] <- c("var1", "var2")
}
}
if (coorthoCOMax < 1) coorthoCOMax <- 1
# count number of inparalogs & calculate frequency of all supertaxa
inputDf <- inputDf %>%
dplyr::group_by(geneID, ncbiID) %>%
dplyr::mutate(paralog = dplyr::n()) %>%
dplyr::ungroup()
# filter by number of coorthologs
inputDf <- inputDf %>% filter(!(paralog > coorthoCOMax))
# add tax info from sortedInputTaxa and number of species from taxaCount
fullMdData <- inputDf %>%
left_join(sortedInputTaxa, by = "ncbiID") %>%
left_join(taxaCount, by = "supertaxon") %>%
distinct()
fullMdData$fullName <- as.character(fullMdData$fullName)
# add geneName column if not yet exist
if (!("geneName" %in% colnames(fullMdData)))
fullMdData$geneName <- fullMdData$geneID
# Sort `geneName` based on the order of `geneID`
# Extract unique gene names for each geneID
orderedName <- fullMdData %>%
dplyr::group_by(geneID) %>%
dplyr::reframe(geneName = unique(as.character(geneName))) %>%
dplyr::pull(geneName)
# Update `geneName` as a factor with the ordered levels
fullMdData$geneName <- factor(fullMdData$geneName, levels = orderedName)
# Rename column "n" to "numberSpec"
fullMdData <- fullMdData %>% dplyr::rename(numberSpec = n)
return(fullMdData)
}
#' Filter phylogentic profiles
#' @description Create a filtered data needed for plotting or clustering
#' phylogenetic profiles. NOTE: this function require some intermediate steps
#' using the results from other functions. If you would like to get a full
#' processed data from the raw input, please use the function
#' fromInputToProfile() instead!
#' @usage filterProfileData(DF, taxaCount, refTaxon = NULL,
#' percentCO = c(0, 1), coorthoCOMax = 9999,
#' var1CO = c(0, 1), var2CO = c(0, 1), var1Rel = "protein",
#' var2Rel = "protein", groupByCat = FALSE, catDt = NULL,
#' var1AggregateBy = "max", var2AggregateBy = "max")
#' @param DF a reduced dataframe contains info for all phylogenetic
#' profiles in the selected taxonomy rank.
#' @param taxaCount dataframe counting present taxa in each supertaxon
#' @param refTaxon selected reference taxon. NOTE: This taxon will not be
#' affected by the filtering. If you want to filter all, set refTaxon <- NULL.
#' Default = NULL.
#' @param percentCO min and max cutoffs for percentage of species present
#' in a supertaxon. Default = c(0, 1).
#' @param coorthoCOMax maximum number of co-orthologs allowed. Default =
#' 9999.
#' @param var1CO min and max cutoffs for var1. Default = c(0, 1).
#' @param var2CO min anc max cutoffs for var2. Default = c(0, 1).
#' @param var1Rel relation of var1 ("protein" for protein-protein or
#' "species" for protein-species). Default = "protein".
#' @param var2Rel relation of var2 ("protein" for protein-protein or
#' "species" for protein-species). Default = "protein".
#' @param groupByCat group genes by their categories (TRUE or FALSE). Default =
#' FALSE.
#' @param catDt dataframe contains gene categories
#' (optional, NULL if groupByCat = FALSE or no info provided). Default = NULL.
#' @param var1AggregateBy aggregate method for VAR1 (max, min, mean
#' or median), applied for calculating var1 of supertaxa. Default = "max".
#' @param var2AggregateBy aggregate method for VAR2 (max, min, mean
#' or median), applied for calculating var2 of supertaxa. Default = "max".
#' @return A filtered dataframe for generating profile plot including seed gene
#' IDs (or orthologous group IDs), their ortholog IDs and the corresponding
#' (super)taxa, (super)taxon IDs, number of co-orthologs in each (super)taxon,
#' values for two additional variables var1, var2, % of species present in each
#' supertaxon, and the categories of seed genes (or ortholog groups).
#' @author Vinh Tran tran@bio.uni-frankfurt.de
#' @importFrom stats complete.cases aggregate
#' @export
#' @seealso \code{\link{parseInfoProfile}} and \code{\link{reduceProfile}}
#' for generating input dataframe, \code{\link{fullProcessedProfile}} for a
#' demo full processed profile dataframe, \code{\link{fromInputToProfile}} for
#' generating fully processed data from raw input.
#' @examples
#' # NOTE: this function require some intermediate steps using the results from
#' # other functions. If you would like to get a full processed data from the
#' # raw input, please use the function fromInputToProfile() instead!
#' library(dplyr)
#' data("fullProcessedProfile", package="PhyloProfile")
#' rankName <- "class"
#' refTaxon <- "Mammalia"
#' percentCutoff <- c(0.0, 1.0)
#' coorthologCutoffMax <- 10
#' var1Cutoff <- c(0.75, 1.0)
#' var2Cutoff <- c(0.5, 1.0)
#' var1Relation <- "protein"
#' var2Relation <- "species"
#' groupByCat <- FALSE
#' catDt <- NULL
#' var1AggregateBy <- "max"
#' var2AggregateBy <- "max"
#' taxonIDs <- levels(as.factor(fullProcessedProfile$ncbiID))
#' sortedInputTaxa <- sortInputTaxa(
#' taxonIDs, rankName, refTaxon, NULL, NULL
#' )
#' taxaCount <- sortedInputTaxa %>% dplyr::group_by(supertaxon) %>%
#' summarise(n = n(), .groups = "drop")
#' filterProfileData(
#' fullProcessedProfile,
#' taxaCount,
#' refTaxon,
#' percentCutoff,
#' coorthologCutoffMax,
#' var1Cutoff,
#' var2Cutoff,
#' var1Relation,
#' var2Relation,
#' groupByCat,
#' catDt,
#' var1AggregateBy,
#' var2AggregateBy
#' )
filterProfileData <- function(
DF, taxaCount, refTaxon = NULL, percentCO = c(0, 1), coorthoCOMax = 9999,
var1CO = c(0, 1), var2CO = c(0, 1), var1Rel = "protein",
var2Rel = "protein", groupByCat = FALSE, catDt = NULL,
var1AggregateBy = "max", var2AggregateBy = "max"
) {
if (is.null(DF)) stop("Profile data cannot be NULL!")
if (is.null(refTaxon)) refTaxon <- "NA"
geneID <- supertaxon <- value <- NULL
flag <- ifelse(all(DF$numberSpec == 1), 0, 1)
DF$taxonMod <- sub("^[[:digit:]]*_", "", DF$supertaxon)
filter_var <- function(var, varCO, rel, refTaxon, taxonMod) {
if (rel == "protein") {
var[taxonMod != refTaxon & (var < varCO[1] | var > varCO[2])] <- NA
} else {
var[taxonMod != refTaxon & (var < varCO[1] | var > varCO[2])] <- 0
}
return(var)
}
if (flag == 0) {
DF$var1 <- filter_var(DF$var1, var1CO, var1Rel, refTaxon, DF$taxonMod)
DF$var2 <- filter_var(DF$var2, var2CO, var2Rel, refTaxon, DF$taxonMod)
} else {
DF$var1 <- filter_var(DF$var1, var1CO, var1Rel, NA, NA)
DF$var2 <- filter_var(DF$var2, var2CO, var2Rel, NA, NA)
}
DFtmp <- DF[stats::complete.cases(DF), ]
finalPresSpecDt <- calcPresSpec(DFtmp, taxaCount)
DF <- DF %>% left_join(finalPresSpecDt, by = c("geneID", "supertaxon"))
DF$presSpec <- ifelse(
DF$presSpec < percentCO[1] | DF$presSpec > percentCO[2], 0, DF$presSpec
)
# Convert orthoID to character and set to NA where presSpec == 0
DF$orthoID <- ifelse(DF$presSpec == 0, NA, as.character(DF$orthoID))
# Conditionally update var1 and var2 based on orthoID being NA
if (var1Rel == "protein") DF$var1[is.na(DF$orthoID)] <- NA
if (var2Rel == "protein") DF$var2[is.na(DF$orthoID)] <- NA
# Set presSpec to 0 where orthoID is NA
DF$presSpec[is.na(DF$orthoID)] <- 0
# Set paralog to 1 if flag == 1
if (flag == 1) DF$paralog <- 1
# Dropping unused levels and ensuring geneID and supertaxon are factors
DF <- DF %>% droplevels() %>% mutate(
geneID = factor(geneID), supertaxon = factor(supertaxon)
)
aggregate_scores <- function(df, var, aggBy) {
if (nrow(df) == 0) {
# Return an empty data frame
return(data.frame(
supertaxon=character(0), geneID=character(0), value=numeric(0)
))
}
aggregated <- stats::aggregate(
df[[var]], by = list(df$supertaxon, df$geneID), FUN = aggBy
)
colnames(aggregated) <- c("supertaxon", "geneID", "value")
return(aggregated)
}
mVar1Dt <- aggregate_scores(
DF[!is.na(DF$var1), ], "var1", var1AggregateBy
) %>% dplyr::rename(mVar1 = value)
mVar2Dt <- aggregate_scores(DF[!is.na(DF$var2), ], "var2", var2AggregateBy)
if (nrow(mVar2Dt) == 0) {
mVar2Dt <- unique(DF[, c("supertaxon", "geneID")]) %>%
dplyr::mutate(mVar2 = 0)
} else {
mVar2Dt <- mVar2Dt %>% dplyr::rename(mVar2 = value)
}
scoreDf <- dplyr::full_join(mVar1Dt, mVar2Dt, by = c("supertaxon","geneID"))
DF <- dplyr::left_join(DF, scoreDf, by = c("geneID", "supertaxon"))
if (groupByCat) {
if (is.null(catDt)) {
catDt <- data.frame(geneID = levels(DF$geneID), group="noCategory")
}
dfCat <- merge(
expand.grid(
supertaxon = levels(DF$supertaxon), geneID = levels(DF$geneID)
), catDt, by = "geneID", all.x = TRUE
)
DF <- dplyr::left_join(dfCat, DF, by = c("geneID", "supertaxon"))
DF$category <- DF$group
}
return(DF)
}
#' Reduce the filtered profile data into supertaxon level
#' @description Reduce data of the processed phylogenetic profiles from input
#' taxonomy rank into supertaxon level (e.g. from species to phylum)
#' @param filteredProfile dataframe contains the filtered profiles (see
#' ?parseInfoProfile, ?filterProfileData and ?filteredProfile)
#' @return A reduced dataframe contains only profile data for the selected
#' supertaxon rank. This dataframe contains only supertaxa and their value
#' (mVar1 & mVar2) for each gene.
#' @author Vinh Tran tran@bio.uni-frankfurt.de
#' @export
#' @seealso \code{\link{parseInfoProfile}} for creating a full processed
#' profile dataframe, \code{\link{filterProfileData}} for filter processed
#' profile and \code{\link{filteredProfile}} for a demo filtered
#' profile dataframe
#' @examples
#' data("filteredProfile", package="PhyloProfile")
#' reduceProfile(filteredProfile)
reduceProfile <- function(filteredProfile) {
if (is.null(filteredProfile)) stop("Profile data cannot be NULL!")
# check if working with the lowest taxonomy rank; 1 for NO; 0 for YES
flag <- 1
if (length(unique(levels(as.factor(filteredProfile$numberSpec)))) == 1) {
if (unique(levels(as.factor(filteredProfile$numberSpec))) == 1) {
superDfExt <- filteredProfile[, c(
"geneID", "supertaxon", "supertaxonID", "orthoID", "category",
"presSpec", "paralog", "mVar1", "mVar2", "geneName"
)]
superDfExt$presentTaxa <- 1
superDfExt$totalTaxa <- 1
# rename mVar to var
names(superDfExt)[names(superDfExt) == "mVar1"] <- "var1"
names(superDfExt)[names(superDfExt) == "mVar2"] <- "var2"
flag <- 0
}
}
if (flag == 1) {
# get representative orthoID that has m VAR1 for each supertaxon
mOrthoID <- filteredProfile[, c(
"geneID", "supertaxon", "var1", "mVar1", "orthoID", "presSpec",
"geneName"
)]
mOrthoID$sVar1 <- mOrthoID$mVar1 - mOrthoID$var1
# mOrthoID <- subset(mOrthoID, mOrthoID$var1 == mOrthoID$mVar1)
mOrthoID <- subset(mOrthoID, mOrthoID$sVar1 <= 0)
colnames(mOrthoID) <- c(
"geneID","supertaxon","var1","mVar1","orthoID","presSpec","sVar1",
"geneName"
)
mOrthoID <- mOrthoID[order(mOrthoID$sVar1, decreasing = TRUE), ]
mOrthoID <- mOrthoID[!is.na(mOrthoID$orthoID), ]
mOrthoID <- mOrthoID[, c("geneID", "supertaxon", "orthoID", "presSpec")]
mOrthoID <- mOrthoID[!duplicated(mOrthoID[, seq_len(2)]), ]
# get data set for PhyloProfile plotting (contains only supertaxa info)
superDf <- subset(filteredProfile, select = c(
"geneID", "supertaxon", "supertaxonID", "mVar1", "category",
"mVar2", "paralog", "presentTaxa", "totalTaxa", "geneName"
))
superDf <- superDf[!duplicated(superDf), ]
superDfExt <- merge(
superDf, mOrthoID, by = c("geneID", "supertaxon"), all.x = TRUE
)
superDfExt <- superDfExt[, c(
"geneID", "supertaxon", "supertaxonID",
"mVar1", "presSpec", "category", "orthoID", "mVar2", "paralog",
"presentTaxa", "totalTaxa", "geneName"
)]
# rename mVar to var
names(superDfExt)[names(superDfExt) == "mVar1"] <- "var1"
names(superDfExt)[names(superDfExt) == "mVar2"] <- "var2"
}
superDfExt$orthoID <- as.character(superDfExt$orthoID)
return(superDfExt)
}
#' Complete processing of raw input phylogenetic profiles
#' @description Create a processed and filtered data for plotting or analysing
#' phylogenetic profiles from raw input file (from raw input to final filtered
#' dataframe)
#' @usage fromInputToProfile(rawInput, rankName, refTaxon = NULL,
#' taxaTree = NULL, sortedTaxonList = NULL, var1AggregateBy = "max",
#' var2AggregateBy = "max", percentCutoff = c(0, 1),
#' coorthologCutoffMax = 9999, var1Cutoff = c(0, 1), var2Cutoff = c(0, 1),
#' var1Relation = "protein", var2Relation = "protein", groupByCat = FALSE,
#' catDt = NULL, taxDB = NULL)
#' @param rawInput input file (in long, wide, multi-fasta or orthoxml format)
#' @param rankName taxonomy rank (e.g. "species","phylum",...)
#' @param refTaxon selected reference taxon name (used for sorting and will be
#' protected from filtering). Default = NULL.
#' @param taxaTree input taxonomy tree for taxa in input profiles (optional).
#' Default = NULL.
#' @param sortedTaxonList list of sorted taxa (optional). Default = NULL.
#' @param var1AggregateBy aggregate method for var1 (min, max, mean or median).
#' Default = "max".
#' @param var2AggregateBy aggregate method for VAR2 (min, max, mean or median).
#' Default = "max".
#' @param percentCutoff min and max cutoffs for percentage of species present
#' in a supertaxon. Default = c(0, 1).
#' @param coorthologCutoffMax maximum number of co-orthologs allowed. Default =
#' 9999.
#' @param var1Cutoff min and max cutoffs for var1. Default = c(0, 1).
#' @param var2Cutoff min and max cutoffs for var2. Default = c(0, 1).
#' @param var1Relation relation of var1 ("protein" for protein-protein or
#' "species" for protein-species). Default = "protein".
#' @param var2Relation relation of var2 ("protein" for protein-protein or
#' "species" for protein-species). Default = "protein".
#' @param groupByCat group genes by their categories (TRUE or FALSE). Default =
#' FALSE.
#' @param catDt dataframe contains gene categories. Default = NULL
#' @param taxDB Path to the taxonomy DB files
#' @return Dataframe required for generating phylogenetic profile plot or
#' clustering analysis. It contains seed gene IDs (or orthologous group IDs),
#' their ortholog IDs and the corresponding (super)taxa, (super)taxon IDs,
#' number of co-orthologs in each (super)taxon, values for two additional
#' variables var1, var2, % of species present in each supertaxon, and the
#' categories of seed genes (or ortholog groups).
#' @author Vinh Tran tran@bio.uni-frankfurt.de
#' @importFrom dplyr count
#' @export
#' @seealso \code{\link{createLongMatrix}}, \code{\link{getInputTaxaID}},
#' \code{\link{getInputTaxaName}}, \code{\link{sortInputTaxa}},
#' \code{\link{parseInfoProfile}}, \code{\link{reduceProfile}},
#' \code{\link{filterProfileData}}
#' @examples
#' rawInput <- system.file(
#' "extdata", "test.main.long", package = "PhyloProfile", mustWork = TRUE
#' )
#' rankName <- "class"
#' refTaxon <- "Mammalia"
#' taxaTree <- NULL
#' sortedTaxonList <- NULL
#' var1AggregateBy <- "max"
#' var2AggregateBy <- "mean"
#' percentCutoff <- c(0.0, 1.0)
#' coorthologCutoffMax <- 10
#' var1Cutoff <- c(0.75, 1.0)
#' var2Cutoff <- c(0.5, 1.0)
#' var1Relation <- "protein"
#' var2Relation <- "species"
#' groupByCat <- FALSE
#' catDt <- NULL
#' fromInputToProfile(
#' rawInput,
#' rankName,
#' refTaxon,
#' taxaTree,
#' sortedTaxonList,
#' var1AggregateBy,
#' var2AggregateBy,
#' percentCutoff,
#' coorthologCutoffMax,
#' var1Cutoff,
#' var2Cutoff,
#' var1Relation,
#' var2Relation,
#' groupByCat,
#' catDt
#' )
fromInputToProfile <- function(
rawInput,
rankName,
refTaxon = NULL,
taxaTree = NULL,
sortedTaxonList = NULL,
var1AggregateBy = "max",
var2AggregateBy = "max",
percentCutoff = c(0, 1),
coorthologCutoffMax = 9999,
var1Cutoff = c(0, 1),
var2Cutoff = c(0, 1),
var1Relation = "protein",
var2Relation = "protein",
groupByCat = FALSE,
catDt = NULL,
taxDB = NULL
) {
if (missing(rawInput) | missing(rankName)) return("Missing input")
if (is.null(rawInput) | is.null(rankName)) return("Missing input")
supertaxon <- NULL
# convert raw input into long format
inputDf <- createLongMatrix(rawInput)
# get input taxon IDs and names
inputTaxonID <- getInputTaxaID(inputDf)
# sort input taxa based on selected reference taxon or input taxonomy tree
sortedInputTaxa <- sortInputTaxa(
inputTaxonID, rankName, refTaxon, taxaTree, sortedTaxonList, taxDB
)
# count present taxa in each supertaxon
taxaCount <- sortedInputTaxa %>% dplyr::group_by(supertaxon) %>%
summarise(n = n(), .groups = "drop")
# parse info (additional values...) into profile df
fullMdData <- parseInfoProfile(inputDf, sortedInputTaxa, taxaCount)
# filter profile
filteredDf <- filterProfileData(
fullMdData,
taxaCount,
refTaxon,
percentCutoff,
coorthologCutoffMax,
var1Cutoff,
var2Cutoff,
var1Relation,
var2Relation,
groupByCat = FALSE,
catDt = NULL,
var1AggregateBy, var2AggregateBy
)
# reduce profile df into supertaxon level
dataHeat <- reduceProfile(filteredDf)
return(dataHeat)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.