#' Pre-processing for DAtest
#'
#' Pre-process the count table before running \code{DAtest} functions
#' @param data Either a matrix with counts/abundances, OR a \code{phyloseq} object. If a matrix/data.frame is provided rows should be taxa/genes/proteins and columns samples
#' @param min.samples Minimum number of samples the features should be present in. Default 0
#' @param min.reads Minimum number of total reads the features should have. Default 0
#' @param min.abundance Minimum mean relative abundance features should have. Default 0
#' @return Similar to input, but with features not reaching the criteria given grouped as "Others"
#' @examples
#' # Creating random count_table with many low abundant
#' set.seed(4)
#' mat <- matrix(rnbinom(1000, size = 0.05, mu = 500), nrow = 100, ncol = 10)
#' # Keep only those present in at least 3 samples
#' res <- preDA(mat, min.samples = 3)
#'
#' # Last feature is now called 'Others' and is a sum of all features present in less than 3 samples
#' rownames(res)[nrow(res)]
#' @export
preDA <- function(data, min.samples = 0, min.reads = 0, min.abundance = 0){
# Extract from phyloseq
if(is(data, "phyloseq")){
loadNamespace("phyloseq")
count_table <- phyloseq::otu_table(data)
if(!phyloseq::taxa_are_rows(data)) count_table <- t(count_table)
} else {
count_table <- data
}
# Check
if(any("Others" %in% rownames(count_table))) stop("There is already a feature named 'Others' in data. Maybe data has been pre-processed already")
# Exclude by samples
exclude.samples <- which(rowSums(count_table > 0) < min.samples)
# Exclude by reads
exclude.reads <- which(rowSums(count_table) < min.reads)
# Exclude by abundance
count_rel <- apply(count_table, 2, function(x) x/sum(x))
exclude.abundance <- which(rowMeans(count_rel) < min.abundance)
# Which to exclude
exclude <- unique(c(exclude.samples,exclude.reads,exclude.abundance))
if(length(exclude) == 0) stop("No features to group!")
if(length(exclude) == nrow(count_table)) stop("All features would be grouped as 'Others'!")
message(paste(length(exclude),"features grouped as 'Others' in the output"))
# Make new count_table
count.keep <- count_table[-exclude,]
count.other <- count_table[exclude,]
count.other <- colSums(count.other)
count.new <- rbind(count.keep,count.other)
rownames(count.new)[nrow(count.new)] <- "Others"
# Output
if(is(data, "phyloseq")){
# Fix tax_table
tax <- as.data.frame(unclass(phyloseq::tax_table(data)))
tax.keep <- tax[-exclude,]
tax.new <- rbind(tax.keep,NA)
rownames(tax.new)[nrow(tax.new)] <- "Others"
if(phyloseq::taxa_are_rows(data)){
data.new <- phyloseq::phyloseq(phyloseq::otu_table(count.new, taxa_are_rows = TRUE),
phyloseq::sample_data(data),
phyloseq::tax_table(as.matrix(tax.new)))
}
if(!phyloseq::taxa_are_rows(data)){
data.new <- phyloseq::phyloseq(phyloseq::otu_table(t(count.new), taxa_are_rows = FALSE),
phyloseq::sample_data(data),
phyloseq::tax_table(as.matrix(tax.new)))
}
return(data.new)
} else {
return(count.new)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.