R/select_samples.r

Defines functions select.samples

Documented in select.samples

#!/usr/bin/Rscript
### SIAMCAT - Statistical Inference of Associations between
### Microbial Communities And host phenoTypes R flavor EMBL
### Heidelberg 2012-2018 GNU GPL 3.0

#' @title Select samples based on metadata
#'
#' @description This function select samples based on information given in the
#' metadata
#'
#' @usage select.samples(siamcat, filter, allowed.set = NULL,
#' allowed.range = NULL, verbose = 1)
#'
#' @param siamcat an object of class \link{siamcat-class}
#'
#' @param filter string, name of the meta variable on which the selection 
#' should be done
#'
#' @param allowed.set a vector of allowed values
#'
#' @param allowed.range a range of allowed values
#'
#' @param verbose integer, control output: \code{0} for no output at all,
#' \code{1} for only information about progress and success, \code{2} for
#' normal level of information and \code{3} for full debug information,
#' defaults to \code{1}
#'
#' @keywords SIAMCAT select.samples
#'
#' @export select.samples
#'
#' @encoding UTF-8
#'
#' @return an object of class \link{siamcat-class} with labels and metadata 
#' filtered in order to contain only allowed values
#'
#' @details This functions selects labels and metadata based on a specific
#' column in the metadata. Provided with a column-name in the metadata and a
#' range or a set of allowed values, the function will filter the
#' \link{siamcat-class} object accordingly.
#'
#' @examples
#' data(siamcat_example)
#'
#' # Select all samples that fall into an Age-range between 25 and 80 years
#' siamcat_selected <- select.samples(siamcat_example,
#'     filter='Age',
#'     allowed.range=c(25, 80))
#'
#' # Select only female samples
#' siamcat_female <- select.samples(siamcat_example,
#'     filter='Gender',
#'     allowed.set=c('F'))
select.samples <- function(siamcat, filter, allowed.set = NULL,
    allowed.range = NULL, verbose = 1) {

    if (verbose > 1)
        message("+ starting select.samples")
    s.time <- proc.time()[3]

    # checks
    if (is.null(meta(siamcat))){
        stop('SIAMCAT needs to have metadata for select.samples. Exiting...')
    }
    if (!filter %in% colnames(meta(siamcat)))
        stop("! The filter name is not present in colnames of the sample",
            " data. Stopping.\n")
    if (!xor(is.null(allowed.range), is.null(allowed.set))) {
        stop("Neither allowed.range nor allowed.set (or both at the same",
            " time) have been provided, exiting!")
    }
    if (!is.null(data_split(siamcat, verbose=0)) |
        !is.null(eval_data(siamcat, verbose=0)) |
        !is.null(models(siamcat, verbose=0))){
            warning("The machine learning pipeline has to be run again ",
                "after filtering the samples")
    }
    if (!is.null(filt_feat(siamcat, verbose=0)) |
        !is.null(norm_feat(siamcat, verbose=0))){
            warning("Selcting samples may affect the results of feature ",
                "filtering and normalization\nFor sanity, results from ",
                "previous analyses (e.g. filtered features) will be removed!")
        siamcat <- siamcat(
            phyloseq=physeq(siamcat),
            label=label(siamcat),
            validate=FALSE, verbose=0)
    }



    if (verbose > 2)
        message("+++ checking allowed values")
    if (!is.null(allowed.range)) {
        stopifnot(length(allowed.range) == 2)
        stopifnot(all(is.numeric(allowed.range)))
        allowed.range <- sort(allowed.range)
        stopifnot(allowed.range[1] <= allowed.range[2])
    }

    if (!is.null(allowed.set)) {
        allowed.set <- sort(unique(allowed.set))
        stopifnot(all(is.character(allowed.set)))
    }

    # get meta column
    filter.var <- meta(siamcat)[[filter]]

    if (!is.null(allowed.range)) {
        if (!is.numeric(filter.var)){
            stop('A numerical column is needed for allowed range. Exiting...')
        }
        if (verbose > 2)
            message(paste0("+++ allowed.range  = [", paste(allowed.range,
                collapse = ","), "]"))
    } else {
        if (!any(allowed.set %in% unique(filter.var))){
            stop('The allowed set has no overlap with the chosen
                metadata. Exiting...')
        }
        if (verbose > 2)
            message(paste0("+++ allowed.set = {", paste(allowed.set,
                collapse = ","), "}"))
    }

    # select samples on range
    if (!is.null(allowed.range)) {

        s.idx <- !is.na(filter.var) & filter.var >= allowed.range[1] &
            filter.var <= allowed.range[2]

        if (verbose > 1) {
            message(paste0("+++ removed ", sum(!s.idx), " samples with ",
                    filter, " not in [", paste(allowed.range, collapse = ", "),
                    "] (retaining ", sum(s.idx), ")"))
        }

    } else {

        s.idx <- !is.na(filter.var) & filter.var %in% allowed.set

        if (verbose > 1) {
            message(paste0("+++ removed ",sum(!s.idx), " samples with ",
                    filter, " not in {", paste(allowed.set, collapse = ", "),
                    "} (retaining ", sum(s.idx), ")"))
        }
    }

    s.names <- rownames(meta(siamcat))[s.idx]

    # prune phyloseq object
    physeq(siamcat) <-
        prune_samples(x = physeq(siamcat), samples = s.names)
    # filter label object
    siamcat <- filter.label(siamcat, s.names, verbose = verbose)

    e.time <- proc.time()[3]
    if (verbose > 1){
        msg <- paste( "+ finished select.samples in",
            formatC(e.time - s.time, digits = 3), "s")
        message(msg)
    }

    if (verbose == 1)
        message("Selecting samples finished")

    return(siamcat)
}
zellerlab/siamcat documentation built on Feb. 1, 2024, 2:21 a.m.