R/MTXmodel.R

Defines functions MTXmodel option_not_valid_error

#!/usr/bin/env Rscript

###############################################################################
# MTX modeling: run model with covariate feature-DNA

# Copyright (c) 2018 Harvard School of Public Health

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
###############################################################################

# load in the required libraries, report an error if they are not installed

for (lib in c('optparse', 'logging', 'data.table', 'dplyr')) {
    suppressPackageStartupMessages(require(lib, character.only = TRUE))
}

###############################################################
# If running on the command line, load other modules #
###############################################################

# this evaluates to true if script is being called directly as an executable
if (identical(environment(), globalenv()) &&
        !length(grep("^source\\(", sys.calls()))) {
    script_options <- commandArgs(trailingOnly = FALSE)
    script_path <-
        sub("--file=", "", script_options[grep("--file=", script_options)])
    script_dir <- dirname(script_path)
    script_name <- basename(script_path)
    
    for (R_file in dir(script_dir, pattern = "*.R"))
    {
        if (!(R_file == script_name))
            source(file.path(script_dir, R_file))
    }
}

###########################
# Set the default options #
###########################
# covariate-DNA
normalization_choices <- c("TSS", "CLR", "CSS", "NONE", "TMM")
analysis_method_choices_names <-
    #c("LM", "CPLM", "NEGBIN", "ZINB")
    c("LM", "CPLM", "NEGBIN", "ZINB", "LOGIT")
#transform_choices <- c("LOG", "LOGIT", "AST", "NONE")
transform_choices <- c("LOG", "LOGIT", "AST", "NONE", "PA")
valid_choice_method_norm <- hash::hash()
valid_choice_method_norm[[analysis_method_choices_names[3]]] <-
    normalization_choices[3:5]
valid_choice_method_norm[[analysis_method_choices_names[4]]] <-
    normalization_choices[3:5]
#valid_choice_method_transform <- analysis_method_choices_names[1:2]
valid_choice_method_transform <- analysis_method_choices_names[c(1, 2, 5)]
valid_choice_transform_norm <- hash::hash()
valid_choice_transform_norm[[transform_choices[2]]] <-
    normalization_choices[c(1, 4)]
valid_choice_transform_norm[[transform_choices[3]]] <-
    normalization_choices[c(1, 4)]
correction_choices <-
    c("BH", "holm", "hochberg", "hommel", "bonferroni", "BY")

# covariate-DNA
rna_dna_flt_choices <- 
    c("none", "lenient", "semi_strict", "strict")

# set the default run options
args <- list()
args$input_data <- NULL
args$input_metadata <- NULL
args$output <- NULL
args$min_abundance <- 0.0
args$min_prevalence <- 0.1
args$min_variance <- 0.0
args$max_significance <- 0.25
args$normalization <- normalization_choices[1]
args$transform <- transform_choices[1]
args$analysis_method <- analysis_method_choices_names[1]
args$random_effects <- NULL
args$fixed_effects <- NULL
args$correction <- correction_choices[1]
args$standardize <- TRUE
args$plot_heatmap <- TRUE
args$heatmap_first_n <- 50
args$plot_scatter <- TRUE
args$cores <- 1
args$reference <- NULL
# covariate-DNA
args$input_dnadata <- "none"
args$rna_dna_flt <- rna_dna_flt_choices[2]


##############################
# Add command line arguments #
##############################

options <-
    optparse::OptionParser(usage = paste(
        "%prog [options]",
        " <data.tsv> ",
        "<metadata.tsv> ",
        "<output_folder>" 
        )
    )
options <-
    optparse::add_option(
        options,
        c("-a", "--min_abundance"),
        type = "double",
        dest = "min_abundance",
        default = args$min_abundance,
        help = paste0("The minimum abundance for each feature",
            " [ Default: %default ]"
        )
    )
options <-
    optparse::add_option(
        options,
        c("-p", "--min_prevalence"),
        type = "double",
        dest = "min_prevalence",
        default = args$min_prevalence,
        help = paste0("The minimum percent of samples for which",
            "a feature is detected at minimum abundance",
            " [ Default: %default ]"
        )
    )
options <-
    optparse::add_option(
        options,
        c("-b", "--min_variance"),
        type = "double",
        dest = "min_variance",
        default = args$min_variance,
        help = paste0("Keep features with variances",
            "greater than value",
            " [ Default: %default ]"
        )
    )
options <-
    optparse::add_option(
        options,
        c("-s", "--max_significance"),
        type = "double",
        dest = "max_significance",
        default = args$max_significance,
        help = paste0("The q-value threshold for significance",
            " [ Default: %default ]"
        )
    )
options <-
    optparse::add_option(
        options,
        c("-n", "--normalization"),
        type = "character",
        dest = "normalization",
        default = args$normalization,
        help = paste(
            "The normalization method to apply",
            " [ Default: %default ] [ Choices:",
            toString(normalization_choices),
            "]"
        )
    )
options <-
    optparse::add_option(
        options,
        c("-t", "--transform"),
        type = "character",
        dest = "transform",
        default = args$transform,
        help = paste(
            "The transform to apply [ Default: %default ] [ Choices:",
            toString(transform_choices),
            "]"
        )
    )
options <-
    optparse::add_option(
        options,
        c("-m", "--analysis_method"),
        type = "character",
        dest = "analysis_method",
        default = args$analysis_method,
        help = paste(
            "The analysis method to apply [ Default: %default ] [ Choices:",
            toString(analysis_method_choices_names),
            "]"
        )
    )
options <-
    optparse::add_option(
        options,
        c("-r", "--random_effects"),
        type = "character",
        dest = "random_effects",
        default = args$random_effects,
        help = paste("The random effects for the model, ",
            "comma-delimited for multiple effects",
            " [ Default: none ]"
        )
    )
options <-
    optparse::add_option(
        options,
        c("-f", "--fixed_effects"),
        type = "character",
        dest = "fixed_effects",
        default = args$fixed_effects,
        help = paste("The fixed effects for the model,",
            " comma-delimited for multiple effects",
            " [ Default: all ]"
        )
    )
options <-
    optparse::add_option(
        options,
        c("-c", "--correction"),
        type = "character",
        dest = "correction",
        default = args$correction,
        help = paste("The correction method for computing",
            " the q-value [ Default: %default ]"
        )
    )
options <-
    optparse::add_option(
        options,
        c("-z", "--standardize"),
        type = "logical",
        dest = "standardize",
        default = args$standardize,
        help = paste("Apply z-score so continuous metadata are on",
            " the same scale [ Default: %default ]"
        )
    )
options <-
    optparse::add_option(
        options,
        c("-l", "--plot_heatmap"),
        type = "logical",
        dest = "plot_heatmap",
        default = args$plot_heatmap,
        help = paste("Generate a heatmap for the significant ",
            "associations [ Default: %default ]"
        )
    )
options <-
    optparse::add_option(
        options,
        c("-i", "--heatmap_first_n"),
        type = "double",
        dest = "heatmap_first_n",
        default = args$heatmap_first_n,
        help = paste("In heatmap, plot top N features with significant ",
            "associations [ Default: %default ]"
        )
    )
options <-
    optparse::add_option(
        options,
        c("-o", "--plot_scatter"),
        type = "logical",
        dest = "plot_scatter",
        default = args$plot_scatter,
        help = paste("Generate scatter plots for the significant",
            " associations [ Default: %default ]"
        )
    )
options <-
    optparse::add_option(
        options,
        c("-e", "--cores"),
        type = "double",
        dest = "cores",
        default = args$cores,
        help = paste("The number of R processes to ",
            "run in parallel [ Default: %default ]"
        )
    )

options <-
    optparse::add_option(
        options,
        c("-d", "--reference"),
        type = "character",
        dest = "reference",
        default = args$reference,
        help = paste("The factor to use as a reference for",
            "a variable with more than two levels",
            "provided as a string of 'variable,reference'",
            "semi-colon delimited for multiple variables [ Default: NA ]"
        )
    )

# covariate-DNA
options <-
    optparse::add_option(
        options,
        c("-x", "--input_dnadata"),
        type = "character",
        dest = "input_dnadata",
        default = args$input_dnadata,
        help = paste0("The covariate DNA abundance for each feature",
                      " [ Default: %default ]"
        )
    )
options <-
    optparse::add_option(
        options,
        c("-y", "--rna_dna_flt"),
        type = "character",
        dest = "rna_dna_flt",
        default = args$rna_dna_flt,
        help = paste0("Filtering features/samples based on the detectable abundance of RNA and covariate DNA per feature",
                      " [ Default: %default ]\n",
					  "\t\t[ Choices: none, lenient, semi_strict, strict]\n",
					  "\t\tnone: do not apply filtering\n",
					  "\t\tlenient: ignore features that are not detected at both DNA and RNA levels across all samples\n",
					  "\t\tsemi_strict: ignore the sample where both feature's DAN and RNA are not detected\n",
					  "\t\tstrict: ignore the sample where either feature's DAN or RNA is not detected"
        )
    )

option_not_valid_error <- function(message, valid_options) {
    logging::logerror(paste(message, ": %s"), toString(valid_options))
    stop("Option not valid", call. = FALSE)
}

#######################################################
# Main function (defaults same command line) #
#######################################################

MTXmodel <-
    function(
        input_data,
        input_metadata,
        output,
        min_abundance = 0.0,
        min_prevalence = 0.1,
        min_variance = 0.0,
        normalization = "TSS",
        transform = "LOG",
        analysis_method = "LM",
        max_significance = 0.25,
        random_effects = NULL,
        fixed_effects = NULL,
        correction = "BH",
        standardize = TRUE,
        cores = 1,
        plot_heatmap = TRUE,
        plot_scatter = TRUE,
        heatmap_first_n = 50,
        reference = NULL,
        # covariate-DNA
        input_dnadata = "none",
        rna_dna_flt = "none")
    {
        # Allow for lower case variables
        normalization <- toupper(normalization)
        transform <- toupper(transform)
        analysis_method <- toupper(analysis_method)
        correction <- toupper(correction)

        #################################################################
        # Read in the data and metadata, create output folder, init log #
        #################################################################
        # if a character string then this is a file name, else it 
        # is a data frame
        if (is.character(input_data)) {
            data <-
                data.frame(data.table::fread(
                    input_data, header = TRUE, sep = "\t"),
                        row.names = 1)
            if (nrow(data) == 1) {
                # read again to get row name
                data <- read.table(input_data, header = TRUE, row.names = 1)
            }
        } else {
            data <- input_data
        }
        if (is.character(input_metadata)) {
            metadata <-
                data.frame(data.table::fread(
                    input_metadata, header = TRUE, sep = "\t"),
                        row.names = 1)
            if (nrow(metadata) == 1) {
                metadata <- read.table(input_metadata,
                    header = TRUE,
                    row.names = 1)
            }
        } else {
            metadata <- input_metadata
        }
        
        # covariate-DNA
        if (is.character(input_dnadata)) {
        	if (input_dnadata != "none") {
                dnadata <-
                    data.frame(data.table::fread(
                        input_dnadata, header = TRUE, sep = "\t"),
                        row.names = 1)
                if (nrow(dnadata) == 1) {
                    dnadata <- read.table(input_dnadata,
                                          header = TRUE,
                                          row.names = 1)
                }
            } else {
            	dnadata <- "none"
            }
        } else {
			dnadata <- input_dnadata
        }
        
        # create an output folder and figures folder if it does not exist
        if (!file.exists(output)) {
            print("Creating output folder")
            dir.create(output)
        }

        if (plot_heatmap || plot_scatter){
            figures_folder <- file.path(output,"figures")
        if (!file.exists(figures_folder)) {
            print("Creating output figures folder")
            dir.create(figures_folder)
        }
        }
        
        # create log file (write info to stdout and debug level to log file)
        # set level to finest so all log levels are reviewed
        log_file <- file.path(output, "mtx_model.log")
        # remove log file if already exists (to avoid append)
        if (file.exists(log_file)) {
            print(paste("Warning: Deleting existing log file:", log_file))
            unlink(log_file)
        }
        logging::basicConfig(level = 'FINEST')
        logging::addHandler(logging::writeToFile, 
            file = log_file, level = "DEBUG")
        logging::setLevel(20, logging::getHandler('basic.stdout'))
        
        #####################
        # Log the arguments #
        #####################
        
        logging::loginfo("Writing function arguments to log file")
        logging::logdebug("Function arguments")
        if (is.character(input_data)) {
            logging::logdebug("Input data file: %s", input_data)
        }
        if (is.character(input_metadata)) {
            logging::logdebug("Input metadata file: %s", input_metadata)
        }
        logging::logdebug("Output folder: %s", output)
        logging::logdebug("Min Abundance: %f", min_abundance)
        logging::logdebug("Min Prevalence: %f", min_prevalence)
        logging::logdebug("Normalization: %s", normalization)
        logging::logdebug("Transform: %s", transform)
        logging::logdebug("Analysis method: %s", analysis_method)
        logging::logdebug("Max significance: %f", max_significance)
        logging::logdebug("Random effects: %s", random_effects)
        logging::logdebug("Fixed effects: %s", fixed_effects)
        logging::logdebug("Correction method: %s", correction)
        logging::logdebug("Standardize: %s", standardize)
        logging::logdebug("Cores: %d", cores)
        # covariate-DNA
        if (is.character(input_dnadata)) {
            logging::logdebug("Input dnadata file: %s", input_dnadata)
        }
        
        ####################################
        # Check valid options are selected #
        ####################################
        
        # Check valid normalization option selected
        logging::loginfo("Verifying options selected are valid")
        if (!normalization %in% normalization_choices) {
            option_not_valid_error(
                paste(
                    "Please select a normalization",
                    "from the list of available options"),
                toString(normalization_choices)
            )
        }
        
        # check valid transform option selected
        if (!transform %in% transform_choices) {
            option_not_valid_error(
                "Please select a transform from the list of available options",
                toString(transform_choices)
            )
        }
        
        # check valid method option selected
        if (!analysis_method %in% analysis_method_choices_names) {
            option_not_valid_error(
                paste(
                    "Please select an analysis method",
                    "from the list of available options"),
                toString(analysis_method_choices_names)
            )
        }
        
        # check valid correction method selected
        if (!correction %in% correction_choices) {
            option_not_valid_error(
                paste("Please select a correction method",
                    "from the list of available options"),
                toString(correction_choices)
            )
        }
        
        # check a valid choice combination is selected
        for (limited_method in hash::keys(
            valid_choice_method_norm)) {
            if (analysis_method == limited_method) {
                if (!normalization %in% 
                    valid_choice_method_norm[[limited_method]]) {
                    option_not_valid_error(
                        paste0("This method can only be used ",
                            "with a subset of normalizations. ",
                            "Please select from the following list"
                        ),
                        toString(valid_choice_method_norm[[limited_method]])
                    )
                }
            }
        }
        for (limited_transform in hash::keys(valid_choice_transform_norm)) {
            if (transform == limited_transform) {
                if (!normalization %in% 
                    valid_choice_transform_norm[[limited_transform]]) {
                    option_not_valid_error(
                        paste0("This transform can only be used",
                            " with a subset of normalizations. ",
                            "Please select from the following list"
                        ),
                        toString(
                            valid_choice_transform_norm[[limited_transform]])
                    )
                }
            }
        }
        
        # check that the transform can be applied to the method selected
        if (transform != "NONE")
        {
            if (!analysis_method %in% valid_choice_method_transform) {
                option_not_valid_error(
                    paste0("The transform selected can only be used",
                        " with some methods. ",
                        "Please select from the following list"
                    ),
                    toString(valid_choice_method_transform)
                )
            }
        }
        
        # covariate-DNA
        if (!rna_dna_flt %in% rna_dna_flt_choices) {
            option_not_valid_error(
                paste(
                    "Please select an rna_dna_flt method",
                    "from the list of available options"),
                toString(rna_dna_flt_choices)
            )
        }
        
        ###############################################################
        # Determine orientation of data in input and reorder to match #
        ###############################################################
        
        logging::loginfo("Determining format of input files")
        samples_row_row <- intersect(rownames(data), rownames(metadata))
        if (length(samples_row_row) > 0) {
            # this is the expected formatting so do not modify data frames
            logging::loginfo(
                paste(
                    "Input format is data samples",
                    "as rows and metadata samples as rows"))
        } else {
            samples_column_row <- intersect(colnames(data), rownames(metadata))
            if (length(samples_column_row) == 0) { 
              # modify possibly included special chars in sample names in metadata
              rownames(metadata) <- make.names(rownames(metadata))
              samples_column_row <- intersect(colnames(data), rownames(metadata))
            }
            if (length(samples_column_row) > 0) {
                logging::loginfo(
                    paste(
                        "Input format is data samples",
                        "as columns and metadata samples as rows"))
                # transpose data frame so samples are rows
                data <- as.data.frame(t(data))
                logging::logdebug(
                    "Transformed data so samples are rows")
            } else {
                samples_column_column <- 
                    intersect(colnames(data), colnames(metadata))
                if (length(samples_column_column) > 0) {
                    logging::loginfo(
                        paste(
                            "Input format is data samples",
                            "as columns and metadata samples as columns"))
                    data <- as.data.frame(t(data))
                    metadata <- as.data.frame(t(metadata))
                    logging::logdebug(
                        "Transformed data and metadata so samples are rows")
                } else {
                    samples_row_column <- 
                        intersect(rownames(data), colnames(metadata))
                    if (length(samples_row_column) == 0) {
                      # modify possibly included special chars in sample names in data
                      rownames(data) <- make.names(rownames(data))
                      samples_row_column <- intersect(rownames(data), colnames(metadata))
                    }
                    if (length(samples_row_column) > 0) {
                        logging::loginfo(
                            paste(
                                "Input format is data samples",
                                "as rows and metadata samples as columns"))
                        metadata <- as.data.frame(t(metadata))
                        logging::logdebug(
                            "Transformed metadata so samples are rows")
                    } else {
                        logging::logerror(
                            paste("Unable to find samples in data and",
                                "metadata files.",
                                "Rows/columns do not match."))
                        logging::logdebug(
                            "Data rows: %s", 
                            paste(rownames(data), collapse = ","))
                        logging::logdebug(
                            "Data columns: %s", 
                            paste(colnames(data), collapse = ","))
                        logging::logdebug(
                            "Metadata rows: %s", 
                            paste(rownames(metadata), collapse = ","))
                        logging::logdebug(
                            "Metadata columns: %s",
                            paste(colnames(metadata), collapse = ","))
                        stop()
                    }
                }
            }
        }
       
        # covariate-DNA
        if (length(dnadata) > 1) {
          samples_row_row <- intersect(rownames(dnadata), rownames(metadata))
          if (length(samples_row_row) > 0) {
            # this is the expected formatting so do not modify dnadata frames
            logging::loginfo(
              paste(
                "Input format is dnadata samples",
                "as rows and metadata samples as rows"))
          } else {
            samples_column_row <- intersect(colnames(dnadata), rownames(metadata))
            if (length(samples_column_row) == 0) { 
              # modify possibly included special chars in sample names in metadata
              rownames(metadata) <- make.names(rownames(metadata))
              samples_column_row <- intersect(colnames(dnadata), rownames(metadata))
            }
            if (length(samples_column_row) > 0) {
              logging::loginfo(
                paste(
                  "Input format is dnadata samples",
                  "as columns and metadata samples as rows"))
              # transpose dnadata frame so samples are rows
              dnadata <- as.data.frame(t(dnadata))
              logging::logdebug(
                "Transformed dnadata so samples are rows")
            } else {
              samples_column_column <- intersect(colnames(dnadata), colnames(metadata))
              if (length(samples_column_column) > 0) {
                logging::loginfo(
                  paste(
                    "Input format is dnadata samples",
                    "as columns and metadata samples as columns"))
                dnadata <- as.data.frame(t(dnadata))
                metadata <- as.data.frame(t(metadata))
                logging::logdebug(
                  "Transformed dnadata and metadata so samples are rows")
              } else {
                samples_row_column <- intersect(rownames(dnadata), colnames(metadata))
                if (length(samples_row_column) == 0) {
                  # modify possibly included special chars in sample names in dnadata
                  rownames(dnadata) <- make.names(rownames(dnadata))
                  samples_row_column <- intersect(rownames(dnadata), colnames(metadata))
                }
                if (length(samples_row_column) > 0) {
                  logging::loginfo(
                    paste(
                      "Input format is dnadata samples",
                      "as rows and metadata samples as columns"))
                  metadata <- as.data.frame(t(metadata))
                  logging::logdebug(
                    "Transformed metadata so samples are rows")
                } else {
                  logging::logerror(
                    paste("Unable to find samples in dnadata and",
                          "metadata files.",
                          "Rows/columns do not match."))
                  logging::logdebug(
                    "DNAData rows: %s", 
                    paste(rownames(dnadata), collapse = ","))
                  logging::logdebug(
                    "DNAData columns: %s", 
                    paste(colnames(dnadata), collapse = ","))
                  logging::logdebug(
                    "Metadata rows: %s", 
                    paste(rownames(metadata), collapse = ","))
                  logging::logdebug(
                    "Metadata columns: %s",
                    paste(colnames(metadata), collapse = ","))
                  stop()
                }
              }
            }
          }
        }
          
        
        # replace unexpected characters in feature names
        colnames(data) <- make.names(colnames(data))
 
        # check for samples without metadata
        extra_feature_samples <-
            setdiff(rownames(data), rownames(metadata))
        if (length(extra_feature_samples) > 0)
            logging::logdebug(
                paste("The following samples were found",
                    "to have features but no metadata.",
                    "They will be removed. %s"),
                paste(extra_feature_samples, collapse = ",")
            )
        
        # check for metadata samples without features
        extra_metadata_samples <-
            setdiff(rownames(metadata), rownames(data))
        if (length(extra_metadata_samples) > 0)
            logging::logdebug(
                paste("The following samples were found",
                    "to have metadata but no features.",
                    "They will be removed. %s"),
                paste(extra_metadata_samples, collapse = ",")
            )
        
        # get a set of the samples with both metadata and features
        intersect_samples <- intersect(rownames(data), rownames(metadata))
        # covariate-DNA
        #if (dnadata != "none") {
        if (length(dnadata) > 1) {
            # replace unexpected characters in feature names
            colnames(dnadata) <- make.names(colnames(dnadata))
            intersect_samples <- intersect(rownames(dnadata), intersect_samples)
        }
        logging::logdebug(
            "A total of %s samples were found in both the data and metadata",
            length(intersect_samples)
        )
        
        # now order both data and metadata with the same sample ordering
        logging::logdebug(
            "Reordering data/metadata to use same sample ordering")
        data <- data[intersect_samples, , drop = FALSE]
        metadata <- metadata[intersect_samples, , drop = FALSE]
        # covariate-DNA
        if (length(dnadata) > 1) {
            dnadata <- dnadata[intersect_samples, , drop = FALSE]
        }
        
        ###########################################
        # Compute the formula based on user input #
        ###########################################
        
        random_effects_formula <- NULL
        # use all metadata if no fixed effects are provided
        if (is.null(fixed_effects)) {
            fixed_effects <- colnames(metadata)
        } else {
            fixed_effects <- unlist(strsplit(fixed_effects, ",", fixed = TRUE))
            # remove any fixed effects not found in metadata names
            to_remove <- setdiff(fixed_effects, colnames(metadata))
            if (length(to_remove) > 0)
                logging::logwarn(
                    paste("Feature name not found in metadata",
                        "so not applied to formula as fixed effect: %s"),
                    paste(to_remove, collapse = " , ")
                )
            fixed_effects <- setdiff(fixed_effects, to_remove)
            # covariate-DNA
			#if (length(fixed_effects) == 0) {
            #    logging::logerror("No fixed effects included in formula.")
            #    stop()
            #}
        }
        
        if (!is.null(random_effects)) {
            random_effects <-
                unlist(strsplit(random_effects, ",", fixed = TRUE))
            # subtract random effects from fixed effects
            fixed_effects <- setdiff(fixed_effects, random_effects)
            # remove any random effects not found in metadata
            to_remove <- setdiff(random_effects, colnames(metadata))
            if (length(to_remove) > 0)
                logging::logwarn(
                    paste("Feature name not found in metadata",
                        "so not applied to formula as random effect: %s"),
                    paste(to_remove, collapse = " , ")
                )
            random_effects <- setdiff(random_effects, to_remove)
            
            # create formula
            if (length(random_effects) > 0) {
                random_effects_formula_text <-
                    paste(
                        "expr ~ (1 | ",
                        paste(
                            random_effects,
                            ")",
                            sep = '',
                            collapse = " + (1 | "
                        ),
                        sep = '')
                logging::loginfo("Formula for random effects: %s",
                    random_effects_formula_text)
                random_effects_formula <-
                    tryCatch(
                        as.formula(random_effects_formula_text),
                        error = function(e)
                            stop(
                                paste(
                                    "Invalid formula for random effects: ",
                                    random_effects_formula_text
                                )
                            )
                    )
            }
        }
        
        # reduce metadata to only include fixed/random effects in formula
        effects_names <- union(fixed_effects, random_effects)
        metadata <- metadata[, effects_names, drop = FALSE]
        
        # create the fixed effects formula text
        # covariate-DNA 
		if (! is.null(fixed_effects) && length(fixed_effects) > 0) {
			formula_text <-
            	paste("expr ~ ", paste(fixed_effects, collapse = " + "))
        	logging::loginfo("Formula for fixed effects: %s", formula_text)
        	formula <-
            	tryCatch(
                	as.formula(formula_text),
                	error = function(e)
                    	stop(
                        	paste(
                            	"Invalid formula.",
                            	"Please provide a different formula: ",
                            	formula_text
                        	)
                    	)
            	)
        } else {
			formula <- NULL
		}

        #########################################################
        # Filter data based on min abundance and min prevalence #
        #########################################################

        # use ordered factor for variables with more than two levels
        # find variables with more than two levels
        if (is.null(reference)) {
            reference <- ","
        }

        for ( i in colnames(metadata) ) {
            mlevels <- unique(na.omit(metadata[,i]))
            numeric_levels <- grep('^-?[0-9.]+[eE+-]?', mlevels, value = T)
            if ( ( length(mlevels[! (mlevels %in% c("UNK"))]) > 2 ) &&
                 ( i %in% fixed_effects ) &&
                 ( length(numeric_levels) == 0)) {
                 split_reference <- unlist(strsplit(reference, "[,;]"))
                if (! i %in% split_reference ) {
                    stop(
                      paste("Please provide the reference for the variable '",
                        i, "' which includes more than 2 levels: ",
                        paste(as.character(mlevels), collapse=", "), ".", sep="")
                    )
                } else {
                    ref <- split_reference[match(i,split_reference)+1]
                    other_levels <- as.character(mlevels)[! as.character(mlevels) == ref]
                    metadata[,i] = factor(metadata[,i], levels=c(ref, other_levels))
                }
            }
        }       
 
        unfiltered_data <- data
        unfiltered_metadata <- metadata
        # covariate-DNA
        unfiltered_dnadata <- dnadata
        
        # require at least total samples * min prevalence values 
        # for each feature to be greater than min abundance
        logging::loginfo(
            "Filter data based on min abundance and min prevalence")
        total_samples <- nrow(unfiltered_data)
        logging::loginfo("Total samples in data: %d", total_samples)
        min_samples <- total_samples * min_prevalence
        logging::loginfo(
            paste("Min samples required with min abundance",
                "for a feature not to be filtered: %f"),
            min_samples
        )
        
        # Filter by abundance using zero as value for NAs
        data_zeros <- unfiltered_data
        data_zeros[is.na(data_zeros)] <- 0
        filtered_data <-
            unfiltered_data[, 
                colSums(data_zeros > min_abundance) > min_samples,
                drop = FALSE]
        total_filtered_features <-
            ncol(unfiltered_data) - ncol(filtered_data)
        logging::loginfo("Total filtered features: %d", total_filtered_features)
        filtered_feature_names <-
            setdiff(names(unfiltered_data), names(filtered_data))
        logging::loginfo("Filtered feature names from abundance and prevalence filtering: %s",
            toString(filtered_feature_names))
        
        #################################
        # Filter data based on variance #
        #################################
        
        sds <- apply(filtered_data, 2, sd)
        variance_filtered_data <- filtered_data[, which(sds > min_variance), drop = FALSE]
        variance_filtered_features <- ncol(filtered_data) - ncol(variance_filtered_data)
        logging::loginfo("Total filtered features with variance filtering: %d", variance_filtered_features)
        variance_filtered_feature_names <- setdiff(names(filtered_data), names(variance_filtered_data))
        logging::loginfo("Filtered feature names from variance filtering: %s",
                         toString(variance_filtered_feature_names))
        filtered_data <- variance_filtered_data
       
        ######################
        # Normalize features #
        ######################
        
        logging::loginfo(
            "Running selected normalization method: %s", normalization)
        filtered_data_norm <-
            normalizeFeatures(filtered_data, normalization = normalization)
        
        # covariate-DNA
        if (length(dnadata) > 1) {
            dnadata_norm <-
                normalizeFeatures(dnadata, normalization = normalization)
        }
        
        ################################
        # Standardize metadata, if set #
        ################################
        
        if (standardize) {
            logging::loginfo(
                "Applying z-score to standardize continuous metadata")
            metadata <- metadata %>% dplyr::mutate_if(is.numeric, scale)
        } else {
            logging::loginfo("Bypass z-score application to metadata")
        }
        
        ############################
        # Transform and run method #
        ############################
       
        # transform features
        logging::loginfo("Running selected transform method: %s", transform)
        # covariate-DNA
        if (transform == "PA") {
            filtered_data_norm_transformed <-
                transformFeatures_ext(filtered_data_norm, transformation = transform)
            if (length(dnadata) > 1) {
                logging::loginfo("Running selected transform method for covaraite abundance: %s", "LOG")
                dnadata_norm_transformed <- transformFeatures(dnadata_norm, transformation = "LOG")
            }
        } else {
            filtered_data_norm_transformed <-
                transformFeatures(filtered_data_norm, transformation = transform)
            if (length(dnadata) > 1) {
                dnadata_norm_transformed <- transformFeatures(dnadata_norm, transformation = transform)
            }
        }
        
		if (ncol(filtered_data_norm) < 1) { 
            all_outfile <- file.path(output, "all_results.tsv")
            mycolnames = c( 
                "feature",
                "metadata",
                "value",
                "coef",
                "stderr",
                "N", 
                "N.not.0",
                "pval",
                "qval")
            write.table(
                t(mycolnames),
                file = all_outfile,
                sep = "\t",
                quote = FALSE,
                col.names = FALSE,
                row.names = FALSE)

            residuals_file = file.path(output, "residuals.rds")
            file.create(residuals_file)
            return(NULL)
        }		

        # apply the method to the data with the correction
        logging::loginfo(
            "Running selected analysis method: %s", analysis_method)
        # covariate-DNA
        if (length(dnadata) > 1) {
            fit_data <-
                fit.dnadata(
                    filtered_data,
                    dnadata,
                    filtered_data_norm_transformed,
                    metadata,
                    dnadata_norm_transformed,
                    rna_dna_flt,
                    min_abundance,
                    min_samples,
                    analysis_method,
                    fixed_effects,
                    effects_names,
                    random_effects_formula = random_effects_formula,
                    correction = correction,
                    cores = cores
                )
        } else {
            fit_data <-
                fit.data(
                    filtered_data_norm_transformed,
                    metadata,
                    analysis_method,
                    formula = formula,
                    random_effects_formula = random_effects_formula,
                    correction = correction,
                    cores = cores
                )
        }

        #################################################################
        # Count the total values for each feature (untransformed space) #
        #################################################################
        
        logging::loginfo("Counting total values for each feature")
        fit_data$results$N <-
            apply(
                fit_data$results,
                1,
                FUN = function(x)
                    length(filtered_data_norm[, x[1]])
            )
        fit_data$results$N.not.zero <-
            apply(
                fit_data$results,
                1,
                FUN = function(x)
                    length(which(filtered_data_norm[, x[1]] > 0))
            )
        
        #########################
        # Write out the results #
        #########################
        
        ###########################
        # write residuals to file #
        ###########################
        
        residuals_file = file.path(output, "residuals.rds")
        # remove residuals file if already exists (since residuals append)
        if (file.exists(residuals_file)) {
            logging::logwarn(
                "Deleting existing residuals file: %s", residuals_file)
            unlink(residuals_file)
        }
        logging::loginfo("Writing residuals to file %s", residuals_file)
        saveRDS(fit_data$residuals, file = residuals_file)
        
        ###############################
        # write fitted values to file #
        ###############################
        
        fitted_file = file.path(output, "fitted.rds")
        # remove fitted file if already exists (since fitted append)
        if (file.exists(fitted_file)) {
          logging::logwarn(
            "Deleting existing fitted file: %s", fitted_file)
          unlink(fitted_file)
        }
        logging::loginfo("Writing fitted values to file %s", fitted_file)
        saveRDS(fit_data$fitted, file = fitted_file)
        
        ########################################################
        # write extracted random effects to file (if specified) #
        ########################################################
        
        if (!is.null(random_effects)) {
          ranef_file = file.path(output, "ranef.rds")
          # remove ranef file if already exists (since ranef append)
          if (file.exists(ranef_file)) {
            logging::logwarn(
              "Deleting existing ranef file: %s", ranef_file)
            unlink(ranef_file)
          }
          logging::loginfo("Writing extracted random effects to file %s", ranef_file)
          saveRDS(fit_data$ranef, file = ranef_file)
        }
        
        #############################
        # write all results to file #
        #############################
        
        results_file <- file.path(output, "all_results.tsv")
        # covariate-DNA
        results_file <- file.path(output, "all_results.tsv.tmp")
        logging::loginfo(
            "Writing all results to file (ordered by increasing q-values): %s",
            results_file)
        ordered_results <- fit_data$results[order(fit_data$results$qval), ]
        # Remove any that are NA for the q-value
        ordered_results <-
            ordered_results[!is.na(ordered_results$qval), ]
        write.table(
            ordered_results[c(
                "feature",
                "metadata",
                "value",
                "coef",
                "stderr",
                "N",
                "N.not.zero",
                "pval",
                "qval")],
            file = results_file,
            sep = "\t",
            quote = FALSE,
            col.names = c(
                "feature",
                "metadata",
                "value",
                "coef",
                "stderr",
                "N",
                "N.not.0",
                "pval",
                "qval"
            ),
            row.names = FALSE
        )
        
        ###########################################
        # write results passing threshold to file #
        ###########################################
        
        significant_results <-
            ordered_results[ordered_results$qval <= max_significance, ]
        significant_results_file <-
            file.path(output, "significant_results.tsv")
        # covariate-DNA
        significant_results_file <-
            file.path(output, "significant_results.tsv.tmp")
        logging::loginfo(
            paste("Writing the significant results",
                "(those which are less than or equal to the threshold",
                "of %f ) to file (ordered by increasing q-values): %s"),
            max_significance,
            significant_results_file
        )
        write.table(
            significant_results[c(
                "feature",
                "metadata",
                "value",
                "coef",
                "stderr",
                "N",
                "N.not.zero",
                "pval",
                "qval")],
            file = significant_results_file,
            sep = "\t",
            quote = FALSE,
            col.names = c(
                "feature",
                "metadata",
                "value",
                "coef",
                "stderr",
                "N",
                "N.not.0",
                "pval",
                "qval"
            ),
            row.names = FALSE
        )
        
        # covariate-DNA
        # filter out DNA covariate
        all_outfile <- file.path(output, "all_results.tsv")
        sig_outfile <- file.path(output, "significant_results.tsv")
        rerun_multiplicity_correction (results_file, all_outfile, sig_outfile, fixed_effects, max_significance, correction)
        significant_results_file <- sig_outfile 
        
        
        #######################################################
        # Create visualizations for results passing threshold #
        #######################################################
        
        if (plot_heatmap) {
            heatmap_file <- file.path(output, "heatmap.pdf")
            logging::loginfo(
                "Writing heatmap of significant results to file: %s",
                heatmap_file)
            save_heatmap(significant_results_file, heatmap_file, figures_folder,
                first_n = heatmap_first_n)
        }
        
        if (plot_scatter) {
            logging::loginfo(
                paste("Writing association plots",
                    "(one for each significant association)",
                    "to output folder: %s"),
                output
            )
            maaslin2_association_plots(
                unfiltered_metadata,
                filtered_data,
                significant_results_file,
                output,
                figures_folder)
        }
        
        return(fit_data)
    }

###########################################################################
# If running on the command line, get arguments and call function #
###########################################################################

# this evaluates to true if script is being called directly as an executable
if (identical(environment(), globalenv()) &&
        !length(grep("^source\\(", sys.calls()))) {
    # get command line options and positional arguments
    parsed_arguments = optparse::parse_args(options, 
        positional_arguments = TRUE)
    current_args <- parsed_arguments[["options"]]
    positional_args <- parsed_arguments[["args"]]
    # check three positional arguments are provided
    if (length(positional_args) != 3) {
        optparse::print_help(options)
        stop(
            paste("Please provide the required",
                "positional arguments",
                "<data.tsv> <metadata.tsv> <output_folder>")
        )
    }
    
    # call MTXmodel with the command line options
    fit_data <-
        MTXmodel(
            positional_args[1],
            positional_args[2],
            positional_args[3],
            current_args$min_abundance,
            current_args$min_prevalence,
            current_args$min_variance,
            current_args$normalization,
            current_args$transform,
            current_args$analysis_method,
            current_args$max_significance,
            current_args$random_effects,
            current_args$fixed_effects,
            current_args$correction,
            current_args$standardize,
            current_args$cores,
            current_args$plot_heatmap,
            current_args$plot_scatter,
            current_args$heatmap_first_n,
            current_args$reference,
            # covariate-DNA
            current_args$input_dnadata,
            current_args$rna_dna_flt
        )
}
biobakery/MTX_model documentation built on July 22, 2024, 11:13 p.m.