R/fit.R

Defines functions fit.data

# Load Required Packages
for (lib in c(
    'dplyr',
    'pbapply',
    'lmerTest',
    'car',
    'parallel',
    'MuMIn',
    'glmmTMB',
    'MASS',
    'cplm',
    'pscl'
)) {
    suppressPackageStartupMessages(require(lib, character.only = TRUE))
}

# fit the data using the model selected and applying the correction
fit.data <-
    function(
        features,
        metadata,
        model,
        formula = NULL,
        random_effects_formula = NULL,
        correction = "BH",
        cores = 1) {

        # set the formula default to all fixed effects if not provided
        if (is.null(formula))
            formula <-
                as.formula(paste(
                    "expr ~ ", 
                    paste(colnames(metadata), 
                    collapse = "+")))

        if (!(is.null(random_effects_formula))) {
            formula <-
                paste(
                    '. ~', 
                    paste(all.vars(formula)[-1], collapse = ' + '), 
                    '.', 
                    sep = ' + ')
	    formula <- update(random_effects_formula, formula)
	}
        
        #############################################################
        # Determine the function and summary for the model selected #
        #############################################################
        
        if (model == "LM") {
            if (is.null(random_effects_formula)) {
                model_function <-
                    function(formula, data, na.action) {
                        return(glm(
                            formula,
                            data = data,
                            family = 'gaussian',
                            na.action = na.action
                        ))
                    }
                summary_function <- function(fit) {
                    lm_summary <- summary(fit)$coefficients
                    para <- as.data.frame(lm_summary)[-1, -3]
                    para$name <- rownames(lm_summary)[-1]
                    return(para)
                }
            } else {
                model_function <-
                    function(formula, data, na.action) {
                        return(lmerTest::lmer(
                            formula, 
                            data = data, 
                            na.action = na.action))
                    }
                summary_function <- function(fit) {
                    lm_summary <- coef(summary(fit))
                    para <- as.data.frame(lm_summary)[-1, -c(3:4)]
                    para$name <- rownames(lm_summary)[-1]
                    return(para)
                }
            }
        }
        
        
        if (model == "SLM") {
            # simple "lm" linear model
            if (is.null(random_effects_formula)) {
                model_function <-
                    function(formula, data, na.action) {
                        return(lm(
                            formula,
                            data = data,
                            family = 'gaussian',
                            na.action = na.action
                        ))
                    }
                summary_function <- function(fit) {
                    lm_summary <- summary(fit)$coefficients
                    r2 <- summary(fit)$r.squared
                    para <- as.data.frame(lm_summary)[-1, -3]
                    para$name <- rownames(lm_summary)[-1]
                    para$r2 <- r2
                    return(para)
                }
            } else {
                #need to be tested
                model_function <-
                    function(formula, data, na.action) {
                        return(lmerTest::lmer(
                            formula, 
                            data = data, 
                            na.action = na.action))
                    }
                summary_function <- function(fit) {
                    lm_summary <- coef(summary(fit))
                    para <- as.data.frame(lm_summary)[-1, -c(3:4)]
                    para$name <- rownames(lm_summary)[-1]
                    para$r2 <- MuMIn::r.squaredGLMM(fit)
                    return(para)
                }
            }
        }
       
        if (model == "CPLM") {
            if (is.null(random_effects_formula)) {
                model_function <- cplm::cpglm
                summary_function <- function(fit) {
                    cplm_out <-
                        capture.output(
                            cplm_summary <- cplm::summary(fit)$coefficients)
                    para <- as.data.frame(cplm_summary)[-1, -3]
                    para$name <- rownames(cplm_summary)[-1]
                    logging::logdebug(
                        "Summary output\n%s", 
                        paste(cplm_out, collapse = "\n"))
                    return(para)
                }
            } else {
	        model_function <-
                    function(formula, data, na.action) {
                        return(glmmTMB::glmmTMB(
                            formula,
                            data = data,
                            family=glmmTMB::tweedie(link = "log"),
                            ziformula = ~0,
                            na.action = na.action
                        ))
                    }
                summary_function <- function(fit) {
		    glmmTMB_summary <- coef(summary(fit))
	            para <- as.data.frame(glmmTMB_summary$cond)[-1, -3]; 
		    para$name <- rownames(glmmTMB_summary$cond)[-1];
                    return(para)
                }
	    }
        }
        
        if (model == "NEGBIN") {
            if (is.null(random_effects_formula)) {
                model_function <- MASS::glm.nb
                summary_function <- function(fit) {
                    glm_summary <- summary(fit)$coefficients
                    para <- as.data.frame(glm_summary)[-1, -3]
                    para$name <- rownames(glm_summary)[-1]
                    return(para)
                }
            } else {
	        model_function <-
                    function(formula, data, na.action) {
                        return(glmmTMB::glmmTMB(
                            formula,
                            data = data,
                            family=glmmTMB::nbinom2(link = "log"),
			    ziformula = ~0,
                            na.action = na.action
                        ))
                    }
                summary_function <- function(fit) {
		    glmmTMB_summary <- coef(summary(fit))
	            para <- as.data.frame(glmmTMB_summary$cond)[-1, -3]; 
		    para$name <- rownames(glmmTMB_summary$cond)[-1];
                    return(para)
                }
	    }
        }
        
        if (model == "ZICP") {
            if (is.null(random_effects_formula)) {
                model_function <- cplm::zcpglm
                summary_function <- function(fit) {
                    zicp_out <- capture.output(
                        zicp_summary <- cplm::summary(fit)$coefficients$tweedie)
                    para <- as.data.frame(zicp_summary)[-1, -3]
                    para$name <- rownames(zicp_summary)[-1]
                    logging::logdebug(
                        "Summary output\n%s", 
                        paste(zicp_out, collapse = "\n"))
                    return(para)
                }
            } else {
	        model_function <-
                    function(formula, data, na.action) {
                        return(glmmTMB::glmmTMB(
                            formula,
                            data = data,
                            family=glmmTMB::tweedie(link = "log"),
			    ziformula = ~1,
                            na.action = na.action
                        ))
                    }
                summary_function <- function(fit) {
		    glmmTMB_summary <- coef(summary(fit))
	            para <- as.data.frame(glmmTMB_summary$cond)[-1, -3]
		    para$name <- rownames(glmmTMB_summary$cond)[-1]
                    return(para)
                }
	    }
        }
        
        if (model == "ZINB") {
            if (is.null(random_effects_formula)) {
                model_function <-
                    function(formula, data, na.action) {
                        return(pscl::zeroinfl(
				formula,
				data = data,
				dist = "negbin",
				na.action = na.action))
                     }
                summary_function <- function(fit) {
                    pscl_summary <- summary(fit)$coefficients$count
                    para <-as.data.frame(pscl_summary)[-c(1, (ncol(metadata) + 2)), -3]
	            para$name <- rownames(pscl_summary)[-c(1, (ncol(metadata) + 2))]
                    return(para)
                }
            } else {
	        model_function <-
                    function(formula, data, na.action) {
                        return(glmmTMB::glmmTMB(
                            formula,
                            data = data,
                            family=glmmTMB::nbinom2(link = "log"),
			    ziformula = ~1,
                            na.action = na.action
                        ))
                    }
                summary_function <- function(fit) {
		    glmmTMB_summary <- coef(summary(fit))
	            para <- as.data.frame(glmmTMB_summary$cond)[-1, -3]
		    para$name <- rownames(glmmTMB_summary$cond)[-1]
                    return(para)
                }
            }
	}
        
        #######################################
        # Init cluster for parallel computing #
        #######################################
        
        cluster <- NULL
        if (cores > 1)
        {
            logging::loginfo("Creating cluster of %s R processes", cores)
            cluster <- parallel::makeCluster(cores)
        }
        
        ##############################
        # Apply per-feature modeling #
        ##############################
        outputs <-
            pbapply::pblapply(seq_len(ncol(features)), cl = cluster, function(x) {
                # Extract Features One by One
                featuresVector <- features[, x]
                
                # Fit Model
                logging::loginfo(
                    "Fitting model to feature number %d, %s",
                    x,
                    colnames(features)[x])
                dat_sub <-
                    data.frame(expr = as.numeric(featuresVector), metadata)
                fit <- tryCatch({
                    fit1 <-
                        model_function(
                            formula, 
                            data = dat_sub, 
                            na.action = na.exclude)
                }, error = function(err) {
                    fit1 <-
                        try({
                            model_function(
                                formula, 
                                data = dat_sub, 
                                na.action = na.exclude)
                        })
                    return(fit1)
                })
                
                # Gather Output
                output <- list()
                if (all(!inherits(fit, "try-error"))) {
                    output$para <- summary_function(fit)
                    output$residuals <- residuals(fit)
                }
                else{
                    logging::logwarn(paste(
                        "Fitting problem for feature", 
                        x, 
                        "returning NA"))
                    output$para <-
                        as.data.frame(matrix(NA, 
                            nrow = ncol(metadata), ncol = 3))
                    output$para$name <- colnames(metadata)
                    output$residuals <- NA
                }
                if (model == 'SLM')
                    colnames(output$para) <-
                        c('coef', 'stderr' , 'pval', 'name', 'r2')
                else
                    colnames(output$para) <- 
                        c('coef', 'stderr' , 'pval', 'name')
                output$para$feature <- colnames(features)[x]
                return(output)
            })
        
        # stop the cluster
        if (!is.null(cluster))
            parallel::stopCluster(cluster)
        
        # bind the results for each feature
        paras <-
            do.call(rbind, lapply(outputs, function(x) {
                return(x$para)
            }))
        residuals <-
            do.call(rbind, lapply(outputs, function(x) {
                return(x$residuals)
            }))
        row.names(residuals) <- colnames(features)
        
        ################################
        # Apply correction to p-values #
        ################################
        
        paras$qval <- as.numeric(p.adjust(paras$pval, method = correction))
        
        #####################################################
        # Determine the metadata names from the model names #
        #####################################################
        
        metadata_names <- colnames(metadata)
        # order the metadata names by decreasing length
        metadata_names_ordered <-
            metadata_names[order(
                nchar(metadata_names), decreasing = TRUE)]
        # find the metadata name based on the match 
        # to the beginning of the string
        extract_metadata_name <- function(name) {
            return(metadata_names_ordered[mapply(
                startsWith, 
                name, 
                metadata_names_ordered)][1])
        }
        paras$metadata <- unlist(lapply(paras$name, extract_metadata_name))
        # compute the value as the model contrast minus metadata
        paras$value <-
            mapply(function(x, y) {
                if (x == y)
                    x
                else
                    gsub(x, "", y)
            }, paras$metadata, paras$name)
        
        ##############################
        # Sort by decreasing q-value #
        ##############################
        
        paras <- paras[order(paras$qval, decreasing = FALSE), ]
        paras <-
            dplyr::select(
                paras,
                c('feature', 'metadata', 'value'),
                dplyr::everything())
        rownames(paras)<-NULL
        return(list("results" = paras, "residuals" = residuals))
    }

Try the Maaslin2 package in your browser

Any scripts or data that you put into this service are public.

Maaslin2 documentation built on Nov. 8, 2020, 6:31 p.m.