R/scone_main.R
In scone: Single Cell Overview of Normalized Expression data

#' Normalize Expression Data and Evaluate Normalization Performance
#' 
#' This function applies and evaluates a variety of normalization schemes
#' with respect to a specified SconeExperiment containing scRNA-Seq data.
#' Each normalization consists of three main steps: \itemize{ \item{Impute:}{
#' Replace observations of zeroes with expected expression values. }
#' \item{Scale:}{ Match sample-specific expression scales or quantiles. }
#' \item{Adjust:}{ Adjust for sample-level batch factors / unwanted variation.}
#' } Following completion of each step, the normalized expression matrix is
#' scored based on SCONE's data-driven evaluation criteria.
#' 
#' @param x a \code{\link{SconeExperiment}} object.
#' @param ... see specific S4 methods for additional arguments.
#' 
#' @param imputation list or function. (A list of) function(s) to be used for
#'   imputation. By default only scone::impute_null is included.
#' @param impute_args arguments passed to all imputation functions.
#' 
#' @param zero character. Zero-handling option, see Details.
#' 
#' @param scaling list or function. (A list of) function(s) to be used for
#'   scaling normalization step.
#'   
#' @param k_ruv numeric. The maximum number of factors of unwanted variation.
#'   Adjustment step models will include a range of 1 to k_ruv factors of
#'   unwanted variation. If 0, RUV adjustment will not be performed.
#' @param k_qc numeric. The maximum number of quality metric PCs.
#'   Adjustment step models will include a range of 1 to k_qc quality metric
#'   PCs. If 0, QC factor adjustment will not be performed.
#' @param adjust_bio character. If 'no', bio will not be included in Adjustment
#'   step models; if 'yes', both models with and without 'bio' will be run; if
#'   'force', only models with 'bio' will be run.
#' @param adjust_batch character. If 'no', batch will not be included in 
#'   Adjustment step models; if 'yes', both models with and without 'batch'
#'   will be run; if 'force', only models with 'batch' will be run.
#'   
#' @param evaluate logical. If FALSE the normalization methods will not be
#'   evaluated.
#' @param eval_pcs numeric. The number of principal components to use for
#'   evaluation. Ignored if evaluate=FALSE.
#' @param eval_proj function. Projection function for evaluation  (see 
#'   \code{\link{score_matrix}} for details). If NULL, PCA is used for 
#'   projection.
#' @param eval_proj_args list. List of arguments passed to projection function
#'   as eval_proj_args.
#' @param eval_kclust numeric. The number of clusters (> 1) to be used for pam
#'   tightness evaluation. If an array of integers, largest average silhouette
#'   width (tightness) will be reported. 
#'   If NULL, tightness will be returned NA.
#' @param stratified_pam logical. If TRUE then maximum ASW for PAM_SIL is
#'   separately computed for each biological-cross-batch stratum (accepting
#'   NAs), and a weighted average is returned as PAM_SIL.
#' @param stratified_cor logical. If TRUE then cor metrics are separately
#'   computed for each biological-cross-batch stratum (accepts NAs), and
#'   weighted averages are returned for EXP_QC_COR, EXP_UV_COR, & EXP_WV_COR.
#'   Default FALSE.
#' @param stratified_rle logical. If TRUE then rle metrics are separately 
#'   computed for each biological-cross-batch stratum (accepts NAs), and
#'   weighted averages are returned for RLE_MED & RLE_IQR. Default FALSE.
#' @param verbose logical. If TRUE some messagges are printed.
#' @param run logical. If FALSE the normalization and evaluation are not run,
#'   but normalization parameters are returned in the output object for 
#'   inspection by the user.
#' @param return_norm character. If "no" the normalized values will not be
#'   returned with the output object. This will create a much smaller object
#'   and may be useful for large datasets and/or when many combinations are
#'   compared. If "in_memory" the normalized values will be returned as part
#'   of the output. If "hdf5" they will be written on file using the 
#'   \code{rhdf5} package.
#' @param hdf5file character. If \code{return_norm="hdf5"}, the name of the
#'   file onto which to save the normalized matrices.
#' @param bpparam object of class \code{bpparamClass} that specifies the
#'   back-end to be used for computations. See 
#'   \code{\link[BiocParallel]{bpparam}} for details.
#'   
#' @return A \code{\link{SconeExperiment}} object with the log-scaled
#'   normalized data matrix as elements of the \code{assays} slot, if
#'   \code{return_norm} is "in_memory", and with the performance 
#'   metrics and scores.
#'   
#' @details If \code{run=FALSE} only the \code{scone_params} slot of the
#'   output object is populated with a \code{data.frame}, each row 
#'   corresponding to a set of normalization parameters.
#'   
#' @details If \code{x} has a non-empty \code{scone_params} slot, only the
#'   subset of normalizations specified in \code{scone_params} are performed
#'   and evaluated.
#'   
#' @details The \code{zero} arguments supports 3 zero-handling options:
#' \itemize{ 
#' \item{none:}{ Default. No special zero-handling.}
#' \item{preadjust:}{ Restore prior zero observations to zero following
#' Impute and Scale steps.}
#' \item{postadjust:}{ Set prior zero observations and all negative
#' expression values to zero following the Adjust Step. }
#' \item{strong:}{ Apply both preadjust and postadjust options.}
#' }
#' 
#' @details Evaluation metrics are defined in \code{\link{score_matrix}}. Each
#'   metric is assigned a +/- signature for conversion to scores: Positive-
#'   signature metrics increase with improving performance, including BIO_SIL,
#'   PAM_SIL, and EXP_WV_COR. Negative-signature metrics decrease with
#'   improving performance, including BATCH_SIL, EXP_QC_COR, EXP_UV_COR,
#'   RLE_MED, and RLE_IQR. Scores are computed so that higer-performing methods
#'   are assigned higher scores.
#'   
#' @details Note that if one wants to include the unnormalized data in the 
#'   final comparison of normalized matrices, the identity function must be
#'   included in the scaling list argument. Analogously, if one wants to
#'   include non-imputed data in the comparison, the scone::impute_null
#'   function must be included.
#'   
#' @details If \code{return_norm="hdf5"}, the normalized matrices will be 
#'   written to the \code{hdf5file} file. This must be a string specifying (a 
#'   path to) a new file. If the file already exists, it will return error. In 
#'   this case, the \code{\link{SconeExperiment}} object will not contain the
#'   normalized counts.
#'   
#' @details If \code{return_norm="no"} the normalized matrices are computed to 
#'   copmute the scores and then discarded.
#'   
#' @details In all cases, the normalized matrices can be retrieved via the 
#'   \code{\link{get_normalized}} function.
#'   
#' @aliases scone scone,SconeExperiment-method
#'   
#' @name scone
#'   
#' @seealso \code{\link{get_normalized}}, \code{\link{get_design}}
#'   
#' @importFrom RUVSeq RUVg
#' @importFrom matrixStats rowMedians
#' @import BiocParallel
#' @importFrom graphics abline arrows barplot hist par plot text
#' @importFrom stats approx as.formula binomial coefficients contr.sum cor dist
#'   dnbinom fitted.values glm mad median model.matrix na.omit p.adjust pnorm 
#'   prcomp quantile quasibinomial sd
#' @importFrom utils capture.output
#' @importFrom rhdf5 h5createFile h5write.default h5write H5close
#' @importFrom rARPACK svds
#' @export
#'
#'
#' @examples
#' 
#' mat <- matrix(rpois(1000, lambda = 5), ncol=10)
#' colnames(mat) <- paste("X", 1:ncol(mat), sep="")
#' obj <- SconeExperiment(mat)
#' no_results <- scone(obj, scaling=list(none=identity,
#'            uq=UQ_FN, deseq=DESEQ_FN),
#'            run=FALSE, k_ruv=0, k_qc=0, eval_kclust=2)
#'            
#' results <- scone(obj, scaling=list(none=identity,
#'            uq=UQ_FN, deseq=DESEQ_FN),
#'            run=TRUE, k_ruv=0, k_qc=0, eval_kclust=2,
#'            bpparam = BiocParallel::SerialParam())
#'            
#' results_in_memory <- scone(obj, scaling=list(none=identity,
#'            uq=UQ_FN, deseq=DESEQ_FN),
#'            k_ruv=0, k_qc=0, eval_kclust=2,
#'            return_norm = "in_memory",
#'            bpparam = BiocParallel::SerialParam())
#' 

setMethod(
  f = "scone",
  signature = signature(x = "SconeExperiment"),
  definition = function(x, imputation=list(none=impute_null),
                        impute_args = NULL,
                        zero = c("none", "preadjust","postadjust","strong"),
                        scaling, k_ruv=5, k_qc=5,
                        adjust_bio=c("no", "yes", "force"),
                        adjust_batch=c("no", "yes", "force"),
                        run=TRUE, evaluate=TRUE, eval_pcs=3, 
                        eval_proj = NULL,
                        eval_proj_args = NULL, eval_kclust=2:10,
                        verbose=FALSE, stratified_pam = FALSE,
                        stratified_cor = FALSE,
                        stratified_rle = FALSE,
                        return_norm = c("no", "in_memory", "hdf5"),
                        hdf5file,
                        bpparam=BiocParallel::bpparam()) {

    zero <- match.arg(zero)
    rezero <- (zero %in% c("preadjust","strong"))
    fixzero <- (zero %in% c("postadjust","strong"))

    if(x@is_log) {
      stop("At the moment, scone is implemented only for non-log counts.")
    }
    
    if (any(apply(assay(x), 1, function(x)
      max(x) - min(x)) < 1e-3)) {
      stop("Expression data can't contain constant features, range < 1e-3.")
    }
    
    return_norm <- match.arg(return_norm)
    x@scone_run <- return_norm
    
    if(!is.null(impute_args)) {
      x@impute_args <- as.list(impute_args)
    }
    
    if(is.null(rownames(x))) {
      rownames(x) <- as.character(seq_len(NROW(x)))
    }
    if(is.null(colnames(x))) {
      colnames(x) <- paste0("Sample", seq_len(NCOL(x)))
    }
    
    if(return_norm == "hdf5") {
      if(missing(hdf5file)) {
        stop("If `return_norm='hdf5'`, `hdf5file` must be specified.")
      } else {
        if(BiocParallel::bpnworkers(bpparam) > 1) {
          stop(paste0("At the moment, `return_norm='hdf5'` does ",
                      "not support multicores."))
        }
        stopifnot(h5createFile(hdf5file))
        h5write(rownames(x), hdf5file, "genes")
        h5write(colnames(x), hdf5file, "samples")
        x@hdf5_pointer <- hdf5file
        H5close()
      }
    }
    
    if(!is.function(imputation)) {
      if(is.list(imputation)) {
        if(!all(sapply(imputation, is.function))) {
          stop("'imputation' must be a function or a list of functions.")
        }
        if(is.null(names(imputation))) {
          names(imputation) <- paste("imputation", 
                                     seq_along(imputation), 
                                     sep="")
        }
      } else {
        stop("'imputation' must be a function or a list of functions.")
      }
    }
    
    if(is.function(imputation)) {
      l <- list(imputation)
      names(l) <- deparse(substitute(imputation))
      imputation <- l
    }
    
    x@imputation_fn <- imputation
    
    if(!is.function(scaling)) {
      if(is.list(scaling)) {
        if(!all(sapply(scaling, is.function))) {
          stop("'scaling' must be a function or a list of functions.")
        }
        if(is.null(names(scaling))) {
          names(scaling) <- paste("scaling", seq_along(scaling), sep="")
        }
      } else {
        stop("'scaling' must be a function or a list of functions.")
      }
    }
    
    if(is.function(scaling)) {
      l <- list(scaling)
      names(l) <- deparse(substitute(scaling))
      scaling <- l
    }
    
    x@scaling_fn <- scaling
    
    if(k_ruv < 0) stop("'k_ruv' must be non-negative.")
    if(k_qc < 0) stop("'k_qc' must be non-negative.")
    
    if(k_ruv > 0) {
      if(length(x@which_negconruv) == 0) {
        stop("If k_ruv>0, negative controls must be specified.")
      } else {
        ruv_negcon <- get_negconruv(x)
      }
    }
    
    qc <- get_qc(x)
    
    if(k_qc > 0) {
      if(length(x@which_qc) == 0) {
        stop("If k_qc>0, QC metrics must be specified.")
      }
    }
    
    if(!is.null(qc)) {
      qc_pcs <- prcomp(qc, center=TRUE, scale=TRUE)$x
    } else {
      qc_pcs <- NULL
    }
    
    adjust_batch <- match.arg(adjust_batch)
    adjust_bio <- match.arg(adjust_bio)
    
    if(adjust_bio != "no") {
      if(length(x@which_bio) == 0) {
        stop("if adjust_bio is 'yes' or 'force', 'bio' must be specified.")
      }
    }
    bio <- get_bio(x)
    
    if(adjust_batch != "no") {
      if(length(x@which_batch) == 0) {
        stop("if adjust_batch is 'yes' or 'force', 'batch' must be specified.")
      }
    }
    batch <- get_batch(x)
    
    if(!is.null(batch)) {
      if(!is.null(bio)) {
        batch <- factor(batch, levels = unique(batch[order(bio)]))
      } else {
        batch <- factor(batch, levels = sort(levels(batch)))
      }
    }
    
    colData(x)[,x@which_batch] <- batch
    
    if(evaluate) {
      if(length(x@which_negconeval) == 0) {
        if(verbose) message(paste0("Negative controls will ",
                                   "not be used in evaluation (correlations ",
                                   "with negative controls will be returned",
                                   " as NA)"))
      }
      eval_negcon <- get_negconeval(x)
      
      if(eval_pcs > ncol(x)) {
        stop("'eval_pcs' must be less or equal than the number of samples.")
      }
      
      if(any(eval_kclust >= ncol(x))) {
        stop("'eval_kclust' must be less than the number of samples.")
      }
      
      if(stratified_rle) {
        if(is.null(bio) & is.null(batch)){
          stop("For stratified_rle, bio and/or batch must be specified")
        }
      }
      
      if(stratified_cor) {
        if(is.null(bio) & is.null(batch)){
          stop("For stratified_cor, bio and/or batch must be specified")
        }
      }
      
      if(!is.null(eval_kclust) & stratified_pam) {
        if(is.null(bio) & is.null(batch)){
          stop("For stratified_pam, bio and/or batch must be specified")
        }
        biobatch = as.factor(paste(bio,batch,sep = "_"))
        if(max(eval_kclust) >= min(table(biobatch))) {
          stop(paste0("For stratified_pam, max 'eval_kclust' ",
                      "must be smaller than bio-cross-batch ",
                      "stratum size"))
        }
      }
    }
    
    # Check Design: Confounded design or Nesting 
    # of Batch Condition and Biological Condition
    nested <- FALSE
    
    if(!is.null(bio) & !is.null(batch)) {
      tab <- table(bio, batch)
      if(all(colSums(tab>0)==1)){ # On
        if(nlevels(bio) == nlevels(batch)) {
          if(adjust_bio != "no" & adjust_batch != "no") {
            stop(paste0("Biological conditions ",
                        "and batches are confounded. They cannot both ",
                        "be included in the model, please set at least",
                        " one of 'adjust_bio' and 'adjust_batch' to 'no.'"))
          } else {
            warning(paste0("Biological conditions and batches are confounded.",
                           " Removing batch ",
                           "effects may remove true biological",
                           " signal and/or inferred differences may be ",
                           "inflated because of batch effects."))
          }
        } else {
          nested <- TRUE
          x@nested <- TRUE
          if(verbose) message("Detected nested design...")
        }
      }
    }
    
    # Step 0: compute the parameter matrix
    if(NROW(x@scone_params) == 0) {
      bi <- switch(adjust_bio,
                   no="no_bio",
                   yes=c("no_bio", "bio"),
                   force="bio"
      )
      
      ba <- switch(adjust_batch,
                   no="no_batch",
                   yes=c("no_batch", "batch"),
                   force="batch"
      )
      
      ruv_qc <- "no_uv"
      if(k_ruv > 0) {
        ruv_qc <- c(ruv_qc, paste("ruv_k=", 1:k_ruv, sep=""))
      }
      if(k_qc > 0) {
        ruv_qc <- c(ruv_qc, paste("qc_k=", 1:k_qc, sep=""))
      }
      
      params <- expand.grid(imputation_method=names(imputation),
                            scaling_method=names(scaling),
                            uv_factors=ruv_qc,
                            adjust_biology=bi,
                            adjust_batch=ba,
                            stringsAsFactors=FALSE
      )
      rownames(params) <- apply(params, 1, paste, collapse=',')
      
      ## if adjust_bio = "yes", meaning 
      ## both bio and no_bio, than remove bio when
      ## no batch or uv correction
      if(adjust_bio == "yes") {
        remove_params <- which(params$uv_factors=="no_uv" &
                                 params$adjust_batch=="no_batch" & 
                                 params$adjust_biology=="bio")
        params <- params[-remove_params,]
      }
      x@scone_params <- data.frame(params)
    } else {
      params <- x@scone_params
    }
    
    
    if(!run) {
      return(x)
    }
    
    ## add a check to make sure that design matrix is full rank
    
    ## Step 1: imputation
    if(verbose) message("Imputation step...")
    im_params <- unique(params[,1])
    
    imputed <- lapply(seq_along(im_params), 
                      function(i) imputation[[im_params[i]]](assay(x),
                                                             impute_args))
    names(imputed) <- im_params
    # output: a list of imputed matrices
    
    ## Step 2: scaling
    if(verbose) message("Scaling step...")
    sc_params <- unique(params[,1:2])
    
    scaled <- bplapply(seq_len(NROW(sc_params)), 
                       function(i) scaling[[sc_params[i,2]]](
                         imputed[[sc_params[i,1]]]
                       ), 
                       BPPARAM=bpparam)
    if(rezero){
      if(verbose) message("Re-zero step...")
      toz = assay(x) <= 0
      scaled <- lapply(scaled,function(x,toz){
        x[toz] = 0
        x
      },toz = toz)
      x@rezero <- TRUE
    }

    names(scaled) <- apply(sc_params, 1, paste, collapse="_")
    # output: a list of normalized expression matrices
    
    failing <- bplapply(scaled, function(x) any(is.na(x)), BPPARAM=bpparam)
    failing <- simplify2array(failing)
    if(any(failing)) {
      idx <- which(failing)
      failed_names = names(scaled)[idx]
      warning(paste(failed_names, paste0("returned at least one NA value.",
                                         " Consider removing it from the",
                                         " comparison.\n"),
                    paste0("In the meantime, failed ",
                           "methods have been filtered from the output.")))
      
      remove_params = paste(params$imputation_method, 
                            params$scaling_method, sep="_") %in% 
        failed_names
      params <- params[!remove_params,]
      
      scaled = scaled[!failing]
    }
    
    if(verbose) message("Computing RUV factors...")
    ## compute RUV factors
    if(k_ruv > 0) {
      ruv_factors <- bplapply(scaled,
                              function(x) RUVg(log1p(x), 
                                               ruv_negcon, 
                                               k_ruv, isLog=TRUE)$W,
                              BPPARAM=bpparam)
    }
    
    if(evaluate) {
      
      if(verbose) message("Computing factors for evaluation...")
      
      ## generate factors: eval_pcs pcs per gene set
      uv_factors <- wv_factors <- NULL
      eval_poscon <- get_poscon(x)
      
      if(!is.null(eval_negcon)) {
        uv_factors <- svds(scale(t(log1p(assay(x)[eval_negcon,])),
                                center = TRUE,
                                scale = TRUE),
                           k=eval_pcs, nu=eval_pcs, nv=0)$u
      }
      
      if(!is.null(eval_poscon)) {
        wv_factors <- svds(scale(t(log1p(assay(x)[eval_poscon,])),
                                center = TRUE,
                                scale = TRUE),
                           k=eval_pcs, nu=eval_pcs, nv=0)$u
      }
      
    }
    
    if(verbose) message("Factor adjustment and evaluation...")
    
    if(fixzero){
      if(verbose) message("...including re-zero step...")
      x@fixzero <- TRUE
    }
    
    outlist <- bplapply(seq_len(NROW(params)), function(i) {
      
      parsed <- .parse_row(params[i,], bio, batch, ruv_factors, qc_pcs)
      design_mat <- make_design(parsed$bio, parsed$batch, 
                                parsed$W,
                                nested=(nested &
                                          !is.null(parsed$bio) &
                                          !is.null(parsed$batch)))
      sc_name <- paste(params[i,1:2], collapse="_")
      adjusted <- lm_adjust(log1p(scaled[[sc_name]]), design_mat, batch)
      
      if(fixzero){
        toz = assay(x) <= 0
        adjusted[toz] = 0
        adjusted[adjusted <= 0] = 0
      }
      
      if(evaluate) {
        score <- score_matrix(expr=adjusted, eval_pcs = eval_pcs, 
                              eval_proj = eval_proj, 
                              eval_proj_args = eval_proj_args,
                              eval_kclust = eval_kclust,
                              bio = bio, batch = batch,
                              qc_factors = qc_pcs, 
                              uv_factors = uv_factors, wv_factors = wv_factors,
                              stratified_pam = stratified_pam,
                              stratified_cor = stratified_cor,
                              stratified_rle = stratified_rle,
                              is_log = TRUE)
      } else {
        score <- NULL
      }
      
      if(verbose) message(paste0("Processed: ", rownames(params)[i]))
      if(return_norm == "in_memory") {
        return(list(score=score, adjusted=adjusted))
      } else {
        if(return_norm == "hdf5") {
          h5write(expm1(adjusted), hdf5file, rownames(params)[i])
          H5close()
        }
        return(list(score=score))
      }
    }, BPPARAM=bpparam)
    
    if(return_norm == "in_memory") {
      adjusted <- lapply(outlist, function(x) expm1(x$adjusted))
      names(adjusted) <- apply(params, 1, paste, collapse=',')
      assays(x) <- adjusted
    } else {
      adjusted <- NULL
    }
    
    if(evaluate) {
      evaluation <- lapply(outlist, function(x) x$score)
      names(evaluation) <- apply(params, 1, paste, collapse=',')
      evaluation <- simplify2array(evaluation)
      
      scores <- evaluation * c(1, -1, 1,  #BIO_SIL,BATCH_SIL,PAM_SIL
                               -1, -1, 1, #EXP_QC_COR,EXP_UV_COR,EXP_WV_COR
                               -1, -1) #RLE_MED,RLE_IQR
      
      # Mean Score Rank per Method
      ranked_scores = apply(na.omit(scores),1,rank)
      if(is.null(dim(ranked_scores))){
        mean_score_rank <- mean(ranked_scores)
      }else{
        mean_score_rank <- rowMeans(ranked_scores)
      }
      
      scores <- cbind(t(scores), mean_score_rank)[order(mean_score_rank,
                                                        decreasing = TRUE), ,
                                                  drop=FALSE]
      
      evaluation <- t(evaluation[,order(mean_score_rank,
                                        decreasing = TRUE),
                                 drop=FALSE])
      
      if(return_norm == "in_memory") {
        adjusted <- adjusted[order(mean_score_rank, decreasing = TRUE)]
        assays(x) <- adjusted
      }
      
      params <- params[order(mean_score_rank, decreasing = TRUE),]
      x@scone_metrics <- evaluation
      x@scone_scores <- scores
    }
    
    x@scone_params <- data.frame(params)
    
    if(verbose) message("Done!")
    
    
    return(x)
  }
)