Nothing
#' Normalize Expression Data and Evaluate Normalization Performance
#'
#' This function applies and evaluates a variety of normalization schemes
#' with respect to a specified SconeExperiment containing scRNA-Seq data.
#' Each normalization consists of three main steps: \itemize{ \item{Impute:}{
#' Replace observations of zeroes with expected expression values. }
#' \item{Scale:}{ Match sample-specific expression scales or quantiles. }
#' \item{Adjust:}{ Adjust for sample-level batch factors / unwanted variation.}
#' } Following completion of each step, the normalized expression matrix is
#' scored based on SCONE's data-driven evaluation criteria.
#'
#' @param x a \code{\link{SconeExperiment}} object.
#' @param ... see specific S4 methods for additional arguments.
#'
#' @param imputation list or function. (A list of) function(s) to be used for
#' imputation. By default only scone::impute_null is included.
#' @param impute_args arguments passed to all imputation functions.
#'
#' @param zero character. Zero-handling option, see Details.
#'
#' @param scaling list or function. (A list of) function(s) to be used for
#' scaling normalization step.
#'
#' @param k_ruv numeric. The maximum number of factors of unwanted variation.
#' Adjustment step models will include a range of 1 to k_ruv factors of
#' unwanted variation. If 0, RUV adjustment will not be performed.
#' @param k_qc numeric. The maximum number of quality metric PCs.
#' Adjustment step models will include a range of 1 to k_qc quality metric
#' PCs. If 0, QC factor adjustment will not be performed.
#' @param adjust_bio character. If 'no', bio will not be included in Adjustment
#' step models; if 'yes', both models with and without 'bio' will be run; if
#' 'force', only models with 'bio' will be run.
#' @param adjust_batch character. If 'no', batch will not be included in
#' Adjustment step models; if 'yes', both models with and without 'batch'
#' will be run; if 'force', only models with 'batch' will be run.
#'
#' @param evaluate logical. If FALSE the normalization methods will not be
#' evaluated.
#' @param eval_pcs numeric. The number of principal components to use for
#' evaluation. Ignored if evaluate=FALSE.
#' @param eval_proj function. Projection function for evaluation (see
#' \code{\link{score_matrix}} for details). If NULL, PCA is used for
#' projection.
#' @param eval_proj_args list. List of arguments passed to projection function
#' as eval_proj_args.
#' @param eval_kclust numeric. The number of clusters (> 1) to be used for pam
#' tightness evaluation. If an array of integers, largest average silhouette
#' width (tightness) will be reported.
#' If NULL, tightness will be returned NA.
#' @param stratified_pam logical. If TRUE then maximum ASW for PAM_SIL is
#' separately computed for each biological-cross-batch stratum (accepting
#' NAs), and a weighted average is returned as PAM_SIL.
#' @param stratified_cor logical. If TRUE then cor metrics are separately
#' computed for each biological-cross-batch stratum (accepts NAs), and
#' weighted averages are returned for EXP_QC_COR, EXP_UV_COR, & EXP_WV_COR.
#' Default FALSE.
#' @param stratified_rle logical. If TRUE then rle metrics are separately
#' computed for each biological-cross-batch stratum (accepts NAs), and
#' weighted averages are returned for RLE_MED & RLE_IQR. Default FALSE.
#' @param verbose logical. If TRUE some messagges are printed.
#' @param run logical. If FALSE the normalization and evaluation are not run,
#' but normalization parameters are returned in the output object for
#' inspection by the user.
#' @param return_norm character. If "no" the normalized values will not be
#' returned with the output object. This will create a much smaller object
#' and may be useful for large datasets and/or when many combinations are
#' compared. If "in_memory" the normalized values will be returned as part
#' of the output. If "hdf5" they will be written on file using the
#' \code{rhdf5} package.
#' @param hdf5file character. If \code{return_norm="hdf5"}, the name of the
#' file onto which to save the normalized matrices.
#' @param bpparam object of class \code{bpparamClass} that specifies the
#' back-end to be used for computations. See
#' \code{\link[BiocParallel]{bpparam}} for details.
#'
#' @return A \code{\link{SconeExperiment}} object with the log-scaled
#' normalized data matrix as elements of the \code{assays} slot, if
#' \code{return_norm} is "in_memory", and with the performance
#' metrics and scores.
#'
#' @details If \code{run=FALSE} only the \code{scone_params} slot of the
#' output object is populated with a \code{data.frame}, each row
#' corresponding to a set of normalization parameters.
#'
#' @details If \code{x} has a non-empty \code{scone_params} slot, only the
#' subset of normalizations specified in \code{scone_params} are performed
#' and evaluated.
#'
#' @details The \code{zero} arguments supports 3 zero-handling options:
#' \itemize{
#' \item{none:}{ Default. No special zero-handling.}
#' \item{preadjust:}{ Restore prior zero observations to zero following
#' Impute and Scale steps.}
#' \item{postadjust:}{ Set prior zero observations and all negative
#' expression values to zero following the Adjust Step. }
#' \item{strong:}{ Apply both preadjust and postadjust options.}
#' }
#'
#' @details Evaluation metrics are defined in \code{\link{score_matrix}}. Each
#' metric is assigned a +/- signature for conversion to scores: Positive-
#' signature metrics increase with improving performance, including BIO_SIL,
#' PAM_SIL, and EXP_WV_COR. Negative-signature metrics decrease with
#' improving performance, including BATCH_SIL, EXP_QC_COR, EXP_UV_COR,
#' RLE_MED, and RLE_IQR. Scores are computed so that higer-performing methods
#' are assigned higher scores.
#'
#' @details Note that if one wants to include the unnormalized data in the
#' final comparison of normalized matrices, the identity function must be
#' included in the scaling list argument. Analogously, if one wants to
#' include non-imputed data in the comparison, the scone::impute_null
#' function must be included.
#'
#' @details If \code{return_norm="hdf5"}, the normalized matrices will be
#' written to the \code{hdf5file} file. This must be a string specifying (a
#' path to) a new file. If the file already exists, it will return error. In
#' this case, the \code{\link{SconeExperiment}} object will not contain the
#' normalized counts.
#'
#' @details If \code{return_norm="no"} the normalized matrices are computed to
#' copmute the scores and then discarded.
#'
#' @details In all cases, the normalized matrices can be retrieved via the
#' \code{\link{get_normalized}} function.
#'
#' @aliases scone scone,SconeExperiment-method
#'
#' @name scone
#'
#' @seealso \code{\link{get_normalized}}, \code{\link{get_design}}
#'
#' @importFrom RUVSeq RUVg
#' @importFrom matrixStats rowMedians
#' @import BiocParallel
#' @importFrom graphics abline arrows barplot hist par plot text
#' @importFrom stats approx as.formula binomial coefficients contr.sum cor dist
#' dnbinom fitted.values glm mad median model.matrix na.omit p.adjust pnorm
#' prcomp quantile quasibinomial sd
#' @importFrom utils capture.output
#' @importFrom rhdf5 h5createFile h5write.default h5write H5close
#' @importFrom rARPACK svds
#' @export
#'
#'
#' @examples
#'
#' mat <- matrix(rpois(1000, lambda = 5), ncol=10)
#' colnames(mat) <- paste("X", 1:ncol(mat), sep="")
#' obj <- SconeExperiment(mat)
#' no_results <- scone(obj, scaling=list(none=identity,
#' uq=UQ_FN, deseq=DESEQ_FN),
#' run=FALSE, k_ruv=0, k_qc=0, eval_kclust=2)
#'
#' results <- scone(obj, scaling=list(none=identity,
#' uq=UQ_FN, deseq=DESEQ_FN),
#' run=TRUE, k_ruv=0, k_qc=0, eval_kclust=2,
#' bpparam = BiocParallel::SerialParam())
#'
#' results_in_memory <- scone(obj, scaling=list(none=identity,
#' uq=UQ_FN, deseq=DESEQ_FN),
#' k_ruv=0, k_qc=0, eval_kclust=2,
#' return_norm = "in_memory",
#' bpparam = BiocParallel::SerialParam())
#'
setMethod(
f = "scone",
signature = signature(x = "SconeExperiment"),
definition = function(x, imputation=list(none=impute_null),
impute_args = NULL,
zero = c("none", "preadjust","postadjust","strong"),
scaling, k_ruv=5, k_qc=5,
adjust_bio=c("no", "yes", "force"),
adjust_batch=c("no", "yes", "force"),
run=TRUE, evaluate=TRUE, eval_pcs=3,
eval_proj = NULL,
eval_proj_args = NULL, eval_kclust=2:10,
verbose=FALSE, stratified_pam = FALSE,
stratified_cor = FALSE,
stratified_rle = FALSE,
return_norm = c("no", "in_memory", "hdf5"),
hdf5file,
bpparam=BiocParallel::bpparam()) {
zero <- match.arg(zero)
rezero <- (zero %in% c("preadjust","strong"))
fixzero <- (zero %in% c("postadjust","strong"))
if(x@is_log) {
stop("At the moment, scone is implemented only for non-log counts.")
}
if (any(apply(assay(x), 1, function(x)
max(x) - min(x)) < 1e-3)) {
stop("Expression data can't contain constant features, range < 1e-3.")
}
return_norm <- match.arg(return_norm)
x@scone_run <- return_norm
if(!is.null(impute_args)) {
x@impute_args <- as.list(impute_args)
}
if(is.null(rownames(x))) {
rownames(x) <- as.character(seq_len(NROW(x)))
}
if(is.null(colnames(x))) {
colnames(x) <- paste0("Sample", seq_len(NCOL(x)))
}
if(return_norm == "hdf5") {
if(missing(hdf5file)) {
stop("If `return_norm='hdf5'`, `hdf5file` must be specified.")
} else {
if(BiocParallel::bpnworkers(bpparam) > 1) {
stop(paste0("At the moment, `return_norm='hdf5'` does ",
"not support multicores."))
}
stopifnot(h5createFile(hdf5file))
h5write(rownames(x), hdf5file, "genes")
h5write(colnames(x), hdf5file, "samples")
x@hdf5_pointer <- hdf5file
H5close()
}
}
if(!is.function(imputation)) {
if(is.list(imputation)) {
if(!all(sapply(imputation, is.function))) {
stop("'imputation' must be a function or a list of functions.")
}
if(is.null(names(imputation))) {
names(imputation) <- paste("imputation",
seq_along(imputation),
sep="")
}
} else {
stop("'imputation' must be a function or a list of functions.")
}
}
if(is.function(imputation)) {
l <- list(imputation)
names(l) <- deparse(substitute(imputation))
imputation <- l
}
x@imputation_fn <- imputation
if(!is.function(scaling)) {
if(is.list(scaling)) {
if(!all(sapply(scaling, is.function))) {
stop("'scaling' must be a function or a list of functions.")
}
if(is.null(names(scaling))) {
names(scaling) <- paste("scaling", seq_along(scaling), sep="")
}
} else {
stop("'scaling' must be a function or a list of functions.")
}
}
if(is.function(scaling)) {
l <- list(scaling)
names(l) <- deparse(substitute(scaling))
scaling <- l
}
x@scaling_fn <- scaling
if(k_ruv < 0) stop("'k_ruv' must be non-negative.")
if(k_qc < 0) stop("'k_qc' must be non-negative.")
if(k_ruv > 0) {
if(length(x@which_negconruv) == 0) {
stop("If k_ruv>0, negative controls must be specified.")
} else {
ruv_negcon <- get_negconruv(x)
}
}
qc <- get_qc(x)
if(k_qc > 0) {
if(length(x@which_qc) == 0) {
stop("If k_qc>0, QC metrics must be specified.")
}
}
if(!is.null(qc)) {
qc_pcs <- prcomp(qc, center=TRUE, scale=TRUE)$x
} else {
qc_pcs <- NULL
}
adjust_batch <- match.arg(adjust_batch)
adjust_bio <- match.arg(adjust_bio)
if(adjust_bio != "no") {
if(length(x@which_bio) == 0) {
stop("if adjust_bio is 'yes' or 'force', 'bio' must be specified.")
}
}
bio <- get_bio(x)
if(adjust_batch != "no") {
if(length(x@which_batch) == 0) {
stop("if adjust_batch is 'yes' or 'force', 'batch' must be specified.")
}
}
batch <- get_batch(x)
if(!is.null(batch)) {
if(!is.null(bio)) {
batch <- factor(batch, levels = unique(batch[order(bio)]))
} else {
batch <- factor(batch, levels = sort(levels(batch)))
}
}
colData(x)[,x@which_batch] <- batch
if(evaluate) {
if(length(x@which_negconeval) == 0) {
if(verbose) message(paste0("Negative controls will ",
"not be used in evaluation (correlations ",
"with negative controls will be returned",
" as NA)"))
}
eval_negcon <- get_negconeval(x)
if(eval_pcs > ncol(x)) {
stop("'eval_pcs' must be less or equal than the number of samples.")
}
if(any(eval_kclust >= ncol(x))) {
stop("'eval_kclust' must be less than the number of samples.")
}
if(stratified_rle) {
if(is.null(bio) & is.null(batch)){
stop("For stratified_rle, bio and/or batch must be specified")
}
}
if(stratified_cor) {
if(is.null(bio) & is.null(batch)){
stop("For stratified_cor, bio and/or batch must be specified")
}
}
if(!is.null(eval_kclust) & stratified_pam) {
if(is.null(bio) & is.null(batch)){
stop("For stratified_pam, bio and/or batch must be specified")
}
biobatch = as.factor(paste(bio,batch,sep = "_"))
if(max(eval_kclust) >= min(table(biobatch))) {
stop(paste0("For stratified_pam, max 'eval_kclust' ",
"must be smaller than bio-cross-batch ",
"stratum size"))
}
}
}
# Check Design: Confounded design or Nesting
# of Batch Condition and Biological Condition
nested <- FALSE
if(!is.null(bio) & !is.null(batch)) {
tab <- table(bio, batch)
if(all(colSums(tab>0)==1)){ # On
if(nlevels(bio) == nlevels(batch)) {
if(adjust_bio != "no" & adjust_batch != "no") {
stop(paste0("Biological conditions ",
"and batches are confounded. They cannot both ",
"be included in the model, please set at least",
" one of 'adjust_bio' and 'adjust_batch' to 'no.'"))
} else {
warning(paste0("Biological conditions and batches are confounded.",
" Removing batch ",
"effects may remove true biological",
" signal and/or inferred differences may be ",
"inflated because of batch effects."))
}
} else {
nested <- TRUE
x@nested <- TRUE
if(verbose) message("Detected nested design...")
}
}
}
# Step 0: compute the parameter matrix
if(NROW(x@scone_params) == 0) {
bi <- switch(adjust_bio,
no="no_bio",
yes=c("no_bio", "bio"),
force="bio"
)
ba <- switch(adjust_batch,
no="no_batch",
yes=c("no_batch", "batch"),
force="batch"
)
ruv_qc <- "no_uv"
if(k_ruv > 0) {
ruv_qc <- c(ruv_qc, paste("ruv_k=", 1:k_ruv, sep=""))
}
if(k_qc > 0) {
ruv_qc <- c(ruv_qc, paste("qc_k=", 1:k_qc, sep=""))
}
params <- expand.grid(imputation_method=names(imputation),
scaling_method=names(scaling),
uv_factors=ruv_qc,
adjust_biology=bi,
adjust_batch=ba,
stringsAsFactors=FALSE
)
rownames(params) <- apply(params, 1, paste, collapse=',')
## if adjust_bio = "yes", meaning
## both bio and no_bio, than remove bio when
## no batch or uv correction
if(adjust_bio == "yes") {
remove_params <- which(params$uv_factors=="no_uv" &
params$adjust_batch=="no_batch" &
params$adjust_biology=="bio")
params <- params[-remove_params,]
}
x@scone_params <- data.frame(params)
} else {
params <- x@scone_params
}
if(!run) {
return(x)
}
## add a check to make sure that design matrix is full rank
## Step 1: imputation
if(verbose) message("Imputation step...")
im_params <- unique(params[,1])
imputed <- lapply(seq_along(im_params),
function(i) imputation[[im_params[i]]](assay(x),
impute_args))
names(imputed) <- im_params
# output: a list of imputed matrices
## Step 2: scaling
if(verbose) message("Scaling step...")
sc_params <- unique(params[,1:2])
scaled <- bplapply(seq_len(NROW(sc_params)),
function(i) scaling[[sc_params[i,2]]](
imputed[[sc_params[i,1]]]
),
BPPARAM=bpparam)
if(rezero){
if(verbose) message("Re-zero step...")
toz = assay(x) <= 0
scaled <- lapply(scaled,function(x,toz){
x[toz] = 0
x
},toz = toz)
x@rezero <- TRUE
}
names(scaled) <- apply(sc_params, 1, paste, collapse="_")
# output: a list of normalized expression matrices
failing <- bplapply(scaled, function(x) any(is.na(x)), BPPARAM=bpparam)
failing <- simplify2array(failing)
if(any(failing)) {
idx <- which(failing)
failed_names = names(scaled)[idx]
warning(paste(failed_names, paste0("returned at least one NA value.",
" Consider removing it from the",
" comparison.\n"),
paste0("In the meantime, failed ",
"methods have been filtered from the output.")))
remove_params = paste(params$imputation_method,
params$scaling_method, sep="_") %in%
failed_names
params <- params[!remove_params,]
scaled = scaled[!failing]
}
if(verbose) message("Computing RUV factors...")
## compute RUV factors
if(k_ruv > 0) {
ruv_factors <- bplapply(scaled,
function(x) RUVg(log1p(x),
ruv_negcon,
k_ruv, isLog=TRUE)$W,
BPPARAM=bpparam)
}
if(evaluate) {
if(verbose) message("Computing factors for evaluation...")
## generate factors: eval_pcs pcs per gene set
uv_factors <- wv_factors <- NULL
eval_poscon <- get_poscon(x)
if(!is.null(eval_negcon)) {
uv_factors <- svds(scale(t(log1p(assay(x)[eval_negcon,])),
center = TRUE,
scale = TRUE),
k=eval_pcs, nu=eval_pcs, nv=0)$u
}
if(!is.null(eval_poscon)) {
wv_factors <- svds(scale(t(log1p(assay(x)[eval_poscon,])),
center = TRUE,
scale = TRUE),
k=eval_pcs, nu=eval_pcs, nv=0)$u
}
}
if(verbose) message("Factor adjustment and evaluation...")
if(fixzero){
if(verbose) message("...including re-zero step...")
x@fixzero <- TRUE
}
outlist <- bplapply(seq_len(NROW(params)), function(i) {
parsed <- .parse_row(params[i,], bio, batch, ruv_factors, qc_pcs)
design_mat <- make_design(parsed$bio, parsed$batch,
parsed$W,
nested=(nested &
!is.null(parsed$bio) &
!is.null(parsed$batch)))
sc_name <- paste(params[i,1:2], collapse="_")
adjusted <- lm_adjust(log1p(scaled[[sc_name]]), design_mat, batch)
if(fixzero){
toz = assay(x) <= 0
adjusted[toz] = 0
adjusted[adjusted <= 0] = 0
}
if(evaluate) {
score <- score_matrix(expr=adjusted, eval_pcs = eval_pcs,
eval_proj = eval_proj,
eval_proj_args = eval_proj_args,
eval_kclust = eval_kclust,
bio = bio, batch = batch,
qc_factors = qc_pcs,
uv_factors = uv_factors, wv_factors = wv_factors,
stratified_pam = stratified_pam,
stratified_cor = stratified_cor,
stratified_rle = stratified_rle,
is_log = TRUE)
} else {
score <- NULL
}
if(verbose) message(paste0("Processed: ", rownames(params)[i]))
if(return_norm == "in_memory") {
return(list(score=score, adjusted=adjusted))
} else {
if(return_norm == "hdf5") {
h5write(expm1(adjusted), hdf5file, rownames(params)[i])
H5close()
}
return(list(score=score))
}
}, BPPARAM=bpparam)
if(return_norm == "in_memory") {
adjusted <- lapply(outlist, function(x) expm1(x$adjusted))
names(adjusted) <- apply(params, 1, paste, collapse=',')
assays(x) <- adjusted
} else {
adjusted <- NULL
}
if(evaluate) {
evaluation <- lapply(outlist, function(x) x$score)
names(evaluation) <- apply(params, 1, paste, collapse=',')
evaluation <- simplify2array(evaluation)
scores <- evaluation * c(1, -1, 1, #BIO_SIL,BATCH_SIL,PAM_SIL
-1, -1, 1, #EXP_QC_COR,EXP_UV_COR,EXP_WV_COR
-1, -1) #RLE_MED,RLE_IQR
# Mean Score Rank per Method
ranked_scores = apply(na.omit(scores),1,rank)
if(is.null(dim(ranked_scores))){
mean_score_rank <- mean(ranked_scores)
}else{
mean_score_rank <- rowMeans(ranked_scores)
}
scores <- cbind(t(scores), mean_score_rank)[order(mean_score_rank,
decreasing = TRUE), ,
drop=FALSE]
evaluation <- t(evaluation[,order(mean_score_rank,
decreasing = TRUE),
drop=FALSE])
if(return_norm == "in_memory") {
adjusted <- adjusted[order(mean_score_rank, decreasing = TRUE)]
assays(x) <- adjusted
}
params <- params[order(mean_score_rank, decreasing = TRUE),]
x@scone_metrics <- evaluation
x@scone_scores <- scores
}
x@scone_params <- data.frame(params)
if(verbose) message("Done!")
return(x)
}
)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.