This vignette explores the design of the normalization workflow and functions in hermes.
normalize()
is a generic function, so we just want to create a new method for our HermesData
class.qcfilterApply
argument - the filtering will be done outside this method so that it is transparent to the user.control_normalize <- function(log = TRUE, lib_sizes = NULL, prior_count = 1) { assert_that( is.flag(log), is.null(lib_sizes) || utils.nest::is_integer_vector(lib_sizes), is.number(prior_count) && prior_count >= 0 ) list( log = log, lib_sizes = lib_sizes, prior_count = prior_count ) } cont <- control_normalize()
Just to simplify our life.
is_hermes_data <- function(object) { is_class(object, "AnyHermesData") } h <- hermes_data is_hermes_data(h)
For detailed background on the normalization methods, see here.
edgeR::cpm()
.h_cpm <- function(object, control) { assert_that( is_hermes_data(object), utils.nest::is_fully_named_list(control) ) edgeR::cpm( y = counts(object), lib.size = control$lib_sizes, log = control$log, prior.count = control$prior_count ) } counts_cpm <- h_cpm(h, cont) str(counts_cpm)
edgeR::rpkm()
.h_rpkm <- function(object, control) { assert_that( is_hermes_data(object), utils.nest::is_fully_named_list(control), noNA(rowData(object)$WidthBP) ) edgeR::rpkm( y = counts(object), gene.length = rowData(object)$WidthBP, lib.size = control$lib_sizes, log = control$log, prior.count = control$prior_count ) } counts_rpkm <- h_rpkm(h, cont) str(counts_rpkm)
h_tpm <- function(object, control) { rpkm <- h_rpkm(object, control_normalize(log = FALSE)) rpkm_sums <- colSums(rpkm, na.rm = TRUE) tpm <- sweep(rpkm, MARGIN = 2, STATS = rpkm_sums, FUN = "/") * 1e6 if (control$log) { log2(tpm + control$prior_count) } else { tpm } } counts_tpm <- h_tpm(h, cont) str(counts_tpm)
Double check that this gives the same as the more manual calculation:
widths <- rowData(h)$WidthBP geneLengthPerKb <- 1000 / widths rate <- sweep(counts(h), 1, geneLengthPerKb,"*") mls <- colSums(rate, na.rm = TRUE) tpm <- sweep(rate, 2, mls, "/") * 1E6 tpm2 <- h_tpm(h, control_normalize(log = FALSE)) tpm[1:3, 1:3] tpm2[1:3, 1:3]
limma::voom()
.h_voom <- function(object, control) { assert_that( is_hermes_data(object), utils.nest::is_fully_named_list(control) ) norm_log2 <- limma::voom( counts = counts(object), lib.size = control$lib_sizes )$E if (control$log) { norm_log2 } else { 2^norm_log2 } } counts_voom <- h_voom(h, cont) str(counts_voom)
Compare with CPM:
cnts <- counts(h) lib.size <- colSums(cnts) y <- t(log2(t(cnts + 0.5)/(lib.size + 1) * 1e+06)) y[1:3, 1:3] counts_voom[1:3, 1:3] h_cpm(h, control_normalize(log = TRUE, prior_count = 0.5, lib_sizes = as.integer(lib.size + 1L)))[1:3, 1:3]
So that means this is just a very slight variation of CPM. We might therefore drop this later if not needed.
Let's put this all together in the method.
setMethod( f = "normalize", signature = "AnyHermesData", definition = function(object, methods = c("tpm", "cpm", "rpkm", "voom"), control = control_normalize(), ...) { methods <- match.arg(methods, several.ok = TRUE) for (method in methods) { fun_name <- paste0("h_", method) assay(object, method) <- do.call(fun_name, list(object = object, control = control)) } metadata(object) <- c(metadata(object), list(control_normalize = control)) object } )
Let's try it out.
h_norm <- normalize(h) assayNames(h_norm) assay(h_norm, "tpm")[1:3, 1:3]
So that looks good.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.