#' Normalise nanostring mRNA data using multiple algorithms
#'
#' @description Performs nCounter scaling factor based normalisations using
#' spike-in controls and housekeeping genes and uses wrappers for geNorm,
#' variance stabilising normalisation (vsn), cyclic loess, quantile and
#' RUV-III normalisation.
#'
#' @param count_set A count_set of mRNA counts generated using count_set
#' @param norm_method types of normalizations to perform. DEFAULT = "all",
#' other = c("housekeeping_scaled", "all_endogenous_scaled",
#' "geNorm_housekeeping", "loess", "vsn", "quantile", ruvIII")
#' @param background_correct Background correction by calculating the
#' proportion of each sample that are background counts, based on spike in
#' negative controls. DEFAULT = "proportional", other = "mean2sd", "none"
#' @param positive_control_scaling Option to perform scaling factor
#' normalisation to the geometric mean of the positive controls. TRUE/FALSE
#' Default = TRUE
#' @param count_threshold Threshold above which at which a gene is considered
#' to be expressed. Any integer. Default = 0. Options are "mean2sd" or any
#' number. To apply no threshold, use -1.
#' @param geNorm_n number of housekeeping genes to keep after ranking using
#'the geNorm algorithm. Default = 5.
#' @param plot_dir Where to write the files to? The directory must already
#' exist. e.g. "/full/path/to/my/plots/". Default = NULL. No plot will be
#' saved if NULL
#' @param ruv_k k value for RUVIII normalisation. Default = 1. See RUV package
#' for more details.
#'
#' @examples
#'
#' # biological groups
#' rnf5_group <- c(rep("WT", 5), rep("KO", 5))
#'
#' # sample ids
#' rnf5_sampleid <- c("GSM3638131", "GSM3638132", "GSM3638133", "GSM3638134",
#' "GSM3638135", "GSM3638136", "GSM3638137", "GSM3638138",
#' "GSM3638139", "GSM3638140")
#'
#' # build count_set
#' rnf5_count_set <- count_set(count_data = Rnf5,
#' group = rnf5_group,
#' samp_id = rnf5_sampleid)
#' # normalize
#' rnf5_count_set_norm <- multi_norm(count_set = rnf5_count_set,
#' positive_control_scaling = TRUE,
#' background_correct = "mean2sd",
#' #plot_dir = "~/Dropbox/git/NanoStringClustR/plot_test/"
#' )
#'
#' @return Returns a count_set containing log2 transformed, normalised data and
#' a diagnostic plot report
#'
#' @export multi_norm
#'
#' @importFrom SummarizedExperiment assays colData rowData colData<- rowData<-
#' assays<-
#' @importFrom FSA geomean
#' @importFrom matrixStats rowMins
#' @importFrom affy normalize.loess
#' @importFrom vsn justvsn
#' @importFrom preprocessCore normalize.quantiles
#' @importFrom ruv RUVIII replicate.matrix
#' @importFrom stats sd var
#' @importFrom graphics par axis mtext
#' @importFrom ctrlGene geNorm
#'
#' @references
#' Wolfgang Huber, Anja von Heydebreck, Holger Sueltmann, Annemarie Poustka and Martin Vingron. Variance Stabilization
#' Applied to Microarray Data Calibration and to the Quantification of Differential Expression. Bioinformatics 18, S96-S104
#' (2002).
#'
#' Gautier, L., Cope, L., Bolstad, B. M., and Irizarry, R. A. 2004. affy---analysis of Affymetrix GeneChip data at the probe
#' level. Bioinformatics 20, 3 (Feb. 2004), 307-315.
#'
#' Ben Bolstad (2018). preprocessCore: A collection of pre-processing functions. R package version 1.44.0.
#' https://github.com/bmbolstad/preprocessCore
#'
#' Johann Gagnon-Bartsch (2018). ruv: Detect and Remove Unwanted Variation using Negative Controls. R package version 0.9.7.
#' https://CRAN.R-project.org/package=ruv
#'
multi_norm <- function(count_set = NULL,
norm_method = "all",
background_correct = "proportional",
positive_control_scaling = TRUE,
count_threshold = 0,
geNorm_n = 5,
plot_dir = NULL,
ruv_k = 1) {
####### Input checks #######------------------------------------------------
# check count_set is provided
if(is.null(count_set)) stop("A CountSet list generated by count_set must be provided.")
if(!is.null(count_set)) {
# check the file format
if(count_set@class != "SummarizedExperiment"){
stop(paste("summarized file provided is not a SummarizedExperiment,
produce a SummarizedExperiment using count_set.", "\n",
sep=""))
}
}
#check norm_method
if(!any(c("all","housekeeping_scaled", "all_endogenous_scaled",
"geNorm_housekeeping", "loess", "vsn", "quantile", "RUVIII") %in% norm_method)){
stop(paste("norm_method must be one or more of c(\"all\",\"housekeeping_scaled\",
\"all_endogenous_scaled\", \"geNorm_housekeeping\", \"loess\", \"vsn\",
\"quantile\", \"RUVIII\"). "))
}
#check background_correct
if(!(background_correct %in% c("proportional", "mean2sd", "none"))) {
stop(paste("background_correct must be \"proportional\",\"mean2sd\" or \"none\".",
"\n", sep= ""))
}
#check geNorm
if(!is.numeric(geNorm_n)){
stop(paste("geNorm_n must be numeric", "\n",
sep = ""))
}
#check count_threshold
if(!(count_threshold == "mean2sd") & (!is.numeric(count_threshold))){
stop(paste("count_threshold must be \"mean2sd\" or a positive number",
"\n", sep = "" ))
}
#checks for code_class
if(sum(grepl("Endogenous", unique(rowData(count_set)$Code_Class))) < 1 ){
stop(paste("Endogenous probes missing in Code_Class"))
}
if(!("Positive" %in% unique(rowData(count_set)$Code_Class))){
stop(paste("Positive probes missing in Code_Class"))
}
if(!("Negative" %in% unique(rowData(count_set)$Code_Class))){
stop(paste("Negative probes missing in Code_Class"))
}
if(!("Housekeeping" %in% unique(rowData(count_set)$Code_Class))){
stop(paste("Housekeeping probes missing in Code_Class"))
}
#check plot_dir
if(is.null(plot_dir)){
message("### No plot directory provided. Plots will not be saved", "\n", sep = "" )
}
# set variables for plotting results
plot_this <- FALSE
# if dir is not null, set plot_this to TRUE
if(!is.null(plot_dir)){
plot_this <- TRUE
}
####### Input checks finished #######---------------------------------------
#log2 transform counts for plotting, remove pos and neg probes
count_set_in <- count_set
assays(count_set_in)$counts <- log2((assays(count_set_in)$counts)+1)
count_set_in <- count_set_in[!grepl("NEG..", rownames(rowData(count_set_in))) ,]
count_set_in <- count_set_in[!grepl("POS..", rownames(rowData(count_set_in))) ,]
####### Optional pre-processing ######--------------------------------------
### 1. Background correction ###
if(background_correct == "proportional") {
message("### Performing background correction. Proportional", "\n", sep = "")
# calculate ratio of negative counts to total counts
negatives_se <- count_set[rowData(count_set)$Code_Class == "Negative" ,]
sum_negatives <- apply(assays(negatives_se)$counts, 2, function(x) sum(x))
sum_total_counts <- apply(assays(count_set)$counts, 2, function(x) sum(x))
ratio <- 1 - (sum_negatives/sum_total_counts)
#remove this proportion of counts from every count
background_corrected <- t(apply(assays(count_set)$counts, 1, function(x){
x*ratio
}))
#add to count_set
assays(count_set)$background_corrected <- background_corrected
}
if(background_correct == "mean2sd") {
message("### Performing background correction. Mean+2SD", "\n", sep = "")
#calculate mean + 2sd of negatives
negatives_se <- count_set[rowData(count_set)$Code_Class == "Negative" ,]
background <- apply(assays(negatives_se)$counts, 2, function(x) mean(x) + 2*sd(x))
background_corrected <- t(apply(assays(count_set)$counts, 1, "-", background))
#convert all negative numbers to 0
background_corrected[background_corrected < 0] <- 0
#add to count_set
assays(count_set)$background_corrected <- background_corrected
}
if(background_correct == "none") {
#add to count_set
assays(count_set)$background_corrected <- assays(count_set)$counts
}
### apply threshold of expression. Default > 0 ###
if(count_threshold == "mean2sd"){
#calculate mean + 2sd of negatives
negatives_se <- count_set[rowData(count_set)$Code_Class == "Negative" ,]
background <- apply(assays(negatives_se)$background_corrected, 2, function(x) mean(x) + 2*sd(x))
expressed <- matrixStats::rowMins(assays(count_set)$background_corrected) > mean(background)
count_set <- count_set[expressed, ]
}
#remove negative controls from count_set
count_set <- count_set[!grepl("NEG..", rownames(rowData(count_set))) ,]
# if threshold of expression is numeric
if(is.numeric(count_threshold)){
expressed <- matrixStats::rowMins(assays(count_set)$background_corrected) > count_threshold
count_set <- count_set[expressed, ]
}
### 2. Scaling factor spike-in positive controls ###
if(positive_control_scaling == TRUE){
message("### Performing positive control scaling.", "\n", sep = "")
#select positive controls
positives_se <- count_set[rowData(count_set)$Code_Class == "Positive" ,]
#calculate geometric mean of positive controls for each lane
geomean <- apply(assays(positives_se)$background_corrected, 2,
function(x) FSA::geomean(x, na.rm = TRUE, zneg.rm = TRUE))
#Divide this arithmetic mean by the geometric mean of each lane to
##generate a lane-specific normalization factor.
am_geomeans <- mean(geomean)
sf <- sapply(geomean, function(x) {am_geomeans/x})
#multiply all counts by sf
positive_control_scaled <- t(apply(assays(count_set)$background_corrected, 1, "*", sf))
assays(count_set)$positive_control_scaled <- positive_control_scaled
}
#remove negative controls from count_set
count_set <- count_set[!grepl("POS..", rownames(rowData(count_set))), ]
#select pre-processed data for input to filtering and normalisations
if(positive_control_scaling == FALSE ){
assays(count_set)$normalisation_input <- assays(count_set)$background_corrected
} else if(positive_control_scaling == TRUE){
assays(count_set)$normalisation_input <- assays(count_set)$positive_control_scaled
}
#remove ligation and miRNA spike in controls from count_set, use currently not supported
count_set <- count_set[!grepl("LIG_POS_.",rownames(rowData(count_set))), ]
count_set <- count_set[!grepl("LIG_NEG_.",rownames(rowData(count_set))), ]
count_set <- count_set[!grepl("osa-miR414", rownames(rowData(count_set))), ]
count_set <- count_set[!grepl("osa-miR442", rownames(rowData(count_set))), ]
count_set <- count_set[!grepl("cel-miR-254", rownames(rowData(count_set))), ]
count_set <- count_set[!grepl("cel-miR-248", rownames(rowData(count_set))), ]
count_set <- count_set[!grepl("ath-miR159a", rownames(rowData(count_set))), ]
####### Normalisations ####### --------------------------------------------------------
message("### Performing scaling factor normalisations.", "\n", sep = "")
if(any(c("all", "housekeeping_scaled") %in% norm_method) == TRUE){
### 1. Scaling factor normalisation based on the geometric mean of all housekeeping genes ###
#select housekeeping genes
housekeeping_se <- count_set[rowData(count_set)$Code_Class == "Housekeeping" ,]
#calculate geometric mean of housekeeping genes for each lane
geomean <- apply(assays(housekeeping_se)$normalisation_input, 2, function(x) FSA::geomean(x, na.rm = TRUE, zneg.rm = TRUE))
#Divide this arithmetic mean by the geometric mean of each lane to generate a lane-specific normalization factor.
am_geomeans <- mean(geomean)
sf <- sapply(geomean, function(x) {am_geomeans/x})
#multiply all counts by sf
housekeeping_scaled <- t(apply(assays(count_set)$normalisation_input, 1, "*", sf))
#log2 transform, add to counts_set
assays(count_set)$housekeeping_scaled <- log2(housekeeping_scaled + 1)
}
if(any(c("all", "all_endogenous_scaled") %in% norm_method) == TRUE){
### 3. Scaling factor normalisation based on the geometric mean of total counts ###
#calculate geometric mean of all genes for each lane
geomean <- apply(assays(count_set)$normalisation_input, 2, function(x) FSA::geomean(x, na.rm = TRUE, zneg.rm = TRUE))
#Divide this arithmetic mean by the geometric mean of each lane to generate a lane-specific normalization factor.
am_geomeans <- mean(geomean)
sf <- sapply(geomean, function(x) {am_geomeans/x})
#multiply all counts by sf
all_endogenous_scaled <- t(apply(assays(count_set)$normalisation_input, 1, "*", sf))
#log2 transform, add to count_set
assays(count_set)$all_endogenous_scaled <- log2(all_endogenous_scaled + 1)
}
if(any(c("all", "geNorm_housekeeping") %in% norm_method) == TRUE){
### 4. Scaling factor with geNorm selected HKs ###
#rank genes by M value using geNorm algorithm
M_vals <-ctrlGene::geNorm(t(assays(housekeeping_se)$normalisation_input), ctVal=FALSE)
M_vals <- M_vals[order(M_vals$Avg.M),]
stable_hk_genes <- as.character(M_vals$Genes)
stable_hk_genes <- unlist(strsplit(stable_hk_genes, "[-]"))
stable_hk_genes <- stable_hk_genes[1:geNorm_n]
#select these genes from housekeeping_se
stable_hk_se <- count_set[rownames(rowData(count_set)) %in% stable_hk_genes ,]
#calculate geometric mean of housekeeping genes for each lane
geomean <- apply(assays(stable_hk_se)$normalisation_input, 2, function(x) FSA::geomean(x, na.rm = TRUE, zneg.rm = TRUE))
#Divide this arithmetic mean by the geometric mean of each lane to generate a lane-specific normalization factor.
am_geomeans <- mean(geomean)
sf <- sapply(geomean, function(x) {am_geomeans/x})
#multiply all counts by sf
geNorm_housekeeping <- t(apply(assays(count_set)$normalisation_input, 1, "*", sf))
#log2 transform
assays(count_set)$geNorm_housekeeping <- log2(geNorm_housekeeping + 1)
}
if(any(c("all", "loess") %in% norm_method) == TRUE){
### 5. Loess ###
message("### Performing loess normalisation.", "\n", sep = "")
loess <- affy::normalize.loess(((assays(count_set)$normalisation_input)+1), verbose = FALSE)
#make empty loess assay in count_set
assays(count_set)$loess <- log2(loess) #+1 already done
}
if(any(c("all", "vsn") %in% norm_method) == TRUE){
### 6. VSN ###
if (nrow(assays(count_set)$normalisation_input) > 42){
message("### Performing VSN normalisation.", "\n", sep = "")
vsn <- vsn::justvsn(((assays(count_set)$normalisation_input)+1), verbose = FALSE)
#make empty vsn assay in count_set. #outputs glog2 counts
assays(count_set)$vsn <- vsn
} else {
message("### Performing VSN normalisation.", "\n", sep = "")
message("#####The number of genes in this count_set after background correction
is too low for reliable estimation of the vsn transformation parameters. Check the
vsn package for more details (Huber et al., 2002).", "\n", sep = "")
message("##### VSN normalisation not performed.", "\n", sep = "")
}
}
if(any(c("all", "quantile") %in% norm_method) == TRUE){
### 7. Quantile ###
message("### Performing quantile normalisation.", "\n", sep = "")
log_counts <- log2((assays(count_set)$normalisation_input)+1)
quant_norm <- preprocessCore::normalize.quantiles(log_counts) #outputs log2 normalised counts
colnames(quant_norm)<-colnames(log_counts)
rownames(quant_norm)<-rownames(log_counts)
#make empty quantile assay in count_set. #outputs log2 counts
assays(count_set)$quantile <- quant_norm
}
if(any(c("all", "ruvIII") %in% norm_method) == TRUE){
### 8. RUVIII ###
#use RUV if replicate samples are present based on sample id and grouping
sample_names <- paste(count_set$samp_id, count_set$group, sep = "_")
not_replicates <- unique(sample_names)
if(length(not_replicates) < length(count_set$samp_id)) { #are there replicates?
message("### Replicates present. Performing RUV")
rep_mat <- ruv::replicate.matrix(sample_names)
message("### RUVIII normalisation: There are ", nrow(rep_mat),
" samples; ", ncol(rep_mat)," unique samples with ",
nrow(rep_mat)-ncol(rep_mat), " replicate(s).")
#Identifying Control Genes
##1. Use all genes as negative controls
RUV_norm <- ruv::RUVIII(Y = t(log_counts),
M = rep_mat,
ctl = c(1:nrow(assays(count_set)$normalisation_input)))
RUV_norm <- t(RUV_norm)
#2. Select the most stably expressed genes
stable_genes <- apply(RUV_norm, 1, var)
control_genes <- which(stable_genes < 0.5)
#3. RUVIII correction using control_genes
##Only k=1 is supported at the moment
RUV_norm <-ruv::RUVIII(Y = t(log_counts), M = rep_mat, ctl = control_genes, k = ruv_k)
RUV_norm <- t(RUV_norm) #outputs log2, RUVIII transformed, normalised counts
#make empty RUVIII assay in count_set
assays(count_set)$ruvIII <- RUV_norm
} else if(length(not_replicates) == length(count_set$samp_id)) {
message("### No replicates present. RUV not performed")
}
}
# remove normalisation input assay
assays(count_set)$normalisation_input <- NULL
###### Return all counts in log2 ######--------------------------------------
#log2 normalise raw count data
assays(count_set)$counts <- log2((assays(count_set)$counts)+1)
#log2 transform background_corrected_counts
assays(count_set)$background_corrected <- log2((assays(count_set)$background_corrected)+1)
#log2 transform positive_control_scaled
if(positive_control_scaling == TRUE){
assays(count_set)$positive_control_scaled <- log2((assays(count_set)$positive_control_scaled)+1)
}
if(background_correct == "none" & !(count_threshold == "mean2sd") & (!is.numeric(count_threshold))) {
assays(count_set)$background_corrected <- NULL
}
####### plots #######---------------------------------------------------------
if (plot_this == TRUE){
#refactorise groups & batches in normalized and input count_sets
count_set_in$group <- factor(count_set_in$group)
count_set_in$batch <- factor(count_set_in$batch)
count_set$group <- factor(count_set$group)
count_set$batch <- factor(count_set$batch)
#create vector of count_set assays after normalisations
assays_all <- names(assays(count_set)[2:length(assays(count_set))])
#Density Plot
grDevices::pdf(file = paste(plot_dir, "Density.pdf", sep = ""), width = 10, height = 10)
par(mfrow = c(3,3), mar = c(7,4.1,4.1,2.1))
density_plot_wrap(count_set = count_set_in,
norm_method = "counts",
title = "counts",
colors = RColorBrewer::brewer.pal(max(length(unique(count_set$group)),3),"Set1"))
lapply(seq_along(assays_all), function(i){
data_in <- assays_all[i]
density_plot_wrap(count_set = count_set,
norm_method = data_in,
title = data_in,
colors = RColorBrewer::brewer.pal(max(length(unique(count_set$group)),3),"Set1"),
legend = FALSE)
})
grDevices::dev.off()
#PCA
grDevices::pdf(file = paste(plot_dir, "PCA.pdf", sep = ""), width = 10, height = 10)
par(mfrow = c(3,3), mar = c(7,4.1,4.1,2.1))
pca_plot_wrap(count_set = count_set_in,
norm_method = "counts",
title = "counts",
colors = RColorBrewer::brewer.pal(max(length(unique(count_set$group)),3),"Set1"))
lapply(seq_along(assays_all), function(i){
data_in <- assays_all[i]
pca_plot_wrap(count_set = count_set,
norm_method = data_in,
title = data_in,
colors = RColorBrewer::brewer.pal(max(length(unique(count_set$group)),3),"Set1"),
legend = FALSE)
})
grDevices::dev.off()
#RLE
grDevices::pdf(file = paste(plot_dir, "RLE.pdf", sep = ""), width = 10, height = 10)
par(mfrow = c(3,3), mar = c(7,4.1,4.1,2.1))
rle_plot_wrap(count_set = count_set_in,
norm_method = "counts",
title = "counts",
colors = RColorBrewer::brewer.pal(max(length(unique(count_set$group)),3),"Set1"))
lapply(seq_along(assays_all), function(i){
data_in <- assays_all[i]
rle_plot_wrap(count_set = count_set,
norm_method = data_in,
title = data_in,
colors = RColorBrewer::brewer.pal(max(length(unique(count_set$group)),3),"Set1"),
legend = FALSE)
})
grDevices::dev.off()
#Clustering
grDevices::pdf(file = paste(plot_dir, "hclust.pdf", sep = ""), width = 10, height = 10)
par(mfrow = c(3,3), mar = c(7,4.1,4.1,2.1))
hclust_plot_wrap(count_set = count_set_in,
norm_method = "counts",
title = "counts",
colors = RColorBrewer::brewer.pal(max(length(unique(count_set$group)),3),"Set1"))
lapply(seq_along(assays_all), function(i){
data_in <- assays_all[i]
hclust_plot_wrap(count_set = count_set,
norm_method = data_in,
title = data_in,
colors = RColorBrewer::brewer.pal(max(length(unique(count_set$group)),3),"Set1"),
legend = FALSE)
})
grDevices::dev.off()
}
return(count_set)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.