knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width=5, fig.height=5 )
This is a step by step tutorial on how to assess and/or correct signal drift and batch effects within/across a multi-batch direct infusion mass spectrometry (DIMS) dataset. The same approach can be used on liquid chromatography mass spectrometry (LCMS) peak table as well.
Deeper details on how the algorithm works are detailed in \@ref(algorithm)
You should have R version 4.0.0 or above and Rstudio installed to be able to run this notebook.
Execute following commands from the R terminal.
install.packages("gridExtra") if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager") BiocManager::install("pmp")
Load the required libraries into the R environment
library(S4Vectors) library(SummarizedExperiment) library(pmp) library(ggplot2) library(reshape2) library(gridExtra)
In this tutorial we will be using a direct infusion mass spectrometry (DIMS)
dataset consisting of 172 samples measured across 8 batches and is included in
pmp
package as SummarizedExperiemnt
class object MTBLS79
.
More detailed description of the dataset is available from @kirwan2014,
MTBLS79 and R man page.
help ("MTBLS79")
feature_names <- c("70.03364", "133.07379", "146.16519", "163.04515", "174.89483", "200.03196", "207.07818", "221.05062", "240.02445", "251.03658", "266.01793", "304.99115", "321.07923", "338.98131", "376.03962", "393.35878", "409.05716", "430.24353", "451.01086", "465.14937") summary(t(SummarizedExperiment::assay(MTBLS79[feature_names, ])))
#number of samples: ncol(MTBLS79)
#Batches: unique(MTBLS79$Batch)
#Sample classes: unique(MTBLS79$Class)
A more detailed overview and guidelines on strategies for quality control of mass spectrometry assays is detailed in recent work by @broadhurst2018.
To evaluate if the data needs correction, it is common practice to examine the relative standard deviation (RSD) of the quality control (QC) samples and biological samples. RSD% is also sometimes referred to as the coefficient of variation (CV). An RSD% for the QC samples below 20-30% is commonly used as an acceptable level of technical variation where signal correction is not required.
The following code calculates and plots the RSD% values of the features within the dataset.
# separate the LCMS data from the meta data data(MTBLS79) data <- SummarizedExperiment::assay(MTBLS79[feature_names, ]) class <- SummarizedExperiment::colData(MTBLS79)$Class batch <- SummarizedExperiment::colData(MTBLS79)$Batch order <- c(1:ncol(data)) # get index of QC samples QChits <- which(class == "QC") # small function to calculate RSD% FUN <- function(x) sd(x, na.rm=TRUE) / mean(x, na.rm=TRUE) * 100 # RSD% of biological and QC samples within all 8 batches: out <- matrix(ncol=2, nrow=nrow(data)) colnames(out) <- c("Sample","QC") rownames(out) <- rownames(data) # for each feature calculate RSD% for the samples and the QCs for (i in 1:nrow(data)) { out[i, 1] <- FUN(data[i, -QChits]) # for samples out[i, 2] <- FUN(data[i, QChits]) # for QCs } # prepare data for plotting plotdata <- melt(data.frame(out), variable.name="Class", value.name="RSD") plotdata$feature <- rownames(data) plotdata$RSD <- round(plotdata$RSD,0) plotdata$feature <- factor(plotdata$feature, ordered=TRUE, levels=unique(plotdata$feature)) # plot ggplot(data=plotdata, aes(x=Class, y=feature, fill=RSD)) + geom_tile() + geom_text(aes(label=RSD)) + scale_fill_gradient2(low="black", mid="white", high="red")
A violin plot is a useful way of summarising the RSD% over all samples/QCs in the data set. Note a very high QC sample RSD% value for feature '409.05716'.
ggplot(data=plotdata, aes(x=Class, y=RSD, fill=Class)) + geom_violin(draw_quantiles=c(0.25,0.5,0.75)) + ylab("RSD%") + guides(fill=FALSE) + theme(panel.background=element_blank())
The plots indicates that most features have a QC RSD% lower than 30%, which is a commonly accepted threshold, but for some features the QC RSD% exceeds 30% and is more similar to the signal variation of the biological samples. We can calculate similar statistics per batch and visualise the results with a box plot.
# prepare some matrices to store the results RSDQC <- matrix(ncol=8, nrow=nrow(data)) RSDsample <- matrix(ncol=8, nrow=nrow(data)) colnames(RSDQC) <- unique(batch) colnames(RSDsample) <- unique(batch) rownames(RSDQC) <- rownames(data) rownames(RSDsample) <- rownames(data) # for each feature for (i in 1:nrow(data)) { # for each batch for (nb in 1:8) { # RSD% of QCs in this batch RSDQC[i, nb] <- FUN(data[i, which(class == "QC" & batch == nb)]) # RSD% of samples in this batch RSDsample[i, nb] <- FUN(data[i, which(!class == "QC" & batch == nb)]) } } # prepare results for plotting plotdataQC <- melt(as.data.frame(RSDQC), variable.name="batch", value.name="RSD") plotdataQC$Class <- "QC" plotdataBio <- melt(as.data.frame(RSDsample), variable.name="batch", value.name="RSD") plotdataBio$Class <- "Sample" plotdata <- rbind(plotdataQC, plotdataBio) plotdata$Class <- as.factor(plotdata$Class) # plot ggplot(data=plotdata, aes(x=Class, y=RSD, fill=Class)) + geom_boxplot() + facet_wrap(~ batch, ncol=3) + ylab("RSD%") + xlab("") + scale_x_discrete(labels=NULL) + theme(panel.background=element_blank(), axis.text.x=element_blank(), axis.ticks.x=element_blank())
Summary of RSD% of QC samples
summary(RSDQC)
Summary of RSD% of biological samples
summary(RSDsample)
From the above we can conclude that for every analytical batch RSD% tends to be higher in the analytical samples than it is in the QC samples for all 20 measured features. A few outlier QC samples can be observed.
An alternative measure of QC and biological sample variability is the so called D-ratio, which indicates if the technical variation within the QC samples exceeds the biological variation within biological samples.
# prepare a list of colours for plotting manual_color = c("#386cb0", "#ef3b2c", "#7fc97f", "#fdb462", "#984ea3", "#a6cee3", "#778899", "#fb9a99", "#ffff33") # Function to calculate median absolute deviation DRatfun <- function(samples, qcs) mad(qcs) / mad(samples) # prepare matrix for dratio output dratio <- matrix(ncol=8, nrow=nrow(data)) colnames(dratio) <- unique(batch) rownames(dratio) <- rownames(data) # calculate dratio for each feature, per batch for (i in 1:nrow(dratio)){ for (nb in 1:8) { dratio[i, nb] <- DRatfun(samples=data[i, which(!class == "QC" & batch == nb)], qcs=data[i, which(class == "QC" & batch == nb)]) } } # prepare data for plotting dratio <- as.data.frame(round(dratio, 2)) plotdata2 <- melt(dratio, variable.name="batch") plotdata2$index <- rownames(data) plotdata2$index <- factor(plotdata2$index, ordered=TRUE, levels=unique(plotdata2$index)) ggplot(data=plotdata2, aes(x=index, y=value, color=batch)) + geom_point(size=2) + xlab("index") + ylab("D-ratio") + geom_hline(yintercept=1) + theme(panel.background=element_blank()) + scale_color_manual(values=manual_color) + theme(axis.text.x=element_text(angle=90))
The D-ratio is a convenient measure to assess if technical variation in the QC samples (MAD QC) is higher than the variation within the biological samples (MAD sample). Ratio values close to, or higher than 1 indicate that technical variation of the measured feature is higher than the biological variation and therefore should be treated carefully during interperation of the dataset. The colors in the figure above indicate different analytical batches. In the example above we can see that feature '409.05716' has a D-ratio value above 1 in four batches, while for features '70.0336' and '393.35878' the D-ratio is reproducibly low within all eight batches.
Principal components analysis (PCA) can be used to check common trends in the data. Let's inspect the scores of the first two principal components and samples colored by batch and class. For PCA model data should be normalised and missing values should be replaced using imputation, followed by data scaling. We will use the probabilistic quotient normalisation (PQN) method to normalise the data, k-nearest neighbours (KNN) for missing value imputation and finally the glog method to stabilise the variance across low and high intensity mass spectral features. See @guida2016 for a more detailed review on common pre-processing steps and methods.
pca_data <- MTBLS79[feature_names, ] pca_data <- pqn_normalisation(pca_data, classes=class, qc_label="QC") pca_data <- mv_imputation(pca_data, method="KNN", k=5, rowmax=0.5, colmax=0.5, maxp=NULL, check_df=FALSE) pca_data <- glog_transformation(pca_data, classes=class, qc_label="QC") pca_data <- prcomp(t(SummarizedExperiment::assay(pca_data)), center=TRUE, scale=FALSE) exp_var_pca <- round(((pca_data$sdev^2)/sum(pca_data$sdev^2)*100)[1:2], 2) plots <- list() plotdata <- data.frame(PC1=pca_data$x[, 1], PC2=pca_data$x[, 2], batch=as.factor(batch), class=class) plots[[1]] <- ggplot(data=plotdata, aes(x=PC1, y=PC2, col=batch)) + geom_point(size=2) + theme(panel.background=element_blank()) + scale_color_manual(values=manual_color) + ggtitle("PCA scores, before correction") + xlab(paste0("PC1 (", exp_var_pca[1] ," %)")) + ylab(paste0("PC2 (", exp_var_pca[2] ," %)")) plots[[2]] <- ggplot(data=plotdata, aes(x=PC1, y=PC2, col=class)) + geom_point(size=2) + theme(panel.background=element_blank()) + scale_color_manual(values=manual_color) + ggtitle("PCA scores plot, before correction") + xlab(paste0("PC1 (", exp_var_pca[1] ," %)")) + ylab(paste0("PC2 (", exp_var_pca[2] ," %)")) grid.arrange(ncol=2, plots[[1]], plots[[2]])
Left side plot above clearly shows that samples measured in batches 7 and 8 are differentiating from bathes 1 to 6. On the right hand side plot the seperation between samples classes is still visible, but also seperation between measurement batches is clearly visible across PC2 axis.
Alternatively, trends in measured signal related to injection order could indicate if signal drif and/or batch effect correction is required. The plot below illustrates the measured signal of QC samples across all 8 batches. To be able to compare all 20 features measured at different signal ranges, the data will be scaled to unit variance (UV).
# autoscale the QC data QCdata <- data[ ,QChits] QCdata2 <- as.data.frame(scale(t(QCdata), scale=TRUE, center=TRUE)) # prepare the data for plotting plotdata <- melt(QCdata2, value.name="intensity") plotdata$index <- rep(1:nrow(QCdata2), ncol(QCdata2)) plotdata$batch <- as.factor(batch[QChits]) # plot ggplot(data=plotdata, aes(x=index, y=intensity, col=batch)) + geom_point(size=2) + facet_wrap(~ variable, ncol=4) + theme(panel.background=element_blank()) + scale_color_manual(values=manual_color)
This figure indicates that there is some fluctuation in the measured signal across the eight batches, and that some features are following a similar pattern, i.e. they are correlated. We can create a similar plot to the one above including linear regression fit between measured data points.
ggplot(data=plotdata, aes(x=index, y=intensity, col=batch)) + geom_point(size=2) + facet_wrap(~ variable, ncol=4) + geom_smooth(method="lm", se=TRUE, colour="black") + theme(panel.background=element_blank()) + scale_color_manual(values=manual_color)
The plot above indicates that some trends can be observed. It is possible to calculate actual correlation values within QC samples for each measured feature, and we will use Kendall's tau statistic to estimate a rank-based measure of association.
sampleorder <- c(1:ncol(QCdata)) correlations <- matrix(ncol=2, nrow=nrow(data)) rownames(correlations) <- rownames(data) colnames(correlations) <- c("tau","p.value") correlations <- as.data.frame(correlations) for (coln in 1:nrow(data)) { stat <- cor.test(sampleorder, QCdata[coln, ], method="kendall") correlations$tau[coln] <- stat$estimate correlations$p.value[coln] <- stat$p.value } correlations
While most of the calculated tau values and corresponding p-values indicate that there is not a strong trend between injection order and the measured QC sample signal, some of the values don't match the trends we can observe in figure above. For features "133.07379" and "376.03962" calculated correlation values are aroun 0.6 and in figure above intensity increasing tren can be observed. For other features correlation values are relatively low, but in figure above clear trends in signal changes in batches 7 and 8 can be osberved.
Alternatively it is possible to calculate correlation statistics per batch and visualise the results.
correlations <- matrix(ncol=8, nrow=nrow(data)) rownames(correlations) <- rownames(data) colnames(correlations) <- unique(batch) QCbatch <- batch[QChits] for (coln in 1:nrow(data)) { for (bch in 1:8) { sampleorder <- scale(c(1:length(which(QCbatch==bch))), center=TRUE, scale=TRUE) if ((length(sampleorder) - length(which(is.na(QCdata[coln, which(QCbatch==bch)])))) >= 3){ correlations[coln, bch] <- cor.test(sampleorder, QCdata[coln, which(QCbatch==bch)], method="kendall")$estimate } } } round(correlations, 2)
plotdata <- as.data.frame(correlations) plotdata$feature <- rownames(plotdata) plotdata <- melt(plotdata, variable.name="batch") plotdata$feature <- factor(plotdata$feature, ordered=TRUE, levels = unique(plotdata$feature)) ggplot(data=plotdata, aes(x=batch, y=feature, fill=value)) + geom_tile() + scale_fill_gradient2()
Figure above indicates that there are significant acquisition order related trends within some batches. Fore example, bathes 4 and 6 for features '451.01081' and '409.05716'.
It is possible to apply univariate regression to the QC sample injection order and signal intensity, to estimate correlation and spread (R2) of the measured data points.
sampleorder <- c(1:ncol(QCdata)) regressionout <- matrix(ncol=3, nrow=nrow(data)) rownames(regressionout) <- rownames(data) colnames(regressionout) <- c("R2.adj","coefficient","p.value") regressionout <- as.data.frame(regressionout) for (coln in 1:nrow(data)) { tempdat <- data.frame(x=sampleorder, y=QCdata[coln, ]) stat <- lm(x ~ y, data=tempdat) stat <- summary(stat) regressionout$R2.adj[coln] <- stat$adj.r.squared regressionout$coefficient[coln] <- stat$coefficients[2,1] regressionout$p.value[coln] <- stat$coefficients[2,4] } regressionout
And regression statistics per batch.
regPerBatch <- matrix(ncol=8, nrow=nrow(data)) rownames(regPerBatch) <- rownames(data) colnames(regPerBatch) <- unique(batch) QCbatch <- MTBLS79$Batch[QChits] for (coln in 1:nrow(data)) { for (bch in 1:8) { sampleorder <- c(1:length(which(QCbatch == bch))) tempdat <- data.frame(x=sampleorder, y=QCdata[coln, which(QCbatch==bch)]) stat <- lm(x ~ y, data=tempdat) stat <- summary(stat) regPerBatch[coln,bch] <- stat$adj.r.squared } } round(regPerBatch,2)
plotdata <- as.data.frame(regPerBatch) plotdata$feature <- rownames(plotdata) plotdata <- melt(plotdata, variable.name="batch") plotdata$feature <- factor(plotdata$feature, ordered=TRUE, levels=unique(plotdata$feature)) ggplot(data=plotdata, aes(x=batch, y=feature, fill=value)) + geom_tile() + scale_fill_gradient2()
Let's have a closer look to '451.01086' measured feature and how signal correction can be applied.
data <- data.frame(data= as.vector(SummarizedExperiment::assay(MTBLS79["451.01086", ])), batch=batch, class=factor(class, ordered=TRUE)) data$order <- c(1:nrow(data)) data$batch <- as.factor(data$batch) ggplot(data=data, aes(x=order, y=log(data,10), col=batch, shape=class)) + geom_point(size=2) + theme(panel.background=element_blank()) + scale_color_manual(values=manual_color)
In this plot circles and squares represent biological samples and triangles are the QC samples. Analytical batches are represented by colours. Differences in measured intensities can be observed between analytical batches.
Similar plot for QC samples only
QCdata <- data[data$class == "QC",] ggplot(data=QCdata, aes(x=order, y=log(data,10), col=batch, shape=class)) + geom_point(size=2) + theme(panel.background=element_blank()) + scale_color_manual(values=manual_color, drop=FALSE) + scale_shape_manual(values=c(16, 17, 15), drop=FALSE)
This figure indicates that there is signal drift present within each analytical batch and between analytical batches for this feature. Let's have a look at the RSD% for all QC samples and QC samples within each analytical batch.
FUN <- function(x) sd(x, na.rm=TRUE)/mean(x, na.rm=TRUE) * 100 # RSD% of biological and QC samples within all 6 batches: out <- c(NA,NA) names(out) <- c("Biological","QC") out[1] <-FUN(data$data[-QChits]) out[2] <-FUN(data$data[QChits]) out
# RSD% per batch: out <- matrix(ncol=8,nrow=2) colnames(out) <- unique(batch) rownames(out) <- c("Biological","QC") for (i in 1:8) { out[1, i] <- FUN(data$data[which(!class=="QC" & batch==i)]) out[2, i] <- FUN(data$data[which(class=="QC" & batch==i)]) } out
From the outputs above it's clear that variance of measured QC sample intensities between batches is high (RSD% = 50), and is relatively high within batches 1 to 5. For batch 3 QC variation exceeds that of the biological samples.
We will apply QC-RSC signal correction method as it is described in @kirwan2013.
The first step involves extracting QC sample data
qcData <- data$data[class == "QC"] qc_batch <- batch[class == "QC"] qc_order <- order[class == "QC"] qcData
Note that the QC data has 1 missing value. Smoothed spline regression doesn't support missing values, so the workaround is to apply missing value imputation or remove the NA values from input to the smoothed spline fit function (which we will do here). We recommend at least 4 QC values be present per batch for the fit to be reliable.
The next step involves applying the smoothed spline fit function to the QC sample data within each batch. We will look at the data for batch 6 in detail.
nbatch <- unique(qc_batch) nb <- 6 # Sample injection order x <- qc_order[qc_batch==nbatch[nb]] # Measured peak intensity or area y <- qcData[qc_batch==nbatch[nb]] y
In this example, signal for 1 QC sample wasn't measured, so these samples need to be removed. The smoothed spline regression input will look like this:
NAhits <- which(is.na(y)) if (length(NAhits)>0) { x <- x[-c(NAhits)] y <- y[-c(NAhits)] rbind(x,y) }
We will apply a log transformation to the data before fitting
y <- log((y + sqrt(y^2)) / 2) y
Fit a smoothed cubic spline using internal cross-validation for parameter estimation
sp.obj <- smooth.spline(x, y, cv=TRUE) sp.obj out <- rbind(y,sp.obj$y) row.names(out) <- c("measured","fitted") out
Now the smoothed spline fit is used to predict values for the biological sample for the current batch.
valuePredict=predict(sp.obj, order[batch==nb]) plotchr <- as.numeric(data$class) # reverse the log transformation to convert the predictions back to the # original scale valuePredict$y <- exp(valuePredict$y) plotdata <- data.frame(measured=data$data[batch==nb], fitted=valuePredict$y, Class=class[batch==nb], order=order[batch==nb]) plotdata2 <- melt(plotdata, id.vars=c("Class","order"), value.name="intensity", variable.name="data") ggplot(data=plotdata2, aes(x=order, y=log(intensity,10), color=data, shape=Class)) + geom_point(size=2) + theme(panel.background=element_blank()) + scale_color_manual(values=manual_color) + scale_shape_manual(values=c(16, 17, 15), drop=FALSE)
The figure above shows the original data points in blue and the fitted data in red. Triangles represent QC samples.
The next step in signal correction is to "flatten" the fitted curve to correct for signal drift. This can usually be done by subtracting the fitted values from the actual measured values for each feature. To avoid getting negative values we will add the median value of the feature to the corrected data.
fitmedian <- median(plotdata$measured, na.rm=TRUE) plotdata$corrected_subt <- (plotdata$measured - plotdata$fitted) + fitmedian plotdata2 <- melt(plotdata, id.vars=c("Class","order"), value.name="intensity", variable.name="data") plotdata_class <- as.character(plotdata2$Class) plotdata_class[plotdata_class == "S"] <- "Sample" plotdata_class[plotdata_class == "C"] <- "Sample" plotdata2$Class <- factor(plotdata_class) ggplot(data=plotdata2, aes(x=order, y=intensity, color=data, shape=Class)) + geom_point(size=2) + theme(panel.background=element_blank()) + scale_color_manual(values=manual_color) + facet_grid(Class ~ .) + scale_shape_manual(values=c(17, 16), drop=FALSE)
An alternative to subtraction of the fitted values is to divide them by the median of the fit and use the resulting coefficients to correct the data points. The same general relative trends should be observed in either case.
plotdata$corrected_div <- plotdata$measured/(plotdata$fitted/fitmedian) plotdata3 <- plotdata[,c("Class", "order", "corrected_subt", "corrected_div")] plotdata3 <- melt(plotdata3, id.vars=c("Class","order"), value.name="intensity", variable.name="data") plotdata_class <- as.character(plotdata3$Class) plotdata_class[plotdata_class=="S"] <- "Sample" plotdata_class[plotdata_class=="C"] <- "Sample" plotdata3$Class <- factor(plotdata_class) ggplot(data=plotdata3, aes(x=order, y=intensity, color=data, shape=Class)) + geom_point(size=2) + theme(panel.background=element_blank()) + scale_color_manual(values=manual_color) + geom_smooth(se=FALSE) + facet_grid(Class ~ .)
So far we have applied signal correction for data points within one analytical batch. The code below will perform the same steps for each of the 8 batches.
outl <- rep(NA, nrow(data)) for (nb in 1:length(nbatch)){ # assigning sample injection order for a batch to 'x', and corresponding # intensities to 'y' x <- qc_order[qc_batch == nbatch[nb]] y <- qcData[qc_batch == nbatch[nb]] # remove measurements with missing values NAhits <- which(is.na(y)) if (length(NAhits) > 0) { x <- x[-c(NAhits)] y <- y[-c(NAhits)] } # require at least 4 data points for QC fit if (length(y) >= 4) { range <- c(batch == nbatch[nb]) # Order is a vector of sample indices for the current batch outl[range] <- pmp:::splineSmoother(x=x, y=y, newX=order[range], log=TRUE, a=1, spar=0) # If less than 5 data points are present, return empty values } else { range <- c(batch == nbatch[nb]) outl[range] <- rep(NA, nrow(data))[range] } } plotdata <- data.frame(measured=data$data, fitted=outl, Class=class, batch=batch, order=c(1:nrow(data))) plotdata2 <- melt(plotdata, id.vars=c("batch","Class","order"), value.name="intensity", variable.name="data") ggplot(data=plotdata2, aes(x=order, y=log(intensity,10), color=data, shape=Class)) + geom_point(alpha=0.5, size=2) + theme(panel.background=element_blank()) + scale_color_manual(values=manual_color)
After smoothed spline fit per each batch is calculated, we can apply signal correction within each batch.
# median intensity value is used to adjust batch effect mpa <- rep(NA, nrow(data)) for (bch in 1:8) { mpa[batch==bch] <- median(data$data[batch==bch], na.rm=TRUE) } QC_fit <- outl/mpa # and correct actual data res <- data$data/QC_fit # correct data using subtratcion res2 <- (data$data -outl) +mpa plotdata <- data.frame(measured=data$data, corrected_subt=res2, corrected_div=res, Class=class, batch=batch, order=c(1:nrow(data))) plotdata2 <- melt(plotdata, id.vars=c("batch","Class","order"), value.name="intensity", variable.name="data") ggplot(data=plotdata2, aes(x=order, y=log(intensity,10), color=data, shape=Class)) + geom_point(alpha=0.2, size=2) + theme(panel.background=element_blank()) + scale_color_manual(values=manual_color) + geom_smooth(se=FALSE) + facet_grid(Class ~ .)
The figure above shows the measured data points in blue and the corrected values using subtraction (red) or division(green). Fitted smoothed spline curves of the corrected data over all batches still indicates that there is batch related signal drift in the data. This can be corrected using the "grand median".
First, a grand median is calculated across all batches, and then difference between each batch median and the grand median is subtracted from all the samples in that batch, to remove the difference.
mpa <- rep(NA, nrow(data)) for (bch in 1:8) { mpa[batch == bch] <- median(res2[batch == bch], na.rm=TRUE) } grandMedian <- median(res2, na.rm=TRUE) mpa <- mpa - grandMedian plotdata$corrected_subt <- plotdata$corrected_subt - mpa mpa <- rep(NA, nrow(data)) for (bch in 1:8) { mpa[batch == bch] <- median(res[batch == bch], na.rm=TRUE) } grandMedian <- median(res, na.rm=TRUE) mpa <- mpa - grandMedian plotdata$corrected_div <- plotdata$corrected_div - mpa plotdata2 <- melt(plotdata, id.vars=c("batch","Class","order"), value.name="intensity", variable.name="data") ggplot(data=plotdata2, aes(x=order, y=log(intensity,10), color=data, shape=Class)) + geom_point(alpha=0.2, size=2) + theme(panel.background=element_blank()) + scale_color_manual(values=manual_color) + geom_smooth(se=FALSE) + facet_grid(Class ~ .)
We can calculate RSD% before and after correction.
FUN <- function(x) sd(x, na.rm=TRUE)/mean(x, na.rm=TRUE) * 100 # RSD% of biological and QC samples within all 6 batches: out <- matrix(nrow=2, ncol=2) colnames(out) <- c("Biological","QC") rownames(out) <- c("measured", "corrected") out[1,1] <-FUN(data$data[-QChits]) out[1,2] <-FUN(data$data[QChits]) out[2,1] <-FUN(res[-QChits]) out[2,2] <-FUN(res[QChits]) round(out, 2) # RSD% per batch: out <- matrix(ncol=8,nrow=4) colnames(out) <- unique(batch) rownames(out) <- c("Biological","QC","Corrected biological","Corrected QC") for(i in 1:8) { out[1, i] <- FUN(data$data[which(!class=="QC" & batch==i)]) out[2, i] <- FUN(data$data[which(class=="QC" & batch==i)]) out[3, i] <- FUN(res[which(!class=="QC" & batch==i)]) out[4, i] <- FUN(res[which(class=="QC" & batch==i)]) } round(out, 2)
The data has now been corrected for batch and signal drift effects.
All the steps from example above can be applied to all or susbet of features in the data set using function "QCRSC".
data <- MTBLS79[feature_names,] class <- MTBLS79$Class batch <- MTBLS79$Batch sample_order <- c(1:ncol(data)) corrected_data <- QCRSC(df=data, order=sample_order, batch=batch, classes=class, spar=0, minQC=4)
We can calculate RSD% statistics per batch before and after correction and visualise the results with a box plot.
data <- SummarizedExperiment::assay(data) corrected_data <- SummarizedExperiment::assay(corrected_data) RSDQC <- matrix(ncol=8, nrow=nrow(data)) RSDsample <- matrix(ncol=8, nrow=nrow(data)) colnames(RSDQC) <- unique(batch) colnames(RSDsample) <- unique(batch) RSDQC_corrected <- matrix(ncol=8, nrow=nrow(data)) RSDsample_corrected <- matrix(ncol=8, nrow=nrow(data)) colnames(RSDQC_corrected) <- unique(batch) colnames(RSDsample_corrected) <- unique(batch) rownames(RSDQC) <- rownames(data) rownames(RSDsample) <- rownames(data) rownames(RSDQC_corrected) <- rownames(data) rownames(RSDsample_corrected) <- rownames(data) # for each feature for (i in 1:nrow(data)) { # for each batch for (nb in 1:8) { # RSD% of QCs in this batch RSDQC[i, nb] <- FUN(data[i, which(class == "QC" & batch == nb)]) # RSD% of samples in this batch RSDsample[i, nb] <- FUN(data[i, which(!class == "QC" & batch == nb)]) # RSD% of QCs in this batch after correction RSDQC_corrected[i, nb] <- FUN(corrected_data[i, which(class == "QC" & batch == nb)]) # RSD% of samples in this batch after correction RSDsample_corrected[i, nb] <- FUN(corrected_data[i, which(!class == "QC" & batch == nb)]) } } # prepare results for plotting plotdataQC <- melt(as.data.frame(RSDQC), variable.name="batch", value.name="RSD") plotdataQC$Class <- "QC" plotdataBio <- melt(as.data.frame(RSDsample), variable.name="batch", value.name="RSD") plotdataBio$Class <- "Sample" plotdataQC_corrected <- melt(as.data.frame(RSDQC_corrected), variable.name="batch", value.name="RSD") plotdataQC_corrected$Class <- "QC_corr" plotdataBio_corrected <- melt(as.data.frame(RSDsample_corrected), variable.name="batch", value.name="RSD") plotdataBio_corrected$Class <- "Sample_corr" plotdata <- rbind(plotdataQC, plotdataQC_corrected) plotdata$Class <- as.factor(plotdata$Class) # plot ggplot(data=plotdata, aes(x=Class, y=RSD, fill=Class)) + geom_boxplot() + facet_wrap(~ batch, ncol=3) + ylab("RSD%") + xlab("") + scale_x_discrete(labels=NULL) + theme(panel.background=element_blank(), axis.text.x=element_blank(), axis.ticks.x=element_blank()) + scale_y_continuous(limits=c(0, 50)) plotdata <- rbind(plotdataBio, plotdataBio_corrected) plotdata$Class <- as.factor(plotdata$Class) # plot ggplot(data=plotdata, aes(x=Class, y=RSD, fill=Class)) + geom_boxplot() + facet_wrap(~ batch, ncol=3) + ylab("RSD%") + xlab("") + theme(panel.background=element_blank(), axis.text.x=element_blank(), axis.ticks.x=element_blank())
Or compare the scores plots of principal components analysis (PCA) before and after correction.
# PQN used to normalise data # KNN for missing value imputation # glog scaling method # A more detailed overview is detailed in # Di Guida et al, Metabolomics, 12:93, 2016 # https://dx.doi.org/10.1007/s11306-016-1030-9 pca_data <- pqn_normalisation(data, classes=class, qc_label="QC") pca_data <- mv_imputation(pca_data, method="KNN", k=5, rowmax=0.5, colmax=0.5, maxp=NULL, check_df=FALSE) pca_data <- glog_transformation(pca_data, classes=class, qc_label="QC") pca_corrected_data <- pqn_normalisation(corrected_data, classes=class, qc_label="QC") pca_corrected_data <- mv_imputation(pca_corrected_data, method="KNN", k=5, rowmax=0.5, colmax=0.5, maxp=NULL, check_df=FALSE) pca_corrected_data <- glog_transformation(pca_corrected_data, classes=class, qc_label="QC") pca_data <- prcomp(t(pca_data), center=TRUE, scale=FALSE) pca_corrected_data <- prcomp(t(pca_corrected_data), center=TRUE, scale=FALSE) # Calculate percentage of explained variance of the first two PC's exp_var_pca <- round(((pca_data$sdev^2)/sum(pca_data$sdev^2)*100)[1:2],2) exp_var_pca_corrected <- round(((pca_corrected_data$sdev^2) / sum(pca_corrected_data$sdev^2)*100)[1:2],2) plots <- list() plotdata <- data.frame(PC1=pca_data$x[, 1], PC2=pca_data$x[, 2], batch=as.factor(batch), class=class) plots[[1]] <- ggplot(data=plotdata, aes(x=PC1, y=PC2, col=batch)) + geom_point(size=2) + theme(panel.background=element_blank()) + scale_color_manual(values=manual_color) + ggtitle("PCA scores, before correction") + xlab(paste0("PC1 (", exp_var_pca[1] ," %)")) + ylab(paste0("PC2 (", exp_var_pca[2] ," %)")) plots[[2]] <- ggplot(data=plotdata, aes(x=PC1, y=PC2, col=class)) + geom_point(size=2) + theme(panel.background=element_blank()) + scale_color_manual(values=manual_color) + ggtitle("PCA scores, before correction") + xlab(paste0("PC1 (", exp_var_pca[1] ," %)")) + ylab(paste0("PC2 (", exp_var_pca[2] ," %)")) plotdata_corr <- data.frame(PC1=pca_corrected_data$x[, 1], PC2=pca_corrected_data$x[, 2], batch=as.factor(batch), class=class) plots[[3]] <- ggplot(data=plotdata_corr, aes(x=PC1, y=PC2, col=batch)) + geom_point(size=2) + theme(panel.background=element_blank()) + scale_color_manual(values=manual_color) + ggtitle("PCA scores, after correction") + xlab(paste0("PC1 (", exp_var_pca_corrected[1] ," %)")) + ylab(paste0("PC2 (", exp_var_pca_corrected[2] ," %)")) plots[[4]] <- ggplot(data=plotdata_corr, aes(x=PC1, y=PC2, col=class)) + geom_point(size=2) + theme(panel.background=element_blank()) + scale_color_manual(values=manual_color) + ggtitle("PCA scores, after correction") + xlab(paste0("PC1 (", exp_var_pca_corrected[1] ," %)")) + ylab(paste0("PC2 (", exp_var_pca_corrected[2] ," %)")) grid.arrange(ncol=2, plots[[1]], plots[[2]], plots[[3]], plots[[4]])
sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.