pmp_vignette_sbc_spectral_quality_assessment.R
In pmp: Peak Matrix Processing and signal batch correction for metabolomics datasets

## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
    collapse = TRUE,
    comment = "#>",
    fig.width=5,
    fig.height=5
)

## ----echo=TRUE, message=FALSE, warning=FALSE, eval=FALSE----------------------
#  install.packages("gridExtra")
#  
#  if (!requireNamespace("BiocManager", quietly = TRUE))
#      install.packages("BiocManager")
#  BiocManager::install("pmp")

## ---- message=FALSE, warning=FALSE--------------------------------------------
library(S4Vectors)
library(SummarizedExperiment)
library(pmp)
library(ggplot2)
library(reshape2)
library(gridExtra)

## ---- eval=FALSE--------------------------------------------------------------
#  help ("MTBLS79")

## -----------------------------------------------------------------------------
feature_names <- c("70.03364", "133.07379", "146.16519", "163.04515", 
    "174.89483", "200.03196", "207.07818", "221.05062", "240.02445",
    "251.03658", "266.01793", "304.99115", "321.07923", "338.98131", 
    "376.03962", "393.35878", "409.05716", "430.24353", "451.01086",
    "465.14937")
summary(t(SummarizedExperiment::assay(MTBLS79[feature_names, ])))

## -----------------------------------------------------------------------------
#number of samples:
ncol(MTBLS79)

## -----------------------------------------------------------------------------
#Batches:
unique(MTBLS79$Batch)

## -----------------------------------------------------------------------------
#Sample classes:
unique(MTBLS79$Class)

## ----fig.height=5, fig.width=4, message=FALSE, warning=FALSE------------------
#  separate the LCMS data from the meta data
data(MTBLS79)
data <- SummarizedExperiment::assay(MTBLS79[feature_names, ])
class <- SummarizedExperiment::colData(MTBLS79)$Class
batch <- SummarizedExperiment::colData(MTBLS79)$Batch
order <- c(1:ncol(data))

# get index of QC samples
QChits <- which(class == "QC")

# small function to calculate RSD%
FUN <- function(x) sd(x, na.rm=TRUE) / mean(x, na.rm=TRUE) * 100

# RSD% of biological and QC samples within all 8 batches:
out <- matrix(ncol=2, nrow=nrow(data))
colnames(out) <- c("Sample","QC")
rownames(out) <- rownames(data)

# for each feature calculate RSD% for the samples and the QCs
for (i in 1:nrow(data)) {
    out[i, 1] <- FUN(data[i, -QChits]) # for samples
    out[i, 2] <- FUN(data[i, QChits]) # for QCs
}

# prepare data for plotting
plotdata <- melt(data.frame(out), variable.name="Class", value.name="RSD")
plotdata$feature <- rownames(data)

plotdata$RSD <- round(plotdata$RSD,0)
plotdata$feature <- factor(plotdata$feature, ordered=TRUE,
    levels=unique(plotdata$feature))

# plot
ggplot(data=plotdata, aes(x=Class, y=feature, fill=RSD)) + 
    geom_tile() + 
    geom_text(aes(label=RSD)) +
    scale_fill_gradient2(low="black", mid="white", high="red")

## -----------------------------------------------------------------------------
ggplot(data=plotdata, aes(x=Class, y=RSD, fill=Class)) +
    geom_violin(draw_quantiles=c(0.25,0.5,0.75)) +
    ylab("RSD%") + 
    guides(fill=FALSE) +
    theme(panel.background=element_blank())

## ----message=FALSE, warning=FALSE, fig.height=6-------------------------------
# prepare some matrices to store the results
RSDQC <- matrix(ncol=8, nrow=nrow(data))
RSDsample <- matrix(ncol=8, nrow=nrow(data))
colnames(RSDQC) <- unique(batch)
colnames(RSDsample) <- unique(batch)

rownames(RSDQC) <- rownames(data)
rownames(RSDsample) <- rownames(data)

# for each feature
for (i in 1:nrow(data)) {
    # for each batch
    for (nb in 1:8) {
        # RSD% of QCs in this batch
        RSDQC[i, nb] <- FUN(data[i, which(class == "QC" & batch == nb)])
        # RSD% of samples in this batch
        RSDsample[i, nb] <- FUN(data[i, which(!class == "QC" & batch == nb)])
    }
}

# prepare results for plotting
plotdataQC <- melt(as.data.frame(RSDQC), variable.name="batch",
    value.name="RSD")
plotdataQC$Class <- "QC"

plotdataBio <- melt(as.data.frame(RSDsample), variable.name="batch",
    value.name="RSD")
plotdataBio$Class <- "Sample"

plotdata <- rbind(plotdataQC, plotdataBio)

plotdata$Class <- as.factor(plotdata$Class)

# plot
ggplot(data=plotdata, aes(x=Class, y=RSD, fill=Class)) + geom_boxplot() +
    facet_wrap(~ batch, ncol=3) +
    ylab("RSD%") +
    xlab("") +
    scale_x_discrete(labels=NULL) +
    theme(panel.background=element_blank(), axis.text.x=element_blank(),
        axis.ticks.x=element_blank())

## -----------------------------------------------------------------------------
summary(RSDQC)

## -----------------------------------------------------------------------------
summary(RSDsample)

## ----message=FALSE, warning=FALSE---------------------------------------------

# prepare a list of colours for plotting
manual_color = c("#386cb0", "#ef3b2c", "#7fc97f", "#fdb462", "#984ea3", 
    "#a6cee3", "#778899", "#fb9a99", "#ffff33")

# Function to calculate median absolute deviation
DRatfun <- function(samples, qcs) mad(qcs) / mad(samples)

# prepare matrix for dratio output
dratio <- matrix(ncol=8, nrow=nrow(data))
colnames(dratio) <- unique(batch)
rownames(dratio) <- rownames(data)

# calculate dratio for each feature, per batch
for (i in 1:nrow(dratio)){
    for (nb in 1:8) {
        dratio[i, nb] <- DRatfun(samples=data[i, which(!class == "QC" &
        batch == nb)], qcs=data[i, which(class == "QC" & batch == nb)])
    }
}

# prepare data for plotting
dratio <- as.data.frame(round(dratio, 2))

plotdata2 <- melt(dratio, variable.name="batch")
plotdata2$index <- rownames(data)
plotdata2$index <- factor(plotdata2$index, ordered=TRUE,
    levels=unique(plotdata2$index))

ggplot(data=plotdata2, aes(x=index, y=value, color=batch)) +
    geom_point(size=2) +
    xlab("index") + ylab("D-ratio") +
    geom_hline(yintercept=1) + theme(panel.background=element_blank()) +
    scale_color_manual(values=manual_color) +
    theme(axis.text.x=element_text(angle=90))

## ---- fig.width=6.5, fig.height=5---------------------------------------------
pca_data <- MTBLS79[feature_names, ]

pca_data <- pqn_normalisation(pca_data, classes=class, qc_label="QC")
pca_data <- mv_imputation(pca_data, method="KNN", k=5, rowmax=0.5,
    colmax=0.5, maxp=NULL, check_df=FALSE)
pca_data <- glog_transformation(pca_data, classes=class, qc_label="QC")

pca_data <- prcomp(t(SummarizedExperiment::assay(pca_data)), center=TRUE, 
    scale=FALSE)
exp_var_pca <- round(((pca_data$sdev^2)/sum(pca_data$sdev^2)*100)[1:2], 2)

plots <- list()

plotdata <- data.frame(PC1=pca_data$x[, 1], PC2=pca_data$x[, 2],
    batch=as.factor(batch), class=class)

plots[[1]] <- ggplot(data=plotdata, aes(x=PC1, y=PC2, col=batch)) +
    geom_point(size=2) +
    theme(panel.background=element_blank()) +
    scale_color_manual(values=manual_color) +
    ggtitle("PCA scores, before correction") +
    xlab(paste0("PC1 (", exp_var_pca[1] ," %)")) +
    ylab(paste0("PC2 (", exp_var_pca[2] ," %)"))

plots[[2]] <- ggplot(data=plotdata, aes(x=PC1, y=PC2, col=class)) +
    geom_point(size=2) +
    theme(panel.background=element_blank()) +
    scale_color_manual(values=manual_color) +
    ggtitle("PCA scores plot, before correction") +
    xlab(paste0("PC1 (", exp_var_pca[1] ," %)")) +
    ylab(paste0("PC2 (", exp_var_pca[2] ," %)"))

grid.arrange(ncol=2, plots[[1]], plots[[2]])

## ----message=FALSE, warning=FALSE, fig.height=10------------------------------

# autoscale the QC data
QCdata <- data[ ,QChits]
QCdata2 <- as.data.frame(scale(t(QCdata), scale=TRUE, center=TRUE))

# prepare the data for plotting
plotdata <- melt(QCdata2, value.name="intensity")
plotdata$index <- rep(1:nrow(QCdata2), ncol(QCdata2))

plotdata$batch <- as.factor(batch[QChits])

# plot
ggplot(data=plotdata, aes(x=index, y=intensity, col=batch)) +
    geom_point(size=2) +
    facet_wrap(~ variable, ncol=4) +
    theme(panel.background=element_blank()) +
    scale_color_manual(values=manual_color)

## ---- warning=FALSE, fig.height=10--------------------------------------------
ggplot(data=plotdata, aes(x=index, y=intensity, col=batch)) + 
    geom_point(size=2) +
    facet_wrap(~ variable, ncol=4) +
    geom_smooth(method="lm", se=TRUE, colour="black") +
    theme(panel.background=element_blank()) +
    scale_color_manual(values=manual_color)

## ----message=FALSE, warning=FALSE, fig.height=7.5-----------------------------
sampleorder <- c(1:ncol(QCdata))

correlations <- matrix(ncol=2, nrow=nrow(data))
rownames(correlations) <- rownames(data)
colnames(correlations) <- c("tau","p.value")
correlations <- as.data.frame(correlations)

for (coln in 1:nrow(data)) {
    stat <- cor.test(sampleorder, QCdata[coln, ], method="kendall")
    correlations$tau[coln] <- stat$estimate
    correlations$p.value[coln] <- stat$p.value
}

correlations


## ---- warning=FALSE, message=FALSE--------------------------------------------

correlations <- matrix(ncol=8, nrow=nrow(data))
rownames(correlations) <- rownames(data)
colnames(correlations) <- unique(batch)

QCbatch <- batch[QChits]

for (coln in 1:nrow(data)) {
    for (bch in 1:8) {
        sampleorder <- scale(c(1:length(which(QCbatch==bch))),
        center=TRUE, scale=TRUE)

        if ((length(sampleorder) - 
        length(which(is.na(QCdata[coln, which(QCbatch==bch)])))) >= 3){
            correlations[coln, bch] <- cor.test(sampleorder,
            QCdata[coln, which(QCbatch==bch)], method="kendall")$estimate
        }
    }
}

round(correlations, 2)

## ---- fig.wide=TRUE-----------------------------------------------------------
plotdata <- as.data.frame(correlations)
plotdata$feature <- rownames(plotdata)

plotdata <- melt(plotdata, variable.name="batch")
plotdata$feature <- factor(plotdata$feature, ordered=TRUE,
    levels = unique(plotdata$feature))

ggplot(data=plotdata, aes(x=batch, y=feature, fill=value)) + 
    geom_tile() + scale_fill_gradient2()


## -----------------------------------------------------------------------------
sampleorder <- c(1:ncol(QCdata))

regressionout <- matrix(ncol=3, nrow=nrow(data))
rownames(regressionout) <- rownames(data)
colnames(regressionout) <- c("R2.adj","coefficient","p.value")
regressionout <- as.data.frame(regressionout)

for (coln in 1:nrow(data)) {
    tempdat <- data.frame(x=sampleorder, y=QCdata[coln, ])
    stat <- lm(x ~ y, data=tempdat)
    stat <- summary(stat)

    regressionout$R2.adj[coln] <- stat$adj.r.squared  
    regressionout$coefficient[coln] <- stat$coefficients[2,1]
    regressionout$p.value[coln] <- stat$coefficients[2,4]
}

regressionout

## -----------------------------------------------------------------------------
regPerBatch <- matrix(ncol=8, nrow=nrow(data))
rownames(regPerBatch) <- rownames(data)
colnames(regPerBatch) <- unique(batch)

QCbatch <- MTBLS79$Batch[QChits]

for (coln in 1:nrow(data)) {
    for (bch in 1:8) {
        sampleorder <- c(1:length(which(QCbatch == bch)))
        tempdat <- data.frame(x=sampleorder, y=QCdata[coln, 
            which(QCbatch==bch)])
        stat <- lm(x ~ y, data=tempdat)
        stat <- summary(stat)
        regPerBatch[coln,bch] <- stat$adj.r.squared
    }
}

round(regPerBatch,2)

## ---- fig.wide=TRUE-----------------------------------------------------------
plotdata <- as.data.frame(regPerBatch)
plotdata$feature <- rownames(plotdata)

plotdata <- melt(plotdata, variable.name="batch")
plotdata$feature <- factor(plotdata$feature, ordered=TRUE, 
    levels=unique(plotdata$feature))

ggplot(data=plotdata, aes(x=batch, y=feature, fill=value)) + 
    geom_tile() + scale_fill_gradient2()

## ---- warning=FALSE, message=FALSE--------------------------------------------
data <- data.frame(data=
    as.vector(SummarizedExperiment::assay(MTBLS79["451.01086", ])), batch=batch,
    class=factor(class, ordered=TRUE))
data$order <- c(1:nrow(data))
data$batch <- as.factor(data$batch)

ggplot(data=data, aes(x=order, y=log(data,10), col=batch, shape=class)) +
    geom_point(size=2) + theme(panel.background=element_blank()) +
    scale_color_manual(values=manual_color)

## -----------------------------------------------------------------------------

QCdata <- data[data$class == "QC",]

ggplot(data=QCdata, aes(x=order, y=log(data,10), col=batch, shape=class)) +
    geom_point(size=2) + theme(panel.background=element_blank()) +
    scale_color_manual(values=manual_color, drop=FALSE) +
    scale_shape_manual(values=c(16, 17, 15), drop=FALSE)

## -----------------------------------------------------------------------------
FUN <- function(x) sd(x, na.rm=TRUE)/mean(x, na.rm=TRUE) * 100

# RSD% of biological and QC samples within all 6 batches:
out <- c(NA,NA)
names(out) <- c("Biological","QC")
out[1] <-FUN(data$data[-QChits])
out[2] <-FUN(data$data[QChits])
out

## -----------------------------------------------------------------------------
# RSD% per batch:
out <- matrix(ncol=8,nrow=2)
colnames(out) <- unique(batch)
rownames(out) <- c("Biological","QC")
for (i in 1:8) {
    out[1, i] <- FUN(data$data[which(!class=="QC" & batch==i)])
    out[2, i] <- FUN(data$data[which(class=="QC" & batch==i)])
}

out

## -----------------------------------------------------------------------------
qcData <- data$data[class == "QC"]
qc_batch <- batch[class == "QC"]
qc_order <- order[class == "QC"]

qcData


## -----------------------------------------------------------------------------
nbatch <- unique(qc_batch)

nb <- 6

# Sample injection order
x <- qc_order[qc_batch==nbatch[nb]]

# Measured peak intensity or area
y <- qcData[qc_batch==nbatch[nb]]
y

## -----------------------------------------------------------------------------
NAhits <- which(is.na(y))
if (length(NAhits)>0) {
    x <- x[-c(NAhits)]
    y <- y[-c(NAhits)]
    rbind(x,y)
}

## -----------------------------------------------------------------------------
y <- log((y + sqrt(y^2)) / 2)
y

## -----------------------------------------------------------------------------
sp.obj <- smooth.spline(x, y, cv=TRUE) 
sp.obj

out <- rbind(y,sp.obj$y)
row.names(out) <- c("measured","fitted")
out

## -----------------------------------------------------------------------------
valuePredict=predict(sp.obj, order[batch==nb])

plotchr <- as.numeric(data$class)

# reverse the log transformation to convert the predictions back to the
# original scale
valuePredict$y <- exp(valuePredict$y)

plotdata <- data.frame(measured=data$data[batch==nb], fitted=valuePredict$y,
    Class=class[batch==nb], order=order[batch==nb])
plotdata2 <- melt(plotdata, id.vars=c("Class","order"), value.name="intensity",
    variable.name="data")

ggplot(data=plotdata2, aes(x=order, y=log(intensity,10), color=data,
    shape=Class)) + geom_point(size=2) +
    theme(panel.background=element_blank()) +
    scale_color_manual(values=manual_color) +
    scale_shape_manual(values=c(16, 17, 15), drop=FALSE)

## ---- warning=FALSE-----------------------------------------------------------
fitmedian <- median(plotdata$measured, na.rm=TRUE)
plotdata$corrected_subt <- (plotdata$measured - plotdata$fitted) + fitmedian

plotdata2 <- melt(plotdata, id.vars=c("Class","order"), 
    value.name="intensity", variable.name="data")

plotdata_class <- as.character(plotdata2$Class)
plotdata_class[plotdata_class == "S"] <- "Sample"
plotdata_class[plotdata_class == "C"] <- "Sample"
plotdata2$Class <- factor(plotdata_class)

ggplot(data=plotdata2, aes(x=order, y=intensity, color=data, shape=Class)) +
    geom_point(size=2) + theme(panel.background=element_blank()) + 
    scale_color_manual(values=manual_color) +
    facet_grid(Class ~ .) +
    scale_shape_manual(values=c(17, 16), drop=FALSE)

## ---- warning=FALSE-----------------------------------------------------------
plotdata$corrected_div <- plotdata$measured/(plotdata$fitted/fitmedian)

plotdata3 <- plotdata[,c("Class", "order", "corrected_subt", "corrected_div")]

plotdata3 <- melt(plotdata3, id.vars=c("Class","order"), 
    value.name="intensity", variable.name="data")

plotdata_class <- as.character(plotdata3$Class)
plotdata_class[plotdata_class=="S"] <- "Sample"
plotdata_class[plotdata_class=="C"] <- "Sample"
plotdata3$Class <- factor(plotdata_class)

ggplot(data=plotdata3, aes(x=order, y=intensity, color=data, shape=Class)) +
    geom_point(size=2) + theme(panel.background=element_blank()) +
    scale_color_manual(values=manual_color) +
    geom_smooth(se=FALSE) + facet_grid(Class ~ .)


## ---- warning=FALSE-----------------------------------------------------------

outl <- rep(NA, nrow(data))

for (nb in 1:length(nbatch)){
    # assigning sample injection order for a batch to 'x', and corresponding 
    # intensities to 'y'
    x <- qc_order[qc_batch == nbatch[nb]]
    y <- qcData[qc_batch == nbatch[nb]]

    # remove measurements with missing values
    NAhits <- which(is.na(y))
    if (length(NAhits) > 0) {
        x <- x[-c(NAhits)]
        y <- y[-c(NAhits)]
    }

    # require at least 4 data points for QC fit
    if (length(y) >= 4) {
        range <- c(batch == nbatch[nb])
        # Order is a vector of sample indices for the current batch
        outl[range] <- pmp:::splineSmoother(x=x, y=y, newX=order[range], 
        log=TRUE, a=1, spar=0)

        # If less than 5 data points are present, return empty values   
    } else {
        range <- c(batch == nbatch[nb])
        outl[range] <- rep(NA, nrow(data))[range]
    }
}

plotdata <- data.frame(measured=data$data, fitted=outl, Class=class, 
    batch=batch, order=c(1:nrow(data)))
plotdata2 <- melt(plotdata, id.vars=c("batch","Class","order"),
    value.name="intensity", variable.name="data")

ggplot(data=plotdata2, aes(x=order, y=log(intensity,10), 
    color=data, shape=Class)) + geom_point(alpha=0.5, size=2) +
    theme(panel.background=element_blank()) + 
    scale_color_manual(values=manual_color)


## ---- warning=FALSE-----------------------------------------------------------

# median intensity value is used to adjust batch effect

mpa <- rep(NA, nrow(data))

for (bch in 1:8) {
    mpa[batch==bch] <- median(data$data[batch==bch], na.rm=TRUE)
}


QC_fit <- outl/mpa

# and correct actual data
res <- data$data/QC_fit

# correct data using subtratcion
res2 <- (data$data -outl) +mpa

plotdata <- data.frame(measured=data$data, corrected_subt=res2, 
    corrected_div=res, Class=class, batch=batch, order=c(1:nrow(data)))
plotdata2 <- melt(plotdata, id.vars=c("batch","Class","order"), 
    value.name="intensity", variable.name="data")

ggplot(data=plotdata2, aes(x=order, y=log(intensity,10), 
    color=data, shape=Class)) + geom_point(alpha=0.2, size=2) +
    theme(panel.background=element_blank()) +
    scale_color_manual(values=manual_color) +
    geom_smooth(se=FALSE) +
    facet_grid(Class ~ .)

## ---- warning=FALSE-----------------------------------------------------------
mpa <- rep(NA, nrow(data))

for (bch in 1:8) {
    mpa[batch == bch] <- median(res2[batch == bch], na.rm=TRUE)
}

grandMedian <- median(res2, na.rm=TRUE)

mpa <- mpa - grandMedian

plotdata$corrected_subt <- plotdata$corrected_subt - mpa

mpa <- rep(NA, nrow(data))

for (bch in 1:8) {
    mpa[batch == bch] <- median(res[batch == bch], na.rm=TRUE)
}

grandMedian <- median(res, na.rm=TRUE)

mpa <- mpa - grandMedian

plotdata$corrected_div <- plotdata$corrected_div - mpa

plotdata2 <- melt(plotdata, id.vars=c("batch","Class","order"),
    value.name="intensity", variable.name="data")

ggplot(data=plotdata2, aes(x=order, y=log(intensity,10),
    color=data, shape=Class)) + geom_point(alpha=0.2, size=2) +
    theme(panel.background=element_blank()) +
    scale_color_manual(values=manual_color) +
    geom_smooth(se=FALSE) +
    facet_grid(Class ~ .)

## ---- warning=FALSE-----------------------------------------------------------
FUN <- function(x) sd(x, na.rm=TRUE)/mean(x, na.rm=TRUE) * 100

# RSD% of biological and QC samples within all 6 batches:
out <- matrix(nrow=2, ncol=2)
colnames(out) <- c("Biological","QC")
rownames(out) <- c("measured", "corrected")
out[1,1] <-FUN(data$data[-QChits])
out[1,2] <-FUN(data$data[QChits])
out[2,1] <-FUN(res[-QChits])
out[2,2] <-FUN(res[QChits])

round(out, 2)

# RSD% per batch:
out <- matrix(ncol=8,nrow=4)
colnames(out) <- unique(batch)
rownames(out) <- c("Biological","QC","Corrected biological","Corrected QC")
for(i in 1:8) {
    out[1, i] <- FUN(data$data[which(!class=="QC" & batch==i)])
    out[2, i] <- FUN(data$data[which(class=="QC" & batch==i)])
    out[3, i] <- FUN(res[which(!class=="QC" & batch==i)])
    out[4, i] <- FUN(res[which(class=="QC" & batch==i)])
}

round(out, 2)

## ----message=FALSE, warning=FALSE, include=TRUE-------------------------------
data <- MTBLS79[feature_names,]

class <- MTBLS79$Class
batch <- MTBLS79$Batch
sample_order <- c(1:ncol(data))

corrected_data <- QCRSC(df=data, order=sample_order, batch=batch,
    classes=class, spar=0, minQC=4)

## ---- warning=FALSE-----------------------------------------------------------
data <- SummarizedExperiment::assay(data)
corrected_data <- SummarizedExperiment::assay(corrected_data)
RSDQC <- matrix(ncol=8, nrow=nrow(data))
RSDsample <- matrix(ncol=8, nrow=nrow(data))
colnames(RSDQC) <- unique(batch)
colnames(RSDsample) <- unique(batch)

RSDQC_corrected <- matrix(ncol=8, nrow=nrow(data))
RSDsample_corrected <- matrix(ncol=8, nrow=nrow(data))
colnames(RSDQC_corrected) <- unique(batch)
colnames(RSDsample_corrected) <- unique(batch)

rownames(RSDQC) <- rownames(data)
rownames(RSDsample) <- rownames(data)
rownames(RSDQC_corrected) <- rownames(data)
rownames(RSDsample_corrected) <- rownames(data)

# for each feature
for (i in 1:nrow(data)) {
    # for each batch
    for (nb in 1:8) {
        # RSD% of QCs in this batch
        RSDQC[i, nb] <- FUN(data[i, which(class == "QC" & batch == nb)])
        # RSD% of samples in this batch
        RSDsample[i, nb] <- FUN(data[i, which(!class == "QC" & batch == nb)])
        # RSD% of QCs in this batch after correction
        RSDQC_corrected[i, nb] <- FUN(corrected_data[i, which(class == "QC"
            & batch == nb)])
        # RSD% of samples in this batch after correction
        RSDsample_corrected[i, nb] <- FUN(corrected_data[i, which(!class == "QC"
            & batch == nb)]) 
    }
}

# prepare results for plotting
plotdataQC <- melt(as.data.frame(RSDQC), variable.name="batch",
    value.name="RSD")
plotdataQC$Class <- "QC"

plotdataBio <- melt(as.data.frame(RSDsample), variable.name="batch",
    value.name="RSD")
plotdataBio$Class <- "Sample"

plotdataQC_corrected <- melt(as.data.frame(RSDQC_corrected),
    variable.name="batch", value.name="RSD")
plotdataQC_corrected$Class <- "QC_corr"

plotdataBio_corrected <- melt(as.data.frame(RSDsample_corrected),
    variable.name="batch", value.name="RSD")
plotdataBio_corrected$Class <- "Sample_corr"

plotdata <- rbind(plotdataQC, plotdataQC_corrected)

plotdata$Class <- as.factor(plotdata$Class)

# plot
ggplot(data=plotdata, aes(x=Class, y=RSD, fill=Class)) + geom_boxplot() +
    facet_wrap(~ batch, ncol=3) +
    ylab("RSD%") +
    xlab("") +
    scale_x_discrete(labels=NULL) +
    theme(panel.background=element_blank(), axis.text.x=element_blank(),
        axis.ticks.x=element_blank()) +
    scale_y_continuous(limits=c(0, 50))

plotdata <- rbind(plotdataBio, plotdataBio_corrected)

plotdata$Class <- as.factor(plotdata$Class)

# plot
ggplot(data=plotdata, aes(x=Class, y=RSD, fill=Class)) + geom_boxplot() +
    facet_wrap(~ batch, ncol=3) +
    ylab("RSD%") +
    xlab("") +
    theme(panel.background=element_blank(), axis.text.x=element_blank(),
        axis.ticks.x=element_blank())


## ---- fig.width=6, fig.height=8-----------------------------------------------
# PQN used to normalise data
# KNN for missing value imputation
# glog scaling method 
# A more detailed overview is detailed in
# Di Guida et al, Metabolomics, 12:93, 2016
# https://dx.doi.org/10.1007/s11306-016-1030-9

pca_data <- pqn_normalisation(data, classes=class, qc_label="QC")
pca_data <- mv_imputation(pca_data, method="KNN", k=5, rowmax=0.5,
    colmax=0.5, maxp=NULL, check_df=FALSE)
pca_data <- glog_transformation(pca_data, classes=class, qc_label="QC")

pca_corrected_data <- pqn_normalisation(corrected_data, classes=class,
    qc_label="QC")
pca_corrected_data <- mv_imputation(pca_corrected_data, method="KNN", k=5,
    rowmax=0.5, colmax=0.5, maxp=NULL, check_df=FALSE)
pca_corrected_data <- glog_transformation(pca_corrected_data, 
    classes=class, qc_label="QC")

pca_data <- prcomp(t(pca_data), center=TRUE, scale=FALSE)
pca_corrected_data <- prcomp(t(pca_corrected_data), center=TRUE, scale=FALSE)

# Calculate percentage of explained variance of the first two PC's
exp_var_pca <- round(((pca_data$sdev^2)/sum(pca_data$sdev^2)*100)[1:2],2)
exp_var_pca_corrected <- round(((pca_corrected_data$sdev^2) /
    sum(pca_corrected_data$sdev^2)*100)[1:2],2)

plots <- list()

plotdata <- data.frame(PC1=pca_data$x[, 1], PC2=pca_data$x[, 2], 
    batch=as.factor(batch), class=class)

plots[[1]] <- ggplot(data=plotdata, aes(x=PC1, y=PC2, col=batch)) +
    geom_point(size=2) + theme(panel.background=element_blank()) +
    scale_color_manual(values=manual_color) +
    ggtitle("PCA scores, before correction") +
    xlab(paste0("PC1 (", exp_var_pca[1] ," %)")) +
    ylab(paste0("PC2 (", exp_var_pca[2] ," %)"))

plots[[2]] <- ggplot(data=plotdata, aes(x=PC1, y=PC2, col=class)) +
    geom_point(size=2) + theme(panel.background=element_blank()) +
    scale_color_manual(values=manual_color) +
    ggtitle("PCA scores, before correction") +
    xlab(paste0("PC1 (", exp_var_pca[1] ," %)")) +
    ylab(paste0("PC2 (", exp_var_pca[2] ," %)"))

plotdata_corr <- data.frame(PC1=pca_corrected_data$x[, 1], 
    PC2=pca_corrected_data$x[, 2], batch=as.factor(batch), class=class)

plots[[3]] <- ggplot(data=plotdata_corr, aes(x=PC1, y=PC2, col=batch)) +
    geom_point(size=2) +
    theme(panel.background=element_blank()) +
    scale_color_manual(values=manual_color) +
    ggtitle("PCA scores, after correction") +
    xlab(paste0("PC1 (", exp_var_pca_corrected[1] ," %)")) +
    ylab(paste0("PC2 (", exp_var_pca_corrected[2] ," %)"))

plots[[4]] <- ggplot(data=plotdata_corr, aes(x=PC1, y=PC2, col=class)) +
    geom_point(size=2) +
    theme(panel.background=element_blank()) +
    scale_color_manual(values=manual_color) +
    ggtitle("PCA scores, after correction") +
    xlab(paste0("PC1 (", exp_var_pca_corrected[1] ," %)")) +
    ylab(paste0("PC2 (", exp_var_pca_corrected[2] ," %)"))

grid.arrange(ncol=2, plots[[1]], plots[[2]], plots[[3]], plots[[4]])

## -----------------------------------------------------------------------------
sessionInfo()
Any scripts or data that you put into this service are public.
pmp documentation built on April 1, 2021, 6:01 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
pmp
Peak Matrix Processing and signal batch correction for metabolomics datasets

inst/doc/pmp_vignette_sbc_spectral_quality_assessment.R
In pmp: Peak Matrix Processing and signal batch correction for metabolomics datasets

Try the pmp package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

pmp Peak Matrix Processing and signal batch correction for metabolomics datasets

inst/doc/pmp_vignette_sbc_spectral_quality_assessment.R In pmp: Peak Matrix Processing and signal batch correction for metabolomics datasets

Try the pmp package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

pmp
Peak Matrix Processing and signal batch correction for metabolomics datasets

inst/doc/pmp_vignette_sbc_spectral_quality_assessment.R
In pmp: Peak Matrix Processing and signal batch correction for metabolomics datasets