#' Input the RGlist raw data, build a Rnits object and perform filtering and normalization
#'
#' This function takes high-dimensional expression data as a RGList, creates a
#' \code{\linkS4class{Rnits}} object for subsequent filtering and normalization
#'
#' See the Limma User's Guide for more details on \code{read.maimages}, \code{normalizeBetweenArrays},
#' \code{normalizeWithinArrays} and \code{RGList}. For importing microarray raw data,
#' use the 'Targets file' to specify experimental design. The target file has columns
#' SlideNumber, FileName, Cy3 (description of Cy3 channel ref/control/treatment), Cy5
#' (description of Cy3 channel ref/control/treatment) and Time. Time values should be
#' identical for control and treatment.
#'
#' @param obj Raw expression data in \code{RGlist}, \code{AffyBatch} or simple data
#' frame format
#' @param probedata A data frame containing the probe names that should match the probe
#' names in raw data (optional)
#' @param phenodata A data frame with information about sample names. The rownames of
#' the data frame must match column names of the expression values. If input data is
#' data frame of log ratios, this is required.
#' @param filter An argument to perform background filtering of probes. If \code{NULL},
#' no filtering is done. If an integer (0-500), probes are flagged based on raw channel
#' intensity. If a vector of two numbers is provided, the first will be used for red channel
#' and the second for green channel. If \code{'background'}, probes whose intensities are
#' lower than 2 standard deviations less than the mean of the background intensity for
#' the channel are flagged.
#' @param normalize Character string specifying the normalization method for raw data.
#' If \code{Intensity}, the reference channels for all arrays are used to construct an
#' array-specific smoothing function which is then applied to normalize the sample channel.
#' If \code{Between}, the normalization method \code{normalizeBetweenArrays} in the
#' LIMMA package is used (use \code{normmethod} to further specify normalization
#' methods. See packaged LIMMA for details.). If \code{Within}, the normalization
#' method \code{normalizeWithinArrays} in the LIMMA package is used.
#' @param normmethod Normalization method for input data. Default \code{NULL}.
#' Can be one of 'quantile', 'vsn', 'Between'
#' @param background Only for AffyBatch data. If \code{TRUE}, background filtering
#' will be done on Affy data.
#' @param center If \code{TRUE}, the log-ratio data will be mean centered to
#' 0 in the column space.
#' @param plot If \code{TRUE}, boxplots of normalized channel intensities and
#' log-ratios are drawn.
#' @param threshold Default \code{0.8}. Fraction of samples with missing data
#' for individual probes to be filtered out.
#' @param logscale Default \code{FALSE}. Is the data in logscale? If FALSE,
#' log2 transformation is done on the data.
#'
#' @return An object of S4 class \code{\linkS4class{Rnits}} (which is
#' derived from class \code{\linkS4class{exprSet}}), containing the probe data,
#' design data, expression data, phenotypical data (i.e. Time).
#' @export
#' @docType methods
#' @seealso \code{ExpressionSet}
#'
#' @importFrom limma normalizeBetweenArrays normalizeWithinArrays normalizeQuantiles normalizeVSN
#' @importFrom affy rma
#' @importFrom impute impute.knn
#' @import Biobase
#' @importFrom graphics boxplot
#'
#' @examples
#' # load pre-compiled expressionSet object for Ronen and Botstein yeast chemostat data
#' data(yeastchemostat)
#' rnitsobj = build.Rnits(yeastchemostat, logscale = TRUE, normmethod = 'Between')
build.Rnits <- function(obj, probedata = NULL, phenodata = NULL, filter = NULL, normalize = NULL,
normmethod = NULL, plot = FALSE, center = FALSE, background = NULL, threshold = 0.8,
logscale = FALSE) {
callData <- list()
fdata <- pdata <- AnnotatedDataFrame(data = data.frame())
## Check necessary arguments
if (!"ExpressionSet" %in% class(obj)) {
if (is.null(phenodata))
stop("Phenotype data table must be provided for object\n")
if (!"Sample" %in% colnames(phenodata))
stop("Phenotype data table must have \"Sample\" column\n")
if (is.null(probedata))
cat("Probedata not provided\n")
}
if (!is.null(probedata) & is.null(probedata$ProbeID))
stop("Probe data table has no column named \"ProbeID")
if ("data.frame" %in% class(obj) | "matrix" %in% class(obj)) {
cat("Log scale is", logscale, "\n")
## Data frame checks
if (nrow(phenodata) != ncol(obj))
stop("Phenotype data table must have same number of columns as data table\n")
if (all(rownames(phenodata) != colnames(obj)))
stop("Phenotype data table must have the same column names as data table\n")
if (!is.null(probedata) && nrow(probedata) != nrow(obj))
stop("Probe data table must equal number of rows as data table\n")
## Data matrix checks
if (!logscale && min(obj, na.rm = TRUE) < 0)
stop("Data is indicated to be NOT in log scale, but negative values observed")
if (!logscale)
obj <- log2(obj)
callData$Type = "matrix"
if (is.null(normmethod)) {
cat("No filtering or normalization done for input object\n")
} else {
if (!normmethod %in% c("quantile", "vsn", "Between"))
stop("Invalid normmethod argument for data frame. Must be vsn, quantile or Between")
if (normmethod == "quantile") {
cat("Quantile normalization\n")
obj <- normalizeQuantiles(obj)
}
if (normmethod == "vsn") {
cat("VSN normalization\n")
obj <- normalizeVSN(obj)
}
if (normmethod == "Between") {
cat("Between Array normalization\n")
obj <- normalizeBetweenArrays(data.matrix(obj))
}
}
if (!is.null(probedata)) {
rownames(obj) <- probedata$ProbeID
rownames(probedata) <- probedata$ProbeID
fdata <- AnnotatedDataFrame(data = probedata)
} else {
probedata <- data.frame(ProbeID = paste0("p", 1:nrow(obj)))
fdata <- AnnotatedDataFrame(data = probedata)
if (is.null(rownames(obj)))
rownames(obj) <- probedata$ProbeID
}
pdata <- AnnotatedDataFrame(data = phenodata)
to <- new("Rnits", exprs = obj, phenoData = pdata, featureData = fdata, callData = callData)
## Probe filter
if (!is.null(filter)) {
if (!filter %in% c(TRUE, FALSE))
stop("filter argument must be TRUE or FALSE")
if (filter) {
cat("Probe filter is TRUE. Applying probe threshold of ", 100 * threshold,
"% samples\n")
probefilt <- which(apply(exprs(to), 1, function(x) sum(is.na(x))) >
threshold * ncol(exprs(to)))
cat(length(probefilt), " probes have NA values in more than ", ceiling(threshold *
ncol(exprs(to))), "samples\n")
if (length(probefilt) > 0)
to <- to[-probefilt, ]
}
}
} else if ("RGList" %in% class(obj)) {
callData$Type = "RGList"
## RGlist checks
if (nrow(phenodata) != ncol(obj))
stop("Phenotype data table must have same number of columns as data table\n")
if (!is.null(probedata) && nrow(probedata) != nrow(obj))
stop("Probe data table must have equal number of rows as data table\n")
if (!is.null(probedata) & !all(probedata$ProbeID == obj$genes$ID))
cat("Probe ID's do not match RGList gene ID's\n")
## Filter
if (is.null(filter)) {
callData$Filter = "None"
cat("No probe filtering\n")
} else {
if (filter[1] == "background") {
nacount <- rep(0, nrow(obj))
cat("Performing background filtering on data\n")
callData$Filter = "Background"
for (i in 1:ncol(obj$R)) {
r.id = which(obj$R[, i] <= median(obj$Rb[, i]) + 2 * sd(obj$Rb[,
i]))
g.id = which(obj$G[, i] <= median(obj$Gb[, i]) + 2 * sd(obj$Gb[,
i]))
nacount[r.id] <- nacount[r.id] + 1
nacount[g.id] <- nacount[g.id] + 1
}
} else if (is.numeric(filter)) {
callData$Filter = "Intensity"
if (max(filter) > 500 | min(filter) <= 0)
stop("Filter intensity should be between 0 and 500\n")
if (length(filter) == 2) {
filtR = filter[1]
filtG = filter[2]
} else if (length(filter) == 1) {
filtR = filtG = filter
} else stop("filter intensity should be either one or two integers")
nacount <- rep(0, nrow(obj))
cat("Performing Intensity filtering on data\n")
for (i in 1:ncol(obj$R)) {
r.id = which(obj$R[, i] <= filtR)
g.id = which(obj$G[, i] <= filtG)
nacount[r.id] <- nacount[r.id] + 1
nacount[g.id] <- nacount[g.id] + 1
}
} else stop("Filter argument should either be 'background' or an intensity value between 0 and 500")
# Probe filter
thr = threshold
thr = ncol(obj$R) * thr
filt = which(nacount <= thr)
cat(paste0("\tProbes before filtering: ", nrow(obj)), "\n")
obj = obj[filt, ]
if (!is.null(probedata))
probedata = probedata[filt, ]
cat(paste0("\tProbes after filtering: ", nrow(obj), "\n"))
}
## Normalize
if (is.null(normalize)) {
callData$Normalize = "None"
cat("No normalization done\n")
R <- obj$R
G <- obj$G
lr <- log2(R/G)
} else {
if (normalize == "Between") {
callData$Normalize = "Between"
cat("Between Array Normalization from Limma\n")
if (is.null(normmethod)) {
normmethod = "Aquantile"
} else if (!normmethod %in% c("none", "scale", "quantile", "Aquantile",
"Gquantile", "Rquantile", "Tquantile", "vsn")) {
cat("Normalization Method ", normmethod, " not a valid option for Between Array normalization. Continuing with Aquantile\n")
normmethod = "Aquantile"
}
callData$NormMethod = normmethod
MAobj <- normalizeBetweenArrays(obj, method = normmethod)
lr <- MAobj$M
R <- MAobj$A + lr/2
G <- R - lr
} else if (normalize == "Within") {
callData$Normalize = "Within"
cat("Within Normalization from Limma\n")
if (is.null(normmethod)) {
normmethod = "loess"
} else if (!normmethod %in% c("none", "median", "loess", "printtiploess",
"composite", "control", "robustspline")) {
cat("Normalization Method ", normmethod, " not a valid option for Within Array normalization. Continuing with loess.\n")
normmethod = "loess"
}
callData$NormMethod = normmethod
MAobj <- normalizeWithinArrays(obj, method = normmethod)
lr <- MAobj$M
R <- MAobj$A + lr/2
G <- R - lr
} else if (normalize == "Intensity" | normalize == "IntensityGreen") {
cat("Intensity Normalization on Green Channel\n")
callData$Normalize = "Intensity"
outList <- ranknormalization(ch1 = obj$G, ch2 = obj$R, plot = FALSE)
R <- outList$Pch2
G <- outList$Pch1
lr <- outList$logratio
} else if (normalize == "IntensityRed") {
cat("Intensity Normalization on Red Channel\n")
outList <- ranknormalization(ch1 = obj$R, ch2 = obj$G, plot = FALSE)
R <- outList$Pch1
G <- outList$Pch2
lr <- -outList$logratio
} else stop("Invalid normalization argument")
}
obj$R <- R
obj$G <- G
if (!is.null(probedata)) {
rownames(lr) <- rownames(obj$R) <- rownames(obj$G) <- probedata$ProbeID
rownames(probedata) <- probedata$ProbeID
fdata <- AnnotatedDataFrame(data = probedata)
} else {
probedata <- obj$genes
rownames(lr) <- rownames(obj$R) <- rownames(obj$G) <- rownames(probedata) <- obj$genes$ID
fdata <- AnnotatedDataFrame(data = probedata)
}
pdata <- AnnotatedDataFrame(data = phenodata)
to <- new("Rnits", rawTwoColor = obj, exprs = lr, phenoData = pdata, featureData = fdata,
callData = callData)
} else if ("AffyBatch" %in% class(obj)) {
calldata$Type = "AffyBatch"
nmOpt <- bgOpt <- TRUE
if (is.null(normalize)) {
cat("Setting normalization ON for Affy object. Disable by settingn normalize = FALSE\n")
nmOpt <- TRUE
}
if (is.null(background)) {
cat("Setting background correction ON for Affy object. Disable by setting background = FALSE\n")
bgOpt <- TRUE
}
rmaset <- rma(obj, verbose = FALSE, background = bgOpt, normalize = nmOpt)
to <- new("Rnits", rawAffy = obj, exprs = exprs(rmaset), featureData = featureData(rmaset),
phenoData = phenoData(rmaset), callData = callData)
} else if ("ExpressionSet" %in% class(obj)) {
callData$Type = "ExpressionSet"
phenodata = pData(obj)
featuredata = fData(obj)
if (!"Sample" %in% colnames(phenodata))
stop("Phenotype data table must have Sample column\n")
if (!"Time" %in% colnames(phenodata))
stop("Phenotype data table must have Time column\n")
dat <- exprs(obj)
if (!logscale && min(dat, na.rm = TRUE) < 0)
stop("Data is indicated to be NOT in log scale, but negative values observed")
if (!logscale)
dat <- log2(dat)
callData$Type = "ExpressionSet"
if (is.null(normmethod)) {
cat("No filtering or normalization done for input object\n")
exprs(obj) <- dat
} else {
if (!normmethod %in% c("quantile", "vsn", "Between"))
stop("Invalid normmethod argument for data frame. Must be vsn, quantile or Between")
if (normmethod == "quantile") {
cat("Quantile normalization\n")
dat <- normalizeQuantiles(dat)
}
if (normmethod == "vsn") {
cat("VSN normalization\n")
dat <- normalizeVSN(dat)
}
if (normmethod == "Between") {
cat("Between Array normalization\n")
dat <- normalizeBetweenArrays(data.matrix(dat))
}
exprs(obj) <- dat
}
if (!"GeneName" %in% colnames(featuredata)) {
if (!"Gene Symbol" %in% colnames(featuredata)) {
stop("GeneName or Gene Symbol column not found in Probe data.\n")
} else {
featuredata[["GeneName"]] <- as.character(featuredata[["Gene Symbol"]])
}
}
to <- new("Rnits", exprs = exprs(obj), featureData = as(featuredata, "AnnotatedDataFrame"),
phenoData = as(phenodata, "AnnotatedDataFrame"), callData = callData)
## Probe filter
if (!is.null(filter)) {
if (!filter %in% c(TRUE, FALSE))
stop("filter argument must be TRUE or FALSE")
if (filter) {
cat("Probe filter is TRUE. Applying probe threshold of ", 100 * threshold,
"% samples\n")
probefilt <- which(apply(exprs(to), 1, function(x) sum(is.na(x))) >
threshold * ncol(exprs(to)))
cat(length(probefilt), " probes have NA values in more than ", ceiling(threshold *
ncol(exprs(to))), "samples\n")
if (length(probefilt) > 0)
to <- to[-probefilt, ]
}
}
} else stop("Input obj must be either a data frame, matrix, RGList, ExpressionSet or AffyBatch.")
pdata <- pData(to)
## Replicate sample data
samp_time <- with(pdata, paste0(Sample, "_", Time))
if (max(table(samp_time)) > 1) {
#stop("Some series have replicate samples.")
#print("Some series have replicate samples.")
if(length(unique(table(samp_time))) > 1) stop("Uneven number of replicates")
if(!'Replicate' %in% colnames(pdata)) stop("Phenotype data must have 'Replicate' column for replicate data")
Nrep=unique(table(samp_time))
sort_order <- order(pdata$Sample, pdata$Replicate, as.numeric(pdata$Time))
rep=TRUE
} else{
Nrep=1
rep=FALSE
sort_order <- order(pdata$Sample, as.numeric(pdata$Time))
}
## Sort sample data
to <- to[, sort_order]
pdata <- pData(to)
utime <- unique(as.numeric(pdata$Time))
usamp <- sort(unique(pdata$Sample))
Nsets <- length(usamp) - 1
Nc <- length(utime)*Nrep
breaks = 1
for (i in 1:Nsets) breaks[i + 1] = breaks[i] + Nc
if ("control" %in% tolower(usamp)) {
control_samp <- which(tolower(pdata$Sample) %in% "control")
} else {
control_samp <- 1:Nc
}
callData$Nsets <- Nsets
callData$Nc <- Nc
callData$breaks <- breaks
callData$sample_names <- usamp
callData$time_vec <- sort(utime)
callData$control_samp <- control_samp
callData$Nrep <- Nrep
to$callData <- callData
cat("Dataset with ", nrow(to), " features and ", Nsets + 1, " samples in ", Nrep, "replicates\n")
if (center) {
cat("Centering data\n")
globalmean <- mean(colMeans(exprs(to), na.rm = TRUE))
exprs(to) <- globalmean + scale(exprs(to), center = TRUE, scale = FALSE)
}
if (plot)
boxplot(exprs(to), main = "Expression", xlab = "Samples", cex = 0.5, pch = 19)
return(to)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.