#' Load and preprocess microarray data
#' Loads, preprocesses and annotates microarray data to be further used by
#' downstream functions in the \pkg{\link{piano}} package.
#' This function requires at least two inputs: (1) data, either CEL files in
#' the directory specified by \code{datadir} or normalized data specified by
#' \code{dataNorm}, and (2) experimental setup specified by \code{setup}.
#' The setup shold be either a tab delimited text file with column headers or a
#' \code{data.frame}. The first column should contain the names of the CEL
#' files or the column names used for the normalized data, please be sure to
#' use names valid as column names, e.g. avoid names starting with numbers.
#' Additional columns should assign attributes in some category to each array.
#' (For an example run the example below and look at the object
#' \code{myArrayData$setup}.)
#' The \pkg{piano} package is customized for yeast 2.0 arrays and annotation
#' will work automatically, if the cdfName of the arrays equals \emph{Yeast_2}.
#' If using normalized yeast 2.0 data as input, the user needs to set the
#' argument \code{platform="yeast2"} to tell the function to use yeast
#' annotation. If other platforms than yeast 2.0 is used, set
#' \code{platform=NULL} (default) and supply appropriate annotation by the
#' argument \code{annotation}. Note that the cdfName will override
#' \code{platform}, so it can still be set to \code{NULL} for yeast 2.0 CEL
#' files. Note also that \code{annotation} overrides \code{platform}, so if the
#' user wants to use an alternative annotation for yeast, this can be done
#' simply by specifying this in \code{annotation}.
#' The annotation should have the column headers \emph{Gene name},
#' \emph{Chromosome} and \emph{Chromosome location}. The \emph{Gene name} is
#' used in the heatmap in \code{diffExp} and the \emph{Chromosome} and
#' \emph{Chromosome location} is used by the \code{polarPlot}. The rownames (or
#' first column if using a text file) should contain the \emph{probe IDs}. If
#' using a text file the first column should have the header \emph{probeID} or
#' similar. The filtering step discards all probes not listed in the
#' annotation.
#' Normalization is performed on all CEL file data using one of the Affymetrix
#' methods: PLIER (\code{"plier"}) as implemented by
#' \code{\link[plier:justPlier]{justPlier}}, RMA (Robust Multi-Array Average)
#' (\code{"rma"}) expression measure as implemented by
#' \code{\link[affy:rma]{rma}} or MAS 5.0 expression measure \code{"mas5"} as
#' implemented by \code{\link[affy:mas5]{mas5}}.
#' It is possible to pass additional arguments to
#' \code{\link[affy:read.affybatch]{ReadAffy}}, e.g. \code{cdfname} as this
#' might be required for some types of CEL files.
#' @param datadir character string giving the directory in which to look for
#' the data. Defaults to \code{getwd()}.
#' @param setup character string giving the name of the file containing the
#' experimental setup, or an object of class \code{data.frame} or similar
#' containing the experimental setup. Defaults to \code{"setup.txt"}, see
#' details below for more information.
#' @param dataNorm character string giving the name of the normalized data, or
#' an object of class \code{data.frame} or similar containing the normalized
#' data. Only to be used if the user wishes to start with normalized data
#' rather then CEL files.
#' @param platform character string giving the name of the platform, can be
#' either \code{"yeast2"} or \code{NULL}. See details below for more
#' information.
#' @param annotation character string giving the name of the annotation file,
#' or an object of class \code{data.frame} or similar containing the annotation
#' information. The annotation should consist of the columns \emph{Gene name},
#' \emph{Chromosome} and \emph{Chromosome location}. Not required if
#' \code{platform="yeast2"}.
#' @param normalization character string giving the normalization method, can
#' be either \code{"plier"}, \code{"rma"} or \code{"mas5"}. Defaults to
#' \code{"plier"}.
#' @param filter should the data be filtered? If \code{TRUE} then probes not
#' present in the annotation will be discarded. Defaults to \code{TRUE}.
#' @param verbose verbose? Defaults to \code{TRUE}.
#' @param \dots additional arguments to be passed to \code{ReadAffy}.
#' @return An \code{ArrayData} object (which is essentially a \code{list}) with
#' the following elements:
#' \item{dataRaw}{raw data as an AffyBatch object}
#' \item{dataNorm}{\code{data.frame} containing normalized expression values}
#' \item{setup}{\code{data.frame} containing experimental setup}
#' \item{annotation}{\code{data.frame} containing annotation}
#' Depending on input arguments the \code{ArrayData} object may not include
#' \code{dataRaw} and/or \code{annotation}.
#' @author Leif Varemo \email{piano.rpkg@@gmail.com} and Intawat Nookaew
#' \email{piano.rpkg@@gmail.com}
#' @seealso \pkg{\link{piano}}, \code{\link{runQC}}, \code{\link{diffExp}},
#' \code{\link[affy:read.affybatch]{ReadAffy}},
#' \code{\link[affy:expresso]{expresso}},
#' \code{\link[plier:justPlier]{justPlier}}, \code{\link[yeast2.db:yeast2BASE]{yeast2.db}}
#' @references Gautier, L., Cope, L., Bolstad, B. M., and Irizarry, R. A. affy
#' - analysis of Affymetrix GeneChip data at the probe level.
#' \emph{Bioinformatics.} \bold{20}, 3, 307-315 (2004).
#' @examples
#' # Get path to example data and setup files:
#' dataPath <- system.file("extdata", package="piano")
#' # Load normalized data:
#' myArrayData <- loadMAdata(datadir=dataPath, dataNorm="norm_data.txt.gz", platform="yeast2")
#' # Print to look at details:
#' myArrayData
loadMAdata <- function(datadir=getwd(), setup="setup.txt", dataNorm,
platform="NULL", annotation, normalization="plier",
filter=TRUE, verbose=TRUE, ...) {
#if(!try(require(affy))) stop("package affy is missing") # old, line below is preferred:
if (!requireNamespace("affy", quietly = TRUE)) stop("package affy is missing")
#if(!try(require(plier))) stop("package plier is missing") # old, line below is preferred:
if (!requireNamespace("plier", quietly = TRUE)) stop("package plier is missing")
# Argument check:
if(!normalization %in% c("plier","rma","mas5")) {
stop("incorrect value of argument normalization")
if(!platform %in% c("NULL","yeast2")) {
stop("incorrect value of argument platform")
# Verbose function:
.verb <- function(mes, verbose) {
if(verbose == TRUE) {
# Load the data:
nCelFiles <- length(dir(path=datadir, pattern = ".*cel", all.files=FALSE,
full.names=FALSE, ignore.case = TRUE, recursive=FALSE))
if(nCelFiles > 0 & missing(dataNorm)) {
# Load CEL-files
.verb("Loading CEL files...", verbose)
dataRaw <- affy::ReadAffy(celfile.path=datadir, ...)
colnames(exprs(dataRaw)) <- gsub("\\.CEL","",colnames(exprs(dataRaw)), ignore.case=TRUE)
colnames(exprs(dataRaw)) <- gsub("\\.gz","",colnames(exprs(dataRaw)), ignore.case=TRUE)
if(sum(duplicated(colnames(exprs(dataRaw)))) > 0) stop("found samples with identical names")
.verb("...done", verbose)
} else if(!missing(dataNorm)) {
if(is(dataNorm, "character")) {
# If no CEL-files, or if selected, load txt-file
dataFilePath <- paste(datadir, "/", dataNorm, sep="")
if(!file.exists(dataFilePath)) {
stop("could not find the data file")
.verb("Loading data in text file...", verbose)
dataNorm <- as.data.frame(read.delim(dataFilePath, header=TRUE, sep="\t",
row.names=1, as.is=TRUE,quote=""),stringsAsFactors=FALSE)
colnames(dataNorm) = gsub("\\.CEL","",colnames(dataNorm), ignore.case=TRUE)
colnames(dataNorm) = gsub("\\.gz","",colnames(dataNorm), ignore.case=TRUE)
if(sum(duplicated(colnames(dataNorm))) > 0) stop("found samples with identical names")
.verb("...done", verbose)
} else {
dataNorm <- as.data.frame(dataNorm,stringsAsFactors=FALSE)
} else {
stop("could not find any data files in directory")
# This (above) creates object 'dataRaw' or 'dataNorm' depending on the input
# (cel or txt).
# Load the setup:
if(is(setup, "character")) {
setupFilePath <- paste(datadir, "/", setup, sep="")
if(!file.exists(setupFilePath)) {
stop("could not find the setup file")
.verb("Loading setup file...", verbose)
setup <- as.data.frame(read.delim(setupFilePath, header=TRUE, sep="\t",
row.names=1, as.is=TRUE,quote=""),stringsAsFactors=FALSE)
rownames(setup) = gsub("\\.CEL","",rownames(setup), ignore.case=TRUE)
rownames(setup) = gsub("\\.gz","",rownames(setup), ignore.case=TRUE)
.verb("...done", verbose)
} else {
setup <- as.data.frame(setup, stringsAsFactors=FALSE)
# Normalize the raw data:
if(exists("dataRaw", inherits=FALSE)) {
# iterplier qubic spline
if(normalization == "plier") {
.verb("Preprocessing using PLIER with cubic spline normalization...", verbose)
dataNorm <- affy::normalize.AffyBatch.qspline(dataRaw, type="pmonly", verbose=FALSE)
tmp <- suppressWarnings(tmp <- capture.output(dataNorm <- plier::justPlier(dataNorm,normalize=FALSE,
usemm=FALSE, concpenalty=0.08,
dataNorm <- as.data.frame(exprs(dataNorm),stringsAsFactors=FALSE)
colnames(dataNorm) <- gsub("\\.CEL","",colnames(dataNorm), ignore.case=TRUE)
colnames(dataNorm) <- gsub("\\.gz","",colnames(dataNorm), ignore.case=TRUE)
.verb("...done", verbose)
} else if(normalization == "rma") {
.verb("Preprocessing using RMA with quantile normalization...", verbose)
dataNorm <- affy::rma(dataRaw,verbose=FALSE)
dataNorm <- as.data.frame(exprs(dataNorm),stringsAsFactors=FALSE)
colnames(dataNorm) <- gsub("\\.CEL","",colnames(dataNorm), ignore.case=TRUE)
colnames(dataNorm) <- gsub("\\.gz","",colnames(dataNorm), ignore.case=TRUE)
.verb("...done", verbose)
} else if(normalization == "mas5") {
.verb("Preprocessing using MAS 5.0 with quantile normalization...", verbose)
dataNorm <- affy::mas5(dataRaw,verbose=FALSE)
dataNorm <- as.data.frame(log2(exprs(dataNorm)),stringsAsFactors=FALSE)
colnames(dataNorm) <- gsub("\\.CEL","",colnames(dataNorm), ignore.case=TRUE)
colnames(dataNorm) <- gsub("\\.gz","",colnames(dataNorm), ignore.case=TRUE)
.verb("...done", verbose)
} else {
.verb("Text file data: No normalization performed.", verbose)
# Check annotation
if(exists("dataRaw", inherits=FALSE)) {
if(affy::cdfName(dataRaw) == "Yeast_2") {
platform <- "yeast2"
if(platform == "yeast2" & missing(annotation)) {
annotationInfo <- "yeast2"
} else if(!missing(annotation)) {
annotationInfo <- "asArgument"
} else {
annotationInfo <- "none"
if(annotationInfo != "none") {
if(annotationInfo == "yeast2") {
#if(!try(require(yeast2.db))) stop("package yeast2.db is needed for annotationInfo='yeast2'") # old, line below is preferred:
if (!requireNamespace("yeast2.db", quietly = TRUE)) stop("package yeast2.db is missing")
if (!requireNamespace("AnnotationDbi", quietly = TRUE)) stop("package AnnotationDbi is missing")
# Annotate the probes using the yeast2.db package:
.verb("Creating annotation...", verbose)
# Gene name
geneName <- yeast2.db::yeast2ORF
geneName <- AnnotationDbi::toTable(geneName)
# Chromosome location
chromosome <- yeast2.db::yeast2CHRLOC
chromosome <- AnnotationDbi::toTable(chromosome)
chromosome <- chromosome[,c(1,3,2)]
# Probe id:s (corresponding to those in dataNorm)
probeID <- as.data.frame(rownames(dataNorm),stringsAsFactors=FALSE)
colnames(probeID) <- "probeID"
# Annotation data frame
annot <- merge(probeID,geneName,by.x="probeID",by.y="probe_id",all.x=TRUE)
annot <- merge(annot,chromosome,by.x="probeID",by.y="probe_id",all.x=TRUE)
rownames(annot) <- annot$probeID
annot <- annot[2:ncol(annot)]
colnames(annot) <- c("geneName","chromosome","start") # <- remove sys.name?
.verb("...done", verbose)
} else if(annotationInfo == "asArgument") {
# Else annotate from annotation-argument:
if(is(annotation, "character")) {
.verb("Creating annotation...", verbose)
annotFilePath <- paste(datadir, "/", annotation, sep="")
if(!file.exists(annotFilePath)) {
stop("could not find the annotation file")
annot <- as.data.frame(read.delim(annotFilePath, header=TRUE, sep="\t",
row.names=1, as.is=TRUE,quote=""),stringsAsFactors=FALSE)
if(ncol(annot) != 3) {
stop("provided annotation has to contain 3 columns")
colnames(annot) <- c("geneName","chromosome","start")
.verb("...done", verbose)
} else {
annot <- annotation
annot[,1] <- as.character(annotation[,1])
annot[,2] <- as.character(annotation[,2])
# Check for NAs:
suppressWarnings(tmp <- as.numeric(as.character(annot[,3])))
if(!all(!is.na(tmp))) stop("the chromosome location in annotation has to be numerical")
annot[,3] <- tmp
# Remove mappings not in data:
annot <- annot[rownames(annot)%in%rownames(dataNorm),]
# Check for duplicates:
if(length(rownames(annot))!=length(unique(rownames(annot)))) {
stop("the annotation contains Gene name duplicates")
} else {
warning("no annotation created, may cause limitation in downstream functions")
if(filter == TRUE & exists("annot", inherits=FALSE)) {
# Remove unmapped probes:
.verb("Removing unmapped probes...", verbose)
mappedProbes <- rownames(annot)[!is.na(annot[,1])]
probes <- rownames(dataNorm)
dataNorm <- dataNorm[probes %in% mappedProbes,]
# Remove mappings not in data:
annot <- annot[rownames(annot) %in% rownames(dataNorm),]
.verb("..done", verbose)
} else if(filter == TRUE & !exists("annot", inherits=FALSE)) {
warning("annotation required for filtering, filtering step is omitted")
# Check sample name consistency:
tmp1 <- length(rownames(setup))
tmp2 <- length(colnames(dataNorm))
tmp3 <- length(c(1:tmp1)[rownames(setup) %in% colnames(dataNorm)])
tmp4 <- length(c(1:tmp2)[colnames(dataNorm) %in% rownames(setup)])
if(tmp1 != tmp2 | tmp3 != tmp4) {
stop("inconsistant sample names in dataNorm and setup")
# Construct ArrayData object as return:
if(exists("dataRaw", inherits=FALSE) & exists("annot", inherits=FALSE)) {
arrayData <- list(dataRaw=dataRaw, dataNorm=dataNorm, setup=setup, annotation=annot)
} else if(exists("annot", inherits=FALSE)){
arrayData <- list(dataNorm=dataNorm, setup=setup, annotation=annot)
} else if(exists("dataRaw", inherits=FALSE)) {
arrayData <- list(dataRaw=dataRaw, dataNorm=dataNorm, setup=setup)
} else {
arrayData <- list(dataNorm=dataNorm, setup=setup)
class(arrayData) <- "ArrayData"
