#' peptideSet constructor
#'
#' This function reads GenePix results (.gpr) files and creates a peptideSet object
#' gathering experiment information.
#'
#' @param files A \code{character} vector. If NULL, all files with a .gpr extension
#' in the specified path will be read.
#' @param path A \code{character} string. The directory where the .gpr files to
#' read are located.
#' @param mapping.file A \code{character} string or \code{data.frame}. A mapping file
#' that gives information for each sample. See details section below for a list of
#' required fields.
#' @param use.flags A \code{logical}. Should spots with flag value -99 or lower
#' be excluded?
#' @param rm.control.list A \code{character} vector. The name of the controls to
#' be excluded from the peptideSet.
#' @param empty.control.list A \code{character} vector. The name of the empty
#' controls. If non NULL, the intensity of these empty spots will be substracted
#' from remaining intensities.
#' @param bgCorrect.method A \code{character} string. The name of the method used
#' for background correction. This is passed to limma's backgroundCorrect method.
#' See details section below and ?backgroundCorrect for more information.
#' @param log A \code{logical}. If TRUE, intensities will be log2 transformed after
#' BG correction.
#' @param check.row.order A \code{logical}. Should slides be reduced to a common
#' set of peptides?
#' @param verbose A \code{logical}. Displays progress and additional information.
#'
#' @details
#' GenePix results files (.gpr) are read when found in either the files or path
#' arguments. By default, the foreground and background median intensities of the
#' "red" channels, "F635 Median" and "B635 Median", are read. The background
#' correction specified in bgCorrect.method is passed to the backgroundCorrect
#' method in the limma package.
#'
#' The mapping.file can be either a filename or a data.frame. In any case, it should
#' contain at least three columns labeled "filename", "ptid" and "visit". The
#' filenames given here should match those read from the path or files argument,
#' or be a subset of it. For each ptid (patient ID), the visit column should have at
#' least one "pre" and one "post" sample. Any additional column will be kept into
#' the resulting \code{peptideSet} and can be used later on for groupping.
#'
#' If check.row.order = TRUE, the final set of probes is taken to be those with
#' IDs found in all arrays that were read.
#'
#' Row, Column and Block spatial array position for each probe are stored in the
#' \code{featureRanges} slot of the returned object.
#'
#'
#' @return A \code{peptideSet} object that contain the intensities, peptide
#' sequences and annotations available in the mapping file.
#'
#' @examples
#' # Read gpr files
#' library(pepDat)
#' mapFile <- system.file("extdata/mapping.csv", package = "pepDat")
#' dirToParse <- system.file("extdata/gpr_samples", package = "pepDat")
#' pSet <- makePeptideSet(files = NULL, path = dirToParse,
#' mapping.file = mapFile, log=TRUE)
#'
#' # Specify controls to be removed and empty controls
#' # to be used for background correction.
#' pSet <- makePeptideSet(files = NULL, path = dirToParse,
#' mapping.file = mapFile, log = TRUE,
#' rm.control.list = c("JPT-control", "Ig", "Cy3"),
#' empty.control.list= c("empty", "blank control"))
#'
#' @seealso \code{\link{peptideSet}}, \code{\link{read.maimages}},
#' \code{\link{backgroundCorrect}}
#'
#' @author Raphael Gottardo, Gregory Imholte
#'
#' @rdname makePeptideSet
#' @export
#' @importFrom tools file_path_sans_ext file_ext
#' @importFrom limma read.maimages backgroundCorrect
#' @importFrom Biobase preproc preproc<- assayData phenoData experimentData
#' protocolData sampleNames sampleNames<- pData pData<- exprs exprs<- rowMedians
#' @importFrom IRanges IRanges space
#' @importFrom GenomicRanges GRanges
#'
makePeptideSet<-function(files=NULL, path=NULL, mapping.file=NULL, use.flags=FALSE,
rm.control.list=NULL, empty.control.list=NULL,
bgCorrect.method="normexp", log=TRUE, check.row.order=FALSE,
verbose=FALSE){
# There is some ambiguity with respect to what is Name and ID
# ID -> peptide
# Name -> annotation
f <- function(x) as.numeric(x$Flags > -99)
# before reading in files, check whether mapping.file is accessible,
# to save user time in case they made a mistake.
if (!is.null(mapping.file)){
mapping.file<-.sanitize_mapping_file2(mapping.file)
files <- file_path_sans_ext(mapping.file$filename)
}
if (!check.row.order) { # Assume that the design is the same and don't check rows, order, etc.
RG <- read.maimages(files=files,
source="genepix", path=path, ext="gpr",
columns=list(R="F635 Median",Rb="B635 Median"),
wt.fun=f,verbose=verbose)
} else { # Used if the arrays don't exactly contain the same feature (e.g. the design has changed)
files <- grep("gpr",list.files(path),value=TRUE)
RG.list <- lapply(files, read.maimages, source="genepix",
path=path, columns=list(R="F635 Median",Rb="B635 Median"),
wt.fun=f, verbose=verbose)
if(verbose){
cat("Reordering all peptides\n")
}
RG <- RG.list[[1]]
# Find the common target
target.id <- Reduce(intersect,lapply(RG.list,function(x){x$genes$ID}))
if(length(target.id)==0){
stop("No common features found across slides")
}
# subset all
RG.list <- lapply(RG.list,function(x, target){
ind <- x$genes$ID%in%target;
x$genes <- x$genes[ind,];
x$Eb <- x$Eb[ind];
x$E <- x$E[ind];
x
}, target.id)
# order all
RG.list <- lapply(RG.list, function(x, target){
ind <- order(x$genes$ID);
x$genes <- x$genes[ind,];
x$Eb <- x$Eb[ind];
x$E <- x$E[ind];
x
})
RG$E <- do.call(cbind,lapply(RG.list,function(x){x$E}))
RG$Eb <- do.call(cbind,lapply(RG.list,function(x){x$Eb}))
RG$targets <- do.call(rbind,lapply(RG.list,function(x){x$targets}))
# Make sure the sample names are consitent across objects
colnames(RG$E) <- rownames(RG$targets)
colnames(RG$Eb) <- rownames(RG$targets)
RG$genes <- RG.list[[1]]$genes
}
offset <- 0.5
if (bgCorrect.method=="half") offset <- .5 else offset <- 1
RG <- try(backgroundCorrect(RG, method=bgCorrect.method, offset=offset, verbose=verbose))
myDesc <- new("MIAME")
## Put NA instead of flags
if (use.flags)
{
RG$E[RG$weights==0] <- NA
}
# Keep track of printer and source
preproc(myDesc)$source<-RG$source
preproc(myDesc)$printer<-RG$printer
## Fill in the details of the preprocessing
## Put NA instead of flags
if (use.flags)
{
RG$E[RG$weights==0] <- NA
}
if (log) {
RG$E <- log2(RG$E)
preproc(myDesc) <- c(preproc(myDesc),list(transformation="log", normalization="none"))
} else {
preproc(myDesc) <- c(preproc(myDesc),list(transformation="none", normalization="none"))
}
if(!is.null(empty.control.list)){
norm.empty <- TRUE
} else{
norm.empty <- FALSE
}
preproc(myDesc)$bgCorrect.method <- bgCorrect.method
preproc(myDesc)$norm.empty <- norm.empty
if (norm.empty) {
#Check both name and ID for the control list
index <- RG$genes$Name%in%empty.control.list | RG$genes$ID%in%empty.control.list
if(sum(index)==0){
warning("No empty controls matching the given list were found.")
mean.empty <- rep(0, ncol(as.matrix(RG$E)))
} else{
if (verbose) {
cat("** Background corrected using ", sum(index), " empty splots **\n")
}
mean.empty <- matrix(colMeans(as.matrix(RG$E[index,])),
nrow=nrow(as.matrix(RG$E)),
ncol=ncol(as.matrix(RG$E)),
byrow=TRUE)
}
} else {
mean.empty <- rep(0, ncol(as.matrix(RG$E)))
}
# Keep the layout
layout <- lapply(RG$genes[, c("Block","Row","Column")],as.factor)
# Remove controls
ind.keep <- rep(TRUE,nrow(RG$E))
if (!is.null(rm.control.list)) {
ind.keep <- lapply(rm.control.list,
function(x, Name, ID){
!grepl(x,Name) & !grepl(x,ID)},
as.character(RG$genes$Name),
as.character(RG$genes$ID))
ind.keep <- do.call(cbind,ind.keep)
ind.keep <- apply(ind.keep,1,all)
}
## See note above about Name and ID
# ID -> peptide
# Name -> annotation
featureSequence <- as.character(RG$genes$ID)[ind.keep]
featureID <- as.character(RG$genes$Name)[ind.keep]
nPep <- length(which(ind.keep))
# Positions are set to zero before the information is provided in summarize pSet
pSet <- new('peptideSet',
featureRange = GRanges(seqnames = " ", strand = "*",
ranges = IRanges(rep(0,nPep),rep(0,nPep)), featureID,
peptide = featureSequence, block = layout$Block[ind.keep],
row = layout$Row[ind.keep], column = layout$Column[ind.keep]),
exprs = as.matrix(RG$E-mean.empty)[ind.keep, ],
experimentData = myDesc)
# Make sure everything is stored as lower
sampleNames(pSet)<-tolower(sampleNames(pSet))
if(!is.null(mapping.file)){
pData(pSet) <- mapping.file[match(sampleNames(pSet), rownames(mapping.file)), ]
}
return(pSet)
}
.sanitize_mapping_file2 <- function(mapping.file){
if(is.character(mapping.file)){
map <- read.csv(mapping.file, header=TRUE)
} else if(is.data.frame(mapping.file)){
map <- mapping.file
} else {
stop("The mapping file should be a character vector or a data.frame")
}
colnames(map) <- tolower(colnames(map))
if(!all(c("filename", "ptid", "visit") %in% colnames(map))){
stop("The mapping file should include at least the 3 mandatory columns: 'filename', 'ptid' and 'visit'")
}
row.names(map) <- file_path_sans_ext(tolower(map$filename))
return(map)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.