Nothing
# e: expressionSet object
# reference$var: variable that for which computations (averages, diff to control,...) will be computed and displayed displayed in graph
# reference$level: specific level of var from which differences are computed
# across: vector of variables that make combinations of grouping for computations
# var and across variables MUST be present in pData(ExpressionSetObject)
# within: reference$var and across could have same combinations in different subgroups of columns such as those that come from celllines - we first within data, make computations per within and merge back at the end ; this is a vector of withinting variables
## Note: if there are additional pheno data than the ones used in within,var,across their content could be lost in aggregated pData used to store statistics
## <DEBUG>
# data(e)
# reference=list(var='dose',level=10)
# within=c('cellline')
# across=NULL
# nReplicatesVar=3
## </DEBUG>
#' Summary statistics for gene expression
#'
#' Compute summary statistics per gene of expression data in a ExpressionSet object.
#' @param e An object of class ExpressionSe
#' @param reference A list with two items: var and level - See details
#' @param within Character vector - names of pData columns - See details
#' @param across Character vector - names of pData columns - See details
#' @param nReplicatesVar Integer - Minimum number of replicates to compute variance
#' @param ... \dots
#' @details
#' Summary statistics (mean, variances and difference to reference or control) will be
#' computed on the 'exprs' slot of the ExpressionSet object. The parameters of the
#' computation are specified by the parameters 'reference', 'within' and 'across'.\\
#' The design of the computations is such that the differences and pooled variances are calculated
#' against the sample(s) that was(were) chosen as reference. The reference is specified by
#' the level of a certain variable in the phenoData slot (e.g.: column 'control' and level 'WT'
#' of the phenoData slot or a boolean ('ref') variable with 0 or 1) -- the list object of 'var' and
#' 'level' together determine the reference group. \\
#'
#' All groups determined by combining the \code{reference$var} and \code{across} variables will be
#' compared to the reference group. Two different approaches to obtain necessary computations:
#' \itemize{
#' \item{}{Prepare a boolean variable that reflects only the reference group and specify all groupings
#' in the across arguments. E.g.: \code{reference=list(var = 'boolean', level = 1),
#' across = c('compound','dose')}}
#' \item{}{Add an extra column to the phenoData slot that contains all combinations, with a specific one
#' for the reference group: for example,
#' \code{pData(e)['refvar'] <- paste(pData(e)['compound'], pData(e)['dose'],sep='.')}
#' so as to use \code{reference = list(var = 'refvar', level ='comp1.dose1')} as argument for reference.}
#' } \\
#'
#' Sometimes computations need to be conducted within groups, and are thus nested. For example,
#' when comparing treament values of different cell lines, each will have gene expression values
#' for its own reference. The parameter 'within' allows to define such subgroups, for which
#' computations will be done separately and combined afterwards. Both parameters 'within' and
#' 'across' can be a vector of column names, whose unique combinations will be used for groupings.
#' @return Returns an object of class ExpressionSet with pData inherited from the submitted ExpressionSet object,
#' supplemented by the computed statistics in the 'exprs' slot and info thereof in the 'phenoData' slot.
#' @seealso \code{\link{plotLogRatio}}
#' @examples
#' if (require(ALL)){
#' data(ALL, package = "ALL")
#' ALL <- addGeneInfo(ALL)
#' ALL$BTtype <- as.factor(substr(ALL$BT,0,1))
#' ALL2 <- ALL[,ALL$BT != 'T1'] # omit subtype T1 as it only contains one sample
#' ALL2$BTtype <- as.factor(substr(ALL2$BT,0,1)) # create a vector with only T and B
#'
#' # Test for differential expression between B and T cells
#' tTestResult <- tTest(ALL, "BTtype", probe2gene = FALSE)
#' topGenes <- rownames(tTestResult)[1:20]
#'
#' # plot the log ratios versus subtype B of the top genes
#' LogRatioALL <- computeLogRatio(ALL2, reference=list(var='BT',level='B'))
#' a <- plotLogRatio(e=LogRatioALL[topGenes,],openFile=FALSE, tooltipvalues=FALSE, device='pdf',
#' colorsColumnsBy=c('BTtype'), main = 'Top 20 genes most differentially between T- and B-cells',
#' orderBy = list(rows = "hclust"),
#' probe2gene = TRUE)
#' }
#' @author Eric Lecoutre
#' @keywords manip data dplot
#' @importFrom Biobase sampleNames pData exprs AnnotatedDataFrame featureData experimentData annotation
#' @export
computeLogRatio <- function(e,reference,
within=NULL,across=NULL,nReplicatesVar=3,...){
stopifnot(is(e, "ExpressionSet"))
stopifnot(colnames(e)==sampleNames(e))
if (!all(c(reference$var,reference$across) %in% colnames(pData(e))))
{
stop(paste('\n reference var or any variable in across not present in phenotype data',' (',
paste(c(reference$var, across), collapse=' '),')\n',sep=''))
}
if (!(reference$level %in% unique(pData(e)[,reference$var])))
{
stop(paste('\n reference level is not define for reference$var'))
}
# if(is.null(within)) {
# pData(e)<-cbind(pData(e),'.tmpwithin'=1)
#
# }
pData <- pData(e)
pData <- pData[do.call('order',as.list(pData[,c(within,reference$var,across),drop=FALSE])),]
# here we use names with . just in case phenoData already contains such a variable
# adding reference level of var in pData to use it in further computations (make the code more readable)
pData[, ".reference"] <- pData[,reference$var]==reference$level
# careful if NA is one of the possible levels for var... then our variable reference would have some NA
if (is.na(reference$level)) pData$.reference[is.na(pData$.reference)] <- TRUE else
pData$.reference[is.na(pData$.reference)] <- FALSE
# adding combinations of within reference$var & across for further combinations
pData$.groups <- do.call('paste',
c(as.list(pData[,c(reference$var,across),drop=FALSE]),sep='.'))
# adding replicates with the selected parameters
pData$.replicates <- replicates(do.call('paste',
c(as.list(pData[,c(within,reference$var,across),drop=FALSE],sep='.'))))
# preparing withinted data, (could be more than one within variables)
if (is.null(within)){
bywithin <- rep(1, nrow(pData))
} else {
bywithin <- do.call('paste',
c(as.list(pData[, within, drop=FALSE]), sep = "."))
pData$.within <- bywithin
}
pData$.withingroups <- if (is.null(within)){
paste(pData$.groups, sep = ".")
} else {
paste(bywithin, pData$.groups, sep = ".")
}
pDatawithin <- split(pData, bywithin)
# check there is at least one reference in each within groups
check <- sapply(pDatawithin, function(block) any(block$.reference))
if (any(!check)) stop('Error in computation: within variables define groups for which there are not any reference to make compare with.')
# check if defined reference level and across are compatible: level must define
# groups as little as possible - in other words, we can't have two groups 'across'
# that are reference level for a certain within
check <- any(
sapply(split(pData,pData$.within),
function(biggroup) length(unique(biggroup[biggroup$.reference,'.groups']))>1))
if (check) stop('Incompatibility between reference level defined by var and across variables. \nReference level must constitute a group as little as possible.\nIt appears here one across variable splits the reference level into several subgroups.\nPlease modify your call.')
exprs.new <- c()
pData.new <- c()
for (iwithin in seq(along = pDatawithin)){
# cat('\n***DEBUG - iwithin: ',iwithin)
#E iwithin <- 1 # iwithin <- 2
iwithinname <- names(pDatawithin)[iwithin]
ipData <- pDatawithin[[iwithin]]
ipData <- ipData[order(ipData$.withingroups),] # necessary so that further tapply do have same order than within (alphabetic order)
# prepare pData that will be used for computed aggregated statistics
ireference <- tapply(ipData$.reference,ipData$.groups,function(ref)sum(ref)>0)
ireplicates <- tapply(ipData$.replicates,ipData$.groups,max)
statpData <- ipData[!duplicated(ipData[,c(within,reference$var,across),drop=FALSE]),,drop=FALSE]
# remove columns that were added before as they will be replaced
statpData <- statpData[,-which(colnames(statpData)=='.reference')]
statpData <- statpData[,-which(colnames(statpData)=='.replicates')]
if ('.oldcolnames' %in% colnames(statpData)){
statpData <- statpData[,-which(colnames(statpData)=='.oldcolnames')]
}
statpData <- cbind(statpData,.reference=ireference,.replicates=ireplicates)
# withinting data in groups of across (within within) to compute statistics per groups in a row
igroups <- split(ipData,ipData$.groups)
# prepare root names that will be used to store statistics
irootnames <- do.call('paste',
c(as.list(statpData[,c(within,reference$var,across),drop=FALSE]),sep='.'))
# irootnames <- do.call('paste',
# c(as.list(statpData[,c(within,across),drop=FALSE]),sep='.'))
# here begin the computations
# each time we also prepare adequate pData
## computing averages
exprs.mean <- do.call('cbind',
lapply(igroups,function(group)
apply(as.matrix(exprs(e)[, rownames(group)]),1,mean)))
colnames(exprs.mean) <- paste(irootnames, 'mean', sep='.')
pData.mean <- cbind(statpData, statistic='mean')
rownames(pData.mean) <- colnames(exprs.mean)
## computing variances
# we have to ensure sufficient replicates!
which.enough.replicates <- which(statpData$.replicate>=nReplicatesVar)
tmp.pData <- statpData[which.enough.replicates,,drop=FALSE]
if (nrow(tmp.pData)>0){
exprs.var <- do.call('cbind',
lapply(igroups[which.enough.replicates],function(group)
apply(exprs(e)[,rownames(group)],1,stats::var)))
colnames(exprs.var) <- paste(irootnames[which.enough.replicates],'var',sep='.')
pData.var <- cbind(tmp.pData,statistic='var')
rownames(pData.var) <- colnames(exprs.var)
} else {
exprs.var <- c()
pData.var <- c()
}
## computing diff to ref
reference.which <- which(pData.mean$.reference)
exprs.diffref <- exprs.mean[,-reference.which,drop=FALSE]- exprs.mean[,reference.which]
colnames(exprs.diffref) <- paste(irootnames[-reference.which],'diffref',sep='.')
pData.diffref <- cbind(statpData[!statpData$.reference,,drop=FALSE],statistic='diffref')
rownames(pData.diffref) <- colnames(exprs.diffref)
## computing pooled standard deviations -- we have to check for replicates
# first ensure there are enough replicates for reference
### <TODO> add sign for pooledSD
ref.replicates <- statpData[reference.which,'.replicates']
if (ref.replicates<nReplicatesVar){
warnings(paste('only ',ref.replicates,' replicates in the group ',iwithinname, ' to compute pooled standard errors( ',nReplicatesVar,'required)',sep=''))
exprs.pooledSD <- c()
pData.pooledSD <- c()
exprs.spooledSD <- c()
pData.spooledSD <- c()
} else {
# then select only groups with enough replicates
tmp.pData <- pData.var[pData.var$.replicates>=nReplicatesVar & !pData.var$.reference,,drop=FALSE]
if (nrow(tmp.pData)>0){
# ok, here there are some groups with enough replicates to compute pooled sd
# formula: # pooled variance =
# sqrt(((n1 - 1) * var1 + (n2 - 1) * var2) / (n1 + n2 - 2))
var1 <- exprs.var[,rownames(pData.var[pData.var$.reference,])]
n1 <- pData.var[pData.var$.reference,'.replicates']
exprs.pooledSD <- sapply(rownames(tmp.pData),
function(varvar){
n2 <- tmp.pData[varvar,'.replicates']
var2 <- exprs.var[,varvar]
out <- sqrt(((n1-1)*var1 + (n2-1)*var2)/(n1+n2-2))
return(out)
})
# then look at corresponding diff to reference and prepare signed SD for plot
corresponding.diffref <- paste(substr(rownames(tmp.pData),1,nchar(rownames(tmp.pData))-4),'diffref',sep='.')
exprs.spooledSD <- sapply(seq(along=rownames(tmp.pData)),
function(ivarvar){
varvar <- rownames(tmp.pData)[ivarvar]
out <- exprs.pooledSD[,varvar]
# diffname <- paste(tmp.pData[varvar,'.withingroups'],'diffref',sep='.')
# diffname <- paste(irootnames[ivarvar],'diffref',sep='.')
diffname <- corresponding.diffref[ivarvar]
diffref <- exprs.diffref[,diffname]
out[diffref<0] <- -out[diffref<0]
return(out)
})
# prepare correct pData
cnames <- do.call('paste',
c(as.list(tmp.pData[,c(within,reference$var,across),drop=FALSE]),'pooledSD',sep='.'))
colnames(exprs.pooledSD) <- cnames
pData.pooledSD <- tmp.pData
pData.pooledSD$statistic <- 'pooledSD'
rownames(pData.pooledSD) <- colnames(exprs.pooledSD)
cnames <- do.call('paste',
c(as.list(tmp.pData[,c(within,reference$var,across),drop=FALSE]),'spooledSD',sep='.'))
colnames(exprs.spooledSD) <- cnames
pData.spooledSD <- tmp.pData
pData.spooledSD$statistic <- 'signedpooledSD'
rownames(pData.spooledSD) <- colnames(exprs.spooledSD)
}
else {
exprs.pooledSD <- c()
pData.pooledSD <- c()
exprs.spooledSD <- c()
pData.spooledSD <- c()
}
}
# we have done computations for this within / merge them to output object
exprs.new <- cbind(exprs.new,exprs.mean,exprs.var,exprs.diffref,exprs.pooledSD,exprs.spooledSD)
pData.new <- rbind(pData.new,pData.mean,pData.var,pData.diffref,pData.pooledSD,pData.spooledSD)
}
phenoData <- AnnotatedDataFrame(data=pData.new)
out <- new('ExpressionSetWithComputation',
exprs=as.matrix(exprs.new),
phenoData=phenoData,
featureData=featureData(e),
experimentData=experimentData(e),
annotation=annotation(e)) # ,
# protocolData = new('AnnotatedDataFrame', data = data.frame()))
invisible(out)
}
# compute(e,reference=reference,within=within)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.