Nothing
# Copyright (C) 2013 Klambauer Guenter and Thomas Unterthiner
# <klambauer@bioinf.jku.at>
#' @title Detection of Differential Expression in a semi-supervised Setting
#' @description Performs the DEXSS algorithm for detection of differentially
#' expressed genes in RNA-seq data for a semi-supervised setting, i.e. that
#' the condition of some samples is known, and for some samples the condition
#' is unkown.
#'
#' @details The read count \eqn{x} is explained by
#' a finite mixture of negative binomials:
#'
#' \deqn{
#' p(x) = \sum_{i=1} ^n \alpha_i\ \mathrm{NB}(x;\ \mu_i, r_i),
#' }
#'
#' where \eqn{\alpha_i} is the weight of the mixture component, \eqn{\mathrm{NB}}
#' is the negative binomial with mean parameter \eqn{\mu_i} and size parameter
#' \eqn{r_i}. The parameters are selected by an EM algorithm in a Baysian
#' framework.
#'
#' Each component in the mixture model corresponds to one condition.
#'
#' \itemize{
#' \item If the groups, conditions, replicate status, or labels are unknown, DEXUS
#' tries to estimate these conditions. For each transcript DEXUS tries to
#' explain the read counts by one negative binomial distribution. If this is
#' possible, the transcript is explained by one condition and therefore it
#' is not differentially expressed. If more than one negative binomial
#' distribution is needed to explain the read counts of a transcript, this
#' transcript indicates that it is differentially expressed. Evidence for
#' differential expression is strong if a large amount of samples participate
#' in each condition and the mean expression values are well separated. Both
#' of these criteria are measured by the informative/non-informative (I/NI)
#' call.
#'
#' \item If there are more than two groups given by the vector \env{labels},
#' DEXUS uses a generalized linear model to explain the data in analogy to
#' (McCarthy, 2012).
#'
#' \item If there are two groups given by the vector \env{labels}, DEXUS
#' uses the exact test for count data to test between the
#' sample groups, as implemented by (Anders and Huber, 2010) in the package
#' "DESeq".
#' }
#'
#'
#'
#' @param X either a vector of counts or a raw data matrix, where
#' columns are interpreted as samples and rows as genomic regions. An instance
#' of "countDataSet" is also accepted.
#' @param nclasses The number of conditions, i.e. mixture components. (Default = 2)
#' @param G The weight of the prior distribution of the mixture weights.
#' Not used in the supervised case. (Default = 1).
#' @param cyc Positive integer that sets the number of cycles of the EM algorithm.
#' (Default = 20).
#' @param alphaInit The initial estimates of the condition sizes, i.e., mixture weights.
#' Not used in the supervised case. (Default = c(0.5,0.5)) .
#' @param labels The labels for the classes, will be coerced into an integer.
#' For this semi-supervised version the known labels/conditions
#' must be coded as integers starting with 1. The samples with the label 1 will
#' be considered as being in the "major condition".
#' For the samples with unknown labels/conditions an "NA" must be set.
#' @param normalization method used for normalizing the reads.
#' "RLE" is the method used by (Anders and Huber, 2010),
#' "upperquartile" is the Upper-Quartile method by (Bullard et al., 2010), and none
#' deactivates normalization. (Default = "RLE").
#' @param kmeansIter number of times the K-Means algorithm is run. (Default = 10).
#' @param ignoreIfAllCountsSmaller Ignores transcript for which all read counts
#' are smaller than this value. These transcripts are considered as "not expressed"
#' (Default = 1).
#' @param theta The weight of the prior on the size parameter or inverse
#' dispersion parameter. Theta is adjusted to each transcript by dividing
#' by the mean read count of the transcript. The higher \code{theta}, the lower \code{r} and
#' the higher the overdispersion will be. (Default = 2.5).
#' @param minMu Minimal mean for all negative binomial distributions. (Default = 0.5).
#' @param rmax Maximal value for the size parameter. The inverse of this parameter
#' is the lower bound on the dispersion. In analogy to (Anders and Huber, 2010) we
#' use 13 as default. (Default = 13).
#' @param initialization Method used to find the initial clusters.
#' Dexus can either use the quantiles of the readcounts of each gene or
#' run k-means on the counts. (Default = "kmeans").
#' @param multiclassPhiPoolingFunction In "multiClass" mode the dispersion
#' is either estimated across all classes at once (NULL), or separately for
#' each condition, i.e., class. The size parameters or dispersion per class are
#' then joined to one estimate by the mean ("mean"), minimum ("min") or maximum ("max").
#' In our investigations estimation across all classes at once performed best.
#' (Default = NULL).
#' @param quiet Logical that indicates whether dexus should report the steps
#' of the algorithm. Supresses messages from the program if set to TRUE. (Default = FALSE).
#' @param resultObject Type of the result object; can either be a list ("list")
#' or an instance of "DEXUSResult" ("S4"). (Default="S4").
#' @return "list" or "DEXUSResult".
#' A list containing the results and the parameters of the algorithm or an
#' instance of "DEXUSResult".
#' @references
#'
#' Anders, S. and Huber, W. (2010). \emph{Differential expression analysis
#' for sequence count data.} Genome Biol, 11(10), R106.
#'
#' Bullard, J. H., Purdom, E., Hansen, K. D., and Dudoit, S. (2010).
#' \emph{Evaluation of statistical methods for normalization and differential
#' expression in mRNA-seq experiments.}
#' BMC Bioinformatics, 11, 94.
#'
#' McCarthy, D. J., Chen, Y., and Smyth, G. K. (2012).
#' \emph{Differential expression analysis of multifactor RNA-Seq experiments
#' with respect to biological variation.} Nucleic Acids Res, 40(10),
#' 4288-4297.
#'
#' @examples
#' data(dexus)
#' labels1 <- substr(colnames(countsBottomly),1,2)
#' labels2 <- c()
#' labels2[which(labels1=="D2")] <- 1
#' labels2[which(labels1=="B6")] <- 2
#' labels2[c(3,7,8,10,12,15)] <- NA
#' res <- dexss(countsBottomly[1:100, ],labels=labels2,nclasses=2,G=0)
#' @aliases DEXSS, dexss
#' @useDynLib dexus
#' @author Guenter Klambauer \email{klambauer@@bioinf.jku.at} and Thomas
#' Unterthiner \email{unterthiner@@bioinf.jku.at}
#' @export
dexss <- function(X, nclasses=2, G=1, alphaInit,
cyc=20, labels, normalization = "RLE",
kmeansIter=10, ignoreIfAllCountsSmaller=1, theta=2.5, minMu=0.5,rmax=13.0,
initialization="kmeans",
multiclassPhiPoolingFunction=NULL, quiet=FALSE, resultObject="S4"){
### check input ############################################################
if (is.matrix(X)){
if (is.null(rownames(X))){
rownames(X) <- paste("Transcript_",1:nrow(X),sep="")
} else {
if (length(unique(rownames(X)))!=length(rownames(X)))
stop("Transcript names must be unique.")
}
if (is.null(colnames(X))){
colnames(X) <- paste("Sample_",1:ncol(X),sep="")
} else {
if (length(unique(colnames(X)))!=length(colnames(X)))
stop("Sample names must be unique.")
}
} else if (is.vector(X)) {
X <- matrix(X,nrow=1)
rownames(X) <- "Transcript1"
colnames(X) <- paste("Sample_",1:ncol(X),sep="")
normalization <- "none"
} else if (inherits(X,"CountDataSet")) {
object <- X
X <- DESeq::counts(X)
labels <- DESeq::conditions(object)
if (is.null(rownames(X))){
rownames(X) <- paste("Transcript_",1:nrow(X),sep="")
} else {
if (length(unique(rownames(X)))!=length(rownames(X)))
stop("Transcript names must be unique.")
}
if (is.null(colnames(X))){
colnames(X) <- paste("Sample_",1:ncol(X),sep="")
} else {
if (length(unique(colnames(X)))!=length(colnames(X)))
stop("Sample names must be unique.")
}
} else {
stop("Input data types must be a matrix or vector!")
}
if (!is.numeric(nclasses) | nclasses < 1 | nclasses!=as.integer(nclasses)){
stop("\"nclasses\" must be integer greater or equal to 1.")
}
if (is.null(labels)){
stop("Labels must be given for DEXSS!")
}
if (!(is.integer(labels) | is.numeric(labels))){
stop("Labels must be integers!")
}
if (!any(is.na(labels))){
stop("Some labels must be NA.")
}
if(length(labels)!=ncol(X)){
stop("Length of labels must be equal to number of rows in count matrix!")
}
labels <- labels-1
labels[which(is.na(labels))] <- (-1)
labels <- as.integer(labels)
if ((length(unique(labels))-1) > nclasses){
stop("\"nclasses\" must be greater or equal number of different labels!")
}
if (nclasses > ncol(X))
stop("Number of conditions greater than the number of observations.")
## if (!is.numeric(alphaInit) | any(alphaInit < 0) | length(alphaInit)!=nclasses){
## message("\"alphaINIT\" must be numeric greater and all values greater 0.")
## stop("Length of this vector must be \"nclasses\"")
## }
if (!is.numeric(G) | length(G) !=1 | G < 0){
stop("\"G\" must be a numeric greater or equal to 0.")
}
if (!is.numeric(cyc) | length(cyc) !=1 | cyc < 1){
stop("\"cyc\" must be an integer greater or equal to 1.")
}
if (missing(alphaInit))
alphaInit <- rep(1,nclasses)
if (!is.numeric(alphaInit) | any(alphaInit < 0) | length(alphaInit)!=nclasses){
message("\"alphaINIT\" must be numeric greater and all values greater 0.")
stop("Length of this vector must be \"nclasses\"")
}
if (!(normalization %in% c("none","RLE","upperquartile"))){
stop("\"normalization\" must be \"none\",\"RLE\" or \"upperquartile\".")
}
if (!is.numeric(kmeansIter) | length(kmeansIter) !=1 | kmeansIter < 1){
stop("\"kmeansIter\" must be an integer greater or equal to 1.")
}
if (!is.numeric(ignoreIfAllCountsSmaller) | length(ignoreIfAllCountsSmaller) !=1){
stop("\"ignoreIfAllCountsSmaller\" must be a numeric value.")
}
if (!is.numeric(theta) | length(theta) !=1 | theta < 0){
stop("\"theta\" must be a positive numeric value.")
}
if (!is.numeric(minMu) | length(minMu) !=1 | minMu < 0){
stop("\"minMu\" must be a positive numeric value.")
}
if (!is.numeric(rmax) | length(rmax) !=1){
stop("\"rmax\" must be a positive numeric value.")
}
if (rmax==Inf) rmax <- -1
if (!(initialization %in% c("kmeans","quantiles"))){
stop("\"initialization\" must be \"kmeans\" or \"quantiles\".")
}
if (!(is.null(multiclassPhiPoolingFunction))){
if (!(multiclassPhiPoolingFunction %in% c("min","max","mean"))){
stop(paste("\"multiclassPhiPoolingFunction\" must be \"min\"",
"\"max\", or \"mean\" or NULL"))
}
}
if (!is.logical(quiet)){
stop("\"quiet\" must be a logical.")
}
############################################################################
# Normalization
X.raw <- X
norm <- normalizeData(X, normalization)
X <- norm$X
gamma <- rep(1,nclasses)
gamma[1] <- 1+G
#eps1 <- 10^(-(1:nclasses)-6)
#gamma <- gamma + eps1
mainClassIdx <- 1
trows <- nrow(X)
exclIdx <- (apply(X,1,max) < ignoreIfAllCountsSmaller)
if (all(exclIdx)){
message("All transcripts filtered out because of low counts.")
stop("Check your count matrix or change \"ignoreIfAllCountsSmaller\".")
}
if (length(exclIdx) & !quiet)
message(paste("Filtered out",100*round(mean(exclIdx),4),
"% of the genes due to low counts"))
X.all <- X
X <- pmax(X,minMu)
X <- X[!exclIdx, ,drop=FALSE]
N <- ncol(X)
g <- nrow(X)
etaPerGene <- theta/((1+rowMeans(X)))
varToMeanT <- 1
### Semi-supervised############################################################
mode <- "semi-supervised"
if (!quiet) message("Semi-supervised mode.")
clusterFunc <- switch(initialization,
kmeans=initialize.clusters.kmeans,
quantiles=initialize.clusters.quantiles)
clusterResults <- (apply(X,1,clusterFunc, n=nclasses,
mainClassIdx=mainClassIdx, rmax=rmax,kmeansIter))
meansINIT <- sapply(clusterResults, function(x) x$means)
rINIT <- sapply(clusterResults, function(x) x$r)
alphaInit <- as.numeric(alphaInit/sum(alphaInit))
res <- .Call("dexss",as.vector(t(X)), nrow(X), ncol(X),
alphaInit, as.vector(rINIT), as.vector(meansINIT),
as.numeric(gamma), as.integer(nclasses), as.integer(cyc),
as.numeric(varToMeanT), as.integer(labels),
as.numeric(etaPerGene), as.numeric(minMu),as.numeric(rmax))
## rerun if the major class is not mainClassIdx after learning
alpha <- matrix(res$alpha,nrow=g,byrow=TRUE)
A <- array(res$A,dim=c(nclasses,N,g)) ##right!
r <- matrix(res$r,nrow=g,byrow=TRUE)
means <- matrix(res$means,nrow=g,byrow=TRUE)
resp <- t(sapply(1:g,function(i) apply(A[,,i],2,which.max) ))
rerunIdx <- which(apply(resp,1,function(x) any(table(x) > table(x)[mainClassIdx])))
if (length(rerunIdx)>0){
reorder <- t(apply(resp,1,function(x){
h <- hist(x,breaks=seq(0.5,nclasses+0.5,1),plot=FALSE)$counts
order(h,decreasing=TRUE)[order(gamma,decreasing=TRUE)]
}
))
res2 <- .Call("dexss",as.vector(t(X[rerunIdx, ,drop=FALSE])),
length(rerunIdx), ncol(X),
alphaInit,
as.vector(t(r[rerunIdx,reorder[rerunIdx, ,drop=FALSE]])),
as.vector(t(means[rerunIdx,reorder[rerunIdx, ,drop=FALSE] ])),
as.numeric(gamma), as.integer(nclasses), as.integer(cyc),
as.numeric(varToMeanT), as.integer(labels), as.numeric(etaPerGene),
as.numeric(minMu),as.numeric(rmax))
#implant res2 into res...
implantIdx <- as.vector(sapply(rerunIdx*nclasses,
function(x) (x-nclasses+1):x ))
res$r[implantIdx] <- res2$r
res$means[implantIdx] <- res2$means
res$alpha[implantIdx] <- res2$alpha
implantIdx2 <- as.vector(sapply((rerunIdx-1)*N*nclasses,
function(x) (x+1):(x+N*(nclasses) )))
res$A[implantIdx2] <- res2$A
#check: res$A[implantIdx2]==as.vector(A[,,rerunIdx])
}
pval <- rep(NA,nrow(X))
# extracting from result object
alpha <- matrix(res$alpha,nrow=g,byrow=TRUE)
# if (mode=="semi-supervised" & nclasses >2){
# oo <- t(apply(alpha,1,order,decreasing=TRUE))
# alpha <- t(sapply(seq(g), function(x) alpha[x, oo[x, ]] ))
# }
#
A <- array(res$A,dim=c(nclasses,N,g)) ##right!
# if (mode=="semi-supervised" & nclasses >2){
# A <- lapply(seq(g), function(x) A[oo[x, ], ,x] )
# A <- array(unlist(A),dim=c(nclasses,N,g))
# }
#
#
poisIdx <- which(res$r < 0)
res$r[poisIdx] <- Inf
r <- matrix(res$r,nrow=g,byrow=TRUE)
# if (mode=="semi-supervised" & nclasses >2)
# r <- t(sapply(seq(g), function(x) r[x, oo[x, ]] ))
#
means <- matrix(res$means,nrow=g,byrow=TRUE)
# if (mode=="semi-supervised" & nclasses >2)
# means <- t(sapply(seq(g), function(x) means[x, oo[x, ]] ))
#
p <- res$means / (res$means + res$r)
sds <- sqrt(res$means * (res$means + res$r) / res$r)
sds[which(r==Inf)] <- sqrt(res$means[which(r==Inf)])
sds <- matrix(sds,nrow=g,byrow=TRUE)
resp <- t(sapply(1:g,function(i) apply(A[,,i],2,which.max) ))
fc <- log((means+1e-4)/(means[,mainClassIdx]+1e-4))
INI <- sapply(1:g, function(i) sum(alpha[i, ] * abs(fc)[i, ]))
### reorder
### re-insert excluded genes
pFinal <- matrix(NA,nrow=trows,ncol=nclasses)
pFinal[exclIdx, ] <- rep(NA,nclasses)
pFinal[!exclIdx, ] <- p
rownames(pFinal) <- rownames(X.all)
colnames(pFinal) <- paste("Condition",1:nclasses,sep="_")
alphaFinal <- matrix(NA,nrow=trows,ncol=nclasses)
alphaFinal[exclIdx, ] <- rep(0,nclasses)
alphaFinal[exclIdx,mainClassIdx] <- 1
alphaFinal[!exclIdx, ] <- alpha
rownames(alphaFinal) <- rownames(X.all)
colnames(alphaFinal) <- paste("Condition",1:nclasses,sep="_")
AFinal <- array(NA,dim=c(nclasses,N,trows))
AFinal[, ,exclIdx] <- matrix(NA,nrow=nclasses,ncol=N)
AFinal[, ,!exclIdx] <- A
respFinal <- matrix(NA,nrow=trows,ncol=N)
respFinal[exclIdx, ] <- rep(mainClassIdx,N)
#browser()
#respFinal[exclIdx, ] <- matrix(rep(labels,length(which(exclIdx))),nrow=length(which(exclIdx)))
respFinal[!exclIdx, ] <- resp
rownames(respFinal) <- rownames(X.all)
colnames(respFinal) <- colnames(X.all)
rFinal <- matrix(NA,nrow=trows,ncol=nclasses)
rFinal[exclIdx, ] <- rep(NA,nclasses)
rFinal[!exclIdx, ] <- r
rownames(rFinal) <- rownames(X.all)
colnames(rFinal) <- paste("Condition",1:nclasses,sep="_")
meansFinal <- matrix(NA,nrow=trows,ncol=nclasses)
if (any(exclIdx))
meansFinal[exclIdx, ] <-
t(sapply(rowMeans(X.all[exclIdx, ,drop=FALSE]),rep,nclasses))
meansFinal[!exclIdx, ] <- means
rownames(meansFinal) <- rownames(X.all)
colnames(meansFinal) <- paste("Condition",1:nclasses,sep="_")
sdsFinal <- matrix(NA,nrow=trows,ncol=nclasses)
sdsFinal[exclIdx, ] <- rep(NA,nclasses)
sdsFinal[!exclIdx, ] <- sds
rownames(sdsFinal) <- rownames(X.all)
colnames(sdsFinal) <- paste("Condition",1:nclasses,sep="_")
fcFinal <- matrix(NA,nrow=trows,ncol=nclasses)
fcFinal[exclIdx, ] <- rep(NA,nclasses)
fcFinal[!exclIdx, ] <- fc
rownames(fcFinal) <- rownames(X.all)
colnames(fcFinal) <- paste("Condition",1:nclasses,sep="_")
INIFinal <- vector("numeric", length=trows)
INIFinal[!exclIdx] <- INI
if (length(which(exclIdx))>0)
INIFinal[exclIdx] <- runif(n=length(which(exclIdx)),min=0,max=min(INI))
names(INIFinal) <- rownames(X.all)
pvalFinal <- rep(1, length=trows)
pvalFinal[!exclIdx] <- pval
if (all(is.na(pval))){
pvalFinal[exclIdx] <- NA
} else{
pvalFinal[exclIdx] <- runif(n=length(which(exclIdx)),0.01,1.00)
}
names(pvalFinal) <- rownames(X.all)
params <- list(nclasses, alphaInit, G, cyc, labels, normalization,
kmeansIter, ignoreIfAllCountsSmaller, theta, minMu, rmax, initialization,
mode, multiclassPhiPoolingFunction,quiet,resultObject)
names(params) <- c("nclasses","alphaINIT","G","cyc","labels",
"normalization","kmeansIter","ignoreIfAllCountsSmaller",
"theta","minMu","rmax","initialization","mode",
"multiclassPhiPoolingFunction","quiet","resultObject")
if (resultObject!="S4"){
res <- list(params, X.raw, X.all, norm$sizeFactors,
AFinal, respFinal, fcFinal, INIFinal, pvalFinal,
t(rINIT), t(meansINIT), pFinal, alphaFinal,
rFinal, meansFinal, sdsFinal)
names(res) <- c("params","X","X.norm", "sizeFactors",
"A","resp","logfc","INI","pval",
"rINIT", "meansINIT","p","alpha",
"r","means","sds")
return(res)
} else {
res <- new("DEXUSResult")
res@transcriptNames <- rownames(X.all)
res@sampleNames <- colnames(X.all)
res@inputData <- X.raw
res@normalizedData <- X.all
res@sizeFactors <- norm$sizeFactors
res@INIValues <- INIFinal
res@INICalls <- INIFinal>0.1
res@INIThreshold <- 0.1
res@pvals <- pvalFinal
res@responsibilities <- respFinal
res@posteriorProbs <- AFinal
res@logFC <- fcFinal
res@conditionSizes <- alphaFinal
res@sizeParameters <- rFinal
res@means <- meansFinal
res@dispersions <- 1/rFinal
res@params <- params
return(res)
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.