#' @title Estimates the proportions of mixed samples for each mixing component
#'
#' @description This function is designed to estimate the deconvolved
#' expressions of individual mixed tumor samples for unknown component
#' for each gene.
#'
#' @param data.Y A SummarizedExperiment object of expression data from mixed
#' tumor samples. It is a \eqn{G} by \eqn{My} matrix where \eqn{G} is the number
#' of genes and \eqn{My} is the number of mixed samples. Samples with the same
#' tissue type should be placed together in columns.
#' @param data.N1 A SummarizedExperiment object of expression data
#' from reference component 1 (e.g., normal). It is a \eqn{G} by \eqn{M1} matrix
#' where \eqn{G} is the number of genes and \eqn{M1} is the number of samples
#' for component 1.
#' @param data.N2 A SummarizedExperiment object of expression data from
#' additional reference samples. It is a \eqn{G} by \eqn{M2} matrix where
#' \eqn{G} is the number of genes and \eqn{M2} is the number of samples for
#' component 2. Component 2 is needed only for running a three-component model.
#' @param niter The maximum number of iterations used in the algorithm of
#' iterated conditional modes. A larger value better guarantees
#' the convergence in estimation but increases the running time. The default is
#' 10.
#' @param nbin The number of bins used in numerical integration for computing
#' complete likelihood. A larger value increases accuracy in estimation but
#' increases the running time, especially in a three-component deconvolution
#' problem. The default is 50.
#' @param if.filter The logical flag indicating whether a predetermined filter
#' rule is used to select genes for proportion estimation. The default is TRUE.
#' @param filter.sd The cut-off for the standard deviation of lognormal
#' distribution. Genes whose log transferred standard deviation smaller than
#' the cut-off will be selected into the model. The default is 0.5.
#' @param ngene.selected.for.pi The percentage or the number of genes used for
#' proportion estimation. The difference between the expression levels from
#' mixed tumor samples and the known component(s) are evaluated, and the most
#' differential expressed genes are selected, which is called DE. It is enabled
#' when if.filter = TRUE. The default is \eqn{min(1500, 0.3*G)}, where
#' \eqn{G} is the number of genes. Users can also try using more genes,
#' ranging from \eqn{0.3*G} to \eqn{0.5*G}, and evaluate the outcome.
#' @param nspikein The number of spikes in normal reference used for proportion
#' estimation. The default value is \eqn{ min(200, 0.3*My)}, where
#' \eqn{My} the number of mixed samples. If it is set to 0, proportion
#' estimation is performed without any spike in normal reference.
#' @param mean.diff.in.CM Threshold of expression difference for selecting genes
#' in the component merging strategy. We merge three-component to two-component
#' by selecting genes with similar expressions for the two known components.
#' Genes with the mean differences less than the threshold will be selected for
#' component merging. It is used in the three-component setting, and is enabled
#' when if.filter = TRUE. The default is 0.25.
#' @param tol The convergence criterion. The default is 10^(-5).
#' @param pi01 Initialized proportion for first kown component. The default is
#' \eqn{Null} and pi01 will be generated randomly from uniform distribution.
#' @param pi02 Initialized proportion for second kown component. pi02 is needed
#' only for running a three-component model. The default is \eqn{Null} and pi02
#' will be generated randomly from uniform distribution.
#' @param nthread The number of threads used for deconvolution when OpenMP is
#' available in the system. The default is the number of whole threads minus one.
#' In our no-OpenMP version, it is set to 1.
#'
#' @return
#' \item{pi}{A matrix of estimated proportion. First row and second row
#' corresponds to the proportion estimate for the known components and unkown
#' component respectively for two or three component settings, and each column
#' corresponds to one sample.}
#' \item{pi.iter}{Estimated proportions in each iteration. It is a \eqn{niter
#' *Ny*p} array, where \eqn{p} is the number of components. This is
#' enabled only when output.more.info = TRUE.}
#' \item{gene.name}{The names of genes used in estimating the proportions.
#' If no gene names are provided in the original data set, the genes will
#' be automatically indexed.}
#'
#' @author Zeya Wang, Wenyi Wang
#'
#' @seealso http://bioinformatics.mdanderson.org/main/DeMixT
#'
#' @examples
#' # Example 1: estimate proportions for simulated two-component data
#' # with spike-in normal reference
#' data(test.data.2comp)
#' # res.DE = DeMixT_DE(data.Y = test.data.2comp$data.Y,
#' # data.N1 = test.data.2comp$data.N1,
#' # niter = 10, nbin = 50, nspikein = 50,
#' # if.filter = TRUE,
#' # mean.diff.in.CM = 0.25, ngene.selected.for.pi = 150,
#' # tol = 10^(-5))
#' #
#' # Example 2: estimate proportions for simulated two-component data
#' # without spike-in normal reference
#' # data(test.data.2comp)
#' # res.DE = DeMixT_DE(data.Y = test.data.2comp$data.Y,
#' # data.N1 = test.data.2comp$data.N1,
#' # niter = 10, nbin = 50, nspikein = 0,
#' # if.filter = TRUE,
#' # mean.diff.in.CM = 0.25, ngene.selected.for.pi = 150,
#' # tol = 10^(-5))
#' #
#' # Example 3: estimate proportions for simulated three-component
#' # mixed cell line data
#' # data(test.data.3comp)
#' # res.DE <- DeMixT_DE(data.Y = test.data.3comp$data.Y,
#' # data.N1 = test.data.3comp$data.N1,
#' # data.N2 = test.data.3comp$data.N2,
#' # if.filter = TRUE)
#'
#' @references Wang Z, Cao S, Morris J S, et al. Transcriptome Deconvolution of
#' Heterogeneous Tumor Samples with Immune Infiltration. iScience, 2018, 9: 451-460.
#'
#' @keywords DeMixT_DE
#'
#' @export
DeMixT_DE <- function (data.Y, data.N1, data.N2 = NULL, niter = 10,
nbin = 50, if.filter = TRUE, filter.sd = 0.5,
ngene.selected.for.pi = NA, nspikein = NULL,
mean.diff.in.CM = 0.25, tol = 10^(-5),
pi01 = NULL, pi02 = NULL,
nthread = parallel::detectCores() - 1)
{
filter.option = 1
## Transfering data format
data.Y <- SummarizedExperiment::assays(data.Y)[[1]]
data.N1 <- SummarizedExperiment::assays(data.N1)[[1]]
if (is.na(ngene.selected.for.pi)){
ngene.selected.for.pi = min(1500, round(0.3*nrow(data.Y)))
}
nS = ncol(data.Y)
## Generate Spike-in normal reference
if (!is.null(data.N2)) nspikein = 0
if (is.null(nspikein)) nspikein = min(200, ceiling(ncol(data.Y)*0.3))
if (nspikein > 0){
MuN = apply(log2(data.N1), 1, mean)
MuN[which(!is.finite(MuN))] =
apply(log2(data.N1[which(!is.finite(MuN)), ] + 0.1), 1, mean)
SigmaN = apply(log2(data.N1), 1, sd)
SigmaN[which(!is.finite(SigmaN))] =
apply(log2(data.N1[which(!is.finite(SigmaN)), ] + 0.1), 1, sd)
Spikein.normal = array(0, c(nrow(data.Y), nspikein))
for(k in 1:nrow(data.Y)) {
Spikein.normal[k, ] = 2^rnorm(n = nspikein, mean = MuN[k], sd = SigmaN[k])
}
data.Y = cbind(data.Y, Spikein.normal)
}
if (!is.null(data.N2))
data.N2 <- SummarizedExperiment::assays(data.N2)[[1]]
## Creat row and column names for input data
if (is.null(rownames(data.Y))) {
rownames(data.Y) <- paste('Gene', seq = seq(1, nrow(data.Y)))
rownames(data.N1) <- paste('Gene', seq = seq(1, nrow(data.N1)))
if(!is.null(data.N2)){
rownames(data.N2) <- paste('Gene', seq = seq(1, nrow(data.N2)))
}
}
if (is.null(colnames(data.Y))) {
colnames(data.Y) <- paste('Sample', seq = seq(1, ncol(data.Y)))
colnames(data.N1) <- paste('Sample', seq = seq(1, ncol(data.N1)))
if(!is.null(data.N2)){
colnames(data.N2) <- paste('Sample', seq = seq(1, ncol(data.N2)))
}
}
## Merge the inputdata and creat group id
if (is.null(data.N2)) {
inputdata <- cbind(data.N1, data.Y)
groupid <- c(rep(1, ncol(data.N1)), rep(3, ncol(data.Y)))
}else {
inputdata <- cbind(data.N1, data.N2, data.Y)
groupid <- c(rep(1, ncol(data.N1)), rep(2, ncol(data.N2)),
rep(3, ncol(data.Y)))
}
## filter.option: 1 - remove genes containing zero; 2 - add 1 to to kill zeros
if (filter.option == 1) {
index <- apply(inputdata, 1, function(x) sum(x <= 0) ==
0)
inputdata <- inputdata[index, ]
data.N1 <- data.N1[index, ]
if (!is.null(data.N2))
data.N2 <- data.N2[index, ]
data.Y <- data.Y[index, ]
}else if (filter.option == 2) {
inputdata <- inputdata + 1
data.N1 <- data.N1 + 1
if (!is.null(data.N2))
data.N2 <- data.N2 + 1
data.Y <- data.Y + 1
}else {
stop("The argument filter.option can only be 1 or 2")
}
## filter out genes with constant value across all samples
if (is.null(data.N2)) {
if (dim(inputdata)[1] == 1) {
inputdata <- t(as.matrix(inputdata[apply(data.N1,
1, function(x) length(unique(x)) > 1), ]))
}
else {
inputdata <- inputdata[apply(data.N1, 1, function(x)
length(unique(x)) > 1), ]
}
}else{
if (dim(inputdata)[1] == 1) {
inputdata <- t(as.matrix(inputdata[apply(data.N1,
1, function(x) length(unique(x)) > 1) & apply(data.N2,
1, function(x) length(unique(x)) > 1), ]))
}
else {
inputdata <- inputdata[apply(data.N1, 1, function(x)
length(unique(x)) > 1) & apply(data.N2, 1, function(x)
length(unique(x)) > 1), ]
}
}
filter2 <- function(inputdata1r, ngene.selected.for.pi, n = 1) {
if ((ngene.selected.for.pi > 1) & (ngene.selected.for.pi%%1 ==
0)) {
id2 <- order(inputdata1r, decreasing = TRUE)
id2 <- id2[seq(1, min(n * ngene.selected.for.pi,
length(inputdata1r)))]
}
else if ((ngene.selected.for.pi < 1) & (ngene.selected.for.pi >
0)) {
id2 <- (inputdata1r > quantile(inputdata1r, probs = 1 -
n * ngene.selected.for.pi))
}
else {
stop("The argument ngene.selected.for.pi can only be
an integer or a percentage between 0 and 1")
}
if (sum(id2) < 20)
stop("there are too few genes for filtering stage 1.\n
Please relax threshold for filtering ")
return(inputdatamat1[id2, ])
}
inputdata = as.matrix(inputdata)
if (if.filter == FALSE) {
gene.name <- rownames(inputdata)
res <- Optimum_KernelC(inputdata, groupid,
setting.pi = 0, nspikein = nspikein,
givenpi = rep(0, 2 * ncol(data.Y)), givenpiT = rep(0, ncol(data.Y)),
niter = niter, ninteg = nbin, tol = tol, pi01 = pi01, pi02 = pi02,
nthread = nthread)
}
if (if.filter == TRUE & is.null(data.N2)) {
#inputdatans <- rowSds(log2(data.N1))
inputdatans <- apply(log2(data.N1), 1, sd)
id1 <- (inputdatans < filter.sd)
if (sum(id1) < 20)
stop("The threshold of standard variation is too stringent. \n
Please provide a larger threshold. ")
inputdatamat1 <- inputdata[id1, ]
inputdatamat1nm <- rowMeans(inputdatamat1[, groupid ==
1])
inputdatamat1ym <- rowMeans(inputdatamat1[, groupid ==
3])
inputdata1r <- inputdatamat1ym/inputdatamat1nm
inputdata2 <- filter2(inputdata1r, ngene.selected.for.pi)
gene.name <- rownames(inputdata2)
res <- Optimum_KernelC(inputdata2, groupid, setting.pi = 0, nspikein = nspikein,
givenpi = rep(0, 2 * ncol(data.Y)), givenpiT = rep(0, ncol(data.Y)),
niter = niter, ninteg = nbin, tol = tol, pi01 = pi01, pi02 = pi02,
nthread = nthread)
}
if (if.filter == TRUE & !is.null(data.N2)) {
message("Fitering stage 1 starts\n")
inputdatan1m <- rowMeans(log2(inputdata[, groupid ==
1]))
inputdatan2m <- rowMeans(log2(inputdata[, groupid ==
2]))
#inputdatan1s <- rowSds(log2(inputdata[, groupid == 1]))
inputdatan1s <- apply(log2(inputdata[, groupid == 1]), 1, sd)
#inputdatan2s <- rowSds(log2(inputdata[, groupid == 2]))
inputdatan2s <- apply(log2(inputdata[, groupid == 2]), 1, sd)
id1 <- ((abs(inputdatan1m - inputdatan2m) < mean.diff.in.CM) &
(inputdatan1s < filter.sd) & (inputdatan2s < filter.sd))
if (sum(id1) < 20)
stop("The thresholds of standard variation and \n
mean difference are too stringent. \n
Please provide larger thresholds")
inputdatamat1 <- inputdata[id1, ]
inputdatamat1nm <- rowMeans(inputdatamat1[, (groupid ==
1) | (groupid == 2)])
inputdatamat1ym <- rowMeans(inputdatamat1[, (groupid ==
3)])
#inputdata1r <- rowSds(inputdatamat1[, groupid == 3])
inputdata1r <- apply(inputdatamat1[, groupid == 3], 1, sd)
inputdatamat2 <- filter2(inputdata1r, ngene.selected.for.pi)
cnvgroup <- groupid
cnvgroup[groupid == 2] <- 1
res1 <- Optimum_KernelC(inputdatamat2, cnvgroup, setting.pi = 0, nspikein = nspikein,
givenpi = rep(0, ncol(data.Y)), givenpiT = rep(0, ncol(data.Y)),
niter = niter, ninteg = nbin, tol = tol, pi01 = pi01, pi02 = pi02,
nthread = nthread)
fixed.piT <- 1 - as.numeric(res1$pi[1, ])
message("Filtering stage 1 is finished\n")
message("Filtering stage 2 starts\n")
id3 <- ((inputdatan1s < filter.sd) & (inputdatan2s <
filter.sd))
if (sum(id3) < 20)
stop("The thresholds of standard variation and \n
mean difference are too stringent. \n
Please provide larger thresholds")
inputdatamat1 <- inputdata[id3, ]
inputdatan1m <- rowMeans(log2(inputdatamat1[, groupid ==
1]))
inputdatan2m <- rowMeans(log2(inputdatamat1[, groupid ==
2]))
inputdata1d <- abs(inputdatan1m - inputdatan2m)
inputdatamat2 <- filter2(inputdata1d, ngene.selected.for.pi,
2)
#inputdata1s <- rowSds(inputdatamat2[, groupid == 3])
inputdata1s <- apply(inputdatamat2[, groupid == 3], 1, sd)
id5 <- (inputdata1s > quantile(inputdata1s, probs = 0.5))
inputdatamat3 <- inputdatamat2[id5, ]
gene.name <- rownames(inputdatamat3)
res <- Optimum_KernelC(inputdatamat3, groupid,
nspikein = nspikein, setting.pi = 2,
givenpi = rep(0, 2 * ncol(data.Y)), givenpiT = fixed.piT,
niter = niter, ninteg = nbin, tol = tol, pi01 = pi01, pi02 = pi02,
nthread = nthread)
message("Filtering stage 2 is finished")
}
if(nspikein == 0){
pi <- rbind(t(as.matrix(res$pi[1, ])),
1 - t(as.matrix(res$pi[1, ])))
row.names(pi) <- c("PiN1", "PiT")
colnames(pi) <- colnames(data.Y)
pi1 <- as.matrix(res$pi1)
colnames(pi1) <- as.character(seq(1, ncol(pi1)))
row.names(pi1) = colnames(data.Y)
## Save pi.iter
pi.iter <- array(0, c(ncol(res$pi1), nrow(res$pi1), 2))
pi.iter[ , , 1] <- t(res$pi1)
pi.iter[ , , 2] <- 1 - pi.iter[ , , 1]
colnames(pi.iter) <- colnames(data.Y)
row.names(pi.iter) <- paste('iter ', seq = c(1:nrow(pi.iter)))
}else{
pi <- rbind(t(as.matrix(res$pi[1, c(1:nS)])),
1 - t(as.matrix(res$pi[1, c(1:nS)])))
row.names(pi) <- c("PiN1", "PiT")
colnames(pi) <- colnames(data.Y)[c(1:nS)]
pi1 <- as.matrix(res$pi1[c(1:nS),])
colnames(pi1) <- as.character(seq(1, ncol(pi1)))
row.names(pi1) = colnames(data.Y[,c(1:nS)])
## Save pi.iter
pi.iter <- array(0, c(ncol(res$pi1), nrow(res$pi1)-nspikein, 2))
pi.iter[ , , 1] <- t(res$pi1)[,c(1:nS)]
pi.iter[ , , 2] <- 1 - pi.iter[ , , 1]
colnames(pi.iter) <- colnames(data.Y)[1:nS]
row.names(pi.iter) <- paste('iter ', seq = c(1:nrow(pi.iter)))
}
if(IQR(pi[2,]) < 0.01){
stop('DeMixT Estimation Failed, try different initial values for pi')
}
if (!is.null(data.N2)) {
pi <- rbind(res$pi,
t(as.matrix(apply(res$pi, 2, function(x) 1-sum(x)))))
row.names(pi) <- c("PiN1" ,"PiN2", "PiT")
colnames(pi) <- colnames(data.Y)
pi2 <- as.matrix(res$pi2)
colnames(pi2) <- as.character(seq(1, ncol(pi2)))
row.names(pi2) = colnames(data.Y)
## Save pi.iter
pi.iter <- array(0, c(ncol(res$pi1), nrow(res$pi1), 3))
pi.iter[ , , 1] <- t(res$pi1)
pi.iter[ , , 2] <- t(res$pi2)
pi.iter[ , , 3] <- 1 - (pi.iter[ , , 1] + pi.iter[ , , 2])
# pi.iter <- array(t(rbind(res$pi1, res$pi2)),
# dim <- c(ncol(res$pi1), nrow(res$pi1), 2))
colnames(pi.iter) <- colnames(data.Y)
row.names(pi.iter) <- paste('iter ', seq = c(1:nrow(pi.iter)))
}
cat("Estimation of Proportions finished:\n")
print(round(t(pi),4))
return(list(pi = pi, pi.iter = pi.iter, gene.name = gene.name))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.