#' Two Sample Test
#'
#' @param sce a \code{SingleCellExperiment} object
#' @param sampleKey a string specifying the factor column in \code{colData(sce)}
#' that codes for samples
#' @param sample0 a string specifying the level for the baseline sample
#' @param sample1 a string specifying the level for the contrasting sample
#' @param statistic one of "WD" (Wasserstein Distance), "UI" (Usage Index),
#' or "WUI" (Weighted Usage Index). If "UI" or "WUI" is specified, then the
#' \code{featureIndex} argument must also be provided.
#' @param nBootstraps number of bootstraps samples to use estimate pvalues
#' @param geneKey a string specifying the factor column in \code{rowData(sce)}
#' that specifies the gene with which each transcript is associated
#' @param featureIndex a string specifying a column in \code{rowData(sce)}
#' that specifies either which isoforms are indexed ("UI") or the weight
#' for each isoform within the gene ("WUI"). For "UI" this should be a logical
#' that is TRUE for at least one isoform per gene, otherwise the index will constant.
#' For "WUI" mode, this should be numeric.
#' @param featureExclude a string specifying a logical column in \code{rowData(sce)}
#' of features to exclude. All isoforms with a TRUE value in the column will
#' be removed.
#' @param minCellsPerGene a numeric specifying the minimum number of cells
#' that express a gene to consider the gene testable.
#' @param assayName the assay in the \code{SingleCellExperiment} to use
#'
#' @return a \code{DataFrame} with columns
#' - \code{gene}
#' - \code{stat}
#' - \code{pval}
#'
#'
#' @importFrom Matrix fac2sparse Diagonal drop0
#' @importFrom sparseMatrixStats rowAlls colAnys
#' @import SingleCellExperiment
#' @export
testTwoSample <- function (sce, sampleKey, sample0, sample1,
statistic=c("WUI", "WD", "UI"),
nBootstraps=10000L,
geneKey="gene_id", featureIndex=NULL,
featureExclude=NULL, minCellsPerGene=50L,
assayName="counts", pseudocount=1) {
## check arguments
.validateArguments.testTwoSample(statistic, featureIndex)
## filter testable transcripts
sce <- .filterTestableTxs(sce, sampleKey, sample0, sample1, geneKey,
featureExclude, minCellsPerGene, assayName)
## compute observed statistics
cts_tx_cell <- assay(sce, assayName)
M_gene_tx <- fac2sparse(rowData(sce)[[geneKey]])
M_cell_sample <- t(fac2sparse(colData(sce)[[sampleKey]]))[,c(sample0, sample1)]
cts_tx_sample <- cts_tx_cell %*% M_cell_sample
cts_gene_sample <- M_gene_tx %*% cts_tx_sample
usage_tx_sample <- (t(M_gene_tx) %*% (1/cts_gene_sample)) * cts_tx_sample
dUsage_tx <- usage_tx_sample[,sample1] - usage_tx_sample[,sample0]
if (statistic == "WD") {
stat_gene <- M_gene_tx %*% abs(dUsage_tx) / 2.0
} else {
## technically
D_index <- Diagonal(nrow(sce), rowData(sce)[[featureIndex]])
stat_gene <- M_gene_tx %*% D_index %*% dUsage_tx
}
## bootstrap statistics
ncells_sample <- colSums(M_cell_sample)
M_cell_bs0 <- Matrix(sparse=TRUE,
data=rmultinom(nBootstraps,
ncells_sample[[sample0]],
rep(1, sum(ncells_sample))))
cts_tx_bs0 <- cts_tx_cell %*% M_cell_bs0
cts_gene_bs0 <- M_gene_tx %*% cts_tx_bs0
usage_tx_bs0 <- (t(M_gene_tx) %*% (1/cts_gene_bs0)) * cts_tx_bs0
rm(M_cell_bs0, cts_tx_bs0, cts_gene_bs0); gc()
M_cell_bs1 <- Matrix(sparse=TRUE,
data=rmultinom(nBootstraps,
ncells_sample[[sample1]],
rep(1, sum(ncells_sample))))
cts_tx_bs1 <- cts_tx_cell %*% M_cell_bs1
cts_gene_bs1 <- M_gene_tx %*% cts_tx_bs1
usage_tx_bs1 <- (t(M_gene_tx) %*% (1/cts_gene_bs1)) * cts_tx_bs1
rm(M_cell_bs1, cts_tx_bs1, cts_gene_bs1, cts_tx_cell); gc()
## Note: switching order of Kronecker product effectively computes all
## combinations of columns
#K_repeat <- matrix(rep(1, nBootstraps), nrow=1)
#dUsage_tx_bs <- (K_repeat %x% usage_tx_bs1) - (usage_tx_bs0 %x% K_repeat)
dUsage_tx_bs <- usage_tx_bs1 - usage_tx_bs0
rm(usage_tx_bs1, usage_tx_bs0); gc()
if (statistic == "WD") {
stat_gene_bs <- M_gene_tx %*% abs(dUsage_tx_bs) / 2.0
} else {
stat_gene_bs <- M_gene_tx %*% D_index %*% dUsage_tx_bs
}
rm(dUsage_tx_bs); gc()
pval_gene <- rowMeans(abs(stat_gene_bs) - abs(stat_gene) >= 0, na.rm=TRUE)
nbs_true <- rowSums(!is.na(stat_gene_bs))
## adjust by pseudocount
##pval_gene <- (pval_gene * nBootstraps^2 + pseudocount) / (nBootstraps^2 + pseudocount)
pval_gene <- (pval_gene * nbs_true + pseudocount) / (nbs_true + pseudocount)
## format dataframe
DataFrame(gene=rownames(M_gene_tx), stat=stat_gene[,1],
pval=pval_gene, bootstraps=nbs_true)
}
#' Validate arguments to testTwoSample
#'
#' @param statistic character argument from \code{testTwoSample}
#' @param featureIndex character argument from \code{testTwoSample}
#'
#' @return
#'
#' @noRd
.validateArguments.testTwoSample <- function (statistic, featureIndex) {
if (statistic == 'WUI') {
if (is.null(featureIndex)) {
stop("The 'WUI' (Weighted Usage Index) statistic requires a `featureIndex` to be provided!")
}
} else if (statistic == 'UI') {
if(is.null(featureIndex)) {
stop("The 'UI' (Usage Index) statistic requires a `featureIndex` to be provided!")
}
} else if (statistic == 'WD') {
if (!is.null(featureIndex)) {
warning("The 'WD' (Wasserstein Distance) statistic ignores the `featureIndex` argument.")
}
}
}
#' Filter to testable transcripts
#'
#' @param sce a \code{SingleCellExperiment} object
#' @param sampleKey a string specifying the factor column in \code{colData(sce)}
#' that codes for samples
#' @param sample0 a string specifying the level for the baseline sample
#' @param sample1 a string specifying the level for the contrasting sample
#' @param geneKey a string specifying the factor column in \code{rowData(sce)}
#' that specifies the gene with which each transcript is associated
#' @param featureExclude a string specifying a logical column in \code{rowData(sce)}
#' of features to exclude. All isoforms with a TRUE value in the column will
#' be removed.
#' @param minCellsPerGene a numeric specifying the minimum number of cells
#' that express a gene to consider the gene testable.
#' @param assayName the assay in the \code{SingleCellExperiment} to use
#'
#' @return
#'
#' @noRd
#' @importFrom Matrix fac2sparse Diagonal drop0
#' @importFrom sparseMatrixStats rowAlls colAnys
#' @import SingleCellExperiment
.filterTestableTxs <- function (sce, sampleKey, sample0, sample1, geneKey,
featureExclude, minCellsPerGene,
assayName) {
## filter cells
sce <- sce[,colData(sce)[[sampleKey]] %in% c(sample0, sample1)]
## filter excluded transcripts
if (!is.null(featureExclude)) {
if (is.logical(featureExclude)) {
sce <- sce[!featureExclude,]
} else if (featureExclude %in% names(rowData(sce))) {
sce <- sce[!rowData(sce)[[featureExclude]],]
} else {
warning("The `featureExclude` argument must either be a logical or",
" a column name of `rowData(sce)`! Ignoring argument.")
}
}
## filter unexpressed transcripts
sce <- sce[rowSums(assay(sce, assayName)) > 0,]
## filter untestable transcripts
cts_tx_cell <- assay(sce, assayName)
M_gene_tx <- fac2sparse(rowData(sce)[[geneKey]])
M_cell_sample <- t(fac2sparse(colData(sce)[[sampleKey]]))[,c(sample0, sample1)]
ncells_gene_sample <- ((M_gene_tx %*% cts_tx_cell) > 0) %*% M_cell_sample
idxExpressedGenes <- rowAlls(drop0(ncells_gene_sample >= minCellsPerGene))
idxMultiTxGenes <- rowSums(M_gene_tx) > 1
idxTestableTxs <- colAnys(M_gene_tx[idxExpressedGenes & idxMultiTxGenes,])
sce[idxTestableTxs,]
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.