##' Prepare the transcript expression matrix. Transcripts with low
##' expression are removed. Genes expressing only one transcript
##' are removed because they are not informative for splicing analysis.
##' The relative transcript expression is retrieved and genes with
##' low variability in splicing are removed. Also genes with a small number (<25) of
##' different splicing patterns are removed as they don't fit the
##' permutation process.
##'
##' After removal of inappropriate transcripts/genes, transcript
##' relative expression is computed by dividing its expression
##' by the total expression of all transcripts for the gene.
##' @title Transcript expression preparation
##' @param te.df a data.frame with the transcript expression. The first
##' two columns are the transcript and gene ids, named 'trId' and
##' 'geneId'. The rest of the columns contain the expression values for each sample.
##' @param min.transcript.exp the minimum transcript expression. Transcripts
##' with lower expression in all samples are removed.
##' @param min.gene.exp the minimum gene expression. Samples with lower
##' gene expression are removed from the analysis of this gene. If after this step
##' the proportion of retained samples for a given gene is smaller than
##' \code{min.prop} (see below), the gene is removed from the analysis.
##' @param min.prop the minimum proportion of samples with gene expression above
##' \code{min.gene.exp} for the gene to be analyzed.
##' @param min.dispersion the minimum dispersion of the transcript
##' relative expression. Genes with lower dispersion are removed.
##' @param verbose If \code{TRUE} the names of filtered genes and transcripts will be displayed.
##' Default is \code{FALSE}.
##' @return a data.frame with the relative transcript expression for the
##' genes to study.
##' @author Jean Monlong, Diego Garrido-Martín
##' @export
prepare.trans.exp <- function(te.df, min.transcript.exp = 0.01, min.gene.exp = 0.01,
min.prop = 0.8, min.dispersion = 0.1, verbose = FALSE)
{
if(!all(c("geneId", "trId") %in% colnames(te.df))){
stop("Missing column in 'te.df' : 'geneId' and 'trId' are required.")
}
if(min.transcript.exp < 0 || min.gene.exp < 0){
stop("transcript/gene minimum expression should be greater or equal to 0.")
}
if(min.prop < 0 || min.prop > 1){
stop("'min.prop' value should be in [0, 1].")
}
te.df$geneId <- as.character(te.df$geneId)
te.df$trId <- as.character(te.df$trId)
samples <- setdiff(colnames(te.df), c("chr", "start", "end", "geneId", "trId"))
if(length(samples) < 5){
stop("Not enough samples; at least 5 samples required (although at least 20 is recommended).")
}
if(length(samples) < 20){
warning("Low sample size : it's recommended to have at least 20 samples.")
}
trans.to.keep <- apply(te.df[, samples], 1, function(r) any(r >= min.transcript.exp))
if(all(!trans.to.keep)){
stop("No transcripts with expression above threshold")
}
if(verbose && any(!trans.to.keep)){
message("Filtered transcripts due to low expression : ",
paste(te.df$trId[which(!trans.to.keep)], collapse = " "))
}
te.df <- te.df[which(trans.to.keep), ]
nb.trans <- table(te.df$geneId)
trans2 <- names(which(nb.trans > 1))
if(verbose && any(nb.trans <= 1)){
message("Filtered single-transcript genes : ",
paste(setdiff(unique(te.df$geneId), trans2), collapse = " "))
}
te.df <- te.df[which(te.df$geneId %in% trans2), ]
relativize.filter.dispersion <- function(df) {
df[, samples] <- apply(df[, samples], 2, relativize, min.gene.exp = min.gene.exp)
disp <- te.dispersion(df[, samples])
non.na.p <- sum(!is.na(df[1, samples])) / length(samples)
if (disp >= min.dispersion && non.na.p >= min.prop &&
nbDiffPt(df[, samples]) >= min(25, length(samples) * 0.8)){
return(df)
} else {
return(data.frame())
}
}
te.df <- plyr::ldply(lapply(unique(te.df$geneId), function(gene.i){
df <- te.df[which(te.df$geneId == gene.i), ]
relativize.filter.dispersion(df)
}), identity)
if(verbose && length(unique(te.df$geneId)) != length(trans2)){
message("Filtered low exp/disp genes : ",
paste(setdiff(trans2,unique(te.df$geneId)), collapse = " "))
}
if(nrow(te.df) == 0){
stop("No genes found with suitable transcript expression.")
}
return(te.df)
}
##' Compute the relative expression of transcripts.
##' @title Relative expression computation
##' @param x a vector of transcript expression.
##' @param min.gene.exp the minimum gene expression in the sample. If the
##' gene expression is too low it is safer to remove the sample from the
##' analysis.
##' @return a vector with the relative expression.
##' @author Jean Monlong
##' @keywords internal
relativize <- function(x, min.gene.exp = 0.01)
{
x <- as.numeric(x)
if (!any(is.na(x)) && sum(x) > min.gene.exp) {
x/sum(x)
} else {
return(rep(NA, length(x)))
}
}
##' Compute the dispersion from a gene's transcript expression matrix.
##' Dispersion is computed as the mean Hellinger distance to the centroid.
##' @title Splicing dispersion computation
##' @param tr a data.frame with the splicing ratios (transcript x sample).
##' @return a value for the dispersion.
##' @author Diego Garrido-Martín
##' @keywords internal
te.dispersion <- function (tr)
{
c <- as.numeric(apply(tr, 1, function(x)(mean(x, na.rm = T))))
d <- mean(apply(tr, 2, function(x)(hellingerDist.p(x, c))), na.rm = T)
return(d)
}
##' Compute the number of non-identical splicing ratios. Too few splicing ratios are
##' problematic for the permutation process. If many samples have exactly the same
##' splicing ratios, the permutation might create many identical permuted F scores.
##' @title Number of non-identical splicing ratio points
##' @param sr a data.frame with the splicing ratios (transcript x sample).
##' @return the number of non-identical splicing ratios.
##' @author Jean Monlong
##' @keywords internal
nbDiffPt <- function(sr)
{
length(unique(apply(sr, 2, paste, collapse = "-")))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.