Nothing
#' @title Filtering FraserDataSets
#'
#' @description This method can be used to filter out introns that are not
#' reliably detected and to remove introns with no variablity between samples.
#'
#' @inheritParams countRNA
#' @param object A \code{\link{FraserDataSet}} object
#' @param minExpressionInOneSample The minimal read count in at least one
#' sample that is required for an intron to pass the filter.
#' @param quantile Defines which quantile should be considered for the filter.
#' @param quantileMinExpression The minimum read count an intron needs to have
#' at the specified quantile to pass the filter.
#' @param minDeltaPsi Only introns for which the maximal difference in the psi
#' value of a sample to the mean psi of the intron is larger than this value
#' pass the filter.
#' @param filter If TRUE, a subsetted fds containing only the introns that
#' passed all filters is returned. If FALSE, no subsetting is done and the
#' information of whether an intron passed the filters is only stored in the
#' mcols.
#' @param delayed If FALSE, count matrices will be loaded into memory,
#' otherwise the function works on the delayedMatrix representations. The
#' default value depends on the number of samples in the fds-object.
#'
#' @return A FraserDataSet with information about which junctions passed the
#' filters. If \code{filter=TRUE}, the filtered FraserDataSet is returned.
#'
#' @examples
#' fds <- createTestFraserDataSet()
#' fds <- filterExpressionAndVariability(fds, minDeltaPsi=0.1, filter=FALSE)
#' mcols(fds, type="psi5")[, c(
#' "maxCount", "passedExpression", "maxDPsi3", "passedVariability")]
#'
#' plotFilterExpression(fds)
#' plotFilterVariability(fds)
#'
#' @name filtering
#' @rdname filtering
#'
NULL
#' @describeIn filtering This functions filters out both introns with low
#' read support and introns that are not variable across samples.
#' @export
filterExpressionAndVariability <- function(object, minExpressionInOneSample=20,
quantile=0.05, quantileMinExpression=1, minDeltaPsi=0,
filter=TRUE,
delayed=ifelse(ncol(object) <= 300, FALSE, TRUE),
BPPARAM=bpparam()){
# filter introns with low read support and corresponding splice sites
object <- filterExpression(object,
minExpressionInOneSample=minExpressionInOneSample,
quantile=quantile,
quantileMinExpression=quantileMinExpression,
filter=filter, delayed=delayed,
BPPARAM=BPPARAM)
# filter introns that are not variable across samples
object <- filterVariability(object, minDeltaPsi=minDeltaPsi, filter=filter,
delayed=delayed, BPPARAM=BPPARAM)
# return fds
message(date(), ": Filtering done!")
return(object)
}
filterExpression.FRASER <- function(object, minExpressionInOneSample=20,
quantile=0.05, quantileMinExpression=1, filter=TRUE,
delayed=ifelse(ncol(object) <= 300, FALSE, TRUE),
BPPARAM=bpparam()){
stopifnot(is(object, "FraserDataSet"))
message(date(), ": Filtering out introns with low read support ...")
# extract counts
cts <- K(object, type="j")
ctsN5 <- N(object, type="psi5")
ctsN3 <- N(object, type="psi3")
if(isFALSE(delayed)){
cts <- as.matrix(cts)
ctsN5 <- as.matrix(ctsN5)
ctsN3 <- as.matrix(ctsN3)
}
# cutoff functions
f1 <- function(cts, ...){
rowMaxs(cts) }
f2 <- function(cts, ctsN5, quantile, ...){
rowQuantiles(ctsN5, probs=quantile, drop=FALSE)[,1] }
f3 <- function(cts, ctsN3, quantile, ...) {
rowQuantiles(ctsN3, probs=quantile, drop=FALSE)[,1] }
funs <- c(maxCount=f1, quantileValue5=f2, quantileValue3=f3)
# run it in parallel
cutoffs <- bplapply(funs, function(f, ...) f(...), BPPARAM=BPPARAM,
cts=cts, ctsN3=ctsN3, ctsN5=ctsN5, quantile=quantile)
# add annotation to object
for(n in names(cutoffs)){
mcols(object, type="j")[n] <- cutoffs[[n]]
}
mcols(object, type="j")[['passedExpression']] <-
cutoffs$maxCount >= minExpressionInOneSample &
(cutoffs$quantileValue5 >= quantileMinExpression |
cutoffs$quantileValue3 >= quantileMinExpression)
if("passedVariability" %in% colnames(mcols(object, type="j"))){
mcols(object, type="j")[['passed']] <-
mcols(object, type="j")[['passedExpression']] &
mcols(object, type="j")[['passedVariability']]
} else{
mcols(object, type="j")[['passed']] <-
mcols(object, type="j")[['passedExpression']]
}
# filter if requested
if(isTRUE(filter)){
object <- applyExpressionFilters(object, minExpressionInOneSample,
quantileMinExpression)
}
validObject(object)
return(object)
}
#' @describeIn filtering This function filters out introns and corresponding
#' splice sites that have low read support in all samples.
#' @export
setMethod("filterExpression", signature="FraserDataSet",
filterExpression.FRASER)
#' @describeIn filtering This function filters out introns and corresponding
#' splice sites which do not show variablity across samples.
#' @export
filterVariability <- function(object, minDeltaPsi=0, filter=TRUE,
delayed=ifelse(ncol(object) <= 300, FALSE, TRUE),
BPPARAM=bpparam()){
message(date(), ": Filtering out non-variable introns ...")
# extract counts
cts <- K(object, type="j")
ctsSE <- K(object, type="ss")
ctsN5 <- N(object, type="psi5")
ctsN3 <- N(object, type="psi3")
ctsNSE <- N(object, type="theta")
if(isFALSE(delayed)){
cts <- as.matrix(cts)
ctsN5 <- as.matrix(ctsN5)
ctsN3 <- as.matrix(ctsN3)
ctsSE <- as.matrix(ctsSE)
ctsNSE <- as.matrix(ctsNSE)
}
# cutoff functions
f1 <- function(cts, ctsN3, ...) {
psi <- cts/ctsN3
rowMaxs(abs(psi - rowMeans2(psi, na.rm=TRUE)), na.rm=TRUE) }
f2 <- function(cts, ctsN5, ...) {
psi <- cts/ctsN5
rowMaxs(abs(psi - rowMeans2(psi, na.rm=TRUE)), na.rm=TRUE) }
f3 <- function(ctsSE, ctsNSE, ...) {
theta <- ctsSE/ctsNSE
dTheta <- rowMaxs(abs(theta - rowMeans2(theta, na.rm=TRUE)),
na.rm=TRUE) }
funs <- c(maxDPsi3=f1, maxDPsi5=f2, maxDTheta=f3)
# run it in parallel
cutoffs <- bplapply(funs, function(f, ...) f(...), BPPARAM=BPPARAM,
cts=cts, ctsN3=ctsN3, ctsN5=ctsN5,
ctsSE=ctsSE, ctsNSE=ctsNSE)
# add annotation to object
for(n in names(cutoffs)){
if(n == "maxDTheta"){
mcols(object, type="ss")[n] <- cutoffs[[n]]
} else{
mcols(object, type="j")[n] <- cutoffs[[n]]
}
}
# add annotation of theta on splice sites of introns to mcols
intron_dt <- as.data.table(rowRanges(object, type="j"))
ss_dt <- as.data.table(rowRanges(object, type="ss"))
mcols(object, type="j")["maxDThetaDonor"] <-
merge(intron_dt, ss_dt, by.x="startID", by.y="spliceSiteID",
all.x=TRUE, sort=FALSE)[,maxDTheta]
mcols(object, type="j")["maxDThetaAcceptor"] <-
merge(intron_dt, ss_dt, by.x="endID", by.y="spliceSiteID",
all.x=TRUE, sort=FALSE)[,maxDTheta]
# check which introns pass the filter
mcols(object, type="j")[['passedVariability']] <- pmax(na.rm=TRUE,
cutoffs$maxDPsi3,
cutoffs$maxDPsi5,
mcols(object, type="j")$maxDThetaDonor,
mcols(object, type="j")$maxDThetaAcceptor,
0) >= minDeltaPsi
if("passedExpression" %in% colnames(mcols(object, type="j"))){
mcols(object, type="j")[['passed']] <-
mcols(object, type="j")[['passedExpression']] &
mcols(object, type="j")[['passedVariability']]
} else{
mcols(object, type="j")[['passed']] <-
mcols(object, type="j")[['passedVariability']]
}
# filter if requested
if(isTRUE(filter)){
object <- applyVariabilityFilters(object, minDeltaPsi)
}
validObject(object)
return(object)
}
#' Applies previously calculated filters for expression filters
#' @noRd
applyExpressionFilters <- function(fds, minExpressionInOneSample,
quantileMinExpression){
maxCount <- mcols(fds, type="j")[['maxCount']]
quantileValue5 <- mcols(fds, type="j")[['quantileValue5']]
quantileValue3 <- mcols(fds, type="j")[['quantileValue3']]
# report rare junctions that passed minExpression filter but not
# quantileFilter as SE obj
junctionsToReport <- maxCount >= minExpressionInOneSample &
!(quantileValue5 >= quantileMinExpression |
quantileValue3 >= quantileMinExpression)
outputDir <- file.path(workingDir(fds), "savedObjects", nameNoSpace(fds))
if(any(junctionsToReport)){
# get SE object of junctions to report
rareJunctions <- asSE(fds[junctionsToReport, by="j"])
for(aname in assayNames(rareJunctions)){
if(!(aname %in% c("rawCountsJ", "rawOtherCounts_psi5",
"rawOtherCounts_psi3", "psi5", "psi3",
"delta_psi5", "delta_psi3"))){
assay(rareJunctions, aname) <- NULL
}
}
rareJunctions <- saveHDF5SummarizedExperiment(rareJunctions,
dir=file.path(tempdir(), "tmp_rJ"),
replace=TRUE)
# check if folder already exists from previous filtering
rareJctsDir <- file.path(outputDir, "rareJunctions")
if(dir.exists(rareJctsDir)){
warning("Filtering has already been applied previously. Introns ",
"that were already filtered out but should be kept now ",
"cannot be restored.")
rJ_stored <- loadHDF5SummarizedExperiment(dir=rareJctsDir)
toReport <- mcols(rJ_stored)$maxCount >= minExpressionInOneSample &
!(mcols(rJ_stored)$quantileValue5 >= quantileMinExpression |
mcols(rJ_stored)$quantileValue3 >= quantileMinExpression)
rJ_tmp <- rbind(rJ_stored[toReport,], rareJunctions)
for(aname in assayNames(rJ_tmp)){
assay(rJ_tmp, aname) <-
rbind(as.matrix(assay(rareJunctions, aname)),
as.matrix(assay(rJ_stored[toReport,], aname)) )
}
rareJunctions <- rJ_tmp
rm(rJ_tmp)
}
rareJunctions <- saveHDF5SummarizedExperiment(rareJunctions,
dir=rareJctsDir, replace=TRUE)
}
# apply filter
numFilt <- sum(mcols(fds, type="j")[['passedExpression']])
message(paste0("Keeping ", numFilt, " junctions out of ", length(fds),
". This is ", signif(numFilt/length(fds)*100, 3),
"% of the junctions"))
fds <- fds[mcols(fds, type="j")[['passedExpression']], by="psi5"]
return(fds)
}
#' Applies previously calculated variablilty filters
#' @noRd
applyVariabilityFilters <- function(fds, minDeltaPsi){
#
passedVariability <- mcols(fds, type="j")[['passedVariability']]
# maxDPsi3 <- mcols(fds, type="j")[['maxDPsi3']]
# maxDPsi5 <- mcols(fds, type="j")[['maxDPsi5']]
# maxDThetaDonor <- mcols(fds, type="j")[['maxDThetaDonor']]
# maxDThetaAcceptor <- mcols(fds, type="j")[['maxDThetaAcceptor']]
# store information of non-variable junctions
filtered <- !passedVariability
# filtered <- (pmax(maxDPsi3, maxDPsi5, maxDThetaDonor, maxDThetaAcceptor)
# < minDeltaPsi)
outputDir <- file.path(workingDir(fds), "savedObjects", nameNoSpace(fds))
if(any(filtered)){
# get SE object of junctions to report
nonVariableJunctions <- asSE(fds[filtered, by="j"])
for(aname in assayNames(nonVariableJunctions)){
if(!(aname %in% c("rawCountsJ", "rawOtherCounts_psi5",
"rawOtherCounts_psi3", "psi5", "psi3",
"delta_psi5", "delta_psi3"))){
assay(nonVariableJunctions, aname) <- NULL
}
}
nonVariableJunctions <- saveHDF5SummarizedExperiment(replace=TRUE,
nonVariableJunctions,
dir=file.path(tempdir(), "tmp_nvJ"))
# check if folder already exists from previous filtering
nonVarJctsDir <- file.path(outputDir, "nonVariableJunctions")
if(dir.exists(nonVarJctsDir)){
warning("Filtering has already been applied previously. Introns ",
"that were already filtered out but should be kept now ",
"cannot be restored.")
nV_stored <- loadHDF5SummarizedExperiment(dir=nonVarJctsDir)
toReport <- mcols(nV_stored)$maxDPsi5 < minDeltaPsi &
mcols(nV_stored)$maxDPsi3 < minDeltaPsi &
mcols(nV_stored)$maxDThetaDonor < minDeltaPsi &
mcols(nV_stored)$maxDThetaAcceptor < minDeltaPsi
nVJunctions <- rbind(nonVariableJunctions, nV_stored[toReport,])
for(aname in assayNames(nVJunctions)){
assay(nVJunctions, aname) <-
rbind(as.matrix(assay(nonVariableJunctions, aname)),
as.matrix(assay(nV_stored[toReport,], aname)) )
}
nonVariableJunctions <- nVJunctions
rm(nVJunctions)
}
nonVariableJunctions <- saveHDF5SummarizedExperiment(dir=nonVarJctsDir,
x=nonVariableJunctions, replace=TRUE)
}
# apply filtering
numFilt <- sum(passedVariability)
message(paste0("Keeping ", numFilt, " junctions out of ", length(fds),
". This is ", signif(numFilt/length(fds)*100, 3),
"% of the junctions"))
fds <- fds[mcols(fds, type="j")[['passedVariability']], by="psi5"]
return(fds)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.