Nothing
#'
#' Merge external data
#'
#' To boost its own sequencing data, one can download existing and precounted
#' data. This function merges the existing \code{FraserDataSet} with
#' external count data.
#'
#' For more details on existing datasets have a look at:
#' <https://github.com/gagneurlab/drop#datasets>
#'
#' Since FRASER can not hand NA values, the merge will return only the
#' intersecting regions and will drop any non overlapping features. This has to
#' be kept in mind when analysing rare disease samples.
#'
#' @param fds A \code{FraserDataSet}
#' @param countFiles A character vector of file names pointing to the external
#' count data. The vector has to be names or the files have to start
#' with \code{k_j}, \code{k_theta}, \code{n_psi3}, \code{n_psi5},
#' \code{n_theta}.
#' @param sampleIDs The samples to be merged from the external data.
#' @param annotation A sample annotation of the external data (optional).
#'
#' @return Merged \code{FraserDataSet} object.
#'
#' @examples
#' anno <- fread(system.file("extdata", "externalCounts",
#' "annotation.tsv.gz", package="FRASER"))
#' ctsFiles <- list.files(full.names = TRUE, pattern="counts",
#' system.file("extdata", "externalCounts", package="FRASER"))
#'
#' fds <- createTestFraserDataSet()
#' fds_merged <- mergeExternalData(fds, ctsFiles, anno[,sampleID], anno)
#'
#' K(fds, "psi5")
#' K(fds_merged, "psi5")
#'
#' @export
mergeExternalData <- function(fds, countFiles, sampleIDs, annotation=NULL){
# check input
checkFraserDataSet(fds)
checkCountData(fds)
if(length(countFiles) != 5){
stop("You have to provide 5 files, but only ", length(countFiles),
" were provided.")
}
if(is.null(names(countFiles))){
names(countFiles) <- gsub("(_counts)?.tsv.gz", "", basename(countFiles))
}
reqNames <- c("k_j", "k_theta", "n_psi3", "n_psi5", "n_theta")
if(any(!reqNames %in% unique(names(countFiles)))){
stop("Please name the input or the files correctly. We are missing: ",
paste(collapse=", ",
reqNames[!reqNames %in% names(countFiles)]))
}
if(any(!file.exists(countFiles))){
stop("Provided files are missing! We are missing: ",
paste(collapse=", ", countFiles[!file.exists(countFiles)]))
}
if(any(unique(sampleIDs) != sampleIDs)){
stop("Provided sampleIDs have to be unique!")
}
sampleIDs <- as.character(sampleIDs)
# load external counts
message("Loading provided counts")
names(reqNames) <- reqNames
extCts <- lapply(reqNames, function(id){
gr <- makeGRangesFromDataFrame(fread(countFiles[id]),
keep.extra.columns=TRUE)
if(any(!sampleIDs %in% colnames(mcols(gr)))){
stop("Can not find provided sampleID in count data. Missing IDs: ",
paste(collapse=", ",
sampleIDs[!sampleIDs %in% colnames(mcols(gr))]))
}
gr[,sampleIDs]
})
stopifnot(all(granges(extCts[['k_j']]) == granges(extCts[['n_psi3']])))
stopifnot(all(granges(extCts[['k_j']]) == granges(extCts[['n_psi5']])))
stopifnot(all(granges(extCts[['k_theta']]) == granges(extCts[['n_theta']])))
#
# merging colData
#
message(date(), ": Merging data ...")
if(!is.null(annotation)){
annotation <- DataFrame(annotation)
} else {
annotation <- DataFrame(sampleID=as.character(sampleIDs))
}
rownames(annotation) <- annotation[,"sampleID"]
newColData <- DataFrame(rbind(fill=TRUE,
as.data.table(colData(fds)),
as.data.table(annotation[sampleIDs,])))
rownames(newColData) <- newColData[,"sampleID"]
#
# merge psi5/psi3 data
#
extractExtData <- function(fds, countFun, type, ov, extData, extName){
ans <- as.matrix(cbind(countFun(fds, type=type)[from(ov),],
mcols(extData[[extName]])[to(ov),]))
mode(ans) <- "integer"
ans
}
# find overlap
if(all(strand(rowRanges(fds, type="j")) == "*")){
for(id in reqNames){
strand(extCts[[id]]) <- "*"
}
}
ov <- findOverlaps(rowRanges(fds, type="j"), extCts[['k_j']], type="equal")
newCtsK_J <- extractExtData(fds, K, "j", ov, extCts, "k_j")
newCtsN_psi5 <- extractExtData(fds, N, "psi5", ov, extCts, "n_psi5")
newCtsN_psi3 <- extractExtData(fds, N, "psi3", ov, extCts, "n_psi3")
#
# merge theta data
#
# find overlap
ovss <- findOverlaps(rowRanges(fds, type="theta"),
extCts[['k_theta']], type="equal")
newCtsK_theta <- extractExtData(fds, K, "theta", ovss, extCts, "k_theta")
newCtsN_theta <- extractExtData(fds, N, "theta", ovss, extCts, "n_theta")
#
# finalize merged FraserDataObject
#
nsr <- SummarizedExperiment(
colData=newColData,
assays=SimpleList(
rawCountsSS=newCtsK_theta,
rawOtherCounts_theta=newCtsN_theta - newCtsK_theta),
rowRanges=rowRanges(fds, type="theta")[
from(ovss),c("spliceSiteID", "type")])
ans <- new("FraserDataSet",
name = name(fds),
bamParam = scanBamParam(fds),
strandSpecific = strandSpecific(fds),
workingDir = workingDir(fds),
colData = newColData,
assays=Assays(SimpleList(
rawCountsJ=newCtsK_J,
rawOtherCounts_psi5=newCtsN_psi5 - newCtsK_J,
rawOtherCounts_psi3=newCtsN_psi3 - newCtsK_J)),
nonSplicedReads = nsr,
rowRanges = rowRanges(fds)[from(ov),c("startID", "endID")],
elementMetadata = DataFrame(newCtsK_J[,integer(0)]))
#
# compute new psi values
#
ans <- calculatePSIValues(ans)
ans
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.