#' Scale transcript coverages by abundance estimate
#'
#' This function takes a list of predicted transcript coverage profiles (from
#' \code{\link{predictTxCoverage}}) and corresponding transcript abundance
#' estimates (from, e.g., Salmon, kallisto, StringTie, RSEM, or any other
#' transcript abundance estimation software), and scales the coverage profiles
#' by the abundance estimates in order to determine the predicted number of
#' reads covering each position in the transcript. In addition, it extracts the
#' predicted number of reads spanning each junction in each transcript, and sums
#' up these for each annotated junction, across all transcripts containing the
#' junction.
#'
#' @param txCoverageProfiles A \code{list} of predicted transcript coverage
#' profiles (output from \code{\link{predictTxCoverage}}).
#' @param txQuants A \code{data.frame} with estimated transcript abundances.
#' Must contain columns named \code{transcript}, \code{TPM} and \code{count},
#' all other columns will be ignored.
#' @param tx2gene A \code{data.frame} with transcript-to-gene mapping. Must have
#' at least two columns, named \code{tx} and \code{gene}.
#' @param strandSpecific Logical, whether or not the data is strand-specific.
#' Determines whether or not the strand of a junction should be taken into
#' account when the coverages are summed up across transcripts.
#' @param methodName Name to use for the abundance estimation method in the
#' output table.
#' @param verbose Logical, whether to print progress messages.
#'
#' @return A list with two elements:
#' \describe{
#' \item{\code{junctionPredCovs}:}{A \code{tibble} with the predicted coverage
#' for each junction, scaled by the estimated transcript abundances and
#' summarized across all transcripts.}
#' \item{\code{txQuants}:}{A \code{tibble} with estimated transcript abundances,
#' including information about whether or not the coverage profile could be
#' predicted ('covOK') or if a uniform coverage was imposed ('covError' or
#' 'covNA').}
#' }
#'
#' @author Charlotte Soneson
#'
#' @export
#'
#' @references
#' Soneson C, Love MI, Patro R, Hussain S, Malhotra D, Robinson MD: A junction
#' coverage compatibility score to quantify the reliability of transcript
#' abundance estimates and annotation catalogs. bioRxiv doi:10.1101/378539
#' (2018)
#'
#' @examples
#' \dontrun{
#' gtf <- system.file("extdata/Homo_sapiens.GRCh38.90.chr22.gtf.gz",
#' package = "jcc")
#' bam <- system.file("extdata/reads.chr22.bam", package = "jcc")
#' biasMod <- fitAlpineBiasModel(gtf = gtf, bam = bam,
#' organism = "Homo_sapiens",
#' genome = Hsapiens, genomeVersion = "GRCh38",
#' version = 90, minLength = 230,
#' maxLength = 7000, minCount = 10,
#' maxCount = 10000, subsample = TRUE,
#' nbrSubsample = 30, seed = 1, minSize = NULL,
#' maxSize = 220, verbose = TRUE)
#' tx2gene <- readRDS(system.file("extdata/tx2gene.sub.rds", package = "jcc"))
#' predCovProfiles <- predictTxCoverage(biasModel = biasMod$biasModel,
#' exonsByTx = biasMod$exonsByTx,
#' bam = bam, tx2gene = tx2gene,
#' genome = Hsapiens,
#' genes = c("ENSG00000070371",
#' "ENSG00000093010"),
#' nCores = 1, verbose = TRUE)
#' txQuants <- readRDS(system.file("extdata/quant.sub.rds", package = "jcc"))
#' txsc <- scaleTxCoverages(txCoverageProfiles = predCovProfiles,
#' txQuants = txQuants, tx2gene = tx2gene,
#' strandSpecific = TRUE, methodName = "Salmon",
#' verbose = TRUE)
#' }
#'
#' @importFrom dplyr mutate group_by ungroup left_join select distinct select
#' rename
#'
scaleTxCoverages <- function(txCoverageProfiles, txQuants, tx2gene,
strandSpecific, methodName, verbose = FALSE) {
## Keep only transcript ID, count and TPM columns in quantification table
txQuants <- txQuants %>% dplyr::select(transcript, count, TPM)
transcripts <- names(txCoverageProfiles)
names(transcripts) <- transcripts
## Go through all transcripts and scale predicted junction coverage by the
## estimated abundance
scaledcovs <- lapply(transcripts, function(tx) {
tryCatch({
ab <- txQuants$count[txQuants$transcript == tx]
## if the transcript is not present in the quantification file
if (length(ab) == 0 || is.na(ab)) ab <- 0
m <- txCoverageProfiles[[tx]]$junctionCov
m$pred.cov <- m$pred.cov / max(1e-10,
sum(txCoverageProfiles[[tx]]$predCov)) *
ab * txCoverageProfiles[[tx]]$aveFragLength
as.data.frame(m) %>% dplyr::mutate(transcript = tx)
}, error = function(e) NULL)
})
## Combine estimates for all transcripts
junctionPredCovs <- do.call(rbind, scaledcovs)
if (!strandSpecific) {
junctionPredCovs$strand <- "*"
}
## Sum coverages of the same junction from different transcripts. Each
## junction is present once per gene it is included in.
junctionPredCovs <- junctionPredCovs %>%
dplyr::group_by(seqnames, start, end, strand) %>%
dplyr::mutate(pred.cov = sum(pred.cov)) %>%
dplyr::rename(predCov = pred.cov) %>%
dplyr::ungroup() %>%
dplyr::left_join(tx2gene %>% dplyr::select(tx, gene),
by = c("transcript" = "tx")) %>%
dplyr::group_by(seqnames, start, end, strand, gene) %>%
dplyr::mutate(transcript = paste(transcript, collapse = ",")) %>%
dplyr::ungroup() %>%
dplyr::distinct() %>%
dplyr::mutate(method = methodName)
## Get coverage "note" for each transcript
covNotes <- vapply(transcripts, function(tx) {
txCoverageProfiles[[tx]]$note
}, character(1))
## Add gene info to transcript quant table
txQuants <- txQuants %>% dplyr::left_join(tx2gene %>% dplyr::select(tx, gene),
by = c("transcript" = "tx")) %>%
dplyr::mutate(method = methodName) %>%
dplyr::inner_join(data.frame(transcript = names(covNotes),
covNote = covNotes,
stringsAsFactors = FALSE),
by = "transcript") %>%
dplyr::select(transcript, gene, count, TPM, method, covNote)
list(junctionPredCovs = as.data.frame(junctionPredCovs),
txQuants = as.data.frame(txQuants))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.