## Wrapper for tximport package for reading in transcript quantification data
#' Read transcript quantification data with tximport package
#'
#' After generating transcript/feature abundance results using kallisto, Salmon,
#' Sailfish or RSEM for a batch of samples, read these abundance values into an
#' \code{SCESet} object.
#'
#' @param samples character vector providing a set of sample names to use for
#' the abundance results.
#' @param files character vector providing a set of filenames containing
#' kallisto abundance results to be read in.
#' @param log list (optional), generated by \code{runKallisto}. If provided, then
#' \code{samples} and \code{files} arguments are ignored.
#' @param type character, the type of software used to generate the abundances.
#' Options are "kallisto", "salmon", "sailfish", "rsem". This argument is passed
#' to \code{\link[tximport]{tximport}}.
#' @param txOut logical, whether the function should just output transcript-level (default TRUE)
#' @param logExprsOffset numeric scalar, providing the offset used when doing
#' log2-transformations of expression data to avoid trying to take logs of zero.
#' Default offset value is \code{1}.
#' @param verbose logical, should function provide output about progress?
#' @param ... optional parameters passed to \code{\link[tximport]{tximport}}.
#' See documentation for \code{\link[tximport]{tximport}} for options and
#' details.
#'
#' @details Note: tximport does not import bootstrap estimates from kallisto,
#' Salmon, or Sailfish. If you want bootstrap estimates use the
#' \code{\link{readKallistoResults}} or \code{\link{readSalmonResults}}
#' functions.
#'
#' @return an \code{SCESet} object containing the abundance, count and feature
#' length data from the supplied samples.
#'
#' @importFrom tximport tximport
#' @export
#' @references
#' Soneson C, Love MI, Robinson MD. Differential analyses for RNA-seq: transcript-level estimates improve gene-level inferences. F1000Res. 2015;4: 1521.
#'
#' @examples
#' \dontrun{
#' ## this example requires installation of the tximportData package from
#' ## Bioconductor
#' library(tximportData)
#' dir <- system.file("extdata", package = "tximportData")
#' list.files(dir)
#' samples <- read.table(file.path(dir, "samples.txt"), header = TRUE)
#' samples
#' directories <- file.path(dir, "kallisto", samples$run)
#' names(directories) <- paste0("sample", 1:6)
#' files <- file.path(directories, "abundance.tsv")
#' sce_example <- readTxResults(samples = names(directories),
#' files = files, type = "kallisto")
#'
#' ## for faster reading of results use the read_tsv function from the readr pkg
#' library(readr)
#' sce_example <- readTxResults(samples = names(directories),
#' files = files, type = "kallisto", reader = read_tsv)
#' }
#'
readTxResults <- function(samples = NULL, files = NULL,
log = NULL, type = "kallisto", txOut = TRUE,
logExprsOffset = 1, verbose = TRUE, ...) {
## allow for some failed runs
tx_fail <- rep(FALSE, length(samples))
## Checks on arguments
if ( !is.null(log) && verbose ) {
cat("Using log argument to define samples and results directories.")
if ( !is.list(log) )
stop("The kallisto_log argument should be a list returned by runKallisto()")
samples <- names(log)
directories <- vapply(log, function(x) {x$output_dir}, character(1))
logs <- lapply(log, function(x) {x$kallisto_log})
## Can only check kallisto fail if log provided
tx_fail <- vapply(logs, function(x) {
any(grepl("[wW]arning|[eE]rror", x))}, logical(1))
if ( any(tx_fail) ) {
warning(paste0("The transcript quantification job failed for the following samples:\n ",
paste0(names(logs)[tx_fail], collapse = "\n"),
"\n It is recommended that you inspect the log object for these samples."))
}
} else {
if ( verbose )
cat("Kallisto log not provided - assuming all runs successful")
if ( is.null(samples) | is.null(files) )
stop("If kallisto_log argument is not used, then both samples and files must be provided.")
if ( length(samples) != length(files) )
stop("samples and files arguments must be the same length")
}
if ( !is.null(log) ) {
## define sample names and directories
samples <- samples[!tx_fail]
directories <- directories[!tx_fail]
files <- file.path(directories, "abundance.tsv")
}
## Read tx results into results objects
names(samples) <- NULL # found that named vector can mess things up
if ( !all(file.exists(files)) )
stop("Some of the files provided do not exist!")
txi <- tximport::tximport(files, type = type, txOut = txOut, ...)
## define pdata and fdata
nsamples <- length(samples)
nfeatures <- nrow(txi$abundance)
nbootstraps <- rep(0, nsamples)
## Set up results objects
pdata <- data.frame(n_features = rep(nfeatures, nsamples),
n_bootstraps = nbootstraps)
rownames(pdata) <- samples
fdata <- data.frame(feature_id = rownames(txi$abundance),
ave_feature_length = rowMeans(txi$length))
rownames(fdata) <- rownames(txi$abundance)
est_counts <- txi$counts
tpm <- txi$abundance
feature_length <- txi$length
colnames(est_counts) <- colnames(tpm) <- colnames(feature_length) <- samples
rownames(est_counts) <- rownames(tpm) <- rownames(feature_length) <-
rownames(txi$abundance)
## Produce SCESet object
pdata <- new("AnnotatedDataFrame", pdata)
fdata <- new("AnnotatedDataFrame", fdata)
sce_out <- newSCESet(exprsData = log2(tpm + logExprsOffset),
phenoData = pdata, featureData = fdata,
countData = est_counts,
logExprsOffset = logExprsOffset,
lowerDetectionLimit = 0)
tpm(sce_out) <- tpm
set_exprs(sce_out, "feature_length") <- feature_length
if ( verbose )
cat("Using log2(TPM + logExprsOffset) as 'exprs' values in output.")
## Return SCESet object
sce_out
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.