## Wrappers for kallisto quantification of abundance of RNA-seq reads
################################################################################
#' Run kallisto on FASTQ files to quantify feature abundance
#'
#' Run the abundance quantification tool \code{kallisto} on a set of FASTQ
#' files. Requires \code{kallisto} (\url{http://pachterlab.github.io/kallisto/})
#' to be installed and a kallisto feature index must have been generated prior
#' to using this function. See the kallisto website for installation and basic
#' usage instructions.
#'
#' @param targets_file character string giving the path to a tab-delimited text
#' file with either 2 columns (single-end reads) or 3 columns (paired-end reads)
#' that gives the sample names (first column) and FastQ file names (column 2 and
#' if applicable 3). The file is assumed to have column headers, although these
#' are not used.
#' @param transcript_index character string giving the path to the kallisto
#' index to be used for the feature abundance quantification.
#' @param single_end logical, are single-end reads used, or paired-end reads?
#' @param output_prefix character string giving the prefix for the output folder
#' that will contain the kallisto results. The default is \code{"output"} and
#' the sample name (column 1 of \code{targets_file}) is appended (preceded by an
#' underscore).
#' @param fragment_length scalar integer or numeric giving the estimated
#' average fragment length. Required argument if \code{single_end} is \code{TRUE},
#' optional if \code{FALSE} (kallisto default for paired-end data is that the
#' value is estimated from the input data).
#' @param fragment_standard_deviation scalar numeric giving the estimated
#' standard deviation of read fragment length. Required argument if
#' \code{single_end} is \code{TRUE}, optional if \code{FALSE} (kallisto default
#' for paired-end data is that the value is estimated from the input data).
#' @param n_cores integer giving the number of cores (nodes/threads) to use for
#' the kallisto jobs. The package \code{parallel} is used. Default is 2 cores.
#' @param n_bootstrap_samples integer giving the number of bootstrap samples
#' that kallisto should use (default is 0). With bootstrap samples, uncertainty
#' in abundance can be quantified.
#' @param bootstrap_seed scalar integer or numeric giving the seed to use for
#' the bootstrap sampling (default used by kallisto is 42). Optional argument.
#' @param correct_bias logical, should kallisto's option to model and correct
#' abundances for sequence specific bias? Requires kallisto version 0.42.2 or
#' higher.
#' @param plaintext logical, if \code{TRUE} then bootstrapping results are
#' returned in a plain text file rather than an HDF5
#' \url{https://www.hdfgroup.org/HDF5/} file.
#' @param kallisto_version character string indicating whether or not the
#' version of kallisto to be used is \code{"pre-0.42.2"} or \code{"current"}.
#' This is required because the kallisto developers changed the output file
#' extensions and added features in version 0.42.2.
#' @param verbose logical, should timings for the run be printed?
#' @param dry_run logical, if \code{TRUE} then a list containing the kallisto
#' commands that would be run and the output directories is returned. Can be
#' used to read in results if kallisto is run outside an R session or to produce
#' a script to run outside of an R session.
#' @param kallisto_cmd (optional) string giving full command to use to call
#' kallisto, if simply typing "kallisto" at the command line does not give the
#' required version of kallisto or does not work. Default is simply "kalliso".
#' If used, this argument should give the full path to the desired kallisto
#' binary.
#'
#' @details A kallisto transcript index can be built from a FASTA file:
#' \code{kallisto index [arguments] FASTA-file}. See the kallisto documentation
#' for further details.
#'
#' @return A list containing three elements for each sample for which feature
#' abundance has been quantified: (1) \code{kallisto_call}, the call used for
#' kallisto, (2) \code{kallisto_log} the log generated by kallisto, and (3)
#' \code{output_dir} the directory in which the kallisto results can be found.
#'
#' @importFrom parallel makeCluster
#' @importFrom parallel stopCluster
#' @importFrom parallel parLapply
#' @importFrom utils read.delim
#' @export
#' @examples
#' \dontrun{
#' ## If in kallisto's 'test' directory, then try these calls:
#' ## Generate 'targets.txt' file:
#' write.table(data.frame(Sample="sample1", File1="reads_1.fastq.gz", File2="reads_1.fastq.gz"),
#' file="targets.txt", quote=FALSE, row.names=FALSE, sep="\t")
#' kallisto_log <- runKallisto("targets.txt", "transcripts.idx", single_end=FALSE,
#' output_prefix="output", verbose=TRUE, n_bootstrap_samples=10,
#' dry_run = FALSE)
#' }
runKallisto <- function(targets_file, transcript_index, single_end = TRUE,
output_prefix = "output", fragment_length = NULL,
fragment_standard_deviation = NULL,
n_cores = 2, n_bootstrap_samples = 0,
bootstrap_seed = NULL, correct_bias = TRUE,
plaintext = FALSE, kallisto_version = "current",
verbose = TRUE, dry_run = FALSE,
kallisto_cmd = "kallisto") {
targets_dir <- paste0(dirname(targets_file), "/")
targets <- read.delim(targets_file, stringsAsFactors = FALSE, header = TRUE)
if ( !(ncol(targets) == 2 || ncol(targets) == 3) )
stop("Targets file must have either 2 columns (single-end reads) or
3 columns (paired-end reads). File should be tab-delimited with
column headers")
if ( ncol(targets) == 2 && !single_end ) {
warning("targets only has two columns; proceeding assuming single-end
reads")
single_end <- TRUE
}
if ( ncol(targets) == 3 && single_end ) {
warning("targets only has three columns, but 'single_end' was TRUE;
proceeding assuming paired-end reads")
single_end <- FALSE
}
samples <- targets[,1]
## If we have single-end reads then fragment_length must be defined
if ( single_end && is.null(fragment_length) )
stop("If single-end reads are used, then fragment_length must be
defined. Either a scalar giving the average fragment length to use for all
samples, or a vector providing the ave fragment length for each sample.")
else
paired_end <- !single_end
if ( single_end && is.null(fragment_standard_deviation) )
stop("If single-end reads are used, then fragment_standard_deviation must be defined.")
else
paired_end <- !single_end
if ( correct_bias && kallisto_version == "pre-0.42.2" )
stop("Bias correction requires kallisto version 0.42.2 or higher.")
## Make sure that we'll be able to find the fastq files
targets[, 2] <- paste0(targets_dir, targets[, 2])
if ( ncol(targets) == 3 )
targets[, 3] <- paste0(targets_dir, targets[, 3])
## Generate calls to kallisto
output_dirs <- paste(output_prefix, samples, sep = "_")
kallisto_args <- paste("quant -i", transcript_index, "-o",
output_dirs, "-b", n_bootstrap_samples)
names(kallisto_args) <- samples
if ( single_end )
kallisto_args <- paste(kallisto_args, "--single")
if ( !is.null(fragment_length) )
kallisto_args <- paste(kallisto_args, "-l", fragment_length)
if ( !is.null(fragment_standard_deviation) )
kallisto_args <- paste(kallisto_args, "-s", fragment_standard_deviation)
if ( !is.null(bootstrap_seed) )
kallisto_args <- paste0(kallisto_args, " --seed=", bootstrap_seed)
if ( correct_bias && kallisto_version != "pre-0.42.2" )
kallisto_args <- paste0(kallisto_args, " --bias")
if ( plaintext ) # output bootstrap results in a plaintext file
kallisto_args <- paste0(kallisto_args, " --plaintext" )
kallisto_args <- paste(kallisto_args, targets[, 2])
if (paired_end)
kallisto_args <- paste(kallisto_args, targets[, 3])
##
if ( dry_run ) {
kallisto_log <- vector("list", length(samples))
names(kallisto_log) <- samples
kallisto_calls <- paste("kallisto", kallisto_args)
for (i in seq_len(length(kallisto_calls))) {
kallisto_log[[i]]$output_dir <- output_dirs[i]
kallisto_log[[i]]$kallisto_call <- kallisto_calls[i]
kallisto_log[[i]]$kallisto_log <-
"Dry run: kallisto commands not executed."
}
} else {
if (verbose)
print(paste("Analysis started: ", Sys.time()))
cl <- parallel::makeCluster(n_cores)
# one or more parLapply calls to kallisto
kallisto_log <- parallel::parLapply(cl, kallisto_args, .call_kallisto,
kallisto_cmd, verbose)
parallel::stopCluster(cl)
## Return log of kallisto jobs, so user knows where to find results
names(kallisto_log) <- samples
if (verbose) {
print(paste("Analysis completed: ", Sys.time()))
print(paste("Processed", length(samples), "samples"))
}
for (i in seq_len(length(kallisto_log))) {
kallisto_log[[i]]$output_dir <- output_dirs[i]
}
}
kallisto_log
}
.call_kallisto <- function(kcall, kallisto_cmd, verbose=TRUE) {
out <- tryCatch(ex <- system2(kallisto_cmd, kcall, stdout = TRUE,
stderr = TRUE),
warning = function(w){w}, error = function(e){e})
list(kallisto_call = paste(kallisto_cmd, kcall), kallisto_log = out)
}
### Possible test for kallisto wrapper, but seems to leave tests hanging (not clear why)
# context("kallisto tests on inputs")
#
# test_that("tests for presence of average fragment length if single-end reads", {
# expect_that(runKallisto("../extdata/targets.txt",
# "../extdata/transcripts.idx",
# output_prefix="../extdata/output", n_cores=12,
# fragment_length=NULL, verbose=TRUE),
# gives_warning("targets only has three columns, but 'single_end' was TRUE; proceeding assuming paired-end reads"))
# })
#
#
################################################################################
#' Read kallisto results for a single sample into a list
#'
#' @param directory character string giving the path to the directory containing
#' the kallisto results for the sample.
#' @param read_h5 logical, if \code{TRUE} then read in bootstrap results from
#' the HDF5 object produced by kallisto.
#' @param kallisto_version character string indicating whether or not the
#' version of kallisto to be used is \code{"pre-0.42.2"} or \code{"current"}.
#' This is required because the kallisto developers changed the output file
#' extensions and added features in version 0.42.2.
#'
#' @details The directory is expected to contain results for just a single
#' sample. Putting more than one sample's results in the directory will result
#' in unpredictable behaviour with this function. The function looks for the
#' files (with the default names given by kallisto) 'abundance.txt',
#' 'run_info.json' and (if \code{read_h5=TRUE}) 'abundance/h5'. If these files
#' are missing, or if results files have different names, then this function
#' will not find them.
#'
#' @return A list with two elements: (1) a data.frame \code{abundance} with
#' columns for 'target_id' (feature, transcript, gene etc), 'length' (feature
#' length), 'eff_length' (effective feature length), 'est_counts' (estimated
#' feature counts), 'tpm' (transcripts per million) and possibly many columns
#' containing bootstrap estimated counts; and (2) a list \code{run_info} with
#' details about the kallisto run that generated the results.
#'
#' @importFrom data.table fread
#' @importFrom rhdf5 h5read
#' @importFrom rhdf5 H5close
#' @importFrom rjson fromJSON
#' @export
#' @examples
#' # If kallisto results are in the directory "output", then call:
#' # readKallistoResultsOneSample("output")
readKallistoResultsOneSample <- function(directory, read_h5=FALSE,
kallisto_version = "current") {
## Read in abundance information for the sample
if ( kallisto_version == "pre-0.42.2" )
file_to_read <- paste0(directory, "/abundance.txt")
else
file_to_read <- paste0(directory, "/abundance.tsv")
if ( file.exists(file_to_read) ) {
abundance <- data.table::fread(file_to_read, colClasses = c("numeric", "numeric", "numeric", "character", "character"), sep = "\t")
abundance$est_counts <- as.numeric(abundance$est_counts)
abundance$tpm <- as.numeric(abundance$tpm)
abundance$eff_length <- as.numeric(abundance$eff_length)
} else
stop(paste("File", file_to_read, "not found or does not exist. Please check directory is correct."))
## Read in run information
run_info <- rjson::fromJSON(file = paste0(directory, "/run_info.json"))
## Read in HDF5 data file with bootstrap results
if ( read_h5 ) {
file_to_read <- paste0(directory, "/abundance.h5")
if ( file.exists(file_to_read) ) {
h5 <- rhdf5::h5read(file_to_read, "/")
rhdf5::H5close()
} else {
stop(paste("File", file_to_read, "not found or does not exist. Please check directory is correct and that you want to read results in HDF5 format."))
}
if ( h5$aux$num_bootstrap > 0 ) {
boot_mat <- data.frame(matrix(unlist(h5$bootstrap),
nrow = length(h5$est_counts),
ncol = h5$aux$num_bootstrap))
colnames(boot_mat) <- names(h5$bootstrap)
abundance <- cbind(abundance, boot_mat)
}
}
list(abundance = abundance, run_info = run_info)
}
################################################################################
#' Read kallisto results from a batch of jobs
#'
#' After generating transcript/feature abundance results using kallisto for a
#' batch of samples, read these abundance values into an \code{SCESet} object.
#'
#' @param kallisto_log list, generated by \code{runKallisto}. If provided, then
#' \code{samples} and \code{directories} arguments are ignored.
#' @param samples character vector providing a set of sample names to use for
#' the abundance results.
#' @param directories character vector providing a set of directories containing
#' kallisto abundance results to be read in.
#' @param read_h5 logical, should the bootstrap results be read in from the HDF5
#' objects produced by kallisto?
#' @param kallisto_version character string indicating whether or not the
#' version of kallisto to be used is \code{"pre-0.42.2"} or \code{"current"}.
#' This is required because the kallisto developers changed the output file
#' extensions and added features in version 0.42.2.
#' @param logExprsOffset numeric scalar, providing the offset used when doing
#' log2-transformations of expression data to avoid trying to take logs of zero.
#' Default offset value is \code{1}.
#' @param verbose logical, should function provide output about progress?
#'
#' @details This function expects to find only one set of kallisto abundance
#' results per directory; multiple adundance results in a given directory will
#' be problematic.
#'
#' @return an SCESet object
#'
#' @export
#' @examples
#' \dontrun{
#' kallisto_log <- runKallisto("targets.txt", "transcripts.idx", single_end=FALSE,
#' output_prefix="output", verbose=TRUE, n_bootstrap_samples=10)
#' sceset <- readKallistoResults(kallisto_log)
#' }
#'
readKallistoResults <- function(kallisto_log = NULL, samples = NULL,
directories = NULL, read_h5 = FALSE,
kallisto_version = "current",
logExprsOffset = 1,
verbose = TRUE) {
kallisto_fail <- rep(FALSE, length(samples))
## Checks on arguments
if ( !is.null(kallisto_log) ) {
cat("Using kallisto_log argument to define samples and results directories.")
if ( !is.list(kallisto_log) )
stop("The kallisto_log argument should be a list returned by runKallisto()")
samples <- names(kallisto_log)
directories <- sapply(kallisto_log, function(x) {x$output_dir})
logs <- lapply(kallisto_log, function(x) {x$kallisto_log})
## Can only check kallisto fail if log provided
kallisto_fail <- sapply(logs, function(x) {
any(grepl("[wW]arning|[eE]rror", x))})
if ( any(kallisto_fail) ) {
warning(paste0("The kallisto job failed for the following samples:\n ",
paste0(names(logs)[kallisto_fail], collapse = "\n"),
"\n It is recommended that you inspect kallisto_log for these samples."))
}
} else {
cat("Kallisto log not provided - assuming all runs successful")
if ( is.null(samples) | is.null(directories) )
stop("If kallisto_log argument is not used, then both samples and directories must be provided.")
if ( length(samples) != length(directories) )
stop("samples and directories arguments must be the same length")
}
samples <- samples[!kallisto_fail]
directories <- directories[!kallisto_fail]
if ( !all(dir.exists(directories)) )
stop("Some of the desired directories to import do not exist!")
## Read first file to get size of feature set
s1 <- readKallistoResultsOneSample(directories[1], read_h5 = read_h5,
kallisto_version = kallisto_version)
nsamples <- length(samples)
nfeatures <- nrow(s1$abundance)
nbootstraps <- s1$run_info$n_bootstraps
navec_samples <- rep(NA, nsamples)
## Set up results objects
pdata <- data.frame(n_targets = navec_samples, n_bootstraps = navec_samples,
kallisto_version = navec_samples,
index_version = navec_samples, start_time = navec_samples,
call = navec_samples)
rownames(pdata) <- samples
fdata <- data.frame(feature_id = s1$abundance$target_id,
feature_length = s1$abundance$length,
feature_eff_length = s1$abundance$eff_length)
rownames(fdata) <- s1$abundance$target_id
est_counts <- tpm <- feat_eff_len <-
matrix(NA, nrow = nfeatures, ncol = nsamples)
colnames(est_counts) <- colnames(tpm) <- colnames(feat_eff_len) <- samples
rownames(est_counts) <- rownames(tpm) <- rownames(feat_eff_len) <-
s1$abundance$target_id
if ( read_h5 ) {
bootstraps <- array(NA, dim = c(nfeatures, nsamples, nbootstraps))
rownames(bootstraps) <- s1$abundance$target_id
colnames(bootstraps) <- samples
}
## Read kallisto results into results objects
if ( verbose )
cat(paste("\nReading results for", nsamples, "samples:\n"))
for (i in seq_len(nsamples)) {
tmp_samp <- readKallistoResultsOneSample(
directories[i], read_h5 = read_h5,
kallisto_version = kallisto_version)
## counts
if ( length(tmp_samp$abundance$est_counts) != nfeatures )
warning(paste("Results for directory", directories[i], "do not match dimensions of other samples."))
else
est_counts[,i] <- tmp_samp$abundance$est_counts
## tpm
if ( length(tmp_samp$abundance$est_counts) == nfeatures )
tpm[,i] <- tmp_samp$abundance$tpm
if ( length(tmp_samp$abundance$est_counts) == nfeatures )
tpm[,i] <- tmp_samp$abundance$tpm
## feature effective length
if ( length(tmp_samp$abundance$eff_length) == nfeatures )
feat_eff_len[, i] <- tmp_samp$abundance$eff_length
## run info
pdata$n_targets[i] <- tmp_samp$run_info$n_targets
pdata$n_processed[i] <- tmp_samp$run_info$n_processed
pdata$n_bootstraps[i] <- tmp_samp$run_info$n_bootstraps
pdata$kallisto_version[i] <- tmp_samp$run_info$kallisto_version
pdata$index_version[i] <- tmp_samp$run_info$index_version
pdata$start_time[i] <- tmp_samp$run_info$start_time
pdata$call[i] <- tmp_samp$run_info$call
## bootstraps
if ( read_h5 )
bootstraps[,i,] <- as.matrix(tmp_samp$abundance[,-c(1:5)])
if ( verbose ) {
cat(".")
if ( i %% 80 == 0)
cat("\n")
}
}
## Add median feature effective length to fData
fdata$median_effective_length <- matrixStats::rowMedians(feat_eff_len)
if ( verbose )
cat("\n")
## Produce SCESet object
pdata <- new("AnnotatedDataFrame", pdata)
fdata <- new("AnnotatedDataFrame", fdata)
sce_out <- newSCESet(exprsData = log2(tpm + logExprsOffset),
phenoData = pdata, featureData = fdata,
countData = est_counts,
logExprsOffset = logExprsOffset,
lowerDetectionLimit = 0)
tpm(sce_out) <- tpm
set_exprs(sce_out, "feature_effective_length") <- feat_eff_len
if ( verbose )
cat("Using log2(TPM + 1) as 'exprs' values in output.")
if ( read_h5 )
bootstraps(sce_out) <- bootstraps
## Return SCESet object
sce_out
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.