#' Import transcript-level abundances and counts
#'
#' Import RNA-seq counts using
#' [tximport](https://doi.org/doi:10.18129/B9.bioc.tximport).
#'
#' Normalized counts are loaded as length-scaled transcripts per million.
#' Consult this [vignette](https://goo.gl/h6fm15) for more information.
#'
#' @author Michael Steinbaugh, Rory Kirchner
#' @keywords internal
#' @noRd
#'
#' @note Ignoring transcript versions should work by default. There may be
#' an issue with genomes containing non-Ensembl transcript IDs, such as
#' C. elegans, although we need to double check.
#' @note Updated 2022-03-07.
#'
#' @param sampleDirs `character`.
#' Sample directories to import.
#'
#' @param type `character(1)`.
#' Expression caller to use.
#'
#' @seealso
#' - `tximport::tximport()`.
#'
#' @return `list`.
.tximport <-
function(sampleDirs,
type = c("salmon", "kallisto", "sailfish"),
txOut = FALSE,
countsFromAbundance = "lengthScaledTPM",
tx2gene,
ignoreTxVersion = TRUE) {
assert(
all(isDir(sampleDirs)),
identical(names(sampleDirs), makeNames(basename(sampleDirs))),
isFlag(txOut)
)
if (isTRUE(txOut)) {
assert(is.null(tx2gene))
} else {
assert(is(tx2gene, "TxToGene"))
}
type <- match.arg(type)
countsFromAbundance <- match.arg(
arg = countsFromAbundance,
choices = eval(formals(tximport)[["countsFromAbundance"]])
)
## Locate the counts files ---------------------------------------------
subdirs <- file.path(sampleDirs, type)
assert(all(isDir(subdirs)))
basename <- ifelse(
test = identical(type, "kallisto"),
yes = "abundance.h5",
## salmon, sailfish.
no = "quant.sf"
)
files <- file.path(subdirs, basename)
assert(all(isFile(files)))
names(files) <- names(sampleDirs)
## tx2gene -------------------------------------------------------------
if (isFALSE(txOut)) {
tx2gene <- as.data.frame(tx2gene)
if (!identical(
x = c("txId", "txName"),
y = colnames(tx2gene)
)) {
colnames(tx2gene) <- c("txId", "txName")
}
## Ensure transcript IDs are stripped from tx2gene, if desired.
if (isTRUE(ignoreTxVersion)) {
tx2gene[["txId"]] <- stripTranscriptVersions(tx2gene[["txId"]])
rownames(tx2gene) <- tx2gene[["txId"]]
}
}
## Import counts using tximport ----------------------------------------
## Note that this step can take a long time when processing a lot of
## samples, and is recommended to be run on an HPC cluster, rather than
## locally.
alert(sprintf(
fmt = paste(
"Importing {.pkg %s} transcript-level counts from {.file %s}",
"files using {.pkg tximport} %s."
),
type, basename(files[[1L]]), packageVersion("tximport")
))
dl(c(
"countsFromAbundance" = countsFromAbundance,
"txOut" = txOut
))
## We're using a `do.call()` approach here so we can apply version-
## specific tximport fixes, if necessary.
args <- list(
"files" = files,
"type" = type,
"txIn" = TRUE,
"txOut" = txOut,
"countsFromAbundance" = countsFromAbundance,
"tx2gene" = tx2gene,
"ignoreTxVersion" = ignoreTxVersion,
"geneIdCol" = "geneId",
"txIdCol" = "txId"
)
txi <- do.call(what = tximport, args = args)
assert(isSubset(c("abundance", "counts", "length"), names(txi)))
## Handle edge case kallisto GENCODE transcript identifier sanitization.
## https://support.bioconductor.org/p/9149475/
if (allAreMatchingFixed(x = rownames(txi[["counts"]]), pattern = "|")) {
alert(sprintf(
"Sanitizing transcript identifiers in {.fun %s} return.",
"tximport"
))
sanitizeTranscriptIds <- function(x) {
sub(pattern = "^([^\\|]+)\\|.+$", replacement = "\\1", x = x)
}
rownames(txi[["abundance"]]) <-
sanitizeTranscriptIds(rownames(txi[["abundance"]]))
rownames(txi[["counts"]]) <-
sanitizeTranscriptIds(rownames(txi[["counts"]]))
rownames(txi[["length"]]) <-
sanitizeTranscriptIds(rownames(txi[["length"]]))
}
## Run assert checks before return.
invisible(lapply(
X = txi[c("abundance", "counts", "length")],
FUN = function(x) {
assert(
is.matrix(x),
identical(colnames(x), names(sampleDirs))
)
}
))
assert(isTximport(txi))
txi
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.