# Make a summary table for a selection of barcodes in cbtable
# colName should be the name of a logical column in cbtable, indicating
# inclusion/exclusion with respect to the barcode collection
# cbName will be used for display purposes, to identify the barcode collection
.makeSummaryTable <- function(cbtable, colName, cbName = "",
countCol = "collapsedFreq", quantmat = NULL) {
stopifnot(colName %in% colnames(cbtable))
stopifnot(is.logical(cbtable[[colName]]))
if (countCol == "nbrMappedUMI") {
readtype <- " mapped"
} else {
readtype <- ""
}
df <- list()
df[[paste0("Number of barcodes", cbName)]] <-
as.character(sum(cbtable[[colName]], na.rm = TRUE))
df[[paste0("Number of barcodes with quantification", cbName)]] <-
as.character(sum(cbtable[[colName]] & !is.na(cbtable$mappingRate),
na.rm = TRUE))
df[[paste0("Fraction", readtype, " reads in barcodes", cbName)]] <-
as.character(paste0(signif(
100 * sum(cbtable[[countCol]][cbtable[[colName]]],
na.rm = TRUE) /
sum(cbtable$originalFreq, na.rm = TRUE), 4), "%"))
df[[paste0("Mean number of", readtype, " reads per cell", cbName)]] <-
as.character(round(mean(
cbtable[[countCol]][cbtable[[colName]]],
na.rm = TRUE)))
df[[paste0("Median number of", readtype, " reads per cell", cbName)]] <-
as.character(round(stats::median(
cbtable[[countCol]][cbtable[[colName]]],
na.rm = TRUE)))
df[[paste0("Mean number of detected genes per cell", cbName)]] <-
as.character(round(mean(
cbtable$nbrGenesAboveZero[cbtable[[colName]]],
na.rm = TRUE)))
df[[paste0("Median number of detected genes per cell", cbName)]] <-
as.character(round(stats::median(
cbtable$nbrGenesAboveZero[cbtable[[colName]]],
na.rm = TRUE)))
if (!is.null(quantmat)) {
df[[paste0("Total number of detected genes", cbName)]] <-
as.character(sum(
rowSums(quantmat[, colnames(quantmat) %in%
cbtable$CB[cbtable[[colName]]],
drop = FALSE]) > 0))
}
df[[paste0("Mean UMI count per cell", cbName)]] <-
as.character(round(mean(
cbtable$totalUMICount[cbtable[[colName]]],
na.rm = TRUE)))
df[[paste0("Median UMI count per cell", cbName)]] <-
as.character(round(stats::median(
cbtable$totalUMICount[cbtable[[colName]]],
na.rm = TRUE)))
df <- t(data.frame(df,
stringsAsFactors = FALSE,
check.names = FALSE))
df
}
#' Read alevin data required to generate summary report
#'
#' Read all alevin output files required to generate the summary report or shiny
#' app.
#'
#' @param baseDir Path to the output directory from the alevin run (should be
#' the directory containing the \code{alevin} directory).
#' @param customCBList Named list with custom set(s) of barcodes to provide
#' summary statistics/plots for, in addition to the whitelists generated by
#' alevin.
#'
#' @author Charlotte Soneson
#'
#' @export
#'
#' @importFrom utils read.delim
#' @import dplyr
#' @importFrom rjson fromJSON
#' @importFrom tximport tximport
#'
#' @return A list collecting all necessary information for generating the
#' summary report/shiny app.
#'
#' @examples
#' alevin <- readAlevinQC(system.file("extdata/alevin_example_v0.14",
#' package = "alevinQC"))
#'
readAlevinQC <- function(baseDir, customCBList = list()) {
if (!is.list(customCBList)) {
stop("'customCBList' must be a (possibly empty) list")
}
if (length(customCBList) > 0 && (any(is.null(names(customCBList))) ||
any(names(customCBList) == ""))) {
stop("'customCBList' must be a named list")
}
## Check that all required files are available, stop if not
infversion <- checkAlevinInputFiles(baseDir)
## Depending on the inferred version, read alevin output files
if (infversion == "pre0.14") {
## pre-v0.14
.readAlevinQC_pre0.14(baseDir = baseDir, customCBList = customCBList)
} else if (infversion == "v0.14") {
## v0.14 or newer, final whitelist inferred from data
.readAlevinQC_v0.14(baseDir = baseDir, customCBList = customCBList,
type = "standard")
} else if (infversion == "v0.14extwl") {
## v0.14 or newer, external whitelist provided
.readAlevinQC_v0.14(baseDir = baseDir, customCBList = customCBList,
type = "extwl")
} else if (infversion == "v0.14nowl") {
## v0.14 or newer, no whitelist.txt file but no external whitelist
.readAlevinQC_v0.14(baseDir = baseDir, customCBList = customCBList,
type = "nowl")
} else {
stop("Unidentifiable alevin output")
}
}
.readAlevinQC_pre0.14 <- function(baseDir, customCBList = list()) {
## No longer supported by tximport (version >= 1.17.4)
# nocov start
alevinDir <- file.path(baseDir, "alevin")
## Raw CB frequencies (assumed to be in descending order)
rawcbfreq <- utils::read.delim(file.path(alevinDir, "raw_cb_frequency.txt"),
header = FALSE, as.is = TRUE) %>%
dplyr::rename(CB = V1, originalFreq = V2) %>%
dplyr::mutate(ranking = seq_len(length(CB)))
## First set of whitelisted CBs (quantified)
filtcbfreq <- utils::read.delim(file.path(alevinDir,
"filtered_cb_frequency.txt"),
header = FALSE, as.is = TRUE) %>%
dplyr::rename(CB = V1, collapsedFreq = V2)
## FeatureDump
## dedupRate = nbr deduplicated UMI counts/nbr mapped reads
## nbrGenesAboveMean = nbr genes with count > mean gene count
featuredump <- utils::read.delim(file.path(alevinDir, "featureDump.txt"),
header = FALSE, as.is = TRUE) %>%
dplyr::rename(CB = V1, mappingRate = V2, duplicationRate = V3,
dedupRate = V4, nbrGenesAboveMean = V5)
## Mapped UMI
mappedumi <- utils::read.delim(file.path(alevinDir, "MappedUmi.txt"),
header = FALSE, as.is = TRUE) %>%
dplyr::rename(CB = V1, nbrMappedUMI = V2)
## Final set of whitelisted CBs
finalwhitelist <- utils::read.delim(file.path(alevinDir, "whitelist.txt"),
header = FALSE, as.is = TRUE)$V1
## Quantification
quantmat <- tximport::tximport(file.path(baseDir, "alevin/quants_mat.gz"),
type = "alevin")$counts
quants <- data.frame(CB = colnames(quantmat),
totalUMICount = colSums(quantmat),
nbrGenesAboveZero = colSums(quantmat > 0),
stringsAsFactors = FALSE)
## Merge information about quantified CBs
quantbcs <- filtcbfreq %>%
dplyr::full_join(featuredump, by = "CB") %>%
dplyr::full_join(mappedumi, by = "CB") %>%
dplyr::full_join(quants, by = "CB") %>%
dplyr::mutate(inFinalWhiteList = CB %in% finalwhitelist) %>%
dplyr::mutate(inFirstWhiteList = TRUE)
cbtable <- dplyr::full_join(
rawcbfreq,
quantbcs
) %>% dplyr::mutate(inFirstWhiteList = replace(inFirstWhiteList,
is.na(inFirstWhiteList),
FALSE),
inFinalWhiteList = replace(inFinalWhiteList,
is.na(inFinalWhiteList),
FALSE))
## Add information from custom barcode sets
customCBsummary <- list()
for (i in seq_along(customCBList)) {
nm <- paste0("customCB__", names(customCBList)[i])
cbtable[[nm]] <- cbtable$CB %in% customCBList[[i]]
efr <- signif(100 * sum(cbtable[[nm]])/length(customCBList[[i]]), 4)
qfr <- signif(100 * sum(cbtable[[nm]] & !is.na(cbtable$mappingRate))/
length(customCBList[[i]]), 4)
message(efr, "% of barcodes in custom barcode set ",
names(customCBList)[i], " were found in the data set (",
qfr, "% were quantified)")
customCBsummary[[nm]] <- .makeSummaryTable(
cbtable = cbtable,
colName = nm,
cbName = paste0(" (", gsub("customCB__", "", nm), ")"),
countCol = "collapsedFreq",
quantmat = quantmat)
}
## Meta information and command information
metainfo <- rjson::fromJSON(file = file.path(baseDir,
"aux_info/meta_info.json"))
cmdinfo <- rjson::fromJSON(file = file.path(baseDir, "cmd_info.json"))
## Create "version info" table
versiontable <- t(data.frame(
`Start time` = metainfo$start_time,
`Salmon version` = metainfo$salmon_version,
`Index` = cmdinfo$index,
`R1file` = paste(cmdinfo$mates1,
collapse = ", "),
`R2file` = paste(cmdinfo$mates2,
collapse = ", "),
`tgMap` = cmdinfo$tgMap,
`Library type` = metainfo$library_types,
stringsAsFactors = FALSE,
check.names = FALSE
))
## Create summary tables
summarytable_full <- t(data.frame(
`Total number of processed reads` =
as.character(metainfo$num_processed),
`Number of reads with valid cell barcode (no Ns)` =
as.character(round(sum(rawcbfreq$originalFreq, na.rm = TRUE))),
`Number of mapped reads` = metainfo$num_mapped,
`Percent mapped` = metainfo$percent_mapped,
`Total number of observed cell barcodes` =
as.character(length(unique(cbtable$CB))),
stringsAsFactors = FALSE,
check.names = FALSE
))
summarytable_initialwl <- .makeSummaryTable(
cbtable = cbtable,
colName = "inFirstWhiteList",
cbName = " (initial whitelist)",
countCol = "collapsedFreq",
quantmat = quantmat
)
summarytable_finalwl <- .makeSummaryTable(
cbtable = cbtable,
colName = "inFinalWhiteList",
cbName = " (final whitelist)",
countCol = "collapsedFreq",
quantmat = quantmat
)
## Return
list(cbTable = cbtable, versionTable = versiontable,
summaryTables = c(list(fullDataset = summarytable_full,
initialWhitelist = summarytable_initialwl,
finalWhitelist = summarytable_finalwl),
customCBsummary),
type = "pre0.14"
)
# nocov end
}
.readAlevinQC_v0.14 <- function(baseDir, customCBList = list(), type = "standard") {
alevinDir <- file.path(baseDir, "alevin")
## Raw CB frequencies (assumed to be in descending order)
rawcbfreq <- utils::read.delim(file.path(alevinDir, "raw_cb_frequency.txt"),
header = FALSE, as.is = TRUE) %>%
dplyr::rename(CB = V1, originalFreq = V2) %>%
dplyr::mutate(ranking = seq_len(length(CB)))
if (!all(diff(rawcbfreq$originalFreq) <= 0)) {
warning("The raw CB frequencies are not sorted in decreasing order")
}
## FeatureDump
featuredump <- try({
utils::read.delim(file.path(alevinDir, "featureDump.txt"),
header = TRUE, as.is = TRUE, sep = "\t")
}, silent = TRUE)
if (is(featuredump, "try-error")) {
if (grepl("more columns than column names", featuredump)) {
warning("The 'featureDump.txt' file could not be cleanly read. ",
"Please check results carefully (e.g. by reading the ",
"input manually and checking the 'cbTable' output.")
## In alevin 1.0.0 (fixed in 1.1.0), with certain flags,
## the ArborescenceCount column could be split over multiple columns
tmpcn <- unlist(utils::read.delim(
file.path(alevinDir, "featureDump.txt"),
header = FALSE, as.is = TRUE, sep = "\t", nrows = 1
))
tmpdt <- utils::read.delim(
file.path(alevinDir, "featureDump.txt"),
header = FALSE, as.is = TRUE, sep = "\t", skip = 1
)
if (length(tmpcn) != ncol(tmpdt)) {
tmpdt <- tmpdt[, seq_len(length(tmpcn)), drop = FALSE]
}
colnames(tmpdt) <- tmpcn
featuredump <- tmpdt
} else {
stop("The 'featureDump.txt' file could not be read.")
}
}
featuredump <- featuredump %>%
dplyr::rename(mappingRate = MappingRate,
collapsedFreq = CorrectedReads,
dedupRate = DedupRate,
nbrGenesAboveMean = NumGenesOverMean,
nbrMappedUMI = MappedReads,
totalUMICount = DeduplicatedReads,
nbrGenesAboveZero = NumGenesExpressed)
if (type == "standard") {
## Final set of whitelisted CBs
finalwhitelist <- utils::read.delim(
file.path(alevinDir, "whitelist.txt"),
header = FALSE, as.is = TRUE)$V1
}
## Meta information and command information
metainfo <- rjson::fromJSON(file = file.path(baseDir,
"aux_info/meta_info.json"))
cmdinfo <- rjson::fromJSON(file = file.path(baseDir, "cmd_info.json"))
alevinmetainfo <- rjson::fromJSON(
file = file.path(baseDir, "aux_info/alevin_meta_info.json"))
## Merge information about quantified CBs
cbtable <- dplyr::full_join(
rawcbfreq,
featuredump,
by = "CB"
)
if (type == "standard") {
## we have a whitelist.txt file representing the final whitelist
cbtable <- cbtable %>%
dplyr::mutate(inFinalWhiteList = CB %in% finalwhitelist) %>%
dplyr::mutate(
inFirstWhiteList = ranking <= alevinmetainfo$initial_whitelist
)
## Check if there is any barcode that is not in the first whitelist,
## but which has an original ranking lower than any barcode that is
## in the first whitelist, and remove it.
toremove <-
!cbtable$inFirstWhiteList &
cbtable$ranking <= max(cbtable$ranking[cbtable$inFirstWhiteList])
if (any(toremove)) {
warning("Excluding ", sum(toremove), " unquantified barcode",
ifelse(sum(toremove) > 1, "s", ""),
" with higher original frequency than barcodes ",
"included in the first whitelist: ",
paste0(cbtable$CB[toremove], collapse = ", "))
cbtable <- cbtable[!toremove, ]
}
} else if (type == "nowl") {
## we have an indication of the size of the initial whitelist, but
## no final whitelist (and no whitelist.txt file).
## the final number of considered CBs should be those in the initial
## whitelist
cbtable <- cbtable %>%
dplyr::mutate(
inFirstWhiteList = ranking <= alevinmetainfo$initial_whitelist
) %>%
dplyr::mutate(inFinalWhiteList = inFirstWhiteList)
} else if (type == "extwl") {
## the CBs included in the featureDump file are the ones included
## in the external whitelist. There is no initial whitelisting step
cbtable <- cbtable %>%
dplyr::mutate(inFirstWhiteList = CB %in% featuredump$CB) %>%
dplyr::mutate(inFinalWhiteList = inFirstWhiteList)
}
## Add information from custom barcode sets
customCBsummary <- list()
for (i in seq_along(customCBList)) {
nm <- paste0("customCB__", names(customCBList)[i])
cbtable[[nm]] <- cbtable$CB %in% customCBList[[i]]
efr <- signif(100 * sum(cbtable[[nm]])/length(customCBList[[i]]), 4)
qfr <- signif(100 * sum(cbtable[[nm]] & !is.na(cbtable$mappingRate))/
length(customCBList[[i]]), 4)
message(efr, "% of barcodes in custom barcode set ",
names(customCBList)[i], " were found in the data set (",
qfr, "% were quantified)")
customCBsummary[[nm]] <- .makeSummaryTable(
cbtable = cbtable,
colName = nm,
cbName = paste0(" (", gsub("customCB__", "", nm), ")"),
countCol = "collapsedFreq",
quantmat = NULL)
}
## Create "version info" table
versiontable <- t(data.frame(
`Start time` = metainfo$start_time,
`Salmon version` = metainfo$salmon_version,
`Index` = cmdinfo$index,
`R1file` = paste(cmdinfo$mates1,
collapse = ", "),
`R2file` = paste(cmdinfo$mates2,
collapse = ", "),
`tgMap` = cmdinfo$tgMap,
`Library type` = metainfo$library_types,
stringsAsFactors = FALSE,
check.names = FALSE
))
if (type == "extwl") {
versiontable <- rbind(
versiontable,
t(data.frame(
`External whitelist` = cmdinfo$whitelist,
stringsAsFactors = FALSE,
check.names = FALSE
))
)
}
if ("expectCells" %in% names(cmdinfo)) {
versiontable <- rbind(
versiontable,
t(data.frame(
`Expected number of cells` = cmdinfo$expectCells,
stringsAsFactors = FALSE,
check.names = FALSE
))
)
}
if ("forceCells" %in% names(cmdinfo)) {
versiontable <- rbind(
versiontable,
t(data.frame(
`Forced number of cells` = cmdinfo$forceCells,
stringsAsFactors = FALSE,
check.names = FALSE
))
)
}
## Create summary tables
summarytable_full <- t(data.frame(
`Total number of processed reads` =
as.character(alevinmetainfo$total_reads),
`Number of reads with Ns` =
as.character(alevinmetainfo$reads_with_N),
`Number of reads with valid cell barcode (no Ns)` =
as.character(round(sum(rawcbfreq$originalFreq, na.rm = TRUE))),
`Number of mapped reads` = alevinmetainfo$reads_in_eqclasses,
`Percent mapped (of all reads)` =
paste0(signif(as.numeric(alevinmetainfo$mapping_rate), 4), "%"),
`Number of noisy CB reads` =
as.character(alevinmetainfo$noisy_cb_reads),
`Number of noisy UMI reads` =
as.character(alevinmetainfo$noisy_umi_reads),
`Total number of observed cell barcodes` =
as.character(length(unique(cbtable$CB))),
stringsAsFactors = FALSE,
check.names = FALSE
))
## If used_reads is reported, add actual mapping rate
if (!is.null(alevinmetainfo$used_reads)) {
summarytable_full <- rbind(
summarytable_full,
t(data.frame(
`Number of used reads` = alevinmetainfo$used_reads,
`Percent mapped (of used reads)` =
paste0(signif(100 * alevinmetainfo$reads_in_eqclasses /
alevinmetainfo$used_reads, 4), "%"),
stringsAsFactors = FALSE,
check.names = FALSE
))
)
}
summarytable_initialwl <- .makeSummaryTable(
cbtable = cbtable,
colName = "inFirstWhiteList",
cbName = " (initial whitelist)",
countCol = "collapsedFreq",
quantmat = NULL
)
summarytable_finalwl <- .makeSummaryTable(
cbtable = cbtable,
colName = "inFinalWhiteList",
cbName = " (final whitelist)",
countCol = "collapsedFreq",
quantmat = NULL
)
## Return
list(cbTable = cbtable, versionTable = versiontable,
summaryTables = c(list(fullDataset = summarytable_full,
initialWhitelist = summarytable_initialwl,
finalWhitelist = summarytable_finalwl),
customCBsummary),
type = type
)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.