#' Read alevin-fry data required to generate summary report
#'
#' Read all alevin-fry output files required to generate the
#' summary report or shiny app.
#'
#' @param mapDir Path to the output directory from the \code{salmon alevin}
#' run (should be the directory containing the \code{alevin} folder).
#' @param permitDir Path to the output directory from the
#' \code{generate-permit-list} and \code{collate} runs.
#' @param quantDir Path to the output directory from the
#' \code{alevin-fry quant} run (should be the directory containing the
#' \code{alevin} folder).
#'
#' @author Charlotte Soneson
#'
#' @export
#'
#' @importFrom utils read.delim
#' @import dplyr
#' @importFrom rjson fromJSON
#'
#' @return A list collecting all necessary information for generating the
#' summary report/shiny app.
#'
#' @examples
#' alevinfry <- readAlevinFryQC(
#' mapDir = system.file("extdata/alevinfry_example_v0.5.0/map",
#' package = "alevinQC"),
#' permitDir = system.file("extdata/alevinfry_example_v0.5.0/permit",
#' package = "alevinQC"),
#' quantDir = system.file("extdata/alevinfry_example_v0.5.0/quant",
#' package = "alevinQC"))
#'
readAlevinFryQC <- function(mapDir, permitDir, quantDir) {
## Check that all required files are available, stop if not
infversion <- checkAlevinFryInputFiles(mapDir = mapDir,
permitDir = permitDir,
quantDir = quantDir)
## Depending on the inferred version, read alevin output files
if (infversion == "v0.4.3") {
## v0.4.3 or newer
.readAlevinFryQC_v0.4.3(mapDir = mapDir, permitDir = permitDir,
quantDir = quantDir)
} else if (infversion == "v0.5.0") {
## v0.5.0 or newer
.readAlevinFryQC_v0.5.0(mapDir = mapDir, permitDir = permitDir,
quantDir = quantDir)
} else if (infversion == "piscem_v0.6.0") {
## v0.5.0 or newer
.readAlevinFryQC_piscemv0.6.0(mapDir = mapDir, permitDir = permitDir,
quantDir = quantDir)
} else {
stop("Unidentifiable alevin-fry output")
}
}
.readAlevinFryQC_v0.4.3 <- function(mapDir, permitDir, quantDir) {
## Raw CB frequencies (in descending order)
rawcbfreq <- utils::read.delim(file.path(permitDir, "all_freq.tsv"),
header = FALSE, as.is = TRUE) %>%
dplyr::rename(CB = V1, originalFreq = V2) %>%
dplyr::arrange(dplyr::desc(originalFreq)) %>%
dplyr::mutate(ranking = seq_len(length(CB)))
if (!all(diff(rawcbfreq$originalFreq) <= 0)) {
warning("The raw CB frequencies are not sorted in decreasing order")
}
## FeatureDump
featuredump <- utils::read.delim(file.path(quantDir, "featureDump.txt"),
header = TRUE, as.is = TRUE, sep = "\t")
featuredump <- featuredump %>%
dplyr::rename(mappingRate = MappingRate,
collapsedFreq = CorrectedReads,
dedupRate = DedupRate,
nbrGenesAboveMean = NumGenesOverMean,
nbrMappedUMI = MappedReads,
totalUMICount = DeduplicatedReads,
nbrGenesAboveZero = NumGenesExpressed)
permitlist <- utils::read.delim(file.path(quantDir, "alevin",
"quants_mat_rows.txt"),
header = FALSE, as.is = TRUE) %>%
dplyr::pull(V1)
## Meta information and command information
metainfo <- rjson::fromJSON(file = file.path(mapDir,
"aux_info/meta_info.json"))
cmdinfo <- rjson::fromJSON(file = file.path(mapDir, "cmd_info.json"))
quantinfo <- rjson::fromJSON(file = file.path(quantDir, "quant.json"))
## Merge information about quantified CBs
cbtable <- dplyr::full_join(
rawcbfreq,
featuredump,
by = "CB"
)
cbtable <- cbtable %>%
dplyr::mutate(inPermitList = CB %in% permitlist)
## Check if there is any barcode that is not in the permitlist,
## but which has an original ranking lower than any barcode that is
## in the permitlist, and remove it.
# toremove <-
# !cbtable$inPermitList &
# cbtable$ranking <= max(cbtable$ranking[cbtable$inPermitList])
# if (any(toremove)) {
# warning("Excluding ", sum(toremove), " unquantified barcode",
# ifelse(sum(toremove) > 1, "s", ""),
# " with higher original frequency than barcodes ",
# "included in the permitlist: ",
# paste0(cbtable$CB[toremove], collapse = ", "))
# cbtable <- cbtable[!toremove, ]
# }
## Add information from custom barcode sets (not implemented)
customCBsummary <- list()
## Create "version info" table
versiontable <- t(data.frame(
`Start time` = metainfo$start_time,
`Salmon version` = metainfo$salmon_version,
`alevin-fry version (quant)` = quantinfo$version_str,
`Index` = cmdinfo$index,
`R1file` = paste(cmdinfo$mates1,
collapse = ", "),
`R2file` = paste(cmdinfo$mates2,
collapse = ", "),
`tgMap` = ifelse(is.null(cmdinfo$tgMap), "NA",
cmdinfo$tgMap),
`Library type` = metainfo$library_types,
stringsAsFactors = FALSE,
check.names = FALSE
))
## Create summary tables
summarytable_full <- t(data.frame(
`Total number of processed reads` =
as.character(metainfo$num_processed),
`Number of mapped reads` = metainfo$num_mapped,
`Total number of observed cell barcodes` =
as.character(length(unique(cbtable$CB))),
stringsAsFactors = FALSE,
check.names = FALSE
))
summarytable_permitlist <- .makeSummaryTable(
cbtable = cbtable,
colName = "inPermitList",
cbName = " (permitlist)",
countCol = "nbrMappedUMI",
quantmat = NULL
)
## Return
list(cbTable = cbtable, versionTable = versiontable,
summaryTables = c(list(fullDataset = summarytable_full,
permitlist = summarytable_permitlist),
customCBsummary),
type = "alevin-fry"
)
}
.readAlevinFryQC_v0.5.0 <- function(mapDir, permitDir, quantDir) {
## Raw CB frequencies (in descending order)
if (file.exists(file.path(permitDir, "all_freq.bin"))) {
rawcbfreq <- cpp_get_permit_freq_info(file.path(permitDir,
"all_freq.bin"))
} else {
message(file.path(permitDir, "all_freq.bin"),
" is missing - using permit_freq.bin")
rawcbfreq <- cpp_get_permit_freq_info(file.path(permitDir,
"permit_freq.bin"))
}
rawcbfreq <- data.frame(CB = rawcbfreq[[1]],
originalFreq = rawcbfreq[[2]]) %>%
dplyr::arrange(dplyr::desc(originalFreq)) %>%
dplyr::mutate(ranking = seq_len(length(CB)))
if (!all(diff(rawcbfreq$originalFreq) <= 0)) {
warning("The raw CB frequencies are not sorted in decreasing order")
}
## FeatureDump
featuredump <- utils::read.delim(file.path(quantDir, "featureDump.txt"),
header = TRUE, as.is = TRUE, sep = "\t")
featuredump <- featuredump %>%
dplyr::rename(mappingRate = MappingRate,
collapsedFreq = CorrectedReads,
dedupRate = DedupRate,
nbrGenesAboveMean = NumGenesOverMean,
nbrMappedUMI = MappedReads,
totalUMICount = DeduplicatedReads,
nbrGenesAboveZero = NumGenesExpressed)
permitlist <- cpp_get_permit_freq_info(file.path(permitDir,
"permit_freq.bin"))[[1]]
## Meta information and command information
metainfo <- rjson::fromJSON(file = file.path(mapDir,
"aux_info/meta_info.json"))
cmdinfo <- rjson::fromJSON(file = file.path(mapDir, "cmd_info.json"))
quantinfo <- rjson::fromJSON(file = file.path(quantDir, "quant.json"))
quantinfo <- .parse_quant_t2gmap(quantinfo)
## Merge information about quantified CBs
cbtable <- dplyr::full_join(
rawcbfreq,
featuredump,
by = "CB"
)
cbtable <- cbtable %>%
dplyr::mutate(inPermitList = CB %in% permitlist)
## Check if there is any barcode that is not in the permitlist,
## but which has an original ranking lower than any barcode that is
## in the permitlist, and remove it.
# toremove <-
# !cbtable$inPermitList &
# cbtable$ranking <= max(cbtable$ranking[cbtable$inPermitList])
# if (any(toremove)) {
# warning("Excluding ", sum(toremove), " unquantified barcode",
# ifelse(sum(toremove) > 1, "s", ""),
# " with higher original frequency than barcodes ",
# "included in the permitlist: ",
# paste0(cbtable$CB[toremove], collapse = ", "))
# cbtable <- cbtable[!toremove, ]
# }
## Add information from custom barcode sets (not implemented)
customCBsummary <- list()
## Create "version info" table
versiontable <- t(data.frame(
`Start time` = metainfo$start_time,
`Salmon version` = metainfo$salmon_version,
`alevin-fry version (quant)` = quantinfo$version_str,
`Index` = cmdinfo$index,
`R1file` = paste(cmdinfo$mates1,
collapse = ", "),
`R2file` = paste(cmdinfo$mates2,
collapse = ", "),
`tgMap` = quantinfo$tgMap,
`Library type` = metainfo$library_types,
stringsAsFactors = FALSE,
check.names = FALSE
))
## Create summary tables
summarytable_full <- t(data.frame(
`Total number of processed reads` =
as.character(metainfo$num_processed),
`Number of mapped reads` = metainfo$num_mapped,
`Total number of observed cell barcodes` =
as.character(length(unique(cbtable$CB))),
stringsAsFactors = FALSE,
check.names = FALSE
))
summarytable_permitlist <- .makeSummaryTable(
cbtable = cbtable,
colName = "inPermitList",
cbName = " (permitlist)",
countCol = "nbrMappedUMI",
quantmat = NULL
)
## Return
list(cbTable = cbtable, versionTable = versiontable,
summaryTables = c(list(fullDataset = summarytable_full,
permitlist = summarytable_permitlist),
customCBsummary),
type = "alevin-fry"
)
}
.readAlevinFryQC_piscemv0.6.0 <- function(mapDir, permitDir, quantDir) {
## Raw CB frequencies (in descending order)
if (file.exists(file.path(permitDir, "all_freq.bin"))) {
rawcbfreq <- cpp_get_permit_freq_info(file.path(permitDir,
"all_freq.bin"))
} else {
message(file.path(permitDir, "all_freq.bin"),
" is missing - using permit_freq.bin")
rawcbfreq <- cpp_get_permit_freq_info(file.path(permitDir,
"permit_freq.bin"))
}
rawcbfreq <- data.frame(CB = rawcbfreq[[1]],
originalFreq = rawcbfreq[[2]]) %>%
dplyr::arrange(dplyr::desc(originalFreq)) %>%
dplyr::mutate(ranking = seq_len(length(CB)))
if (!all(diff(rawcbfreq$originalFreq) <= 0)) {
warning("The raw CB frequencies are not sorted in decreasing order")
}
## FeatureDump
featuredump <- utils::read.delim(file.path(quantDir, "featureDump.txt"),
header = TRUE, as.is = TRUE, sep = "\t")
featuredump <- featuredump %>%
dplyr::rename(mappingRate = MappingRate,
collapsedFreq = CorrectedReads,
dedupRate = DedupRate,
nbrGenesAboveMean = NumGenesOverMean,
nbrMappedUMI = MappedReads,
totalUMICount = DeduplicatedReads,
nbrGenesAboveZero = NumGenesExpressed)
permitlist <- cpp_get_permit_freq_info(file.path(permitDir,
"permit_freq.bin"))[[1]]
mapinfo <- rjson::fromJSON(file = file.path(mapDir, "map_info.json"))
mapinfo <- .parse_piscem_map_info(mapinfo)
quantinfo <- rjson::fromJSON(file = file.path(quantDir, "quant.json"))
quantinfo <- .parse_quant_t2gmap(quantinfo)
## Merge information about quantified CBs
cbtable <- dplyr::full_join(
rawcbfreq,
featuredump,
by = "CB"
)
cbtable <- cbtable %>%
dplyr::mutate(inPermitList = CB %in% permitlist)
## Add information from custom barcode sets (not implemented)
customCBsummary <- list()
## Create "version info" table
versiontable <- t(data.frame(
`Run time (seconds)` = mapinfo$runtime_seconds,
# `Salmon version` = metainfo$salmon_version,
`alevin-fry version (quant)` = quantinfo$version_str,
`Index` = mapinfo$Index,
`R1file` = mapinfo$R1file,
`R2file` = mapinfo$R2file,
`tgMap` = quantinfo$tgMap,
stringsAsFactors = FALSE,
check.names = FALSE
))
## Create summary tables
summarytable_full <- t(data.frame(
`Total number of processed reads` =
as.character(mapinfo$num_reads),
`Number of mapped reads` = mapinfo$num_mapped,
`Percent of mapped reads` = mapinfo$percent_mapped,
`Total number of observed cell barcodes` =
as.character(length(unique(cbtable$CB))),
stringsAsFactors = FALSE,
check.names = FALSE
))
summarytable_permitlist <- .makeSummaryTable(
cbtable = cbtable,
colName = "inPermitList",
cbName = " (permitlist)",
countCol = "nbrMappedUMI",
quantmat = NULL
)
## Return
list(cbTable = cbtable, versionTable = versiontable,
summaryTables = c(list(fullDataset = summarytable_full,
permitlist = summarytable_permitlist),
customCBsummary),
type = "alevin-fry"
)
}
#' @keywords internal
#' @noRd
.parse_piscem_map_info <- function(mapinfo) {
cmdline <- strsplit(mapinfo$cmdline," ")[[1]][-1]
mapinfo$Index <- cmdline[which(cmdline == "-i" | cmdline == "--index") + 1]
mapinfo$R1file <- cmdline[which(cmdline == "-1" | cmdline == "--read1") + 1]
mapinfo$R2file <- cmdline[which(cmdline == "-2" | cmdline == "--read2") + 1]
mapinfo
}
#' @keywords internal
#' @noRd
.parse_quant_t2gmap <- function(quantinfo) {
if (is.null(quantinfo$quant_options$tg_map)) {
cmd <- strsplit(quantinfo$cmd, " ")[[1]]
quantinfo$tgMap <- cmd[which(cmd == "-m" | cmd == "--tg-map") + 1]
} else {
quantinfo$tgMap <- quantinfo$quant_options$tg_map
}
quantinfo
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.