#' Process an experiment with multiple variable sequences
#'
#' This function enables the processing of data sets with multiple
#' variable sequences, which should potentially be handled in different
#' ways. For example, a barcode association experiment
#' with two variable sequences (the barcode and the biological variant)
#' that need to be processed differently, e.g. in terms of matching to
#' wildtype sequences or collapsing of similar sequences.
#' In contrast, while \code{digestFastqs} allow the specification
#' of multiple variable sequences (within each of the forward and reverse
#' reads), they will be concatenated and processed as a single unit.
#'
#' linkMultipleVariants will process the input in the following way:
#' \itemize{
#' \item First, run \code{digestFastqs} with the parameters provided
#' in \code{combinedDigestParams}. Typically, this will be a
#' "naive" counting run, where the frequencies of all observed
#' variants are tabulated. The variable sequences
#' within the forward and reverse reads, respectively, will be
#' processed as a single sequence.
#' \item Next, run \code{digestFastqs} with each of the additional
#' parameter sets provided (\code{...}). Each of these should
#' correspond to a single variable sequence from the combined
#' run (i.e., if there are two Vs in the element specifications
#' in the combined run, there should be two additional
#' parameter sets provided, each corresponding to the
#' processing of one variable sequence part). It is assumed
#' that the order of the additional arguments correspond to the
#' order of the variable sequences in the combined run, in such a way
#' that if the variable sequences extracted in each of the separate
#' runs are concatenated in the order that the parameter sets are
#' provided to \code{linkMultipleVariants}, they will form the variable
#' sequence extracted in the combined run.
#' \item The result of each of the separate runs is a 'conversion table',
#' containing the final set of identified sequence variants as well
#' as all individual sequences corresponding to each of them. This
#' is then combined with the count table from the combined, "naive"
#' run in order to create an aggregated count table. More precisely,
#' each sequence in the combined run is split into the constituent
#' variable sequences, and
#' each variable sequence is then matched to the output from the right
#' separate run, from which the final feature ID (mutant name, or
#' collapsed sequence) will be extracted and used to replace the original
#' sequence in the combined count table. Once all the matches are done,
#' rows with NAs (where no match could be found in the separate run)
#' are removed and the counts are aggregated across all identical
#' combinations of variable sequences.
#' }
#'
#' In order to define the \code{elementsForward} and \code{elementsReverse}
#' arguments for the separate runs, a strategy that often works is to simply
#' copy the arguments from the combined run, and successively replace all
#' but one of the 'V's by 'S'. This will effectively process one variable
#' sequence at the time, while keeping all other elements of the reads
#' consistent (since this can affect e.g. filtering criteria). Note that
#' to process individual variable sequences in the reverse read, you also
#' need to swap the 'forward' and 'reverse' specifications (since
#' \code{digestFastqs} requires a forward read).
#'
#' @param combinedDigestParams A named list of arguments to
#' \code{digestFastqs} for the combined ("naive") run.
#' @param ... Additional arguments providing arguments to \code{digestFastqs}
#' for the separate runs (processing each variable sequence in turn).
#' Each argument must be a named list of arguments to \code{digestFastqs}.
#' In addition, arguments \code{collapseMaxDist}, \code{collapseMinScore}
#' and \code{collapseMinRatio} can be specified, and will be passed on
#' to \code{collapseMutantsBySimilarity}.
#'
#' @author Charlotte Soneson, Michael Stadler
#'
#' @export
#'
#' @return A list with the following elements:
#' \itemize{
#' \item countAggregated - a \code{tibble} with columns corresponding to
#' each of the variable sequences, and a column with the total observed
#' read count for the combination.
#' \item convSeparate - a list of conversion tables from the respective
#' separate runs.
#' \item outCombined - the \code{digestFastqs} output for the combined run.
#' }
#'
#' @importFrom dplyr select rename group_by summarize across matches mutate
#' @importFrom tidyr separate separate_rows
#' @importFrom rlang .data
#'
linkMultipleVariants <- function(combinedDigestParams = list(), ...) {
## Process additional arguments
paramsSeparate <- list(...)
collapseParams <- c("collapseMaxDist",
"collapseMinScore", "collapseMinRatio")
## --------------------------------------------------------------------- ##
## Initial checks
## --------------------------------------------------------------------- ##
## combinedDigestParams and each element of paramsSeparate must be named
## lists
.assertVector(x = combinedDigestParams, type = "list")
defaults <- formals(digestFastqs)
.assertVector(x = names(combinedDigestParams), type = "character",
allowNULL = FALSE, validValues = names(defaults))
for (parms in paramsSeparate) {
.assertVector(x = parms, type = "list")
.assertVector(x = names(parms), type = "character", allowNULL = FALSE,
validValues = c(names(defaults), collapseParams))
}
## Fill the parameter lists with default values for all arguments
## that are not provided, to avoid having to check for possible
## NULL values downstream
paramsSeparate <- lapply(paramsSeparate, function(parm) {
c(parm, as.list(defaults[!(names(defaults) %in% names(parm))]))
})
combinedDigestParams <- c(combinedDigestParams,
as.list(defaults[!(names(defaults) %in%
names(combinedDigestParams))]))
## --------------------------------------------------------------------- ##
## Checks
## --------------------------------------------------------------------- ##
## We can't have a fwd and a rev V in the same run unless they are merged
## since this would disagree with the order in the concatenated V
## Currently, each separate run can only have one V in the fwd and/or one
## in the rev (both allowed if they are matched); otherwise the lengths
## extracted from the combined run don't match with those from the
## separate runs
for (parm in paramsSeparate) {
nbrFwdV <- sum(strsplit(parm$elementsForward, "")[[1]] == "V")
nbrRevV <- sum(strsplit(parm$elementsReverse, "")[[1]] == "V")
if (nbrFwdV == 0 && nbrRevV == 0) {
stop("Each separate run must contain at least one variable ",
"segment.")
}
if (nbrFwdV > 0 && nbrRevV > 0 &&
!parm$mergeForwardReverse) {
stop("A separate run can not have variable sequences in both ",
"the forward and reverse reads unless 'mergeForwardReverse' ",
"is TRUE.")
}
if (nbrFwdV > 1 || nbrRevV > 1) {
stop("A separate run can have at most one variable segment ",
"in each of the forward and reverse reads, respectively.")
}
}
## Summing UMI counts after the processing may not be accurate - will
## always use read counts
if (length(grep("U", paste0(combinedDigestParams$elementsForward,
combinedDigestParams$elementsReverse))) > 0) {
warning("Aggregating UMI counts may not be accurate - will use ",
"read counts instead")
}
## --------------------------------------------------------------------- ##
## Combined quantification - just tabulate combined variable sequences
## --------------------------------------------------------------------- ##
outCombined <- do.call(digestFastqs, combinedDigestParams)
## Get count matrix with "raw" (uncorrected) sequences
countCombined <- outCombined$summaryTable %>%
dplyr::select("sequence", "nbrReads", "varLengths") %>%
dplyr::mutate(idx = paste0("I", seq_along(.data$sequence)))
## If applicable, separate into forward and reverse sequences
if (any(grepl("_", countCombined$sequence))) {
countCombined <- countCombined %>%
tidyr::separate(.data$sequence, into = c("sequenceForward",
"sequenceReverse"),
sep = "_") %>%
tidyr::separate(.data$varLengths, into = c("varLengthsForward",
"varLengthsReverse"),
sep = "_") %>%
dplyr::mutate(
nCompForward = vapply(strsplit(.data$varLengthsForward, ","),
length, 0),
nCompReverse = vapply(strsplit(.data$varLengthsReverse, ","),
length, 0))
} else {
countCombined <- countCombined %>%
dplyr::rename(sequenceForward = "sequence",
varLengthsForward = "varLengths") %>%
dplyr::mutate(
nCompForward = vapply(strsplit(.data$varLengthsForward, ","),
length, 0))
}
offsetForward <- 0
## Split forward and reverse sequences, respectively
if ("sequenceForward" %in% colnames(countCombined)) {
stopifnot("varLengthsForward" %in% colnames(countCombined))
## Check that all sequences have the same number of components
stopifnot(length(unique(countCombined$nCompForward)) == 1)
offsetForward <- countCombined$nCompForward[1]
tmp <- split(countCombined, countCombined$varLengthsForward)
countCombined <- unsplit(lapply(tmp, function(df) {
w <- as.numeric(strsplit(df$varLengthsForward[1], ",")[[1]])
df <- df %>%
tidyr::separate(
.data$sequenceForward,
into = names(paramsSeparate)[seq_along(w)],
# into = paste0("V", seq_along(w)),
sep = cumsum(w[seq_len(length(w) - 1)]))
}), countCombined$varLengthsForward)
}
if ("sequenceReverse" %in% colnames(countCombined)) {
stopifnot("varLengthsReverse" %in% colnames(countCombined))
## Check that all sequences have the same number of components
stopifnot(length(unique(countCombined$nCompReverse)) == 1)
tmp <- split(countCombined, countCombined$varLengthsReverse)
countCombined <- unsplit(lapply(tmp, function(df) {
w <- as.numeric(strsplit(df$varLengthsReverse[1], ",")[[1]])
df <- df %>%
tidyr::separate(
.data$sequenceReverse,
into = names(paramsSeparate)[offsetForward + seq_along(w)],
# into = paste0("V", offsetForward + seq_along(w)),
sep = cumsum(w[seq_len(length(w) - 1)]))
}), countCombined$varLengthsReverse)
}
## --------------------------------------------------------------------- ##
## Quantify each variable sequence separately
## --------------------------------------------------------------------- ##
outSeparate <- lapply(paramsSeparate, function(ps) {
tmp <- do.call(digestFastqs, ps[!names(ps) %in% collapseParams])
if (any(collapseParams %in% names(ps))) {
## Collapse this variable region -> need to call groupSimilarSequences
tbl <- do.call(groupSimilarSequences,
c(list(seqs = tmp$summaryTable$sequence,
scores = tmp$summaryTable$nbrReads, verbose = FALSE),
ps[names(ps) %in% collapseParams]))
colnames(tbl)[colnames(tbl) == "representative"] <- "mutantName"
tmp$summaryTable <- tbl
}
tmp
})
## Filter tables
filtSeparate <- lapply(outSeparate, function(out) out$filterSummary)
## Conversion tables
convSeparate <- lapply(outSeparate, function(out) {
out$summaryTable %>%
dplyr::select("mutantName", "sequence") %>%
tidyr::separate_rows("sequence", sep = ",")
})
## --------------------------------------------------------------------- ##
## Replace naive sequences with corrected ones
## --------------------------------------------------------------------- ##
for (i in names(paramsSeparate)) {
countCombined[[i]] <-
convSeparate[[i]]$mutantName[match(countCombined[[i]],
convSeparate[[i]]$sequence)]
}
## Filter out rows with NAs (the sequence was not retained)
countCombined <- countCombined[rowSums(is.na(countCombined)) == 0, ,
drop = FALSE]
## Aggregate counts
countAggregated <- countCombined %>%
dplyr::group_by(dplyr::across(names(paramsSeparate))) %>%
dplyr::summarize(nbrReads = sum(.data$nbrReads), .groups = "drop")
list(countAggregated = countAggregated, convSeparate = convSeparate,
outCombined = outCombined, filtSeparate = filtSeparate)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.