#' Collapse mutants by similarity
#'
#' These functions can be used to collapse variants, either by similarity or
#' according to a pre-defined grouping. The functions \code{collapseMutants} and
#' \code{collapseMutantsByAA} assume that a grouping variable is available as a
#' column in \code{rowData(se)} (\code{collapseMutantsByAA} is a convenience
#' function for the case when this column is "mutantNameAA", and is provided
#' for backwards compatibility). The \code{collapseMutantsBySimilarity} will
#' generate the grouping variable based on user-provided thresholds on the
#' sequence similarity (defined by the Hamming distance), and subsequently
#' collapse based on the derived grouping.
#'
#' @param se A \code{\link[SummarizedExperiment]{SummarizedExperiment}}
#' generated by \code{\link{summarizeExperiment}}
#' @param assayName The name of the assay that will be used to calculate a
#' "score" (typically derived from the read counts) for each variant.
#' @param scoreMethod Character scalar giving the approach used to calculate
#' ranking scores from the assay defined by \code{assayName}. Currently,
#' this can be one of \code{"rowSum"} or \code{"rowMean"}. All filtering
#' criteria will be applied to these scores.
#' @param sequenceCol Character scalar giving the name of the column in
#' \code{rowData(se)} that contains the nucleotide sequence of the
#' variants.
#' @param collapseMaxDist Numeric scalar defining the tolerance for collapsing
#' similar sequences. If the value is in [0, 1), it defines the maximal
#' Hamming distance in terms of a fraction of sequence length:
#' (\code{round(collapseMaxDist * nchar(sequence))}).
#' A value greater or equal to 1 is rounded and directly used as the maximum
#' allowed Hamming distance. Note that sequences can only be
#' collapsed if they are all of the same length.
#' @param collapseMinScore Numeric scalar, indicating the minimum score for the
#' sequence to be considered for collapsing with similar sequences.
#' @param collapseMinRatio Numeric scalar. During collapsing of
#' similar sequences, a low-frequency sequence will be collapsed
#' with a higher-frequency sequence only if the ratio between the
#' high-frequency and the low-frequency scores is at least this
#' high. The default value of 0 indicates that no such check is performed.
#' @param verbose Logical, whether to print progress messages.
#'
#' @author Charlotte Soneson, Michael Stadler
#'
#' @export
#'
#' @importFrom SummarizedExperiment assayNames assay rowData
#' @importFrom Matrix rowSums rowMeans
#'
#' @examples
#' se <- readRDS(system.file("extdata", "GSE102901_cis_se.rds",
#' package = "mutscan"))[1:200, ]
#' ## The rows of this object correspond to individual codon variants
#' dim(se)
#' head(rownames(se))
#'
#' ## Collapse by amino acid
#' sec <- collapseMutantsByAA(se)
#' ## The rows of the collapsed object correspond to amino acid variants
#' dim(sec)
#' head(rownames(sec))
#' ## The mutantName column contains the individual codon variants that were
#' ## collapsed
#' head(SummarizedExperiment::rowData(sec))
#'
#' ## Collapse similar sequences
#' sec2 <- collapseMutantsBySimilarity(
#' se = se, assayName = "counts", scoreMethod = "rowSum",
#' sequenceCol = "sequence", collapseMaxDist = 2,
#' collapseMinScore = 0, collapseMinRatio = 0)
#' dim(sec2)
#' head(rownames(sec2))
#' head(SummarizedExperiment::rowData(sec2))
#'
collapseMutantsBySimilarity <- function(se, assayName, scoreMethod = "rowSum",
sequenceCol = "sequence",
collapseMaxDist = 0,
collapseMinScore = 0,
collapseMinRatio = 0,
verbose = TRUE) {
## Check input arguments
.assertVector(x = se, type = "SummarizedExperiment")
.assertScalar(x = assayName, type = "character",
validValues = SummarizedExperiment::assayNames(se))
.assertScalar(x = scoreMethod, type = "character",
validValues = c("rowSum", "rowMean"))
.assertScalar(x = sequenceCol, type = "character",
validValues = colnames(SummarizedExperiment::rowData(se)))
.assertScalar(x = collapseMaxDist, type = "numeric")
.assertScalar(x = collapseMinScore, type = "numeric")
.assertScalar(x = collapseMinRatio, type = "numeric")
.assertScalar(x = verbose, type = "logical")
## All sequences must be the same length, and can only consist of ACGT_
seqs <- SummarizedExperiment::rowData(se)[, sequenceCol]
if (length(unique(nchar(seqs))) != 1) {
stop("All entries in the sequence column must have the same length")
}
if (!all(vapply(seqs,
function(x) grepl("^[ACGT_]*$", x), FALSE))) {
stop("All entries in the sequence column must consist of [ACGT_]")
}
## Get matching table
if (scoreMethod == "rowSum") {
scores <- Matrix::rowSums(SummarizedExperiment::assay(se, assayName))
} else if (scoreMethod == "rowMean") {
scores <- Matrix::rowMeans(SummarizedExperiment::assay(se, assayName))
}
matchTable <- groupSimilarSequences(
seqs = seqs,
scores = scores,
collapseMaxDist = collapseMaxDist,
collapseMinScore = collapseMinScore,
collapseMinRatio = collapseMinRatio,
verbose = verbose
)
matchTable$representative <- rownames(se)[
match(matchTable$representative,
SummarizedExperiment::rowData(se)[, sequenceCol])]
## Add grouping variable
SummarizedExperiment::rowData(se)$collapseCol <-
matchTable$representative[match(SummarizedExperiment::rowData(se)[, sequenceCol],
matchTable$sequence)]
## Collapse
se <- collapseMutants(se, nameCol = "collapseCol")
se
}
#' @rdname collapseMutantsBySimilarity
#' @export
#'
collapseMutantsByAA <- function(se) {
collapseMutants(se = se, nameCol = "mutantNameAA")
}
#' Collapse counts by mutated amino acid
#'
#' @param se A \code{\link[SummarizedExperiment]{SummarizedExperiment}}
#' generated by \code{\link{summarizeExperiment}}
#' @param nameCol A character scalar providing the column of
#' \code{rowData(se)} that contains the amino acid mutant names (that will
#' be the new row names).
#'
#' @return A \code{\link[SummarizedExperiment]{SummarizedExperiment}} where
#' counts have been aggregated by the mutated amino acid(s).
#'
#' @author Charlotte Soneson, Michael Stadler
#'
#' @export
#'
#' @importFrom DelayedArray rowsum
#' @importFrom S4Vectors metadata
#' @importFrom SummarizedExperiment assays rowData SummarizedExperiment colData
#' @importFrom dplyr across group_by summarize
#' @importFrom stats setNames
#'
#' @rdname collapseMutantsBySimilarity
#'
collapseMutants <- function(se, nameCol) {
.assertVector(x = se, type = "SummarizedExperiment")
.assertScalar(x = nameCol, type = "character",
validValues = colnames(SummarizedExperiment::rowData(se)))
## Collapse assays
## Matrix.utils has been removed from CRAN
# aList <- lapply(SummarizedExperiment::assays(se), function(a) {
# Matrix.utils::aggregate.Matrix(
# x = a,
# groupings = factor(SummarizedExperiment::rowData(se)[[nameCol]]),
# fun = "colSums")
# })
aList <- lapply(SummarizedExperiment::assays(se), function(a) {
DelayedArray::rowsum(
x = a,
group = factor(SummarizedExperiment::rowData(se)[[nameCol]])
)
})
## Collapse rowData - simple columns
rd <- mergeValues(SummarizedExperiment::rowData(se)[[nameCol]],
SummarizedExperiment::rowData(se)$sequence) %>%
stats::setNames(c(nameCol, "sequence"))
for (v in setdiff(
intersect(c("mutantName", "mutantNameBase", "mutantNameBaseHGVS",
"mutantNameCodon", "mutantNameAA", "mutantNameAAHGVS",
"sequenceAA", "mutationTypes",
"nbrMutBases", "nbrMutCodons", "nbrMutAAs", "varLengths"),
colnames(SummarizedExperiment::rowData(se))),
nameCol)) {
tmp <- mergeValues(SummarizedExperiment::rowData(se)[[nameCol]],
SummarizedExperiment::rowData(se)[[v]])
rd[[v]] <- tmp$valueColl[match(rd[[nameCol]], tmp$mutantNameColl)]
}
rd0 <- as.data.frame(SummarizedExperiment::rowData(se)) %>%
dplyr::group_by(.data[[nameCol]]) %>%
dplyr::summarize(
dplyr::across(c("minNbrMutBases", "minNbrMutCodons",
"minNbrMutAAs"),
function(x) min(x)),
dplyr::across(c("maxNbrMutBases", "maxNbrMutCodons",
"maxNbrMutAAs"),
function(x) max(x)))
rd <- rd %>%
dplyr::full_join(rd0, by = nameCol)
rd <- S4Vectors::DataFrame(rd)
rownames(rd) <- rd[[nameCol]]
## Create SummarizedExperiment
rn1 <- rownames(aList[[1]])
cn1 <- colnames(aList[[1]])
for (i in seq_along(aList)) {
stopifnot(all(rownames(aList[[i]]) == rn1))
stopifnot(all(colnames(aList[[i]]) == cn1))
}
stopifnot(cn1 == rownames(SummarizedExperiment::colData(se)))
SummarizedExperiment::SummarizedExperiment(
assays = aList,
colData = SummarizedExperiment::colData(se),
metadata = S4Vectors::metadata(se),
rowData = rd[rn1, ]
)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.