R/collapseMutantsByAA.R

Defines functions collapseMutants collapseMutantsByAA collapseMutantsBySimilarity

Documented in collapseMutants collapseMutantsByAA collapseMutantsBySimilarity

#' Collapse mutants by similarity
#' 
#' These functions can be used to collapse variants, either by similarity or 
#' according to a pre-defined grouping. The functions \code{collapseMutants} and 
#' \code{collapseMutantsByAA} assume that a grouping variable is available as a 
#' column in \code{rowData(se)} (\code{collapseMutantsByAA} is a convenience 
#' function for the case when this column is "mutantNameAA", and is provided 
#' for backwards compatibility). The \code{collapseMutantsBySimilarity} will 
#' generate the grouping variable based on user-provided thresholds on the 
#' sequence similarity (defined by the Hamming distance), and subsequently
#' collapse based on the derived grouping. 
#' 
#' @param se A \code{\link[SummarizedExperiment]{SummarizedExperiment}}
#'     generated by \code{\link{summarizeExperiment}}
#' @param assayName The name of the assay that will be used to calculate a 
#'     "score" (typically derived from the read counts) for each variant.
#' @param scoreMethod Character scalar giving the approach used to calculate 
#'     ranking scores from the assay defined by \code{assayName}. Currently, 
#'     this can be one of \code{"rowSum"} or \code{"rowMean"}. All filtering
#'     criteria will be applied to these scores.
#' @param sequenceCol Character scalar giving the name of the column in 
#'     \code{rowData(se)} that contains the nucleotide sequence of the 
#'     variants. 
#' @param collapseMaxDist Numeric scalar defining the tolerance for collapsing 
#'     similar sequences. If the value is in [0, 1), it defines the maximal 
#'     Hamming distance in terms of a fraction of sequence length:
#'     (\code{round(collapseMaxDist * nchar(sequence))}).
#'     A value greater or equal to 1 is rounded and directly used as the maximum
#'     allowed Hamming distance. Note that sequences can only be
#'     collapsed if they are all of the same length.
#' @param collapseMinScore Numeric scalar, indicating the minimum score for the 
#'     sequence to be considered for collapsing with similar sequences.
#' @param collapseMinRatio Numeric scalar. During collapsing of
#'     similar sequences, a low-frequency sequence will be collapsed 
#'     with a higher-frequency sequence only if the ratio between the 
#'     high-frequency and the low-frequency scores is at least this 
#'     high. The default value of 0 indicates that no such check is performed.
#' @param verbose Logical, whether to print progress messages.
#' 
#' @author Charlotte Soneson, Michael Stadler
#' 
#' @export
#' 
#' @importFrom SummarizedExperiment assayNames assay rowData
#' @importFrom Matrix rowSums rowMeans
#' 
#' @examples
#' se <- readRDS(system.file("extdata", "GSE102901_cis_se.rds",
#'                           package = "mutscan"))[1:200, ]
#' ## The rows of this object correspond to individual codon variants
#' dim(se)
#' head(rownames(se))
#' 
#' ## Collapse by amino acid
#' sec <- collapseMutantsByAA(se)
#' ## The rows of the collapsed object correspond to amino acid variants
#' dim(sec)
#' head(rownames(sec))
#' ## The mutantName column contains the individual codon variants that were 
#' ## collapsed
#' head(SummarizedExperiment::rowData(sec))
#' 
#' ## Collapse similar sequences
#' sec2 <- collapseMutantsBySimilarity(
#'     se = se, assayName = "counts", scoreMethod = "rowSum",
#'     sequenceCol = "sequence", collapseMaxDist = 2,
#'     collapseMinScore = 0, collapseMinRatio = 0)
#' dim(sec2)
#' head(rownames(sec2))
#' head(SummarizedExperiment::rowData(sec2))
#'
collapseMutantsBySimilarity <- function(se, assayName, scoreMethod = "rowSum",
                                        sequenceCol = "sequence", 
                                        collapseMaxDist = 0,
                                        collapseMinScore = 0, 
                                        collapseMinRatio = 0,
                                        verbose = TRUE) {
    ## Check input arguments
    .assertVector(x = se, type = "SummarizedExperiment")
    .assertScalar(x = assayName, type = "character", 
                  validValues = SummarizedExperiment::assayNames(se))
    .assertScalar(x = scoreMethod, type = "character",
                  validValues = c("rowSum", "rowMean"))
    .assertScalar(x = sequenceCol, type = "character",
                  validValues = colnames(SummarizedExperiment::rowData(se)))
    .assertScalar(x = collapseMaxDist, type = "numeric")
    .assertScalar(x = collapseMinScore, type = "numeric")
    .assertScalar(x = collapseMinRatio, type = "numeric")
    .assertScalar(x = verbose, type = "logical")
    
    ## All sequences must be the same length, and can only consist of ACGT_
    seqs <- SummarizedExperiment::rowData(se)[, sequenceCol]
    if (length(unique(nchar(seqs))) != 1) {
        stop("All entries in the sequence column must have the same length")
    }
    if (!all(vapply(seqs, 
                    function(x) grepl("^[ACGT_]*$", x), FALSE))) {
        stop("All entries in the sequence column must consist of [ACGT_]")
    }
    
    ## Get matching table
    if (scoreMethod == "rowSum") {
        scores <- Matrix::rowSums(SummarizedExperiment::assay(se, assayName))
    } else if (scoreMethod == "rowMean") {
        scores <- Matrix::rowMeans(SummarizedExperiment::assay(se, assayName))
    }
    matchTable <- groupSimilarSequences(
        seqs = seqs,
        scores = scores,
        collapseMaxDist = collapseMaxDist,
        collapseMinScore = collapseMinScore,
        collapseMinRatio = collapseMinRatio,
        verbose = verbose
    )
    matchTable$representative <- rownames(se)[
        match(matchTable$representative, 
              SummarizedExperiment::rowData(se)[, sequenceCol])]

    ## Add grouping variable
    SummarizedExperiment::rowData(se)$collapseCol <- 
        matchTable$representative[match(SummarizedExperiment::rowData(se)[, sequenceCol], 
                                        matchTable$sequence)]
    
    ## Collapse
    se <- collapseMutants(se, nameCol = "collapseCol")
    
    se
}

#' @rdname collapseMutantsBySimilarity
#' @export
#' 
collapseMutantsByAA <- function(se) {
    collapseMutants(se = se, nameCol = "mutantNameAA")
}

#' Collapse counts by mutated amino acid
#'
#' @param se A \code{\link[SummarizedExperiment]{SummarizedExperiment}}
#'     generated by \code{\link{summarizeExperiment}}
#' @param nameCol A character scalar providing the column of 
#'     \code{rowData(se)} that contains the amino acid mutant names (that will 
#'     be the new row names).
#'
#' @return A \code{\link[SummarizedExperiment]{SummarizedExperiment}} where
#'     counts have been aggregated by the mutated amino acid(s).
#'
#' @author Charlotte Soneson, Michael Stadler
#'
#' @export
#'
#' @importFrom DelayedArray rowsum
#' @importFrom S4Vectors metadata
#' @importFrom SummarizedExperiment assays rowData SummarizedExperiment colData
#' @importFrom dplyr across group_by summarize
#' @importFrom stats setNames
#'
#' @rdname collapseMutantsBySimilarity
#' 
collapseMutants <- function(se, nameCol) {
    .assertVector(x = se, type = "SummarizedExperiment")
    .assertScalar(x = nameCol, type = "character",
                  validValues = colnames(SummarizedExperiment::rowData(se)))
    
    ## Collapse assays
    ## Matrix.utils has been removed from CRAN
    # aList <- lapply(SummarizedExperiment::assays(se), function(a) {
    #     Matrix.utils::aggregate.Matrix(
    #         x = a,
    #         groupings = factor(SummarizedExperiment::rowData(se)[[nameCol]]),
    #         fun = "colSums")
    # })
    aList <- lapply(SummarizedExperiment::assays(se), function(a) {
        DelayedArray::rowsum(
            x = a,
            group = factor(SummarizedExperiment::rowData(se)[[nameCol]])
        )
    })
    
    ## Collapse rowData - simple columns
    rd <- mergeValues(SummarizedExperiment::rowData(se)[[nameCol]],
                      SummarizedExperiment::rowData(se)$sequence) %>%
        stats::setNames(c(nameCol, "sequence"))
    for (v in setdiff(
        intersect(c("mutantName", "mutantNameBase", "mutantNameBaseHGVS",
                    "mutantNameCodon", "mutantNameAA", "mutantNameAAHGVS",
                    "sequenceAA", "mutationTypes",
                    "nbrMutBases", "nbrMutCodons", "nbrMutAAs", "varLengths"),
                  colnames(SummarizedExperiment::rowData(se))),
        nameCol)) {
        tmp <- mergeValues(SummarizedExperiment::rowData(se)[[nameCol]],
                           SummarizedExperiment::rowData(se)[[v]])
        rd[[v]] <- tmp$valueColl[match(rd[[nameCol]], tmp$mutantNameColl)]
    }
    rd0 <- as.data.frame(SummarizedExperiment::rowData(se)) %>%
        dplyr::group_by(.data[[nameCol]]) %>%
        dplyr::summarize(
            dplyr::across(c("minNbrMutBases", "minNbrMutCodons",
                            "minNbrMutAAs"),
                          function(x) min(x)),
            dplyr::across(c("maxNbrMutBases", "maxNbrMutCodons",
                            "maxNbrMutAAs"),
                          function(x) max(x)))
    rd <- rd %>% 
        dplyr::full_join(rd0, by = nameCol)
    
    rd <- S4Vectors::DataFrame(rd)
    rownames(rd) <- rd[[nameCol]]
    
    ## Create SummarizedExperiment
    rn1 <- rownames(aList[[1]])
    cn1 <- colnames(aList[[1]])
    for (i in seq_along(aList)) {
        stopifnot(all(rownames(aList[[i]]) == rn1))
        stopifnot(all(colnames(aList[[i]]) == cn1))
    }
    stopifnot(cn1 == rownames(SummarizedExperiment::colData(se)))
    SummarizedExperiment::SummarizedExperiment(
        assays = aList,
        colData = SummarizedExperiment::colData(se),
        metadata = S4Vectors::metadata(se),
        rowData = rd[rn1, ]
    )
}
fmicompbio/mutscan documentation built on Oct. 24, 2024, 2:41 p.m.