R/calc.R
In CAGEfightR: Analysis of Cap Analysis of Gene Expression (CAGE) data using Bioconductor

Documented in calcComposition calcPooled calcSupport calcTotalTags calcTPM

#' Calculate support of CAGE data.
#'
#' Calculate the number of samples expression a feature above a certain level.
#' This number is refered to as the 'support'.
#'
#' @param object RangedSummarizedExperiment: CAGE data quantified at CTSS,
#'   cluster or gene-level.
#' @param inputAssay character: Name of assay holding input expression values.
#' @param outputColumn character: Name of column in rowRanges to hold support
#'   values.
#' @param unexpressed numeric: Support will be calculated based on features
#'   larger than this cutoff.
#'
#' @importClassesFrom Matrix dgCMatrix
#' @return object with support added as a column in rowRanges.
#' @family Calculation functions
#' @export
#' @examples
#' data(exampleUnidirectional)
#'
#' # Count samples with at at least a single tags
#' exampleUnidirectional <- calcSupport(exampleUnidirectional,
#'                                      inputAssay='counts',
#'                                      unexpressed=0)
#'
#' # Count number of samples with more than 1 TPM and save as a new column.
#' exampleUnidirectional <- calcTPM(exampleUnidirectional,
#'                                  totalTags = 'totalTags')
#' exampleUnidirectional <- calcSupport(exampleUnidirectional,
#'                                      inputAssay='TPM',
#'                                      unexpressed=1,
#'                                      outputColumn='TPMsupport')
calcSupport <- function(object, inputAssay = "counts", outputColumn = "support",
    unexpressed = 0) {
    # Prechecks
    assert_that(methods::is(object, "SummarizedExperiment"), is.string(inputAssay),
        inputAssay %in% assayNames(object), is.string(outputColumn), is.number(unexpressed))

    if (outputColumn %in% colnames(rowData(object))) {
        warning("object already has a column named ", outputColumn, " in rowData: It will be overwritten!")
    }

    # Calculate support
    rowData(object)[, outputColumn] <- as.integer(Matrix::rowSums(assay(object, inputAssay) >
        unexpressed))

    # Return
    object
}

#' Calculate the total number of CAGE tags across samples.
#'
#' For each CAGE library, calculate the total number of tags.
#'
#' @param object RangedSummarizedExperiment: CAGE data quantified at CTSS,
#'   cluster or gene-level.
#' @param inputAssay character: Name of assay holding input expression values.
#' @param outputColumn character: Name of column in colData to hold number of
#'   total tags.
#'
#' @importClassesFrom Matrix dgCMatrix
#' @return object with total tags per library added as a column in
#'   colData.
#' @family Calculation functions
#' @export
#' @examples
#' data(exampleUnidirectional)
#' calcTotalTags(exampleUnidirectional)
calcTotalTags <- function(object, inputAssay = "counts", outputColumn = "totalTags") {
    # Prechecks
    assert_that(methods::is(object, "RangedSummarizedExperiment"),
    						not_empty(object),
    						inputAssay %in% assayNames(object),
    						is.string(inputAssay),
    						is.string(outputColumn))

    if (outputColumn %in% colnames(colData(object))) {
        warning("object already has a column named ", outputColumn, " in colData: It will be overwritten!")
    }

    # Calculate colSums
    colData(object)[, outputColumn] <- Matrix::colSums(assay(object, inputAssay))

    # Return
    object
}

#' Calculate CAGE Tags-Per-Million (TPM)
#'
#' Normalize CAGE-tag counts into TPM values.
#'
#' @param object RangedSummarizedExperiment: CAGE data quantified at CTSS,
#'   cluster or gene-level.
#' @param inputAssay character: Name of assay holding input expression values.
#' @param outputAssay character: Name of assay to hold TPM values.
#' @param totalTags character or NULL: Column in colData holding the total
#'   number of tags for each samples. If NULL, this will be calculated using
#'   calcTotalTags.
#' @param outputColumn character: Name of column in colData to hold number of
#'   total tags, only used if totalTags is NULL.
#'
#' @return object with TPM-values added as a new assay. If totalTags is NULL,
#'   total tags added as a column in colData.
#' @family Calculation functions
#'
#' @importClassesFrom Matrix dgCMatrix
#' @export
#' @examples
#' data(exampleUnidirectional)
#'
#' # Calculate TPM:
#' calcTPM(exampleUnidirectional)
#'
#' # Use pre-calculated total number of tags:
#' calcTPM(exampleUnidirectional,
#'         outputAssay='TPMsupplied',
#'         totalTags='totalTags')
calcTPM <- function(object, inputAssay = "counts", outputAssay = "TPM", totalTags = NULL,
    outputColumn = "totalTags") {
    # Prechecks
    assert_that(methods::is(object, "RangedSummarizedExperiment"),
    						not_empty(object),
    						is.string(inputAssay),
    						inputAssay %in% assayNames(object),
    						is.string(outputAssay))

    if (is.null(totalTags)) {
        message("Calculating library sizes...")
        object <- calcTotalTags(object = object, inputAssay = inputAssay, outputColumn = outputColumn)
        totalTags <- outputColumn
    } else if (is.string(totalTags)) {
        message("Using supplied library sizes...")
        assert_that(totalTags %in% colnames(colData(object)), is.numeric(colData(object)[,
            totalTags]), all(colData(object)[, totalTags] >= 0))
    } else {
        stop("totalTags should NULL or a column in colData!")
    }

    if (outputAssay %in% assayNames(object)) {
        warning("object already has an assay named ", outputAssay, ": It will be overwritten!")
    }

    # Scale counts to TPM
    message("Calculating TPM...")
    assay(object, outputAssay) <- Matrix::t(Matrix::t(assay(object, inputAssay))/(colData(object)[,
        totalTags]/1000000))

    # Return
    object
}

#' Calculate pooled expression across all samples.
#'
#' Sum expression of features across all samples to obtain a 'pooled' signal.
#'
#' @param object RangedSummarizedExperiment: CAGE data quantified at CTSS,
#'   cluster or gene-level.
#' @param inputAssay character: Name of assay holding input expression values.
#' @param outputColumn character: Name of column in rowRanges to hold pooled
#'   expression.
#'
#' @importClassesFrom Matrix dgCMatrix
#' @return object with pooled expression added as a column
#'   in rowRanges.
#' @family Calculation functions
#' @export
#' @examples
#' data(exampleCTSSs)
#'
#' # Calculate TPM using supplied total number of tags:
#' exampleCTSSs <- calcTPM(exampleCTSSs, totalTags='totalTags')
#'
#' # Sum TPM values over samples:
#' calcPooled(exampleCTSSs)
calcPooled <- function(object, inputAssay = "TPM", outputColumn = "score") {
    # Prechecks
    assert_that(methods::is(object, "RangedSummarizedExperiment"),
    						not_empty(object),
    						is.string(inputAssay),
    						inputAssay %in% assayNames(object),
    						is.string(outputColumn))

    if (outputColumn %in% colnames(rowData(object))) {
        warning("object already has a column named ", outputColumn, " in rowData: It will be overwritten!")
    }

    # Calculate colSums
    rowData(object)[, outputColumn] <- Matrix::rowSums(assay(object, inputAssay))

    # Return
    object
}

#' Calculate composition of CAGE data.
#'
#' For every feature, count in how many samples it is expressed above a certain
#' fraction (e.g. 10 percent) within a grouping, usually genes. This count is
#' refered to as the 'composition' value.
#'
#' @param object RangedSummarizedExperiment: CAGE data quantified at CTSS,
#'   cluster or gene-level.
#' @param inputAssay character: Name of assay holding input expression values.
#' @param outputColumn character: Name of column in rowRanges to hold
#'   composition values.
#' @param unexpressed numeric: Composition will be calculated based on features
#'   larger than this cutoff.
#' @param genes character: Name of column in rowData holding genes (NAs are not
#'   currently allowed.)
#'
#' @return object with composition added as a column in rowData.
#' @family Calculation functions
#' @export
#' @examples
#' data(exampleUnidirectional)
#'
#' # Annotate clusters with geneIDs:
#' library(TxDb.Mmusculus.UCSC.mm9.knownGene)
#' txdb <- TxDb.Mmusculus.UCSC.mm9.knownGene
#' exampleUnidirectional <- assignGeneID(exampleUnidirectional,
#'                                       geneModels=txdb,
#'                                       outputColumn='geneID',
#'                                       swap='thick')
#'
#' # Calculate composition values:
#' exampleUnidirectional <- subset(exampleUnidirectional, !is.na(geneID))
#' calcComposition(exampleUnidirectional)
#'
#' # Use a lower threshold
#' calcComposition(exampleUnidirectional,
#'                 unexpressed=0.05,
#'                 outputColumn='lenientComposition')
calcComposition <- function(object, inputAssay = "counts", outputColumn = "composition",
    unexpressed = 0.1, genes = "geneID") {
    assert_that(methods::is(object, "SummarizedExperiment"), is.string(inputAssay),
        inputAssay %in% assayNames(object), is.string(outputColumn), is.number(unexpressed),
        unexpressed >= 0 & unexpressed <= 1, is.string(genes), is.element(genes,
            colnames(rowData(object))), is.character(rowData(object)[, genes]), noNA(rowData(object)[,
            genes]))

    if (outputColumn %in% colnames(rowData(object))) {
        warning("object already has a column named ", outputColumn, " in rowData: It will be overwritten!")
    }

    # Extract gene-wise matrices
    L <- splitAsList(x = assay(object, inputAssay), f = rowData(object)[, genes],
        drop = FALSE)

    # Scale and find high compositions Note: Scale always coerces to a base::matrix!
    L <- endoapply(L, function(x) scale(x, center = FALSE, scale = Matrix::colSums(x)) >
        unexpressed)

    # Calculate count high compositions
    L <- endoapply(L, rowSums, na.rm = TRUE)

    # Back to integer vector
    L <- unsplit(L, f = rowData(object)[, genes], drop = FALSE)
    L <- as.integer(L)

    # Post-checks
    stopifnot(length(L) == nrow(object), noNA(L))

    # Append
    rowData(object)[, outputColumn] <- L

    # Return
    object
}

Any scripts or data that you put into this service are public.

CAGEfightR documentation built on Nov. 8, 2020, 5:42 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

CAGEfightR
Analysis of Cap Analysis of Gene Expression (CAGE) data using Bioconductor

R/calc.R
In CAGEfightR: Analysis of Cap Analysis of Gene Expression (CAGE) data using Bioconductor

Defines functions calcComposition calcPooled calcTPM calcTotalTags calcSupport

Documented in calcComposition calcPooled calcSupport calcTotalTags calcTPM

Try the CAGEfightR package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

CAGEfightR Analysis of Cap Analysis of Gene Expression (CAGE) data using Bioconductor

R/calc.R In CAGEfightR: Analysis of Cap Analysis of Gene Expression (CAGE) data using Bioconductor

Defines functions calcComposition calcPooled calcTPM calcTotalTags calcSupport

Documented in calcComposition calcPooled calcSupport calcTotalTags calcTPM

Try the CAGEfightR package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

CAGEfightR
Analysis of Cap Analysis of Gene Expression (CAGE) data using Bioconductor

R/calc.R
In CAGEfightR: Analysis of Cap Analysis of Gene Expression (CAGE) data using Bioconductor