R/assign.R
In CAGEfightR: Analysis of Cap Analysis of Gene Expression (CAGE) data using Bioconductor

#### Helpers ####

checkGenomes <- function(x, y){
    tmp <- checkCompatibleSeqinfo(x,  y)
    methods::is(tmp, "Seqinfo")
}

#### txType ####

#' Annotate ranges with transcript type.
#'
#' Annotate a set of ranges in a GRanges object with transcript type (txType)
#' based on their genic context. Transcripts are obtained from a TxDb object,
#' but can alternatively be specified manually as a GRangesList.
#'
#' @param object GRanges or RangedSummarizedExperiment: Ranges to be annotated.
#' @param txModels TxDb or GRangesList: Transcript models via a TxDb, or
#'   manually specified as a GRangesList.
#' @param outputColumn character: Name of column to hold txType values.
#' @param swap character or NULL: If not NULL, use another set of ranges
#'   contained in object to calculate overlaps, for example peaks in the thick
#'   column.
#' @param tssUpstream integer: Distance to extend annotated promoter upstream.
#' @param tssDownstream integer: Distance to extend annotated promoter
#'   downstream.
#' @param proximalUpstream integer: Maximum distance upstream of promoter to be
#'   considered proximal.
#' @param detailedAntisense logical: Wether to mirror all txType categories in
#'   the antisense direction (TRUE) or lump them all together (FALSE).
#' @param noOverlap character: In case categories are manually supplied with as
#'   a GRangesList, what to call regions with no overlap.
#' @param ... additional arguments passed to methods.
#'
#' @return object with txType added as factor column in rowData (or mcols)
#'
#' @family Annotation functions
#' @export
#' @examples
#' \dontrun{
#' data(exampleUnidirectional)
#'
#' # Obtain transcript models from a TxDb-object:
#' library(TxDb.Mmusculus.UCSC.mm9.knownGene)
#' txdb <- TxDb.Mmusculus.UCSC.mm9.knownGene
#'
#' # Assign txIDs
#' assignTxType(exampleUnidirectional,
#'              txModels=txdb)
#'
#' # Assign txIDs using only TC peaks:
#' exampleUnidirectional <- assignTxType(exampleUnidirectional,
#'                                       txModels=txdb,
#'                                       swap='thick')
#'
#' # The 'promoter' and 'proximal' category boundaries can be changed:
#' assignTxType(exampleUnidirectional,
#'              txModels=txdb,
#'              swap='thick',
#'              tssUpstream=50,
#'              tssDownstream=50,
#'              proximalUpstream=100)
#'
#' # Annotation using complete antisense categories:
#' exampleUnidirectional <- assignTxType(exampleUnidirectional,
#'                                     txModels=txdb,
#'                                     outputColumn='txTypeExtended',
#'                                     swap='thick',
#'                                     detailedAntisense=TRUE)
#'
#' # The output is always a factor added as a column:
#' summary(rowRanges(exampleUnidirectional)$txType)
#' summary(rowRanges(exampleUnidirectional)$txTypeExtended)
#'
#' # To avoid using a TxDb-object, a GRangesList can be supplied:
#' custom_hierarchy <- GRangesList(promoters=granges(promoters(txdb)),
#'                                 exons=granges(exons(txdb)))
#' assignTxType(exampleUnidirectional,
#'              txModels=custom_hierarchy,
#'              outputColumn='customType',
#'              swap='thick',
#'              noOverlap = 'intergenic')
#' }
setGeneric("assignTxType", function(object, txModels, ...) {
    standardGeneric("assignTxType")
})

#' @rdname assignTxType
setMethod("assignTxType", signature(object = "GenomicRanges", txModels = "GenomicRangesList"),
    function(object, txModels, outputColumn = "txType", swap = NULL, noOverlap = "intergenic") {
        # Pre-checks
        assert_that(!is.null(names(txModels)),
                    is.string(outputColumn),
                    is.string(noOverlap),
                    checkGenomes(object, txModels))
                    #identical(seqlengths(object), seqlengths(txModels)))

        # Warnings
        if (outputColumn %in% colnames(mcols(object))) {
            warning("object already has a column named ",
                    outputColumn,
                    " in mcols: It will be overwritten!")
        }

        # Find overlaps
        if (is.null(swap)) {
            message("Finding hierachical overlaps...")
            hits <- findOverlaps(object, txModels)
        } else {
            message("Finding hierachical overlaps with swapped ranges...")
            hits <- findOverlaps(swapRanges(object, inputColumn = swap), txModels)
        }

        # Choose highest in hierachy
        hits <- breakTies(hits, method = "first")

        # Extract corresponding categories
        hits <- methods::as(hits, "List")
        hits <- extractList(names(txModels), hits)
        hits <- as.character(hits)

        # Replace NAs and format as factor
        hits <- ifelse(is.na(hits), noOverlap, hits)
        hits <- factor(hits, levels = c(names(txModels), noOverlap))

        # Append to object
        mcols(object)[, outputColumn] <- hits

        # Summarise annotation
        s <- as.data.frame(table(hits))
        colnames(s) <- c("txType", "count")
        s$percentage <- round((s$count / sum(s$count)) * 100, digits = 1)
        s <- paste(utils::capture.output(print(s)), collapse = "\n")
        message("### Overlap summary: ###")
        message(s)

        # Return
        object
    })

#' @rdname assignTxType
setMethod("assignTxType",
          signature(object = "RangedSummarizedExperiment",
                    txModels = "GenomicRangesList"),
    function(object, txModels, ...) {
        rowRanges(object) <- assignTxType(rowRanges(object),
                                          txModels = txModels,
            ...)

        # Return
        object
    })

#' @rdname assignTxType
setMethod("assignTxType", signature(object = "GenomicRanges", txModels = "TxDb"),
    function(object, txModels, outputColumn = "txType", swap = NULL,
             tssUpstream = 100, tssDownstream = 100, proximalUpstream = 1000,
             detailedAntisense = FALSE) {
        # Pre-checks
        assert_that(is.count(tssUpstream),
                    is.count(tssDownstream),
                    is.count(proximalUpstream),
                    is.flag(detailedAntisense),
                    checkGenomes(object, txModels))

        # Hierachy
        hierachy <- utilsSimplifyTxDb(object=txModels,
                                      tssUpstream=tssUpstream,
                                      tssDownstream=tssDownstream,
                                      proximalUpstream=proximalUpstream,
                                      detailedAntisense=detailedAntisense)

        # Build sense hierachy
        # hierachy <- GRangesList(promoter = granges(trim(promoters(txModels, upstream = tssUpstream,
        #     downstream = tssDownstream))), proximal = granges(trim(promoters(txModels,
        #     upstream = proximalUpstream, downstream = 0))), fiveUTR = granges(unlist(fiveUTRsByTranscript(txModels))),
        #     threeUTR = granges(unlist(threeUTRsByTranscript(txModels))), CDS = granges(cds(txModels)),
        #     exon = granges(exons(txModels)), intron = granges(unlist(intronsByTranscript(txModels))))
        #
        # Build sense hierachy message('Extracting txType categories...') hierachy <-
        # List(promoter=trim(promoters(txModels, upstream=tssUpstream,
        # downstream=tssDownstream)), proximal=trim(promoters(txModels,
        # upstream=proximalUpstream, downstream=0)),
        # fiveUTR=fiveUTRsByTranscript(txModels),
        # threeUTR=threeUTRsByTranscript(txModels), CDS=cds(txModels),
        # exon=exons(txModels), intron=intronsByTranscript(txModels)) # Coerce to
        # GRangesList #message('Coercing to GRangesList...') hierachy <- lapply(hierachy,
        # unlist) hierachy <- lapply(hierachy, granges) hierachy <- GRangesList(hierachy)

        # Build antisense hierachy message('Adding antisense categories...')
        # if (detailedAntisense) {
        #     antisense <- invertStrand(hierachy)
        #     names(antisense) <- paste0("antisense_", names(antisense))
        #     hierachy <- c(hierachy, antisense)
        # } else if (!detailedAntisense) {
        #     antisense <- invertStrand(granges(transcripts(txModels)))
        #     hierachy$antisense <- antisense
        # } else {
        #     stop("detailedAntisense must be either TRUE/FALSE!")
        # }
        # rm(antisense)

        # Overlap
        object <- assignTxType(object = object,
                               txModels = hierachy,
                               outputColumn = outputColumn,
                               swap = swap,
                               noOverlap = "intergenic")

        # Return
        object
    })

#' @rdname assignTxType
setMethod("assignTxType", signature(object = "RangedSummarizedExperiment", txModels = "TxDb"),
    function(object, txModels, ...) {
        rowRanges(object) <- assignTxType(rowRanges(object), txModels = txModels,
            ...)

        # Return
        object
    })

#### geneID ####

#' Annotate ranges with gene ID.
#'
#' Annotate a set of ranges in a GRanges object with gene IDs (i.e. Entrez Gene
#' Identifiers) based on their genic context. Features overlapping multiple
#' genes are resolved by distance to the nearest TSS. Genes are obtained from a
#' TxDb object, or can manually supplied as a GRanges.
#'
#' @param object GRanges or RangedSummarizedExperiment: Ranges to be annotated.
#' @param geneModels TxDb or GRanges: Gene models via a TxDb, or manually
#'   specified as a GRangesList.
#' @param outputColumn character: Name of column to hold geneID values.
#' @param swap character or NULL: If not NULL, use another set of ranges
#'   contained in object to calculate overlaps, for example peaks in the thick
#'   column.
#' @param upstream integer: Distance to extend annotated promoter upstream.
#' @param downstream integer: Distance to extend annotated promoter downstream.
#' @param ... additional arguments passed to methods.
#'
#' @return object with geneID added as a column in rowData (or mcols).
#'
#' @family Annotation functions
#' @export
#' @examples
#' data(exampleUnidirectional)
#'
#' # Obtain gene models from a TxDb-object:
#' library(TxDb.Mmusculus.UCSC.mm9.knownGene)
#' txdb <- TxDb.Mmusculus.UCSC.mm9.knownGene
#'
#' # Assign geneIDs
#' assignGeneID(exampleUnidirectional,
#'              geneModels=txdb,
#'              outputColumn='geneID')
#'
#' # Assign geneIDs using only TC peaks:
#' assignGeneID(exampleUnidirectional,
#'              geneModels=txdb,
#'              outputColumn='geneID',
#'              swap='thick')
setGeneric("assignGeneID", function(object, geneModels, ...) {
    standardGeneric("assignGeneID")
})

#' @rdname assignGeneID
setMethod("assignGeneID", signature(object = "GenomicRanges", geneModels = "GenomicRanges"),
    function(object, geneModels, outputColumn = "geneID", swap = NULL, upstream = 1000,
        downstream = 100) {
        # Pre-checks
        assert_that(!is.null(names(geneModels)),
                    is.string(outputColumn),
                    is.number(upstream),
                    is.number(downstream),
                    checkGenomes(object, geneModels))
                    #identical(seqlengths(object), seqlengths(geneModels)))

        # Warnings
        if (outputColumn %in% colnames(mcols(object))) {
            warning("object already has a column named ",
                    outputColumn,
                    " in mcols: It will be overwritten!")
        }

        # Extract anchor points
        message("Overlapping while taking distance to nearest TSS into account...")
        extendedGeneModels <- punion(geneModels,
                                     promoters(geneModels,
                                               upstream = upstream,
                                               downstream = downstream))
        extendedGeneModels <- trim(extendedGeneModels)
        TSSs <- resize(geneModels, width = 1, fix = "start")

        # Find overlaps
        if (is.null(swap)) {
            message("Finding hierachical overlaps...")
            hits <- findOverlaps(object, extendedGeneModels)
        } else {
            message("Finding hierachical overlaps with swapped ranges...")
            hits <- findOverlaps(swapRanges(object, inputColumn = swap),
                                 extendedGeneModels)
        }

        # Calculate distances to TSSs
        mcols(hits)$distance <- distance(Pairs(object, TSSs, hits = hits))
        rm(TSSs, extendedGeneModels)

        # Resolve by distance to nearest TSS by split apply, THIS CAN BE UPDATED IN NEXT
        # VERSION OF S4Vectors!
        hits <- hits[which.min(splitAsList(mcols(hits)$distance, queryHits(hits)),
            global = TRUE)]

        # Extract ids
        hits <- methods::as(hits, "List")
        hits <- extractList(names(geneModels), hits)
        hits <- as.character(hits)
        stopifnot(length(hits) == length(object))

        # Append to object
        mcols(object)[, outputColumn] <- hits

        # Return
        message("### Overlap Summary: ###")
        message("Features overlapping genes: ",
                round(mean(!is.na(hits)) * 100, digits = 2),
                " %")
        message("Number of unique genes: ", length(unique(na.omit(hits))))

        # Return
        object
    })

#' @rdname assignGeneID
setMethod("assignGeneID", signature(object = "RangedSummarizedExperiment", geneModels = "GenomicRanges"),
    function(object, geneModels, ...) {
        rowRanges(object) <- assignGeneID(rowRanges(object),
                                          geneModels = geneModels,
                                          ...)

        # Return
        object
    })

#' @rdname assignGeneID
setMethod("assignGeneID", signature(object = "GenomicRanges", geneModels = "TxDb"),
    function(object, geneModels, outputColumn = "geneID", swap = NULL, upstream = 1000,
        downstream = 100) {
        # Pre-checks
        assert_that(is.string(outputColumn),
                    is.number(upstream),
                    is.number(downstream),
                    checkGenomes(object, geneModels))
            #identical(seqlengths(object), seqlengths(geneModels)))

        # Extract anchor points
        message("Extracting genes...")
        as_gr <- genes(geneModels)

        # Overlap
        object <- assignGeneID(object,
                               geneModels = as_gr,
                               outputColumn = outputColumn,
                               swap = swap,
                               upstream = upstream,
                               downstream = downstream)

        # Return
        object
    })

#' @rdname assignGeneID
setMethod("assignGeneID", signature(object = "RangedSummarizedExperiment", geneModels = "TxDb"),
    function(object, geneModels, ...) {
        rowRanges(object) <- assignGeneID(rowRanges(object),
                                          geneModels = geneModels,
                                          ...)

        # Return
        object
    })

#### txID ####

#' Annotate ranges with transcript ID.
#'
#' Annotate a set of ranges in a GRanges object with transcript IDs based on
#' their genic context. All overlapping transcripts are returned. Transcripts
#' are obtained from a TxDb object, or can manually supplied as a GRanges.
#'
#' @param object GRanges or RangedSummarizedExperiment: Ranges to be annotated.
#' @param txModels TxDb or GRanges: Transcript models via a TxDb, or manually
#'   specified as a GRanges.
#' @param outputColumn character: Name of column to hold txID values.
#' @param swap character or NULL: If not NULL, use another set of ranges
#'   contained in object to calculate overlaps, for example peaks in the thick
#'   column.
#' @param upstream integer: Distance to extend annotated promoter upstream.
#' @param downstream integer: Distance to extend annotated promoter downstream.
#' @param ... additional arguments passed to methods.
#'
#' @return object with txID added as a column in rowData (or mcols)
#'
#' @family Annotation functions
#' @export
#' @examples
#' data(exampleUnidirectional)
#'
#' # Obtain transcript models from a TxDb-object:
#' library(TxDb.Mmusculus.UCSC.mm9.knownGene)
#' txdb <- TxDb.Mmusculus.UCSC.mm9.knownGene
#'
#' # Assign txIDs
#' assignTxID(exampleUnidirectional,
#'            txModels=txdb,
#'            outputColumn='geneID')
#'
#' # Assign txIDs using only TC peaks:
#' assignTxID(exampleUnidirectional,
#'              txModels=txdb,
#'              outputColumn='geneID',
#'              swap='thick')
setGeneric("assignTxID", function(object, txModels, ...) {
    standardGeneric("assignTxID")
})

#' @rdname assignTxID
setMethod("assignTxID", signature(object = "GenomicRanges", txModels = "GenomicRanges"),
    function(object, txModels, outputColumn = "txID", swap = NULL) {
        # Pre-checks
        assert_that(!is.null(names(txModels)),
                    is.string(outputColumn),
                    checkGenomes(object, txModels))
                    #identical(seqlengths(object), seqlengths(txModels)))

        # Warnings
        if (outputColumn %in% colnames(mcols(object))) {
            warning("object already has a column named ",
                    outputColumn,
                    " in mcols: It will be overwritten!")
        }

        # Find overlaps
        if (is.null(swap)) {
            message("Finding hierachical overlaps...")
            hits <- findOverlaps(object, txModels)
        } else {
            message("Finding hierachical overlaps with swapped ranges...")
            hits <- findOverlaps(swapRanges(object, inputColumn = swap),
                                 txModels)
        }

        # Count unique transcrips for later
        nTxs <- length(unique(to(hits)))

        # Extract names
        hits <- methods::as(hits, "List")
        hits <- extractList(names(txModels), hits)
        hits <- paste(hits, collapse = ";")
        hits <- ifelse(hits == "", NA, hits)

        # Checks
        stopifnot(length(hits) == length(object), is.character(hits))

        # Append to object
        mcols(object)[, outputColumn] <- hits

        # Return
        message("### Overlap Summary: ###")
        message("Features overlapping transcripts: ",
                round(mean(!is.na(hits)) * 100, digits = 2),
                " %")
        message("Number of unique transcripts: ", nTxs)

        # Return
        object
    })

#' @rdname assignTxID
setMethod("assignTxID", signature(object = "RangedSummarizedExperiment", txModels = "GenomicRanges"),
    function(object, txModels, ...) {
        rowRanges(object) <- assignTxID(rowRanges(object),
                                        txModels = txModels,
                                        ...)

        # Return
        object
    })

#' @rdname assignTxID
setMethod("assignTxID", signature(object = "GenomicRanges", txModels = "TxDb"),
          function(object, txModels, outputColumn = "txID", swap = NULL, upstream = 1000, downstream = 0) {
    # Pre-checks
    assert_that(is.string(outputColumn),
                is.number(upstream),
                upstream >= 0,
                is.number(downstream),
                downstream >= 0,
                checkGenomes(object, txModels))
                #identical(seqlengths(object), seqlengths(txModels)))

    # Extract anchor points
    message("Extracting transcripts...")
    txs <- transcripts(txModels, columns = "tx_name")
    names(txs) <- txs$tx_name
    txs <- punion(txs, promoters(txs, upstream = upstream, downstream = downstream))
    txs <- trim(txs)

    # Overlap
    object <- assignTxID(object, txModels = txs, outputColumn = outputColumn)

    # Return
    object
})

#' @rdname assignTxID
setMethod("assignTxID", signature(object = "RangedSummarizedExperiment", txModels = "TxDb"),
    function(object, txModels, ...) {
        rowRanges(object) <- assignTxID(rowRanges(object), txModels = txModels, ...)

        # Return
        object
    })

#### NAs ####

#' Annotate ranges with missing IDs.
#'
#' This function can relabel ranges with missing IDs (i.e. returned by
#' assignTxID and assignGeneID), in case they need to be retained for further
#' analysis.
#'
#' @param object character, GRanges or RangedSummarizedExperiment: IDs to have
#'   NAs replaces with new IDs.
#' @param outputColumn character: Name of column to hold txID values.
#' @param prefix character: New name to assign to ranges with missing IDs, in
#'   the style prefix1, prefix2, etc.
#' @param ... additional arguments passed to methods.
#'
#' @return object with NAs replaced in outputColumn
#'
#' @family Annotation functions
#' @export
#' @examples
#' data(exampleUnidirectional)
#'
#' # Obtain gene models from a TxDb-object:
#' library(TxDb.Mmusculus.UCSC.mm9.knownGene)
#' txdb <- TxDb.Mmusculus.UCSC.mm9.knownGene
#'
#' # Assign geneIDs using only TC peaks:
#' exampleUnidirectional <- assignGeneID(exampleUnidirectional,
#'                                       geneModels=txdb,
#'                                       outputColumn='geneID',
#'                                       swap='thick')
#'
#' # Replace NAs with 'Novel'
#' assignMissingID(exampleUnidirectional)
#'
#' # Replace NAs with 'NovelTSS'
#' assignMissingID(exampleUnidirectional, prefix = 'NovelTSS')
setGeneric("assignMissingID", function(object, ...) {
    standardGeneric("assignMissingID")
})

#' @rdname assignMissingID
setMethod("assignMissingID", signature(object = "character"), function(object, prefix = "Novel") {
    # Pre-checks
    assert_that(is.string(prefix))

    missingIDs <- is.na(object)
    totalMissing <- sum(missingIDs)
    object[missingIDs] <- paste0(prefix, seq_len(totalMissing))

    message("Assigned ", totalMissing, " missing IDs")
    object
})

#' @rdname assignMissingID
setMethod("assignMissingID", signature(object = "GenomicRanges"), function(object,
    outputColumn = "geneID", prefix = "Novel") {
    # Pre-checks
    assert_that(is.string(outputColumn),
                outputColumn %in% colnames(mcols(object)))

    # Replace
    mcols(object)[, outputColumn] <- assignMissingID(mcols(object)[, outputColumn],
        prefix = prefix)

    # Return
    object
})

#' @rdname assignMissingID
setMethod("assignMissingID", signature(object = "RangedSummarizedExperiment"), function(object,
    outputColumn = "geneID", prefix = "Novel") {
    rowRanges(object) <- assignMissingID(rowRanges(object), outputColumn = outputColumn,
        prefix = prefix)

    # Return
    object
})
Any scripts or data that you put into this service are public.
CAGEfightR documentation built on Nov. 8, 2020, 5:42 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
CAGEfightR
Analysis of Cap Analysis of Gene Expression (CAGE) data using Bioconductor

R/assign.R
In CAGEfightR: Analysis of Cap Analysis of Gene Expression (CAGE) data using Bioconductor

Defines functions checkGenomes

Try the CAGEfightR package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

CAGEfightR Analysis of Cap Analysis of Gene Expression (CAGE) data using Bioconductor

R/assign.R In CAGEfightR: Analysis of Cap Analysis of Gene Expression (CAGE) data using Bioconductor

Defines functions checkGenomes

Try the CAGEfightR package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

CAGEfightR
Analysis of Cap Analysis of Gene Expression (CAGE) data using Bioconductor

R/assign.R
In CAGEfightR: Analysis of Cap Analysis of Gene Expression (CAGE) data using Bioconductor