R/trimTo.R

Defines functions asymmetricPercentiles symmetricPercentiles

symmetricPercentiles <- function(r, prop) {
    # Convert to normal vector
    r <- as.vector(r)

    # Calculate proportions
    r_sum <- sum(r)
    r_prop <- r/r_sum

    # Cumulate sum from either side
    f_cum <- cumsum(r_prop)
    r_cum <- rev(cumsum(rev(r_prop)))

    # Trim both sides
    half_prop <- prop/2
    left <- Position(function(x) x >= half_prop, f_cum, right = FALSE, nomatch = 1)
    right <- Position(function(x) x >= half_prop, r_cum, right = TRUE, nomatch = length(r))

    # Return range
    c(left, right)
}

asymmetricPercentiles <- function(r, prop) {
    # Convert to normal vector
    r <- as.vector(r)

    # Calculate proportions
    r_sum <- sum(r)
    r_prop <- r/r_sum

    # Sort be minimum cumsum from either side
    f_cum <- cumsum(r_prop)
    r_cum <- rev(cumsum(rev(r_prop)))
    min_cum <- pmin(f_cum, r_cum)
    o_cum <- order(min_cum)

    # Order original proportions and cut
    r_ord <- cumsum(r_prop[o_cum])
    left <- Position(function(x) x > prop, r_ord, right = FALSE, nomatch = 1)
    o_out <- o_cum[left:length(r_ord)]

    # Return range
    range(o_out)
}

#' Trim width of TCs to expression percentiles
#'
#' Given a set of TCs and genome-wide CTSS coverage, reduce the width of TC
#' until a certain amount of expression has been removed.
#'
#' @param object GenomicRanges or RangedSummarizedExperiment: TCs to be trimmed.
#' @param pooled GenomicRanges or RangedSummarizedExperiment: CTSS coverage.
#' @param percentile numeric: Fraction of expression to remove from TCs.
#' @param symmetric logical: Whether to trim the same amount from both edges of
#'   the TC (TRUE) or always trim from the least expressed end (FALSE).
#' @param ... additional arguments passed to methods.
#'
#' @return GRanges with trimmed TCs, including recalculated peaks and scores.
#' @family Clustering functions
#' @family Trimming functions
#' @export
#' @examples
#' data(exampleCTSSs)
#' data(exampleBidirectional)
#'
#' # Calculate pooled CTSSs
#' exampleCTSSs <- calcTPM(exampleCTSSs, totalTags='totalTags')
#' exampleCTSSs <- calcPooled(exampleCTSSs)
#'
#' # Choose a few wide clusters:
#' TCs <- subset(exampleUnidirectional, width >= 100)
#'
#' # Symmetric trimming (same percentage from each side):
#' TCs_sym <- trimToPercentiles(TCs, pooled=exampleCTSSs, symmetric=FALSE)
#'
#' # Assymmetric trimming (always trim from lowest side):
#' TCs_asym <- trimToPercentiles(TCs, pooled=exampleCTSSs, symmetric=TRUE)
#'
#' # Compare the two results sets of widths:
#' summary(width(TCs_sym) - width(TCs_asym))
setGeneric("trimToPercentiles", function(object, pooled, ...) {
    standardGeneric("trimToPercentiles")
})

#' @rdname trimToPercentiles
setMethod("trimToPercentiles", signature(object = "GRanges", pooled = "GRanges"),
    function(object, pooled, percentile = 0.1, symmetric = FALSE) {
        # Pre-checks
        assert_that(checkPooled(pooled),
                    percentile > 0 & percentile <= 1,
                    is.logical(symmetric),
                    identical(seqlengths(object), seqlengths(pooled)))

        # Split by strand
        message("Splitting by strand...")
        covByStrand <- splitPooled(pooled)
        TCsByStrand <- splitByStrand(object)

        # coverage_stranded <- splitByStrand(pooled)
        # coverage_plus <- coverage(coverage_stranded$`+`, weight = "score")
        # coverage_minus <- coverage(coverage_stranded$`-`, weight = "score")
        # tcs_stranded <- splitByStrand(object)
        # rm(coverage_stranded)

        # Convert to IRangesList
        irl_plus <- methods::as(TCsByStrand$`+`,'IRangesList')
        irl_minus <- methods::as(TCsByStrand$`-`,'IRangesList')

        # Tempoary workwaround
        # irl_plus <- split(ranges(tcs_stranded$`+`), seqnames(tcs_stranded$`+`))
        # irl_minus <- split(ranges(tcs_stranded$`-`), seqnames(tcs_stranded$`-`))
        # rm(tcs_stranded)

        # Obtain views
        # views_plus <- Views(coverage_plus, irl_plus)
        # views_minus <- Views(coverage_minus, irl_minus)

        # Views
        views_plus <- Views(covByStrand$`+`, irl_plus)
        views_minus <- Views(covByStrand$`-`, irl_minus)

        # Extract Adjustments
        if (symmetric) {
            message(paste0("Symmetric trimming to percentile: ",
                           percentile * 100, "%"))
            adjust_plus <- viewApply(views_plus, symmetricPercentiles,
                                     prop = percentile, simplify = FALSE)
            adjust_minus <- viewApply(views_minus, symmetricPercentiles,
                                      prop = percentile, simplify = FALSE)
        } else if (!symmetric) {
            message(paste0("Asymmetric trimming to percentile: ",
                           percentile * 100, "%"))
            adjust_plus <- viewApply(views_plus, asymmetricPercentiles,
                                     prop = percentile, simplify = FALSE)
            adjust_minus <- viewApply(views_minus, asymmetricPercentiles,
                                      prop = percentile, simplify = FALSE)
        } else {
            stop("Additional percentile functions not yet implemented!")
        }
        rm(views_plus, views_minus)

        # Adjustments as IntegerLists
        message("Adjusting ranges...")
        left_plus <- methods::as(lapply(adjust_plus, function(x) lapply(x, function(x) x[1])),
            "IntegerList")
        right_plus <- methods::as(lapply(adjust_plus, function(x) lapply(x, function(x) x[2])),
            "IntegerList")
        left_minus <- methods::as(lapply(adjust_minus, function(x) lapply(x, function(x) x[1])),
            "IntegerList")
        right_minus <- methods::as(lapply(adjust_minus, function(x) lapply(x, function(x) x[2])),
            "IntegerList")

        # Narrow ranges
        irl_plus <- narrow(irl_plus, start = left_plus, end = right_plus)
        irl_minus <- narrow(irl_minus, start = left_minus, end = right_minus)
        rm(left_plus, right_plus, left_minus, right_minus)

        # Calculate new stats
        message("Calculating new stats...")
        trimmedTCs <- TCstats(coverage_plus = covByStrand$`+`,
                              coverage_minus = covByStrand$`-`,
                              tcs_plus = irl_plus,
                              tcs_minus = irl_minus)
        rm(covByStrand, irl_plus, irl_minus)

        # Carry over seqinfo and sort
        message("Preparing output...")
        seqinfo(trimmedTCs) <- seqinfo(object)
        trimmedTCs <- sort(trimmedTCs)

        # Print some basic stats
        message("Tag clustering summary:")
        summarizeWidths(trimmedTCs)

        # Return
        trimmedTCs
    })

#' @rdname trimToPercentiles
setMethod("trimToPercentiles", signature(object = "GRanges", pooled = "GPos"),
          function(object, pooled, ...) {
              trimToPercentiles(object, methods::as(pooled, "GRanges"), ...)
          })

#' @rdname trimToPercentiles
setMethod("trimToPercentiles", signature(object = "RangedSummarizedExperiment", pooled = "GenomicRanges"),
    function(object, pooled, ...) {
        trimToPercentiles(rowRanges(object), pooled, ...)
    })

#' @rdname trimToPercentiles
setMethod("trimToPercentiles", signature(object = "GRanges", pooled = "RangedSummarizedExperiment"),
    function(object, pooled, ...) {
        trimToPercentiles(object, rowRanges(pooled), ...)
    })

#' @rdname trimToPercentiles
setMethod("trimToPercentiles", signature(object = "RangedSummarizedExperiment", pooled = "RangedSummarizedExperiment"),
    function(object, pooled, ...) {
        trimToPercentiles(rowRanges(object), rowRanges(pooled), ...)
    })

#' Trim width of TCs by distance from TC peak
#'
#' Trim the width of TCs by distance from the TC peaks.
#'
#' @param object GenomicRanges or RangedSummarizedExperiment: Tag clusters.
#' @param pooled GenomicRanges or RangedSummarizedExperiment: Basepair-wise
#'   pooled CTSS (stored in the score column).
#' @param upstream integer: Maximum upstream distance from TC peak.
#' @param downstream integer: Maximum downstream distance from TC peak.
#' @param ... additional arguments passed to methods.
#'
#' @return data.frame with two columns: threshold and nTCs (number of Tag
#'   Clusters)
#' @family Clustering functions
#' @family Trimming functions
#' @export
#' @examples
#' data(exampleCTSSs)
#' data(exampleBidirectional)
#'
#' # Calculate pooled CTSSs
#' exampleCTSSs <- calcTPM(exampleCTSSs, totalTags='totalTags')
#' exampleCTSSs <- calcPooled(exampleCTSSs)
#'
#' # Choose a few wide clusters:
#' TCs <- subset(exampleUnidirectional, width >= 100)
#'
#' # Trim to +/- 10 bp of TC peak
#' trimToPeak(TCs, pooled=exampleCTSSs, upstream=10, downstream=10)
setGeneric("trimToPeak", function(object, pooled, ...) {
    standardGeneric("trimToPeak")
})

#' @rdname trimToPeak
setMethod("trimToPeak", signature(object = "GRanges", pooled = "GRanges"),
    function(object, pooled, upstream, downstream) {
        # Pre-Checks
        assert_that(checkPeaked(object),
                    checkPooled(pooled),
                    is.count(upstream),
                    is.count(downstream),
                    identical(seqlengths(object), seqlengths(pooled)))

        # Extract peaks
        message("Trimming TCs around peaks...")
        peaks <- swapRanges(object)

        # Expand peaks
        expanded <- promoters(peaks,
                              upstream = upstream,
                              downstream = downstream)

        # Intersect to obtain filtered peaks.
        ranges(peaks) <- pintersect(ranges(object), ranges(expanded))

        # Split by strand
        message("Splitting by strand...")
        covByStrand <- splitPooled(pooled)
        TCsByStrand <- splitByStrand(peaks)

         # coverage_stranded <- splitByStrand(pooled)
        # coverage_plus <- coverage(coverage_stranded$`+`, weight = "score")
        # coverage_minus <- coverage(coverage_stranded$`-`, weight = "score")
        # tcs_stranded <- splitByStrand(peaks)
        # rm(coverage_stranded, peaks)

        # Convert to IRangesList
        irl_plus <- methods::as(TCsByStrand$`+`,'IRangesList')
        irl_minus <- methods::as(TCsByStrand$`-`,'IRangesList')
        rm(TCsByStrand)

        # irl_plus <- split(ranges(tcs_stranded$`+`), seqnames(tcs_stranded$`+`))
        # irl_minus <- split(ranges(tcs_stranded$`-`), seqnames(tcs_stranded$`-`))
        # rm(tcs_stranded)

        # Calculate new stats
        message("Calculating new stats...")
        trimmedTCs <- TCstats(coverage_plus = covByStrand$`+`,
                              coverage_minus = covByStrand$`-`,
                              tcs_plus = irl_plus,
                              tcs_minus = irl_minus)
        rm(covByStrand, irl_plus, irl_minus)

        # Carry over seqinfo and sort
        message("Preparing output...")
        seqinfo(trimmedTCs) <- seqinfo(object)
        trimmedTCs <- sort(trimmedTCs)

        # Print some basic stats
        message("Tag clustering summary:")
        summarizeWidths(trimmedTCs)

        # Return
        trimmedTCs
    })

#' @rdname trimToPeak
setMethod("trimToPeak", signature(object = "GRanges", pooled = "GPos"),
          function(object, pooled, ...) {
              trimToPeak(object, methods::as(pooled, "GRanges"), ...)
          })

#' @rdname trimToPeak
setMethod("trimToPeak", signature(object = "RangedSummarizedExperiment", pooled = "GenomicRanges"),
    function(object, pooled, ...) {
        trimToPeak(rowRanges(object), pooled, ...)
    })

#' @rdname trimToPeak
setMethod("trimToPeak", signature(object = "GRanges", pooled = "RangedSummarizedExperiment"),
    function(object, pooled, ...) {
        trimToPeak(object, rowRanges(pooled), ...)
    })

#' @rdname trimToPeak
setMethod("trimToPeak", signature(object = "RangedSummarizedExperiment", pooled = "RangedSummarizedExperiment"),
    function(object, pooled, ...) {
        trimToPeak(rowRanges(object), rowRanges(pooled), ...)
    })
MalteThodberg/CAGEfightR documentation built on Sept. 11, 2021, 4:42 a.m.