R/aicc_loess.R

Defines functions loessFit .getLoess .localResolution .aiccOpt .minTwo .localMin .numDecimals .loessFitForChr

Documented in loessFit

#' Perform optimized Loess regression for each chromosome
#'
#' Called after the \code{\link{calculateDistance}} step and before
#' \code{\link{prePeak}}.
#'
#' @param mmapprData The \code{\linkS4class{MmapprData}} object to be analyzed.
#'
#' @return A \code{\linkS4class{MmapprData}} object with the \code{$loess} element
#'   of the \code{distance} slot list filled.
#'
#' @examples
#' if (requireNamespace('MMAPPR2data', quietly=TRUE)
#'         & all(Sys.which(c("samtools", "vep")) != "")) {
#'     mmappr_param <- MmapprParam(refFasta = MMAPPR2data::goldenFasta(),
#'                                 wtFiles = MMAPPR2data::exampleWTbam(),
#'                                 mutFiles = MMAPPR2data::exampleMutBam(),
#'                                 species = "danio_rerio",
#'                                 outputFolder = tempOutputFolder())
#' }
#' \dontrun{
#' md <- new('MmapprData', param = mmappr_param)
#' postCalcDistMD <- calculateDistance(md)
#'
#' postLoessMD <- loessFit(postCalcDistMD)
#' }
#' @export
loessFit <- function(mmapprData) {
    loessOptResolution <- mmapprData@param@loessOptResolution
    loessOptCutFactor <- mmapprData@param@loessOptCutFactor

    # each item (chr) of distance list has mutCounts, wtCounts,
    # distanceDf going in
    mmapprData@distance <-
        BiocParallel::bplapply(mmapprData@distance,
                               FUN=.loessFitForChr,
                               loessOptResolution=loessOptResolution,
                               loessOptCutFactor=loessOptCutFactor)
    #.loessFitForChr returns list with mutCounts, wtCounts, loess, aicc

    return(mmapprData)
}


.getLoess <- function(s, pos, euc_dist, ...){
    x <- suppressWarnings(try(loess(euc_dist ~ pos, span=s, degree=1,
                   family="symmetric", ...), silent=TRUE))
    return(x)
}

# gets greater of the two differences to either side of a span in a
# vector of spans
.localResolution <- function(spans, span) {
    stopifnot(is.numeric(spans))
    stopifnot(all(spans >= 0 & spans <= 1))

    spans <- unique(sort(spans))
    index <- which(spans %in% span)
    stopifnot(length(index) == 1)

    result <- max(c(diff(spans)[index-1], diff(spans)[index]), na.rm=TRUE)
    stopifnot(is.numeric(result))
    return(result)
}

#returns a list of aicc and span values as well as the time it took
#define needed functions
.aiccOpt <- function(distanceDf, spans, resolution, cutFactor) {
    aiccValues <- vapply(spans, .aicc, FUN.VALUE=numeric(1),
                         euc_dist=distanceDf$distance, pos=distanceDf$pos)
    if (length(spans) != length(aiccValues))
        stop("AICc values and spans don't match")
    aiccDf <- data.frame(spans, aiccValues)

    #finds two lowest local minima for finer attempt
    minVals <- .minTwo(.localMin(aiccDf$aiccValues))
    # gets first column of dataframe after selecting for rows that
    # match the two lowest localMins
    minSpans <- aiccDf[aiccDf$aiccValues %in% minVals, 1]

    # goes through each minimum and goes deeper if resolution isn't fine enough
    for (minSpan in minSpans) {
        # get resolution at that point, which is greater of differences
        # to either side
        localResolution <- round(.localResolution(spans, minSpan),
                                 digits=.numDecimals(resolution))
        stopifnot(is.numeric(localResolution))
        if (abs(localResolution) > resolution){
            addVector <- ((localResolution * cutFactor) * seq(1, 9))
            newSpans <- c(minSpan - addVector, minSpan + addVector)
            newSpans <- newSpans[newSpans > 0 & newSpans <= 1]
            newSpans <- round(newSpans, digits = .numDecimals(resolution))
            newSpans <- unique(newSpans)
            #recursive call
            aiccDf <- rbind(aiccDf, .aiccOpt(distanceDf, spans=newSpans,
                                             resolution=resolution,
                                             cutFactor=cutFactor))
        }
    }
    return(aiccDf)
}

#returns minimum two elements (useful for .aiccOpt)
.minTwo <- function(x){
    len <- length(x)
    if(len < 2){
        return(x)
    }
    sort(x,partial=c(1, 2))[c(1, 2)]
}

#returns all local minima (problem if repeated local maxima on end)
.localMin <- function(x){
    indices <- which(diff(c(FALSE,diff(x)>0,TRUE))>0)
    return(x[indices])
}

.numDecimals <- function(x) {
    stopifnot(is(x, "numeric"))
    if (!grepl("[.]", x)) return(0)
    x <- sub("0+$","",x)
    x <- sub("^.+[.]","",x)
    nchar(x)
}

.aicc <- function (s, euc_dist, pos) {
    # extract values from loess object
    x <- .getLoess(s, pos, euc_dist)
    if(is(x, "try-error")) return(NA)
    span <- x$pars$span
    n <- x$n
    traceL <- x$trace.hat
    sigma2 <- sum( x$residuals^2 ) / (n-1)
    #return aicc value
    return(log(sigma2) + 1 + 2* (2*(traceL+1)) / (n-traceL-2))
}


# the function that gets run for each chromosome
# takes element of mmapprData@distance (with mutCounts, wtCounts, distanceDf)
# outputs complete element of mmapprData@distance
# with columns (mutcounts, wtCounts, loess, aicc)
.loessFitForChr <- function(resultList, loessOptResolution, loessOptCutFactor){
    startTime <- proc.time()
    tryCatch({
        if(is(resultList, 'character')) stop('--Loess fit failed')

        #returns dataframe with spans and aicc values for each loess
        startSpans <- c(seq(.01, .16, .01), seq(.21, .91, .10))
        resultList$aicc <- .aiccOpt(distanceDf = resultList$distanceDf,
                                              spans = startSpans,
                                              resolution = loessOptResolution,
                                              cutFactor = loessOptCutFactor)

        #now get loess for best aicc
        minAicc <- min(resultList$aicc$aiccValues, na.rm=TRUE)
        bestSpan <- round(
            resultList$aicc[resultList$aicc$aiccValues == minAicc, 'spans'],
            digits = .numDecimals(loessOptResolution)
        )
        bestSpan <- mean(bestSpan, na.rm = TRUE)
        resultList$loess <- .getLoess(bestSpan, resultList$distanceDf$pos,
                                      resultList$distanceDf$distance,
                                      surface = "direct"
        )

        #no longer needed
        resultList$distanceDf <- NULL
        resultList$seqname <- NULL

        resultList$loessTime <- proc.time() - startTime
        return(resultList)
    },
    error = function(e) {
        if (is(resultList, "character"))
            return(paste0(resultList, e$message))
        else return(e$message)
    }
    )
}
kjohnsen/MMAPPR2 documentation built on Oct. 1, 2019, 7:01 p.m.