R/BSmooth.fstat.R
In bsseq: Analyze, manage and store bisulfite sequencing data

Documented in BSmooth.fstat computeStat smoothSds

BSmooth.fstat <- function(BSseq, design, contrasts, verbose = TRUE){
    stopifnot(is(BSseq, "BSseq"))
    stopifnot(hasBeenSmoothed(BSseq))

    ## if(any(rowSums(getCoverage(BSseq)[, unlist(groups)]) == 0))
    ##     warning("Computing t-statistics at locations where there is no data; consider subsetting the 'BSseq' object first")

    if(verbose) cat("[BSmooth.fstat] fitting linear models ... ")
    ptime1 <- proc.time()
    allPs <- getMeth(BSseq, type = "smooth", what = "perBase",
                     confint = FALSE)
    allPs <- as.array(allPs)
    fit <- lmFit(allPs, design)
    fitC <- contrasts.fit(fit, contrasts)
    ## Need
    ##   fitC$coefficients, fitC$stdev.unscaled, fitC$sigma, fitC$cov.coefficients
    ## actuall just need
    ##   tstats <- fitC$coefficients / fitC$stdev.unscaled / fitC$sigma
    ##   rawSds <- fitC$sigma
    ##   cor.coefficients <- cov2cor(fitC$cov.coefficients)
    rawSds <- as.matrix(fitC$sigma)
    cor.coefficients <- cov2cor(fitC$cov.coefficients)
    rawTstats <- fitC$coefficients / fitC$stdev.unscaled / fitC$sigma
    names(dimnames(rawTstats)) <- NULL
    ptime2 <- proc.time()
    stime <- (ptime2 - ptime1)[3]
    if(verbose) cat(sprintf("done in %.1f sec\n", stime))

    parameters <- c(BSseq@parameters,
                    list(design = design, contrasts = contrasts))
    stats <- list(rawSds = rawSds,
                  cor.coefficients = cor.coefficients,
                  rawTstats = rawTstats)
    out <- BSseqStat(gr = granges(BSseq),
                     stats = stats,
                     parameters = parameters)
    out
}

smoothSds <- function(BSseqStat, k = 101, qSd = 0.75, mc.cores = 1,
                      maxGap = 10^8, verbose = TRUE) {
    smoothSd <- function(Sds, k, qSd) {
        k0 <- floor(k/2)
        if(all(is.na(Sds))) return(Sds)
        thresSD <- pmax(Sds, quantile(Sds, qSd, na.rm = TRUE), na.rm = TRUE)
        addSD <- rep(median(Sds, na.rm = TRUE), k0)
        sSds <- as.vector(runmean(Rle(c(addSD, thresSD, addSD)), k = k))
        sSds
    }
    if(is.null(maxGap))
        maxGap <- BSseqStat@parameters[["maxGap"]]
    if(is.null(maxGap))
        stop("need to set argument 'maxGap'")
    if(verbose) cat("[smoothSds] preprocessing ... ")
    ptime1 <- proc.time()
    clusterIdx <- makeClusters(granges(BSseqStat), maxGap = maxGap)
    ptime2 <- proc.time()
    stime <- (ptime2 - ptime1)[3]
    if(verbose) cat(sprintf("done in %.1f sec\n", stime))
    smoothSds <- do.call("c",
                         mclapply(clusterIdx, function(idx) {
                             rawSds <- getStats(BSseqStat,
                                                what = "rawSds")[idx, ]
                             rawSds <- as.array(rawSds)
                             smoothSd(rawSds, k = k, qSd = qSd)
                         }, mc.cores = mc.cores))
    smoothSds <- as.matrix(smoothSds)
    if("smoothSds" %in% names(getStats(BSseqStat)))
        BSseqStat@stats[["smoothSds"]] <- smoothSds
    else
        BSseqStat@stats <- c(getStats(BSseqStat), list(smoothSds = smoothSds))
    BSseqStat
}

# Quieten R CMD check
globalVariables("tstat")

# NOTE: Realises in memory a matrix with nrow = length(BSseqStat) and
#       ncol = length(coef)
computeStat <- function(BSseqStat, coef = NULL) {
    stopifnot(is(BSseqStat, "BSseqStat"))
    if(is.null(coef)) {
        coef <- 1:ncol(getStats(BSseqStat, what = "rawTstats"))
    }
    raw_tstats <- getStats(BSseqStat, what = "rawTstats")[, coef, drop = FALSE]
    scaled_sds <- getStats(BSseqStat, what = "rawSds") /
        getStats(BSseqStat, what = "smoothSds")
    scaled_sds_matrix <- matrix(
        rep(scaled_sds, ncol(raw_tstats)),
        ncol = ncol(raw_tstats))
    tstats <- raw_tstats * scaled_sds_matrix
    if(length(coef) > 1) {
        cor.coefficients <- getStats(BSseqStat,
                                     what = "cor.coefficients")[coef,coef]
        # NOTE: classifyTestsF() calls as.matrix(tstats) and so realises this
        #       array
        stat <- as.matrix(
            classifyTestsF(tstats, cor.coefficients, fstat.only = TRUE))
        stat.type <- "fstat"
    } else {
        stat <- tstats
        stat.type <- "tstat"
    }
    if("stat" %in% names(getStats(BSseqStat))) {
        BSseqStat@stats[["stat"]] <- stat
        BSseqStat@stats[["stat.type"]] <- stat.type
    } else {
        BSseqStat@stats <- c(getStats(BSseqStat),
                             list(stat = stat, stat.type = stat.type))
    }
    BSseqStat
}

localCorrectStat <- function(BSseqStat, threshold = c(-15,15), mc.cores = 1, verbose = TRUE) {
    compute.correction <- function(idx) {
        xx <- start(BSseqTstat)[idx]
        yy <- tstat[idx] ## FIXME
        if(!is.null(threshold)) {
            stopifnot(is.numeric(threshold) && length(threshold) == 2)
            stopifnot(threshold[1] < 0 && threshold[2] > 0)
            yy[yy < threshold[1]] <- threshold[1]
            yy[yy > threshold[2]] <- threshold[2]
        }
        suppressWarnings({
            drange <- diff(range(xx, na.rm = TRUE))
        })
        if(drange <= 25000)
            return(yy)
        tstat.function <- approxfun(xx, yy)
        xx.reg <- seq(from = min(xx), to = max(xx), by = 2000)
        yy.reg <- tstat.function(xx.reg)
        fit <- locfit(yy.reg ~ lp(xx.reg, h = 25000, deg = 2, nn = 0),
                      family = "huber", maxk = 50000)
        correction <- predict(fit, newdata = data.frame(xx.reg = xx))
        yy - correction
    }
    maxGap <- BSseqStat$parameters$maxGap
    if(verbose) cat("[BSmooth.tstat] preprocessing ... ")
    ptime1 <- proc.time()
    clusterIdx <- makeClusters(BSseqStat$gr, maxGap = maxGap)
    ptime2 <- proc.time()
    stime <- (ptime2 - ptime1)[3]
    if(verbose) cat(sprintf("done in %.1f sec\n", stime))
    stat <- BSseqStat$stat
    stat.corrected <- do.call(c, mclapply(clusterIdx, compute.correction,
                                          mc.cores = mc.cores))
    BSseqTstat@stats <- cbind(getStats(BSseqTstat),
                              "tstat.corrected" = stat.corrected)
    BSseqTstat@parameters$local.local <- TRUE
    BSseqTstat
}

fstat.pipeline <- function(BSseq, design, contrasts, cutoff, fac, nperm = 1000,
                           coef = NULL, maxGap.sd = 10 ^ 8, maxGap.dmr = 300,
                           type = "dmrs", mc.cores = 1) {
    type <- match.arg(type, c("dmrs", "blocks"))
    bstat <- BSmooth.fstat(BSseq = BSseq, design = design,
                           contrasts = contrasts)
    bstat <- smoothSds(bstat)
    bstat <- computeStat(bstat, coef = coef)
    dmrs <- dmrFinder(bstat, cutoff = cutoff, maxGap = maxGap.dmr)
    if (is.null(dmrs)) {
        stop("No DMRs identified. Consider reducing the 'cutoff' from (",
             paste0(cutoff, collapse = ", "), ")")
    }
    idxMatrix <- permuteAll(nperm, design)
    nullDist <- getNullDistribution_BSmooth.fstat(BSseq = BSseq,
                                                  idxMatrix = idxMatrix,
                                                  design = design,
                                                  contrasts = contrasts,
                                                  coef = coef,
                                                  cutoff = cutoff,
                                                  maxGap.sd = maxGap.sd,
                                                  maxGap.dmr = maxGap.dmr,
                                                  mc.cores = mc.cores)
    fwer <- getFWER.fstat(null = c(list(dmrs), nullDist), type = type)
    dmrs$fwer <- fwer
    meth <- getMeth(BSseq, dmrs, what = "perRegion")
    meth <- t(apply(meth, 1, function(xx) tapply(xx, fac, mean)))
    dmrs <- cbind(dmrs, meth)
    dmrs$maxDiff <- rowMaxs(meth) - rowMins(meth)
    list(bstat = bstat, dmrs = dmrs, idxMatrix = idxMatrix, nullDist = nullDist)
}

fstat.comparisons.pipeline <- function(BSseq, design, contrasts, cutoff, fac,
                                       maxGap.sd = 10 ^ 8, maxGap.dmr = 300,
                                       verbose = TRUE) {
    bstat <- BSmooth.fstat(BSseq = BSseq,
                           design = design,
                           contrasts = contrasts,
                           verbose = verbose)
    bstat <- smoothSds(bstat, maxGap = maxGap.sd, verbose = verbose)
    # NOTE: Want to keep the bstat object corresponding to the original fstat
    #       and not that from the last t-tsts in the following lapply()
    bstat_f <- bstat
    if (verbose) {
        cat(paste0("[fstat.comparisons.pipeline] making ", ncol(contrasts),
                   " comparisons ... \n"))
    }
    l <- lapply(seq_len(ncol(contrasts)), function(coef) {
        if (verbose) {
            cat(paste0("[fstat.comparisons.pipeline] ",
                       colnames(contrasts)[coef], "\n"))
        }
        bstat <- computeStat(bstat, coef = coef)
        dmrs <- dmrFinder(bstat, cutoff = cutoff, maxGap = maxGap.dmr,
                          verbose = verbose)
        if (!is.null(dmrs)) {
            meth <- getMeth(BSseq, dmrs, what = "perRegion")
            meth <- t(apply(meth, 1, function(xx) tapply(xx, fac, mean)))
            dmrs <- cbind(dmrs, meth)
            dmrs$maxDiff <- rowMaxs(meth) - rowMins(meth)
        }
        dmrs
    })
    names(l) <- colnames(contrasts)
    list(bstat = bstat, dmrs = l)
}