R/dmrFinder.R

Defines functions regionFinder3 clusterMaker dmrFinder

Documented in dmrFinder

# NOTE: Realises in memory a matrix with nrow = length(bstat) and ncol = 1
dmrFinder <- function(bstat, cutoff = NULL, qcutoff = c(0.025, 0.975),
                      maxGap = 300, stat = "tstat.corrected", verbose = TRUE) {
    if(is(bstat, "BSseqTstat")) {
        if(! stat %in% colnames(bstat@stats))
            stop("'stat' needs to be a column of 'bstat@stats'")
        dmrStat <- as.array(bstat@stats[, stat, drop = FALSE])
    }
    if(is(bstat, "BSseqStat")) {
        dmrStat <- as.array(getStats(bstat, what = "stat"))
    }
    subverbose <- max(as.integer(verbose) - 1L, 0L)
    if(is.null(cutoff))
        cutoff <- .quantile(dmrStat, qcutoff)
    if(length(cutoff) == 1)
        cutoff <- c(-cutoff, cutoff)
    direction <- as.integer(dmrStat >= cutoff[2])
    direction[as.array(dmrStat <= cutoff[1])] <- -1L
    direction[is.na(direction)] <- 0L
    chrs <- as.character(seqnames(bstat))
    positions <- start(bstat)
    regions <- regionFinder3(direction, chr = chrs, positions = positions,
                             maxGap = maxGap, verbose = subverbose)
    if(is.null(regions$down) && is.null(regions$up))
        return(NULL)
    if(verbose) cat("[dmrFinder] creating dmr data.frame\n")
    regions <- do.call(rbind, regions)
    rownames(regions) <- NULL
    regions$width <- regions$end - regions$start + 1
    regions$invdensity <- regions$width / regions$n
    regions$chr <- as.character(regions$chr)
    if(is(bstat, "BSseqTstat")) {
        stats <- getStats(bstat, regions, stat = stat)
        regions <- cbind(regions, stats)
        if(stat %in% c("tstat.corrected", "tstat")) {
            regions$direction <- ifelse(regions$meanDiff > 0, "hyper", "hypo")
        }
        regions <- regions[order(abs(regions$areaStat), decreasing = TRUE),]
    }
    if(is(bstat, "BSseqStat")) {
        stats <- getStats(bstat, regions)
        regions <- cbind(regions, stats)
        regions <- regions[order(abs(regions$areaStat), decreasing = TRUE),]
    }
    regions
}


clusterMaker <- function(chr, pos, order.it=TRUE, maxGap=300){
    nonaIndex <- which(!is.na(chr) & !is.na(pos))
    Indexes <- split(nonaIndex, chr[nonaIndex])
    clusterIDs <- rep(NA, length(chr))
    LAST <- 0
    for(i in seq(along = Indexes)){
        Index <- Indexes[[i]]
        x <- pos[Index]
        if(order.it){
            Index <- Index[order(x)]
            x <- pos[Index]
        }
        y <- as.numeric(diff(x) > maxGap)
        z <- cumsum(c(1, y))
        clusterIDs[Index] <- z + LAST
        LAST <- max(z) + LAST
    }
    clusterIDs
}


regionFinder3 <- function(x, chr, positions, keep, maxGap = 300, verbose = TRUE) {
    getSegments3 <- function(x, cid, verbose = TRUE){
        if(verbose) cat("[regionFinder3] segmenting\n")
        segments <- cumsum(c(1, diff(x) != 0)) +
            cumsum(c(1, diff(cid) != 0))
        names(segments) <- NULL
        if(verbose) cat("[regionFinder3] splitting\n")
        out <- list(up = split(which(x > 0), segments[x > 0]),
                    down = split(which(x < 0), segments[x < 0]),
                    nochange = split(which(x == 0), segments[x == 0]))
        names(out[[1]]) <- NULL
        names(out[[2]]) <- NULL
        names(out[[3]]) <- NULL
        out
    }
    cid0 <- clusterMaker(chr, positions, maxGap = maxGap)
    segments <- getSegments3(x = x, cid = cid0, verbose = verbose)
    out <- vector("list", 2)
    names(out) <- c("up", "down")
    for(ii in 1:2) {
        idx <- segments[[ii]]
        if(length(idx) > 0) {
            out[[ii]] <- data.frame(chr = sapply(idx, function(jj) chr[jj[1]]),
                                    start = sapply(idx, function(jj) min(positions[jj])),
                                    end = sapply(idx, function(jj) max(positions[jj])),
                                    idxStart = sapply(idx, function(jj) min(jj)),
                                    idxEnd = sapply(idx, function(jj) max(jj)),
                                    cluster = sapply(idx, function(jj) cid0[jj[1]]),
                                    n = sapply(idx, length))
        }
    }
    out
}
kasperdanielhansen/bsseq documentation built on Jan. 18, 2025, 3:27 a.m.