R/plotting.R

#' Methimpute plotting functions
#' 
#' This page provides an overview of all \pkg{\link{methimpute}} plotting functions.
#'
#' @param model A \code{\link{methimputeBinomialHMM}} object.
#' @param datapoints The number of randomly selected datapoints for the plot.
#' @param binwidth The bin width for the histogram/boxplot.
#' @return A \code{\link[ggplot2]{ggplot}} object.
#' @name plotting
#' @import ggplot2
#' @examples 
#'## Get some toy data
#'file <- system.file("data","arabidopsis_toydata.RData",
#'                     package="methimpute")
#'data <- get(load(file))
#'print(data)
#'model <- callMethylation(data)
#'## Make nice plots
#'plotHistogram(model, total.counts=5)
#'plotScatter(model)
#'plotTransitionProbs(model)
#'plotConvergence(model)
#'plotPosteriorDistance(model$data)
#'
#'## Get annotation data and make an enrichment profile
#'# Note that this looks a bit ugly because our toy data
#'# has only 200000 datapoints.
#'data(arabidopsis_genes)
#'plotEnrichment(model, annotation=arabidopsis_genes)
#'
NULL

#' Get state colors
#'
#' Get the colors that are used for plotting.
#'
#' @param states A character vector.
#' @return A character vector with colors.
#' @seealso \code{\link{plotting}}
#' @export
#'@examples
#'cols <- getStateColors()
#'pie(1:length(cols), col=cols, labels=names(cols))
getStateColors <- function(states=NULL) {
    state.colors <- c("Background" = "gray", "Unmethylated" = "red","Methylated" = "blue","Intermediate" = "green", "Total" = "black", "Signal" = "red")
    if (is.null(states)) {
        return(state.colors)
    } else {
        states <- as.character(states)
        states.other <- setdiff(states, names(state.colors))
        if (length(states.other) > 0) {
            state.colors.other <- getDistinctColors(states.other)
            state.colors <- c(state.colors, state.colors.other)
        }
        return(state.colors[states])
    }
}


#' Transform genomic coordinates
#'
#' Add two columns with transformed genomic coordinates to the \code{\link{GRanges-class}} object. This is useful for making genomewide plots.
#'
#' @param gr A \code{\link{GRanges-class}} object.
#' @return The input \code{\link{GRanges-class}} with two additional metadata columns 'start.genome' and 'end.genome'.
transCoord <- function(gr) {
    cum.seqlengths <- cumsum(as.numeric(seqlengths(gr)))
    cum.seqlengths.0 <- c(0,cum.seqlengths[-length(cum.seqlengths)])
    names(cum.seqlengths.0) <- seqlevels(gr)
    gr$start.genome <- start(gr) + cum.seqlengths.0[as.character(seqnames(gr))]
    gr$end.genome <- end(gr) + cum.seqlengths.0[as.character(seqnames(gr))]
    return(gr)
}


#' @describeIn plotting Plot a histogram of count values and fitted distributions.
#' @param total.counts The number of total counts for which the histogram is to be plotted.
#' @importFrom stats dbinom
#' @export
plotHistogram <- function(model, total.counts, binwidth=1) {

    if (class(model) == 'GRanges') {
        model <- list(data=model)
    }
    ## Get cross section at total counts ##
    counts <- model$data$counts
    mask.crosssec <- counts[,'total'] == total.counts
    counts.methylated <- counts[mask.crosssec,'methylated']
    
    contexts <- intersect(levels(model$data$context), unique(model$data$context))
    ## Histogram ##
    df <- data.frame(counts=counts.methylated, context=model$data$context[mask.crosssec])
    ggplt <- ggplot(df) + geom_histogram(aes_string(x='counts', y='..density..'), binwidth=binwidth, color='black', fill='white')
    ggplt <- ggplt + theme_bw()
    ggplt <- ggplt + xlab(paste0("methylated counts\nat positions with total counts = ", total.counts))
    ggplt <- ggplt + coord_cartesian(xlim=c(0,total.counts))
    ggplt <- ggplt + facet_wrap(~ context)
    
    ## Fits ##
    if (!is.null(model$params$emissionParams)) {
        x <- seq(0, total.counts, by = binwidth)
        distr.list <- list()
        for (context in contexts) {
            distr <- list(x=x)
            for (i1 in 1:length(model$params$emissionParams)) {
                e <- model$params$emissionParams[[i1]]
                distr[[names(model$params$emissionParams)[i1]]] <- model$params$weights[[context]][i1] * stats::dbinom(x, size=total.counts, prob=e[context,'prob'])
            }
            distr <- as.data.frame(distr)
            distr$Total <- rowSums(distr[,-1])
            distr$context <- context
            distr.list[[context]] <- distr
        }
        distr <- do.call(rbind, distr.list)
        distr <- reshape2::melt(distr, id.vars=c('x', 'context'), variable.name='components')
        distr$components <- sub('^X', '', distr$components)
        distr$components <- factor(distr$components, levels=c('Total', levels(model$data$status)))
        ggplt <- ggplt + geom_line(data=distr, mapping=aes_string(x='x', y='value', col='components'))
        
        # ## Make legend
        # lprobs <- round(sapply(model$params$emissionParams, function(x) { x[,'prob'] }), 4)
        # lweights <- round(model$params$weights[[context]], 2)
        # legend <- paste0(names(model$params$emissionParams), ", prob=", lprobs, ", weight=", lweights)
        # legend <- c(paste0("Total, weight=", round(sum(lweights))), legend)
        # ggplt <- ggplt + scale_color_manual(name="components", values=getStateColors(levels(distr$components)), labels=legend) + theme(legend.position=c(0.5,0.99), legend.justification=c(0.5,1))
        
        ## Make legend
        # lprobs <- round(sapply(model$params$emissionParams, function(x) { x[,'prob'] }), 4)
        # rownames(lprobs) <- rownames(model$params$emissionParams[[1]])
        # lprobs <- lprobs[contexts,]
        # lweights <- round(t(as.data.frame(model$params$weights)), 2)
        # lweights <- lweights[contexts,]
        ggplt <- ggplt + scale_color_manual(name="components", values=getStateColors(levels(distr$components))) + theme(legend.position=c(0.5,0.99), legend.justification=c(0.5,1))
    }
    ggplt <- ggplt + ggtitle('Fitted distributions')
        
    return(ggplt)

}


#' @describeIn plotting Plot a scatter plot of read counts colored by methylation status.
#' @export
plotScatter <- function(model, datapoints=1000) {

    contexts <- intersect(levels(model$data$context), unique(model$data$context))
    ggplts <- list()
    limits <- list()
    data <- model$data
    ## Find sensible limits
    xmax <- quantile(data$counts[,'total']-data$counts[,'methylated'], 0.99)
    ymax <- quantile(data$counts[,'methylated'], 0.99)
    limits <- c(xmax, ymax)
    df <- data.frame(status=data$status, unmethylated=data$counts[,'total']-data$counts[,'methylated'], methylated=data$counts[,'methylated'], context=data$context)
    if (datapoints < nrow(df)) {
        df <- df[sample(1:nrow(df), datapoints, replace = FALSE), ]
    }
    
    ggplt <- ggplot(df, aes_string(x='methylated', y='unmethylated', col='status'))
    ggplt <- ggplt + geom_point(alpha=0.3)
    ggplt <- ggplt + coord_cartesian(xlim=c(0,xmax), ylim=c(0,ymax))
    ggplt <- ggplt + theme_bw()
    ggplt <- ggplt + xlab('methylated counts') + ylab('unmethylated counts')
    ggplt <- ggplt + facet_wrap(~ context)
    ggplt <- ggplt + scale_color_manual(values=getStateColors(levels(df$status)))
    
    # ## Legend
    # lweights <- round(model$params$weights[[context]], 2)
    # legend <- paste0(names(model$params$weights[[context]]), ", weight=", lweights)
    # ggplt <- ggplt + scale_color_manual(values=getStateColors(names(model$params$weights[[context]])), labels=legend)
    # ggplt <- ggplt + theme(legend.position=c(1,1), legend.justification=c(1,1))
    
    ggplt <- ggplt + ggtitle('Classification')
    
    return(ggplt)
    
}


#' @describeIn plotting Plot a heatmap of transition probabilities.
#' @importFrom reshape2 melt
#' @export
plotTransitionProbs <- function(model) {

    if (is.list(model$params$transProbs)) {
        As <- model$params$transProbs
    } else if (is.matrix(model$params$transProbs)) {
        As <- list(transitions=model$params$transProbs)
    }

    A <- reshape2::melt(As, varnames=c('from','to'), value.name='prob')
    names(A)[4] <- 'transitionContext'
    A$from <- factor(A$from, levels=rev(unique(A$from)))
    A$to <- factor(A$to, levels=rev(unique(A$from)))
    ggplt <- ggplot(data=A) + geom_tile(aes_string(x='to', y='from', fill='prob')) + scale_fill_gradient(low="white", high="blue", limits=c(0,1)) + theme_bw() + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust=0.5))
    ggplt <- ggplt + facet_wrap(~ transitionContext)
    
    ggplt <- ggplt + ggtitle('Transition probabilities')
    
    return(ggplt)

}

insertNULL <- function(plotlist) {
    n <- 0.5 * ( sqrt(8*length(plotlist)+1) - 1 )
    plotlist2 <- list()
    i3 <- 1
    for (i1 in 1:n) {
        if (i1 > 1) {
            for (i2 in 1:(n-i1)) {
                plotlist2[length(plotlist2)+1] <- list(NULL)
            }
        }
        for (i2 in i1:n) {
            plotlist2[[length(plotlist2)+1]] <- plotlist[[i3]]
            i3 <- i3 + 1
        }
    }
    return(plotlist2)
}

#' @describeIn plotting Plot the convergence of the probability parameters.
#' @export
plotConvergence <- function(model) {
    
    info <- model$convergenceInfo$prefit
    if (is.null(info)) {
        info <- model$convergenceInfo
    }
    # ## Plot loglikelihood
    # df <- data.frame(iteration=0:(length(info$logliks)-1), loglik=info$logliks)
    # ggplt <- ggplot(df) + geom_line(aes_string(x='iteration', y='loglik')) + theme_bw()
    # ggplt <- ggplt + scale_x_continuous(breaks=df$iteration, limits=c(2, length(info$logliks)-1))
    
    ## Plot parameters
    df <- reshape2::melt(info$parameterInfo, value.name='prob')
    ggplt <- ggplot(df) + geom_line(aes_string(x='iteration', y='prob', col='status')) + theme_bw()
    ggplt <- ggplt + theme(panel.grid.minor.x = element_blank())
    ggplt <- ggplt + facet_wrap(~ context)
    ggplt <- ggplt + scale_y_continuous(limits=c(0,1))
    ggplt <- ggplt + scale_color_manual(values=getStateColors(unique(df$status)))
    
    ggplt <- ggplt + ggtitle('Baum-Welch convergence')
    
    return(ggplt)
}


#' @describeIn plotting Plot an enrichment profile around an annotation.
#' @param annotation A \code{\link[GenomicRanges]{GRanges-class}} object with coordinates for the annotation.
#' @param windowsize Resolution in base-pairs for the curve upstream and downstream of the annotation.
#' @param insidewindows Number of data points for the curve inside the annotation.
#' @param range Distance upstream and downstream for which the enrichment profile is calculated.
#' @param category.column The name of a column in \code{data} that will be used for facetting of the plot.
#' @param plot Logical indicating whether a plot or the underlying data.frame is to be returned.
#' @param df.list A list() of data.frames, output from \code{plotEnrichment(..., plot=FALSE)}. If specified, option \code{data} will be ignored.
#' @export
plotEnrichment <- function(model, annotation, windowsize=100, insidewindows=20, range=1000, category.column=NULL, plot=TRUE, df.list=NULL) {
    ## For debugging
    # windowsize=100; insidewindows=20; range=1000; category.column=NULL
    # annotation <- arabidopsis_genes
    # model$data$imputed <- FALSE
    # model$data$imputed[model$data$counts[,'total'] == 0] <- TRUE
    # model$data$imputed <- factor(model$data$imputed)
    # category.column <- 'imputed'
    # data <- model$data

    if (class(model) == 'methimputeBinomialHMM') {
        data <- model$data
        distr <- model$params$emissionParams
    } else if (class(model) == 'GRanges') {
        data <- model
        distr <- NULL
    } else {
        stop("Argument 'model' needs to be of class 'GRanges' or 'methimputeBinomialHMM'.")
    }
    if (is.null(df.list)) {
        ## Variables
        categories <- 'none'
        if (!is.null(category.column)) {
            categories <- levels(mcols(data)[,category.column])
        }
        if (is.null(data$meth.lvl)) {
            data$meth.lvl <- data$counts[,'methylated'] / data$counts[,'total']
        }
        if (is.null(data$rc.meth.lvl) & !is.null(distr)) {
            if (!is.null(distr$Intermediate)) {
                data$rc.meth.lvl <- distr$Unmethylated[data$context,] * data$posteriorUnmeth + distr$Intermediate[data$context,] * (1 - data$posteriorUnmeth - data$posteriorMeth) + distr$Methylated[data$context,] * data$posteriorMeth
            } else {
                data$rc.meth.lvl <- distr$Unmethylated[data$context,] * data$posteriorUnmeth + distr$Methylated[data$context,] * data$posteriorMeth
            }
        }
        
        ## Subset annotation to data range
        annotation <- subsetByOverlaps(annotation, data)

        overlaps.rc.meth.lvl <- array(NA, dim=c(length(levels(data$context)), length(categories), range%/%windowsize, 2, 2), dimnames=list(context=levels(data$context), category=categories, distance=1:(range%/%windowsize), what=c('mean', 'weight'), where=c('upstream', 'downstream')))
        overlaps.meth.lvl <- array(NA, dim=c(length(levels(data$context)), length(categories), range%/%windowsize, 2, 2), dimnames=list(context=levels(data$context), category=categories, distance=1:(range%/%windowsize), what=c('mean', 'weight'), where=c('upstream', 'downstream')))
        ## Upstream and downstream annotation
        annotation.up <- resize(x = annotation, width = 1, fix = 'start')
        annotation.down <- resize(x = annotation, width = 1, fix = 'end')
        for (context in levels(data$context)) {
            message("Upstream and downstream counts for context ", context)
            data.context <- data[data$context == context]
            for (category in categories) {
                message("  category ", category)
                if (!is.null(category.column)) {
                    data.category <- data.context[mcols(data.context)[,category.column] == category]
                } else {
                    data.category <- data.context
                }
                for (i1 in 1:(range %/% windowsize)) {
                    anno.up <- suppressWarnings( promoters(x = annotation.up, upstream = i1*windowsize, downstream = 0) )
                    anno.up <- suppressWarnings( resize(x = anno.up, width = windowsize, fix = 'start') )
                    ind <- findOverlaps(anno.up, data.category)
                    overlaps.rc.meth.lvl[context, category, as.character(i1), 'mean', 'upstream'] <- mean(data.category$rc.meth.lvl[ind@to], na.rm=TRUE)
                    overlaps.rc.meth.lvl[context, category, as.character(i1), 'weight', 'upstream'] <- length(ind@to) / sum(as.numeric(width(anno.up)))
                    overlaps.meth.lvl[context, category, as.character(i1), 'mean', 'upstream'] <- mean(data.category$meth.lvl[ind@to], na.rm=TRUE)
                    overlaps.meth.lvl[context, category, as.character(i1), 'weight', 'upstream'] <- length(ind@to) / sum(as.numeric(width(anno.up)))
                }
                for (i1 in 1:(range %/% windowsize)) {
                    anno.down <- suppressWarnings( promoters(x = annotation.down, upstream = 0, downstream = i1*windowsize) )
                    anno.down <- suppressWarnings( resize(x = anno.down, width = windowsize, fix = 'end') )
                    ind <- findOverlaps(anno.down, data.category)
                    overlaps.rc.meth.lvl[context, category, as.character(i1), 'mean', 'downstream'] <- mean(data.category$rc.meth.lvl[ind@to], na.rm=TRUE)
                    overlaps.rc.meth.lvl[context, category, as.character(i1), 'weight', 'downstream'] <- length(ind@to) / sum(as.numeric(width(anno.down)))
                    overlaps.meth.lvl[context, category, as.character(i1), 'mean', 'downstream'] <- mean(data.category$meth.lvl[ind@to], na.rm=TRUE)
                    overlaps.meth.lvl[context, category, as.character(i1), 'weight', 'downstream'] <- length(ind@to) / sum(as.numeric(width(anno.down)))
                }
            }
        }
    
        ## Inside annotation
        ioverlaps.rc.meth.lvl <- array(NA, dim=c(length(levels(data$context)), length(categories), insidewindows, 2, 1), dimnames=list(context=levels(data$context), category=categories, distance=1:insidewindows, what=c('mean', 'weight'), where=c('inside')))
        ioverlaps.meth.lvl <- array(NA, dim=c(length(levels(data$context)), length(categories), insidewindows, 2, 1), dimnames=list(context=levels(data$context), category=categories, distance=1:insidewindows, what=c('mean', 'weight'), where=c('inside')))
        ioverlaps.status.Methylated <- array(NA, dim=c(length(levels(data$context)), length(categories), insidewindows, 2, 1), dimnames=list(context=levels(data$context), category=categories, distance=1:insidewindows, what=c('mean', 'weight'), where=c('inside')))
        ioverlaps.status.Intermediate <- array(NA, dim=c(length(levels(data$context)), length(categories), insidewindows, 2, 1), dimnames=list(context=levels(data$context), category=categories, distance=1:insidewindows, what=c('mean', 'weight'), where=c('inside')))
        mask.plus <- as.logical(strand(annotation) == '+' | strand(annotation) == '*')
        mask.minus <- !mask.plus
        widths <- width(annotation)
        for (context in levels(data$context)) {
            message("Inside counts for context ", context)
            data.context <- data[data$context == context]
            for (category in categories) {
                message("  category ", category)
                if (!is.null(category.column)) {
                    data.category <- data.context[mcols(data.context)[,category.column] == category]
                } else {
                    data.category <- data.context
                }
                for (i1 in 1:insidewindows) {
                    anno.in <- annotation
                    end(anno.in[mask.plus]) <- start(annotation[mask.plus]) + round(widths[mask.plus] * i1 / insidewindows)
                    start(anno.in[mask.plus]) <- start(annotation[mask.plus]) + round(widths[mask.plus] * (i1-1) / insidewindows)
                    start(anno.in[mask.minus]) <- end(annotation[mask.minus]) - round(widths[mask.minus] * i1 / insidewindows)
                    end(anno.in[mask.minus]) <- end(annotation[mask.minus]) - round(widths[mask.minus] * (i1-1) / insidewindows)
                    ind <- findOverlaps(anno.in, data.category)
                    ioverlaps.rc.meth.lvl[context, category, as.character(i1), 'mean', 'inside'] <- mean(data.category$rc.meth.lvl[ind@to], na.rm=TRUE)
                    ioverlaps.rc.meth.lvl[context, category, as.character(i1), 'weight', 'inside'] <- length(ind@to) / sum(as.numeric(width(anno.in)))
                    ioverlaps.meth.lvl[context, category, as.character(i1), 'mean', 'inside'] <- mean(data.category$meth.lvl[ind@to], na.rm=TRUE)
                    ioverlaps.meth.lvl[context, category, as.character(i1), 'weight', 'inside'] <- length(ind@to) / sum(as.numeric(width(anno.in)))
                    ioverlaps.status.Methylated[context, category, as.character(i1), 'mean', 'inside'] <- mean(data.category$status[ind@to] == "Methylated", na.rm=TRUE)
                    ioverlaps.status.Methylated[context, category, as.character(i1), 'weight', 'inside'] <- length(ind@to) / sum(as.numeric(width(anno.in)))
                    ioverlaps.status.Intermediate[context, category, as.character(i1), 'mean', 'inside'] <- mean(data.category$status[ind@to] == "Intermediate", na.rm=TRUE)
                    ioverlaps.status.Intermediate[context, category, as.character(i1), 'weight', 'inside'] <- length(ind@to) / sum(as.numeric(width(anno.in)))
                }
            }
        }
        
        overlaps.list <- list(meth.lvl = overlaps.meth.lvl, rc.meth.lvl = overlaps.rc.meth.lvl)
        ioverlaps.list <- list(meth.lvl = ioverlaps.meth.lvl, rc.meth.lvl = ioverlaps.rc.meth.lvl)
        
        df.list <- list()
        for (i1 in 1:length(overlaps.list)) {
            name <- names(overlaps.list)[i1]
            overlaps <- overlaps.list[[name]]
            ioverlaps <- ioverlaps.list[[name]]
            
            ## Normalize weight over categories
            overlaps[,,,what='weight',] <- sweep(x = overlaps[,,,what='weight',, drop=FALSE], MARGIN = c(1,3,4,5), STATS = apply(overlaps[,,,what='weight',, drop=FALSE], c(1,3,4,5), sum), FUN = '/')
            ioverlaps[,,,what='weight',] <- sweep(x = ioverlaps[,,,what='weight',, drop=FALSE], MARGIN = c(1,3,4,5), STATS = apply(ioverlaps[,,,what='weight',, drop=FALSE], c(1,3,4,5), sum), FUN = '/')
            
            ## Prepare data.frames for plotting
            dfo <- reshape2::melt(overlaps)
            df <- dfo[dfo$what == 'mean',]
            names(df)[6] <- 'mean'
            df$weight <- dfo$value[dfo$what == 'weight']
            df$distance <- as.numeric(df$distance)
            # Adjust distances for plot
            df$distance[df$where == 'upstream'] <- -df$distance[df$where == 'upstream'] * windowsize
            df$distance[df$where == 'downstream'] <- (df$distance[df$where == 'downstream']-1) * windowsize + range
            
            dfo <- reshape2::melt(ioverlaps)
            idf <- dfo[dfo$what == 'mean',]
            names(idf)[6] <- 'mean'
            idf$weight <- dfo$value[dfo$what == 'weight']
            idf$distance <- as.numeric(idf$distance)
            # Adjust distances for plot
            idf$distance <- (idf$distance-1) * range / insidewindows
            
            df.list[[name]] <- rbind(df, idf)
        }
    }
    
    if (!plot) {
        return(df.list)
    }
    
    plotlist <- list()
    for (i1 in 1:length(df.list)) {
        name <- names(df.list)[i1]
        df <- df.list[[name]]
        
        breaks <- c(c(-1, -0.5, 0, 0.5, 1, 1.5, 2) * range)
        labels <- c(-range, -range/2, '0%', '50%', '100%', range/2, range)
        ylabs <- c('Mean methylation\nlevel', 'Mean recalibrated\nmethylation level')
            
        ggplts <- list()
        # Enrichment profile
        ggplt <- ggplot(df) + geom_line(aes_string(x='distance', y='mean', col='context'))
        ggplt <- ggplt + scale_x_continuous(breaks=breaks, labels=labels)
        ggplt <- ggplt + scale_y_continuous(limits=c(0,1))
        ggplt <- ggplt + xlab('Distance from annotation')
        ggplt <- ggplt + ylab(ylabs[i1])
        if (!is.null(category.column)) {
            ggplt <- ggplt + facet_grid(.~category)
        }
        ggplt <- ggplt + theme_bw()
        ggplts[['profile']] <- ggplt
        plotlist[[name]] <- ggplt
        
        # # Cytosine density
        # ggplt <- ggplot(df) + geom_line(aes_string(x='distance', y='weight', col='context'))
        # ggplt <- ggplt + scale_x_continuous(breaks=breaks, labels=labels)
        # ggplt <- ggplt + xlab('Distance from annotation') + ylab('Normalized C frequency\nper bp of annotation')
        # if (!is.null(category.column)) {
        #     ggplt <- ggplt + facet_grid(.~category)
        # }
        # ggplts[['weight']] <- ggplt
        
        # cowplt <- cowplot::plot_grid(plotlist = ggplts, align = 'v', nrow=2, rel_heights=c(2,1))
        # plotlist[[name]] <- cowplt
    }

    if (plot) {
        return(plotlist)
    }
}


plotTransitionDistances <- function(model) {

    df <- as.data.frame(mcols(model$data)[c('transitionContext', 'distance')])
    df <- df[!is.na(df$transitionContext),]
    ggplt <- ggplot(df) + geom_histogram(aes_string(x='distance', y='..density..'), binwidth=1, fill='white', col='black') + theme_bw()
    ggplt <- ggplt + facet_wrap(~transitionContext, ncol = length(levels(model$data$context)))
    return(ggplt)

}


#' @describeIn plotting Maximum posterior vs. distance to nearest covered cytosine.
#' @param max.coverage.y Maximum coverage for positions on the y-axis.
#' @param min.coverage.x Minimum coverage for positions on the x-axis.
#' @param xmax Upper limit for the x-axis.
#' @param xbreaks.interval Interval for breaks on the x-axis.
#' @param cutoffs A vector with values that are plotted as horizontal lines. The names of the vector must match the context levels in \code{data$context}.
#' @export
plotPosteriorDistance <- function(model, datapoints=1e6, binwidth=5, max.coverage.y=0, min.coverage.x=3, xmax=200, xbreaks.interval=xmax/10, cutoffs=NULL) {

    if (class(model) == 'methimputeBinomialHMM') {
        data <- model$data
    } else if (class(model) == 'GRanges') {
        data <- model
    } else {
        stop("Argument 'model' needs to be of class 'GRanges' or 'methimputeBinomialHMM'.")
    }
    if (!is.null(cutoffs)) {
        if (length(setdiff(levels(data$context), names(cutoffs))) > 0 ) {
            stop("names(cutoffs) must be equal to levels(data$context)")
        }
    }
    ptm <- startTimedMessage("Plotting maximum posterior vs. distance to nearest covered ...")
    if (length(data) > datapoints) {
        data.small <- data[sort(sample(length(data), datapoints))]
    } else {
        data.small <- data
    }
    datac <- data.small[data.small$counts[,'total'] <= max.coverage.y]
    # Distance of nearest covered cytosine
    cov <- data[data$counts[,'total'] >= min.coverage.x]
    near <- distanceToNearest(datac, cov)
    datac$distance.nearest <- NA
    datac$distance.nearest[near@from] <- mcols(near)$distance
    # Plot
    df <- data.frame(posteriorMax=datac$posteriorMax, distance.nearest=datac$distance.nearest, context=datac$context)
    df$distance.nearest <- df$distance.nearest %/% binwidth * binwidth
    df$distance.nearest.factor <- factor(df$distance.nearest)
    ggplt <- ggplot(df[df$distance.nearest <= xmax,]) + geom_boxplot(aes_string(x='distance.nearest.factor', y='posteriorMax', fill='context'), outlier.shape = NA) + theme_bw()
    ggplt <- ggplt + facet_grid(.~context)
    ggplt <- ggplt + xlab(paste0('Distance in [bp] to nearest position with coverage >= ', min.coverage.x)) + ylab(paste0('Maximum posterior\nat positions with coverage <= ', max.coverage.y))
    ggplt <- ggplt + scale_x_discrete(breaks=seq(0,max(df$distance.nearest, na.rm=TRUE), by=xbreaks.interval))
    if (!is.null(cutoffs)) {
        df <- data.frame(context=names(cutoffs), cutoff=cutoffs)
        ggplt <- ggplt + geom_hline(data=df, mapping=aes_string(yintercept='cutoff'), linetype=1)
    }
    ggplt <- ggplt + ggtitle('Confidence vs. distance')
    stopTimedMessage(ptm)
    return(ggplt)
}


#' #' Plot a count histogram
#' #'
#' #' Plot a histogram of count values and fitted distributions.
#' #'
#' #' @importFrom stats dnbinom
#' plotHistogram2 <- function(model, binwidth=10) {
#' 
#'     ## Assign variables
#'     counts <- model$data$observable
#'     maxcounts <- max(counts)
#' 
#'     ## Plot histogram
#'     ggplt <- ggplot(data.frame(counts)) + geom_histogram(aes_string(x='counts', y='..density..'), binwidth=binwidth, color='black', fill='white') + xlab("counts")
#'     ggplt <- ggplt + coord_cartesian(xlim=c(0,quantile(counts, 0.995)))
#'     ggplt <- ggplt + theme_bw()
#' 
#'     ## Add distributions
#'     if (!is.null(model$params$emissionParams)) {
#'         x <- seq(0, maxcounts, by = binwidth)
#'         distr <- list(x=x)
#'         for (irow in 1:nrow(model$params$emissionParams)) {
#'             e <- model$params$emissionParams
#'             if (e$type[irow] == 'delta') {
#'                 distr[[rownames(model$params$emissionParams)[irow]]] <- rep(0, length(x))
#'                 distr[[rownames(model$params$emissionParams)[irow]]][1] <- model$params$weights[irow]
#'             } else if (e$type[irow] == 'dnbinom') {
#'                 distr[[rownames(model$params$emissionParams)[irow]]] <- model$params$weights[irow] * stats::dnbinom(x, size=e[irow,'size'], prob=e[irow,'prob'])
#'             }
#'         }
#'         distr <- as.data.frame(distr)
#'         distr$Total <- rowSums(distr[,2:(1+nrow(model$params$emissionParams))])
#'         distr <- reshape2::melt(distr, id.vars='x', variable.name='components')
#'         distr$components <- sub('^X', '', distr$components)
#'         distr$components <- factor(distr$components, levels=c(levels(model$data$state), 'Total'))
#'         ggplt <- ggplt + geom_line(data=distr, mapping=aes_string(x='x', y='value', col='components'))
#' 
#'         ## Make legend
#'         lmeans <- round(model$params$emissionParams[,'mu'], 2)
#'         lvars <- round(model$params$emissionParams[,'var'], 2)
#'         lweights <- round(model$params$weights, 2)
#'         legend <- paste0(rownames(model$params$emissionParams), ", mean=", lmeans, ", var=", lvars, ", weight=", lweights)
#'         legend <- c(legend, paste0('Total, mean=', round(mean(counts),2), ', var=', round(var(counts),2)))
#'         ggplt <- ggplt + scale_color_manual(name="components", values=getStateColors(c(rownames(model$params$emissionParams),'Total')), labels=legend) + theme(legend.position=c(1,1), legend.justification=c(1,1))
#'     }
#' 
#'     return(ggplt)
#' }
#' 
#' 
#' plotBoxplot <- function(model, datapoints=1000) {
#'   
#'     df <- data.frame(state=model$data$state, observable=model$data$observable)
#'     if (datapoints < nrow(df)) {
#'         df <- df[sample(1:nrow(df), datapoints, replace = FALSE), ]
#'     }
#'     df <- suppressMessages( reshape2::melt(df) )
#'     names(df) <- c('state', 'status', 'observable')
#'     ggplt <- ggplot(df) + geom_boxplot(aes_string(x='state', y='observable', fill='status'))
#'     return(ggplt)
#'     
#' }

#' #' Histograms faceted by state
#' #' 
#' #' Make histograms of variables in \code{bins} faceted by state in \code{segmentation}.
#' #' 
#' #' @param segmentation A segmentation with metadata column 'state'.
#' #' @param bins A \code{\link{binnedMethylome}}.
#' #' @param plotcol A character vector specifying the meta-data columns for which to produce the faceted plot.
#' #' @param binwidth Bin width for the histogram.
#' #' @return A \code{\link[ggplot2]{ggplot}}.
#' #' @importFrom reshape2 melt
#' plotStateHistograms <- function(segmentation, bins, plotcol, binwidth) {
#'   
#'     states <- levels(segmentation$state)
#'     segmentation.split <- split(segmentation, segmentation$state)
#'     mind <- findOverlaps(bins, segmentation.split, select='first')
#'     bins$state <- factor(names(segmentation.split)[mind], levels=states)
#'     
#'     df <- data.frame(state=bins$state, mcols(bins)[,plotcol])
#'     names(df)[2:ncol(df)] <- plotcol
#'     dfm <- reshape2::melt(df, id.vars = 'state', measure.vars = plotcol, variable.name = 'plotcol')
#'     ggplt <- ggplot(dfm) + geom_freqpoly(aes_string(x='value', y='..density..', col='plotcol'), binwidth = binwidth)
#'     ggplt <- ggplt + theme_bw()
#'     ggplt <- ggplt + facet_wrap(~ state)
#'     
#'     return(ggplt)
#' }


#' #' Scatter plots faceted by state
#' #' 
#' #' Make scatter plots of two variables in \code{bins} faceted by state in \code{segmentation}.
#' #' 
#' #' @param segmentation A segmentation with metadata column 'state'.
#' #' @param bins A \code{\link{binnedMethylome}}.
#' #' @param x A character specifying the meta-data column that should go on the x-axis.
#' #' @param y A character specifying the meta-data column that should go on the y-axis.
#' #' @param col A character specifying the meta-data column that should be used for coloring.
#' #' @return A \code{\link[ggplot2]{ggplot}}.
#' plotStateScatter <- function(segmentation, bins, x, y, col=NULL, datapoints=1000) {
#'   
#'     states <- levels(segmentation$state)
#'     segmentation.split <- split(segmentation, segmentation$state)
#'     mind <- findOverlaps(bins, segmentation.split, select='first')
#'     bins$seg.state <- factor(names(segmentation.split)[mind], levels=states)
#'     
#'     df <- data.frame(state = bins$seg.state, x = mcols(bins)[,x], y = mcols(bins)[,y], color=mcols(bins)[,col])
#'     df <- df[sample(1:nrow(df), size=datapoints, replace=FALSE),]
#'     if (!is.null(col)) {
#'         ggplt <- ggplot(df) + geom_jitter(aes_string(x='x', y='y', col='color'), alpha=0.1)
#'     } else {
#'         ggplt <- ggplot(df) + geom_jitter(aes_string(x='x', y='y'), alpha=0.1)
#'     }
#'     ggplt <- ggplt + theme_bw()
#'     ggplt <- ggplt + xlab(x) + ylab(y)
#'     ggplt <- ggplt + facet_wrap(~ state)
#'     ggplt <- ggplt + scale_color_manual(values=getDistinctColors(levels(df$color)))
#'     
#'     return(ggplt)
#' }









#' #' @describeIn enrichment_analysis Compute the fold enrichment of combinatorial states for multiple annotations.
#' #' @param model A \code{\link{combinedMultimodel}} or \code{\link{multimodel}} object or a file that contains such an object.
#' #' @param annotations A \code{list()} with \code{\link{GRanges-class}} objects containing coordinates of multiple annotations The names of the list entries will be used to name the return values.
#' #' @param plot A logical indicating whether the plot or an array with the fold enrichment values is returned.
#' #' @param logscale Whether (\code{TRUE}) or not (\code{FALSE}) to plot on log scale.
#' #' @param cluster Whether (\code{TRUE}) or not (\code{FALSE}) to cluster the annotations.
#' #' @importFrom S4Vectors subjectHits queryHits
#' #' @importFrom IRanges subsetByOverlaps
#' #' @importFrom reshape2 melt
#' #' @importFrom stats hclust as.dendrogram
#' #' @export
#' heatmapEnrichment <- function(model, annotations, plot=TRUE, logscale=TRUE, cluster=TRUE) {
#'   
#'   ## Variables
#'   bins <- model$data
#'   genome <- sum(as.numeric(width(bins)))
#'   annotationsAtBins <- lapply(annotations, function(x) { IRanges::subsetByOverlaps(x, bins) })
#'   feature.lengths <- lapply(annotationsAtBins, function(x) { sum(as.numeric(width(x))) })
#'   state.levels <- levels(bins$state)
#'   # Only state levels that actually appear
#'   state.levels <- state.levels[state.levels %in% unique(bins$state)]
#'   
#'   ggplts <- list()
#'   folds <- list()
#'   maxfolds <- list()
#'   
#'   ptm <- startTimedMessage("Calculating fold enrichment ...")
#'   fold <- array(NA, dim=c(length(annotationsAtBins), length(state.levels)), dimnames=list(annotation=names(annotationsAtBins), state=state.levels))
#'   for (istate in 1:length(state.levels)) {
#'     mask <- bins$state == state.levels[istate]
#'     bins.mask <- bins[mask]
#'     state.length <- sum(as.numeric(width(bins.mask)))
#'     for (ifeat in 1:length(annotationsAtBins)) {
#'       feature <- annotationsAtBins[[ifeat]]
#'       ind <- findOverlaps(bins.mask, feature)
#'       
#'       binsinfeature <- bins.mask[unique(S4Vectors::queryHits(ind))]
#'       sum.binsinfeature <- sum(as.numeric(width(binsinfeature)))
#'       
#'       featuresinbins <- feature[unique(S4Vectors::subjectHits(ind))]
#'       sum.featuresinbins <- sum(as.numeric(width(featuresinbins)))
#'       
#'       fold[ifeat,istate] <- sum.binsinfeature / state.length / feature.lengths[[ifeat]] * genome
#'     }
#'   }
#'   stopTimedMessage(ptm)
#'   
#'   ## Clustering
#'   if (cluster) {
#'     hc.anno <- stats::hclust(dist(fold))
#'     fold <- fold[hc.anno$order, ]
#'   }
#'   
#'   if (plot) {
#'     df <- reshape2::melt(fold, value.name='fold')
#'     if (logscale) {
#'       df$fold <- log(df$fold)
#'     }
#'     df$state <- factor(df$state, levels=colnames(fold))
#'     # Transform annotation to numeric for compatibility with the dendrogram
#'     df$ylabel <- df$annotation
#'     df$annotation <- as.numeric(df$annotation)
#'     maxfold <- max(df$fold, na.rm=TRUE)
#'     minfold <- max(df$fold, na.rm=TRUE)
#'     limits <- max(abs(maxfold), abs(minfold))
#'     ggplt <- ggplot(df) + geom_tile(aes_string(x='state', y='annotation', fill='fold'))
#'     ggplt <- ggplt + scale_y_continuous(breaks=1:length(unique(df$annotation)), labels=unique(df$ylabel), position='right')
#'     ggplt <- ggplt + theme_bw()
#'     ggplt <- ggplt + theme(axis.text.x = element_text(angle=90, hjust=1, vjust=0.5))
#'     if (logscale) {
#'       ggplt <- ggplt + scale_fill_gradientn(name='log(enrichment)', colors=grDevices::colorRampPalette(c("blue","white","red"))(20), values=c(seq(-limits,0,length.out=10), seq(0,limits,length.out=10)), rescaler=function(x,...) {x}, oob=identity, limits=c(-limits,limits), na.value="blue")
#'     } else {
#'       ggplt <- ggplt + scale_fill_gradientn(name='enrichment', colors=grDevices::colorRampPalette(c("blue","white","red"))(20), values=c(seq(0,1,length.out=10), seq(1,maxfold,length.out=10)), rescaler=function(x,...) {x}, oob=identity, limits=c(0,maxfold))
#'     }
#'     cowplt <- ggplt
#'     ## Dendrograms
#'     if (cluster) {
#'       theme_none <- theme(axis.text = element_blank(),
#'                           axis.ticks = element_blank(),
#'                           panel.background = element_blank(),
#'                           axis.title = element_blank())
#'       # Row dendrogram
#'       df.dendro <- ggdendro::segment(ggdendro::dendro_data(stats::as.dendrogram(hc.anno)))
#'       row.dendro <- ggplot(df.dendro) +  geom_segment(aes_string(x='y', y='x', xend='yend', yend='xend')) +  theme_none + scale_x_reverse() + coord_cartesian(ylim=c(0.5,max(df$annotation)+0.5))
#'       # Column dendrogram
#'       # col.dendro <- ggplot(ggdendro::segment(ggdendro::dendro_data(stats::as.dendrogram(hc.state)))) +  geom_segment(aes_string(x='x', y='y', xend='xend', yend='yend')) + theme_none
#'       cowplt <- cowplot::plot_grid(row.dendro, ggplt, align = 'h', ncol = 2, rel_widths = c(1,3))
#'     }
#'     return(cowplt)
#'   } else {
#'     return(fold)
#'   }
#' }
ataudt/popmeth documentation built on May 10, 2019, 2:07 p.m.