Repitools: Epigenomic tools

setGeneric("clusterPlots", function(scores.list, ...){standardGeneric("clusterPlots")})

setMethod("clusterPlots", "ClusteredScoresList",
	function(scores.list, plot.ord = 1:length(scores.list), plot.type =
                 c("heatmap", "line", "by.cluster"), heat.bg.col = "black",
                 summarize = c("mean", "median"), symm.scale = FALSE, cols = NULL,
                 t.name = NULL, verbose = TRUE, ...)
{
    scores.list <- scores.list[plot.ord]
    plot.type <- match.arg(plot.type)
    summarize <- match.arg(summarize)

    n.marks <- length(scores.list)

    if(is.null(cols) == TRUE)
    {
	if(plot.type %in% c("by.cluster", "line"))
	{
	    cols <- colorpanel(n.marks, "blue", "green", "red")
	} else {
	    cols <- colorpanel(100, "blue", "white", "red")
	}
    }

    tbls <- tables(scores.list)

    # Get summary expression for each cluster. Find descending order.
    expr <- scores.list@expr
    expr.name <- scores.list@expr.name
    cl.id <- scores.list@cluster.id
    cl.levels <- levels(factor(cl.id))
    n.clusters <- length(cl.levels)

    keep <- !is.na(cl.id)
    tbls <- lapply(tbls, function(x) x[keep, ])
    cl.id <- cl.id[keep]

    if(!is.null(expr))
    {
        expr <- expr[keep]
        if(summarize == "mean")
            cl.expr <- tapply(expr, factor(cl.id), mean, na.rm = TRUE)
        else
            cl.expr <- tapply(expr, factor(cl.id), median, na.rm = TRUE)
        cl.ord <- order(cl.expr, decreasing = TRUE)
    } else {
        cl.expr <- numeric()
        cl.ord <- 1:n.clusters
    }

    # Get x-axis pos and x, score labels.
    pos.labels <- colnames(tbls[[1]])
    pos <- as.integer(gsub('%', '', pos.labels)) # Get raw position if labels have
                                                 # percentage signs.

    if(symm.scale)
        score.labels <- c("-100%", "0%", "100%")
    else
        score.labels <- c("0%", "50%", "100%")
    
    if(verbose) message("Generating plot.")
    if(plot.type == "by.cluster")
    {
	# Group each cluster from all epigenetic marks.

	profiles <- list() 
	for(i in 1:n.clusters)
	    profiles[[i]] <- sapply(tbls, function(x)
                                    if(summarize == "mean")
                                        colMeans(x[cl.id == cl.levels[cl.ord[i]], , drop = FALSE], na.rm = TRUE)
                                    else
                                        apply(x[cl.id == cl.levels[cl.ord[i]], , drop = FALSE], 2, median, na.rm = TRUE)
                                   )

	# Plot the lineplots by cluster.
	invisible(mapply(function(x, y)
	{
            y.max = max(unlist(x)) * 1.1
            if(symm.scale) y.min = -y.max else y.min = 0
            layout(rbind(2:1), widths = c(3, 1))
           
	    par(mai = c(1, 0, 0.8, 0))
	    plot.new()
            legend("topleft", legend = names(scores.list), title = "Mark", col = cols, ...)
	    par(mai = c(1, 1, 0.8, 0.1))
            if(!is.na(cl.expr[y]))
            {
                cl.text <- paste('(',
                                ifelse(summarize == "mean", "Mean", "Median"),
                                " Expression: ", round(cl.expr[y], 2), ')', sep = '')
            } else {
                cl.text <- paste("Cluster", cl.levels[y])
            }
	    matplot(pos, x, ylim = c(y.min, y.max), type = 'l', col = cols,
                    xlab = "Relative Position", ylab = "Score", yaxt = 'n',
                    xaxs = 'i', yaxs = 'i', ...)
	    axis(2, at = c(y.min, (y.min + y.max) / 2, y.max), labels = score.labels)

            mtext(paste("Within Cluster Coverage", cl.text), outer = TRUE, line = -2)
	}, profiles, as.list(cl.ord)))
    } else if(plot.type == "line") # Plot a table of lineplots.
    {
        old.par <- par(no.readonly = TRUE)
        par(oma = c(5, 6, 5, 2), mar = c(0, 0, 0, 0), xpd = NA)
        if(is.null(scores.list@.old.ranges))
            ranges <- lapply(tbls, range, na.rm = TRUE)
        else
            ranges <- scores.list@.old.ranges

	      profiles <- lapply(tbls, function(x)
                           {
                               lapply(cl.ord, function(y)
                               {
                                   if(summarize == "mean")
                                       colMeans(x[cl.id == y, , drop = FALSE], na.rm = TRUE)
                                   else
                                       apply(x[cl.id == y, , drop = FALSE], 2, function(z) median(z), na.rm = TRUE)
                               })
                           })

        n.boxes <- (n.marks) * n.clusters
        layout(matrix(c(1:n.boxes, rep(n.boxes + 1, n.clusters), if(!is.null(expr)) rep(n.boxes + 2, n.clusters)),
                      nrow = n.clusters),
               widths = c(rep(0.8 / n.marks, n.marks), 0.05, if(!is.null(expr)) 0.15))

        x.lims <- c(pos[1], pos[length(pos)])
        for(col.index in 1:n.marks)
        {
            for(row.index in 1:n.clusters)
            {
                matplot(pos, profiles[[col.index]][[row.index]], type = 'l', ylim = ranges[[col.index]],
                        xlab = "", ylab = "", xaxs = 'i', yaxs = 'i', xaxt = 'n', yaxt = 'n',
                        col = cols[col.index], ...)
                if(row.index == n.clusters && col.index == 1)
                {
                    axis(1, pos, pos.labels)
                    mtext("Relative Position", 1, 3)
                }
                if(row.index == 1 && col.index == 1)
                {
                    labels.ys <- par("usr")[3:4]
                    axis(2, c(labels.ys[1], mean(labels.ys), labels.ys[2]), labels = score.labels, las = 2)
                    mtext("Score", 2, 3)
                }
                if(row.index == 1)
                    title(names(scores.list)[col.index], line = 2)
            }    
        }

        plot.new()
        plot.window(xlim = c(0, 2), ylim = c(0, n.clusters), xaxs = 'i', yaxs = 'i')
        title("ID", line = 2)
        text(rep(1, n.clusters), (n.clusters:1) - 0.5, 1:n.clusters)

        if(!is.null(expr))
        {
            box.data <- lapply(cl.levels[rev(cl.ord)], function(x) expr[cl.id == x])
            box.data <- boxplot(box.data, plot = FALSE)

            plot.locs <- c(unlist(box.data$stats), unlist(box.data$out))
            max.expr <- 1.1 * max(plot.locs)
            min.data <- min(plot.locs)
            min.expr <- ifelse(min.data < 0, min.data * 1.1, min.data * 0.9)
            
            plot.new()
            plot.window(xlim = c(min.expr, max.expr), ylim = c(0, n.clusters),
                        xaxs = 'i', yaxs = 'i')
            axis(1)
            if(!is.null(expr.name)) mtext(expr.name, 1, 3)
            bxp(box.data, at = (1:n.clusters) - 0.5, add = TRUE, horizontal = TRUE,
               yaxt = 'n')
        }
        mtext(t.name, font = 2, line = 3, outer = TRUE)
        par(old.par)
    } else { # Plot a heatmap.
        if(is.null(scores.list@.old.ranges))
            ranges <- lapply(tbls, range, na.rm = TRUE)
        else
            ranges <- scores.list@.old.ranges

        plot.ranges <- lapply(ranges, function(x)
                                      {
                                         if(symm.scale)
                                             c(-1, 1) * max(abs(x))
                                         else
                                             range(x)
                                      })
	
        par(oma = c(1, 1, 3, 1))
	sort.data <- scores.list@sort.data
	sort.name <- scores.list@sort.name
	# Get order of all features next.
	if(length(sort.data) == 0)
	    ord <- order(factor(cl.id, levels = cl.levels[rev(cl.ord)]))
	else
	    ord <- order(factor(cl.id, levels = cl.levels[rev(cl.ord)]), sort.data)
	
	# Re-arrange the ChIP and expression data and vector of sort data.
	tbls <- lapply(tbls, function(x) x[ord, ])
	if(!is.null(expr)) expr <- expr[ord]
	if(length(sort.data) > 0) sort.data <- sort.data[ord]

	# Plot heatmap.
        extras <- sum(!is.null(expr), !is.null(sort.data))
	switch(as.character(extras),
            `0` = layout(rbind(1:(n.marks + 2)), widths=c(1, rep(3, n.marks)), 0.4),
	    `1` = layout(rbind(1:(n.marks + 3)), widths=c(1, rep(3, n.marks), 0.4, 2)),
	    `2` = layout(rbind(1:(n.marks + 4)), widths=c(1, rep(3, n.marks), 0.4, 2, 1)))
	par(mai = c(1.02, 0.50, 0.82, 0.05))
  	
	n.bins = length(cols)
	image(y=seq(1/n.bins/2, 1-(1/n.bins/2), 1/n.bins), z=rbind(1:n.bins),
              col = cols, axes = FALSE, xlab = "Score", ylab = NA)
	axis(2, at=c(0, 0.5, 1), labels = score.labels, las = 2)

	par(mai=c(1.02,0.05,0.82,0.05))
        cl.sizes <- table(cl.id)[rev(cl.ord)]
	bounds <- cumsum(cl.sizes)
        mapply(function(x, y, z)
	{
	    image(pos, 1:nrow(x), t(x), zlim = z, xlab = "Relative Position",
                  xaxt = 'n', yaxt = 'n', col = cols, main = y)
            usr <- par("usr")
            rect(usr[1], usr[3], usr[2], usr[4], col = heat.bg.col)
	    image(pos, 1:nrow(x), t(x), zlim = z, xaxt = 'n', yaxt = 'n', col = cols, add = TRUE)
            axis(1, pos, labels = pos.labels)		

	    # Add lines delimiting the cluster boundaries.
	    abline(h = bounds[-n.clusters], lwd = 3)
	}, tbls, names(scores.list), plot.ranges)

        cl.midpts <- bounds - cl.sizes / 2
        plot.new()
        plot.window(xlim = c(0, 2), ylim = c(0, bounds[length(bounds)]), xaxs = 'i', yaxs = 'i')
        text(rep(1, n.clusters), cl.midpts, labels = cl.levels[rev(cl.ord)], cex = 2)
        title("ID")

	par(mai = c(1.02, 0.05, 0.82, 0.50))
        if(!is.null(expr))
	    plot(expr, y = 1:length(expr), yaxs = 'i', xlab = expr.name, ylab = NA,
                 yaxt = 'n', ...)
	if(!is.null(sort.data)) plot(sort.data, y = 1:length(sort.data), yaxs = 'i',
                     xlab = sort.name, ylab = NA, yaxt = 'n', ...)	

        if(!is.null(t.name))
	    mtext(t.name, line = 0, font = 2, cex = 1.5, outer = TRUE)
    }
})

setMethod("clusterPlots", "ScoresList", function(scores.list, scale = function(x) x,
          cap.q = 0.95, cap.type = c("sep", "all"), all.mappable = FALSE, n.clusters = NULL,
          plot.ord = 1:length(scores.list), expr = NULL, expr.name = NULL,
          sort.data = NULL, sort.name = NULL, plot.type = c("heatmap", "line", "by.cluster"),
          summarize = c("mean", "median"), cols = NULL, t.name = NULL,
          verbose = TRUE, ...)
{
    scores.list <- scores.list[plot.ord]
    plot.type <- match.arg(plot.type)
    cap.type <- match.arg(cap.type)
    summarize <- match.arg(summarize)
    tbls <- tables(scores.list)

    if(is.null(n.clusters))
	stop("Number of clusters not given.\n")
    if(!is.null(expr) && nrow(tbls[[1]]) != length(expr))
	stop("Number of features does not match length of expression vector.\n")
    if(!is.null(sort.data) && is.null(sort.name))
    	stop("'sort.data' provided, 'sort.name' is not.")
    if(!is.null(sort.data) && !is.null(expr) && length(expr) != length(sort.data))
	stop("'sort.data' length not the same as number of features in coverage",
             "matrices or expression data.\n")

    # Get rid of any rows that have 1/2 NAs or more.
    # Ensures a distance measure can be computed between all pairs of transcript scores.

    if(all.mappable == FALSE)
    {
        rows.na <- lapply(tbls, function(x)
                                apply(x, 1, function(y) length(which(is.na(y)))))
        drop <- apply(do.call(cbind, rows.na), 1, function(x) any(x >= ncol(tbls[[1]]) / 2))
    } else { # Only use transcripts that have all positions with a score.
        rows.na <- lapply(tbls, function(x)
                            apply(x, 1, function(y) any(is.na(y))))
        drop <- apply(do.call(cbind, rows.na), 1, any)
    }
    
    tbls <- lapply(tbls, function(x) x[!drop, ])
    tbls <- lapply(tbls, scale)   
    
    if(cap.type == "all")
        max.cvg <- quantile(abs(do.call(cbind, tbls)), cap.q, na.rm = TRUE)

    # Find the maximum score allowable, then make any scores bigger be the
    # maximum score.
    tbls <- lapply(tbls, function(x)
    {
        if(cap.type == "sep") max.cvg <- quantile(abs(x), cap.q, na.rm = TRUE)
        x[x > max.cvg] = max.cvg
        x[x < -max.cvg] = -max.cvg
        x / max.cvg
    })

    # Do some kind of clustering for all marks together.
    all <- do.call(cbind, tbls)
    any.na <- any(is.na(all))
    if(any.na)
    {
        if(verbose) message("Doing PAM clustering.")
        cl.id <- pam(all, n.clusters, cluster.only = TRUE, do.swap = FALSE)
    } else { # No NAs in the data. Use k-means for speed.
        if(verbose) message("Doing k-means clustering.")
        cl.id <- kmeans(all, n.clusters, iter.max = 100)$cluster
    }
    cl.id <- factor(cl.id)

    # Make sure order of IDs correspond to increasing expression, if present.
    if(!is.null(expr))
    {
        expr <- expr[!drop]
        if(summarize == "mean")
            cl.expr <- tapply(expr, cl.id, mean, na.rm = TRUE)
        else
            cl.expr <- tapply(expr, cl.id, median, na.rm = TRUE)
        cl.ord <- order(cl.expr, decreasing = TRUE)
        cl.id <- factor(sapply(cl.id, function(x) match(x, cl.ord)))
    }

    csl <- ClusteredScoresList(scores.list, anno = scores.list@anno[!drop], scores = tbls,
                               cluster.id = cl.id, expr = expr, expr.name = expr.name,
                               sort.data = sort.data, sort.name = sort.name)
    clusterPlots(csl, plot.type = plot.type, summarize = summarize, cols = cols,
                 t.name = t.name, verbose = verbose, ...)
    invisible(csl)
})