R/view_motifs.R

Defines functions convert_mat_type_from_ppm check_right_side check_mot_sizes make_multi_letter_polygon_data make_letter_polygon_data make_position_polygon_data make_matrix_polygon_data prep_single_motif_plot_data view_motifs

Documented in view_motifs

#' Plot motif logos.
#'
#' Show sequence logo. If given a list of more than one motif, then the motifs
#' are aligned with the first in the list.
#'
#' @param motifs See [convert_motifs()] for acceptable motif formats.
#' @param use.type `character(1)` One of `c('PCM', 'PPM', 'PWM', 'ICM')`.
#' @param return.raw `logical(1)` Instead of returning a plot, return the
#'    aligned named matrices used to generate the plot. This can be useful
#'    if you wish to use [view_motifs()] alignment capabilities for custom
#'    plotting uses. Alignment is performed by adding empty columns to the
#'    left or right of motifs to generate matrices of equal length.
#' @param dedup.names `logical(1)` Plotting motifs with duplicated names is
#'    not allowed. Setting this to `TRUE` allows the names to be modified
#'    for plotting.
#' @param show.positions `logical(1)` Show x-axis position tick labels.
#' @param show.positions.once `logical(1)` When plotting multiple motifs,
#'    show x-axis position tick labels only once. If `FALSE`, then
#'    x-axis tick labels are specific to each motif.
#' @param show.names `logical(1)` Add motif names when plotting multiple
#'    motifs.
#' @param names.pos `character(1)` Motif name locations. Either above (`top`)
#'    or to the right (`right`) of the logos.
#' @param use.freq `numeric(1)` Plot higher order motifs from the `multifreq`
#'    slot.
#' @param colour.scheme `character` A named character vector of colour names.
#'    Default colours are provided for DNA, RNA, and AA motifs if left `NULL`.
#' @param fontDF `data.frame` or `DataFrame` Polygon data for letters used
#'    for plotting, as generated by the `createPolygons()` function from the
#'    `gglogo` package. See the `fontDFroboto` data object (which is used
#'    by default when `fontDF = NULL`). See `Examples` for how to generate
#'    your own font set. Expected columns: `x`, `y`, `order`, `group`;
#'    additional columns will be ignored.
#' @param min.height `numeric(1)` Minimum height for a letter to be plotted.
#'    The number is taken as the fraction of the total height of the plot.
#'    The default value is to not show letters which take up 1% or less of the
#'    vertical space. For smaller figures it is recommended to increase this
#'    value, and vice versa for larger figures.
#' @param x.spacer `numeric(1)` Add horizontal spacing between letters. The
#'    number is taken as the fraction of the width of an individual position.
#'    Increasing this value is recommended for plotting multifreq motifs.
#' @param y.spacer `numeric(1)` Add vertical spacing between letters. The
#'    number is taken as the fraction nof the total height of the plot.
#' @param sort.positions `logical(1)` Sort letters vertically per position
#'    by height.
#' @param sort.positions.decreasing `logical(1)` Sort in decreasing or
#'    increasing order based on letter height.
#' @param text.size `numeric(1)` Text size for labels.
#' @param fit.to.height `numeric(1)` Normalize the per position height to
#'    this value. If `NULL`, no normalization is applied. Note that this
#'    parameter is ignored if `use.type = c("PWM", "ICM")`.
#' @param RC.text `character(1)` The text to display alongside the name
#'    of motifs shown as their reverse complement.
#' @param ... Unused. Was previously in place to allow extra args to be given
#'    to `ggseqlogo::ggseqlogo`, however `universalmotif` now implements its
#'    own motif plotting code directly with `ggplot2`.
#'
#' @return A `ggplot` object. If `return.raw = TRUE`, a list of matrices.
#'
#' @details
#' See [compare_motifs()] for more info on comparison parameters.
#'
#' See [view_logo()] to plot from a numeric matrix with arbitrary values instead
#' of a motif object.
#'
#' Note: `score.strat = "a.mean"` is NOT recommended, as [view_motifs()] will
#' not discriminate between two alignments with equal mean scores, even if one
#' alignment is longer than the other.
#'
#' Note: if you want to plot the motifs yourself, you can set
#' `return.raw=TRUE` to get the numeric motif matrices and calculate
#' the polygon paths on your own or access the polygon path data directly from
#' the final `ggplot` object using `$data`.
#'
#' @examples
#' ## Plotting multifreq motifs:
#' data(examplemotif2)
#' view_motifs(examplemotif2, use.freq = 2)
#'
#' ## Generate your own letter set:
#' \dontrun{
#'
#' library(gglogo)  # install from CRAN first if needed
#' fontDFtimes <- createPolygons(LETTERS, "Times", 800, scale = TRUE)
#' view_motifs(examplemotif2, fontDF = fontDFtimes)
#'
#' ## Note: setting `scale = TRUE` is necessary to properly align letters
#' ## vertically, but this has the effect of horizontally stretching out
#' ## letters which shouldn't be stretched (such as "I"). If you need to plot
#' ## letters which have been badly horizontally scaled, you can fix them
#' ## manually as demonstrated here:
#'
#' # Retrieve the x-coordinates for the desired letter:
#' tofix <- fontDFtimes$x[fontDFtimes$group == "I"]
#' # Scale the letter x-coordinates:
#' tofix <- tofix * 0.35
#' # Remember to center the letter around 0.5 again:
#' tofix <- tofix + (1 - max(tofix)) / 2
#' # Apply the fix:
#' fontDFtimes$x[fontDFtimes$group == "I"] <- tofix
#' view_motifs(create_motif("AIG", alphabet = "AA"), fontDF = fontDFtimes)
#'
#' }
#'
#' @seealso [compare_motifs()], [add_multifreq()], [view_logo()]
#' @author Benjamin Jean-Marie Tremblay, \email{benjamin.tremblay@@uwaterloo.ca}
#' @inheritParams compare_motifs
#' @export
view_motifs <- function(motifs, use.type = "ICM", method = "ALLR",
  tryRC = TRUE, min.overlap = 6, min.mean.ic = 0.25, relative_entropy = FALSE,
  normalise.scores = FALSE, min.position.ic = 0, score.strat = "sum",
  return.raw = FALSE, dedup.names = TRUE, show.positions = TRUE,
  show.positions.once = TRUE, show.names = TRUE, names.pos = c("top", "right"),
  use.freq = 1, colour.scheme = NULL, fontDF = NULL, min.height = 0.01,
  x.spacer = if (use.freq == 1) 0.04 else 0.1,
  y.spacer = 0.01, sort.positions = !use.type %in% c("PCM", "PPM"),
  sort.positions.decreasing = TRUE, text.size = 16,
  fit.to.height = if (use.type == "PPM") 1 else NULL, RC.text = " [RC]", ...) {

  # TODO: give option to only use limits as breaks (eg 0-2 instead of 0-1-2)

  # TODO: think about plugging view_motifs() into ggtree somehow

  # TODO: need to implement arg checks for new options

  # param check --------------------------------------------
  args <- as.list(environment())
  all_checks <- character(0)
  if (!method %in% COMPARE_METRICS) {
    method_check <- " * Incorrect 'method'"
    all_checks <- c(all_checks, method_check)
  }
  if (!use.type %in% c("PPM", "ICM", "PWM", "PCM")) {
    use.type_check <- paste0(" * Incorrect 'use.type': expected `PPM`, `PCM`, ",
                             "`PWM` or `ICM`; got `",
                             use.type, "`")
    use.type_check <- wmsg2(use.type_check, 4, 2)
    all_checks <- c(all_checks, use.type_check)
  }
  char_check <- check_fun_params(list(use.type = args$use.type,
                                      method = args$method,
                                      RC.text = args$RC.text,
                                      score.strat = args$score.strat),
                                 numeric(), logical(), TYPE_CHAR)
  num_check <- check_fun_params(list(min.overlap = args$min.overlap,
                                     min.mean.ic = args$min.mean.ic,
                                     min.position.ic = args$min.position.ic),
                                numeric(), logical(), TYPE_NUM)
  logi_check <- check_fun_params(list(tryRC = args$tryRC,
                                      relative_entropy = args$relative_entropy,
                                      normalise.scores = args$normalise.scores,
                                      return.raw = args$return.raw,
                                      dedup.names = args$dedup.names),
                                 numeric(), logical(), TYPE_LOGI)
  all_checks <- c(all_checks, char_check, num_check, logi_check)
  if (length(all_checks) > 0) stop(all_checks_collapse(all_checks))
  #---------------------------------------------------------

  dots <- list(...)
  if (length(dots))
    warning(wmsg("Found unknown parameters; please note that as of v1.10.0 ",
        "the ggseqlogo package is no longer used for plotting motifs and thus ",
        "the previously intended purpose of allowing for extra ggseqlogo ",
        "arguments via `...` is now no longer needed [unknown arguments: ",
        paste0(names(dots), collapse = ", "), "]"), call. = FALSE, immediate. = TRUE)

  names.pos <- match.arg(names.pos)

  if (!score.strat %in% c("sum", "a.mean", "g.mean", "median", "wa.mean",
                          "wg.mean", "fzt"))
    stop("'score.strat' must be one of 'sum', 'a.mean', 'g.mean', 'median', ",
         "'wa.mean', 'wg.mean', 'fzt'")

  if (score.strat %in% c("g.mean", "wg.mean") && method %in%
      c("ALLR", "ALLR_LL", "PCC"))
    stop(wmsg("'g.mean'/'wg.mean' is not allowed for methods which can generate ",
        "negative values: ALLR, ALLR_LL, PCC"))

  motifs <- convert_motifs(motifs)
  motifs <- convert_type_internal(motifs, "PPM")
  if (!is.list(motifs)) motifs <- list(motifs)

  if (use.type == "PWM" && length(motifs) > 1 && method == "KL") {
    stop("cannot use method 'KL' with 'PWM' matrices")
  }

  if (use.type == "PWM" && !is.null(fit.to.height))
    stop("`fit.to.height` must be NULL if `use.type = \"PWM\"`")

  mot.nsites <- lapply(motifs, function(x) x@nsites)
  mot.pseudo <- lapply(motifs, function(x) x@pseudocount)

  if (use.freq == 1) {
    mot.mats <- lapply(motifs, function(x) x@motif)
    mot.bkgs <- get_bkgs(motifs)
    for (i in seq_along(mot.mats)) {
      mot.mats[[i]] <- convert_mat_type_from_ppm(mot.mats[[i]], use.type,
        mot.nsites[[i]], mot.bkgs[[i]], mot.pseudo[[i]], relative_entropy)
    }
  } else if (use.freq > 1) {
    if (any(vapply(motifs, function(x) length(x@multifreq) == 0, logical(1))))
      stop("missing multifreq slots")
    check_multi <- vapply(
      motifs,
      function(x) any(names(x@multifreq) %in% as.character(use.freq)),
      logical(1)
    )
    if (!any(check_multi))
      stop("not all motifs have corresponding multifreq matrix")
    mot.mats <- lapply(motifs,
      function(x) x@multifreq[[as.character(use.freq)]])
    if (use.type != "PPM") {
      for (i in seq_along(mot.mats)) {
        rn <- rownames(mot.mats[[i]])
        if (use.type == "PWM") {
          mot.mats[[i]] <- MATRIX_ppm_to_pwm(
            mot.mats[[i]], nsites = motifs[[i]]@nsites,
            pseudocount = motifs[[i]]@pseudocount,
            bkg = motifs[[i]]@bkg[rownames(mot.mats[[i]])])
        } else if (use.type == "ICM") {
          mot.mats[[i]] <- MATRIX_ppm_to_icm(
            mot.mats[[i]], bkg = motifs[[i]]@bkg[rownames(mot.mats[[i]])],
            relative_entropy = relative_entropy
          )
        } else if (use.type == "PCM") {
          mot.mats[[i]] <- MATRIX_ppm_to_pcm(
            mot.mats[[i]], nsites = motifs[[i]]@nsites
          )
        }
        rownames(mot.mats[[i]]) <- rn
      }
    }
  } else {
    stop("`use.freq` must be a positive integer")
  }

  mot.names <- vapply(motifs, function(x) x@name, character(1))
  if (length(mot.names) != length(unique(mot.names))) {
    if (!dedup.names) stop(wmsg(
      "All motifs must have unique names when `dedup.names=FALSE`."
    ), call. = FALSE)
    mot.names <- make.unique(mot.names)
  }

  mot.alph <- unique(vapply(motifs, function(x) x@alphabet, character(1)))
  if (length(mot.alph) > 1) stop("can only have one alphabet")
  use.custom <- FALSE

  alph <- switch(mot.alph, "DNA" = DNA_BASES, "RNA" = RNA_BASES,
    "AA" = AA_STANDARD2, rownames(mot.mats[[1]]))

  ylim2 <- NULL
  breaks <- NULL
  switch(use.type,
    "PPM" = {
      yname <- "probability"
      ylim2 <- 0:1
      breaks <- c(0, 0.5, 1)
    },
    "ICM" = {
      yname <- "bits"
      ylim2 <- c(0, log2(nrow(mot.mats[[1]])))
      breaks <- unique(round(c(0, ylim2[2] * 0.5, ylim2[2])))
    },
    "PWM" = {
      yname <- "logodds"
    },
    "PCM" = {
      yname <- "counts"
    },
    stop("'use.type' must be one of 'PCM', 'PPM', 'PWM', 'ICM'")
  )

  if (is.null(fontDF)) fontDF <- fontDFroboto
  fontDF <- as.data.frame(fontDF)
  if (!any(c("x", "y", "order", "group") %in% colnames(fontDF)))
    stop(wmsg("Expected columns [x, y, order, group] in `fontDF` object"))
  fontDF <- fontDF[, c("x", "y", "order", "group")]

  if (is.null(colour.scheme)) {
    colour.scheme <- switch(mot.alph, "DNA" = DNA_COLOURS, "RNA" = RNA_COLOURS,
      "AA" = AA_COLOURS, NULL)
  } else {
    if (any(!names(colour.scheme) %in% alph))
      stop("colour.scheme must be a named vector with all possible letters present")
  }

  if (use.type %in% c("PWM", "ICM") && !is.null(fit.to.height)) {
    warning("`fit.to.height` is ignored for `use.type = c(\"ICM\", \"PWM\")`")
    fit.to.height <- NULL
  }

  if (length(motifs) == 1) {

    if (return.raw) {
      colnames(mot.mats[[1]]) <- NULL
      names(mot.mats) <- mot.names
      return(mot.mats)
    }

    if (use.type == "PWM" && any(is.infinite(mot.mats[[1]])))
      stop("Found non-finite values in motif")

    plotobj <- prep_single_motif_plot_data(mot.mats[[1]], use.type, fontDF,
      min.height, x.spacer, y.spacer, sort.positions, sort.positions.decreasing,
      fit.to.height, 1)

    breaks2 <- seq_len(ncol(mot.mats[[1]]))
    limits2 <- c(min(breaks2) - 0.501, max(breaks2) + 0.501)

    if (use.type == "PCM") {
      breaks <- c(0, round(max(colSums(mot.mats[[1]])) / 2), max(colSums(mot.mats[[1]])))
      ylim2 <- breaks[-2]
      breaks <- unique(breaks)
    } else if (use.type == "PWM") {
      breaks <- c(round(min(colSums(mot.mats[[1]]))) - 1, round(min(colSums(mot.mats[[1]])) / 2), 0,
        round(log2(nrow(mot.mats[[1]])) / 2), log2(nrow(mot.mats[[1]])))
      mot.mats.tmp <- mot.mats[[1]]
      mot.mats.tmp[mot.mats.tmp > 0] <- 0
      minval <- min(colSums(mot.mats.tmp))
      mot.mats.tmp <- mot.mats[[1]]
      mot.mats.tmp[mot.mats.tmp < 0] <- 0
      maxval <- max(colSums(mot.mats.tmp))
      breaks <- round(c(minval, minval / 2, 0, maxval / 2, maxval))
      breaks[1] <- breaks[1] - max(c(1, breaks[1] * 0.1))
      ylim2 <- c(breaks[1], maxval)
      breaks <- unique(breaks)
    }

    # not sure why but letters glitch out unless you do this
    plotobj$y <- plotobj$y * 0.999

    plotobj <- ggplot(plotobj, aes(.data$x, .data$y, group = .data$letter.id, fill = .data$group)) +
      geom_polygon() +
      ylab(yname) +
      xlab(element_blank()) +
      theme_minimal() +
      scale_y_continuous(breaks = breaks, limits = ylim2, expand = c(0, 0)) +
      scale_x_continuous(breaks = breaks2, limits = limits2, expand = c(0.02, 0)) +
      theme(axis.line.y = element_line(size = 0.25),
        panel.grid = element_blank(),
        text = element_text(size = text.size),
        axis.ticks.y = element_line(size = 0.25), legend.position = "none",
        axis.text.x = element_text(colour = "black"),
        axis.text.y = element_text(colour = "black", margin = margin(r = 1)))

    if (!show.positions) plotobj <- plotobj + theme(axis.text.x = element_blank())

    if (!is.null(colour.scheme)) plotobj <- plotobj +
      scale_fill_manual(values = colour.scheme[alph], limits = alph)

    if (use.type == "PWM")
      plotobj <- plotobj + geom_hline(yintercept = 0, size = 0.25, colour = "grey75")

  } else {

    # TODO: Use PPM to compare motifs but visualize using ICM?
    res <- view_motifs_prep(mot.mats, method, tryRC, min.overlap,
      min.mean.ic, min.position.ic, mot.bkgs, relative_entropy,
      normalise.scores, alph, get_nsites(motifs), score.strat)
    which.rc <- res$motIsRC
    mots <- res$motifs
    mots <- check_mot_sizes(mots)

    if (method %in% c("KL", "ALLR", "ALLR_LL")) {
      for (i in seq_along(mots)) {
        mots[[i]][mots[[i]] > 0] <- mots[[i]][mots[[i]] > 0] - 0.01
      }
    }

    # mots <- mapply(function(x1, x2, x3, x4)
    #                  convert_mat_type_from_ppm(x1, use.type, x2, x3, x4, relative_entropy),
    #                mots, mot.nsites, mot.bkgs, mot.pseudo, SIMPLIFY = FALSE)

    for (i in seq_along(which.rc)) {
      if (which.rc[i]) mot.names[i + 1] <- paste0(mot.names[i + 1], RC.text)
    }
    names(mots) <- mot.names

    if (return.raw) return(mots)

    if (use.type == "PWM") {
      for (i in seq_along(mots)) {
        if (any(is.infinite(mots[[i]])))
          stop("Found non-finite values in motif")
      }
    }

    plotobj_multi <- vector("list", length(mots))
    for (i in seq_along(plotobj_multi)) {
      plotobj_multi[[i]] <- prep_single_motif_plot_data(
        mots[[i]], use.type, fontDF, min.height, x.spacer, y.spacer, sort.positions,
        sort.positions.decreasing, fit.to.height, 1
      )
      plotobj_multi[[i]]$motif.id <- names(mots)[i]
    }
    plotobj <- do.call(rbind, plotobj_multi)
    plotobj$y <- plotobj$y * 0.999

    breaks2 <- seq_len(ncol(mots[[1]]))
    limits2 <- c(min(breaks2) - 0.501, max(breaks2) + 0.501)

    mot_colsums <- vapply(mots, function(x) max(colSums(x)), numeric(1))
    if (use.type == "PCM") {
      breaks <- c(0, round(max(mot_colsums) / 2), max(mot_colsums))
      ylim2 <- breaks[-2]
      breaks <- unique(breaks)
    } else if (use.type == "PWM") {
      breaks <- c(round(min(mot_colsums)) - 1, round(min(mot_colsums) / 2), 0,
        round(log2(nrow(mots[[1]])) / 2), log2(nrow(mots[[1]])))
      mots.tmp <- mots
      minvals <- numeric(length(mots))
      for (i in seq_along(mots.tmp)) {
        mots.tmp[[i]][mots.tmp[[i]] > 0] <- 0
        minvals[i] <- min(colSums(mots.tmp[[i]]))
      }
      mots.tmp <- mots
      maxvals <- numeric(length(mots.tmp))
      for (i in seq_along(mots.tmp)) {
        mots.tmp[[i]][mots.tmp[[i]] < 0] <- 0
        maxvals[i] <- max(colSums(mots.tmp[[i]]))
      }
      breaks <- round(c(min(minvals), min(minvals) / 2, 0, max(maxvals) / 2, max(maxvals)))
      breaks[1] <- breaks[1] - max(c(1, breaks[1] * 0.1))
      ylim2 <- c(breaks[1], max(maxvals))
      breaks <- unique(breaks)
    }

    # Proper handling of x-limits/breaks/labels for motifs with added on blank
    # edges (if show.positions.once = FALSE):
    # - adjust input data so actual start of motif is always 1; if right shifted,
    #   then those blank left positions should start in the negatives
    # - don't feed limits anything
    # - don't feed breaks anything
    # - feed labels a function to replace numbers which don't belong to motif
    #   with blanks (not sure how to make this work with left shifted motifs)

    plotobj$motif.id <- factor(plotobj$motif.id, levels = mot.names)

    if (!show.positions.once) {
      plabs <- res$offsets
      plotcounter <- new.env()
      plotcounter$p <- 0
      labfun <- function(x) {
        pcount <- get("p", envir = plotcounter)
        pcount <- pcount + 1
        assign("p", pcount, envir = plotcounter)
        y <- plabs[[pcount]]
        z <- seq(from = x[1], to = x[length(x)], by = 1)
        y <- c(rep(TRUE, y), rep(FALSE, ncol(mot.mats[[pcount]])))
        y <- c(y, rep(TRUE, length(x) - length(y)))
        z[y] <- ""
        z[!y] <- seq_len(sum(!y))
        z
      }
    }
    plotobj <- ggplot(plotobj, aes(.data$x, .data$y, group = .data$letter.id,
        fill = .data$group)) +
      geom_polygon() +
      ylab(yname) +
      xlab(element_blank()) +
      theme_minimal() +
      scale_y_continuous(breaks = breaks, limits = ylim2, expand = c(0, 0)) +
      scale_x_continuous(breaks = breaks2, limits = limits2, expand = c(0.02, 0),
        labels = if (!show.positions.once) labfun else waiver()) +
      theme(axis.line.y = element_line(size = 0.25),
        panel.grid = element_blank(),
        text = element_text(size = text.size),
        axis.ticks.y = element_line(size = 0.25), legend.position = "none",
        axis.text.x = element_text(colour = "black"),
        axis.text.y = element_text(colour = "black", margin = margin(r = 1))) +
      facet_wrap(~motif.id, ncol = 1, strip.position = names.pos,
        scales = if (!show.positions.once) "free_x" else "fixed")

    if (!show.names && (!show.positions || show.positions.once))
      plotobj <- plotobj + theme(panel.spacing = unit(1, "lines"))
    else if ((!show.names || names.pos == "right") &&
             show.positions && show.positions.once)
      plotobj <- plotobj + theme(panel.spacing = unit(1, "lines"))

    if (show.names && names.pos == "right")
      plotobj <- plotobj +
        theme(strip.text.y.right = element_text(angle = 0, hjust = 0))

    if (!show.positions) plotobj <- plotobj + theme(axis.text.x = element_blank())

    if (!is.null(colour.scheme)) plotobj <- plotobj +
      scale_fill_manual(values = colour.scheme[alph], limits = alph)

    if (use.type == "PWM")
      plotobj <- plotobj + geom_hline(yintercept = 0, size = 0.25, colour = "grey75")

    if (!show.names) plotobj <- plotobj + theme(strip.text = element_blank())

  }

  plotobj

}

#-----------------------------------------------------------------------------

prep_single_motif_plot_data <- function(mat, use.type, fontDF, min.height = 0.03,
  x.spacer = 0.04, y.spacer = 0.01, sort.positions = FALSE,
  sort.positions.decreasing = TRUE, fit.to.height = NULL, fit.to.width = 1) {

  if (use.type == "PPM") {

    mat[mat > 1] <- 1
    make_matrix_polygon_data(mat, fontDF, min.height, x.spacer, y.spacer,
      sort.positions, sort.positions.decreasing, fit.to.height, fit.to.width)

  } else if (use.type == "PCM") {

    maxheight <- max(colSums(mat))
    make_matrix_polygon_data(mat, fontDF, min.height * maxheight,
      x.spacer, y.spacer * maxheight, sort.positions,
      sort.positions.decreasing, fit.to.height, fit.to.width)

  } else if (use.type == "PWM") {

    mat[mat > log2(nrow(mat))] <- log2(nrow(mat))
    maxheight <- log2(nrow(mat))
    if (any(mat < 0)) maxheight <- maxheight - min(mat)
    make_matrix_polygon_data(mat, fontDF, min.height * log2(nrow(mat)),
      x.spacer, y.spacer * maxheight, sort.positions,
      sort.positions.decreasing, NULL, fit.to.width)

  } else if (use.type == "ICM") {

    maxheight <- log2(nrow(mat))
    mat[mat > maxheight] <- maxheight
    make_matrix_polygon_data(mat, fontDF, min.height * maxheight,
      x.spacer, y.spacer * maxheight, sort.positions,
      sort.positions.decreasing, NULL, fit.to.width)

  }

}

make_matrix_polygon_data <- function(mat, fontDF, min.height = 0.03,
  x.spacer = 0.04, y.spacer = 0.01, sort.positions = FALSE,
  sort.positions.decreasing = TRUE, fit.to.height = NULL,
  fit.to.width = 1) {

  out <- vector("list", ncol(mat))

  for (i in seq_along(out)) {
    out[[i]] <- make_position_polygon_data(mat[, i, drop = TRUE], fontDF,
      min.height, x.spacer, y.spacer, sort.positions,
      sort.positions.decreasing, fit.to.height, fit.to.width, i)
    if (nrow(out[[i]])) {
      out[[i]]$x <- out[[i]]$x + i - 1
    }
  }

  do.call(rbind, out)

}

make_position_polygon_data <- function(vec, fontDF, min.height = 0.03,
  x.spacer = 0.04, y.spacer = 0.01, sort.positions = FALSE,
  sort.positions.decreasing = TRUE, fit.to.height = NULL, fit.to.width = 1,
  position.id, trim.top = FALSE, trim.bot = FALSE) {

  anyNeg <- any(vec < 0) && any(abs(vec[vec < 0]) >= 0.03)
  if (anyNeg) {
    negvec <- abs(vec[vec < 0])
    negout <- make_position_polygon_data(negvec, fontDF, min.height,
      x.spacer, y.spacer, sort.positions, !sort.positions.decreasing, NULL,
      fit.to.width, position.id, TRUE)
    negout$y <- negout$y - max(negout$y)
    vec <- vec[vec >= 0]
  } else {
    vec[vec < 0] <- 0
  }

  if (is.null(fit.to.height)) fit.to.height <- sum(vec)
  vec <- vec[vec >= min.height]

  if (all(vec == 0)) return(
    data.frame(x = NA_real_, y = NA_real_, order = NA_integer_,
      group = NA_character_, letter.id = NA_character_)
      # letter.id = NA_character_, position.id = position.id)
  )

  vec <- (vec / sum(vec)) * fit.to.height

  if (sort.positions) vec <- sort(vec, decreasing = sort.positions.decreasing)

  # Some low prio stuff that could be fixed eventually:

    # The current y.spacer implementation is technically introducing some
    # incorrectness, since middle letters get squished more than outer letters
    # and they then lose their real proportions. Shouldn't be too noticeable
    # when y.spacer is small, but worth going back and fixing later. Of course,
    # a quick fix would be to squish the outer letters just as much as the inner
    # ones but I don't think that looks as nice.

    # Make it so the y.spacer is implemented between the bottom letter of
    # +0 and the top letter of -0, possible via setting the trim.top/bot flags.
    # -- Maybe only apply y.spacer to -0 during regular use?

  y.spacer2 <- 0
  if (length(vec) > 1)
    y.spacer2 <- y.spacer * ((length(vec) - 1) / length(vec))

  adjheights <- c(rev(cumsum(rev(vec))) + y.spacer2 / 2, 0)

  out <- vector("list", length(vec))
  for (i in seq_along(out)) {

    tmplet <- names(vec)[i]
    tmpyspacer <- y.spacer2
    if ((i == 1 || i == length(vec)) && length(vec) > 1) {
      tmpyspacer <- tmpyspacer - y.spacer2 / 2
    }
    tmpletheight <- unname(vec[i]) - tmpyspacer
    tmpletid <- paste0(position.id, ".", tmplet)

    if (nchar(tmplet) > 1) {
      out[[i]] <- make_multi_letter_polygon_data(
        tmplet, fontDF, tmpletheight, fit.to.width, x.spacer, tmpletid)
    } else {
      out[[i]] <- make_letter_polygon_data(tmplet, fontDF, tmpletheight,
        fit.to.width, x.spacer, paste0(tmpletid, ".1"))
    }

    out[[i]]$x <- out[[i]]$x + 0.5
    out[[i]]$y <- out[[i]]$y + adjheights[i + 1]

  }

  if (anyNeg) out <- c(out, list(negout))
  out <- do.call(rbind, out)

  out

}

# Since x.spacer is not applied to outer edges of first/last letters, handle
# it outside the individual letter functions.

make_letter_polygon_data <- function(letter, fontDF, height, fit.to.width,
  x.spacer, letter.id, centre.letter = TRUE) {

  if (!letter %in% fontDF$group) {
    warning(wmsg("Letter \"", letter, "\" was not found in `fontDF`"),
      call. = FALSE)
    return(data.frame(x = NA_real_, y = NA_real_, order = NA_integer_,
      group = NA_character_, letter.id = letter.id))
  }
  fontDF <- fontDF[fontDF$group == letter, ]
  fontDF$y <- fontDF$y * height
  fit.to.width <- fit.to.width - x.spacer
  fontDF$x <- fontDF$x * fit.to.width
  if (centre.letter) fontDF$x <- fontDF$x + (1 - fit.to.width) / 2

  fontDF$letter.id <- letter.id

  fontDF

}

## Keep this around in case the intra-position x.spacer implementation is broken
# make_multi_letter_polygon_data <- function(multiletter, fontDF, height,
#   fit.to.width, x.spacer, letter.id) {
#
#   splitlets <- strsplit(multiletter, "")[[1]]
#   out <- vector("list", length(splitlets))
#   for (i in seq_along(out)) {
#     out[[i]] <- make_letter_polygon_data(splitlets[i], fontDF, height,
#       fit.to.width / length(out), x.spacer,
#       paste0(letter.id, ".", i), centre.letter = FALSE)
#     out[[i]]$x <- out[[i]]$x + (fit.to.width / length(out)) * (i - 1)
#     out[[i]]$x <- out[[i]]$x + x.spacer / 2
#     out[[i]]$x <- out[[i]]$x + (1 - fit.to.width) / 2
#   }
#
#   out <- do.call(rbind, out)
#
#   # out$x <- out$x + x.spacer / 2
#   # out$x <- out$x + (1 - fit.to.width) / 2
#
#   out
#
# }

make_multi_letter_polygon_data <- function(multiletter, fontDF, height,
  fit.to.width, x.spacer, letter.id) {

  splitlets <- strsplit(multiletter, "")[[1]]
  out <- vector("list", length(splitlets))
  for (i in seq_along(out)) {
    out[[i]] <- make_letter_polygon_data(splitlets[i], fontDF, height,
      fit.to.width / length(out), x.spacer / 10,
      paste0(letter.id, ".", i), centre.letter = FALSE)
    out[[i]]$x <- out[[i]]$x + (fit.to.width / length(out)) * (i - 1)
    out[[i]]$x <- out[[i]]$x + (1 - fit.to.width) / 2
  }

  out <- do.call(rbind, out)

  out$x <- out$x * (fit.to.width - x.spacer)
  out$x <- out$x + (1 - fit.to.width + x.spacer) / 2

  out

}

#------------------------------------------------------------------------------

check_mot_sizes <- function(mots) {

  sizes <- vapply(mots, ncol, integer(1))
  msize <- max(sizes)

  if (length(unique(sizes)) == 1) {
    mots <- check_right_side(mots, msize)
    return(mots)
  }

  for (i in seq_along(sizes)) {
    if (sizes[i] < msize) {
      mots[[i]] <- cbind(mots[[i]], matrix(0, nrow = nrow(mots[[i]]),
                                           ncol = msize - sizes[i]))
    }
  }

  check_right_side(mots, msize)

}

check_right_side <- function(mots, msize) {

  ok <- FALSE
  thiscol <- msize

  while (!ok) {

    for (i in seq_along(mots)) {
      if (any(mots[[i]][, thiscol] > 0)) ok <- TRUE
    }

    if (!ok) {
      for (i in seq_along(mots)) {
        mots[[i]] <- mots[[i]][, -thiscol]
      }
      thiscol <- thiscol - 1
    }

  }

  mots

}

convert_mat_type_from_ppm <- function(mot.mat, type, nsites, bkg, pseudocount,
                                      relative_entropy) {

  which.zero <- apply(mot.mat, 2, function(x) any(x != 0))

  if (length(nsites) == 0 || nsites == 1) nsites <- 100
  if (length(pseudocount) == 0) pseudocount <- 1

  mot.mat2 <- mot.mat[, which.zero, drop = FALSE]
  mot.mat2 <- switch(type,
                       "PCM" = apply(mot.mat2,  2,
                                     ppm_to_pcmC, nsites = nsites),
                       "PWM" = apply(mot.mat2, 2,
                                     ppm_to_pwmC, bkg = bkg,
                                     pseudocount = pseudocount,
                                     nsites = nsites),
                       "ICM" = apply(mot.mat2, 2,
                                     ppm_to_icmC, bkg = bkg,
                                     relative_entropy = relative_entropy),
                        mot.mat2
                     )

  mot.mat[, which.zero] <- mot.mat2

  mot.mat

}
bjmt/universalmotif documentation built on Nov. 16, 2024, 7:38 a.m.