R/statepaintr.R

#' Run a StatePaintR decisionMatrix against a group of files listed in a
#' manifest.
#'
#' @param manifest A character vector containing the filename of the manifest
#'   file. Alternatively, it can be a \code{data.frame}, with the same format as
#'   the text manifest. See details for the expected format.
#' @param decisionMatrix An object of class \code{\linkS4class{decisionMatrix}}.
#' @param scoreStates logical; if scores have been specified in the
#'   decisionMatrix, should the states be scored?
#' @param progress logical; show progress bar and timing details?
#'
#' @return A \code{\link{GRangesList}}, each \code{\link{GRanges}} object
#'   describing the states of all segements for a sample. Each range is
#'   described by the fields \code{name} indicating the sample, \code{state}
#'   indicating the state, and optionally \code{score} indicating the score of
#'   the state. The \code{\linkS4class{decisionMatrix}} and the manifest used to
#'   run the segmentation are included in the \code{attributes} of the output
#'   object.
#' @details The manifest is a tab delimited file containing five fields; SAMPLE,
#'   MARK, SRC, BUILD, and FILE. \cr `SAMPLE` refers to the sample to which the
#'   marks are related, like a cell line, or tissue. \cr `MARK` refers to the
#'   chromatin mark, or other feature described in the
#'   \code{\linkS4class{decisionMatrix}} \code{abstractionLayer} \cr `SRC`
#'   refers to the source of the data, retained for documentation, but not used
#'   in the functions. \cr `BUILD` refers to the genome build of the data
#'   tracks. All tracks under a single sample must be of the same genome build.
#'   \cr `FILE` refers to the location of the file describing the mark. Can be
#'   .bed, .narrowPeak, .gappedPeak, etc. Only the first three columns are used,
#'   `chromosome`, `start`, and `end`, unless \code{scoreStates = TRUE}, in
#'   which case a narrowPeak file is required.
#' @examples
#' manifest <- system.file("extdata", "manifest.hmec.txt", package = "StatePaintR")
#' load(system.file("extdata", "poised.promoter.model.rda", package = "StatePaintR"))
#' states <- PaintStates(manifest = manifest,
#'                       decisionMatrix = poised.promoter.model,
#'                       scoreStates = FALSE, progress = FALSE)
#' states
#' attributes(states)$manifest
#' attributes(states)$decisionMatrix
#'
#' @importFrom GenomicRanges disjoin findOverlaps reduce binnedAverage coverage
#' @importFrom S4Vectors from to DataFrame
#' @importFrom stringr str_detect coll str_replace
#' @importFrom data.table frankv
#' @importFrom matrixStats rowMedians rowMaxs
#' @importFrom Matrix rowMeans
#' @importFrom IRanges %outside%
#' @importFrom utils txtProgressBar setTxtProgressBar
#'
#' @export
PaintStates <- function(manifest, decisionMatrix, scoreStates = FALSE, progress = TRUE) {
  start.time <- Sys.time()
  ## check arguments
  if (missing(manifest)) {stop("provide a manifest describing the location of your files \n",
                              "and the mark that was investigated")}
  if (missing(decisionMatrix)) {stop("provide a decisionMatrix object")}
  if (is(decisionMatrix, "decisionMatrix")) {
    deflookup <- reverse_tl(abstractionLayer(decisionMatrix))
    names(deflookup) <- tolower(names(deflookup))
    d <- decisionMatrix(decisionMatrix)
  } else {
    stop("arg: decisionMatrix must be object of class decisionMatrix")
  }

  ## set up samples from manifest
  samples <- parse.manifest(manifest)
  sample.genomes <- as.list(unique(unlist(sapply(samples, "[", "BUILD"))))
  sample.genomes.names <- lapply(sample.genomes, function(x) {str_replace(tolower(x), "grch38", "hg38")})
  do.seqinfo <- try(Seqinfo(genome = sample.genomes.names[[1]]))
  if (inherits(do.seqinfo, "try-error")) {
    sample.genomes <- lapply(sample.genomes.names, function(x) NULL)
    warning("could not download seqinfo for genome")
  } else {
    sample.genomes <- lapply(sample.genomes.names, function(x) Seqinfo(genome = x))
  }
  names(sample.genomes) <- sample.genomes.names

  ## prep output + skipped samples
  output <- list()
  skipped <- list()

  ## prep progess bars
  lc <- 0
  if (progress) pb <- txtProgressBar(min = 0, max = length(samples) * 3, style = 3)

  for (cell.sample in seq_along(samples)) {
    lc <- lc + 1
    if (progress) setTxtProgressBar(pb, lc)
    skipname <- cell.sample
    cell.sample <- samples[[cell.sample]]
    x <- GetBioFeatures(manifest = cell.sample, dm = decisionMatrix, my.seqinfo = sample.genomes[[cell.sample[1, "BUILD"]]])
    if (is.null(x)) {
      warning(cell.sample[1, "SAMPLE"], " has no MARKs that are present in the translation layer \n",
              " MARKs included are ", paste(cell.sample[, "MARK"], collapse = " "))
      skipped <- c(skipped, skipname)
      lc <- lc + 2
      next()
    }
    ## normalize sample names
    names(x) <- tolower(names(x))
    inputset <- names(x)
    inputset <- sapply(inputset, function(x, y = deflookup) {
      out <- unlist(y[str_detect(x, paste0(names(y), "$"))], use.names = FALSE)
      return(out)
      })
    ## filter data sets that do not have a role in the abstractionLayer
    notInTranslationLayer <- sapply(inputset, is.null)
    if (all(sapply(inputset, is.null))) {
      warning(cell.sample[1, "SAMPLE"], " has no MARKs that are present in the translation layer \n",
              " MARKs included are ", paste(cell.sample[, "MARK"], collapse = " "))
      skipped <- c(skipped, skipname)
      lc <- lc + 2
      next()
    }
    x <- x[!notInTranslationLayer]
    inputset <- inputset[!notInTranslationLayer]
    names(x) <- inputset
    ## if there are multiple sources of data with the same meaning in the abstractionLayer
    ## merge them by averaging thier scores across bins
    to.merge <- inputset[duplicated(inputset)]

    merged <- MergeDataSets(input = x, input.desc = inputset, merge.groups = to.merge, score = scoreStates)
    x <- merged[["input"]]
    inputset <- merged[["input.desc"]]

    if (length(to.merge) >= 1) {
      to.merge <- unique(unlist(to.merge))
      for (mark in to.merge) {
        x.merge <- unlist(x[names(x) %in% mark])
        x.names <- unique(x.merge$feature)
        x <- x[!(names(x) %in% mark)]
        names(x.merge) <- NULL
        if (scoreStates) {
          x.merge <- binnedAverage(reduce(x.merge), coverage(x.merge, weight = "signalValue"), varname = "signalValue")
          mcols(x.merge)$feature <- paste(x.names, collapse = "|")
          mcols(x.merge) <- mcols(x.merge)[, c(2,1)]
        } else {
          x.merge <- reduce(x.merge)
          mcols(x.merge)$feature <- paste(x.names, collapse = "|")
          mcols(x.merge)$signalValue <- NA
        }
        x.merge <- GRangesList(x.merge)
        names(x.merge) <- mark
        x <- append(x, x.merge)
        rm(x.merge)
        inputset <- inputset[inputset != mark]
        inputset <- c(inputset, merged = mark)
      }
    }

    inputset.c <- names(inputset)
    names(inputset.c) <- inputset
    x.f <- disjoin(unlist(x))

    dontuseScore.i <- grep(pattern = "\\*$", inputset)
    inputset <- str_replace(inputset, "\\*$", "")
    dontbreak.i <- grep(pattern = "^\\[.*\\]$", inputset)
    inputset <- str_replace(inputset, "^\\[", "")
    inputset <- str_replace(inputset, "\\]$", "")

    dontbreak <- inputset[dontbreak.i]
    dontuseScore <- inputset[dontuseScore.i]

    names(x) <- inputset
    if (length(dontbreak) > 0) {
      x.f <- x.f[x.f %outside% x[dontbreak]]
      unfrag <- reduce(sort(do.call(c, list(unlist(x[dontbreak]), ignore.mcols = TRUE))))
      x.f <- c(x.f, unfrag, ignore.mcols = TRUE)
      x.f <- sort(x.f)
    }
    overlaps <- findOverlaps(x.f, x)
    resmatrix <- make.overlap.matrix(from(overlaps), to(overlaps), names(x))
    resmatrix <- resmatrix[, colnames(d)[colnames(d) %in% colnames(resmatrix)], drop = FALSE]
    if (scoreStates) {
      scorematrix <- resmatrix
      signalCol <- sapply(x, function(x) !is.na(mcols(x)[1, "signalValue"]))
      signalCol <- signalCol[!(names(signalCol) %in% dontuseScore)]
      signalCol <- which(signalCol[colnames(scorematrix)])
      for (feature in names(signalCol)) {
        scoreOverlaps <- findOverlaps(x.f, x[[feature]])
        scorematrix[from(scoreOverlaps), feature] <- mcols(x[[feature]])[to(scoreOverlaps), "signalValue"]
        scorematrix[, feature] <- frankv(scorematrix[, feature], ties.method = "average")
      }
      scorematrix <- scorematrix[, names(signalCol), drop = FALSE]
    }

    resmatrix <- resmatrix + 2L

    missing.data <- colnames(d)[!(colnames(d) %in% colnames(resmatrix))]
    if (any(is.na(d))) {
      d[is.na(d)] <- 1L
    }

    lmd <- length(missing.data)
    if (lmd > 0) {
      dl <- d[-which(matrix(bitwAnd(d[, missing.data, drop = FALSE], 2L) == 2L, ncol = lmd), arr.ind = TRUE)[,1], -which(colnames(d) %in% missing.data), drop = FALSE]
    } else {
      dl <- d
    }
    d.order <- dl
    d.order[dl == 1] <- 0
    d.order[dl == 0] <- 1
    dl <- dl[order(rowSums(d.order, na.rm = TRUE), decreasing = FALSE), , drop = FALSE]
    resmatrix <- t(resmatrix)
    mcols(x.f)$name <- cell.sample[1, "SAMPLE"]
    lc <- lc + 1
    if (progress) setTxtProgressBar(pb, lc)
    if (scoreStates) {
      dl.score <- dl[, names(signalCol)]
      score.cells <- which(dl.score == 3L, arr.ind = TRUE)
      if (length(signalCol) == 1) {
        score.cells <- matrix(score.cells, ncol = 1, dimnames = list(names(score.cells), c("row")))
        score.cells <- cbind(score.cells, col = 1)
      }
      segments <- data.frame(state = flookup(resmatrix, dl)[, 1], score = NA, stringsAsFactors = FALSE)
      #segments <- data.frame(state = flookup(resmatrix, dl)[, 1], median = NA, mean = NA, max = NA, stringsAsFactors = FALSE)
      score.features <- unique(segments$state)[(unique(segments$state) %in% rownames(score.cells))]
      for (score.feature in score.features) {
        scorematrix.rows <- segments$state == score.feature
        feature.scores <- scorematrix[segments$state == score.feature,
                                      score.cells[rownames(score.cells) %in% score.feature, "col"],
                                      drop = FALSE]
        #feature.scores.df <- data.frame(median = numeric(), mean = numeric(), max = numeric())
        if (is(feature.scores, "matrix")) {
          feature.scores <- rowMaxs(feature.scores)
          #feature.scores.df <- rbind.data.frame(feature.scores.df, data.frame(median = rowMedians(feature.scores), mean = rowMeans(feature.scores), max = rowMaxs(feature.scores)))
        } else {
          feature.scores <- feature.scores
          #feature.scores <- feature.scores / 2
          #feature.scores.df <- rbind.data.frame(feature.scores.df, data.frame(median = feature.scores / 2, mean = feature.scores / 2, max = feature.scores))
        }
        #if(cell.sample[1,1] == "thyroid gland" & score.feature == "PPR") browser()
        segments[segments$state == score.feature, "score"] <- feature.scores
        #segments[segments$state == score.feature, c("median", "mean", "max")] <- feature.scores.df
      }
      # feature.score.bm2 <<- feature.score.tmp1
      mcols(x.f)$state <- segments$state
      mcols(x.f)$score <- segments$score
      #mcols(x.f)[, c("median", "mean", "max")] <- DataFrame(segments[, c("median", "mean", "max")])
    } else {
      mcols(x.f)$state <- flookup(resmatrix, dl)[, 1]
    }
    x.f.l <- split(x.f, x.f$state)
    if (scoreStates) {
      x.f.ll <- lapply(x.f.l, reduce, with.revmap = TRUE)
      for (state.name in names(x.f.ll)) {
        if (state.name %in% score.features) {
          score.feature <- state.name
          revmap <- mcols(x.f.ll[[score.feature]])$revmap
          my.scores <- sapply(revmap, function(my.splits, orig.gr.cols) {
            if(length(my.splits) > 1) {
              if(zero_range(orig.gr.cols[my.splits])) {
                my.splits[1]
              } else {
                my.splits[which(orig.gr.cols[my.splits] == max(orig.gr.cols[my.splits]))[[1]]]
              }
            } else {
              my.splits
            }
          }, orig.gr.cols = mcols(x.f.l[[score.feature]])$score)
          my.scores <- mcols(x.f.l[[score.feature]])[my.scores, "score"]
          #my.scores <- mcols(x.f.l[[score.feature]])[revmap, "score"]
          #my.scores <- mcols(x.f.l[[score.feature]])[revmap, c("median", "mean", "max")]
        } else {
          my.scores <- 0
          #my.scores <- DataFrame(median = rep.int(0, length(x.f.ll[[state.name]])), mean = rep.int(0, length(x.f.ll[[state.name]])), max = rep.int(0, length(x.f.ll[[state.name]])))
        }
        mcols(x.f.ll[[state.name]])$revmap <- NULL
        mcols(x.f.ll[[state.name]])$name <- cell.sample[1, "SAMPLE"]
        mcols(x.f.ll[[state.name]])$state <- state.name
        if(max(my.scores) > 0) {
          mcols(x.f.ll[[state.name]])$score <- ceiling((my.scores/max(my.scores)) * 1000)
        } else {
          mcols(x.f.ll[[state.name]])$score <- 0
        }
        #mcols(x.f.ll[[state.name]])[, c("median", "mean", "max")] <- my.scores
      }
      x.f.l <- x.f.ll
    } else {
      x.f.l <- lapply(x.f.l, reduce)
      for (state.name in names(x.f.l)) {
        mcols(x.f.l[[state.name]])$name <- cell.sample[1, "SAMPLE"]
        mcols(x.f.l[[state.name]])$state <- state.name
        mcols(x.f.l[[state.name]])$score <- 0
      }
    }
    x.f <- sort(do.call(c, unlist(x.f.l, use.names = FALSE)))
    #x.f$score <- (x.f$score/max(x.f$score)) * 1000
    # mcols(x.f)[, c("median", "mean", "max")] <- lapply(mcols(x.f)[, c("median", "mean", "max")], function(x) {
    #   x/max(x) * 1000
    # })
    output <- c(output, x.f)
    lc <- lc + 1
    if (progress) setTxtProgressBar(pb, lc)
  }
  if (length(samples) > 1) {
    output <- GRangesList(output)
  }
  if (progress) close(pb)

  skipped <- unlist(skipped)
  if (is.null(skipped)) {
    names(output) <- names(samples)
  } else {
    names(output) <- names(samples)[-skipped]
    samples <- samples[-skipped]
  }

  attributes(output)$manifest <- samples
  attributes(output)$decisionMatrix <- decisionMatrix
  done.time <- Sys.time() - start.time
  if (progress) message("processed ", length(samples), " in ", round(done.time, digits = 2), " ", attr(done.time, "units"))
  return(output)
}

#' Write StatePaintR object to bedfiles
#'
#' @param states A GRangesList produced by \code{\link{PaintStates}}
#' @param output.dir A character string indicating the directory to save the
#'   exported states. The directory will be created if it does not exist.
#' @param progress logical; show progress bar and timing details?
#'
#' @importFrom dplyr left_join
#' @importFrom stringr str_replace_all
#' @importClassesFrom rtracklayer BasicTrackLine
#' @return Invisibly returns the states object.
#' @export
#' @examples
#' \dontrun{
#' ExportStatePaintR(states = states,
#'                   output.dir = tempdir())
#' }
ExportStatePaintR <- function(states, output.dir, progress = TRUE) {
  start.time <- Sys.time()
  if (missing(output.dir)) { stop("please indicate output directory") }
  if (!dir.exists(output.dir)) { dir.create(output.dir) }
  m.data <- attributes(states)$manifest
  decisionMatrix <- attributes(states)$decisionMatrix
  decisionMatrix@abstraction.layer <- lapply(abstractionLayer(decisionMatrix), tolower)
  m.data <- lapply(m.data, function(manifest, dm = decisionMatrix) {
    manifest <- manifest[tolower(manifest$MARK) %in% unlist(abstractionLayer(dm), use.names = FALSE), , drop = FALSE]
    if (nrow(manifest) < 1) {
      return(NULL)
    } else {
      return(manifest)
    }
  })
  m.data <- m.data[!sapply(m.data, is.null)]
  color.key <- stateColors(decisionMatrix)
  hub.id <- decisionMatrix@id
  if (progress) pb <- txtProgressBar(min = 0, max = length(states), style = 3)
  for (state in seq_along(states)) {
    if (progress) setTxtProgressBar(pb, state)
    s.name <- str_replace_all(names(m.data)[state], "/", "-")
    s.name <- str_replace_all(s.name, " ", "_")
    s.m.data <- m.data[[state]]
    state <- states[[state]]
    write.state(state, s.m.data, color.key, hub.id,
                file.path(output.dir,
                          paste0(s.name, ".", nrow(s.m.data), "mark.segmentation.bed")))
  }
  if (progress) close(pb)
  done.time <- Sys.time() - start.time
  if (progress) message("processed ", length(states), " in ", round(done.time, digits = 2), " ", attr(done.time, "units"))
  message("segmentation files written to: ", output.dir)
  return(invisible(states))
}

#' Retrieve Decision Matrix from StateHub
#'
#' @param search character string of either unique id, or search term.
#'
#' @return \code{\linkS4class{decisionMatrix}} object
#' @importFrom httr content GET http_error modify_url
#' @importFrom jsonlite toJSON fromJSON
#' @importFrom stringr str_extract
#' @export
#'
#' @examples
#' get.decision.matrix(search = "Cedars-Sinai")
#' poised.promoter.model <- get.decision.matrix("5813b67f46e0fb06b493ceb0")
#' poised.promoter.model
get.decision.matrix <- function(search) {
  if (missing(search)) { stop("missing search argument") }
  stateHub <- modify_url("http://statehub.org/statehub", path = "statehub/getmodel.jsp")
  query <- GET(stateHub, query = list(id = search))
  if (length(content(query)) < 1) {
    query <- GET(stateHub, query = list(search = search))
    if (http_error(query)) { stop("site not availible") }
    if (length(content(query)) < 1) { stop("no search results found") }
  }
  query <- content(query)
  for (query.i in seq_along(query)) {
    query.result <- query[[query.i]]
    state.names <- sapply(query.result$states, function(x) {x$name})
    for (state in seq_along(query.result$states)) {
      if (state == 1) {
        decision.matrix <- produce.state(query.result$states, state)
      } else {
        decision.matrix <- rbind(decision.matrix, produce.state(query.result$states, state))
      }
    }
    rownames(decision.matrix) <- state.names
    state.colors <- sapply(query.result$states, "[[", "format")
    state.colors <- str_extract(state.colors, "#([a-f]|[A-F]|[0-9]){3}(([a-f]|[A-F]|[0-9]){3})?\\b")
    names(state.colors) <- state.names
    decision.matrix <- new("decisionMatrix",
                           id = query.result[["id"]],
                           name = query.result[["name"]],
                           author = query.result[["author"]],
                           revision = query.result[["revision"]],
                           description = query.result[["description"]],
                           abstraction.layer = fromJSON(toJSON(query.result[["translation"]], auto_unbox = TRUE)),
                           decision.matrix = decision.matrix,
                           state.colors = state.colors)
    if (query.i <= 1) {
      output <- decision.matrix
    } else {
      output <- c(output, decision.matrix)
    }
  }
  return(output)
}

#' decisionMatrix class
#'
#' @slot id character. a unique ID that refers to a model revision
#' @slot name character. The name of the model on StateHub
#' @slot author character. The author of the model on StateHub
#' @slot revision character. The revision of the model. A time stamp
#' @slot description character. A short description of the model.
#' @slot abstraction.layer list. A description of the relationship between the
#'   precise data type (e.g. a chromatin mark like H3K27ac) and feature
#'   describing the functional category in the decision.matrix (e.g.
#'   Regulatory).
#' @slot decision.matrix matrix. The description of what combination of features
#'   are required to call a state.
#' @slot state.colors character. The color to assign to each state.
#'
#' @import methods
#' @return an object of class decisionMatrix, normally made in response to a
#'   query to StateHub with \code{\link{get.decision.matrix}}
#' @export
#'
setClass(Class = "decisionMatrix",
         slots = list(id = "character",
                      name = "character",
                      author = "character",
                      revision = "character",
                      description = "character",
                      abstraction.layer = "list",
                      decision.matrix = "matrix",
                      state.colors = "character"))




#' decisionMatrix
#'
#' @param object of class decisionMatrix
#'
#' @return a matrix; The description of what combination of features are required to call a state.
#' @export
#'
#' @examples
#' load(system.file("extdata", "poised.promoter.model.rda", package = "StatePaintR"))
#' poised.promoter.model
#' decisionMatrix(poised.promoter.model)
setGeneric("decisionMatrix", function(object) standardGeneric("decisionMatrix"))
#' @describeIn decisionMatrix Extract Descision Matrix from descisionMatrix object
#' @export
#' @examples
#' load(system.file("extdata", "poised.promoter.model.rda", package = "StatePaintR"))
#' poised.promoter.model
#' decisionMatrix(poised.promoter.model)
setMethod("decisionMatrix",
          signature = signature(object = "decisionMatrix"),
          function(object) {
            object@decision.matrix
          })


#' abstractionLayer
#'
#' @param object of class decisionMatrix
#'
#' @return a list; A description of the relationship between the precise data
#'   type (e.g. a chromatin mark like H3K27ac) and feature describing the
#'   functional category in the decision.matrix (e.g. Regulatory).
#' @details In the abstraction layer one may indicate if a functional category
#'   should not be used for scoring states. This is done by placing an asterisk
#'   at the end of it's name (e.g. Regulatory*). One may also specify that a
#'   certain type of functional category never be split into smaller units by
#'   its overlapping with other features. This is done by placing the name of
#'   the functional category in square brackets (e.g. [Core])
#' @export
#'
#' @examples
#' load(system.file("extdata", "poised.promoter.model.rda", package = "StatePaintR"))
#' poised.promoter.model
#' abstractionLayer(poised.promoter.model)
setGeneric("abstractionLayer", function(object) standardGeneric("abstractionLayer"))
#' @describeIn decisionMatrix Extract abstraction layer from descisionMatrix object
#' @export
#' @examples
#' abstractionLayer(poised.promoter.model)
setMethod("abstractionLayer",
          signature = signature(object = "decisionMatrix"),
          function(object) {
            object@abstraction.layer
          })

#' doNotScore
#'
#' @param decisionMatrix of class decisionMatrix
#' @param functionalCategory a character vector indicating the functional
#'   category to be excluded from scoring.
#' @return a decisionMatrix; the abstractionLayer of the decision matrix will be
#'   altered to indicate that the functional category indicated must not be used
#'   in scoring calculations.
#' @details Sometimes one may wish to use a functional category for calling a
#'   state, but not use the functional category for any scoring. This allows the
#'   user that option.
#' @export
#'
#' @examples
#' dm <- get.decision.matrix("5813b67f46e0fb06b493ceb0")
#' dm <- doNotScore(decisionMatrix = dm, functionalCategory = "Regulatory")
#' dm
doNotScore <- function(decisionMatrix, functionalCategory) {
  if (missing(decisionMatrix)) {
    stop("provide a decisionMatrix object")
  }
  if (missing(functionalCategory)) {
    stop("provide a functionalCategory to modify")
  }
  if (is(decisionMatrix, "decisionMatrix")) {
    categories <- grep(pattern = functionalCategory, x = names(abstractionLayer(decisionMatrix)))
    if (length(colnames) > 0) {
      for (category.i in categories) {
        category <- names(abstractionLayer(decisionMatrix))[category.i]
        names(decisionMatrix@abstraction.layer)[category.i] <- paste0(category, "*")
      }
    }
  } else {
    stop("decisionMatrix must be an object of class decisionMatrix")
  }
  return(decisionMatrix)
}

#' doNotSplit
#'
#' @param decisionMatrix of class decisionMatrix
#' @param functionalCategory a character vector indicating intervals described
#'   by the functional category will never be split into smaller intervals
#' @return a decisionMatrix; the abstractionLayer of the decision matrix will be
#'   altered to indicate that the functional category indicated must not be
#'   split.
#' @details Some functional categories described in an abstraction layer may
#'   perform better or more like one expects if they are never split into
#'   smaller intervals due to being overlapped by other functional categories.
#'   One may indicate if this is the case with this function
#' @export
#'
#' @examples
#' dm <- get.decision.matrix("5813b67f46e0fb06b493ceb0")
#' dm <- doNotSplit(decisionMatrix = dm, functionalCategory = "Core")
#' dm
doNotSplit <- function(decisionMatrix, functionalCategory) {
  if (missing(decisionMatrix)) {
    stop("provide a decisionMatrix object")
  }
  if (missing(functionalCategory)) {
    stop("provide a functionalCategory to modify")
  }
  if (is(decisionMatrix, "decisionMatrix")) {
    categories <- grep(pattern = functionalCategory, x = names(abstractionLayer(decisionMatrix)))
    if (length(colnames) > 0) {
      for (category.i in categories) {
        category <- names(abstractionLayer(decisionMatrix))[category.i]
        if (grepl("\\*$", category)) {
          category <- str_replace(category, "\\*$", "")
          category <- paste0("[", category, "]*")
        } else {
          category <- paste0("[", category, "]")
        }
        names(decisionMatrix@abstraction.layer)[category.i] <- category
      }
    }
  } else {
    stop("decisionMatrix must be an object of class decisionMatrix")
  }
  return(decisionMatrix)
}

#' stateColors
#'
#' @param object of class decisionMatrix
#'
#' @return character. The color to assign to each state.
#' @export
#'
#' @examples
#' load(system.file("extdata", "poised.promoter.model.rda", package = "StatePaintR"))
#' poised.promoter.model
#' abstractionLayer(poised.promoter.model)
setGeneric("stateColors", function(object) standardGeneric("stateColors"))
#' @describeIn decisionMatrix Extract color information from descisionMatrix object
#' @param object decisionMatrix.
#' @export
#' @examples
#' stateColors(poised.promoter.model)
setMethod("stateColors",
          signature = signature(object = "decisionMatrix"),
          function(object) {
            data.frame(STATE = names(object@state.colors), COLOR = object@state.colors, stringsAsFactors = FALSE)
          })

#' PRG - generate a Precision Recall Gain object to show accuracy of states
#'
#' @param states A GRangesList produced by \code{\link{PaintStates}} or a list
#'   of GRanges, or GRangesList that has a "score" column, and optionally a
#'   "state" column, that is filtered on if the \code{state.select} argument is
#'   used
#' @param comparison A GRanges object that indicates TRUE or FALSE for a
#'   specific genomic intervals. The names of the the \code{states} object must
#'   be reflected in the columns of the \code{comparison} object, Where each
#'   interval is indicated as being validated with 1 or negative with 0
#' @param state.select A character vector that will filter the states object if
#'   the states object has a "state" column. For example if the output of
#'   \code{\link{PaintStates}} has segments with 'EAR' and 'PAR', and one wants
#'   to select only 'EAR', then \code{state.select = c("EAR")} would only
#'   compare the 'EAR' scores in \code{states} to the \code{comparison} set.
#' @param comparison.select a list of numeric vectors that can be used to select
#'   a subset of \code{comparison} for each element in the list of \code{states}
#'
#' @return a list that contains the precision gain and recall gain, the convex
#'   hull, and the area under the precision recall gain curve (auprg).
#' @details This code incorprates Precision-Recall-Gain analysis from
#'   \url{https://github.com/meeliskull/prg/tree/master/R_package}. \cr From the
#'   abstract of the referenced paper: "Precision-Recall analysis abounds in
#'   applications of binary classification where true negatives do not add value
#'   and hence should not affect assessment of the classifier's performance.
#'   Perhaps inspired by the many advantages of receiver operating
#'   characteristic (ROC) curves and the area under such curves for
#'   accuracy-based performance assessment, many researchers have taken to
#'   report Precision-Recall (PR) curves and associated areas as performance
#'   metric. We demonstrate in this paper that this practice is fraught with
#'   difficulties, mainly because of incoherent scale assumptions -- e.g., the
#'   area under a PR curve takes the arithmetic mean of precision values whereas
#'   the F_beta score applies the harmonic mean. We show how to fix this by plotting
#'   PR curves in a different coordinate system, and demonstrate that the new
#'   Precision-Recall-Gain curves inherit all key advantages of ROC curves. In
#'   particular, the area under Precision-Recall-Gain curves conveys an expected
#'   F1 score on a harmonic scale, and the convex hull of a
#'   Precision-Recall-Gain curve allows us to calibrate the classifier's scores
#'   so as to determine, for each operating point on the convex hull, the
#'   interval of beta values for which the point optimises F_beta. We demonstrate
#'   experimentally that the area under traditional PR curves can easily favour
#'   models with lower expected F1 score than others, and so the use of
#'   Precision-Recall-Gain curves will result in better model selection."
#' @seealso \url{http://www.cs.bris.ac.uk/~flach/PRGcurves/}
#' @references Flach, P., & Kull, M. (2015). Precision-Recall-Gain Curves: PR
#'   Analysis Done Right. In C. Cortes, N. D. Lawrence, D. D. Lee, M. Sugiyama,
#'   & R. Garnett (Eds.), Advances in Neural Information Processing Systems 28
#'   (pp. 838-846). Curran Associates, Inc.
#'
#' @export
#'
#' @examples
#' load(system.file("extdata", "heart.states.rda", package = "StatePaintR"))
#' load(system.file("extdata", "vista.heart.enhancers.rda", package = "StatePaintR"))
#' PRG(states = heart.states,
#'    comparison = heart.enhancers,
#'    state.select = c("EAR", "EARC", "AR", "ARC"))
#'
PRG <- function(states, comparison, state.select = NULL, comparison.select = NULL) {
  if (inherits(states, "GRangesList") || inherits(states, "list")) {
    plot.data <- list()
    for (tissue in names(states)) {
      if (!is.null(comparison.select)) {
        select.comparison <- comparison[comparison.select[[tissue]],]
      } else {
        select.comparison <- comparison
      }
      state <- states[[tissue]]
      plot.tissue <- PRG_helper(state, state.select, tissue, select.comparison)
      plot.data <- c(plot.data, plot.tissue)
    }
    precision.recall.gain <- lapply(plot.data, PRG_prg)
    precision.recall.gain <- do.call("rbind", precision.recall.gain)
    convex.hull <- lapply(plot.data, PRG_convex_hull)
    convex.hull <- do.call("rbind", convex.hull)
    auprg <- sapply(plot.data, function(x) x$auprg)
  } else {
    stop("states object must be a list or GRangesList")
  }
  return(list(precision.recall.gain = precision.recall.gain,
              convex.hull = convex.hull,
              auprg = auprg))
}



#' PlotStates - plotting the results of PaintStates
#'
#' @param states A GRangesList produced by \code{\link{PaintStates}} or a list
#'   of GRanges, or GRangesList that has a "score" column, and optionally a
#'   "state" column, that is filtered on if the \code{state.select} argument is
#'   used
#' @param location A GRanges object of length 1, or a character vector of the
#'   form: "chr:start-end", e.g., "chr12:51267156-52080611"
#' @param gene.track optionally, an object created by BiomartGeneRegionTrack() to plot the gene context in addition to segmentation.
#' @param additional.tracks optionally, an object of class "AnnotationTrack" with additional data to plot in addition to segmentation.
#'
#' @return Plots the states, of the samples in the states object, and if possible the genes in the region
#' @export
#' @importFrom Gviz AnnotationTrack GenomeAxisTrack IdeogramTrack
#'   BiomartGeneRegionTrack plotTracks
#' @importFrom IRanges subsetByOverlaps
#' @importFrom GenomeInfoDb genome
#' @importFrom dplyr left_join
#' @importFrom GenomeInfoDb seqlevels
#' @importFrom stringr str_length
#' @examples
#' load(system.file("extdata", "heart.states.rda", package = "StatePaintR"))
#' load(system.file("extdata", "vista.heart.enhancers.rda", package = "StatePaintR"))
#' mylocation <- GRanges("chr1:42623508-42754885")
#' vista.in.range <- subsetByOverlaps(heart.enhancers, mylocation)
#' vista.anno <- AnnotationTrack(range = vista.in.range,
#'                               fill = mcols(vista.in.range)$validated + 2,
#'                               stacking = "dense",
#'                               col = NULL,
#'                               col.line = NULL,
#'                               stackHeight = 1,
#'                               shape = "box",
#'                               rotation.title = 360,
#'                               background.title = "transparent", col.title = "black",
#'                               cex.title = 0.5,
#'                               name = "VISTA enhancers")
#' PlotStates(states = heart.states,
#'            location = mylocation,
#'            additional.tracks = vista.anno)

PlotStates <- function(states, location = NULL, gene.track = NULL, additional.tracks = NULL) {
  if (is.null(location)) {
    stop("choose a genomic location to plot")
  }
  if (is.character(location)) {
    location <- GRanges(location)
  } else if (is(location, "GRanges")) {
    if (length(location) > 1) stop("location must only be a single genomic location")
    location <- location
  } else {
    stop("location must be either a character vector in the form 'chr:start-end' or a GRanges object of length 1")
  }
  dm <- attributes(states)$decisionMatrix
  my.len <- max(str_length(names(states)))/3
  atracks <- lapply(states, function(my.range, loc = location, col = stateColors(dm), len = my.len) {
    my.range <- subsetByOverlaps(my.range, location)
    my.col <- data.frame(state = mcols(my.range)$state)
    my.col <- suppressWarnings(left_join(my.col, col, by = c("state" = "STATE"))$COLOR)
    my.gen <- genome(my.range)[[1]]
    my.chr <- as.character(unique(seqnames(my.range)))
    my.name <- mcols(my.range)$name[[1]]
    atrack <- AnnotationTrack(range = my.range,
                              feature = as.factor(mcols(my.range)$state),
                              stacking = "dense",
                              fill = my.col,
                              col = NULL,
                              col.line = NULL,
                              stackHeight = 1,
                              shape = "box",
                              rotation.title = 360,
                              background.title = "transparent", col.title = "black",
                              cex.title = 0.5,
                              name = my.name)
  })
  my.genome <- genome(atracks[[1]])
  my.chr <- seqlevels(location)
  gtrack <- GenomeAxisTrack()
  itrack <- IdeogramTrack(genome = my.genome, chromosome = my.chr)
  if (is.null(gene.track)) {
    grtrack <- try(BiomartGeneRegionTrack(start = start(location), end = end(location),
                                          chromosome = seqlevels(location),
                                          genome = my.genome,
                                          col.line = NULL, col = NULL,
                                          stackHeight = 0.5,
                                          rotation.title = 360, col.title = "black", cex.title = 0.5,
                                          filters = list(biotype = c("protein_coding", "lincRNA")),
                                          transcriptAnnotation = "symbol",
                                          name = "ENSEMBL genes", stacking = "squish"))
    trycount <- 1
    while (inherits(grtrack, "try-error")) {
      grtrack <- try(BiomartGeneRegionTrack(start = start(location), end = end(location),
                                            chr = seqlevels(location),
                                            genome = my.genome,
                                            col.line = NULL, col = NULL,
                                            stackHeight = 0.5,
                                            genome = my.genome,
                                            rotation.title = 360, col.title = "black", cex.title = 0.5,
                                            filter = list(biotype = c("protein_coding", "lincRNA"),
                                                          transcriptAnnotation = "symbol",
                                                          name = "ENSEMBL genes", stacking = "squish")))
      if (trycount >= 3) {
        plotTracks(c(list(itrack, gtrack), c(atracks)), title.width = my.len)
        warning("could not find GeneRegionTrack on dynamically queried Biomart Ensembl data source\n",
                "a GeneRegionTrack can be passed manually to the gene.track parameter")
        return(invisible(c(list(itrack, gtrack), c(atracks))))
      } else {
        trycount <- trycount + 1
      }
    }
  } else if (inherits(gene.track, "GeneRegionTrack")) {
    if (genome(gene.track)[[1]] == my.genome) {
      grtrack <- gene.track
    } else {
      stop("gene.track genome: ", genome(gene.track)[[1]], " does not match states genome: ", my.genome)
    }
  } else {
    stop("gene.track is not a valid GeneRegionTrack")
  }
  if (!is.null(additional.tracks)) {
    if (inherits(additional.tracks, "AnnotationTrack")) {
      atracks <- c(additional.tracks, atracks)
    } else {
      stop("additional.tracks is not a valid AnnotationTrack object")
    }
  }
plotTracks(c(list(itrack, gtrack, grtrack), c(atracks)),
           collapseTranscripts = "meta",
           from = start(location),
           to = end(location),
           title.width = my.len)
return(invisible(c(list(itrack, gtrack, grtrack), c(atracks))))
}
Simon-Coetzee/StatePaintR documentation built on May 9, 2019, 1:31 p.m.