R/AllUtilities.R

### =========================================================================
### Helper functions not exported.
### For emphasis, __no function in this file will be exported__, because 
### `@keywords internal` applies to the whole file 
### (https://github.com/ramhiser/sparsediscrim/issues/26).
### This means that these functions will not be documented by roxygen2, even
### though the functions have roxygen2 tags.
### =========================================================================

#' Check whether all elements of a numeric vector are identical (within machine precision)
#' @param x a numeric vector.
# 
#' @return TRUE if all elements of the vector are identical (within machine 
#' precision). FALSE in all other cases, including if the vector contains any 
#' NAs.
#' 
#' @export
#' @keywords internal
#' 
#' @note This function is based on Hadley and John's answer to 
#' http://stackoverflow.com/q/4752275
.zero_range <- function(x, tol = .Machine$double.eps ^ 0.5) {
  if (length(x) == 1) {
    val <- TRUE
  } 
  if (any(is.na(x)) & any(is.na(x))){
    val <- FALSE
  } else{
    val <- (abs(max(x) - min(x)) < tol)
  }
  
  return(val)
}

## TODO: Check documentation and re-write as necessary.
#' An internal function used when constructing/combining CoMeth objects
#' 
#' Combine list-wise data into matrix-like data whilst taking care of common and sample-specific m-tuples, i.e. filling in NA for 'counts' when a sample doesn't have any observations for that m-tuple. 
#' 
#' @param m An \code{integer} storing the size of the m-tuples, i.e. the \code{m} in m-tuple. Only a single value is accepted, and not a list, because \code{m} must be the same for all samples in a \code{CoMeth} object.
#' @param sample_names A \code{character} vector containing the names of the samples. Sample names must be unique.
#' @param pos A \code{\link[IRanges]{DataFrameList}} \code{pos} must be named and the names must match those given in \code{sample_names}. The columns of each DataFrame must be: \code{seqnames}, \code{pos1}, ..., \code{posm}, where, for example, \code{posm} is \code{pos3} if \code{m} = 3. \code{seqnames} stored the sequence/chromosome name of the m-tuples. Therefore, the number of columns of each DataFrame is \code{m} + 1 and the number of rows is equal to the number of m-tuples for that particular sample.
#' @param counts A \code{\link[IRanges]{DataFrameList}}. \code{counts} must be named and the names must match those given in \code{sample_names}. The entry in each DataFrame corresponds to the number of times that particular co-methylation pattern (columns) was observed for that particular m-tuple (rows). Therefore, each DataFrame must have the same number of rows as its corresponding DataFrame in \code{pos} and have \eqn{2 ^ \code{m}} columns. The column names of each DataFrame must match that given by \code{make_m_tuple_names(m)}. 
#' @param strand An (optional)\code{\link[IRanges]{RleList}} object containing the strand information of each m-tuple. \strong{WARNING}: If \code{strand} is missing, all m-tuples in the resulting \code{\link{CoMeth}} object will have their strand set to \code{*} to signify that the strand is unknown or irrelevant (such as when methylation measurements have been combined across strands). \strong{WARNING}: m-tuples will not be combined across samples if they are on different strands.
#' 
#' @keywords internal
#' 
#' @return A list containing the combined seqnames, pos, counts and strand
.combine <- function(sample_names, seqnames, pos, strand, counts){
    
  ## Combine the data
  if (length(sample_names) > 1L){
    combined_coordinates <- DataFrame(seqnames = seqnames[[1L]], pos[[1L]], strand = strand[[1L]])
    combined_counts <- lapply(counts[[sample_names[[1L]]]], as.matrix)
    
    for (i in seq(from = 2L, to = length(sample_names), by = 1L)){
      ## Find any identical m-tuples between the current "combined" data and the "new sample"
      pair_coordinates <- rbind(combined_coordinates, DataFrame(seqnames = seqnames[[sample_names[[i]]]], pos[[sample_names[[i]]]], strand = strand[[sample_names[[i]]]]))
      in_both_idx1 <- duplicated(pair_coordinates) # Indexes *[[sample_names[[i]]]] (need to offset by NROW(combined_*))
      in_both_idx2 <- duplicated(pair_coordinates, fromLast = TRUE)
      in_both_combined_idx <- which(in_both_idx2[seq_len(nrow(combined_coordinates))])
      in_both_new_sample_idx <- which(in_both_idx1[seq(from = nrow(combined_coordinates) + 1L, to = length(in_both_idx1), by = 1L)])
      in_just_combined_idx <- which(!in_both_idx2[seq_len(nrow(combined_coordinates))])
      in_just_new_sample_idx <- which(!in_both_idx1[seq(from = nrow(combined_coordinates) + 1L, to = length(in_both_idx1), by = 1L)])
      
      ## Combine the "combined" data and the "new sample" data to make the "new combined" data
      ## Always combine things in this order: (1) In both, (2) In just combined_*, (3) In just *[[sample_names[[i]]]]
      combined_coordinates <- rbind(combined_coordinates[in_both_combined_idx, ], combined_coordinates[in_just_combined_idx, ], DataFrame(seqnames = seqnames[[sample_names[[i]]]], pos[[sample_names[[i]]]], strand = strand[[sample_names[[i]]]])[in_just_new_sample_idx, ])
      combined_counts <- mapply(FUN = function(combined_counts, this_sample_counts, in_both_combined_idx, in_both_new_sample_idx, in_just_combined_idx, in_just_new_sample_idx){
      val <- rbind(cbind(combined_counts[in_both_combined_idx, , drop = FALSE], this_sample_counts[in_both_new_sample_idx]),
                   cbind(combined_counts[in_just_combined_idx, , drop = FALSE], matrix(NA_integer_, nrow = length(in_just_combined_idx), ncol = 1L)), 
                   cbind(matrix(NA_integer_, nrow = length(in_just_new_sample_idx), ncol = ncol(combined_counts)), this_sample_counts[in_just_new_sample_idx, , drop = FALSE]))
      return(val)
      }, combined_counts = combined_counts, this_sample_counts = lapply(counts[[sample_names[[i]]]], as.matrix), MoreArgs = list(in_both_combined_idx = in_both_combined_idx, in_both_new_sample_idx = in_both_new_sample_idx, in_just_combined_idx = in_just_combined_idx, in_just_new_sample_idx = in_just_new_sample_idx), SIMPLIFY = FALSE)
    }
    
    seqnames <- combined_coordinates[, 1L]
    pos <- as.matrix(combined_coordinates[, -c(1L, ncol(combined_coordinates))])
    strand <- combined_coordinates[, ncol(combined_coordinates)]
    counts <- combined_counts # list elements are already matrices  
    # TODO: Add names to counts?
  } else if (length(sample_names) == 1L){
    seqnames <- seqnames[[sample_names[[1L]]]]
    pos <- as.matrix(pos[[sample_names[[1L]]]])
    strand <- strand[[sample_names[[1L]]]]
    counts <- lapply(counts[[sample_names[[1L]]]], function(x){
      dim(x) <- c(length(x), 1L)
      return(x)
      }) # counts will go in the assays slot of a SummarizedExperiment-type object. Therefore counts must be coerced to matrices - this hack is a fast way to turn a vector into a column matrix.
      # TODO: Add names to counts?
  } else{
    seqnames <- Rle()
    pos <- matrix()
    strand <- Rle()
    counts <- list()
  }
  
  ## Return list of objects
  return(list(seqnames = seqnames, pos = pos, strand = strand, counts = counts))
}

## TODO: Write documentation
## NOTE: This function should only be called if m >= 3.
#' @export
#' 
#' @keywords internal
.findIdentical.MTuples <- function(query, subject, select){
  ## Idea1: Create seqnames:strand:pos vectors for both query and subject and then find all matches  
  ## This is ~6x slower than Idea 2 for a query with 1300 elements and a subject with 100000 elements. 
#   q_pos <- getPos(query)
#   q_k <- paste(seqnames(query), strand(query), do.call("paste", c(split(q_pos, c(col(q_pos))), list(sep = ":"))), sep = ":")
#   s_pos <- getPos(subject)
#   s_k <- paste(seqnames(subject), strand(subject), do.call("paste", c(split(s_pos, c(col(s_pos))), list(sep = ":"))), sep = ":")
#   ## Now, Find all matches and return a Hits object. This seems harder than it should be. It's easy to find the *first* match (using match(q_k, s_k)) but hard to find *all* matches.
#   tmp <- lapply(q_k, function(q, s_k){
#     which(s_k %in% q)
#   }, s_k = s_k)
  
  ## Idea 2: Create (m - 1) x 2-tuples (pairs) from the m-tuples as GRanges and run findOverlaps parameters chosen to select only exact matches.
  ## Then, intersect the Hits objects from each pair to create the final Hits object.
  m <- getM(query) # Assumes m is identical for query and subject
  
  ## Create an index variable
  ii <- seq_len(m - 1L)
  
  ## Create all pairs and run findOverlaps
  pair_hits <- lapply(ii, function(i, q_seqnames, s_seqnames, q_strand, s_strand, q_pos, s_pos){
    findOverlaps(query = GRanges(seqnames = q_seqnames, ranges = IRanges(start = q_pos[, i], end = q_pos[, i + 1]), strand = q_strand), subject = GRanges(seqnames = s_seqnames, ranges = IRanges(start = s_pos[, i], end = s_pos[, i + 1]), strand = s_strand), maxgap = 0L, minoverlap = 1L, type = "equal", select = select)
  }, q_seqnames = seqnames(query), s_seqnames = seqnames(subject), q_strand = strand(query), s_strand = strand(subject), q_pos = getPos(query), s_pos = getPos(subject))
  
  ## Intersect the Hits objects
  tuples_hits <- Reduce(intersect, pair_hits)
  
  return(tuples_hits)
}

#' Make m-tuple names
#'
#' This helper function constructs m-tuple names in the correct (alphabetical) order for a given value of m
#' @param m The size of the m-tuple. Must be an int.
#' 
#' @export
#' 
#' @keywords internal
#' 
#' @examples
#' .make_m_tuple_names(1L)
#' .make_m_tuple_names(2L)
#' .make_m_tuple_names(3L)
#' @return A character vector
.make_m_tuple_names <- function(m){
  if (!is.integer(m) || m < 1){
    stop("'m' must be an int. 'm' must be greater than 0.")
  }
  sort(do.call(paste0, expand.grid(lapply(seq_len(m), function(x){c('M', 'U')}))))
}

## TODO: Document
## TOOD: Test
#' @export
#' @keywords internal
.EP <- function(x){
  p <- lapply(X = x, FUN = function(xx, y){
    xx / y
  }, y = Reduce(x = x, '+'))
  val <- 1 - Reduce(x = lapply(p, function(pp){pp^2}), f = '+')
  
  return(val)
}

## TODO: Document
#' @export
#' @keywords internal
.beta <- function(x){
  val <- x[['M']] / (x[['M']] + x[['U']])
  
  return(val)
}

## TODO: Document
## TODO: Tests
#' Compute a log odds ratio
#' 
#' Uses base-2 logarithms.
#' The default offset, which is used to avoid zero-counts, is 0.5.
.LOR <- function(x, offset = 0.5){
  # NOTE: Without the outer brackets the newline character is parsed and only 
  # the numerator is evaluated when computing LOR(!)
  lor <- (log2(x[['MM']] + offset) + log2(x[['UU']] + offset)
          - log2(x[['MU']] + offset) - log2(x[['UM']] + offset))
  return(lor)
}

## TODO: Document
## TODO: Tests
#' Compute the average methylation level (zeta) of an m-tuple
#' 
#' Definition based on Landan et al. (2012). zeta differs from mean(beta) 
#' becuase it only uses reads containing all methylation loci in the m-tuple.
.zeta <- function(x, m){
  ## Count how many methylated bases in the m-tuple
  ## From http://stackoverflow.com/a/12427831
  nm <- sapply(regmatches(names(x), gregexpr("M", names(x))), length)
  
  numerator <- Reduce(f = '+', mapply(FUN = function(nm, x){nm * x}, nm = nm, 
                                      x = x, SIMPLIFY = FALSE)) 
  denominator <- m * Reduce(f = '+', x)
  val <- numerator / denominator
  
  return(val)
}

## TODO: Tests
#' Return the valid methylation types
#' 
#' @param methylation_type A character.
#' @export
#' @keywords internal
#' @return Returns \code{TRUE} if a valid methylation type, otherwise 
#' \code{FALSE}.
.valid_methylation_type <- function(methylation_type) {
  valid_methylation_types <- c('CG', 'CHG', 'CHH', 'CNN', 'CG/CHG', 'CG/CHH', 
                               'CG/CNN', 'CHG/CHH',  'CHG/CNN', 'CHH/CNN', 
                               'CG/CHG/CHH', 'CG/CHG/CNN', 'CHG/CHH/CNN', 
                               'CG/CHG/CHH/CNN')
  val <- methylation_type %in% valid_methylation_types
  
  return(val)
}

## TODO: This currently breaks when strand == '-'.
## See email to Bioc-Devel https://stat.ethz.ch/pipermail/bioc-devel/2014-May/005820.html
#' A helper function used by \code{\link{makeMLS}}.
#'
#' \code{\link{makeMLS}} uses \code{\link[BSgenome]{bsapply}} to apply 
#' \code{\link[Biostrings]{matchPDict}} to all chromosomes of a 
#' \code{\link[BSgenome]{bsgenome}} object. This returns a list of 
#' \code{\link[IRanges]{IRanges}} that must then be converted to 
#' \code{\link[GenomicRanges]{GRanges}}. This helper function does that.
#' 
#' @param irl A list of \code{\link[IRanges]{IRanges}} objects.
#' @param strand A character vector of length 1. The strand of all methylation 
#' loci in \code{irl}.
#' @param seqinfo The \code{\link[GenomicRanges]{Seqinfo}} of all methylation 
#' loci in \code{irl}.
#' 
#' @param Note, \code{.irl2gr} differs from calling 
#' \code{as(IRangesList(irl), "GRanges")}, where \code{irl} is a list of 
#' \code{\link[IRanges]{IRanges}} objects. Specifically, it requires the user 
#' to specify the strand rather than simply setting it to \code{*}, it makes 
#' all ranges of width = 1, with the start being the cytosine in the relevant 
#' strand, and it requires the user to specify the 
#' \code{\link[GenomicRanges]{Seqinfo}}.
#' @export
#' 
.irl2gr <- function(irl, strand, seqinfo) {
  
  if (!is(seqinfo, "Seqinfo")){
    stop(sQuote('seqinfo'), " must be a ", sQuote("Seqinfo"), " object.")
  }
  
  # NOTE: Need to unname irl otherwise do.call returns a list (which is not 
  # what I expected nor wanted). 
  if (strand == '+') {
    GRanges(seqnames = Rle(names(irl), sapply(irl, length)), 
            ranges = resize(do.call("c", unname(unlist(irl))), width = 1, 
                            fix = 'start'), 
            strand = strand,
            seqinfo = seqinfo)
  } else if (strand == '-') {
    GRanges(seqnames = Rle(names(irl), sapply(irl, length)), 
            ranges = resize(do.call("c", unname(unlist(irl))), width = 1, 
                            fix = 'end'),
            strand = strand,
            seqinfo = seqinfo)
  } else {
    stop("Unexpected ", sQuote('strand'))
  }
}
PeteHaitch/cometh documentation built on May 8, 2019, 1:32 a.m.