R/GTuples-comparison.R

Defines functions sort.GTuples .sort.GTuples .order.GTuples .GTuples.asIntegerTuples duplicated.GTuples .duplicated.GTuples .pcompare_GTuples

Documented in duplicated.GTuples sort.GTuples

### =========================================================================
### Comparing and ordering genomic tuples
### -------------------------------------------------------------------------
### I. UNIQUE AND DUPLICATED ELEMENTS WITHIN A GTuples OBJECT
### ---------------------------------------------------------------
### Two elements of a GTuples object (i.e. two genomic tuples) are
### considered equal iff they are on the same underlying sequence and strand,
### and have the same positions. duplicated() and unique() on a
### GTuples object are conforming to this.
###
### II. ORDERING THE ELEMENTS OF A GTuples OBJECT
### ---------------------------------------------------
### The "natural order" for the elements of a GTuples object is to order
### them (a) first by sequence level, (b) then by strand, (c) then by pos_{1}, 
### ..., pos_{m}. This way, the space of genomic tuples is totally ordered.
### order(), sort(), and rank() on a GTuples object are using this
### "natural order".
###
### III. ELEMENT-WISE (AKA "PARALLEL") COMPARISON OF 2 GTuples OBJECTS
### ------------------------------------------------------------------------
### We want the "==", "!=", "<=", ">=", "<" and ">" operators between 2
### GTuples objects to be compatible with the "natural order" defined
### previously. Defining those operators when the 2 objects have *identical*
### seqlevels() is straighforward but we can in fact extend this comparison
### to the following situation:
###   (A) 'e1' and 'e2' have compatible sets of underlying sequences, that is,
###       'seqinfo(e1)' and 'seqinfo(e2)' can be merged.
###   (B) 'seqlevels(e1)' and 'seqlevels(e2)' are in the same order. Note that
###       (A) guarantees that the seqlevels of one is a subset of the seqlevels 
###       of the other. (B) is saying that this subset should be a subsequence.
### Pre-comparison step: if (A) and (B) are satisfied, then the 2 seqinfo() are
### merged and the seqlevels() of the result is assigned back to each object
### to pcompare. This is a way to have 2 objects with identical seqlevels()
### before the comparison can actually be performed and meaningful.
### The reason (B) is required for the pre-comparison step is because we want
### this step to preserve the original order of the seqlevels() in *both*
### objects. Without this precaution, the expected anti-symetric property of
### some operators would not be satisfied e.g. 'any(e1 < e2 & e2 < e1)' could
### be TRUE.

### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### pcompare() and related methods.
###
### pcompare() is based on the method defined for GRanges objects but cannot 
### explicity inherit the method due to the internalPos slot in GTuples 
### objects. However, unlike the pcompare() method defined for GRanges, the 
### pcompare() method for GTuples requires that both x and y have the same 
### length. I also define the element wise (aka "parallel") operators '<=' and 
### '=='. The other element wise operators (`!=`, `>=`, `<`, `>`) work 
### out-of-the-box on GTuples objects via inheritance from GRanges -> Vector.

# .pcompare_GTuples is adapted from GenomicRanges:::.pcompare_GenomicRanges.
# On 2 GTuples objects, it returns one of the 3 codes: If the first tuple in 
# the pair is "<" than the second tuple then the return value for that element 
# is < 0, if the first tuple in the pair is "==" the second tuple then the 
# return value is 0, and if the first tuple is ">" that the second tuple then 
# the return value is > 0.
#' @importMethodsFrom GenomeInfoDb seqlevels "seqlevels<-" seqinfo
#' @importMethodsFrom GenomicRanges granges
#' @importMethodsFrom S4Vectors pcompare
.pcompare_GTuples <- function(x, y) {
  
  # Different to comparing GRanges, I only allow comparison if x, y have same
  # length.
  # shortened error message because a long error trigger line formatting
  # that breaks the testthat error parser.
  
  # This is where .pcompare_GTuples really differs from .pcompare_GenomicRanges
  # NOTE: moved this up because the next 'if' will fail on NA != NA
  if (is.na(size(x)) || is.na(size(y))) {
    stop("Cannot pcompare empty '", class(x), "'.")
  }
  
  # Check 'size' is identical
  if (size(x) != size(y)) {
    stop("Cannot pcompare '", class(x), "' objects of different 'size'.")
  }
  
  if (size(x) <= 2L) {
    # Use the GRanges comparison method 
    # Can't use callNextMethod() because it breaks "<=", "<", etc.
    #callNextMethod()
    pcompare(granges(x), granges(y))
  } else {
    # Otherwise, use method specifically written for m-tuples (m > 2)
    
    # Pre-comparison step (see above for details).
    # merge() will fail if 'x' and 'y' don't have compatible underlying
    # sequences.
    seqinfo <- merge(seqinfo(x), seqinfo(y))
    seqlevels <- seqlevels(seqinfo)
    if (any(diff(match(seqlevels(y), seqlevels)) < 0L)) {
      stop("the 2 objects to pcompare have seqlevels in incompatible orders")
    }
    # This should only insert new seqlevels in the existing ones i.e. it
    # should NEVER drop or reorder existing levels
    seqlevels(x) <- seqlevels(y) <- seqlevels
    
    if (size(x) == 1L) {
      val <- .pcompareGTuplesCpp(
        int_seqnames = as.integer(seqnames(x)) - as.integer(seqnames(y)), 
        int_strand = as.integer(strand(x)) - as.integer(strand(y)), 
        int_pos = as.matrix(start(x) - start(y))
      )
    } else if (size(x) > 1L) {
      # If lengths are equal then no need to recycle, which is faster.
      if (isTRUE(length(x) == length(y))) {
        val <- .pcompareGTuplesCpp(
          int_seqnames = as.integer(seqnames(x)) - as.integer(seqnames(y)),
          int_strand = as.integer(strand(x)) - as.integer(strand(y)),
          int_pos = cbind(start(x) - start(y), x@internalPos - y@internalPos, 
                          end(x) - end(y))
        )
      } else {
        # Lengths are not equal so must recycle, which is slower.
        int_internal_pos <- .matrixDiffWithRecycling(x@internalPos, 
                                                     y@internalPos)
        val <- .pcompareGTuplesCpp(
          int_seqnames = as.integer(seqnames(x)) - as.integer(seqnames(y)), 
          int_strand = as.integer(strand(x)) - as.integer(strand(y)),
          int_pos = cbind(start(x) - start(y), int_internal_pos, 
                          end(x) - end(y))
        )
      }
    }
    val
  }
}

#' @importMethodsFrom S4Vectors pcompare
#' 
#' @export
setMethod("pcompare", 
          c("GTuples", "GTuples"), 
          function(x, y) {
            .pcompare_GTuples(x, y)
          }
)

### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### duplicated()  
###
### Can't rely on using duplicated() via inheritance from Vector because 
### GTuples inherits from GRanges, rather than from `Vector` directly.
### Furthermore, simply deferring to the duplicated() method for GRanges would 
### only check pos1, posm of of each tuple, which is okay provided size < 3 but 
### not if size >= 3.
###
### unique() will work out-of-the-box on an GTuples object thanks to the 
### method for GRanges objects, which inherits from Vector objects.

# NOTE: Don't explicitly import duplicated.data.table but rely on the method 
#       being available since the S3 generic is available via base.
.duplicated.GTuples <- function(x, incomparables = FALSE, fromLast = FALSE) {
  if (!identical(incomparables, FALSE)) 
    stop("\"duplicated\" method for '", class(x), "' objects only accepts ", 
         "'incomparables = FALSE'")
  duplicated(.GT2DT(x), incomparables = incomparables, fromLast = fromLast)
}

# S3/S4 combo for duplicated.GTuples

# NOTE: This should really just use the @export tag. However, because 
#       BiocGenerics redefines duplicated as an S4 generic, roxygen2 doesn't 
#       see this as an S3 method but rather as a function, and so adds 
#       export(duplicated.GTuples) to the NAMESPACE rather than 
#       S3method(duplicated, GTuples).
# NOTE: Both export and define duplicated.GTuples as an S3 method so that (a) 
#       it can be called directly, (b) tab-completion on the name of the generic 
#       shows it, and (c) methods() doesn't asterisk it (based on advice 
#       from GenomicRanges' NAMESPACE).
#' @export
#' @rawNamespace S3method(duplicated, GTuples)
duplicated.GTuples <- function(x, incomparables = FALSE, ...) {
  .duplicated.GTuples(x, incomparables = incomparables, ...)
}

#' @export
setMethod("duplicated", 
          "GTuples", 
          .duplicated.GTuples
)

### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### match()
###
### %in%, findMatches(), countMatches() will work out-of-the-box on GTuples 
### objects thanks to the method for Vector objects.

# Effectively just calls findOverlaps with type = equal.
#' @importFrom S4Vectors isSingleNumberOrNA isTRUEorFALSE selectHits
#' @importMethodsFrom GenomeInfoDb seqinfo
#' @importMethodsFrom IRanges findOverlaps
#' @importMethodsFrom S4Vectors from to
#' @export
setMethod("match", 
          c("GTuples", "GTuples"), 
          function(x, table, nomatch = NA_integer_, incomparables = NULL, 
                   ignore.strand = FALSE) {
            
            if (!isSingleNumberOrNA(nomatch)) {
              stop("'nomatch' must be a single number or NA")
            }
            if (!is.integer(nomatch)) {
              nomatch <- as.integer(nomatch)
            }
            if (!is.null(incomparables)) {
              stop("\"match\" method for GenomicRanges objects only accepts ",
                   "'incomparables = NULL'")
            }
            if (!isTRUEorFALSE(ignore.strand)) {
              stop("'ignore.strand' must be TRUE or FALSE")
            }
            ## Calling merge() is the way to check that 'x' and 'table' are 
            ## based on the same reference genome.
            ## That is, it's called for its side-effect (generally, not a good 
            ## programming strategy)
            merge(seqinfo(x), seqinfo(table))
            
            ol <- findOverlaps(query = x,
                               subject = table,
                               type = "equal", 
                               ignore.strand = ignore.strand)
            if (!ignore.strand) {
              # Only keep those matches where the strands are identical
              x_strand <- strand(x)[from(ol)]
              table_strand <- strand(table)[to(ol)]
              ol <- ol[x_strand == table_strand]
            }
            val <- selectHits(ol, "first")
            val[is.na(val)] <- nomatch
            val
          }
)

# NOTE: Need to explicitly import %in%.
#' @importMethodsFrom S4Vectors %in%

### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### selfmatch()
###

# Can't defer to selfMatch,GenomicRanges-method introduced in 
# https://hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/GenomicRanges@116115 
# so include this kludge to restore previous behaviour
#' @importFrom S4Vectors selfmatch
#' @export
setMethod("selfmatch", 
          "GTuples",
          function(x, ignore.strand = FALSE, ...) {
            if (!isTRUEorFALSE(ignore.strand)) {
              stop("'ignore.strand' must be TRUE or FALSE")
            }
            match(x, x, ignore.strand = ignore.strand, ...)
          }
)

### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### order() and related methods.
###
### The order() and rank() methods for GTuples objects are consistent with the 
### order implied by pcompare().
### is.unsorted() is a quick/cheap way of checking whether a GTuples
### object is already sorted, e.g., called prior to a costly sort.
### sort is defined via inheritance to GRanges

#' @importFrom S4Vectors decode
.GTuples.asIntegerTuples <- function(x, ignore.strand = FALSE) {
  a <- decode(seqnames(x))
  if (ignore.strand) {
    b <- integer(length(x))
  } else {
    b <- decode(strand(x))
  }
  c <- start(x)
  if (size(x) > 2L) {
    ipd <- IPD(x)
    # NOTE: This is somewhat inefficient since it constructs the matrix and then
    #       splits it into a list of vectors. 
    # TODO (longterm): Do this without the aforementioned overhead.
    rest <- split(ipd, rep(seq_len(ncol(ipd)), each = nrow(ipd)))
  } else {
    rest <- list()
  }
  c(list(a, b, c), rest)
}

#' @importFrom data.table := .GRP 
#' @importFrom S4Vectors isTRUEorFALSE decode
#' 
#' @export
setMethod("is.unsorted", "GTuples", 
          function(x, na.rm = FALSE, strictly = FALSE, ignore.strand = FALSE) {
            
            if (!identical(na.rm, FALSE)) {
              warning("\"is.unsorted\" method for '", class(x), "' objects ",
                      "ignores the 'na.rm' argument")
            }
            if (!isTRUEorFALSE(strictly)) {
              stop("'strictly' must be TRUE of FALSE")
            }
            if (is.na(size(x))) {
              return(FALSE)
            }
            
            if (size(x) < 3L) {
              callNextMethod()
            } else {
              # NOTE: It is tempting to use
              #         if (ignore.strand) {
              #           x <- unstrand(x)
              #         }
              #         is.unsorted(order(x), strictly = strictly)
              #       However, this is incorrect when there are repeated tuples 
              #       because order assigns these distinct values
              #       This implementation is clunky but has correct behaviour. 
              #       The idea is to check that each 2-tuple within the m-tuple 
              #       is unsorted and return TRUE as soon as this is identified.
              # NOTE: a and b are the same for all 2-tuples whereas c and d are 
              #       updated as we move through the 2-tuples
              a <- decode(seqnames(x))
              if (ignore.strand) {
                b <- integer(length(x))
              } else {
                b <- decode(strand(x))
              }
              c <- start(x)
              d <- x@internalPos[, 1L]
              unsorted <- !S4Vectors:::sortedIntegerQuads(a, b, c, d, 
                                                          strictly = strictly)
              if (unsorted) {
                return(TRUE)
              }
              if (size(x) > 3) {
                for (j in seq(2L, size(x) - 2L, 1L)) {
                  c <- x@internalPos[, j - 1]
                  d <- x@internalPos[, j]
                }
                unsorted <- !S4Vectors:::sortedIntegerQuads(a, b, c, d, 
                                                            strictly = strictly)
                if (unsorted) {
                  return(TRUE)
                }
              }
              c <- x@internalPos[, size(x) - 2L]
              d <- end(x)
              !S4Vectors:::sortedIntegerQuads(a, b, c, d, strictly = strictly)
            }
          }
)


#' @importFrom data.table := .I setorderv
#' @importFrom S4Vectors isTRUEorFALSE
.order.GTuples <- function(x, decreasing = FALSE, ignore.strand = FALSE) {
  if (!isTRUEorFALSE(decreasing)) {
    stop("'decreasing' must be TRUE or FALSE")
  }
  key <- c("seqnames", "strand", paste0("pos", seq_len(size(x))))
  dt <- .GT2DT(x, ignore.strand)[, idx := .I]
  if (decreasing) {
    order <- -1L
  } else {
    order <- 1L
  }
  setorderv(dt, key, order = order)
  dt[, idx]
  # TODO: Use this slightly faster version once data.table:::forderv() is 
  #       exported.
  # data.table:::forderv(.GT2DT(x, ignore.strand), 
  #                      by = c("seqnames", "strand", paste0("pos", seq_len(size(x)))), 
  #                      sort = TRUE, 
  #                      retGrp = FALSE, 
  #                      order = order), 
  #                      na.last = na.last)
}
#' @importFrom utils globalVariables
globalVariables("idx")

# TODO: Support the 'ignore.strand' argument once order,GenomicRanges-method 
#       does.
#' @importFrom S4Vectors isTRUEorFALSE
#' @importMethodsFrom GenomeInfoDb seqnames
#' 
#' @export
setMethod("order", 
          "GTuples", 
          function(..., na.last = TRUE, decreasing = FALSE, 
                   method = c("auto", "shell", "radix")) {
            
            if (!isTRUEorFALSE(decreasing)) {
              stop("'decreasing' must be TRUE or FALSE")
            }
            
            args <- list(...)
            
            sizes <- vapply(args, size, integer(1L))
            
            if (all(is.na(sizes))) {
              return(integer(0L))
            }
            
            if (!.zero_range(sizes)) {
              stop("All '", class(args[[1]]), "' objects must have the same ", 
                   "'size'.")
            }
            
            if (length(args) == 1L) {
              return(.order.GTuples(args[[1L]], decreasing))
            } else {
              # TODO: Pass ignore.strand to .GTuples.asIntegerTuples once 
              #       order() supports this argument.
              order_args <- c(unlist(lapply(args, .GTuples.asIntegerTuples),
                                     recursive = FALSE, use.names = FALSE),
                              list(na.last = na.last, 
                                   decreasing = decreasing))
              do.call(order, order_args)
            }
          }
)

# NOTE: Need to (temporarily?) redefine sort,GTuples-method (see 
#       https://github.com/PeteHaitch/GenomicTuples/issues/31)
#' @importMethodsFrom S4Vectors extractROWS
.sort.GTuples <- function(x, decreasing = FALSE, ignore.strand = FALSE, by) {
  if (is.na(size(x))) {
    return(x)
  }
  if (missing(by)) {
    oo <- .order.GTuples(x, decreasing, ignore.strand)
  } else {
    if (!identical(ignore.strand, FALSE)) {
      warning("'ignore.strand' ignored when 'by' is specified")
    }
    oo <- S4Vectors:::orderBy(by, x, decreasing = decreasing)
  }
  extractROWS(x, oo)
}
### S3/S4 combo for sort.GenomicRanges
# NOTE: Both export and define sort.GTuples as an S3 method so that (a) it can 
#       be called directly, (b) tab-completion on the name of the generic 
#       shows it, and (c) methods() doesn't asterisk them (based on advice 
#       from GenomicRanges' NAMESPACE).
#' @rawNamespace S3method(sort, GTuples)
#' @export
sort.GTuples <- function(x, decreasing = FALSE, ...)
  .sort.GTuples(x, decreasing = decreasing, ...)

#' @export
setMethod("sort", "GTuples", .sort.GTuples)
PeteHaitch/GenomicTuples documentation built on Nov. 5, 2024, 2 p.m.