Nothing
### =========================================================================
### Comparing and ordering genomic tuples
### -------------------------------------------------------------------------
### I. UNIQUE AND DUPLICATED ELEMENTS WITHIN A GTuples OBJECT
### ---------------------------------------------------------------
### Two elements of a GTuples object (i.e. two genomic tuples) are
### considered equal iff they are on the same underlying sequence and strand,
### and have the same positions. duplicated() and unique() on a
### GTuples object are conforming to this.
###
### II. ORDERING THE ELEMENTS OF A GTuples OBJECT
### ---------------------------------------------------
### The "natural order" for the elements of a GTuples object is to order
### them (a) first by sequence level, (b) then by strand, (c) then by pos_{1},
### ..., pos_{m}. This way, the space of genomic tuples is totally ordered.
### order(), sort(), and rank() on a GTuples object are using this
### "natural order".
###
### III. ELEMENT-WISE (AKA "PARALLEL") COMPARISON OF 2 GTuples OBJECTS
### ------------------------------------------------------------------------
### We want the "==", "!=", "<=", ">=", "<" and ">" operators between 2
### GTuples objects to be compatible with the "natural order" defined
### previously. Defining those operators when the 2 objects have *identical*
### seqlevels() is straighforward but we can in fact extend this comparison
### to the following situation:
### (A) 'e1' and 'e2' have compatible sets of underlying sequences, that is,
### 'seqinfo(e1)' and 'seqinfo(e2)' can be merged.
### (B) 'seqlevels(e1)' and 'seqlevels(e2)' are in the same order. Note that
### (A) guarantees that the seqlevels of one is a subset of the seqlevels
### of the other. (B) is saying that this subset should be a subsequence.
### Pre-comparison step: if (A) and (B) are satisfied, then the 2 seqinfo() are
### merged and the seqlevels() of the result is assigned back to each object
### to pcompare. This is a way to have 2 objects with identical seqlevels()
### before the comparison can actually be performed and meaningful.
### The reason (B) is required for the pre-comparison step is because we want
### this step to preserve the original order of the seqlevels() in *both*
### objects. Without this precaution, the expected anti-symetric property of
### some operators would not be satisfied e.g. 'any(e1 < e2 & e2 < e1)' could
### be TRUE.
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### pcompare() and related methods.
###
### pcompare() is based on the method defined for GRanges objects but cannot
### explicity inherit the method due to the internalPos slot in GTuples
### objects. However, unlike the pcompare() method defined for GRanges, the
### pcompare() method for GTuples requires that both x and y have the same
### length. I also define the element wise (aka "parallel") operators '<=' and
### '=='. The other element wise operators (`!=`, `>=`, `<`, `>`) work
### out-of-the-box on GTuples objects via inheritance from GRanges -> Vector.
# .pcompare_GTuples is adapted from GenomicRanges:::.pcompare_GenomicRanges.
# On 2 GTuples objects, it returns one of the 3 codes: If the first tuple in
# the pair is "<" than the second tuple then the return value for that element
# is < 0, if the first tuple in the pair is "==" the second tuple then the
# return value is 0, and if the first tuple is ">" that the second tuple then
# the return value is > 0.
#' @importMethodsFrom GenomeInfoDb seqlevels "seqlevels<-" seqinfo
#' @importMethodsFrom GenomicRanges granges
#' @importMethodsFrom S4Vectors pcompare
.pcompare_GTuples <- function(x, y) {
# Different to comparing GRanges, I only allow comparison if x, y have same
# length.
# shortened error message because a long error trigger line formatting
# that breaks the testthat error parser.
# This is where .pcompare_GTuples really differs from .pcompare_GenomicRanges
# NOTE: moved this up because the next 'if' will fail on NA != NA
if (is.na(size(x)) || is.na(size(y))) {
stop("Cannot pcompare empty '", class(x), "'.")
}
# Check 'size' is identical
if (size(x) != size(y)) {
stop("Cannot pcompare '", class(x), "' objects of different 'size'.")
}
if (size(x) <= 2L) {
# Use the GRanges comparison method
# Can't use callNextMethod() because it breaks "<=", "<", etc.
#callNextMethod()
pcompare(granges(x), granges(y))
} else {
# Otherwise, use method specifically written for m-tuples (m > 2)
# Pre-comparison step (see above for details).
# merge() will fail if 'x' and 'y' don't have compatible underlying
# sequences.
seqinfo <- merge(seqinfo(x), seqinfo(y))
seqlevels <- seqlevels(seqinfo)
if (any(diff(match(seqlevels(y), seqlevels)) < 0L)) {
stop("the 2 objects to pcompare have seqlevels in incompatible orders")
}
# This should only insert new seqlevels in the existing ones i.e. it
# should NEVER drop or reorder existing levels
seqlevels(x) <- seqlevels(y) <- seqlevels
if (size(x) == 1L) {
val <- .pcompareGTuplesCpp(
int_seqnames = as.integer(seqnames(x)) - as.integer(seqnames(y)),
int_strand = as.integer(strand(x)) - as.integer(strand(y)),
int_pos = as.matrix(start(x) - start(y))
)
} else if (size(x) > 1L) {
# If lengths are equal then no need to recycle, which is faster.
if (isTRUE(length(x) == length(y))) {
val <- .pcompareGTuplesCpp(
int_seqnames = as.integer(seqnames(x)) - as.integer(seqnames(y)),
int_strand = as.integer(strand(x)) - as.integer(strand(y)),
int_pos = cbind(start(x) - start(y), x@internalPos - y@internalPos,
end(x) - end(y))
)
} else {
# Lengths are not equal so must recycle, which is slower.
int_internal_pos <- .matrixDiffWithRecycling(x@internalPos,
y@internalPos)
val <- .pcompareGTuplesCpp(
int_seqnames = as.integer(seqnames(x)) - as.integer(seqnames(y)),
int_strand = as.integer(strand(x)) - as.integer(strand(y)),
int_pos = cbind(start(x) - start(y), int_internal_pos,
end(x) - end(y))
)
}
}
val
}
}
#' @importMethodsFrom S4Vectors pcompare
#'
#' @export
setMethod("pcompare",
c("GTuples", "GTuples"),
function(x, y) {
.pcompare_GTuples(x, y)
}
)
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### duplicated()
###
### Can't rely on using duplicated() via inheritance from Vector because
### GTuples inherits from GRanges, rather than from `Vector` directly.
### Furthermore, simply deferring to the duplicated() method for GRanges would
### only check pos1, posm of of each tuple, which is okay provided size < 3 but
### not if size >= 3.
###
### unique() will work out-of-the-box on an GTuples object thanks to the
### method for GRanges objects, which inherits from Vector objects.
# NOTE: Don't explicitly import duplicated.data.table but rely on the method
# being available since the S3 generic is available via base.
.duplicated.GTuples <- function(x, incomparables = FALSE, fromLast = FALSE) {
if (!identical(incomparables, FALSE))
stop("\"duplicated\" method for '", class(x), "' objects only accepts ",
"'incomparables = FALSE'")
duplicated(.GT2DT(x), incomparables = incomparables, fromLast = fromLast)
}
# S3/S4 combo for duplicated.GTuples
# NOTE: This should really just use the @export tag. However, because
# BiocGenerics redefines duplicated as an S4 generic, roxygen2 doesn't
# see this as an S3 method but rather as a function, and so adds
# export(duplicated.GTuples) to the NAMESPACE rather than
# S3method(duplicated, GTuples).
# NOTE: Both export and define duplicated.GTuples as an S3 method so that (a)
# it can be called directly, (b) tab-completion on the name of the generic
# shows it, and (c) methods() doesn't asterisk it (based on advice
# from GenomicRanges' NAMESPACE).
#' @export
#' @rawNamespace S3method(duplicated, GTuples)
duplicated.GTuples <- function(x, incomparables = FALSE, ...) {
.duplicated.GTuples(x, incomparables = incomparables, ...)
}
#' @export
setMethod("duplicated",
"GTuples",
.duplicated.GTuples
)
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### match()
###
### %in%, findMatches(), countMatches() will work out-of-the-box on GTuples
### objects thanks to the method for Vector objects.
# Effectively just calls findOverlaps with type = equal.
#' @importFrom S4Vectors isSingleNumberOrNA isTRUEorFALSE selectHits
#' @importMethodsFrom GenomeInfoDb seqinfo
#' @importMethodsFrom IRanges findOverlaps
#' @importMethodsFrom S4Vectors from to
#' @export
setMethod("match",
c("GTuples", "GTuples"),
function(x, table, nomatch = NA_integer_, incomparables = NULL,
ignore.strand = FALSE) {
if (!isSingleNumberOrNA(nomatch)) {
stop("'nomatch' must be a single number or NA")
}
if (!is.integer(nomatch)) {
nomatch <- as.integer(nomatch)
}
if (!is.null(incomparables)) {
stop("\"match\" method for GenomicRanges objects only accepts ",
"'incomparables = NULL'")
}
if (!isTRUEorFALSE(ignore.strand)) {
stop("'ignore.strand' must be TRUE or FALSE")
}
## Calling merge() is the way to check that 'x' and 'table' are
## based on the same reference genome.
## That is, it's called for its side-effect (generally, not a good
## programming strategy)
merge(seqinfo(x), seqinfo(table))
ol <- findOverlaps(query = x,
subject = table,
type = "equal",
ignore.strand = ignore.strand)
if (!ignore.strand) {
# Only keep those matches where the strands are identical
x_strand <- strand(x)[from(ol)]
table_strand <- strand(table)[to(ol)]
ol <- ol[x_strand == table_strand]
}
val <- selectHits(ol, "first")
val[is.na(val)] <- nomatch
val
}
)
# NOTE: Need to explicitly import %in%.
#' @importMethodsFrom S4Vectors %in%
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### selfmatch()
###
# Can't defer to selfMatch,GenomicRanges-method introduced in
# https://hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/GenomicRanges@116115
# so include this kludge to restore previous behaviour
#' @importFrom S4Vectors selfmatch
#' @export
setMethod("selfmatch",
"GTuples",
function(x, ignore.strand = FALSE, ...) {
if (!isTRUEorFALSE(ignore.strand)) {
stop("'ignore.strand' must be TRUE or FALSE")
}
match(x, x, ignore.strand = ignore.strand, ...)
}
)
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### order() and related methods.
###
### The order() and rank() methods for GTuples objects are consistent with the
### order implied by pcompare().
### is.unsorted() is a quick/cheap way of checking whether a GTuples
### object is already sorted, e.g., called prior to a costly sort.
### sort is defined via inheritance to GRanges
#' @importFrom S4Vectors decode
.GTuples.asIntegerTuples <- function(x, ignore.strand = FALSE) {
a <- decode(seqnames(x))
if (ignore.strand) {
b <- integer(length(x))
} else {
b <- decode(strand(x))
}
c <- start(x)
if (size(x) > 2L) {
ipd <- IPD(x)
# NOTE: This is somewhat inefficient since it constructs the matrix and then
# splits it into a list of vectors.
# TODO (longterm): Do this without the aforementioned overhead.
rest <- split(ipd, rep(seq_len(ncol(ipd)), each = nrow(ipd)))
} else {
rest <- list()
}
c(list(a, b, c), rest)
}
#' @importFrom data.table := .GRP
#' @importFrom S4Vectors isTRUEorFALSE decode
#'
#' @export
setMethod("is.unsorted", "GTuples",
function(x, na.rm = FALSE, strictly = FALSE, ignore.strand = FALSE) {
if (!identical(na.rm, FALSE)) {
warning("\"is.unsorted\" method for '", class(x), "' objects ",
"ignores the 'na.rm' argument")
}
if (!isTRUEorFALSE(strictly)) {
stop("'strictly' must be TRUE of FALSE")
}
if (is.na(size(x))) {
return(FALSE)
}
if (size(x) < 3L) {
callNextMethod()
} else {
# NOTE: It is tempting to use
# if (ignore.strand) {
# x <- unstrand(x)
# }
# is.unsorted(order(x), strictly = strictly)
# However, this is incorrect when there are repeated tuples
# because order assigns these distinct values
# This implementation is clunky but has correct behaviour.
# The idea is to check that each 2-tuple within the m-tuple
# is unsorted and return TRUE as soon as this is identified.
# NOTE: a and b are the same for all 2-tuples whereas c and d are
# updated as we move through the 2-tuples
a <- decode(seqnames(x))
if (ignore.strand) {
b <- integer(length(x))
} else {
b <- decode(strand(x))
}
c <- start(x)
d <- x@internalPos[, 1L]
unsorted <- !S4Vectors:::sortedIntegerQuads(a, b, c, d,
strictly = strictly)
if (unsorted) {
return(TRUE)
}
if (size(x) > 3) {
for (j in seq(2L, size(x) - 2L, 1L)) {
c <- x@internalPos[, j - 1]
d <- x@internalPos[, j]
}
unsorted <- !S4Vectors:::sortedIntegerQuads(a, b, c, d,
strictly = strictly)
if (unsorted) {
return(TRUE)
}
}
c <- x@internalPos[, size(x) - 2L]
d <- end(x)
!S4Vectors:::sortedIntegerQuads(a, b, c, d, strictly = strictly)
}
}
)
#' @importFrom data.table := .I setorderv
#' @importFrom S4Vectors isTRUEorFALSE
.order.GTuples <- function(x, decreasing = FALSE, ignore.strand = FALSE) {
if (!isTRUEorFALSE(decreasing)) {
stop("'decreasing' must be TRUE or FALSE")
}
key <- c("seqnames", "strand", paste0("pos", seq_len(size(x))))
dt <- .GT2DT(x, ignore.strand)[, idx := .I]
if (decreasing) {
order <- -1L
} else {
order <- 1L
}
setorderv(dt, key, order = order)
dt[, idx]
# TODO: Use this slightly faster version once data.table:::forderv() is
# exported.
# data.table:::forderv(.GT2DT(x, ignore.strand),
# by = c("seqnames", "strand", paste0("pos", seq_len(size(x)))),
# sort = TRUE,
# retGrp = FALSE,
# order = order),
# na.last = na.last)
}
#' @importFrom utils globalVariables
globalVariables("idx")
# TODO: Support the 'ignore.strand' argument once order,GenomicRanges-method
# does.
#' @importFrom S4Vectors isTRUEorFALSE
#' @importMethodsFrom GenomeInfoDb seqnames
#'
#' @export
setMethod("order",
"GTuples",
function(..., na.last = TRUE, decreasing = FALSE,
method = c("auto", "shell", "radix")) {
if (!isTRUEorFALSE(decreasing)) {
stop("'decreasing' must be TRUE or FALSE")
}
args <- list(...)
sizes <- vapply(args, size, integer(1L))
if (all(is.na(sizes))) {
return(integer(0L))
}
if (!.zero_range(sizes)) {
stop("All '", class(args[[1]]), "' objects must have the same ",
"'size'.")
}
if (length(args) == 1L) {
return(.order.GTuples(args[[1L]], decreasing))
} else {
# TODO: Pass ignore.strand to .GTuples.asIntegerTuples once
# order() supports this argument.
order_args <- c(unlist(lapply(args, .GTuples.asIntegerTuples),
recursive = FALSE, use.names = FALSE),
list(na.last = na.last,
decreasing = decreasing))
do.call(order, order_args)
}
}
)
# NOTE: Need to (temporarily?) redefine sort,GTuples-method (see
# https://github.com/PeteHaitch/GenomicTuples/issues/31)
#' @importMethodsFrom S4Vectors extractROWS
.sort.GTuples <- function(x, decreasing = FALSE, ignore.strand = FALSE, by) {
if (is.na(size(x))) {
return(x)
}
if (missing(by)) {
oo <- .order.GTuples(x, decreasing, ignore.strand)
} else {
if (!identical(ignore.strand, FALSE)) {
warning("'ignore.strand' ignored when 'by' is specified")
}
oo <- S4Vectors:::orderBy(by, x, decreasing = decreasing)
}
extractROWS(x, oo)
}
### S3/S4 combo for sort.GenomicRanges
# NOTE: Both export and define sort.GTuples as an S3 method so that (a) it can
# be called directly, (b) tab-completion on the name of the generic
# shows it, and (c) methods() doesn't asterisk them (based on advice
# from GenomicRanges' NAMESPACE).
#' @rawNamespace S3method(sort, GTuples)
#' @export
sort.GTuples <- function(x, decreasing = FALSE, ...)
.sort.GTuples(x, decreasing = decreasing, ...)
#' @export
setMethod("sort", "GTuples", .sort.GTuples)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.