R/closest.R
In HelloRanges: Introduce *Ranges to bedtools users

Documented in bedtools_closest R_bedtools_closest

bedtools_closest <- function(cmd = "--help") {
    do_R_call(R_bedtools_closest, BEDTOOLS_CLOSEST_DOC, cmd)
}

## Essentially an LOJ based on closest match between 'a' and 'b'

### NOTE: When ranges overlap, bedtools selects the 'b' range that
### overlaps the largest fraction of the 'a' range. That behavior
### seems somewhat arbitrary, so we do not imitate it.

### NOTE: bedtools distances are one larger than those from
### distance(), except in the case of overlap, where both yield
### zero. Our goal is analogous behavior, not identical behavior, so
### this is fine. But it does mean that the negative distances
### generated by -D no longer make sense.

R_bedtools_closest <- function(a, b, s=FALSE, S=FALSE,
                               d=FALSE, D=c("none", "ref", "a", "b"),
                               io=FALSE, iu=FALSE, id=FALSE,
                               fu=FALSE, fd=FALSE,
                               t=c("all", "first", "last"),
                               mdb=c("each", "all"), k=1L,
                               names=NULL, filenames=FALSE,
                               N=FALSE)
{
    D <- match.arg(D)
    t <- match.arg(t)
    mdb <- match.arg(mdb)
    
    stopifnot(isSingleString(a) || hasRanges(a),
              (is.character(b) && !anyNA(b) && length(b) >= 1L) ||
                  hasRanges(b),
              isTRUEorFALSE(s),
              isTRUEorFALSE(S), !(s && S),
              isTRUEorFALSE(d), !d || D == "none",
              isTRUEorFALSE(io),
              isTRUEorFALSE(iu), !iu || D != "none", !(iu && fu),
              isTRUEorFALSE(id), !id || D != "none", !(io && iu && id),
                                 !(id && fd),
              isTRUEorFALSE(fu),
              isTRUEorFALSE(fd), !(fu && fd),
              isSingleNumber(k), k >= 1L,
              isTRUEorFALSE(filenames),
              isTRUEorFALSE(N))

    if (k > 1L) {
        stop("'-k' > 1 not yet supported")
    }
    if (N) {
        stop("'-N' not yet supported, ",
             "but see ?nearest for when 'subject' is missing")
    }
    
    if (!is.null(names)) {
        stopifnot(is.character(names), !anyNA(names),
                  length(names) == length(b))
    }
    
    R(genome <- Seqinfo(genome=NA_character_)) # no 'g' parameter
    
    a <- normA(a)
    b <- normB(b)

    .gr_a <- importA(a)

    .gr_b <- importB(b, names, filenames)
    loopOverB <- length(b) > 1L && mdb == "each"
    if (loopOverB) {
        .preamble <- .code
        rm(.code)
        .gr_b_orig <- .gr_b
        .gr_b <- as.name(paste0(.gr_b, "_i"))
    }

    .gr_a_o <- prepOverlapRanges(a, FALSE)
    .gr_b_o <- prepOverlapRanges(b, FALSE)
    
    if (S) {
        .gr_b_o <- .R(invertStrand(.gr_b_o))
    }
    
    ignore.strand <- !(s || S)
    deferRestriction <- !ignore.strand && D == "ref" && (id || iu)
    useNearest <- !(iu || id || io || fu || fd)
    .hits <- quote(hits)
    if (useNearest) { 
        R(.hits <- nearest(.gr_a_o, .gr_b_o, ignore.strand=ignore.strand,
                           select="all"))
    } else {
        hits <- NULL
        if (!io) {
            R(overlaps <- findOverlaps(.gr_a_o, .gr_b_o,
                                       ignore.strand=ignore.strand))
            hits <- quote(overlaps)
        }
        ignore.strand <- ignore.strand && !(D %in% c("a", "b"))
        if (!(s || S)) {
            if (D == "a")
                .gr_b_o <- .R(unstrand(.gr_b_o))
            else if (D == "b")
                .gr_a_o <- .R(unstrand(.gr_a_o))
        }
        transpose <- D == "b" && (!iu || !id)
        if (transpose) {
            .gr_tmp_o <- .gr_a_o
            .gr_a_o <- .gr_b_o
            .gr_b_o <- .gr_tmp_o
            .findUp <- quote(precede)
            .findDown <- quote(follow)
            .aHits <- quote(subjectHits)
        } else {
            .findUp <- quote(follow)
            .findDown <- quote(precede)
            .aHits <- quote(queryHits)
        }
        if (!iu || deferRestriction) {
            R(upstream <- .findUp(.gr_a_o, .gr_b_o,
                                  ignore.strand=ignore.strand,
                                  select="all"))
            hits <- c(hits, quote(upstream))
        }
        if (!id || deferRestriction) {
            R(downstream <- .findDown(.gr_a_o, .gr_b_o,
                                      ignore.strand=ignore.strand,
                                      select="all"))
            hits <- c(hits, quote(downstream))
        }
        if (fu) {
            if (!io) {
                R(overlaps <-
                      overlaps[!queryHits(overlaps) %in% .aHits(upstream)])
            }
            if (!id) {
                R(downstream <-
                      downstream[!.aHits(downstream) %in% .aHits(upstream)])
            }
        }
        if (fd) {
            if (!io) {
                R(overlaps <-
                      overlaps[!queryHits(overlaps) %in% .aHits(downstream)])
            }
            if (!iu) {
                R(upstream <-
                      upstream[!.aHits(upstream) %in% .aHits(downstream)])
            }
        }
        if (length(hits) > 1L) {
            .combine <- as.call(c(quote(c), hits))
            R(.hits <- .combine)
            unresolved <- length(hits) == 3L || !(fd || fu)
            if (unresolved) {
                R(.hits <- selectNearest(.hits, .gr_a_o, .gr_b_o))
            }
        } else {
            .hits <- hits[[1L]]
        }
        if (transpose) {
            .t <- quote(t)
            .hits <- .R(.t(.hits))
        }
    }

    if (t != "all") {
        .breakTies <- .R(.hits <- breakTies(.hits, t))
        if (!deferRestriction) {
            R(.breakTies)
        }
    }
    
    R(ans <- pair(.gr_a, .gr_b, .hits, all.x=TRUE))

    if (d || D != "none") {
        R(mcols(ans)$distance <- distance(ans))
    }
    
    if (deferRestriction) {
        .id <- quote(start(second) < end(first))
        .iu <- quote(end(first) > start(second))
        .restriction <- if (id) .R(.id) else if (iu) .R(.iu) else .R(.id & .iu)
        if (t == "all") {
            R(ans <- subset(ans, .restriction))
        } else {
            R(.hits <- .hits[with(ans, .restriction)])
            R(.breakTies)
            R(ans <- pair(.gr_a, .gr_b, .hits, all.x=TRUE))
            R(mcols(ans)$distance <- distance(ans))
        }
    }

    if (loopOverB) {
        .code[[length(.code)]] <- .code[[length(.code)]][[3]]
        args <- as.pairlist(setNames(alist(bi=), as.character(.gr_b)))
        loopFun <- as.call(c(quote(`function`), list(args), list(.code)))
        .code <- .preamble
        rm(b)
        R(ans <- unlist(List(lapply(split(.gr_b_orig, ~ b), loopFun)),
                        use.names=FALSE))
    }

    R(ans)
}

BEDTOOLS_CLOSEST_DOC <-
    "Usage:
       bedtools_closest [options]
     Options:
       -a <FILE>  BAM/BED/GFF/VCF file A. Each feature in A is compared to B
          in search of overlaps. Use 'stdin' if passing A with a UNIX pipe.
       -b <FILE1,...>  One or more BAM/BED/GFF/VCF file(s) B. Use 'stdin' if
          passing B with a UNIX pipe. -b may be followed with multiple
          databases and/or wildcard (*) character(s).
       -s  Require same strandedness. That is, find the closest feature in B
           that overlaps A on the _same_ strand. By default, overlaps are
           reported without respect to strand.
       -S  Require opposite strandedness. That is, find the closest feature in
           B that overlaps A on the _opposite_ strand. By default, overlaps are
           reported without respect to strand.
       -d  In addition to the closest feature in B, report its distance to A
           as an extra column. The reported distance for overlapping features
           will be 0.
       -D <mode>  Like -d, report the closest feature in B, and its distance to
          A as an extra column. However unlike -d, -D conveys a notion of
          upstream that is useful with other arguments.
                The options for defining which orientation is \"upstream\" are:
                * ref: Report distance with respect to the reference genome.
                       B features with a lower (start, stop) are upstream.
                * a: Report distance with respect to A.
                     When A is on the - strand, \"upstream\" means B has a
                     higher (start,stop).
                * b: Report distance with respect to B.
                     When B is on the - strand, \"upstream\" means A has a
                     higher (start,stop).
       --io  Ignore features in B that overlap A. That is, we want close,
             yet not touching features only.
       --iu  Ignore features in B that are upstream of features in A. This
             option requires -D and follows its orientation rules for
             determining what is \"upstream\".
       --id  Ignore features in B that are downstream of features in A. This
             option requires -D and follows its orientation rules for
             determining what is \"downstream\".
       --fu  Choose first from features in B that are upstream of features in A.
             This option requires -D and follows its orientation rules for
             determining what is \"upstream\".
       --fd  Choose first from features in B that are downstream of features in
             A. This option requires -D and follows its orientation rules for
             determining what is \"downstream\".
       -t <mode>  Specify how ties for closest feature should be handled. This
          occurs when two features in B have exactly the same \"closeness\"
         with A. By default, all such features in B are reported.
                Here are all the options:
                * all: Report all ties (default).
                * first: Report the first tie that occurred in the B file.
                * last: Report the last tie that occurred in the B file.
       --mdb <mode>  Specifiy how multiple databases should be resolved.
                * each: Report closest records for each database (default).
                * all: Report closest records among all databases.
       -k <number>  Report the k closest hits. Default is 1. If
          tieMode = \"all\", all ties will still be reported.
       --names <NAME1,...>  When using multiple databases (-b), provide an
               alias for each that will appear instead of a fileId when also
               printing the DB record.
       --filenames  When using multiple databases (-b), show each complete
                    filename instead of a fileId when also printing the DB
                    record.
       -N  Require that the query and the closest hit have different names.
           For BED, the 4th column is compared."

do_bedtools_closest <- make_do(R_bedtools_closest)