# This file was generated by Rcpp::compileAttributes
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
binary_search = function(breaks, search, left_index = TRUE) {
.Call('epik_binary_search', PACKAGE = 'epik', breaks, search, left_index) + 1
}
# == title
# Extract subset of sites in a set of intervals
#
# == param
# -start start positions, a single integer or a numeric vector
# -end end positions, a single integer or a numeric vector
# -site positions of all sites, a numeric vector, should be sorted increasingly.
# -return_index whether return the index in the position vector or just the position itself?
# -min_sites minimal number of sites in an interval, regions which contain sites less than this value will be filtered out.
#
# == details
# Providing a huge vector of genomic positions, we want to extract subset of positions which
# locate in a specific group of regions (e.g. extract CpG sites in DMRs). The simplest way is to use:
#
# site = sort(sample(10000000, 1000000))
# start = 123456
# end = 654321
# subsite = site[site >= start & site <= end]
#
# Unfortunately, in above code, the whole vector ``site`` will be scanned four times
# (``>=``, ``<=``, ``&`` and ``[``).
# If you want to look for sites in more than one regions (e.g. 1000 regions), in every
# loop, the whole ``site`` vector will be re-scanned again and again which is very time-consuming.
#
# Here we have `extract_sites` function which uses binary search to do subsetting.
# Of course, ``site`` should be sorted non-decreasing beforehand.
#
# subsite = extract_sites(start, end, site, index = FALSE)
#
# Not only for single interval, you can also extract sites in multiple genomic regins,
# by setting ``start`` and ``end`` as vectors. E.g. extracting CpG sites in exons in every gene.
#
# start = c(123456, 234567, 345678)
# end = c(133456, 244567, 355678)
# subsite = extract_sites(start, end, site)
#
# You can choose to return index only or positions.
#
# subsite = extract_sites(start, end, site, return_index = FALSE)
# head(subsite)
# subsite_index = extract_sites(start, end, site, return_index = TRUE)
# head(subsite_index)
# head(site[subsite_index])
#
# Regions that include sites less than ``min_site`` will be filtered out.
#
# This kind of overlapping can also be done by defining an `IRanges::IRanges` or `GenomicRanges::GRanges`
# object and calling `GenomicRanges::findOverlaps` function. Following compares the running time for using `IRanges::findOverlaps`
# and `extract_sites`:
#
# set.seed(123)
# site = sort(sample(1000, 100))
# pos = do.call("rbind", lapply(1:10, function(i) sort(sample(max(site), 2))))
# pos_ir = IRanges(pos[, 1], pos[, 2])
# site_ir = IRanges(site, site)
# library(microbenchmark)
# microbenchmark(f1 = {r1 = extract_sites(pos[, 1], pos[, 2], site)},
# f2 = {
# mtch = findOverlaps(pos_ir, site_ir)
# r2 = start(site_ir[unique(subjectHits(mtch))])
# })
# # Unit: microseconds
# # expr min lq mean median uq max neval cld
# # f1 6.052 8.3925 13.99696 15.5855 18.132 29.366 100 a
# # f2 1105.678 1175.9220 1237.87162 1192.6070 1271.972 2054.595 100 b
#
# == value
# A vector of positions or index.
#
# == author
# Zuguang Gu <z.gu@dkfz.de>
#
# == example
# site = sort(sample(1000, 100))
# pos = do.call("rbind", lapply(1:10, function(i) sort(sample(max(site), 2))))
# extract_sites(pos[, 1], pos[, 2], site)
extract_sites = function(start, end, site, return_index = FALSE, min_sites = 0) {
.Call('epik_extract_sites', PACKAGE = 'epik', start, end, site, return_index, min_sites)
}
rowWhichMax = function(m) {
.Call('epik_rowWhichMax', PACKAGE = 'epik', m)
}
dist_by_closeness <- function(mat) {
as.dist(.Call('epik_dist_by_closeness', PACKAGE = 'epik', mat))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.