# Old, slow version based on GRanges methods
#
# Find the distance to the nearest genomic feature.
#
# For a given query set of genomic regions, and a given feature set of
# regions, this function will return the distance for each query region to its
# closest feature. It ignores strand and returns the distance as positive or
# negative, depending on whether the feature is upstream or downstream.
#
# This function is similar to the bioconductor distanceToNearest function, but
# returns negative values for downstream distances instead of absolute values.
# This allows you to assess the relative location.
#
# @param query A GRanges or GRangesList object with query sets
# @param features A GRanges object with features to test distance to
#
# @return A vector of genomic distances for each query region relative to its
# closest feature.
calcFeatureDistBioc = function(query, features) {
.validateInputs(list(query=c("GRangesList","GRanges")))
if (is(query, "GRangesList")) {
# Recurse over each GRanges object
x = lapply(query, calcFeatureDist, features)
return(x)
}
precedeInd = precede(query, features)
preIndNA = is.na(precedeInd)
followInd = follow(query, features)
folIndNA = is.na(followInd)
preDist = rep(NA, length(query))
preDist[!preIndNA] = -distance(query[!preIndNA],
features[precedeInd[!preIndNA]])
postDist = rep(NA, length(query))
postDist[!folIndNA] = distance(query[!folIndNA],
features[followInd[!folIndNA]])
postHits = -preDist > postDist
postHitsNA = is.na(postHits)
dists = preDist
dists[postHits[!postHitsNA]] = postDist[postHits[!postHitsNA]]
return(dists)
}
#' Find the distance to the nearest genomic feature
#'
#' For a given query set of genomic regions, and a given feature set of
#' regions, this function will return the distance for each query region to its
#' closest feature. It ignores strand and returns the distance as positive or
#' negative, depending on whether the feature is upstream or downstream
#'
#' This function is similar to the bioconductor distanceToNearest function, but
#' returns negative values for downstream distances instead of absolute values.
#' This allows you to assess the relative location.
#'
#' @param query A GRanges or GRangesList object with query sets
#' @param features A GRanges object with features to test distance to
#'
#' @return A vector of genomic distances for each query region relative to its
#' closest feature.
#' @export
#' @examples
#' vistaSftd = GenomicRanges::shift(vistaEnhancers, 100000)
#' calcFeatureDist(vistaEnhancers, vistaSftd)
calcFeatureDist = function(query, features) {
.validateInputs(list(query=c("GRangesList","GRanges")))
if (is(query, "GRangesList")) {
# Recurse over each GRanges object
x = lapply(query, calcFeatureDist, features)
return(x)
}
queryDT = grToDt(query)
featureDT = grToDt(features)
queryDTs = splitDataTable(queryDT, "chr")
featureDTs = splitDataTable(featureDT, "chr")
as.vector(unlist(mapply(queryDTs, featureDTs[names(queryDTs)],
FUN=DTNearest)))
}
# Function uses data.table rolling join to identify the nearest features
# really quickly.
#
# @param DT1 A data.table object to be joined to a second data.table object.
# @param DT2 A second data.table object to join with DT1.
#
# @return A rolling joined data.table object.
DTNearest = function(DT1, DT2) {
#data.table::set(DT1, j=mid, value=start + round((end-start)/2))
#data.table::set(DT2, j=mid, value=start + round((end-start)/2))
if (is.null(DT1)) {
return(NULL)
}
if (is.null(DT2)) {
return(rep(NA, nrow(DT1)))
}
DT1[, mid:=start + round((end-start)/2)]
DT2[, mid:=start + round((end-start)/2)]
data.table::setorder(DT1, mid)
data.table::setorder(DT2, mid)
data.table::setattr(DT1, "sorted", "mid")
data.table::setattr(DT2, "sorted", "mid")
DT2[J(DT1), roll="nearest"]
DT2[J(DT1), start+round((end-start)/2)-mid, roll="nearest"]
}
#' Calculates the distribution of distances from a query set to closest TSS
#'
#' Given a query GRanges object and an assembly string, this function will grab
#' the TSS list for the given reference assembly and then calculate the distance
#' from each query feature to the closest TSS. It is a wrapper of
#' \code{calcFeatureDist} that uses built-in TSS features for a reference
#' assembly
#'
#' @param query A GenomicRanges or GenomicRangesList object with query regions
#' @param refAssembly A character vector specifying the reference genome
#' assembly (*e.g.* 'hg19'). This will be used to grab chromosome sizes with
#' \code{getTSSs}.
#' @return A vector of distances for each query region relative to TSSs.
#' @export
#' @examples
#' calcFeatureDistRefTSS(vistaEnhancers, "hg19")
calcFeatureDistRefTSS = function(query, refAssembly) {
features = getTSSs(refAssembly)
return(calcFeatureDist(query, features))
}
# Converts a nucleotide count into a label with abbreviation
# @param x base count
# @return A label with 'kb' or 'mb' appended if appropriate
genomeLabel = function(x) {
.validateInputs(list(x="numeric"))
lab = x
if (abs(x) > 1e6){
lab = paste0(round(x/1e6), " mb")
}
else if (abs(x) > 1e3){
lab = paste0(round(x/1e3), " kb")
}
return(lab)
}
#' Plots a histogram of distances to genomic features
#'
#' Given the results from \code{featureDistribution}, plots a histogram of
#' distances surrounding the features of interest
#'
#' @param dists Results from \code{featureDistribution}
#' @param bgdists Background distances. If provided, will plot a background
#' distribution of expected distances
#' @param featureName Character vector for plot labels (optional).
#' @param numbers a logical indicating whether the raw numbers should be
#' displayed, rather than percentages (optional).
#' @param nbins Number of bins on each side of the center point.
#' @param size Number of bases to include in plot on each side of the
#' center point.
#' @param infBins Include catch-all bins on the sides?
#' @param tile Turn on a tile mode, which plots a tiled figure
#' instead of a histogram.
#' @param labelOrder -- Enter "default" to order by order of user input (default);
#' Enter "center" to order by value in tile in the closest proximity to the center
#' of features (in case TSS is used - center is TSS) (center).
#' @return A ggplot2 plot object
#' @export
#' @examples
#' TSSdist = calcFeatureDistRefTSS(vistaEnhancers, "hg19")
#' f = plotFeatureDist(TSSdist, featureName="TSS")
plotFeatureDist = function(dists, bgdists=NULL, featureName="features",
numbers=FALSE, nbins=50, size=100000,
infBins=FALSE, tile=FALSE, labelOrder="default") {
df = cutDists(dists, divisions=NULL, nbins, size, infBins)
if(is.list(dists)){
nplots = length(dists)
} else {
nplots = 1
}
if (!is.null(bgdists)) {
bgDistsDF = cutDists(bgDists, divisions=NULL, nbins, size, infBins)
# bgDistsDF$Freq= scale(bgDistsDF$Freq, center=FALSE)
bgDistsDF$Freq = (bgDistsDF$Freq / sum(bgDistsDF$Freq)) * 100
df$bgFreq = rep(bgDistsDF$Freq, nplots)
df$bgX = rep(seq_len(nrow(bgDistsDF)-1), nplots)
}
if ("name" %in% names(df)){
df$name = sortingFunction(df, labelOrder, nbins)
if (!numbers)
df$Freq = df[, .(Freq.Per = (Freq / sum(Freq)) * 100),
by = name]$"Freq.Per"
df$name = sortingFunction(df, labelOrder, nbins)
# It has multiple regions
g = ggplot(df, aes(x=cuts, y=Freq, fill=name, color = name)) +
facet_grid(. ~name)
} else {
if (!numbers)
df$Freq = (df$Freq / sum(df$Freq)) * 100
g = ggplot(df, aes(x=cuts, y=Freq))
}
if (!is.null(bgdists)) {
# bgtrack = scale(smooth(bgDistsDF$Freq), center=FALSE)
g = g +
geom_line(stat="identity", aes(x=bgX,y=bgFreq),
color="gray", alpha=1, size=1.5) +
geom_bar(stat="identity", aes(x=cuts,y=bgFreq), fill="gray", alpha=0.8)
}
# find midpoint
midx = nrow(df)/2/nplots
barcount = nrow(df)/nplots
minlabel = genomeLabel(-size)
maxlabel = genomeLabel(size)
edgeLabels = c(minlabel, rep("", barcount-2), maxlabel)
if (tile) {
if (!"name" %in% names(df)) {
df$name = "Region set"
}
ncuts = length(unique(df$cuts))
xs = rep(seq_len(ncuts), nplots)
g = ggplot(df) +
geom_raster(aes(x=xs, y=name, fill=Freq)) +
scale_fill_gradient(low="navy", high="orange") +
geom_point(aes(x=midx, y=0.5), color="black",
size=2, shape=17, alpha=0.8) +
theme_classic() +
labs(fill=ifelse(numbers,"Counts","Frequency (%)")) +
theme(legend.position="bottom") +
xlab(paste("Distance to", featureName)) +
theme(axis.text.x=element_text(angle = 0, hjust = 0.5, vjust=0.5)) +
scale_x_continuous(breaks=c(1, ncuts), labels=c(minlabel, maxlabel))
return(g)
}
if ("name" %in% names(df)) {
g = g +
geom_bar(data=df, stat="identity", alpha=0.7)
} else {
g = g +
geom_bar(data=df, stat="identity", fill="darkblue", alpha=0.7)
}
g = g + geom_point(aes(x=midx, y=0), color="tan2", size=2,
shape=17, alpha=0.8) +
guides(fill="none") + # remove legend for geom_point
theme_classic() +
theme(aspect.ratio=1) +
theme_blank_facet_label() +
xlab(paste("Distance to", featureName)) +
ylab(ifelse(numbers,"Counts","Frequency (%)")) +
# theme(axis.text.x=element_text(angle = 90, hjust = 1, vjust=0.5))
theme(axis.text.x=element_text(angle = 0, hjust = 0.5, vjust=0.5)) +
theme(plot.title = element_text(hjust = 0.5)) + # Center title
ggtitle(paste("Distribution relative to", featureName)) +
theme(legend.position="bottom") +
theme(panel.spacing.x=unit(1, "lines")) +
scale_x_discrete(labels=edgeLabels) +
scale_x_discrete(labels=edgeLabels, expand=expansion(mult=0.035))
return(g)
}
# Internal helper function for \code{plotFeatureDist}:
# orderes datasets based on their order in the user provided list,
# or based on the value around feature center (in TSS based on TSS)
#
# @param df A data.table with varibales "cuts" - based on created bins in
# \code{plotFeatureDist} function , "Freq" - either frequency or raw
# counts in aa given bin, "name" - name of the dataset
# @param labelOrder The method used to order datasets. Options: "default"
# orderes datasets in a plot based on order of datasets in GRangesList
# provided by user; "center" orderes datasets based on value in a central
# bin of the plot.
# @param nbins Number of bins on each side of the center point - input in
# \code{plotFeatureDist} function.
# @return A factor of names in "df" input with levels sorted based on
# sorting option.
sortingFunction = function(df, labelOrder="default", nbins=50){
if(labelOrder == "default"){
orderedLabels = unique(df$name)
orderedNames = factor(df$name, levels = orderedLabels)
return(orderedNames)
}
if (labelOrder == "center"){
# get the value around center, sort lables based on
# central values, use the labels as factor levels
centerIndex = seq(nbins, nrow(df), by = (nbins*2))
centerTiles = df[centerIndex,]
orderTiles = centerTiles[order(centerTiles$Freq, decreasing = TRUE),]
orderedLabels = orderTiles$name
orderedNames = factor(df$name, levels = orderedLabels)
return(orderedNames)
}
}
# Internal helper function for \code{plotFeatureDist}
#
# @param dists A vector of genomic distances.
# @param divisions A vector of bin sizes to divide the dists into.
# @param nbins Number of bins on each side of the center point.
# @param size Number of bases to include in plot on
# each side of the center point.
# @param infBins Include catch-all bins on the sides?
# @return A data.frame of the table of the frequency of dists in divisions.
cutDists = function(dists, divisions=NULL, nbins=50,
size=100000, infBins=TRUE) {
if (is.null(divisions)) {
poscuts = seq(0, size, by=size/nbins)
divisions = sort(unique(c(-poscuts, poscuts)))
if (infBins) {
divisions = c(-Inf, divisions, Inf)
}
}
if (is.list(dists)) {
x = lapply(dists, cutDists, divisions)
# To accommodate multiple lists, we'll need to introduce a new 'name'
# column to distinguish them.
nameList = names(dists)
if(is.null(nameList)) {
nameList = seq_along(dists) # Fallback to sequential numbers
}
# Append names
xb = rbindlist(x)
xb$name = rep(nameList, vapply(x, nrow, integer(1)))
return(xb)
}
labels = labelCuts(sort(divisions), collapse=" to ", infBins=infBins)
cuts = cut(dists, divisions, labels)
df = as.data.frame(table(cuts))
setDT(df)
return(df)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.