Nothing
#' Assign peak midpoints to defined gene loci.
#'
#' Determine the midpoints of a set of input regions \code{peaks} and the overlap of the midpoints with a given locus definition \code{locusdef}. Also report the TSS that is nearest each region (peak) overlapping a defined locus and its distance.
#'
#' Typically, this function will not be used alone, but inside \code{chipenrich()}.
#'
#' @param peaks A \code{GRanges} object representing regions to be used for enrichment.
#' @param locusdef A locus definition object from \code{chipenrich.data}.
#' @param tss A \code{GRanges} object representing the TSSs for the genome build. Includes \code{mcols} for Entrez Gene ID \code{gene_id} and gene symbol \code{symbol}.
#' @param weighting A string defining what weighting option they want. Current options are 'multiAssign', 'signalValue', and 'logSignal Value'. Default is NULL.
#'
#' @return A \code{data.frame} with columns for \code{peak_id, chr, peak_start, peak_end, gene_locus_start, gene_locus_end, gene_id, nearest_tss, nearest_tss_gene, dist_to_tss, nearest_tss_gene_strand}. The result is used in \code{num_peaks_per_gene()}.
#'
#' @examples
#'
#' data('locusdef.hg19.nearest_tss', package = 'chipenrich.data')
#' data('tss.hg19', package = 'chipenrich.data')
#'
#' file = system.file('extdata', 'test_assign.bed', package = 'chipenrich')
#' peaks = read_bed(file)
#'
#' assigned_peaks = assign_peaks(
#' peaks = peaks,
#' locusdef = locusdef.hg19.nearest_tss,
#' tss = tss.hg19)
#'
#' @export
assign_peaks = function(peaks, locusdef, tss, weighting=NULL) {
# Extract GRanges of locusdef
# NOTE: locusdef is an environment, so uses @
ldef_gr = locusdef@granges
# Determine midpoints of peaks and construct a GRanges object
# on that basis. Include the peak name for later merging
peak_mids = IRanges::mid(GenomicRanges::ranges(peaks))
mids_gr = GenomicRanges::GRanges(
seqnames = GenomeInfoDb::seqnames(peaks),
ranges = IRanges::IRanges(start = peak_mids, end = peak_mids),
name = GenomicRanges::mcols(peaks)$name
)
if (all(c("signalValue", "logsignalValue") %in% weighting)) {
stop("You can only choose one of signalValue and logsignalValue!")
}
if (any(c("signalValue", "logsignalValue") %in% weighting)) {
if (!("signalValue" %in% colnames(GenomicRanges::mcols(peaks)))) {
stop("No signalValue column!")
}
mids_gr$signalValue = GenomicRanges::mcols(peaks)$signalValue
}
# Determine overlaps of peak midpoints with locus definition
mid_ldef_overlaps = GenomicRanges::findOverlaps(mids_gr, ldef_gr)
if(length(mid_ldef_overlaps) == 0) {
stop('Intersection between peak midpoints and locus definition is empty. Try selecting a different locus definition.')
}
# Determine distance of peak midpoints to nearest TSSs
mid_dist_to_ntss = GenomicRanges::distanceToNearest(mids_gr, tss)
# Sign the distances by up (-) or down (+) stream
mid_starts = GenomicRanges::start(mids_gr)[S4Vectors::queryHits(mid_dist_to_ntss)]
tss_starts = GenomicRanges::start(tss)[S4Vectors::subjectHits(mid_dist_to_ntss)]
tss_pos_strand = GenomicRanges::strand(tss)[S4Vectors::subjectHits(mid_dist_to_ntss)] == "+"
tss_neg_strand = GenomicRanges::strand(tss)[S4Vectors::subjectHits(mid_dist_to_ntss)] == "-"
neg1 = (mid_starts < tss_starts) & (tss_pos_strand)
neg2 = (mid_starts > tss_starts) & (tss_neg_strand)
neg_dists = as.logical(neg1 | neg2)
# Reassign the signed distances
dists = GenomicRanges::mcols(mid_dist_to_ntss)$distance
dists[neg_dists] = dists[neg_dists] * -1
GenomicRanges::mcols(mid_dist_to_ntss)$distance = dists
# Construct data.frames for overlaps and distances
mid_indices = S4Vectors::queryHits(mid_ldef_overlaps)
ldef_indices = S4Vectors::subjectHits(mid_ldef_overlaps)
mid_ldef_df = data.frame(
peak_id = GenomicRanges::mcols(mids_gr)$name[mid_indices],
chr = GenomeInfoDb::seqnames(mids_gr)[mid_indices],
peak_start = GenomicRanges::start(peaks)[mid_indices],
peak_end = GenomicRanges::end(peaks)[mid_indices],
gene_locus_start = GenomicRanges::start(ldef_gr)[ldef_indices],
gene_locus_end = GenomicRanges::end(ldef_gr)[ldef_indices],
gene_id = GenomicRanges::mcols(ldef_gr)$gene_id[ldef_indices],
gene_symbol = GenomicRanges::mcols(ldef_gr)$symbol[ldef_indices],
stringsAsFactors = FALSE
)
# Add the signalValue column if method == "polyenrich_weighted"
# Stops if there isn't one
if (any(c("signalValue","logsignalValue") %in% weighting)) {
mid_ldef_df$signalValue = GenomicRanges::mcols(peaks)$signalValue[mid_indices]
}
mid_indices = S4Vectors::queryHits(mid_dist_to_ntss)
tss_indices = S4Vectors::subjectHits(mid_dist_to_ntss)
mid_dist_df = data.frame(
peak_id = GenomicRanges::mcols(mids_gr)$name[mid_indices],
nearest_tss = GenomicRanges::start(tss)[tss_indices],
nearest_tss_gene_id = GenomicRanges::mcols(tss)$gene_id[tss_indices],
nearest_tss_symbol = GenomicRanges::mcols(tss)$symbol[tss_indices],
dist_to_tss = GenomicRanges::mcols(mid_dist_to_ntss)$distance,
nearest_tss_gene_strand = GenomicRanges::strand(tss)[tss_indices],
stringsAsFactors = FALSE
)
# Merge the overlap and distance data.frames on the peak names
d = base::merge(
x = mid_ldef_df,
y = mid_dist_df,
by = 'peak_id',
all.x = TRUE,
all.y = FALSE,
sort = FALSE
)
# Reorder the columns
column_order = c(
"peak_id",
"chr",
"peak_start",
"peak_end",
"gene_id",
"gene_symbol",
"gene_locus_start",
"gene_locus_end",
"nearest_tss",
"dist_to_tss",
"nearest_tss_gene_id",
"nearest_tss_symbol",
"nearest_tss_gene_strand"
)
if (any(c("signalValue","logsignalValue") %in% weighting)) {
column_order = c(column_order, "signalValue")
}
d = d[, column_order]
return(d)
}
#' Assign whole peaks to all overlapping defined gene loci.
#'
#' Determine all overlaps between the set of input regions \code{peaks} and the given locus definition \code{locusdef}. In addition, report where each overlap begins and ends, as well as the length of the overlap.
#'
#' Typically, this function will not be used alone, but inside \code{chipenrich()} with \code{method = 'broadenrich'}.
#'
#' @param peaks A \code{GRanges} object representing regions to be used for enrichment.
#' @param locusdef A locus definition object from \code{chipenrich.data}.
#'
#' @return A \code{data.frame} with columns for \code{peak_id, chr, peak_start, peak_end, gene_locus_start, gene_locus_end, gene_id, overlap_start, overlap_end, peak_overlap}. The result is used in \code{num_peaks_per_gene()}.
#'
#' @examples
#'
#' data('locusdef.hg19.nearest_tss', package = 'chipenrich.data')
#' data('tss.hg19', package = 'chipenrich.data')
#'
#' file = system.file('extdata', 'test_assign.bed', package = 'chipenrich')
#' peaks = read_bed(file)
#'
#' assigned_peaks = assign_peak_segments(
#' peaks = peaks,
#' locusdef = locusdef.hg19.nearest_tss)
#'
#' @export
assign_peak_segments = function(peaks, locusdef) {
# Extract GRanges of locusdef
# NOTE: locusdef is a class, so uses @
ldef_gr = locusdef@granges
# Determine overlaps of peaks with locus definition
peak_ldef_overlaps = GenomicRanges::findOverlaps(peaks, ldef_gr)
if(length(peak_ldef_overlaps) == 0) {
stop('Intersection between peaks and locus definition is empty. Try selecting a different locus definition.')
}
# Determine start and ends for peaks and genes that overlap
peak_indices = S4Vectors::queryHits(peak_ldef_overlaps)
ldef_indices = S4Vectors::subjectHits(peak_ldef_overlaps)
peak_start = GenomicRanges::start(peaks)[peak_indices]
peak_end = GenomicRanges::end(peaks)[peak_indices]
gene_start = GenomicRanges::start(ldef_gr)[ldef_indices]
gene_end = GenomicRanges::end(ldef_gr)[ldef_indices]
# Use pmin() and pmax() to determine componentwise mins and maxes
# which are also the starts and ends of the overlaps
overlap_start = pmax(peak_start, gene_start)
overlap_end = pmin(peak_end, gene_end)
overlap_length = overlap_end - overlap_start + 1
d = data.frame(
peak_id = GenomicRanges::mcols(peaks)$name[peak_indices],
chr = GenomeInfoDb::seqnames(peaks)[peak_indices],
peak_start = peak_start,
peak_end = peak_end,
gene_locus_start = gene_start,
gene_locus_end = gene_end,
gene_id = GenomicRanges::mcols(ldef_gr)$gene_id[ldef_indices],
gene_symbol = GenomicRanges::mcols(ldef_gr)$symbol[ldef_indices],
overlap_start = overlap_start,
overlap_end = overlap_end,
peak_overlap = overlap_length,
stringsAsFactors = FALSE
)
return(d)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.