#' Infer polyA sites from droplet sequencing data
#'
#' Infer polyA sites from 10X, Drop-seq or similar 3' enriched sequencing data. Simple function
#' that looks for peaks in read coverage to estimate potential polyA sites. Default parameters are
#' chosen because they work reasonably well with the example data, but they should typically be
#' empirically selected by verifying the output.
#'
#' @param genes \code{\link[GenomicRanges:GRangesList-class]{GRangesList}} object containing
#' annotations of genes for which polyA sites are to be estimated.
#' @param bam Path to .bam file containing aligned reads used for polyA site estimation.
#' @param polyA_downstream (numeric) How far downstream of a peak in coverage are polyA sites
#' expected? Somewhat depends on input DNA fragment size. (default: 100). Importantly, this value
#' should not be larger than half of the window size (wdsize), else polyA sites might be moved
#' outside of the transcripts, even if they were extended using the extend_downstream parameter.
#' @param min_cvrg (numeric) Minimal coverage for peaks to be considered for polyA site estimation
#' (default: 0).
#' @param wdsize (numeric) Window size to estimate sequencing coverage along transcripts (default:
#' 200).
#' @param by (numeric) Steps in basepairs in which the sliding window should be moved along
#' transcripts to estimate smooth coverage (default: 1).
#' @param extend_downstream (numeric) To which amount should transcript annotations be extended
#' downstream when estimating polyA sites (default: 0). A reasonable value (e.g. 100-200 bp)
#' allows to account for polyA sites that fall a few basepairs downstream of terminal exons.
#' @param perc_threshold (numeric) Only sequencing coverage peaks within \code{perc_threshold}
#' percentile of coverage are considered for polyA site estimation (default: 0.9). Avoids that
#' small peaks that in coverage are considered, resulting in manby false polyA sites.
#' @param parallel (logical) Triggers parallel computing using the
#' \code{\link[BiocParallel]{BiocParallel-package}} package. This requires that a parallel
#' back-end was registered prior to executing the function. (default: FALSE).
#' @return A \code{\link[GenomicRanges:GRanges-class]{GRanges}} object containing coordinates of
#' estimated polyadenylation sites.
#' @examples
#' library(GenomicRanges)
#'
#' # protein-coding exons of genes within chr11 region
#' data("chr11_genes")
#' target_genes <- split(chr11_genes, f = chr11_genes$gene_name)
#'
#' # subset of target genes for quick example
#' target_genes <- target_genes[18:27]
#'
#' # bam file containing aligned Drop-seq reads
#' dropseq_bam <- system.file("extdata", "chr11_k562_dropseq.bam", package = "TAPseq")
#'
#' # infer polyA sites for all target genes with adjusted parameters. parameter values depend on the
#' # input data and at this stage it's best to try different settings and check the results
#' polyA_sites <- inferPolyASites(target_genes, bam = dropseq_bam, polyA_downstream = 50,
#' wdsize = 100, min_cvrg = 1, parallel = TRUE)
#' @export
inferPolyASites <- function(genes, bam, polyA_downstream = 100, min_cvrg = 0, wdsize = 200,
by = 1, extend_downstream = 0, perc_threshold = 0.9, parallel = FALSE) {
# verify that genes is a GRangesList
if (!is(genes , "GRangesList")) {
stop("genes needs to be a GRangesList containing annotations per gene!", call. = FALSE)
}
# raise warning if polyA_downstream is half or less of wdsize
if (polyA_downstream > wdsize / 2) {
warning("'polyA_downstream' is larger than 'wdsize' / 2. ",
"PolyA sites might be moved outside transcripts and missed.", call. = FALSE)
}
# calculate per base coverage for all genes ------------------------------------------------------
## TO DO: load bam only for chromosomes of target genes
message("Loading reads and computing coverage...")
# load all reads in bam file
reads <- GenomicAlignments::readGAlignments(bam)
# calculate coverage per base
read_cvrg <- GenomicAlignments::coverage(reads)
# find putative polyA sites ----------------------------------------------------------------------
message("Inferring polyA sites...")
# get polyA sites for all genes
if (parallel == TRUE) {
polyA_sites <- BiocParallel::bplapply(X = genes, FUN = polyA_site_gene, coverage = read_cvrg,
polyA_downstream = polyA_downstream, wdsize = wdsize, by = by,
extend_downstream = extend_downstream, perc_threshold = perc_threshold)
} else {
polyA_sites <- lapply(X = genes, FUN = polyA_site_gene, coverage = read_cvrg,
polyA_downstream = polyA_downstream, wdsize = wdsize, by = by,
extend_downstream = extend_downstream, perc_threshold = perc_threshold)
}
# transform output to GRanges object
polyA_sites <- unlist(GRangesList(polyA_sites))
# filter polyA sites according to min_cvrg
polyA_sites <- polyA_sites[polyA_sites$score >= min_cvrg]
# sort according to position
polyA_sites <- sort(polyA_sites)
# return output
message("Done!")
return(polyA_sites)
}
# helper functions =================================================================================
# infer polyA sites for one gene
polyA_site_gene <- function(gene, coverage, polyA_downstream, wdsize, by, extend_downstream,
perc_threshold) {
# get chromosome and strand of gene
chr <- as.character(unique(seqnames(gene)))
strand <- as.character(unique(strand(gene)))
# merge exons of gene
exons <- reduce(gene)
# find putative polyA sites if gene is on the '+' strand -----------------------------------------
if (strand == "+") {
# extend terminal exon downstream
end(exons[length(exons)]) <- end(exons[length(exons)]) + extend_downstream
# exon ends and starts in spliced (concatenated) transcript
exon_ends <- cumsum(width(exons))
exon_starts <- c(1, exon_ends[-length(exon_ends)] + 1)
tx_exons <- IRanges(exon_starts, exon_ends)
# calculate coverage in sliding windows along gene ---------------------------------------------
# coverage of gene
gene_cvrg <- Views(coverage[[chr]], ranges(exons))
# coverage of spliced transcript (concatenated)
tx_cvrg <- unlist(gene_cvrg)
# define windows based on strand of gene, so that the 3' end is for sure
# covered by a window
wd_ends <- seq(from = sum(width(exons)), to = wdsize, by = -by)
wd_starts <- wd_ends - wdsize + 1
wds <- IRanges(wd_starts, wd_ends)
# calculate coverage in each window
wd_cvrg <- sum(Views(tx_cvrg, wds))
# define putative polyA site based on local maxima (peaks) -------------------------------------
# calculate local maxima in smoothed coverage
peaks <- local_maxima(wd_cvrg)
peaks_cvrg <- wd_cvrg[peaks]
# retain peaks which coverage is >= perc_threshold
min_cvrg <- as.numeric(stats::quantile(wd_cvrg, probs = perc_threshold))
peaks <- peaks[peaks_cvrg >= min_cvrg]
# get windows for these peaks
peak_wds <- wds[peaks]
# get center of peaks
peak_centers <- round(start(peak_wds) + wdsize / 2 - 1)
# define putative polyA site
polyA_sites <- peak_centers + polyA_downstream
# if gene is on '-' strand
} else if (strand == "-") {
start(exons[1]) <- start(exons[1]) - extend_downstream
exon_ends <- cumsum(width(exons))
exon_starts <- c(1, exon_ends[-length(exon_ends)] + 1)
tx_exons <- IRanges(exon_starts, exon_ends)
# calculate coverage in sliding windows along gene ---------------------------------------------
gene_cvrg <- Views(coverage[[chr]], ranges(exons))
tx_cvrg <- unlist(gene_cvrg)
wd_starts <- seq(from = 1, to = (sum(width(exons)) - wdsize + 1), by = by)
wd_ends <- wd_starts + wdsize - 1
wds <-IRanges(wd_starts, wd_ends)
wd_cvrg <- sum(Views(tx_cvrg, wds))
# define putative polyA site based on local maxima (peaks) -------------------------------------
peaks <- local_maxima(wd_cvrg)
peaks_cvrg <- wd_cvrg[peaks]
min_cvrg <- as.numeric(stats::quantile(wd_cvrg, probs = perc_threshold))
peaks <- peaks[peaks_cvrg >= min_cvrg]
peak_wds <- wds[peaks]
peak_centers <- round(start(peak_wds) + wdsize / 2 - 1)
polyA_sites <- peak_centers - polyA_downstream + 1
} else {
stop("Strand not '+' or '-' for at least 1 gene!", call. = FALSE)
}
# translate polyA sites to genomic coordinates ---------------------------------------------------
# transform to IRanges object
polyA_sites <- IRanges(start = polyA_sites, end = polyA_sites)
# add metadata
cov_bp <- wd_cvrg[peaks] / wdsize # coverage per bp of peaks
mcols(polyA_sites) <- data.frame("name" = rep_len("polyA_site", length(cov_bp)),
"score" = cov_bp)
# sort and remove possible duplicate polyA sites due to rounding of coordinates
polyA_sites <- unique(sort(polyA_sites))
# get exon overlapping with polyA sites
polyA_overlaps <- findOverlaps(query = tx_exons, subject = polyA_sites)
polyA_exons <- S4Vectors::queryHits(polyA_overlaps)
# only retain polyA sites that overlap any exons
polyA_sites <- polyA_sites[S4Vectors::subjectHits(polyA_overlaps)]
# convert any polyA sites overlapping transcripts to genomic coordinates
if (length(polyA_sites) > 0) {
# calculate distance of polyA site to the start of the overlapping exon
pos_diff <- start(polyA_sites) - start(tx_exons[polyA_exons])
# translate polyA site coordinates to genomic coordinates
polyA_start <- start(exons[polyA_exons]) + pos_diff
# create GRanges object with genomic coordinates of polyA site
polyA_coords <- GRanges(seqnames = chr, strand = strand,
ranges = IRanges(polyA_start, polyA_start),
mcols(polyA_sites))
} else {
# if no polyA sites overlap transcripts, return empty GRanges object
polyA_coords <- GRanges()
}
# return polyA sites with genomic coordinates
return(polyA_coords)
}
## FUN: find local maxima in an ordered data series (after post in:
# https://stackoverflow.com/questions/6836409/finding-local-maxima-and-minima)
local_maxima <- function(x) {
# Use -Inf instead if x is numeric (non-integer)
y <- diff(c(-1, x)) > 0L
# y <- diff(c(-.Machine$integer.max, x)) > 0L
rle(y)$lengths
y <- cumsum(rle(y)$lengths)
y <- y[seq.int(1L, length(y), 2L)]
if (x[[1]] == x[[2]]) {
y <- y[-1]
}
# return vector with indices of local maxima
return(y)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.