##' Annotate peaks
##'
##'
##' @title annotatePeak
##' @param peak peak file or GRanges object
##' @param tssRegion Region Range of TSS
##' @param TxDb TxDb or EnsDb annotation object
##' @param level one of transcript and gene
##' @param assignGenomicAnnotation logical, assign peak genomic annotation or not
##' @param genomicAnnotationPriority genomic annotation priority
##' @param annoDb annotation package
##' @param addFlankGeneInfo logical, add flanking gene information from the peaks
##' @param flankDistance distance of flanking sequence
##' @param sameStrand logical, whether find nearest/overlap gene in the same strand
##' @param ignoreOverlap logical, whether ignore overlap of TSS with peak
##' @param ignoreUpstream logical, if True only annotate gene at the 3' of the peak.
##' @param ignoreDownstream logical, if True only annotate gene at the 5' of the peak.
##' @param overlap one of 'TSS' or 'all', if overlap="all", then gene overlap with peak will be reported as nearest gene, no matter the overlap is at TSS region or not.
##' @param verbose print message or not
##' @param columns names of columns to be obtained from database
##' @return data.frame or GRanges object with columns of:
##'
##' all columns provided by input.
##'
##' annotation: genomic feature of the peak, for instance if the peak is
##' located in 5'UTR, it will annotated by 5'UTR. Possible annotation is
##' Promoter-TSS, Exon, 5' UTR, 3' UTR, Intron, and Intergenic.
##'
##' geneChr: Chromosome of the nearest gene
##'
##' geneStart: gene start
##'
##' geneEnd: gene end
##'
##' geneLength: gene length
##'
##' geneStrand: gene strand
##'
##' geneId: entrezgene ID
##'
##' distanceToTSS: distance from peak to gene TSS
##'
##' if annoDb is provided, extra column will be included:
##'
##' ENSEMBL: ensembl ID of the nearest gene
##'
##' SYMBOL: gene symbol
##'
##' GENENAME: full gene name
##' @import BiocGenerics S4Vectors GenomeInfoDb
##' @examples
##' \dontrun{
##' require(TxDb.Hsapiens.UCSC.hg19.knownGene)
##' txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
##' peakfile <- system.file("extdata", "sample_peaks.txt", package="ChIPseeker")
##' peakAnno <- annotatePeak(peakfile, tssRegion=c(-3000, 3000), TxDb=txdb)
##' peakAnno
##' }
##' @seealso \code{\link{plotAnnoBar}} \code{\link{plotAnnoPie}} \code{\link{plotDistToTSS}}
##' @export
##' @author G Yu
annotatePeak <- function(peak,
tssRegion=c(-3000, 3000),
TxDb=NULL,
level = "transcript",
assignGenomicAnnotation=TRUE,
genomicAnnotationPriority = c("Promoter", "5UTR", "3UTR", "Exon", "Intron", "Downstream", "Intergenic"),
annoDb=NULL,
addFlankGeneInfo=FALSE,
flankDistance=5000,
sameStrand = FALSE,
ignoreOverlap=FALSE,
ignoreUpstream=FALSE,
ignoreDownstream=FALSE,
overlap = "TSS",
verbose=TRUE,
columns=c("ENTREZID", "ENSEMBL", "SYMBOL", "GENENAME")) {
is_GRanges_of_TxDb <- FALSE
if (is(TxDb, "GRanges")) {
is_GRanges_of_TxDb <- TRUE
assignGenomicAnnotation <- FALSE
annoDb <- NULL
addFlankGeneInfo <- FALSE
message("#\n#.. 'TxDb' is a self-defined 'GRanges' object...\n#")
message("#.. Some parameters of 'annotatePeak' will be disable,")
message("#.. including:")
message("#..\tlevel, assignGenomicAnnotation, genomicAnnotationPriority,")
message("#..\tannoDb, addFlankGeneInfo and flankDistance.")
message("#\n#.. Some plotting functions are designed for visualizing genomic annotation")
message("#.. and will not be available for the output object.\n#")
}
if (is_GRanges_of_TxDb) {
level <- "USER_DEFINED"
} else {
level <- match.arg(level, c("transcript", "gene"))
}
if (assignGenomicAnnotation && all(genomicAnnotationPriority %in% c("Promoter", "5UTR", "3UTR", "Exon", "Intron", "Downstream", "Intergenic")) == FALSE) {
stop('genomicAnnotationPriority should be any order of c("Promoter", "5UTR", "3UTR", "Exon", "Intron", "Downstream", "Intergenic")')
}
if ( is(peak, "GRanges") ){
## this test will be TRUE
## when peak is an instance of class/subclass of "GRanges"
input <- "gr"
peak.gr <- peak
} else {
input <- "file"
peak.gr <- loadPeak(peak, verbose)
}
peakNum <- length(peak.gr)
if (verbose)
cat(">> preparing features information...\t\t",
format(Sys.time(), "%Y-%m-%d %X"), "\n")
if (is_GRanges_of_TxDb) {
features <- TxDb
} else {
TxDb <- loadTxDb(TxDb)
if (level=="transcript") {
features <- getGene(TxDb, by="transcript")
} else {
features <- getGene(TxDb, by="gene")
}
}
if (verbose)
cat(">> identifying nearest features...\t\t",
format(Sys.time(), "%Y-%m-%d %X"), "\n")
## nearest features
idx.dist <- getNearestFeatureIndicesAndDistances(peak.gr, features,
sameStrand, ignoreOverlap,
ignoreUpstream,ignoreDownstream,
overlap=overlap)
if (verbose)
cat(">> calculating distance from peak to TSS...\t",
format(Sys.time(), "%Y-%m-%d %X"), "\n")
## distance
distance <- idx.dist$distance
## update peak, remove un-map peak if exists.
peak.gr <- idx.dist$peak
## annotation
if (assignGenomicAnnotation == TRUE) {
if (verbose)
cat(">> assigning genomic annotation...\t\t",
format(Sys.time(), "%Y-%m-%d %X"), "\n")
anno <- getGenomicAnnotation(peak.gr, distance, tssRegion, TxDb, level, genomicAnnotationPriority, sameStrand=sameStrand)
annotation <- anno[["annotation"]]
detailGenomicAnnotation <- anno[["detailGenomicAnnotation"]]
} else {
annotation <- NULL
detailGenomicAnnotation <- NULL
}
## append annotation to peak.gr
if (!is.null(annotation))
mcols(peak.gr)[["annotation"]] <- annotation
has_nearest_idx <- which(idx.dist$index <= length(features))
nearestFeatures <- features[idx.dist$index[has_nearest_idx]]
## duplicated names since more than 1 peak may annotated by only 1 gene
names(nearestFeatures) <- NULL
nearestFeatures.df <- as.data.frame(nearestFeatures)
if (is_GRanges_of_TxDb) {
colnames(nearestFeatures.df)[1:5] <- c("geneChr", "geneStart", "geneEnd",
"geneLength", "geneStrand")
} else if (level == "transcript") {
if (is(TxDb, "EnsDb")) {
nearestFeatures.df <- nearestFeatures.df[, c("seqnames", "start",
"end", "width",
"strand", "gene_id",
"tx_id", "tx_biotype"),
drop = FALSE]
colnames(nearestFeatures.df) <- c(
"geneChr", "geneStart", "geneEnd", "geneLength", "geneStrand",
"geneId", "transcriptId", "transcriptBiotype")
} else {
colnames(nearestFeatures.df) <- c("geneChr", "geneStart", "geneEnd",
"geneLength", "geneStrand",
"geneId", "transcriptId")
nearestFeatures.df$geneId <- TXID2EG(
as.character(nearestFeatures.df$geneId), geneIdOnly=TRUE)
}
} else {
if (is(TxDb, "EnsDb")) {
nearestFeatures.df <- nearestFeatures.df[, c("seqnames", "start",
"end", "width",
"strand", "gene_id",
"gene_biotype"),
drop = FALSE]
colnames(nearestFeatures.df) <- c("geneChr", "geneStart", "geneEnd",
"geneLength", "geneStrand",
"geneId", "geneBiotype")
} else
colnames(nearestFeatures.df) <- c("geneChr", "geneStart", "geneEnd",
"geneLength", "geneStrand",
"geneId")
}
for(cn in colnames(nearestFeatures.df)) {
mcols(peak.gr)[[cn]][has_nearest_idx] <- unlist(nearestFeatures.df[, cn])
}
mcols(peak.gr)[["distanceToTSS"]] <- distance
if (!is.null(annoDb)) {
if (verbose)
cat(">> adding gene annotation...\t\t\t",
format(Sys.time(), "%Y-%m-%d %X"), "\n")
.idtype <- IDType(TxDb)
if (length(.idtype) == 0 || is.na(.idtype) || is.null(.idtype)) {
n <- length(peak.gr)
if (n > 100)
n <- 100
sampleID <- peak.gr$geneId[1:n]
if (all(grepl('^ENS', sampleID))) {
.idtype <- "Ensembl Gene ID"
} else if (all(grepl('^\\d+$', sampleID))) {
.idtype <- "Entrez Gene ID"
} else {
warning("Unknown ID type, gene annotation will not be added...")
.idtype <- NA
}
}
if (!is.na(.idtype)) {
peak.gr %<>% addGeneAnno(annoDb, .idtype, columns)
}
}
if (addFlankGeneInfo == TRUE) {
if (verbose)
cat(">> adding flank feature information from peaks...\t",
format(Sys.time(), "%Y-%m-%d %X"), "\n")
flankInfo <- getAllFlankingGene(peak.gr, features, level, flankDistance)
if (level == "transcript") {
mcols(peak.gr)[["flank_txIds"]] <- NA
mcols(peak.gr)[["flank_txIds"]][flankInfo$peakIdx] <- flankInfo$flank_txIds
}
mcols(peak.gr)[["flank_geneIds"]] <- NA
mcols(peak.gr)[["flank_gene_distances"]] <- NA
mcols(peak.gr)[["flank_geneIds"]][flankInfo$peakIdx] <- flankInfo$flank_geneIds
mcols(peak.gr)[["flank_gene_distances"]][flankInfo$peakIdx] <- flankInfo$flank_gene_distances
}
if (!is_GRanges_of_TxDb) {
if(verbose)
cat(">> assigning chromosome lengths\t\t\t",
format(Sys.time(), "%Y-%m-%d %X"), "\n")
peak.gr@seqinfo <- seqinfo(TxDb)[names(seqlengths(peak.gr))]
}
if(verbose)
cat(">> done...\t\t\t\t\t",
format(Sys.time(), "%Y-%m-%d %X"), "\n")
if (assignGenomicAnnotation) {
res <- new("csAnno",
anno = peak.gr,
tssRegion = tssRegion,
level=level,
hasGenomicAnnotation = TRUE,
detailGenomicAnnotation=detailGenomicAnnotation,
annoStat=getGenomicAnnoStat(peak.gr),
peakNum=peakNum
)
} else {
res <- new("csAnno",
anno = peak.gr,
tssRegion = tssRegion,
level=level,
hasGenomicAnnotation = FALSE,
peakNum=peakNum
)
}
return(res)
}
##' dropAnno
##'
##' drop annotation exceeding distanceToTSS_cutoff
##' @title dropAnno
##' @param csAnno output of annotatePeak
##' @param distanceToTSS_cutoff distance to TSS cutoff
##' @return csAnno object
##' @export
##' @author Guangchuang Yu
dropAnno <- function(csAnno, distanceToTSS_cutoff=10000) {
idx <- which(abs(mcols(csAnno@anno)[["distanceToTSS"]]) < distanceToTSS_cutoff)
csAnno@anno <- csAnno@anno[idx]
csAnno@peakNum <- length(idx)
if (csAnno@hasGenomicAnnotation) {
csAnno@annoStat <- getGenomicAnnoStat(csAnno@anno)
csAnno@detailGenomicAnnotation = csAnno@detailGenomicAnnotation[idx,]
}
csAnno
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.