suppressPackageStartupMessages({ library(ChIPpeakAnno) library(biomaRt) library(AnnotationHub) library(EnsDb.Hsapiens.v75) library(TxDb.Hsapiens.UCSC.hg19.knownGene) library(BSgenome.Hsapiens.UCSC.hg19) library(org.Hs.eg.db) library(UpSetR) library(seqinr) library(motifStack) }) knitr::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE)
library(BiocManager) install("jianhong/workshop2020", build_vignettes = TRUE)
The functions, toGRanges
, annotatePeakInBatch
, and addGeneIDs
in the
ChIPpeakAnno, make the annotation of ChIP-Seq peaks streamlined into
four major steps:
Read peak data with toGRanges
Generate annotation data with toGRanges
Annotate peaks with annotatePeakInBatch
Add additional informations with addGeneIDs
## First, load the ChIPpeakAnno package library(ChIPpeakAnno)
GRanges
with the toGRanges
methodHere we use broadPeak file format as example.
## the sample file is included in ChIPpeakAnno package. ## chage the file path into your own file path to handle your data path <- system.file("extdata", "Tead4.broadPeak", package="ChIPpeakAnno") ## toGRanges is overlaoded method, ## by define the correct file format to import the file in correct coordinates peaks <- toGRanges(path, format="broadPeak") ## see top 2 lines of the imported peaks. ## the imported peaks will be packaged into GRanges object head(peaks, n=2)
toGRanges
methodChIP-seq is a kind of DNA-seq. In the mapping step, no transcriptome is used. In the annotation step, both Ensembl or UCSC annotations can be used. Depend on your research, less complex genome annotations, such as UCSC annotations, are prefearable for reproducible and robust gene/transcripts annotation. To discover and explain unknown biological mechanisms, more comprehensive and complex genome annotations are necessary, such as Ensembl.
Here we show how to prepare annotations based on TxDb package with the
toGRanges
method.
library(TxDb.Hsapiens.UCSC.hg19.knownGene) annoDataTxDb <- toGRanges(TxDb.Hsapiens.UCSC.hg19.knownGene) head(annoDataTxDb, n=2)
We can also prepare annotations based on EnsDb package with the toGRanges
method.
library(EnsDb.Hsapiens.v75) ##GRCh37.p13 annoDataEnsDb <- toGRanges(EnsDb.Hsapiens.v75) head(annoDataEnsDb, n=2)
The shortage of offline annotation packages is that it may not contain up-to-date data. We can also use online annotation data via the biomaRt or AnnotationHub packages. However, to use the biomaRt package, you must be very clear that the default mart service is based the newrest genome assembly. For example, if you are annotating the data mapped to hg19/GRCh37 or mm9/NCBIM37 assembly, you must change the default server into achive server. For more details, please refer the documentation of the biomaRt package.
library(biomaRt) ## here we use ensembl GRCh37.p13 (V75) service, ## use listEnsemblArchives() function to list the available host ## use listDatasets() function to list the available dataset mart <- useMart(biomart = "ensembl", dataset = "hsapiens_gene_ensembl", host = "feb2014.archive.ensembl.org") ## use the getAnnotation function to obtain the TSS for ensembl GRCh37.p13. annoDataMart <- getAnnotation(mart, featureType = "TSS") annoDataMart[head(names(annoDataEnsDb), n=2)]
mart <- useMart(biomart = "ensembl", dataset = "hsapiens_gene_ensembl", host = "feb2014.archive.ensembl.org") annoDataMart <- readRDS(system.file("extdata", "ChIPpeakAnno", "annoDataMart.rds", package = "workshop2020", mustWork = TRUE)) annoDataMart[head(names(annoDataEnsDb), n=2)]
Here I show some sample code to get annotations from AnnotationHub.
library(AnnotationHub) ah <- AnnotationHub() hg19 <- AnnotationHub::query(ah, c("UCSC", "Hsapiens", "hg19", "knownGene")) hg19 hg19TxDb <- hg19[["AH52258"]] annoDataHg19<- toGRanges(hg19TxDb) identical(annoDataHg19, annoDataTxDb) ## retrieve the gencode v32 annotation GRCh37 <- AnnotationHub::query(ah, c("GRCh37", "GENCODE", "v32", "TxDb")) annoDataGencode <- toGRanges(GRCh37[["AH75188"]]) head(annoDataGencode, n=2)
annotatePeakInBatch
function.Here we will compare the difference among differnt annotation sources.
First we compare the annotations from ensembl GRCh37.p13 between the EnsDb.Hsapiens.v75 package and ensembl archives.
seqlevelsStyle(annoDataEnsDb) seqlevelsStyle(peaks) ## keep the seqnames in the same style if(!identical(seqlevelsStyle(peaks), seqlevelsStyle(annoDataEnsDb))){ seqlevelsStyle(peaks) <- seqlevelsStyle(annoDataEnsDb)[1] } ## do annotation by nearest TSS of Ensembl GRCh37.p13 annoEnsDb <- annotatePeakInBatch(peaks, AnnotationData=annoDataEnsDb) head(annoEnsDb, n=2) # A pie chart can be used to demonstrate the overlap features of the peaks. pie1(table(annoEnsDb$insideFeature))
## keep the seqnames in the same style if(!identical(seqlevelsStyle(peaks), seqlevelsStyle(annoDataMart))){ seqlevelsStyle(peaks) <- seqlevelsStyle(annoDataMart)[1] } ## do annotation by nearest TSS of Ensembl GRCh37.p13 annoMart <- annotatePeakInBatch(peaks, AnnotationData=annoDataMart) head(annoMart, n=2) # A pie chart can be used to demonstrate the overlap features of the peaks. pie1(table(annoMart$insideFeature))
We can see the minor difference between them. Lets check the reasons.
## check how many annotations are different from each other table(start(annoDataMart[names(annoDataEnsDb)])==start(annoDataEnsDb)) table(end(annoDataMart[names(annoDataEnsDb)])==end(annoDataEnsDb)) ## get the samples head(annoDataMart[names(annoDataEnsDb)][ start(annoDataMart[names(annoDataEnsDb)])!=start(annoDataEnsDb)], n=2) head(annoDataEnsDb[start(annoDataMart[names(annoDataEnsDb)])!= start(annoDataEnsDb)], n=2) ## check the biomart output getBM(attributes = c("ensembl_gene_id", 'chromosome_name', "description"), filters = "ensembl_gene_id", values = c("LRG_198", "LRG_262"), mart = mart)
Here we can see that the EnsDb.Hsapiens.v75 package already fixed the issue of chromosome_name issue, where the biomart online still output the wrong chromosome_name.
Now we annotate the peaks by UCSC hg19 annotations.
## keep the seqnames in the same style if(!identical(seqlevelsStyle(peaks), seqlevelsStyle(annoDataTxDb))){ seqlevelsStyle(peaks) <- seqlevelsStyle(annoDataTxDb)[1] } ## do annotation by nearest TSS of UCSC hg19 annotations annoTxDb <- annotatePeakInBatch(peaks, AnnotationData=annoDataTxDb) head(annoTxDb, n=2) # A pie chart can be used to demonstrate the overlap features of the peaks. pie1(table(annoTxDb$insideFeature))
Now we try to use gencode v32 annotations to annotate the peaks.
## keep the seqnames in the same style if(!identical(seqlevelsStyle(peaks), seqlevelsStyle(annoDataGencode))){ seqlevelsStyle(peaks) <- seqlevelsStyle(annoDataGencode)[1] } ## do annotation by nearest TSS of Gencode annoGencode <- annotatePeakInBatch(peaks, AnnotationData=annoDataGencode) head(annoGencode, n=2) # A pie chart can be used to demonstrate the overlap features of the peaks. pie1(table(annoGencode$insideFeature))
We can see the difference between the annotations among Ensembl, UCSC and Gencode in annotation. Let's check the overlaps of annotations.
Ensembl <- unique(annoDataEnsDb) UCSC <- unique(annoDataTxDb) Gencode <- unique(annoDataGencode) ## make the sequence info to same, otherwise can not compare. genome(seqinfo(Gencode)) <- genome(seqinfo(UCSC)) <- genome(seqinfo(Ensembl)) <- factor("hg19") isCircular(seqinfo(UCSC)) <- rep(FALSE, length(seqlevels(UCSC))) isCircular(seqinfo(Gencode)) <- rep(FALSE, length(seqlevels(Gencode))) seqlevels(Ensembl) <- sub("chrMT", "chrM", seqlevels(Ensembl)) seqlengths(Ensembl)["chrM"] <- seqlengths(UCSC)["chrM"] ## find the overlaps by max gap = 0 bp. ol <- findOverlapsOfPeaks(Ensembl, UCSC, Gencode, ignore.strand = FALSE, connectedPeaks="keepAll") ## venn diagram to show the overlaps makeVennDiagram(ol, connectedPeaks = "keepAll")
addGeneIDs
functionlibrary(org.Hs.eg.db) annoEnsDb <- addGeneIDs(annoEnsDb, orgAnn="org.Hs.eg.db", feature_id_type="ensembl_gene_id", IDs2Add=c("symbol")) head(annoEnsDb, n=2) annoTxDb <- addGeneIDs(annoTxDb, orgAnn="org.Hs.eg.db", feature_id_type="entrez_id", IDs2Add=c("symbol")) head(annoTxDb, n=2) annoGencode$entrez_id <- xget(sub("(ENSG\\d{11}).*$", "\\1", as.character(annoGencode$feature)), org.Hs.egENSEMBL2EG, output = "last") annoGencode$symbol[!is.na(annoGencode$entrez_id)] <- xget(annoGencode$entrez_id[!is.na(annoGencode$entrez_id)], org.Hs.egSYMBOL) head(annoGencode) library(UpSetR) upset(fromList(list(Ensembl=unique(annoEnsDb$symbol), UCSC=unique(annoTxDb$symbol), Gencode=unique(annoGencode$symbol))), order.by = "freq")
As a conclusion, annotate with different annotation resources, even the other parameter keep same, the annotations will be different from each other. To improve the reproducibility, accuracy of an annotation source should be provided. Some people were advertizing their tools by playing this trick by using different annotations to mislead users.
The function findOverlapsOfPeaks
returns an object of overlappingPeaks,
which contains there elements: venn_cnt, peaklist (a list of
overlapping peaks or unique peaks), and overlappingPeaks (a list of data frame
consists of the annotation of all the overlapping peaks).
The following examples illustrate the usage of this method to convert BED and
GFF file to GRanges, add metadata from orignal peaks to the overlap GRanges
using function addMetadata
, and visualize the overlapping using function
makeVennDiagram
.
library(ChIPpeakAnno) bed <- system.file("extdata", "MACS_output.bed", package="ChIPpeakAnno") gr1 <- toGRanges(bed, format="BED", header=FALSE) ## one can also try import from rtracklayer gff <- system.file("extdata", "GFF_peaks.gff", package="ChIPpeakAnno") gr2 <- toGRanges(gff, format="GFF", header=FALSE, skip=3) ## must keep the class exactly same as gr1$score, i.e., numeric. gr2$score <- as.numeric(gr2$score) ol <- findOverlapsOfPeaks(gr1, gr2, connectedPeaks = "keepAll") ## add metadata (mean of score) to the overlapping peaks ol <- addMetadata(ol, colNames="score", FUN=mean) head(ol$peaklist[["gr1///gr2"]], n=2) makeVennDiagram(ol, fill=c("#009E73", "#F0E442"), # circle fill color col=c("#D55E00", "#0072B2"), #circle border color cat.col=c("#D55E00", "#0072B2")) # label color, keep same as circle border color
But if we check the sum of the overlaps, the total number for each sample are not identical to inputed peak number. This can be fixed by set 'connectedPeaks' parameter into 'keepAll'.
length(gr1) length(gr2) 62+166 61+166 makeVennDiagram(ol, fill=c("#009E73", "#F0E442"), col=c("#D55E00", "#0072B2"), cat.col=c("#D55E00", "#0072B2"), connectedPeaks = "keepAll")
The assignChromosomeRegion
function can be used to summarize the distribution
of peaks over different type of features such as exon, intron, enhancer,
proximal promoter, 5' UTR and 3' UTR.
This distribution can be summarized in peak centric or nucleotide centric view
using the function assignChromosomeRegion
.
Please note that one peak might span multiple type of features, leading to the
number of annotated features greater than the total number of input peaks.
At the peak centric view, precedence will dictate the annotation order when
peaks span multiple type of features.
The sample code here plots the distribution of peaks are enriched around the promoters.
overlaps <- ol$peaklist[["gr1///gr2"]] ## get the overlapping peaks ## load TxDb to assign genomic elements library(TxDb.Hsapiens.UCSC.hg19.knownGene) aCR<-assignChromosomeRegion(overlaps, nucleotideLevel=FALSE, precedence=c("Promoters", "immediateDownstream", "fiveUTRs", "threeUTRs", "Exons", "Introns"), TxDb=TxDb.Hsapiens.UCSC.hg19.knownGene) barplot(aCR$percentage, las=3)
In addition, the distribution of the distance of overlapped peaks to the nearest
feature such as the transcription start sites (TSS) can be plotted by the
binOverFeature
function.
The sample code here plots the distribution of peaks around the TSS.
binOverFeature(overlaps, annotationData=annoDataTxDb, radius=5000, nbins=20, FUN=length, errFun=0, ylab="count", main="Distribution of aggregated peak numbers around TSS")
The annotatePeakInBatch
function provide multiple methods to annotate the
peaks and those methods can be set by combination of multiple parameters.
The 'output' is the key parameter to determine the annotation method.
The default method is search the nearest features calculated as
'PeakLoc - FeatureLocForDistance'. For more information, please refer the
documentation of the annotatePeakInBatch
function.
As shown from the distribution of aggregated peak numbers around TSS and
the distribution of peaks in different of chromosome regions,
most of the peaks locate around TSS. Therefore, it is reasonable to use
the annotatePeakInBatch
or annoPeaks
to annotate the peaks to the promoter regions of Hg19/GRCh37 genes.
Promoters can be specified with 'bindingRegion'.
For the following example, promoter region is defined as upstream 2000 and
downstream 500 from TSS (bindingRegion=c(-2000, 500)).
overlaps.anno <- annotatePeakInBatch(overlaps, AnnotationData=annoDataTxDb, output="nearestBiDirectionalPromoters", bindingRegion=c(-2000, 500)) library(org.Hs.eg.db) overlaps.anno <- addGeneIDs(overlaps.anno, "org.Hs.eg.db", feature_id_type="entrez_id", IDs2Add = "symbol") head(overlaps.anno, n=2) library(WriteXLS) WriteXLS(as.data.frame(unname(overlaps.anno)), "anno.xls")
The distribution of the common peaks around features can be visualized using a pie chart.
pie1(table(overlaps.anno$insideFeature))
The following example shows how to use getEnrichedGO
to obtain a list of
enriched GO terms with annotated peaks.
For pathway analysis, please use function getEnrichedPATH
with reactome or
KEGG database. Please note that by default feature_id_type is set as
"ensembl_gene_id".
If you are using TxDb as annotation data, please set it to "entrez_id".
over <- getEnrichedGO(overlaps.anno, orgAnn="org.Hs.eg.db", feature_id_type="entrez_id", maxP=.05, minGOterm=10, multiAdjMethod="BH", condense=TRUE) head(over[["bp"]][, -c(3, 10)], n=2) library(reactome.db) path <- getEnrichedPATH(overlaps.anno, "org.Hs.eg.db", "reactome.db", feature_id_type="entrez_id", maxP=.05) head(path, n=2)
There are multiple methods to get the consensus in the peaks:
output the fastq file by the getAllPeakSequence
function and
search the motif by the 3rd program such as homer, MEME, and so on.
test the pre-defined consensus patterns to see if target consensus are
enriched or not by the summarizePatternInPeaks
function.
calculate the z-scores of all combinations of oligonucleotide in a given
length by Markove chain by the oligoSummary
function.
Here is an example to get the Z-scores for short oligos.
library(seqinr) library(BSgenome.Hsapiens.UCSC.hg19) seq <- getAllPeakSequence(overlaps, upstream=20, downstream=20, genome=Hsapiens) ## output the fasta file for the 3nd program write2FASTA(seq, "test.fa") ## summary of the short oligos os <- oligoSummary(seq, oligoLength=6, MarkovOrder=3, quickMotif=TRUE) ## plot the results zscore <- sort(os$zscore) h <- hist(zscore, breaks=100, main="Histogram of Z-score") text(zscore[length(zscore)], max(h$counts)/10, labels=names(zscore[length(zscore)]), srt=90)
## generate the motifs library(motifStack) pfms <- mapply(function(.ele, id) new("pfm", mat=.ele, name=paste("SAMPLE motif", id)), os$motifs, 1:length(os$motifs)) motifStack(pfms)
Bidirectional promoters are the DNA regions located between TSS of two adjacent genes that are transcribed on opposite directions and often co-regulated by this shared promoter region. Here is an example to find peaks near bi-directional promoters.
bdp <- peaksNearBDP(overlaps, annoDataTxDb, maxgap=5000) c(bdp$percentPeaksWithBDP, bdp$n.peaks, bdp$n.peaksWithBDP) head(bdp$peaksWithBDP, n=2)
Given two or more peak lists from different TFs, one may be interested in finding whether DNA binding profile of those TFs are correlated, and if correlated, what is the common binding pattern.
Here we will show how to compare binding profiles from multiple transcription factors (TFs) by ChIP-seq sample data of TAF, YY1 and Tead4 from mouse.
path <- system.file("extdata", package="ChIPpeakAnno") files <- dir(path, "broadPeak") data <- sapply(file.path(path, files), toGRanges, format="broadPeak") (names(data) <- gsub(".broadPeak", "", files))
When we test the association between two sets of data based on hypergeometric
distribution, the number of all potential binding sites is required.
The parameter totalTest in the makeVennDiagram
function indicates how many
potential peaks in total will be used in the hypergeometric test. It should be
larger than the largest number of peaks in the peak list. The smaller it is
set, the more stringent the test is. The time used to calculate p-value does not
depend on the value of the totalTest.
For practical guidance on how to choose totalTest,
please refer to the post.
The following example makes an assumption that there are 3% of coding region
plus promoter region. Because the sample data is only a subset of chromosome 2,
we estimate that the total binding sites is 1/24 of possible binding region
in the genome.
ol <- findOverlapsOfPeaks(data, connectedPeaks="keepAll") averagePeakWidth <- mean(width(unlist(GRangesList(ol$peaklist)))) tot <- ceiling(3.3e+9 * .03 / averagePeakWidth / 24) makeVennDiagram(ol, totalTest=tot, connectedPeaks="keepAll", fill=c("#CC79A7", "#56B4E9", "#F0E442"), # circle fill color col=c("#D55E00", "#0072B2", "#E69F00"), #circle border color cat.col=c("#D55E00", "#0072B2", "#E69F00"))
## see the difference if we set connectedPeaks to "keepFirstListConsistent" ## set connectedPeaks to keepFirstListConsistent will show consistent total ## number of peaks for the first peak list. makeVennDiagram(ol, totalTest=tot, connectedPeaks="keepFirstListConsistent", fill=c("#CC79A7", "#56B4E9", "#F0E442"), col=c("#D55E00", "#0072B2", "#E69F00"), cat.col=c("#D55E00", "#0072B2", "#E69F00"))
The above hypergeometric test requires users to input an estimate of
the total potential binding sites for a given TF.
To circumvent this requirement, we implemented a permutation test called
peakPermTest
.
Before performing a permutation test, users need to generate random peaks
using the distribution discovered from the input peaks for a given feature type
(transcripts or exons), to make sure the binding positions relative to features,
such as TSS and geneEnd, and the width of the random peaks follow the
distribution of that of the input peaks.
Alternatively, a peak pool representing all potential
binding sites can be created with associated binding probabilities for
random peak sampling using the preparePool
function.
Here is an example to build a peak pool for human genome using the
transcription factor binding site clusters (V3) (see ?wgEncodeTfbsV3
)
downloaded from ENCODE
with the HOT spots (?HOT.spots
) removed.
HOT spots are the genomic regions with high probability of being bound by many
TFs in ChIP-seq experiments.
We suggest remove those HOT spots from the peak lists before performing
permutation test to avoid the overestimation of the association between the
two input peak lists.
Users can also choose to remove ENCODE blacklist for a given species.
The blacklists were constructed by identifying consistently problematic regions
over independent cell lines and types of experiments for each species in the
ENCODE and modENCODE datasets.
Please note that some of the blacklists may need to be converted to the correct
genome assembly using liftover utility.
Following are the sample codes to do the permutation test using the permTest
function:
data(HOT.spots) data(wgEncodeTfbsV3) hotGR <- reduce(unlist(HOT.spots)) removeOl <- function(.ele){ ol <- findOverlaps(.ele, hotGR) if(length(ol)>0) .ele <- .ele[-unique(queryHits(ol))] .ele } TAF <- removeOl(data[["TAF"]]) TEAD4 <- removeOl(data[["Tead4"]]) YY1 <- removeOl(data[["YY1"]]) # we subset the pool to save demo time set.seed(1) wgEncodeTfbsV3.subset <- wgEncodeTfbsV3[sample.int(length(wgEncodeTfbsV3), 2000)] pool <- new("permPool", grs=GRangesList(wgEncodeTfbsV3.subset), N=length(YY1)) pt1 <- peakPermTest(YY1, TEAD4, pool=pool, seed=1, force.parallel=FALSE) plot(pt1)
pt2 <- peakPermTest(YY1, TAF, pool=pool, seed=1, force.parallel=FALSE) plot(pt2)
You can easily visualize and compare the binding patterns of raw signals of
multiple ChIP-Seq experiments using function
featureAlignedHeatmap
and featureAlignedDistribution
.
features <- ol$peaklist[[length(ol$peaklist)]] feature.recentered <- reCenterPeaks(features, width=4000) ## here we also suggest importData function in bioconductor trackViewer package ## to import the coverage. ## compare rtracklayer, it will save you time when handle huge dataset. library(rtracklayer) files <- dir(path, "bigWig") if(.Platform$OS.type != "windows"){ cvglists <- sapply(file.path(path, files), import, format="BigWig", which=feature.recentered, as="RleList") }else{## rtracklayer can not import bigWig files on Windows load(file.path(path, "cvglist.rds")) } names(cvglists) <- gsub(".bigWig", "", files) feature.center <- reCenterPeaks(features, width=1) sig <- featureAlignedSignal(cvglists, feature.center, upstream=2000, downstream=2000) ##Because the bw file is only a subset of the original file, ##the signals are not exists for every peak. keep <- rowSums(sig[[2]]) > 0 sig <- sapply(sig, function(.ele) .ele[keep, ], simplify = FALSE) feature.center <- feature.center[keep] heatmap <- featureAlignedHeatmap(sig, feature.center, upstream=2000, downstream=2000, upper.extreme=c(3,.5,4))
featureAlignedDistribution(sig, feature.center, upstream=2000, downstream=2000, type="l")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.