#' Annotate peaks based on regions of peak
#' @description This function first merge and extend footprint regions, and then
#' integrate R package ChIPseeker to get footprint-related genes
#' @param footprints footprints that overlap with peaks, generated by overlap_footprints_peaks() or
#' intersect function of bedtools
#' @param motif motif file, you can choose our bulit-in motif database of
#' 'mus musculus', 'homo sapiens', 'zebrafish' and 'chicken' by 'motif = Tranfac201803_Mm_MotifTFsF',
#' 'motif = Tranfac201803_Hs_MotifTFsF', 'motif = Tranfac201803_Zf_MotifTFsF',
#' 'motif = Tranfac201803_Ch_MotifTFsF' respectively, or you can upload your own motif data base,
#' but the formata use be the same as our built-in motif database.
#' @param Species character, indicating the species of data which is used to
#' choose annodb when annotate peak.
#' @param txdb TxDb object contained transcript-related features of a particular
#' genome. Bioconductor provides several package that containing TxDb object of
#' model organisms with multiple commonly used genome version, for instance
#' TxDb.Hsapiens.UCSC.hg38.knownGene, TxDb.Hsapiens.UCSC.hg19.knownGene for
#' human genome hg38 and hg19, TxDb.Mmusculus.UCSC.mm10.knownGene and TxDb.Mmusculus.UCSC.mm9.knownGene
#' for mouse genome mm10 and mm9, etc.
#' @param tssRegion Region Range of TSS
#' @importFrom GenomicRanges GRanges
#' @importFrom IRanges IRanges
#' @importFrom ChIPseeker annotatePeak
#' @return return a list, first element is bed format datafrmae, second element
#' is annotated footprints dataframe
#' @export
#'
#' @examples #txdb <- TxDb.Mmusculus.UCSC.mm10.knownGene::TxDb.Mmusculus.UCSC.mm10.knownGene
#' load(system.file("extdata", "combined.rda", package = "IReNA"))
#' load(system.file("extdata", "test_peak.rda", package = "IReNA"))
#' peak_bed <- get_bed(test_peak)
#' overlapped <- overlap_footprints_peaks(combined, peak_bed)
#' #list1 <- get_related_genes(overlapped,txdb = txdb,motif=Tranfac201803_Mm_MotifTFsF,Species = 'Mm')
get_related_genes <- function(footprints, motif, Species, txdb, tssRegion = c(-3000, 3000)) {
validInput(footprints,'footprints','df')
validInput(motif,'motif','df')
validInput(Species,'Species','character')
validInput(txdb,'txdb','txdb')
validInput(tssRegion,'tssRegion','vector')
footprintslist <- merge_extent_footprints(footprints, motif)
merged_footprints <- footprintslist[[2]]
if (Species == "Hs") {
annodb <- "org.Hs.eg.db"
} else if (Species == "Mm") {
annodb <- "org.Mm.eg.db"
} else if (Species == "Zf") {
annodb <- "org.Dr.eg.db"
} else if (Species == "Ch") {
annodb <- "org.Gg.eg.db"
}
reference_GRange <- GenomicRanges::GRanges(seqnames = merged_footprints[,1],
IRanges::IRanges(start = as.numeric(merged_footprints[,2]),
end = as.numeric(merged_footprints[,3])),
strand = merged_footprints[,4])
peakAnno <- ChIPseeker::annotatePeak(reference_GRange,
tssRegion = tssRegion,
TxDb = txdb, annoDb = annodb
)
region <- peakAnno@anno@elementMetadata$annotation
gene <- peakAnno@anno@elementMetadata$ENSEMBL
start1 <- peakAnno@anno@ranges@start
merged_footprints2 <- merged_footprints[merged_footprints$V2 %in% start1, ]
exon1 <- grep('exon',region)
Intron1 <- grep('Intron',region)
Intergenic1 <- grep('Intergenic',region)
Downstream1 <- grep('Downstream',region)
Promoter1 <- grep('Promoter',region)
UTR3 <- grep("3' UTR",region)
UTR5 <- grep("5' UTR",region)
region2 <- rep(NA,length(region))
region2[exon1]='Exon'
region2[Intron1]='Intron'
region2[Downstream1]='Downstream'
region2[Promoter1]='Promoter'
region2[UTR3]="3' UTR"
region2[UTR5]="5' UTR"
region2[Intergenic1]='Intergenic'
table(region2)
peak_region1 <- paste(as.character(peakAnno@anno@seqnames),
as.character(peakAnno@anno@ranges),sep = ':')
peak_region2 <- paste0(merged_footprints[,1],':'
,merged_footprints[,2],'-',merged_footprints[,3])
merged_footprints2 <- merged_footprints[peak_region2%in%peak_region1,]
merged_footprints2$gene <- gene
merged_footprints2$region <- region2
merged_footprints2 <- merged_footprints2[, c(9, 8, 1:7)]
colnames(merged_footprints2) <- c(paste0("V", 1:9))
footprintslist[[2]] <- merged_footprints2
footprintslist[[1]] <- footprintslist[[1]][peak_region2%in%peak_region1,]
return(footprintslist)
}
merge_extent_footprints <- function(file1, motif1) {
file1 <- file1[file1[,7] %in% motif1$Accession,]
peak_region <- paste(as.character(file1[,1]), file1[,2],
file1[,3], file1[,4], sep = "\t")
file1$peak_region <- peak_region
file1$tf <- motif1[match(file1[,7],motif1[,1]),5]
group_peak <- dplyr::group_by(file1,peak_region)
group_peak <- group_peak[order(group_peak$peak_region),]
group_peak2 <-dplyr::group_map(group_peak,~merge_group(.x))
group_peak3 <- as.data.frame(group_peak[!duplicated(group_peak$peak_region),])
num1 <- as.integer((as.numeric(group_peak3[,2]) + as.numeric(group_peak3[,3])) / 2)
size1 <- as.numeric(group_peak3[,3]) - as.numeric(group_peak3[,2]) + 1
num11 <- as.integer(num1 - (size1 * 5))
num12 <- as.integer(num1 + (size1 * 5))
peak_index = paste0(group_peak3[,8],':',group_peak3[,9],'-',group_peak3[,10])
Candid <- group_peak3[,1:4]
Candid <- dplyr::mutate(Candid, V5 = num11 ,V6=num12,V7=unlist(group_peak2))
bed <- Candid[,c(1,5,6)]
list1 <- list(bed,Candid)
return(list1)
}
merge_group <- function(data1){
data1 <- as.data.frame(data1)
motif <- as.character(data1[,7])
related_gene <- as.character(data1[,11])
mg <- paste(motif,related_gene,sep = ';',collapse = '|')
return(mg)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.