R/annotation.R

Defines functions merge_group merge_extent_footprints get_related_genes

Documented in get_related_genes

#' Annotate peaks based on regions of peak
#' @description This function first merge and extend footprint regions, and then
#'  integrate R package ChIPseeker to get footprint-related genes
#' @param footprints footprints that overlap with peaks, generated by overlap_footprints_peaks() or
#'  intersect function of bedtools
#' @param motif motif file, you can choose our bulit-in motif database of
#' 'mus musculus', 'homo sapiens', 'zebrafish' and 'chicken' by 'motif = Tranfac201803_Mm_MotifTFsF',
#' 'motif = Tranfac201803_Hs_MotifTFsF', 'motif = Tranfac201803_Zf_MotifTFsF',
#' 'motif = Tranfac201803_Ch_MotifTFsF' respectively, or you can upload your own motif data base,
#'  but the formata use be the same as our built-in motif database.
#' @param Species character, indicating the species of data which is used to
#' choose annodb when annotate peak.
#' @param txdb 	TxDb object contained transcript-related features of a particular
#' genome. Bioconductor provides several package that containing TxDb object of
#' model organisms with multiple commonly used genome version, for instance
#' TxDb.Hsapiens.UCSC.hg38.knownGene, TxDb.Hsapiens.UCSC.hg19.knownGene for
#' human genome hg38 and hg19, TxDb.Mmusculus.UCSC.mm10.knownGene and TxDb.Mmusculus.UCSC.mm9.knownGene
#' for mouse genome mm10 and mm9, etc.
#' @param tssRegion Region Range of TSS
#' @importFrom GenomicRanges GRanges
#' @importFrom IRanges IRanges
#' @importFrom ChIPseeker annotatePeak
#' @return return a list, first element is bed format datafrmae, second element
#' is annotated footprints dataframe
#' @export
#'
#' @examples #txdb <- TxDb.Mmusculus.UCSC.mm10.knownGene::TxDb.Mmusculus.UCSC.mm10.knownGene
#' load(system.file("extdata", "combined.rda", package = "IReNA"))
#' load(system.file("extdata", "test_peak.rda", package = "IReNA"))
#' peak_bed <- get_bed(test_peak)
#' overlapped <- overlap_footprints_peaks(combined, peak_bed)
#' #list1 <- get_related_genes(overlapped,txdb = txdb,motif=Tranfac201803_Mm_MotifTFsF,Species = 'Mm')
get_related_genes <- function(footprints, motif, Species, txdb, tssRegion = c(-3000, 3000)) {
  validInput(footprints,'footprints','df')
  validInput(motif,'motif','df')
  validInput(Species,'Species','character')
  validInput(txdb,'txdb','txdb')
  validInput(tssRegion,'tssRegion','vector')
  footprintslist <- merge_extent_footprints(footprints, motif)
  merged_footprints <- footprintslist[[2]]
  if (Species == "Hs") {
    annodb <- "org.Hs.eg.db"
  } else if (Species == "Mm") {
    annodb <- "org.Mm.eg.db"
  } else if (Species == "Zf") {
    annodb <- "org.Dr.eg.db"
  } else if (Species == "Ch") {
    annodb <- "org.Gg.eg.db"
  }
  reference_GRange <- GenomicRanges::GRanges(seqnames = merged_footprints[,1],
                                             IRanges::IRanges(start = as.numeric(merged_footprints[,2]),
                                                              end = as.numeric(merged_footprints[,3])),
                                             strand = merged_footprints[,4])
  peakAnno <- ChIPseeker::annotatePeak(reference_GRange,
    tssRegion = tssRegion,
    TxDb = txdb, annoDb = annodb
  )
  region <- peakAnno@anno@elementMetadata$annotation
  gene <- peakAnno@anno@elementMetadata$ENSEMBL
  start1 <- peakAnno@anno@ranges@start
  merged_footprints2 <- merged_footprints[merged_footprints$V2 %in% start1, ]
  exon1 <- grep('exon',region)
  Intron1 <- grep('Intron',region)
  Intergenic1 <- grep('Intergenic',region)
  Downstream1 <- grep('Downstream',region)
  Promoter1 <- grep('Promoter',region)
  UTR3 <- grep("3' UTR",region)
  UTR5 <- grep("5' UTR",region)
  region2 <- rep(NA,length(region))
  region2[exon1]='Exon'
  region2[Intron1]='Intron'
  region2[Downstream1]='Downstream'
  region2[Promoter1]='Promoter'
  region2[UTR3]="3' UTR"
  region2[UTR5]="5' UTR"
  region2[Intergenic1]='Intergenic'
  table(region2)
  peak_region1 <- paste(as.character(peakAnno@anno@seqnames),
                        as.character(peakAnno@anno@ranges),sep = ':')
  peak_region2 <- paste0(merged_footprints[,1],':'
                         ,merged_footprints[,2],'-',merged_footprints[,3])
  merged_footprints2 <- merged_footprints[peak_region2%in%peak_region1,]
  merged_footprints2$gene <- gene
  merged_footprints2$region <- region2
  merged_footprints2 <- merged_footprints2[, c(9, 8, 1:7)]
  colnames(merged_footprints2) <- c(paste0("V", 1:9))
  footprintslist[[2]] <- merged_footprints2
  footprintslist[[1]] <- footprintslist[[1]][peak_region2%in%peak_region1,]
  return(footprintslist)
}



merge_extent_footprints <- function(file1, motif1) {
  file1 <- file1[file1[,7] %in% motif1$Accession,]
  peak_region <- paste(as.character(file1[,1]), file1[,2],
                       file1[,3], file1[,4], sep = "\t")
  file1$peak_region <- peak_region
  file1$tf <- motif1[match(file1[,7],motif1[,1]),5]
  group_peak <- dplyr::group_by(file1,peak_region)
  group_peak <- group_peak[order(group_peak$peak_region),]
  group_peak2 <-dplyr::group_map(group_peak,~merge_group(.x))
  group_peak3 <- as.data.frame(group_peak[!duplicated(group_peak$peak_region),])
  num1 <- as.integer((as.numeric(group_peak3[,2]) + as.numeric(group_peak3[,3])) / 2)
  size1 <- as.numeric(group_peak3[,3]) - as.numeric(group_peak3[,2]) + 1
  num11 <- as.integer(num1 - (size1 * 5))
  num12 <- as.integer(num1 + (size1 * 5))
  peak_index = paste0(group_peak3[,8],':',group_peak3[,9],'-',group_peak3[,10])
  Candid <- group_peak3[,1:4]
  Candid <- dplyr::mutate(Candid, V5 = num11 ,V6=num12,V7=unlist(group_peak2))
  bed <- Candid[,c(1,5,6)]
  list1 <- list(bed,Candid)
  return(list1)
}



merge_group <- function(data1){
  data1 <- as.data.frame(data1)
  motif <- as.character(data1[,7])
  related_gene <- as.character(data1[,11])
  mg <- paste(motif,related_gene,sep = ';',collapse = '|')
  return(mg)
}
jiang-junyao/IReNA documentation built on Nov. 4, 2024, 8:29 p.m.