R/annotatePeakInBatch.R

#' Obtain the distance to the nearest TSS, miRNA, and/or exon for a list of 
#' peaks
#' 
#' Obtain the distance to the nearest TSS, miRNA, exon et al for a list of peak 
#' locations leveraging IRanges and biomaRt package
#' 
#' 
#' @param myPeakList A \link[GenomicRanges:GRanges-class]{GRanges} object
#' @param mart A mart object, used if AnnotationData is not supplied, see
#' useMart of bioMaRt package for details
#' @param featureType A charcter vector used with mart argument if
#' AnnotationData is not supplied; choose from "TSS", "miRNA" or "Exon"
#' @param AnnotationData A \link[GenomicRanges:GRanges-class]{GRanges} or
#' \link{annoGR} object. It can be obtained from the function getAnnotation or
#' customized annotation of class GRanges containing additional variable:
#' strand (1 or + for plus strand and -1 or - for minus strand). Pre-compliled
#' annotations, such as TSS.human.NCBI36, TSS.mouse.NCBIM37, TSS.rat.RGSC3.4
#' and TSS.zebrafish.Zv8, are provided by this package (attach them with data()
#' function). Another method to provide annotation data is to obtain through
#' biomaRt in real time by using the mart and featureType option
#' @param output \describe{ 
#' \item{nearestLocation (default)}{will output the nearest features calculated 
#' as PeakLoc - FeatureLocForDistance; when selected, the output can consist of 
#' both "strictly nearest features (non-overlapping)" and "overlapping features"
#' as long as they are the nearest}
#' \item{overlapping}{will output overlapping features with maximum gap
#' specified as maxgap between peak range and feature range; it is possible for 
#' a peak to be annotated with zero ("NA" will be returned) or multiple 
#' overlapping features if exist}
#' \item{both}{will output all the nearest features as well as any features
#' that overlap with the peak that is not the nearest}
#' \item{shortestDistance}{will output the features with the shortest distance;
#' the "shortest distance" is determined from either ends of the feature to 
#' either ends of the peak} 
#' \item{upstream&inside}{will output all upstream and overlapping features with 
#' maximum gap} 
#' \item{inside&downstream}{will output all downstream and overlapping features 
#' with maximum gap}
#' \item{upstream}{will output all upstream features with maximum gap} 
#' \item{downstream}{will output all downstream features with maximum gap} 
#' \item{upstreamORdownstream}{will output all upstream features with maximum 
#' gap or downstream with maximum gap} 
#' \item{nearestBiDirectionalPromoters}{will use \link{annoPeaks} to
#' annotate peaks. Nearest promoters from both direction of the peaks (strand
#' is considered). It will report bidirectional promoters if there are
#' promoters in both directions in the given region (defined by bindingRegion).
#' Otherwise, it will report the closest promoter in one direction.} }
#' @param multiple Not applicable when output is nearest. TRUE: output multiple
#' overlapping features for each peak. FALSE: output at most one overlapping
#' feature for each peak. This parameter is kept for backward compatibility,
#' please use select.
#' @param maxgap The maximum \emph{gap} that is allowed between 2 ranges for
#' the ranges to be considered as overlapping. The \emph{gap} between 2 ranges
#' is the number of positions that separate them. The \emph{gap} between 2
#' adjacent ranges is 0. By convention when one range has its start or end
#' strictly inside the other (i.e. non-disjoint ranges), the \emph{gap} is
#' considered to be -1.
#' @param PeakLocForDistance Specify the location of peak for calculating
#' distance,i.e., middle means using middle of the peak to calculate distance
#' to feature, start means using start of the peak to calculate the distance to
#' feature, endMinusStart means using the end of the peak to calculate the 
#' distance to features on plus strand and the start of the peak to calculate 
#' the distance to features on minus strand. To be compatible with previous 
#' version, by default using start
#' @param FeatureLocForDistance Specify the location of feature for calculating
#' distance,i.e., middle means using middle of the feature to calculate
#' distance of peak to feature, start means using start of the feature to
#' calculate the distance to feature, TSS means using start of feature when
#' feature is on plus strand and using end of feature when feature is on minus
#' strand, geneEnd means using end of feature when feature is on plus strand
#' and using start of feature when feature is on minus strand. To be compatible
#' with previous version, by default using TSS
#' @param select "all" may return multiple overlapping peaks, "first" will
#' return the first overlapping peak, "last" will return the last overlapping
#' peak and "arbitrary" will return one of the overlapping peaks.
#' @param ignore.strand When set to TRUE, the strand information is ignored in
#' the annotation. Unless you have stranded peaks and you are interested in 
#' annotating peaks to the features in the same strand only, you should just 
#' use the default setting ignore.strand = TRUE.
#' @param bindingRegion Annotation range used for \link{annoPeaks}, which is a
#' vector with two integer values, default to c (-5000, 5000). The first one
#' must be no bigger than 0. And the sec ond one must be no less than 1. Once
#' bindingRegion is defined, annotation will based on \link{annoPeaks}. Here is
#' how to use it together with the parameter output and FeatureLocForDistance.
#' \itemize{ \item To obtain peaks with nearest bi-directional promoters within
#' 5kb upstream and 3kb downstream of TSS, set output =
#' "nearestBiDirectionalPromoters" and bindingRegion = c(-5000, 3000) \item To
#' obtain peaks within 5kb upstream and up to 3kb downstream of TSS within the
#' gene body, set output="overlapping", FeatureLocForDistance="TSS" and
#' bindingRegion = c(-5000, 3000) \item To obtain peaks up to 5kb upstream
#' within the gene body and 3kb downstream of gene/Exon End, set
#' output="overlapping", FeatureLocForDistance="geneEnd" and bindingRegion =
#' c(-5000, 3000) \item To obtain peaks from 5kb upstream to 3kb downstream of
#' genes/Exons, set output="overlapping", bindingType = "fullRange" and
#' bindingRegion = c(-5000, 3000) } For details, see \link{annoPeaks}.
#' @param ... Parameters could be passed to \link{annoPeaks}
#' @return An object of \link[GenomicRanges:GRanges-class]{GRanges} with slot
#' start holding the start position of the peak, slot end holding the end
#' position of the peak, slot space holding the chromosome location where the
#' peak is located, slot rownames holding the id of the peak. In addition, the
#' following variables are included.  \item{list("feature")}{id of the feature
#' such as ensembl gene ID} \item{list("insideFeature")}{upstream: peak resides
#' upstream of the feature; downstream: peak resides downstream of the feature;
#' inside: peak resides inside the feature; overlapStart: peak overlaps with
#' the start of the feature; overlapEnd: peak overlaps with the end of the
#' feature; includeFeature: peak include the feature entirely}
#' \item{list("distancetoFeature")}{distance to the nearest feature such as
#' transcription start site.  By default, the distance is calculated as the
#' distance between the start of the binding site and the TSS that is the gene
#' start for genes located on the forward strand and the gene end for genes
#' located on the reverse strand. The user can specify the location of peak and
#' location of feature for calculating this}
#' \item{list("start_position")}{start position of the feature such as gene}
#' \item{list("end_position")}{end position of the feature such as the gene}
#' \item{list("strand")}{1 or + for positive strand and -1 or - for negative
#' strand where the feature is located} \item{list("shortestDistance")}{The
#' shortest distance from either end of peak to either end the feature.  }
#' \item{list("fromOverlappingOrNearest")}{Relevant only when output is set to 
#' "both". If "nearestLocation": indicates this feature's start (feature's end 
#' for features from minus strand) is the closest to the peak start ("strictly 
#' nearest" or "nearest overlapping"); if "Overlapping": indicates this feature 
#' overlaps with this peak although it is not the nearest (non-nearest 
#' overlapping) }
#' @author Lihua Julie Zhu, Jianhong Ou
#' @seealso \link{getAnnotation}, \link{findOverlappingPeaks},
#' \link{makeVennDiagram}, \link{addGeneIDs}, \link{peaksNearBDP},
#' \link{summarizePatternInPeaks}, \link{annoGR}, \link{annoPeaks}
#' @references 1. Zhu L.J. et al. (2010) ChIPpeakAnno: a Bioconductor package
#' to annotate ChIP-seq and ChIP-chip data. BMC Bioinformatics 2010,
#' 11:237doi:10.1186/1471-2105-11-237
#' 
#' 2. Zhu L (2013). "Integrative analysis of ChIP-chip and ChIP-seq dataset."
#' In Lee T and Luk ACS (eds.), Tilling Arrays, volume 1067, chapter 4, pp.
#' -19.  Humana Press. http://dx.doi.org/10.1007/978-1-62703-607-8_8
#' @keywords misc
#' @export
#' @import IRanges
#' @import GenomicRanges
#' @importFrom GenomeInfoDb seqlevels
#' @importFrom BiocGenerics start end width strand
#' @importFrom S4Vectors mcols
#' @importFrom dplyr %>% filter group_by top_n
#' @examples
#' 
#' 
#'     ## example 1: annotate myPeakList by TxDb or EnsDb.
#'     data(myPeakList)
#'     library(ensembldb)
#'     library(EnsDb.Hsapiens.v75)
#'     annoData <- annoGR(EnsDb.Hsapiens.v75)
#'     annotatePeak = annotatePeakInBatch(myPeakList[1:6], AnnotationData=annoData)
#'     annotatePeak
#'     
#'     ## example 2: annotate myPeakList (GRanges) 
#'     ## with TSS.human.NCBI36 (Granges)
#'     data(TSS.human.NCBI36)
#'     annotatedPeak = annotatePeakInBatch(myPeakList[1:6], 
#'                                         AnnotationData=TSS.human.NCBI36)
#'     annotatedPeak
#'     
#'     ## example 3: you have a list of transcription factor biding sites from 
#'     ## literature and are interested in determining the extent of the overlap 
#'     ## to the list of peaks from your experiment. Prior calling the function 
#'     ## annotatePeakInBatch, need to represent both dataset as GRanges 
#'     ## where start is the start of the binding site, end is the end of the 
#'     ## binding site, names is the name of the binding site, space and strand 
#'     ## are the chromosome name and strand where the binding site is located.
#'     
#'     myexp <- GRanges(seqnames=c(6,6,6,6,5,4,4), 
#'                      IRanges(start=c(1543200,1557200,1563000,1569800,
#'                                      167889600,100,1000),
#'                              end=c(1555199,1560599,1565199,1573799,
#'                                    167893599,200,1200),
#'                              names=c("p1","p2","p3","p4","p5","p6", "p7")), 
#'                      strand="+")
#'     literature <- GRanges(seqnames=c(6,6,6,6,5,4,4), 
#'                           IRanges(start=c(1549800,1554400,1565000,1569400,
#'                                           167888600,120,800),
#'                                   end=c(1550599,1560799,1565399,1571199,
#'                                         167888999,140,1400),
#'                                   names=c("f1","f2","f3","f4","f5","f6","f7")),
#'                           strand=rep(c("+", "-"), c(5, 2)))
#'     annotatedPeak1 <- annotatePeakInBatch(myexp, 
#'                                           AnnotationData=literature)
#'     pie(table(annotatedPeak1$insideFeature))
#'     annotatedPeak1
#'     ### use toGRanges or rtracklayer::import to convert BED or GFF format
#'     ###  to GRanges before calling annotatePeakInBatch
#'     test.bed <- data.frame(space=c("4", "6"), 
#'                            start=c("100", "1000"),
#'                            end=c("200", "1100"), 
#'                            name=c("peak1", "peak2"))
#'     test.GR = toGRanges(test.bed)
#'     annotatePeakInBatch(test.GR, AnnotationData = literature)
#'  
#'  library(testthat)
#'   peak <- GRanges(seqnames = "chr1", 
#'                   IRanges(start = 24736757, end=24737528,
#'                           names = "testPeak"))
#'   data(TSS.human.GRCh37)
#'   TSS.human.GRCh37[names(TSS.human.GRCh37)== "ENSG00000001461"]
#'   # GRanges object with 1 range and 1 metadata column:
#'   # seqnames            ranges strand |            description
#'   #<Rle>         <IRanges>  <Rle> |            <character>
#'   # ENSG00000001461        1 24742285-24799466      + | NIPA-like domain con..
#'   peak
#'   #GRanges object with 1 range and 0 metadata columns:
#'   #   seqnames            ranges strand
#'   #<Rle>         <IRanges>  <Rle>
#'   #  testPeak     chr1 24736757-24737528      *
#'   TSS.human.GRCh37[names(TSS.human.GRCh37)== "ENSG00000001460"]
#'   #GRanges object with 1 range and 1 metadata column:
#'   #   seqnames            ranges strand |            description
#'   #<Rle>         <IRanges>  <Rle> |            <character>
#'   #   ENSG00000001460        1 24683490-24743424      - | UPF0490 protein C1or..
#'   ap <- annotatePeakInBatch(peak, Annotation=TSS.human.GRCh37, 
#'                             PeakLocForDistance = "start")
#'   stopifnot(ap$feature=="ENSG00000001461")
#'   ap <- annotatePeakInBatch(peak, Annotation=TSS.human.GRCh37,
#'                             PeakLocForDistance = "end")
#'   stopifnot(ap$feature=="ENSG00000001461")
#'   ap <- annotatePeakInBatch(peak, Annotation=TSS.human.GRCh37,
#'                             PeakLocForDistance = "middle")
#'   stopifnot(ap$feature=="ENSG00000001461")
#'   ap <- annotatePeakInBatch(peak, Annotation=TSS.human.GRCh37,
#'                             PeakLocForDistance = "endMinusStart")
#'   stopifnot(ap$feature=="ENSG00000001461")
#'   ## Let's calculate the distances between the peak and the TSS of the genes
#'   ## in the annotation file used for annotating the peaks.
#'   ## Please note that we need to compute the distance using the annotation
#'   ## file TSS.human.GRCh37.
#'   ## If you would like to use  TxDb.Hsapiens.UCSC.hg19.knownGene, 
#'   ## then you will need to annotate the peaks
#'   ## using TxDb.Hsapiens.UCSC.hg19.knownGene as well.
#'   ### using start
#'   start(peak) -start(TSS.human.GRCh37[names(TSS.human.GRCh37)== 
#'                                       "ENSG00000001461"]) #picked
#'   #[1] -5528
#'   start(peak) -end(TSS.human.GRCh37[names(TSS.human.GRCh37)==
#'                                    "ENSG00000001460"])
#'   #[1] -6667
#'   #### using middle
#'   (start(peak) + end(peak))/2 -
#'       start(TSS.human.GRCh37[names(TSS.human.GRCh37)== "ENSG00000001461"])
#'   #[1] -5142.5
#'   (start(peak) + end(peak))/2 -
#'       end(TSS.human.GRCh37[names(TSS.human.GRCh37)== "ENSG00000001460"])
#'   # [1] 49480566
#'   end(peak) -start(TSS.human.GRCh37[names(TSS.human.GRCh37)==
#'                                    "ENSG00000001461"]) #picked
#'   # [1] -4757
#'   end(peak) -end(TSS.human.GRCh37[names(TSS.human.GRCh37)==
#'                                  "ENSG00000001460"])
#'   # [1] -5896
#'   #### using endMinusStart
#'   end(peak) - start(TSS.human.GRCh37[names(TSS.human.GRCh37)==
#'                                     "ENSG00000001461"]) ## picked
#'   # [1] -4575
#'   start(peak) -end(TSS.human.GRCh37[names(TSS.human.GRCh37)==
#'                                     "ENSG00000001460"])
#'   #[1] -6667
#'   ###### using txdb object to annotate the peaks
#'   library(org.Hs.eg.db)
#'   select(org.Hs.eg.db, key="STPG1", keytype="SYMBOL",
#'          columns=c("ENSEMBL", "ENTREZID", "SYMBOL"))
#'   #  SYMBOL         ENSEMBL ENTREZID
#'   #  STPG1 ENSG00000001460    90529
#'   select(org.Hs.eg.db, key= "ENSG00000001461", keytype="ENSEMBL",
#'          columns=c("ENSEMBL", "ENTREZID", "SYMBOL"))
#'   #ENSEMBL ENTREZID SYMBOL
#'   # ENSG00000001461    57185 NIPAL3
#'   require(TxDb.Hsapiens.UCSC.hg19.knownGene)
#'   txdb.ann <- genes(TxDb.Hsapiens.UCSC.hg19.knownGene)
#'   STPG1 <- select(org.Hs.eg.db, key="STPG1", keytype="SYMBOL",
#'                   columns=c( "SYMBOL", "ENSEMBL", "ENTREZID"))[1,3]
#'   NIPAL3 <- select(org.Hs.eg.db, key="NIPAL3", keytype="SYMBOL",
#'                    columns=c( "SYMBOL", "ENSEMBL", "ENTREZID"))[1,3]
#'   ap <- annotatePeakInBatch(peak, Annotation=txdb.ann,
#'                             PeakLocForDistance = "start")
#'   expect_equal(ap$feature, STPG1)
#'   ap <- annotatePeakInBatch(peak, Annotation=txdb.ann,
#'                             PeakLocForDistance = "end")
#'   expect_equal(ap$feature, STPG1)
#'   ap <- annotatePeakInBatch(peak, Annotation=txdb.ann,
#'                             PeakLocForDistance = "middle")
#'   expect_equal(ap$feature, STPG1)
#'   ap <- annotatePeakInBatch(peak, Annotation=txdb.ann,
#'                             PeakLocForDistance = "endMinusStart")
#'   expect_equal(ap$feature, NIPAL3)
#'   txdb.ann[NIPAL3]
#'   txdb.ann[txdb.ann$gene_id == NIPAL3]
#'   #  GRanges object with 1 range and 1 metadata column:
#'   #    seqnames            ranges strand |     gene_id
#'   #  <Rle>         <IRanges>  <Rle> | <character>
#'   #   57185     chr1 24742245-24799473      + |       57185
#'   #-------
#'   txdb.ann[txdb.ann$gene_id == STPG1]
#'   #   GRanges object with 1 range and 1 metadata column:
#'   #     seqnames            ranges strand |     gene_id
#'   #  <Rle>         <IRanges>  <Rle> | <character>
#'   #     90529     chr1 24683489-24741587      - |       90529
#' 
annotatePeakInBatch <-
    function (myPeakList, mart, featureType = c("TSS", "miRNA", "Exon"),
              AnnotationData, 
              output = c("nearestLocation", "overlapping", "both", 
                         "shortestDistance", "inside",
                         "upstream&inside", "inside&downstream",
                         "upstream", "downstream", 
                         "upstreamORdownstream", 
                         "nearestBiDirectionalPromoters"),
              multiple = c(TRUE,FALSE), maxgap = -1L,
              PeakLocForDistance=c("start","middle","end", "endMinusStart"),
              FeatureLocForDistance=c("TSS","middle","start","end", "geneEnd"),
              select=c("all", "first", "last", "arbitrary"),
              ignore.strand=TRUE, bindingRegion=NULL, ...)
    {
        if(output[1]=="nearestStart") output <- "nearestLocation"
        featureType = match.arg(featureType)
        PeakLocForDistance = match.arg(PeakLocForDistance)
        FeatureLocForDistance = match.arg(FeatureLocForDistance)
        output = match.arg(output)
        select = match.arg(select)
        multiple = multiple[1]
        if (PeakLocForDistance == "endMinusStart")
        {
            if (missing(AnnotationData) || length(AnnotationData) == 0)
             {
              if (missing(mart)) {
                stop("Error in querying biomart database.
                     No valid mart object is passed in!
                     Suggest call getAnnotation before calling
                     annotatePeakInBatch")
              }
              else if(!is(mart, "Mart")){
                if(is(mart, "GRanges")){
                    warning("AnnotationData is missing, and a GRanges is passed
                            to mart parameter. Trying to annotate peaks by
                            the GRanges you input.")
                    AnnotationData <- mart
                    mart <- NULL
                } else {
                    stop("Error in querying biomart database.
                     No valid mart object is passed in!
                     Suggest call getAnnotation before calling
                     annotatePeakInBatch")
                }
            }
            else {
                AnnotationData <- getAnnotation(mart, featureType = featureType)
            }
          } 
          if (!length(names(AnnotationData)) || any(is.na(names(AnnotationData))))
          {
             names(AnnotationData) <- paste0("ann", seq_along(AnnotationData))
          }
          plusAnno = AnnotationData[strand(AnnotationData) == "+"]
          minusAnno = AnnotationData[strand(AnnotationData) == "-"]
          if (length(plusAnno) > 0) 
             r.plus <- annotatePeakInBatch(myPeakList, 
                    AnnotationData =  plusAnno,
                    featureType = featureType,
                    PeakLocForDistance = "end",
                    output = output,
                    multiple = multiple,
                    select = select,
                    maxgap = maxgap,
                    FeatureLocForDistance = FeatureLocForDistance,
                    ignore.strand = TRUE,
                    bindingRegion = bindingRegion, ...)
          if (length(minusAnno) > 0)  
              r.minus <- annotatePeakInBatch(myPeakList, 
                    AnnotationData =  minusAnno,
                    featureType = featureType,
                    PeakLocForDistance = "start",
                    output = output,
                    multiple = multiple,
                    select = select,
                    maxgap = maxgap,
                    FeatureLocForDistance = FeatureLocForDistance,
                    ignore.strand = TRUE,
                    bindingRegion = bindingRegion, ...)
         if (length(plusAnno) > 0 & length(minusAnno) == 0) {
             return(r.plus)
         }
         else if (length(plusAnno) == 0 & length(minusAnno) > 0) {
	     return(r.minus)
         }
         else if (length(plusAnno) > 0 & length(minusAnno) > 0) {
             r.both  <- c(r.plus, r.minus) 
             r.both <- r.both[!is.na(r.both$fromOverlappingOrNearest)]
             r.both.df <- cbind(uid = names(r.both), as.data.frame(r.both))
             r.both.df %>%
                filter(fromOverlappingOrNearest == "NearestLocation") %>%        
    		group_by(peak) %>%
                top_n(-1, abs(distancetoFeature)) -> r.n.df
	     if (nrow(r.n.df) == 0)
		return(r.both)
             else if (length(r.both[r.both$fromOverlappingOrNearest != "NearestLocation"]) == 0)
		return(r.both[names(r.both) %in% r.n.df$uid & !is.na(r.both$feature)]) 
             else
         	return(r.both[(names(r.both) %in% r.n.df$uid |
		    r.both$fromOverlappingOrNearest != "NearestLocation") & !is.na(r.both$feature)])
          }
       } ### end of if endMinusStart
       if(output[1]=="nearestStart") output <- "nearestLocation"
           featureType = match.arg(featureType)
           PeakLocForDistance = match.arg(PeakLocForDistance)
           FeatureLocForDistance = match.arg(FeatureLocForDistance)
           output = match.arg(output)
           select = match.arg(select)
           multiple = multiple[1]

      if ((output == "overlapping" || output == "both")
            && select =="all" && multiple==FALSE) {
            warning("Please use select instead of multiple!")
            select = "first"
      }
      if(output=="upstream&inside"){
            if(FeatureLocForDistance!="TSS") {
                FeatureLocForDistance <- "TSS"
                warning("FeatureLocForDistance is set to TSS")
            }
            select <- "all"
        }
      if(output=="inside&downstream"){
            if(FeatureLocForDistance!="geneEnd"){
                FeatureLocForDistance <- "geneEnd"
                warning("FeatureLocForDistance is set to geneEnd")
            }
            select <- "all"
      }
      if (missing(myPeakList)) stop("Missing required argument myPeakList!")
        if (!is(myPeakList, "GRanges")) {
            stop("No valid myPeakList passed in. It needs to be GRanges object")
        }
        if (missing(AnnotationData)) {
            message("No AnnotationData as GRanges is passed in, 
                    so now querying biomart database for AnnotationData ....")
            if (missing(mart)) {
                stop("Error in querying biomart database. 
                     No valid mart object is passed in! 
                     Suggest call getAnnotation before calling 
                     annotatePeakInBatch")
            }
            if(!is(mart, "Mart")){
                if(is(mart, "GRanges")){
                    warning("AnnotationData is missing, and a GRanges is passed
                            to mart parameter. Trying to annotate peaks by 
                            the GRanges you input.")
                    AnnotationData <- mart
                    mart <- NULL
                }else{
                    stop("Error in querying biomart database. 
                     No valid mart object is passed in! 
                     Suggest call getAnnotation before calling 
                     annotatePeakInBatch")
                }
            }else{
                AnnotationData <- getAnnotation(mart, featureType = featureType)
                message("Done querying biomart database, start annotating ....
                        Better way would be calling getAnnotation before 
                        calling annotatePeakInBatch")
            }
       }
      if (!inherits(AnnotationData, 
                      c("GRanges", "annoGR"))) {
            stop("AnnotationData needs to be GRanges or annoGR object")
       }
       if (!length(names(AnnotationData)) || any(is.na(names(AnnotationData))))
       {
             names(AnnotationData) <- paste0("ann", seq_along(AnnotationData))
       }
       if(inherits(AnnotationData, "annoGR")){
            TSS.ordered <- as(AnnotationData, "GRanges")
       }else{
            TSS.ordered <- AnnotationData
       }
        nAnno <- length(TSS.ordered)
        
        rm(AnnotationData)
        rm(nAnno)
        
      if(length(bindingRegion)>1){
            message("Annotate peaks by annoPeaks, see ?annoPeaks for details.")
            ## FeatureLocForDistance=c("TSS","middle","start","end", "geneEnd")
            ##"startSite", "endSite", "fullRange"
            dots <- list(...)
            if(!"bindingType" %in% names(dots)){
                if(output %in% 
                   c("overlapping", 
                     "nearestBiDirectionalPromoters")){
                    bindingType <- switch(
                        output,
                        nearestBiDirectionalPromoters="nearestBiDirectionalPromoters",
                        overlapping=switch(FeatureLocForDistance,
                                           TSS="startSite",
                                           geneEnd="endSite",
                                           NA),
                        NA
                    )
                }
            }else{
              bindingType <- dots$bindingType
            }
            if(exists("bindingType") && !is.na(bindingType)){
                message("maxgap will be ignored.")
                dots$peak <- myPeakList
                dots$annoData <- TSS.ordered
                dots$bindingType <- bindingType
                dots$bindingRegion <- bindingRegion
                return(do.call(annoPeaks, dots))
            }else{
                message("bindingRegion will be ignored.")
            }
      }
        
      if (is.null(names(TSS.ordered))){
            names(TSS.ordered) <- formatC(seq_along(TSS.ordered),
                                          width=nchar(length(TSS.ordered)),
                                          flag="0")
        }
      if (is.null(names(myPeakList))) {
            names(myPeakList) <- formatC(seq_along(myPeakList),
                                         width = nchar(length(myPeakList)),
                                         flag = "0")
        }
      if(any(duplicated(names(myPeakList)))){
            warning("Found duplicated names in myPeakList. 
                    Changing the peak names ...")
            names(myPeakList) <- formatC(seq_along(myPeakList),
                                         width = nchar(length(myPeakList)),
                                         flag = "0")
       }
        savedNames <- names(myPeakList)
        
        ##clear seqnames, the format should be chr+NUM
        ##TODO, how about the seqname not start with chr?
        ## fix by seqlevelsStyle
        TSS.ordered <- formatSeqnames(TSS.ordered, myPeakList)
        #myPeakList <- formatSeqnames(myPeakList)
      if(!all(seqlevels(myPeakList) %in% seqlevels(TSS.ordered))){
            warning("not all the seqnames of myPeakList is 
                    in the AnnotationData.")
      }
        
      if(length(myPeakList)>10000){
            ##huge dataset
            myPeakList <- split(myPeakList, 
                                cut(seq_along(myPeakList),
                                    ceiling(length(myPeakList)/5000)))
            myPeakList <- lapply(myPeakList, annotatePeakInBatch, 
                                 AnnotationData=TSS.ordered, 
                                 output = output, maxgap = maxgap,
                                 PeakLocForDistance=PeakLocForDistance,
                                 FeatureLocForDistance=FeatureLocForDistance,
                                 select=select,
                                 ignore.strand=ignore.strand)
            names(myPeakList) <- NULL
            myPeakList <- unlist(GRangesList(myPeakList))
            names(myPeakList) <- make.names(paste(myPeakList$peak, 
                                                  myPeakList$feature))
            ##myPeakList
            return(myPeakList)
      }
        ###STEP1 get nearst annotation for each peak, 
        ###use distanceToNearest(query, subject, ignore.strand=T/F, select)
        ## the distance got here is the shortestDistance
        ## select could only be arbitrary or all, 
        ## if it is "first" or "last", use "all" instead.
        ## if output=="nearest", annotation should only consider the start point
        ##         ignore.strand <- all(strand(myPeakList)=="*") || 
        ##             all(strand(TSS.ordered)=="*") || 
        ##             all(strand(myPeakList)=="+")
        nsel <- ifelse(select %in% c("all", "first", "last"), 
                       "all", "arbitrary")
        featureGR <- TSS.ordered
        end(featureGR) <- 
            start(featureGR) <- 
            switch(FeatureLocForDistance,
                   TSS=ifelse(strand(featureGR)=="-", 
                              end(featureGR), 
                              start(featureGR)), 
                   geneEnd=ifelse(strand(featureGR)=="-", 
                                  start(featureGR), 
                                  end(featureGR)),
                   middle=round(rowMeans(cbind(start(featureGR), 
                                               end(featureGR)))),
                   start=start(featureGR),
                   end=end(featureGR) 
            )
        myPeaksGR <- myPeakList
        end(myPeaksGR) <-
            start(myPeaksGR) <-
            switch(PeakLocForDistance,
                   middle=round(rowMeans(cbind(start(myPeaksGR),
                                               end(myPeaksGR)))),
                   start=start(myPeaksGR),
                   end=end(myPeaksGR)
            )
        if(output=="nearestLocation"){
            dist <- as.data.frame(nearest(myPeaksGR, featureGR, 
                                          ignore.strand=ignore.strand, 
                                          select=nsel))            
            if(nrow(dist)==0) dist[1,] <- NA ##in case no match at all
            if(nsel=="arbitrary") {
                dist <- cbind(queryHits=seq_along(myPeakList), 
                              subjectHits=dist)
                colnames(dist) <- c("queryHits", "subjectHits")
            }
            dist$output <- rep("NearestLocation", nrow(dist))
        }
        if(output=="both"){
            distN <- as.data.frame(nearest(myPeaksGR, featureGR,
                                           ignore.strand=ignore.strand,
                                           select=nsel))
            if(nrow(distN)==0) distN[1,]<-NA
            if(nsel=="arbitrary") {
                distN <- cbind(queryHits=seq_along(myPeakList), 
                               subjectHits=distN)
                colnames(distN) <- c("queryHits", "subjectHits")
            }
            distO <- as.data.frame(findOverlaps(myPeakList, TSS.ordered,
                                                maxgap=maxgap,
                                                ignore.strand=ignore.strand,
                                                select=select))
            if(nrow(distO)==0) distO[1,]<-NA
            if(ncol(distO)==1){
                distO <- cbind(queryHits=seq_along(myPeakList), 
                               subjectHits=distO)
                colnames(distO) <- c("queryHits", "subjectHits")
            }
            distN$output <- rep("NearestLocation", nrow(distN))
            distN <- distN[!is.na(distN$subjectHits),]
            distO$output <- rep("Overlapping", nrow(distO))
            distO <- distO[!is.na(distO$subjectHits),]
            dist <- rbind(distN, distO)
            dist <- dist[!duplicated(dist[,c("queryHits", "subjectHits")]),
                         ,drop=FALSE]
            dist <- dist[order(dist$queryHits, dist$subjectHits),,drop=FALSE]
        }
        if(output=="overlapping"){
            dist <- as.data.frame(findOverlaps(myPeakList, TSS.ordered,
                                               maxgap=maxgap,
                                               ignore.strand=ignore.strand,
                                               select=select))
            if(nrow(dist)==0) dist[1,] <- NA
            if(ncol(dist)==1){
                dist <- cbind(queryHits=seq_along(myPeakList), subjectHits=dist)
                colnames(dist) <- c("queryHits", "subjectHits")
            }
            dist$output <- rep("Overlapping", nrow(dist))
        }
        if(output=="shortestDistance"){
            dist <- as.data.frame(nearest(myPeakList, TSS.ordered,
                                          ignore.strand=ignore.strand,
                                          select=nsel))
            if(nrow(dist)==0) dist[1,] <- NA ##in case no match at all
            if(nsel=="arbitrary") {
                dist <- cbind(queryHits=seq_along(myPeakList), subjectHits=dist)
                colnames(dist) <- c("queryHits", "subjectHits")
            }
            dist$output <- rep("shortestDistance", nrow(dist))
        }
        if(output=="upstream&inside"){
            #upstream
            featureGR <- TSS.ordered
            start <- ifelse(strand(featureGR)=="-", 
                            start(featureGR), 
                            start(featureGR)-max(maxgap, 1))
            width <- width(featureGR) + max(maxgap, 1)
            start(featureGR) <- start
            width(featureGR) <- width
            dist <- as.data.frame(findOverlaps(myPeakList, featureGR,
                                                ignore.strand=ignore.strand,
                                                select=select,
                                                type="any"))
            dist$output <- rep("Upstream&Inside", nrow(dist))
        }
        if(output=="upstream"){
            #upstream
            featureGR <- TSS.ordered
            start <- ifelse(strand(featureGR)=="-", 
                            end(featureGR)+1, 
                            start(featureGR)-max(maxgap, 1))
            width <- max(maxgap, 1)
            start(featureGR) <- start
            width(featureGR) <- width
            dist <- as.data.frame(findOverlaps(myPeakList, featureGR,
                                               ignore.strand=ignore.strand,
                                               select=select,
                                               type="any"))
            dist$output <- rep("Upstream", nrow(dist))
        }
        if(output=="inside&downstream"){
            #downstream
            featureGR <- TSS.ordered
            start <- ifelse(strand(featureGR)=="-", 
                            start(featureGR)-max(maxgap, 1), 
                            start(featureGR))
            width <- width(featureGR) + max(maxgap, 1)
            start(featureGR) <- start
            width(featureGR) <- width
            dist <- as.data.frame(findOverlaps(myPeakList, featureGR,
                                                ignore.strand=ignore.strand,
                                                select=select,
                                                type="any"))
            dist$output <- rep("Inside&Downstream", nrow(dist))
        }
        if(output=="downstream"){
            #downstream
            featureGR <- TSS.ordered
            start <- ifelse(strand(featureGR)=="-", 
                            start(featureGR)-max(maxgap, 1), 
                            end(featureGR)+1)
            width <- max(maxgap, 1)
            start(featureGR) <- start
            width(featureGR) <- width
            dist <- as.data.frame(findOverlaps(myPeakList, featureGR,
                                               ignore.strand=ignore.strand,
                                               select=select,
                                               type="any"))
            dist$output <- rep("Downstream", nrow(dist))
        }
        if(output=="upstreamORdownstream"){
            featureGR <- TSS.ordered
            start <- ifelse(strand(featureGR)=="-", 
                            end(featureGR)+1, 
                            start(featureGR)-max(maxgap, 1))
            width <- max(maxgap, 1)
            start(featureGR) <- start
            width(featureGR) <- width
            dist1 <- as.data.frame(findOverlaps(myPeakList, featureGR,
                                               ignore.strand=ignore.strand,
                                               select=select,
                                               type="any"))
            dist1$output <- rep("Upstream", nrow(dist1))
            featureGR <- TSS.ordered
            start <- ifelse(strand(featureGR)=="-", 
                            start(featureGR)-max(maxgap, 1), 
                            end(featureGR)+1)
            width <- max(maxgap, 1)
            start(featureGR) <- start
            width(featureGR) <- width
            dist2 <- as.data.frame(findOverlaps(myPeakList, featureGR,
                                               ignore.strand=ignore.strand,
                                               select=select,
                                               type="any"))
            dist2$output <- rep("Downstream", nrow(dist2))
            dist <- rbind(dist1, dist2)
        }
        if(output=="inside"){
            dist <- as.data.frame(findOverlaps(myPeakList, TSS.ordered,
                                               ignore.strand=ignore.strand,
                                               select=select,
                                               type="within"))
            dist$output <- rep("inside", nrow(dist))
        }
        if(output=="upstream2downstream"){
            featureGR <- TSS.ordered
            start(featureGR) <- 
                apply(cbind(start(featureGR) - maxgap, 1), 1, max)
            end(featureGR) <- end(featureGR) + maxgap
            dist <- as.data.frame(findOverlaps(myPeakList, featureGR,
                                               ignore.strand=ignore.strand,
                                               select=select,
                                               type="any"))
            dist$output <- rep("upstream2downstream", nrow(dist))
        }
        ##  nearest is NOT filtered by maxgap, is this should be changed?
        ##        dist <- dist[!is.na(dist$subjectHits), ]
        ##        distance <- distance(myPeakList[dist$queryHits],
        ##                             TSS.ordered[dist$subjectHits],
        ##                             ignore.strand=ignore.strand)
        ##        dist <- dist[abs(distance) <= maxgap, ]
        if(output=="nearestBiDirectionalPromoters" && length(bindingRegion)<=1){
          stop("If output is nearestBiDirectionalPromoters,",
               "please set bindingRegion. See ?annoPeaks for details.")
        }
        myPeakList.Hit <- 
            myPeakList[dist$queryHits[!is.na(dist$subjectHits)]]
        myPeakList.NA <- 
            myPeakList[!names(myPeakList) %in% names(myPeakList.Hit)]
        subjectHits <- 
            TSS.ordered[dist$subjectHits[!is.na(dist$subjectHits)]]
        mcols(subjectHits)$output <- 
            dist[!is.na(dist$subjectHits),"output"]
        #    myPeakList.Hit$distanceToNearest <- 
        #    dist$distance[!is.na(dist$subjectHits)]
        
        ###STEP2 get distance for each peak and nearest annotation by 
        ## distance(x, y). the distance is calculated by
        ##        PeakLocForDistance=c("start","middle","end"),
        ##        FeatureLocForDistance=c("TSS","middle","start",
        ##                              "end", "geneEnd")
        FeatureLoc <-
            switch(FeatureLocForDistance,
                   middle=as.integer(round(rowMeans(cbind(start(subjectHits), 
                                                          end(subjectHits))))),
                   start=start(subjectHits),
                   end=end(subjectHits),
                   geneEnd=as.integer(ifelse(strand(subjectHits)=="-", 
                                             start(subjectHits), 
                                             end(subjectHits))),
                   TSS=as.integer(ifelse(strand(subjectHits)=="-", 
                                         end(subjectHits), 
                                         start(subjectHits))))
        
        PeakLoc <- 
            switch(PeakLocForDistance,
                   start=start(myPeakList.Hit),
                   end=end(myPeakList.Hit),
                   middle=as.integer(round(rowMeans(
                       cbind(start(myPeakList.Hit), end(myPeakList.Hit))))))
        
        distancetoFeature <- as.numeric(ifelse(strand(subjectHits)=="-", 
                                               FeatureLoc - PeakLoc, 
                                               PeakLoc - FeatureLoc))
        
        ###STEP3 relationship between query and subject:
        ###   "inside", "overlapEnd", "overlapStart", 
        ###    "includeFeature", "upstream", "downstream"
        insideFeature <- getRelationship(myPeakList.Hit, subjectHits)
        myPeakList.Hit$peak <- names(myPeakList.Hit)
        myPeakList.Hit$feature <- names(subjectHits)
        myPeakList.Hit$start_position <- start(subjectHits)
        myPeakList.Hit$end_position <- end(subjectHits)
        myPeakList.Hit$feature_strand <- as.character(strand(subjectHits))
        
        myPeakList.Hit$insideFeature <- insideFeature[, "insideFeature"]
        myPeakList.Hit$distancetoFeature <- distancetoFeature
        
        myPeakList.Hit$shortestDistance <- 
            as.integer(insideFeature[,"shortestDistance"])
        
        myPeakList.Hit$fromOverlappingOrNearest <- subjectHits$output
        ##save oid for select == "first" or "last" filter
        myPeakList.Hit$oid <- insideFeature[, "ss"]
        ###combind data of NA with Hits
        if(length(myPeakList.NA)>0){
            myPeakList.NA$peak <- names(myPeakList.NA)
            for(ncol in c("feature",
                          "start_position",
                          "end_position",
                          "feature_strand",
                          "insideFeature",
                          "distancetoFeature",
                          "shortestDistance",
                          "fromOverlappingOrNearest",
                          "oid")){
                mcols(myPeakList.NA)[, ncol] <- NA
            }
            ## in case duplicated colnames
            colnames(mcols(myPeakList.NA)) <- colnames(mcols(myPeakList.Hit))
            myPeakList <- c(myPeakList.Hit, myPeakList.NA)
        }else{
            myPeakList <- myPeakList.Hit
        }
        ###output the results
        ##select=c("all", "first", "last", "arbitrary"))
        ##output = c("nearest", "overlapping", "both")
        ##if select %in% first or last, must filter the duplicate annotation
        ##if output="both", must filter the duplicate annotation
        ##if input is GRanges, output is GRanges
        ###the order of output should be same as input
        if(select=="first"){
            myPeakList <- 
                myPeakList[order(names(myPeakList), abs(myPeakList$oid))]
            myPeakList <- 
                myPeakList[!duplicated(names(myPeakList))]
        }
        if(select=="last"){
            myPeakList <- 
                myPeakList[order(names(myPeakList), -abs(myPeakList$oid))]
            myPeakList <- 
                myPeakList[!duplicated(names(myPeakList))]
        }
        ##remove column oid
        myPeakList$oid <- NULL
        ##re-order myPeakList as the original order
        oid <- seq_along(savedNames)
        names(oid) <- savedNames
        oid <- oid[names(myPeakList)]
        if(!any(is.na(oid))){
            myPeakList <- myPeakList[order(oid)]
        }
        

        if(output=="nearestLocation"){
            ## remove duplicate annotation, just keep the nearest one
            removeDuplicates <- function(gr){
                dup <- duplicated(gr$peak)
                if(any(dup)){
                    gr$oid <- seq_along(gr)
                    dup <- gr[dup]
                    gr.dup <- gr[gr$peak %in% dup$peak] 
                    ## bugs peak name must be different.
                    gr.NOTdup <- gr[!gr$peak %in% dup$peak]
                    gr.dup <- split(gr.dup, gr.dup$peak)
                    gr.dup <- lapply(gr.dup, function(.ele){
                        .ele[.ele$shortestDistance == 
                                 min(.ele$shortestDistance)]
                    })
                    gr.dup <- unlist(GRangesList(gr.dup))
                    gr <- c(gr.dup, gr.NOTdup)
                    gr <- gr[order(gr$oid)]
                    gr$oid <- NULL  
                }
                gr
            }
            myPeakList <- removeDuplicates(myPeakList)
        }

        
        names(myPeakList) <- 
            make.names(paste(myPeakList$peak, myPeakList$feature))

        ##myPeakList
        return(myPeakList)
    }

globalVariables(c("fromOverlappingOrNearest", "peak"))
jianhong/ChIPpeakAnno documentation built on Jan. 4, 2025, 5:27 p.m.