R/GRB.R
In CNEr: CNE Detection and Visualization

Documented in makeGRBs

### -----------------------------------------------------------------
### make GRBs from CNEs
### Exported!
makeGRBs <- function(x, winSize=NULL, genes=NULL, ratio=1, 
                     background=c("chromosome", "genome"),
                     minCNEs=1L){#,
                     #byChrom=FALSE
  if(!is(x, "GRangesList")){
    stop("The input `x` must be a `GRangesList` object!")
  }
  background <- match.arg(background)
  # if(isTRUE(byChrom)){
  #   if(!all(sapply(cneList, 
  #                  function(x){"seqnames2" %in% colnames(mcols(x))}))){
  #     stop("`x` must have metacolum of `seqnames2` to do",
  #          "byChrom GRB prediction!")
  #   }
  # }
  
  xList <- x
  if(is.null(names(xList))){
    names(xList) <- as.character(1:length(xList))
  }
  x <- unlist(x)
  x <- reduce(x, ignore.strand=TRUE)
  cov <- coverage(x)

  # Guess winSize from genome size if NULL
  # winSize in kb
  if(is.null(winSize)){
    ## Based on 300kb for human
    winSize <- seqlengths(x) / 3e6 * 300
  }
  winSize <- winSize * 1e3
  
  if(!is.null(genes) && !is(genes, "GRanges")){
    stop("The `genes` must be a `GRanges` object!")
  }

  density <- suppressWarnings(runmean(cov, k=winSize,  endrule="constant"))
  if(background == "genome"){
    # calculate the background percentage of coverage
    totalGenomeSize <- 
      sum(as.numeric(seqlengths(x)[as.character(unique(seqnames(x)))]))
    if(is.na(totalGenomeSize)){
      stop("seqlengths must be provided in input x!")
    }
    coveredAll <- sum(sum(cov)) / totalGenomeSize
    # slice the density into GRBs
    slicedDensities <- slice(density, lower=coveredAll*ratio, 
                             includeLower=FALSE)
  }else if(background == "chromosome"){
    # calculate the background percentage of coverage per chromosome
    coveredAllPerChromosome <- sum(cov)/seqlengths(x)[names(cov)]
    slicedDensities <- list()
    for(i in 1:length(density)){
      slicedDensities[[names(density)[i]]] <- slice(density[[i]],
        lower=coveredAllPerChromosome[names(density)[i]]*ratio, 
        includeLower=FALSE)
    }
    slicedDensities <- RleViewsList(slicedDensities)
  }
  
  clusterRanges <- GRanges(seqnames=rep(names(slicedDensities),
                                        elementNROWS(slicedDensities)),
                           ranges=unlist(ranges(slicedDensities)),
                           strand="+",
                           seqinfo=seqinfo(x))

  # shrink the GRBs with actual CNE locations
  hits <- findOverlaps(x, clusterRanges, type="within", select="all",
                       ignore.strand=TRUE)
  mergedGRBCNEsList <- split(x[queryHits(hits)], subjectHits(hits))
  starts <- min(start(mergedGRBCNEsList))
  ends <- max(end(mergedGRBCNEsList))
  seqnames <- sapply(seqnames(mergedGRBCNEsList), runValue)
  GRBMergedClean <- GRanges(seqnames=seqnames,
                            ranges=IRanges(start=starts,
                                           end=ends),
                            strand="+",
                            seqinfo=seqinfo(x))
  clusterRanges <- GRBMergedClean
  names(clusterRanges) <- paste(clusterRanges)
  
  # remove GRBs (which do not encompass any gene)
  if(!is.null(genes)){
    hits <- findOverlaps(genes, clusterRanges, type="within", select="all",
                         ignore.strand=TRUE)
    indexKeep <- unique(subjectHits(hits))
    clusterRanges <- clusterRanges[indexKeep]
  }
  
  # Count the number of CNEs within the GRBs
  for(i in 1:length(xList)){
    hitsCNEs <- findOverlaps(xList[[i]], clusterRanges,
                             ignore.strand=TRUE, type="within")
    
    ## Add the number of CNEs
    cnes <- lengths(split(queryHits(hitsCNEs), subjectHits(hitsCNEs)))
    mcols(clusterRanges)[[names(xList)[i]]] <- 0L
    mcols(clusterRanges)[[names(xList)[i]]][as.integer(names(cnes))] <- cnes
    
    ## Add the CNE ranges
    cnes <- split(xList[[i]][queryHits(hitsCNEs)], subjectHits(hitsCNEs))
    missingCNEs <- setdiff(seq_len(length(clusterRanges)),
                           subjectHits(hitsCNEs))
    for(j in missingCNEs){
      ## This is really a bad implementation. 
      cnes[[as.character(j)]] <- GRanges()
    }
    mcols(clusterRanges)[[paste(names(xList)[i], "CNE")]] <- 
      cnes[order(as.integer(names(cnes)))]
  }
  
  # Filter out the GRBs with few CNEs
  indexToKeep <- apply(as.data.frame(mcols(clusterRanges)[names(xList)]) >= 
                         minCNEs, 1, any)
  clusterRanges <- clusterRanges[indexToKeep]
  
  return(clusterRanges)
}