#' @include help.R
NULL
setClass("AnchoredReadPairs",
representation(zero_end="GAlignmentPairs",
single_end="GAlignmentPairs",
both_ends="GAlignmentPairs"))
AnchoredReadPairs <- function(zero_end, single_end, both_ends=GRanges()){
if(missing(zero_end)) new("AnchoredReadPairs")
new("AnchoredReadPairs", zero_end=zero_end,
single_end=single_end, both_ends=both_ends)
}
setMethod("zeroEndAnchors", "AnchoredReadPairs",
function(object) object@zero_end)
setMethod("singleEndAnchors", "AnchoredReadPairs",
function(object) object@single_end)
setMethod("bothEndAnchors", "AnchoredReadPairs",
function(object) object@both_ends)
FilterEdgeParam <- function(freq=5, minimum_maxdist=50, bad_bins=GRanges()){
list(freq=freq,
minimum_maxdist=minimum_maxdist,
bad_bins=bad_bins)
}
gaps0 <- function(x){
g <- gaps(x)
g <- g[strand(g)=="*"]
g
}
hgnc <- function(g) g$hgnc
isDuplication <- function(object, minimum_foldchange=1){
## - must not already be called an amplicon
## - must be somatic
## - must have elevated copy number
!isAmplicon(object) & (object$seg.mean >= minimum_foldchange) &
!is.na(object$seg.mean) & isSomatic(object) & width(object) < 500e3
}
#' @aliases filterBy,GRanges,GRanges-method
#' @rdname filterBy-methods
setMethod("filterBy", c("GRanges", "GRanges"),
function(query, subject, ...){
dropbins <- unique(queryHits(findOverlaps(query, subject, type=type, ...)))
if(length(dropbins) > 0)
query <- query[-dropbins]
return(query)
##query[!overlapsAny(query, subject, ...)]
})
node2 <- function(name, sep="-") sapply(name, function(x) strsplit(x, sep)[[1]][2])
node1 <- function(name, sep="-") sapply(name, function(x) strsplit(x, sep)[[1]][1])
#' Lists germline filters and parameters for filtering amplicons
#'
#'
#' @export
#'
#' @examples
#' params <- ampliconParams("hg19")
#'
#' @return Returns a list of germline filters and some of the hard
#' thresholds used in the amplicon analysis.
#'
#' @param AMP_THR numeric threshold for high copy amplicon
#'
#' @param LOW_THR numeric a lower threshold used when considering
#' amplicons that are bridged by improper read pairs to a high copy
#' amplicon
#'
#' @param border_size used to construct a query (GRanges object) for
#' additional amplicons neighboring a focal amplicon. TODO: more
#' detail needed.
#'
#' @param overhang An integer indicating how much to expand the
#' germline filter on each size. Using \code{overlapsAny}, a
#' determination is made whether an amplicon is part-germline (any
#' overlap) or no-germline (no overlap). If the amplicon is
#' completely within the extended germline filter, the amplicon is
#' considered fully-germline.
#'
#' @param MIN_WIDTH length-one integer vector indicating the minimum size of amplicon (see also \code{joinNearGRanges})
#'
#' @param MIN_SEGMEAN_DIFF length-one numeric vector. Adjacent segments whose means differ by less than this value are candidates for merging by \code{{joinNearGRanges}}
#'
#' @param min.gapwidth length-one numeric vector passed to the \code{reduce} method that merges adacent segments with comparable segment means
#'
#' @param maxgap length-one numeric vector. Passed to \code{findOverlaps} when
#' evaluating whether the amplicon ranges overlap with other amplicons (see
#' \code{linkNearAmplicons}).
#'
#' @param frequency minimum number of read pairs
#'
#' @param minimum_maxdist TODO
#'
#' @param minimum_count a length-one integer vector: minimum number of
#' improperly paired reads required to link two amplicons
#'
#' @param bad_bins a \code{GRanges} object of problematic bins
ampliconParams <- function(AMP_THR=log2(2.75),
LOW_THR=log2(1.75),
border_size=10e3,
overhang=25e3,
MIN_WIDTH=2000,
MIN_SEGMEAN_DIFF=0.05,
min.gapwidth=3000,
maxgap=500e3,
frequency=5,
minimum_maxdist=50,
minimum_count=5,
bad_bins=GRanges()){
##filters <- listGenomeFilters()
edge <- FilterEdgeParam(freq=5,
minimum_maxdist=minimum_maxdist,
bad_bins=bad_bins)
filters <- list()
filters$border_size <- border_size
filters$overhang <- overhang
filters$AMP_THR <- AMP_THR
filters$LOW_THR <- LOW_THR
filters$MIN_WIDTH <- MIN_WIDTH
filters$MIN_SEGMEAN_DIFF <- MIN_SEGMEAN_DIFF
filters$min.gapwidth <- min.gapwidth
filters$maxgap <- maxgap
filters$minimum_maxdist=minimum_maxdist
filters$minimum_count=minimum_count
filters$edge <- edge
filters
}
overlapsGermline <- function(object, germline, overhang=5e3){
filter_levels <- c("no_germline", "part_germline", "fully_germline")
start(germline) <- pmax(start(germline)-overhang, 1)
sl <- seqlengths(germline)[as.character(seqnames(germline))]
end(germline) <- pmin(end(germline)+overhang, sl)
germline <- reduce(germline)
any_overlap <- overlapsAny(object, germline)
filtered <- ifelse(any_overlap, "part_germline", "no_germline")
filtered[any_overlap] <- "part_germline"
complete_overlap <- findOverlaps(object, germline, type="within")
filtered[unique(queryHits(complete_overlap))] <- "fully_germline"
filtered <- factor(filtered, levels=filter_levels)
filtered
}
standardizeGRangesMetadata <- function(granges){
if(length(granges) == 0) return(granges)
mns <- granges$seg.mean
is_amplicon <- granges$is_amplicon
granges <- reduce(granges)
granges$seg.mean <- mns
granges$is_amplicon <- is_amplicon
names(granges) <- ampliconNames(granges)
granges$hgnc <- as.character(NA)
granges$driver <- as.character(NA)
granges$biol_sign <- as.character(NA)
granges$groups <- as.factor(NA)
granges
}
isNotAmplicon <- function(object) !isAmplicon(object)
isGap <- function(object) width(object)==999 & is.na(object$seg.mean)
gapBetweenLowCopyRanges <- function(g){
index_lowcopy <- which(isNotAmplicon(g))
index_gap1kb <- which(isGap(g))
flanked_by_lowcopy <- (index_gap1kb - 1) %in% index_lowcopy &
(index_gap1kb + 1) %in% index_lowcopy
gap1kb <- g[index_gap1kb]
gap1kb[flanked_by_lowcopy]
}
startAmpliconBoundary <- function(ranges, is_gap=TRUE, extend=2e3){
### NOT RUN: is_gap is never specified when this function is called
### so the value defaults to TRUE
if(!is_gap){
candidate_amplicon <- ranges$seg.mean > 0.8 & !is.na(ranges$seg.mean)
if(any(!candidate_amplicon)){
start(ranges)[!candidate_amplicon] <- end(ranges)[!candidate_amplicon]
}
}
###
is_small <- width(ranges) < extend
if(any(is_small)){
small_amplicons <- ranges[is_small]
if(all(is_small)) return(small_amplicons)
}
amplicons <- ranges[!is_small]
cnv_begin_edge <- amplicons
start(cnv_begin_edge) <- start(amplicons)
end(cnv_begin_edge) <- start(amplicons)+extend
if(any(is_small)){
cnv_begin_edge <- sort(c(cnv_begin_edge, small_amplicons))
}
cnv_begin_edge
}
endAmpliconBoundary <- function(ranges, is_gap=FALSE, extend=2e3){
### NOT RUN: is_gap is never specified when this function is called
### so the value defaults to TRUE
if(!is_gap){
candidate_amplicon <- ranges$seg.mean > 0.8 & !is.na(ranges$seg.mean)
if(any(!candidate_amplicon)){
start(ranges)[!candidate_amplicon]<- end(ranges)[!candidate_amplicon]
}
}
###
is_small <- width(ranges) < extend
if(any(is_small)){
small_amplicons <- ranges[is_small]
if(all(is_small)) return(small_amplicons)
}
amplicons <- ranges[!is_small]
if(length(amplicons) == 0) return(amplicons)
cnv_end_edge <- amplicons
start(cnv_end_edge) <- end(amplicons)-extend
end(cnv_end_edge) <- end(amplicons)
if(any(is_small)){
cnv_end_edge <- sort(c(cnv_end_edge, small_amplicons))
}
cnv_end_edge
}
borderSize <- function(object) object@border_size
germlineCNV <- function(object) object@germline_cnv
outliers <- function(object) object@outliers
ampliconQueryRanges <- function(object, min.fc=1){
g <- ranges(object)
gap1kb <- gapBetweenLowCopyRanges(g)
starts <- startAmpliconBoundary(g, extend=borderSize(object))
ends <- endAmpliconBoundary(g, extend=borderSize(object))
g <- c(starts, ends)
names(g) <- ampliconNames(g)
g <- g[!duplicated(names(g))]
g <- g[width(g) > 1]
g <- g[!is.na(g$seg.mean)]
g <- g[g$seg.mean > min.fc | isGap(g)]
## drop 999bp intervals between low copy ranges
g <- g[!names(g) %in% names(gap1kb)]
assembly_gaps <- assemblyGaps(object)
if(length(assembly_gaps)==0) {
queryRanges(object) <- g
return(object)
}
## drop ranges contained within assembly gaps
assembly_gaps <- reduce(assembly_gaps)
o <- findOverlaps(g, assembly_gaps, type="within")
if(length(o) > 0){
g <- g[-unique(queryHits(o))]
if(length(g)==0) warning("All queryRanges are within assembly gaps")
}
## ## drop ranges contained within germline CNV / outliers gaps
germline <- sort(reduce(c(germlineCNV(object), outliers(object))))
o <- findOverlaps(g, germline, type="within")
if(length(o) > 0){
g <- g[-unique(queryHits(o))]
if(length(g)==0) warning("All queryRanges are within germline")
}
queryRanges(object) <- g
object
}
setMethod("combine", signature(x="GRanges", y="GRanges"),
function(x, y,...){
col.names <- intersect(colnames(mcols(x)), colnames(mcols(y)))
##col.names <- union(colnames(mcols(x)), colnames(mcols(y)))
mcols(x) <- mcols(x)[, col.names]
mcols(y) <- mcols(y)[, col.names]
colnames(mcols(x)) <- colnames(mcols(y)) <- col.names
xy <- c(x, y)
xy
})
.add_mcols <- function(g){
g$seg.mean <- NA
g$is_amplicon <- FALSE
g$hgnc <- as.character(NA)
g$driver <- as.character(NA)
g$biol_sign <- as.character(NA)
g$groups <- as.factor(NA)
g
}
.empty_filters <- function(assembly_gaps=GRanges(),
centromeres=GRanges(),
germline_cnv=GRanges(),
outliers=GRanges()){
list(assembly_gaps=assembly_gaps,
centromeres=centromeres,
germline_cnv=germline_cnv,
outliers=outliers)
}
#' Constructor for AmpliconGraph
#'
#' Constructor for \code{AmpliconGraph}.
#'
#' @param ranges a \code{GRanges} object of the segmented normalized copy number
#' @param filters a list of germline CNV and sequence-based filters
#' @param params a list of parameters
#' @rdname AmpliconGraph-constructor
#' @export
AmpliconGraph <- function(ranges=GRanges(),
filters,
params=ampliconParams(border_size=150e3, overhang=5e3)){
if(missing(filters)){
filters <- .empty_filters()
}
assembly_gaps <- filters[["assembly_gaps"]]
centromeres <- filters[["centromeres"]]
germline_cnv <- filters[["germline_cnv"]]
outliers <- filters[["outliers"]]
border_size <- params[["border_size"]]
overhang <- params[["overhang"]]
if(missing(ranges)) {
ag <- new("AmpliconGraph")
names(ranges(ag)) <- character()
return(ag)
}
ranges <- standardizeGRangesMetadata(ranges)
assembly_gaps <- reduce(sort(combine(assembly_gaps, centromeres)))
##
## Annotate GRanges
##
## gaps
##
g <- gaps0(ranges)
if(length(g) > 0){
g <- .add_mcols(g)
ranges <- sort(c(ranges, g))
}
##
germ <- reduce(c(germline_cnv, outliers))
ranges$overlaps_germline <- overlapsGermline(ranges, germ,
overhang=overhang)
names(ranges) <- ampliconNames(ranges)
amps <- ranges[isAmplicon(ranges) & isSomatic(ranges)]
tmp <- new("AmpliconGraph",
graph=graphNEL(nodes=names(amps)),
ranges=ranges,
border_size=border_size,
assembly_gaps=assembly_gaps,
germline_cnv=germline_cnv,
outliers=outliers)
## initialize query ranges
ampliconQueryRanges(tmp, min.fc = params[["LOW_THR"]])
}
trimRangesOverlappingCentromere <- function(object, centromeres){
if(numEdges(graph(object)) > 0) {
stop("currently this method does not preserve edges")
}
rg <- ranges(object)
sl <- seqlevels(rg)
si <- seqinfo(rg)
spanned_by_gap <- unique(queryHits(findOverlaps(rg, centromeres, type="within")))
if(length(spanned_by_gap) > 0){
rg <- rg[-spanned_by_gap]
}
overlaps_centromere <- overlapsAny(rg, centromeres)
if(any(overlaps_centromere)){
rgs <- rg[overlaps_centromere]
trimmed <- setdiff(rgs, centromeres)
j <- subjectHits(findOverlaps(trimmed, rgs))
## only keep one side
tmp <- sapply(split(trimmed, j), function(g) g[which.max(width(g))])
trimmed <- unlist(GRangesList(tmp))
j <- subjectHits(findOverlaps(trimmed, rgs))
trimmed$seg.mean <- rgs$seg.mean[j]
trimmed$is_amplicon <- rgs$is_amplicon[j]
trimmed$hgnc <- rgs$hgnc
trimmed$driver <- rgs$driver
trimmed$biol_sign <- rgs$biol_sign
trimmed$groups <- rgs$groups
trimmed$overlaps_germline <- rgs$overlaps_germline
## drop regions that were gaps to begin with
trimmed <- trimmed[!is.na(trimmed$seg.mean)]
rg <- filterBy(rg, rgs)
rg <- sort(c(rg, trimmed))
names(rg) <- ampliconNames(rg)
rg <- keepSeqlevels(rg, sl, pruning.mode="coarse")
seqinfo(rg) <- si
ranges(object) <- rg
}
ag <- graphNEL(nodes=names(ampliconRanges(object)))
graph(object) <- ag
object
}
#' Merge adjacent amplicons that have a similar segment mean
#'
#' Merges adjacent amplicons (<= 3kb separation) that have similar
#' segment means.
#'
#' @keywords internal
#' @export
#' @return a \code{GRanges} object of the merged segments
#'
#' @param object a \code{GRanges} object
#'
#' @param thr a length-one numeric vector. If the difference of the average log
#' ratio for two segments is less than this number, combine the segments. The
#' segment mean of the merged GRanges will be the weighted average, where the
#' weights are the segment widths.
#'
#'
#' @param MIN_WIDTH minimum size of amplicon (default 2000)
#'
#' @details Merging is performed by the \code{reduce} operation.
#'
#'
joinNearGRanges <- function(object, params){
##thr=0.05, MIN_WIDTH=2e3, min.gapwidth=3000){
thr <- params[["MIN_SEGMEAN_DIFF"]]
MIN_WIDTH <- params[["MIN_WIDTH"]]
min.gapwidth <- params[["min.gapwidth"]]
k <- which(!is.na(object$seg.mean) & width(object) > MIN_WIDTH)
if(length(k) == 0) return(object)
g <- object[k]
means <- g$seg.mean
d <- diff(means)
if(all(d > thr)) return(object)
regions <- paste0("region", cumsum(c(0, abs(d) > thr)))
regions <- factor(regions, levels=unique(regions))
indexlist <- split(seq_along(regions), regions)
##
## list of regions. Each element is a vector of regions that can be
## joined
##
glist <- vector("list", length(indexlist))
for(i in seq_along(indexlist)){
j <- indexlist[[i]]
if(length(j)==1) next()
ng <- reduce(g[j], min.gapwidth=min.gapwidth)
##
## TODO: combining the metadata. Here, we're just taking the first row
##
mc <- mcols(g)[j[1], , drop=FALSE]
mc$seg.mean <- sum(width(g[j])*g$seg.mean[j])/sum(width(g[j]))
mcols(ng) <- mc
glist[[i]] <- ng
}
gnew <- unlist(GRangesList(unlist(glist)))
## Remove regions in the original object that overlap with the
## joined regions (g contains all the original intervals that were
## subsequently joined)
object2 <- filterBy(object, gnew)
object3 <- c(object2, gnew)
object3$seg.mean <- round(object3$seg.mean, 2)
object3 <- sort(object3)
object3
}
flankingRanges <- function(object, shift=2){
rgs <- ranges(object)
amplicons <- ampliconRanges(object)
index <- match(names(amplicons), names(rgs))
index2 <- index - shift
index2 <- index2[index2 > 0]
index3 <- index + shift
is_right_flank <- (index3 < length(rgs)) & (as.character(seqnames(rgs[index3])) == as.character(seqnames(rgs[index])))
# This just fixes the issue at the right flank. Couldn't this happen at the left flank as well?
if(any(!is_right_flank)){
## REFACTOR. No right flank exists. If we exclude, linkedDuplicatedRanges will fail
## assign the amplicon index
index3[!is_right_flank] <- index[!is_right_flank]
}
left_flank <- rgs[index2]
right_flank <- rgs[index3]
left <- list(first=left_flank, last=amplicons)
right <- list(first=amplicons, last=right_flank)
list(left=left, right=right)
}
flankingDuplications <- function(object, minimum_foldchange=1){
flanks <- flankingRanges(object, shift=2)
is_dupL <- isDuplication(flanks$left[[1]], minimum_foldchange)
is_dupR <- isDuplication(flanks$right[[2]], minimum_foldchange)
flanks$left <- lapply(flanks$left, "[", is_dupL)
flanks$right <- lapply(flanks$right, "[", is_dupR)
flanks
}
minimumBasepairCoverage <- function(rp, regions){
dj <- disjoin(rp)
dj <- dj[overlapsAny(dj, regions)]
cnt <- countOverlaps(dj, rp)
h <- findOverlaps(regions, dj)
cntlist <- split(cnt[subjectHits(h)], queryHits(h))
min_cnt <- sapply(cntlist, min)
min_cnt
}
linkedDuplicatedRanges <- function(object, rpsegs,
flanking_duplications,
params){
minimum_count <- params[["min_count"]]
flank <- flanking_duplications
lengths <- unlist(lapply(flank, elementNROWS))
##if(any(lengths != 1)) stop("Flanking regions must be length-one GRanges")
gapsLeft <- GRanges(seqnames(flank[["left"]]$first),
IRanges(end(flank[["left"]]$first)+1,
start(flank[["left"]]$last)))
gapsRight <- GRanges(seqnames(flank[["right"]]$first),
IRanges(end(flank[["right"]]$first)+1,
start(flank[["right"]]$last)))
cntsLeft <- minimumBasepairCoverage(rpsegs, gapsLeft)
cntsRight <- minimumBasepairCoverage(rpsegs, gapsRight)
left <- lapply(flank$left, "[", which(cntsLeft >= minimum_count))
right <- lapply(flank$right, "[", which(cntsRight >= minimum_count))
list(left=left, right=right)##flanking_duplications[cnts >= minimum_count]
}
addFlanks <- function(object, dup_granges){
## left-of-amplicon flank
flank <- dup_granges$left[[1]]
if(length(flank) > 0){
amp <- dup_granges$left[[2]]
edges <- paste(names(flank), names(amp), sep="-")
index <- match(names(flank), names(ranges(object)))
new_nodes <- names(ranges(object))[index]
object <- addNode(new_nodes, object, edges)
}
## right-of-amplicon flank
flank <- dup_granges$right[[2]]
if(length(flank) > 0){
amp <- dup_granges$right[[1]]
edgR <- paste(names(amp), names(flank), sep="-")
index <- match(names(flank), names(ranges(object)))
new_nodes <- names(ranges(object))[index]
object <- addNode(new_nodes, object, edgR)
}
object
}
#' Add focal amplicons that flank an amplicon seed
#'
#' Amplicons selected to seed a graph have a fold change of at least log2(2.75),
#' or nearly 3-fold the diploid genome. Lower-copy amplicons flanking the seed
#' amplicons are added to the graph if they have a fold change of at least
#' log2(1.75), or nearly 2-fold the diploid genome. To assess whether a low-copy
#' amplicon is 'flanking' an amplicon seed, all read pairs that aligned to the
#' amplicon seeds (see \code{get_readpairs}) are passed to this function in
#' object \code{rp}. The read pairs are converted to segments by
#' \code{readPairsAsSegments} and an assessment of whether flanking duplications
#' are linked is given by \code{linkedDuplicatedRanges}. In particular, this
#' function checks whether there are at least \code{minimum_count} fragments
#' that connect the amplicon seeds to the flanking regions. Finally, the flanks
#' are added to the graph by the \code{addFlanks} function.
#'
#' REFACTORING: overly complicated with many abstractions. Needs an
#' example to step through
#'
#'
#' @param object a \code{AmpliconGraph}
#'
#' @param rp a \code{GAlignmentPairs} object
#'
#' @param params a list of parameters for amplicon analyses
#' @seealso \code{\link{ampliconParams}}
#' @examples
#' \dontrun{
#' path <- file.path(system.file(package="trellis"), "tests/testthat")
#' ag <- readRDS(file.path(path, "addFocalDups.ffab104.rds"))
#' params <- ampliconParams()
#' ag <- addFocalDupsFlankingAmplicon(ag, rp, params)
#' }
#' @export
addFocalDupsFlankingAmplicon <- function(object, rp, params){
if(totalWidth(queryRanges(object))==0) return(object)
flanks <- flankingDuplications(object,
minimum_foldchange=params[["LOW_THR"]])
rpsegs <- readPairsAsSegments(rp)
dup_gr <- linkedDuplicatedRanges(object, rpsegs, flanks, params)
object <- addFlanks(object, dup_gr)
object
}
numberAnchored <- function(object, read_pairs){
seed <- ampliconRanges(object)
R1_anchored <- overlapsAny(first(read_pairs), seed)
R2_anchored <- overlapsAny(last(read_pairs), seed)
number_anchored <- R1_anchored+R2_anchored
zero_anchor <- read_pairs[number_anchored == 0]
single_anchor <- read_pairs[number_anchored == 1]
both_anchored <- read_pairs[number_anchored == 2]
AnchoredReadPairs(zero_end=zero_anchor,
single_end=single_anchor,
both_ends=both_anchored)
}
edgeStats <- function(edges, param){
freq <- table(names(edges))
freq <- freq[freq >= param$freq]
if(length(freq)==0) return(data.frame(freq=integer(), maxdist=integer(), bad_bin=logical()))
stats <- data.frame(freq=as.integer(freq))
rownames(stats) <- names(freq)
e2 <- edges[names(edges) %in% rownames(stats)]
index <- split(seq_len(length(e2)), names(e2))
##i <- NULL
results <- vector("list", length(index))
for(j in seq_along(index)){
i <- index[[j]]
results[[j]] <- start(first(e2))[i]
}
maxdist <- sapply(lapply(results, range), diff)
##maxdist <- sapply(lapply(foreach(i = index) %do% start(first(e2))[i], range), diff)
names(maxdist) <- names(index)
stats$maxdist <- maxdist[rownames(stats)]
stats$bad_bin <- rep(FALSE, nrow(stats))
bad_bins <- param$bad_bins
if(length(bad_bins) == 0 ) return(stats)
hits1 <- findOverlaps(first(e2), bad_bins[[1]], select="first")
hits2 <- findOverlaps(last(e2), bad_bins[[2]], select="first")
if(any(hits1 == hits2, na.rm=TRUE)){
## matches a flagged bin pair
stop("variable flag not defined")
##id <- names(e2)[flag]
##stats$bad_bin[id] <- TRUE
}
## check the reverse
hits1 <- findOverlaps(first(e2), bad_bins[[2]], select="first")
hits2 <- findOverlaps(last(e2), bad_bins[[1]], select="first")
if(any(hits1 == hits2, na.rm=TRUE)){
stop("variable flag not defined")
## id <- names(e2)[flag]
## stats$bad_bin[id] <- TRUE
}
stats
}
edgeFilters <- function(edges, param=FilterEdgeParam()){
stats <- edgeStats(edges, param)
valid_edges <- rownames(stats)[stats$maxdist >= param$minimum_maxdist & !stats$bad_bin]
valid_edges
}
singleAnchorHits <- function(object, read_pair){
if(length(read_pair) < 1) return(NULL)
a <- ampliconRanges(object)
hitfirst <- findOverlaps(first(read_pair), a, select="all", maxgap=1e3)
g <- nonampliconRanges(object)
hitlast <- findOverlaps(last(read_pair), g, select="all", maxgap=1e3)
hitlist <- hitsWithSameQuery(list(hitfirst, hitlast))
if(!identicalReadQueries(hitlist)){
hitlist <- dropDuplicatedQueryHits(hitlist)
}
hitlist <- hitsWithDifferentTargets(hitlist)
edges <- orderEdgesByIndexOfOverlap(hitlist, ranges1=a, ranges2=g)
supporting_read_pair <- read_pair[queryHits(hitlist[[1]])]
names(supporting_read_pair) <- edges
supporting_read_pair
}
addDuplications <- function(object, granges, edges){
if(length(granges) > 0){
index <- match(unique(names(granges)), names(ranges(object)))
new_nodes <- names(ranges(object))[index]
object <- addNode(new_nodes, object, edges)
}
object
}
linkFocalDups <- function(object, rp, params){
LOW_THR <- params[["LOW_THR"]]
edgeParam <- params[["edge"]]
if(totalWidth(queryRanges(object))==0) return(object)
anchors <- numberAnchored(object, rp)
se_anchors <- singleEndAnchors(anchors)
e <- singleAnchorHits(object, se_anchors)
if(is.null(e)) return(object)
keep <- edgeFilters(e, param=edgeParam)
e <- e[names(e) %in% keep]
if(length(e)==0) return(object)
candidates <- ranges(object)[node2(names(e))]
is_dup <- isDuplication(candidates, minimum_foldchange=LOW_THR)
e <- e[is_dup]
candidates <- candidates[is_dup]
object <- addDuplications(object, candidates, unique(names(e)))
object
}
orderEdgesByIndexOfOverlap <- function(hitlist, ranges1, ranges2){
if(missing(ranges2)) ranges2 <- ranges1
j <- subjectHits(hitlist[[1]])
k <- subjectHits(hitlist[[2]])
jk <- cbind(pmin(j, k), pmax(j,k))
edges <- paste(names(ranges1)[jk[,1]],
names(ranges2)[jk[,2]],
sep="-")
edges
}
hitsWithSameQuery <- function(hitlist){
qhits <- intersect(queryHits(hitlist[[1]]), queryHits(hitlist[[2]]))
hitlist <- lapply(hitlist, function(x, index) x[queryHits(x) %in% index], index=qhits)
hitlist
}
hitsWithDifferentTargets <- function(hitlist){
j <- subjectHits(hitlist[[1]])
k <- subjectHits(hitlist[[2]])
notequal <- j != k
hitlist <- lapply(hitlist, "[", i=notequal)
hitlist
}
identicalReadQueries <- function(hitlist)
identical(queryHits(hitlist[[1]]), queryHits(hitlist[[2]]))
dropDuplicatedQueryHits <- function(hitlist){
qh1 <- queryHits(hitlist[[1]])
qh2 <- queryHits(hitlist[[2]])
d1 <- duplicated(qh1)
d2 <- duplicated(qh2)
hitlist[[1]] <- hitlist[[1]][!d1, ]
hitlist[[2]] <- hitlist[[2]][!d2, ]
hitlist
}
twoAnchorHits <- function(object, read_pair){
if(length(read_pair) < 1) return(NULL)
a <- ampliconRanges(object)
hitfirst <- findOverlaps(first(read_pair), a, select="all", maxgap=1e3)
hitlast <- findOverlaps(last(read_pair), a, select="all", maxgap=1e3)
hitlist <- hitsWithSameQuery(list(hitfirst, hitlast))
if(!identicalReadQueries(hitlist)){
hitlist <- dropDuplicatedQueryHits(hitlist)
}
hitlist <- hitsWithDifferentTargets(hitlist)
edges <- orderEdgesByIndexOfOverlap(hitlist, ranges1=a)
supporting_read_pair <- read_pair[queryHits(hitlist[[1]])]
names(supporting_read_pair) <- edges
supporting_read_pair
}
linkAmplicons <- function(object, rp, edgeParam=FilterEdgeParam()){
rp <- rp[overlapsAny(rp, ampliconRanges(object), maxgap=1e3)]
if(numNodes(object)<2) return(object)
anchors_rp <- numberAnchored(object, rp)
e <- twoAnchorHits(object, bothEndAnchors(anchors_rp))
if(length(e)==0) return(object) ##there are no improper pairs linking amplicons
e <- e[names(e) %in% edgeFilters(e, param=edgeParam)]
if(length(e)==0) return(object)
tabe <- table(names(e))
keep <- names(tabe)[tabe >= edgeParam$freq]
if (length(keep) == 0) return(object)
from <- node1(keep)
to <- node2(keep)
existing <- edges(object)[from] ## is 'to' in any of the existing edges
edge_exists <- mapply(function(to, existing) to %in% existing,
to=to, existing=existing)
if(!all(edge_exists)){
graph(object) <- addEdge(node1(keep[!edge_exists]),
node2(keep[!edge_exists]), graph(object))
}
object
}
linkNearAmplicons <- function(object, maxgap=500e3){
hits <- findOverlaps(ampliconRanges(object), maxgap=maxgap)
hits <- hits[!isSelfHit(hits)]
hits <- hits[!isRedundantHit(hits)]
if(length(hits)==0) return(object)
new_edges <- paste(names(ampliconRanges(object))[queryHits(hits)],
names(ampliconRanges(object))[subjectHits(hits)],
sep="-")
from <- node1(new_edges)
to <- node2(new_edges)
existing <- edges(object)[from] ## is 'to' in any of the existing edges
edge_exists <- mapply(function(to, existing) to %in% existing,
to=to, existing=existing)
new_edges <- new_edges[!edge_exists]
if(length(new_edges)==0) return(object)
graph(object) <- addEdge(node1(new_edges),
node2(new_edges), graph(object))
object
}
germline <- function(object){
reduce(c(germlineCNV(object), outliers(object)))
}
#' Identify lower copy focal amplicons (possibly duplicatons) not already
#' annotated as amplicons and not contained within the germline filters
#'
#' Select all duplicated segments (see \code{isDuplication}) less than `maxgap`
#' kb from the seed amplicons of the graph that are not within a germline CNV or
#' outlier. Germline CNVs and outliers can be conveniently extracted from the
#' graph object as given in the examples.
#'
#' @export
#'
#' @param object A \code{AmpliconGraph}
#' @param params a list of parameters such as that generated by \code{ampliconParams}.
#' @return a reduced \code{GRanges} of lower-copy amplicons
#' @seealso \code{\link{ampliconParams}}
#' @examples
#' ## load a previously saved AmpliconGraph from a regression unit test
#' \dontrun{
#' path <- system.file("testthat", package="trellis")
#' ag <- readRDS(file.path(path, "addFocalDups.ffab104.rds"))
#' params <- ampliconParams()
#' focalAmpliconDupRanges(ag, params)
#' }
#' @export
focalAmpliconDupRanges <- function(object, params){
LOW_THR <- params[["LOW_THR"]]
MAX_SIZE <- params[["maxgap"]]
g <- ampliconRanges(object)
is_dup <- isDuplication(ranges(object), minimum_foldchange=LOW_THR)
## remove any lower copy ranges that are very large
dup_g <- ranges(object)[is_dup]
dup_g <- dup_g[width(dup_g) < MAX_SIZE]
g <- sort(c(dup_g, g))
##
## TODO: add expansion to params
##
if(length(g) > 0){
g <- expandGRanges(g, 1e3)
}
## we do not want to link germline events
germ <- germline(object)
##
## TODO: add expansion to params
##
germ <- expandGRanges(germ, 10e3)
germ <- reduce(germ)
g <- filterBy(g, germ, type="within")
reduce(g)
}
removeNode2 <- function (node, object) {
##object <- clearNode(node, object)
nN <- nodes(object)
node <- node[node %in% nN]
if(length(node) == 0) return(object)
wh <- match(node, nN)
gN <- nN[-wh]
nE <- object@edgeL
nE <- nE[gN]
object@edgeL <- nE
identical(names(nE), gN)
##nE <- object@edgeL[-wh]
nE2 <- lapply(nE, function(el) {
## Nodes that never linked to anything to begin with
if(length(el$edges) == 0) return(el)
oldN <- nN[el$edges]
result <- match(oldN, gN)
result <- as.integer(result[!is.na(result)])
el$edges <- result
el
})
object@nodes <- gN
object@edgeL <- nE2
object
}
filterSmallAmplicons <- function(object, MIN_SIZE=5e3){
ar <- reduce(ampliconRanges(object), min.gapwidth=5e3)
ar <- ar[width(ar) < 5e3]
if(length(ar) > 0){
j <- subjectHits(findOverlaps(ar, ranges(object)))
gN <- removeNode2(names(ranges(object))[j], graph(object))
graph(object) <- gN
}
object
}
updateRangesMetadata <- function(object, granges){
index <- match(names(granges), names(ranges(object)))
rg <- ranges(object)[-index]
rg2 <- sort(c(rg, granges))
ranges(object) <- rg2
object
}
#' Assign a group id for nodes that are linked
#'
#' Takes a list of amplicons (GRanges object) and an undirected graph
#' representation of the amplicons
#'
#' @export
#' @param gN a \code{AmpliconGraph} object
groupAmplicons <- function(gN){
V <- nodes(gN)
if(numEdges(gN) > 0){
kgroups <- kCliques(gN)
groups <- kgroups[[length(kgroups)]]
grouped.amplicon <- rep(NA, length(V))
for(k in seq_along(groups)){
grouped.amplicon[match(groups[[k]], V)] <- k
}
if(any(is.na(grouped.amplicon))){
grouped.amplicon[is.na(grouped.amplicon)] <- max(grouped.amplicon, na.rm=TRUE) +
seq_len(sum(is.na(grouped.amplicon)))
}
grouped.amplicons <- as.integer(factor(grouped.amplicon))
} else grouped.amplicon <- seq_len(length(V))
names(grouped.amplicon) <- V
factor(grouped.amplicon)
}
setAmpliconGroups <- function(object){
groups <- groupAmplicons(graph(object))
groups <- setNames(paste0("gr", groups), names(groups))
ar <- ampliconRanges(object)
groups <- groups[names(ar)]
ar$groups <- groups
object <- updateRangesMetadata(object, ar)
object
}
getGenes <- function(object, transcripts){
.hgnc <- setNames(rep(NA, length(object)), names(object))
hits <- findOverlaps(transcripts, object, type="within")
##genes <- hgnc(transcripts)[queryHits(hits)]
genes <- transcripts$gene_name[queryHits(hits)]
amp <- names(object)[subjectHits(hits)]
geneL <- lapply(split(genes, amp), function(x) paste(unique(x), collapse=", "))
genes <- unlist(geneL)
.hgnc[names(genes)] <- genes
object$hgnc <- as.character(.hgnc)
object
}
setGenes <- function(object, transcripts){
ar <- getGenes(ampliconRanges(object), transcripts)
object <- updateRangesMetadata(object, ar)
object
}
## Two levels of significance for genes:
## - clinical significance (synonymous with driver)
##
## - biologically significant: perhaps biologically significant but unkown
## clinical significance. This set of all clinically significant
## genes is a subset
driver_genes <- function(tx, clin_sign=FALSE){
if(clin_sign){
return(tx$gene_name[tx$cancer_connection])
}
tx$gene_name [ tx$biol_sign ]
}
getDrivers <- function(object, transcripts, clin_sign=TRUE){
known_drivers <- unique(driver_genes(transcripts, clin_sign=clin_sign))
genes <- object$hgnc
gene_list <- split(genes, object$groups)
gene_list <- lapply(gene_list, function(x){
x <- x[!is.na(x)]
if(length(x)==0) return(NA)
xx <- unlist(strsplit(x, ", "))
})
driver_in_grouped_amplicon <- lapply(gene_list, function(x){
x <- paste(x[x %in% known_drivers], collapse=", ")
if(x == "") x <- NA
x
})
driver_in_grouped_amplicon <- unlist(driver_in_grouped_amplicon)
if(all(is.na(driver_in_grouped_amplicon))) return(object)
drv <- driver_in_grouped_amplicon[!is.na(driver_in_grouped_amplicon)]
for(i in seq_along(drv)){
driver_group <- names(drv)[i]
if(clin_sign){
object$driver[object$groups == driver_group] <- drv[i]
} else {
object$biol_sign[ object$groups == driver_group ] <- drv[i]
}
}
if(clin_sign){
object$driver <- as.character(object$driver)
} else {
object$biol_sign <- as.character(object$biol_sign)
}
object
}
setDrivers <- function(object, transcripts, clin_sign=TRUE){
ar <- ampliconRanges(object)
ar <- getDrivers(ar, transcripts, clin_sign=clin_sign)
object <- updateRangesMetadata(object, ar)
object
}
#' Initialize a graph object for amplicon analyses
#'
#'
#' @param params a list of parameters
#' @param af a list of germline filters
#' @param segs a \code{GRanges} object from the segmentation of the normalized coverage
#' @seealso \code{\link{ampliconParams}}
#' @export
#' @examples
#' library(svfilters.hg19)
#' library(svbams)
#' library(Rsamtools)
#' data(germline_filters)
#' ##
#' ## read in some CNVs
#' ##
#' cv.extdata <- system.file("extdata", package="svbams")
#' segs <- readRDS(file.path(cv.extdata, "cgov44t_segments.rds"))
#' extdata <- system.file("extdata", package="svbams")
#' bview <- BamViews(bamPaths=file.path(extdata, "cgov44t_revised.bam"))
#' params <- ampliconParams()
#' germline_filters[["germline_cnv"]] <- GRanges()
#' germline_filters[["outliers"]] <- GRanges()
#' makeAGraph(segs, germline_filters, params)
makeAGraph <- function(segs, af, params){
segs$is_amplicon <- segs$seg.mean > params$AMP_THR
ag <- AmpliconGraph(ranges=segs,
filters=af,
params=params)
if (numNodes (ag) == 0) return (ag)
ag <- trimRangesOverlappingCentromere (ag, af[["centromeres"]])
ag
}
makeAGraph2 <- function(segs, af, params){
ag <- AmpliconGraph(ranges=segs,
filters=af,
params=params)
if (numNodes (ag) == 0) return (ag)
ag <- trimRangesOverlappingCentromere (ag, af[["centromeres"]])
ag
}
#' Construct an AmpliconGraph from a BamViews object
#'
#' This function constructs an \code{AmpliconGraph} from a
#' \code{BamViews} object for a single sample. By default, the seeds
#' of the graph are focal amplicons with fold-change of nearly 3
#' relative to the diploid genome (log2(2.75)). The threshold of
#' seeding amplicons can be adjusted by the \code{ampliconParams}
#' function. After seeding the graph with high-copy focal amplicons,
#' both neighboring (flanking) and distant low-copy focal amplicons
#' are added to the graph object. Next, improperly paired reads in
#' which both the first and last read align to any queryRange of the
#' graph object are parsed from the bam file ( this function will
#' throw an error if not all files in \code{bamPaths} exist). If 5 or
#' more improperly paired reads bridge a node to another node, these
#' amplicons are grouped. Further, if a low-copy amplicon is bridged
#' to an existing node, the low-copy amplicon will become a node in
#' the graph. Amplicon groups are defined by the edges between nodes,
#' where the edges represent improperly paired reads that support a
#' junction between two amplicons.
#'
#' REFACTORING: needs an example to step through. Perhaps an initial
#' graph object and then keep updating the graph object and associated
#' visualization with each step. Each of the low level functions in
#' sv_deletions should be exported to more fully document this
#' procedure.
#'
#' @seealso See \code{ampliconParams} for default parameters. The
#' wrapper \code{\link{sv_amplicon_exp}} constructs and saves an
#' \code{\linkS4class{AmpliconGraph}} for each sample in an
#' experiment.
#'
#' @export
#' @param bview a \code{BamViews} object
#'
#' @param segs a \code{GRanges} object of segments with log2
#' fold-changes consistent with amplification
#'
#' @param amplicon_filters a \code{list} of filters
#' @param params a list of parameters for the amplicon analysis
#' @param transcripts a \code{GRanges} object of transcripts
sv_amplicons <- function(bview, segs, amplicon_filters,
params, transcripts){
ag <- initialize_graph(segs, amplicon_filters, params)
REMOTE <- file.exists(bamPaths(bview))
if(!REMOTE) stop ("Path to BAM files is invalid")
##
## REFACTOR: could this step use the saved improper read pairs
##
rp <- get_readpairs(ag, bamPaths(bview))
ag <- addFocalDupsFlankingAmplicon(ag, rp, params)
queryRanges(ag) <- focalAmpliconDupRanges(ag, params)
irp <- get_improper_readpairs(ag, bamPaths(bview))
##
## At this point, focal duplications added to the graph have not
## been linked to any of the seeds
##
##paired_bin_filter <- af[["paired_bin_filter"]]
##param <- FilterEdgeParam(minimum_maxdist=50, bad_bins=paired_bin_filter)
##param <- FilterEdgeParam(minimum_maxdist=50, bad_bins=GRanges())
edge.p <- params[["edge"]]
ag <- linkFocalDups(ag, irp, params)
ag <- linkAmplicons(ag, irp, edgeParam=edge.p)
ag <- linkNearAmplicons(ag, maxgap=params[["maxgap"]])
ag <- filterSmallAmplicons (ag)
ag <- setAmpliconGroups (ag)
ag <- setGenes (ag, transcripts)
ag <- setDrivers (ag, transcripts, clin_sign=TRUE)
ag <- setDrivers (ag, transcripts, clin_sign=FALSE)
ag
}
initialize_graph <- function(segs, amplicon_filters, params){
ag <- makeAGraph(segs, amplicon_filters, params)
merged <- joinNearGRanges(ranges(ag), params)
names(merged) <- ampliconNames(merged)
## the names of the nodes no longer correspond to the range names
## stopifnot(nodes(ag) %in% names(tmp)), and so
## setAmpliconGroups fails
ranges(ag) <- merged
ag
}
ampliconFilters <- function(genome){
pkg <- paste0("svfilters.", genome)
data("germline_filters", package=pkg, envir=environment())
germline_filters <- get("germline_filters")
germline_filters
}
amplified_segments <- function(segments, params){
segments$is_amplicon <- segments$seg.mean > params$AMP_THR
segments
}
initialize_graph2 <- function(preprocess, filters, params){
ag <- makeAGraph2(preprocess$segments, filters, params)
merged <- joinNearGRanges(ranges(ag), params)
names(merged) <- ampliconNames(merged)
## the names of the nodes no longer correspond to the range names
## stopifnot(nodes(ag) %in% names(tmp)), and so
## setAmpliconGroups fails
ranges(ag) <- merged
ag
}
add_amplicons <- function(ag, bam.file, params){
proper_rp <- get_readpairs2(queryRanges(ag), bam.file)
ag <- addFocalDupsFlankingAmplicon(ag, proper_rp, params)
queryRanges(ag) <- focalAmpliconDupRanges(ag, params)
ag
}
#' Construct an AmpliconGraph from a BamViews object
#'
#' This function constructs an \code{AmpliconGraph} from a
#' \code{BamViews} object for a single sample. By default, the seeds
#' of the graph are focal amplicons with fold-change of nearly 3
#' relative to the diploid genome (log2(2.75)). The threshold of
#' seeding amplicons can be adjusted by the \code{ampliconParams}
#' function. After seeding the graph with high-copy focal amplicons,
#' both neighboring (flanking) and distant low-copy focal amplicons
#' are added to the graph object. Next, improperly paired reads in
#' which both the first and last read align to any queryRange of the
#' graph object are parsed from the bam file ( this function will
#' throw an error if not all files in \code{bamPaths} exist). If 5 or
#' more improperly paired reads bridge a node to another node, these
#' amplicons are grouped. Further, if a low-copy amplicon is bridged
#' to an existing node, the low-copy amplicon will become a node in
#' the graph. Amplicon groups are defined by the edges between nodes,
#' where the edges represent improperly paired reads that support a
#' junction between two amplicons.
#'
#'
#' @seealso See \code{ampliconParams} for default parameters. The
#' wrapper \code{\link{sv_amplicon_exp}} constructs and saves an
#' \code{\linkS4class{AmpliconGraph}} for each sample in an
#' experiment.
#'
#' @export
#' @param preprocess a list of the preprocessed data (see \code{preprocessData})
#' @param amplicon_filters a \code{GRanges} object of germline filters. If this
#' argument is not specified then a default set of germline filters will be used
#' from the \code{svfilters.hg19} package if the \code{genome} slot in \code{preprocess} is
#' set to "hg19" or from the \code{svfilters.hg18} package if it's set to "hg18".
#' @param params a list of parameters for the amplicon analysis. Default parameters
#' are given by \code{AmpliconParams()}.
#' @return an \code{AmpliconGraph} object
#' @examples
#' data(pdata)
#' pdata$bam.file <- system.file("extdata", "cgov44t_revised.bam",
#' package="svbams")
#' sv_amplicons2(pdata)
#' @export
sv_amplicons2 <- function(preprocess, amplicon_filters,
params=ampliconParams()){
if(missing(amplicon_filters)){
amplicon_filters <- ampliconFilters(preprocess$genome)
}
preprocess$segments <- amplified_segments(preprocess$segments, params)
ag <- initialize_graph2(preprocess, amplicon_filters, params)
if (length(ag) == 0) return(ag)
##
## Requires bam file -- can not work remotely
##
ag <- add_amplicons(ag, preprocess$bam.file, params)
improper_rp <- preprocess$read_pairs[["improper"]]
ag <- link_amplicons(ag, improper_rp, params)
tx <- loadTx(preprocess$genome)
ag <- annotate_amplicons(ag, tx)
ag
}
link_amplicons <- function(ag, improper_rp, params){
edge.p <- params[["edge"]]
ag <- linkFocalDups(ag, improper_rp, params)
ag <- linkAmplicons(ag, improper_rp, edgeParam=edge.p)
ag <- linkNearAmplicons(ag, maxgap=params[["maxgap"]])
ag <- filterSmallAmplicons (ag)
ag <- setAmpliconGroups (ag)
ag
}
annotate_amplicons <- function(ag, tx){
ag <- setGenes (ag, tx)
ag <- setDrivers (ag, tx, clin_sign=TRUE)
ag <- setDrivers (ag, tx, clin_sign=FALSE)
ag
}
#' Identify somatic amplicons and grouped amplicons
#'
#' This function identifies focal, somatic amplifications. As
#' amplicons are often grouped by the bridge-fusion-breakage cycles,
#' we represent the set of somatic amplicons identified for a sample
#' as a graph. The nodes of the graph are the amplicons and the edges
#' are the links between amplicons informed by paired reads. This
#' function is a wrapper for \code{sv_amplicons} that constructs an
#' \code{AmpliconGraph} for a given sample. As with the other
#' *Experiment functions, \code{sv_amplicon_exp} saves the
#' \code{AmpliconGraph} computed for each sample as in intermediate
#' file for quick recall. If the file exists, this function reads the
#' serialized R object from disk. To generate an \code{AmpliconGraph}
#' de novo, one must first delete the intermediate files (see
#' examples).
#'
#' @seealso See \code{\linkS4class{AmpliconGraph}} for methods
#' associated with the class and \code{\link{sv_amplicons}} for
#' construction of an \code{AmpliconGraph} for a single sample.
#'
#'
#' @param dirs character-vector of file paths for storing intermediate
#' files
#' @param bviews A \code{BamViews} object
#' @param grl A \code{GRangesList} of the segmented genomes (each
#' element is the \code{GRanges} for a sample)
#' @param amplicon_filters A list of germline filters and parameters. See the \code{germline_filters} object in the package \code{svfilters.<build>}.
#' @param params a list of parameters for the amplicon analysis
#' @param transcripts a \code{GRanges} object of the transcripts.
sv_amplicon_exp <- function(dirs, bviews, grl, amplicon_filters,
params=ampliconParams(),
transcripts){
ag_files <- file.path(dirs["2amplicons"],
paste0(colnames(bviews), ".rds"))
result_list <- setNames(vector("list",
length(ag_files)),
colnames(bviews))
for(i in seq_along(result_list)){
if(file.exists(ag_files[i])){
ag <- readRDS(ag_files[i])
result_list[[i]] <- ag
next()
}
ag <- sv_amplicons(bviews[, i], segs=grl[[i]],
amplicon_filters, params, transcripts)
result_list[[i]] <- ag
saveRDS(ag, file=ag_files[i])
}
result_list
}
recurrentAmplicons <- function(tx, grl, maxgap=5e3){
## tx is big. Make this smaller as a first step
tx <- subsetByOverlaps(tx, reduce(unlist(grl), min.gapwidth=maxgap))
gene_list <- split(tx, tx$gene_name)
## remove any element that has multiple chromosomes
nchroms <- sapply(gene_list, function(g) length(unique(chromosome(g))))
gene_list <- gene_list[nchroms == 1]
## A gene can have multiple entries. Try reduce
gene_list <- GRangesList(lapply(gene_list, reduce, min.gapwidth=maxgap))
##
## Add back the gene name
##
el <- elementNROWS(gene_list)
tx <- unlist(gene_list)
tx$gene_name <- rep(names(gene_list), el)
## ensure that 2 amplicons for a subject hitting a gene are only counted once
is_overlap_list <- lapply(grl, function(gr, tx, maxgap) {
overlapsAny(tx, gr, maxgap=maxgap)
}, maxgap=maxgap, tx=tx)
is_overlap <- do.call(cbind, is_overlap_list)
cnts <- rowSums(is_overlap)
tx <- tx[cnts > 1, ]
is_overlap <- is_overlap[ cnts > 1, ]
cnts <- cnts[ cnts > 1 ]
ids <- apply(is_overlap, 1, function(is_amplicon, id) {
paste(id[is_amplicon], collapse=",")
}, id=gsub(".bam", "", colnames(is_overlap)))
result <- data.frame(gene = tx$gene_name, freq=as.integer(cnts), id=ids)
##
## Gene coordinates
##
tx2 <- tx[tx$gene_name %in% result$gene]
tx2.list <- GRangesList(sapply(split(tx2, tx2$gene_name), reduce))
tx2 <- unlist(tx2.list)
tx2 <- tx2[result$gene]
stopifnot(identical(names(tx2), as.character(result$gene)))
result$chr <- chromosome(tx2)
result$start <- start(tx2)
result$end <- end(tx2)
result <- result[, c("gene", "chr", "start", "end", "freq", "id")]
rownames(result) <- NULL
result[order(result$freq, decreasing=TRUE), ]
}
#' Table of recurrent drivers
#'
#' Given a GRangesList of amplicons or deletions (each element of list is a
#' sample), tabulate the frequency of driver deletions or amplifications.
#'
#' @param grl a \code{GRangesList} organized by sample
#' @param transcripts a \code{GRanges} object of transcripts as provided by the
#' \code{svfilters.<ucsc_build>} packages
#' @param split A character string indicating the string used to separate
#' drivers. By default, we assume the drivers associated with a given deletion
#' or amplicon are separated by a comma followed by a space.
#' @return a \code{data.frame} of recurrent drivers
#' @export
recurrentDrivers <- function(grl, transcripts, split=", "){
driver_list <- sapply(grl, function(g){
dr <- as.character(g$driver)
if(length(dr) == 0) return(NULL)
dr <- dr[!is.na(dr)]
dr <- unlist(strsplit(dr, split))
unique(dr)
})
driver_list2 <- driver_list[ elementNROWS(driver_list) > 0 ]
df <- data.frame(id=rep(names(driver_list2), elementNROWS(driver_list2)),
gene=unlist(driver_list2))
dflist <- split(df, df$gene)
ids <- sapply(sapply(dflist, "[[", "id"), paste, collapse=",")
df <- data.frame(gene=names(dflist),
id=ids,
freq=elementNROWS(dflist))
tx <- transcripts[transcripts$gene_name %in% df$gene]
txlist <- split(tx, tx$gene_name)
tx <- unlist(reduce(txlist, min.gapwidth=2e3))
df$chr <- chromosome(tx)
df$start <- start(tx)
df$end <- end(tx)
df <- df[order(df$freq, decreasing=TRUE), ]
df
}
#' @include AllGenerics.R
NULL
## #' Plot amplicons as nodes and the links between amplicons as edges
## #'
## #' Linked amplicons are represented as a \code{AmpliconGraph}. With
## #' amplicons as nodes and edges representing links between amplicons
## #' from paired reads, a \code{graphNEL} object encapsulates this data.
## #' Plotting utilities in the \code{Rgraphviz} package can be used to
## #' visualize these graphs.
## #'
## #' @export
## #'
## #' @seealso See \code{\link{sv_amplicons}} for constructing
## #' an \code{AmpliconGraph} object. See
## #' \code{\link{qualitativeColors}} for a list of qualitative
## #' colors. See \code{\link[graph]{graphNEL-class}} for details on
## #' graph representation.
## #'
## #' @param ag a \code{AmpliconGraph} object
## #' @param col_list a list of color palettes
## plot_amplicons <- function(ag, col_list=qualitativeColors()){
## if(length(ag)==0){
## message("No amplicons -- nothing to plot")
## return(invisible())
## }
## dark_colors <- c("#332288", "#661100", "#882255", "#AA4499")
## ar <- nodes(ag)
## chroms <- sapply(strsplit(ar, ":"), "[", 1)
## sl <- unique(chroms)
## L <- length(sl)
## L <- ifelse(L > 12, 12, L)
## sl <- factor(chroms, levels=sl)
## color_nodes <- col_list[[L]][as.integer(sl)]
## g1 <- graph(ag)
## color_nodes <- setNames(color_nodes, nodes(g1))
## nodenames <- setNames(nodes(g1), nodes(g1))
## text_col <- setNames(rep("black", length(nodes(g1))), nodes(g1))
## text_col[color_nodes %in% dark_colors] <- "gray90"
## nodeRenderInfo(g1) <- list(label=nodenames, fill=color_nodes, textCol=text_col)
## nodeAttrs <- list(fillcolor=color_nodes)
## attrs <- list(node=list(shape="rectangle",
## fixedsize=FALSE),
## graph=list(rankdir="LR"))
## graph_object <- layoutGraph(g1,
## attrs=attrs,
## nodeAttrs=nodeAttrs)
## graph_object
## }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.