#' GRanges representing the breakend coordinates of
#' structural variants
#' #@export
#setClass("BreakpointGRanges", contains="GRanges")
#' Partner breakend for each breakend.
#'
#' @details
#' All breakends must have their partner breakend included
#' in the GRanges.
#'
#' @param gr GRanges object of SV breakends
#' @param selfPartnerSingleBreakends treat single breakends as their own partner.
#' @return A GRanges object in which each entry is the partner breakend of
#' those in the input object.
#' @examples
#' #reading in a VCF file as \code{vcf}
#' vcf.file <- system.file("extdata", "gridss.vcf", package = "StructuralVariantAnnotation")
#' vcf <- VariantAnnotation::readVcf(vcf.file, "hg19")
#' #parsing \code{vcf} to GRanges object \code{gr}
#' gr <- breakpointRanges(vcf)
#' #output partner breakend of each breakend in \code{gr}
#' partner(gr)
#'@export
partner <- function(gr, selfPartnerSingleBreakends=FALSE) {
.assertValidBreakpointGRanges(gr, allowSingleBreakends=selfPartnerSingleBreakends)
return(gr[ifelse(selfPartnerSingleBreakends & is.na(gr$partner), names(gr), gr$partner),])
}
#' Determines whether this breakend has a valid partner in this
#' GRanges
#'
#' @param gr GRanges object of SV breakends
#' @param selfPartnerSingleBreakends treat single breakends as their own partner.
#' @return True/False for each row in the breakpoint GRanges
#' @examples
#' #Subset to chromosome 6 intra-chromosomal events \code{vcf}
#' vcf.file <- system.file("extdata", "COLO829T.purple.sv.ann.vcf.gz",
#' package = "StructuralVariantAnnotation")
#' vcf <- VariantAnnotation::readVcf(vcf.file)
#' gr <- breakpointRanges(vcf)
#' gr <- gr[seqnames(gr) == "6"]
#' # We now need to filter out inter-chromosomal events to ensure
#' # our GRanges doesn't contain any breakpoints whose partner
#' # has already been filtered out and no longer exists in the GRanges.
#' gr <- gr[hasPartner(gr)]
#'@export
hasPartner <- function(gr, selfPartnerSingleBreakends=FALSE) {
return((is.na(gr$partner) & selfPartnerSingleBreakends) |
(!is.na(gr$partner) & gr$partner %in% names(gr)))
}
#' Finding overlapping breakpoints between two breakpoint sets
#'
#' @details
#' \code{findBreakpointOverlaps()} is an efficient adaptation of \code{findOverlaps-methods()}
#' for breakend ranges. It searches for overlaps between breakpoint objects, and return a
#' matrix including index of overlapping ranges as well as error stats.
#' All breakends must have their partner breakend included in the \code{partner}
#' field. A valid overlap requires that breakends on boths sides meets the overlapping
#' requirements.
#'
#' See GenomicRanges::findOverlaps-methods for details of overlap calculation.
#'
#' @param query,subject Both of the input objects should be GRanges objects.
#' Unlike \code{findOverlaps()}, \code{subject} cannot be ommitted. Each breakpoint
#' must be accompanied with a partner breakend, which is also in the GRanges, with the
#' partner's id recorded in the \code{partner} field.
#' See GenomicRanges::findOverlaps-methods for details.
#' @param maxgap,minoverlap Valid overlapping thresholds of a maximum gap and a minimum
#' overlapping positions between breakend intervals. Both should be scalar integers. maxgap
#' allows non-negative values, and minoverlap allows positive values.
#' See GenomicRanges::findOverlaps-methods for details.
#' @param ignore.strand Default value is FALSE. strand information is ignored when set to
#' TRUE.
#' See GenomicRanges::findOverlaps-methods for details.
#' @param sizemargin Error margin in allowable size to prevent matching of events
#' of different sizes, e.g. a 200bp event matching a 1bp event when maxgap is
#' set to 200.
#' @param restrictMarginToSizeMultiple Size restriction multiplier on event size.
#' The default value of 0.5 requires that the breakpoint positions can be off by
#' at maximum, half the event size. This ensures that small deletion do actually
#' overlap at least one base pair.
#' @examples
#' #reading in VCF files
#' query.file <- system.file("extdata", "gridss-na12878.vcf", package = "StructuralVariantAnnotation")
#' subject.file <- system.file("extdata", "gridss.vcf", package = "StructuralVariantAnnotation")
#' query.vcf <- VariantAnnotation::readVcf(query.file, "hg19")
#' subject.vcf <- VariantAnnotation::readVcf(subject.file, "hg19")
#' #parsing vcfs to GRanges objects
#' query.gr <- breakpointRanges(query.vcf)
#' subject.gr <- breakpointRanges(subject.vcf)
#' #find overlapping breakpoint intervals
#' findBreakpointOverlaps(query.gr, subject.gr)
#' findBreakpointOverlaps(query.gr, subject.gr, ignore.strand=TRUE)
#' findBreakpointOverlaps(query.gr, subject.gr, maxgap=100, sizemargin=0.5)
#' @return A dataframe containing index and error stats of overlapping breakpoints.
#'@export
findBreakpointOverlaps <- function(query, subject, maxgap=-1L, minoverlap=0L, ignore.strand=FALSE, sizemargin=NULL, restrictMarginToSizeMultiple=NULL) {
.assertValidBreakpointGRanges(query)
.assertValidBreakpointGRanges(subject)
pquery = partner(query)
squery = partner(subject)
localhits = findOverlaps(query, subject, maxgap=maxgap, minoverlap=minoverlap, type="any", select="all", ignore.strand=ignore.strand)
remotehits = findOverlaps(pquery, squery, maxgap=maxgap, minoverlap=minoverlap, type="any", select="all", ignore.strand=ignore.strand)
## duplicated() version:
#hits = Hits(c(S4Vectors::queryHits(localhits), S4Vectors::queryHits(remotehits)), c(S4Vectors::subjectHits(localhits), S4Vectors::subjectHits(remotehits)), nLnode=nLnode(localhits), nRnode=nRnode(localhits), sort.by.query=TRUE)
#hits = hits[duplicated(hits)]
## intersect() version:
hits = BiocGenerics::intersect(localhits, remotehits)
## dplyr() version:
#hits <- dplyr::bind_rows(
# as.data.frame(localhits, row.names=NULL),
# as.data.frame(remotehits, row.names=NULL))
#hits = hits %>% dplyr::arrange(queryHits, subjectHits) %>%
# dplyr::filter(!is.na(dplyr::lead(.$queryHits)) & !is.na(dplyr::lead(.$subjectHits)) & dplyr::lead(.$queryHits) == .$queryHits & dplyr::lead(.$subjectHits) == .$subjectHits)
## dplyr() exploiting the sorted nature of the findOverlaps():
#hits = Hits(c(S4Vectors::queryHits(localhits), S4Vectors::queryHits(remotehits)), c(S4Vectors::subjectHits(localhits), S4Vectors::subjectHits(remotehits)), nLnode=nLnode(localhits), nRnode=nRnode(localhits), sort.by.query=TRUE)
#queryLead = dplyr::lead(S4Vectors::queryHits(hits))
#querySubject = dplyr::lead(S4Vectors::queryHits(hits))
#hits = hits[
# !is.na(queryLead) &d
# !is.na(querySubject) &
# queryLead == S4Vectors::queryHits(hits) &
# querySubject == S4Vectors::subjectHits(hits)]
if (!is.null(sizemargin) && !is.na(sizemargin)) {
# take into account confidence intervals when calculating event size
callwidth <- .distance(query, pquery)
truthwidth <- .distance(subject, squery)
callsize <- callwidth + .replaceNa(query$insLen, 0)
truthsize <- truthwidth + .replaceNa(subject$insLen, 0)
sizeerror <- .distance(
IRanges::IRanges(start=callsize$min[S4Vectors::queryHits(hits)], end=callsize$max[S4Vectors::queryHits(hits)]),
IRanges::IRanges(start=truthsize$min[S4Vectors::subjectHits(hits)], end=truthsize$max[S4Vectors::subjectHits(hits)])
)$min
# event sizes must be within sizemargin
hits <- hits[sizeerror - 1 < sizemargin * pmin(callsize$max[S4Vectors::queryHits(hits)], truthsize$max[S4Vectors::subjectHits(hits)]),]
# further restrict breakpoint positions for small events
localbperror <- .distance(query[S4Vectors::queryHits(hits)], subject[S4Vectors::subjectHits(hits)])$min
remotebperror <- .distance(pquery[S4Vectors::queryHits(hits)], squery[S4Vectors::subjectHits(hits)])$min
if (!is.null(restrictMarginToSizeMultiple)) {
allowablePositionError <- (pmin(callsize$max[S4Vectors::queryHits(hits)], truthsize$max[S4Vectors::subjectHits(hits)]) * restrictMarginToSizeMultiple + 1)
hits <- hits[localbperror <= allowablePositionError & remotebperror <= allowablePositionError, ]
}
}
return(hits)
}
# TODO: new function to annotate a Hits object with sizeerror, localbperror, and remotebperror
#' @noRd
.distance <- function(r1, r2) {
return(data.frame(
min=pmax(0, pmax(start(r1), start(r2)) - pmin(end(r1), end(r2))),
max=pmax(end(r2) - start(r1), end(r1) - start(r2))))
}
#' Counting overlapping breakpoints between two breakpoint sets
#'
#' @details
#' \code{countBreakpointOverlaps()} returns the number of overlaps between breakpoint
#' objects, based on the output of \code{findBreakpointOverlaps()}.
#' See GenomicRanges::countOverlaps-methods
#' @param querygr,subjectgr,maxgap,minoverlap,ignore.strand,sizemargin,restrictMarginToSizeMultiple
#' See \code{findBreakpointOverlaps()}.
#' @param countOnlyBest Default value set to FALSE. When set to TRUE, the result count
#' each subject breakpoint as overlaping only the best overlapping query breakpoint.
#' The best breakpoint is considered to be the one with the highest QUAL score.
#' @param breakpointScoreColumn Query column defining a score for determining which query breakpoint
#' is considered the best when countOnlyBest=TRUE.
#' @examples
#' truth_vcf = VariantAnnotation::readVcf(system.file("extdata", "na12878_chr22_Sudmunt2015.vcf",
#' package = "StructuralVariantAnnotation"))
#' crest_vcf = VariantAnnotation::readVcf(system.file("extdata", "na12878_chr22_crest.vcf",
#' package = "StructuralVariantAnnotation"))
#' caller_bpgr = breakpointRanges(crest_vcf)
#' caller_bpgr$true_positive = countBreakpointOverlaps(caller_bpgr, breakpointRanges(truth_vcf),
#' maxgap=100, sizemargin=0.25, restrictMarginToSizeMultiple=0.5, countOnlyBest=TRUE)
#' @return An integer vector containing the tabulated query overlap hits.
#' @export
countBreakpointOverlaps <- function(querygr, subjectgr, countOnlyBest=FALSE,
breakpointScoreColumn = "QUAL", maxgap=-1L,
minoverlap=0L, ignore.strand=FALSE, sizemargin=NULL,
restrictMarginToSizeMultiple=NULL) {
hitscounts <- rep(0, length(querygr))
hits <- as.data.frame(findBreakpointOverlaps(querygr, subjectgr, maxgap, minoverlap, ignore.strand, sizemargin=sizemargin, restrictMarginToSizeMultiple=restrictMarginToSizeMultiple))
if (!countOnlyBest) {
hits <- hits %>%
dplyr::group_by(.data$queryHits) %>%
dplyr::summarise(n=dplyr::n())
} else {
# assign supporting evidence to the call with the highest QUAL
hits$QUAL <- S4Vectors::mcols(querygr)[[breakpointScoreColumn]][hits$queryHits]
hits <- hits %>%
dplyr::arrange(dplyr::desc(.data$QUAL), .data$queryHits) %>%
dplyr::distinct(.data$subjectHits, .keep_all=TRUE) %>%
dplyr::group_by(.data$queryHits) %>%
dplyr::summarise(n=dplyr::n())
}
hitscounts[hits$queryHits] <- hits$n
return(hitscounts)
}
#' Converts a breakpoint GRanges object to a Pairs object
#' @param bpgr breakpoint GRanges object
#' @param writeQualAsScore write the breakpoint GRanges QUAL field as the score
#' fields for compatibility with BEDPE rtracklayer export
#' @param writeName write the breakpoint GRanges QUAL field as the score
#' fields for compatibility with BEDPE rtracklayer export
#' @param bedpeName function that returns the name to use for the breakpoint.
#' Defaults to the sourceId, name column, or row names (in that priority) of
#' the first breakend of each pair.
#' @param firstInPair function that returns TRUE for breakends that are considered
#' the first in the pair, and FALSE for the second in pair breakend. By default,
#' the first in the pair is the breakend with the lower ordinal in the breakpoint
#' GRanges object.
#' @examples
#' vcf.file <- system.file("extdata", "gridss.vcf", package = "StructuralVariantAnnotation")
#' bpgr <- breakpointRanges(VariantAnnotation::readVcf(vcf.file))
#' pairgr <- breakpointgr2pairs(bpgr)
#' #rtracklayer::export(pairgr, con="example.bedpe")
#' @return Pairs GRanges object suitable for export to BEDPE by rtracklayer
#' @rdname pairs2breakpointgr
#' @export
breakpointgr2pairs <- function(
bpgr,
writeQualAsScore=TRUE,
writeName=TRUE,
bedpeName = NULL,
firstInPair = NULL) {
.assertValidBreakpointGRanges(bpgr, "Cannot convert breakpoint GRanges to Pairs: ", allowSingleBreakends=FALSE)
if (is.null(bedpeName)) {
bedpeName = function(gr) { .replaceNull(.replaceNull(gr$sourceId, gr$name), names(gr)) }
}
if (is.null(firstInPair)) {
firstInPair = function(gr) { seq_along(gr) < match(gr$partner, names(gr)) }
}
isFirst = firstInPair(bpgr)
pairgr = S4Vectors::Pairs(bpgr[isFirst], partner(bpgr)[isFirst])
if (writeName) {
S4Vectors::mcols(pairgr)$name = bedpeName(S4Vectors::first(pairgr))
}
if (writeQualAsScore) {
S4Vectors::mcols(pairgr)$score = S4Vectors::first(pairgr)$QUAL
}
return(pairgr)
}
#' @noRd
.assertValidBreakpointGRanges <- function(bpgr, friendlyErrorMessage="", allowSingleBreakends=TRUE) {
if (is.null(names(bpgr))) {
stop(paste0(friendlyErrorMessage, "Breakpoint GRanges require names"))
}
if (any(is.na(names(bpgr)))) {
stop(paste0(friendlyErrorMessage, "Breakpoint GRanges names cannot be NA"))
}
if (any(duplicated(names(bpgr)))) {
stop(paste0(friendlyErrorMessage, "Breakpoint GRanges names cannot duplicated"))
}
if (!allowSingleBreakends & any(is.na(bpgr$partner))) {
stop(paste0(friendlyErrorMessage, "Breakpoint GRanges contains single breakends"))
}
if (any(duplicated(bpgr$partner) & !is.na(bpgr$partner))) {
stop(paste0(friendlyErrorMessage,
"Multiple breakends with the sample partner identified. ",
"Breakends with multiple partners not currently supported by Breakpoint GRanges."))
}
else if (!all(is.na(bpgr$partner) | (bpgr$partner %in% names(bpgr) & names(bpgr) %in% bpgr$partner))) {
stop(paste0(friendlyErrorMessage,
"Unpartnered breakpoint found. ",
"All breakpoints must contain a partner in the breakpoint GRanges."))
}
}
#' Converts a BEDPE Pairs containing pairs of GRanges loaded using to a breakpoint GRanges object.
#' @details
#' Breakpoint-level column names will override breakend-level column names.
#' @param pairs a Pairs object consisting of two parallel genomic loci.
#' @param placeholderName prefix to use to ensure each entry has a unique ID.
#' @param firstSuffix first in pair name suffix to ensure breakend name uniqueness
#' @param secondSuffix second in pair name suffix to ensure breakend name uniqueness
#' @param nameField Fallback field for row names if the Pairs object does not contain any names.
#' BEDPE files loaded using rtracklayer use the "name" field.
#' @param renameScoreToQUAL renames the 'score' column to 'QUAL'.
#' Performing this rename results in a consistent variant quality score column
#' name for variant loaded from BEDPE and VCF.
#' @examples
#' bedpe.file <- system.file("extdata", "gridss.bedpe", package = "StructuralVariantAnnotation")
#' bedpe.pairs <- rtracklayer::import(bedpe.file)
#' bedpe.bpgr <- pairs2breakpointgr(bedpe.pairs)
#' @return Breakpoint GRanges object.
#' @export
pairs2breakpointgr <- function(
pairs,
placeholderName="bedpe",
firstSuffix="_1", secondSuffix="_2",
nameField="name",
renameScoreToQUAL=TRUE) {
n <- names(pairs)
if (is.null(n)) {
# BEDPE uses the "name" field
if (nameField %in% names(S4Vectors::mcols(pairs))) {
n <- S4Vectors::mcols(pairs)[[nameField]]
mcols(pairs)$sourceId <- n
} else {
n <- rep(NA_character_, length(pairs))
}
}
# ensure row names are unique
n <- ifelse(is.na(n) | n == "" | n =="." | duplicated(n), paste0(placeholderName, seq_along(n)), n)
#
gr <- c(S4Vectors::first(pairs), S4Vectors::second(pairs))
names(gr) <- c(paste0(n, firstSuffix), paste0(n, secondSuffix))
gr$partner <- c(paste0(n, secondSuffix), paste0(n, firstSuffix))
for (col in names(S4Vectors::mcols(pairs))) {
if (col %in% nameField) {
# drop columns we have processed
} else {
S4Vectors::mcols(gr)[[col]] <- S4Vectors::mcols(pairs)[[col]]
}
}
if (renameScoreToQUAL) {
names(mcols(gr))[which(names(mcols(gr)) == "score")] <- "QUAL"
}
return(gr)
}
#' Extracts the breakpoint sequence.
#'
#' @details
#' The sequence is the sequenced traversed from the reference anchor bases
#' to the breakpoint. For backward (-) breakpoints, this corresponds to the
#' reverse compliment of the reference sequence bases.
#'
#' @param gr breakpoint GRanges
#' @param ref Reference BSgenome
#' @param anchoredBases Number of bases leading into breakpoint to extract
#' @param remoteBases Number of bases from other side of breakpoint to extract
#' @return Breakpoint sequence around the variant position.
#' @export
extractBreakpointSequence <- function(gr, ref, anchoredBases, remoteBases=anchoredBases) {
localSeq <- extractReferenceSequence(gr, ref, anchoredBases, 0)
insSeq <- ifelse(strand(gr) == "-",
as.character(Biostrings::reverseComplement(DNAStringSet(.replaceNa(gr$insSeq,"")))),
.replaceNa(gr$insSeq, ""))
remoteSeq <- as.character(Biostrings::reverseComplement(DNAStringSet(
extractReferenceSequence(partner(gr), ref, remoteBases, 0))))
return(paste0(localSeq, insSeq, remoteSeq))
}
#' Returns the reference sequence around the breakpoint position
#'
#' @details
#' The sequence is the sequenced traversed from the reference anchor bases
#' to the breakpoint. For backward (-) breakpoints, this corresponds to the
#' reverse compliment of the reference sequence bases.
#'
#' @param gr breakpoint GRanges
#' @param ref Reference BSgenome
#' @param anchoredBases Number of bases leading into breakpoint to extract
#' @param followingBases Number of reference bases past breakpoint to extract
#' @return Reference sequence around the breakpoint position.
#' @export
extractReferenceSequence <- function(gr, ref, anchoredBases, followingBases=anchoredBases) {
assertthat::assert_that(is(gr, "GRanges"))
assertthat::assert_that(is(ref, "BSgenome"))
gr <- .constrict(gr)
seqgr <- GRanges(seqnames=GenomeInfoDb::seqnames(gr), ranges=IRanges::IRanges(
start=start(gr) - ifelse(strand(gr) == "-", followingBases, anchoredBases - 1),
end=end(gr) + ifelse(strand(gr) == "-", anchoredBases - 1, followingBases)))
startPad <- pmax(0, 1 - start(seqgr))
endPad <- pmax(0, end(seqgr) - GenomeInfoDb::seqlengths(ref)[as.character(GenomeInfoDb::seqnames(seqgr))])
GenomicRanges::ranges(seqgr) <- IRanges::IRanges(start=start(seqgr) + startPad, end=end(seqgr) - endPad)
seq <- Biostrings::getSeq(ref, seqgr)
seq <- paste0(stringr::str_pad("", startPad, pad="N"), as.character(seq), stringr::str_pad("", endPad, pad="N"))
# DNAStringSet doesn't like out of bounds subsetting
seq <- ifelse(strand(gr) == "-", as.character(Biostrings::reverseComplement(DNAStringSet(seq))), seq)
return(seq)
}
#' constrict
#' @param gr GRanges object
#' @param ref reference
#' @param position only 'middle' position is accepted.
#' @return A constricted GRanges object.
#' @noRd
.constrict <- function(gr, ref=NULL,position="middle") {
isLower <- start(gr) < start(partner(gr))
# Want to call a valid breakpoint
# 123 456
#
# => <= + -
# > <== f f
#
# => => + +
# > ==> f c
roundDown <- isLower | strand(gr) == "-"
if (position == "middle") {
pos <- (start(gr) + end(gr)) / 2
GenomicRanges::ranges(gr) <- IRanges::IRanges(
start=ifelse(roundDown,floor(pos), ceiling(pos)),
width=1, names=names(gr))
} else {
stop(paste("Unrecognised position", position))
}
if (!is.null(ref)) {
GenomicRanges::ranges(gr) <- IRanges::IRanges(start=pmin(pmax(1, start(gr)), GenomeInfoDb::seqlengths(ref)[as.character(GenomeInfoDb::seqnames(gr))]), width=1)
}
return(gr)
}
#' Calculates the length of inexact homology between the breakpoint sequence
#' and the reference
#'
#' @param gr reakpoint GRanges
#' @param ref reference BSgenome
#' @param anchorLength Number of bases to consider for homology
#' @param margin Number of additional reference bases include. This allows
#' for inexact homology to be detected even in the presence of indels.
#' @param mismatch see Biostrings::pairwiseAlignment
#' @param gapOpening see Biostrings::pairwiseAlignment
#' @param gapExtension see Biostrings::pairwiseAlignment
#' @param match see Biostrings::pairwiseAlignment
#' @return A dataframe containing the length of inexact homology between the
#' breakpoint sequence and the reference.
#' @export
calculateReferenceHomology <- function(gr, ref,
anchorLength=300,
margin=5,
match=2, mismatch=-6, gapOpening=5, gapExtension=3 # bwa
#match = 1, mismatch = -4, gapOpening = 6, gapExtension = 1, # bowtie2
) {
# shrink anchor for small events to prevent spanning alignment
aLength <- .replaceNa(pmin(anchorLength, abs(gr$svLen) + 1), anchorLength)
anchorSeq <- extractReferenceSequence(gr, ref, aLength, 0)
anchorSeq <- sub(".*N", "", anchorSeq)
# shrink anchor with Ns
aLength <- nchar(anchorSeq)
varseq <- extractBreakpointSequence(gr, ref, aLength)
varseq <- sub("N.*", "", varseq)
bpLength <- nchar(varseq) - aLength
nonbpseq <- extractReferenceSequence(gr, ref, 0, bpLength + margin)
nonbpseq <- sub("N.*", "", nonbpseq)
refseq <- paste0(anchorSeq, nonbpseq)
partnerIndex <- match(gr$partner, names(gr))
if (all(refseq=="") && all(varseq=="")) {
# Workaround of Biostrings::pairwiseAlignment bug
return(data.frame(
exacthomlen=rep(NA, length(gr)),
inexacthomlen=rep(NA, length(gr)),
inexactscore=rep(NA, length(gr))))
}
aln <- Biostrings::pairwiseAlignment(varseq, refseq, type="local",
substitutionMatrix=nucleotideSubstitutionMatrix(match, mismatch, FALSE, "DNA"),
gapOpening=gapOpening, gapExtension=gapExtension, scoreOnly=FALSE)
ihomlen <- Biostrings::nchar(aln) - aLength - deletion(nindel(aln))[,2] - insertion(nindel(aln))[,2]
ibphomlen <- ihomlen + ihomlen[partnerIndex]
ibpscore <- score(aln) + score(aln)[partnerIndex] - 2 * aLength * match
# TODO: replace this with an efficient longest common substring function
# instead of S/W with a massive mismatch/gap penalty
penalty <- anchorLength * match
matchLength <- Biostrings::pairwiseAlignment(varseq, refseq, type="local",
substitutionMatrix=nucleotideSubstitutionMatrix(match, -penalty, FALSE, "DNA"),
gapOpening=penalty, gapExtension=0, scoreOnly=TRUE) / match
ehomlen <- matchLength - aLength
ebphomlen <- ehomlen + ehomlen[partnerIndex]
ebphomlen[aLength == 0] <- NA
ibphomlen[aLength == 0] <- NA
ibpscore[aLength == 0] <- NA
return(data.frame(
exacthomlen=ebphomlen,
inexacthomlen=ibphomlen,
inexactscore=ibpscore))
}
#' Converts to breakend notation
#' @param gr GRanges object.
#' @param insSeq insert sequence of the GRanges.
#' @param ref reference sequence of the GRanges.
#' @return breakendAlt or breakpointAlt depending on whether the variant is partnered.
#' @noRd
.toVcfBreakendNotationAlt = function(gr, insSeq=gr$insSeq, ref=gr$REF) {
assertthat::assert_that(all(width(gr) == 1))
assertthat::assert_that(!is.null(insSeq))
assertthat::assert_that(all(insSeq != ""))
assertthat::assert_that(!is.null(gr$partner))
isBreakpoint = !is.na(gr$partner)
breakendAlt = ifelse(as.character(strand(gr)) == "+", paste0(gr$insSeq, "."), paste0(".", gr$insSeq))
gr$partner[isBreakpoint] = names(gr)[isBreakpoint] # self partner to prevent errors
partnergr = gr[gr$partner]
partnerDirectionChar = ifelse(strand(partnergr) == "+", "]", "[")
breakpointAlt = ifelse(as.character(strand(gr)) == "+",
paste0(ref, insSeq, partnerDirectionChar, GenomeInfoDb::seqnames(partnergr), ":", start(partnergr), partnerDirectionChar),
paste0(partnerDirectionChar, GenomeInfoDb::seqnames(partnergr), ":", start(partnergr), partnerDirectionChar, insSeq, ref))
return (ifelse(isBreakpoint, breakpointAlt, breakendAlt))
}
#' Converts the given breakpoint GRanges object to VCF format in breakend
#' notation.
#'
#' @param gr breakpoint GRanges object. Can contain both breakpoint and single
#' breakend SV records.
#' @param ... For cbind and rbind a list of VCF objects. For all other methods
#' ... are additional arguments passed to methods. See VCF class in
#' VariantAnnotation for more details.
#' @return A VCF object.
breakpointGRangesToVCF <- function(gr, ...) {
if (is.null(gr$insSeq)) {
gr$insSeq = rep("", length(gr))
}
nominalgr = GRanges(seqnames=GenomeInfoDb::seqnames(gr),
ranges=IRanges::IRanges(start=(end(gr) + start(gr)) / 2,
width=1))
if (is.null(gr$REF)) {
gr$REF = rep("N", length(gr))
}
gr$ALT[is.na(gr$ALT)] = ""
if (is.null(gr$ALT)) {
gr$ALT = rep("", length(gr))
}
gr$ALT[is.na(gr$ALT)] = ""
gr$ALT[gr$ALT == ""] = .toVcfBreakendNotationAlt(gr)[gr$ALT == ""]
ciposstart = start(gr) - start(nominalgr)
ciposend = end(gr) - end(nominalgr)
vcf = VCF(rowRanges=nominalgr, collapsed=FALSE)
fixeddf = data.frame(
ALT=gr$ALT,
REF=gr$REF,
QUAL=gr$QUAL,
FILTER=gr$FILTER)
VariantAnnotation::VCF(rowRanges = GRanges(), colData = S4Vectors::DataFrame(),
exptData = list(header = VCFHeader()), fixed = S4Vectors::DataFrame(),
info = S4Vectors::DataFrame(), geno = S4Vectors::SimpleList(), ..., collapsed=FALSE,
verbose = FALSE)
}
#' Type of simplest explanation of event. Possible types are:
#' | Type | Description |
#' | BND | Single breakend |
#' | CTX | Interchromosomal translocation |
#' | INV | Inversion. |
#' | DUP | Tandem duplication |
#' | INS | Insertion |
#' | DEL | Deletion |
#'
#' Note that both ++ and -- breakpoint will be classified as inversions regardless of whether both breakpoint that consistitute an actual inversion exists or not
#'
#' @param gr breakpoint GRanges object
#' @param insertionLengthThreshold portion of inserted bases compared to total event size to be classified as an insertion.
#' For example, a 5bp deletion with 5 inserted bases will be classified as an INS event.
#' @return Type of simplest explanation of event
#' @export
simpleEventType <- function(gr, insertionLengthThreshold=0.5) {
if (is.null(gr$partner)) {
gr$partner = rep(NA_character_, length(gr))
}
pgr = partner(gr, selfPartnerSingleBreakends=TRUE)
return(
ifelse(is.na(gr$partner), "BND",
ifelse(seqnames(gr) != seqnames(pgr), "CTX", # inter-chromosomosal
ifelse(strand(gr) == strand(pgr), "INV",
ifelse(gr$insLen >= abs(simpleEventLength(gr)) * insertionLengthThreshold, "INS", # TODO: improve classification of complex events
ifelse(xor(start(gr) < start(pgr), strand(gr) == "-"), "DEL",
"DUP"))))))
}
#' Length of event if interpreted as an isolated breakpoint.
#' @param gr breakpoint GRanges object
#' @return Length of the simplest explanation of this breakpoint/breakend.
#' @export
simpleEventLength <- function(gr) {
if (is.null(gr$partner)) {
gr$partner = rep(NA_character_, length(gr))
}
pgr = partner(gr, selfPartnerSingleBreakends=TRUE)
return(
ifelse(seqnames(gr) != seqnames(pgr) | as.logical(strand(gr) == strand(pgr) | is.na(gr$partner)), NA_integer_,
gr$insLen + 1 + ifelse(as.logical(strand(gr) == "+"), start(gr) - start(pgr), start(pgr) - start(gr))))
}
#' Finds duplication events that are reported as inserts.
#' As sequence alignment algorithms do no allow backtracking, long read-based
#' variant callers will frequently report small duplication as insertion events.
#' Whilst both the duplication and insertion representations result in the same
#' sequence, this representational difference is problematic when comparing
#' variant call sets.
#'
#' WARNING: this method does not check that the inserted sequence actually matched the duplicated sequence.
#' @param query a breakpoint GRanges object
#' @param subject a breakpoint GRanges object
#' @param maxgap maximum distance between the insertion position and the duplication
#' @param maxsizedifference maximum size difference between the duplication and insertion.
#' @return Hits object containing the ordinals of the matching breakends
#' in the query and subject
#' @export
findInsDupOverlaps <- function(query, subject, maxgap=-1L, maxsizedifference=0L) {
.assertValidBreakpointGRanges(query)
.assertValidBreakpointGRanges(subject)
query$ordinal = seq_len(length(query))
subject$ordinal = seq_len(length(subject))
query$set = simpleEventType(query)
query$sel = simpleEventLength(query)
subject$set = simpleEventType(subject)
subject$sel = simpleEventLength(subject)
pquery = partner(query)
psubject = partner(subject)
query$isLowBreakend = start(query) < start(pquery) | (start(query) == start(pquery) & query$ordinal < pquery$ordinal)
subject$isLowBreakend = start(subject) < start(psubject) | (start(subject) == start(psubject) & subject$ordinal < psubject$ordinal)
qins_to_sdup = .findOverlaps_queryIns_subjectDup(query, subject, psubject, maxgap=maxgap, maxsizedifference=maxsizedifference)
sins_to_qdup = .findOverlaps_queryIns_subjectDup(subject, query, pquery, maxgap=maxgap, maxsizedifference=maxsizedifference)
lowhits = data.frame(
qhits=c(qins_to_sdup$queryHits, sins_to_qdup$subjectHits),
shits=c(qins_to_sdup$subjectHits, sins_to_qdup$queryHits))
# add upper to upper match
bothhits = S4Vectors::Hits(
from=c(lowhits$qhits, pquery$ordinal[lowhits$qhits]),
to=c(lowhits$shits, psubject$ordinal[lowhits$shits]),
nLnode=length(query),
nRnode=length(subject))
return(bothhits)
}
#' @noRd
.findOverlaps_queryIns_subjectDup <- function(query, subject, psubject , maxgap=-1L, maxsizedifference=0L) {
subject$HighEndPosition = end(psubject)
subject = subject[subject$set == "DUP" & subject$isLowBreakend]
end(subject) = subject$HighEndPosition
# expand by one since insertion can preceed, succeed, or be in the middle of the dup
start(subject) = start(subject) - 1
query = query[query$set == "INS" & query$isLowBreakend]
hits = findOverlaps(query, subject, maxgap=maxgap, ignore.strand=TRUE)
hits = hits[abs(query$sel[S4Vectors::queryHits(hits)] - subject$sel[S4Vectors::subjectHits(hits)]) <= maxsizedifference]
# TODO: filter by
# Translate back to ordinals of what was passed in to us
return(data.frame(
queryHits=query$ordinal[S4Vectors::queryHits(hits)],
subjectHits=subject$ordinal[S4Vectors::subjectHits(hits)]))
}
#' Identifies potential transitive imprecise calls that can be explained by
#' traversing multiple breakpoints.
#'
#' Transitive calls are imprecise breakpoints or breakpoints with inserted sequence
#' that can be explained by a sequence of breakpoints.
#' That is, A-C calls in which additional sequence may be between A and C that
#' can be explained by A-B-C.
#'
#' @param transitiveGr a breakpoint GRanges object containing imprecise calls
#' @param subjectGr breakpoints to traverse
#' @param maximumImpreciseInsertSize
#' Expected number of bases to traverse imprecise calls.
#' @param minimumTraversedBreakpoints
#' Minimum number of traversed breakpoints to consider a transitive
#' @param maximumTraversedBreakpoints
#' Maximum number of breakpoints to traverse when looking for an explanation of the transitive calls
#' @param positionalMargin
#' Allowable margin of error when matching call positional overlaps.
#' A non-zero margin allows for matching of breakpoint with imperfect homology.
#' @param insertionLengthMargin
#' Allowable difference in length between the inserted sequence and the traversed
#' path length.
#' Defaults to 50bp to allow for long read indel errors.
#' @param insLen
#' Integer vector of same length as `transitiveGr` indicating the number
#' of bases inserted at the breakpoint.
#'
#' Defaults to transitiveGr$insLen which will be present if the GRanges
#' was loaded from a VCF using breakpointRanges()
#' @param impreciseTransitiveCalls
#' Boolean vector of same length as `transitiveGr` indicating which calls
#' are imprecise calls. Defaults to calls with a non-zero interval size
#' that have no homology.
#' @param impreciseSubjectCalls
#' Boolean vector of same length as `subjectGr` indicating which calls
#' are imprecise calls. Defaults to calls with a non-zero interval size
#' that have no homology.
#' @param allowImprecise Allow traversal of imprecise calls.
#' Defaults to FALSE as to prevent spurious results which skip
#' some breakpoints when traversing multiple breakpoints
#' E.g. An A-D transitive from an underlying A-B-C-D rearrangement
#' will include A-B-D and A-C-D results if allowImprecise=TRUE.
#' @return `DataFrame` containing the transitive calls traversed with the following columns:
#' | column | meaning |
#' | ------ | ------- |
#' | transitive_breakpoint_name | Name of the transitive breakpoint a path was found for |
#' | total_distance | Total length (in bp) of the path |
#' | traversed_breakpoint_names | `CharacterList` of names of breakpoint traversed in the path |
#' | distance_to_traversed_breakpoint | `IntegerList` of distances from start of path to end of traversing breakpoint |
#' @export
findTransitiveCalls <- function(
transitiveGr,
subjectGr,
maximumImpreciseInsertSize=700,
minimumTraversedBreakpoints=2,
maximumTraversedBreakpoints=6,
positionalMargin=8,
insertionLengthMargin=50,
insLen=transitiveGr$insLen,
impreciseTransitiveCalls=(transitiveGr$HOMLEN == 0 | is.null(transitiveGr$HOMLEN)) & start(transitiveGr) != end(transitiveGr),
impreciseSubjectCalls=(subjectGr$HOMLEN == 0 | is.null(subjectGr$HOMLEN)) & start(subjectGr) != end(subjectGr),
allowImprecise=FALSE) {
if (is.null(insLen)) {
stop("Missing insLen")
}
transitiveGr$.isImprecise = impreciseTransitiveCalls
transitiveGr$insLen = insLen
transitiveGr = transitiveGr[hasPartner(transitiveGr)]
transitiveGr$.isImprecise = transitiveGr$.isImprecise | partner(transitiveGr)$.isImprecise
transitiveGr$minimumTransitiveLength = ifelse(insLen > 0, pmax(0, insLen - insertionLengthMargin), 0)
transitiveGr$maximumTransitiveLength = ifelse(insLen > 0, insLen + insertionLengthMargin, ifelse(impreciseTransitiveCalls, maximumImpreciseInsertSize, 0))
transitiveGr = transitiveGr[transitiveGr$maximumTransitiveLength > 0]
transitiveGr = transitiveGr[hasPartner(transitiveGr)]
transitiveGr$ordinal = seq_len(length(transitiveGr))
transitiveGr$partnerOrdinal = partner(transitiveGr)$ordinal
if (!allowImprecise) {
subjectGr = subjectGr[!impreciseSubjectCalls]
}
subjectGr = subjectGr[hasPartner(subjectGr)]
# centre-align subject intervals to simplify the traversal logic
start(subjectGr) = (start(subjectGr) + end(subjectGr)) / 2
if (is.null(subjectGr$insLen)) {
warning("insLen field missing. Assuming all traversed breakpoints have no inserted sequence")
subjectGr$insLen = 0
}
subjectGr$insLen = .replaceNa(subjectGr$insLen, 0)
subjectGr$ordinal = seq_len(length(subjectGr))
subjectGr$partnerOrdinal = partner(subjectGr)$ordinal
# transitive breakpoint must occur within the confidence interval bounds
terminal_matches = as.data.frame(GenomicRanges::findOverlaps(transitiveGr, subjectGr, maxgap=positionalMargin, ignore.strand=FALSE)) %>%
dplyr::select(
terminalStartOrdinal=.data$queryHits,
transitiveOrdinal=.data$subjectHits) %>%
dplyr::mutate(
terminalEndOrdinal=transitiveGr$partnerOrdinal[.data$terminalStartOrdinal])
current_traversals = dplyr::inner_join(terminal_matches, terminal_matches, by=c("terminalStartOrdinal"="terminalEndOrdinal"), suffix=c(".start", ".end")) %>%
dplyr::select(
terminalStartOrdinal=.data$terminalStartOrdinal,
currentTraverseInOrdinal=.data$transitiveOrdinal.start,
endingTraverseOutOrdinal=.data$transitiveOrdinal.end,
terminalEndOrdinal=.data$terminalEndOrdinal) %>%
dplyr::mutate(
currentTraverseOutOrdinal=subjectGr$partnerOrdinal[.data$currentTraverseInOrdinal],
distance=subjectGr$insLen[.data$currentTraverseInOrdinal],
# TAB is used as a placeholder as it's a disallowed character in VCF and causes a parsing error in BEDPE
breakpointsTraversed=paste0(names(subjectGr)[.data$currentTraverseInOrdinal], " "),
traversedDistances=paste0(.data$distance, " "),
traversedBreakpoints=1,
minimumTransitiveLength=transitiveGr$minimumTransitiveLength[.data$terminalStartOrdinal],
maximumTransitiveLength=transitiveGr$maximumTransitiveLength[.data$terminalStartOrdinal])
resultdf = data.frame(
terminalStartOrdinal=integer(0),
currentTraverseInOrdinal=integer(0),
endingTraverseOutOrdinal=integer(0),
terminalEndOrdinal=integer(0),
currentTraverseOutOrdinal=integer(0),
distance=integer(0),
ordinalsTraversed=character(0),
traversedDistances=character(0),
traversedBreakpoints=integer(0))
if (nrow(current_traversals > 0)) {
traversable_segments = .traversable_segments(subjectGr, max(current_traversals$maximumTransitiveLength))
# now we traverse
while (nrow(current_traversals) > 0) {
# Terminal paths
current_traversals = current_traversals %>%
dplyr::filter(.data$distance <= .data$maximumTransitiveLength & .data$traversedBreakpoints <= maximumTraversedBreakpoints) %>%
dplyr::mutate(is_complete_path=.data$currentTraverseOutOrdinal == .data$endingTraverseOutOrdinal)
resultdf = dplyr::bind_rows(
resultdf,
current_traversals %>% dplyr::filter(
.data$is_complete_path &
.data$distance >= .data$minimumTransitiveLength &
.data$traversedBreakpoints >= minimumTraversedBreakpoints))
# traverse
current_traversals = current_traversals %>%
dplyr::filter(!.data$is_complete_path) %>%
dplyr::inner_join(traversable_segments, by=c("currentTraverseInOrdinal"="segmentStartExternalOrdinal")) %>%
dplyr::mutate(
currentTraverseInOrdinal=.data$segmentEndInternalOrdinal,
currentTraverseOutOrdinal=.data$segmentEndExternalOrdinal,
distance=.data$distance + .data$segmentLength + .data$segmentEndAdditionalLength,
traversedBreakpoints=.data$traversedBreakpoints + 1,
traversedDistances=paste0(.data$traversedDistances, .data$distance, " "),
breakpointsTraversed=paste0(.data$breakpointsTraversed, names(subjectGr)[.data$segmentEndInternalOrdinal], " "))
# How can drop columns without a "findTransitiveCalls: no visible binding for global variable 'segmentStartAdditionalLength'" NOTE in R check?
# dplyr::select(
# -segmentStartInternalOrdinal,
# -segmentLength,
# -segmentEndInternalOrdinal,
# -segmentEndExternalOrdinal,
# -segmentStartAdditionalLength,
# -segmentEndAdditionalLength)
current_traversals$segmentStartInternalOrdinal = NULL
current_traversals$segmentLength = NULL
current_traversals$segmentEndInternalOrdinal = NULL
current_traversals$segmentEndExternalOrdinal = NULL
current_traversals$segmentStartAdditionalLength = NULL
current_traversals$segmentEndAdditionalLength = NULL
}
}
# transform results into long form
resultdf = resultdf %>%
dplyr::mutate(
transitive_breakpoint_name=names(transitiveGr)[.data$terminalStartOrdinal],
total_distance=as.integer(.data$distance),
# trim trailing tab
traversed_breakpoint_names=stringr::str_sub(.data$breakpointsTraversed, end=stringr::str_length(.data$breakpointsTraversed) - 1),
distance_to_traversed_breakpoint=stringr::str_sub(.data$traversedDistances, end=stringr::str_length(.data$traversedDistances) - 1)) %>%
dplyr::select(
transitive_breakpoint_name=.data$transitive_breakpoint_name,
total_distance=.data$total_distance,
traversed_breakpoint_names=.data$traversed_breakpoint_names,
distance_to_traversed_breakpoint=.data$distance_to_traversed_breakpoint) %>%
as.data.frame() %>%
S4Vectors::DataFrame()
resultdf$traversed_breakpoint_names = IRanges::CharacterList(stringr::str_split(resultdf$traversed_breakpoint_names, stringr::fixed(" ")))
resultdf$distance_to_traversed_breakpoint = IRanges::IntegerList(stringr::str_split(resultdf$distance_to_traversed_breakpoint, stringr::fixed(" ")))
return(resultdf)
}
#' @noRd
.traversable_segments = function(gr, maxgap) {
as.data.frame(GenomicRanges::findOverlaps(gr, gr, maxgap=maxgap, ignore.strand=TRUE)) %>%
dplyr::filter(
(as.logical(strand(gr)[.data$queryHits] == "-") & as.logical(strand(gr)[.data$subjectHits] == "+") & start(gr)[.data$queryHits] <= start(gr)[.data$subjectHits]) |
(as.logical(strand(gr)[.data$queryHits] == "+") & as.logical(strand(gr)[.data$subjectHits] == "-") & start(gr)[.data$queryHits] >= start(gr)[.data$subjectHits])) %>%
dplyr::select(
segmentStartInternalOrdinal=.data$queryHits,
segmentEndInternalOrdinal=.data$subjectHits) %>%
dplyr::mutate(
segmentStartExternalOrdinal=gr$partnerOrdinal[.data$segmentStartInternalOrdinal],
segmentEndExternalOrdinal=gr$partnerOrdinal[.data$segmentEndInternalOrdinal],
segmentLength=1 + abs(start(gr)[.data$segmentStartInternalOrdinal] - start(gr)[.data$segmentEndInternalOrdinal]),
segmentStartAdditionalLength=gr$insLen[.data$segmentStartInternalOrdinal],
segmentEndAdditionalLength=gr$insLen[.data$segmentEndInternalOrdinal]) %>%
dplyr::select(
segmentStartExternalOrdinal=.data$segmentStartExternalOrdinal,
segmentStartInternalOrdinal=.data$segmentStartInternalOrdinal,
segmentStartAdditionalLength=.data$segmentStartAdditionalLength,
segmentLength=.data$segmentLength,
segmentEndAdditionalLength=.data$segmentEndAdditionalLength,
segmentEndInternalOrdinal=.data$segmentEndInternalOrdinal,
segmentEndExternalOrdinal=.data$segmentEndExternalOrdinal)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.