R/disjoinPafAlignments.R

Defines functions disjoinPafAlignments

Documented in disjoinPafAlignments

#' Disjoin overlapping PAF alignments.
#'
#' This function takes loaded PAF alignments using \code{\link{readPaf}} function and identify overlapping
#' genomic ranges either in target or query coordinates and then split PAF alignments at the positions of
#' these overlaps into a disjoined set of genomic ranges. Alternatively, user defined set of genomic ranges
#' in parameter `disjoin.gr` can be used to disjoin PAF alignments.
#'
#' @param min.overlap A minimum number of overlapping base pairs to disjoin PAF alignments.
#' @param coordinates PAF coordinates to disjoin, either 'target' or 'query'.
#' @param disjoin.gr A \code{\link{GRanges-class}} object containing genomic ranges where PAF alignments will be disjoined
#' (must be in defined `coordinates`).
#' @inheritParams breakPaf
#' @return A \code{tibble} of disjoined PAF alignments.
#' @import GenomicRanges
#' @importFrom GenomicAlignments GAlignments cigarNarrow explodeCigarOpLengths
#' @importFrom GenomeInfoDb seqnames
#' @importFrom S4Vectors subjectHits queryHits
#' @importFrom IRanges IRanges
#' @importFrom methods is
#' @importFrom tibble tibble
#' @author David Porubsky
#' @export
#' @examples
#' ## Get PAF to plot
#' paf.file <- system.file("extdata", "test1.paf", package = "SVbyEye")
#' ## Read in PAF
#' paf.table <- readPaf(paf.file = paf.file, include.paf.tags = TRUE, restrict.paf.tags = "cg")
#' ## Disjoin PAF at target coordinates
#' disj.paf.table <- disjoinPafAlignments(paf.table = paf.table, coordinates = "target")
#' ## Disjoin PAF at user defined coordinates
#' disj.gr <- GenomicRanges::GRanges(
#'     seqnames = "query.region",
#'     ranges = IRanges::IRanges(start = 16300000, end = 16350000)
#' )
#' disj.paf.table <- disjoinPafAlignments(
#'     paf.table = paf.table,
#'     coordinates = "query", disjoin.gr = disj.gr
#' )
#'
disjoinPafAlignments <- function(paf.table, min.overlap = 1000, coordinates = "target", disjoin.gr = NULL) {
    ptm <- startTimedMessage(paste0("[disjoinPafAlignments] Disjoining ", coordinates, " alignments"))

    ## Check user input ##
    ## Make sure PAF has at least 12 mandatory fields
    if (ncol(paf.table) >= 12) {
        paf <- paf.table
    } else {
        stop("Submitted PAF alignments do not contain a minimum of 12 mandatory fields, see PAF file format definition !!!")
    }

    ## Process target ranges ##
    if (coordinates == "target") {
        target.gr <- GenomicRanges::makeGRangesFromDataFrame(paf, seqnames.field = "t.name", start.field = "t.start", end.field = "t.end")

        if (methods::is(disjoin.gr, "GRanges")) {
            hits <- GenomicRanges::findOverlaps(disjoin.gr, target.gr)
            if (length(hits) > 0) {
                todisj.gr <- disjoin.gr
            } else {
                message("No genomic ranges in 'disjoin.gr' overlaps target coordinates")
            }
        } else {
            ## Get regions covered more than once
            cov.gr <- as(GenomicRanges::coverage(target.gr), "GRanges")
            cov.gr <- cov.gr[cov.gr$score > 1]
            ## Filter by minimum overlapping regions
            todisj.gr <- cov.gr[GenomicRanges::width(cov.gr) >= min.overlap, 0]
        }

        ## Get disjoined set of query and target ranges
        if (length(todisj.gr) > 0) {
            ## Remove regions to disjoin from target
            subtr.gr <- suppressWarnings(unlist(GenomicRanges::subtract(target.gr, todisj.gr, ignore.strand = TRUE)))
            ## Get intersection between regions to disjoin and target
            inters.gr <- suppressWarnings(GenomicRanges::intersect(target.gr, todisj.gr, ignore.strand = TRUE))
            target.gr <- suppressWarnings(sort(c(subtr.gr, inters.gr), ignore.strand = TRUE))
            ## Lift target ranges to query coordinates
            query.gr <- suppressMessages(liftRangesToAlignment(paf.table = paf, gr = target.gr, direction = "target2query", report.cigar.str = TRUE))
            ## Expand target ranges to match mapped query ranges
            target.gr <- target.gr[query.gr$idx]
            target.gr$alignmentsHits <- query.gr$alignmentsHits
            GenomicRanges::strand(target.gr) <- GenomicRanges::strand(query.gr)
            cigars.region <- query.gr$cg
            processCIGAR <- TRUE
        } else {
            processCIGAR <- FALSE
        }
        ## Process query ranges ##
    } else if (coordinates == "query") {
        query.gr <- GenomicRanges::makeGRangesFromDataFrame(paf, seqnames.field = "q.name", start.field = "q.start", end.field = "q.end")

        if (methods::is(disjoin.gr, "GRanges")) {
            hits <- GenomicRanges::findOverlaps(disjoin.gr, query.gr)
            if (length(hits) > 0) {
                todisj.gr <- disjoin.gr
            } else {
                message("No genomic ranges in 'disjoin.gr' overlaps target coordinates")
            }
        } else {
            ## Get regions covered more than once
            cov.gr <- as(GenomicRanges::coverage(query.gr), "GRanges")
            cov.gr <- cov.gr[cov.gr$score > 1]
            ## Filter by minimum overlapping regions
            todisj.gr <- cov.gr[GenomicRanges::width(cov.gr) >= min.overlap, 0]
        }

        ## Get disjoined set of query and target ranges
        if (length(todisj.gr) > 0) {
            ## Remove regions to disjoin from target
            subtr.gr <- suppressWarnings(unlist(GenomicRanges::subtract(query.gr, todisj.gr, ignore.strand = TRUE)))
            ## Get intersection between regions to disjoin and target
            inters.gr <- suppressWarnings(GenomicRanges::intersect(query.gr, todisj.gr, ignore.strand = TRUE))
            query.gr <- suppressWarnings(sort(c(subtr.gr, inters.gr), ignore.strand = TRUE))
            ## Lift query ranges to target coordinates
            target.gr <- suppressMessages(liftRangesToAlignment(paf.table = paf, gr = query.gr, direction = "query2target", report.cigar.str = TRUE))
            ## Expand query ranges to match mapped target ranges
            query.gr <- query.gr[target.gr$idx]
            GenomicRanges::strand(query.gr) <- GenomicRanges::strand(target.gr)
            cigars.region <- target.gr$cg
            processCIGAR <- TRUE
        } else {
            processCIGAR <- FALSE
        }
    }

    if (processCIGAR) {
        #   ## Subset the cigar string per target region ##
        # ## Define PAF alignment object
        # alignments <- GenomicAlignments::GAlignments(
        #   seqnames = paf$t.name,
        #   pos = as.integer(paf$t.start) + 1L,
        #   #pos = as.integer(paf$t.start),
        #   cigar = paf$cg,
        #   strand = GenomicRanges::strand(paf$strand),
        #   names = paf$t.name
        # )
        # ## Get overlaps between target ranges and alignments
        # #hits <- findOverlaps(target.gr, alignments, minoverlap = min.overlap)
        # hits <- GenomicRanges::findOverlaps(target.gr, alignments)
        #   end(target.gr[which.max(end(target.gr))]) <- max(end(alignments))
        #   hits <- GenomicRanges::findOverlaps(target.gr, alignments, type = 'within')
        #
        #   ## Keep only longest overlaps for each target range
        #   intersect.gr <- GenomicRanges::pintersect(target.gr[S4Vectors::queryHits(hits)], as(alignments[S4Vectors::subjectHits(hits)], 'GRanges'))
        #   overlap.width <- GenomicRanges::width(intersect.gr)
        #   max.overlap <- sapply(split(overlap.width, S4Vectors::queryHits(hits)), max)
        #   hits <- hits[overlap.width %in% max.overlap]
        #   #hits <- hits[which(unique(overlap.width) %in% max.overlap)]
        #
        #   ## Get local alignment coordinates [in parallel fashion]
        #   #lifted.gr <- GenomicAlignments::pmapToAlignments(x = target.gr, alignments = alignments[S4Vectors::subjectHits(hits)])
        #   #names(lifted.gr) <- NULL
        #
        #   ## Set target range to local alignment coordinates
        #   suppressWarnings( target.local.gr <- GenomicRanges::shift(target.gr, shift = -(start(alignments)[S4Vectors::subjectHits(hits)] - 1)) )
        #   suppressWarnings( target.local.gr <- GenomicRanges::shift(target.gr, shift = -(start(alignments)[target.gr$alignmentsHits] - 1)) )
        #   start(target.local.gr[start(target.local.gr) == 0]) <- 1
        #   #target.local.gr <- GenomicRanges::trim(target.local.gr)
        #
        #   ## Define start and end position for CIGAR extraction
        #   #starts <- pmax(GenomicRanges::start(lifted.gr), 1)
        #   #width <- GenomicRanges::width(target.gr) - 1
        #   #ends <- pmin(GenomicRanges::end(lifted.gr), GenomicRanges::width(alignments)[S4Vectors::subjectHits(hits)])
        #   starts <- GenomicRanges::start(target.local.gr)
        #   width <- GenomicRanges::width(target.local.gr)
        #
        #   cigars <- alignments@cigar[S4Vectors::subjectHits(hits)]
        #   cigars.region <- vapply(seq_along(starts), function(i) {
        #     tryCatch(
        #       {
        #         GenomicAlignments::cigarNarrow(cigar = cigars[i], start = starts[i], width = width[i])[1]
        #       },
        #       error = function(e) {
        #         return("1=")
        #       }
        #     )
        #   }, FUN.VALUE = character(1))

        ## Get alignment length from regional cigars
        aln.len <- vapply(cigars.region, function(cg) sum(GenomicAlignments::explodeCigarOpLengths(cigar = cg)[[1]]), USE.NAMES = FALSE, FUN.VALUE = numeric(1))
        ## Get matched bases from regional cigars
        n.match <- vapply(cigars.region, function(cg) sum(GenomicAlignments::explodeCigarOpLengths(cigar = cg, ops = c("=", "M"))[[1]]), USE.NAMES = FALSE, FUN.VALUE = numeric(1))
        ## Get single-base mismatches from regional cigars
        n.mismatch <- vapply(cigars.region, function(cg) sum(GenomicAlignments::explodeCigarOpLengths(cigar = cg, ops = c("X"))[[1]]), USE.NAMES = FALSE, FUN.VALUE = numeric(1))

        ## Prepare modified PAF for export ##
        # return.paf <- tibble::tibble("q.name" = as.character(GenomeInfoDb::seqnames(query.gr))
        #                           ,"q.len" = paf$q.len[S4Vectors::subjectHits(hits)]
        #                           ,"q.start" = GenomicRanges::start(query.gr)
        #                           ,"q.end" = GenomicRanges::end(query.gr)
        #                           ,"strand" = as.character( GenomicRanges::strand(query.gr) )
        #                           ,"t.name" = as.character(GenomeInfoDb::seqnames(target.gr))
        #                           ,"t.len" = paf$t.len[S4Vectors::subjectHits(hits)]
        #                           ,"t.start" = GenomicRanges::start(target.gr)
        #                           ,"t.end" = GenomicRanges::end(target.gr)
        #                           ,"n.match" = n.match
        #                           ,"aln.len" = aln.len
        #                           ,"mapq" = paf$mapq[S4Vectors::subjectHits(hits)]
        #                           ,"cg" = cigars.region)

        return.paf <- tibble::tibble(
            "q.name" = as.character(GenomeInfoDb::seqnames(query.gr)),
            "q.len" = paf$q.len[target.gr$alignmentsHits],
            "q.start" = GenomicRanges::start(query.gr),
            "q.end" = GenomicRanges::end(query.gr),
            "strand" = as.character(GenomicRanges::strand(query.gr)),
            "t.name" = as.character(GenomeInfoDb::seqnames(target.gr)),
            "t.len" = paf$t.len[target.gr$alignmentsHits],
            "t.start" = GenomicRanges::start(target.gr),
            "t.end" = GenomicRanges::end(target.gr),
            "n.match" = n.match,
            "aln.len" = aln.len,
            "mapq" = paf$mapq[target.gr$alignmentsHits],
            "cg" = cigars.region
        )
    } else {
        return.paf <- paf
        message("    No genomic ranges to disjoin were found !!!")
    }

    stopTimedMessage(ptm)
    ## Return PAF
    return(return.paf)
}
daewoooo/SVbyEye documentation built on Oct. 15, 2024, 6:12 a.m.