R/subsetPafAlignments.R

Defines functions subsetPaf subsetPafAlignments

Documented in subsetPaf subsetPafAlignments

#' Subset PAF alignments at desired genomic range.
#'
#' This function takes loaded PAF alignments using \code{\link{readPaf}} function and then subsets
#' as well as cuts PAF alignments at desired target coordinates. This function can only be applied
#' to PAF alignments containing a single query and target sequence.
#'
#' @param target.region A user defined target region either as character string ('chr:start-end') or as
#' a \code{\link{GRanges-class}} object containing a single genomic region to which PAF alignments will be
#' narrowed down.
#' @inheritParams breakPaf
#' @return A \code{tibble} of subsetted PAF alignments.
#' @importFrom GenomicRanges makeGRangesFromDataFrame resize start end width strand
#' @importFrom GenomicAlignments cigarNarrow GAlignments explodeCigarOpLengths
#' @importFrom IRanges subsetByOverlaps
#' @importFrom tibble tibble
#' @importFrom methods as
#' @importFrom dplyr bind_rows
#' @author David Porubsky
#' @export
#' @examples
#' ## Get PAF to plot
#' paf.file <- system.file("extdata", "test1.paf", package = "SVbyEye")
#' ## Read in PAF
#' paf.table <- readPaf(paf.file = paf.file, include.paf.tags = TRUE, restrict.paf.tags = "cg")
#' ## Subset PAF alignments based on desired target region
#' subsetPafAlignments(paf.table = paf.table, target.region = "target.region:19050000-19200000")
#'
subsetPafAlignments <- function(paf.table, target.region = NULL) {
    ptm <- startTimedMessage("[subsetPafAlignments] Subsetting&cutting PAF alignments")
    ## Check user input ##
    ## Make sure PAF has at least 12 mandatory fields
    if (ncol(paf.table) >= 12) {
        paf <- paf.table
    } else {
        stop("\nSubmitted PAF alignments do not contain a minimum of 12 mandatory fields, see PAF file format definition !!!")
    }
    ## Check if PAF contains expected cg field containing CIGAR string
    if (!"cg" %in% colnames(paf)) {
        stop(paste0(
            "\nExpected PAF field 'cg' containing CIGAR string is missing, ",
            "therefore alignments cannot be narrowed down exactly to the user defined 'target.region'!!!"
        ))
    }

    ## Filter alignments by target region ##
    if (!is.null(target.region)) {
        if (is.character(target.region)) {
            target.region.gr <- methods::as(target.region, "GRanges")
        } else if (is(target.region, "GRanges")) {
            target.region.gr <- target.region
        } else {
            message("\nParameter 'target.region' can either be 'GRanges' object or character string 'chr#:start-end'!!!")
        }
        ## Subset PAF by ranges overlaps
        target.gr <- GenomicRanges::makeGRangesFromDataFrame(paf, seqnames.field = "t.name", start.field = "t.start", end.field = "t.end")
        hits <- IRanges::findOverlaps(target.gr, target.region.gr) ## TODO Take only overlaps, regions itself are not shrinked!!!
    }

    ## Return NULL if there are no overlaps with the target region
    if (length(hits) > 0) {
        paf <- paf[S4Vectors::queryHits(hits), ]
        target.gr <- target.gr[S4Vectors::queryHits(hits), ]

        ## Get alignments embedded within target region
        hits <- IRanges::findOverlaps(target.gr, target.region.gr, type = "within", ignore.strand = TRUE)
        within.idx <- S4Vectors::queryHits(hits)
        ## Get alignments with target region being embedded within
        hits <- IRanges::findOverlaps(target.region.gr, target.gr, type = "within", ignore.strand = TRUE)
        embedded.idx <- S4Vectors::subjectHits(hits)
        ## Get alignments with target region being partially overlapping
        partial.idx <- setdiff(seq_along(target.gr), c(embedded.idx, within.idx))

        ## Get alignments embedded within target region
        if (length(within.idx) > 0) {
            paf.within.aln <- paf[within.idx, ]
        } else {
            paf.within.aln <- tibble::tibble()
        }

        ## Process alignments with target region being embedded within
        if (length(embedded.idx) > 0) {
            ## Create PAF alignment object
            paf.aln <- paf[embedded.idx, ]
            alignments <- GenomicAlignments::GAlignments(
                seqnames = paf.aln$t.name,
                pos = as.integer(paf.aln$t.start) + 1L,
                cigar = paf.aln$cg,
                strand = GenomicRanges::strand(paf.aln$strand),
                names = paf.aln$t.name
            )
            ## Get overlaps between target ranges and alignments
            hits <- IRanges::findOverlaps(target.region.gr, alignments)
            ## Lift target positions to query coordinates
            query.coords <- suppressMessages(liftRangesToAlignment(paf.table = paf.aln, gr = target.region.gr, direction = "target2query", report.cigar.str = TRUE))
            ## Set target range to local alignment coordinates
            suppressWarnings(target.local.gr <- GenomicRanges::shift(target.region.gr[S4Vectors::queryHits(hits)],
                shift = -(GenomicRanges::start(alignments)[S4Vectors::subjectHits(hits)] - 1)
            ))
            GenomicRanges::start(target.local.gr[GenomicRanges::start(target.local.gr) == 0]) <- 1

            ## Cut cigar string
            starts <- GenomicRanges::start(target.local.gr)
            width <- GenomicRanges::width(target.local.gr)
            cigars <- alignments@cigar[S4Vectors::subjectHits(hits)]
            new.cigar <- vapply(seq_along(starts), function(i) {
                tryCatch(
                    {
                        GenomicAlignments::cigarNarrow(cigar = cigars[i], start = starts[i], width = width[i])[1]
                    },
                    error = function(e) {
                        return("1=")
                    }
                )
            }, FUN.VALUE = character(1))
            ## Get alignment length from regional cigars
            new.aln.len <- sum(GenomicAlignments::explodeCigarOpLengths(cigar = new.cigar)[[1]])
            ## Get matched bases from regional cigars
            new.n.match <- sum(GenomicAlignments::explodeCigarOpLengths(cigar = new.cigar, ops = c("=", "M"))[[1]])

            ## Update PAF coordinates and cigar string ##
            paf.aln$q.start <- GenomicRanges::start(query.coords)
            paf.aln$q.end <- GenomicRanges::end(query.coords)
            paf.aln$t.start <- GenomicRanges::start(target.region.gr)
            paf.aln$t.end <- GenomicRanges::end(target.region.gr)
            paf.aln$n.match <- new.n.match
            paf.aln$aln.len <- new.aln.len
            paf.aln$cg <- new.cigar
            paf.embedded.aln <- paf.aln
        } else {
            paf.embedded.aln <- tibble::tibble()
        }

        ## Process alignments with target region being partially overlapping
        if (length(partial.idx) > 0) {
            paf.partial.aln <- paf[partial.idx, ]
            ## Narrow alignment at desired start position ##
            cut.start.idx <- which(paf.partial.aln$t.start < GenomicRanges::start(target.region.gr))
            if (length(cut.start.idx) != 0) {
                ## Create PAF alignment object
                paf.aln <- paf.partial.aln[cut.start.idx, ]
                alignments <- GenomicAlignments::GAlignments(
                    seqnames = paf.aln$t.name,
                    pos = as.integer(paf.aln$t.start) + 1L,
                    cigar = paf.aln$cg,
                    strand = GenomicRanges::strand(paf.aln$strand),
                    names = paf.aln$t.name
                )
                ## Get overlaps between target ranges and alignments
                hits <- GenomicRanges::findOverlaps(target.region.gr, alignments)
                ## Lift target positions to query coordinates
                target.start <- GenomicRanges::resize(target.region.gr, width = 1, fix = "start")
                query.start <- suppressMessages(liftRangesToAlignment(paf.table = paf.aln, gr = target.start, direction = "target2query", report.cigar.str = TRUE))
                ## Set target range to local alignment coordinates
                suppressWarnings(target.local.gr <- GenomicRanges::shift(target.start[S4Vectors::queryHits(hits)],
                    shift = -(start(alignments)[S4Vectors::subjectHits(hits)] - 1)
                ))
                GenomicRanges::start(target.local.gr[GenomicRanges::start(target.local.gr) == 0]) <- 1

                ## Cut cigar string
                starts <- GenomicRanges::start(target.local.gr)
                cigars <- alignments@cigar[S4Vectors::subjectHits(hits)]
                widths <- GenomicRanges::width(alignments[S4Vectors::subjectHits(hits)])
                new.cigar <- vapply(seq_along(starts), function(i) {
                    tryCatch(
                        {
                            GenomicAlignments::cigarNarrow(cigar = cigars[i], start = starts[i], end = widths[i])[1]
                        },
                        error = function(e) {
                            return("1=")
                        }
                    )
                }, FUN.VALUE = character(1))
                ## Get alignment length from regional cigars
                new.aln.len <- vapply(GenomicAlignments::explodeCigarOpLengths(cigar = new.cigar), FUN = sum, FUN.VALUE = numeric(1))
                ## Get matched bases from regional cigars
                new.n.match <- vapply(GenomicAlignments::explodeCigarOpLengths(cigar = new.cigar, ops = c("=", "M")), FUN = sum, FUN.VALUE = numeric(1))

                ## Update PAF coordinates and cigar string ##
                paf.aln$q.end[paf.aln$strand == "-"] <- GenomicRanges::start(query.start[GenomicRanges::strand(query.start) == "-"])
                paf.aln$q.start[paf.aln$strand == "+"] <- GenomicRanges::start(query.start[GenomicRanges::strand(query.start) == "+"])
                paf.aln$t.start <- GenomicRanges::start(target.region.gr)
                # paf.aln$t.end <- GenomicRanges::end(target.region.gr)
                paf.aln$n.match <- new.n.match
                paf.aln$aln.len <- new.aln.len
                paf.aln$cg <- new.cigar
                paf.partial.aln[cut.start.idx, ] <- paf.aln
            }

            ## Narrow alignment at desired end position ##
            cut.end.idx <- which(paf.partial.aln$t.end > GenomicRanges::end(target.region.gr))
            if (length(cut.end.idx) != 0) {
                ## Create PAF alignment object
                paf.aln <- paf.partial.aln[cut.end.idx, ]
                alignments <- GenomicAlignments::GAlignments(
                    seqnames = paf.aln$t.name,
                    pos = as.integer(paf.aln$t.start) + 1L,
                    cigar = paf.aln$cg,
                    strand = GenomicRanges::strand(paf.aln$strand),
                    names = paf.aln$t.name
                )
                ## Get overlaps between target ranges and alignments
                hits <- GenomicRanges::findOverlaps(target.region.gr, alignments)
                ## Lift target positions to query coordinates
                target.end <- GenomicRanges::resize(target.region.gr, width = 1, fix = "end")
                query.end <- suppressMessages(liftRangesToAlignment(paf.table = paf.aln, gr = target.end, direction = "target2query", report.cigar.str = TRUE))
                ## Set target range to local alignment coordinates
                suppressWarnings(target.local.gr <- GenomicRanges::shift(target.end[S4Vectors::queryHits(hits)],
                    shift = -(start(alignments)[S4Vectors::subjectHits(hits)] - 1)
                ))
                GenomicRanges::start(target.local.gr[GenomicRanges::start(target.local.gr) == 0]) <- 1

                ## Cut cigar string
                ends <- GenomicRanges::start(target.local.gr)
                cigars <- alignments@cigar[S4Vectors::subjectHits(hits)]
                new.cigar <- vapply(seq_along(ends), function(i) {
                    tryCatch(
                        {
                            GenomicAlignments::cigarNarrow(cigar = cigars[i], start = 1, end = ends[i])[1]
                        },
                        error = function(e) {
                            return("1=")
                        }
                    )
                }, FUN.VALUE = character(1))
                ## Get alignment length from regional cigars
                new.aln.len <- vapply(GenomicAlignments::explodeCigarOpLengths(cigar = new.cigar), FUN = sum, FUN.VALUE = numeric(1))
                ## Get matched bases from regional cigars
                new.n.match <- vapply(GenomicAlignments::explodeCigarOpLengths(cigar = new.cigar, ops = c("=", "M")), FUN = sum, FUN.VALUE = numeric(1))

                ## Update PAF coordinates and cigar string ##
                paf.aln$q.start[paf.aln$strand == "-"] <- GenomicRanges::start(query.end[GenomicRanges::strand(query.end) == "-"])
                paf.aln$q.end[paf.aln$strand == "+"] <- GenomicRanges::start(query.end[GenomicRanges::strand(query.end) == "+"])
                # paf.aln$t.start <- GenomicRanges::start(target.region.gr)
                paf.aln$t.end <- GenomicRanges::end(target.region.gr)
                paf.aln$n.match <- new.n.match
                paf.aln$aln.len <- new.aln.len
                paf.aln$cg <- new.cigar
                paf.partial.aln[cut.end.idx, ] <- paf.aln
            }
        } else {
            paf.partial.aln <- tibble::tibble()
        }
        ## Merge final PAF
        paf <- dplyr::bind_rows(paf.within.aln, paf.embedded.aln, paf.partial.aln)

        stopTimedMessage(ptm)
        return(paf)
    } else {
        warning("\nNone of the PAF ranges overlap user defined 'target.region', exiting ...")
        return(NULL)
    }
}


#' Subset PAF alignments at desired genomic range.
#'
#' This function takes loaded PAF alignments using \code{\link{readPaf}} function and then subsets
#' as well as cuts PAF alignments at desired target coordinates.
#'
#' @inheritParams breakPaf
#' @inheritParams subsetPafAlignments
#' @return A \code{tibble} of subsetted PAF alignments.
#' @importFrom GenomicRanges makeGRangesFromDataFrame strand
#' @importFrom IRanges findOverlaps
#' @importFrom GenomeInfoDb seqlevels seqnames
#' @importFrom methods as
#' @importFrom dplyr bind_rows
#' @author David Porubsky
#' @export
#' @examples
#' ## Get PAF to plot
#' paf.file <- system.file("extdata", "test_ava.paf", package = "SVbyEye")
#' ## Read in PAF
#' paf.table <- readPaf(paf.file = paf.file, include.paf.tags = TRUE, restrict.paf.tags = "cg")
#' ## Subset multiple PAF alignments based on desired target region
#' target.region <- "HG01358_2:100000-200000"
#' subsetPaf(paf.table, target.region = target.region)
#'
subsetPaf <- function(paf.table, target.region = NULL) {
    ## Check user input ##
    ## Make sure PAF has at least 12 mandatory fields
    if (ncol(paf.table) >= 12) {
        paf <- paf.table
    } else {
        stop("\nSubmitted PAF alignments do not contain a minimum of 12 mandatory fields, see PAF file format definition !!!")
    }
    ## Check if PAF contains expected cg field containing CIGAR string
    if (!"cg" %in% colnames(paf)) {
        stop(paste0(
            "\nExpected PAF field 'cg' containing CIGAR string is missing, ",
            "therefore alignments cannot be narrowed down exactly to the user defined 'target.region'!!!"
        ))
    }

    ## Filter alignments by target region ##
    if (!is.null(target.region)) {
        if (is.character(target.region)) {
            target.region.gr <- methods::as(target.region, "GRanges")
        } else if (is(target.region, "GRanges")) {
            target.region.gr <- target.region
        } else {
            message("\nParameter 'target.region' can either be 'GRanges' object or character string 'chr#:start-end'!!!")
        }
        ## Make GRanges from all target alignment coordinates
        target.gr <- GenomicRanges::makeGRangesFromDataFrame(paf, seqnames.field = "t.name", start.field = "t.start", end.field = "t.end")
        ## Make sure all sequence levels are present
        seq.lvs <- unique(c(unique(paf$t.name), unique(paf$q.name)))
        GenomeInfoDb::seqlevels(target.gr) <- seq.lvs
        GenomeInfoDb::seqlevels(target.region.gr) <- seq.lvs
        ## Check if user defined target region overlaps any target sequence in submitted paf.table
        hits <- IRanges::findOverlaps(target.gr, target.region.gr)
        if (length(hits) > 0) {
            ## Get all possible target coordinates
            all.targets <- list()
            while (length(IRanges::findOverlaps(target.region.gr, target.gr)) > 0) {
                gr <- liftRangesToAlignment(paf.table = paf, gr = target.region.gr, direction = "target2query")
                all.targets[[length(all.targets) + 1]] <- target.region.gr[, 0]
                target.region.gr <- gr
            }
            all.targets[[length(all.targets) + 1]] <- target.region.gr[, 0]
            all.targets <- do.call(c, all.targets)
            all.targets <- all.targets[GenomeInfoDb::seqnames(all.targets) != "Failed"]
            ## IF not all sequence levels present in original PAF are reported in all.targets then
            ## make sure to include any remaining ranges transferable from query to target
            if (!all(seq.lvs %in% as.character(GenomeInfoDb::seqnames(all.targets)))) {
                gr <- liftRangesToAlignment(paf.table = paf, gr = all.targets, direction = "query2target")
                gr <- gr[GenomeInfoDb::seqnames(gr) != "Failed"]
                all.targets <- c(all.targets, gr[, 0])
                GenomicRanges::strand(all.targets) <- "*"
                all.targets <- unique(all.targets)
            }

            ## Remove self-alignments
            paf <- paf[paf$q.name != paf$t.name, ]

            ## Subset alignments for all target coordinates
            paf.l <- split(paf, paf$t.name)
            target.paf <- list()
            for (i in seq_along(paf.l)) {
                sub.paf <- paf.l[[i]]
                new.paf <- subsetPafAlignments(
                    paf.table = sub.paf,
                    target.region = all.targets[as.character(GenomeInfoDb::seqnames(all.targets)) == unique(sub.paf$t.name)]
                )
                target.paf[[i]] <- new.paf
            }
            subset.paf <- dplyr::bind_rows(target.paf)
            ## Reassign new coordinates
            subset.paf$q.len <- (subset.paf$q.end - subset.paf$q.start) + 1
            subset.paf$t.len <- (subset.paf$t.end - subset.paf$t.start) + 1
            subset.paf$q.start <- 1
            subset.paf$t.start <- 1
            subset.paf$q.end <- subset.paf$q.len
            subset.paf$t.end <- subset.paf$t.len
        }
    } else {
        message("\nNone of the PAF ranges overlap user defined 'target.region', exiting ...")
        return(NULL)
    }
    return(subset.paf)
}
daewoooo/SVbyEye documentation built on Oct. 15, 2024, 6:12 a.m.