#' Plot all-versus-all alignments stored in PAf format.
#'
#' This function takes PAF output file from minimap2 reporting all-versus-all alignments of multiple FASTA sequences
#' and visualize the alignments in a miropeat style.
#'
#' @param seqnames.order A user defined order sequence names to be plotted from top to the bottom.
#' @inheritParams readPaf
#' @inheritParams plotMiro
#' @return A \code{list} of miropeat style plots.
#' @import ggplot2
#' @importFrom grid unit
#' @importFrom scales comma
#' @importFrom dplyr bind_rows group_by summarise arrange
#' @importFrom GenomicAlignments explodeCigarOpLengths
#' @author David Porubsky
#' @export
#' @examples
#' ## Get PAF to plot
#' paf.file <- system.file("extdata", "test_ava.paf", package = "SVbyEye")
#' ## Read in PAF
#' paf.table <- readPaf(paf.file = paf.file, include.paf.tags = TRUE, restrict.paf.tags = "cg")
#' ## Make a plot
#' ## Color by alignment directionality
#' plotAVA(paf.table = paf.table, color.by = "direction")
#' ## Color by fraction of matched bases in each alignment
#' plotAVA(paf.table = paf.table, color.by = "identity")
#' ## Define custom sample order
#' seqnames.order <- c("HG00438_2", "HG01358_2", "HG02630_2", "HG03453_2")
#' plotAVA(paf.table = paf.table, color.by = "direction", seqnames.order = seqnames.order)
#' ## Only samples present in custom sample order are being plotted
#' seqnames.order <- c("HG00438_2", "HG01358_2", "HG03453_2")
#' plotAVA(paf.table = paf.table, color.by = "direction", seqnames.order = seqnames.order)
#' ## Outline PAF alignments
#' plotAVA(paf.table = paf.table, outline.alignments = TRUE)
#' ## Highlight structural variants
#' plotAVA(
#' paf.table = paf.table, min.deletion.size = 1000, min.insertion.size = 1000,
#' highlight.sv = "outline"
#' )
#' ## Bin PAF alignments into user defined bin and color them by sequence identity (% of matched bases)
#' plotAVA(paf.table = paf.table, binsize = 10000)
#' ## Add annotation to all-versus-all alignments ##
#' plt <- plotAVA(paf.table = paf.table, color.by = "direction")
#' annot.file <- system.file("extdata", "test_annot_ava.RData", package = "SVbyEye")
#' annot.gr <- get(load(annot.file))
#' addAnnotation(ggplot.obj = plt, annot.gr = annot.gr, coordinate.space = "self", y.label.id = "ID")
#'
plotAVA <- function(paf.table, seqnames.order = NULL, min.deletion.size = NULL, min.insertion.size = NULL, highlight.sv = NULL, binsize = NULL, color.by = "direction", perc.identity.breaks = c(90, 95, 99, 99.5, 99.6, 99.7, 99.8, 99.9), color.palette = NULL, outline.alignments = FALSE) {
## Check user input
## Make sure submitted paf.table has at least 12 mandatory fields
if (ncol(paf.table) >= 12) {
paf <- paf.table
## Add PAF alignment IDs if it doesn't exists
if (!"aln.id" %in% colnames(paf)) {
paf$aln.id <- seq_len(nrow(paf))
}
} else {
stop("Submitted PAF alignments do not contain a minimum of 12 mandatory fields, see PAF file format definition !!!")
}
## Get desired sequence order ##
## Based on user input
seq.ids <- unique(c(paf$q.name, paf$t.name))
if (is.character(seqnames.order)) {
if (all(seq.ids %in% seqnames.order)) {
seq.ord <- seqnames.order
} else {
# seq.ord <- c(seqnames.order, setdiff(seq.ids, seqnames.order))
message("Not all query and target IDs present in user defined 'seqnames.order', subsetting !!!")
## Subset PAF to only those samples defined in seqnames.order
paf <- paf[paf$q.name %in% seqnames.order & paf$t.name %in% seqnames.order, ]
seq.ord <- seqnames.order
}
} else {
seq.ord <- NULL
}
## Break PAF at insertion/deletions defined in cigar string
if (!is.null(min.deletion.size) | !is.null(min.insertion.size)) {
paf.l <- breakPaf(paf.table = paf, min.deletion.size = min.deletion.size, min.insertion.size = min.insertion.size, collapse.mismatches = TRUE, report.sv = TRUE)
paf <- paf.l$M
paf.svs <- paf.l$SVs
} else {
# paf$aln.id <- 1:nrow(paf)
paf.svs <- NULL
if (!is.null(highlight.sv)) {
highlight.sv <- NULL
warning("Please specify 'min.deletion.size' and 'min.insertion.size' in order to make parameter 'highlight.sv' to work !!!")
}
}
## Store PAF alignments for later addition of 'geom_gene_arrow'
paf.copy <- paf
paf.copy$ID <- "M"
## Bin PAF alignments
if (!is.null(binsize)) {
if (binsize > 0) {
if (binsize < 10) {
binsize <- 10
warning("Minimum allowed bin size is 10, forced binsize=10!!!")
}
paf <- pafToBins(paf.table = paf, binsize = binsize)
## If the PAF alignments are binned only color.by = 'fraction.matches' is allowed
color.by <- "identity"
}
}
## Mark alignments ranges by 'M' (matches)
paf$ID <- "M"
## Add SVs to the alignment table
if (!is.null(paf.svs)) {
if (nrow(paf.svs) > 0) {
paf.svs$ID <- "INS"
paf.svs$ID[grep(paf.svs$cg, pattern = "D", ignore.case = TRUE)] <- "DEL"
paf <- dplyr::bind_rows(paf, paf.svs)
}
}
## If user sequence order not defined get one based on number of mismatches between all pairs of alignments ##
## Get unique alignment ID
paf$seq.pair <- paste0(paf$q.name, "__", paf$t.name)
if (is.null(seq.ord)) {
## Order alignments based on the number of mismatches
# paf$NM <- S4Vectors::sapply(paf$cg, function(cg) sum(GenomicAlignments::explodeCigarOpLengths(cigar = cg, ops = c('X'))[[1]]), USE.NAMES = FALSE)
paf$NM <- vapply(paf$cg, function(cg) sum(GenomicAlignments::explodeCigarOpLengths(cigar = cg, ops = c("X"))[[1]]), USE.NAMES = FALSE, FUN.VALUE = numeric(1))
paf.ord <- paf %>%
dplyr::group_by(seq.pair) %>%
dplyr::summarise(n.nm = sum(.data$NM)) %>%
dplyr::arrange(.data$n.nm)
# paf.ord <- paf %>% dplyr::group_by(seq.pair) %>% dplyr::summarise(identity = sum(n.match) / sum(aln.len)) %>% dplyr::arrange(identity)
paf.ord.pairs <- unlist(strsplit(paf.ord$seq.pair, "__"))
## Assign level to seq.names ordered by number of matching bases in plus an minus orientation
seq.names <- paf.ord.pairs[!duplicated(paf.ord.pairs)]
} else {
## Assign user defined assembly order
seq.names <- seq.ord
}
## Order PAF
# desired.pairs <- paste0(seq.names[-length(seq.names)], '__', seq.names[-1])
# paf <- paf %>% dplyr::arrange(match(paf$q.name, seq.names))
# paf <- paf[match(paf$q.name, seqnames.order),]
# paf <- paf[paf$seq.pair %in% desired.pairs,]
## Flip start-end if strand == '-'
paf[paf$strand == "-", c("t.start", "t.end")] <- rev(paf[paf$strand == "-", c("t.start", "t.end")])
## Assign level to seq.names
seq.ids <- length(seq.names):1
names(seq.ids) <- seq.names
paf$y1 <- seq.ids[match(paf$q.name, names(seq.ids))]
paf$y2 <- seq.ids[match(paf$t.name, names(seq.ids))]
## Keep subsequent comparisons only
paf <- paf[abs(paf$y2 - paf$y1) == 1, ]
## Flip query and target for alignments where query comes first in the defined order
flipQT <- which(paf$y1 > paf$y2)
# paf[flipQT,] <- transform(paf[flipQT,],
# 'q.name' = t.name, 'q.start' = t.start, 'q.end' = t.end,
# 't.name' = q.name, 't.start' = q.start, 't.end' = q.end,
# 'y1' = y2, 'y2' = y1)
if (length(flipQT) > 0) {
paf.sub <- paf[flipQT, ]
## Flip query and target coordinates
paf[flipQT, ] <- transform(
paf.sub,
"q.name" = paf.sub$t.name, "q.start" = paf.sub$t.start, "q.end" = paf.sub$t.end,
"t.name" = paf.sub$q.name, "t.start" = paf.sub$q.start, "t.end" = paf.sub$q.end,
"y1" = paf.sub$y2, "y2" = paf.sub$y1
)
## Switch insertion and deletion labels for flipped query and target alignments
paf$ID[flipQT] <- dplyr::recode(paf$ID[flipQT], "INS" = "DEL", "DEL" = "INS")
}
paf$seq.pair[flipQT] <- paste0(paf$q.name[flipQT], "__", paf$t.name[flipQT])
## Translate paf alignments to plotting coordinates ##
x <- c(rbind(paf$q.start, paf$t.start, paf$q.end, paf$t.end))
y <- c(rbind(paf$y1, paf$y2, paf$y1, paf$y2))
group <- rep(seq_len(nrow(paf)), each = 4)
seq.name <- c(rbind(paf$q.name, paf$t.name, paf$q.name, paf$t.name))
seq.pos <- c(rbind(paf$q.start, paf$t.start, paf$q.end, paf$t.end))
pos.shift <- 0
pos.genomic <- c(rbind(paf$q.start, paf$t.start, paf$q.end, paf$t.end))
seq.id <- c(rbind("query", "target", "query", "target"))
n.match <- rep(paf$n.match, each = 4)
aln.len <- rep(paf$aln.len, each = 4)
mapq <- rep(paf$mapq, each = 4)
# aln.id <- rep(paf$aln.id, each=4)
ID <- rep(paf$ID, each = 4)
seq.pair <- rep(paf$seq.pair, each = 4)
direction <- rep(paf$strand, each = 4)
coords <- data.frame(
x = x,
y = y,
group = group,
seq.pos = seq.pos,
pos.shift = pos.shift,
pos.genomic = pos.genomic,
direction = direction,
seq.name = seq.name,
seq.id = seq.id,
n.match = n.match,
aln.len = aln.len,
mapq = mapq,
# aln.id=aln.id,
ID = ID,
# direction.flip=direction.flip,
seq.pair = seq.pair,
stringsAsFactors = FALSE
)
## Get x-axis labels
y.labels <- unique(coords$seq.name)
y.breaks <- coords$y[match(y.labels, coords$seq.name)]
## Define color palette
if (!is.null(color.palette)) {
if (all(c("+", "-") %in% names(color.palette))) {
pal <- color.palette
} else {
pal <- c("-" = "cornflowerblue", "+" = "forestgreen")
warning("User defined 'color.palette' does not contain both '+' and '-' directions, using default values instead!!!")
}
} else {
pal <- c("-" = "cornflowerblue", "+" = "forestgreen")
}
## Plot alignments and color by direction or identity
if (color.by == "direction") {
plt <- ggplot2::ggplot(coords[coords$ID == "M", ]) +
geom_miropeats(ggplot2::aes(x, y, group = group, fill = direction), alpha = 0.5) +
ggplot2::scale_fill_manual(values = pal, name = "Alignment\ndirection")
} else if (color.by == "identity") {
coords$identity <- (coords$n.match / coords$aln.len) * 100
coords$identity[is.nan(coords$identity) | is.na(coords$identity)] <- 0
## Define color scheme
coords.l <- getColorScheme(data.table = coords, value.field = "identity", breaks = perc.identity.breaks)
coords <- coords.l$data
colors <- coords.l$colors
plt <- ggplot2::ggplot(coords[coords$ID == "M", ]) +
geom_miropeats(ggplot2::aes(x, y, group = group, fill = .data$col.levels), alpha = 0.5) +
ggplot2::scale_fill_manual(values = colors, drop = FALSE, name = "Identity")
} else if (color.by == "mapq") {
plt <- ggplot2::ggplot(coords[coords$ID == "M", ]) +
geom_miropeats(ggplot2::aes(x, y, group = group, fill = mapq), alpha = 0.5) +
ggplot2::scale_fill_gradient(low = "gray", high = "red")
} else {
plt <- ggplot2::ggplot(coords[coords$ID == "M", ]) +
geom_miropeats(ggplot2::aes(x, y, group = group), alpha = 0.5, fill = "gray")
}
## Add alignment outlines
if (outline.alignments) {
plt <- plt + geom_miropeats(data = coords[coords$ID == "M", ], ggplot2::aes(x, y, group = group), fill = NA, color = "gray", linewidth = 0.25)
}
## Add indels
if (!is.null(highlight.sv)) {
if (nrow(coords[coords$ID != "M", ]) > 0) {
## Add SVs to the plot
if (highlight.sv == "outline") {
plt <- plt + ggnewscale::new_scale_color() +
geom_miropeats(data = coords[coords$ID != "M", ], ggplot2::aes(x, y, group = group, color = ID), fill = NA, alpha = 0.5, inherit.aes = FALSE) +
ggplot2::scale_color_manual(values = c("DEL" = "firebrick3", "INS" = "dodgerblue3"), name = "SV class")
} else if (highlight.sv == "fill") {
plt <- plt + ggnewscale::new_scale_fill() +
geom_miropeats(data = coords[coords$ID != "M", ], ggplot2::aes(x, y, group = group, fill = ID), alpha = 0.5, inherit.aes = FALSE) +
ggplot2::scale_fill_manual(values = c("DEL" = "firebrick3", "INS" = "dodgerblue3"), name = "SV class")
} else {
warning("Parameter 'highlight.sv' can only take values 'outline' or 'fill', see function documentation!!!")
}
} else {
message("There are no SVs to highlight. Try to decrease 'min.deletion.size' and 'min.insertion.size' values!!!")
}
}
## Add x and y scales
suppressMessages(
plt <- plt +
ggplot2::scale_x_continuous(expand = c(0, 0), labels = scales::comma) +
ggplot2::scale_y_continuous(breaks = y.breaks, labels = y.labels) +
ggplot2::xlab("Genomic position (bp)") +
ggplot2::ylab("")
)
## Add arrows to mark start and end of each alignment
# start <- coords$x[c(T, T, F, F)]
# end <- coords$x[c(F, F, T, T)]
# y <- coords$y[c(T, T, F, F)]
# group <- coords$group[c(T, T, F, F)]
# plt.df <- data.frame(start=start, end=end, y=y, group=group)
# plt.df$direction <- ifelse(plt.df$start < plt.df$end, '+', '-')
#
# plt <- plt + ggnewscale::new_scale_fill() + ggnewscale::new_scale_color() +
# gggenes::geom_gene_arrow(data=plt.df, ggplot2::aes(xmin = start, xmax = end, y = y, color= direction, fill = direction), arrowhead_height = unit(3, 'mm')) +
# scale_fill_manual(values = c('-' = 'cornflowerblue', '+' = 'forestgreen'), name='Alignment\ndirection') +
# scale_color_manual(values = c('-' = 'cornflowerblue', '+' = 'forestgreen'), name='Alignment\ndirection')
## Add sequence length lines
seq.lines <- data.frame(y.breaks = y.breaks, y.labels = y.labels)
seq.lengths <- dplyr::bind_cols("seq.id" = c(paf.copy$q.name, paf.copy$t.name), "seq.len" = c(paf.copy$q.len, paf.copy$t.len))
seq.lengths <- seq.lengths[!duplicated(seq.lengths$seq.id), ]
seq.lines$seq.len <- seq.lengths$seq.len[match(seq.lines$y.labels, seq.lengths$seq.id)]
plt <- plt + ggplot2::geom_segment(data = seq.lines, ggplot2::aes(x = 1, xend = .data$seq.len, y = y.breaks, yend = y.breaks), linewidth = 1)
## Set the theme and scales
theme_ava <- ggplot2::theme(
panel.grid.major = ggplot2::element_blank(),
panel.grid.minor = ggplot2::element_blank(),
panel.background = ggplot2::element_blank(),
axis.line.x = ggplot2::element_line(linewidth = 1),
axis.ticks.x = ggplot2::element_line(linewidth = 1),
axis.ticks.length.x = grid::unit(2, "mm")
)
plt <- plt + theme_ava
## Return final plot
return(plt)
}
# paf2coords_test <- function(paf.table, offset.alignments=FALSE) {
# ## Check user input ##
# ## Make sure PAF has at least 12 mandatory fields
# if (ncol(paf.table) >= 12) {
# paf <- paf.table
# } else {
# stop('Submitted PAF alignments do not contain a minimum of 12 mandatory fields, see PAF file format definition !!!')
# }
#
# ## Flip start-end if strand == '-'
# paf[paf$strand == '-', c('t.start','t.end')] <- rev(paf[paf$strand == '-', c('t.start','t.end')])
# #paf[paf$strand == '-', c('q.start','q.end')] <- rev(paf[paf$strand == '-', c('q.start','q.end')])
#
# ## Get subsequent seqnames order
# seq.names <- unique(c(paf$q.name, paf$t.name))
# #seq.names <- unique(c(rbind(paf$q.name, paf$t.name)))
#
# ## Get unique alignment ID
# if (!'seq.pair' %in% colnames(paf)) {
# paf$seq.pair <- paste0(paf$q.name, '__', paf$t.name)
# }
#
# ## Sync scales between alignments [per region id]
# paf.l <- split(paf, paf$seq.pair)
# for (i in seq_along(paf.l)) {
# paf.sub <- paf.l[[i]]
# q.range <- range(c(paf.sub$q.start, paf.sub$q.end))
# t.range <- range(c(paf.sub$t.start, paf.sub$t.end))
# ## Adjust target ranges given the size difference with respect query ranges
# range.offset <- diff(q.range) - diff(t.range)
# t.range[2] <- t.range[2] + range.offset ## Make a start position as offset and change only end position
# ## Covert query to target coordinates
# paf.sub$q.start.trans <- q2t(x = paf.sub$q.start, q.range = q.range, t.range = t.range)
# paf.sub$q.end.trans <- q2t(x = paf.sub$q.end, q.range = q.range, t.range = t.range)
# # q.range <- range(c(paf$q.start, paf$q.end))
# # t.range <- range(c(paf$t.start, paf$t.end))
# # paf$q.start.trans <- q2t(x = paf$q.start, q.range = q.range, t.range = t.range)
# # paf$q.end.trans <- q2t(x = paf$q.end, q.range = q.range, t.range = t.range)
# paf.l[[i]] <- paf.sub
# }
# paf <- do.call(rbind, paf.l)
#
# ## Assign level to seq.names
# seq.ids <- length(seq.names):1
# names(seq.ids) <- seq.names
# paf$y1 <- seq.ids[match(paf$q.name, names(seq.ids))]
# paf$y2 <- seq.ids[match(paf$t.name, names(seq.ids))]
# ## Keep subsequent comparisons only
# paf <- paf[abs(paf$y2 - paf$y1) == 1,]
#
# ## Flip query and target for alignments where query comes first
# flipQT <- which(paf$y1 > paf$y2)
# paf[flipQT,] <- transform(paf[flipQT,],
# q.name = t.name, q.start = t.start, q.end = t.end,
# t.name = q.name, t.start = q.start, t.end = q.end,
# y1 = y2, y2 = y1)
# paf$seq.pair[flipQT] <- paste0(paf$q.name[flipQT], '___', paf$t.name[flipQT])
#
# ## Vectorize data transformation ##
# #x <- c(rbind(paf$q.start.trans, paf$t.start, paf$q.end.trans, paf$t.end))
# #y <- rep(c(1,2,1,2), times=nrow(paf))
# x <- c(rbind(paf$q.start, paf$t.start, paf$q.end, paf$t.end))
# y <- c(rbind(paf$y1, paf$y2, paf$y1, paf$y2))
# ## Offset target alignments
# if (offset.alignments) {
# offset <- rep(c(0,0,0,0,0,0.05,0,0.05), times=ceiling(nrow(paf) / 2))[1:length(y)]
# y <- y + offset
# }
# group <- rep(1:nrow(paf), each=4)
# seq.name <- c(rbind(paf$q.name, paf$t.name, paf$q.name, paf$t.name))
# seq.pos <- c(rbind(paf$q.start, paf$t.start, paf$q.end, paf$t.end))
# seq.id <- c(rbind('query', 'target', 'query', 'target'))
# n.match <- rep(paf$n.match, each=4)
# aln.len <- rep(paf$aln.len, each=4)
# mapq <- rep(paf$mapq, each=4)
# aln.id <- rep(paf$aln.id, each=4)
# ID <- rep(paf$ID, each=4)
# seq.pair <- rep(paf$seq.pair, each=4)
# direction <- rep(paf$strand, each=4)
#
# ## Create final data coordinate data frame
# coords <- data.frame(x=x,
# y=y,
# group=group,
# seq.pos=seq.pos,
# direction=direction,
# seq.name=seq.name,
# seq.id=seq.id,
# n.match=n.match,
# aln.len=aln.len,
# mapq=mapq,
# aln.id=aln.id,
# ID=ID,
# #direction.flip=direction.flip,
# seq.pair=seq.pair,
# stringsAsFactors = FALSE)
#
# return(coords)
# }
#
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.