#' Prepare PAF alignments for plotting.
#'
#' This function takes loaded PAF alignments using \code{\link{readPaf}} function. Such alignment could be post-processed
#' using \code{\link{filterPaf}}, \code{\link{breakPaf}} and \code{\link{flipPaf}} functions. Subsequently such alignments are
#' expanded in a set of x and y coordinates ready to be plotted by \code{\link{geom_miropeats}} function.
#'
#' @param offset.alignments Set to \code{TRUE} if subsequent target alignments should be offsetted below and above the midline.
#' @param add.col A user defined column name present in `paf.table` to be added in returned coordinates table.
#' @param sync.x.coordinates If set to \code{TRUE} query coordinates will be adjusted to the limits of target coordinates. (Default : `TRUE`)
#' @inheritParams breakPaf
#' @inheritParams q2t
#' @return A \code{tibble} of PAF alignments reported as x and y coordinate values.
#' @importFrom tibble add_column
#' @importFrom dplyr bind_cols
#' @importFrom data.table ':='
#' @author David Porubsky
#' @export
#' @examples
#' ## Get PAF to process ##
#' paf.file <- system.file("extdata", "test1.paf", package = "SVbyEye")
#' ## Read in PAF
#' paf.table <- readPaf(paf.file = paf.file, include.paf.tags = TRUE, restrict.paf.tags = "cg")
#' ## Convert PAF alignments to x and y coordinates needed for plotting
#' paf2coords(paf.table = paf.table)
#' ## Include optional an column to be exported as well (e.g. alignment specific GC content)
#' paf.table$GC.content <- round(runif(nrow(paf.table), min = 30, max = 60), digits = 2)
#' paf2coords(paf.table = paf.table, add.col = "GC.content")
#'
paf2coords <- function(paf.table, offset.alignments = FALSE, sync.x.coordinates = TRUE, q.range = NULL, t.range = NULL, add.col = NULL) {
## Check user input ##
## Make sure PAF has at least 12 mandatory fields
if (ncol(paf.table) >= 12) {
paf <- paf.table
} else {
stop("Submitted PAF alignments do not contain a minimum of 12 mandatory fields, see PAF file format definition !!!")
}
## Set required fields with default values if not defined
if (!"aln.id" %in% colnames(paf)) {
paf$aln.id <- seq_len(nrow(paf))
}
if (!"ID" %in% colnames(paf)) {
paf$ID <- "M"
}
## Flip start-end if strand == '-'
paf[paf$strand == "-", c("t.start", "t.end")] <- rev(paf[paf$strand == "-", c("t.start", "t.end")])
# paf[paf$strand == '-', c('q.start','q.end')] <- rev(paf[paf$strand == '-', c('q.start','q.end')])
## Add continuous scale to PAF to make sure multiple query and target sequences are position next to each other
paf <- paf2continuousScale(paf.table = paf)
paf$q.start.shift <- paf$q.start + paf$q.shift
paf$q.end.shift <- paf$q.end + paf$q.shift
paf$t.start.shift <- paf$t.start + paf$t.shift
paf$t.end.shift <- paf$t.end + paf$t.shift
## Get unique alignment ID
# if (!"seq.pair" %in% colnames(paf)) {
# paf$seq.pair <- paste0(paf$q.name, "__", paf$t.name)
# }
## Sync scales between alignments [per region id]
if (sync.x.coordinates) {
# paf.l <- split(paf, paf$seq.pair)
# for (i in seq_along(paf.l)) {
# paf.sub <- paf.l[[i]]
# q.range <- range(c(paf.sub$q.start, paf.sub$q.end))
# t.range <- range(c(paf.sub$t.start, paf.sub$t.end))
# ## Adjust target ranges given the size difference with respect to query ranges
# range.offset <- diff(q.range) - diff(t.range)
# t.range[2] <- t.range[2] + range.offset ## Make a start position as offset and change only end position
# ## Covert query to target coordinates
# paf.sub$q.start.trans <- q2t(x = paf.sub$q.start, q.range = q.range, t.range = t.range)
# paf.sub$q.end.trans <- q2t(x = paf.sub$q.end, q.range = q.range, t.range = t.range)
# # q.range <- range(c(paf$q.start, paf$q.end))
# # t.range <- range(c(paf$t.start, paf$t.end))
# # paf$q.start.trans <- q2t(x = paf$q.start, q.range = q.range, t.range = t.range)
# # paf$q.end.trans <- q2t(x = paf$q.end, q.range = q.range, t.range = t.range)
# paf.l[[i]] <- paf.sub
# }
# paf <- do.call(rbind, paf.l)
## Use user defined q.range if defined
if (is.null(q.range) | !is.numeric(q.range) | length(q.range) != 2) {
q.range <- range(c(paf$q.start.shift, paf$q.end.shift))
}
## Use user defined t.range if defined
if (is.null(t.range) | !is.numeric(t.range) | length(t.range) != 2) {
t.range <- range(c(paf$t.start.shift, paf$t.end.shift))
}
## Adjust target ranges given the size difference with respect to query ranges
range.offset <- diff(q.range) - diff(t.range)
t.range[2] <- t.range[2] + range.offset ## Make a start position as offset and change only end position
## Covert query to target coordinates
paf$q.start.trans <- q2t(x = paf$q.start.shift, q.range = q.range, t.range = t.range)
paf$q.end.trans <- q2t(x = paf$q.end.shift, q.range = q.range, t.range = t.range)
} else {
paf$q.start.trans <- paf$q.start
paf$q.end.trans <- paf$q.end
}
## Vectorize data transformation ##
## Define x and y coordinates
# x <- c(rbind(paf$q.start.trans, paf$t.start, paf$q.end.trans, paf$t.end))
x <- c(rbind(paf$q.start.trans, paf$t.start.shift, paf$q.end.trans, paf$t.end.shift))
y <- rep(c(1, 2, 1, 2), times = nrow(paf))
## If there are more than one query or target sequence use offset
if (length(unique(paf$q.name)) > 1) {
offset <- seq(from = 0, to = length(unique(paf$q.name)) / 10, by = 0.1)[seq_along(unique(paf$q.name))]
names(offset) <- unique(paf$q.name[order(paf$q.start.shift, decreasing = TRUE)])
# q.offset <- rep(offset, table(paf$q.name) * 2)
y[c(TRUE, FALSE, TRUE, FALSE)] <- y[c(TRUE, FALSE, TRUE, FALSE)] - rep(offset[paf$q.name], each = 2)
}
if (length(unique(paf$t.name)) > 1) {
offset <- seq(from = 0, to = length(unique(paf$t.name)) / 10, by = 0.1)[seq_along(unique(paf$t.name))]
names(offset) <- unique(paf$t.name[order(paf$t.start.shift, decreasing = TRUE)])
# t.offset <- rep(offset, table(paf$t.name) * 2)
y[c(FALSE, TRUE, FALSE, TRUE)] <- y[c(FALSE, TRUE, FALSE, TRUE)] + rep(offset[paf$t.name], each = 2)
}
## Offset overlapping TARGET alignments up&down based on start position [TODO rewrite to a helper function]
if (offset.alignments) {
if ("bin.id" %in% colnames(paf)) {
## Use an un-binned version of PAF alignments (group by bin ID if present)
to.rep <- table(paf$bin.id)
offset.l <- list()
for (i in seq_along(to.rep)) {
if (i %% 2 == 0) {
offset.l[[i]] <- rep(x = c(0, 0.05, 0, 0.05), times = to.rep[i])
} else {
offset.l[[i]] <- rep(x = c(0, 0, 0, 0), times = to.rep[i])
}
}
offset <- do.call(c, offset.l)
} else {
offset <- rep(c(0, 0, 0, 0, 0, 0.05, 0, 0.05), times = ceiling(nrow(paf) / 2))[seq_along(y)]
}
## Add offset value to y-axis positions
y <- y + offset
}
group <- rep(seq_len(nrow(paf)), each = 4)
seq.name <- c(rbind(paf$q.name, paf$t.name, paf$q.name, paf$t.name))
# seq.pos <- c(rbind(paf$q.start, paf$t.start, paf$q.end, paf$t.end))
seq.pos <- c(rbind(paf$q.start.shift, paf$t.start.shift, paf$q.end.shift, paf$t.end.shift))
pos.shift <- c(rbind(paf$q.shift, paf$t.shift, paf$q.shift, paf$t.shift))
pos.genomic <- c(rbind(paf$q.start, paf$t.start, paf$q.end, paf$t.end))
seq.id <- rep(c("query", "target", "query", "target"), times = nrow(paf))
n.match <- rep(paf$n.match, each = 4)
aln.len <- rep(paf$aln.len, each = 4)
mapq <- rep(paf$mapq, each = 4)
aln.id <- rep(paf$aln.id, each = 4)
ID <- rep(paf$ID, each = 4)
# seq.pair <- rep(paf$seq.pair, each = 4)
direction <- rep(paf$strand, each = 4)
## Create coordinates table (for plotting)
coords <- dplyr::bind_cols(
x = x,
y = y,
group = group,
seq.pos = seq.pos,
pos.shift = pos.shift,
pos.genomic = pos.genomic,
direction = direction,
seq.name = seq.name,
seq.id = seq.id,
n.match = n.match,
aln.len = aln.len,
mapq = mapq,
aln.id = aln.id,
ID = ID,
# seq.pair = seq.pair
)
## Add user defined column name if defined
if (!is.null(add.col)) {
if (nchar(add.col) > 0 & add.col %in% colnames(paf)) {
new.col <- rep(paf[, eval(add.col), drop = TRUE], each = 4)
coords <- coords %>% tibble::add_column(!!add.col := new.col)
} else {
warning("User defined column name in 'add.column' is not present in submitted 'paf.table', skipping !!!")
}
}
return(coords)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.