#' Add 'correct offset' column to a set of reads
#'
#' Given rpfs, and named vectors of starts and stops, this adds the 'correct'
#' offset to the reads to use with determining p-site offsets
#' @keywords Ribostan
#' @author Dermot Harnett, \email{dermot.p.harnett@gmail.com}
#'
#' @param rpfs GRanges object with RPFs
#' @param anno annotation object
#'
#' @return the rpf granges but wth a 'cor_offset' column
add_cor_offset <- function(rpfs, anno) {
cdsstarts <- anno$cdsstarts %>% unlist()
cds_prestop_st <- anno$cds_prestop_st %>% unlist()
rpfs <- rpfs %>%
methods::as("GRanges") %>%
addphase(cdsstarts) %>%
{
.$startoffset <- cdsstarts[as.vector(seqnames(.))] + .$phase - GenomicRanges::start(.)
.$endoffset <- cds_prestop_st[as.vector(seqnames(.))] + .$phase - GenomicRanges::start(.)
.$cor_offset <- ifelse(
(.$startoffset < (.$readlen - 5)) & (.$startoffset > 5),
.$startoffset,
ifelse(
(.$endoffset < (.$readlen - 5)) & (.$endoffset > 5),
.$endoffset,
NA
)
)
.
}
rpfs
}
#' Get Offsets by maximizing p-sites within the CDS
#'
#'
#' @keywords Ribostan
#' @author Dermot Harnett, \email{dermot.p.harnett@gmail.com}
#'
#' @param rpfs GRanges object with RPFs
#' @param anno annotation object
#'
#' @return a data frame with the best offset per read/phase
# TODO document the columns output by this function
# TODO maybe also shortedn this by removing arguments?
# TODO remove the hardcoded numbers
get_incl_max_offsets <- function(rpfs, anno) {
stopifnot("readlen" %in% colnames(mcols(rpfs)))
#
rpfs <- rpfs %>% add_cor_offset(anno)
#
rpfs$readlen <- as.numeric(rpfs$readlen)
if (!"score" %in% colnames(mcols(rpfs))) rpfs$score <- 1
# data frame with numbers of reads gained at a specific offset
gaindf <- rpfs %>%
subset((startoffset < readlen - 5) & (startoffset > 5)) %>%
mcols(.) %>%
.[, c("startoffset", "score", "readlen", "phase")] %>%
as.data.frame() %>%
group_by(.data$phase, .data$readlen, .data$startoffset) %>%
tally(wt = .data$score) %>%
arrange(.data$startoffset) %>%
group_by(.data$phase, .data$readlen) %>%
mutate(gain = cumsum(.data$n)) %>%
select(p_offset = 'startoffset', 'phase', 'gain')
# data frame with numbers of reads lost at a specific offset
lossdf <- rpfs %>%
subset((.$endoffset < (readlen - 3)) & (.$endoffset > 3)) %>%
mcols(.) %>%
.[, c("endoffset", "score", "readlen", "phase")] %>%
as.data.frame() %>%
group_by(.data$phase, .data$readlen, .data$endoffset) %>%
tally(wt = .data$score) %>%
arrange(.data$endoffset) %>%
group_by(.data$phase, .data$readlen) %>%
mutate(loss = cumsum(.data$n)) %>%
select('p_offset' = 'endoffset', 'phase', 'loss') %>%
mutate('p_offset' = .data$p_offset + 3)
# for a given offset, our gain is the start overlappers whose
# offset is that or less, minus the stop guys whose offset is
# less than that
#
netdf <- gaindf %>%
full_join(lossdf, by = c("readlen", "p_offset", "phase")) %>%
mutate(net =
replace_na(.data$gain, 0) - replace_na(.data$loss, 0)) %>%
group_by(.data$readlen) %>%
arrange(.data$readlen, .data$p_offset) %>%
filter(!is.na(.data$net))
#
readlens <- rpfs$readlen %>% unique()
#
allpos <-
lapply(readlens, function(rl) {
expand.grid(
readlen = rl,
phase = 0:2,
p_offset = 2:(rl - 3)
)
}) %>%
bind_rows() %>%
filter((.data$p_offset %% 3) == 0)
netdf <- allpos %>% left_join(netdf,
by = c("readlen", "p_offset", "phase")
)
netdf <- netdf %>%
mutate(net = replace_na(.data$net, 0)) %>%
mutate(twind = .data$net + lag(.data$net) + lead(.data$net))
netdf <- netdf %>%
group_by(.data$readlen) %>%
filter(max(.data$net) > 32)
best_offsets <- netdf %>%
filter(!is.na(.data$twind)) %>%
group_by(.data$readlen) %>%
dplyr::slice((which.max(.data$twind) - 1):(which.max(.data$twind) + 1))
#
uniqueoffsetsfound <- best_offsets %>%
group_by(.data$readlen, .data$phase) %>%
filter(n() > 1) %>%
nrow() %>%
`==`(0)
stopifnot(uniqueoffsetsfound)
#
best_offsets <- best_offsets%>%
select(-'gain',-'loss',-'net',-'twind')
best_offsets
}
#' Get Offsets by maximizing p-sites within the CDS
#'
#'
#' @keywords Ribostan
#' @author Dermot Harnett, \email{dermot.p.harnett@gmail.com}
#'
#' @param input GRanges object with RPFs or a bam file with RPFs
#' @param anno annotation object
#' @param method Method for p-site determination. Defaults to 'cds_incl'
#'
#' @return a dataframe with p-site offsets per readlength
#' @examples
#' cov <- get_readgr(ribobam, anno)
#' offsets_df <- get_offsets(cov, anno)
#' @export
#' @examples
#' data(chr22_anno)
#' data(rpfs)
#' data(offsets_df)
#' testoffsets_df <- get_offsets(rpfs, chr22_anno)
#'
# TODO Add reference to a-p-site paper
# input = ribobam
get_offsets <- function(input, anno, method = "cds_incl") {
if (is.character(input) & (length(input) == 1)) {
message("loading footprint data")
rpfs <- get_readgr(input, anno)
} else {
stopifnot(is(input, "GRanges"))
rpfs <- input
}
#
if (method == "cds_incl") {
best_offsets <- get_incl_max_offsets(rpfs, anno)
} else {
stop("not yet implemented")
}
best_offsets
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.