R/kl_div.R

Defines functions get_cds_codons

Documented in get_cds_codons

#' Get A Granges object with locations of each codon in the given cds
#'
#'
#' @keywords Ribostan
#' @author Dermot Harnett, \email{dermot.p.harnett@gmail.com}
#'
#' @param anno An annotation object
#' @param n_wind_l_ext The 5' extent of the window around the codon to return
#' Defaults to: 45
#' @param n_wind_r_ext The 3' extent of the window around the codon to return
#' Defaults to: 9
#' @param n_start_buff The number of bp at the start of each cds to exclude
#' Defaults to 60
#' @param n_end_buff The number of bp at the end of each cds to exclude
#' Defaults to 60
#'
#' @return a GRanges object with the codon identity in the names, and the
#'
#' @details Use estimates of per codon dwell time to get the mean dwell time
#' over transcrips

# TODO this function is dumb...
# TODO stop missusing the name slot, maybe move some of this out so we don't
# have so many arguments

get_cds_codons <- function(anno,
                           n_wind_l_ext = 45,
                           n_wind_r_ext = 9,
                           n_start_buff = 60,
                           n_end_buff = 60) {
  stopifnot((n_start_buff %% 3) == 0)
  stopifnot((n_end_buff %% 3) == 0)
  fafileob <- anno$fafileob
  longtrs <- anno$longtrs
  longorfs <- anno$trgiddf$orf_id[match(longtrs, anno$trgiddf$transcript_id)]
  message("getting codon positions...")
  exonsgrl <- anno$exonsgrl[longtrs]
  trspacecds <- anno$trspacecds[longorfs]
  names(trspacecds) <- longtrs
  # sequence of relevant exons
  exonseq <- exonsgrl %>%
    sort_grl_st() %>%
    GenomicFeatures::extractTranscriptSeqs(x = fafileob)
  # data frame with codon and position info
  codposdf <- get_codposdf(longorfs, anno)
  # as a gr
  codposdf <- GRanges(codposdf$orf_id, IRanges(codposdf$pos), codon = codposdf$codon)
  codongr <- codposdf %>% GenomicFeatures::mapFromTranscripts(trspacecds)
  codongr$codon <- codposdf$codon[codongr$xHits]
  codongr <- resize(codongr, 3, "start")
  # add in seqinfo
  seqlevels(codongr) <- names(exonseq)
  seqinfo(codongr) <- Seqinfo(names(exonseq), nchar(exonseq))
  strand(codongr) <- "+"
  # expand the windows around these codons
  codongrbak <- codongr
  codongr <- codongr[(GenomicRanges::start(codongr) - 1) >= n_wind_l_ext]
  enddists <- (seqlengths(codongr)[as.character(seqnames(codongr))] -
    GenomicRanges::end(codongr))
  codongr <- codongr[enddists >= n_wind_r_ext]
  codmatchwindows <- codongr %>%
    resize(n_wind_l_ext, "end") %>%
    resize(width(.) + n_wind_r_ext, "start")
  # require these to be inside our cds
  # use only matches in the inner cds
  innercds <- trspacecds %>%
    subset(width > (3 + n_start_buff + n_end_buff)) %>%
    resize(width(.) - n_start_buff, "end") %>%
    resize(width(.) - n_end_buff, "start")
  codongr <- codongr %>% subsetByOverlaps(innercds)
  codmatchwindows <- codmatchwindows[!is_out_of_bounds(codmatchwindows)]
  # define inner cds - cds but with start and end trimmed off.
  codmatchwindows <- codmatchwindows %>% subsetByOverlaps(innercds)
  # lift them to inner cds coordinates
  cds_codons <- codmatchwindows %>%
    GenomicFeatures::mapToTranscripts(innercds)
  cds_codons$codon <- codmatchwindows$codon[cds_codons$xHits]
  cds_codons$xHits <- NULL
  cds_codons$transcriptsHits <- NULL
  cds_codons
}
#' Get A Granges object with locations of each codon in the given cds
#'
#'
#' @keywords Ribostan
#' @author Dermot Harnett, \email{dermot.p.harnett@gmail.com}
#'
#' @param rpf_cov A list of SimpleRleLists - RPF coverage split by readlength
#' @param cds_codons - GRanges object containing windows in which to get rust scores
#'
#' @details Use estimates of per codon dwell time to get the mean dwell time over transcrips
#' @return A data frame with expected and actual rust score per position in window, codon, readlength

get_cov_rust_scores <- function(rpf_cov, cds_codons) {
  fp_profiles <- rpf_cov %>% lapply(function(cdsrlfpcov) {
    rustcdsrlfpcov <- cdsrlfpcov
    rustcdsrlfpcov <- rustcdsrlfpcov > mean(rustcdsrlfpcov)
    nz_trs <- any(rustcdsrlfpcov) %>% names(.)[.]
    #
    cds_codons_nz <- cds_codons %>% subset(seqnames %in% nz_trs)
    codtrs <- seqnames(cds_codons_nz) %>% as.character()
    cat(".")
    # calculate ro vals
    ro_cl <- rustcdsrlfpcov[cds_codons_nz] %>%
      split(cds_codons_nz$codon) %>%
      lapply(as.matrix) %>%
      lapply(colMeans)
    # also get evals
    tr_rust_evals <- rustcdsrlfpcov %>% mean()
    re_c <- tr_rust_evals[codtrs] %>%
      split(cds_codons_nz$codon) %>%
      vapply(mean, 1.0)
    ro_cl <- ro_cl %>% lapply(
      tibble::enframe,
      "position", "ro_cl"
    )
    ro_cl <- bind_rows(ro_cl, .id = "codon")
    re_c <- tibble::enframe(re_c, "codon", "re_c")
    ro_cl %>% left_join(re_c, by = "codon")
  })
  fp_profiles <- bind_rows(.id = "readlen", fp_profiles)
  fp_profiles
}

#' Get A Granges object with locations of each codon in the given cds
#'
#'
#' @keywords Ribostan
#' @author Dermot Harnett, \email{dermot.p.harnett@gmail.com}
#'
#' @param covgrs A list of SimpleRleLists - RPF coverage
#' @param anno An annotation object
#' @param n_wind_l_ext The 5' extent of the window around the codon to return
#'     Defaults to: 45
#'
#' @details gets profiles of read 5' end occurence around codons
#' @return A data frame with expected and actual rust score per position in
#' window, codon, readlength
#' @export

get_metacodon_profs <- function(covgrs, anno, n_wind_l_ext = 45) {
  are_psites <- covgrs %>% vapply(function(x) "orf" %in% colnames(mcols(x)), TRUE)
  stopifnot(!any(are_psites))
  cds_codons <- get_cds_codons(anno)

  stopifnot(!is.null(names(covgrs)))
  cdsfpcovlist <- lapply(covgrs, function(covgr) {
    rlsplitcov <- split(resize(covgr, 1, "start"), covgr$readlen)
    lapply(rlsplitcov, coverage)
  })
  lapplyfunc <- if('paralell'%in%installed.packages()){
    rust_roel <- parallel::mclapply(
      mc.cores = detectCores(), cdsfpcovlist, cds_codons = cds_codons,
      FUN = get_cov_rust_scores
  )
  }else{
    rust_roel <- lapply(cdsfpcovlist, cds_codons = cds_codons,
      FUN = get_cov_rust_scores
  )
  }
  
  # https://www.nature.com/articles/ncomms12915#Sec10 see equation 3
  # of RUST paper
  metacodondf <- bind_rows(rust_roel, .id = "sample")
  #
  metacodondf <- mutate(metacodondf, position = 
      .data$position - 1 - (n_wind_l_ext))
  metacodondf <- filter(metacodondf, !.data$codon %in% c("TAG", "TAA", "TGA"))
  metacodondf <- mutate(metacodondf, count = .data$ro_cl / .data$re_c)
  metacodondf$nreadlen <- metacodondf$readlen %>% as.numeric()
  metacodondf
}

#' Calculate KL-divergence from a data frame of rust expected and actual scores
#'
#'
#' @keywords Ribostan
#' @author Dermot Harnett, \email{dermot.p.harnett@gmail.com}
#'
#' @param metacodondf a data frame with metacodon info
#' @param anno An annotation object
#'
#' @examples
#' data(chr22_anno)
#' data(rpfs)
#' data(offsets_df)
#' covgrs <- list(sample1 = rpfs)
#' # note this doesn't work that well on a small subset
#' kl_df <- get_kl_df(metacodondf, chr22_anno)
#' @return A data frame with expected and actual rust score per position in
#' window, codon, readlength
#' @export
get_kl_df <- function(metacodondf, anno) {
  metacodondf %>%
    group_by(.data$sample, .data$nreadlen, .data$position) %>%
    mutate(ro_cl = .data$ro_cl / sum(.data$ro_cl),
      re_c = .data$re_c / sum(.data$re_c)) %>%
    summarise(KL = sum(.data$ro_cl * log2(.data$ro_cl / .data$re_c)),
       .groups = "keep")
}

#' Get the most frequent element from a numeric vector
#'
#'
#' @keywords Ribostan
#' @author Dermot Harnett, \email{dermot.p.harnett@gmail.com}
#'
#' @param x a vector
#'
#' @return the single most common value in the vector

most_freq <- function(x) {
  x %>%
    table() %>%
    sort() %>%
    names() %>%
    as.numeric() %>%
    tail(1)
}


#' Get a dataframe with p-site offsets from a datafram of KL divergence
#'
#'
#' @keywords Ribostan
#' @author Dermot Harnett, \email{dermot.p.harnett@gmail.com}
#'
#' @param kl_df  dataframe with KL divergence per readlength, position in window
#'  around codon
#' @param method means by which to select best offfset currently only possible value
#' is a_max - assumes the maximum KL divergence occurs under the A site
#'
#' @return a dataframe with p-site offsets per readlength, sample
#' @examples
#' data(chr22_anno)
#' data(rpfs)
#' data(offsets_df)
#' covgrs <- list(sample1 = rpfs)
#' # note this doesn't work that well on a small subset
#' kl_df <- get_kl_df(covgrs, chr22_anno) #'
#' kl_offsets <- select_offsets(kl_df)
#' @export


select_offsets <- function(kl_df, method = "a_max") {
  if (method == "a_max") {
    kl_offsets <- kl_df %>%
      group_by(.data$sample, .data$nreadlen, .data$position) %>%
      summarise(sumKL = sum(.data$KL), .groups = "keep") %>%
      filter(.data$position < -6) %>%
      filter(.data$position > -(.data$nreadlen - 6)) %>%
      group_by(.data$sample, .data$nreadlen) %>%
      dplyr::slice(which.max(.data$sumKL)) %>%
      mutate(p_offset = -(.data$position + 3))
  }
  kl_offsets %>%
    ungroup() %>%
    select('sample', 'nreadlen', 'p_offset')
}



#' Plot KL divergence in selected windows
#'
#'
#' @keywords Ribostan
#' @author Dermot Harnett, \email{dermot.p.harnett@gmail.com}
#'
#' @param kl_df  dataframe with KL divergence per readlength, position in window
#'  around codon
#' @param kl_offsets  dataframe with information on p-sites
#' @param selreadlens  integer vector of readlengths to plot
#'
#' @return a dataframe with p-site offsets per readlength
#' @importFrom ggplot2 qplot theme_bw facet_grid scale_y_continuous 
#' @importFrom ggplot2 scale_x_continuous geom_vline ggtitle
#' @export
# selreadlens=27:31
plot_kl_dv <- function(kl_df, kl_offsets, selreadlens = 27:32) {
  stopifnot(c("position", "nreadlen", "KL", "sample") %in% colnames(kl_df))
  stopifnot(c("nreadlen", "sample", "p_offset") %in% colnames(kl_offsets))
  #
  if (!is.null(selreadlens)) {
    kl_offsets <- kl_offsets %>% filter(.data$nreadlen %in% selreadlens)
  }
  #
  kl_div_plot <- kl_df %>%
    filter(.data$position < -3) %>%
    filter(.data$position > -(.data$nreadlen - 6)) %>%
    # separate(sample,c('fraction','genotype','rep'),remove=F)%>%
    filter(.data$nreadlen %in% selreadlens) %>% 
    {
      qplot(data = ., x = position, y = KL) +
        theme_bw() +
        facet_grid(nreadlen ~ sample,scales = 'free_y') +
        scale_y_continuous("RUST KL divergence") +
        scale_x_continuous("5 read position relative to codon ") +
        geom_vline(
          data = kl_offsets, aes(xintercept = -p_offset - 3),
          color = I("green"), linetype = 2
        ) +
        geom_vline(
          data = kl_offsets, aes(xintercept = -p_offset),
          color = I("blue"), linetype = 2
        ) +
        ggtitle("RUST KL divergence vs position")
    }
  kl_div_plot
}

#' Create a dataframe with a_site, p_site and e site occupancies
#'
#'
#' @keywords Ribostan
#' @author Dermot Harnett, \email{dermot.p.harnett@gmail.com}
#'
#' @param metacodondf A data frame with expected and actual rust score per position in
#' window, codon, readlength
#' @param kl_offsets A dataframe with p-site offsets per readlength, sample
#'
#' @return a dataframe occupancies for e,p and a sites for each sample
#' @examples
#' data(chr22_anno)
#' data(rpfs)
#' data(offsets_df)
#' data(metacodondf)
#' # note this doesn't work that well on a small subset
#' kl_df <- get_kl_df(metacodondf, chr22_anno)
#' kl_offsets <- select_offsets(kl_df)
#' allcodondt <- export_codon_dts(metacodondf, kl_offsets)
#' @export

export_codon_dts <- function(metacodondf, kl_offsets) {
  posseldf <- bind_rows(
    kl_offsets %>% mutate(position = -.data$p_offset + 3, site = "e_site")
      %>% select(nreadlen, position, site),
    kl_offsets %>% mutate(position = -.data$p_offset, site = "p_site")
      %>% select(nreadlen, position, site),
    kl_offsets %>% mutate(position = -.data$p_offset - 3, site = "a_site")
      %>% select(nreadlen, position, site),
    kl_offsets %>% mutate(position = -.data$p_offset - 6, site = "a_p3_site")
      %>% select(nreadlen, position, site)
  )
  #
  allcodondt <- metacodondf %>%
    inner_join(posseldf, by = c("position", "nreadlen")) %>%
    group_by(.data$sample, .data$site, .data$codon) %>%
    summarise(rust = sum(.data$ro_cl) / sum(.data$re_c)) %>%
    dplyr::rename("RUST_score" = "rust") %>%
    tidyr::pivot_wider(names_from = "site", values_from = "RUST_score")
  #
  allcodondt
}
zslastman/RiboStan documentation built on June 12, 2024, 1:59 a.m.