Nothing
#' Shift footprints by selected offsets
#'
#' Function shifts footprints (GRanges) using specified offsets for every of
#' the specified lengths. Reads that do not conform to the specified lengths
#' are filtered out and rejected. Reads are resized to single base in 5' end
#' fashion, treated as p site.
#' This function takes account for junctions in cigars of the reads. Length of
#' the footprint is saved in size' parameter of GRanges output. Footprints are
#' also sorted according to their genomic position, ready to be saved as a
#' ofst, bed or wig file.
#'
#' The two columns in the shift data.frame/data.table argument are:\cr
#' - fraction Numeric vector of lengths of footprints you select
#' for shifting.\cr
#' - offsets_start Numeric vector of shifts for corresponding
#' selected_lengths. eg. c(-10, -10) with selected_lengths of c(31, 32) means
#' length of 31 will be shifted left by 10. Footprints of length 32 will be
#' shifted right by 10.
#'
#' NOTE: It will remove softclips from valid width, the CIGAR 3S30M is qwidth
#' 33, but will remove 3S so final read width is 30 in ORFik.
#' @param footprints \code{\link{GAlignments}} object of RiboSeq reads
#' @param shifts a data.frame / data.table with minimum 2 columns,
#' fraction (selected_lengths) and selected_shifts (relative position).
#' Output from \code{\link{detectRibosomeShifts}}
#' @param sort logical, default TRUE. If False will keep original order of
#' reads, and not sort output reads in increasing genomic location per
#' chromosome and strand.
#' @return A \code{\link{GRanges}} object of shifted footprints, sorted and
#' resized to 1bp of p-site,
#' with metacolumn "size" indicating footprint size before shifting and
#' resizing, sorted in increasing order.
#' @family pshifting
#' @references https://bmcgenomics.biomedcentral.com/articles/10.1186/s12864-018-4912-6
#' @export
#' @examples
#' ## Basic run
#' # Transcriptome annotation ->
#' gtf_file <- system.file("extdata", "annotations.gtf", package = "ORFik")
#' # Ribo seq data ->
#' riboSeq_file <- system.file("extdata", "ribo-seq.bam", package = "ORFik")
#' \dontrun{
#' footprints <- readBam(riboSeq_file)
#'
#' # detect the shifts automagically
#' shifts <- detectRibosomeShifts(footprints, gtf_file)
#' # shift the RiboSeq footprints
#' shiftedReads <- shiftFootprints(footprints, shifts)
#' }
shiftFootprints <- function(footprints, shifts, sort = TRUE) {
if (!is(shifts, "data.frame")) stop("shifts must be data.frame/data.table")
if (nrow(shifts) == 0) stop("No shifts found in data.frame")
if (is.null(shifts$fraction) | is.null(shifts$offsets_start))
stop("Either fraction or offsets_start column in shifts is not set!")
selected_lengths <- shifts$fraction
selected_shifts <- shifts$offsets_start
lengthsAll <- readWidths(footprints, along.reference = TRUE)
validLengths <- lengthsAll %in% selected_lengths
lengthsAll <- lengthsAll[validLengths]
footprints <- footprints[validLengths]
shiftsAll <- -selected_shifts[match(lengthsAll,
selected_lengths)] + 1
# Shift cigar and map back to corrected GRanges.
is_pos <- strandBool(footprints)
shiftsAll[!is_pos] <- -shiftsAll[!is_pos]
shifted <- qnarrow(footprints, shiftsAll, shiftsAll)
shifted <- resize(GRanges(shifted), width = 1)
shifted$size <- lengthsAll
shifted <- sortSeqlevels(shifted)
if (sort) {
message("Sorting shifted footprints...")
shifted <- sort(shifted)
}
return(shifted)
}
#' Detect ribosome shifts
#'
#' Utilizes periodicity measurement (Fourier transform), and change point
#' analysis to detect ribosomal footprint shifts for each of the ribosomal
#' read lengths. Returns subset of read lengths and their shifts for which
#' top covered transcripts follow periodicity measure. Each shift value
#' assumes 5' anchoring of the reads, so that output offsets values will
#' shift 5' anchored footprints to be on the p-site of the ribosome. The
#' E-site will be shift + 3 and A site will be shift - 3. So update to these,
#' if you rather want those.
#'
#' Check out vignette for the examples of plotting RiboSeq metaplots over start
#' and stop codons, so that you can verify visually whether this function
#' detects correct shifts.
#'
#' For how the Fourier transform works, see: \code{\link{isPeriodic}}\cr
#' For how the changepoint analysis works, see: \code{\link{changePointAnalysis}}\cr
#'
#' NOTE: It will remove softclips from valid width, the CIGAR 3S30M is qwidth
#' 33, but will remove 3S so final read width is 30 in ORFik. This is standard
#' for ribo-seq.
#'
#' @param footprints \code{\link{GAlignments}} object of RiboSeq reads -
#' footprints, can also be path to the .bam /.ofst file. If GAlignment object
#' has a meta column called "score", this will be used as replicate numbering
#' for that read. So be careful if you have custom files with score columns,
#' with another meaning.
#' @inheritParams loadTxdb
#' @param start (logical) Whether to include predictions based on the start
#' codons. Default TRUE.
#' @param stop (logical) Whether to include predictions based on the stop
#' codons. Default FASLE. Only use if there exists 3' UTRs for the annotation.
#' If peridicity around stop codon is stronger than at the start codon, use
#' stop instead of start region for p-shifting.
#' @param top_tx (integer), default 10. Specify which % of the top covered by RiboSeq
#' reads transcripts to use for estimation of the shifts. By default we take top 10%
#' top covered transcripts as they represent less noisy dataset. This is only
#' applicable when there are more than 1000 transcripts.
#' @inheritParams filterTranscripts
#' @param firstN (integer) Represents how many bases of the transcripts
#' downstream of start codons to use for initial estimation of the
#' periodicity.
#' @param tx a GRangesList, if you do not have 5' UTRs in annotation, send
#' your own version. Example: extendLeaders(tx, 30)
#' Where 30 bases will be new "leaders". Since each original transcript was
#' either only CDS or non-coding (filtered out).
#' @param min_reads default (1000), how many reads must a read-length have to
#' be considered for periodicity.
#' @param accepted.lengths accepted readlengths, default 26:34, usually ribo-seq
#' is strongest between 27:32.
#' @inheritParams footprints.analysis
#' @param must.be.periodic logical TRUE, if FALSE will not filter on
#' periodic read lengths. (The Fourier transform filter will be skipped).
#' @return a data.table with lengths of footprints and their predicted
#' coresponding offsets
#' @family pshifting
#' @references https://bmcgenomics.biomedcentral.com/articles/10.1186/s12864-018-4912-6
#' @importFrom IRanges quantile
#' @export
#' @examples
#' ## Basic run
#' # Transcriptome annotation ->
#' gtf_file <- system.file("extdata", "annotations.gtf", package = "ORFik")
#' # Ribo seq data ->
#' riboSeq_file <- system.file("extdata", "ribo-seq.bam", package = "ORFik")
#' \dontrun{
#' footprints <- readBam(riboSeq_file)
#'
#' detectRibosomeShifts(footprints, gtf_file, stop = TRUE)
#' ## Subset bam file
#' param = ScanBamParam(flag = scanBamFlag(
#' isDuplicate = FALSE,
#' isSecondaryAlignment = FALSE))
#' footprints <- readBam(riboSeq_file, param = param)
#' detectRibosomeShifts(footprints, gtf_file, stop = TRUE)
#'
#' ## Without 5' Annotation
#' library(GenomicFeatures)
#'
#' txdb <- loadTxdb(gtf_file)
#' tx <- exonsBy(txdb, by = "tx", use.names = TRUE)
#' tx <- extendLeaders(tx, 30)
#' # Now run function, without 5' and 3' UTRs
#' detectRibosomeShifts(footprints, txdb, start = TRUE, minFiveUTR = NULL,
#' minCDS = 150L, minThreeUTR = NULL, firstN = 150L,
#' tx = tx)
#'
#' }
#'
detectRibosomeShifts <- function(footprints, txdb, start = TRUE, stop = FALSE,
top_tx = 10L, minFiveUTR = 30L, minCDS = 150L, minThreeUTR = 30L,
firstN = 150L, tx = NULL, min_reads = 1000, accepted.lengths = 26:34,
heatmap = FALSE, must.be.periodic = TRUE) {
txdb <- loadTxdb(txdb)
# Filters for cds and footprints
txNames <- filterTranscripts(txdb, minFiveUTR = minFiveUTR, minCDS = minCDS,
minThreeUTR = minThreeUTR)
cds <- loadRegion(txdb, part = "cds", names.keep = txNames)
footprints <- fimport(footprints, cds)
# reduce data-set to only matching seqlevels
seqMatch <- validSeqlevels(cds, footprints)
cds <- keepSeqlevels(cds, seqMatch, pruning.mode = "coarse")
footprints <- keepSeqlevels(footprints, seqMatch, pruning.mode = "coarse")
if (length(cds) == 0 | length(footprints) == 0) {
stop("txdb and footprints did not have any matched seqnames")
}
txNames <- txNames[txNames %in% names(cds)]
if (is.null(tx)) tx <- loadRegion(txdb)
tx <- tx[txNames]
# find periodic read lengths
footprints <- convertToOneBasedRanges(footprints, addSizeColumn = TRUE,
addScoreColumn = TRUE,
along.reference = TRUE)
# Filter if < 1000 counts read size or not in accepted.lengths
lengths <- data.table(score = footprints$score, size = footprints$size)
tab <- lengths[, .(counts = sum(score)), by = size]
tab <- tab[(counts >= min_reads) & (size %in% accepted.lengths),]
if (nrow(tab) == 0) stop("No valid read lengths found with",
"accepted.lengths and counts > min_reads")
cds <- cds[countOverlapsW(cds, footprints, "score") > 0]
top_tx <- percentage_to_ratio(top_tx, cds)
if (must.be.periodic) {
periodicity <- windowPerReadLength(cds, tx, footprints,
pShifted = FALSE, upstream = 0,
downstream = firstN - 1,
zeroPosition = 0, scoring = "periodic",
acceptedLengths = tab$size)
validLengths <- periodicity[score == TRUE,]$fraction
} else validLengths <- accepted.lengths
if (length(validLengths) == 0)
stop(paste("Library contained no periodic or accepted read-lengths,",
"check your library. Are you using the correct genome?",
"Is this Ribo-seq?"))
# find shifts
if (start) {
rw <- windowPerReadLength(cds, tx, footprints, pShifted,
upstream = 30, downstream = 29,
acceptedLengths = validLengths,
scoring = NULL)
rw[, sum.count := sum(count), by = genes]
rw <- rw[sum.count >= quantile(sum.count, top_tx), ]
rw <- coverageScorings(rw, scoring = "sum")
footprints.analysis(rw, heatmap)
offset <- rw[, .(offsets_start = changePointAnalysis(score)),
by = fraction]
}
if (stop & !is.null(minThreeUTR)) {
threeUTRs <- loadRegion(txdb, "trailers")[txNames]
rw <- windowPerReadLength(threeUTRs, tx, footprints, FALSE,
upstream = 30, downstream = 29,
acceptedLengths = validLengths,
scoring = "sum")
footprints.analysis(rw, heatmap, region = "start of 3' UTR")
if (nrow(offset) == 0) {
offset <- rw[, .(offsets_stop = changePointAnalysis(score, "stop")),
by = fraction]
} else offset$offsets_stop <- rw[, .(changePointAnalysis(score, "stop")),
by = fraction]$V1
}
return(offset)
}
#' Shift footprints of each file in experiment
#'
#' For more details, see: \code{\link{detectRibosomeShifts}}
#'
#' #' Saves files to a specified location as .ofst and .wig,
#' The .ofst file will include a score column containing read width. \cr
#' The .wig fiels, will be saved in pairs of +/- strand, and score column
#' will be replicates of reads starting at that position,
#' score = 5 means 5 reads.\cr
#' Remember that different species might have different default Ribosome
#' read lengths, for human, mouse etc, normally around 27:30.
#' @inheritParams detectRibosomeShifts
#' @param df an ORFik \code{\link{experiment}}
#' @param out.dir output directory for files,
#' default: dirname(df$filepath[1]), making a /pshifted
#' folder at that location
#' @param output_format default c("ofst", "wig"), use export.ofst or
#' wiggle format (wig) using \code{\link{export.wiggle}} ? Default is both.
#' The wig format version can be used in IGV, the score column is counts of that
#' read with that read length, the cigar reference width is lost,
#' ofst is much faster to save and load in R, and retain cigar reference width,
#' but can not be used in IGV. \cr You can also do bedoc format, bed format
#' keeping cigar: \code{\link{export.bedoc}}. bedoc is usually not used for
#' p-shifting.
#' @param BPPARAM how many cores/threads to use? default: bpparam()
#' @param log logical, default (TRUE), output a log file with parameters used.
#' @return NULL (Objects are saved to out.dir/pshited/"name_pshifted.ofst",
#' wig, bedo or .bedo)
#' @importFrom rtracklayer export.bed
#' @importFrom utils packageVersion
#' @family pshifting
#' @references https://bmcgenomics.biomedcentral.com/articles/10.1186/s12864-018-4912-6
#' @export
#' @examples
#' df <- ORFik.template.experiment()
#' df <- df[3,] #lets only p-shift RFP sample at index 3
#' # If you want to check it in IGV do:
#' shiftFootprintsByExperiment(df)
#' # Then use the .wig files that are created, which are readable in IGV.
#' # If you only need in R, do: (then you get no .wig files)
#' #shiftFootprintsByExperiment(df, output_format = "ofst")
shiftFootprintsByExperiment <- function(df,
out.dir = pasteDir(dirname(
df$filepath[1]), "/pshifted/"),
start = TRUE, stop = FALSE,
top_tx = 10L, minFiveUTR = 30L,
minCDS = 150L, minThreeUTR = 30L,
firstN = 150L, min_reads = 1000,
accepted.lengths = 26:34,
output_format = c("ofst", "wig"),
BPPARAM = bpparam(),
log = TRUE, heatmap = FALSE,
must.be.periodic = TRUE) {
path <- out.dir
dir.create(path, showWarnings = FALSE, recursive = TRUE)
if (!dir.exists(path)) stop(paste("out.dir", out.dir, "does not exist!"))
if (!any(c("bed", "bedo", "wig", "ofst") %in% output_format))
stop("output_format allowed bed, bedo, wig or ofst")
for (out.form in output_format)
message(paste("Saving", out.form, "files to:", out.dir))
message(paste("Shifting reads in experiment:", df@experiment))
txdb <- loadTxdb(df)
rfpFiles <- filepath(df, "ofst") # If ofst file not present, uses bam file
shifts <- bplapply(rfpFiles,
FUN = function(file, path, df, start, stop,
top_tx, minFiveUTR, minCDS, minThreeUTR,
firstN, min_reads, accepted.lengths,
output_format, heatmap = heatmap,
must.be.periodic = must.be.periodic
) {
message(file)
rfp <- fimport(file)
shifts <- detectRibosomeShifts(rfp, txdb = loadTxdb(df), start = start,
stop = stop, top_tx = top_tx,
minFiveUTR = minFiveUTR,
minCDS = minCDS, minThreeUTR = minThreeUTR,
firstN = firstN, min_reads = min_reads,
accepted.lengths = accepted.lengths,
heatmap = heatmap,
must.be.periodic = must.be.periodic)
shifted <- shiftFootprints(rfp, shifts)
name <- paste0(path, remove.file_ext(file, basename = TRUE))
if ("bedo" %in% output_format) {
export.bedo(convertToOneBasedRanges(shifted, addScoreColumn = TRUE,
addSizeColumn = TRUE),
paste0(name, "_pshifted.bedo"))
}
if ("ofst" %in% output_format) {
export.ofst(convertToOneBasedRanges(shifted, addScoreColumn = TRUE,
addSizeColumn = TRUE),
paste0(name, "_pshifted.ofst"))
}
if ("bed" %in% output_format) {
export.bed(convertToOneBasedRanges(shifted, addScoreColumn = TRUE,
addSizeColumn = FALSE),
paste0(name, "_pshifted.bed"))
}
if ("wig" %in% output_format) {
export.wiggle(shifted, paste0(name, "_pshifted.wig"))
}
return(shifts)
}, path = path, df = df, start = start, stop = stop,
top_tx = top_tx, minFiveUTR = minFiveUTR,
minCDS = minCDS, minThreeUTR = minThreeUTR,
firstN = firstN, min_reads = min_reads,
accepted.lengths = accepted.lengths, output_format = output_format,
heatmap = heatmap, must.be.periodic = must.be.periodic,
BPPARAM = BPPARAM)
if (log) {
fileConn<-file(paste0(path, "/pshifting_arguments.txt"), "w")
cat(paste("From ORFik version:", packageVersion("ORFik"), "\n"),
file = fileConn)
cat("All arguments not specificed below are default:\n", file = fileConn)
cat(paste(as.character(sys.call()), "\n"), file = fileConn)
close(fileConn)
# Save shifts
names(shifts) <- rfpFiles
saveRDS(shifts, file = paste0(path, "/shifting_table.rds"))
}
return(invisible(NULL))
}
#' Plot shifted heatmaps per library
#'
#' A good validation for you p-shifting, to see shifts are corresponding
#' and close to the CDS TIS.
#' @inheritParams shiftFootprintsByExperiment
#' @param output name to save file, full path. (Default NULL) No saving.
#' @param scoring which scoring scheme to use for heatmap, default
#' "transcriptNormalized". Some alternatives: "sum", "zscore".
#' @param title Title for top of plot, default "Ribo-seq".
#' A more informative name could be "Ribo-seq zebrafish Chew et al. 2013"
#' @param addFracPlot logical, default TRUE, add positional sum plot on top
#' per heatmap.
#' @importFrom gridExtra grid.arrange
#' @return a ggplot2 grob object
#' @export
#' @examples
#' df <- ORFik.template.experiment()
#' df <- df[3,] #lets only p-shift RFP sample at index 3
#' #shiftFootprintsByExperiment(df, output_format = "bedo)
#' #shiftPlots(df, title = "Ribo-seq Human ORFik et al. 2020")
shiftPlots <- function(df, output = NULL, title = "Ribo-seq",
scoring = "transcriptNormalized",
addFracPlot = TRUE,
BPPARAM = bpparam()) {
txNames <- filterTranscripts(df, 20, 21, 1)
txdb <- loadTxdb(df)
cds <- loadRegion(txdb, part = "cds", names.keep = txNames)
mrna <- loadRegion(txdb, part = "mrna", names.keep = txNames)
style <- seqlevelsStyle(cds)
plots <- bplapply(seq(nrow(df)),
function(x, cds, mrna, style, paths, df) {
miniTitle <- gsub("_", " ", bamVarName(df, skip.experiment = TRUE)[x])
hitMap <- windowPerReadLength(cds, mrna, fimport(paths[x], style),
pShifted = TRUE)
coverageHeatMap(hitMap, scoring = scoring, addFracPlot = addFracPlot,
title = miniTitle)
}, cds = cds, mrna = mrna, style = style, BPPARAM = BPPARAM,
paths = filepath(df, "pshifted"), df = df)
res <- do.call("grid.arrange", c(plots, ncol=1, top = title))
if (!is.null(output))
ggsave(output, res,
width = 225, height = (length(res) -1)*65,
units = "mm", dpi = 300)
return(res)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.