Nothing
## ----eval = TRUE, echo = TRUE, message = FALSE--------------------------------
library(ORFik) # This package
library(GenomicFeatures) # For basic transcript operations
library(data.table) # For fast table operations
library(BSgenome.Hsapiens.UCSC.hg19) # Human genome
## ----eval = TRUE, echo = TRUE-------------------------------------------------
txdbFile <- system.file("extdata", "hg19_knownGene_sample.sqlite",
package = "GenomicFeatures")
## ----eval = TRUE, echo = TRUE-------------------------------------------------
txdb <- loadTxdb(txdbFile)
fiveUTRs <- loadRegion(txdb, "leaders")
fiveUTRs[1]
## ----eval = TRUE, echo = TRUE, message = FALSE--------------------------------
# Extract sequences of fiveUTRs.
tx_seqs <- extractTranscriptSeqs(BSgenome.Hsapiens.UCSC.hg19::Hsapiens,
fiveUTRs)
tx_seqs[1]
## ----eval = TRUE, echo = TRUE, message = FALSE--------------------------------
fiveUTR_ORFs <- findMapORFs(fiveUTRs, tx_seqs, groupByTx = FALSE)
fiveUTR_ORFs[1:2]
## ----eval = TRUE, echo = TRUE, message = FALSE--------------------------------
txNames(fiveUTR_ORFs[1:2]) # <- Which transcript
## ----eval = TRUE, echo = TRUE, message = FALSE--------------------------------
names(fiveUTR_ORFs[1:2]) # <- Which ORF
## ----eval = FALSE, echo = TRUE, message = FALSE-------------------------------
# saveRDS(fiveUTR_ORFs[1:2], "save/path/uorfs.rds")
## ----eval = FALSE, echo = TRUE, message = FALSE-------------------------------
# export.bed12(fiveUTR_ORFs[1:2], "save/path/uorfs.bed12")
## ----eval = TRUE, echo = TRUE, message = FALSE--------------------------------
orf_seqs <- extractTranscriptSeqs(BSgenome.Hsapiens.UCSC.hg19::Hsapiens,
fiveUTR_ORFs[1:3])
orf_seqs
## ----eval = FALSE, echo = TRUE, message = FALSE-------------------------------
# writeXStringSet(orf_seqs, filepath = "uorfs.fasta")
## ----eval = TRUE, echo = TRUE, message = FALSE--------------------------------
orf_aa_seq <- Biostrings::translate(orf_seqs)
orf_aa_seq
## ----eval = FALSE, echo = TRUE, message = FALSE-------------------------------
# writeXStringSet(orf_aa_seq, filepath = "uorfs_AA.fasta")
## ----eval = TRUE, echo = TRUE-------------------------------------------------
unlisted_ranges <- unlistGrl(fiveUTR_ORFs)
test_ranges <- groupGRangesBy(unlisted_ranges) # <- defualt is tx grouping by names()
test_ranges[1]
## ----eval = TRUE, echo = TRUE-------------------------------------------------
unlisted_ranges <- unlistGrl(fiveUTR_ORFs)
test_ranges <- groupGRangesBy(unlisted_ranges, unlisted_ranges$names)
test_ranges[1:2]
## ----eval = TRUE, echo = TRUE-------------------------------------------------
unlisted_ranges <- unlistGrl(fiveUTR_ORFs)
ORFs <- groupGRangesBy(unlisted_ranges, unlisted_ranges$names)
length(ORFs) # length means how many ORFs left in set
## ----eval = TRUE, echo = TRUE-------------------------------------------------
ORFs <- ORFs[widthPerGroup(ORFs) >= 60]
length(ORFs)
## ----eval = TRUE, echo = TRUE-------------------------------------------------
ORFs <- ORFs[numExonsPerGroup(ORFs) > 1]
length(ORFs)
## ----eval = TRUE, echo = TRUE-------------------------------------------------
ORFs <- ORFs[strandPerGroup(ORFs) == "+"]
# all remaining ORFs where on positive strand, so no change
length(ORFs)
## ----eval = TRUE, echo = TRUE-------------------------------------------------
startSites(fiveUTR_ORFs, asGR = TRUE, keep.names = TRUE, is.sorted = TRUE)
stopSites(fiveUTR_ORFs, asGR = TRUE, keep.names = TRUE, is.sorted = TRUE)
## ----eval = TRUE, echo = TRUE-------------------------------------------------
starts <- startCodons(fiveUTR_ORFs, is.sorted = TRUE)
stops <- stopCodons(fiveUTR_ORFs, is.sorted = TRUE)
starts[1:2]
## ----eval = TRUE, echo = TRUE-------------------------------------------------
txSeqsFromFa(starts, BSgenome.Hsapiens.UCSC.hg19::Hsapiens, is.sorted = TRUE)
"Stop codons"
txSeqsFromFa(stops, BSgenome.Hsapiens.UCSC.hg19::Hsapiens, is.sorted = TRUE)
## ----eval = TRUE, echo = TRUE-------------------------------------------------
library(Biostrings)
# strand with ORFs in both directions
seqs <- DNAStringSet("ATGAAATGAAGTAAATCAAAACAT")
######################>######################< (< > is direction of ORF)
# positive strands
pos <- findORFs(seqs, startCodon = "ATG", minimumLength = 0)
# negative strands
neg <- findORFs(reverseComplement(seqs),
startCodon = "ATG", minimumLength = 0)
## ----eval = TRUE, echo = TRUE-------------------------------------------------
pos <- GRanges(pos, strand = "+")
neg <- GRanges(neg, strand = "-")
# as GRanges
res <- c(pos, neg)
# or merge together and make GRangesList
res <- split(res, seq.int(1, length(pos) + length(neg)))
res[1:2]
## ----eval = TRUE, echo = TRUE-------------------------------------------------
res[strandBool(res)] # Keep only + stranded ORFs
## ----eval = TRUE, echo = TRUE, warning = FALSE, message = FALSE---------------
# path to example CageSeq data from hg19 heart sample
cageData <- system.file("extdata", "cage-seq-heart.bed.bgz",
package = "ORFik")
# get new Transcription Start Sites using CageSeq dataset
newFiveUTRs <- reassignTSSbyCage(fiveUTRs, cageData)
newFiveUTRs
## ----eval = TRUE, echo = TRUE-------------------------------------------------
# Find path to a bam file
bam_file <- system.file("extdata", "ribo-seq.bam", package = "ORFik")
footprints <- readBam(bam_file)
## ----eval = TRUE, echo = TRUE-------------------------------------------------
table(readWidths(footprints))
## ----eval = TRUE, echo = TRUE, warning = FALSE, message = FALSE---------------
gtf_file <- system.file("extdata", "annotations.gtf", package = "ORFik")
txdb <- loadTxdb(gtf_file)
tx <- exonsBy(txdb, by = "tx", use.names = TRUE)
cds <- cdsBy(txdb, by = "tx", use.names = TRUE)
trailers <- threeUTRsByTranscript(txdb, use.names = TRUE)
cds[1]
## ----eval = FALSE, echo = TRUE, warning = FALSE, message = FALSE--------------
# loadRegions(gtf_file, parts = c("tx", "cds", "trailers"))
## ----eval = TRUE, echo = TRUE-------------------------------------------------
footprintsGR <- convertToOneBasedRanges(footprints, addSizeColumn = TRUE)
footprintsGR
## ----eval = TRUE, echo = TRUE-------------------------------------------------
hitMap <- windowPerReadLength(cds, tx, footprintsGR, pShifted = FALSE)
coverageHeatMap(hitMap, scoring = "transcriptNormalized")
## ----eval = TRUE, echo = TRUE-------------------------------------------------
footprints <- footprints[readWidths(footprints) == 29]
footprintsGR <- footprintsGR[readWidths(footprintsGR) == 29]
footprints
## ----eval = TRUE, echo = TRUE, warning = FALSE--------------------------------
txNames <- filterTranscripts(txdb) # <- get only transcripts that pass filter
tx <- tx[txNames]; cds <- cds[txNames]; trailers <- trailers[txNames];
windowsStart <- startRegion(cds, tx, TRUE, upstream = 30, 29)
windowsStop <- startRegion(trailers, tx, TRUE, upstream = 30, 29)
windowsStart
## ----eval = TRUE, echo = TRUE, warning = FALSE--------------------------------
hitMapStart <- metaWindow(footprintsGR, windowsStart, withFrames = TRUE)
hitMapStop <- metaWindow(footprintsGR, windowsStop, withFrames = TRUE)
## ----eval = TRUE, echo = TRUE, warning = FALSE--------------------------------
pSitePlot(hitMapStart)
## ----eval = TRUE, echo = TRUE, warning = FALSE--------------------------------
pSitePlot(hitMapStop, region = "stop")
## ----eval = TRUE, echo = TRUE, warning = FALSE--------------------------------
shifts <- detectRibosomeShifts(footprints, txdb, stop = TRUE)
shifts
## ----eval = TRUE, echo = TRUE, warning = FALSE--------------------------------
shiftedFootprints <- shiftFootprints(footprints, shifts)
shiftedFootprints
## ----eval = TRUE, echo = TRUE, warning = FALSE, message = FALSE---------------
fiveUTRs <- fiveUTRs[1:10]
faFile <- BSgenome.Hsapiens.UCSC.hg19::Hsapiens
tx_seqs <- extractTranscriptSeqs(faFile, fiveUTRs)
fiveUTR_ORFs <- findMapORFs(fiveUTRs, tx_seqs, groupByTx = FALSE)
## ----eval = TRUE, echo = TRUE, warning = FALSE, message = FALSE---------------
starts <- unlist(ORFik:::firstExonPerGroup(fiveUTR_ORFs), use.names = FALSE)
RFP <- promoters(starts, upstream = 0, downstream = 1)
score(RFP) <- rep(29, length(RFP)) # the original read widths
# set RNA-seq seq to duplicate transcripts
RNA <- unlist(exonsBy(txdb, by = "tx", use.names = TRUE), use.names = TRUE)
## ----eval = TRUE, echo = TRUE, warning = FALSE, message = FALSE---------------
# transcript database
txdb <- loadTxdb(txdbFile)
dt <- computeFeatures(fiveUTR_ORFs[1:4], RFP, RNA, txdb, faFile,
sequenceFeatures = TRUE)
dt
## ----eval = TRUE, echo = TRUE-------------------------------------------------
cds <- cdsBy(txdb, by = "tx", use.names = TRUE)[1:10]
tx <- exonsBy(txdb, by = "tx", use.names = TRUE)[names(cds)]
faFile <- BSgenome.Hsapiens.UCSC.hg19::Hsapiens
kozakSequenceScore(cds, tx, faFile, species = "human")
## ----eval = TRUE, echo = TRUE-------------------------------------------------
pfm <- t(matrix(as.integer(c(29,26,28,26,22,35,62,39,28,24,27,17,
21,26,24,16,28,32,5,23,35,12,42,21,
25,24,22,33,22,19,28,17,27,47,16,34,
25,24,26,25,28,14,5,21,10,17,15,28)),
ncol = 4))
kozakSequenceScore(cds, tx, faFile, species = pfm)
## ----eval = TRUE, echo = TRUE-------------------------------------------------
seqs <- startRegionString(cds, tx, faFile, upstream = 5, downstream = 5)
rate <- fpkm(cds, RFP)
ORFik:::kozakHeatmap(seqs, rate)
## ----eval = TRUE, echo = TRUE-------------------------------------------------
# First make some toy example
cds <- GRanges("chr1", IRanges(c(1, 10, 20, 30, 40, 50, 60, 70, 80),
c(5, 15, 25, 35, 45, 55, 65, 75, 85)),
"+")
names(cds) <- c(rep("tx1", 3), rep("tx2", 3), rep("tx3", 3))
cds <- groupGRangesBy(cds)
ribo <- GRanges("chr1", c(1, rep.int(23, 4), 30, 34, 34, 43, 60, 64, 71, 74),
"+")
# We could do a simplification and use the ORFik entropy function
entropy(cds, ribo) # <- spread of reads
## ----eval = TRUE, echo = TRUE-------------------------------------------------
tile <- tile1(cds, FALSE, FALSE) # tile them to 1 based positions
tails <- tails(tile, 9) # get 9 last bases per cds
stopOverlap <- countOverlaps(tails, ribo)
allOverlap <- countOverlaps(cds, ribo)
fractions <- (stopOverlap + 1) / (allOverlap + 1) # pseudocount 1
cdsToRemove <- fractions > 1 / 2 # filter with pseudocounts (1+1)/(3+1)
cdsToRemove
## ----eval = TRUE, echo = TRUE, warning = FALSE, message = FALSE---------------
# Get the annotation
txdb <- loadTxdb(gtf_file)
# Ribo-seq
bam_file <- system.file("extdata", "ribo-seq.bam", package = "ORFik")
reads <- readGAlignments(bam_file)
shiftedReads <- shiftFootprints(reads, detectRibosomeShifts(reads, txdb))
## ----eval = TRUE, echo = TRUE, message = FALSE--------------------------------
# Lets take all valid transcripts, with size restrictions:
# leader > 100 bases, cds > 100 bases, trailer > 100 bases
txNames <- filterTranscripts(txdb, 100, 100, 100) # valid transcripts
loadRegions(txdb, parts = c("leaders", "cds", "trailers", "tx"),
names.keep = txNames)
# Create meta coverage per part of transcript
leaderCov <- metaWindow(shiftedReads, leaders, scoring = NULL,
feature = "leaders")
cdsCov <- metaWindow(shiftedReads, cds, scoring = NULL,
feature = "cds")
trailerCov <- metaWindow(shiftedReads, trailers, scoring = NULL,
feature = "trailers")
## ----eval = TRUE, echo = TRUE, message = FALSE--------------------------------
dt <- rbindlist(list(leaderCov, cdsCov, trailerCov))
dt[, `:=` (fraction = "Ribo-seq")] # Set info column
# zscore gives shape, a good starting plot
windowCoveragePlot(dt, scoring = "zscore", title = "Ribo-seq metaplot")
## ----eval = TRUE, echo = TRUE-------------------------------------------------
windowCoveragePlot(dt, scoring = "median", title = "Ribo-seq metaplot")
## ----eval = TRUE, echo = TRUE-------------------------------------------------
if (requireNamespace("BSgenome.Hsapiens.UCSC.hg19")) {
# size 100 window: 50 upstream, 49 downstream of TIS
windowsStart <- startRegion(cds, tx, TRUE, upstream = 50, 49)
hitMapStart <- metaWindow(shiftedReads, windowsStart, withFrames = TRUE)
pSitePlot(hitMapStart, length = "meta coverage")
}
## ----eval = TRUE, echo = TRUE, message = FALSE, fig.height=8------------------
hitMap <- windowPerReadLength(cds, tx, shiftedReads)
coverageHeatMap(hitMap, addFracPlot = TRUE)
## ----eval = TRUE, echo = TRUE, message=FALSE----------------------------------
if (requireNamespace("BSgenome.Hsapiens.UCSC.hg19")) {
# Load more files like above (Here I make sampled data from earlier Ribo-seq)
dt2 <- copy(dt)
dt2[, `:=` (fraction = "Ribo-seq2")]
dt2$score <- dt2$score + sample(seq(-40, 40), nrow(dt2), replace = TRUE)
dtl <- rbindlist(list(dt, dt2))
windowCoveragePlot(dtl, scoring = "median", title = "Ribo-seq metaplots")
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.