Nothing
## ---- echo=FALSE--------------------------------------------------------------
library(knitr)
opts_chunk$set(cache=FALSE,
error=FALSE)
## ----message=FALSE------------------------------------------------------------
library(alpineData)
dir <- system.file("extdata",package="alpineData")
metadata <- read.csv(file.path(dir,"metadata.csv"),
stringsAsFactors=FALSE)
metadata[,c("Title","Performer","Date","Population")]
## ----message=FALSE------------------------------------------------------------
library(GenomicAlignments)
ERR188297()
## ----message=FALSE------------------------------------------------------------
library(rtracklayer)
dir <- tempdir()
for (sample.name in metadata$Title) {
# the reads are accessed with functions named
# after the sample name. the following line calls
# the function with the sample name and saves
# the reads to `gap`
gap <- match.fun(sample.name)()
file.name <- file.path(dir,paste0(sample.name,".bam"))
export(gap, con=file.name)
}
bam.files <- file.path(dir, paste0(metadata$Title, ".bam"))
names(bam.files) <- metadata$Title
stopifnot(all(file.exists(bam.files)))
## ---- eval=FALSE--------------------------------------------------------------
# library(ensembldb)
# gtf.file <- "Homo_sapiens.GRCh38.84.gtf"
# txdb <- EnsDb(gtf.file) # already an EnsDb
# txdf <- transcripts(txdb, return.type="DataFrame")
# tab <- table(txdf$gene_id)
# one.iso.genes <- names(tab)[tab == 1]
# # pre-selected genes based on medium to high counts
# # calculated using Rsubread::featureCounts
# selected.genes <- scan("selected.genes.txt", what="char")
# one.iso.txs <- txdf$tx_id[txdf$gene_id %in%
# intersect(one.iso.genes, selected.genes)]
# ebt0 <- exonsBy(txdb, by="tx")
# ebt.fit <- ebt0[one.iso.txs]
## ----message=FALSE------------------------------------------------------------
library(GenomicRanges)
## -----------------------------------------------------------------------------
library(alpine)
data(preprocessedData)
# filter small genes and long genes
min.bp <- 600
max.bp <- 7000
gene.lengths <- sum(width(ebt.fit))
summary(gene.lengths)
ebt.fit <- ebt.fit[gene.lengths > min.bp & gene.lengths < max.bp]
length(ebt.fit)
set.seed(1)
# better to use ~100 genes
ebt.fit <- ebt.fit[sample(length(ebt.fit),10)]
## -----------------------------------------------------------------------------
w <- getFragmentWidths(bam.files[1], ebt.fit[[1]])
c(summary(w), Number=length(w))
quantile(w, c(.025, .975))
## -----------------------------------------------------------------------------
getReadLength(bam.files)
## ----message=FALSE------------------------------------------------------------
library(alpine)
library(BSgenome.Hsapiens.NCBI.GRCh38)
readlength <- 75
minsize <- 125 # better 80 for this data
maxsize <- 175 # better 350 for this data
gene.names <- names(ebt.fit)
names(gene.names) <- gene.names
## ----buildFragtype------------------------------------------------------------
system.time({
fragtypes <- lapply(gene.names, function(gene.name) {
buildFragtypes(exons=ebt.fit[[gene.name]],
genome=Hsapiens,
readlength=readlength,
minsize=minsize,
maxsize=maxsize,
gc.str=FALSE)
})
})
print(object.size(fragtypes), units="auto")
## -----------------------------------------------------------------------------
head(fragtypes[[1]], 3)
## -----------------------------------------------------------------------------
models <- list(
"GC" = list(
formula = "count ~ ns(gc,knots=gc.knots,Boundary.knots=gc.bk) + ns(relpos,knots=relpos.knots,Boundary.knots=relpos.bk) + gene",
offset=c("fraglen")
),
"all" = list(
formula = "count ~ ns(gc,knots=gc.knots,Boundary.knots=gc.bk) + ns(relpos,knots=relpos.knots,Boundary.knots=relpos.bk) + gene",
offset=c("fraglen","vlmm")
)
)
## ----fitBiasModels------------------------------------------------------------
system.time({
fitpar <- lapply(bam.files, function(bf) {
fitBiasModels(genes=ebt.fit,
bam.file=bf,
fragtypes=fragtypes,
genome=Hsapiens,
models=models,
readlength=readlength,
minsize=minsize,
maxsize=maxsize)
})
})
# this object saved was 'fitpar.small' for examples in alpine man pages
# fitpar.small <- fitpar
## -----------------------------------------------------------------------------
library(RColorBrewer)
palette(brewer.pal(8,"Dark2"))
## ----fraglen------------------------------------------------------------------
perf <- as.integer(factor(metadata$Performer))
plotFragLen(fitpar, col=perf)
## ----gccurve------------------------------------------------------------------
plotGC(fitpar, model="all", col=perf)
## ----relpos-------------------------------------------------------------------
plotRelPos(fitpar, model="all", col=perf)
## ----vlmm---------------------------------------------------------------------
plotOrder0(fitpar[["ERR188297"]][["vlmm.fivep"]][["order0"]])
plotOrder0(fitpar[["ERR188297"]][["vlmm.threep"]][["order0"]])
## -----------------------------------------------------------------------------
print(head(fitpar[["ERR188297"]][["summary"]][["all"]]), row.names=FALSE)
## ---- eval=FALSE--------------------------------------------------------------
# one.iso.genes <- intersect(names(tab)[tab == 1], selected.genes)
# two.iso.genes <- intersect(names(tab)[tab == 2], selected.genes)
# three.iso.genes <- intersect(names(tab)[tab == 3], selected.genes)
# set.seed(1)
# genes.theta <- c(sample(one.iso.genes, 2),
# sample(two.iso.genes, 2),
# sample(three.iso.genes, 2))
# txdf.theta <- txdf[txdf$gene_id %in% genes.theta,]
# ebt.theta <- ebt0[txdf.theta$tx_id]
## -----------------------------------------------------------------------------
model.names <- c("null","fraglen.vlmm","GC")
## ----estimateAbundance--------------------------------------------------------
system.time({
res <- lapply(genes.theta, function(gene.name) {
txs <- txdf.theta$tx_id[txdf.theta$gene_id == gene.name]
estimateAbundance(transcripts=ebt.theta[txs],
bam.files=bam.files,
fitpar=fitpar,
genome=Hsapiens,
model.names=model.names)
})
})
## -----------------------------------------------------------------------------
res[[1]][["ERR188297"]][["GC"]]
res[[6]][["ERR188297"]][["GC"]]
## -----------------------------------------------------------------------------
mat <- extractAlpine(res, model="GC")
mat
## -----------------------------------------------------------------------------
se <- extractAlpine(res, model="GC", transcripts=ebt.theta)
se
## ---- eval=FALSE--------------------------------------------------------------
# norm.mat <- normalizeDESeq(mat, cutoff=0.1)
## -----------------------------------------------------------------------------
data(preprocessedData)
prob.mat <- plotGC(fitpar, "all", return.type=2)
head(prob.mat)
## -----------------------------------------------------------------------------
model.names <- c("fraglen","fraglen.vlmm","GC","all")
## -----------------------------------------------------------------------------
fitpar[[1]][["model.params"]][c("minsize","maxsize")]
## ----predictCoverage----------------------------------------------------------
system.time({
pred.cov <- predictCoverage(gene=ebt.fit[["ENST00000245479"]],
bam.files=bam.files["ERR188204"],
fitpar=fitpar,
genome=Hsapiens,
model.names=model.names)
})
## -----------------------------------------------------------------------------
palette(brewer.pal(9, "Set1"))
frag.cov <- pred.cov[["ERR188204"]][["frag.cov"]]
plot(frag.cov, type="l", lwd=3, ylim=c(0,max(frag.cov)*1.5))
for (i in seq_along(model.names)) {
m <- model.names[i]
pred <- pred.cov[["ERR188204"]][["pred.cov"]][[m]]
lines(pred, col=i, lwd=3)
}
legend("topright", legend=c("observed",model.names),
col=c("black",seq_along(model.names)), lwd=3)
## -----------------------------------------------------------------------------
sessionInfo()
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.