Nothing
## ----githubInstall, eval=FALSE------------------------------------------------
# ## Install SPsimSeq
# library(devtools)
# install_github("CenterForStatistics-UGent/SPsimSeq")
## ----BiocInstall, eval=FALSE--------------------------------------------------
# ## Install SPsimSeq
# library(BiocManager)
# BiocManager::install("SPsimSeq")
## ----loadSPsimSeq-------------------------------------------------------------
# load package
library(SPsimSeq)
## ----loadData, eval=TRUE------------------------------------------------------
# load the Zhang bulk RNA-seq data (availabl with the package)
data("zhang.data.sub")
# filter genes with sufficient expression (important step to avoid bugs)
zhang.counts <- zhang.data.sub$counts
MYCN.status <- zhang.data.sub$MYCN.status #The grouping variable
## ----simulateData-------------------------------------------------------------
set.seed(6452) #Set seed for reproducibility
# simulate data
sim.data.bulk <- SPsimSeq(n.sim = 1, s.data = zhang.counts,
group = MYCN.status, n.genes = 3000, batch.config = 1,
group.config = c(0.5, 0.5), tot.samples = ncol(zhang.counts),
pDE = 0.1, lfc.thrld = 0.5, result.format = "list", return.details = TRUE)
## ----dataExploration----------------------------------------------------------
sim.data.bulk1 <- sim.data.bulk$sim.data.list[[1]]
head(sim.data.bulk1$counts[, seq_len(5)]) # count data
head(sim.data.bulk1$colData) # sample info
head(sim.data.bulk1$rowData) # gene info
## ----evaluateDensities--------------------------------------------------------
geneDens = evaluateDensities(sim.data.bulk, newData = rownames(zhang.counts)[1])
#This returns for every sample, the midpoints (mids) and associated densities (gy)
## ----comparison, warning=FALSE, fig.width=8, fig.height=4---------------------
# compare the distributions of the mean expressions, variability,
# and fraction of zero counts per gene
library(LSD) # for generating heatmap plots
# normalize counts for comparison
Y0.log.cpm <- log2(edgeR::cpm(zhang.counts)+1)
Y1.log.cpm <- log2(edgeR::cpm(sim.data.bulk1$counts)+1)
Y0.log.cpm <- Y0.log.cpm[rowMeans(Y0.log.cpm>0)>=0.1, ]
Y1.log.cpm <- Y1.log.cpm[rowMeans(Y1.log.cpm>0)>=0.1, ]
rowVars <- function(X){apply(X, 1, var, na.rm=TRUE)}
rowCVs <- function(X){apply(X, 1, function(x) sd(x, na.rm=TRUE)/mean(x, na.rm=TRUE))}
par(mfrow=c(1, 3))
boxplot(list(real.data=log(colSums(zhang.counts)),
simulated.data=log(sim.data.bulk1$colData$sim.Lib.Size)),
main="library size")
boxplot(list(real.data=rowMeans(Y0.log.cpm),
simulated.data=rowMeans(Y1.log.cpm)),
main="mean expression of genes")
boxplot(list(real.data=rowVars(Y0.log.cpm),
simulated.data=rowVars(Y1.log.cpm)),
main="variance of gene expressions")
## ----meanvariancetrend--------------------------------------------------------
# compare the relationship between the mean and variability
par(mfrow=c(1,3), mar=c(4,4,4,1))
heatscatter(rowMeans(Y0.log.cpm), rowCVs(Y0.log.cpm), ylim=c(0, 6), xlim=c(0, 16),
colpal="bl2gr2rd", main="real data", xlab="mean log2-CPM",
ylab="coefficients of variation", cexplot=0.5, alpha = 60, cex.lab=1.25)
heatscatter(rowMeans(Y1.log.cpm), rowCVs(Y1.log.cpm), ylim=c(0, 6), xlim=c(0, 16),
main="SPsimSeq", xlab="mean log2-CPM", ylab="coefficients of variation",
cexplot=0.5, alpha = 60, colpal="bl2gr2rd", cex.lab=1.25)
n.gride <- 1000
min.g <- seq(0, 20, length.out = n.gride+1)[-n.gride]
max.g <- seq(0, 20, length.out = n.gride+1)[-1]
mid.g <- (min.g+max.g)/2
f.real <- vapply(seq_len(n.gride), FUN.VALUE = double(1), function(r){
x <- Y0.log.cpm[rowMeans(Y0.log.cpm)<=max.g[r] & rowMeans(Y0.log.cpm)>min.g[r],]
y <- ifelse(!is.null(dim(x)), mean(rowCVs(x)), mean(sd(x)/mean(x)))
y
})
f.SPsim <- vapply(seq_len(n.gride), FUN.VALUE = double(1), function(r){
x <- Y1.log.cpm[rowMeans(Y1.log.cpm)<=max.g[r] & rowMeans(Y1.log.cpm)>min.g[r],]
y <- ifelse(!is.null(dim(x)), mean(rowCVs(x)), mean(sd(x)/mean(x)))
y
})
sm1 <- loess(I(f.SPsim-f.real)~mid.g)
plot(mid.g, f.SPsim-f.real, xlim=c(0, 14), col="lightskyblue", pch=20, cex.lab=1.25,
cex.main=1.4, main="SPsimSeq - real data", ylab="difference", xlab="mean log2-CPM")
lines(mid.g,predict(sm1, newdata = mid.g), col="blue", lwd=3)
## ----correlation--------------------------------------------------------------
# compare the correlation between genes and samples
cor.mat.Y0 <- cor(t(Y0.log.cpm))
cor.mat.Y1 <- cor(t(Y1.log.cpm))
cor.vec.Y0 <- cor.mat.Y0[upper.tri(cor.mat.Y0)]
cor.vec.Y1 <- cor.mat.Y1[upper.tri(cor.mat.Y1)]
par(mfrow=c(1,3), mar=c(4,4,3.5,1))
hist(cor.vec.Y0, nclass = 30, probability = TRUE,
border="gray", col="steelblue1", main="real data", xlab="Genewise correlations",
ylim=c(0, 3.5), xlim=c(-1, 1), cex.lab=1.25)
hist(cor.vec.Y1, nclass = 30, probability = TRUE, border="gray",
col="steelblue1", main="SPsimSeq", xlab="Genewise correlations",
ylim=c(0, 3.5), xlim=c(-1, 1), cex.lab=1.25)
plot(seq(-1, 1, 0.1), seq(-1, 1, 0.1), type="n", xlab="quantile (real data)",
ylab="quantile (simulated data)", main="correlation quantile-quantile plot")
abline(0, 1, col="gray")
points(quantile(cor.vec.Y0, seq(0, 1, 0.001)), quantile(cor.vec.Y1, seq(0, 1, 0.001)),
col="blue", pch=20, cex=1.5, cex.lab=1.25)
## ----scRNA, eval=TRUE, warning=FALSE, fig.width=8, fig.height=4---------------
library(SingleCellExperiment)
# load the NGP nutlin data (availabl with the package, processed with SMARTer/C1 protocol, and contains read-counts)
data("scNGP.data")
set.seed(654321)
# simulate data (we simulate here only a single data, n.sim = 1)
sim.data.sc <- SPsimSeq(n.sim = 1, s.data = scNGP.data,
group = scNGP.data$characteristics..treatment,
n.genes = 4000, batch.config = 1,
group.config = c(0.5, 0.5), tot.samples = 100,
pDE = 0.1, lfc.thrld = 0.5, model.zero.prob = TRUE,
result.format = "SCE")
## ----scRNAhead----------------------------------------------------------------
sim.data.sc1 <- sim.data.sc[[1]]
class(sim.data.sc1)
head(counts(sim.data.sc1)[, seq_len(5)])
colData(sim.data.sc1)
rowData(sim.data.sc1)
## ----basicSCrna---------------------------------------------------------------
# normalize counts for comparison
Y0.log.cpm <- log2(edgeR::cpm(counts(scNGP.data))+1)
Y1.log.cpm <- log2(edgeR::cpm(counts(sim.data.sc1))+1)
Y0.log.cpm <- Y0.log.cpm[rowMeans(Y0.log.cpm>0)>=0.1, ]
Y1.log.cpm <- Y1.log.cpm[rowMeans(Y1.log.cpm>0)>=0.1, ]
rowVars <- function(X){apply(X, 1, var, na.rm=TRUE)}
rowCVs <- function(X){apply(X, 1, function(x) sd(x, na.rm=TRUE)/mean(x, na.rm=TRUE))}
rowZeroFrac <- function(X){apply(X, 1, function(x) mean(x==0, na.rm=TRUE))}
par(mfrow=c(1, 3))
boxplot(list(real.data=colSums(counts(scNGP.data)),
simulated.data=colData(sim.data.sc1)$sim.Lib.Size),
main="library size")
boxplot(list(real.data=rowMeans(Y0.log.cpm),
simulated.data=rowMeans(Y1.log.cpm)),
main="mean expression of genes")
boxplot(list(real.data=rowVars(Y0.log.cpm),
simulated.data=rowVars(Y1.log.cpm)),
main="variance of gene expressions")
## ----meanVarSCrna-------------------------------------------------------------
# compare the relationship between the mean and variability
par(mfrow=c(1,3), mar=c(4,4,4,1))
heatscatter(rowMeans(Y0.log.cpm), rowCVs(Y0.log.cpm), ylim=c(0, 6), xlim=c(0, 16),
colpal="bl2gr2rd", main="real data", xlab="mean log2-CPM",
ylab="coefficients of variation", cexplot=0.5, alpha = 60, cex.lab=1.25)
heatscatter(rowMeans(Y1.log.cpm), rowCVs(Y1.log.cpm), ylim=c(0, 6), xlim=c(0, 16),
main="SPsimSeq", xlab="mean log2-CPM", ylab="coefficients of variation",
cexplot=0.5, alpha = 60, colpal="bl2gr2rd", cex.lab=1.25)
n.gride <- 1000
min.g <- seq(0, 20, length.out = n.gride+1)[-n.gride]
max.g <- seq(0, 20, length.out = n.gride+1)[-1]
mid.g <- (min.g+max.g)/2
f.real <- vapply(seq_len(n.gride), FUN.VALUE = double(1), function(r){
x <- Y0.log.cpm[rowMeans(Y0.log.cpm)<=max.g[r] & rowMeans(Y0.log.cpm)>min.g[r],]
y <- ifelse(!is.null(dim(x)), mean(rowCVs(x)), mean(sd(x)/mean(x)))
y
})
f.SPsim <- vapply(seq_len(n.gride), FUN.VALUE = double(1), function(r){
x <- Y1.log.cpm[rowMeans(Y1.log.cpm)<=max.g[r] & rowMeans(Y1.log.cpm)>min.g[r],]
y <- ifelse(!is.null(dim(x)), mean(rowCVs(x)), mean(sd(x)/mean(x)))
y
})
sm1 <- loess(I(f.SPsim-f.real)~mid.g)
plot(mid.g, f.SPsim-f.real, xlim=c(0, 14), col="lightskyblue", pch=20, cex.lab=1.25,
cex.main=1.4, main="SPsimSeq - real data", ylab="difference", xlab="mean log2-CPM")
lines(mid.g,predict(sm1, newdata = mid.g), col="blue", lwd=3)
## ----scRNAmeanzeroes----------------------------------------------------------
# compare the relationship between the mean and fraction of zeros
par(mfrow=c(1,3), mar=c(4,4,4,1))
heatscatter(rowMeans(Y0.log.cpm), rowZeroFrac(Y0.log.cpm), ylim=c(0, 1),
xlim=c(0, 16), colpal="bl2gr2rd", main="real data", xlab="mean log2-CPM",
ylab="fraction of zero counts", cexplot=0.5, alpha = 60, cex.lab=1.25)
heatscatter(rowMeans(Y1.log.cpm), rowZeroFrac(Y1.log.cpm), ylim=c(0, 1),
xlim=c(0, 16), main="SPsimSeq", xlab="mean log2-CPM",
ylab="fraction of zero counts", cexplot=0.5, alpha = 60,
colpal="bl2gr2rd", cex.lab=1.25)
n.gride <- 1000
min.g <- seq(0, 20, length.out = n.gride+1)[-n.gride]
max.g <- seq(0, 20, length.out = n.gride+1)[-1]
mid.g <- (min.g+max.g)/2
f.real <- vapply(seq_len(n.gride), FUN.VALUE = double(1), function(r){
x <- Y0.log.cpm[rowMeans(Y0.log.cpm)<=max.g[r] & rowMeans(Y0.log.cpm)>min.g[r],]
y <- ifelse(!is.null(dim(x)), mean(rowZeroFrac(x)), mean(x==0))
y
})
f.SPsim <- vapply(seq_len(n.gride), FUN.VALUE = double(1), function(r){
x <- Y1.log.cpm[rowMeans(Y1.log.cpm)<=max.g[r] & rowMeans(Y1.log.cpm)>min.g[r],]
y <- ifelse(!is.null(dim(x)), mean(rowZeroFrac(x)), mean(x==0))
y
})
sm1 <- loess(I(f.SPsim-f.real)~mid.g)
plot(mid.g, f.SPsim-f.real, xlim=c(0, 14), col="lightskyblue", pch=20, cex.lab=1.25,
cex.main=1.4, main="SPsimSeq - real data", ylab="difference", xlab="mean log2-CPM")
lines(mid.g,predict(sm1, newdata = mid.g), col="blue", lwd=3)
## ----corSCrna-----------------------------------------------------------------
# compare the correlation between genes and samples
Y0.log.cpm2 <- Y0.log.cpm[rowMeans(Y0.log.cpm>0)>0.25, ]
Y1.log.cpm2 <- Y1.log.cpm[rowMeans(Y1.log.cpm>0)>0.25, ]
cor.mat.Y0 <- cor(t(Y0.log.cpm2))
cor.mat.Y1 <- cor(t(Y1.log.cpm2))
cor.vec.Y0 <- cor.mat.Y0[upper.tri(cor.mat.Y0)]
cor.vec.Y1 <- cor.mat.Y1[upper.tri(cor.mat.Y1)]
par(mfrow=c(1,3), mar=c(4,4,3.5,1))
hist(cor.vec.Y0, nclass = 30, probability = TRUE,
border="gray", col="steelblue1", main="real data", xlab="pairwise correlation between genes",
ylim=c(0, 3.5), xlim=c(-1, 1), cex.lab=1.25)
hist(cor.vec.Y1, nclass = 30, probability = TRUE, border="gray",
col="steelblue1", main="SPsimSeq", xlab="pairwise correlation between genes",
ylim=c(0, 3.5), xlim=c(-1, 1), cex.lab=1.25)
plot(seq(-1, 1, 0.1), seq(-1, 1, 0.1), type="n", xlab="quantile (real data)",
ylab="quantile (simulated data)", main="correlation quantile-quantile plot")
abline(0, 1, col="gray")
points(quantile(cor.vec.Y0, seq(0, 1, 0.001)), quantile(cor.vec.Y1, seq(0, 1, 0.001)),
col="blue", pch=20, cex=1.5, cex.lab=1.25)
## -----------------------------------------------------------------------------
sessionInfo()
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.