Nothing
## ----knitr, echo=FALSE, results='hide'----------------------------------------
library("knitr")
opts_chunk$set(tidy=FALSE,dev="pdf",fig.show="hide",
fig.width=4,fig.height=4.5,
message=FALSE, warning=FALSE)
## ----style, eval=TRUE, echo=FALSE, results="asis"---------------------------------------
BiocStyle::latex(width=90)
## ----options, results="hide", echo=FALSE--------------------------------------
options(digits=3, width=80, prompt=" ", continue=" ")
## ----simResult, cache=TRUE, results='hide', fig.height=4, fig.width=4---------
# load library
library('variancePartition')
# load simulated data:
# geneExpr: matrix of gene expression values
# info: information/metadata about each sample
data(varPartData)
# Specify variables to consider
# Age is continuous so model it as a fixed effect
# Individual and Tissue are both categorical,
# so model them as random effects
# Note the syntax used to specify random effects
form <- ~ Age + (1|Individual) + (1|Tissue) + (1|Batch)
# Fit model and extract results
# 1) fit linear mixed model on gene expression
# If categorical variables are specified,
# a linear mixed model is used
# If all variables are modeled as fixed effects,
# a linear model is used
# each entry in results is a regression model fit on a single gene
# 2) extract variance fractions from each model fit
# for each gene, returns fraction of variation attributable
# to each variable
# Interpretation: the variance explained by each variables
# after correcting for all other variables
# Note that geneExpr can either be a matrix,
# and EList output by voom() in the limma package,
# or an ExpressionSet
varPart <- fitExtractVarPartModel( geneExpr, form, info )
# sort variables (i.e. columns) by median fraction
# of variance explained
vp <- sortCols( varPart )
# Figure 1a
# Bar plot of variance fractions for the first 10 genes
plotPercentBars( vp[1:10,] )
#
# Figure 1b
# violin plot of contribution of each variable to total variance
plotVarPart( vp )
## ----accessResults, cache=TRUE, warning=FALSE---------------------------------
# Access first entries
head(varPart)
# Access first entries for Individual
head(varPart$Individual)
# sort genes based on variance explained by Individual
head(varPart[order(varPart$Individual, decreasing=TRUE),])
## ----savePlot, cache=TRUE, eval=FALSE-----------------------------------------
# fig <- plotVarPart( vp )
# ggsave(file, fig)
## ----plotStratify, cache=TRUE, warning=FALSE, fig.height=4, fig.width=4-------
# get gene with the highest variation across Tissues
# create data.frame with expression of gene i and Tissue
# type for each sample
i <- which.max( varPart$Tissue )
GE <- data.frame( Expression = geneExpr[i,], Tissue = info$Tissue)
# Figure 2a
# plot expression stratified by Tissue
plotStratify( Expression ~ Tissue, GE, main=rownames(geneExpr)[i])
#
# get gene with the highest variation across Individuals
# create data.frame with expression of gene i and Tissue
# type for each sample
i <- which.max( varPart$Individual )
GE <- data.frame( Expression = geneExpr[i,],
Individual = info$Individual)
# Figure 2b
# plot expression stratified by Tissue
label <- paste("Individual:", format(varPart$Individual[i]*100,
digits=3), "%")
main <- rownames(geneExpr)[i]
plotStratify( Expression ~ Individual, GE, colorBy=NULL,
text=label, main=main)
## ----cache=TRUE---------------------------------------------------------------
library('lme4')
# fit regression model for the first gene
form_test <- geneExpr[1,] ~ Age + (1|Individual) + (1|Tissue)
fit <- lmer(form_test, info, REML=FALSE )
# extract variance statistics
calcVarPart(fit)
## ----canCorPairs, cache=TRUE, results='hide', fig.width=5, fig.height=5-------
form <- ~ Individual + Tissue + Batch + Age + Height
# Compute Canonical Correlation Analysis (CCA)
# between all pairs of variables
# returns absolute correlation value
C = canCorPairs( form, info)
# Plot correlation matrix
plotCorrMatrix( C )
## ----simResult-omit, cache=TRUE, results='hide'-------------------------------
form <- ~ Age + (1|Individual) + (1|Tissue) + (1|Batch)
# Fit model
results <- fitVarPartModel( geneExpr, form, info )
# Extract results
varPart <- extractVarPart( results )
## ----simResult-fast-two-step, cache=TRUE, results='hide'----------------------
# Fit model and run summary() function on each model fit
vpSummaries <- fitVarPartModel( geneExpr, form, info, fxn=summary )
## ----simResult-fast-two-step1, cache=TRUE-------------------------------------
# Show results of summary() for the first gene
vpSummaries[[1]]
## ----echo=TRUE, cache=TRUE, results='hide'------------------------------------
form <- ~ (1|Tissue) + (1|Individual) + (1|Batch) + Age
varPart <- fitExtractVarPartModel( geneExpr, form, info )
## ----echo=TRUE, cache=TRUE, results='hide'------------------------------------
library('limma')
# subtract out effect of Batch
fit <- lmFit( geneExpr, model.matrix(~ Batch, info))
res <- residuals( fit, geneExpr)
# fit model on residuals
form <- ~ (1|Tissue) + (1|Individual) + Age
varPartResid <- fitExtractVarPartModel( res, form, info )
## ----echo=TRUE, cache=TRUE, results='hide'------------------------------------
# subtract out effect of Batch with linear mixed model
modelFit <- fitVarPartModel( geneExpr, ~ (1|Batch), info )
res <- residuals( modelFit )
# fit model on residuals
form <- ~ (1|Tissue) + (1|Individual) + Age
varPartResid <- fitExtractVarPartModel( res, form, info )
## ----echo=TRUE, cache=TRUE, results='hide'------------------------------------
# extract residuals directly without storing intermediate results
residList <- fitVarPartModel( geneExpr, ~ (1|Batch), info,
fxn=residuals )
# convert list to matrix
residMatrix = do.call(rbind, residList)
## ----withinTissue, echo=TRUE, cache=TRUE, results='hide', fig.height=5, fig.width=4----
# specify formula to model within/between individual variance
# separately for each tissue
# Note that including +0 ensures each tissue is modeled explicitly
# Otherwise, the first tissue would be used as baseline
form <- ~ (Tissue+0|Individual) + Age + (1|Tissue) + (1|Batch)
# fit model and extract variance percents
varPart <- fitExtractVarPartModel( geneExpr, form, info, showWarnings=FALSE )
# violin plot
plotVarPart( sortCols(varPart), label.angle=60 )
## ----echo=TRUE, cache=TRUE, results='hide'------------------------------------
form <- ~ (1|Individual) + (1|Tissue) + Age + Height
# fit model
res <- fitVarPartModel( geneExpr[1:4,], form, info )
## ----echo=TRUE, cache=TRUE----------------------------------------------------
# evaluate the collinearity score on the first model fit
# this reports the correlation matrix between coefficient estimates
# for fixed effects
# the collinearity score is the maximum absolute correlation value
# If the collinearity score > .99 then the variance partition
# estimates may be problematic
# In that case, a least one variable should be omitted
colinearityScore( res[[1]] )
## ----echo=TRUE, cache=TRUE, results='hide'------------------------------------
form <- ~ (1|Individual) + (1|Tissue) + Age + Height
# Specify custom weights
# In this example the weights are simulated from a
# uniform distribution and are not meaningful.
weights <- matrix(runif(length(geneExpr)), nrow=nrow(geneExpr))
# Specify custom weights
res <- fitExtractVarPartModel( geneExpr[1:4,], form, info,
weightsMatrix=weights[1:4,] )
## ----vpInteraction, echo=TRUE, cache=TRUE, results='hide', fig.width=4, fig.height=4----
form <- ~ (1|Individual) + Age + Height + (1|Tissue) + (1|Batch) +
(1|Batch:Tissue)
# fit model
vpInteraction <- fitExtractVarPartModel( geneExpr, form, info )
plotVarPart( sortCols( vpInteraction ) )
## ----echo=TRUE, cache=TRUE, results='hide'------------------------------------
library('limma')
library('edgeR')
# identify genes that pass expression cutoff
isexpr <- rowSums(cpm(geneCounts)>1) >= 0.5 * ncol(geneCounts)
# create data structure with only expressed genes
gExpr <- DGEList(counts=geneCounts[isexpr,])
# Perform TMM normalization
gExpr <- calcNormFactors(gExpr)
# Specify variables to be included in the voom() estimates of
# uncertainty.
# Recommend including variables with a small number of categories
# that explain a substantial amount of variation
design <- model.matrix( ~ Batch, info)
# Estimate precision weights for each gene and sample
# This models uncertainty in expression measurements
vobjGenes <- voom(gExpr, design )
# Define formula
form <- ~ (1|Individual) + (1|Tissue) + (1|Batch) + Age
# variancePartition seamlessly deals with the result of voom()
# by default, it seamlessly models the precision weights
# This can be turned off with useWeights=FALSE
varPart <- fitExtractVarPartModel( vobjGenes, form, info )
## ----echo=TRUE, cache=TRUE, results='hide'------------------------------------
library('DESeq2')
# create DESeq2 object from gene-level counts and metadata
dds <- DESeqDataSetFromMatrix(countData = geneCounts,
colData = info,
design = ~ 1)
# Estimate library size correction scaling factors
dds <- estimateSizeFactors(dds)
# identify genes that pass expression cutoff
isexpr <- rowSums(fpm(dds)>1) >= 0.5 * ncol(dds)
# compute log2 Fragments Per Million
# Alternatively, fpkm(), vst() or rlog() could be used
quantLog <- log2( fpm( dds )[isexpr,] + 1)
# Define formula
form <- ~ (1|Individual) + (1|Tissue) + (1|Batch) + Age
# Run variancePartition analysis
varPart <- fitExtractVarPartModel( quantLog, form, info)
## ----echo=TRUE, cache=TRUE, results='hide', eval=FALSE------------------------
# library('tximportData')
# library('tximport')
# library('readr')
#
# # Get data from folder where tximportData is installed
# dir <- system.file("extdata", package = "tximportData")
# samples <- read.table(file.path(dir, "samples.txt"), header = TRUE)
# files <- file.path(dir, "kallisto", samples$run, "abundance.tsv")
# names(files) <- paste0("sample", 1:6)
#
# tx2gene <- read.csv(file.path(dir, "tx2gene.csv"))
#
# # reads results from kallisto
# txi <- tximport(files, type = "kallisto", tx2gene = tx2gene,
# countsFromAbundance = "lengthScaledTPM")
#
# # define metadata (usually read from external source)
# info_tximport <- data.frame( Sample = sprintf("sample%d", 1:6),
# Disease=c("case", "control")[c(rep(1, 3), rep(2, 3) )] )
#
# # Extract counts from kallisto
# y <- DGEList( txi$counts )
#
# # compute library size normalization
# y <- calcNormFactors(y)
#
# # apply voom to estimate precision weights
# design <- model.matrix( ~ Disease, data = info_tximport)
# vobj <- voom(y, design)
#
# # define formula
# form <- ~ (1|Disease)
#
# # Run variancePartition analysis (on only 10 genes)
# varPart_tx <- fitExtractVarPartModel( vobj[1:10,], form,
# info_tximport)
#
## ----echo=TRUE, cache=TRUE, results='hide'------------------------------------
library('ballgown')
# Get data from folder where ballgown is installed
data_directory <- system.file('extdata', package='ballgown')
# Load results of Cufflinks/Tablemaker
bg <- ballgown(dataDir=data_directory, samplePattern='sample',
meas='all')
# extract gene-level FPKM quantification
# Expression can be convert to log2-scale if desired
gene_expression <- gexpr(bg)
# extract transcript-level FPKM quantification
# Expression can be convert to log2-scale if desired
transcript_fpkm <- texpr(bg, 'FPKM')
# define metadata (usually read from external source)
info_ballgown <- data.frame( Sample = sprintf("sample%02d", 1:20),
Batch = rep(letters[1:4], 5),
Disease=c("case", "control")[c(rep(1, 10), rep(2, 10) )] )
# define formula
form <- ~ (1|Batch) + (1|Disease)
# Run variancePartition analysis
# Gene-level analysis
varPart_gene <- fitExtractVarPartModel( gene_expression, form,
info_ballgown)
# Transcript-level analysis
varPart_transcript <- fitExtractVarPartModel( transcript_fpkm, form,
info_ballgown)
## ----echo=FALSE, cache=TRUE, results='hide'-----------------------------------
library('variancePartition')
sim_data = function( n, p, var_indiv, var_site){
info = data.frame(ID = paste("s", 1:n, sep=''),
Individual = factor(rep( round(seq(1, n/6, length.out=n/6)),6)),
Tissue = factor(sort(rep(toupper(letters[1:3]), n/3))),
Age = rpois(n, 50))
geneExpr = matrix(0, nrow=p, ncol=n)
for( i in 1:p){
beta_indiv = rnorm(nlevels(info$Individual), 0, sqrt(var_indiv))
beta_site = rnorm(nlevels(info$Tissue), 0, sqrt(var_site))
eta = model.matrix( ~ Individual, info) %*% beta_indiv + model.matrix( ~ Tissue+0, info) %*% beta_site
noise = rbeta(1, 10, 100)
errVar = var(eta) * (noise) / (1-noise)
geneExpr[i,] = eta + rnorm(n, 0, sqrt(errVar))
}
colnames(geneExpr) = paste("s", 1:ncol(geneExpr), sep='')
rownames(geneExpr) = paste("gene", 1:nrow(geneExpr), sep='')
return( list(info=info, geneExpr=geneExpr))
}
plotVar = function( geneExpr, info ){
form = ~ (1|Individual) + (1|Tissue)
varPart = fitExtractVarPartModel( geneExpr, form, info )
plotVarPart( varPart, col=rainbow(8)[1:3] )
}
plotVarCross = function( geneExpr, info, label.angle=30 ){
form = ~ (Tissue+0|Individual) + (1|Tissue)
#res = fitVarPartModel( geneExpr, form, info )
#varPart = extractVarPart( res )
varPart = fitExtractVarPartModel( geneExpr, form, info, showWarnings=FALSE )
plotVarPart( varPart, label.angle=label.angle )
}
plotPCA = function(geneExpr, col){
dcmp = prcomp(t(geneExpr))
par(mar = c(4, 4, 1, 1) + 0.1)
plot(dcmp$x[,1:2], col=col)
}
plotTree = function(geneExpr, col){
hc = hclust(dist(t(geneExpr)))
library('dendextend')
hcd = as.dendrogram(hc)
labels_colors(hcd) <- col[match(labels(hcd), names(col))]
par(mar = c(4, 4, 1, 1) + 0.1, cex=.6)
plot( hcd, yaxt='n', horiz=TRUE )
}
## ----siteDominant, echo=FALSE, cache=TRUE, results='hide', fig.height=5, fig.width=5----
set.seed(1)
n = 60
p = 200
data = sim_data( n, p, 1, 4)
colTissue = rainbow(nlevels(data$info$Tissue))[data$info$Tissue]
names(colTissue) = data$info$ID
colIndiv = rainbow(nlevels(data$info$Individual))[data$info$Individual]
names(colIndiv) = data$info$ID
plotPCA( data$geneExpr, colTissue)
plotPCA( data$geneExpr, colIndiv)
plotTree( data$geneExpr, colTissue)
legend("topleft", legend=levels(data$info$Tissue), fill=rainbow(nlevels(data$info$Tissue))[1:nlevels(data$info$Tissue)], title="Tissue")
plotTree( data$geneExpr, colIndiv)
legend("topleft", legend=levels(data$info$Individual), fill=rainbow(nlevels(data$info$Individual))[1:nlevels(data$info$Individual)], title="Individual")
plotVar( data$geneExpr, data$info )
plotVarCross( data$geneExpr, data$info, label.angle=60 )
## ----IndivDominant, echo=FALSE, cache=TRUE, results='hide', fig.height=5, fig.width=5----
set.seed(1)
n = 60
p = 200
data = sim_data( n, p, 3, 1)
colTissue = rainbow(nlevels(data$info$Tissue))[data$info$Tissue]
names(colTissue) = data$info$ID
colIndiv = rainbow(nlevels(data$info$Individual))[data$info$Individual]
names(colIndiv) = data$info$ID
plotPCA( data$geneExpr, colTissue)
plotPCA( data$geneExpr, colIndiv)
plotTree( data$geneExpr, colTissue)
legend("topleft", legend=levels(data$info$Tissue), fill=rainbow(nlevels(data$info$Tissue))[1:nlevels(data$info$Tissue)], title="Tissue")
plotTree( data$geneExpr, colIndiv)
legend("topleft", legend=levels(data$info$Individual), fill=rainbow(nlevels(data$info$Individual))[1:nlevels(data$info$Individual)], title="Individual")
plotVar( data$geneExpr, data$info )
plotVarCross( data$geneExpr, data$info, label.angle=60 )
## ----echo=FALSE, cache=FALSE--------------------------------------------------
# if( "cluster" %in% class(cl) ){
if( exists("cl") ){
library('doParallel')
# stop cluster and catch warning if invalid
res = tryCatch( {stopCluster(cl)}, warning = function(x) {
}, error = function(x) {
}, finally={
})
# warning("STOP CLUSTER!!!!!!!!!!!!!!!!!\n")
}
## ----DE, echo=FALSE, cache=TRUE, fig.height=5, fig.width=5--------------------
set.seed(1)
library('variancePartition')
n = 500
data = data.frame(Sex = factor(c(rep('F', n), rep('M', n))))
data$expression = rnorm(2*n, (as.integer(data$Sex)-1)*2, .3)
data$Sex = factor(data$Sex)
fit = lm(expression ~ Sex, data)
# calcVarPart(fit)
# coef(fit)[2]
plotStratify( expression ~ Sex, data, ylim=c(-6, 9)) +
annotate("text", x = 0.5, y = 9, hjust=0, label = paste("fold change:", format(coef(fit)[2], digits=3)), size=5.5) +
annotate("text", x = 0.5, y = 8.2, hjust=0, label = paste("% variance of expression:", format(calcVarPart(fit)[1]*100, digits=3), "%"), size=5.5)
n = 500
data = data.frame(Sex = factor(c(rep('F', n), rep('M', n))))
data$expression = rnorm(2*n, (as.integer(data$Sex)-1)*2, 2.01)
data$Sex = factor(data$Sex)
fit = lm(expression ~ Sex, data)
# calcVarPart(fit)
# coef(fit)[2]
plotStratify( expression ~ Sex, data, ylim=c(-6, 9)) +
annotate("text", x = 0.5, y = 9, hjust=0, label = paste("fold change:", format(coef(fit)[2], digits=3)), size=5.5) +
annotate("text", x = 0.5, y = 8.2, hjust=0, label = paste("% variance of expression:", format(calcVarPart(fit)[1]*100, digits=3), "%"), size=5.5)
## ----sessInfo, results="asis", echo=FALSE-------------------------------------
toLatex(sessionInfo())
## ----resetOptions, results="hide", echo=FALSE---------------------------------
options(prompt="> ", continue="+ ")
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.