library(knitr) library(Cairo) opts_chunk$set(warning=FALSE, fig.width=6, fig.height=6, dev="CairoPNG", stop=TRUE) library(GEOquery)
gdsfmt
for very large datasetspath <- download.450k.lead.dataset()
Create samplesheet
library(meffil) options(mc.cores=10) samplesheet <- meffil.create.samplesheet(path) samples <- read.csv(file.path(path, "samples.csv"), check.names=F, row.names=1) samplesheet <- data.frame( samplesheet, samples[match(samplesheet$Sample_Name, rownames(samples)),], stringsAsFactors=F, check.names=F) samplesheet <- samplesheet[which(samplesheet[["sample type"]] == "HM450K"),]
Parameters.
qc.file <- "gds/qc-report.html" author <- "Sen, et al." study <- "Cord blood DNA methylation and lead exposure (GSE69633)" norm.file <- "gds/normalization-report.html" cell.type.reference <- "gervin and lyle cord blood"
Generate QC objects for each sample and QC report.
qc.objects <- meffil.qc( samplesheet, cell.type.reference=cell.type.reference, verbose=T) qc.summary <- meffil.qc.summary(qc.objects, verbose=T) meffil.qc.report( qc.summary, output.file=qc.file, author=author, study=study)
Remove any low quality samples.
if (nrow(qc.summary$bad.samples) > 0) qc.objects <- meffil.remove.samples( qc.objects, qc.summary$bad.samples$sample.name) samplesheet <- samplesheet[match(names(qc.objects), rownames(samplesheet)),]
Check how many principal components to include.
print(meffil.plot.pc.fit(qc.objects, n.cross=3)$plot)
Ten seems about right.
number.pcs <- 10
Normalize the dataset.
norm.objects <- meffil.normalize.quantiles( qc.objects, number.pcs=number.pcs, verbose=T) beta <- meffil.normalize.samples( norm.objects, just.beta=T, remove.poor.signal=T, cpglist.remove=qc.summary$bad.cpgs$name, verbose=T)
Normalize while saving to a GDS file.
dir.create("gds") gds.filename <- "gds/beta.gds" if (!file.exists(gds.filename)) meffil.normalize.samples( norm.objects, just.beta=T, remove.poor.signal=T, cpglist.remove=qc.summary$bad.cpgs$name, gds.filename=gds.filename, verbose=T)
Load the matrix in the GDS file for comparison.
beta.gds <- meffil.gds.methylation(gds.filename)
It should be the same as the one generated the standard way.
identical(beta.gds, beta) identical(colnames(beta.gds), colnames(beta)) identical(rownames(beta.gds), rownames(beta))
Save detection p-values to a GDS file.
detp.filename <- "gds/detp.gds" if (!file.exists(detp.filename)) meffil.save.detection.pvalues(qc.objects, detp.filename, verbose=T)
Compare detection p-values to loading them to a matrix from QC objects.
detp.gds <- meffil.gds.detection.pvalues(detp.filename) detp <- meffil.load.detection.pvalues(qc.objects) identical(detp, detp.gds)
Verify that correct signals have been set to missing in the methylation matrix.
idx <- which( detp[rownames(beta.gds), colnames(beta.gds)] > qc.objects[[1]]$bad.probes.detectionp.threshold) all(is.na(beta.gds[idx]))
We can load a subset from the GDS file as well:
bw.sites <- c( "cg20076442", "cg25953130", "cg04521626", "cg14097568", "cg17133774", "cg00654448", "cg00442282", "cg13696490", "cg12044213", "cg08817867", "cg00382138", "cg06870470", "cg25557739", "cg24324628", "cg15783941", "cg14597739", "cg22962123", "cg05851442", "cg23387597", "cg24973755", "cg16219283", "cg25799241", "cg06658067") ## PMID: 25869828 (bonferroni p < 0.05) samples <- colnames(beta)[c(1,3,5,7)] sub.gds <- meffil.gds.methylation(gds.filename, bw.sites, samples)
It should be the same as subsetting the matrix.
quantile(beta[bw.sites,samples] - sub.gds, na.rm=T) identical(colnames(sub.gds), samples) identical(rownames(sub.gds), bw.sites)
Compute principal components for the normalization report.
pcs <- meffil.methylation.pcs(beta, winsorize.pct=NA, outlier.iqr.factor=3) pcs.gds <- meffil.methylation.pcs(gds.filename, winsorize.pct=NA, outlier.iqr.factor=3)
The resulting components are exactly the same.
identical(pcs.gds, pcs)
Run an EWAS from beta matrix or from GDS file:
variable.name <- "birth weight" variable <- samplesheet[[variable.name]] covariates <- data.frame( t(meffil.cell.count.estimates(qc.objects)), samplesheet[,c("socioeconomic score","gender","smoke ever", "gestational age","pbconc (ng/dl)")], stringsAsFactors=F) ## restrict to autosomal CpG sites as both males and females in dataset auto.sites <- meffil.get.autosomal.sites("450k") ## from beta matrix ewas.ret <- meffil.ewas( beta, variable=variable, covariates=covariates, sites=auto.sites, isva=F, sva=F, smartsva=T, n.sv=10, winsorize.pct=NA, outlier.iqr.factor=3, verbose=T) ## from GDS file ewas.gds <- meffil.ewas( gds.filename, variable=variable, covariates=covariates, sites=auto.sites, isva=F, sva=F, smartsva=T, n.sv=10, winsorize.pct=NA, outlier.iqr.factor=3, verbose=T)
Verify that we evaluated only autosomal CpG sites.
length(setdiff(rownames(ewas.gds$analyses$smartsva$table), auto.sites))==0
We obtain similar but not identical results
because limma
is used to test associations using the beta matrix
and, because limma
requires access to the entire matrix,
glm
is used to test associations using the GDS file.
sapply(names(ewas.ret$analyses), function(model) { cor(ewas.ret$analyses[[model]]$table[,"coefficient"], ewas.gds$analyses[[model]]$table[,"coefficient"], use="p") }) sapply(names(ewas.ret$analyses), function(model) { table(matrix=ewas.ret$analyses[[model]]$table[,"p.value"] < 1e-5, gds=ewas.gds$analyses[[model]]$table[,"p.value"] < 1e-5) }, simplify=F)
Is there some evidence of replicating previously
identified associations with r variable.name
?
sapply(ewas.ret$analyses, function(obj) quantile(obj$table[bw.sites,"p.value"])) sapply(ewas.gds$analyses, function(obj) quantile(obj$table[bw.sites,"p.value"]))
Generate the EWAS report using either the beta matrix or the GDS file.
ewas.parameters <- meffil.ewas.parameters(max.plots=10) ## using the beta matrix summary.ret <- meffil.ewas.summary( ewas.ret, beta, selected.cpg.sites=bw.sites, parameters=ewas.parameters, verbose=T) meffil.ewas.report(summary.ret, output.file="gds/ewas.html", author="Me", study=paste(variable.name, "in cord blood DNA methylation (GEO:GSE69633)")) ## or using the GDS file summary.gds <- meffil.ewas.summary( ewas.gds, gds.filename, selected.cpg.sites=bw.sites, parameters=ewas.parameters, verbose=T) meffil.ewas.report(summary.gds, output.file="gds/ewas-gds.html", author="Me", study=paste(variable.name, "in cord blood DNA methylation (GEO:GSE69633)"))
Robust linear regression in EWAS is also possible.
When analysing the beta matrix, we test associations
using limma::lmFit
with the 'robust' option
which applies MASS::rlm
to each CpG site.
When analysing the GDS file, we similarly
test associations using MASS::rlm
and then
calculate statistical significance using lmtest::coeftest
with vcov=sandwich::vcovHC(fit, type="HC0")
.
## from matrix robust.ret <- meffil.ewas( beta, variable=variable, covariates=covariates, sites=auto.sites, isva=F, sva=F, smartsva=T, n.sv=10, rlm=T, winsorize.pct=NA, outlier.iqr.factor=3, verbose=T) ## from GDS file robust.gds <- meffil.ewas( gds.filename, variable=variable, covariates=covariates, sites=auto.sites, isva=F, sva=F, smartsva=T, n.sv=10, rlm=T, winsorize.pct=NA, outlier.iqr.factor=3, verbose=T)
As expected, results somewhat different for robust regression.
sapply(names(robust.ret$analyses), function(model) { cor(ewas.ret$analyses[[model]]$table[,"coefficient"], robust.ret$analyses[[model]]$table[,"coefficient"], use="p") }) sapply(names(robust.ret$analyses), function(model) { table(ewas=ewas.ret$analyses[[model]]$table[,"p.value"] < 1e-5, robust=robust.ret$analyses[[model]]$table[,"p.value"] < 1e-5) },simplify=F)
Between the robust matrix and GDS analyses, coefficients identical but p-values different because statistical handled differently in 'limma'.
sapply(names(robust.ret$analyses), function(model) { cor(robust.ret$analyses[[model]]$table[,"coefficient"], robust.gds$analyses[[model]]$table[,"coefficient"], use="p") }) sapply(names(robust.ret$analyses), function(model) { table(matrix=robust.ret$analyses[[model]]$table[,"p.value"] < 1e-5, gds=robust.gds$analyses[[model]]$table[,"p.value"] < 1e-5) }, simplify=F)
Finally, it is possible to perform EWAS on subsamples of the dataset. We'll remove some potential lead exposure outliers from analysis.
pb <- samplesheet[["pbconc (ng/dl)"]] samples <- samplesheet[["Sample_Name"]][pb < 8]
## from matrix sub.ret <- meffil.ewas( beta, variable=variable, covariates=covariates, sites=auto.sites, samples=samples, isva=F, sva=F, smartsva=T, n.sv=10, winsorize.pct=NA, outlier.iqr.factor=3, verbose=T) ## from GDS file sub.gds <- meffil.ewas( gds.filename, variable=variable, covariates=covariates, sites=auto.sites, samples=samples, isva=F, sva=F, smartsva=T, n.sv=10, winsorize.pct=NA, outlier.iqr.factor=3, verbose=T)
Check that the model used the correct samples.
identical(samples, rownames(sub.gds$analyses$none$design))
Again, coefficients are identical but statistical significances slightly different.
sapply(names(sub.ret$analyses), function(model) { cor(sub.ret$analyses[[model]]$table[,"coefficient"], sub.gds$analyses[[model]]$table[,"coefficient"], use="p") }) sapply(names(sub.ret$analyses), function(model) { table(matrix=sub.ret$analyses[[model]]$table[,"p.value"] < 1e-5, gds=sub.gds$analyses[[model]]$table[,"p.value"] < 1e-5) }, simplify=F)
There are only slight differences from the full dataset analysis.
sapply(names(sub.gds$analyses), function(model) { cor(sub.gds$analyses[[model]]$table[,"coefficient"], ewas.gds$analyses[[model]]$table[,"coefficient"], use="p") }) sapply(names(sub.gds$analyses), function(model) { table(sub=sub.gds$analyses[[model]]$table[,"p.value"] < 1e-5, full=ewas.gds$analyses[[model]]$table[,"p.value"] < 1e-5) }, simplify=F)
It is also possible to apply a function to each CpG site or each sample in the dataset.
v.gds <- meffil:::meffil.gds.apply( gds.filename, bysite=T, type="double", FUN=var, na.rm=T) v <- rowVars(beta, na.rm=T) all(abs(v-v.gds) < 2e-16)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.