library(knitr) library(Cairo) opts_chunk$set(warning=FALSE, fig.width=6, fig.height=6, dev="CairoPNG", stop=TRUE) library(penalized) library(GEOquery)
path <- download.450k.lead.dataset()
Create samplesheet
library(meffil) options(mc.cores=10) samplesheet <- meffil.create.samplesheet(path) samples <- read.csv(file.path(path, "samples.csv"), check.names=F, row.names=1) samplesheet <- data.frame(samplesheet, samples[match(samplesheet$Sample_Name, rownames(samples)),], stringsAsFactors=F, check.names=F) samplesheet <- samplesheet[which(samplesheet[["sample type"]] == "HM450K"),]
Parameters.
qc.file <- "ewas/qc-report.html" author <- "Sen, et al." study <- "Cord blood DNA methylation and lead exposure (GSE69633)" norm.file <- "ewas/normalization-report.html" cell.type.reference <- "gervin and lyle cord blood"
Generate QC objects for each sample and QC report.
qc.objects <- meffil.qc(samplesheet, cell.type.reference=cell.type.reference, verbose=T) qc.summary <- meffil.qc.summary(qc.objects, verbose=T) meffil.qc.report(qc.summary, output.file=qc.file, author=author, study=study)
Remove any low quality samples.
if (nrow(qc.summary$bad.samples) > 0) qc.objects <- meffil.remove.samples(qc.objects, qc.summary$bad.samples$sample.name) samplesheet <- samplesheet[match(names(qc.objects), rownames(samplesheet)),]
Check how many principal components to include.
print(meffil.plot.pc.fit(qc.objects, n.cross=3)$plot)
Ten seems about right.
number.pcs <- 10
Normalize dataset and generate normalization report.
norm.objects <- meffil.normalize.quantiles(qc.objects, number.pcs=number.pcs, verbose=T) norm.dataset <- meffil.normalize.samples(norm.objects, just.beta=F, cpglist.remove=qc.summary$bad.cpgs$name, verbose=T) beta <- meffil.get.beta(norm.dataset$M, norm.dataset$U) pcs <- meffil.methylation.pcs(beta, sites=meffil.get.autosomal.sites("450k"), verbose=T) parameters <- meffil.normalization.parameters(norm.objects) norm.summary <- meffil.normalization.summary(norm.objects=norm.objects, pcs=pcs, parameters=parameters, verbose=T) meffil.normalization.report(norm.summary, output.file=norm.file, author=author, study=study)
Parameters.
ewas.variable.name <- "gestational age" ewas.covariate.names <- c("socioeconomic score","gender","smoke ever","birth weight") ewas.output.file <- "ewas/ewas-gestational-age-report.html" ewas.author <- "Me" ewas.study <- "Gestational age in cord blood DNA methylation (GEO:GSE69633)" ewas.cpg.sites <- c("cg08943494", "cg11932158","cg16725984", "cg20334115", "cg18623216") ## PMID: 25869828 (top 5)
Perform EWAS.
variable <- samplesheet[[ewas.variable.name]] covariates <- data.frame(t(meffil.cell.count.estimates(qc.objects)), samplesheet[,ewas.covariate.names], stringsAsFactors=F) ewas.ret <- meffil.ewas(beta, variable=variable, covariates=covariates, isva=F, sva=F, smartsva=T, most.variable=50000, verbose=T)
Generate EWAS report.
ewas.parameters <- meffil.ewas.parameters(sig.threshold=1e-5,max.plots=10) ewas.summary <- meffil.ewas.summary(ewas.ret, beta, selected.cpg.sites=ewas.cpg.sites, parameters=ewas.parameters, verbose=T) meffil.ewas.report(ewas.summary, output.file=ewas.output.file, author=ewas.author, study=ewas.study)
idx <- sample(order(ewas.ret$analyses$smartsva$table$p.value)[1:1000],100) sub.sites <- rownames(beta)[idx] sub.covariates <- cbind(covariates, with(ewas.ret$analyses$smartsva, design[,grep("^X[0-9]+$", colnames(design))])) ewas.sub.ret <- meffil.ewas(beta, variable=variable, sites=sub.sites, covariates=sub.covariates, sva=F, isva=F, smartsva=F, verbose=T) ewas.parameters <- meffil.ewas.parameters(sig.threshold=0.05/length(sub.sites),max.plots=10) ewas.summary <- meffil.ewas.summary(ewas.sub.ret, beta, selected.cpg.sites=sub.sites, parameters=ewas.parameters, verbose=T) meffil.ewas.report(ewas.summary, output.file="ewas/ewas-gestational-age-report-100.html", author=ewas.author, study=ewas.study)
How well does SVA capture batch effects that are not explicitly included in the model?
slide <- model.matrix(~ 0 + Slide, samplesheet) row <- model.matrix(~ 0 + sentrix_row, samplesheet) sv <- ewas.ret$analyses$smartsva$design sv <- sv[,grep("X", colnames(sv))] library(penalized) prediction.accuracy <- function(v, p) { fit <- optL1(response=v, penalized=p, fold=10, standardize=T, maxiter=1000, model="logistic") sum((fit$predictions > 0.5) == (v == 1))/length(v) } apply(row, 2, prediction.accuracy, sv) apply(slide, 2, prediction.accuracy, sv)
How much do the different EWAS models agree?
sapply(ewas.ret$analyses, function(a) sapply(ewas.ret$analyses, function(b) length(intersect(rownames(a$table)[which(a$table$p.value < 0.0001)], rownames(b$table)[which(b$table$p.value < 0.0001)])))) sapply(ewas.ret$analyses, function(a) sapply(ewas.ret$analyses, function(b) { a <- rownames(a$table)[which(a$table$p.value < 0.0001)] b <- rownames(b$table)[which(b$table$p.value < 0.0001)] length(intersect(a,b))/length(union(a,b)) }))
sapply(ewas.ret$analyses, function(a) sapply(ewas.ret$analyses, function(b) { idx <- which(a$table$p.value < 0.01) cor(a$table$coefficient[idx], b$table$coefficient[idx]) }))
Compare meffil EWAS to EWAS 'done by hand'.
## EWAS by meffil test.ret <- meffil.ewas(beta, variable=variable, covariates=covariates, isva=F,sva=T,smartsva=F, most.variable=50000, random.seed=20161123, verbose=T) test.ret$sv <- test.ret$analyses$sva$design test.ret$sv <- test.ret$sv[,grep("X", colnames(test.ret$sv))] ## EWAS by hand beta.win <- meffil:::winsorize(beta) beta.sva <- beta.win beta.sva <- beta.sva[which(rownames(beta.sva) %in% meffil.get.autosomal.sites()),] beta.sva <- beta.sva[order(rowVars(beta.sva), decreasing=T)[1:50000],] mod0 <- model.matrix(~ ., covariates) mod <- cbind(mod0, variable) set.seed(20161123) sva.ret <- sva(dat=beta.sva, mod=mod, mod0=mod0) apply(abs(cor(test.ret$sv, sva.ret$sv)), 2, max) mod <- cbind(mod, sva.ret$sv) fit <- lm.fit(mod, t(beta.win)) cor(fit$coefficients["variable",], test.ret$analyses$sva$table$coefficient)
View EWAS effect estimates next to genes and other genomic landmarks by generating genome browser tracks and uploading them to https://genome.ucsc.edu/cgi-bin/hgCustom.
meffil.ewas.bedgraph(ewas.ret, "ewas/sva.bed", "smartsva", name="ga", description=ewas.study) header <- paste0("track type=bedGraph name='ga-custom' description='", ewas.study, "' visibility=full color=200,100,0 altColor=0,100,200 priority=20") meffil.ewas.bedgraph(ewas.ret, "ewas/sva-custom.bed", "smartsva", header=header) meffil.ewas.bedgraph(ewas.ret, "ewas/sva-minimal.bed", "smartsva")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.