We'll use a couple of small cord blood DNA methylation datasets available on GEO: GSE79056, GSE62924.
Loading each dataset takes about 1 minute.
accessions <- c("GSE79056", "GSE62924") library(geograbi) ## https://github.com/yousefi138 samples <- sapply(accessions, geograbi.get.samples, simplify=F) vars <- sapply(samples, geograbi.extract.characteristics, simplify=F) system.time(methylation <- sapply(accessions, geograbi.get.data, simplify=F))
Remove non-CpG probes from the datasets.
for (acc in accessions) methylation[[acc]] <- methylation[[acc]][grepl("^c", rownames(methylation[[acc]])),]
Extract the variable of interest from each dataset.
variable <- sapply(accessions, function(acc) { idx <- grep("(ga weeks|gestational_age|gestational age)", colnames(vars[[acc]])) stopifnot(length(idx) == 1) as.numeric(vars[[acc]][,idx]) }, simplify=F)
Remove samples with incomplete data.
for (acc in accessions) { idx <- which(!is.na(variable[[acc]])) samples[[acc]] <- samples[[acc]][idx,] vars[[acc]] <- vars[[acc]][idx,] variable[[acc]] <- variable[[acc]][idx] methylation[[acc]] <- methylation[[acc]][,idx] }
SVA takes a few seconds for each dataset.
library(sva, quietly=T) set.seed(20191029) covariates <- sapply(accessions, function(acc) { mod <- model.matrix(~ var, data.frame(var=variable[[acc]])) mod0 <- mod[,1,drop=F] random.idx <- sample(1:nrow(methylation[[acc]]), 5000) meth.sva <- methylation[[acc]][random.idx,] meth.mean <- rowMeans(meth.sva, na.rm=T) idx <- which(is.na(meth.sva), arr.ind=T) if (nrow(idx) > 0) meth.sva[idx] <- meth.mean[idx[,1]] sva(meth.sva, mod=mod, mod0=mod0)$sv }, simplify=F)
Testing associations takes a few seconds for each dataset.
library(limma) stats <- sapply(accessions, function(acc) { mod <- model.matrix(~ ., data.frame(ga=variable[[acc]], covariates[[acc]])) fit <- lmFit(methylation[[acc]], mod) fit <- eBayes(fit) data.frame(estimate=fit$coefficients[,"ga"], se=sqrt(fit$s2.post) * fit$stdev.unscaled[,"ga"], p.value=fit$p.value[,"ga"]) },simplify=F)
Now we check the agreement between the datasets.
kable(sapply(stats, function(a) sapply(stats, function(b) { sites <- intersect(rownames(a), rownames(b)) sites <- sites[order(a[sites,"p.value"])[1:100]] cor(a[sites,"estimate"], b[sites,"estimate"]) })))
A previous study reported thousands of associations with gestational age:
Bohlin J, et al. Prediction of gestational age based on genome-wide differentially methylated regions. Genome Biol. 2016;17(1):207.
The effect estimates are strongly associated.
bohlin <- read.csv("bohlin.csv",stringsAsFactors=F,row.names=1) kable(sapply(stats, function(stats) { cor(bohlin$estimate, stats[rownames(bohlin),"estimate"], use="p") }))
If your data was generated using one of the Illumina BeadChips, then it is possible to annotate your EWAS summary statistics using the appropriate Bioconductor annotation package.
For the Illumina 450K microaray, we use the
IlluminaHumanMethylation450kanno.ilmn12.hg19
package.
if (!require(IlluminaHumanMethylation450kanno.ilmn12.hg19, quietly=T)) { install.packages("BiocManager") BiocManager::install("IlluminaHumanMethylation450kanno.ilmn12.hg19") } library(IlluminaHumanMethylation450kanno.ilmn12.hg19)
If we were using the more recent Illumina MethylationEPIC BeadChip,
then we would load the IlluminaHumanMethylationEPICanno.ilm10b2.hg19
package instead.
We construct an annotation data frame as follows:
data(list="IlluminaHumanMethylation450kanno.ilmn12.hg19") data(Locations) data(Other) annotation <- cbind(as.data.frame(Locations), as.data.frame(Other))
We then add the annotation to the stats
object
(Warning: we assume that the row names of the stats
object
are the Illumina CpG site identifiers, e.g. cg05775921).
stats <- sapply(stats, function(stats) { annotation <- annotation[match(rownames(stats), rownames(annotation)),] cbind(stats, annotation) }, simplify=F)
Takes about 2 minutes per dataset to calculate CpG site correlations.
library(dmrff) pre <- sapply(accessions, function(acc) { cat(date(), "pre", acc, "\n") with(stats[[acc]], dmrff.pre(estimate, se, methylation[[acc]], chr, pos)) }, simplify=F)
There are hundreds of DMRs for this phenotype, so the final meta-analysis takes about 15 minutes with a single processor.
options(mc.cores=20) ## if you have 20 processors available meta <- dmrff.meta(pre)
The output contains a data frame of the meta-analysed regions and the EWAS meta-analysis on which it was based.
kable(meta$dmrs[1:2,])
kable(meta$ewas[1:2,])
Here we add the CpG sites and gene names to the EWAS sites:
idx <- match(with(meta$ewas, paste(chr,pos)), with(annotation, paste(chr,pos))) meta$ewas$cpg <- rownames(annotation)[idx] meta$ewas$gene <- annotation$UCSC_RefGene_Name[idx]
dmrs <- meta$dmrs[which(meta$dmrs$p.adjust < 0.05 & meta$dmrs$n >= 2), ]
We've identified r nrow(dmrs)
DMRs with Bonferroni adjusted p < 0.05.
For convenience, we'll just look at DMRs on chromosome 6 with at least 5 CpG sites.
dmrs6 <- meta$dmrs[which(meta$dmrs$p.adjust < 0.05 & meta$dmrs$n >= 5 & meta$dmrs$chr == "chr6"), ] dmrs6 <- dmrs6[order(dmrs6$start),] kable(dmrs6[,c("chr","start","end","n","estimate","se","p.value","p.adjust")])
Below we use the dmrff.sites
function to show the CpG sites in each DMR and their
meta-analysed summary statistics.
sites <- dmrff.sites(dmrs6, meta$ewas$chr, meta$ewas$pos) sites <- cbind(sites[,c("region","chr","pos")], meta$ewas[sites$site,c("cpg", "estimate","se","p.value")]) kable(sites,row.names=F)
best6 <- dmrs6[which.min(dmrs6$p.value),] dmrff.plot( best6$chr, best6$start, best6$end, meta$ewas$estimate, meta$ewas$se, meta$ewas$chr, meta$ewas$pos)
We can use the same function to show the summary statistics for these sites from one of the original studies.
acc <- "GSE79056" sites <- dmrff.sites(dmrs6, stats[[acc]]$chr, stats[[acc]]$pos) sites <- cbind(sites[,c("region","chr","pos")], stats[[acc]][sites$site,c("estimate","se","p.value")]) kable(sites,row.names=F)
best6 <- dmrs6[which.min(dmrs6$p.value),] dmrff.plot( best6$chr, best6$start, best6$end, stats[[acc]]$estimate, stats[[acc]]$se, stats[[acc]]$chr, stats[[acc]]$pos)
Going back to full set of DMRs, we can ask which regions are novel, i.e. contain no CpG sites identified by Bohlin et al.
## consider only autosomal regions dmrs <- dmrs[which(dmrs$chr %in% paste0("chr", 1:22)),] ## collect CpG sites in the regions sites <- dmrff.sites(dmrs, meta$ewas$chr, meta$ewas$pos) sites <- cbind(sites, meta$ewas[sites$site,c("cpg","estimate","se","p.value")]) ## identify sites with associations according to Bohlin et al. sites$bohlin <- sites$cpg %in% rownames(bohlin) ## novel regions do not contain any of the Bohlin sites dmrs$novel <- !(1:nrow(dmrs) %in% sites$region[sites$bohlin])
Of r nrow(dmrs)
DMRs identified,
r round(100*sum(dmrs$novel)/nrow(dmrs),2)
%
are novel.
They appear on the following chromosomes:
freq.novel <- table(dmrs$chr[dmrs$novel]) freq <- table(dmrs$chr) prop <- 100*freq.novel/freq prop <- as.data.frame(prop) colnames(prop) <- c("chr","pct") prop$novel <- freq.novel prop$total <- freq prop <- prop[order(prop$pct),] kable(prop,row.names=F,digits=2)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.