#' Epigenome-wide association study
#'
#' Test association with each CpG site.
#'
#' @param beta Methylation levels matrix,
#' one row per CpG site, one column per sample
#' or the filename of GDS (Genomic Data Structure) output from
#' \code{\link{meffil.normalize.samples}}.
#' @param variable Independent variable vector.
#' @param covariates Covariates data frame to include in regression model,
#' one row per sample, one column per covariate (Default: NULL).
#' @param batch Batch vector to be included as a random effect (Default: NULL). Ignored if \code{beta} is a GDS filename.
#' @param weights Non-negative observation weights.
#' Can be a numeric matrix of individual weights of same dimension as \code{beta},
#' or a numeric vector of weights with length \code{ncol(beta)},
#' or a numeric vector of weights with length \code{nrow(beta)}.
#' @param sites Restrict the EWAS to the given CpG sites -- must match row names of \code{beta} (Default: NULL).
#' @param samples Restrict the EWAS to the given samples -- must match column names of \code{beta} (Default: NULL).
#' @param cell.counts Proportion of cell counts for one cell type in cases
#' where the samples are mainly composed of two cell types (e.g. saliva) (Default: NULL). Ignored if \code{beta} is a GDS filename.
#' @param isva Apply Independent Surrogate Variable Analysis (ISVA) to the
#' methylation levels and include the resulting variables as covariates in a
#' regression model (Default: FALSE).
#' @param sva Apply Surrogate Variable Analysis (SVA) to the
#' methylation levels and covariates and include
#' the resulting variables as covariates in a regression model (Default: TRUE).
#' @param smartsva Apply the SmartSVA algorithm to the
#' methylation levels and include the resulting variables as covariates in a
#' regression model (Default: FALSE).
#' @param smartsva.alpha alpha argument to SmartSVA providing the
#' initial point for optimization. Smaller values
#' reduce the number of iterations needed to reach convergence.
#' Setting this 1 will produce exactly the outputs as SVA. (Default: 0.5).
#' @param n.sv Number of surrogate variables to calculate (Default: NULL).
#' @param winsorize.pct Apply all regression models to methylation levels
#' winsorized to the given level. Set to NA to avoid winsorizing (Default: 0.05).
#' @param robust Test associations with the 'robust' option when \code{\link{limma::eBayes}}
#' is called (Default: TRUE). Ignored if \code{beta} is a GDS filename.
#' @param rlm If \code{beta} is a matrix, then test associations with
#' the 'robust' option when \code{\link{limma:lmFit}} is called.
#' If \code{beta} is a GDS filename, then test associations
#' using robust regression using \code{\link{MASS::rlm}}
#' and calculate statistical significance using \code{\link{lmtest::coeftest}}
#' with \code{vcov=\link{sandwich::vcovHC}(fit, type="HC0")}
#' (Default: FALSE).
#' @param outlier.iqr.factor For each CpG site, prior to fitting regression models,
#' set methylation levels less than
#' \code{Q1 - outlier.iqr.factor * IQR} or more than
#' \code{Q3 + outlier.iqr.factor * IQR} to NA. Here IQR is the inter-quartile
#' range of the methylation levels at the CpG site, i.e. Q3-Q1.
#' Set to NA to skip this step (Default: NA).
#' @param most.variable Apply (Independent) Surrogate Variable Analysis to the
#' given most variable CpG sites (Default: 50000).
#' @param featureset No longer used (Default: NA).
#' @param verbose Set to TRUE if status updates to be printed (Default: FALSE).
#'
#' @export
meffil.ewas <- function(beta, variable,
covariates=NULL, batch=NULL, weights=NULL,
sites=NULL, samples=NULL,
cell.counts=NULL,
isva=F, sva=T, smartsva=F, ## cate?
smartsva.alpha=0.5,
n.sv=NULL,
winsorize.pct=0.05,
robust=FALSE,
rlm=FALSE,
outlier.iqr.factor=NA, ## typical value = 3
most.variable=50000,
featureset=NA,
random.seed=20161123,
lmfit.safer=F,
verbose=F) {
if (is.matrix(beta)) {
all.sites <- rownames(beta)
all.samples <- colnames(beta)
}
else {
beta.dims <- meffil:::retrieve.gds.dims(beta)
all.sites <- beta.dims[[1]]
all.samples <- beta.dims[[2]]
if (!is.null(batch))
stop("'batch' is ignored when 'beta' is a GDS filename")
if (!is.null(cell.counts))
stop("'cell.counts' is ignored when 'beta' is a GDS filename")
if (robust)
warning("'robust' is ignored when 'beta' is a GDS filename")
}
features <- meffil.all.features()
if (!is.null(sites))
sites <- intersect(all.sites, sites)
else
sites <- all.sites
stopifnot(length(sites) > 1 && all(sites %in% features$name))
if (is.null(samples))
samples <- all.samples
stopifnot(length(samples) > 1 && all(samples %in% all.samples))
stopifnot(length(all.samples) == length(variable))
stopifnot(is.null(covariates) || is.data.frame(covariates) && nrow(covariates) == length(all.samples))
stopifnot(is.null(weights)
|| (is.numeric(weights)
&& is.vector(weights)
&& length(weights) == length(all.samples)))
stopifnot(is.null(batch) || length(batch) == length(all.samples))
stopifnot(most.variable > 1)
stopifnot(!is.numeric(winsorize.pct) || winsorize.pct > 0 && winsorize.pct < 0.5)
original.variable <- variable
original.covariates <- covariates
if (is.character(variable))
variable <- as.factor(variable)
stopifnot(!is.factor(variable) || is.ordered(variable) || length(levels(variable)) == 2)
meffil:::msg("Simplifying any categorical variables.", verbose=verbose)
variable <- meffil:::simplify.variable(variable)
if (!is.null(covariates))
covariates <- do.call(cbind, lapply(covariates, meffil:::simplify.variable))
sample.idx <- which(!is.na(variable) & all.samples %in% samples)
if (!is.null(covariates))
sample.idx <- intersect(sample.idx, which(apply(!is.na(covariates), 1, all)))
meffil:::msg("Removing", length(samples) - length(sample.idx), "missing case(s).", verbose=verbose)
weights <- weights[sample.idx]
samples <- all.samples[sample.idx]
variable <- variable[sample.idx]
original.variable <- original.variable[sample.idx]
original.covariates <- original.covariates[sample.idx,,drop=F]
if (!is.null(covariates))
covariates <- covariates[sample.idx,,drop=F]
if (!is.null(batch))
batch <- batch[sample.idx]
if (!is.null(cell.counts))
cell.counts <- cell.counts[sample.idx]
if (!is.null(covariates)) {
pos.var.idx <- which(apply(covariates, 2, var, na.rm=T) > 0)
meffil:::msg("Removing", ncol(covariates) - length(pos.var.idx), "covariates with no variance.",
verbose=verbose)
covariates <- covariates[,pos.var.idx, drop=F]
}
covariate.sets <- list(none=NULL)
if (!is.null(covariates))
covariate.sets$all <- covariates
surrogates.ret <- NULL
if (isva || sva || smartsva) {
meffil:::msg("Computing surrogate variables ...", verbose=verbose)
sva.sites <- meffil.autosomal.subset(sites)
sva.sets <- add.surrogate.variables(
beta, sva.sites, samples,
most.variable,
variable, covariates,
winsorize.pct, outlier.iqr.factor,
random.seed, isva, sva, smartsva, n.sv,
smartsva.alpha, verbose)
surrogates.ret <- sva.sets$last
covariate.sets <- c(covariate.sets, sva.sets$sets)
}
msg("Starting EWAS ...", verbose=verbose)
if (is.matrix(beta)) {
analyses <- meffil:::ewas.by.matrix(
variable, beta, sites, samples,
covariate.sets,
batch, weights, cell.counts,
winsorize.pct, outlier.iqr.factor,
robust, rlm, lmfit.safer, verbose)
}
else {
analyses <- meffil:::ewas.by.gds(
variable, beta, sites, samples,
covariate.sets, weights,
winsorize.pct, outlier.iqr.factor,
rlm, verbose)
}
msg("Finished EWAS.", verbose=verbose)
p.values <- sapply(analyses, function(analysis) analysis$table$p.value)
coefficients <- sapply(analyses, function(analysis) analysis$table$coefficient)
rownames(p.values) <- rownames(coefficients) <- rownames(analyses[[1]]$table)
for (name in names(analyses)) {
idx <- match(rownames(analyses[[name]]$table), features$name)
analyses[[name]]$table$chromosome <- features$chromosome[idx]
analyses[[name]]$table$position <- features$position[idx]
}
list(class="ewas",
version=packageVersion("meffil"),
samples=samples,
variable=original.variable,
covariates=original.covariates,
winsorize.pct=winsorize.pct,
batch=batch,
robust=robust,
rlm=rlm,
weights=weights,
outlier.iqr.factor=outlier.iqr.factor,
most.variable=most.variable,
p.value=p.values,
coefficient=coefficients,
analyses=analyses,
random.seed=random.seed,
sva.ret=surrogates.ret)
}
is.ewas.object <- function(object)
is.list(object) && all(c("class","version","samples","variable","covariates",
"winsorize.pct","outlier.iqr.factor",
"analyses","random.seed")
%in% names(object))
ewas.by.matrix <- function(variable, beta, sites, samples,
covariate.sets, batch, weights, cell.counts,
winsorize.pct, outlier.iqr.factor,
robust, rlm, lmfit.safer, verbose) {
stopifnot(all(!is.na(variable)))
stopifnot(length(variable) == length(samples))
stopifnot(is.null(batch) || length(batch) == length(samples))
stopifnot(is.null(weights) || length(weights) == length(variable))
stopifnot(is.null(cell.counts)
|| length(cell.counts) == length(samples)
&& all(cell.counts >= 0 & cell.counts <= 1))
stopifnot(length(sites) > 1 && all(sites %in% rownames(beta)))
stopifnot(length(samples) > 1 && all(samples %in% colnames(beta)))
stopifnot(all(sapply(covariate.sets, function(set) is.null(set) || nrow(set) == length(samples))))
stopifnot(is.logical(robust))
stopifnot(is.logical(rlm))
stopifnot(is.logical(lmfit.safer))
beta <- beta[sites,samples]
if (is.numeric(winsorize.pct) || is.numeric(outlier.iqr.factor)) {
meffil:::msg("Handling methylation outliers", verbose=verbose)
beta <- meffil.handle.outliers(
beta,
winsorize.pct,
outlier.iqr.factor)
}
sapply(names(covariate.sets), function(name) {
meffil:::msg("EWAS for covariate set", name, verbose=verbose)
covariates <- covariate.sets[[name]]
meffil:::ewas.limma(
variable,
beta=beta,
covariates=covariates,
batch=batch,
weights=weights,
cell.counts=cell.counts,
winsorize.pct=winsorize.pct,
robust=robust,
rlm=rlm,
lmfit.safer=lmfit.safer,
verbose=verbose)
}, simplify=F)
}
# Test associations between \code{variable} and each row of \code{beta}
# while adjusting for \code{covariates} (fixed effects) and \code{batch} (random effect).
# If \code{cell.counts} is not \code{NULL}, then it is assumed that
# the methylation data is derived from samples with two cell types.
# \code{cell.counts} should then be a vector of numbers
# between 0 and 1 of length equal to \code{variable} corresponding
# to the proportions of cell of a selected cell type in each sample.
# The regression model is then modified in order to identify
# associations specifically in the selected cell type (PMID: 24000956).
ewas.limma <- function(variable, beta, covariates,
batch, weights, cell.counts,
winsorize.pct,
robust, rlm, lmfit.safer, verbose) {
stopifnot(all(!is.na(variable)))
stopifnot(length(variable) == ncol(beta))
stopifnot(is.null(covariates) || nrow(covariates) == ncol(beta))
stopifnot(is.null(batch) || length(batch) == ncol(beta))
stopifnot(is.null(cell.counts)
|| length(cell.counts) == ncol(beta)
&& all(cell.counts >= 0 & cell.counts <= 1))
method <- "ls"
if (rlm) method <- "robust"
if (is.null(covariates))
design <- data.frame(intercept=1, variable=variable)
else
design <- data.frame(intercept=1, variable=variable, covariates)
rownames(design) <- colnames(beta)
if (!is.null(cell.counts)) {
## Irizarray method: Measuring cell-type specific differential methylation
## in human brain tissue
## Mi is methylation level for sample i
## Xi is the value of the variable of interest for sample i
## pi is the proportion of the target cell type for sample i
## thus: Mi ~ target cell methylation * pi + other cell methylation * (1-pi)
## and target cell methylation = base target methylation + effect of Xi value
## and other cell methylation = base other methylation + effect of Xi value
## thus:
## Mi = (T + U Xi)pi + (O + P Xi)(1-pi) + e
## = C + A Xi pi + B Xi (1-pi) + e
design <- design[,-which(colnames(design) == "intercept")]
design <- cbind(design * cell.counts,
typeB=design * (1-cell.counts))
}
meffil:::msg("Linear regression with limma::lmFit", verbose=verbose)
fit <- NULL
batch.cor <- NULL
if (!is.null(batch)) {
meffil:::msg("Adjusting for batch effect", verbose=verbose)
corfit <- duplicateCorrelation(beta, design, block=batch, ndups=1)
batch.cor <- corfit$consensus
meffil:::msg("Linear regression with batch as random effect", verbose=verbose)
tryCatch({
fit <- lmFit(
beta, design, method=method,
block=batch, cor=batch.cor, weights=weights)
}, error=function(e) {
print(e)
meffil:::msg("lmFit failed with random effect batch variable, omitting", verbose=verbose)
})
}
if (is.null(fit)) {
meffil:::msg("Linear regression with only fixed effects", verbose=verbose)
batch <- NULL
if (!lmfit.safer)
fit <- lmFit(beta, design, method=method, weights=weights)
else
fit <- lmfit.safer(beta, design, method=method, weights=weights, verbose=verbose)
}
meffil:::msg("Empirical Bayes", verbose=verbose)
if (is.numeric(winsorize.pct) && robust) {
fit.ebayes <- eBayes(fit, robust=T, winsor.tail.p=c(winsorize.pct, winsorize.pct))
} else {
fit.ebayes <- eBayes(fit, robust=robust)
}
alpha <- 0.975
std.error <- (sqrt(fit.ebayes$s2.post) * fit.ebayes$stdev.unscaled[,"variable"])
margin.error <- (std.error * qt(alpha, df=fit.ebayes$df.total))
n <- apply(beta, 1, function(v) sum(!is.na(v)))
list(design=design,
batch=batch,
batch.cor=batch.cor,
cell.counts=cell.counts,
table=data.frame(p.value=fit.ebayes$p.value[,"variable"],
fdr=p.adjust(fit.ebayes$p.value[,"variable"], "fdr"),
p.holm=p.adjust(fit.ebayes$p.value[,"variable"], "holm"),
t.statistic=fit.ebayes$t[,"variable"],
coefficient=fit.ebayes$coefficient[,"variable"],
coefficient.ci.high=fit.ebayes$coefficient[,"variable"] + margin.error,
coefficient.ci.low=fit.ebayes$coefficient[,"variable"] - margin.error,
coefficient.se=std.error,
n=n))
}
## apply lmFit to subsets of the methylation matrix to hopefully avoid
## out-of-memory errors (mainly for really large datasets or low memory servers)
lmfit.safer <- function(beta, design, method, weights, n.partitions=8, verbose=F) {
partitions <- sample(1:n.partitions, nrow(beta), replace=T)
fits <- lapply(1:n.partitions, function(part) {
meffil:::msg("Applying limma::lmFit to partition ", part, " of ", n.partitions, ".", verbose=verbose)
idx <- which(partitions == part)
lmFit(beta[idx,], design, method=method, weights=weights)
})
idx <- unlist(lapply(1:n.partitions, function(part) which(partitions==part)))
idx <- match(1:nrow(beta), idx)
fit <- fits[[1]]
for (item in c("coefficients", "stdev.unscaled")) {
fit[[item]] <- do.call(rbind, lapply(fits, function(fit) fit[[item]]))[idx,]
}
for (item in c("df.residual", "sigma", "Amean")) {
fit[[item]] <- do.call(c, lapply(fits, function(fit) fit[[item]]))[idx]
}
fit
}
# Test associations between \code{variable} and each row of \code{beta}
# while adjusting for \code{covariates}.
ewas.by.gds <- function(variable, beta, sites, samples,
covariate.sets, weights,
winsorize.pct, outlier.iqr.factor,
rlm, verbose) {
stopifnot(all(!is.na(variable)))
stopifnot(length(variable) == length(samples))
stopifnot(all(sapply(covariate.sets, function(set) is.null(set) || nrow(set) == length(samples))))
stopifnot(is.null(weights) || length(weights) == length(variable))
designs <- lapply(covariate.sets, function(covariates) {
if (is.null(covariates))
design <- data.frame(intercept=1, variable=variable)
else
design <- data.frame(intercept=1, variable=variable, covariates)
rownames(design) <- samples
design
})
stats <- meffil:::lapply.gds(
beta, margin=1, sites=sites, samples=samples,
type="list",
FUN=ewas.test.models,
AFUN=ifelse(rlm,meffil:::ewas.test.rlm,meffil:::ewas.test.glm),
designs=designs,
weights=weights,
winsorize.pct=winsorize.pct,
outlier.iqr.factor=outlier.iqr.factor)
stats <- do.call(rbind, stats)
analyses <- lapply(1:length(designs), function(i) {
idx <- seq(i,nrow(stats),length(designs))
stats <- data.frame(stats[idx,])
rownames(stats) <- sites
stats$fdr <- p.adjust(stats$p.value, "fdr")
stats$p.holm <- p.adjust(stats$p.value, "holm")
list(design=designs[[i]],
table=stats)
})
names(analyses) <- names(covariate.sets)
analyses
}
ewas.test.models <- function(cpg,AFUN,designs,weights,winsorize.pct,outlier.iqr.factor,...) {
cpg <- meffil::meffil.handle.outliers(cpg,winsorize.pct, outlier.iqr.factor)
idx <- which(!is.na(cpg))
cpg <- cpg[idx]
if (!is.null(weights)) weights <- weights[idx]
t(sapply(designs, function(design) {
AFUN(cpg,design[idx,],weights,...)
}))
}
ewas.test.glm <- function(cpg,design,weights,...) {
fit <- glm.fit(x=design,y=cpg,weights=weights,...)
ret <- coef(summary.glm(fit))["variable",]
names(ret) <- c("coefficient","coefficient.se","t.statistic","p.value")
c(ret,
coefficient.ci.low=ret[["coefficient"]]-1.96*ret[["coefficient.se"]],
coefficient.ci.high=ret[["coefficient"]]+1.96*ret[["coefficient.se"]],
n=sum(!is.na(cpg)))
}
ewas.test.rlm <- function(cpg,design,weights=NULL,...) {
tryCatch({
fit <- MASS::rlm(x=design,y=cpg,...)
ret <- lmtest::coeftest(fit, vcov=sandwich::vcovHC(fit, type="HC0"))
ci <- confint(ret)["variable",]
ret <- ret["variable",]
names(ret) <- c("coefficient","coefficient.se","t.statistic","p.value")
c(ret,
coefficient.ci.low=ci[["2.5 %"]],
coefficient.ci.high=ci[["97.5 %"]],
n=sum(!is.na(cpg)))
}, error=function(e) {
c("coefficient"=NA,"coefficient.se"=NA,"t.statistic"=NA,"p.value"=NA,
"coefficient.ci.low"=NA,"coefficient.ci.high"=NA,n=sum(!is.na(cpg)))
})
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.