Nothing
#' Test differential methylation on each locus
#'
#' The function takes a beta value matrix with probes on the rows and
#' samples on the columns. It also takes a sample information data frame
#' (sample.data) and formula for testing. The function outputs a list of
#' coefficient tables for each factor tested.
#'
#' @param betas beta values
#' @param sample.data data frame for sample information, column names
#' are predictor variables (e.g., sex, age, treatment, tumor/normal etc)
#' and are referenced in formula. Rows are samples.
#' @param formula formula
#' @param se.lb lower bound to standard error of slope, lower this to get
#' more difference of small effect size.
#' @param balanced whether design is balanced or not. default to FALSE, when
#' unbalanced will use Welch's method to estimate standard error.
#' balance=TRUE is faster.
#' @param cf.test factors to test (default to all factors in formula except
#' intercept). Use "all" for all factors.
#' @return cf - a list of coefficient tables for each factor
#' @import stats
#' @examples
#' data <- sesameDataGet('HM450.76.TCGA.matched')
#' cf <- DML(data$betas, data$sampleInfo, ~type)
#' @export
DML <- function(
betas, sample.data, formula, se.lb=0.06, balanced=FALSE, cf.test=NULL) {
design <- model.matrix(formula, sample.data)
## convert to factor for faster processing
design.fac <- data.frame(lapply(as.data.frame(design), as.factor))
rdf0 <- unlist(
lapply(design.fac, function(x) max(min(tabulate(x)),1)),
recursive = FALSE, use.names=FALSE)
n.cpg <- dim(betas)[1]
n.cf <- dim(design)[2]
## cf.test specify the factors to be reported
if (is.null(cf.test)) {
cf.test <- colnames(design)
cf.test <- cf.test[cf.test != '(Intercept)']
} else if (cf.test == 'all') {
cf.test <- colnames(design)
}
## preprare output
cf <- lapply(cf.test, function(cfi) matrix(
data=NA, nrow=n.cpg, ncol=5,
dimnames=list(rownames(betas),
c('Estimate', 'Std. Error', 't-stat', 'Pr(>|t|)', 'Effect size'))))
names(cf) <- cf.test
message('Testing differential methylation on each locus:')
for (i in seq_len(n.cpg)) {
if (i%%ceiling(n.cpg/80)==0) message('.', appendLF=FALSE);
## filter NA
sample.is.na <- is.na(betas[i,])
if (all(sample.is.na)) next;
if (any(sample.is.na)) { # this "if" improves performance
design1 <- design[!sample.is.na,,drop=FALSE]
design1.fac <- design.fac[!sample.is.na,]
betas1 <- betas[i,!sample.is.na]
if (sum(apply(
design1[,2:n.cf,drop=FALSE], 2,
function(x) length(unique(x)))==1) > 0) next;
rdf <- unlist(lapply(design1.fac, function(x) max(min(
tabulate(x)),1)), recursive = FALSE, use.names=FALSE)
} else {
design1 <- design
design1.fac <- design.fac
betas1 <- betas[i,]
rdf <- rdf0
}
## sigma is padded, boundary adjusted, and log2
## damped (should use beta distribution)
## wts <- rep(1, length(betas1))
pad <- 0.01
betas1.padded <- pmin(pmax(betas1, pad), 1-pad)
wts <- 1/(betas1.padded*(1-betas1.padded)) # 1/var
## QR-solve weighted least square
z <- .lm.fit(design1*wts, betas1*wts)
names(z$coefficients) <- colnames(design1)
p1 <- seq_len(z$rank)
coefs <- z$coefficients[z$pivot[p1]]
residuals <- z$residuals / wts
if (balanced) {
se <- sum(residuals^2)/(length(residuals)-1)
} else {
## Welch-Satterthwaite se
rs <- residuals^2
se <- unlist(
lapply(design1.fac, function(group1) {sqrt(sum(
vapply(split(
rs, group1), mean, numeric(1)), na.rm=TRUE))}),
recursive = FALSE, use.names=FALSE)
}
## lower bound coefficient se, this selects against
## high effect size differences
se <- pmax(se, se.lb)
## t-statistics
t.stat <- coefs / se
pval <- 2*pt(abs(t.stat), rdf, lower.tail=FALSE)
stopifnot(!any(is.na(pval)))
## output
fitted.rg <- range(betas1 - residuals)
eff <- fitted.rg[2] - fitted.rg[1] # effect size
ans <- cbind(coefs, se, t.stat, pval, eff)
for (cfi in cf.test) {
if (cfi %in% rownames(ans))
cf[[cfi]][i,] <- ans[cfi,]
}
## betas.fitted[i,!sample.is.na] <- betas1 - residuals
}
message('.\n', appendLF=FALSE)
## CpGs are correlated, should apply p-value adjustment on segments
## cf <- lapply(cf, function(cf1) cbind(cf1,
## P.adjusted=p.adjust(cf1[,'Pr(>|t|)'],method='BH')))
message('Significant loci (p<0.05):')
sigcnts <- lapply(cf, function(x) sum(x[,'Pr(>|t|)'] < 0.05, na.rm=TRUE))
sigmsg <- lapply(seq_along(sigcnts), function(i) sprintf(
' - %s: %d significant loci.', names(sigcnts[i]), sigcnts[i][[1]]))
message(do.call(paste0, list(sigmsg, collapse='\n')))
cf
}
#' Find Differentially Methylated Region (DMR)
#'
#' This subroutine uses Euclidean distance to group CpGs and
#' then combine p-values for each segment. The function performs DML test first
#' if cf is NULL. It groups the probe testing results into differential
#' methylated regions in a coefficient table with additional columns
#' designating the segment ID and statistical significance (P-value) testing
#' the segment.
#'
#' @param betas beta values for distance calculation
#' @param sample.data data frame for sample information, column names
#' are predictor variables (e.g., sex, age, treatment, tumor/normal etc)
#' and are referenced in formula. Rows are samples.
#' @param formula formula
#' @param cf coefficient table from diffMeth, when NULL will be computed from
#' beta. If cf is given, sample.data and formula are ignored.
#' @param dist.cutoff distance cutoff (default to use dist.cutoff.quantile)
#' @param seg.per.locus number of segments per locus
#' higher value leads to more segments
#' @param platform EPIC or HM450
#' @param refversion hg38 or hg19
#' @param ... additional parameters to DML
#' @return coefficient table with segment ID and segment P-value
#' @examples
#' data <- sesameDataGet('HM450.76.TCGA.matched')
#' cf <- DMR(data$betas, data$sampleInfo, ~type)
#'
#' @export
DMR <- function(
betas, sample.data = NULL,
formula = NULL, cf = NULL, dist.cutoff = NULL,
seg.per.locus = 0.5, platform = c('EPIC','HM450'),
refversion = c('hg38','hg19'), ...) {
pkgTest('GenomicRanges')
platform <- match.arg(platform)
refversion <- match.arg(refversion)
if (is.null(cf)) {
if (is.null(sample.data) || is.null(formula))
stop('Need either cf or sample data and formula to run DML.')
cf <- DML(betas, sample.data, formula, ...)
}
## filter NA
betas.noNA <- betas[!apply(betas, 1, function(x) all(is.na(x))),]
## sort by coordinates
probe.coords <- sesameDataGet(paste0(
platform, '.probeInfo'))[[paste0('mapped.probes.', refversion)]]
cpg.ids <- intersect(rownames(betas.noNA), names(probe.coords))
probe.coords <- GenomicRanges::sort(probe.coords[cpg.ids])
betas.coord.srt <- betas.noNA[names(probe.coords),]
message("Merging correlated CpGs ... ", appendLF=FALSE)
cpg.ids <- rownames(betas.coord.srt)
cpg.coords <- probe.coords[cpg.ids]
cpg.chrm <- as.vector(GenomicRanges::seqnames(cpg.coords))
cpg.start <- GenomicRanges::start(cpg.coords)
cpg.end <- GenomicRanges::end(cpg.coords)
n.cpg <- length(cpg.ids)
## euclidean distance suffcies
beta.dist <- vapply(seq_len(n.cpg-1), function(i) sum(
(betas.coord.srt[i,] - betas.coord.srt[i+1,])^2, na.rm=TRUE), 1)
## 1-correlation coefficient
## beta.dist <- sapply(seq_len(n.cpg-1), function(i) {
## x <- cor(betas.coord.srt[i,],betas.coord.srt[i+1,],
## use='na.or.complete',method='spearman')
## if (is.na(x)) x <- 0
## 1-x
## })
chrm.changed <- (cpg.chrm[-1] != cpg.chrm[-n.cpg])
## empirical cutoff based on quantiles
if (is.null(dist.cutoff))
dist.cutoff <- quantile(beta.dist, 1-seg.per.locus)
change.points <- (beta.dist > dist.cutoff | chrm.changed)
seg.ids <- cumsum(c(TRUE,change.points))
message("Done.")
## add back unmapped
all.cpg.ids <- rownames(betas)
unmapped <- all.cpg.ids[!(all.cpg.ids %in% cpg.ids)]
cpg.ids <- c(cpg.ids, unmapped)
cpg.chrm <- c(cpg.chrm, rep('*', length(unmapped)))
cpg.start <- c(cpg.start, rep(NA, length(unmapped)))
cpg.end <- c(cpg.end, rep(NA, length(unmapped)))
## segment coordinates
seg.ids <- c(
seg.ids, seq.int(
from=seg.ids[length(seg.ids)]+1, length.out=length(unmapped)))
seg.chrm <- as.vector(tapply(cpg.chrm, seg.ids, function(x) x[1]))
seg.start <- as.vector(tapply(cpg.start, seg.ids, function(x) x[1]))
seg.end <- as.vector(tapply(cpg.end, seg.ids, function(x) x[length(x)]))
message(sprintf('Generated %d segments.', seg.ids[length(seg.ids)]))
## combine p-value
message("Combine p-values ... ")
cfnames <- names(cf)
cf <- lapply(seq_along(cf), function(i) {
cf1 <- cf[[i]]
## Stouffer's Z-score method
seg.pval <- as.vector(tapply(
cf1[cpg.ids, 'Pr(>|t|)'], seg.ids,
function(x) pnorm(sum(qnorm(x))/sqrt(length(x)))))
seg.pval.adj <- p.adjust(seg.pval, method="BH")
seg.ids.cf <- match(rownames(cf1), cpg.ids)
cf1 <- as.data.frame(cf1)
cf1$Seg.ID <- seg.ids[seg.ids.cf]
cf1$Seg.chrm <- seg.chrm[cf1$Seg.ID]
cf1$Seg.start <- seg.start[cf1$Seg.ID]
cf1$Seg.end <- seg.end[cf1$Seg.ID]
cf1$Seg.Pval <- seg.pval[cf1$Seg.ID]
cf1$Seg.Pval.adj <- seg.pval.adj[cf1$Seg.ID]
message(sprintf(
' - %s: %d significant segments.',
cfnames[i], sum(seg.pval<0.05, na.rm=TRUE)))
message(sprintf(
' - %s: %d significant segments (after BH).',
cfnames[i], sum(seg.pval.adj<0.05, na.rm=TRUE)))
cf1
})
names(cf) <- cfnames
message("Done.")
cf
}
#' Top segments in differential methylation
#'
#' This is a utility function to show top differential methylated segments.
#' The function takes a coefficient table as input and output the same
#' table ordered by the sigificance of the segments.
#'
#' @param cf1 coefficient table of one factor from DMR
#' @return coefficient table ordered by adjusted p-value of segments
#' @examples
#' data <- sesameDataGet('HM450.76.TCGA.matched')
#' cf <- DMR(data$betas, data$sampleInfo, ~type)
#' topSegments(cf[[1]])
#' @export
topSegments <- function(cf1) {
x <- unique(cf1[order(cf1[,'Seg.Pval']), c(
'Seg.ID','Seg.chrm','Seg.start','Seg.end','Seg.Pval','Seg.Pval.adj')])
rownames(x) <- x[['Seg.ID']]
x
}
#' Top loci in differential methylation
#'
#' This is a convenience function to show top differential methylated segments.
#' The function takes a coefficient table as input and output the same
#' table ordered by the sigificance of the locus.
#'
#' @param cf1 coefficient table of one factor from diffMeth
#' @return coefficient table ordered by p-value of each locus
#' @examples
#' data <- sesameDataGet('HM450.76.TCGA.matched')
#' cf <- DMR(data$betas, data$sampleInfo, ~type)
#' topLoci(cf[[1]])
#' @export
topLoci <- function(cf1) {
cf1[order(cf1[,'Pr(>|t|)']),]
}
#' Select segment from coefficient table
#'
#' This function takes a coefficient table and returns a subset of the table
#' targeting only the specified segment using segment ID.
#'
#' @param cf1 coefficient table of one factor from DMR
#' @param seg.id segment ID
#' @return coefficient table from given segment
#' @examples
#' data <- sesameDataGet('HM450.76.TCGA.matched')
#' cf <- DMR(data$betas, data$sampleInfo, ~type)
#' getSegment(cf[[1]], cf[[1]][['Seg.ID']][1])
#' @export
getSegment <- function(cf1, seg.id) {
cf1[cf1[,'Seg.ID']==seg.id,]
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.