# == title
# correlated regions in extended gene model
#
# == param
# -site CpG sites
# -meth methylation matrix corresponding to ``site``
# -cov coverage
# -expr expression for current gene
# -chr chromosome
# -cov_cutoff cutoff for coverage
# -min_dp minimal number of non-NA values for calculating correlations
# -cor_method method for calcualting correlations
# -window_size how many CpG sites in a window
# -factor subtype
# -max_width maximum width of a window
#
correlated_regions_per_gene = function(site, meth, cov, expr, chr, cov_cutoff = 3, min_dp = 4,
cor_method = "spearman", window_size = 5, factor = NULL, max_width = 10000) {
if(ncol(meth) != length(expr)) {
stop("number of columsn of `meth` should be same as length of `expr`.\n")
}
index = seq(1, length(site), by = window_size)
i = seq_len(length(index) - 1)
ir = IRanges(site[index[i]], site[index[i+1]-1])
m = lapply(index[i], function(x) {
ind = x+0:(window_size-1)
meth_m = meth[ind, , drop = FALSE]
cov_m = cov[ind, , drop = FALSE]
sapply(seq_len(ncol(meth_m)), function(i) {
xm = meth_m[, i]
ym = cov_m[, i]
mean(xm[ym >= cov_cutoff], na.rm = TRUE)
})
})
m = do.call('rbind', m)
colnames(m) = paste0("mean_meth_", colnames(meth))
corr = apply(m, 1, function(x) {
l = !is.na(x)
if(sum(l) < min_dp) return(NA)
cor(x[l], expr[l], method = cor_method)
})
corr_p = suppressWarnings(apply(m, 1, function(x) {
l = !is.na(x)
if(sum(l) < min_dp) return(NA)
cor.test(x[l], expr[l], method = cor_method)$p.value
}))
if(!is.null(factor)) {
if(length(unique(factor)) == 1) factor = NULL
}
if(!is.null(factor)) {
factor = as.vector(factor)
meth_anova = apply(m, 1, function(x) {
l = !is.na(x)
data = data.frame(value = x[l], class = factor[l], stringsAsFactors = FALSE)
if(length(unique(data$class)) < 2) return(NA)
if(any(table(data$class) < 2)) return(NA)
oneway.test(value ~ class, data = data)$p.value
})
meth_diameter = apply(m, 1, function(x) {
l = !is.na(x)
if(any(table(factor[l]) < 2)) return(NA)
diameter(as.vector(tapply(x[l], factor[l], mean)))
})
}
if(nrow(m) == 1) {
meth_IQR = iqr(m)
} else {
meth_IQR = rowIQRs(m, na.rm = TRUE)
}
gr = GRanges(seqnames = rep(chr, length(ir)),
ranges = ir)
if(is.null(factor)) {
df = DataFrame(n = window_size,
m, # mean methylation
corr = corr,
corr_p = corr_p,
meth_IQR = meth_IQR)
} else {
df = DataFrame(n = window_size,
m, # mean methylation
corr = corr,
corr_p = corr_p,
meth_IQR = meth_IQR,
meth_anova = meth_anova,
meth_diameter = meth_diameter)
}
mcols(gr) = df
return(gr)
}
# == title
# correlated regions
#
# == param
# -sample_id sample id
# -expr expression matrix
# -txdb ``transcritpDb`` object
# -chr chromosome
# -extend extension of gene model, both upstream and downstream
# -cov_filter function to filter on coverage
# -cor_method method to calculate correlation
# -factor subtype
# -window_size how many CpGs in a window
# -max_width maximum width of a window
# -raw_meth whether use raw methylation value (unsmoothed)
# -cov_cutoff cutoff for coverage
# -min_dp minimal non-NA values for calculating correlations
# -col color for subtypes
#
# == detail
# based on `correlated_regions_per_gene`
correlated_regions = function(sample_id, expr, txdb, chr, extend = 50000,
cov_filter = function(x) sum(x > 0, na.rm = TRUE) > length(x)/2,
cor_method = "spearman", factor = NULL, window_size = 5, max_width = 10000,
raw_meth = FALSE, cov_cutoff = 3, min_dp = 4, col = NULL) {
qqcat("extracting gene model (extend = @{extend}, chr = @{chr})...\n")
gene = genes(txdb)
tx_list = transcriptsBy(txdb, by = "gene")
gene = gene[seqnames(gene) == chr]
g = intersect(rownames(expr), names(gene))
expr = expr[g, , drop = FALSE]
gene = gene[g]
tx_list = tx_list[g]
genemodel = gene
start(genemodel) = start(genemodel) - extend
end(genemodel) = end(genemodel) + extend
s = start(genemodel)
start(genemodel) = ifelse(s > 0, s, 1)
expr = expr[, sample_id, drop = FALSE]
all_gi = rownames(expr)
n_gene = length(all_gi)
methylation_hooks$set(chr)
site = methylation_hooks$site()
if(raw_meth) {
meth = methylation_hooks$raw(col_index = sample_id)
} else {
meth = methylation_hooks$meth(col_index = sample_id)
}
cov = methylation_hooks$coverage(col_index = sample_id)
if(!is.null(cov_filter)) {
l = apply(cov, 1, cov_filter)
if(any(is.na(l))) {
stop("`cov_filter` generates `NA`, check it.")
}
site = site[l]
meth = meth[l, , drop = FALSE]
cov = cov[l, , drop = FALSE]
}
op = qq.options("cat_prefix")
if(!raw_meth) cov_cutoff = 0
res = GRanges()
for(i in seq_len(n_gene)) {
# current gene name
gi = all_gi[i]
# expression of current gene
e = expr[gi, ]
if(is.function(op)) {
qq.options(cat_prefix = function(x) {
qq("@{op()} [@{chr}:@{gi}, @{i}/@{n_gene}]")
})
} else {
qq.options(cat_prefix = qq("@{op} [@{chr}:@{gi}, @{i}/@{n_gene}]"))
}
# if gene has low expression in many samples
if(all(e == 0) || sd(e) == 0) {
qqcat("@{gi} has zero expression in all samples, skip\n")
next
}
start = start(genemodel[gi])
end = end(genemodel[gi])
gm_site_index = extract_sites(start, end, site, index = TRUE)
gm_site = site[gm_site_index]
gm_meth = meth[gm_site_index, sample_id, drop = FALSE]
gm_cov = cov[gm_site_index, sample_id, drop = FALSE]
if(length(gm_site) < 10) {
qqcat("@{gi} has too few cpg sites, skip\n")
next
}
qqcat("...\n")
gr = correlated_regions_per_gene(gm_site, gm_meth, gm_cov, e, cov_cutoff = cov_cutoff, chr = chr,
factor = factor, cor_method = cor_method, window_size = window_size, min_dp = min_dp,
max_width = max_width)
gr$gene_id = rep(gi, length(gr))
## distance to gene tss
tss = promoters(gene[gi], upstream = 1, downstream = 0)
gene_tss_dist = as.data.frame(distanceToNearest(gr, tss))[, 3]
if(as.vector(strand(gene[gi])) == "+") {
gene_tss_dist = ifelse(end(gr) < start(tss), -gene_tss_dist, gene_tss_dist)
} else if(as.vector(strand(gene[gi])) == "-") {
gene_tss_dist = ifelse(start(gr) > end(tss), -gene_tss_dist, gene_tss_dist)
}
gr$gene_tss_dist = gene_tss_dist
## distance to tx tss
tx = tx_list[[gi]]
tx = tx[tx$tx_name != gi]
tss = promoters(tx, upstream = 1, downstream = 0)
dist = as.data.frame(distanceToNearest(gr, tss))
tx_tss_dist = dist[, 3]
if(as.vector(strand(gene[gi])) == "+") {
tx_tss_dist = ifelse(end(gr) < start(tss[dist[,2]]), -tx_tss_dist, tx_tss_dist)
} else if(as.vector(strand(gene[gi])) == "-") {
tx_tss_dist = ifelse(start(gr) > end(tss[dist[,2]]), -tx_tss_dist, tx_tss_dist)
}
gr$tx_tss_dist = tx_tss_dist
gr$nearest_tx_tss = tss[dist[,2]]$tx_name
res = c(res, gr)
}
qq.options(cat_prefix = op)
attr(res, "factor") = factor
attr(res, "col") = col
attr(res, "cor_method") = cor_method
attr(res, "extend") = extend
attr(res, "window_size") = window_size
attr(res, "sample_id") = sample_id
attr(res, "cor_method") = cor_method
attr(res, "cov_filter") = cov_filter
attr(res, "raw_meth") = raw_meth
attr(res, "cov_filter") = cov_filter
attr(res, "min_dp") = min_dp
return(res)
}
# == title
# filter correlation regions
#
# == param
# -chromosome chromosomes
# -template template to find cr files
# -cutoff cutoff of adjusted p-values
# -adj_method method for calculating adjusted p-values
# -meth_diameter_cutoff cutoff for diameters
# -meth_IQR_cutoff cutoff for IQR, if there is no subtype information, IQR is used to remove less variable methylation
# -anova_cutoff cutoff for ANOVA test
#
filter_correlated_regions = function(chromosome = paste0("chr", 1:22), template,
cutoff = 0.05, adj_method = "BH", meth_diameter_cutoff = 0.25, meth_IQR_cutoff = 0.25,
anova_cutoff = 0.05) {
if(length(cutoff) == 1) cutoff = rep(cutoff, 2)
cat("calculate fdr...\n")
corr_p = NULL
meth_anova = NULL
meth_diameter = NULL
meth_IQR = NULL
chr_name = NULL
corr = NULL
for(chr in chromosome) {
qqcat("reading cr for @{chr}\n")
cr = readRDS(qq(template))
has_anova = FALSE
if("meth_anova" %in% colnames(mcols(cr))) {
has_anova = TRUE
}
if(has_anova) {
cr = cr[(!is.na(cr$corr)) & (!is.na(cr$corr_p)) & (!is.na(cr$meth_anova)) & (!is.na(cr$meth_diameter))]
} else {
cr = cr[(!is.na(cr$corr)) & (!is.na(cr$corr_p))]
}
corr_p = c(corr_p, cr$corr_p)
corr = c(corr, cr$corr)
if(has_anova) {
meth_anova = c(meth_anova, cr$meth_anova)
meth_diameter = c(meth_diameter, cr$meth_diameter)
} else {
meth_mat = as.matrix(mcols(cr)[, grep("^mean_meth", colnames(mcols(cr)))])
meth_IQR = c(meth_IQR, rowIQRs(meth_mat, na.rm = TRUE))
# meth_IQR = c(meth_IQR, cr$meth_IQR)
}
chr_name = c(chr_name, rep(chr, length(cr)))
}
if(has_anova) {
anova_fdr = p.adjust(meth_anova, method = adj_method)
l = anova_fdr <= cutoff[1] & meth_diameter >= meth_diameter_cutoff
qqcat("filter out @{sum(!l)}/@{length(l)} by differential methylation.\n")
corr_fdr = rep(Inf, length(corr_p))
corr_fdr[l] = p.adjust(corr_p[l], method = adj_method)
l = l & ifelse(corr > 0, corr_fdr <= cutoff[1], corr_fdr <= cutoff[2])
} else {
corr_fdr = p.adjust(corr_p, method = adj_method)
l = ifelse(corr > 0, corr_fdr <= cutoff[1], corr_fdr <= cutoff[2]) & meth_IQR >= meth_IQR_cutoff & !is.na(meth_IQR)
}
l = l & !is.na(corr)
cat("filter by fdr...\n")
cr2 = GRanges()
for(chr in chromosome) {
qqcat("reading cr for @{chr}\n")
cr = readRDS(qq(template))
if(has_anova) {
cr = cr[(!is.na(cr$corr)) & (!is.na(cr$corr_p)) & (!is.na(cr$meth_anova)) & (!is.na(cr$meth_diameter))]
} else {
cr = cr[(!is.na(cr$corr)) & (!is.na(cr$corr_p))]
}
lo = chr_name == chr
cr$corr_fdr = corr_fdr[lo]
if(has_anova) {
cr$meth_anova_fdr = anova_fdr[lo]
}
if(sum(l[lo])) {
cr2 = suppressWarnings(c(cr2, cr[l[lo]]))
}
}
attr(cr2, "factor") = attr(cr, "factor")
attr(cr2, "col") = attr(cr, "col")
attr(cr2, "cor_method") = attr(cr, "cor_method")
attr(cr2, "extend") = attr(cr, "extend")
attr(cr2, "window_size") = attr(cr, "window_size")
attr(cr2, "sample_id") = attr(cr, "sample_id")
attr(cr2, "cor_method") = attr(cr, "cor_method")
attr(cr2, "cov_filter") = attr(cr, "cov_filter")
attr(cr2, "cov_cutoff") = attr(cr, "cov_cutoff")
attr(cr2, "raw_meth") = attr(cr, "raw_meth")
attr(cr2, "min_dp") = attr(cr, "min_dp")
cr2
}
# == title
# plot that helps to choose a gap
reduce_cr_gap_test = function(cr) {
neg_cr = cr[cr$corr < 0]
neg_cr_list = split(as.data.frame(neg_cr), neg_cr$gene_id)
neg_cr_rainfall = do.call("rbind", lapply(neg_cr_list, rainfallTransform))
neg_cr_rainfall$ratio = neg_cr_rainfall$dist/(neg_cr_rainfall$end - neg_cr_rainfall$start + 1)
x = neg_cr_rainfall$dist
par(mfrow = c(2, 3))
for(i in c(10, 100, 500, 1000, 2000)) {
plot(density(log10(x[x >= i])), axes = FALSE, main = qq("neg_cr, dist >= @{i}bp"))
axis(side = 1, at = 1:10, labels = 10^(1:10))
axis(side = 2)
box()
}
pos_cr = cr[cr$corr > 0]
pos_cr_list = split(as.data.frame(pos_cr), pos_cr$gene_id)
pos_cr_rainfall = do.call("rbind", lapply(pos_cr_list, rainfallTransform))
pos_cr_rainfall$ratio = pos_cr_rainfall$dist/(pos_cr_rainfall$end - pos_cr_rainfall$start + 1)
x = pos_cr_rainfall$dist
par(mfrow = c(2, 3))
for(i in c(10, 100, 500, 1000, 2000)) {
plot(density(log10(x[x >= i])), axes = FALSE, main = qq("pos_cr, dist >= @{i}bp"))
axis(side = 1, at = 1:10, labels = 10^(1:10))
axis(side = 2)
box()
}
}
# == title
# refuce cr regions
#
# == param
# -cr cr
# -expe expression
# -txdb txdb
# -max_gap maximum gap
# -gap gap
# -mc.cores number of cores
#
# == detail
# pos_CR and neg_CR are reduced separatedly
reduce_cr = function(cr, expr, txdb, max_gap = 1000, gap = 1.0, mc.cores = 1) {
sample_id = attr(cr, "sample_id")
cor_method = attr(cr, "cor_method")
raw_meth = attr(cr, "raw_meth")
if(raw_meth) warning("only smoothed meth is supported")
qqcat("extracting gene and tx models.\n")
gene = genes(txdb)
tx_list = transcriptsBy(txdb, by = "gene")
reduce_cr_by_gene = function(cr, e, max_gap = 1000) {
if(length(cr) == 0) return(GRanges())
n = cr$n
meth_mat = as.matrix(mcols(cr)[, paste0("mean_meth_", sample_id)]) * n
mcols(cr) = cbind(n = n, meth_mat)
gr = reduce2(cr, max_gap = max_gap, gap = gap)
n = gr$n
meth_mat = as.matrix(mcols(gr)[, paste0("mean_meth_", sample_id)]) / n
corr = apply(meth_mat, 1, function(x) cor(x, e, method = cor_method))
mcols(gr) = cbind(n = n, meth_mat, corr = corr)
return(gr)
}
cr_list = split(cr, cr$gene_id)
i = 0
res = mclapply(names(cr_list), function(gi) {
qqcat("reducing cr on @{gi}, @{i <<- i+1}/@{length(cr_list)}...\n")
cr = cr_list[[gi]]
neg_cr = cr[cr$corr < 0]
pos_cr = cr[cr$corr > 0]
gr = c(reduce_cr_by_gene(neg_cr, expr[gi, sample_id], max_gap = max_gap),
reduce_cr_by_gene(pos_cr, expr[gi, sample_id], max_gap = max_gap))
gr = sort(gr)
gr$gene_id = rep(gi, length(gr))
## distance to gene tss
tss = promoters(gene[gi], upstream = 1, downstream = 0)
gene_tss_dist = as.data.frame(distanceToNearest(gr, tss))[, 3]
if(as.vector(strand(gene[gi])) == "+") {
gene_tss_dist = ifelse(end(gr) < start(tss), -gene_tss_dist, gene_tss_dist)
} else if(as.vector(strand(gene[gi])) == "-") {
gene_tss_dist = ifelse(start(gr) > end(tss), -gene_tss_dist, gene_tss_dist)
}
gr$gene_tss_dist = gene_tss_dist
## distance to tx tss
tx = tx_list[[gi]]
tx = tx[tx$tx_name != gi]
tss = promoters(tx, upstream = 1, downstream = 0)
dist = as.data.frame(distanceToNearest(gr, tss))
tx_tss_dist = dist[, 3]
if(as.vector(strand(gene[gi])) == "+") {
tx_tss_dist = ifelse(end(gr) < start(tss[dist[,2]]), -tx_tss_dist, tx_tss_dist)
} else if(as.vector(strand(gene[gi])) == "-") {
tx_tss_dist = ifelse(start(gr) > end(tss[dist[,2]]), -tx_tss_dist, tx_tss_dist)
}
gr$tx_tss_dist = tx_tss_dist
gr$nearest_tx_tss = tss[dist[,2]]$tx_name
gr
}, mc.cores = mc.cores)
cr2 = do.call("c", res)
attr(cr2, "factor") = attr(cr, "factor")
attr(cr2, "col") = attr(cr, "col")
attr(cr2, "cor_method") = attr(cr, "cor_method")
attr(cr2, "extend") = attr(cr, "extend")
attr(cr2, "window_size") = attr(cr, "window_size")
attr(cr2, "sample_id") = attr(cr, "sample_id")
attr(cr2, "cor_method") = attr(cr, "cor_method")
attr(cr2, "cov_filter") = attr(cr, "cov_filter")
cr2
}
# == title
# ad subtype specificity columns
#
# == param
# -cr cr
# -cutoff cutoff for ANOVA test
#
# == details
# 1 is defined as the methylation is higher than all other subtypes and the difference is significant.
# -1 is defined as the methylation is lower than all other subtypes and the difference is significant.
# All the others are defined as 0.
add_subtype_specificity = function(cr, cutoff = 0.05, suffix = "_ss") {
factor = attr(cr, "factor")
if(length(unique(factor)) <= 1) {
warning("no grouping settings.")
return(cr)
}
level = unique(factor)
n_level = length(level)
sample_id = attr(cr, "sample_id")
subtype_ss = matrix(nrow = length(cr), ncol = n_level)
colnames(subtype_ss) = level
meth_mat = mcols(cr)
meth_mat = meth_mat[, grep("^mean_meth_", colnames(meth_mat))]
meth_mat = as.matrix(meth_mat)
counter = set_counter(length(cr))
for(i in seq_len(length(cr))) {
x = meth_mat[i, paste0("mean_meth_", sample_id)]
# pairwise t-test, t-value and p-value
mat_t = matrix(nrow = n_level, ncol = n_level)
rownames(mat_t) = level
colnames(mat_t) = level
mat_p = mat_t
for(i1 in 2:n_level) {
for(i2 in 1:(i1-1)) {
x1 = x[factor == level[i1]]
x2 = x[factor == level[i2]]
test = t.test(x1, x2)
mat_t[i1, i2] = test$statistic
mat_p[i1, i2] = test$p.value
mat_t[i2, i1] = -mat_t[i1, i2]
mat_p[i2, i1] = mat_p[i1, i2]
}
}
ss = apply(mat_t, 1, function(x) {
x = x[!is.na(x)]
if(all(x > 0)){
return(1)
} else if(all(x < 0)) {
return(-1)
} else {
return(0)
}
})
l = apply(mat_p, 1, function(x) {
x = x[!is.na(x)]
all(x < cutoff)
})
ss[!l] = 0
subtype_ss[i, ] = ss
counter()
}
colnames(subtype_ss) = paste0(colnames(subtype_ss), suffix)
mcols(cr) = cbind(as.data.frame(mcols(cr)), subtype_ss)
return(cr)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.