pipeline/global_methylation_info.R

 
suppressPackageStartupMessages(library(GetoptLong))

head = "~/project/development/cotools/pipeline/head/head.R"
GetoptLong(c("head=s", "head R script"))

source("~/project/development/cotools/script/load_all.R")
source(head)

# chromosome = c("chr21", "chr22")

#setwd(output_dir)

ha = HeatmapAnnotation(subtype = SAMPLE$type, age = SAMPLE$age, col = list(subtype = SAMPLE_COLOR, age = AGE_COL_FUN))


pdf(qq("@{output_dir}/general_methylation_distribution.pdf"), width = 10, height = 10)
global_methylation_distribution(sample_id = SAMPLE$id, annotation = SAMPLE$type, 
  annotation_color = SAMPLE_COLOR, chromosome = chromosome, ha = ha)
global_methylation_distribution(sample_id = SAMPLE$id, annotation = SAMPLE$type, 
  annotation_color = SAMPLE_COLOR, chromosome = chromosome, by_chr = TRUE, ha = ha)
dev.off()

pdf(qq("@{output_dir}/general_methylation_distribution_cgi.pdf"), width = 10, height = 10)
global_methylation_distribution(sample_id = SAMPLE$id, annotation = SAMPLE$type, 
  annotation_color = SAMPLE_COLOR, chromosome = chromosome, background = GENOMIC_FEATURE_LIST$cgi, p = 0.01, ha = ha)
global_methylation_distribution(sample_id = SAMPLE$id, annotation = SAMPLE$type, 
  annotation_color = SAMPLE_COLOR, chromosome = chromosome, by_chr = TRUE, background = GENOMIC_FEATURE_LIST$cgi, p = 0.01, ha = ha)
dev.off()


pdf(qq("@{output_dir}/general_methylation_distribution_cgi_shores.pdf"), width = 10, height = 10)
extended_cgi = GENOMIC_FEATURE_LIST$cgi
start(extended_cgi) = start(extended_cgi) - 2000
end(extended_cgi) = end(extended_cgi) + 2000
shore = setdiff(extended_cgi, GENOMIC_FEATURE_LIST$cgi)
global_methylation_distribution(sample_id = SAMPLE$id, annotation = SAMPLE$type, 
  annotation_color = SAMPLE_COLOR, chromosome = chromosome, background = shore, p = 0.01, ha = ha)
global_methylation_distribution(sample_id = SAMPLE$id, annotation = SAMPLE$type, 
  annotation_color = SAMPLE_COLOR, chromosome = chromosome, by_chr = TRUE, background = shore, p = 0.01, ha = ha)
dev.off()

pdf(qq("@{output_dir}/general_methylation_distribution_neither_cgi_nor_shores.pdf"), width = 10, height = 10)
chromInfo = getChromInfoFromUCSC(genome)
chromInfo = chromInfo[chromInfo$chrom %in% chromosome, ]
chromGr = GRanges(chromInfo$chrom, ranges = IRanges(rep(1, nrow(chromInfo)), chromInfo$length))
complement = setdiff(chromGr, extended_cgi)
global_methylation_distribution(sample_id = SAMPLE$id, annotation = SAMPLE$type, 
  annotation_color = SAMPLE_COLOR, chromosome = chromosome, background = complement, ha = ha)
global_methylation_distribution(sample_id = SAMPLE$id, annotation = SAMPLE$type, 
  annotation_color = SAMPLE_COLOR, chromosome = chromosome, by_chr = TRUE, background = complement, ha = ha)
dev.off()


###########################################################
# differnetial methylation in different genomic features
gr_list = get_mean_methylation_in_genomic_features(SAMPLE$id, chromosome = chromosome, gf_list = GENOMIC_FEATURE_LIST)

pdf(qq("@{output_dir}/heatmap_diff_methylation_in_genomic_features.pdf"), width = 16, height = 16)
for(i in seq_along(gr_list)) {
  heatmap_diff_methylation_in_genomic_features(gr_list[[i]], annotation = SAMPLE$type, 
    annotation_color = SAMPLE_COLOR, title = names(gr_list)[i], txdb = txdb, gf_list = GENOMIC_FEATURE_LIST, ha = ha)
}
dev.off()


####################
## split genome into 1kb window
chromInfo = getChromInfoFromUCSC(genome)
chromInfo = chromInfo[chromInfo$chrom %in% chromosome, ]
chromGr = GRanges(chromInfo$chrom, ranges = IRanges(rep(1, nrow(chromInfo)), chromInfo$length))

genome_1kb_window = makeWindows(chromGr, w = 1000)
gr_list_genome = get_mean_methylation_in_genomic_features(SAMPLE$id, chromosome = chromosome, gf_list = list(genome_1kb = genome_1kb_window))


# res1 = genomic_regions_correlation(gr_genome, GENOMIC_FEATURE_LIST, chromosome = chromosome, 
#   background = gr_list_genome[[1]], nperm = 50, stat_fun = genomicCorr.sintersect)


####################
## overlap to CGI
cgi_1kb_window = makeWindows(GENOMIC_FEATURE_LIST$cgi, w = 1000, short.keep = TRUE)
cgi_1kb_window = cgi_1kb_window[width(cgi_1kb_window) > 500]
gr_list_cgi = get_mean_methylation_in_genomic_features(SAMPLE$id, chromosome = chromosome, gf_list = list(cgi_1kb_window = cgi_1kb_window))

shore_1kb_window = makeWindows(shore, w = 1000, short.keep = TRUE)
shore_1kb_window = shore_1kb_window[width(shore_1kb_window) > 500]
gr_list_shore = get_mean_methylation_in_genomic_features(SAMPLE$id, chromosome = chromosome, gf_list = list(shore_1kb_window = shore_1kb_window))

# res2 = genomic_regions_correlation(gr_cgi, GENOMIC_FEATURE_LIST, chromosome = chromosome, 
#   background = gr_list_cgi[[1]], nperm = 50, stat_fun = genomicCorr.sintersect)

genome_1kb_window = makeWindows(setdiff(chromGr, extended_cgi), w = 1000)
genome_1kb_window = genome_1kb_window[width(genome_1kb_window) > 500]
gr_list_genome = get_mean_methylation_in_genomic_features(SAMPLE$id, chromosome = chromosome, gf_list = list(genome_1kb = genome_1kb_window))


save(gr_list_cgi, gr_list_shore, gr_list_genome, file = "/icgc/dkfzlsdf/analysis/hipo/hipo_016/analysis/WGBS_final/results/genome_windows.RData")

pdf(qq("@{output_dir}/heatmap_diff_methylation_1kb_window.pdf"), width = 14, height = 14)
gr_diff_genome = heatmap_diff_methylation_in_genomic_features(gr_list_genome[[1]], annotation = SAMPLE$type, 
    annotation_color = SAMPLE_COLOR, title = "genome 1kb window", ha = ha)
gr_diff_cgi = heatmap_diff_methylation_in_genomic_features(gr_list_cgi[[1]], annotation = SAMPLE$type, 
    annotation_color = SAMPLE_COLOR, title = "cgi 1kb window", ha = ha)
gr_diff_shore = heatmap_diff_methylation_in_genomic_features(gr_list_shore[[1]], annotation = SAMPLE$type, 
    annotation_color = SAMPLE_COLOR, title = "shore 1kb window", ha = ha)
dev.off()


MR_list = list(gr_diff_genome = gr_diff_genome,
             gr_diff_cgi = gr_diff_cgi,
             gr_diff_shore = gr_diff_shore)
res = genomic_regions_correlation(MR_list, GENOMIC_FEATURE_LIST[c("gene", "intron", "intergenic", "repeats_SINE", "repeats_LINE", "dnase", "exon", "tfbs", "tss_2k", "enhancer")], chromosome = chromosome, nperm = 2)
pdf(qq("@{output_dir}/genome_diff_1kb_window_correlation.pdf"), width = 6, height = 6)
ht = Heatmap(res$stat, name = "jaccard", column_title = qq("jaccard coefficient"), cluster_columns = FALSE)
draw(ht)
dev.off()


################################ only for testing ########################
## for the genomic 1kb window, just use the top 5000 most variable windows

# mat = as.matrix(mcols(gr_list_genome[[1]]))
# od = order(rowVars(mat), decreasing = TRUE)[1:round(nrow(mat)*0.05)]
# mat = mat[od, ]
# gr = gr_list_genome[[1]][od]

# ct = cor_cols(t(mat), abs_cutoff = seq(0.5, 0.9, by = 0.025), mc = 2)

# nn = do.call("rbind", lapply(seq(1000, 10000, by = 1000), function(x) apply(ct, 2, function(y) sum(y > x))))
# matplot(y = nn, x = seq(1000, 10000, by = 1000), type = "l")


# ind = which(ct[, "0.8"] >= 1000)
# mcols(gr) = NULL

# ind = sample(ind, 1000)
# mat = mat[ind, ]
# gr = gr[ind, ]

# gr = annotate_to_genomic_features(gr, list(GENOMIC_FEATURE_LIST$cgi, extended_cgi), name = c("cgi", "shore"))

# anno = ifelse(gr$overlap_to_shore > 0, ifelse(gr$overlap_to_cgi > 1 - gr$overlap_to_shore, "CGI", "Shore"), "Others")

# ha = HeatmapAnnotation(subtype = SAMPLE$type, col = list("subtype" = SAMPLE_COLOR))
# ht = Heatmap(mat, name = "methylation", col = colorRamp2(c(0, 0.5, 1), c("blue", "white", "red")), top_annotation = ha, split = anno) + 
#   Heatmap(anno, name = "anno", col = c("CGI" = "orange", "Shore" = "green", "Others" = "blue"))


# pdf("methylation_classification_wgbs.pdf", width = 14, height = 14)
# draw(ht)
# dev.off()



# ## for the genomic 1kb window, just use the top 5000 most variable windows

# mat = as.matrix(mcols(gr_list_cgi[[1]])); rownames(mat) = seq_len(nrow(mat))
# mat = mat[, colnames(mat) != "ncpg"]
# od = order(rowVars(mat), decreasing = TRUE)[1:round(nrow(mat)*0.2)]
# mat = mat[od, ]

# ct_cgi = cor_cols(t(mat), abs_cutoff = seq(0.5, 0.9, by = 0.025), mc = 2)

# ind = which(ct_cgi[, "0.6"] >= 100)
# mcols(gr) = NULL
# mat_cgi = mat[ind, ]
# gr_cgi = gr[ind, ]

# # genome
# mat = as.matrix(mcols(gr_list_genome[[1]]))
# mat = mat[, colnames(mat) != "ncpg"]
# od = order(rowVars(mat), decreasing = TRUE)[1:round(nrow(mat)*0.02)]
# mat = mat[od, ]
# gr = gr_list_genome[[1]][od]

# ct_genome = cor_cols(t(mat), abs_cutoff = seq(0.5, 0.9, by = 0.025), mc = 2)

# ind = which(ct_genome[, "0.6"] >= 100)
# mcols(gr) = NULL
# mat_genome = mat[ind, ]
# gr_genome = gr[ind, ]






# nn = do.call("rbind", lapply(seq(1000, 10000, by = 1000), function(x) apply(ct, 2, function(y) sum(y > x))))
# matplot(y = nn, x = seq(1000, 10000, by = 1000), type = "l")






# ind = sample(seq_len(nrow(mat_cgi)), 5000)

# mat2 = mat_cgi[ind, ]
# gr2 = gr_cgi[ind]

##############################################################
## let's start from here

# cgi_1kb_window = makeWindows(extended_cgi, w = 1000, short.keep = TRUE)
# cgi_1kb_window = cgi_1kb_window[width(cgi_1kb_window) > 500]
# gr_list_cgi = get_mean_methylation_in_genomic_features(SAMPLE$id, chromosome = chromosome, gf_list = list(cgi_1kb_window = cgi_1kb_window))


# library(ConsensusClusterPlus)
# mat = as.matrix(mcols(gr_list_cgi[[1]])); rownames(mat) = seq_len(nrow(mat))
# mat = mat[, colnames(mat) != "ncpg"]

# set.seed(12345)
# get_class = function(mat) {
#   od = order(rowVars(mat), decreasing = TRUE)[1:round(nrow(mat)*0.2)]
#   mat2 = mat[od, ]

#   ct_cgi = cor_cols(t(mat2), abs_cutoff = 0.6, mc = 2)

#   ind = which(ct_cgi[, "0.6"] >= 500)
#   mat2 = mat2[ind, ]
#   if(nrow(mat2) > 5000) {
#     l = sample(seq_len(nrow(mat2)), 5000)
#   } else {
#     l = rep(TRUE, nrow(mat2))
#   }
#   res = ConsensusClusterPlus(mat2[l, ], maxK = 4, 
#     clusterAlg = "km", distance = "euclidean", reps = 1000, verbose = TRUE)

#   list(class = lapply(res[-1], function(x) x$consensusClass), row_index = rownames(mat2))
# }

# gr = gr_list_cgi[[1]]
# gr2 = annotate_to_genomic_features(gr, list(GENOMIC_FEATURE_LIST$cgi, extended_cgi), name = c("cgi", "shore"))
# anno = ifelse(gr2$overlap_to_shore > 0, ifelse(gr2$overlap_to_cgi > 1 - gr2$overlap_to_shore, "CGI", "Shore"), "Others")

# l = anno == "CGI"
# cl = get_class(mat[l, ])
# class = cl$class[[1]]
# row_index = cl$row_index

# tb = table(class); i = as.numeric(names(tb)[which.max(tb)])
# cl = get_class(mat[l, class == i])
# class[class == i] = cl$class[[1]] + 2
# row_index = union(row_index, cl$row_index)

# tb = table(class); i = as.numeric(names(tb)[which.max(tb)])
# cl = get_class(mat[l, class == i])
# class[class == i] = cl$class[[1]] + 4
# row_index = union(row_index, cl$row_index)

# n = length(row_index)
# if(length(row_index) > 5000) row_index = sample(row_index, 5000)

# gr = gr_list_cgi[[1]]
# gr2 = annotate_to_genomic_features(gr[as.numeric(row_index)], list(GENOMIC_FEATURE_LIST$cgi, extended_cgi), name = c("cgi", "shore"))
# anno = ifelse(gr2$overlap_to_shore > 0, ifelse(gr2$overlap_to_cgi > 1 - gr2$overlap_to_shore, "CGI", "Shore"), "Others")


# m = NULL
# type = NULL
# age = NULL
# class2 = NULL
# for(i in unique(class)) {
#   dend = as.dendrogram(hclust(dist(t(mat[row_index, class == i]))))
#   dend = stats:::reorder.dendrogram(dend, colMeans(mat[row_index, class == i]))
#   col_order = order.dendrogram(dend)
#   m = cbind(m, mat[row_index, class == i][, col_order])
#   type = c(type, SAMPLE$type[class == i][col_order])
#   age = c(age, SAMPLE$age[class == i][col_order])
#   class2 = c(class2, class[class == i][col_order])
# }
# ha = HeatmapAnnotation(subtype = type, age = age, consensusCL = class2, 
#   col = list("subtype" = SAMPLE_COLOR, age = AGE_COL_FUN, consensusCL = structure(2:5, names = unique(class2))))
# ht = Heatmap(m, name = "methylation", col = colorRamp2(c(0, 0.5, 1), c("blue", "white", "red")), show_row_names = FALSE,
#   top_annotation = ha, split = anno, cluster_columns = FALSE, column_title = qq("@{n} 1kb windows")) + 
#   Heatmap(anno, name = "anno", col = c("CGI" = "orange", "Shore" = "green", "Others" = "blue"))

# pdf("methylation_classification_wgbs_cgi.pdf", width = 14, height = 14)
# draw(ht)
# for(an in sapply(ha@anno_list, function(x) x@name)) {
#     decorate_annotation(an, {
#       grid.text(an, x = unit(-2, "mm"), just = "right")
#     })
#   }
# dev.off()


#######################################################
## show simply use top 5K most variable rows may contain rows which only contains random noise
mat = as.matrix(mcols(gr_list_genome[[1]]))
ind = order(rowVars(mat), decreasing = TRUE)[1:5000]
ha = HeatmapAnnotation(subtype = SAMPLE$type, age = SAMPLE$age,
  col = list("subtype" = SAMPLE_COLOR, age = AGE_COL_FUN))
ht = Heatmap(mat[ind, ], name = "methylation", col = colorRamp2(c(0, 0.5, 1), c("blue", "white", "red")), show_row_names = FALSE,
  top_annotation = ha, cluster_columns = TRUE)

pdf(qq("@{output_dir}/methylation_classification_wgbs_top_5k_var.pdf"), width = 14, height = 14)
draw(ht)
for(an in sapply(ha@anno_list, function(x) x@name)) {
    decorate_annotation(an, {
      grid.text(an, x = unit(-2, "mm"), just = "right")
    })
  }
dev.off()



############################
## classify by methylation


methylation_subtype_classfication = function(gr, p, corr, k) {
  gr_1kb_window = makeWindows(gr, w = 1000, short.keep = TRUE)
  gr_1kb_window = gr_1kb_window[width(gr_1kb_window) > 500]
  gr_list_cgi = get_mean_methylation_in_genomic_features(SAMPLE$id, chromosome = chromosome, gf_list = list(gr_1kb_window = gr_1kb_window))

  mat = as.matrix(mcols(gr_list_cgi[[1]])); rownames(mat) = seq_len(nrow(mat))
  mat = mat[, colnames(mat) != "ncpg"]

  set.seed(12345)
  get_class = function(mat, p, corr, k) {
    od = order(rowVars(mat), decreasing = TRUE)[1:round(nrow(mat)*p)]
    mat2 = mat[od, ]

    ct_cgi = cor_cols(t(mat2), abs_cutoff = corr, mc = 2)

    ind = which(ct_cgi[, as.character(corr)] >= k)
    mat2 = mat2[ind, ]
    if(nrow(mat2) > 5000) {
      l = sample(seq_len(nrow(mat2)), 5000)
    } else {
      l = rep(TRUE, nrow(mat2))
    }
    res = ConsensusClusterPlus(mat2[l, ], maxK = 4, 
      clusterAlg = "km", distance = "euclidean", reps = 1000, verbose = TRUE)
    dev.off()
    list(class = lapply(res[-1], function(x) x$consensusClass), row_index = rownames(mat2))
  }

  gr = gr_list_cgi[[1]]
  gr2 = annotate_to_genomic_features(gr, list(GENOMIC_FEATURE_LIST$cgi, extended_cgi), name = c("cgi", "shore"))
  anno = ifelse(gr2$overlap_to_shore > 0, ifelse(gr2$overlap_to_cgi > 1 - gr2$overlap_to_shore, "CGI", "Shore"), "Others")

  cl = get_class(mat, p = p, corr = corr, k = k)
  class = cl$class[[1]]
  row_index = cl$row_index

  tb = table(class); i = as.numeric(names(tb)[which.max(tb)])
  cl = get_class(mat[, class == i], p = p, corr = corr, k = k)
  class[class == i] = cl$class[[1]] + 2
  row_index = union(row_index, cl$row_index)

  tb = table(class); i = as.numeric(names(tb)[which.max(tb)])
  cl = get_class(mat[, class == i], p = p, corr = corr, k = k)
  class[class == i] = cl$class[[1]] + 4
  row_index = union(row_index, cl$row_index)

  n = length(row_index)
  if(length(row_index) > 5000) row_index = sample(row_index, 5000)

  gr = gr_list_cgi[[1]]
  gr2 = annotate_to_genomic_features(gr[as.numeric(row_index)], list(GENOMIC_FEATURE_LIST$cgi, extended_cgi), name = c("cgi", "shore"))
  anno = ifelse(gr2$overlap_to_shore > 0, ifelse(gr2$overlap_to_cgi > 1 - gr2$overlap_to_shore, "CGI", "Shore"), "Others")


  m = NULL
  type = NULL
  age = NULL
  class2 = NULL
  for(i in unique(class)) {
    dend = as.dendrogram(hclust(dist(t(mat[row_index, class == i]))))
    dend = stats:::reorder.dendrogram(dend, colMeans(mat[row_index, class == i]))
    col_order = order.dendrogram(dend)
    m = cbind(m, mat[row_index, class == i][, col_order])
    type = c(type, SAMPLE$type[class == i][col_order])
    age = c(age, SAMPLE$age[class == i][col_order])
    class2 = c(class2, class[class == i][col_order])
  }
  ha = HeatmapAnnotation(subtype = type, age = age, consensusCL = class2, 
    col = list("subtype" = SAMPLE_COLOR, age = AGE_COL_FUN, consensusCL = structure(2:5, names = unique(class2))))
  ht = Heatmap(m, name = "methylation", col = colorRamp2(c(0, 0.5, 1), c("blue", "white", "red")), show_row_names = FALSE,
    top_annotation = ha, split = anno, cluster_columns = FALSE, column_title = qq("@{n} 1kb windows")) + 
    Heatmap(anno, name = "anno", col = c("CGI" = "orange", "Shore" = "green", "Others" = "blue"))

  draw(ht)
  for(an in sapply(ha@anno_list, function(x) x@name)) {
    decorate_annotation(an, {
      grid.text(an, x = unit(-2, "mm"), just = "right")
    })
  }
}

ha = HeatmapAnnotation(subtype = type, age = age, 
    col = list("subtype" = SAMPLE_COLOR, age = AGE_COL_FUN))

pdf(qq("@{output_dir}/methylation_classification_wgbs_cgi.pdf"), width = 14, height = 14)
methylation_subtype_classfication(GENOMIC_FEATURE_LIST$cgi, p = 0.2, corr = 0.6, k = 500)
dev.off()

pdf(qq("@{output_dir}/methylation_classification_wgbs_cgi_shore.pdf"), width = 14, height = 14)
methylation_subtype_classfication(setdiff(extended_cgi, GENOMIC_FEATURE_LIST$cgi), p = 0.2, corr = 0.6, k = 500)
dev.of()

pdf(qq("@{output_dir}/methylation_classification_wgbs_neither_cgi_nor_shores.pdf"), width = 14, height = 14)
methylation_subtype_classfication(setdiff(chromGr, extended_cgi), p = 0.02, corr = 0.8, k = 1000)
dev.of()
jokergoo/cotools documentation built on May 19, 2019, 6:24 p.m.