suppressPackageStartupMessages(library(BPRMeth))
suppressPackageStartupMessages(library(purrr))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(cowplot))
suppressPackageStartupMessages(library(gridExtra))
suppressPackageStartupMessages(library(reshape2))
suppressPackageStartupMessages(library(grid))
suppressPackageStartupMessages(library(ROCR))
suppressPackageStartupMessages(library(scales))
suppressPackageStartupMessages(library(data.table))
suppressPackageStartupMessages(library(ggfortify))
suppressPackageStartupMessages(library(stringi))
# suppressPackageStartupMessages(library(proxy))
suppressPackageStartupMessages(library(RColorBrewer))
# Data
io            <- list()
io$script_dir <- "../"
io$dataset    <- "mt-seq/"
io$sub_dir    <- "/subsampled/"
io$data_dir   <- paste0("../local-data/melissa/", io$dataset, "/imputation/")
io$data_dir   <- paste0("~/datasets/melissa/", io$dataset, "/imputation/")
io$K          <- 6
io$M          <- c(541, 1468, 2666, 745, 231, 384)
io$M_deepcpg  <- c(435, 1182, 2113, 605, 183, 283)
io$cov        <- c(10, 10, 15, 10, 10, 10)
#io$basis      <- c(7, 11, 9, 7, 7, 7)
io$basis      <- c(9, 11, 13, 11, 9, 11)
io$cpg_prcg   <- 0.5
io$data_prcg  <- 0.4
io$reg_prcg   <- 0.95
io$filter     <- 0.5
R.utils::sourceDirectory(paste0(io$script_dir, "lib/"), modifiedOnly = FALSE)
# Different CpG coverages
io$regions <- c("prom3k", "prom5k", "prom10k", "active_enhancers", "Nanog", "super_enhancers")

dt_analysis <- data.table(region = character(), auc_melissa = numeric(), auc_melissa_rate = numeric(), 
                          auc_indep_prof = numeric(), auc_indep_rate = numeric(), auc_rf = numeric(),
                          auc_deepcpg = numeric(), auc_deepcpg_sub = numeric(),
                          f_melissa = numeric(), f_melissa_rate = numeric(), 
                          f_indep_prof = numeric(), f_indep_rate = numeric(), f_rf = numeric(),
                          f_deepcpg = numeric(), f_deepcpg_sub = numeric(),
                          tpr_fpr_melissa = list(), tpr_fpr_melissa_rate = list(), 
                          tpr_fpr_indep_prof = list(), tpr_fpr_indep_rate = list(), tpr_fpr_rf = list(),
                          tpr_fpr_deepcpg = numeric(), tpr_fpr_deepcpg_sub = numeric(),
                          pr_melissa = list(), pr_melissa_rate = list(), 
                          pr_indep_prof = list(), pr_indep_rate = list(), pr_rf = list(),
                          pr_deepcpg = numeric(), pr_deepcpg_sub = numeric()
                          )

model_analysis <- data.table(region = character(), melissa = numeric(), melissa_rate = numeric())
iter <- 1
for (region in io$regions) {
   # Load joint analysis results
   dt_melissa <- readRDS(paste0(io$data_dir, "diffuse_melissa_sim10_", region, 
                              "_cov", io$cov[iter], "_sd0.2_K", io$K, "_M", io$M[iter], 
                              "_basis", io$basis[iter], "_dataPrcg", io$data_prcg, 
                              "_regionPrcg", io$reg_prcg, "_cpgPrcg", io$cpg_prcg,
                              "_filter", io$filter, "_gene_var5", ".rds"))

   # Load independent analysis results
   dt_indep <- readRDS(paste0(io$data_dir, "indep_sim10_", region,
                          "_cov", io$cov[iter], "_sd0.2_M", io$M[iter],
                          "_basis", io$basis[iter], "_dataPrcg", io$data_prcg,
                          "_regionPrcg", io$reg_prcg, "_cpgPrcg", io$cpg_prcg,
                          "_filter", io$filter, "_gene_var5", ".rds"))

   # Load RF analysis results
   dt_rf <- readRDS(paste0(io$data_dir, "rf_indep_sim10_", region,
                              "_cov", io$cov[iter], "_sd0.2_M", io$M[iter],
                              "_dataPrcg", io$data_prcg, "_regionPrcg", io$reg_prcg,
                              "_cpgPrcg", io$cpg_prcg, "_filter", io$filter,
                              "_gene_var5", ".rds"))

   # Load RF analysis results
   dt_deepcpg <- readRDS(paste0(io$data_dir, "deepcpg/deepcpg_sim10_", region, 
                              "_cov", io$cov[iter], "_sd0.2_M", io$M_deepcpg[iter],
                              "_dataPrcg", io$data_prcg, "_regionPrcg", io$reg_prcg, 
                              "_cpgPrcg", io$cpg_prcg, "_filter", io$filter, 
                              "_gene_var5", ".rds"))

   # Load RF analysis results
   dt_deepcpg_sub <- readRDS(paste0(io$data_dir, "deepcpg/subsampled/deepcpg_sim10_", region, 
                              "_cov", io$cov[iter], "_sd0.2_M", io$M_deepcpg[iter],
                              "_dataPrcg", io$data_prcg, "_regionPrcg", io$reg_prcg, 
                              "_cpgPrcg", io$cpg_prcg, "_filter", io$filter, 
                              "_gene_var5", ".rds"))

   for (i in 1:length(dt_melissa$model)) {
       # Create prediction objects
       melissa_pred <- prediction(round(dt_melissa$model[[i]]$eval_perf$eval_prof$pred_obs, 2),
                                  dt_melissa$model[[i]]$eval_perf$eval_prof$act_obs)
       melissa_rate_pred <- prediction(round(dt_melissa$model[[i]]$eval_perf$eval_mean$pred_obs, 2),
                                       dt_melissa$model[[i]]$eval_perf$eval_mean$act_obs)
       indep_prof_pred <- prediction(round(dt_indep$model[[i]]$eval_perf$eval_prof$pred_obs, 2),
                                     dt_indep$model[[i]]$eval_perf$eval_prof$act_obs)
       indep_rate_pred <- prediction(round(dt_indep$model[[i]]$eval_perf$eval_mean$pred_obs, 2),
                                     dt_indep$model[[i]]$eval_perf$eval_mean$act_obs)
       rf_pred <- prediction(round(dt_rf$model[[i]]$eval_perf$pred_obs, 2), dt_rf$model[[i]]$eval_perf$act_obs)
       deepcpg_pred <- prediction(round(dt_deepcpg$model[[i]]$eval_perf$pred_obs, 2),
                                  dt_deepcpg$model[[i]]$eval_perf$act_obs)
       deepcpg_sub_pred <- prediction(round(dt_deepcpg_sub$model[[i]]$eval_perf$pred_obs, 2),
                                  dt_deepcpg_sub$model[[i]]$eval_perf$act_obs)

       # F-measure performance
       f_melissa <- performance(melissa_pred, "f")
       f_melissa_rate <- performance(melissa_rate_pred, "f")
       f_indep_prof <- performance(indep_prof_pred, "f")
       f_indep_rate <- performance(indep_rate_pred, "f")
       f_rf <- performance(rf_pred, "f")
       f_deepcpg <- performance(deepcpg_pred, "f")
       f_deepcpg_sub <- performance(deepcpg_sub_pred, "f")

       dt <- data.table(region = region,
                     auc_melissa = performance(melissa_pred, "auc")@y.values[[1]],
                     auc_melissa_rate = performance(melissa_rate_pred, "auc")@y.values[[1]],
                     auc_indep_prof = performance(indep_prof_pred, "auc")@y.values[[1]],
                     auc_indep_rate = performance(indep_rate_pred, "auc")@y.values[[1]],
                     auc_rf = performance(rf_pred, "auc")@y.values[[1]],
                     auc_deepcpg = performance(deepcpg_pred, "auc")@y.values[[1]],
                     auc_deepcpg_sub = performance(deepcpg_sub_pred, "auc")@y.values[[1]],

                     f_melissa = f_melissa@y.values[[1]][min(which(f_melissa@x.values[[1]] <= 0.5))],
                     f_melissa_rate = f_melissa_rate@y.values[[1]][min(which(f_melissa_rate@x.values[[1]] <= 0.5))],
                     f_indep_prof = f_indep_prof@y.values[[1]][min(which(f_indep_prof@x.values[[1]] <= 0.5))],
                     f_indep_rate = f_indep_rate@y.values[[1]][min(which(f_indep_rate@x.values[[1]] <= 0.5))],
                     f_rf = f_rf@y.values[[1]][min(which(f_rf@x.values[[1]] <= 0.5))],
                     f_deepcpg = f_deepcpg@y.values[[1]][min(which(f_deepcpg@x.values[[1]] <= 0.5))],
                     f_deepcpg_sub = f_deepcpg_sub@y.values[[1]][min(which(f_deepcpg_sub@x.values[[1]] <= 0.5))],

                     tpr_fpr_melissa = list(tpr_fpr_melissa = performance(melissa_pred, "tpr", "fpr")),
                     tpr_fpr_melissa_rate = list(tpr_fpr_melissa_rate = performance(melissa_rate_pred, "tpr", "fpr")),
                     tpr_fpr_indep_prof = list(tpr_fpr_indep_prof = performance(indep_prof_pred, "tpr", "fpr")),
                     tpr_fpr_indep_rate = list(tpr_fpr_indep_rate = performance(indep_rate_pred, "tpr", "fpr")),
                     tpr_fpr_rf = list(tpr_fpr_rf = performance(rf_pred, "tpr", "fpr")),
                     tpr_fpr_deepcpg = list(tpr_fpr_deepcpg = performance(deepcpg_pred, "tpr", "fpr")),
                     tpr_fpr_deepcpg_sub = list(tpr_fpr_deepcpg_sub = performance(deepcpg_sub_pred, "tpr", "fpr")),

                     pr_melissa = list(pr_melissa = performance(melissa_pred, "prec", "rec")),
                     pr_melissa_rate = list(pr_melissa_rate = performance(melissa_rate_pred, "prec", "rec")),
                     pr_indep_prof = list(pr_indep_prof = performance(indep_prof_pred, "prec", "rec")),
                     pr_indep_rate = list(pr_indep_rate = performance(indep_rate_pred, "prec", "rec")),
                     pr_rf = list(pr_rf = performance(rf_pred, "prec", "rec")),
                     pr_deepcpg = list(pr_deepcpg = performance(deepcpg_pred, "prec", "rec")),
                     pr_deepcpg_sub = list(pr_deepcpg_sub = performance(deepcpg_sub_pred, "prec", "rec"))
                     )
    # Add results to final data.table
    dt_analysis <- rbind(dt_analysis, dt)

    dt <- data.table(region = region,
                     melissa = length(which(dt_melissa$model[[i]]$melissa_prof$delta > 4)),
                     melissa_rate = length(which(dt_melissa$model[[i]]$melissa_rate$delta > 4)))

    model_analysis <- rbind(model_analysis, dt)
   }
   iter <- iter + 1
}
rm(iter, dt, i, dt_rf, dt_indep, dt_melissa, melissa_pred, melissa_rate_pred, indep_prof_pred, indep_rate_pred, rf_pred,
   f_melissa, f_melissa_rate, f_indep_prof, f_indep_rate, f_rf)

AUC performance

set.seed(17)
dt_boxplot <- copy(dt_analysis)
s2 <- 0.004

dt_boxplot <- dt_boxplot[, c("auc_melissa", "auc_melissa_rate", "auc_indep_prof", "auc_indep_rate", "auc_rf", "auc_deepcpg", "auc_deepcpg_sub") := 
                             list(auc_melissa + rnorm(.N, 0, s2), auc_melissa_rate + rnorm(.N, 0, s2),
                                  auc_indep_prof + rnorm(.N, 0, s2), auc_indep_rate + rnorm(.N, 0, s2),
                                  auc_rf + rnorm(.N, 0, s2), auc_deepcpg + rnorm(.N, 0, s2), auc_deepcpg_sub + rnorm(.N, 0, s2))]
dt_boxplot <- dt_boxplot[, c("region", "auc_melissa", "auc_melissa_rate", "auc_indep_prof", "auc_indep_rate", "auc_rf", "auc_deepcpg", "auc_deepcpg_sub")]

dt_boxplot <- dt_boxplot %>% setnames(c("region", "auc_melissa", "auc_melissa_rate", "auc_indep_prof", "auc_indep_rate", "auc_rf", "auc_deepcpg", "auc_deepcpg_sub"), c("x", "Melissa", "Melissa Rate", "BPRMeth", "Indep Rate", "RF", "DeepCpG", "DeepCpG Sub")) %>% .[, x := as.factor(x)] %>% melt(variable.name = "Model", value.name = "y")  %>% .[, x := factor(x, levels = c("prom10k", "prom5k", "prom3k", "Nanog", "super_enhancers", "active_enhancers"))] %>% 
    .[, Model := factor(Model, levels = c("Melissa", "DeepCpG", "DeepCpG Sub", "BPRMeth", "RF", "Melissa Rate", "Indep Rate"))]

p_auc_box <- ggplot(dt_boxplot, aes(x = x, y = y, fill = Model)) +
  geom_boxplot(alpha = 0.8, outlier.shape = NA) +
  scale_fill_manual(values = c("red3", "darkgreen", "palegreen3", "chocolate2", "dodgerblue4", "mediumorchid4", "mistyrose4")) + 
  scale_x_discrete(labels = c("prom3k" = "Promoter 3kb", "prom5k" = "Promoter 5kb", "prom10k" = "Promoter 10kb", "active_enhancers" = "Active enhancers", "Nanog" = "Nanog", "super_enhancers" = "Super enhancers")) +
  scale_y_continuous(breaks = pretty_breaks(n = 6)) +
  labs(title = "", x = NULL, y = "AUC") +
  boxplot_theme()
print(p_auc_box)

pdf(file = paste0("out/", io$dataset, io$sub_dir, "/auc-mt-seq.pdf"), width = 15, height = 8, useDingbats = FALSE)
p_auc_box
dev.off()

rm(dt_boxplot, s2)

AUC performance presentation plot

set.seed(17)
dt_boxplot <- copy(dt_analysis)
s2 <- 0.005

dt_boxplot <- dt_boxplot[, c("auc_melissa", "auc_melissa_rate", "auc_indep_prof", "auc_indep_rate", "auc_rf", "auc_deepcpg") := 
                             list(auc_melissa + rnorm(.N, 0, s2), auc_melissa_rate + rnorm(.N, 0, s2),
                                  auc_indep_prof + rnorm(.N, 0, s2), auc_indep_rate + rnorm(.N, 0, s2),
                                  auc_rf + rnorm(.N, 0, s2), auc_deepcpg + rnorm(.N, 0, s2))]
dt_boxplot <- dt_boxplot[, c("region", "auc_melissa", "auc_melissa_rate", "auc_indep_prof", "auc_indep_rate", "auc_rf", "auc_deepcpg")]

dt_boxplot <- dt_boxplot %>% setnames(c("region", "auc_melissa", "auc_melissa_rate", "auc_indep_prof", "auc_indep_rate", "auc_rf", "auc_deepcpg"), c("x", "Melissa", "Melissa Rate", "BPRMeth", "Rate", "RF", "DeepCpG")) %>% .[, x := as.factor(x)] %>% melt(variable.name = "Model", value.name = "y")  %>% .[, x := factor(x, levels = c("prom10k", "prom5k", "prom3k", "Nanog", "super_enhancers", "active_enhancers"))] %>% 
    .[, Model := factor(Model, levels = c("Melissa", "DeepCpG", "BPRMeth", "RF", "Melissa Rate", "Rate"))]

p_auc_box_pres <- ggplot(dt_boxplot, aes(x = x, y = y, fill = Model)) +
  geom_boxplot(alpha = 0.8, outlier.shape = NA) +
  #scale_fill_manual(values = c("red3", "mediumorchid4", "chocolate2", "dodgerblue4", "cornflowerblue", "green4")) + 
  scale_fill_manual(values = c("red3", "darkgreen", "chocolate2", "dodgerblue4", "mediumorchid4", "mistyrose4")) + 
  scale_x_discrete(labels = c("prom3k" = "Promoter 3kb", "prom5k" = "Promoter 5kb", "prom10k" = "Promoter 10kb", "active_enhancers" = "Active enhancers", "Nanog" = "Nanog", "super_enhancers" = "Super enhancers")) +
  scale_y_continuous(breaks = pretty_breaks(n = 6)) +
  labs(title = "", x = NULL, y = "AUC") +
  boxplot_theme() + theme(legend.position = "top")
print(p_auc_box_pres)

pdf(file = paste0("out/", io$dataset, io$sub_dir, "/auc-pres.pdf"), width = 13, height = 9, useDingbats = FALSE)
p_auc_box_pres
dev.off()
rm(dt_boxplot, s2)

F1 performance

set.seed(17)
dt_boxplot <- copy(dt_analysis)
s2 <- 0.005

dt_boxplot <- dt_boxplot[, c("f_melissa", "f_melissa_rate", "f_indep_prof", "f_indep_rate", "f_rf", "f_deepcpg", "f_deepcpg_sub") := 
                             list(f_melissa + rnorm(.N, 0, s2), f_melissa_rate + rnorm(.N, 0, s2),
                                  f_indep_prof + rnorm(.N, 0, s2), f_indep_rate + rnorm(.N, 0, s2),
                                  f_rf + rnorm(.N, 0, s2), f_deepcpg + rnorm(.N, 0, s2), f_deepcpg_sub + rnorm(.N, 0, s2))]
dt_boxplot <- dt_boxplot[, c("region", "f_melissa", "f_melissa_rate", "f_indep_prof", "f_indep_rate", "f_rf", "f_deepcpg", "f_deepcpg_sub")]

dt_boxplot <- dt_boxplot %>% setnames(c("region", "f_melissa", "f_melissa_rate", "f_indep_prof", "f_indep_rate", "f_rf", "f_deepcpg", "f_deepcpg_sub"), c("x", "Melissa", "Melissa Rate", "BPRMeth", "Indep Rate", "RF", "DeepCpG", "DeepCpG Sub")) %>% .[, x := as.factor(x)] %>% melt(variable.name = "Model", value.name = "y")  %>% .[, x := factor(x, levels = c("prom10k", "prom5k", "prom3k", "Nanog", "super_enhancers", "active_enhancers"))] %>% 
    .[, Model := factor(Model, levels = c("Melissa", "DeepCpG", "DeepCpG Sub", "BPRMeth", "RF", "Melissa Rate", "Indep Rate"))]

p_f_box <- ggplot(dt_boxplot, aes(x = x, y = y, fill = Model)) +
  geom_boxplot(alpha = 0.8, outlier.shape = NA) +
  scale_fill_manual(values = c("red3", "darkgreen", "palegreen3", "chocolate2", "dodgerblue4", "mediumorchid4", "mistyrose4")) +
  scale_x_discrete(labels = c("prom3k" = "Promoter 3kb", "prom5k" = "Promoter 5kb", "prom10k" = "Promoter 10kb", "active_enhancers" = "Active enhancers", "Nanog" = "Nanog", "super_enhancers" = "Super enhancers")) +
  scale_y_continuous(breaks = pretty_breaks(n = 6)) +
  labs(title = "", x = NULL, y = "F-measure") +
  boxplot_theme()
print(p_f_box)

pdf(file = paste0("out/", io$dataset, io$sub_dir, "/f-mt-seq.pdf"), width = 15, height = 8, useDingbats = FALSE)
p_f_box
dev.off()
rm(dt_boxplot, s2)

TPR / FPR performance

set.seed(17)
dt_boxplot <- copy(dt_analysis)
# Keep only required columns
dt_boxplot <- dt_boxplot[, c("region", "tpr_fpr_melissa", "tpr_fpr_melissa_rate", "tpr_fpr_indep_prof", "tpr_fpr_indep_rate", "tpr_fpr_rf", "tpr_fpr_deepcpg", "tpr_fpr_deepcpg_sub")]
# Keep one simulation per region
rows <- c(3, 16, 24, 38, 42, 54)
dt_boxplot <- dt_boxplot[rows, ]
# dt_boxplot <- dt_boxplot[!duplicated(dt_boxplot$region), ]

# Change Model and Region names
colnames(dt_boxplot) <- c("Region", "Melissa", "Melissa Rate", "BPRMeth", "Indep Rate", "RF", "DeepCpG", "DeepCpG Sub")
model_names <- colnames(dt_boxplot)[2:NCOL(dt_boxplot)]
region_names <- c("Promoter 3kb", "Promoter 5kb", "Promoter 10kb", "Active enhancers", "Nanog", "Super enhancers")
dt_boxplot$Region <- region_names

# Extract FPR and TPR
dt_tpr_fpr <- data.table(fpr = numeric(), tpr = numeric(), Region = character(), Model = character())
for (i in 1:NROW(dt_boxplot)) {
    for (k in model_names) {
        perform <- dt_boxplot[, k, with = FALSE]
        dt <- data.table(fpr = perform[[1]][[i]]@x.values[[1]], tpr = perform[[1]][[i]]@y.values[[1]], 
                         Region = dt_boxplot$Region[i], Model = k)
        dt_tpr_fpr <- rbind(dt_tpr_fpr, dt)
    }
}

# Rename and refactor
dt_tpr_fpr <- dt_tpr_fpr %>% .[, c("Region", "Model") := list(as.factor(Region), as.factor(Model))] %>% .[, Region := factor(Region, levels = c("Promoter 10kb", "Promoter 5kb", "Promoter 3kb", "Nanog", "Super enhancers", "Active enhancers"))] %>% .[, Model := factor(Model, levels = c("Melissa", "Melissa Rate", "BPRMeth", "Indep Rate", "RF", "DeepCpG", "DeepCpG Sub"))] %>% 
    .[, Model := factor(Model, levels = c("Melissa", "DeepCpG", "DeepCpG Sub", "BPRMeth", "RF", "Melissa Rate", "Indep Rate"))]


p_tpr_fpr <- ggplot(dt_tpr_fpr, aes(x = fpr, y = tpr, group = Model)) +
    geom_line(aes(color = Model), size = 2) +
    facet_wrap( ~ Region) +
    scale_color_manual(values = c("red3", "darkgreen", "palegreen3", "chocolate2", "dodgerblue4", "mediumorchid4", "mistyrose4")) +
    scale_x_continuous(breaks = pretty_breaks(n = 6)) + 
    scale_y_continuous(breaks = pretty_breaks(n = 6)) +
    labs(title = "", x = "False positive rate", y = "True positive rate") +
    line_theme()
print(p_tpr_fpr)


pdf(file = paste0("out/", io$dataset, io$sub_dir, "/tpr-fpr-mt-seq.pdf"), width = 15, height = 12, useDingbats = FALSE)
p_tpr_fpr
dev.off()
rm(dt_boxplot, dt_tpr_fpr, region_names, model_names, perform, dt)

Precision / Recall performance

set.seed(17)
dt_boxplot <- copy(dt_analysis)
# Keep only required columns
dt_boxplot <- dt_boxplot[, c("region", "pr_melissa", "pr_melissa_rate", "pr_indep_prof", "pr_indep_rate", "pr_rf", "pr_deepcpg", "pr_deepcpg_sub")]
# Keep one simulation per region
rows <- c(3, 16, 24, 38, 42, 54)
dt_boxplot <- dt_boxplot[rows, ]

# Change Model and Region names
colnames(dt_boxplot) <- c("Region", "Melissa", "Melissa Rate", "BPRMeth", "Indep Rate", "RF", "DeepCpG", "DeepCpG Sub")
model_names <- colnames(dt_boxplot)[2:NCOL(dt_boxplot)]
region_names <- c("Promoter 3kb", "Promoter 5kb", "Promoter 10kb", "Active enhancers", "Nanog", "Super enhancers")
dt_boxplot$Region <- region_names

# Extract FPR and TPR
dt_pr <- data.table(x = numeric(), y = numeric(), Region = character(), Model = character())
for (i in 1:NROW(dt_boxplot)) {
    for (k in model_names) {
        perform <- dt_boxplot[, k, with = FALSE]
        len <- round(length(perform[[1]][[i]]@x.values[[1]]))
        dt <- data.table(x = perform[[1]][[i]]@x.values[[1]][5:len], 
                         y = perform[[1]][[i]]@y.values[[1]][5:len], 
                         Region = dt_boxplot$Region[i], Model = k)
        dt_pr <- rbind(dt_pr, dt)
    }
}

# Rename and refactor
dt_pr <- dt_pr %>% .[, c("Region", "Model") := list(as.factor(Region), as.factor(Model))] %>% .[, Region := factor(Region, levels = c("Promoter 10kb", "Promoter 5kb", "Promoter 3kb", "Nanog", "Super enhancers", "Active enhancers"))] %>% .[, Model := factor(Model, levels = c("Melissa", "Melissa Rate", "BPRMeth", "Indep Rate", "RF", "DeepCpG", "DeepCpG Sub"))] %>% 
    .[, Model := factor(Model, levels = c("Melissa", "DeepCpG", "DeepCpG Sub", "BPRMeth", "RF", "Melissa Rate", "Indep Rate"))]


p_pr <- ggplot(dt_pr, aes(x = x, y = y, group = Model)) +
    geom_line(aes(color = Model), size = 2) +
    facet_wrap( ~ Region) +
    scale_color_manual(values = c("red3", "darkgreen", "palegreen3", "chocolate2", "dodgerblue4", "mediumorchid4", "mistyrose4")) +
    scale_x_continuous(breaks = pretty_breaks(n = 4)) + 
    scale_y_continuous(limits = c(0.6, 0.99), breaks = pretty_breaks(n = 4)) +
    labs(title = "", x = "Recall", y = "Precision") +
    line_theme()
print(p_pr)

pdf(file = paste0("out/", io$dataset, io$sub_dir, "/pr-mt-seq.pdf"), width = 15, height = 12, useDingbats = FALSE)
p_pr
dev.off()

rm(dt_boxplot, dt_pr, region_names, model_names, perform, dt)

Example of clustered methylation profiles

context <- "prom3k"
iter <- 1
met_file <- paste0("~/datasets/melissa/met/filtered_met/", io$dataset, "/", context, "_cov", 
                   io$cov[iter], "_sd0.2_gene_var5.rds")
met_dt <- readRDS(met_file)
opts <- list(N = length(met_dt$met), M = length(met_dt$met[[1]]), filt_region_cov = 0.5)
# Filtering low covered regions across cells
met_dt <- filter_regions_across_cells(dt = met_dt, opts = opts)
met <- met_dt$met
opts$M <- length(met[[1]])  # Number of genomic regions
print(opts$M)

# Load Melissa model
melissa_obj <- readRDS(paste0(io$data_dir, "diffuse_melissa_sim10_", context, 
                              "_cov", io$cov[iter], "_sd0.2_K", io$K, "_M", io$M[iter], 
                              "_basis", io$basis[iter], "_dataPrcg", io$data_prcg, 
                              "_regionPrcg", io$reg_prcg, "_cpgPrcg", io$cpg_prcg,
                              "_filter", io$filter, "_gene_var5", ".rds"))

# Profiles for each region and cell from 1st simulation study
simulation <- 1
melissa <- melissa_obj$model[[simulation]]$melissa_prof
w_prof <- melissa$W
# Extract most didferent genomic regions from the weights of each cluster
dists <- vector("numeric", length = NROW(w_prof))
for (m in 1:NROW(w_prof)) {
  dists[m] <- sum(dist(t(pnorm(w_prof[m, ,])), method = "euclidean"))
}
Order <- order(dists, decreasing = TRUE)

##-----------------------------------------
# Create objects for plotting specific set of regions
#regions <- c(21, 152, 63)
regions <- c(236, 131, 136)
#regions <- seq(1, 20, 1)
# regions <- c(20, 4, 30)
cluster_assinments <- unique(melissa$labels)
region_melissa_obj <- list()
for (t in 1:length(regions)) {
    r <- Order[regions[t]]
    W_Sigma <- list()
    for (cl in 1:length(cluster_assinments)) {
        W_Sigma[[cl]] <- melissa$W_Sigma[[cluster_assinments[cl]]][[r]]
    }
    region_melissa_obj[[t]] <- list(W = w_prof[r,,cluster_assinments], 
                                    W_Sigma = W_Sigma,  #melissa$W_Sigma[cluster_assinments][[r]],
                                    basis = melissa$basis)
    class(region_melissa_obj[[t]]) <- c("cluster_profiles_vb_bernoulli", "cluster_profiles_vb")
    p_prof <- plot_cluster_profiles(cluster_obj = region_melissa_obj[[t]], 
                                    title = paste0("Gene ", regions[t], " ", met_dt$annos$gene_name[r]), 
                                    x_labels = c("-3Kb","", "TSS", "", "+3Kb")) + theme(legend.position = "left")
    print(p_prof)
}

x_lab <- c("-1.5Kb","", "TSS", "", "+1.5Kb")
reg1_plot <- ggplot_cluster_profiles(cluster_obj = region_melissa_obj[[1]], title = NULL, x_labels = x_lab, x_axis = paste0("Gene ", met_dt$annos$gene_name[Order[regions[1]]])) + theme(legend.position = "left")
reg2_plot <- ggplot_cluster_profiles(cluster_obj = region_melissa_obj[[2]], title = NULL, x_labels = x_lab, x_axis = paste0("Gene ", met_dt$annos$gene_name[Order[regions[2]]]), y_axis = NULL) + theme(legend.position = "none", axis.text.y = element_blank())
reg3_plot <- ggplot_cluster_profiles(cluster_obj = region_melissa_obj[[3]], title = NULL, x_labels = x_lab, x_axis = paste0("Gene ", met_dt$annos$gene_name[Order[regions[3]]]), y_axis = NULL) + theme(legend.position = "none", axis.text.y = element_blank())

Final plot

top_row <- plot_grid(p_auc_box, labels = c("a"), label_size = 25, ncol = 1, nrow = 1, rel_widths = c(1))
p_f_box <- p_f_box + theme(legend.position = c(0.75, 0.24))
middle_row <- plot_grid(p_f_box, labels = c("a"), label_size = 25, ncol = 1, nrow = 1, rel_widths = c(1))
bottom_row <- plot_grid(reg1_plot, reg2_plot, reg3_plot, labels = c("b", "", ""), 
                        label_size = 25, ncol = 3, nrow = 1, rel_widths = c(1.5, 1, 1), align = "hv", scale = 0.94)
final_fig <- plot_grid(middle_row, bottom_row, labels = c("", ""), label_size = 25, ncol = 1, nrow = 2, 
                       rel_heights = c(2, 1), rel_widths = c(1, 1))
print(final_fig)
pdf(file = paste0("out/", io$dataset, io$sub_dir, "/mt-seq-f.pdf"), width = 14.5, height = 12, useDingbats = FALSE)
final_fig
dev.off()

final_fig <- plot_grid(top_row, bottom_row, labels = c("", ""), label_size = 25, ncol = 1, nrow = 2, 
                       rel_heights = c(2.5, 1), rel_widths = c(1, 1))
print(final_fig)
pdf(file = paste0("out/", io$dataset, io$sub_dir, "/mt-seq-auc.pdf"), width = 14.5, height = 12, useDingbats = FALSE)
final_fig
dev.off()

Example of pluripotent gene profiles

context <- "prom10k"
iter <- 3
met_file <- paste0("~/datasets/melissa/met/filtered_met/", io$dataset, "/", context, "_cov", 
                   io$cov[iter], "_sd0.2_gene_var5.rds")
met_dt <- readRDS(met_file)
opts <- list(N = length(met_dt$met), M = length(met_dt$met[[1]]), filt_region_cov = 0.5)
# Filtering low covered regions across cells
met_dt <- filter_regions_across_cells(dt = met_dt, opts = opts)
met <- met_dt$met
opts$M <- length(met[[1]])  # Number of genomic regions
print(opts$M)
# Obtain gene set
geneset <- fread(paste0("~/datasets/melissa/genesets/pluripotency_extended.tsv"), header = FALSE)
# Indeces of genesets with observed data
geneset_ind <- which(met_dt$annos$id %in% geneset$V1)

# Load Melissa model
melissa_obj <- readRDS(paste0(io$data_dir, "diffuse_melissa_sim10_", context, 
                              "_cov", io$cov[iter], "_sd0.2_K", io$K, "_M", io$M[iter], 
                              "_basis", io$basis[iter], "_dataPrcg", io$data_prcg, 
                              "_regionPrcg", io$reg_prcg, "_cpgPrcg", io$cpg_prcg,
                              "_filter", io$filter, "_gene_var5", ".rds"))

# Profiles for each region and cell from 1st simulation study
simulation <- 1
melissa <- melissa_obj$model[[simulation]]$melissa_prof
w_prof <- melissa$W
# Extract most didferent genomic regions from the weights of each cluster
dists <- vector("numeric", length = NROW(w_prof))
for (m in 1:NROW(w_prof)) {
  dists[m] <- sum(dist(t(pnorm(w_prof[m, ,])), method = "euclidean"))
}
Order <- order(dists, decreasing = TRUE)

##-----------------------------------------
# Create objects for plotting specific set of regions
# Supplementary
regions <- c(57, 56, 6, 39, 50, 2, 13) # 24
# Main figure
regions <- c(28, 14, 10)
cluster_assinments <- unique(melissa$labels)
region_melissa_obj <- list()
iter <- 1
for (t in regions) {
    r <- geneset_ind[t]
    W_Sigma <- list()
    for (cl in 1:length(cluster_assinments)) {
        W_Sigma[[cl]] <- melissa$W_Sigma[[cluster_assinments[cl]]][[r]]
    }
    region_melissa_obj[[iter]] <- list(W = w_prof[r,,cluster_assinments], 
                                    W_Sigma = W_Sigma,  #melissa$W_Sigma[cluster_assinments][[r]],
                                    basis = melissa$basis)
    class(region_melissa_obj[[iter]]) <- c("cluster_profiles_vb_bernoulli", "cluster_profiles_vb")
    p_prof <- plot_cluster_profiles(cluster_obj = region_melissa_obj[[iter]], 
                                    title = paste0("Gene ", t, " ", geneset[V1 == met_dt$annos$id[r]][,2]),
                                    x_labels = c("-3Kb","", "TSS", "", "+3Kb")) + theme(legend.position = "left")
    print(p_prof)
    iter <- iter + 1
}

x_lab <- c("-5Kb","", "TSS", "", "+5Kb")
reg1_plot <- ggplot_cluster_profiles(cluster_obj = region_melissa_obj[[1]], title = NULL, x_labels = x_lab, x_axis = paste0("Gene ", geneset[V1 == met_dt$annos$id[geneset_ind[regions[1]]]][,2])) + theme(legend.position = "left")
reg2_plot <- ggplot_cluster_profiles(cluster_obj = region_melissa_obj[[2]], title = NULL, x_labels = x_lab, x_axis = paste0("Gene ", geneset[V1 == met_dt$annos$id[geneset_ind[regions[2]]]][,2]), y_axis = NULL) + theme(legend.position = "none", axis.text.y = element_blank())
reg3_plot <- ggplot_cluster_profiles(cluster_obj = region_melissa_obj[[3]], title = NULL, x_labels = x_lab, x_axis = paste0("Gene ", geneset[V1 == met_dt$annos$id[geneset_ind[regions[3]]]][,2]), y_axis = NULL) + theme(legend.position = "none", axis.text.y = element_blank())
reg4_plot <- ggplot_cluster_profiles(cluster_obj = region_melissa_obj[[4]], title = NULL, x_labels = x_lab, x_axis = paste0("Gene ", geneset[V1 == met_dt$annos$id[geneset_ind[regions[4]]]][,2])) + theme(legend.position = "left")
reg5_plot <- ggplot_cluster_profiles(cluster_obj = region_melissa_obj[[5]], title = NULL, x_labels = x_lab, x_axis = paste0("Gene ", geneset[V1 == met_dt$annos$id[geneset_ind[regions[5]]]][,2]), y_axis = NULL) + theme(legend.position = "none", axis.text.y = element_blank())
reg6_plot <- ggplot_cluster_profiles(cluster_obj = region_melissa_obj[[6]], title = NULL, x_labels = x_lab, x_axis = paste0("Gene ", geneset[V1 == met_dt$annos$id[geneset_ind[regions[6]]]][,2]), y_axis = NULL) + theme(legend.position = "none", axis.text.y = element_blank())

Pluripotency plot

prom_suppl_row <- plot_grid(reg1_plot, reg2_plot, reg3_plot, reg4_plot, reg5_plot, reg6_plot,#labels = c("", "", ""), 
                        label_size = 25, ncol = 3, nrow = 2, rel_widths = c(1.5, 1, 1), 
                        rel_heights = c(1, 1), align = "hv", scale = 0.94)

pdf(file = paste0("out/", io$dataset, io$sub_dir, "/suppl-prom-pluripotency-mt-seq.pdf"), width = 22.5, height = 8, useDingbats = FALSE)
prom_suppl_row
dev.off()


top_row <- plot_grid(p_auc_box, labels = c("a"), label_size = 25, ncol = 1, nrow = 1, rel_widths = c(1))
bottom_row <- plot_grid(reg1_plot, reg2_plot, reg3_plot, labels = c("b", "", ""), 
                        label_size = 25, ncol = 3, nrow = 1, rel_widths = c(1.5, 1, 1), align = "hv", scale = 0.94)
final_fig <- plot_grid(top_row, bottom_row, labels = c("", ""), label_size = 25, ncol = 1, nrow = 2, 
                       rel_heights = c(2.5, 1), rel_widths = c(1, 1))
print(final_fig)
pdf(file = paste0("out/", io$dataset, io$sub_dir, "/mt-seq-auc-pluripotency-prom.pdf"), width = 14.5, height = 12, useDingbats = FALSE)
final_fig
dev.off()


# enh_bottom_row <- plot_grid(enh_reg1_plot, enh_reg2_plot, enh_reg3_plot, #labels = c("", "", ""), 
#                         label_size = 25, ncol = 3, nrow = 1, rel_widths = c(1.5, 1, 1), align = "hv", scale = 0.94)
# final_fig <- plot_grid(top_row, prom_bottom_row, labels = c("", ""), label_size = 25, ncol = 1, nrow = 2, 
#                        rel_heights = c(1.35, 1), rel_widths = c(1, 1))
# print(final_fig)
# pdf(file = paste0("out/", io$dataset, io$sub_dir, "/smallwood.pdf"), width = 14.5, height = 12, useDingbats = FALSE)
# final_fig
# dev.off()
# 
# pdf(file = paste0("out/", io$dataset, io$sub_dir, "/prof-enh-smallwood.pdf"), width = 16.5, height = 4, useDingbats = FALSE)
# enh_bottom_row
# dev.off()

Examples of enhancer regions with 4 clusters

context <- "active_enhancers"
iter <- 4
met_file <- paste0("~/datasets/melissa/met/filtered_met/", io$dataset, "/", context, "_cov", 
                   io$cov[iter], "_sd0.2_gene_var5.rds")
met_dt <- readRDS(met_file)
opts <- list(N = length(met_dt$met), M = length(met_dt$met[[1]]), filt_region_cov = 0.5)
# Filtering low covered regions across cells
met_dt <- filter_regions_across_cells(dt = met_dt, opts = opts)
met <- met_dt$met
opts$M <- length(met[[1]])  # Number of genomic regions
print(opts$M)

# Load Melissa model
melissa_obj <- readRDS(paste0(io$data_dir, "diffuse_melissa_sim10_", context, 
                              "_cov", io$cov[iter], "_sd0.2_K", io$K, "_M", io$M[iter], 
                              "_basis", io$basis[iter], "_dataPrcg", io$data_prcg, 
                              "_regionPrcg", io$reg_prcg, "_cpgPrcg", io$cpg_prcg,
                              "_filter", io$filter, "_gene_var5", ".rds"))

# Profiles for each region and cell from 1st simulation study
simulation <- 1
melissa <- melissa_obj$model[[simulation]]$melissa_prof
w_prof <- melissa$W
# Extract most didferent genomic regions from the weights of each cluster
dists <- vector("numeric", length = NROW(w_prof))
for (m in 1:NROW(w_prof)) {
  dists[m] <- sum(dist(t(pnorm(w_prof[m, ,])), method = "euclidean"))
}
Order <- order(dists, decreasing = TRUE)

##-----------------------------------------
# Create objects for plotting specific set of regions
# Supplementary
regions <- c(57, 56, 6, 39, 50, 2, 13) # 24
# Main figure
#regions <- c(28, 14, 10)
cluster_assinments <- unique(melissa$labels)
region_melissa_obj <- list()
iter <- 1
for (t in 1:length(regions)) {
    r <- Order[regions[t]]
    r <- geneset_ind[t]
    W_Sigma <- list()
    for (cl in 1:length(cluster_assinments)) {
        W_Sigma[[cl]] <- melissa$W_Sigma[[cluster_assinments[cl]]][[r]]
    }
    region_melissa_obj[[iter]] <- list(W = w_prof[r,,cluster_assinments], 
                                    W_Sigma = W_Sigma,  #melissa$W_Sigma[cluster_assinments][[r]],
                                    basis = melissa$basis)
    class(region_melissa_obj[[iter]]) <- c("cluster_profiles_vb_bernoulli", "cluster_profiles_vb")
    p_prof <- plot_cluster_profiles(cluster_obj = region_melissa_obj[[iter]], 
                                    title = paste0("Gene ", t, " ", geneset[V1 == met_dt$annos$id[r]][,2]),
                                    x_labels = c("-3Kb","", "TSS", "", "+3Kb")) + theme(legend.position = "left")
    print(p_prof)
    iter <- iter + 1
}

x_lab <- c("-5Kb","", "TSS", "", "+5Kb")
reg1_plot <- ggplot_cluster_profiles(cluster_obj = region_melissa_obj[[1]], title = NULL, x_labels = x_lab, x_axis = paste0("Gene ", geneset[V1 == met_dt$annos$id[geneset_ind[regions[1]]]][,2])) + theme(legend.position = "left")
reg2_plot <- ggplot_cluster_profiles(cluster_obj = region_melissa_obj[[2]], title = NULL, x_labels = x_lab, x_axis = paste0("Gene ", geneset[V1 == met_dt$annos$id[geneset_ind[regions[2]]]][,2]), y_axis = NULL) + theme(legend.position = "none", axis.text.y = element_blank())
reg3_plot <- ggplot_cluster_profiles(cluster_obj = region_melissa_obj[[3]], title = NULL, x_labels = x_lab, x_axis = paste0("Gene ", geneset[V1 == met_dt$annos$id[geneset_ind[regions[3]]]][,2]), y_axis = NULL) + theme(legend.position = "none", axis.text.y = element_blank())
reg4_plot <- ggplot_cluster_profiles(cluster_obj = region_melissa_obj[[4]], title = NULL, x_labels = x_lab, x_axis = paste0("Gene ", geneset[V1 == met_dt$annos$id[geneset_ind[regions[4]]]][,2])) + theme(legend.position = "left")
reg5_plot <- ggplot_cluster_profiles(cluster_obj = region_melissa_obj[[5]], title = NULL, x_labels = x_lab, x_axis = paste0("Gene ", geneset[V1 == met_dt$annos$id[geneset_ind[regions[5]]]][,2]), y_axis = NULL) + theme(legend.position = "none", axis.text.y = element_blank())
reg6_plot <- ggplot_cluster_profiles(cluster_obj = region_melissa_obj[[6]], title = NULL, x_labels = x_lab, x_axis = paste0("Gene ", geneset[V1 == met_dt$annos$id[geneset_ind[regions[6]]]][,2]), y_axis = NULL) + theme(legend.position = "none", axis.text.y = element_blank())


andreaskapou/Melissa documentation built on June 12, 2020, 5:54 p.m.