suppressPackageStartupMessages(library(BPRMeth)) suppressPackageStartupMessages(library(purrr)) suppressPackageStartupMessages(library(ggplot2)) suppressPackageStartupMessages(library(cowplot)) suppressPackageStartupMessages(library(gridExtra)) suppressPackageStartupMessages(library(reshape2)) suppressPackageStartupMessages(library(grid)) suppressPackageStartupMessages(library(ROCR)) suppressPackageStartupMessages(library(scales)) suppressPackageStartupMessages(library(data.table)) suppressPackageStartupMessages(library(ggfortify)) suppressPackageStartupMessages(library(stringi)) # suppressPackageStartupMessages(library(proxy)) suppressPackageStartupMessages(library(RColorBrewer))
# Data io <- list() io$script_dir <- "../" io$dataset <- "mt-seq/" io$sub_dir <- "/subsampled/" io$data_dir <- paste0("../local-data/melissa/", io$dataset, "/imputation/") io$data_dir <- paste0("~/datasets/melissa/", io$dataset, "/imputation/") io$K <- 6 io$M <- c(541, 1468, 2666, 745, 231, 384) io$M_deepcpg <- c(435, 1182, 2113, 605, 183, 283) io$cov <- c(10, 10, 15, 10, 10, 10) #io$basis <- c(7, 11, 9, 7, 7, 7) io$basis <- c(9, 11, 13, 11, 9, 11) io$cpg_prcg <- 0.5 io$data_prcg <- 0.4 io$reg_prcg <- 0.95 io$filter <- 0.5 R.utils::sourceDirectory(paste0(io$script_dir, "lib/"), modifiedOnly = FALSE)
# Different CpG coverages io$regions <- c("prom3k", "prom5k", "prom10k", "active_enhancers", "Nanog", "super_enhancers") dt_analysis <- data.table(region = character(), auc_melissa = numeric(), auc_melissa_rate = numeric(), auc_indep_prof = numeric(), auc_indep_rate = numeric(), auc_rf = numeric(), auc_deepcpg = numeric(), auc_deepcpg_sub = numeric(), f_melissa = numeric(), f_melissa_rate = numeric(), f_indep_prof = numeric(), f_indep_rate = numeric(), f_rf = numeric(), f_deepcpg = numeric(), f_deepcpg_sub = numeric(), tpr_fpr_melissa = list(), tpr_fpr_melissa_rate = list(), tpr_fpr_indep_prof = list(), tpr_fpr_indep_rate = list(), tpr_fpr_rf = list(), tpr_fpr_deepcpg = numeric(), tpr_fpr_deepcpg_sub = numeric(), pr_melissa = list(), pr_melissa_rate = list(), pr_indep_prof = list(), pr_indep_rate = list(), pr_rf = list(), pr_deepcpg = numeric(), pr_deepcpg_sub = numeric() ) model_analysis <- data.table(region = character(), melissa = numeric(), melissa_rate = numeric()) iter <- 1 for (region in io$regions) { # Load joint analysis results dt_melissa <- readRDS(paste0(io$data_dir, "diffuse_melissa_sim10_", region, "_cov", io$cov[iter], "_sd0.2_K", io$K, "_M", io$M[iter], "_basis", io$basis[iter], "_dataPrcg", io$data_prcg, "_regionPrcg", io$reg_prcg, "_cpgPrcg", io$cpg_prcg, "_filter", io$filter, "_gene_var5", ".rds")) # Load independent analysis results dt_indep <- readRDS(paste0(io$data_dir, "indep_sim10_", region, "_cov", io$cov[iter], "_sd0.2_M", io$M[iter], "_basis", io$basis[iter], "_dataPrcg", io$data_prcg, "_regionPrcg", io$reg_prcg, "_cpgPrcg", io$cpg_prcg, "_filter", io$filter, "_gene_var5", ".rds")) # Load RF analysis results dt_rf <- readRDS(paste0(io$data_dir, "rf_indep_sim10_", region, "_cov", io$cov[iter], "_sd0.2_M", io$M[iter], "_dataPrcg", io$data_prcg, "_regionPrcg", io$reg_prcg, "_cpgPrcg", io$cpg_prcg, "_filter", io$filter, "_gene_var5", ".rds")) # Load RF analysis results dt_deepcpg <- readRDS(paste0(io$data_dir, "deepcpg/deepcpg_sim10_", region, "_cov", io$cov[iter], "_sd0.2_M", io$M_deepcpg[iter], "_dataPrcg", io$data_prcg, "_regionPrcg", io$reg_prcg, "_cpgPrcg", io$cpg_prcg, "_filter", io$filter, "_gene_var5", ".rds")) # Load RF analysis results dt_deepcpg_sub <- readRDS(paste0(io$data_dir, "deepcpg/subsampled/deepcpg_sim10_", region, "_cov", io$cov[iter], "_sd0.2_M", io$M_deepcpg[iter], "_dataPrcg", io$data_prcg, "_regionPrcg", io$reg_prcg, "_cpgPrcg", io$cpg_prcg, "_filter", io$filter, "_gene_var5", ".rds")) for (i in 1:length(dt_melissa$model)) { # Create prediction objects melissa_pred <- prediction(round(dt_melissa$model[[i]]$eval_perf$eval_prof$pred_obs, 2), dt_melissa$model[[i]]$eval_perf$eval_prof$act_obs) melissa_rate_pred <- prediction(round(dt_melissa$model[[i]]$eval_perf$eval_mean$pred_obs, 2), dt_melissa$model[[i]]$eval_perf$eval_mean$act_obs) indep_prof_pred <- prediction(round(dt_indep$model[[i]]$eval_perf$eval_prof$pred_obs, 2), dt_indep$model[[i]]$eval_perf$eval_prof$act_obs) indep_rate_pred <- prediction(round(dt_indep$model[[i]]$eval_perf$eval_mean$pred_obs, 2), dt_indep$model[[i]]$eval_perf$eval_mean$act_obs) rf_pred <- prediction(round(dt_rf$model[[i]]$eval_perf$pred_obs, 2), dt_rf$model[[i]]$eval_perf$act_obs) deepcpg_pred <- prediction(round(dt_deepcpg$model[[i]]$eval_perf$pred_obs, 2), dt_deepcpg$model[[i]]$eval_perf$act_obs) deepcpg_sub_pred <- prediction(round(dt_deepcpg_sub$model[[i]]$eval_perf$pred_obs, 2), dt_deepcpg_sub$model[[i]]$eval_perf$act_obs) # F-measure performance f_melissa <- performance(melissa_pred, "f") f_melissa_rate <- performance(melissa_rate_pred, "f") f_indep_prof <- performance(indep_prof_pred, "f") f_indep_rate <- performance(indep_rate_pred, "f") f_rf <- performance(rf_pred, "f") f_deepcpg <- performance(deepcpg_pred, "f") f_deepcpg_sub <- performance(deepcpg_sub_pred, "f") dt <- data.table(region = region, auc_melissa = performance(melissa_pred, "auc")@y.values[[1]], auc_melissa_rate = performance(melissa_rate_pred, "auc")@y.values[[1]], auc_indep_prof = performance(indep_prof_pred, "auc")@y.values[[1]], auc_indep_rate = performance(indep_rate_pred, "auc")@y.values[[1]], auc_rf = performance(rf_pred, "auc")@y.values[[1]], auc_deepcpg = performance(deepcpg_pred, "auc")@y.values[[1]], auc_deepcpg_sub = performance(deepcpg_sub_pred, "auc")@y.values[[1]], f_melissa = f_melissa@y.values[[1]][min(which(f_melissa@x.values[[1]] <= 0.5))], f_melissa_rate = f_melissa_rate@y.values[[1]][min(which(f_melissa_rate@x.values[[1]] <= 0.5))], f_indep_prof = f_indep_prof@y.values[[1]][min(which(f_indep_prof@x.values[[1]] <= 0.5))], f_indep_rate = f_indep_rate@y.values[[1]][min(which(f_indep_rate@x.values[[1]] <= 0.5))], f_rf = f_rf@y.values[[1]][min(which(f_rf@x.values[[1]] <= 0.5))], f_deepcpg = f_deepcpg@y.values[[1]][min(which(f_deepcpg@x.values[[1]] <= 0.5))], f_deepcpg_sub = f_deepcpg_sub@y.values[[1]][min(which(f_deepcpg_sub@x.values[[1]] <= 0.5))], tpr_fpr_melissa = list(tpr_fpr_melissa = performance(melissa_pred, "tpr", "fpr")), tpr_fpr_melissa_rate = list(tpr_fpr_melissa_rate = performance(melissa_rate_pred, "tpr", "fpr")), tpr_fpr_indep_prof = list(tpr_fpr_indep_prof = performance(indep_prof_pred, "tpr", "fpr")), tpr_fpr_indep_rate = list(tpr_fpr_indep_rate = performance(indep_rate_pred, "tpr", "fpr")), tpr_fpr_rf = list(tpr_fpr_rf = performance(rf_pred, "tpr", "fpr")), tpr_fpr_deepcpg = list(tpr_fpr_deepcpg = performance(deepcpg_pred, "tpr", "fpr")), tpr_fpr_deepcpg_sub = list(tpr_fpr_deepcpg_sub = performance(deepcpg_sub_pred, "tpr", "fpr")), pr_melissa = list(pr_melissa = performance(melissa_pred, "prec", "rec")), pr_melissa_rate = list(pr_melissa_rate = performance(melissa_rate_pred, "prec", "rec")), pr_indep_prof = list(pr_indep_prof = performance(indep_prof_pred, "prec", "rec")), pr_indep_rate = list(pr_indep_rate = performance(indep_rate_pred, "prec", "rec")), pr_rf = list(pr_rf = performance(rf_pred, "prec", "rec")), pr_deepcpg = list(pr_deepcpg = performance(deepcpg_pred, "prec", "rec")), pr_deepcpg_sub = list(pr_deepcpg_sub = performance(deepcpg_sub_pred, "prec", "rec")) ) # Add results to final data.table dt_analysis <- rbind(dt_analysis, dt) dt <- data.table(region = region, melissa = length(which(dt_melissa$model[[i]]$melissa_prof$delta > 4)), melissa_rate = length(which(dt_melissa$model[[i]]$melissa_rate$delta > 4))) model_analysis <- rbind(model_analysis, dt) } iter <- iter + 1 } rm(iter, dt, i, dt_rf, dt_indep, dt_melissa, melissa_pred, melissa_rate_pred, indep_prof_pred, indep_rate_pred, rf_pred, f_melissa, f_melissa_rate, f_indep_prof, f_indep_rate, f_rf)
set.seed(17) dt_boxplot <- copy(dt_analysis) s2 <- 0.004 dt_boxplot <- dt_boxplot[, c("auc_melissa", "auc_melissa_rate", "auc_indep_prof", "auc_indep_rate", "auc_rf", "auc_deepcpg", "auc_deepcpg_sub") := list(auc_melissa + rnorm(.N, 0, s2), auc_melissa_rate + rnorm(.N, 0, s2), auc_indep_prof + rnorm(.N, 0, s2), auc_indep_rate + rnorm(.N, 0, s2), auc_rf + rnorm(.N, 0, s2), auc_deepcpg + rnorm(.N, 0, s2), auc_deepcpg_sub + rnorm(.N, 0, s2))] dt_boxplot <- dt_boxplot[, c("region", "auc_melissa", "auc_melissa_rate", "auc_indep_prof", "auc_indep_rate", "auc_rf", "auc_deepcpg", "auc_deepcpg_sub")] dt_boxplot <- dt_boxplot %>% setnames(c("region", "auc_melissa", "auc_melissa_rate", "auc_indep_prof", "auc_indep_rate", "auc_rf", "auc_deepcpg", "auc_deepcpg_sub"), c("x", "Melissa", "Melissa Rate", "BPRMeth", "Indep Rate", "RF", "DeepCpG", "DeepCpG Sub")) %>% .[, x := as.factor(x)] %>% melt(variable.name = "Model", value.name = "y") %>% .[, x := factor(x, levels = c("prom10k", "prom5k", "prom3k", "Nanog", "super_enhancers", "active_enhancers"))] %>% .[, Model := factor(Model, levels = c("Melissa", "DeepCpG", "DeepCpG Sub", "BPRMeth", "RF", "Melissa Rate", "Indep Rate"))] p_auc_box <- ggplot(dt_boxplot, aes(x = x, y = y, fill = Model)) + geom_boxplot(alpha = 0.8, outlier.shape = NA) + scale_fill_manual(values = c("red3", "darkgreen", "palegreen3", "chocolate2", "dodgerblue4", "mediumorchid4", "mistyrose4")) + scale_x_discrete(labels = c("prom3k" = "Promoter 3kb", "prom5k" = "Promoter 5kb", "prom10k" = "Promoter 10kb", "active_enhancers" = "Active enhancers", "Nanog" = "Nanog", "super_enhancers" = "Super enhancers")) + scale_y_continuous(breaks = pretty_breaks(n = 6)) + labs(title = "", x = NULL, y = "AUC") + boxplot_theme() print(p_auc_box) pdf(file = paste0("out/", io$dataset, io$sub_dir, "/auc-mt-seq.pdf"), width = 15, height = 8, useDingbats = FALSE) p_auc_box dev.off() rm(dt_boxplot, s2)
set.seed(17) dt_boxplot <- copy(dt_analysis) s2 <- 0.005 dt_boxplot <- dt_boxplot[, c("auc_melissa", "auc_melissa_rate", "auc_indep_prof", "auc_indep_rate", "auc_rf", "auc_deepcpg") := list(auc_melissa + rnorm(.N, 0, s2), auc_melissa_rate + rnorm(.N, 0, s2), auc_indep_prof + rnorm(.N, 0, s2), auc_indep_rate + rnorm(.N, 0, s2), auc_rf + rnorm(.N, 0, s2), auc_deepcpg + rnorm(.N, 0, s2))] dt_boxplot <- dt_boxplot[, c("region", "auc_melissa", "auc_melissa_rate", "auc_indep_prof", "auc_indep_rate", "auc_rf", "auc_deepcpg")] dt_boxplot <- dt_boxplot %>% setnames(c("region", "auc_melissa", "auc_melissa_rate", "auc_indep_prof", "auc_indep_rate", "auc_rf", "auc_deepcpg"), c("x", "Melissa", "Melissa Rate", "BPRMeth", "Rate", "RF", "DeepCpG")) %>% .[, x := as.factor(x)] %>% melt(variable.name = "Model", value.name = "y") %>% .[, x := factor(x, levels = c("prom10k", "prom5k", "prom3k", "Nanog", "super_enhancers", "active_enhancers"))] %>% .[, Model := factor(Model, levels = c("Melissa", "DeepCpG", "BPRMeth", "RF", "Melissa Rate", "Rate"))] p_auc_box_pres <- ggplot(dt_boxplot, aes(x = x, y = y, fill = Model)) + geom_boxplot(alpha = 0.8, outlier.shape = NA) + #scale_fill_manual(values = c("red3", "mediumorchid4", "chocolate2", "dodgerblue4", "cornflowerblue", "green4")) + scale_fill_manual(values = c("red3", "darkgreen", "chocolate2", "dodgerblue4", "mediumorchid4", "mistyrose4")) + scale_x_discrete(labels = c("prom3k" = "Promoter 3kb", "prom5k" = "Promoter 5kb", "prom10k" = "Promoter 10kb", "active_enhancers" = "Active enhancers", "Nanog" = "Nanog", "super_enhancers" = "Super enhancers")) + scale_y_continuous(breaks = pretty_breaks(n = 6)) + labs(title = "", x = NULL, y = "AUC") + boxplot_theme() + theme(legend.position = "top") print(p_auc_box_pres) pdf(file = paste0("out/", io$dataset, io$sub_dir, "/auc-pres.pdf"), width = 13, height = 9, useDingbats = FALSE) p_auc_box_pres dev.off() rm(dt_boxplot, s2)
set.seed(17) dt_boxplot <- copy(dt_analysis) s2 <- 0.005 dt_boxplot <- dt_boxplot[, c("f_melissa", "f_melissa_rate", "f_indep_prof", "f_indep_rate", "f_rf", "f_deepcpg", "f_deepcpg_sub") := list(f_melissa + rnorm(.N, 0, s2), f_melissa_rate + rnorm(.N, 0, s2), f_indep_prof + rnorm(.N, 0, s2), f_indep_rate + rnorm(.N, 0, s2), f_rf + rnorm(.N, 0, s2), f_deepcpg + rnorm(.N, 0, s2), f_deepcpg_sub + rnorm(.N, 0, s2))] dt_boxplot <- dt_boxplot[, c("region", "f_melissa", "f_melissa_rate", "f_indep_prof", "f_indep_rate", "f_rf", "f_deepcpg", "f_deepcpg_sub")] dt_boxplot <- dt_boxplot %>% setnames(c("region", "f_melissa", "f_melissa_rate", "f_indep_prof", "f_indep_rate", "f_rf", "f_deepcpg", "f_deepcpg_sub"), c("x", "Melissa", "Melissa Rate", "BPRMeth", "Indep Rate", "RF", "DeepCpG", "DeepCpG Sub")) %>% .[, x := as.factor(x)] %>% melt(variable.name = "Model", value.name = "y") %>% .[, x := factor(x, levels = c("prom10k", "prom5k", "prom3k", "Nanog", "super_enhancers", "active_enhancers"))] %>% .[, Model := factor(Model, levels = c("Melissa", "DeepCpG", "DeepCpG Sub", "BPRMeth", "RF", "Melissa Rate", "Indep Rate"))] p_f_box <- ggplot(dt_boxplot, aes(x = x, y = y, fill = Model)) + geom_boxplot(alpha = 0.8, outlier.shape = NA) + scale_fill_manual(values = c("red3", "darkgreen", "palegreen3", "chocolate2", "dodgerblue4", "mediumorchid4", "mistyrose4")) + scale_x_discrete(labels = c("prom3k" = "Promoter 3kb", "prom5k" = "Promoter 5kb", "prom10k" = "Promoter 10kb", "active_enhancers" = "Active enhancers", "Nanog" = "Nanog", "super_enhancers" = "Super enhancers")) + scale_y_continuous(breaks = pretty_breaks(n = 6)) + labs(title = "", x = NULL, y = "F-measure") + boxplot_theme() print(p_f_box) pdf(file = paste0("out/", io$dataset, io$sub_dir, "/f-mt-seq.pdf"), width = 15, height = 8, useDingbats = FALSE) p_f_box dev.off() rm(dt_boxplot, s2)
set.seed(17) dt_boxplot <- copy(dt_analysis) # Keep only required columns dt_boxplot <- dt_boxplot[, c("region", "tpr_fpr_melissa", "tpr_fpr_melissa_rate", "tpr_fpr_indep_prof", "tpr_fpr_indep_rate", "tpr_fpr_rf", "tpr_fpr_deepcpg", "tpr_fpr_deepcpg_sub")] # Keep one simulation per region rows <- c(3, 16, 24, 38, 42, 54) dt_boxplot <- dt_boxplot[rows, ] # dt_boxplot <- dt_boxplot[!duplicated(dt_boxplot$region), ] # Change Model and Region names colnames(dt_boxplot) <- c("Region", "Melissa", "Melissa Rate", "BPRMeth", "Indep Rate", "RF", "DeepCpG", "DeepCpG Sub") model_names <- colnames(dt_boxplot)[2:NCOL(dt_boxplot)] region_names <- c("Promoter 3kb", "Promoter 5kb", "Promoter 10kb", "Active enhancers", "Nanog", "Super enhancers") dt_boxplot$Region <- region_names # Extract FPR and TPR dt_tpr_fpr <- data.table(fpr = numeric(), tpr = numeric(), Region = character(), Model = character()) for (i in 1:NROW(dt_boxplot)) { for (k in model_names) { perform <- dt_boxplot[, k, with = FALSE] dt <- data.table(fpr = perform[[1]][[i]]@x.values[[1]], tpr = perform[[1]][[i]]@y.values[[1]], Region = dt_boxplot$Region[i], Model = k) dt_tpr_fpr <- rbind(dt_tpr_fpr, dt) } } # Rename and refactor dt_tpr_fpr <- dt_tpr_fpr %>% .[, c("Region", "Model") := list(as.factor(Region), as.factor(Model))] %>% .[, Region := factor(Region, levels = c("Promoter 10kb", "Promoter 5kb", "Promoter 3kb", "Nanog", "Super enhancers", "Active enhancers"))] %>% .[, Model := factor(Model, levels = c("Melissa", "Melissa Rate", "BPRMeth", "Indep Rate", "RF", "DeepCpG", "DeepCpG Sub"))] %>% .[, Model := factor(Model, levels = c("Melissa", "DeepCpG", "DeepCpG Sub", "BPRMeth", "RF", "Melissa Rate", "Indep Rate"))] p_tpr_fpr <- ggplot(dt_tpr_fpr, aes(x = fpr, y = tpr, group = Model)) + geom_line(aes(color = Model), size = 2) + facet_wrap( ~ Region) + scale_color_manual(values = c("red3", "darkgreen", "palegreen3", "chocolate2", "dodgerblue4", "mediumorchid4", "mistyrose4")) + scale_x_continuous(breaks = pretty_breaks(n = 6)) + scale_y_continuous(breaks = pretty_breaks(n = 6)) + labs(title = "", x = "False positive rate", y = "True positive rate") + line_theme() print(p_tpr_fpr) pdf(file = paste0("out/", io$dataset, io$sub_dir, "/tpr-fpr-mt-seq.pdf"), width = 15, height = 12, useDingbats = FALSE) p_tpr_fpr dev.off() rm(dt_boxplot, dt_tpr_fpr, region_names, model_names, perform, dt)
set.seed(17) dt_boxplot <- copy(dt_analysis) # Keep only required columns dt_boxplot <- dt_boxplot[, c("region", "pr_melissa", "pr_melissa_rate", "pr_indep_prof", "pr_indep_rate", "pr_rf", "pr_deepcpg", "pr_deepcpg_sub")] # Keep one simulation per region rows <- c(3, 16, 24, 38, 42, 54) dt_boxplot <- dt_boxplot[rows, ] # Change Model and Region names colnames(dt_boxplot) <- c("Region", "Melissa", "Melissa Rate", "BPRMeth", "Indep Rate", "RF", "DeepCpG", "DeepCpG Sub") model_names <- colnames(dt_boxplot)[2:NCOL(dt_boxplot)] region_names <- c("Promoter 3kb", "Promoter 5kb", "Promoter 10kb", "Active enhancers", "Nanog", "Super enhancers") dt_boxplot$Region <- region_names # Extract FPR and TPR dt_pr <- data.table(x = numeric(), y = numeric(), Region = character(), Model = character()) for (i in 1:NROW(dt_boxplot)) { for (k in model_names) { perform <- dt_boxplot[, k, with = FALSE] len <- round(length(perform[[1]][[i]]@x.values[[1]])) dt <- data.table(x = perform[[1]][[i]]@x.values[[1]][5:len], y = perform[[1]][[i]]@y.values[[1]][5:len], Region = dt_boxplot$Region[i], Model = k) dt_pr <- rbind(dt_pr, dt) } } # Rename and refactor dt_pr <- dt_pr %>% .[, c("Region", "Model") := list(as.factor(Region), as.factor(Model))] %>% .[, Region := factor(Region, levels = c("Promoter 10kb", "Promoter 5kb", "Promoter 3kb", "Nanog", "Super enhancers", "Active enhancers"))] %>% .[, Model := factor(Model, levels = c("Melissa", "Melissa Rate", "BPRMeth", "Indep Rate", "RF", "DeepCpG", "DeepCpG Sub"))] %>% .[, Model := factor(Model, levels = c("Melissa", "DeepCpG", "DeepCpG Sub", "BPRMeth", "RF", "Melissa Rate", "Indep Rate"))] p_pr <- ggplot(dt_pr, aes(x = x, y = y, group = Model)) + geom_line(aes(color = Model), size = 2) + facet_wrap( ~ Region) + scale_color_manual(values = c("red3", "darkgreen", "palegreen3", "chocolate2", "dodgerblue4", "mediumorchid4", "mistyrose4")) + scale_x_continuous(breaks = pretty_breaks(n = 4)) + scale_y_continuous(limits = c(0.6, 0.99), breaks = pretty_breaks(n = 4)) + labs(title = "", x = "Recall", y = "Precision") + line_theme() print(p_pr) pdf(file = paste0("out/", io$dataset, io$sub_dir, "/pr-mt-seq.pdf"), width = 15, height = 12, useDingbats = FALSE) p_pr dev.off() rm(dt_boxplot, dt_pr, region_names, model_names, perform, dt)
context <- "prom3k" iter <- 1 met_file <- paste0("~/datasets/melissa/met/filtered_met/", io$dataset, "/", context, "_cov", io$cov[iter], "_sd0.2_gene_var5.rds") met_dt <- readRDS(met_file) opts <- list(N = length(met_dt$met), M = length(met_dt$met[[1]]), filt_region_cov = 0.5) # Filtering low covered regions across cells met_dt <- filter_regions_across_cells(dt = met_dt, opts = opts) met <- met_dt$met opts$M <- length(met[[1]]) # Number of genomic regions print(opts$M) # Load Melissa model melissa_obj <- readRDS(paste0(io$data_dir, "diffuse_melissa_sim10_", context, "_cov", io$cov[iter], "_sd0.2_K", io$K, "_M", io$M[iter], "_basis", io$basis[iter], "_dataPrcg", io$data_prcg, "_regionPrcg", io$reg_prcg, "_cpgPrcg", io$cpg_prcg, "_filter", io$filter, "_gene_var5", ".rds")) # Profiles for each region and cell from 1st simulation study simulation <- 1 melissa <- melissa_obj$model[[simulation]]$melissa_prof w_prof <- melissa$W # Extract most didferent genomic regions from the weights of each cluster dists <- vector("numeric", length = NROW(w_prof)) for (m in 1:NROW(w_prof)) { dists[m] <- sum(dist(t(pnorm(w_prof[m, ,])), method = "euclidean")) } Order <- order(dists, decreasing = TRUE) ##----------------------------------------- # Create objects for plotting specific set of regions #regions <- c(21, 152, 63) regions <- c(236, 131, 136) #regions <- seq(1, 20, 1) # regions <- c(20, 4, 30) cluster_assinments <- unique(melissa$labels) region_melissa_obj <- list() for (t in 1:length(regions)) { r <- Order[regions[t]] W_Sigma <- list() for (cl in 1:length(cluster_assinments)) { W_Sigma[[cl]] <- melissa$W_Sigma[[cluster_assinments[cl]]][[r]] } region_melissa_obj[[t]] <- list(W = w_prof[r,,cluster_assinments], W_Sigma = W_Sigma, #melissa$W_Sigma[cluster_assinments][[r]], basis = melissa$basis) class(region_melissa_obj[[t]]) <- c("cluster_profiles_vb_bernoulli", "cluster_profiles_vb") p_prof <- plot_cluster_profiles(cluster_obj = region_melissa_obj[[t]], title = paste0("Gene ", regions[t], " ", met_dt$annos$gene_name[r]), x_labels = c("-3Kb","", "TSS", "", "+3Kb")) + theme(legend.position = "left") print(p_prof) } x_lab <- c("-1.5Kb","", "TSS", "", "+1.5Kb") reg1_plot <- ggplot_cluster_profiles(cluster_obj = region_melissa_obj[[1]], title = NULL, x_labels = x_lab, x_axis = paste0("Gene ", met_dt$annos$gene_name[Order[regions[1]]])) + theme(legend.position = "left") reg2_plot <- ggplot_cluster_profiles(cluster_obj = region_melissa_obj[[2]], title = NULL, x_labels = x_lab, x_axis = paste0("Gene ", met_dt$annos$gene_name[Order[regions[2]]]), y_axis = NULL) + theme(legend.position = "none", axis.text.y = element_blank()) reg3_plot <- ggplot_cluster_profiles(cluster_obj = region_melissa_obj[[3]], title = NULL, x_labels = x_lab, x_axis = paste0("Gene ", met_dt$annos$gene_name[Order[regions[3]]]), y_axis = NULL) + theme(legend.position = "none", axis.text.y = element_blank())
top_row <- plot_grid(p_auc_box, labels = c("a"), label_size = 25, ncol = 1, nrow = 1, rel_widths = c(1)) p_f_box <- p_f_box + theme(legend.position = c(0.75, 0.24)) middle_row <- plot_grid(p_f_box, labels = c("a"), label_size = 25, ncol = 1, nrow = 1, rel_widths = c(1)) bottom_row <- plot_grid(reg1_plot, reg2_plot, reg3_plot, labels = c("b", "", ""), label_size = 25, ncol = 3, nrow = 1, rel_widths = c(1.5, 1, 1), align = "hv", scale = 0.94) final_fig <- plot_grid(middle_row, bottom_row, labels = c("", ""), label_size = 25, ncol = 1, nrow = 2, rel_heights = c(2, 1), rel_widths = c(1, 1)) print(final_fig) pdf(file = paste0("out/", io$dataset, io$sub_dir, "/mt-seq-f.pdf"), width = 14.5, height = 12, useDingbats = FALSE) final_fig dev.off() final_fig <- plot_grid(top_row, bottom_row, labels = c("", ""), label_size = 25, ncol = 1, nrow = 2, rel_heights = c(2.5, 1), rel_widths = c(1, 1)) print(final_fig) pdf(file = paste0("out/", io$dataset, io$sub_dir, "/mt-seq-auc.pdf"), width = 14.5, height = 12, useDingbats = FALSE) final_fig dev.off()
context <- "prom10k" iter <- 3 met_file <- paste0("~/datasets/melissa/met/filtered_met/", io$dataset, "/", context, "_cov", io$cov[iter], "_sd0.2_gene_var5.rds") met_dt <- readRDS(met_file) opts <- list(N = length(met_dt$met), M = length(met_dt$met[[1]]), filt_region_cov = 0.5) # Filtering low covered regions across cells met_dt <- filter_regions_across_cells(dt = met_dt, opts = opts) met <- met_dt$met opts$M <- length(met[[1]]) # Number of genomic regions print(opts$M) # Obtain gene set geneset <- fread(paste0("~/datasets/melissa/genesets/pluripotency_extended.tsv"), header = FALSE) # Indeces of genesets with observed data geneset_ind <- which(met_dt$annos$id %in% geneset$V1) # Load Melissa model melissa_obj <- readRDS(paste0(io$data_dir, "diffuse_melissa_sim10_", context, "_cov", io$cov[iter], "_sd0.2_K", io$K, "_M", io$M[iter], "_basis", io$basis[iter], "_dataPrcg", io$data_prcg, "_regionPrcg", io$reg_prcg, "_cpgPrcg", io$cpg_prcg, "_filter", io$filter, "_gene_var5", ".rds")) # Profiles for each region and cell from 1st simulation study simulation <- 1 melissa <- melissa_obj$model[[simulation]]$melissa_prof w_prof <- melissa$W # Extract most didferent genomic regions from the weights of each cluster dists <- vector("numeric", length = NROW(w_prof)) for (m in 1:NROW(w_prof)) { dists[m] <- sum(dist(t(pnorm(w_prof[m, ,])), method = "euclidean")) } Order <- order(dists, decreasing = TRUE) ##----------------------------------------- # Create objects for plotting specific set of regions # Supplementary regions <- c(57, 56, 6, 39, 50, 2, 13) # 24 # Main figure regions <- c(28, 14, 10) cluster_assinments <- unique(melissa$labels) region_melissa_obj <- list() iter <- 1 for (t in regions) { r <- geneset_ind[t] W_Sigma <- list() for (cl in 1:length(cluster_assinments)) { W_Sigma[[cl]] <- melissa$W_Sigma[[cluster_assinments[cl]]][[r]] } region_melissa_obj[[iter]] <- list(W = w_prof[r,,cluster_assinments], W_Sigma = W_Sigma, #melissa$W_Sigma[cluster_assinments][[r]], basis = melissa$basis) class(region_melissa_obj[[iter]]) <- c("cluster_profiles_vb_bernoulli", "cluster_profiles_vb") p_prof <- plot_cluster_profiles(cluster_obj = region_melissa_obj[[iter]], title = paste0("Gene ", t, " ", geneset[V1 == met_dt$annos$id[r]][,2]), x_labels = c("-3Kb","", "TSS", "", "+3Kb")) + theme(legend.position = "left") print(p_prof) iter <- iter + 1 } x_lab <- c("-5Kb","", "TSS", "", "+5Kb") reg1_plot <- ggplot_cluster_profiles(cluster_obj = region_melissa_obj[[1]], title = NULL, x_labels = x_lab, x_axis = paste0("Gene ", geneset[V1 == met_dt$annos$id[geneset_ind[regions[1]]]][,2])) + theme(legend.position = "left") reg2_plot <- ggplot_cluster_profiles(cluster_obj = region_melissa_obj[[2]], title = NULL, x_labels = x_lab, x_axis = paste0("Gene ", geneset[V1 == met_dt$annos$id[geneset_ind[regions[2]]]][,2]), y_axis = NULL) + theme(legend.position = "none", axis.text.y = element_blank()) reg3_plot <- ggplot_cluster_profiles(cluster_obj = region_melissa_obj[[3]], title = NULL, x_labels = x_lab, x_axis = paste0("Gene ", geneset[V1 == met_dt$annos$id[geneset_ind[regions[3]]]][,2]), y_axis = NULL) + theme(legend.position = "none", axis.text.y = element_blank()) reg4_plot <- ggplot_cluster_profiles(cluster_obj = region_melissa_obj[[4]], title = NULL, x_labels = x_lab, x_axis = paste0("Gene ", geneset[V1 == met_dt$annos$id[geneset_ind[regions[4]]]][,2])) + theme(legend.position = "left") reg5_plot <- ggplot_cluster_profiles(cluster_obj = region_melissa_obj[[5]], title = NULL, x_labels = x_lab, x_axis = paste0("Gene ", geneset[V1 == met_dt$annos$id[geneset_ind[regions[5]]]][,2]), y_axis = NULL) + theme(legend.position = "none", axis.text.y = element_blank()) reg6_plot <- ggplot_cluster_profiles(cluster_obj = region_melissa_obj[[6]], title = NULL, x_labels = x_lab, x_axis = paste0("Gene ", geneset[V1 == met_dt$annos$id[geneset_ind[regions[6]]]][,2]), y_axis = NULL) + theme(legend.position = "none", axis.text.y = element_blank())
prom_suppl_row <- plot_grid(reg1_plot, reg2_plot, reg3_plot, reg4_plot, reg5_plot, reg6_plot,#labels = c("", "", ""), label_size = 25, ncol = 3, nrow = 2, rel_widths = c(1.5, 1, 1), rel_heights = c(1, 1), align = "hv", scale = 0.94) pdf(file = paste0("out/", io$dataset, io$sub_dir, "/suppl-prom-pluripotency-mt-seq.pdf"), width = 22.5, height = 8, useDingbats = FALSE) prom_suppl_row dev.off() top_row <- plot_grid(p_auc_box, labels = c("a"), label_size = 25, ncol = 1, nrow = 1, rel_widths = c(1)) bottom_row <- plot_grid(reg1_plot, reg2_plot, reg3_plot, labels = c("b", "", ""), label_size = 25, ncol = 3, nrow = 1, rel_widths = c(1.5, 1, 1), align = "hv", scale = 0.94) final_fig <- plot_grid(top_row, bottom_row, labels = c("", ""), label_size = 25, ncol = 1, nrow = 2, rel_heights = c(2.5, 1), rel_widths = c(1, 1)) print(final_fig) pdf(file = paste0("out/", io$dataset, io$sub_dir, "/mt-seq-auc-pluripotency-prom.pdf"), width = 14.5, height = 12, useDingbats = FALSE) final_fig dev.off() # enh_bottom_row <- plot_grid(enh_reg1_plot, enh_reg2_plot, enh_reg3_plot, #labels = c("", "", ""), # label_size = 25, ncol = 3, nrow = 1, rel_widths = c(1.5, 1, 1), align = "hv", scale = 0.94) # final_fig <- plot_grid(top_row, prom_bottom_row, labels = c("", ""), label_size = 25, ncol = 1, nrow = 2, # rel_heights = c(1.35, 1), rel_widths = c(1, 1)) # print(final_fig) # pdf(file = paste0("out/", io$dataset, io$sub_dir, "/smallwood.pdf"), width = 14.5, height = 12, useDingbats = FALSE) # final_fig # dev.off() # # pdf(file = paste0("out/", io$dataset, io$sub_dir, "/prof-enh-smallwood.pdf"), width = 16.5, height = 4, useDingbats = FALSE) # enh_bottom_row # dev.off()
context <- "active_enhancers" iter <- 4 met_file <- paste0("~/datasets/melissa/met/filtered_met/", io$dataset, "/", context, "_cov", io$cov[iter], "_sd0.2_gene_var5.rds") met_dt <- readRDS(met_file) opts <- list(N = length(met_dt$met), M = length(met_dt$met[[1]]), filt_region_cov = 0.5) # Filtering low covered regions across cells met_dt <- filter_regions_across_cells(dt = met_dt, opts = opts) met <- met_dt$met opts$M <- length(met[[1]]) # Number of genomic regions print(opts$M) # Load Melissa model melissa_obj <- readRDS(paste0(io$data_dir, "diffuse_melissa_sim10_", context, "_cov", io$cov[iter], "_sd0.2_K", io$K, "_M", io$M[iter], "_basis", io$basis[iter], "_dataPrcg", io$data_prcg, "_regionPrcg", io$reg_prcg, "_cpgPrcg", io$cpg_prcg, "_filter", io$filter, "_gene_var5", ".rds")) # Profiles for each region and cell from 1st simulation study simulation <- 1 melissa <- melissa_obj$model[[simulation]]$melissa_prof w_prof <- melissa$W # Extract most didferent genomic regions from the weights of each cluster dists <- vector("numeric", length = NROW(w_prof)) for (m in 1:NROW(w_prof)) { dists[m] <- sum(dist(t(pnorm(w_prof[m, ,])), method = "euclidean")) } Order <- order(dists, decreasing = TRUE) ##----------------------------------------- # Create objects for plotting specific set of regions # Supplementary regions <- c(57, 56, 6, 39, 50, 2, 13) # 24 # Main figure #regions <- c(28, 14, 10) cluster_assinments <- unique(melissa$labels) region_melissa_obj <- list() iter <- 1 for (t in 1:length(regions)) { r <- Order[regions[t]] r <- geneset_ind[t] W_Sigma <- list() for (cl in 1:length(cluster_assinments)) { W_Sigma[[cl]] <- melissa$W_Sigma[[cluster_assinments[cl]]][[r]] } region_melissa_obj[[iter]] <- list(W = w_prof[r,,cluster_assinments], W_Sigma = W_Sigma, #melissa$W_Sigma[cluster_assinments][[r]], basis = melissa$basis) class(region_melissa_obj[[iter]]) <- c("cluster_profiles_vb_bernoulli", "cluster_profiles_vb") p_prof <- plot_cluster_profiles(cluster_obj = region_melissa_obj[[iter]], title = paste0("Gene ", t, " ", geneset[V1 == met_dt$annos$id[r]][,2]), x_labels = c("-3Kb","", "TSS", "", "+3Kb")) + theme(legend.position = "left") print(p_prof) iter <- iter + 1 } x_lab <- c("-5Kb","", "TSS", "", "+5Kb") reg1_plot <- ggplot_cluster_profiles(cluster_obj = region_melissa_obj[[1]], title = NULL, x_labels = x_lab, x_axis = paste0("Gene ", geneset[V1 == met_dt$annos$id[geneset_ind[regions[1]]]][,2])) + theme(legend.position = "left") reg2_plot <- ggplot_cluster_profiles(cluster_obj = region_melissa_obj[[2]], title = NULL, x_labels = x_lab, x_axis = paste0("Gene ", geneset[V1 == met_dt$annos$id[geneset_ind[regions[2]]]][,2]), y_axis = NULL) + theme(legend.position = "none", axis.text.y = element_blank()) reg3_plot <- ggplot_cluster_profiles(cluster_obj = region_melissa_obj[[3]], title = NULL, x_labels = x_lab, x_axis = paste0("Gene ", geneset[V1 == met_dt$annos$id[geneset_ind[regions[3]]]][,2]), y_axis = NULL) + theme(legend.position = "none", axis.text.y = element_blank()) reg4_plot <- ggplot_cluster_profiles(cluster_obj = region_melissa_obj[[4]], title = NULL, x_labels = x_lab, x_axis = paste0("Gene ", geneset[V1 == met_dt$annos$id[geneset_ind[regions[4]]]][,2])) + theme(legend.position = "left") reg5_plot <- ggplot_cluster_profiles(cluster_obj = region_melissa_obj[[5]], title = NULL, x_labels = x_lab, x_axis = paste0("Gene ", geneset[V1 == met_dt$annos$id[geneset_ind[regions[5]]]][,2]), y_axis = NULL) + theme(legend.position = "none", axis.text.y = element_blank()) reg6_plot <- ggplot_cluster_profiles(cluster_obj = region_melissa_obj[[6]], title = NULL, x_labels = x_lab, x_axis = paste0("Gene ", geneset[V1 == met_dt$annos$id[geneset_ind[regions[6]]]][,2]), y_axis = NULL) + theme(legend.position = "none", axis.text.y = element_blank())
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.