knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width=7, fig.height=5 )
library(MitoHEAR)
In the first part of the vignette it is shown the analysis performed on the dataset with GEO accession number GSE115210 from Ludwig et al, 2019 used in fig. 2/S2. Individually sorted cells from clonally derived TF1 clones (C9, D6, and G10) were processed with single cell RNA-seq (Smart-seq2)
load(system.file("extdata", "name_cells_fig_2.Rda", package = "MitoHEAR")) load(system.file("extdata", "name_cells_fig_2_analysis.Rda", package = "MitoHEAR"))
We don't execute the function get_raw_counts_allele here and we directly load his output. A command line implementation of the function get_raw_counts_allele is also available (see github README file for info).
load(system.file("extdata", "output_SNP_mt_fig_2_new.Rda", package = "MitoHEAR")) output_SNP_mt_fig_2 <- result matrix_allele_counts <- output_SNP_mt_fig_2[[1]] name_position_allele <- output_SNP_mt_fig_2[[2]] name_position <- output_SNP_mt_fig_2[[3]]
my.clusters <- rep(0, length(name_cells_fig_2)) my.clusters[grep("_G10_", name_cells_fig_2)] <- "G10" my.clusters[grep("_D6_", name_cells_fig_2)] <- "D6" my.clusters[grep("_C9_", name_cells_fig_2)] <- "C9" tfi_fig_2 <- get_heteroplasmy(matrix_allele_counts, name_position_allele, name_position, number_reads = 50, number_positions = 2000, filtering = 1, my.clusters)
sum_matrix <- tfi_fig_2[[1]] sum_matrix_qc <- tfi_fig_2[[2]] heteroplasmy_matrix_ci <- tfi_fig_2[[3]] allele_matrix_ci<- tfi_fig_2[[4]] cluster_ci <- my.clusters[row.names(sum_matrix)%in%row.names(sum_matrix_qc)] index_ci <- tfi_fig_2[[5]]
name_position_allele_qc <- name_position_allele[name_position%in%colnames(sum_matrix_qc)] name_position_qc <- name_position[name_position%in%colnames(sum_matrix_qc)]
relevant_bases <- filter_bases(heteroplasmy_matrix_ci, min_heteroplasmy = 0.01, min_cells = 10, index_ci)
p_value_wilcox_test_1 <- get_wilcox_test(heteroplasmy_matrix_ci[, relevant_bases], cluster_ci, "C9", "D6" , index_ci) p_value_wilcox_test_2 <- get_wilcox_test(heteroplasmy_matrix_ci[, relevant_bases], cluster_ci, "C9", "G10" , index_ci) p_value_wilcox_test_3 <- get_wilcox_test(heteroplasmy_matrix_ci[, relevant_bases], cluster_ci, "D6", "G10" , index_ci)
p_value_wilcox_test_sort_1 <- sort(p_value_wilcox_test_1, decreasing = F) p_value_wilcox_test_sort_2 <- sort(p_value_wilcox_test_2, decreasing = F) p_value_wilcox_test_sort_3 <- sort(p_value_wilcox_test_3, decreasing = F) p_value_wilcox_test_sort_small_1 <- p_value_wilcox_test_sort_1[p_value_wilcox_test_sort_1<0.05] p_value_wilcox_test_sort_small_2 <- p_value_wilcox_test_sort_2[p_value_wilcox_test_sort_2<0.05] p_value_wilcox_test_sort_small_3 <- p_value_wilcox_test_sort_3[p_value_wilcox_test_sort_3<0.05]
q <- list() for ( i in 1:length(p_value_wilcox_test_sort_small_3[1:2])){ p <- plot_heteroplasmy(names(p_value_wilcox_test_sort_small_3)[i], heteroplasmy_matrix_ci, cluster_ci, index_ci)+ggplot2::ggtitle(paste(names(p_value_wilcox_test_sort_small_3)[i], round(p_value_wilcox_test_sort_small_3[i], 4), sep = "-")) q <- list(q, p) } q q <- list() for ( i in names(p_value_wilcox_test_sort_small_3[1:2])){ p <- plot_allele_frequency(i, heteroplasmy_matrix_ci, allele_matrix_ci, cluster_ci, name_position_qc, name_position_allele_qc, 5, index_ci) q <- list(q, p) } q
features_cluster <- c(names(p_value_wilcox_test_sort_small_1), names(p_value_wilcox_test_sort_small_2), names(p_value_wilcox_test_sort_small_3)) features_cluster <- unique(features_cluster) result_clustering_sc <- clustering_angular_distance(heteroplasmy_matrix_ci, allele_matrix_ci, cluster_ci, length(colnames(heteroplasmy_matrix_ci)), deepSplit_param = 1, minClusterSize_param = 15, 0.2, min_value = 0.001, index = index_ci, relevant_bases = features_cluster) old_new_classification <-result_clustering_sc[[1]] dist_matrix_sc <- result_clustering_sc[[2]] top_dist <- result_clustering_sc[[3]] common_idx <- result_clustering_sc[[4]] old_classification <- as.vector(old_new_classification[, 1]) new_classification <- as.vector(old_new_classification[, 2])
Comparison between the ground truth and the new partition obtained with supervised cluster analysis
plot_heatmap( new_classification, old_classification, (dist_matrix_sc), cluster_columns = F, cluster_rows = F, "Euclidean distance")
result_clustering_sc <- clustering_angular_distance(heteroplasmy_matrix_ci, allele_matrix_ci, cluster_ci, length(colnames(heteroplasmy_matrix_ci)), deepSplit_param = 1, minClusterSize_param = 15, 0.2, min_value = 0.001, index = index_ci, relevant_bases = NULL) old_new_classification <- result_clustering_sc[[1]] dist_matrix_sc <- result_clustering_sc[[2]] top_dist <- result_clustering_sc[[3]] common_idx <- result_clustering_sc[[4]] old_classification <- as.vector(old_new_classification[, 1]) new_classification <- as.vector(old_new_classification[, 2])
Comparison between the ground truth and the new partition obtained with unsupervised cluster analysis
plot_heatmap(new_classification, old_classification, (dist_matrix_sc), cluster_columns = F, cluster_rows = F, "Euclidean distance")
Below the bases selected for the unsupervised cluster analysis
q <- list() for ( i in 1:length(top_dist)){ p <- plot_heteroplasmy(top_dist[i], heteroplasmy_matrix_ci, cluster_ci, index_ci) q <- list(q, p) } q
In the second part of the vignette it is shown the analysis performed on the dataset with GEO accession number GSE115214 from Ludwig et al, 2019 used in fig. 5. Primary human cells( CD34+ hematopoietic stem and progenitor cells HSPCs) from two independent donors were processed with single cell RNA-seq (Smart-seq2)
load(system.file("extdata", "name_cells_fig_5_all.Rda", package = "MitoHEAR")) load(system.file("extdata", "name_cells_fig_5_all_analysis.Rda", package = "MitoHEAR"))
path_meta_data <- system.file("extdata", "CD34_colonies_table.txt", package = "MitoHEAR") cell_convert <- read.table(path_meta_data, header = T) cell_old <- cell_convert$ID1 cell_new <- cell_convert$ID2 cell_old_summary <- rep(0, length(name_cells_fig_5_all)) for (i in 1:length(cell_old_summary)){ cell_old_summary[i] <- c(paste(strsplit(name_cells_fig_5_all, "_")[[i]][3:6], collapse = "_")) } change <- cell_old_summary[grep("_colonies", cell_old_summary)] cell_old_summary[grep("_colonies", cell_old_summary)] <- substr(change, 1, nchar(change)-9)
We don't execute the function get_raw_counts_allele here and we directly load his output. A command line implementation of the function get_raw_counts_allele is also available (see github README file for info).
path_meta_data <- load(system.file("extdata", "output_SNP_mt_fig_5_new.Rda", package = "MitoHEAR")) output_SNP_mt_fig_5 <- result matrix_allele_counts <- output_SNP_mt_fig_5[[1]] name_position_allele <- output_SNP_mt_fig_5[[2]] name_position <- output_SNP_mt_fig_5[[3]]
common_old <- intersect(cell_old, cell_old_summary) row.names(matrix_allele_counts) <- cell_old_summary matrix_allele_counts <- matrix_allele_counts[common_old, ] meta_old <- data.frame(cell_old, cell_new) row.names(meta_old) <- cell_old meta_old <- meta_old[common_old, ] new_small <- meta_old[, 2] row.names(matrix_allele_counts) <- new_small
donor_all <- rep(0, length(row.names(matrix_allele_counts))) donor_all[grep("Donor1_", row.names(matrix_allele_counts))] <- "Donor_1" donor_all[grep("Donor2_", row.names(matrix_allele_counts))] <- "Donor_2" donor_1_2 <- get_heteroplasmy(matrix_allele_counts, name_position_allele, name_position, number_reads = 50, number_positions = 2000, filtering = 2, donor_all)
sum_matrix <- donor_1_2[[1]] sum_matrix_qc <- donor_1_2[[2]] heteroplasmy_matrix_ci <- donor_1_2[[3]] allele_matrix_ci <- donor_1_2[[4]] cluster_ci <- donor_all[row.names(sum_matrix)%in%row.names(sum_matrix_qc)] index_ci <- donor_1_2[[5]]
name_position_allele_qc <- name_position_allele[name_position%in%colnames(sum_matrix_qc)] name_position_qc <- name_position[name_position%in%colnames(sum_matrix_qc)]
relevant_bases <- filter_bases(heteroplasmy_matrix_ci, min_heteroplasmy = 0.01, min_cells = 50, index_ci)
p_value_wilcox_test_1 <- get_wilcox_test(heteroplasmy_matrix_ci[, relevant_bases], cluster_ci, "Donor_1", "Donor_2" , index_ci)
p_value_wilcox_test_sort_1 <- sort(p_value_wilcox_test_1, decreasing = F) p_value_wilcox_test_sort_small_1 <- p_value_wilcox_test_sort_1[p_value_wilcox_test_sort_1<0.05] p_value_wilcox_test_sort_top <- p_value_wilcox_test_sort_small_1[1:5]
q <- list() for ( i in 1:length(p_value_wilcox_test_sort_top[1:2])){ p <- plot_heteroplasmy(names(p_value_wilcox_test_sort_top)[i], heteroplasmy_matrix_ci, cluster_ci, index_ci)+ggplot2::ggtitle(paste(names(p_value_wilcox_test_sort_top)[i], round(p_value_wilcox_test_sort_small_1[i], 4), sep = "-")) q <- list(q, p) } q q <- list() for ( i in names(p_value_wilcox_test_sort_top[1:2])){ p <- plot_allele_frequency(i, heteroplasmy_matrix_ci, allele_matrix_ci, cluster_ci, name_position_qc, name_position_allele_qc, 5, index_ci) q <- list(q, p) } q
heteroplasmy_matrix_ci_small <- heteroplasmy_matrix_ci[, relevant_bases] allele_matrix_ci_small <- allele_matrix_ci[, name_position_qc%in%relevant_bases]
result_clustering_sc <- clustering_angular_distance(heteroplasmy_matrix_ci_small, allele_matrix_ci_small, cluster_ci, length(colnames(heteroplasmy_matrix_ci_small)), deepSplit_param = 0, minClusterSize_param = 100, 0.2, min_value = 0.001, index = index_ci, relevant_bases = NULL) old_new_classification <- result_clustering_sc[[1]] dist_matrix_sc <- result_clustering_sc[[2]] top_dist <- result_clustering_sc[[3]] common_idx <- result_clustering_sc[[4]] old_classification <- as.vector(old_new_classification[, 1]) new_classification <- as.vector(old_new_classification[, 2])
The unsupervised cluster analysis divides cells in two groups that perfectly coincide with donor 1 and donor 2
plot_heatmap(new_classification, old_classification, (dist_matrix_sc), cluster_columns = F, cluster_rows = F, "Euclidean distance")
The final number of clusters obtained does not change with the number of top bases used (determined with the parameter min_value in the function choose_features_clustering. The number in the clustree plot (1, 2, 3, 4) refers to the index of the vector min_frac. So in this case 1 refers to 0.1, 2 to 0.01 and so on.
min_frac <- c(0.1, 0.01, 0.001, 0.0001) choose_features_clustering(heteroplasmy_matrix_ci_small, allele_matrix_ci_small, cluster_ci, top_pos = length(colnames(heteroplasmy_matrix_ci_small)), deepSplit_param = 0, minClusterSize_param = 100, min_frac, 0.2, index = index_ci)
In the second part of the vignette it is shown the analysis performed on the dataset with GEO accession number GSE115214 Ludwig et al, 2019 used in fig. 5. Primary human cells( CD34+ hematopoietic stem and progenitor cells HSPCs) from two independent donors were processed with single cell RNA-seq (Smart-seq2). Below we focus only on some colonies from donor 1.
donor_1 <- donor_all[donor_all == "Donor_1"]
colony_1 <- rep(0, length(seq(101, 135))) for (i in 1:length(seq(101, 135))){ colony_1[i] <- paste0("C", seq(101, 135)[i], collapse = "") } colony_1_all <- rep(0, length(row.names(matrix_allele_counts))) for (i in colony_1 ){ colony_1_all[grep(i, row.names(matrix_allele_counts))] <- i } cluster_ci <- colony_1_all[grep("Donor1_", row.names(matrix_allele_counts))] only_col <- c("C101", "C103", "C107", "C109", "C112", "C114", "C116", "C118", "C120", "C122", "C124", "C132", "C135")
matrix_allele_counts_1 <- matrix_allele_counts[grep("Donor1_", row.names(matrix_allele_counts)), ] donor_1 <- get_heteroplasmy(matrix_allele_counts_1[cluster_ci%in%only_col, ], name_position_allele, name_position, number_reads = 50, number_positions = 2000, filtering = 2, donor_1[cluster_ci%in%only_col])
sum_matrix <- donor_1[[1]] sum_matrix_qc <- donor_1[[2]] heteroplasmy_matrix_ci <- donor_1[[3]] allele_matrix_ci <- donor_1[[4]] cluster_ci <- cluster_ci[cluster_ci%in%only_col] cluster_ci <- cluster_ci[row.names(sum_matrix)%in%row.names(sum_matrix_qc)] index_ci <- donor_1[[5]]
name_position_allele_qc <- name_position_allele[name_position%in%colnames(sum_matrix_qc)] name_position_qc <- name_position[name_position%in%colnames(sum_matrix_qc)]
relevant_bases <- filter_bases(heteroplasmy_matrix_ci, min_heteroplasmy = 0.01, min_cells = 10, index_ci)
heteroplasmy_matrix_ci_small <- heteroplasmy_matrix_ci[, relevant_bases] allele_matrix_ci_small <- allele_matrix_ci[, name_position_qc%in%relevant_bases]
result_clustering_sc <- clustering_angular_distance(heteroplasmy_matrix_ci_small, allele_matrix_ci_small, cluster_ci, length(row.names(heteroplasmy_matrix_ci_small)), deepSplit_param = 0, minClusterSize_param = 10, 0.2, min_value = 0.001, index = index_ci, relevant_bases = NULL) old_new_classification <- result_clustering_sc[[1]] dist_matrix_sc <- result_clustering_sc[[2]] top_dist <- result_clustering_sc[[3]] common_idx <- result_clustering_sc[[4]] old_classification <- as.vector(old_new_classification[, 1]) new_classification <- as.vector(old_new_classification[, 2])
Comparison between the ground truth and the new partition obtained with unsupervised cluster analysis
plot_heatmap(new_classification, old_classification, (dist_matrix_sc), cluster_columns = F, cluster_rows = F, "Euclidean distance")
Below the top 4 bases selected for the unsupervised cluster analysis.
q <- list() for ( i in 1:length(top_dist[1:4])){ p <- plot_heteroplasmy(top_dist[i], heteroplasmy_matrix_ci, cluster_ci, index_ci) q <- list(q, p) } q
utils::sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.