# This code doing:
# 1) Extract SNNGraph clusters (by scran) of the whole 7k cells or a subset of 1362 cells focusing on HEP processes (i.e., cells of C3, C13, C10, C6, C15, C7).
# For the 7k cells, extract the soft-thresholding clusters with or without the transition cells (by QUANTC),
## consensus clusters with two parameters (by SC3),
## and Leiden clusters with 4 resolution settings (by Seurat4).
# 2) Save these cluster IDs into the SingleCellExperimentobject.
## and plot umap for each clustering methods
#
# last update 1/31/2022
# by Holly yang
setwd('F:/projects/scRNA/results/GSE87038_gastrulation/uncorrected/')
subDir = 'E8.25_mesoderm_robustness'
# normalized data were downloaded and reanalyzed
# refer to GSE87038_E8.25_normalize.R
library(dplyr)
library(scater)
library(scran)
library(SC3)
library(Seurat) #Seurat version 4.0.6
#library(leiden)
#library(monocle3)
parameters = list()
parameters$k = 10 # An Integer number of nearest neighbors to use when creating the k nearest neighbor graph for Louvain/Leiden/SNNGraph clustering.
######################################################
## 0) load the R object ##
## refer to 2_QuanTC_simulation_cluster_notused.R ##
## focusing on 1.3k cells analyzed by QuanTC ##
## refer to SC3_GSE87038_E8.25_HEP.R ##
######################################################
load('E8.25_mesoderm/sce_E8.25_uncorrected.RData')
sce
# class: SingleCellExperiment
# dim: 10938 7240
# metadata(0):
# assays(2): counts logcounts
# rownames(10938): Sox17 Mrpl15 ... AC149090.1 CAAA01147332.1
# rowData names(2): ENSEMBL SYMBOL
# colnames(7240): cell_63220 cell_63221 ... cell_95723 cell_95725
# colData names(19): cell barcode ... batch label
# reducedDimNames(5): pca.corrected umap TSNE UMAP force
# altExpNames(0):
#
sce <- sce[,which(colLabels(sce) %in% c(3, 13, 10,6,15, 7))]
sce
# class: SingleCellExperiment
# dim: 10938 1362
# metadata(0):
# assays(2): counts logcounts
# rownames(10938): Sox17 Mrpl15 ... AC149090.1 CAAA01147332.1
# rowData names(2): ENSEMBL SYMBOL
# colnames(1362): cell_63240 cell_63242 ... cell_95715 cell_95717
# colData names(19): cell barcode ... batch label
# reducedDimNames(5): pca.corrected umap TSNE UMAP force
# altExpNames(0):
colData(sce)$label <- factor(sce$label, levels=c('3', '13', '10','6','15', '7'))
################################################################
# 1.1) # extract SNNGraph clusters
# The parameter k indicates the number of nearest neighbors to consider during graph construction, whicc
# we set to 5 for small number of cells (e.g., <500) and 10 to large number of sequenced cells (e.g., 4k).
# In this example, 'pca.corrected' is the author published PCA estimation, based on which
# we have clusered all 7240 cells into 19 clusters using SNNGraph method that considers 10 nearest neighbors during graph construction.
# Here, to compare different clustering methods, we focused on the downsampled 1363 cells of the original 6 clusters,
# For simplification, we did NOT rerun the SNNGraph method while also considering 10 nearest neighbors during graph construction.
##################################################################
# # set.seed(0010101)
# # dec.pois <- modelGeneVarByPoisson(sce, block=sce$batch)
# # save(dec.pois, file='../hvg.dev.RData' )
# # load('../hvg.dev.RData')
# # dim(dec.pois)
# # #[1] 29452 7
#
# # k: An integer scalar specifying the number of nearest neighbors to consider during graph construction.
# SNNGraph.ID <- scran::buildSNNGraph(sce, k= parameters$k, use.dimred = 'pca.corrected')
# SNNGraph.ID <- igraph::cluster_walktrap(SNNGraph.ID)$membership
#
# # check the agreeement between new and original clusters using the SNNGraph method
# table(as.vector(colLabels(sce)), SNNGraph.ID)
# # SNNGraph.ID
# # 1 2 3 4 5 6 7 8
# # 10 0 0 134 144 0 0 0 0
# # 13 0 222 0 1 0 0 0 0
# # 15 0 0 0 2 58 0 0 0
# # 3 0 0 0 0 0 276 105 0
# # 6 0 0 174 0 0 0 0 109
# # 7 137 0 0 0 0 0 0 0
#
# colData(sce)$C_SNNGraph = SNNGraph.ID
#save(sce, file=paste0(subDir,'/sce_E8.25_HEP.RData'), compress=TRUE) # !!!!!!!!!!!!!!!!!
################################################################
# 1.2.1 ) # extract the QUANTC-assigned 4 clusters for 1362 cells
# refer to QuanTC_soft.thresholding_clusters.m
##################################################################
C_TC <- read.table('F:/projects/scRNA/results/GSE87038_gastrulation/QuanTC_Output/QuanTC_HEP/k4/C_TC.txt')
C_TC <- C_TC[,1]
index_TC <- read.table('F:/projects/scRNA/results/GSE87038_gastrulation/QuanTC_Output/QuanTC_HEP/k4/index_TC.txt')
index_TC <- index_TC[,1]
unique(C_TC[index_TC]) # 5 verified the C_TC is the cluster ID generated by QuanTC
colData(sce)$C_Soft_k4 <- C_TC
table(sce$label, sce$C_Soft_k4)
# 1 2 3 4 5 # rename to be same as that in the SFig !
# 3 0 379 0 0 2
# 13 1 0 218 0 4
# 10 241 0 22 0 15
# 6 272 0 0 0 11
# 15 3 0 0 24 33
# 7 0 0 0 137 0
## replace the QuanTC.cluster IDs
tmp <- data.frame(colData(sce))
tmp <- tmp %>% mutate(C_Soft_k4 = replace(C_Soft_k4, C_Soft_k4 == 1, 'C3'))
tmp <- tmp %>% mutate(C_Soft_k4 = replace(C_Soft_k4, C_Soft_k4 == 2, 'C4'))
tmp <- tmp %>% mutate(C_Soft_k4 = replace(C_Soft_k4, C_Soft_k4 == 3, 'C1'))
tmp <- tmp %>% mutate(C_Soft_k4 = replace(C_Soft_k4, C_Soft_k4 == 4, 'C2'))
tmp <- tmp %>% mutate(C_Soft_k4 = replace(C_Soft_k4, C_Soft_k4 == 5, 'TC'))
colData(sce) = DataFrame(tmp)
################################################################
# 1.2.2 ) # extract the QUANTC-assigned 6 clusters for 1362 cells
# refer to QuanTC_soft.thresholding_clusters.m
##################################################################
C_TC <- read.table('F:/projects/scRNA/results/GSE87038_gastrulation/QuanTC_Output/QuanTC_HEP/k6/C_TC.txt')
C_TC <- C_TC[,1]
index_TC <- read.table('F:/projects/scRNA/results/GSE87038_gastrulation/QuanTC_Output/QuanTC_HEP/k6/index_TC.txt')
index_TC <- index_TC[,1]
unique(C_TC[index_TC]) # 7 verified the C_TC is the cluster ID generated by QuanTC
colData(sce)$C_Soft_k6 <- C_TC
table(sce$label, sce$C_Soft_k6)
# 1 2 3 4 5 6 7
# 3 0 254 0 0 0 111 16
# 13 0 0 216 0 0 0 7
# 10 160 0 19 19 0 0 80
# 6 21 0 0 153 0 0 109
# 15 0 0 0 2 24 0 34
# 7 0 0 0 0 137 0 0
## replace the QuanTC.cluster IDs
tmp <- data.frame(colData(sce))
tmp <- tmp %>% mutate(C_Soft_k6 = replace(C_Soft_k6, C_Soft_k6 == 1, 'C5'))
tmp <- tmp %>% mutate(C_Soft_k6 = replace(C_Soft_k6, C_Soft_k6 == 2, 'C4'))
tmp <- tmp %>% mutate(C_Soft_k6 = replace(C_Soft_k6, C_Soft_k6 == 3, 'C3'))
tmp <- tmp %>% mutate(C_Soft_k6 = replace(C_Soft_k6, C_Soft_k6 == 4, 'C6'))
tmp <- tmp %>% mutate(C_Soft_k6 = replace(C_Soft_k6, C_Soft_k6 == 5, 'C1'))
tmp <- tmp %>% mutate(C_Soft_k6 = replace(C_Soft_k6, C_Soft_k6 == 6, 'C2'))
tmp <- tmp %>% mutate(C_Soft_k6 = replace(C_Soft_k6, C_Soft_k6 == 7, 'TC'))
colData(sce) = DataFrame(tmp)
#save(sce, file=paste0(subDir,'/sce_E8.25_HEP.RData'), compress=TRUE) # !!!!!!!!!!!!!!!!!
################################################################
# 1.3) # extract consensus clusters
# refer to http://127.0.0.1:11637/library/SC3/doc/SC3.html
# needs to customize ks accordingily per dataset, the larger range the longer running time
# then, manually pick the optimal matches to follow up,
# In this case, ks=4,5,6,7,8,9 are tested,
# and for the related soft-thresholding clustering method (QuanTC),
# we had take average of the Consensus.Cluster-agreeable clustering results of k=5,6,9,10 to get cell-cell similarity matrix M
##################################################################
# # BEGIN NOT repeat, runs 1 hour !!!!
# refer to F:\projects\BioTIP\source\GSE87038_gastrulation\SC3_GSE87038_E8.25_HEP.R
# define feature names in feature_symbol column
sce.sc3 = sce
rowData(sce.sc3)$feature_symbol <- rownames(sce.sc3)
# remove features with duplicated names
any(duplicated(rowData(sce.sc3)$feature_symbol)) # F
range(counts(sce.sc3))
# [1] 0 515
### to run SC3 successfully, transform sparsematrix to matrix !!!!!!!!!!!!!
counts(sce.sc3) <- as.matrix(counts(sce.sc3))
logcounts(sce.sc3) <- as.matrix(logcounts(sce.sc3))
sum(logcounts(sce.sc3)<1e-16,2)/nrow(logcounts(sce.sc3))>0.95 # TRUE tehrefore no cell being filtered
# NOT repeat, runs 3 hours !!!!
# # biology: boolean parameter, defines whether to compute differentially expressed genes, marker genes and cell outliers.
set.seed(2020)
sce.sc3 <- sc3(sce.sc3, ks = 3:10, biology = FALSE, svm_max = 8000) # default is 5000!!!
# Setting SC3 parameters...
# Your dataset contains more than 2000 cells. Adjusting the nstart parameter of kmeans to 50 for faster performance...
# Calculating distances between the cells...
# Performing transformations and calculating eigenvectors...
# Performing k-means clustering...
# Calculating consensus matrix...
traceback()
# When the sce.sc3 object is prepared for clustering, SC3 can also estimate the optimal number of clusters k in the dataset
# NOT repeat, runs 10 mins !!!!
sce.sc3 <- sc3_estimate_k(sce.sc3)
metadata(sce.sc3)$sc3$k_estimation
# $ k_estimation : num 5
# to save space, transform back matrix to sparse matrix
counts(sce.sc3) <- as(counts(sce.sc3), 'dgCMatrix')
logcounts(sce.sc3) <- as(logcounts(sce.sc3), 'dgCMatrix')
#
# sce <- sce.sc3
# save(sce, file='F:/projects/scRNA/results/GSE87038_gastrulation/QuanTC/QuanTC_HEP/sce_E8.25HEP_uncorrected_SC3.RData') ##!!!!!!!!!!!!!!!!!!!
# gc()
# END DO NOT REPET !!!!!!!!!!!!!
#load('F:/projects/scRNA/results/GSE87038_gastrulation/QuanTC/QuanTC_HEP/sce_E8.25HEP_uncorrected_SC3.RData')
# sce.sc3 <- sce
# load(file=paste0(subDir,'/sce_E8.25_HEP.RData'))
# colData(sce) = colData(sce)[,1:22]
## manually check
table(colData(sce.sc3)$label, colData(sce.sc3)$sc3_6_clusters)[c(3, 13, 10,6,15, 7),]
# 1 2 3 4 5 6
# 3 266 0 1 0 0 114
# 13 0 1 0 222 0 0 good
# 10 0 218 33 27 0 0 good
# 6 0 108 170 5 0 0
# 15 0 1 34 0 25 0
# 7 0 0 0 0 137 0 good
# load(file='sce_E8.25_HEP.RData')
colData(sce)$C_consensus_ks4 = colData(sce.sc3)$sc3_4_clusters
colData(sce)$C_consensus_ks5 = colData(sce.sc3)$sc3_5_clusters
colData(sce)$C_consensus_ks6 = colData(sce.sc3)$sc3_6_clusters
colData(sce)$C_consensus_ks7 = colData(sce.sc3)$sc3_7_clusters
colData(sce)$C_consensus_ks8 = colData(sce.sc3)$sc3_8_clusters
colData(sce)$C_consensus_ks9 = colData(sce.sc3)$sc3_9_clusters
colData(sce)$C_consensus_ks10 = colData(sce.sc3)$sc3__clusters
rm(sce.sc3)
# save(sce, file=paste0(subDir,'/sce_E8.25_HEP.RData'), compress=TRUE) # !!!!!!!!!!!!!!!!!
################################################################
# 1.4.1) # extract Leiden clustering (using Seurat)
# https://satijalab.org/seurat/articles/get_started.html
# Leiden requires the leidenalg python.
# We apply the Seurat packge, by setting the algorithm =4 in the function FindClusters()
# This parameter decides the algorithm for modularity optimization
# (1 = original Louvain algorithm; 2 = Louvain algorithm with multilevel refinement;
# 3 = SLM algorithm; 4 = Leiden algorithm).
# The resolution parameter: use a value above (below) 1.0 if you want to obtain a larger (smaller) number of communities.
###################################################################
# convert from SingleCellExperiment
sce.seurat <- as.Seurat(sce)
# Warning: Feature names cannot have underscores ('_'), replacing with dashes ('-')
sce.seurat
# An object of class Seurat
# 10938 features across 1362 samples within 1 assay
# Active assay: originalexp (10938 features, 0 variable features)
# 5 dimensional reductions calculated: pca.corrected, umap, TSNE, UMAP, force
# Computes the k.param nearest neighbors for a given dataset
sce.seurat <- FindNeighbors(sce.seurat, reduction = "pca.corrected", k.param = parameters$k)
sce.seurat <- FindClusters(sce.seurat, resolution = 0.8, algorithm = 4) # smaller number of communities
table(Idents(sce.seurat), colData(sce)$label) # the cluster 2/8/9 are mixture of SNNGraph clusters
# 3 13 10 6 15 7
# 1 239 0 3 0 0 0
# 2 0 0 79 86 0 0
# 3 0 1 152 1 0 0
# 4 3 125 15 0 1 0
# 5 139 0 0 0 0 0
# 6 0 0 0 138 0 0
# 7 0 0 0 0 0 135
# 8 0 94 11 0 1 0
# 9 0 3 18 58 4 0
# 10 0 0 0 0 54 2
colData(sce)$C_Leiden_0.8 = Idents(sce.seurat)
sce.seurat <- FindClusters(sce.seurat, resolution = 0.4, algorithm = 4) # smaller number of communities
table(Idents(sce.seurat), colData(sce)$label) # the cluster 3/8 are mixture of SNNGraph clusters
# 3 13 10 6 15 7
# 1 378 0 3 0 0 0
# 2 0 3 18 198 4 0
# 3 0 0 82 84 0 0
# 4 0 1 150 1 0 0
# 5 3 125 14 0 1 0
# 6 0 0 0 0 0 135
# 7 0 94 11 0 1 0
# 8 0 0 0 0 54 2
colData(sce)$C_Leiden_0.4 = Idents(sce.seurat)
sce.seurat <- FindClusters(sce.seurat, resolution = 1.2, algorithm = 4) # smaller number of communities
table(Idents(sce.seurat), colData(sce)$label) # the cluster 1/5/11 are mixture of SNNGraph clusters
# 3 13 10 6 15 7
# 1 0 0 75 88 0 0
# 2 138 0 0 0 0 0
# 3 0 0 0 136 0 0
# 4 135 0 1 0 0 0
# 5 0 94 12 0 1 0
# 6 0 0 105 1 0 0
# 7 0 0 0 0 0 104
# 8 2 91 9 0 1 0
# 9 101 0 0 0 0 0
# 10 0 0 0 0 54 33
# 11 0 3 18 58 4 0
# 12 0 1 51 0 0 0
# 13 5 34 7 0 0 0
colData(sce)$C_Leiden_1.2 = Idents(sce.seurat)
#save(sce, file=paste0(subDir,'/sce_E8.25_HEP.RData'), compress=TRUE) # !!!!!!!!!!!!!!!!!
# ################################################################
# # 1.4.2) # extract Leiden (default paramters with monocle3) clustering
# # by running parts of the code of timonocle3
# # https://github.com/dynverse/ti_monocle3/blob/master/package/R/ti_monocle3.R
# # https://cole-trapnell-lab.github.io/monocle3/docs/clustering/
# # cluster_method = c("leiden", "louvain")
# # resolution = NULL (Default), the parameter is determined automatically.
# # k: Integer number of nearest neighbors to use when creating the k nearest neighbor graph for Louvain/Leiden clustering.
# ###################################################################
#
# feature_info <- rowData(sce)
# if (!"gene_short_name" %in% colnames(feature_info)) {
# if ("SYMBOL" %in% colnames(feature_info)) {
# colnames(feature_info)[which(colnames(feature_info)=="SYMBOL")] = "gene_short_name"
# } else {
# feature_info$gene_short_name <- rownames(feature_info)
# }
# }
#
# # create cds object
# cds <- monocle3::new_cell_data_set(
# expression_data = logcounts(sce), #exprs(sce),
# cell_metadata = colData(sce),
# gene_metadata = feature_info
# )
#
# # perform data preprocessing, required by clustering analysis in Monocle3
# cds <- monocle3::preprocess_cds(
# cds,
# num_dim = 2,
# norm_method = "none" # because in this case, the data already be log-transformed and normalized
# )
#
# # perform dimensionality reduction
# cds <- monocle3::reduce_dimension(
# cds,
# reduction_method = 'UMAP',
# preprocess_method = 'PCA'
# )
#
# # perform clustering
# cds <- monocle3::cluster_cells(
# cds,
# k = 20, #parameters$k, #for Louvain/Leiden clustering. Default is 20 in timonocle3.
# cluster_method = "leiden"
# )
#
# table(cds@clusters[['UMAP']]$clusters,colData(sce)$label)
# # 3 13 10 6 15 7
# # 1 1 75 41 0 2 83
# # 2 0 1 53 107 0 0
# # 3 0 1 13 106 0 0
# # 4 0 56 13 0 30 6
# # 5 90 0 5 0 0 0
# # 6 92 0 2 0 0 0
# # 7 0 16 4 0 5 48
# # 8 0 46 11 1 15 0
# # 9 4 3 39 22 0 0
# # 10 65 0 1 0 0 0
# # 11 0 4 34 22 0 0
# # 12 54 0 0 0 0 0
# # 13 47 0 0 0 0 0
# # 14 28 6 11 0 0 0
# # 15 0 0 20 15 0 0
# # 16 0 6 15 10 3 0
# # 17 0 9 16 0 5 0
#
#
# colData(sce)$C_monocle = cds@clusters[['UMAP']]$clusters
#
# #monocle3::plot_cells(cds, color_cells_by="cluster", label_groups_by_cluster=TRUE)
####################################################################
## 2) Save these cluster IDs into the SingleCellExperimentobject. ##
## plot different clustering restuls ##
####################################################################
save(sce, file=paste0(subDir,'/sce_E8.25_HEP.RData'), compress=TRUE) # !!!!!!!!!!!!!!!!!
rm(cds)
###########################################################################
## 3) Annotate cell identities by comparing to the original cluster IDs. ##
###########################################################################
load(file=paste0(subDir,'/sce_E8.25_HEP.RData'))
colnames(colData(sce))
# [1] "cell" "barcode" "sample" "pool" "stage"
# [6] "sequencing.batch" "theiler" "doub.density" "doublet" "cluster"
# [11] "cluster.sub" "cluster.stage" "cluster.theiler" "stripped" "celltype"
# [16] "colour" "sizeFactor" "batch" "label" "C_Soft_k4"
# [21] "C_Soft_k6" "C_consensus_ks4" "C_consensus_ks5" "C_consensus_ks6" "C_consensus_ks7"
# [26] "C_consensus_ks8" "C_consensus_ks9" "C_consensus_ks10" "C_Leiden_0.4" "C_Leiden_0.8"
# [31] "C_Leiden_1.2"
x <- grep('C_', colnames(colData(sce)))
(n=length(x)) # 12
pdf(file=paste0(subDir,"/umap.",subDir,"_clustering_methods.pdf"), width=10, height=9)
gridExtra::grid.arrange(
plotReducedDim(sce, dimred='umap', colour_by='label', #add_legend=FALSE,
text_by='label', text_size = 4, text_colour='black', point_size=0.5) +
ggtitle('SNNGraph') + ylim(5,20)
,plotReducedDim(sce, dimred='umap', colour_by='C_Soft_k4', #add_legend=FALSE,
text_by='C_Soft_k4', text_size = 4, text_colour='black', point_size=0.5) +
ggtitle('QuanTC k4') + ylim(5,20)
,plotReducedDim(sce, dimred='umap',colour_by='C_Soft_k6', #add_legend=FALSE,
text_by='C_Soft_k6', text_size = 4, text_colour='black', point_size=0.5) +
ggtitle('QuanTC k6') + ylim(5,20)
,plotReducedDim(sce, dimred='umap',colour_by='C_Leiden_0.4', #add_legend=FALSE,
text_by='C_Leiden_0.4', text_size = 4, text_colour='black', point_size=0.5) +
ggtitle('Leiden_0.4') + ylim(5,20)
,plotReducedDim(sce, dimred='umap',colour_by='C_Leiden_0.8', #add_legend=FALSE,
text_by='C_Leiden_0.8', text_size = 4, text_colour='black', point_size=0.5) +
ggtitle('Leiden_0.8') + ylim(5,20)
,plotReducedDim(sce, dimred='umap',colour_by='C_Leiden_1.2', #add_legend=FALSE,
text_by='C_Leiden_1.2', text_size = 4, text_colour='black', point_size=0.5) +
ggtitle('Leiden_1.2') + ylim(5,20)
,plotReducedDim(sce, dimred='umap',colour_by='C_consensus_ks4', #add_legend=FALSE,
text_by='C_consensus_ks4', text_size = 4, text_colour='black', point_size=0.5) +
ggtitle('consensus_ks4') + ylim(5,20)
,plotReducedDim(sce, dimred='umap',colour_by='C_consensus_ks5', #add_legend=FALSE,
text_by='C_consensus_ks5', text_size = 4, text_colour='black', point_size=0.5) +
ggtitle('consensus_ks5') + ylim(5,20)
,plotReducedDim(sce, dimred='umap',colour_by='C_consensus_ks6', #add_legend=FALSE,
text_by='C_consensus_ks6', text_size = 4, text_colour='black', point_size=0.5) +
ggtitle('consensus_ks6') + ylim(5,20)
,ncol=3
)
gridExtra::grid.arrange(
plotReducedDim(sce, dimred='umap',colour_by='C_consensus_ks7', #add_legend=FALSE,
text_by='C_consensus_ks7', text_size = 4, text_colour='black', point_size=0.5) +
ggtitle('consensus_ks7') + ylim(5,20)
,plotReducedDim(sce, dimred='umap',colour_by='C_consensus_ks8', #add_legend=FALSE,
text_by='C_consensus_ks8', text_size = 4, text_colour='black', point_size=0.5) +
ggtitle('consensus_ks8') + ylim(5,20)
,plotReducedDim(sce, dimred='umap',colour_by='C_consensus_ks9', #add_legend=FALSE,
text_by='C_consensus_ks9', text_size = 4, text_colour='black', point_size=0.5) +
ggtitle('consensus_ks9')+ ylim(5,20)
,plotReducedDim(sce, dimred='umap',colour_by='C_consensus_ks10', #add_legend=FALSE,
text_by='C_consensus_ks10', text_size = 4, text_colour='black', point_size=0.5) +
ggtitle('consensus_ks10') + ylim(5,20)
,ncol=3, nrow=3
)
dev.off()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.