In shanlu01/mlg: MLG Clustering

suppressPackageStartupMessages({
  library(ggplot2)
  library(cowplot)
  library(knitr)
  library(Seurat)
  library(liger)
  library(mlg)
  library(mclust)
})
data("kowalczyk_1", package = "mlg")
theme_set(theme_classic())

Dataset description

Three variables are stored in file kowalczyk_1.rda: gene_expression_matrix, cell_info, cnmf_factor.
"gene_expression_matrix" stores the logTPM of 1058 cells.
"cell_info" stores the true cell type labels (LT-HSC, MPP, ST-HSC) and condition labels (old and young) of the cells.
"cnmf_factor" stores the coordinates from cNMF decomposition. We include cNMF factors here, because cNMF is implemented in python.

Clustering analysis

Perform PCA, tSNE and Louvain clustering with PCA factors

# number of component to be kept for each dimension reduction method
num_component <- 15

# generate Seurat object
seurat.obj <- CreateSeuratObject(counts = gene_expression_matrix, 
                                       project = "seurat.obj", 
                                       assay = "RNA",
                                       meta.data = cell_info)

seurat.obj <- FindVariableFeatures(seurat.obj, selection.method = "vst", nfeatures = 3000, verbose = F)
seurat.obj <- ScaleData(seurat.obj, vars.to.regress = 'condition_label', verbose = F)
seurat.obj <- RunPCA(seurat.obj, npcs=num_component, verbose = F)
seurat.obj <- RunTSNE(seurat.obj, dims = 1:num_component, verbose = F)
seurat.obj <- FindNeighbors(seurat.obj, dims = 1:num_component, verbose = F)
seurat.obj <- FindClusters(seurat.obj, resolution = 0.3, algorithm =1, verbose = F)
tSNE_coord_PCA <- data.frame(seurat.obj@reductions$tsne@cell.embeddings)
# Visualization of the dataset on tSNE coordinates
plot_grid(ggplot(tSNE_coord_PCA, aes(tSNE_1, tSNE_2, color = cell_info$cell_type_label)) +
            geom_point() + labs(color = "cell type") + theme(legend.position = "bottom"),
            ggplot(tSNE_coord_PCA, aes(tSNE_1, tSNE_2, color = seurat.obj$seurat_clusters)) +
            labs(color = "cluster") + theme(legend.position = "bottom") + geom_point())
# The Adjusted Rand Index(ARI) of the Louvain-PCA clustering
mclust::adjustedRandIndex(seurat.obj$seurat_clusters, cell_info$cell_type_label)

Perform Seurat integration, tSNE and Louvain clustering with PCA components based on Seurat integrated data.

# Seurat integration
condition.list <- SplitObject(seurat.obj, split.by = "condition_label")
int.anchors <- FindIntegrationAnchors(object.list = condition.list, anchor.features = 3000)
kowalczyk.integrated <- IntegrateData(anchorset = int.anchors, verbose = F)
DefaultAssay(kowalczyk.integrated) <- "integrated"
# Reorder the cells, so that the cell order in object kowalczyk.integrated is the same as in original dataset.
kowalczyk.integrated@assays$integrated@data <- kowalczyk.integrated@assays$integrated@data[, rownames(cell_info)]
kowalczyk.integrated@meta.data <- kowalczyk.integrated@meta.data[rownames(cell_info), ]
kowalczyk.integrated <- ScaleData(kowalczyk.integrated, verbose = F)
kowalczyk.integrated <- RunPCA(kowalczyk.integrated, npcs = 30, verbose = F)
kowalczyk.integrated <- RunTSNE(kowalczyk.integrated, verbose = F)
kowalczyk.integrated <- FindNeighbors(kowalczyk.integrated,dims = 1:num_component, k.param = 20, verbose = F)
kowalczyk.integrated <- FindClusters(kowalczyk.integrated, resolution = 0.22, algorithm = 1, verbose = F)
tSNE_coord_Seurat <- data.frame(kowalczyk.integrated@reductions$tsne@cell.embeddings)
# Visualization of the dataset on tSNE coordinates
plot_grid(ggplot(tSNE_coord_Seurat, aes(tSNE_1, tSNE_2, color = cell_info$cell_type_label)) +
          geom_point() + labs(color = "cell type") + theme(legend.position = "bottom"),
          ggplot(tSNE_coord_Seurat, aes(tSNE_1, tSNE_2, color = kowalczyk.integrated$seurat_clusters)) +
          labs(color = "cluster") + theme(legend.position = "bottom") + geom_point())
# The Adjusted Rand Index(ARI) of the Seurat clustering
mclust::adjustedRandIndex(kowalczyk.integrated$seurat_clusters, cell_info$cell_type_label)

Perform Liger integration, tSNE and Louvain clustering with Liger factors.

liger.obj <- createLiger(list(young = condition.list[[1]]@assays$RNA@counts, 
                           old = condition.list[[2]]@assays$RNA@counts)) 
liger.obj <- liger::normalize(liger.obj)
liger.obj <- selectGenes(liger.obj, var.thresh = .7281) # keep 3000 genes, check in length(liger.obj@var.genes) 
liger.obj <- scaleNotCenter(liger.obj)

liger.obj <- optimizeALS(liger.obj, k = num_component)

liger.obj <- quantile_norm(liger.obj)
liger.obj <- louvainCluster(liger.obj, resolution = 0.2)

liger.obj <- runTSNE(liger.obj)

## prepare liger factors for MLG clustering
# reordering and scaling of raw liger factors
liger_factor <- rbind(liger.obj@H$young,liger.obj@H$old)
# reorder the cells, so that the cell order are the same as in original data
liger_factor <- liger_factor[rownames(cell_info), ]
# scale the penalized NMF score for each cell, so that they sums to 1
liger_factor <- t(apply(liger_factor, 1, function(i){i/sum(i)}))
##

tSNE_coord_Liger <- data.frame(liger.obj@tsne.coords)
colnames(tSNE_coord_Liger) <- c("tSNE_1", "tSNE_2")
# Visualization of the dataset on tSNE coordinates
plot_grid(ggplot(tSNE_coord_Liger, aes(tSNE_1, tSNE_2, color = cell_info$cell_type_label))+
          geom_point() + labs(color = "cell type") + theme(legend.position = "bottom"),
          ggplot(tSNE_coord_Liger, aes(tSNE_1, tSNE_2, color = liger.obj@clusters)) +
          labs(color = "cluster") + theme(legend.position = "bottom") + geom_point())
# The Adjusted Rand Index(ARI) of the Liger clustering
mclust::adjustedRandIndex(liger.obj@clusters, cell_info$cell_type_label)

Perform cNMF decomposition, tSNE and Louvain clustering with cNMF factors.

cNMF is applied through the code provided by Kotliar et al (2019), available at Github https://github.com/dylkot/cNMF. cNMF takes count matrix as input. Output factors are stored in "GEP usages" files. Since nonnegative matrix factorzation solutions are invariant to scales, cNMF factors need to be scaled to have row-sum 1, i.e. GEP usages for each cell sums to 1.

cnmf_graph <- FindNeighbors(cnmf_factor, verbose = F)
cnmf_cluster <- FindClusters(cnmf_graph$snn, resolution = 0.15, algorithm = 1, verbose = F)[[1]]
tSNE_cNMF <- RunTSNE(cnmf_factor, verbose = F)
tSNE_coord_cNMF <- data.frame(tSNE_cNMF@cell.embeddings)
# Visualization of the dataset on tSNE coordinates
plot_grid(ggplot(tSNE_coord_cNMF, aes(tSNE_1, tSNE_2, color = cell_info$cell_type_label)) +
          geom_point() + labs(color = "cell type") + theme(legend.position = "bottom"),
          ggplot(tSNE_coord_cNMF, aes(tSNE_1, tSNE_2, color = cnmf_cluster)) +
          labs(color = "cluster") + theme(legend.position = "bottom") + geom_point())
# The Adjusted Rand Index(ARI) of the Louvain-cNMF clustering
mclust::adjustedRandIndex(cnmf_cluster, cell_info$cell_type_label)

MLG clustering

factor_list <- list(PCA = seurat.obj@reductions$pca@cell.embeddings[,1:num_component],
                   cNMF = cnmf_factor,
                   Seurat = kowalczyk.integrated@reductions$pca@cell.embeddings[,1:num_component],
                   Liger = liger_factor)
# perform MLG clustering 
MLG <- mlg_cluster(factor.list = factor_list, cluster.resolution = .5)

# The Adjusted Rand Index(ARI) of the MLG clustering
mclust::adjustedRandIndex(MLG, cell_info$cell_type_label)

# Visualization of the overlapping (which characterizes dependences) between different SNN layers.
p <- prop.overlap.edges(factor_list)
p

# Visualization of MLG by force-directed layout.
plot_grid(mlg_visualization(factor.list = factor_list, label=cell_info$cell_type_label, label_title = "cell type"),
          mlg_visualization(factor.list = factor_list, label=MLG, label_title = "MLG cluster"))

# Compute graph signal to noice ratio of the SNN graph constructed from PCA, cNMF, Seurat, Liger factors and MLG.
graph_signal_noise_ratio(factor_list, knn.param = 20, prune.param = 1/5, cell_label = cell_info$cell_type_label)

# Compare clustering ARI of the 5 methods.
clustering_acc <- c(mclust::adjustedRandIndex(seurat.obj$seurat_clusters, cell_info$cell_type_label),
  mclust::adjustedRandIndex(cnmf_cluster, cell_info$cell_type_label),
  mclust::adjustedRandIndex(kowalczyk.integrated$seurat_clusters, cell_info$cell_type_label),
  mclust::adjustedRandIndex(liger.obj@clusters, cell_info$cell_type_label),
  mclust::adjustedRandIndex(MLG, cell_info$cell_type_label))
names(clustering_acc) <- c("PCA", "cNMF", "Seurat", "Liger", "MLG")
print(clustering_acc)