The goal of CrossSCC is to classify Single-Cell data(as expression matrix of scRNA-seq data) Crossing batch into Clusters using Gaussian Mixture Model, and to find out the mapping relationship in clusters.
You can install the latest version of CrossSCC with:
install.packages("remotes")
remotes::install_github("bioinformatist/CrossSCC")
Besides, you also need a database org.HsSimple.eg.db provided by us: First download the latest release tarball, then run:
install.packages("./org.HsSimple.eg.db", repos=NULL)
This dataset is part of GSE81861 from this paper. First, the FPKM matrix file of all cells was downloaded:
aria2c ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE81nnn/GSE81861/suppl/GSE81861_Cell_Line_FPKM.csv.gz
pigz -d GSE81861_Cell_Line_FPKM.csv.gz
Then pre-process dataset in R:
library(data.table)
library(tidyverse)
library(org.Hs.eg.db)
library(usethis)
library(genefilter)
library(Biobase)
cl <- fread('GSE81861_Cell_Line_FPKM.csv')
names(cl)[1] <- 'Gene'
cl[, Ensembl := str_match(Gene, ".+_(.+)\\.\\d+$")[, 2]]
symbols <- mapIds(org.Hs.eg.db, keys = cl$Ensembl, keytype = "ENSEMBL", column="ENTREZID", multiVals = 'first')
cl[, Entrez := symbols[Ensembl]]
cl[, c('Gene', 'Ensembl'):= NULL]
cl <- cbind(cl[, .(Entrez)], cl[, .SD, .SDcols = !(names(cl) %like% "_B2_")])
cl[, var := rowVars(.SD), .SDcols = -c('Entrez')]
cl <- cl[, max.var := max(var), by = 'Entrez'][var != 0 & max.var == var & !is.na(Entrez), ]
cl[, grep("var", colnames(cl)) := NULL]
cl <- as.data.frame(cl) %>% remove_rownames %>% column_to_rownames(var = "Entrez")
cl <- cl[, !(names(cl) %in% 'Entrez')]
cl.7 <- new("ExpressionSet", exprs=as.matrix(cl), annotation = 'org.Hs.eg.db')
# setwd() back to package root directory
use_data(cl.7, compress = 'xz')
From this research article coming with GSE140228:
aria2c ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE140nnn/GSE140228/suppl/GSE140228_read_counts_Smartseq2.csv.gz
pigz -d GSE140228_read_counts_Smartseq2.csv.gz
aria2c ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE140nnn/GSE140228/suppl/GSE140228_cell_info_Smartseq2.tsv.gz
pigz -d GSE140228_cell_info_Smartseq2.tsv.gz
Then pre-process dataset in R:
library(data.table)
library(org.Hs.eg.db)
library(tidyverse)
library(usethis)
rowVars <- function(x, ...) {
rowSums((x - rowMeans(x, ...))^2, ...)/(dim(x)[2] - 1)
}
immu <- fread('GSE140228_read_counts_Smartseq2.csv')
info <- fread('GSE140228_cell_info_Smartseq2.tsv')
setnames(immu, info$Barcode, paste0(info$celltype_sub, '_', info$Barcode))
symbols <- mapIds(org.Hs.eg.db, keys = immu$gene, keytype = "SYMBOL", column="ENTREZID", multiVals = 'first')
immu[, Entrez := symbols[gene]]
immu[, gene := NULL]
immu[, var := rowVars(.SD), .SDcols = -c('Entrez')]
immu <- immu[, max.var := max(var), by = 'Entrez'][var != 0 & max.var == var & !is.na(Entrez), ]
immu[, grep("var", colnames(immu)) := NULL]
immu <- as.data.frame(immu) %>% remove_rownames %>% column_to_rownames(var = "Entrez")
immu <- immu[, !(names(immu) %in% 'Entrez')]
immu <- new("ExpressionSet", exprs=as.matrix(immu), annotation = 'org.Hs.eg.db')
use_data(immu, compress = 'xz')
library(CrossSCC)
data("cl.7")
(handsome.zuo <- CrossSCC(cl.7, ncores = 16, mean.posterior.cutoff = 0.2814, ovl.cutoff = 0.1265, mean.posterior.weight = 1.0000, ovl.weight = 0.3586, lambda.cutoff = 0.9277, verbose = FALSE))
library(data.tree)
handsome.zuo$Get('sampleNames', filterFun = isLeaf, simplify = FALSE)
library(stringr)
cl1 <- factor(str_match(colnames(cl.7), '__(.+?)_')[, 2])
levels(cl1) <- seq_len(7)
cl1 <- as.character(cl1)
cl2 <- unname(lapply(rapply(handsome.zuo$Get('sampleNames', filterFun = isLeaf), enquote, how = 'unlist'), eval))
cl2 <- vapply(colnames(cl.7), function(y) which(vapply(cl2, function(x) y %in% x,
logical(1))), 2333, USE.NAMES = FALSE)
library(clues)
adjustedRand(cl1, cl2, 'Rand')
The parameters have NOT optimized now for this part.
data("immu")
(handsome.zuo <- CrossSCC(immu, ncores = 16, mean.posterior.cutoff = 0.4459, ovl.cutoff = 0.4243, mean.posterior.weight = 0.2988, ovl.weight = 0.3771, lambda.cutoff = 0.6736, verbose = FALSE))
handsome.zuo$Get('sampleNames', filterFun = isLeaf, simplify = FALSE)
cl1 <- sort(factor(str_match(colnames(immu), '(.+?)_.+')[, 2]))
levels(cl1) <- seq_len(40)
cl1 <- as.character(cl1)
cl2 <- unname(lapply(rapply(handsome.zuo$Get('sampleNames', filterFun = isLeaf), enquote, how = 'unlist'), eval))
cl2 <- vapply(colnames(immu), function(y) which(vapply(cl2, function(x) y %in% x,
logical(1))), 2333, USE.NAMES = FALSE)
library(clues)
adjustedRand(cl1, cl2, 'Rand')
plot_CrossSCC(handsome.zuo)
library(rBayesianOptimization)
library(CrossSCC)
library(data.tree)
library(stringr)
library(clues)
data("cl.7")
test.CrossSCC <- function(mean.posterior.cutoff, ovl.cutoff, mean.posterior.weight, ovl.weight, lambda.cutoff) {
tryCatch(
{handsome.zuo <- CrossSCC(cl.7, ncores = 16, verbose = FALSE,
mean.posterior.cutoff = mean.posterior.cutoff,
ovl.cutoff = ovl.cutoff, mean.posterior.weight = mean.posterior.weight,
ovl.weight = ovl.weight, lambda.cutoff = lambda.cutoff)
cl1 <- factor(str_match(colnames(cl.7), '__(.+?)_')[, 2])
levels(cl1) <- seq_len(7)
cl1 <- as.character(cl1)
cl2 <- unname(lapply(rapply(handsome.zuo$Get('sampleNames', filterFun = isLeaf), enquote, how = 'unlist'), eval))
cl2 <- vapply(colnames(cl.7), function(y) which(vapply(cl2, function(x) y %in% x,
logical(1))), 2333, USE.NAMES = FALSE)
list(Score = adjustedRand(cl1, cl2, 'Rand'), Pred = 0)}, error=function(e) list(Score = 0, Pred = 0)
)
}
opt.res <- BayesianOptimization(test.CrossSCC,
bounds = list(mean.posterior.cutoff = c(0, 0.5), ovl.cutoff = c(0, 0.5),
mean.posterior.weight = c(0, 1), ovl.weight = c(0, 1),
lambda.cutoff = c(0.5, 1)),
init_points = 20, n_iter = 100)
library(rBayesianOptimization)
library(CrossSCC)
library(data.tree)
library(stringr)
library(clues)
data("immu")
test.CrossSCC <- function(mean.posterior.cutoff, ovl.cutoff, mean.posterior.weight, ovl.weight, lambda.cutoff) {
tryCatch(
# For one type contains only 11 samples, the minimum group size filter should be turned off
{handsome.zuo <- CrossSCC(immu, ncores = 16, verbose = FALSE,
mean.posterior.cutoff = mean.posterior.cutoff,
ovl.cutoff = ovl.cutoff, mean.posterior.weight = mean.posterior.weight,
ovl.weight = ovl.weight, lambda.cutoff = lambda.cutoff, min.group.size = 11)
# Original sample is disordered
cl1 <- sort(factor(str_match(colnames(immu), '(.+?)_.+')[, 2]))
levels(cl1) <- seq_len(40)
cl1 <- as.character(cl1)
# table(cl1)
cl2 <- unname(lapply(rapply(handsome.zuo$Get('sampleNames', filterFun = isLeaf), enquote, how = 'unlist'), eval))
cl2 <- vapply(colnames(immu), function(y) which(vapply(cl2, function(x) y %in% x,
logical(1))), 2333, USE.NAMES = FALSE)
list(Score = adjustedRand(cl1, cl2, 'Rand'), Pred = 0)}, error=function(e) list(Score = 0, Pred = 0)
)
}
opt.res <- BayesianOptimization(test.CrossSCC,
bounds = list(mean.posterior.cutoff = c(0, 0.5), ovl.cutoff = c(0, 0.5),
mean.posterior.weight = c(0, 1), ovl.weight = c(0, 1),
lambda.cutoff = c(0.5, 1)),
init_points = 20, n_iter = 100)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.