Poonia Sarita 2022-09-01
unCTC employs pathway-based unsupervised clustering of single-cell RNA-Seq data to identify circulating tumour cells (CTCs) among white blood cells (WBCs). It accepts a list of raw Countdata/TPM single-cell RNA-Seq expression matrices. In the expression matrix, genes must be aligned in rows, while cells must be in columns. unCTC integrates all matrices based on common genes, removes low-expression genes and cells, eliminates batch effects, and normalises the integrated matrix. For unsupervised clustering of circulating tumour cells (CTCs) and white blood cells (WBCs), a normalised matrix is translated to pathway space and deep dictionary learning with k-means clustering is implemented. unCTC also computes copy number variations (CNVs) , revealing the frequency of CNVs and the position of the p/q arm variation. Using Stouffer’s Z-score, unCTC enables the detection of numerous canonical markers indicating malignant/epithelial/immune origins (Stouffer et al., 1949). The expression of other canonical markers confirms the lineage of circulating tumour cells (CTCs).
You can install the released version of unCTC from ….
#First, you need to install the devtools package.
install.packages("devtools")
#Load the devtools package.
library(devtools)
#Install the unCTC package
install_github("SaritaPoonia/unCTC")
#Load unCTC
library(unCTC)
library(unCTC)
library(pheatmap)
library(viridis)
library(ggplot2)
library(cowplot)
unCTC requires: * List of expression data matrices. * List of expression data matrices’ name in the same order. * Gene list: List of specific genes or marker genes * Genesets: list of pathways * A gene/chromosome positions file
Two data matrices 1) Poonia_et_al._TPMData and 2) Ding_et_al._WBC1_TPMData, and their meta data are given with this package.
Poonia_et_al._TPMData = unCTC::Poonia_et_al._TPMData
Ding_et_al._WBC1_TPMData = unCTC::Ding_et_al._WBC1_TPMData
Poonia_et_al._metaData = unCTC::Poonia_et_al._metaData
Ding_et_al._WBC1_metaData = unCTC::Ding_et_al._WBC1_metaData
Here we are using two other dataset Ding_et_al._WBC2_TPMData and Ebright_et_al._TPMData.
load("/path/of/downloaded/folder/Ebright_et_al._TPMData.RData")
Ebright_et_al._TPMData = Ebright_et_al._TPMData
load("/path/of/downloaded/folder/Ding_et_al._WBC2_TPMData.RData")
Ding_et_al._WBC2_TPMData = Ding_et_al._WBC2_TPMData
load("/path/of/downloaded/folder/Ebright_et_al._metaData.RData")
Ebright_et_al._metaData = Ebright_et_al._metaData
load("/path/of/downloaded/folder/Ding_et_al._WBC2_metaData.RData")
Ding_et_al._WBC2_metaData = Ding_et_al._WBC2_metaData
This package includes one geneset, which is taken from molecular signature database.
c2.all.v7.2.symbols = unCTC::c2.all.v7.2.symbols
#Create Expression data list
dataList = list(Poonia_et_al._TPMData,Ebright_et_al._TPMData,
Ding_et_al._WBC1_TPMData,Ding_et_al._WBC2_TPMData)
#Create Data Id's list
dataId = list("Poonia_et_al._TPMData","Ebright_et_al._TPMData",
"Ding_et_al._WBC1_TPMData","Ding_et_al._WBC2_TPMData")
#Create Meta data list
MetaData = list(Poonia_et_al._metaData, Ebright_et_al._metaData,
Ding_et_al._WBC1_metaData, Ding_et_al._WBC2_metaData )
#Genesets given with this package
genesets = c2.all.v7.2.symbols
PathwayEnrichmentScore uses the following steps:
PathwayEnrichmentScore requires following inputs: * data_list: List of expression data matrices * data_id: List of expression data matrices’ name in the same order. * Genesets: List of pathways * min.size: Minimum size of genes in pathways/Genesets, Default is 10 * max.size: Maximum size of genes in pathways/Genesets, Default is 500 * min_Sample: filter out genes which are not expressedin at least min_Sample cells, Default is 5. * min_Gene: Filter out those cells which do not express at least min_Gene genes, Default is 1500. * Parallel_threads : Number of threads in parallel to execute process
#Call PathwayEnrichmentScore
Pathway_score = unCTC::PathwayEnrichmentScore(data_list =dataList,
data_id = dataId,
Genesets = genesets,
min.size=10,
max.size = 500,
min_Sample = 5,
min_Gene = 1500,
Parallel_threads=8L)
For the above pathway enrichment score matrix, we calculate the number of clusters using the Elbow method.
library(factoextra)
library(NbClust)
fviz_nbclust(Pathway_score$Pathway_score, kmeans, method = "wss") +
geom_vline(xintercept = 4, linetype = 2)+
labs(subtitle = "Elbow method")
DDLk_Clust need the following inputs
# DDLK_Clust use python environment so set python environment before running DDLK_Clust()
# Set Python3 Path: Example: Sys.setenv(RETICULATE_PYTHON = "/usr/bin/python3")
Sys.setenv(RETICULATE_PYTHON = "/path/to/python3")
library(reticulate)
#Retrive information about the version of python being used by reticulate
reticulate::py_config()
#If version is different from the the given path then restart session and
#give path again can change path
DDLK_Clusters = unCTC::DDLK_Clust(PathwayScore = Pathway_score$Pathway_score,
PathwayMetaData = Pathway_score$Pathway_metadata,
n = 4,
out.dir = getwd(),
MetaData = MetaData
)
unCTC_plots Plots principal components of pathway enrichment score.
Required input for unCTC_plots method is:
plots = unCTC::unCTC_plots(Pathway_score = DDLK_Clusters$Pathway_score,
Pathway_metadata = DDLK_Clusters$PathwayDDLK_clust,
colorby = "Data_id",
Color_cluster = "Clusters",
pairsplotLegend = "none")
plot_grid(plots$group_by_Class_PCA,plots$group_by_Cluster_PCA)
library(ggplot2)
ggplot(DDLK_Clusters$PathwayDDLK_clust, aes(x=Clusters, fill = Cell_type))+
theme_classic()+
geom_bar(stat="count")+
scale_color_manual()+
scale_fill_manual(
values = c("deepskyblue3","darkred",
"darkgreen","dark turquoise"))
Provide differential genes between given groups.
Differential_genes require following inputs:
Diff_matrix = unCTC::Differential_genes(data_list=dataList,
min_Sample = 5,
min_Gene = 1500,
DDLK_Clusters,
Genesets = genesets,
data_id=dataId,
data_type = "Normalised",
DifferentiateBy = "Clusters",
up_gene_number = 200)
library(pheatmap)
library(viridis)
annotation = Diff_matrix$annotations
annotation$Data_id <- NULL
annotation$GroupID <- NULL
annotation$Cell_type <- NULL
ann = annotation[,c("HormoneStatus","Class","Clusters")]
pheatmap(t(scale(t(Diff_matrix$DiffMat))),cluster_cols = FALSE,
show_colnames = FALSE,cluster_rows = FALSE, show_rownames = FALSE,
color = viridis(1000),annotation = ann)
Provide differential pathways between given groups.
Differential_pathways require following inputs:
Diff_path = unCTC::Differential_pathways(Pathway_score,
DDLK_Clusters = DDLK_Clusters,
DifferentiateBy = "Clusters",
up_pathways_number = 100
)
annotation = Diff_path$annotations
annotation$Data_id <- NULL
annotation$GroupID <- NULL
annotation$Cell_type <- NULL
ann = annotation[,c("HormoneStatus","Class","Clusters")]
Specific_pathways = c("BIOCARTA_THELPER_PATHWAY","BIOCARTA_TCYTOTOXIC_PATHWAY","BIOCARTA_CTL_PATHWAY","BIOCARTA_IL17_PATHWAY","BIOCARTA_CTLA4_PATHWAY","LEE_DIFFERENTIATING_T_LYMPHOCYTE","BIOCARTA_MONOCYTE_PATHWAY",
"ZHENG_FOXP3_TARGETS_IN_T_LYMPHOCYTE_DN","SPIELMAN_LYMPHOBLAST_EUROPEAN_VS_ASIAN_2FC_DN","GUTIERREZ_CHRONIC_LYMPHOCYTIC_LEUKEMIA_UP","GOERING_BLOOD_HDL_CHOLESTEROL_QTL_CIS",
"FARMER_BREAST_CANCER_CLUSTER_6","TURASHVILI_BREAST_NORMAL_DUCTAL_VS_LOBULAR_UP","YANG_BREAST_CANCER_ESR1_BULK_UP","NIKOLSKY_BREAST_CANCER_19Q13.1_AMPLICON",
"GINESTIER_BREAST_CANCER_ZNF217_AMPLIFIED_UP","HOLLERN_SOLID_NODULAR_BREAST_TUMOR_UP","FINETTI_BREAST_CANCER_KINOME_RED","NIKOLSKY_BREAST_CANCER_7Q21_Q22_AMPLICON",
"YANG_BREAST_CANCER_ESR1_UP","GINESTIER_BREAST_CANCER_ZNF217_AMPLIFIED_UP","WP_MAMMARY_GLAND_DEVELOPMENT_PATHWAY_INVOLUTION_STAGE_4_OF_4","MACLACHLAN_BRCA1_TARGETS_UP",
"REACTOME_ERBB2_ACTIVATES_PTK6_SIGNALING","REACTOME_PI3K_EVENTS_IN_ERBB2_SIGNALING","REACTOME_PI3K_EVENTS_IN_ERBB4_SIGNALING","REACTOME_SHC1_EVENTS_IN_ERBB2_SIGNALING","REACTOME_GRB2_EVENTS_IN_ERBB2_SIGNALING")
mat = Diff_path$DiffMatpathway[Specific_pathways,]
pheatmap(t(scale(t(mat))),cluster_cols = FALSE,
show_colnames = FALSE,cluster_rows = FALSE, show_rownames = TRUE, fontsize_row =7,fontsize = 7,
color = viridis(1000),annotation = ann)
Stouffer_score method uses the following steps:
The followings input are required to calculate Stouffer’s score:
With this package, we have given two types of gene list:
Breast_elevated_genes = unCTC::Breast_elevated_genes
Immune_signature_genes = unCTC::Blood_specific_gene
#Calculate Stouffer's score for Blood gene
S_WBC = unCTC::Stouffer_score(data_list = dataList,
min_Sample = 5,
min_Gene = 1500,
gene_list =Immune_signature_genes,
data_id = dataId,
Groupby = "Clusters",
DDLKCluster_data = DDLK_Clusters)
#Calculate Stouffer's score for Breast elevated genes
S_Breast = unCTC::Stouffer_score(data_list = dataList,
min_Sample = 5,
min_Gene = 1500,
gene_list = Breast_elevated_genes,
data_id = dataId,
Groupby = "Clusters",
DDLKCluster_data = DDLK_Clusters)
For better colour visualization we are using following color key:
library(ggplot2)
library(ggpubr)
ColorKey = c("darkred","deepskyblue3","darkolivegreen4",
"dark turquoise","pale violet red",
"steelblue","forestgreen","gray2",
"gray50","hotpink","lightslateblue",
"tan4","yellow3","sienna4","orchid4")
ggplot(S_WBC$Stouffer_score,aes(x=Clusters,y= Stouffer_score,fill=Clusters))+
geom_boxplot(outlier.shape = NA) + theme_classic() +
scale_fill_manual(values = ColorKey) +
ggtitle("Immune gene signature")+
stat_compare_means(comparisons = S_WBC$comparisons,
label = "p.signif", method = "t.test",ref.group = ".all.")
ggplot(S_Breast$Stouffer_score,aes(x=Clusters,y= Stouffer_score,fill=Clusters))+
geom_boxplot(outlier.shape = NA) + theme_classic() +
scale_fill_manual(values = ColorKey)+
ggtitle("Breast elevated gene signature")+
stat_compare_means(comparisons = S_Breast$comparisons,
label = "p.signif", method = "t.test",ref.group = ".all.")
inferCNV R package is used for analysing copy number variation for raw Count/TPM data. Along with all analysis of inferCNV, unCTC::CNV_alterations calculate addition and deletion position in p and q arms in test/cancerous/ diseased data as compared to reference/normal/healthy. To calculate p and q arm location from inferCNV events, we used GRCh37 cytoband information.
CNV_alterations require the following inputs:
gencode_v19_gene_pos =unCTC::gencode_v19_gene_pos
ref_data = unCTC::Poonia_et_al._PBMC_CountData
ref_metadata = unCTC::Poonia_et_al._PBMC_metaData
obs_data = unCTC::Poonia_et_al._CountData
obs_metadata = unCTC::Poonia_et_al._metaData
dataList1 = list(ref_data,obs_data)
dataId1 = list("WBC","CTC")
MetaData1= list(ref_metadata,obs_metadata)
CNV_Alterations1 = unCTC::CNV_alterations(
data_list= dataList1,
data_id= dataId1,
min_Sample = 5,
min_Gene = 1500,
path= getwd(),
GenePositionFile= gencode_v19_gene_pos,
threads_no=8,
MetaData = MetaData1,
Groupby = "GroupID",
Reference_name = c("WBC"),
obs.title ="Observations",
ref.title = "References",
cluster_by_groups = TRUE,
out.Filename = "inferCNV"
)
#> [1] "HMM state (1 = 2 copies loss, 2 = 1 copy loss, 3 = neutral,4 = 1 copy gain, 5 = 2 copies gain, 6 = 3+ copies gain),"
head(CNV_Alterations1)
load("/home/saritap/unCTC_datasets/GSE181279_Countdata.RData")
ref_data = GSE181279_Countdata
load("/home/saritap/unCTC_datasets/Ebright_et_al._CountData.RData")
obs_data = Ebright_et_al._CountData
dataList = list(ref_data,obs_data)
dataId = list("WBC","CTC")
CNV_Alterations2 = unCTC::CNV_alterations(
data_list= dataList,
data_id= dataId,
min_Sample = 5,
min_Gene = 1500,
path= "/home/saritap/u1",
GenePositionFile= gencode_v19_gene_pos,
threads_no=8,
Groupby = "Data_id",
Reference_name = c("WBC"), # WBC data as reference
obs.title ="Observations",
ref.title = "References",
out.Filename = "inferCNV"
)
#> [1] "HMM state (1 = 2 copies loss, 2 = 1 copy loss, 3 = neutral,4 = 1 copy gain, 5 = 2 copies gain, 6 = 3+ copies gain),"
head(CNV_Alterations2)
Give violin plot for a given Canonical marker expression.
Gene_Violin_plots require input: * data_list: List of expression data matrices * data_id: List of expression data matrices name in the same order. * min_Sample: filter out genes which are not expressedin at least min_Sample cells, Default is 5. * min_Gene: Filter out those cells which do not express at least min_Gene genes, Default is 1500. * gene_symbol: Specific gene for which we want to see expression. * MetaData: Optional, list of metadata of expression matrices. If given then columns of all metadata in the list must be identical. * Groupby: Any column name from MetaData, which we want to use to see differential expression of the gene. Default is “data_id”.
# Gene Violin plot
PTPRC = Gene_Violin_plots(data_list =dataList,
data_id = dataId,
min_Sample = 5,
min_Gene = 1500,
gene_symbol = "PTPRC",
DDLKCluster_data = DDLK_Clusters$PathwayDDLK_clust,
Groupby = "Clusters")
NKG7 = Gene_Violin_plots(data_list =dataList,
data_id = dataId,
min_Sample = 5,
min_Gene = 1500,
gene_symbol = "NKG7",
DDLKCluster_data = DDLK_Clusters$PathwayDDLK_clust,
Groupby = "Clusters")
EPCAM = Gene_Violin_plots(data_list =dataList,
data_id = dataId,
min_Sample = 5,
min_Gene = 1500,
gene_symbol = "EPCAM",
DDLKCluster_data = DDLK_Clusters$PathwayDDLK_clust,
Groupby = "Clusters")
KRT18 = Gene_Violin_plots(data_list =dataList,
data_id = dataId,
min_Sample = 5,
min_Gene = 1500,
gene_symbol = "KRT18",
DDLKCluster_data = DDLK_Clusters$PathwayDDLK_clust,
Groupby = "Clusters")
library(cowplot)
plot_grid(PTPRC$Violin_plot,NKG7$Violin_plot,
EPCAM$Violin_plot,KRT18$Violin_plot)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.