examples/code/hESC_Bargaje2017/1.2_data_matrix_timepoint.R

# R4.0.2
 setwd('F:/projects/BioTIP/result/Bargaje2017_EOMES/timepoint/')

 library(RColorBrewer)
 library(SingleCellExperiment)
 library(org.Mm.eg.db)
 library(scater)
 library(scran)
 library(pheatmap)
 library(limma)
 library(edgeR)
 library(dynamicTreeCut)
 library(cluster)
 library(bluster)
 
 library(MouseGastrulationData)
 head(EmbryoCelltypeColours)
 length(EmbryoCelltypeColours) # 37
 
###################################################
## 1) read the published data matrix of 96 genes ##
###################################################
# Dataset S2: This file contains all necessary information to re-analyze the data. 
# We provide the metadata and log2Ex values for all cells 
# (after removing 38 cells for quality control) 
# (processed as described in Materials and Methods). 
# Phenotypes.FACS = phenotypic state based on flow cytometry measurements (Fig. S1)																																																																																																																																																																																																																																																																																																																																																																																																																																																																																																																																																																																																																																																																																																																											
 dat <-  read.table('F:/projects/BioTIP/ref/development/Bargaje2017PNAS_SData_logExp.txt',sep="\t",header=T)
 cli <- t(data.frame(dat[1:3,]))
 cli <- cli[-1,]
 dim(cli)  #[1] 1896    3
 head(cli)
 #      CollectionTime Consensus Cluster Phenotype.FACS
 # Cell1   "Day0"         "1"               "d0.cKIT-h"   
 # Cell109 "Day0"         "1"               "d0.meso-"    
 # Cell11  "Day0"         "1"               "d0.cKIT-h"   
 # Cell111 "Day0"         "1"               "d0.meso-"    
 # Cell112 "Day0"         "1"               "d0.meso-"    
 # Cell114 "Day0"         "1"               "d0.meso-"
 colnames(cli)[2] <- 'Consensus.Cluster'
 cli <- as.data.frame(cli)
 cli$CollectionTime <- as.factor(cli$CollectionTime)
 
 table(cli[,1])
 # Day0   Day1 Day1.5   Day2 Day2.5   Day3   Day4   Day5 
 # 231    166     93    211    122    258    456    359 
 table(cli[,2])
 #  1  10  11  12  13  14  15  16  17  18   2   3   4   5   6   7   8   9 
 # 77 107 130 299 157  64  70  81  67  77 161  86  86  80  37 173  95  49 
table(cli[,3])
 # d0.cd13+     d0.cKIT-h     d0.cKIT-l     d0.cKIT-m      d0.meso- 
 #   35            30            20            22            39 
 # d0.Tra1-60-   d0.Tra1-60+    d1.5.cd13+   d1.5.cKIT-h   d1.5.cKIT-l 
 # 41            44            17            25             9 
 # d1.5.cKIT-m    d1.5.meso-    d1.5.meso+      d1.cd13+     d1.cKIT-h 
 # 30             7             5            27            21 
 # d1.cKIT-l     d1.cKIT-m      d1.meso-   d1.Tra1-60-   d1.Tra1-60+ 
 #   11            23            30            24            30 
 # d2.5.cd13+   d2.5.cKIT-h   d2.5.cKIT-l   d2.5.cKIT-m    d2.5.meso- 
 #   11            32            17            41            10 
 # d2.5.meso+     d2.cKIT-h     d2.cKIT-l     d2.cKIT-m       d2.KDR- 
 #   11            25            21            26            36 
 # d2.KDR+      d2.meso+      d2.ror2+  d3.cd13.kdr+ d3.cd13h.kdr- 
 #   37            38            28            92            77 
 # d3.cd13l.kdr-     d4.Hybrid       d4.kdr- d4.kdr-cxcr4+ d4.kdr+cxcr4- 
 #   89           244            74            64            74 
 # d5.Hybrid d5.kdr.cxcr4- d5.kdr.cxcr4+ d5.kdr+cxcr4- 
 #   146            76            73            64 

dat <- dat[-1:4,]
rownames(dat) <- dat[,1]
dat <- dat[,-1]
sce <- SingleCellExperiment(assays=SimpleList(logcounts=dat),
                            colData=cli)
colData(sce) <- cli
dim(sce) # 96 1896
rm(cli, dat)



###########################################
## 2) reduce dimention and visualization ##
###########################################
set.seed(100) # See below.
sce <- runPCA(sce) 
# You're computing too large a percentage of total singular values, use a standard svd instead.
sce <- runTSNE(sce, dimred="PCA")
#plotReducedDim(sce, dimred="TSNE", colour_by=clust)
sce <- runUMAP(sce, dimred="PCA" )
#plotReducedDim(sce, dimred="UMAP", colour_by=clust)


pdf(file='ReduceDim.pdf', height=4.5, width=10) #!!!!!!!!!!!!!!!!!

MyEmbryoCelltypeColours <- EmbryoCelltypeColours[1:length(unique(sce$Consensus.Cluster))]
names(MyEmbryoCelltypeColours) <- 1:length(unique(sce$Consensus.Cluster))

# Performing the calculations on the PC coordinates, like before.
sil.approx <- approxSilhouette(reducedDim(sce, "PCA"), clusters=sce$Consensus.Cluster)
sil.data <- as.data.frame(sil.approx)
sil.data$closest <- factor(ifelse(sil.data$width > 0, sce$Consensus.Cluster, sil.data$other))
sil.data$cluster <- factor(sce$Consensus.Cluster)

gridExtra::grid.arrange(
  plotReducedDim(sce, dimred="PCA", colour_by='CollectionTime', text_by='Consensus.Cluster',point_size=0.5),# +
    #scale_color_manual(values=MyEmbryoCelltypeColours),
  ggplot(sil.data, aes(x=cluster, y=width, colour=closest)) + 
    ggbeeswarm::geom_quasirandom(method="smiley")+
    scale_color_manual(values=MyEmbryoCelltypeColours),
  ncol=2
)

sil.approx <- approxSilhouette(reducedDim(sce, "TSNE"), clusters=sce$Consensus.Cluster)
sil.data <- as.data.frame(sil.approx)
sil.data$closest <- factor(ifelse(sil.data$width > 0, sce$Consensus.Cluster, sil.data$other))
sil.data$cluster <- factor(sce$Consensus.Cluster)

gridExtra::grid.arrange(
  plotReducedDim(sce, dimred="TSNE", colour_by='CollectionTime', text_by='Consensus.Cluster',point_size=0.5), #+
    #scale_color_manual(values=MyEmbryoCelltypeColours),
  ggplot(sil.data, aes(x=cluster, y=width, colour=closest)) + 
    ggbeeswarm::geom_quasirandom(method="smiley")+
    scale_color_manual(values=MyEmbryoCelltypeColours),
  ncol=2
)

sil.approx <- approxSilhouette(reducedDim(sce, "UMAP"), clusters=sce$Consensus.Cluster)
sil.data <- as.data.frame(sil.approx)
sil.data$closest <- factor(ifelse(sil.data$width > 0, sce$Consensus.Cluster, sil.data$other))
sil.data$cluster <- factor(sce$Consensus.Cluster)

gridExtra::grid.arrange(
  plotReducedDim(sce, dimred="UMAP", colour_by='CollectionTime', text_by='Consensus.Cluster',point_size=0.5),# +
   # scale_color_manual(values=MyEmbryoCelltypeColours),
  ggplot(sil.data, aes(x=cluster, y=width, colour=closest)) + 
    ggbeeswarm::geom_quasirandom(method="smiley")+
    scale_color_manual(values=MyEmbryoCelltypeColours),
  ncol=2
)

dev.off()

colData(sce)$Consensus.Cluster <- factor(colData(sce)$Consensus.Cluster,
                                         levels = c(1:10,11:18))

################################################################################
## 3) defining cell identity by the expression levels of known lineage-marker ##
################################################################################
newmarkers = c("NANOG","TBX2", "GATA4","MESP1","GSC","KDR","HAND1","SOX17","EOMES","T")

plotExpression(sce, 
               features=newmarkers, 
               x="Consensus.Cluster", colour_by="Consensus.Cluster",
               point_size=0.05)
dev.copy2pdf(file='../Marker_expression.pdf')


###################################################
## 4) trajectory analysis, Disagree with biology ##
###################################################

## Minimum spanning tree constructed trajectory

library(scater)

## by clusters , statistics == mean or mdian didn't change patter !------------------
by.cluster <- aggregateAcrossCells(sce, ids=colData(sce)$Consensus.Cluster,
                                   use.assay.type='logcounts', statistics = "sum")
 
centroids <- reducedDim(by.cluster, "PCA")
dmat <- dist(centroids)
dmat <- as.matrix(dmat)
set.seed(1000)
g <- igraph::graph.adjacency(dmat, mode = "undirected", weighted = TRUE)
mst <- igraph::minimum.spanning.tree(g)


pdf(file="trajectory_scater.pdf")
plot(mst)

pairs <- Matrix::which(mst[] > 0, arr.ind=TRUE)
coords <- reducedDim(by.cluster, "PCA")
group <- rep(seq_len(nrow(pairs)), 2)
stuff <- data.frame(rbind(coords[pairs[,1],], coords[pairs[,2],]), group)
plotReducedDim(sce, 'PCA', colour_by="Consensus.Cluster", 
               text_by="Consensus.Cluster", text_size = 1) + 
geom_line(data=stuff, mapping=aes(x=PC1, y=PC2, group=group))

pairs <- Matrix::which(mst[] > 0, arr.ind=TRUE)
coords <- reducedDim(by.cluster, "TSNE")
group <- rep(seq_len(nrow(pairs)), 2)
stuff <- data.frame(rbind(coords[pairs[,1],], coords[pairs[,2],]), group)
plotReducedDim(sce, 'TSNE', colour_by="Consensus.Cluster", 
               text_by="Consensus.Cluster", text_size = 1) + 
geom_line(data=stuff, mapping=aes(x=X1, y=X2, group=group))

pairs <- Matrix::which(mst[] > 0, arr.ind=TRUE)
coords <- reducedDim(by.cluster, "UMAP")
group <- rep(seq_len(nrow(pairs)), 2)
stuff <- data.frame(rbind(coords[pairs[,1],], coords[pairs[,2],]), group)
plotReducedDim(sce, 'UMAP', colour_by="Consensus.Cluster", 
               text_by="Consensus.Cluster", text_size = 1) + 
  geom_line(data=stuff, mapping=aes(x=X1, y=X2, group=group))

dev.off()

plotReducedDim(sce, dimred='TSNE', colour_by='BMP2', swap_rownames=NULL, ncomponents=c(1,2),
               text_by='Consensus.Cluster', text_size = 2, text_colour='red', point_size=1, 
               by_exprs_values='logcounts') 
 


## begin [if rerun] !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
load('../sce.RData')

## end of [if rerun]
myplot<- function(title ='', dimred.sce, dimred = 'TSNE', 
                  newmarkers ='Kdr', 
                  ncomponents = c(1,2), 
                  text_by ='label',
                  swap_rownames ='Symbol', 
                  by_exprs_values = "logcounts", 
                  n = 10, 
                  text_size = 5)
{
   if(length(newmarkers) < n*3) newmarkers <- c(newmarkers, rep(newmarkers[1], n*3-length(newmarkers)))
   b <- ceiling(length(newmarkers)/n)-1
   for(x in 0:b)
   {
    gridExtra::grid.arrange(
      plotReducedDim(dimred.sce, dimred=dimred, colour_by=newmarkers[1+n*x], swap_rownames=swap_rownames,ncomponents=ncomponents,
                     text_by=text_by, text_size = text_size, text_colour='red',point_size=1, by_exprs_values=by_exprs_values) + 
        ggtitle(paste(title,newmarkers[1+n*x])) # + geom_line(data=stuff, mapping=aes(x=X1, y=X2, group=group))
      ,plotReducedDim(dimred.sce, dimred=dimred,colour_by=newmarkers[2+n*x], swap_rownames=swap_rownames,ncomponents=ncomponents,
                      text_by=text_by, text_size = text_size, text_colour='red',point_size=1, by_exprs_values=by_exprs_values) + 
        ggtitle(paste(title,newmarkers[2+n*x]))#  geom_line(data=stuff, mapping=aes(x=X1, y=X2, group=group))
      ,plotReducedDim(dimred.sce, dimred=dimred,colour_by=newmarkers[3+n*x], swap_rownames=swap_rownames,ncomponents=ncomponents,
                      text_by=text_by, text_size = text_size, text_colour='red',point_size=1, by_exprs_values=by_exprs_values) + 
        ggtitle(paste(title,newmarkers[3+n*x]))#  geom_line(data=stuff, mapping=aes(x=X1, y=X2, group=group))
      ,plotReducedDim(dimred.sce, dimred=dimred,colour_by=newmarkers[4+n*x], swap_rownames=swap_rownames,ncomponents=ncomponents,
                      text_by=text_by, text_size = text_size, text_colour='red',point_size=1, by_exprs_values=by_exprs_values) + 
        ggtitle(paste(title,newmarkers[4+n*x]))#  geom_line(data=stuff, mapping=aes(x=X1, y=X2, group=group))
      ,plotReducedDim(dimred.sce, dimred=dimred, colour_by=newmarkers[5+n*x], swap_rownames=swap_rownames,ncomponents=ncomponents,
                     text_by=text_by, text_size = text_size, text_colour='red',point_size=1, by_exprs_values=by_exprs_values) + 
        ggtitle(paste(title,newmarkers[5+n*x])) # + geom_line(data=stuff, mapping=aes(x=X1, y=X2, group=group))
      ,plotReducedDim(dimred.sce, dimred=dimred,colour_by=newmarkers[6+n*x], swap_rownames=swap_rownames,ncomponents=ncomponents,
                      text_by=text_by, text_size = text_size, text_colour='red',point_size=1, by_exprs_values=by_exprs_values) + 
        ggtitle(paste(title,newmarkers[6+n*x]))#  geom_line(data=stuff, mapping=aes(x=X1, y=X2, group=group))
      ,plotReducedDim(dimred.sce, dimred=dimred,colour_by=newmarkers[7+n*x], swap_rownames=swap_rownames,ncomponents=ncomponents,
                      text_by=text_by, text_size = text_size, text_colour='red',point_size=1, by_exprs_values=by_exprs_values) + 
        ggtitle(paste(title,newmarkers[7+n*x]))#  geom_line(data=stuff, mapping=aes(x=X1, y=X2, group=group))
      ,plotReducedDim(dimred.sce, dimred=dimred,colour_by=newmarkers[8+n*x], swap_rownames=swap_rownames,ncomponents=ncomponents,
                      text_by=text_by, text_size = text_size, text_colour='red',point_size=1, by_exprs_values=by_exprs_values) + 
        ggtitle(paste(title,newmarkers[8+n*x]))#  geom_line(data=stuff, mapping=aes(x=X1, y=X2, group=group))
      ,plotReducedDim(dimred.sce, dimred=dimred,colour_by=newmarkers[9+n*x], swap_rownames=swap_rownames,ncomponents=ncomponents,
                      text_by=text_by, text_size = text_size, text_colour='red',point_size=1, by_exprs_values=by_exprs_values) + 
        ggtitle(paste(title,newmarkers[9+n*x]))#  geom_line(data=stuff, mapping=aes(x=X1, y=X2, group=group))
      ,plotReducedDim(dimred.sce, dimred=dimred,colour_by=newmarkers[10+n*x], swap_rownames=swap_rownames,ncomponents=ncomponents,
                      text_by=text_by, text_size = text_size, text_colour='red',point_size=1, by_exprs_values=by_exprs_values) + 
                      ggtitle(paste(title,newmarkers[10+n*x]))  
      ,ncol=5
    )
   }
}
newmarkers = unique(c("NANOG","EOMES","T","GSC","MESP1","GATA4","KDR","HAND1","TBX2", "SOX17",
                      "BMP4","FGF8",  # , "GBX2", "OTC2", "TP73L", "PAX2", "PAX6", "SOX1") #Early Ectodermal Lineage Markers
                      "DKK1",  "TNNT2","WNT3A", "KIT", "TBX5","PTX2","ISL1", 
                      'BMP2',"WNT5A",'FGF8','HAND2','HRT2', 'MESP2','DLL1','DLL3','SFRP1')) # 'HRT2'=='HEY2' 
length(newmarkers)
#[1] 27
setdiff(newmarkers, rownames(sce))
#character(0)
pdf(file="../tSNE.iPSC_marker.pdf", width=14)
myplot(title='', dimred.sce=sce, newmarkers=newmarkers, 
       text_by='Consensus.Cluster', swap_rownames=NULL, by_exprs_values='logcounts')
dev.off()


## MEgan's differentiated markers 
newmarkers = unique(c("BMP4","EOMES",  "WNT3A","TBX20","TBX5",
                      "HAND1","TNNT2","MYL4","NKX2.5","HRT2",
                      "WNT5A", "HAND2","MESP2","SFRP1","DLL1","DLL3",
                      "BMP2", "FGF8", "GATA4", "TBX2", "KDR","SHH",
                      ## following are CTS_C10_19g
                      "BMP2",  "DLL1" , "DLL3",  "EVX1",  "FGF12", "FGF8",  "FOXC1",
                      "FZD7",  "GATA4", "HAND2", "HRT2",  "ISL1",  "MESP1", "MESP2",
                      "MIXL1", "SFRP1", "TBX2",  "WNT4",  "WNT5A"
)) # 'HRT2'=='HEY2' 
pdf(file="../tSNE.iPSC_marker_differentated.pdf", width=14)
myplot(title='', dimred.sce=sce, newmarkers=newmarkers, 
       text_by='Consensus.Cluster', swap_rownames=NULL, by_exprs_values='logcounts')
dev.off()



newmarkers = unique(c('DLL3', 'FZD1', 'FZD2', 'SFRP1',
                        'ALCAM', 'BAMBI', 'EPCAM', 'HRT2', 
                        'NANOG', 'PDGFA', 'PDGFB', 'PDGFRB')
) # 'HRT2'=='HEY2' 
plotExpression(sce, 
               features=newmarkers, 
               x="CollectionTime", colour_by="CollectionTime",
               point_size=0.05)
dev.copy2pdf(file='../Marker_expression_timpoint.pdf')

newmarkers = unique(c('DLL3', 'FZD1', 'FZD2', 'SFRP1',
                      'ALCAM', 'BAMBI', 'EPCAM', 'HRT2', 
                      'NANOG', 'PDGFA', 'PDGFB', 'PDGFRB')
) # 'HRT2'=='HEY2' 
plotExpression(sce[,which(sce$mesodermPath)], 
               features=newmarkers, 
               x="CollectionTime", colour_by="CollectionTime",
               point_size=0.05)
dev.copy2pdf(file='../Marker_expression_timpoint_929cells.pdf')




# library(cowplot)
# library(dplyr)
# library(scater)
# library(Seurat)
# library(cowplot)
# 
# sce.seurat <- as.Seurat(sce, counts = "counts", data = "logcounts")
# # Error: No data in provided assay - counts
# 
# newmarkers = c("EOMES","BMP4","WNT3A",'HAND1')
# # Plot the expression of each of the genes of interest on the tSNE
# FeaturePlot(sce, features = newmarkers)


############################
## 4) BioTIP application  ##
############################
library(BioTIP)
packageVersion('BioTIP') # '1.5.0'

# Ic, we generated time point-specific Log2Ex matrices taht were downloaded from teh publication 
# For each of them (days 0, 1, 1.5, 2, 2.5, and 3/M-only mesoderm-specific cells)
load('../sce.RData')
tmp <- subset(colData(sce), CollectionTime=='Day3' )
table(tmp$Consensus.Cluster)
# 1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18 
# 0   0   0   0   0  22   0   0   0 106 130   0   0   0   0   0   0   0 

int <- which(colData(sce)$CollectionTime %in% c('Day0', 'Day1', 'Day1.5', 'Day2', 'Day2.5') |
               (colData(sce)$CollectionTime =='Day3' & colData(sce)$Consensus.Cluster ==10))
length(int)  #[1] 929

table(colData(sce)[int,]$CollectionTime)
#  Day0   Day1 Day1.5   Day2 Day2.5   Day3   Day4   Day5 
#  231    166     93    211    122    106      0      0 

colData(sce)$mesodermPath = FALSE
colData(sce)$mesodermPath[int] = TRUE
all(which(colData(sce)$mesodermPath) == int)  #TRUE
save(sce, file='../sce.RData')  #!!!!!!!!!!!!!!!!!!!!!!!



sce <- sce[,int]
sce
# class: SingleCellExperiment 
# dim: 96 929 
# metadata(0):
#   assays(1): logcounts
# rownames(96): ACVR1B ACVR2A ... WNT5A WNT5B
# rowData names(0):
#   colnames(929): Cell1 Cell109 ... Cell1421 Cell1429
# colData names(3): CollectionTime Consensus.Cluster Phenotype.FACS
# reducedDimNames(3): PCA TSNE UMAP
# altExpNames(0):
  

# 1) module construction

# Pre-selection Transcript
cut.preselect = 0.8  # out of 96 genes
cut.fdr = 0.2   
cut.minsize = 10  

samplesL <- split(rownames(colData(sce)),f = colData(sce)$CollectionTime)
lengths(samplesL)
# Day0   Day1 Day1.5   Day2 Day2.5   Day3   Day4   Day5 
# 231    166     93    211    122    106      0      0 
samplesL <- samplesL[-which(lengths(samplesL)<10)]
lengths(samplesL)
# Day0   Day1 Day1.5   Day2 Day2.5   Day3 
# 231    166     93    211    122    106 

logmat <- as.matrix(logcounts(sce))
dim(logmat) # [1] 96 929

#this optimizes sd selection
set.seed(2020)
testres <- optimize.sd_selection(logmat, samplesL, B=100, cutoff=cut.preselect,
                                 times=.75, percent=0.8)
class(testres) #[1] "list"
class(testres[[1]])  #[1] "matrix" "array" 
lapply(testres, dim)
# $Day0
# [1]  76 231
# 
# $Day1
# [1]  76 166
# 
# $Day1.5
# [1] 72 93
# 
# $Day2
# [1]  72 211
# 
# $Day2.5
# [1]  75 122
# 
# $Day3
# [1]  75 106


# Network Partition
igraphL <- getNetwork(testres, fdr = cut.fdr)
# Day0:76 nodes
# Day1:76 nodes
# Day1.5:72 nodes
# Day2:72 nodes
# Day2.5:75 nodes
# Day3:73 nodes

cluster <- getCluster_methods(igraphL)

##### plot network #################
####################################
names(igraphL)
#[1]"Day0"   "Day1"   "Day1.5" "Day2"   "Day2.5" "Day3" 

library('igraph')
tmp = igraphL[["Day3"]]
E(tmp)$width <- E(tmp)$weight*3
V(tmp)$community= cluster[["Day3"]]$membership
mark.groups = table(cluster[["Day3"]]$membership)
#mark.groups = mark.groups[which(mark.groups>=10)]
colrs = rainbow(length(mark.groups), alpha = 0.3)
V(tmp)$label <- NA
plot(tmp, vertex.color=colrs[V(tmp)$community], vertex.size = 5,
     mark.groups=cluster[["Day3"]])
legend(1,1, paste0(names(mark.groups),sep=":",mark.groups), text.col=rainbow(length(mark.groups)))

dev.copy2pdf(file='Network.view_Day3.pdf')


# 2.1) Identifying CTS, new MCI score #########
membersL <- getMCI(cluster,testres, adjust.size = FALSE, fun='BioTIP')
names(membersL)
#[1] "members" "MCI"     "sd"      "PCC"     "PCCo"  
save(membersL, file="membersL.RData")

pdf(file=paste0("MCIBar_", cut.preselect,
                "_fdr",cut.fdr,"_minsize",cut.minsize,".pdf"),
    width=10, height=5)
plotBar_MCI(membersL, ylim=c(0,10), minsize = 30)
plotBar_MCI(membersL, ylim=c(0,10), minsize = 20)
plotBar_MCI(membersL, ylim=c(0,10), minsize = cut.minsize)
plotBar_MCI(membersL, ylim=c(0,10), minsize = 5)

dev.off()
#the numbers in the parenthesis: they are total number of clusters, no control of the cell (sample) size per  cluster


# get the statistics using the MCI system
n.state.candidate = 4
topMCI = getTopMCI(membersL[["members"]], membersL[["MCI"]], membersL[["MCI"]], 
                   min=cut.minsize, n=n.state.candidate)
names(topMCI)
#[1] "Day2.5" "Day2"   "Day1.5" "Day1" 

# get the state ID and MCI statistics for the two leading MCI scores per state
maxMCIms <- getMaxMCImember(membersL[["members"]],
                            membersL[["MCI"]],min =cut.minsize, n=3)
names(maxMCIms)
#[1] "idx"             "members"         "2topest.members" "3topest.members"

maxMCI = getMaxStats(membersL[['MCI']], maxMCIms[['idx']])
unlist(maxMCI)
#    Day0     Day1   Day1.5     Day2   Day2.5     Day3 
# 2.876082 3.668823 3.693530 3.997960 4.229456 3.666241 

names(maxMCIms[["members"]][names(topMCI)])
#[1] [1] "Day2.5" "Day2"   "Day1.5" "Day1" 

CTS = getCTS(maxMCI[names(topMCI)], maxMCIms[["members"]][names(topMCI)])
# Length: 18
# Length: 23
# Length: 37
# Length: 43

## tmp calculates the number of bars within each named state
(tmp = unlist(lapply(maxMCIms[['idx']][names(topMCI)], length)))
# Day2.5   Day2 Day1.5   Day1 
# 3      2      3      2

## here returns all the groups with exactly 2 bars
(whoistop2nd = names(tmp[tmp==2]))
#[1] "Day2" "Day1"

## here returns all the groups with exactly 3 bars
(whoistop3rd = names(tmp[tmp==3]))
#[1] "Day2.5" "Day1.5"

## add the gene members of the 2nd toppest biomodue in the states with exactly 2 bars
if(length(whoistop2nd)>0)  CTS = append(CTS, maxMCIms[["2topest.members"]][whoistop2nd])
names(CTS)
#[1] "Day2.5" "Day2"   "Day1.5" "Day1"   "Day2"   "Day1" 

## add the gene members of the 2nd toppest biomodue in the states with exactly 3 bars
if(length(whoistop3rd)>0)  CTS = append(CTS, maxMCIms[["2topest.members"]][whoistop3rd])  
names(CTS)
#[1] "Day2.5" "Day2"   "Day1.5" "Day1"   "Day2"   "Day1"   "Day2.5" "Day1.5"

## add the gene members of the 3rd toppest biomodue in the states with exactly 3 bars
if(length(whoistop3rd)>0)  CTS = append(CTS, maxMCIms[["3topest.members"]][whoistop3rd])  
names(CTS)
#[1] "Day2.5" "Day2"   "Day1.5" "Day1"   "Day2"   "Day1"   "Day2.5" "Day1.5" "Day2.5" "Day1.5"


CTS[1:4]
# $Day2.5
# [1] "ALCAM"   "BAMBI"   "EMILIN2" "EPCAM"   "EVX1"    "FGF10"   "HRT2"    "ISL1"    "MSX2"   
# [10] "NANOG"   "PDGFA"   "PDGFB"   "PDGFRB"  "T"       "TBX5"    "TNNT2"   "WNT5B"   "WNT4"   
# 
# $Day2
# [1] "ACVRL1" "BMP2"   "DKK1"   "DLL1"   "DLL3"   "FOXC1"  "FZD2"   "GATA4"  "HAND1"  "LEFTY1"
# [11] "MESP1"  "MESP2"  "MSX1"   "MYL3"   "MYOCD"  "NANOG"  "PDGFB"  "PDGFRA" "SFRP1"  "SOX17" 
# [21] "T"      "TBX20"  "WNT5A" 
# 
# $Day1.5
# [1] "ACVR2A"  "ALCAM"   "BAMBI"   "BMP4"    "BMPR1A"  "BMPR2"   "EMILIN2" "ENG"     "FGF12"  
# [10] "FGF8"    "FGFR1"   "FGFR2"   "FOXH1"   "FSTL1"   "FZD4"    "FZD6"    "HEY1"    "HRT2"   
# [19] "LTBP1"   "MYOCD"   "NANOG"   "NOTCH2"  "NOTCH3"  "NUMB"    "PARD3"   "PDGFB"   "PDGFRB" 
# [28] "PTCH1"   "RCOR2"   "RPL35A"  "SIRPA"   "TGFB2"   "TGFBR1"  "TGFBR2"  "TNNT2"   "TUBB"   
# [37] "VEGFA"  
# 
# $Day1
# [1] "ACVR1B" "ACVR2A" "ALCAM"  "ANF"    "BAMBI"  "BMP4"   "BMPR1A" "BMPR2"  "DLL3"   "ENG"   
# [11] "EOMES"  "EPCAM"  "FSTL1"  "FZD1"   "FZD2"   "FZD4"   "FZD6"   "FZD7"   "HEY1"   "HHIP"  
# [21] "HRT2"   "IRX4"   "KDR"    "KIT"    "LTBP1"  "NANOG"  "NOTCH2" "NOTCH3" "NUMB"   "PARD3" 
# [31] "PDGFA"  "PDGFB"  "PDGFRB" "PTX2"   "RCOR2"  "RPL35A" "SERCA"  "SFRP1"  "SIRPA"  "TGFB1" 
# [41] "TGFBR1" "TUBB"   "VEGFA" 

#### extract CTS scores for each biomodule candidate in the following steps: ####
## first to record the max MCI for the n.state.candidate 
maxMCI <- maxMCI[names(CTS)[1:n.state.candidate]]
maxMCI
# Day2.5     Day2   Day1.5     Day1 
# 4.229456 3.997960 3.693530 3.668823 

## then applendix the 2nd highest MCI score (if existing) for the states with exactly 2 bars
if(length(whoistop2nd)>0) maxMCI <- c(maxMCI, getNextMaxStats(membersL[['MCI']], idL=maxMCIms[['idx']], whoistop2nd))
names(maxMCI)
# [1] "Day2.5" "Day2"   "Day1.5" "Day1"   "Day2"   "Day1"  

## applendix the 2nd highest MCI score (if existing) for the states with exactly 3 bars
if(length(whoistop3rd)>0) maxMCI <- c(maxMCI, getNextMaxStats(membersL[['MCI']], idL=maxMCIms[['idx']], whoistop3rd))
names(maxMCI)
# [1] "Day2.5" "Day2"   "Day1.5" "Day1"   "Day2"   "Day1"   "Day2.5" "Day1.5" 

## then applendix the 3rd highest MCI score (if existing) for the states with exactly 3 bars
if(length(whoistop3rd)>0) maxMCI <- c(maxMCI, getNextMaxStats(membersL[['MCI']], idL=maxMCIms[['idx']], whoistop3rd, which.next=3))
maxMCI
#   Day2.5      Day2    Day1.5      Day1      Day2      Day1    Day2.5    Day1.5    Day2.5    Day1.5 
# 4.2294562 3.9979599 3.6935297 3.6688229 1.9068669 0.7693816 3.5453475 3.0876365 3.4842829 1.0100347 


## to ensure the same order between maxMCI  and CTS
all(names(CTS) == names(maxMCI)) # TRUE


save(CTS,  maxMCI, whoistop2nd, whoistop3rd, file=paste0("CTS_maxMCI.RData")) # !!!

## estimate significance NOT repeat (takes a while)
C = 1000

simuMCI <- list()
for(i in 1:length(CTS) ){
  n <- length(CTS[[i]])
  simuMCI[[i]] <- simulationMCI(n, samplesL, logmat,  B=C, fun="BioTIP")
}
names(simuMCI) <- names(CTS)
save(simuMCI, P, file=paste0("GenePermutation_",C,"CTS_timepoint", cut.preselect, "_fdr",cut.fdr,"_minsize",cut.minsize,".RData"))


## the sig CTS candidates were c(1:4,7,9)
load(file=paste0("CTS_maxMCI.RData"))
load(file=paste0("GenePermutation_",C,"CTS_timepoint", cut.preselect, "_fdr",
                 cut.fdr,"_minsize",cut.minsize,".RData"))
P <- maxMCI * NA
par(mfrow=c(2,5))
for(i in 1:length(CTS) ){
  n <- length(CTS[[i]])
  P[i] <- length(which(maxMCI[i] <= simuMCI[[i]][names(CTS)[i],]))/C
  plot_MCI_Simulation(maxMCI[i], simuMCI[[i]], las=2, ylim=c(0,4),
                    main=paste(names(CTS)[i], n, "genes",
                               "\n","vs. ",C, "times of gene-permutation"), which2point=names(CTS)[i])
}
dev.copy2pdf(file=paste0("MCI_GenePermutation_1000_CTS_timepoint", cut.preselect, "_fdr",cut.fdr,"_minsize",cut.minsize,".pdf"))

P
 # [1] 0.000 0.000 0.000 0.000 0.268 1.000 0.000 0.116 0.000 1.000
(sig <- which(P<0.01) ) 
#[1] 1 2 3 4 7 9



####### Verifying Tipping Point using IC* score  #################
######  BioTIP score, permutating genes ##############
C= 1000

set.seed(2021)
IC_new <- simuBioTIP_g <- list()  
for(i in 1:length(sig)){
  n <- length(CTS[sig][[i]])
  IC_new[[i]] <- getIc(logmat, samplesL, CTS[sig][[i]], fun="BioTIP", shrink=TRUE)

  simuBioTIP_g[[i]]  <- simulation_Ic(n, samplesL, logmat, B=C,
                                 fun="BioTIP", shrink=TRUE)
}
names(simuBioTIP_g) <- paste0(names(CTS)[sig],'_',lengths(CTS)[sig],'g')
names(IC_new) <- paste0(names(CTS)[sig],'_',lengths(CTS)[sig],'g')

save(IC_new, simuBioTIP_g,   file="simuBioTIP_g.RData", compress=TRUE)

names(CTS)[sig]
#[1] "Day2.5" "Day2"   "Day1.5" "Day1"   "Day2.5" "Day2.5"  

load(file="simuBioTIP_g.RData")
pdf(file="BioTIP_vsSimulation_6CTS.pdf", height=10)
par(mfrow=c(3,4))
for(i in 1:length(IC_new)){
  plot_Ic_Simulation(IC_new[[i]], simuBioTIP_g[[i]], las = 2, ylab="Ic*",
                     main= names(IC_new)[i],
                     fun="boxplot", which2point= names(CTS)[sig][i]) # matplot
  plot_SS_Simulation(IC_new[[i]], simuBioTIP_g[[i]], 
                     main = paste("Delta Ic*"), 
                     ylab=NULL)
  
 }
dev.off()

  
# ######  old IC score, shulffing genes ##############
# set.seed(2021)
# IC_old <- simuIc_g <- list()  
# for(i in 1:length(sig)){
#   n <- length(CTS[sig][[i]])
#   IC_old[[i]] <- getIc(logmat, samplesL, CTS[sig][[i]], fun="cor", shrink=TRUE)
#   
#   simuIc_g[[i]]  <- simulation_Ic(n, samplesL, logmat, B=C,
#                                       fun="cor", shrink=TRUE)
# }
# names(simuIc_g) <- paste0(names(CTS)[sig],'_',lengths(CTS)[sig],'g')
# names(IC_old) <- paste0(names(CTS)[sig],'_',lengths(CTS)[sig],'g')
# 
# save(IC_old, simuIc_g,   file="simuIC_g.RData", compress=TRUE)
# 
# names(CTS)[sig]
# #[1] "Day2.5" "Day2"   "Day1.5" "Day1"   "Day2.5" "Day2.5"  
# 
# pdf(file="IC_vsSimulation_6CTS.pdf", height=10)
# par(mfrow=c(3,4))
# for(i in 1:length(IC_old)){
#   plot_Ic_Simulation(IC_old[[i]], simuIc_g[[i]], las = 2, ylab="old Ic",
#                      main= names(IC_new)[i],
#                      fun="boxplot", which2point= names(CTS)[sig][i]) # matplot
#   plot_SS_Simulation(IC_old[[i]], simuIc_g[[i]], 
#                      main = paste("Delta Ic"), 
#                      ylab=NULL)
#   
# }
# dev.off()

####### Verifying Tipping Point and using IC*  score  #################
######  BioTIP score, Shuffling label ##############
C= 1000
lengths(samplesL)
# Day0   Day1 Day1.5   Day2 Day2.5   Day3 
# 231    166     93    211    122    106 

set.seed(2021)
IC_new <- simuBioTIP_s <- list()  

for(i in 1:length(sig)){
  IC_new[[i]] <- getIc(logmat, samplesL, CTS[sig][[i]], fun="BioTIP", shrink=TRUE)
  simuBioTIP_s[[i]] <- matrix(nrow=length(samplesL), ncol=C)
 # rownames(simuBioTIP_s[[i]]) = names(CTS)[sig][i]
  rownames(simuBioTIP_s[[i]]) = names(samplesL)
  for(j in 1:length(samplesL)) { 
    #ns <- length(samplesL[names(CTS[sig])][[i]])  # for this state only
    ns <- length(samplesL[[j]])  # for each state rewpectively 
    simuBioTIP_s[[i]][j,]  <- simulation_Ic_sample(logmat, ns, Ic=IC_new[[i]],
                                             genes=CTS[sig][[i]], B=C,
                                             fun="BioTIP", shrink=TRUE)
  }
}
names(simuBioTIP_s) <- paste0(names(CTS)[sig],'_',lengths(CTS)[sig],'g')
names(IC_new) <- paste0(names(CTS)[sig],'_',lengths(CTS)[sig],'g')

save(simuBioTIP_s,   file="simuBioTIP_s.RData", compress=TRUE)

names(CTS)[sig]
#[1] "Day2.5" "Day2"   "Day1.5" "Day1"   "Day2.5" "Day2.5"  
pdf(file="BioTIP_vs_Shufflinglabel_6CTS.pdf", height=10)
par(mfrow=c(3,4))
for(i in 1:length(IC_new)){
  plot_Ic_Simulation(IC_new[[i]], simuBioTIP_s[[i]], las = 2, ylab="Ic*",
                     main= names(IC_new)[i],
                     fun="boxplot", which2point= names(CTS)[sig][i]) # matplot
  plot_SS_Simulation(IC_new[[i]], simuBioTIP_s[[i]], 
                     main = paste("Delta Ic*"), 
                     ylab=NULL)
  
}
dev.off()


# ######  old IC score, shulffing labels ##############
# set.seed(2021)
# IC_old <- simuIc_s <- list() 
# 
# for(i in 1:length(sig)){
#   IC_old[[i]] <- getIc(logmat, samplesL, CTS[sig][[i]], fun="cor", shrink=TRUE)
#   simuIc_s[[i]] <- matrix(nrow=length(samplesL), ncol=C)
#   rownames(simuIc_s[[i]]) = names(samplesL)
#   for(j in 1:length(samplesL)) { 
#     ns <- length(samplesL[[j]])  # for each state rewpectively 
#     simuIc_s[[i]][j,]  <- simulation_Ic_sample(logmat, ns, Ic=IC_new[[i]],
#                                                  genes=CTS[sig][[i]], B=C,
#                                                  fun="cor", shrink=TRUE)
#  }
# }
# names(simuIc_s) <- paste0(names(CTS)[sig],'_',lengths(CTS)[sig],'g')
# names(IC_old) <- paste0(names(CTS)[sig],'_',lengths(CTS)[sig],'g')
# 
# save(IC_old, simuIc_s,   file="simuIc_s.RData", compress=TRUE)
# 
# names(CTS)[sig]
# #[1] "Day2.5" "Day2"   "Day1.5" "Day1"   "Day2.5" "Day2.5"  
# pdf(file="IC_vs_Shufflinglabel_6CTS.pdf", height=10)
# par(mfrow=c(3,4))
# for(i in 1:length(IC_old)){
#   plot_Ic_Simulation(IC_old[[i]], simuIc_s[[i]], las = 2, ylab="old Ic",
#                      main= names(IC_new)[i],
#                      fun="boxplot", which2point= names(CTS)[sig][i]) # matplot
#   plot_SS_Simulation(IC_old[[i]], simuIc_s[[i]], 
#                      main = paste("Delta Ic"), 
#                      ylab=NULL)
#   
# }
# dev.off()
xyang2uchicago/BioTIP documentation built on June 30, 2024, 10:14 p.m.