Nothing
## ---- include = FALSE---------------------------------------------------------
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>"
)
## ---- echo=FALSE, fig.cap="Fig 1. BioTIP workflow with five key analytical steps. RTF: relative transcript fluctuation; PCC: Pearson correlation coefficient; MCI: Module-Criticality Index; Ic: index of critical transition.", fig.align='center', out.width = '60%'----
knitr::include_graphics("FigS1_BioTIP_pipeline_detailed_v7.jpg")
## ---- echo=FALSE, fig.cap="Table 1. The key R functions and customaized parameters for the five analytical steps.", fig.align='center', out.width = '60%'----
knitr::include_graphics("Fig_Table1_xy.jpg")
## ---- warning=FALSE, message=FALSE--------------------------------------------
# load BioTIP and dependent packages
library(BioTIP)
library(cluster)
library(GenomicRanges)
library(Hmisc)
library(MASS)
require(stringr)
require(psych)
require(igraph)
## -----------------------------------------------------------------------------
data(GSE6136_matrix)
dim(GSE6136_matrix)
row.names(GSE6136_matrix) = GSE6136_matrix$ID_REF
# remove the downloaded first row after assigning it to be column-name of the final numeric data matrix
GSE6136 = GSE6136_matrix[,-1]
dim(GSE6136)
## -----------------------------------------------------------------------------
data(GSE6136_cli)
##dim(GSE6136_cli) optional: check dimension
cli = t(GSE6136_cli)
colnames(cli) = str_split_fixed(cli[1,],'_',2)[,2]
cli = cli[-1,]
cli = data.frame(cli)
cli[,"cell-type:ch1"] = str_split_fixed(cli$characteristics_ch1.1,": ",2)[,2]
cli[,"Ig clonality:ch1"] = str_split_fixed(cli$characteristics_ch1.3,": ",2)[,2]
colnames(cli)[colnames(cli) == "cell-type:ch1"] = "group"
cli$Row.names = cli[,1]
head(cli[,1:3])
## -----------------------------------------------------------------------------
dat <- GSE6136
df <- log2(dat+1)
head(df[,1:6])
## ----warning=FALSE------------------------------------------------------------
cli$group = factor(cli$group,
levels = c('resting','activated','lymphoma_marginal','lymphoma_transitional','lymphoma_aggressive'))
samplesL <- split(cli[,"geo_accession"],f = cli$group)
lapply(samplesL, length)
test <- sd_selection(df, samplesL,0.01)
head(test[["activated"]])
## ---- warning=FALSE-----------------------------------------------------------
igraphL <- getNetwork(test, fdr = 1)
cluster <- getCluster_methods(igraphL)
## ----echo=TRUE, warning=FALSE-------------------------------------------------
names(cluster)
## ----echo=TRUE, warning=FALSE-------------------------------------------------
membersL_noweight <- getMCI(cluster,test)
plotBar_MCI(membersL_noweight,ylim = c(0,6))
## ----echo=TRUE, warning=FALSE-------------------------------------------------
maxMCIms <- getMaxMCImember(membersL_noweight[[1]],membersL_noweight[[2]],min =10)
names(maxMCIms)
names(maxMCIms[[1]])
names(maxMCIms[[2]])
## ----echo=TRUE, warning=FALSE-------------------------------------------------
head(maxMCIms[['idx']])
head(maxMCIms[['members']][['lymphoma_aggressive']])
## -----------------------------------------------------------------------------
biomodules = getMaxStats(membersL_noweight[['members']],maxMCIms[[1]])
maxMCI = getMaxStats(membersL_noweight[['MCI']],maxMCIms[[1]])
maxMCI = maxMCI[order(maxMCI,decreasing=TRUE)]
head(maxMCI)
topMCI = getTopMCI(membersL_noweight[[1]],membersL_noweight[[2]],membersL_noweight[['MCI']],min =10)
head(topMCI)
## -----------------------------------------------------------------------------
maxSD = getMaxStats(membersL_noweight[['sd']],maxMCIms[[1]])
head(maxSD)
## -----------------------------------------------------------------------------
CTS = getCTS(topMCI, maxMCIms[[2]])
## ----echo=TRUE, warning=FALSE-------------------------------------------------
par(mar = c(10,5,0,2))
plotMaxMCI(maxMCIms,membersL_noweight[[2]],states = names(samplesL),las = 2)
## ---- warning=FALSE, results=FALSE--------------------------------------------
simuMCI <- simulationMCI(3,samplesL,df, B=100)
plot_MCI_Simulation(topMCI[1],simuMCI,las=2)
## -----------------------------------------------------------------------------
IC <- getIc(df,samplesL,CTS[[1]])
par(mar = c(10,5,0,2))
plotIc(IC,las = 2)
## ----warning=FALSE, results=FALSE---------------------------------------------
simuIC <- simulation_Ic(length(CTS[[1]]),samplesL,df,B=100)
par(mar = c(10,5,0,2))
plot_Ic_Simulation(IC,simuIC,las = 2)
## -----------------------------------------------------------------------------
sample_Ic = simulation_Ic_sample(df, sampleNo=3, genes=CTS[[1]], plot=TRUE)
##simulated_Ic = plot_simulation_sample(df,length(samplesL[['lymphoma_aggressive']]),IC[['lym#phoma_aggressive']],CTS)
## ---- echo=FALSE, out.height='60%', out.width='60%'---------------------------
knitr::include_graphics("subway_map.jpg")
## ---- warning=FALSE, message=FALSE, results=FALSE-----------------------------
library(BioTIP)
## ---- warning=FALSE, message=FALSE--------------------------------------------
#Load stringr library
library(stringr)
#BioTIP dependent libraries
library(cluster)
library(psych)
library(stringr)
library(GenomicRanges)
library(Hmisc)
library(MASS)
library(igraph)
library(RCurl)
## -----------------------------------------------------------------------------
## familiarize yourself with data
cell_info = read.delim("cell_info.tsv", head = T, sep = '\t', row.names=1)
dim(cell_info)
## in case R automatically reformated characters in cell labels, match cell labels in the data and in the infor table
rownames(cell_info) = sub('LT-HSC', 'LT.HSC', rownames(cell_info))
## focus on one branch, e.g., S4-S3-S0-S1
## no filter because it was already 4k genes, 10% of the originally measured genes in the single cell experiment
cell_info <- subset(cell_info, branch_id_alias %in% c("('S4', 'S3')", "('S3', 'S0')", "('S1', 'S0')" ))
## check cell sub-population sizes along the STREAM outputs
(t = table(cell_info$label, cell_info$branch_id_alias))
## focus on sub-populations of cells with more than 10 cells
( idx = apply(t, 2, function(x) names(which(x>10))) )
## generate a list of samples (cells) of interest
samplesL = sapply(names(idx),
function(x) sapply(idx[[x]], function(y)
rownames(subset(cell_info, branch_id_alias == x & label == y))))
## check the number of sub-populations along the pseudotime trajectory (generated by STREAM for example)
## recommend to focus on each branch, respectively
names(samplesL[[1]])
## merge sub-populations at each pseudo-time 'state'
samplesL = do.call(c, samplesL)
lengths(samplesL)
## load normalized gene expression matrix, refer to the example below if not yet normalized
## e.g, here is a STREAM-published data matrix of normalized gene expression matrix
## when analyzing SingleCellExperiment object SCE, translate to matrix using the following code
# counts <- logcounts(SCE)
## data matrix imported from our github repository due to large size
githuburl = "https://github.com/ysun98/BioTIPBigData/blob/master/data_Nestorowa.tsv?raw=true"
counts = read.table(url(githuburl), head=T, sep="\t", row.names=1)
dim(counts)
if (class(counts)=="data.frame") counts = as.matrix(counts)
all(colnames(counts) %in% as.character(rownames(cell_info)))
## log2 transformation example
if(max(counts)>20) counts = log2(counts)
## Check the overall distribution using histogram
# hist (counts, 100)
## -----------------------------------------------------------------------------
## setup parameters for gene preselection
B=10 ##for demonstration purposes use 10, when running dataset we recommend at least 100
##optimize and one nonoptimize for single cell use optimize with B 100 or higher for demo purpose only we run
opt.cutoff = 0.2
opt.per = 0.8
## Commented out because of calculation expense, recommend to save the calculated data file after one calculation for future use
## set.seed(100)
## subcounts = optimize.sd_selection(counts, samplesL, cutoff = opt.cutoff, percent=opt.per, B)
## save( subcounts, file='BioTIP_Output_xy/subcounts.optimize_80per.rData')
data("subcounts.optimize_80per")
## ---- warning=FALSE-----------------------------------------------------------
## use 0.01-0.2 to ensure gene-gene co-expression is significant
## increase fdr cutoff to obtain larger gene sets
network.fdr = 0.1
min.size = 50
igraphL = getNetwork(subcounts, fdr=network.fdr)
names(igraphL)
## Network partition using random walk
cluster = getCluster_methods(igraphL)
tmp = igraphL[["('S3', 'S0').CMP"]]
E(tmp)$width <- E(tmp)$weight*3
V(tmp)$community= cluster[["('S3', 'S0').CMP"]]$membership
mark.groups = table(cluster[["('S3', 'S0').CMP"]]$membership)
colrs = rainbow(length(mark.groups), alpha = 0.3)
V(tmp)$label <- NA
plot(tmp, vertex.color=colrs[V(tmp)$community],vertex.size = 5,
mark.groups=cluster[["('S3', 'S0').CMP"]])
## ---- warning=FALSE-----------------------------------------------------------
fun = 'BioTIP'
## Commented out because of calculation expense, recommend to save the calculated data file after one calculation for future use
## membersL = getMCI(cluster, subcounts, adjust.size = F, fun)
## save(membersL, file="membersL.RData", compress=TRUE)
data("membersL")
par(mar=c(1,1,2,1))
plotBar_MCI(membersL, ylim=c(0,30), min=50)
## Decide how many states of interest, here is 4
n.state.candidate <- 4
topMCI = getTopMCI(membersL[["members"]], membersL[["MCI"]], membersL[["MCI"]],
min=min.size,
n=n.state.candidate)
names(topMCI)
## Obtain state ID and MCI statistics for the n=3 leading MCI scores per state
maxMCIms = getMaxMCImember(membersL[["members"]], membersL[["MCI"]],
min =min.size,
n=3)
names(maxMCIms)
## list the maximum MCI score per state, for all states
maxMCI = getMaxStats(membersL[['MCI']], maxMCIms[['idx']])
unlist(maxMCI)
#### extract biomodule candidates in the following steps: ####
## record the gene members per toppest module for each of these states of interest
CTS = getCTS(maxMCI[names(topMCI)], maxMCIms[["members"]][names(topMCI)])
## tmp calculates the number of bars within each named state
tmp = unlist(lapply(maxMCIms[['idx']][names(topMCI)], length))
## here returns all the groups with exactly 2 bars
(whoistop2nd = names(tmp[tmp==2]))
## here returns all the groups with exactly 3 bars
(whoistop3rd = names(tmp[tmp==3]))
## add the gene members of the 2nd toppest biomodue in the states with exactly 2 bars
if(length(whoistop2nd)>0) CTS = append(CTS, maxMCIms[["2topest.members"]][whoistop2nd])
names(CTS)
## add the gene members of the 2nd toppest biomodue in the states with exactly 3 bars
if(length(whoistop3rd)>0) CTS = append(CTS, maxMCIms[["2topest.members"]][whoistop3rd])
names(CTS)
## add the gene members of the 3rd toppest biomodue in the states with exactly 3 bars
if(length(whoistop3rd)>0) CTS = append(CTS, maxMCIms[["3topest.members"]][whoistop3rd])
names(CTS)
## ---- results=FALSE-----------------------------------------------------------
#### extract CTS scores for each biomodule candidate in the following steps: ####
## first to record the max MCI for the n.state.candidate
maxMCI <- maxMCI[names(CTS)[1:n.state.candidate]]
maxMCI
## then applendix the 2nd highest MCI score (if existing) for the states with exactly 2 bars
if(length(whoistop2nd)>0) maxMCI <- c(maxMCI, getNextMaxStats(membersL[['MCI']], idL=maxMCIms[['idx']], whoistop2nd))
names(maxMCI)
## applendix the 2nd highest MCI score (if existing) for the states with exactly 3 bars
if(length(whoistop3rd)>0) maxMCI <- c(maxMCI, getNextMaxStats(membersL[['MCI']], idL=maxMCIms[['idx']], whoistop3rd))
names(maxMCI)
## then applendix the 3rd highest MCI score (if existing) for the states with exactly 3 bars
if(length(whoistop3rd)>0) maxMCI <- c(maxMCI, getNextMaxStats(membersL[['MCI']], idL=maxMCIms[['idx']], whoistop3rd, which.next=3))
maxMCI
## to ensure the same order between maxMCI and CTS
all(names(CTS) == names(maxMCI))
## estimate empritical significance from the MCI scores
## M is precalculated correlation matrix for large dataset (>2k genes), will be reused in the downstream simulation analysis
#counts = read.table("data_Nestorowa.tsv")
#if (class(counts)=="data.frame") counts = as.matrix(counts)
#M <- cor.shrink(counts, Y = NULL, MARGIN = 1, shrink = TRUE)
#save(M, file="cor.shrink_M.RData", compress=TRUE)
#dim(M)
## C is the runs of permutations to estimate random scores
#C = 10 # for real data analysis C = at least 200
#RandomMCI = list()
#n <- length(CTS) # number of CTS candidates
#set.seed(2020)
#for (i in 1:n) #
# i=1; par(mfrow=c(1,1))
# x <- length(CTS[[i]])
# RandomMCI[[i]] <- simulationMCI(x, samplesL, counts, B=C, fun="BioTIP", M=M)
# dim(RandomMCI)
# plot_MCI_Simulation(maxMCI[i], RandomMCI[[i]], las=2,
# ylim=c(0, max(maxMCI[i], 2*RandomMCI[[i]])),
# main=paste(names(maxMCI)[i], length(CTS[[i]]), "genes",
# "\n","vs. ",C, "times of gene-permutation"),
# which2point=names(maxMCI)[i])
######## Finding Tipping Point #################
newIc_score = lapply(CTS, function(x) getIc(counts, samplesL, x, fun="BioTIP"))
names(newIc_score) <- names(CTS)
## ---- echo=FALSE, fig.align='center', out.width = '60%'-----------------------
knitr::include_graphics("MCI_Sim_M.jpg")
## ---- results=FALSE-----------------------------------------------------------
######## verify using IC score #################
## First, estimate the random Ic-scores by permuating the expresion values of genes
RandomIc_g = list()
set.seed(2020)
C= 10 # for real data analysis C = at least 500
# for(i in 1:length(CTS)){ Not to run the full loop B# }
i = 1
CTS <- CTS[[i]]
n <- length(CTS)
RandomIc_g[[i]] <- simulation_Ic(n, samplesL, counts,
B=C,
fun="BioTIP")
names(RandomIc_g)[i] = names(CTS)[i]
# par(mfrow=c(1,length(int)))
# for(i in 1:length(newIc_score)){ Not to run the full loop B plotting
par(mfrow = c(1,1))
n = length(CTS[[i]])
plot_Ic_Simulation(newIc_score[[i]], RandomIc_g[[i]], las = 2, ylab="BioTIP",
main= paste(names(newIc_score)[i],"(",n," transcripts)"),
#fun="matplot", which2point=names(newIc_score)[i])
fun="boxplot", which2point=names(newIc_score)[i], ylim=c(0,0.5))
interesting = which(names(samplesL) == names(newIc_score[i]))
p = length(which(RandomIc_g[[i]][interesting,] >= newIc_score[[i]][names(newIc_score)[i]]))
p = p/ncol(RandomIc_g[[i]])
# first p value (p1) calculated for exactly at tipping point
p2 = length(which(RandomIc_g[[i]] >= newIc_score[[i]][names(newIc_score)[i]]))
p2 = p2/ncol(RandomIc_g[[i]])
p2 = p2/nrow(RandomIc_g[[i]])
# second p value (p2) calculated across all statuses
## local Kernel Density Plot
d <- density(RandomIc_g[[i]]) # returns the density data
plot(d, xlim=range(c(newIc_score[[i]],RandomIc_g[[i]])),
main=paste("Random genes: p.Local=",p)) # plots the results
abline(v=newIc_score[[i]][names(newIc_score)[i]], col="green")
## global Kernel Density Plot
d <- density(unlist(RandomIc_g)) # returns the density data
plot(d, xlim=range(c(newIc_score[[i]],unlist(RandomIc_g))),
main=paste("Random genes: p.Global=",p2)) # plots the results
abline(v=newIc_score[[i]][names(newIc_score)[i]], col="green")
# } Not to run the full loop B plotting
## Second, estimate the random Ic-scores by randomly shulffing cell labels
RandomIc_s = list()
set.seed(2020)
# for(i in 1:length(CTS)){ Not to run the full loop C
i = 1
RandomIc_s[[i]] <- matrix(nrow=length(samplesL), ncol=C)
rownames(RandomIc_s[[i]]) = names(samplesL)
for(j in 1:length(samplesL)) {
ns <- length(samplesL[[j]]) # of cells at the state of interest
RandomIc_s[[i]][j,] <- simulation_Ic_sample(counts, ns,
Ic=BioTIP_score[x],
genes=CTS,
B=C,
fun="BioTIP")
}
names(RandomIc_s)[i] = names(CTS)[i]
# } Not to run the full loop C
# par(mfrow=c(1,length(int)))
# for(i in 1:length(newIc_score)){ Not to run the full loop C plotting
n = length(CTS[[i]])
plot_Ic_Simulation(newIc_score[[i]], RandomIc_s[[i]], las = 2, ylab="BioTIP",
main= paste(names(newIc_score)[i],"(",n," transcripts)"),
fun="boxplot", which2point=names(newIc_score)[i])
interesting = which(names(samplesL) == names(newIc_score[i]))
p = length(which(RandomIc_s[[i]][interesting,] >= newIc_score[[i]][names(newIc_score)[i]]))
p = p/ncol(RandomIc_s[[i]])
# first p value (p1) calculated for exactly at tipping point
p2 = length(which(RandomIc_s[[i]] >= newIc_score[[i]][names(newIc_score)[i]]))
p2 = p2/ncol(RandomIc_s[[i]])
p2 = p2/nrow(RandomIc_s[[i]])
p
p2
# } Not to run the full loop C plotting
## Alternatively, p values is estimated from delta scores
P3 = plot_SS_Simulation(newIc_score[[i]], RandomIc_s[[i]],
xlim=c(0, max(newIc_score[[i]], RandomIc_s[[i]])),
las=2,
main=paste(names(CTS)[i], length(CTS[[i]]), "genes, ", "\n", C, "Shuffling labels"))
## -----------------------------------------------------------------------------
P3
## ---- echo=FALSE, fig.align='center', out.width = '60%'-----------------------
knitr::include_graphics("CTS_Id.jpg")
knitr::include_graphics("delta.jpg")
## ---- echo=FALSE, fig.align='center', out.width = '65%'-----------------------
knitr::include_graphics("Fig2.jpg")
## -----------------------------------------------------------------------------
library(BioTIP)
data(gencode)
head(gencode)
## ----"python code", eval = FALSE----------------------------------------------
# gtf = ("Your/PATH/TO/THE/FILE")
# outF = open("gtf_summary_transbiotype.txt","w")
#
# def getquote(str,f,target):
# targetLen = len(target)+2
# strInd = str.find(target)
# st = strInd + len(target)+2
# ed = st + str[st:].find("";")
# #print(st,ed)
# f.write("\t"+str[st:ed]) if strInd!= -1 else f.write("\t"+"NA.")
#
# with open(gtf, "r") as f:
# for line in f:
# if line[0] != "#":
# chromosome = line.split("\t")[0]
# st = line.split("\t")[3]
# ed = line.split("\t")[4]
# strand = line.split("\t")[6]
# type = line.split("\t")[2]
# outF.write(chromosome+"\t"+st+"\t"+ed+"\t"+strand+"\t"+type)
# c = "transcript_id"
# g = "gene_name"
# t = "transcript_type"
# getquote(line,outF,c)
# getquote(line,outF,g)
# getquote(line,outF,t)
# outF.write("\n")
# outF.close()
## -----------------------------------------------------------------------------
library(BioTIP)
library(GenomicRanges)
data(gencode)
head(gencode)
## -----------------------------------------------------------------------------
query <- GRanges(c("chr1:2-10:+","chr1:6-10:-"), Row.names = c("trans1","trans2"), score = c(1,2))
head(query)
## -----------------------------------------------------------------------------
library(BioTIP)
gr <- GRanges(c("chr1:1-5:+","chr1:2-3:+"),biotype = c("lincRNA","CPC"))
head(gr)
## -----------------------------------------------------------------------------
intron <- GRanges("chr1:6-8:+")
head(intron)
## -----------------------------------------------------------------------------
intron_trncp <- getBiotypes(query, gr, intron)
intron_trncp
## -----------------------------------------------------------------------------
library(BioTIP)
data("intron")
data("ILEF")
data("gencode")
gencode_gr = GRanges(gencode)
ILEF_gr = GRanges(ILEF)
cod_gr = GRanges(cod)
intron_gr = GRanges(intron)
non_coding <- getBiotypes(ILEF_gr, gencode_gr, intron_gr)
dim(non_coding)
head(non_coding[,1:3])
## -----------------------------------------------------------------------------
coding <- getBiotypes(ILEF_gr, gencode_gr)
dim(coding)
head(coding[,1:3])
## -----------------------------------------------------------------------------
library(BioTIP)
data(ILEF)
data(cod)
ILEF_gr = GRanges(ILEF)
cod_gr = GRanges(cod)
rdthrough <- getReadthrough(ILEF_gr, cod_gr)
head(rdthrough)
## ----SessionInfo--------------------------------------------------------------
sessionInfo()
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.