Nothing
## ----style, echo = FALSE, results = 'asis'------------------------------------
BiocStyle::markdown()
## ----global_options, include=FALSE-------------------------------------------------------------------------------------------------
knitr::opts_chunk$set(message=FALSE, error=FALSE, warning=FALSE, fig.width=8,
fig.height=8)
options(width=133)
## ----eval=FALSE--------------------------------------------------------------------------------------------------------------------
# ## Do not execute if you have already installed BioMM.
# if (!requireNamespace("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
# # The following initializes usage of Bioc devel
# BiocManager::install(version='devel')
# BiocManager::install("BioMM")
## ----eval=FALSE--------------------------------------------------------------------------------------------------------------------
# install.packages("devtools")
# library("devtools")
# install_github("transbioZI/BioMM", build_vignettes=TRUE)
## ----loadPkg, eval=TRUE, results="hide"--------------------------------------------------------------------------------------------
library(BioMM)
library(BiocParallel)
library(ranger)
library(rms)
library(glmnet)
library(e1071)
library(precrec)
library(vioplot)
library(CMplot)
library(imager)
library(topGO)
library(xlsx)
## ----studyData, eval=TRUE----------------------------------------------------------------------------------------------------------
## DNA methylation data
methylData <- readRDS(system.file("extdata", "/methylData.rds", package="BioMM"))
# The first column is the label, and the rest are the features (e.g., CpGs)
head(methylData[,1:4])
# 0: control and 1: patient
table(methylData[,1])
dim(methylData)
## Gene expression data
expData <- readRDS(system.file("extdata", "/expData.rds", package="BioMM"))
# The first column is the label, and the rest are the features (e.g., genes)
head(expData[,1:4])
# 0: control and 1: patient
table(expData[,1])
dim(expData)
## ----annotationFile, eval=TRUE-----------------------------------------------------------------------------------------------------
## Load feature annotation data
featureAnno <- readRDS(system.file("extdata", "cpgAnno.rds", package="BioMM"))
# The mapping between CpGs and genes (i.e. entrezID or gene symbol)
head(featureAnno)
# total number of CpGs under investigation
str(unique(featureAnno[,1]))
## Reprocessed Gene ontological pathway annotation with 10 and 200 genes for each pathway
golist <- readRDS(system.file("extdata", "goDB.rds", package="BioMM"))
## Number of investigated biological processes
length(golist)
str(golist[1:3])
## Reprocessed KEGG pathway annotation with 10 and 200 genes for each pathway
kegglist <- readRDS(system.file("extdata", "keggDB.rds", package="BioMM"))
## Number of investigated KEGG pathways
length(kegglist)
str(kegglist[1:3])
## ----pathlist, eval=TRUE-----------------------------------------------------------------------------------------------------------
## Feature annotation to pathways
## Use 100 pathways to reduce the runtime for the downstream analysis. But if possible, please make sure to use all.
numPath <- 100
# GO pathway mapping using DNA methylation data
golistSub <- golist[seq_len(numPath)]
methylGOlist <- omics2pathlist(data=methylData, pathlistDB=golistSub,
featureAnno=featureAnno,
restrictUp=200, restrictDown=10, minPathSize=10)
# KEGG pathway mapping using DNA methylation data
kegglistSub <- kegglist[seq_len(numPath)]
methylKEGGlist <- omics2pathlist(data=methylData, pathlistDB=kegglistSub,
featureAnno=featureAnno,
restrictUp=200, restrictDown=10, minPathSize=10)
# GO pathway mapping using gene expression data
golistSub <- golist[seq_len(numPath)]
expGOlist <- omics2pathlist(data=expData, pathlistDB=golistSub,
featureAnno=NULL,
restrictUp=200, restrictDown=10, minPathSize=10)
# KEGG pathway mapping using gene expression data
kegglistSub <- kegglist[seq_len(numPath)]
expKEGGlist <- omics2pathlist(data=expData, pathlistDB=kegglistSub,
featureAnno=NULL,
restrictUp=200, restrictDown=10, minPathSize=10)
## ----BioMMrandForest4methylGO, eval=TRUE-------------------------------------------------------------------------------------------
## To reduce the runtime, only use a subset of DNA methylation data
## However, if possible, subsetting the data is not suggested.
trainData <- methylData[,1:3000]
trainDataY <- trainData[,1]
testData <- NULL
## Model parameters
supervisedStage1=TRUE
classifier <- "randForest"
predMode <- "probability"
paramlist <- list(ntree=100, nthreads=20)
core <- MulticoreParam(workers = 10)
set.seed(123)
result <- BioMM(trainData=trainData, testData=NULL,
pathlistDB=golistSub, featureAnno,
restrictUp=200, restrictDown=10, minPathSize=10,
supervisedStage1, typePCA="regular",
resample1="BS", resample2="CV", dataMode="allTrain",
repeatA1=50, repeatA2=1, repeatB1=10, repeatB2=1,
nfolds=10, FSmethod1=NULL, FSmethod2=NULL,
cutP1=0.05, cutP2=0.05, fdr2=NULL,
FScore=MulticoreParam(workers = 1),
classifier, predMode, paramlist, innerCore=core)
if (is.null(testData)) {
metricCV <- getMetrics(dataY = trainDataY, predY = result)
message("Cross-validation prediction performance:")
print(metricCV)
} else if (!is.null(testData)){
testDataY <- testData[,1]
metricCV <- getMetrics(dataY = trainDataY, cvYscore = result[[1]])
metricTest <- getMetrics(dataY = testDataY, testYscore = result[[2]])
message("Cross-validation performance:")
print(metricCV)
message("Test set prediction performance:")
print(metricTest)
}
## ----BioMMrandForest4expKEGG, eval=TRUE--------------------------------------------------------------------------------------------
## to reduce the runtime, only use a subset of gene expression data
## However, if possible, subsetting the data is not suggested.
trainData <- expData[,1:3000]
trainDataY <- trainData[,1]
testData <- NULL
## Only for gene expression data
featureAnno=NULL
## Model parameters
supervisedStage1=TRUE
classifier <- "randForest"
predMode <- "probability"
paramlist <- list(ntree=100, nthreads=20)
core <- MulticoreParam(workers = 10)
set.seed(123)
result <- BioMM(trainData=trainData, testData=NULL,
pathlistDB=kegglistSub, featureAnno,
restrictUp=200, restrictDown=10, minPathSize=10,
supervisedStage1, typePCA="regular",
resample1="BS", resample2="CV", dataMode="allTrain",
repeatA1=50, repeatA2=1, repeatB1=10, repeatB2=1,
nfolds=10, FSmethod1=NULL, FSmethod2=NULL,
cutP1=0.05, cutP2=0.05, fdr2=NULL,
FScore=MulticoreParam(workers = 1),
classifier, predMode, paramlist, innerCore=core)
## Cross-validation is applied on the training data, therefore 'result' only returns the CV predicted score.
metricCV <- getMetrics(dataY = trainDataY, predY = result)
message("Cross-validation prediction performance:")
print(metricCV)
## ----stage2dataAprep, eval=TRUE----------------------------------------------------------------------------------------------------
## Define the omics type
# omicsType <- "methylation"
omicsType <- "expression"
pathType <- "GO"
pathType <- "KEGG"
if (omicsType == "methylation" & pathType == "GO"){
studylist <- methylGOlist
} else if (omicsType == "methylation" & pathType == "KEGG"){
studylist <- methylKEGGlist
} else if (omicsType == "expression" & pathType == "GO"){
studylist <- expGOlist
} else if (omicsType == "expression" & pathType == "KEGG"){
studylist <- expKEGGlist
} else {
stop("Wrong specified omicsType and pathType!")
}
length(studylist)
## Model parameters
classifier <- "randForest"
predMode <- "probability"
paramlist <- list(ntree=100, nthreads=20)
core <- MulticoreParam(workers = 10)
set.seed(123)
stage2dataA <- reconBySupervised(trainDataList=studylist,
testDataList=NULL,
resample="BS", dataMode="allTrain",
repeatA=50, repeatB=1, nfolds=10,
FSmethod=NULL, cutP=0.05, fdr=NULL,
FScore=MulticoreParam(workers = 1),
classifier, predMode, paramlist,
innerCore=core, outFileA=NULL, outFileB=NULL)
## Check stage-2 data
dim(stage2dataA)
print(table(stage2dataA[,1]))
head(stage2dataA[,1:4])
## ----stage2dataViz, eval=TRUE------------------------------------------------------------------------------------------------------
core <- MulticoreParam(workers = 10)
fileName <- paste0(omicsType,"_", pathType, "_featuresVarExplained.png")
plotVarExplained(data=stage2dataA, posF=TRUE, binarize=FALSE, core=core,
pathTitle=paste0(pathType, " pathways"), fileName)
plot(load.image(fileName))
## ----topPathFeatures, eval=TRUE, fig.show="hold"-----------------------------------------------------------------------------------
core <- MulticoreParam(workers = 1)
rankMetric <- "negPlogit"
filePrefix <- paste0(omicsType, "_", pathType, "_topPath_", rankMetric)
topPath <- plotRankedFeature(data=stage2dataA,
posF=TRUE, topF=10, binarize=FALSE,
blocklist=studylist,
rankMetric=rankMetric,
colorMetric="size", core,
pathTitle=paste0(pathType, " pathways"),
fileName=paste0(filePrefix, ".png"))
plot(load.image(paste0(filePrefix, ".png")))
## ----reportTopPath, eval=TRUE------------------------------------------------------------------------------------------------------
## Report the top pathways
if (pathType == "GO"){
goterms = unlist(Term(GOTERM))
topGOID = gsub("\\.", ":", rownames(topPath))
subTerm = goterms[is.element(names(goterms), topGOID)]
topIDmatch = subTerm[match(topGOID, names(subTerm))]
topPath <- data.frame(topPath, Description=topIDmatch)
} else if (pathType == "KEGG"){
## A matching list between KEGG ID and names. Data freezes on Aug 2020
keggID2name <- readRDS(system.file("extdata", "/keggID2name202008.rds",
package="BioMM"))
keggSub <- keggID2name[is.element(keggID2name[,"ID"], rownames(topPath)),]
topIDmatch <- keggSub[match(rownames(topPath), keggSub[,"ID"]),]
topPath <- data.frame(topPath, Description=topIDmatch[,"name"])
}
print(topPath)
write.xlsx(topPath,file=paste0(filePrefix, ".xlsx"))
# write.table(topPath,file=paste0(filePrefix, ".txt"), sep="\t")
## ----cirPlot, eval=TRUE, fig.show="hold"-------------------------------------------------------------------------------------------
core <- MulticoreParam(workers = 10)
pathID <- gsub("\\.", ":", rownames(topPath))
## The number of top pathways must be bigger than overall investigated pathways
pathSet <- studylist[is.element(names(studylist), pathID)]
pathMatch <- pathSet[match(pathID, names(pathSet))]
fileName <- paste0(omicsType, "_", pathType, "_SigRankBy_", rankMetric)
cirPlot4pathway(datalist=pathMatch, topPathID=names(pathMatch), core, fileName)
## ----sessioninfo, eval=TRUE--------------------------------------------------------------------------------------------------------
sessionInfo()
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.