Nothing
params <-
list(seed = 20188102)
## ---- eval=FALSE--------------------------------------------------------------
# if (!require("BiocManager"))
# install.packages("BiocManager")
# BiocManager::install("glmSparseNet")
## ----packages, message=FALSE, warning=FALSE-----------------------------------
library(dplyr)
library(Matrix)
library(ggplot2)
library(forcats)
library(parallel)
library(STRINGdb)
library(reshape2)
library(survival)
library(VennDiagram)
library(loose.rock)
library(futile.logger)
library(curatedTCGAData)
library(TCGAutils)
library(glmSparseNet)
#
# Some general options for futile.logger the debugging package
.Last.value <- flog.layout(layout.format('[~l] ~m'))
.Last.value <- loose.rock::show.message(FALSE)
# Setting ggplot2 default theme as minimal
theme_set(ggplot2::theme_minimal())
## ---- eval=FALSE--------------------------------------------------------------
# # Not evaluated in vignette as it takes too long to download and process
# all.interactions.700 <- stringDBhomoSapiens(score_threshold = 700)
# string.network <- buildStringNetwork(all.interactions.700,
# use.names = 'external')
## ---- include=FALSE-----------------------------------------------------------
data('string.network.700.cache', package = 'glmSparseNet')
string.network <- string.network.700.cache
## -----------------------------------------------------------------------------
string.network.undirected <- string.network + Matrix::t(string.network)
string.network.undirected <- (string.network.undirected != 0) * 1
## ---- echo=FALSE, collapse=TRUE-----------------------------------------------
flog.info('Directed graph (score_threshold = %d)', 700)
flog.info(' * total edges: %d', sum(string.network != 0))
flog.info(' * unique protein: %d', nrow(string.network))
flog.info(' * edges per protein: %f',
sum(string.network != 0) / nrow(string.network) )
flog.info('')
flog.info('Undirected graph (score_threshold = %d)', 700)
flog.info(' * total edges: %d', sum(string.network.undirected != 0) / 2)
flog.info(' * unique protein: %d', nrow(string.network.undirected))
flog.info(' * edges per protein: %f',
sum(string.network.undirected != 0)/2/nrow(string.network.undirected))
## ---- echo=FALSE--------------------------------------------------------------
string.network.binary <- (string.network.undirected != 0) * 1
degree.network <- (Matrix::rowSums(string.network.binary) +
Matrix::colSums(string.network.binary)) / 2
flog.info('Summary of degree:', summary(degree.network), capture = TRUE)
## ---- warning=FALSE-----------------------------------------------------------
qplot(degree.network[degree.network <= quantile(degree.network,
probs = .99999)],
geom = 'histogram', fill = my.colors(2), bins = 100) +
theme(legend.position = 'none') + xlab('Degree (up until 99.999% quantile)')
## ---- include=FALSE-----------------------------------------------------------
# chunk not included as it produces to many unnecessary messages
brca <- curatedTCGAData(diseaseCode = "BRCA", assays = "RNASeq2GeneNorm", FALSE)
## ---- eval=FALSE--------------------------------------------------------------
# brca <- curatedTCGAData(diseaseCode = "BRCA", assays = "RNASeq2GeneNorm", FALSE)
## ----data.show, warning=FALSE, error=FALSE------------------------------------
# keep only solid tumour (code: 01)
brca.primary.solid.tumor <- TCGAutils::splitAssays(brca, '01')
xdata.raw <- t(assay(brca.primary.solid.tumor[[1]]))
# Get survival information
ydata.raw <- colData(brca.primary.solid.tumor) %>% as.data.frame %>%
# Convert days to integer
mutate(Days.to.date.of.Death = as.integer(Days.to.date.of.Death),
Days.to.Last.Contact = as.integer(Days.to.Date.of.Last.Contact)) %>%
# Find max time between all days (ignoring missings)
rowwise %>%
mutate(time = max(days_to_last_followup, Days.to.date.of.Death,
Days.to.Last.Contact, days_to_death, na.rm = TRUE)) %>%
# Keep only survival variables and codes
select(patientID, status = vital_status, time) %>%
# Discard individuals with survival time less or equal to 0
filter(!is.na(time) & time > 0) %>% as.data.frame
# Set index as the patientID
rownames(ydata.raw) <- ydata.raw$patientID
# keep only features that are in degree.network and have standard deviation > 0
valid.features <- colnames(xdata.raw)[colnames(xdata.raw) %in%
names(degree.network[degree.network > 0])]
xdata.raw <- xdata.raw[TCGAbarcode(rownames(xdata.raw)) %in%
rownames(ydata.raw), valid.features]
xdata.raw <- scale(xdata.raw)
# Order ydata the same as assay
ydata.raw <- ydata.raw[TCGAbarcode(rownames(xdata.raw)), ]
# Using only a subset of genes previously selected to keep this short example.
set.seed(params$seed)
small.subset <- c('AAK1', 'ADRB1', 'AK7', 'ALK', 'APOBEC3F', 'ARID1B', 'BAMBI',
'BRAF', 'BTG1', 'CACNG8', 'CASP12', 'CD5', 'CDA', 'CEP72',
'CPD', 'CSF2RB', 'CSN3', 'DCT', 'DLG3', 'DLL3', 'DPP4',
'DSG1', 'EDA2R', 'ERP27', 'EXD1', 'GABBR2', 'GADD45A',
'GBP1', 'HTR1F', 'IFNK', 'IRF2', 'IYD', 'KCNJ11', 'KRTAP5-6',
'MAFA', 'MAGEB4', 'MAP2K6', 'MCTS1', 'MMP15', 'MMP9',
'NFKBIA', 'NLRC4', 'NT5C1A', 'OPN4', 'OR13C5', 'OR13C8',
'OR2T6', 'OR4K2', 'OR52E6', 'OR5D14', 'OR5H1', 'OR6C4',
'OR7A17', 'OR8J3', 'OSBPL1A', 'PAK6', 'PDE11A', 'PELO',
'PGK1', 'PIK3CB', 'PMAIP1', 'POLR2B', 'POP1', 'PPFIA3',
'PSME1', 'PSME2', 'PTEN', 'PTGES3', 'QARS', 'RABGAP1',
'RBM3', 'RFC3', 'RGPD8', 'RPGRIP1L', 'SAV1', 'SDC1', 'SDC3',
'SEC16B', 'SFPQ', 'SFRP5', 'SIPA1L1', 'SLC2A14', 'SLC6A9',
'SPATA5L1', 'SPINT1', 'STAR', 'STXBP5', 'SUN3', 'TACC2',
'TACR1', 'TAGLN2', 'THPO', 'TNIP1', 'TP53', 'TRMT2B', 'TUBB1',
'VDAC1', 'VSIG8', 'WNT3A', 'WWOX', 'XRCC4', 'YME1L1',
'ZBTB11', 'ZSCAN21') %>%
sample(size = 50) %>% sort
# make sure we have 100 genes
small.subset <- c(small.subset, sample(colnames(xdata.raw), 51)) %>%
unique %>%
sort
xdata <- xdata.raw[, small.subset[small.subset %in% colnames(xdata.raw)]]
ydata <- ydata.raw %>% select(time, status) %>% filter(!is.na(time) | time < 0)
## -----------------------------------------------------------------------------
#
# Add degree 0 to genes not in STRING network
my.degree <- degree.network[small.subset]
my.string <- string.network.binary[small.subset, small.subset]
## ---- echo=FALSE, warning=FALSE-----------------------------------------------
qplot(my.degree, bins = 100, fill = my.colors(3)) +
theme(legend.position = 'none')
## -----------------------------------------------------------------------------
set.seed(params$seed)
foldid <- balanced.cv.folds(!!ydata$status)$output
## ---- include=FALSE-----------------------------------------------------------
# List that will store all selected genes
selected.genes <- list()
## ---- warning=FALSE, error=FALSE----------------------------------------------
cv.hub <- cv.glmHub(xdata,
Surv(ydata$time, ydata$status),
family = 'cox',
foldid = foldid,
network = my.string,
network.options = networkOptions(min.degree = 0.2))
## ---- echo=FALSE--------------------------------------------------------------
glmSparseNet::separate2GroupsCox(as.vector(coef(cv.hub, s = 'lambda.min')[,1]),
xdata, ydata,
plot.title = 'Full dataset',
legend.outside = FALSE)
selected.genes[['Hub']] <- coef(cv.hub, s = 'lambda.min')[,1] %>%
{ .[. != 0] } %>%
names %>%
geneNames %>%
{ .[['external_gene_name']]}
## ---- warning=FALSE, error=FALSE----------------------------------------------
cv.orphan <- cv.glmOrphan(xdata,
Surv(ydata$time, ydata$status),
family = 'cox',
foldid = foldid,
network = my.string,
network.options = networkOptions(min.degree = 0.2))
## ---- echo=FALSE--------------------------------------------------------------
glmSparseNet::separate2GroupsCox(as.vector(coef(cv.orphan,
s = 'lambda.min')[,1]),
xdata, ydata,
plot.title = 'Full dataset',
legend.outside = FALSE)
selected.genes[['Orphan']] <- coef(cv.orphan, s = 'lambda.min')[,1] %>%
{ .[. != 0] } %>%
names %>%
geneNames %>%
{ .[['external_gene_name']]}
## ---- warning=FALSE, error=FALSE----------------------------------------------
cv.glm <- cv.glmnet(xdata,
Surv(ydata$time, ydata$status),
family = 'cox',
foldid = foldid)
## ---- echo=FALSE--------------------------------------------------------------
glmSparseNet::separate2GroupsCox(as.vector(coef(cv.glm, s = 'lambda.min')[,1]),
xdata, ydata,
plot.title = 'Full dataset',
legend.outside = FALSE)
selected.genes[['GLMnet']] <- coef(cv.glm, s = 'lambda.min')[,1] %>%
{ .[. != 0] } %>%
names %>%
geneNames %>%
{ .[['external_gene_name']]}
## ---- echo=FALSE, warning=FALSE-----------------------------------------------
venn.plot <- venn.diagram(selected.genes ,
NULL,
fill = c(my.colors(5), my.colors(3),
my.colors(4)),
alpha = c(0.3,0.3, .3),
cex = 2,
cat.fontface = 4,
category.names = names(selected.genes))
grid.draw(venn.plot)
## ---- echo=FALSE, warning=FALSE-----------------------------------------------
melt(selected.genes) %>%
mutate(Degree = my.degree[value],
value = factor(value),
L1 = factor(L1)) %>%
mutate(value = fct_reorder(value, Degree)) %>%
as.data.frame %>% ggplot() +
geom_point(aes(value, L1, size = Degree), shape = my.symbols(3),
color = my.colors(3)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0),
legend.position = 'top') +
ylab('Model') + xlab('Gene') +
scale_size_continuous(trans = 'log10')
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.