pipe <- as.character(commandArgs(TRUE)[1])
type <- as.character(commandArgs(TRUE)[2])
bsruns <- as.numeric(commandArgs(TRUE)[3]) # bsruns <- 10
startrun <- as.numeric(commandArgs(TRUE)[4]) # startrun <- 10
if (type %in% c("rf", "svm", "nn")) {
options(expressions = 500000)
}
safeSave <- function(..., list = character(), file = stop("'file' must be specified"),
ascii = FALSE, version = NULL, envir = parent.frame(), compress = isTRUE(!ascii),
compression_level, eval.promises = TRUE, precheck = TRUE) {
i <- 1
while(file.exists(paste0(file, "_", i))) {
i <- i + 1
}
save(..., list = list, file = paste0(file, "_", i),
ascii = ascii, version = version, envir = envir, compress = compress,
compression_level = compression_level, eval.promises = eval.promises, precheck = precheck)
}
aucs <- function(a,b) {
auc <- roc <- 0
ppv <- rec <- spec <- NULL
for (cut in c(2,seq(1,0, length.out = 100),-1)) {
tmp <- a*0
tmp[which(a > cut)] <- 1
tp <- sum(tmp == 1 & b == 1)
fp <- sum(tmp == 1 & b == 0)
tn <- sum(tmp == 0 & b == 0)
fn <- sum(tmp == 0 & b == 1)
ppvtmp <- tp/(tp+fp)
if (is.na(ppvtmp)) { ppvtmp <- 0.5 }
rectmp <- tp/(tp+fn)
spectmp <- 1-tn/(tn+fp)
if (length(ppv) > 0) {
auc <- auc + (rectmp-rec[length(rec)])*(ppvtmp+ppv[length(ppv)])/2
roc <- roc + (spectmp-spec[length(spec)])*(rectmp+rec[length(rec)])/2
}
ppv <- c(ppv, ppvtmp)
rec <- c(rec, rectmp)
spec <- c(spec, spectmp)
}
return(list(auc=auc,roc=roc,ppv=ppv,rec=rec,spec=spec))
}
path <- "/cluster/work/bewi/members/mpirkl/"
if (pipe %in% "loo") {
library(naturalsort)
library(nem)
library(cluster)
library(Rcpp)
library(Rgraphviz)
library(randomForest)
library(e1071)
source("mnems.r")
source("mnems_low.r")
sourceCpp(code=readChar("mm.cpp", file.info("mm.cpp")$size))
source("nempi_main.r")
source("nempi_low.r")
load("TCGA-BRCA_nempi.rda")
## sourceCpp("~/Documents/mnem/src/mm.cpp"); source("~/Documents/mnem/R/mnems.r"); source("~/Documents/mnem/R/mnems_low.r"); source("~/Documents/nempi/R/nempi_main.r"); source("~/Documents/nempi/R/nempi_low.r")
## load("nempi_backup/backup3/TCGA-BRCA_nempi.rda")
P <- Pmut
P <- P[which(rownames(P) %in% rownames(Rho)), which(colnames(P) %in% colnames(Rho))]
P <- P[order(rownames(P)), order(colnames(P))]
P[which(P > 1)] <- 1
Ptmp <- cbind(P, matrix(0, nrow(P), sum(!(colnames(Rho) %in% colnames(P)))))
colnames(Ptmp) <- c(colnames(P), colnames(Rho)[which(!(colnames(Rho) %in% colnames(P)))])
P <- Ptmp[, colnames(Rho)]
D4 <- D2
colnames(D4) <- apply(Rho, 2, function(x) {
Sgenes <- paste(sort(rownames(Rho)[which(x > 0)]), collapse = "_")
return(Sgenes)
})
D5 <- D4[, which(apply(P, 2, sum) != 0)]
genes.med <- apply(D5, 1, median)
D5 <- D5[which(genes.med != 0), ]
PM <- P[, which(apply(P, 2, sum) != 0)]
tp <- fp <- fn <- tn <- prauc <- numeric(ncol(D5))
if (type %in% "nempi") {
converged <- 1
for (i in startrun:(startrun+bsruns-1)) {
if (i > ncol(D5)) { break() }
D6 <- D5[, -i]
Rho2 <- PM[, -i]
Rho3 <- PM
Rho3[, i] <- 0
nempitmp <- nempi(D6, Gamma = Rho2, full = TRUE, converged = converged)
Rho4 <- Rho3
Rho4[, -i] <- nempitmp$Gamma
nempitmp <- nempi(D5, Gamma = Rho4, full = FALSE, converged = converged)
Rho5 <- t(mytc(nempitmp$res$adj))%*%nempitmp$Gamma
tp[i] <- sum(PM[, i] == 1 & Rho5[, i] >= 1/8)
fp[i] <- sum(PM[, i] == 0 & Rho5[, i] >= 1/8)
fn[i] <- sum(PM[, i] == 1 & Rho5[, i] < 1/8)
tn[i] <- sum(PM[, i] == 0 & Rho5[, i] < 1/8)
prauc[i] <- aucs(Rho5[, i],PM[, i])$auc
}
safeSave(prauc, tp, fp, fn, tn, file = paste0(path, "nempi_bs/nempi_loo.rda"))
} else if (type %in% "svm") {
for (i in startrun:(startrun+bsruns-1)) {
if (i > ncol(D5)) { break() }
D6 <- D5[, -i]
Rho2 <- PM[, -i]
Rho3 <- PM
Rho3[, i] <- 0
svmres <- classpi(D6, full = TRUE, method = "svm")
Rho4 <- Rho3
Rho4[, -i] <- svmres$Gamma
D7 <- D5
colnames(D7)[i] <- ""
svmres <- classpi(D7, full = FALSE, method = "svm")
Rho5 <- svmres$Gamma
tp[i] <- sum(PM[, i] == 1 & Rho5[, i] >= 1/8)
fp[i] <- sum(PM[, i] == 0 & Rho5[, i] >= 1/8)
fn[i] <- sum(PM[, i] == 1 & Rho5[, i] < 1/8)
tn[i] <- sum(PM[, i] == 0 & Rho5[, i] < 1/8)
prauc[i] <- aucs(Rho5[, i],PM[, i])$auc
}
safeSave(prauc, tp, fp, fn, tn, file = paste0(path, "nempi_bs/svm_loo.rda"))
} else if (type %in% "rf") {
for (i in startrun:(startrun+bsruns-1)) {
if (i > ncol(D5)) { break() }
D6 <- D5[, -i]
Rho2 <- PM[, -i]
Rho3 <- PM
Rho3[, i] <- 0
svmres <- classpi(D6, full = TRUE, method = "randomForest")
Rho4 <- Rho3
Rho4[, -i] <- svmres$Gamma
D7 <- D5
colnames(D7)[i] <- ""
svmres <- classpi(D7, full = FALSE, method = "randomForest")
Rho5 <- svmres$Gamma
tp[i] <- sum(PM[, i] == 1 & Rho5[, i] >= 1/8)
fp[i] <- sum(PM[, i] == 0 & Rho5[, i] >= 1/8)
fn[i] <- sum(PM[, i] == 1 & Rho5[, i] < 1/8)
tn[i] <- sum(PM[, i] == 0 & Rho5[, i] < 1/8)
prauc[i] <- aucs(Rho5[, i],PM[, i])$auc
}
safeSave(prauc, tp, fp, fn, tn, file = paste0(path, "nempi_bs/rf_loo.rda"))
} else if (type %in% "nn") {
for (i in startrun:(startrun+bsruns-1)) {
if (i > ncol(D5)) { break() }
D6 <- D5[, -i]
Rho2 <- PM[, -i]
Rho3 <- PM
Rho3[, i] <- 0
library(nnet)
svmres <- classpi(D6, full = TRUE, method = "nnet", MaxNWts = 50000, size = 1)
Rho4 <- Rho3
Rho4[, -i] <- svmres$Gamma
D7 <- D5
colnames(D7)[i] <- ""
svmres <- classpi(D7, full = FALSE, method = "nnet", MaxNWts = 50000, size = 1)
Rho5 <- svmres$Gamma
tp[i] <- sum(PM[, i] == 1 & Rho5[, i] >= 1/8)
fp[i] <- sum(PM[, i] == 0 & Rho5[, i] >= 1/8)
fn[i] <- sum(PM[, i] == 1 & Rho5[, i] < 1/8)
tn[i] <- sum(PM[, i] == 0 & Rho5[, i] < 1/8)
prauc[i] <- aucs(Rho5[, i],PM[, i])$auc
}
safeSave(prauc, tp, fp, fn, tn, file = paste0(path, "nempi_bs/nn_loo.rda"))
} else if (type %in% "mf") {
for (i in startrun:(startrun+bsruns-1)) {
if (i > ncol(D5)) { break() }
D6 <- D5
colnames(D6)[i] <- ""
Rho2 <- PM
Rho2[, i] <- 0
Rho3 <- PM
Rho3[, i] <- 0
mfdata <- cbind(as.data.frame(t(D6)), factor(colnames(D6)))
mfdata[which(mfdata == "", arr.ind = TRUE)] <- NA
library(missForest)
mfimp <- missForest(mfdata)
colnames(D6) <- mfimp$ximp[, ncol(mfimp$ximp)]
tmp <- nem(D6)
ures <- list()
ures$Gamma <- apply(getGamma(D6), 2, function(x) return(x/sum(x)))
ures$res <- list()
ures$res$adj <- tmp$adj
ures$null <- TRUE
ures$combi <- 1
Rho5 <- ures$Gamma
tp[i] <- sum(PM[, i] == 1 & Rho5[, i] >= 1/8)
fp[i] <- sum(PM[, i] == 0 & Rho5[, i] >= 1/8)
fn[i] <- sum(PM[, i] == 1 & Rho5[, i] < 1/8)
tn[i] <- sum(PM[, i] == 0 & Rho5[, i] < 1/8)
prauc[i] <- aucs(Rho5[, i],PM[, i])$auc
}
safeSave(prauc, tp, fp, fn, tn, file = paste0(path, "nempi_bs/mf_loo.rda"))
} else if (type %in% "knn") {
for (i in startrun:(startrun+bsruns-1)) {
if (i > ncol(D5)) { break() }
D6 <- D5
colnames(D6)[i] <- ""
Rho2 <- PM
Rho2[, i] <- 0
Rho3 <- PM
Rho3[, i] <- 0
library(class)
train <- t(D6[, which(colnames(D6) != "")])
test <- t(D6[, which(colnames(D6) == "")])
knn0 <- 0
cl <- colnames(D6)[which(colnames(D6) != "")]
knnres <- knn(train, test, cl, prob = TRUE)
D3 <- D6
colnames(D3)[which(colnames(D3) %in% "")] <- as.character(knnres)
tmp <- nem(D3)
ures <- list()
ures$Gamma <- apply(getGamma(D3), 2, function(x) return(x/sum(x)))
ures$res <- list()
ures$res$adj <- tmp$adj
ures$null <- TRUE
ures$combi <- 1
Rho5 <- ures$Gamma
tp[i] <- sum(PM[, i] == 1 & Rho5[, i] >= 1/8)
fp[i] <- sum(PM[, i] == 0 & Rho5[, i] >= 1/8)
fn[i] <- sum(PM[, i] == 1 & Rho5[, i] < 1/8)
tn[i] <- sum(PM[, i] == 0 & Rho5[, i] < 1/8)
prauc[i] <- aucs(Rho5[, i],PM[, i])$auc
}
safeSave(prauc, tp, fp, fn, tn, file = paste0(path, "nempi_bs/knn_loo.rda"))
} else if (type %in% "rand") {
for (i in startrun:(startrun+bsruns-1)) {
if (i > ncol(D5)) { break() }
Rho5 <- PM
Rho5[,i] <- runif(nrow(Rho5))
tp[i] <- sum(PM[, i] == 1 & Rho5[, i] >= 1/8)
fp[i] <- sum(PM[, i] == 0 & Rho5[, i] >= 1/8)
fn[i] <- sum(PM[, i] == 1 & Rho5[, i] < 1/8)
tn[i] <- sum(PM[, i] == 0 & Rho5[, i] < 1/8)
prauc[i] <- aucs(Rho5[, i],PM[, i])$auc
}
safeSave(prauc, tp, fp, fn, tn, file = paste0(path, "nempi_bs/rand_loo.rda"))
}
stop("successfull loo")
} else {
library(snowfall)
library(TCGAbiolinks)
if (is.null(types)) {
types <- TCGAbiolinks:::getGDCprojects()$project_id
## special "FM-AD"
dont <- paste0(unique(gsub("-.*", "", types)), "-")
dont <- dont[-grep("TCGA", dont)]
samplenr <- matrix(NA, length(types), 2)
rownames(samplenr) <- types
donr <- TRUE
snrcount <- 0
} else {
dont <- "AGCT"
donr <- FALSE
}
sizemat <- matrix(0, 1, 2)
colnames(sizemat) <- c("Tumor", "Normal")
rownames(sizemat) <- ""
path <- "mutclust/"
for (type in types) {
if (donr) {
snrcount <- snrcount + 1
}
if (length(grep(paste(dont, collapse = "|"), type)) > 0) { next() }
print(type)
if (file.exists(paste0(path, type, "_final.rda")) & !newmut & !newllr & !newsave) {
load(paste0(path, type, "_final.rda"))
} else {
summ <- TCGAbiolinks:::getProjectSummary(type)
library(SummarizedExperiment)
## get methylation:
if (file.exists(paste0(path, type, "_meth.rda"))) {
load(paste0(path, type, "_meth.rda"))
} else {
data <- GDCquery(project = paste(type, sep = ""),
sample.type = "Primary solid Tumor",
data.category = "DNA methylation",
data.type = "Methylation beta value",
legacy = TRUE
)
GDCdownload(data)
data <- GDCprepare(data, summarizedExperiment = 0)
## map methy sites to genes:
library(methyAnalysis)
if (is.data.frame(data)) {
meth2 <- as.matrix(data[, -(1:3)])
rownames(meth2) <- gsub(";.*", "", data[, 1])
} else {
meth <- data@rowRanges
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
probe2gene <- annotateDMRInfo(meth, 'TxDb.Hsapiens.UCSC.hg19.knownGene')
meth2 <- assay(data)
rownames(meth2) <- probe2gene$sigDMRInfo@elementMetadata@listData$GeneSymbol
}
meth <- meth2
meth <- meth[which(apply(meth, 1, function(x) return(any(is.na(x)))) == FALSE), ]
methm <- meth[which(duplicated(rownames(meth)) == FALSE), ]
count <- 0
for (i in which(duplicated(rownames(meth)) == FALSE)) {
count <- count + 1
methm[count, ] <- apply(meth[which(rownames(meth) %in% rownames(methm)[i]), , drop = FALSE], 2, median)
}
meth <- methm
meth <- meth[order(rownames(meth)), order(colnames(meth))]
colnames(meth) <- gsub("A$", "", lapply(strsplit(colnames(meth), "-"), function(x) { y <- x[1:4]; y <- paste(y, collapse = "-"); return(y) }))
methm <- meth[, which(duplicated(colnames(meth)) == FALSE)]
for (i in which(duplicated(colnames(meth)) == TRUE)) {
j <- which(colnames(methm) == colnames(meth)[i])
methm[, j] <- apply(meth[, which(colnames(meth) %in% colnames(methm)[i]), drop = FALSE], 2, median)
}
meth <- methm
meth <- meth[order(rownames(meth)), order(colnames(meth))]
save(meth, file = paste0(path, type, "_meth.rda"))
}
print("meth done")
## get copy number variation:
if (file.exists(paste0(path, type, "_cnv.rda"))) {
load(paste0(path, type, "_cnv.rda"))
} else {
data <- getGistic(gsub("TCGA-", "", type), type = "thresholded")
## data <- GDCquery(project = paste(type, sep = ""),
## sample.type = "Primary solid Tumor",
## data.category = "Copy Number Variation",
## data.type = "Copy Number Segment",
## )
## GDCdownload(data)
## data <- GDCprepare(data)
cnv <- data[, -(1:3)]
cnv <- apply(cnv, c(1,2), as.numeric)
rownames(cnv) <- data[, 1]
colnames(cnv) <- gsub("A$", "", lapply(strsplit(colnames(cnv), "-"), function(x) { y <- x[1:4]; y <- paste(y, collapse = "-"); return(y) }))
cnv <- cnv[order(rownames(cnv)), order(colnames(cnv))]
save(cnv, file = paste0(path, type, "_cnv.rda"))
}
print("cnv done")
## get expression
if (file.exists(paste0(path, type, "_query.rda"))) {
load(paste0(path, type, "_query.rda"))
} else {
if (length(grep("-AML$|-LAML$", type)) > 0) {
sampletype <- c("Primary Blood Derived Cancer - Peripheral Blood", "Solid Tissue Normal")
} else {
sampletype <- c("Primary Tumor", "Solid Tissue Normal")
}
data <- GDCquery(project = paste(type, sep = ""),
sample.type = sampletype,
data.category = "Transcriptome Profiling",
data.type = "Gene Expression Quantification",
workflow.type = "HTSeq - Counts"
)
GDCdownload(data)
if (is.null(data)) {
print("is null")
next()
}
data <- GDCprepare(data)
save(data, file = paste0(path, type, "_query.rda"))
}
## process expression data:
D <- assay(data)
class <- data@colData@listData$definition
print("gene expression done")
## get mutation
if (file.exists(paste0(path, type, "_mut.rda"))) {
load(paste0(path, type, "_mut.rda"))
} else {
type2 <- gsub(paste(paste0(unique(gsub("-.*", "", types)), "-"), collapse = "|"), "", type)
library(data.table)
GDCquery_Maf(tumor = type2, save.csv = TRUE, pipeline = "varscan2") # mutation file
GDCquery_Maf(tumor = type2, save.csv = TRUE, pipeline = "muse") # mutation file
GDCquery_Maf(tumor = type2, save.csv = TRUE, pipeline = "somaticsniper") # mutation file
GDCquery_Maf(tumor = type2, save.csv = TRUE, pipeline = "mutect2") # mutation file
mut <- list()
count <- 1
type3 <- gsub("-", "\\.", type)
for (i in list.files("GDCdata")) {
if (length(grep(type3, i)) > 0 & length(grep("csv", i)) > 0) {
mut[[count]] <- fread(paste0("GDCdata/", i))
count <- count + 1
}
}
save(mut, file = paste0(path, type, "_mut.rda"))
}
## clinical
if (file.exists(paste0(path, type, "_clin.rda"))) {
load(paste0(path, type, "_clin.rda"))
} else {
clinical <- GDCquery_clinic(project = type, type = "clinical")
save(clinical, file = paste0(path, type, "_clin.rda"))
}
## process mutation data: (https://cancer.sanger.ac.uk/cosmic/help/mutation/overview)
if (file.exists(paste0(path, type, "_mut0.rda")) & !newmut) {
load(paste0(path, type, "_mut0.rda"))
} else {
n <- nmut # try something to get all patients with at least one mutation
library(snowfall)
countSamples <- function(i, mut.org, genes, mut.mat, coln) {
i <- which(rownames(mut.mat) %in% genes[i])
tmp <- mut.org[which(mut.org$Hugo_Symbol %in% rownames(mut.mat)[i]), coln]
tmp2 <- mut.mat[i, ]
tmp2[which(colnames(mut.mat) %in% tmp)] <- 1
return(tmp2)
}
typeSamples <- function(i, mut.org, genes, type.mat, coln, coln2) {
i <- which(rownames(type.mat) %in% genes[i])
tmp <- mut.org[which(mut.org$Hugo_Symbol %in% rownames(type.mat)[i]), coln]
tmp3 <- mut.org[which(mut.org$Hugo_Symbol %in% rownames(type.mat)[i]), coln2]
tmp2 <- type.mat[i, ]
tmp2[which(colnames(type.mat) %in% tmp)] <- tmp3
return(tmp2)
}
biggl <- list()
for (i in length(mut)) {
mutation <- mut[[i]]
biggl[[i]] <- mutation$Hugo_Symbol
}
freq <- sort(table(unlist(biggl)), decreasing = TRUE)
if (n == 0) {
allsub <- names(freq)
} else {
allsub <- names(freq)[1:n]
}
M <- Mtype <- list()
for (i in 1:length(mut)) {
mutation <- mut[[i]]
mut.mat <- matrix(0, length(allsub), length(unique(mutation$Tumor_Sample_Barcode)))
type.mat <- matrix("", length(allsub), length(unique(mutation$Tumor_Sample_Barcode)))
colnames(type.mat) <- colnames(mut.mat) <- sort(unique(mutation$Tumor_Sample_Barcode))
rownames(type.mat) <- rownames(mut.mat) <- allsub
coln <- which(colnames(mutation) %in% "Tumor_Sample_Barcode")
coln2 <- which(colnames(mutation) %in% "Variant_Classification")
mut.org <- mutation[which(mutation$Hugo_Symbol %in% allsub), ]
sfInit(parallel = T, cpus = 4)
sfExport("mut.mat", "coln")
tmp <- sfLapply(as.list(1:length(allsub)), countSamples, mut.org, allsub, mut.mat, coln)
sfStop()
tmp <- do.call("rbind", tmp)
rownames(tmp) <- allsub
colnames(tmp) <- colnames(mut.mat)
M[[i]] <- tmp
sfInit(parallel = T, cpus = 4)
sfExport("type.mat", "coln2")
tmp <- sfLapply(as.list(1:length(allsub)), typeSamples, mut.org, allsub, type.mat, coln, coln2)
sfStop()
tmp <- do.call("rbind", tmp)
rownames(tmp) <- allsub
colnames(tmp) <- colnames(mut.mat)
Mtype[[i]] <- tmp
}
samples <- intersect(intersect(intersect(colnames(M[[1]]), colnames(M[[2]])), colnames(M[[3]])), colnames(M[[4]]))
M0 <- M[[1]][, which(colnames(M[[1]]) %in% samples)]
Mtype0 <- Mtype[[1]][, which(colnames(Mtype[[1]]) %in% samples)]
for (i in 2:length(M)) {
M0 <- M0 + M[[i]][, which(colnames(M[[i]]) %in% samples)]
Mtype0 <- matrix(paste(Mtype0, Mtype[[i]][, which(colnames(Mtype[[i]]) %in% samples)]), nrow(Mtype0))
}
rownames(Mtype0) <- rownames(M0)
colnames(Mtype0) <- colnames(M0)
save(M0, Mtype0, file = paste0(path, type, "_mut0.rda"))
}
## process expression data:
D <- assay(data)
class <- data@colData@listData$definition
M <- M0
Mtype <- Mtype0
colnames(M) <- lapply(colnames(M), function(x) {
y <- unlist(strsplit(x, "-"))
y <- paste(y[1:4], collapse = "-")
y <- unlist(strsplit(y, ""))
y <- paste(y[1:(length(y)-1)], collapse = "")
return(y)
})
colnames(D) <- lapply(colnames(D), function(x) {
y <- unlist(strsplit(x, "-"))
y <- paste(y[1:4], collapse = "-")
y <- unlist(strsplit(y, ""))
y <- paste(y[1:(length(y)-1)], collapse = "")
return(y)
})
colnames(M) <- gsub("A$", "", lapply(strsplit(colnames(M), "-"), function(x) { y <- x[1:4]; y <- paste(y, collapse = "-"); return(y) }))
M <- M[order(rownames(M)), order(colnames(M))]
Mtype <- Mtype[order(rownames(Mtype)), order(colnames(Mtype))]
print("mutation done")
## log odds:
if (file.exists(paste0(path, type, "_llr.rda")) & !newllr) {
load(paste0(path, type, "_llr.rda"))
} else {
library(edgeR)
if (sum(class %in% "Solid Tissue Normal") < 10) {
distrParNC <- function(i, data) {
data[i, ] <- data[i, ] - median(data[, i])
llrcol <- numeric(ncol(data))
div0 <- quantile(data[i, ], 0.25)
div1 <- quantile(data[i, ], 0.75)
sigma <- sd(data[i, ])
for (j in 1:ncol(data)) {
if (data[i, j] <= 0) {
llrcol[j] <- log2(div0/data[i, j])
} else {
llrcol[j] <- log2(data[i, j]/div1)
}
}
return(llrcol)
}
highcounts <- which(apply(D, 1, median) >= 10)
DC <- D[highcounts, ]
genenames <- rownames(D)[highcounts, ]
nf <- calcNormFactors(DC)
DC <- t(t(DC)/nf) # DC <- DC2
## this is very adventurous:
## DC <- t(scale(t(DC)))
## DC <- abs(DC)
## pc <- 1-min(DC)
## tmp <- log2(DC+pc)
## hist(tmp)
sfInit(parallel = T, cpus = 4)
sfExport("DC")
tmp <- do.call("rbind", sfLapply(1:nrow(DC),
distrParNC, DC))
colnames(tmp) <- colnames(DC)
rownames(tmp) <- genenames
sfStop()
DF <- DC
} else {
library(ks)
mykcde <- function(x, k) {
a <- which.min(abs(k$eval.points - x))
b <- k$estimate[a]
b <- min(b, 1-b)
return(b)
}
distrParKs <- function(i, data, C) {
llrcol <- numeric(ncol(data))
ddistr <- list()
dogenes <- unique(colnames(data))[which(!(unique(colnames(data)) %in% ""))]
for (j in dogenes) {
D <- which(colnames(data) %in% j)
ddistr[[j]] <- kcde(data[i, D])
}
cdistr <- kcde(data[i, C])
for (j in which(!(colnames(data) %in% ""))) {
gene <- colnames(data)[j]
llrcol[j] <- log2(mykcde(data[i, j], ddistr[[gene]])/mykcde(data[i,j], cdistr))
}
llrcol <- llrcol[-C]
return(llrcol)
}
DN <- D[, which(class %in% "Solid Tissue Normal")]
DT <- D[, which(class %in% "Primary solid Tumor")]
DF <- cbind(DT, DN)
C <- (ncol(DT)+1):ncol(DF)
highcounts <- which(apply(DF, 1, median) >= 10)
DF <- DF[highcounts, ]
genenames <- rownames(DF)
colnames(DF)[1:ncol(DT)] <- "P" # not knock-down specific!
colnames(DF)[grep(paste(unique(gsub("-.*", "", types)), collapse = "|"), colnames(DF))] <- ""
nf <- calcNormFactors(DF)
DF <- t(t(DF)/nf)
sfInit(parallel = T, cpus = 4)
sfExport("DF", "C", "mykcde")
sfLibrary(ks)
tmp <- do.call("rbind", sfLapply(1:nrow(DF),
distrParKs, DF, C))
sfStop()
colnames(tmp) <- colnames(DT)
}
save(tmp, DF, file = paste0(path, type, "_llr.rda"))
}
rownames(tmp) <- rownames(DF)
par(mfrow=c(1,3))
hist(tmp)
tmp[which(is.na(tmp) | is.infinite(tmp))] <- 0
hist(tmp)
D <- tmp
print("expression done")
## sd.glob <- sd(tmp)
## tmp <- tmp[-which(apply(tmp, 1, sd) < sd.glob), ]
## hist(tmp)
## prep clinical data:
clinical[which(clinical$vital_status%in% "dead"), which(colnames(clinical) %in% "vital_status")] <- 1
clinical[which(clinical$vital_status%in% "alive"), which(colnames(clinical) %in% "vital_status")] <- 0
count <- 0
for (stage in sort(unique(clinical$tumor_stage))) {
clinical[which(clinical$tumor_stage%in% stage), which(colnames(clinical) %in% "tumor_stage")] <- count
count <- count + 1
}
clinical$tumor_stage <- as.numeric(clinical$tumor_stage)
clinical[which(is.na(clinical$days_to_death)), which(colnames(clinical) %in% "days_to_death")] <- clinical[which(is.na(clinical$days_to_death)), which(colnames(clinical) %in% "days_to_last_follow_up")]
clinical$vital_status<- as.numeric(clinical$vital_status)
print("clinical done")
save(clinical, D, M, Mtype, DF, class, meth, cnv, file = paste0(path, type, "_final.rda"))
}
print(table(class))
sizemat <- rbind(sizemat, table(class))
rownames(sizemat)[nrow(sizemat)] <- type
if (donr) {
samplenr[snrcount, 1] <- sum(class %in% "Primary solid Tumor")
samplenr[snrcount, 2] <- sum(class %in% "Solid Tissue Normal")
}
}
stop("tcga done")
}
## commands for hpc:
system("scp ~/Documents/mnem/src/mm.cpp euler:")
system("scp ~/Documents/mnem/R/mnems.r euler:")
system("scp ~/Documents/mnem/R/mnems_low.r euler:")
system("scp ~/Documents/nempi/R/nempi_main.r euler:")
system("scp ~/Documents/nempi/R/nempi_low.r euler:")
system("scp testing/nempi/other/nempi_loo.r euler:")
do="loo"
rm error.txt
rm output.txt
ram=8000 # nempi 8000, rf 64000, svm 20000, mf 8000, knn 8000, nn 20000
queue=4 # nempi 2min, rf 90min, svm 10min, mf 2min, knn 1min, nn 40min
bsruns=10
startrun=1
R="R/bin/R"
type="nn" # nempi, svm, rf, nn, knn, mf
if [ ${type} == 'rf' ] || [ ${type} == 'svm' ] || [ ${type} == 'nn' ]
then
maxpp=500000
else
maxpp=50000
fi
if [ ${type} == 'rf' ]
then
ram=64000
bsruns=1
fi
if [ ${type} == 'svm' ]
then
ram=20000
fi
if [ ${type} == 'nn' ]
then
ram=20000
bsruns=1
fi
bsub -M ${ram} -q normal.${queue}h -n 1 -e error.txt -o output.txt -R "rusage[mem=${ram}]" "${R} --vanilla --max-ppsize ${maxpp} --silent --no-save --args '${do}' '${type}' '${bsruns}' '${startrun}' < nempi_loo.r"
end=$(expr 91 + $bsruns )
end=$(expr $end / $bsruns )
## end=85
for (( i=2; i<=$end; i++ )); do
a=$(expr ${i} - 1 )
b=$(expr ${a} \* ${bsruns} )
startrun=$(expr ${b} + 1 )
echo $startrun
bsub -M ${ram} -q normal.${queue}h -n 1 -e error.txt -o output.txt -R "rusage[mem=${ram}]" "${R} --vanilla --max-ppsize ${maxpp} --silent --no-save --args '${do}' '${type}' '${bsruns}' '${startrun}' < nempi_loo.r"
done
for i in {2..10}; do
bsub -M ${ram} -q normal.${queue}h -n 1 -e error.txt -o output.txt -R "rusage[mem=${ram}]" "${R} --vanilla --max-ppsize ${maxpp} --silent --no-save --args '${do}' '${type}' '${bsruns}' '${startrun}' < nempi_loo.r"
done
## read in loo results:
methods <- c("nempi", "svm", "nn", "rf", "mf", "knn", "rand")
methlist <- list()
comparePre <- compareRec <- comparePrauc <- NULL
for (method in methods) {
print(method)
methlist[[method]] <- list()
tp2 <- tn2 <- fp2 <- fn2 <- prauc2 <- numeric(91)
for (file in list.files("~/Mount/Eulershare/nempi_bs")) {
if (length(grep(paste0("^", method, "_"), file)) > 0 &
length(grep(paste0("_loo\\."), file)) > 0) {
load(paste0("~/Mount/Eulershare/nempi_bs/", file))
tp2 <- tp2 + tp
tn2 <- tn2 + tn
fp2 <- fp2 + fp
fn2 <- fn2 + fn
prauc2 <- prauc2 + prauc
}
}
methlist[[method]][["tp"]] <- tp2
methlist[[method]][["tn"]] <- tn2
methlist[[method]][["fp"]] <- fp2
methlist[[method]][["fn"]] <- fn2
methlist[[method]][["pre"]] <- tp2/(tp2+fp2)
methlist[[method]][["rec"]] <- tp2/(tp2+fn2)
methlist[[method]][["rec"]][which(is.na(methlist[[method]][["rec"]]))] <- 0
methlist[[method]][["prauc"]] <- prauc2
comparePre <- cbind(comparePre, methlist[[method]][["pre"]])
compareRec <- cbind(compareRec, methlist[[method]][["rec"]])
comparePrauc <- cbind(comparePrauc, methlist[[method]][["prauc"]])
}
setEPS()
postscript("temp.eps", height = 7, width = 10)
layout.mat <- matrix(c(rep(c(1,4,7),each=2),10,rep(c(2,5,8),each=2),11,rep(c(3,6,9),each=2),12),7)
par(mfrow=c(1,2))
col <- c("red", "blue", "darkgreen", "brown", "orange", "turquoise","grey")
mnem:::myboxplot(comparePre, dens = 0, col = col, xaxt = "n", main = "precision of mutation calls", ylab = "precision",box=1,border = col,medcol="black")
axis(1, 1:7, round(apply(comparePre,2,mean), 2))
mnem:::myboxplot(compareRec, dens = 0, col = col, xaxt = "n", main = "recall of mutation calls", ylab = "recall",box=1,border = col,medcol="black")
axis(1, 1:7, round(apply(compareRec,2,mean), 2))
dev.off()
setEPS()
postscript("temp.eps", height = 5, width = 5)
layout.mat <- matrix(c(rep(rep(1,each=3),3),2:4),4,byrow=1)
layout(layout.mat)
wtest <- numeric(ncol(comparePrauc))
for (i in 1:length(wtest)) {
wtest[i] <- wilcox.test(comparePrauc[,1],comparePrauc[,i],alternative="greater")$p.value
}
col <- c("red", "blue", "darkgreen", "brown", "orange", "turquoise","grey")
mnem:::myboxplot(comparePrauc, dens = 0, col = col, xaxt = "n", main = "area under the precision-recall curve of mutation calls", ylab = "area under the precision-recall curve",box=1,border = col,medcol="black")
axis(1, 1:7, round(apply(comparePrauc,2,mean), 2))
par(mar=rep(0, 4))
plot.new()
legend("topleft",legend=c(expression(NEM~pi), "svm", "neural net"),col=c("red", "blue", "darkgreen"),fill=c("red", "blue", "darkgreen"),box.col = "transparent",cex = cex.leg,ncol = 1,border="transparent")
plot.new()
legend("topleft",legend=c("random forest", "missForest", "knn"),col=c("brown", "orange", "turquoise"),fill=c("brown", "orange", "turquoise"),box.col = "transparent",cex = cex.leg,ncol = 1,border="transparent")
plot.new()
legend("topleft",legend=c("random"),col=c("grey"),fill=c("grey"),box.col = "transparent",cex = cex.leg,ncol = 1,border="transparent")
dev.off()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.