Nothing
# Simple helper script that benchmarks predictSomatic using a tumor/normal VCF
suppressPackageStartupMessages(library(optparse))
suppressPackageStartupMessages(library(futile.logger))
suppressPackageStartupMessages(library(pROC))
### Parsing command line ------------------------------------------------------
option_list <- list(
make_option(c("--rds"), action="store", type="character", default=NULL,
help="Input: PureCN output RDS file"),
make_option(c("--vcf"), action = "store", type = "character", default = NULL,
help = "Input: VCF file containing SOMATIC calls"),
make_option(c("--callable"), action="store", type="character", default=NULL,
help="TMB: File parsable by rtracklayer specifying all callable regions, e.g. generated by GATK CallableLoci"),
make_option(c("--keepflagged"), action = "store_true", default=FALSE,
help = "Keep variants flagged by predictSomatic [default %default]"),
make_option(c("--keepdb"), action = "store_true", default=FALSE,
help = "Keep variants in dbSNP [default %default]"),
make_option(c("--callcutoff"), action="store", type="double", default=0.8,
help="Posterior probability cutoff to include variants [default %default]"),
make_option(c("--out"), action="store", type="character", default=NULL,
help="File name prefix to which results should be written"),
make_option(c("-v", "--version"), action="store_true", default=FALSE,
help="Print PureCN version"),
make_option(c("-f", "--force"), action="store_true", default=FALSE,
help="Overwrite existing files")
)
opt <- parse_args(OptionParser(option_list=option_list))
if (opt$version) {
message(as.character(packageVersion("PureCN")))
q(status=1)
}
# Parse input rds
infileRds <- opt$rds
if (is.null(infileRds)) stop("Need --rds")
infileRds <- normalizePath(infileRds, mustWork=TRUE)
# Parse both BED files restricting covered region
callableFile <- opt$callable
callable <- NULL
if (!is.null(callableFile)) {
suppressPackageStartupMessages(library(rtracklayer))
callableFile <- normalizePath(callableFile, mustWork=TRUE)
flog.info("Reading %s...", callableFile)
callable <- GRanges(import(callableFile))
}
### Run PureCN ----------------------------------------------------------------
flog.info("Loading PureCN...")
suppressPackageStartupMessages(library(PureCN))
res <- readCurationFile(infileRds)
sampleid <- res$input$sampleid
.getOutPrefix <- function(opt, infile, sampleid) {
out <- opt[["out"]]
if (is.null(out)) {
if (!is.null(infile) && file.exists(infile)) {
outdir <- dirname(infile)
prefix <- sampleid
} else {
stop("Need --out")
}
} else {
outdir <- dirname(out)
prefix <- basename(out)
}
outdir <- normalizePath(outdir, mustWork=TRUE)
outPrefix <- file.path(outdir, prefix)
}
outPrefix <- .getOutPrefix(opt, infileRds, sampleid)
outfile <- paste0(outPrefix, "_tumor_only_benchmark.csv")
if (!opt$force && file.exists(outfile)) {
stop(outfile, " exists. Use --force to overwrite.")
}
flog.info("Reading %s...", basename(opt$vcf))
vcf <- PureCN:::.readAndCheckVcf(opt$vcf, genome = genome(res$input$vcf)[1])
flog.info("Classifying germline vs. somatic...")
p <- GRanges(predictSomatic(res))
if (!is.null(callable)) {
p <- p[overlapsAny(p, callable)]
vcf <- vcf[overlapsAny(vcf, callable)]
}
p$SOMATIC <- NA
ov <- findOverlaps(p, vcf)
p$SOMATIC[queryHits(ov)] <- info(vcf)$SOMATIC[subjectHits(ov)]
if (!opt$keepflagged) {
p <- p[!p$FLAGGED]
}
if (!opt$keepdb) {
p <- p[p$prior.somatic > 0.1]
}
idxCalled <- p$POSTERIOR.SOMATIC <= 1-opt$callcutoff | p$POSTERIOR.SOMATIC >= opt$callcutoff
callRate <- sum(idxCalled)/length(idxCalled)
callsSomatic <- table(p$POSTERIOR.SOMATIC[idxCalled] >= opt$callcutoff & p$SOMATIC[idxCalled] | !p$SOMATIC[idxCalled])
callsGermline <- table(p$POSTERIOR.SOMATIC[idxCalled] <= 1 - opt$callcutoff & !p$SOMATIC[idxCalled] | p$SOMATIC[idxCalled])
callsSomatic <- callsSomatic[["TRUE"]]/sum(callsSomatic)
callsGermline <- callsGermline[["TRUE"]]/sum(callsGermline)
x <- data.frame(
Sampleid = sampleid,
Purity = res$results[[1]]$purity,
Ploidy = res$results[[1]]$ploidy,
Num.Variants = length(p),
Median.Coverage = median(p$depth, na.rm = TRUE),
AUC.AR = auc(p$SOMATIC, p$AR),
AUC.POSTERIOR.SOMATIC = auc(p$SOMATIC, p$POSTERIOR.SOMATIC),
Callrate = callRate,
Accuracy.Somatic = callsSomatic,
Accuracy.Germline = callsGermline
)
write.csv(x, file = outfile, row.names = FALSE, quote = FALSE)
x <- p[idxCalled & p$SOMATIC != p$ML.SOMATIC]
loh <- GRanges(callLOH(res))
mcols(x) <- cbind(mcols(x), mcols(loh[findOverlaps(x, loh, select="first")]))
x$C <- NULL
x$M <- NULL
x$M.flagged <- NULL
outfile2 <- paste0(outPrefix, "_tumor_only_benchmark_wrong_calls.csv")
write.csv(as.data.frame(x), file = outfile2, row.names = FALSE, quote = FALSE)
x <- p[idxCalled & p$SOMATIC == p$ML.SOMATIC]
loh <- GRanges(callLOH(res))
mcols(x) <- cbind(mcols(x), mcols(loh[findOverlaps(x, loh, select="first")]))
x$C <- NULL
x$M <- NULL
x$M.flagged <- NULL
outfile2 <- paste0(outPrefix, "_tumor_only_benchmark_correct_calls.csv")
write.csv(as.data.frame(x), file = outfile2, row.names = FALSE, quote = FALSE)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.