inst/extdata/PureCN.R

suppressPackageStartupMessages(library(optparse))
suppressPackageStartupMessages(library(futile.logger))

### Parsing command line ------------------------------------------------------

option_list <- list(
    make_option(c("-i", "--sampleid"), action = "store", type = "character",
                default = NULL, help = "Sample id"),
    make_option(c("--normal"), action = "store", type = "character", default = NULL,
        help = "Input: normal coverage, GC-normalized. Optional if normaldb or segfile is provided."),
    make_option(c("--tumor"), action = "store", type = "character",
                default = NULL, help = "Input: tumor coverage, GC-normalized"),
    make_option(c("--vcf"), action = "store", type = "character", default = NULL,
                help = "Input: VCF file"),
    make_option(c("--rds"), action = "store", type = "character", default = NULL,
        help = "Input: PureCN output RDS file, used to regenerate plots and files after manual curation"),
    make_option(c("--mappingbiasfile"), action = "store", type = "character", default = NULL,
        help = "Input: Mapping bias RDS file, generated by NormalDB.R (or calculateMappingBiasVcf)"),
    make_option(c("--normal_panel"), action = "store", type = "character", default = NULL,
        help = "Input: Deprecated, use --mappingbiasfile instead"),
    make_option(c("--normaldb"), action = "store", type = "character", default = NULL,
        help = "Input: NormalDB.rds file. Generated by NormalDB.R."),
    make_option(c("--segfile"), action = "store", type = "character",
                default = NULL, help = "Input: Segmentation file"),
    make_option(c("--logratiofile"), action = "store", type = "character",
                default = NULL, help = "Input: Log2 copy number ratio file"),
    make_option(c("--additionaltumors"), action = "store", type = "character",
                default = NULL, help = "Input: tumor coverages from additional biopsies from the SAME patient, GC-normalized"),
    make_option(c("--sex"), action = "store", type = "character",
        default = formals(PureCN::runAbsoluteCN)$sex[[2]],
        help = "Input: Sex of sample. ? (detect), diplod (non-diploid chromosomes removed), F or M [default %default]"),
    make_option(c("--genome"), action = "store", type = "character",
                default = NULL, help = "Assay: Genome version. Use hg18, hg19, or hg38 for human [default %default]"),
    make_option(c("--intervals"), action = "store", type = "character", default = NULL,
        help = "Assay: Interval file as generated by IntervalFile.R"),
    make_option(c("--statsfile"), action = "store", type = "character", default = NULL,
        help = "VCF Filter: MuTect stats file, used to filter artifacts"),
    make_option(c("--minaf"), action = "store", type = "double", default = 0.03,
        help = "VCF Filter: minimum allelic fraction [default %default]"),
    make_option(c("--snpblacklist"), action = "store", type = "character", default = NULL,
        help = "VCF Filter: File parsable by rtracklayer that defines blacklisted regions"),
    make_option(c("--error"), action = "store", type = "double",
        default = formals(PureCN::runAbsoluteCN)$error,
        help = "VCF Filter: Estimated sequencing error rate for artifact filtering [default %default]"),
    make_option(c("--dbinfoflag"), action = "store", type = "character",
        default = formals(PureCN::runAbsoluteCN)$DB.info.flag,
        help = "VCF Filter: VCF INFO flag indicating presence in common germline databases [default %default]"),
    make_option(c("--popafinfofield"), action = "store", type = "character",
        default = formals(PureCN::runAbsoluteCN)$POPAF.info.field,
        help = "VCF Filter: VCF INFO field providing population allele frequency [default %default]"),
    make_option(c("--mincosmiccnt"), action = "store", type = "integer",
        default = formals(PureCN::setPriorVcf)$min.cosmic.cnt,
        help = "VCF Filter: Min number of COSMIC hits [default %default]"),
    make_option(c("--padding"), action = "store", type = "integer",
        default = formals(PureCN::filterVcfBasic)$interval.padding,
        help = "VCF Filter: Keep variants in the flanking region of specified size [default %default]"),
    make_option(c("--funsegmentation"), action = "store", type = "character", default = "CBS",
        help = "Segmentation: Algorithm. CBS, PSCBS, Hclust, or none [default %default]"),
    make_option(c("--alpha"), action = "store", type = "double",
        default = formals(PureCN::segmentationCBS)$alpha,
        help = "Segmentation: significance of breakpoints [default %default]"),
    make_option(c("--undosd"), action = "store", type = "double",
        default = formals(PureCN::segmentationCBS)$undo.SD,
        help = "Segmentation: DNAcopy undo.SD argument. If NULL, tries to find a sensible default [default %default]"),
    make_option(c("--maxsegments"), action = "store", type = "double",
        default = formals(PureCN::runAbsoluteCN)$max.segments,
        help = "Segmentation: Flag noisy samples with many segments [default %default]"),
    make_option(c("--intervalweightfile"), action = "store", type = "character", default = NULL,
        help = "Segmentation: Weights of intervals. Deprecated. Recreate --normaldb with NormalDB.R from PureCN 1.16.0 or higher."),
    make_option(c("--minpurity"), action = "store", type = "double",
        default = formals(PureCN::runAbsoluteCN)$test.purity[[2]],
        help = "Minimum considered purity [default %default]"),
    make_option(c("--maxpurity"), action = "store", type = "double",
        default = formals(PureCN::runAbsoluteCN)$test.purity[[3]],
        help = "Maximum considered purity [default %default]"),
    make_option(c("--minploidy"), action = "store", type = "double", 
        default = formals(PureCN::runAbsoluteCN)$min.ploidy,
        help = "Minimum considered ploidy [default %default]"),
    make_option(c("--maxploidy"), action = "store", type = "double", 
        default = formals(PureCN::runAbsoluteCN)$max.ploidy,
        help = "Maximum considered ploidy [default %default]"),
    make_option(c("--maxcopynumber"), action = "store", type = "double", 
        default =  max(eval(formals(PureCN::runAbsoluteCN)$test.num.copy)),
        help = "Maximum allele-specific integer copy number [default %default]"),
    make_option(c("--postoptimize"), action = "store_true", default = FALSE,
        help = "Post-optimization [default %default]"),
    make_option(c("--bootstrapn"), action = "store", type = "integer", default = 0,
        help = "Number of bootstrap replicates [default %default]"),
    make_option(c("--speedupheuristics"), action = "store", type = "integer", 
        default =  max(eval(formals(PureCN::runAbsoluteCN)$speedup.heuristics)),
        help = "Tries to avoid spending computation time on unlikely local optima [default %default]"),
    make_option(c("--modelhomozygous"), action = "store_true", default = FALSE,
        help = "Model homozygous variants in very pure samples [default %default]"),
    make_option(c("--model"), action = "store", type = "character",
        default = formals(PureCN::runAbsoluteCN)$model[[2]],
        help = "Model used to fit variants. Either beta or betabin [default %default]."),
    make_option(c("--logratiocalibration"), action = "store", type = "double",
        default = formals(PureCN::runAbsoluteCN)$log.ratio.calibration,
        help = "Parameter defining the extend to which log-ratios might be miscalibrated [default %default]"),
    make_option(c("--maxnonclonal"), action = "store", type = "double",
        default = formals(PureCN::runAbsoluteCN)$max.non.clonal,
        help = "Maximum genomic fraction assigned to a subclonal copy number state [default %default]"),
    make_option(c("--maxhomozygousloss"), action = "store", type = "character",
        default = paste(formals(PureCN::runAbsoluteCN)$max.homozygous.loss[2:3], collapse=","),
        help = "Maximum genomic fraction assigned to a complete loss and maximum size of a loss in bp [default %default]"),
    make_option(c("--outvcf"), action = "store_true", default = FALSE,
        help = "Output: Annotate input VCF with posterior probabilities. Otherwise only produce CSV file."),
    make_option(c("--out"), action = "store", type = "character", default = NULL,
        help = paste("Output: File name prefix to which results should be written.",
        "If out is a directory, will use out/sampleid.")),
    make_option(c("--seed"), action = "store", type = "integer", default = NULL,
        help = "Seed for random number generator [default %default]"),
    make_option(c("--parallel"), action = "store_true", default = FALSE,
        help = "Use BiocParallel to fit local optima in parallel."),
    make_option(c("--cores"), action = "store", type = "integer", default = NULL,
        help = "Use BiocParallel MulticoreParam backend with the specified number of worker cores (--parallel uses the default BiocParallel backend instead)."),
    make_option(c("-v", "--version"), action = "store_true", default = FALSE,
        help = "Print PureCN version"),
    make_option(c("-f", "--force"), action = "store_true", default = FALSE,
        help = "Overwrite existing files")
)

opt <- parse_args(OptionParser(option_list = option_list))

if (opt$version) {
    message(as.character(packageVersion("PureCN")))
    q(status = 1)
}

if (!is.null(opt$seed)) {
    set.seed(opt$seed)
}

# TODO: remove in 1.20, defunct in 1.18
if (!is.null(opt$normal_panel) && is.null(opt$mappingbiasfile)) {
    flog.warn("--normal_panel is deprecated, use --mappingbiasfile instead.")
    opt$mappingbiasfile <- opt$normal_panel
}

tumor.coverage.file <- opt$tumor
normal.coverage.file <- opt[["normal"]]
snp.blacklist <- opt$snpblacklist

if (!is.null(snp.blacklist)) {
    snp.blacklist <- strsplit(snp.blacklist, ",")[[1]]
}

seg.file <- opt$segfile
log.ratio <- opt$logratiofile
sampleid <- opt$sampleid
out <- opt[["out"]]
file.rds <- opt$rds
normalDB <- NULL
BPPARAM <- NULL

.getFilePrefix <- function(out, sampleid) {
    isDir <- file.info(out)$isdir
    if (!is.na(isDir) && isDir) return(file.path(out, sampleid))
    out
}

if (!is.null(file.rds) && file.exists(file.rds)) {
    if (is.null(out)) out <- sub(".rds$", "", file.rds)
} else {
    if (is.null(sampleid)) stop("Need --sampleid.")
    if (is.null(opt$genome)) stop("Need --genome.")
    out <- .getFilePrefix(out, sampleid)
    file.rds <- paste0(out, ".rds")
    if (is.null(seg.file)) {
        if (is.null(tumor.coverage.file)) stop("Need either --tumor or --segfile.")
        tumor.coverage.file <- normalizePath(tumor.coverage.file,
            mustWork = TRUE)
    }
}
    
normalizePath(dirname(out), mustWork = TRUE)
if (file.access(dirname(out), 2) < 0) stop("Permission denied to write in --out.")

flog.info("Loading PureCN %s...", Biobase::package.version("PureCN"))
suppressPackageStartupMessages(library(PureCN))
library(futile.logger)

debug <- FALSE
if (Sys.getenv("PURECN_DEBUG") != "") {
    flog.threshold("DEBUG")
    debug <- TRUE
}

.checkFileList <- function(file) {
    files <- read.delim(file, as.is = TRUE, header = FALSE)[,1]
    numExists <- sum(file.exists(files), na.rm = TRUE)
    if (numExists < length(files)) { 
        stop("File not exists in file ", file)
    }
    files
}


### Run PureCN ----------------------------------------------------------------

if (file.exists(file.rds) && !opt$force) {
    flog.info("%s already exists. Skipping... (--force will overwrite)",
        file.rds)
    ret <- readCurationFile(file.rds)
    if (is.null(sampleid)) sampleid <- ret$input$sampleid
} else {
    if (!is.null(opt$normaldb)) {
        normalDB <- readRDS(opt$normaldb)
        if (!is.null(normal.coverage.file)) {
            flog.warn("Both --normal and --normalDB provided. normalDB will NOT be used for coverage denoising. You probably do not want this.")
        }    
    }

    .getNormalCoverage <- function(normal.coverage.file) {
        if (!is.null(normalDB)) {
            if (is.null(normal.coverage.file)) {
                normal.coverage.file <- calculateTangentNormal(tumor.coverage.file,
                    normalDB)
            }
        } else if (is.null(normal.coverage.file) && is.null(seg.file) &&
                   is.null(log.ratio)) {
            stop("Need either normalDB or normal.coverage.file")
        }
        normal.coverage.file
    }
    normal.coverage.file <- .getNormalCoverage(normal.coverage.file)
    file.log <- paste0(out, ".log")

    pdf(paste0(out, "_segmentation.pdf"), width = 10, height = 11)
    if (!is.null(opt$additionaltumors)) {
        if (!is.null(seg.file)) {
            stop("--additionaltumors overwrites --segfile")
        }
        seg.file <- paste0(out, "_multisample.seg")
        if (grepl(".list$", opt$additionaltumors)) {
            additional.tumors <- .checkFileList(opt$additionaltumors)
        } else {
            additional.tumors <- opt$additionaltumors 
        }
        multi.seg <- processMultipleSamples(
            c(list(tumor.coverage.file), as.list(additional.tumors)),
            sampleids = c(sampleid, paste(sampleid,
                seq_along(additional.tumors) + 1, sep = "_")),
            normalDB = normalDB, genome = opt$genome, verbose = debug)
        write.table(multi.seg, seg.file, row.names = FALSE, sep = "\t")
    }    
    af.range <- c(opt$minaf, 1 - opt$minaf)
    test.purity <- seq(opt$minpurity, opt$maxpurity, by = 0.01)

    uses.recommended.fun <- FALSE
    recommended.fun <- if (is.null(seg.file)) "PSCBS" else "Hclust"
    fun.segmentation <- segmentationCBS
    if (opt$funsegmentation != "CBS") {
        if (opt$funsegmentation == "PSCBS") {
            fun.segmentation <- segmentationPSCBS
            if (is.null(seg.file)) uses.recommended.fun <- TRUE
        } else if (opt$funsegmentation == "Hclust") {
            fun.segmentation <- segmentationHclust
            if (!is.null(seg.file)) uses.recommended.fun <- TRUE
        } else if (opt$funsegmentation == "none") {
            fun.segmentation <- function(seg, ...) seg
        } else {
            stop("Unknown segmentation function")
        }
    }
    if (!uses.recommended.fun) {
        flog.warn("Recommended to provide --funsegmentation %s.", recommended.fun)
    }

    mutect.ignore <- eval(formals(PureCN::filterVcfMuTect)$ignore)
    if (opt$error < formals(PureCN::runAbsoluteCN)$error && !is.null(opt$statsfile)) {
        flog.info("Low specified error, will keep fstar_tumor_lod flagged variants")
        mutect.ignore <- mutect.ignore[-match("fstar_tumor_lod", mutect.ignore)]
    }    
    if (!is.null(opt$cores) && opt$cores > 1) {
        suppressPackageStartupMessages(library(BiocParallel))
        BPPARAM <- MulticoreParam(workers = opt$cores)
        flog.info("Using BiocParallel MulticoreParam backend with %s cores.", opt$cores)
    } else if (opt$parallel) {
        suppressPackageStartupMessages(library(BiocParallel))
        BPPARAM <- bpparam()
        flog.info("Using default BiocParallel backend. You can change the default in your ~/.Rprofile file.") 
    }
    if (!is.null(log.ratio)) {
        flog.info("Reading %s...", log.ratio)
        log.ratio <- readLogRatioFile(log.ratio)
        if (!is.null(tumor.coverage.file)) {
            flog.info("Reading %s...", tumor.coverage.file)
            tumor.coverage.file <- readCoverageFile(tumor.coverage.file)
            # GATK4 log-ratio files are filtered for quality, so make sure that
            # coverage files align
            tumor.coverage.file <- subsetByOverlaps(tumor.coverage.file, log.ratio)
        }
        log.ratio <- log.ratio$log.ratio
    }    
    
    ret <- runAbsoluteCN(normal.coverage.file = normal.coverage.file,
            tumor.coverage.file = tumor.coverage.file, vcf.file = opt$vcf,
            sampleid = sampleid, plot.cnv = TRUE,
            interval.file = opt$intervals,
            genome = opt$genome, seg.file = seg.file,
            log.ratio = log.ratio,
            test.purity = test.purity,
            test.num.copy = seq(0, opt$maxcopynumber),
            sex = opt$sex,
            args.filterVcf = list(snp.blacklist = snp.blacklist,
                af.range = af.range, stats.file = opt$statsfile,
                ignore = mutect.ignore,
                interval.padding = opt$padding),
            fun.segmentation = fun.segmentation,
            args.segmentation = list(
                alpha = opt$alpha, undo.SD = opt$undosd),
            args.setMappingBiasVcf =
                list(mapping.bias.file = opt$mappingbiasfile),
            args.setPriorVcf = list(min.cosmic.cnt = opt$mincosmiccnt),
            normalDB = normalDB, model.homozygous = opt$modelhomozygous,
            min.ploidy = opt$minploidy, max.ploidy = opt$maxploidy,
            model = opt[["model"]], log.file = file.log,
            max.segments = opt$maxsegments,
            error = opt$error, DB.info.flag = opt$dbinfoflag, 
            POPAF.info.field = opt$popafinfofield,
            log.ratio.calibration = opt$logratiocalibration,
            max.non.clonal = opt$maxnonclonal,
            max.homozygous.loss = as.numeric(strsplit(opt$maxhomozygousloss,",")[[1]]),
            post.optimize = opt$postoptimize,
            speedup.heuristics = opt$speedupheuristics,
            vcf.field.prefix = "PureCN.",
            BPPARAM = BPPARAM)
    invisible(dev.off())
    if (opt$bootstrapn > 0) {
        ret <- bootstrapResults(ret, n = opt$bootstrapn) 
    }
    saveRDS(ret, file = file.rds)
}

### Create output files -------------------------------------------------------

curationFile <- createCurationFile(file.rds)
if (debug) {
    curationFile$log.ratio.offset <- mean(ret$results[[1]]$log.ratio.offset)
    curationFile$log.ratio.sdev <- ret$input$log.ratio.sdev
    curationFile$num.segments <- nrow(ret$results[[1]]$seg)
    write.csv(curationFile, file = paste0(out, "_debug.csv"),
        row.names = FALSE)
}

flog.info("Generating output files...")
file.pdf <- paste0(out, ".pdf")
pdf(file.pdf, width = 10, height = 11)
plotAbs(ret, type = "all")
invisible(dev.off())

file.pdf <- paste0(out, "_local_optima.pdf")
pdf(file.pdf, width = 5, height = 5)
plotAbs(ret, 1, type = "overview")
invisible(dev.off())

file.seg <- paste0(out, "_dnacopy.seg")
seg <- ret$results[[1]]$seg
seg <- seg[, c(1:6, match("C", colnames(seg)))]
write.table(seg, file = file.seg, sep = "\t", quote = FALSE,
    row.names = FALSE)

if (is(ret$results[[1]]$gene.calls, "data.frame")) {
    file.genes <- paste0(out, "_genes.csv")
    allAlterations <- callAlterations(ret, all.genes = TRUE)

    write.csv(cbind(Sampleid = sampleid, gene.symbol = rownames(allAlterations),
        allAlterations), row.names = FALSE, file = file.genes, quote = FALSE)
    if (!is.null(opt$normaldb)) {
        if (is.null(normalDB)) normalDB <- readRDS(opt$normaldb)
        if (normalDB$version >= 8) {
            file.amps <- paste0(out, "_amplification_pvalues.csv")
            allAmplifications <- callAmplificationsInLowPurity(ret, normalDB,
                all.genes = TRUE, BPPARAM = BPPARAM)

            write.csv(cbind(Sampleid = sampleid, gene.symbol = rownames(allAmplifications),
                allAmplifications), row.names = FALSE, file = file.amps, quote = FALSE)
        }
    }    
} else {
    flog.warn("--intervals does not contain gene symbols. Not generating gene-level calls.")
}
    
if (!is.null(ret$input$vcf)) {
    if (opt$outvcf) {
        file.vcf <- paste0(out, ".vcf")
        vcfanno <- predictSomatic(ret, return.vcf = TRUE)
        writeVcf(vcfanno, file = file.vcf)
        bgzip(file.vcf, paste0(file.vcf, ".gz"), overwrite = TRUE)
        indexTabix(paste0(file.vcf, ".gz"), format = "vcf")
    }
    file.csv <- paste0(out, "_variants.csv")
    write.csv(cbind(Sampleid = sampleid, predictSomatic(ret)), file = file.csv,
        row.names = FALSE, quote = FALSE)

    file.loh <- paste0(out, "_loh.csv")
    write.csv(cbind(Sampleid = sampleid, PureCN::callLOH(ret)), file = file.loh,
        row.names = FALSE, quote = FALSE)

    file.pdf <- paste0(out, "_chromosomes.pdf")
    pdf(file.pdf, width = 9, height = 10)
    vcf <- ret$input$vcf[ret$results[[1]]$SNV.posterior$vcf.ids]
    chromosomes <- seqlevelsInUse(vcf)
    chromosomes <- chromosomes[orderSeqlevels(chromosomes)]
    for (chrom in chromosomes) {
        plotAbs(ret, 1, type = "BAF", chr = chrom)
    }
    invisible(dev.off())
}

Try the PureCN package in your browser

Any scripts or data that you put into this service are public.

PureCN documentation built on Nov. 8, 2020, 5:37 p.m.