inst/extdata/IntervalFile.R

suppressPackageStartupMessages(library(optparse))
suppressPackageStartupMessages(library(futile.logger))

### Parsing command line ------------------------------------------------------

option_list <- list(
    make_option(c("--fasta"), action = "store", type = "character", 
        default = NULL, help = "Reference Fasta file"),
    make_option(c("--infile"), action = "store", type = "character", 
        default = NULL,
        help = "Infile specifying target (baits) intervals. Needs to be parsable by rtracklayer."),
    make_option(c("--offtarget"), action = "store_true",
        default = formals(PureCN::preprocessIntervals)$off.target, 
        help = "Include off-target regions [default %default]"),
    make_option(c("--targetwidth"), action = "store", type = "integer",
        default = formals(PureCN::preprocessIntervals)$average.target.width, 
        help = "Split large targets to approximately that size [default %default]"),
    make_option(c("--mintargetwidth"), action = "store", type = "integer",
        default = formals(PureCN::preprocessIntervals)$min.target.width, 
        help = "Either resize or drop targets smaller than specified [default %default]"),
    make_option(c("--smalltargets"), action = "store", type = "character",
        default = formals(PureCN::preprocessIntervals)$small.targets[[2]], 
        help = "Either 'resize' or 'drop' small targets [default %default]"),
    make_option(c("--offtargetwidth"), action = "store", type = "integer",
        default = formals(PureCN::preprocessIntervals)$average.off.target.width, 
        help = "Bin off-target regions to approximately that size [default %default]"),
    make_option(c("--offtargetseqlevels"), action = "store", type = "character",
        default = formals(PureCN::preprocessIntervals)$off.target.seqlevels[[2]], 
        help = "Controls how to deal with chromosomes/contigs not found in infile. One of targeted, all [default %default]"),
    make_option(c("--mappability"), action = "store", type = "character", 
        help = "File parsable by rtracklayer specifying mappability scores of genomic regions."),
    make_option(c("--minmappability"), action = "store", type = "character", 
        default = paste(eval(formals(PureCN::preprocessIntervals)$min.mappability), collapse=","),
        help = "Minimum mappability for on-target, off-target and chrY regions [default %default]"),
    make_option(c("--reptiming"), action = "store", type = "character", default = NULL,
        help = "File parsable by rtracklayer specifying replication timing scores of genomic regions."),
    make_option(c("--reptimingwidth"), action = "store", type = "integer", default = 100000,
        help = "Average the replication timing data into bins of the specified size [default %default]"),
    make_option(c("--genome"), action = "store", type = "character", 
        default = NULL,
        help = "Genome version. If one of hg18, hg19, hg38, mm9, mm10, rn4, rn5, rn6, canFam3 will annotate intervals with gene symbols"),
    make_option(c("--outfile"), action = "store", type = "character", 
        default = NULL,
        help = "Outfile of annotated targets optimized for copy number calling."),
    make_option(c("--export"), action = "store", type = "character", default = NULL,
        help = "Export optimized intervals using rtracklayer. The file extension specifies the format."),
    make_option(c("--exclude"), action = "store", type = "character", 
        help = "File parsable by rtracklayer specifying baits that should be excluded from --infile."),
    make_option(c("-v", "--version"), action = "store_true", default = FALSE,
        help = "Print PureCN version"),
    make_option(c("-f", "--force"), action = "store_true", default = FALSE,
        help = "Overwrite existing files")
)

opt <- parse_args(OptionParser(option_list = option_list))

if (opt$version) {
    message(as.character(packageVersion("PureCN")))
    q(status = 1)
}

outfile <- opt$outfile

if (is.null(opt$infile)) stop("Need --infile.")
if (is.null(opt$fasta)) stop("Need --fasta.")
if (is.null(opt$outfile)) stop("Need --outfile.")

if (!opt$force && file.exists(outfile)) {
    stop(outfile, " exists. Use --force to overwrite.")
}

.checkOutputDir <-function(filename) {
    if (!is.null(filename)) {
        dname <- dirname(filename)
        if (!file.exists(dname)) {
            flog.fatal("Output directory %s does not exist.", dname)
            q(status = 1)
        }
    }
}

.checkOutputDir(opt$outfile)
.checkOutputDir(opt$export)
.checkOutputDir(opt$exclude)

in.file <- normalizePath(opt$infile, mustWork = TRUE)
reference.file <- normalizePath(opt$fasta, mustWork = TRUE)

suppressPackageStartupMessages(library(rtracklayer))

intervals <- try(import(in.file), silent = TRUE)
if (is(intervals, "try-error")) { 
    flog.warn("Could not parse --infile with rtracklayer:\n\n%s\nTrying GATK3 parser that will probably fail...", intervals)
    intervals <- in.file
} else {
    if (sum(c("MT", "chrM", "chMT") %in% seqlevels(intervals))) {
        flog.warn("--infile contains mitochondrion sequence. It is highly recommended to exclude those baits.")
    }
}    
mappability <- opt$mappability

if (!is.null(mappability)) {
    mappability <- normalizePath(mappability, mustWork = TRUE)
    flog.info("Loading %s...", mappability)
    mappability <- import(mappability)
}
exclude <- opt$exclude
if (!is.null(exclude)) {
    exclude <- normalizePath(exclude, mustWork = TRUE)
    flog.info("Loading %s...", exclude)
    exclude <- import(exclude)
}

flog.info("Loading PureCN %s...", Biobase::package.version("PureCN"))
suppressPackageStartupMessages(library(PureCN))
flog.info("Processing %s...", in.file)

seqinfoRef <- seqinfo(scanFaIndex(reference.file))

reptiming <- opt[["reptiming"]]
if (!is.null(reptiming)) {
    reptiming <- normalizePath(reptiming, mustWork = TRUE)
    flog.info("Loading %s...", reptiming)
    reptiming <- import(reptiming)
}

if (!opt$offtarget) {
    flog.info("Will not add off-target regions. This is only recommended for%s",
     " Amplicon data. Add --offtarget to include them.")
}

min.mappability <- as.numeric(strsplit(opt$minmappability, ",")[[1]])

outGC <- preprocessIntervals(intervals, reference.file,
    output.file = outfile, off.target = opt$offtarget,
    mappability = mappability, min.mappability = min.mappability,
    min.target.width = opt$mintargetwidth,
    small.targets = opt$smalltargets,
    average.off.target.width = opt$offtargetwidth,
    reptiming = reptiming, average.reptiming.width = opt$reptimingwidth,
    off.target.seqlevels = opt$offtargetseqlevels,
    exclude = exclude, average.target.width = opt$targetwidth)

knownGenome <- list(
    hg18 = "TxDb.Hsapiens.UCSC.hg18.knownGene",
    hg19 = "TxDb.Hsapiens.UCSC.hg19.knownGene",
    hg38 = "TxDb.Hsapiens.UCSC.hg38.knownGene",
    mm9 = "TxDb.Mmusculus.UCSC.mm9.knownGene",
    mm10 = "TxDb.Mmusculus.UCSC.mm10.knownGene",
    rn4 = "TxDb.Rnorvegicus.UCSC.rn4.ensGene",
    rn5 = "TxDb.Rnorvegicus.UCSC.rn5.ensGene",
    rn6 = "TxDb.Rnorvegicus.UCSC.rn6.ensGene",
    canFam3 = "TxDb.Cfamiliaris.UCSC.canFam3.refGene"
)

knownOrg <- list(
    hg18 = "org.Hs.eg.db",
    hg19 = "org.Hs.eg.db",
    hg38 = "org.Hs.eg.db", 
    mm9 = "org.Mm.eg.db",
    mm10 = "org.Mm.eg.db",
    rn4 = "org.Rn.eg.db",
    rn5 = "org.Rn.eg.db",
    rn6 = "org.Rn.eg.db",
    canFam3 = "org.Cf.eg.db"
)

.writeGc <- function(interval.gr, output.file) {
    tmp <- data.frame(
        Target = as.character(interval.gr),
        gc_bias = interval.gr$gc_bias,
        mappability = interval.gr$mappability,
        reptiming = interval.gr$reptiming,
        Gene = interval.gr$Gene,
        on_target = interval.gr$on.target
    )    
    write.table(tmp, file = output.file, row.names = FALSE, quote = FALSE, 
                sep = "\t")
}

if (!is.null(opt$genome) ) {
    if (is.null(knownGenome[[opt$genome]])) {
        flog.warn("%s genome not known. %s Known genomes: %s", opt$genome, 
        "Will not annotate targets with gene symbols.", 
        paste(names(knownGenome), collapse=", "))
    } else if (!require(knownGenome[[opt$genome]], character.only = TRUE)) {
        flog.warn("Install %s to get gene symbol annotation.", 
            knownGenome[[opt$genome]])
    } else if (!require(knownOrg[[opt$genome]], character.only = TRUE)) {
        flog.warn("Install %s to get gene symbol annotation.",
            knownOrg[[opt$genome]])
    } else {
        outGC <- suppressMessages(annotateTargets(outGC,
            get(knownGenome[[opt$genome]]), get(knownOrg[[opt$genome]])))
        .writeGc(outGC, outfile)
    }
} else {
    flog.warn("Specify --genome to get gene symbol annotation.")
}

if (!is.null(opt$export)) {
    export(outGC, opt$export)
}

Try the PureCN package in your browser

Any scripts or data that you put into this service are public.

PureCN documentation built on Nov. 8, 2020, 5:37 p.m.