PureCN: Copy number calling and SNV classification using targeted short read sequencing

# Make CMD check happy
globalVariables(names=c("Gene", "LR", "chrom", "seg.id", "seg.length",
"seg.mean", "C.flagged", "weights"))

# calculates the log-likelihood of a segment, given log-ratios, the
# log ratio standard deviation, purity, copy number and ploidy.
# for sub-clonal alterations, it uses a uniform distribution, otherwise
# multiple gaussians for all tested copy numbers. 
.calcLlikSegment <- function(subclonal, lr, sd.seg, p, Ci, total.ploidy, 
max.exon.ratio) {
    if (subclonal) {
        return(.calcLlikSegmentSubClonal(lr, max.exon.ratio))
    }

    .calcLlikSegmentClonal(lr, sd.seg, p, Ci, total.ploidy)
}
.calcLlikSegmentClonal <- function(lr, sd.seg, p, Ci, total.ploidy) {
    sum(dnorm(lr, mean = log2((p * Ci + (1 - p) * 2)/total.ploidy), 
        sd = sd.seg, log = TRUE))
}
.calcLlikSegmentSubClonal <- function(lr, max.exon.ratio) {
    sum(dunif(
        x =  vapply(2^lr, function(y) min(y, max.exon.ratio), double(1)), 
        min = 0, max = max.exon.ratio, log = TRUE))
}
# Previously calculated likelihood scores do not take the segmentation into
# account. This will find the most likely segment minor copy number
.calcMsSegmentC <- function(yy, test.num.copy, Ci, prior.K, mapping.bias.ok, 
    seg.id, min.variants.segment) {
    prior.M <- list(0.2,0.15,c(0.1,0.25),c(0.1,0.3),c(0.1,0.2,0.55))
    prior.M <- c(list(1), lapply(prior.M, function(x) c(x, 1-sum(x))))
    max.M <- floor(Ci / 2)
    idx.germline <- test.num.copy + length(test.num.copy) + 1
    idx.somatic <- test.num.copy + 1
    variant.ok <- mapping.bias.ok & is.finite(apply(yy, 1, max))
    yys <- lapply(0:max.M, function(Mi) {
        for (i in test.num.copy) {
            n.cases.germ <- ifelse(Mi==Ci-Mi,1,2)
            tmp <- unique(c(0, 1,Mi, Ci-Mi))
            n.cases.somatic <- length(tmp)
            Cx <- max(i,Ci)

            if (i!=Mi && i!=Ci - Mi) {
                yy[,idx.germline[i+1]] <- yy[,idx.germline[i+1]] + log(1-prior.K) - log(Cx + 1 - n.cases.germ)

                # allow somatic mutations always have M=1
                if (i <= 1) {
                    yy[,idx.somatic[i+1]] <- yy[,idx.somatic[i+1]] + log(prior.K) - log(n.cases.somatic)
                } else {
                    yy[,idx.somatic[i+1]] <- yy[,idx.somatic[i+1]] +
                        log(1-prior.K) - log(Cx + 1 - n.cases.somatic)
                } 
            } else {
                yy[, idx.germline[i + 1]] <- yy[, idx.germline[i + 1]] + log(prior.K) - log(n.cases.germ)
                yy[, idx.somatic[i + 1]] <- yy[, idx.somatic[i + 1]] + log(prior.K) - log(n.cases.somatic)
            }
        }
        yy
    })
    # if not enough variants in segment, flag
    flag <- sum(variant.ok) < min.variants.segment
    if (!sum(variant.ok)) {
        # still use them to avoid erroring out, but we will ignore the output because
        # of flagging
        variant.ok <- rep(TRUE, length(variant.ok))
    }

    likelihoodScores <- vapply(yys, function(x) {
        sum(apply(x[variant.ok, , drop = FALSE], 1, max))
    }, double(1))

    best <- which.max(likelihoodScores)
    # this should not happen...
    if (!length(best)) {
        flag <- TRUE
        best <- 1
    }
    # transform and scale
    likelihoodScores <- exp(likelihoodScores - likelihoodScores[best])
    posterior <- likelihoodScores[best] / sum(likelihoodScores, na.rm = TRUE)
    if (is.na(posterior) || posterior < 0.5) flag <- TRUE

    list(best = yys[[best]], M = test.num.copy[best], flag = flag, posterior = posterior)
}

# calculate likelihood scores for all possible minor copy numbers
.calcMsSegment <- function(xxi, test.num.copy, opt.C, prior.K, mapping.bias.ok, 
    seg.id, min.variants.segment) {
    lapply(seq_along(xxi), function(i).calcMsSegmentC(xxi[[i]], test.num.copy,
c(test.num.copy, round(opt.C))[i], prior.K, mapping.bias.ok, seg.id, min.variants.segment))
}    

.calcSNVLLik <- function(vcf, tumor.id.in.vcf, ov, p, test.num.copy, 
    C.likelihood, C, opt.C, median.C, snv.model, snv.lr, 
    sampleid = NULL, cont.rate = 0.01, prior.K, max.coverage.vcf, non.clonal.M,
    model.homozygous = FALSE, error = 0.001, max.mapping.bias = 0.8, max.pon,
    min.variants.segment) {
    
    .dbeta <- function(x, shape1, shape2, log, size, rho) dbeta(x=x, 
        shape1 = shape1, shape2 = shape2, log = log)
    if (snv.model == "betabin") {
        .dbeta <- function(x, shape1, shape2, log, size, rho) {
            prob <- ((shape1-1)/(shape1+shape2-2))
            dbetabinom(x = round(x * size),prob = prob, 
                size = size, rho = rho, log = TRUE)
         }
    }
    prefix <- .getPureCNPrefixVcf(vcf)
    prior.somatic <- info(vcf)[[paste0(prefix, "PR")]]
    prior.cont <- ifelse(prior.somatic < 0.1, cont.rate, 0)
    prior.somatic <- prior.somatic - (prior.cont*prior.somatic)
    priorHom <- if (model.homozygous) -log(3) else log(0)

    haploid.penalty <- 0
    
    if (median.C < 1.1) {
        haploid.penalty <- 2
    }
    
    subclonal <- apply(C.likelihood[queryHits(ov), ], 1, which.max) == ncol(C.likelihood)
    
    seg.idx <- which(seq_len(nrow(C.likelihood)) %in% queryHits(ov))

    ar_all <- unlist(geno(vcf)$FA[, tumor.id.in.vcf])
    bias_all <- info(vcf)[[paste0(prefix, "MBB")]]
    dp_all <- unlist(geno(vcf)$DP[, tumor.id.in.vcf])
    impute <- .imputeBetaBin(dp_all, bias_all, 
        mu = info(vcf)[[paste0(prefix, "MBMU")]],
        rho = info(vcf)[[paste0(prefix, "MBRHO")]])
    rho_all <- impute$rho
    ar_all <- ar_all / (impute$mu * 2)
    ar_all[ar_all > 1] <- 1
    
    if (snv.model!="betabin") dp_all[dp_all>max.coverage.vcf] <- max.coverage.vcf

    # Fit variants in all segments
    xx <- lapply(seg.idx, function(i) {
        # classify germline vs somatic
        idx <- subjectHits(ov)[queryHits(ov) == i]
        
        shape1 <- ar_all[idx] * dp_all[idx] + 1
        shape2 <- (1 - ar_all[idx]) * dp_all[idx] + 1
        mInf_all <- log(double(length(shape1)))

        list(vcf.ids=idx, likelihoods=lapply(seq(ncol(C.likelihood)), function(k) {
            Ci <- c(test.num.copy, opt.C[i])[k]
            priorM <- log(max(Ci,1) + 1 + haploid.penalty)
            
            skip <- test.num.copy > Ci | C.likelihood[i, k] <= 0

            p.ar <- lapply(c(0, 1), function(g) {
                cns <- test.num.copy
                if (!g) cns[1] <- non.clonal.M
                dbetax <- (p * cns + g * (1-p)) / (p * Ci + 2 * (1-p))
                l <- lapply(seq_along(test.num.copy), function(j) {
                    if (skip[j]) return(mInf_all)
                    .dbeta(x = dbetax[j],
                    shape1 = shape1,
                    shape2 = shape2, log = TRUE, size = dp_all[idx],
                    rho = rho_all[idx]) - priorM
                })
                do.call(cbind,l)
            })
            
            p.ar.cont.1 <- .dbeta(x = (p * Ci + 2 * (1 - p - cont.rate))/
                  (p * Ci + 2 * (1 - p)),shape1=shape1, shape2=shape2, 
                  log=TRUE, size=dp_all[idx], rho = rho_all[idx]) - priorM

            p.ar.cont.2 <- .dbeta(x = cont.rate / (p * Ci + 2 * (1 - p)), 
                  shape1 = shape1, shape2 = shape2, log = TRUE, 
                  size = dp_all[idx], rho = rho_all[idx]) - priorM

            # add prior probabilities for somatic vs germline
            p.ar[[1]] <- p.ar[[1]] + log(prior.somatic[idx])

            p.ar[[2]] <- p.ar[[2]] + log(1 - prior.somatic[idx])

            # contamination (either homozygous germline, or germline from 
            # other sample)

            p.ar[[3]] <- p.ar.cont.1 + log(prior.cont[idx])
            p.ar[[4]] <- p.ar.cont.2 + log(prior.cont[idx])

            # homozygous state
            p.ar[[5]] <- dbinom(round((1 - ar_all[idx]) * dp_all[idx]),
                size = round(dp_all[idx]), prob = error / 3, log = TRUE) +
                priorHom + log(1 - prior.somatic[idx])
            do.call(cbind, p.ar)
        }))
    })
    
    tmp <- lapply(seq_along(xx),function(i) .calcMsSegment(xx[[i]]$likelihoods, 
               test.num.copy, opt.C[seg.idx[i]], prior.K, 
               mapping.bias.ok = info(vcf[xx[[i]]$vcf.ids])[[paste0(prefix, "MBB")]] >= max.mapping.bias &
                                 info(vcf[xx[[i]]$vcf.ids])[[paste0(prefix, "MBB")]] <= (2 - max.mapping.bias),
               seg.id = seg.idx[i], min.variants.segment))

    xx <- lapply(tmp, lapply, function(x) x$best)
    
    .extractValues <- function(tmp, field) {
        segmentValue <- sapply(seq_along(tmp), function(i) 
            tmp[[i]][[min(C[seg.idx[i]], max(test.num.copy))+1]][[field]])
        segmentValue <- unlist(sapply(seq_along(seg.idx), function(i) 
            rep(segmentValue[i], sum(seg.idx[i] == queryHits(ov)))))
    }
    # Get segment M's for each SNV
    segment.M <- .extractValues(tmp, "M")
    segment.Flag <- .extractValues(tmp, "flag")
    segment.Posterior <- .extractValues(tmp, "posterior")

    likelihoods <- do.call(rbind, 
        lapply(seq_along(xx), function(i) Reduce("+", 
            lapply(seq(ncol(C.likelihood)), function(j) 
                exp(xx[[i]][[j]]) * C.likelihood[seg.idx[i], j]))))

    colnames(likelihoods) <- c(paste("SOMATIC.M", test.num.copy, sep = ""), 
        paste("GERMLINE.M", test.num.copy, sep = ""), "GERMLINE.CONTHIGH", 
        "GERMLINE.CONTLOW", "GERMLINE.HOMOZYGOUS")
    
    vcf.ids <- do.call(c, lapply(seg.idx, function(i) 
        subjectHits(ov)[queryHits(ov) == i]))
    rownames(likelihoods) <- vcf.ids
    
    # for very high-level amplifications, all posteriors can be 0, so make sure
    # we get valid values here and flag those later.
    posteriors <- likelihoods/pmax(rowSums(likelihoods),.Machine$double.xmin)
    # this just adds a lot of helpful info to the SNV posteriors
    xx <- .extractMLSNVState(posteriors)
    
    posteriors <- cbind(
        as.data.frame(rowRanges(vcf[vcf.ids]), row.names=NULL)[, 1:3],
        ID = names(vcf[vcf.ids]),
        REF = as.character(ref(vcf[vcf.ids])),
        ALT = sapply(alt(vcf[vcf.ids]), function(x) 
                     paste(as.character(x), collapse=";")),
        posteriors,
        xx, 
        ML.C = C[queryHits(ov)],
        ML.M.SEGMENT = segment.M,
        M.SEGMENT.POSTERIOR = segment.Posterior,
        M.SEGMENT.FLAGGED = segment.Flag,
        row.names = NULL
    )
    
    posteriors$ML.AR <- (p * posteriors$ML.M + 
        ifelse(posteriors$ML.SOMATIC, 0, 1) * 
        (1 - p)) / (p * posteriors$ML.C + 2 * (1 - p))
    posteriors$ML.AR[posteriors$ML.AR > 1] <- 1 

    posteriors$AR <- unlist(geno(vcf[vcf.ids])$FA[, tumor.id.in.vcf])
    posteriors$AR.ADJUSTED <- NA
    posteriors$MAPPING.BIAS <- info(vcf[vcf.ids])[[paste0(prefix, "MBB")]]
    posteriors$AR.ADJUSTED <- pmin(1, posteriors$AR / posteriors$MAPPING.BIAS)
    # Extract LOH
    posteriors$ML.LOH <- (posteriors$ML.M == posteriors$ML.C | 
        posteriors$ML.M == 0 | posteriors$ML.C == 1)
    
    posteriors$CN.SUBCLONAL <- subclonal
    depth <-as.numeric(geno(vcf[vcf.ids])$DP[, tumor.id.in.vcf])
    ar <- posteriors$AR.ADJUSTED
    ar[!posteriors$ML.SOMATIC] <- NA

    m <-  t(apply(cbind(ar, depth,  posteriors$ML.C), 1, function(x) 
        .calculate_ccf(vaf = x[1], depth = x[2], purity = p, C = x[3])))

    posteriors$CELLFRACTION <- as.numeric(m[,1])
    posteriors$CELLFRACTION.95.LOWER <- as.numeric(m[,2])
    posteriors$CELLFRACTION.95.UPPER <- as.numeric(m[,3])
    ar <- posteriors$AR.ADJUSTED
    posteriors$ALLELIC.IMBALANCE <- .calculate_allelic_imbalance(
        vaf = posteriors$AR,
        depth = depth,
        max.coverage.vcf = max.coverage.vcf, 
        bias = posteriors$MAPPING.BIAS,
        mu = info(vcf[vcf.ids])[[paste0(prefix, "MBMU")]],
        rho = info(vcf[vcf.ids])[[paste0(prefix, "MBRHO")]])

    rm.snv.posteriors <- apply(likelihoods, 1, max)
    idx.ignore <- rm.snv.posteriors == 0 | is.na(rm.snv.posteriors) |
        posteriors$MAPPING.BIAS < max.mapping.bias |
        posteriors$MAPPING.BIAS > (2 - max.mapping.bias) |
        posteriors$start != posteriors$end

    posteriors$FLAGGED <- idx.ignore

    posteriors$log.ratio <- snv.lr[vcf.ids]
    posteriors$depth <- depth
    posteriors$prior.somatic <- prior.somatic[vcf.ids]
    posteriors$prior.contamination <- prior.cont[vcf.ids]
    posteriors$on.target <- info(vcf[vcf.ids])[[paste0(prefix, "OnTarget")]]
    posteriors$seg.id <- queryHits(ov)
    
    posteriors$pon.count <- info(vcf[vcf.ids])[[paste0(prefix, "MBPON")]]
    if (!is.null(posteriors$pon.count)) {
        idx.ignore <- idx.ignore | 
            (posteriors$pon.count > max.pon & posteriors$prior.somatic > 0.1)
        posteriors$FLAGGED <- idx.ignore
    }
    # change seqnames to chr
    colnames(posteriors)[1] <- "chr"    

    ret <- list(
        llik = sum(log(rm.snv.posteriors[!idx.ignore])) - sum(idx.ignore), 
        likelihoods = likelihoods, 
        posteriors = posteriors, 
        vcf.ids = vcf.ids, 
        posterior.contamination = 0)

    ret
}

.extractMLSNVState <- function(snv.posteriors) {
    # should not happen, but to avoid failure of which.max
    snv.posteriors[is.nan(snv.posteriors)] <- 0
    l1 <- apply(snv.posteriors, 1, which.max)
    xx <- do.call(rbind, strsplit(colnames(snv.posteriors)[l1], "\\."))
    xx[, 1] <- ifelse(xx[, 1] == "GERMLINE", FALSE, TRUE)
    xx[, 2] <- gsub("^M", "", xx[, 2])
    xx <- as.data.frame(xx, stringsAsFactors = FALSE)
    xx[, 1] <- as.logical(xx[, 1])
    xx[, 2] <- suppressWarnings(as.numeric(xx[, 2]))
    colnames(xx) <- c("ML.SOMATIC", "ML.M")
    # set sub-clonal (ML.M=0) to 1
    xx$ML.M[xx$ML.SOMATIC & !xx$ML.M] <- 1
    xx$POSTERIOR.SOMATIC <- apply(snv.posteriors[, grep("SOMATIC.M", 
        colnames(snv.posteriors))], 1, sum, na.rm=TRUE)
    xx[, c(1,3,2)]
}

.checkFraction <- function(x, name) {
    if (!is.numeric(x) || length(x) !=1 || 
        x < 0 || x > 1) {
        .stopUserError(name, " not within expected range or format.")
    }
}

.checkParameters <- function(test.purity, min.ploidy, max.ploidy, 
    max.non.clonal, max.homozygous.loss, sampleid, prior.K, 
    prior.contamination, prior.purity, iterations, min.gof, model.homozygous, 
    interval.file, log.ratio.calibration, test.num.copy, max.mapping.bias) {
    if (min(test.purity) <= 0 || max(test.purity) > 0.99) 
        .stopUserError("test.purity not within expected range.")
    if (min.ploidy <= 0 || max.ploidy <= 2) 
        .stopUserError("min.ploidy or max.ploidy not within expected range.")

    if (min(test.num.copy) < 0) 
        .stopUserError("test.num.copy not within expected range.")
    
    if (min(test.num.copy) > 0 || max(test.num.copy)>8) 
        flog.warn("test.num.copy outside recommended range.")
                   
    .checkFraction(max.non.clonal, "max.non.clonal")
    .checkFraction(max.homozygous.loss[1], "max.homozygous.loss")
    .checkFraction(prior.K, "prior.K")
    .checkFraction(prior.contamination, "prior.contamination")
    .checkFraction(min.gof, "min.gof")
    .checkFraction(max.mapping.bias, "max.mapping.bias")

    tmp <- sapply(prior.purity, .checkFraction, "prior.purity")

    if (!is.null(sampleid) && (!is(sampleid, "character") ||
        length(sampleid) != 1)) {
        .stopUserError("sampleid not a character string.")
    }
    if (abs(1-sum(prior.purity)) > 0.02) {
        .stopUserError("prior.purity must add to 1. Sum is ", sum(prior.purity))
    }    
    if (length(prior.purity) != length(test.purity)) {
        .stopUserError("prior.purity must have the same length as ",
            "test.purity.")
    }    
    if (!is.null(interval.file) && !file.exists(interval.file)) {
        .stopUserError("interval.file ", interval.file, " not found.")
    }

    stopifnot(is.numeric(min.ploidy))
    stopifnot(is.numeric(max.ploidy))
    stopifnot(is.numeric(test.purity))
    stopifnot(is.numeric(iterations))
    stopifnot(is.numeric(log.ratio.calibration))
    stopifnot(is.logical(model.homozygous))

    if (iterations < 10 || iterations > 250) {
        .stopUserError("Iterations not in the expected range from 10 to 250.")
    }    
}

.failedNonAberrant <- function(result, cutoffs = c(0.01, 0.005)) {
    xx <- split(result$seg, result$seg$C)
    if (length(xx) < 3) 
        return(TRUE)
    xx.sum <- sort(vapply(xx, function(x) sum(x$size), double(1)), 
        decreasing = TRUE)
    xx.sum <- xx.sum/sum(xx.sum)
    if (xx.sum[2] <= cutoffs[1] && xx.sum[3] <= cutoffs[2]) 
        return(TRUE)
    FALSE
}
.filterDuplicatedResults <- function(results, purity.cutoff = 0.1) {
    if (length(results) < 2) 
        return(results)
    idx.duplicated <- rep(FALSE, length(results))

    for (i in seq_len(length(results)-1)) {
        for (j in seq(i+1,length(results))) {
            if (abs(results[[i]]$purity - results[[j]]$purity) < purity.cutoff &&
                abs(results[[i]]$ploidy - results[[j]]$ploidy) /
                results[[i]]$ploidy < 0.1) {
                idx.duplicated[j] <- TRUE
            } 
        }
    }
    results[!idx.duplicated]
}
.filterDuplicatedCandidates <- function(candidates) {
    if (nrow(candidates) < 2) 
        return(candidates)
    idx.duplicated <- rep(FALSE, nrow(candidates))

    for (i in seq_len(nrow(candidates)-1)) {
        for (j in seq(i+1,nrow(candidates))) {
            if (abs(candidates$purity[i] - candidates$purity[j]) < 0.1 &&
                abs(candidates$tumor.ploidy[i] - candidates$tumor.ploidy[j]) / 
                candidates$tumor.ploidy[i] < 0.1) {
                idx.duplicated[j] <- TRUE
            } 
        }
    }
    candidates[!idx.duplicated, ]
}
.findLocalMinima <- function(m) {
    loc.min <- matrix(nrow = 0, ncol = 2)
    for (i in seq_len(nrow(m))) {
        for (j in seq_len(ncol(m))) {
            x <- seq(i - 1, i + 1)
            x <- x[x >= 1 & x <= nrow(m)]
            y <- seq(j - 1, j + 1)
            y <- y[y >= 1 & y <= ncol(m)]
            if (m[i, j] == max(m[x, y]) && !is.infinite(m[i, j])) 
                loc.min <- rbind(loc.min, c(row = i, col = j))
        }
    }
    loc.min
}
.appendComment <- function(a, b) {
    if (is.na(a)) 
        return(b)
    paste(a, b, sep = ";")
}
.isRareKaryotype <- function(ploidy) {
    ploidy > 4.5 || ploidy < 1.5
}
    
.flagResult <- function(result, max.non.clonal = 0.2, min.gof, 
    use.somatic.status, model.homozygous) {
    result$flag_comment <- NA
    result$flag <- .failedNonAberrant(result)
    if (result$flag) {
        result$flag_comment <- .appendComment(result$flag_comment, 
            "NON-ABERRANT")
    }
    if (result$fraction.subclonal > max.non.clonal*0.75) {
        result$flag <- TRUE
        result$flag_comment <- .appendComment(result$flag_comment, 
            "POLYGENOMIC")
    }
    if (result$fraction.homozygous.loss > 0.01) {
        result$flag <- TRUE
        result$flag_comment <- .appendComment(result$flag_comment, 
            "EXCESSIVE LOSSES")
    }
    if (.isRareKaryotype(result$ploidy)) {
        result$flag <- TRUE
        result$flag_comment <- .appendComment(result$flag_comment, 
            "RARE KARYOTYPE")
    }
    if (result$purity < 0.3) {
        result$flag <- TRUE
        result$flag_comment <- .appendComment(result$flag_comment, 
            "LOW PURITY")
    }
    if (result$purity > 0.9 && !model.homozygous && 
        (!is.null(use.somatic.status) && !use.somatic.status)) {
        result$flag <- TRUE
        result$flag_comment <- .appendComment(result$flag_comment, 
            "HIGH PURITY AND model.homozygous=FALSE")
    }
    result$GoF <- .getGoF(result)

    if (!is.null(result$GoF) && result$GoF < min.gof) {
        result$flag <- TRUE
        result$flag_comment <- .appendComment(result$flag_comment, 
            paste0("POOR GOF (", round(result$GoF*100,digits=1),"%)"))
    }

    fraction.loh <- .getFractionLoh(result)
    # Assume that everything below 2.6 did not undergo genome duplication, which can
    # result in lots of LOH
    if (result$ploidy < 2.6 && fraction.loh > 0.5) {
        result$flag <- TRUE
        result$flag_comment <- .appendComment(result$flag_comment, "EXCESSIVE LOH")
    }
    return(result)
}

.getFractionLoh <- function(result) {
    if (is.null(result$SNV.posterior)) return(0)
    pp <- result$SNV.posterior$posteriors
    x1 <- unique(pp$seg.id[pp$ML.M.SEGMENT==0])
    sum(result$seg$size[x1])/sum(result$seg$size)
}
.getGoF <- function(result) {
    if (is.null(result$SNV.posterior)) return(0)
    r <- result$SNV.posterior$posteriors
    e <- (r$ML.AR-r$AR.ADJUSTED)^2
    maxDist <- 0.2^2
    r2 <- max(1-mean(e,na.rm=TRUE)/maxDist,0)
    return(r2)
}    

.flagResults <- function(results, max.non.clonal = 0.2, max.logr.sdev, 
    logr.sdev, max.segments, min.gof, flag = NA, flag_comment = NA, 
    dropout=FALSE, use.somatic.status=TRUE, model.homozygous=FALSE) {
    if (length(results) < 1) return(results)

    results <- lapply(results, .flagResult, max.non.clonal=max.non.clonal, 
        min.gof=min.gof, use.somatic.status=use.somatic.status, 
        model.homozygous=model.homozygous)

    number.segments <- nrow(results[[1]]$seg)
    
    # some global flags
    if (logr.sdev > max.logr.sdev) {
        for (i in seq_along(results)) {
            results[[i]]$flag <- TRUE
            results[[i]]$flag_comment <- .appendComment(results[[i]]$flag_comment, 
                "NOISY LOG-RATIO")
        }
    }

    if (number.segments > max.segments) {
        for (i in seq_along(results)) {
            results[[i]]$flag <- TRUE
            results[[i]]$flag_comment <- .appendComment(results[[i]]$flag_comment, 
                "NOISY SEGMENTATION")
        }
    }

    if (dropout) {
        for (i in seq_along(results)) {
            results[[i]]$flag <- TRUE
            results[[i]]$flag_comment <- .appendComment(results[[i]]$flag_comment, 
                "HIGH AT- OR GC-DROPOUT")
        }
    }
    
    if (!is.na(flag) && flag) {
        for (i in seq_along(results)) {
            results[[i]]$flag <- TRUE
            results[[i]]$flag_comment <- .appendComment(results[[i]]$flag_comment, 
                flag_comment)
        }
    }
    results
}
.getGeneCalls <- function(seg.adjusted, tumor, log.ratio, fun.focal, 
    args.focal, chr.hash) {
    args.focal <- c(list(seg = seg.adjusted), args.focal)
    focal <- do.call(fun.focal, args.focal)
    abs.gc <- GRanges(seqnames = .add.chr.name(seg.adjusted$chrom, chr.hash), IRanges(start = seg.adjusted$loc.start, 
        end = seg.adjusted$loc.end))

    # that will otherwise mess up the log-ratio means, mins and maxs
    idx <- which(!is.nan(log.ratio) & is.finite(log.ratio) & tumor$Gene != ".")
    if (!length(idx)) return(NA)
    tumor <- tumor[idx]
    log.ratio <- log.ratio[idx]
    if (is.null(tumor$weights)) tumor$weights <- 1
    
    ov <- findOverlaps(tumor, abs.gc)
    if (is.null(seg.adjusted$weight.flagged)) seg.adjusted$weight.flagged <- NA
    # use funky data.table to calculate means etc. in two lines of code.
    dt <- data.table(as.data.frame(tumor[queryHits(ov)]), C = seg.adjusted[subjectHits(ov), "C"], 
        C.flagged = seg.adjusted$weight.flagged[subjectHits(ov)],
        seg.mean = seg.adjusted[subjectHits(ov), "seg.mean"], LR = log.ratio[queryHits(ov)], 
        seg.id = subjectHits(ov), seg.length = seg.adjusted$size[subjectHits(ov)], 
        focal = focal[subjectHits(ov)]
        )
    # some targets have multipe genes assigned?
    if (sum(grepl(",", dt$Gene))) {
        dt <- dt[, list(Gene = unlist(strsplit(as.character(Gene), ",", fixed=TRUE))), 
            by = list(seqnames, start, end, C, C.flagged, seg.mean, seg.id,
                      seg.length, LR, focal, weights)]
    }
    multi_chrom_symbols <- names(which(sapply(lapply(split(as.character(dt$seqnames), dt$Gene), unique), length) > 1))
    if (length(multi_chrom_symbols)) {
        flog.warn("Some gene symbols found on multiple chromosomes. Use of approved symbols suggested.")
        idx <- which(dt$Gene %in% multi_chrom_symbols)
        dt$Gene[idx] <- paste(dt$Gene[idx], dt$seqnames[idx], sep = "__")
    }    
    gene.calls <- data.frame(dt[, list(chr = seqnames[1], start = min(start), 
        end = max(end), 
        C = as.double(C[which.min(seg.length)]), 
        C.flagged = any(C.flagged),
        seg.mean = seg.mean[which.min(seg.length)],
        seg.id = seg.id[which.min(seg.length)], 
        .min.segid=min(seg.id), .max.segid=max(seg.id),
        number.targets = length(start), 
        gene.mean = weighted.mean(LR, weights, na.rm = TRUE), 
        gene.min = min(LR, na.rm = TRUE), gene.max = max(LR, na.rm = TRUE), 
        focal = focal[which.min(seg.length)]), by = Gene], row.names = 1)
    breakpoints <- gene.calls$.max.segid - gene.calls$.min.segid
    gene.calls$breakpoints <- breakpoints
    gene.calls    
}


.extractCountMatrix <- function(coverages) {
    useCounts <- .coverageHasCounts(coverages)
    if (useCounts) {
        return(do.call(cbind, 
            lapply(coverages, function(x) x$counts)))
    }
    # TODO: remove spring 2018 release
    flog.info("Coverage file does not contain read count information, %s", 
        "using total coverage for calculating log-ratios.")
    do.call(cbind, 
      lapply(coverages, function(x) x$coverage))
}
.coverageHasCounts <- function(coverages) {
    for (i in seq_along(coverages)) 
        if (sum(!is.na(coverages[[i]]$counts))==0) return(FALSE)
    return(TRUE)        
}
.checkSymbolsChromosome <- function(tumor, blank=c("", ".")) {
    if (is.null(tumor$Gene)) {
        tumor$Gene <- "."
        return(tumor)
    }    
    chrsPerSymbol <- lapply(split(seqnames(tumor), tumor$Gene), unique)
    nonUniqueSymbols <- names(chrsPerSymbol[sapply(chrsPerSymbol, length)>1])
    idx <- tumor$Gene %in% nonUniqueSymbols
    idxBlank <- tumor$Gene %in% blank
    tumor$Gene <- as.character(tumor$Gene)
    tumor$Gene[idx] <- paste(tumor$Gene, tumor$chr, sep=".")[idx]
    tumor$Gene[idxBlank] <- "."
    tumor
}
    
.get2DPurityGrid <- function(test.purity, by=1/30) {
    startPurity <- max(0.1, min(test.purity))
    endPurity <- min(0.99, max(test.purity))
    grid <- seq(startPurity, endPurity, by=by)   
    if (startPurity < 0.34 && endPurity > 0.35) {
        grid <- c(seq(startPurity, 0.34, by=1/50), 
        seq(0.35, endPurity, by=by))
    } 
    grid       
}
    
.optimizeGrid <- function(test.purity, min.ploidy, max.ploidy, test.num.copy = 0:7, 
    exon.lrs, seg, sd.seg, li, max.exon.ratio, max.non.clonal, BPPARAM) {
    ploidy.grid <- seq(min.ploidy, max.ploidy, by = 0.2)
    if (min.ploidy < 1.8 && max.ploidy > 2.2) {
        ploidy.grid <- c(seq(min.ploidy, 1.8, by = 0.2), 1.9, 2, 2.1, seq(2.2, max.ploidy, 
            by = 0.2))
    }

    .optimizeGridPurity <- function(p) {
        b <- 2 * (1 - p)
        log.ratio.offset <- 0
        lapply(ploidy.grid, function(D) {
            dt <- p/D
            llik.all <- lapply(seq_along(exon.lrs), function(i) .calcLlikSegmentExonLrs(exon.lrs[[i]], 
                log.ratio.offset, max.exon.ratio, sd.seg, dt, b, D, test.num.copy))
            subclonal <- vapply(llik.all, which.max, double(1)) == 1
            subclonal.f <- length(unlist(exon.lrs[subclonal]))/length(unlist(exon.lrs))
            if (subclonal.f > max.non.clonal) return(-Inf)
            sum(vapply(llik.all, max, double(1)))
        })
    }
    if (is.null(BPPARAM)) {
        mm <- lapply(test.purity, .optimizeGridPurity)
    } else {
        mm <- BiocParallel::bplapply(test.purity, .optimizeGridPurity, BPPARAM = BPPARAM)
    }
    mm <- sapply(mm, function(x) unlist(x))
    colnames(mm) <- test.purity
    rownames(mm) <- ploidy.grid

    if (!sum(as.vector(is.finite(mm)))) {
        .stopUserError("Cannot find valid purity/ploidy solution. ", 
            "This happens when input segmentations are garbage, most likely ",
            "due to a catastrophic sample QC failure. Re-check standard QC ",
            "metrics for this sample.")
    }

    ai <- .findLocalMinima(mm)
    candidates <- data.frame(ploidy = as.numeric(rownames(mm)[ai[, 1]]), purity = as.numeric(colnames(mm)[ai[, 
        2]]), llik = mm[ai])
    candidates$tumor.ploidy <- (candidates$ploidy - 2 * (1 - candidates$purity))/candidates$purity
    
    # add diploid candidate solutions in the following purity grid in 
    # case there are none.
    grid <- seq(0,1,by=1/4)
    for (i in seq_along(grid)[-length(grid)]) {
   
        t1 <- which.min(abs(as.numeric(colnames(mm)) - grid[i]))
        t2 <- which.min(abs(as.numeric(colnames(mm)) - grid[i+1]))
        if (t2-t1 < 2) next
        
        if (sum(candidates$purity>grid[i] & 
            candidates$purity< grid[i+1], na.rm=TRUE) < 1) next

        # Nothing close to diplod in this range? Then add.
        if (min(abs(2 - candidates$tumor.ploidy[candidates$purity>grid[i] & 
            candidates$purity< grid[i+1]])) > 0.3) {

            mm.05 <- mm[, seq(t1+1, t2), drop=FALSE]
            
            # Find row most similar to normal diploid
            diploidRowId <- which.min(abs(2-as.numeric(row.names(mm.05))))
            # assert that rownames are still what they should be
            if (diploidRowId != which.min(abs(2-ploidy.grid))) {
                .stopRuntimeError("Cannot find diploid row in grid search.")
            }
            candidates <- rbind(candidates, 
                c(2, as.numeric(names(which.max(mm.05[diploidRowId, ]))), 
                max(mm.05[diploidRowId, ]), 2))

            # Remove again if too similar with existing candidate
            if (nrow(candidates) > 2 && 
                abs(Reduce("-",tail(candidates$ploidy,2))) < 0.001 && 
                abs(Reduce("-",tail(candidates$purity,2))) < 0.1) {
                candidates <- candidates[- (nrow(candidates) - 2 + 
                    which.min(tail(candidates$llik,2))),]
            }    
        }
    }
    
    candidates <- candidates[candidates$tumor.ploidy >= 0.5, ]
    candidates <- .filterDuplicatedCandidates(candidates)
    
    list(all = mm, candidates = candidates[order(candidates$llik, decreasing = TRUE), 
        ])
}
.calcLlikSegmentExonLrs <- function(exon.lrs, log.ratio.offset, max.exon.ratio, sd.seg, 
    dt, b, D, test.num.copy) {
    c(.calcLlikSegmentSubClonal(exon.lrs + log.ratio.offset, max.exon.ratio), vapply(test.num.copy, 
        function(Ci) sum(dnorm(exon.lrs + log.ratio.offset, mean = log2(dt * Ci + 
            b/D), sd = sd.seg, log = TRUE)),double(1)))
}

# This function is used to punish more complex copy number models
# a little bit. Based on the BIC. This just counts the number of utilized 
# copy number states, excluding normal 2. Then multiplies by 
# log(number exons)
.calcComplexityCopyNumber <- function(results) {
    if (length(results) < 2) return(0)
    cs <- sapply((0:7)[-3], function(i) sapply(results, function(y)
                    sum(y$seg$size[y$seg$C == i])/sum(y$seg$size)))
    complexity <- apply(cs,1, function(z) sum(z>0.001))
    n <- sum(results[[1]]$seg$num.mark, na.rm=TRUE)
    -complexity*log(n)
}

.rankResults <- function(results) {
    if (length(results) < 1) return(results)  
    complexity <- .calcComplexityCopyNumber(results) 
    for (i in seq_along(results)) {
        if (is.null(results[[i]]$SNV.posterior)) {
            results[[i]]$total.log.likelihood <- results[[i]]$log.likelihood
        } else {
            results[[i]]$total.log.likelihood <- results[[i]]$log.likelihood/2 + results[[i]]$SNV.posterior$llik
        }
        results[[i]]$total.log.likelihood <- results[[i]]$total.log.likelihood + complexity[i]
    }
    idx.opt <- order(sapply(results, function(z) z$total.log.likelihood), decreasing = TRUE)
    
    results <- results[idx.opt]
    
    # remove solutions with -inf likelihood score
    results[!is.infinite(sapply(results, function(z) z$total.log.likelihood))]
}

.sampleOffsetFast <- function(test.num.copy, seg, exon.lrs, sd.seg, p, C, total.ploidy, 
    max.exon.ratio, simulated.annealing, log.ratio.calibration) {
    # Gibbs sample offset
    test.offset <- seq(sd.seg * -log.ratio.calibration, sd.seg * log.ratio.calibration, 
        by = 0.01)
    test.offset <- seq(p * -log.ratio.calibration, p * log.ratio.calibration * 0.2, length.out=12)

    seg.ids.by.chr <- list(seq_len(nrow(seg)))
    
    lr <- lapply(seq_along(seg.ids.by.chr), function(j) {
        px.offset <- lapply(test.offset, function(px) vapply(seg.ids.by.chr[[j]], 
            function(i) {
                b <- 2 * (1 - p)
                D <- total.ploidy
                dt <- p/D
                llik.all <- .calcLlikSegmentExonLrs(exon.lrs[[i]], px, max.exon.ratio, 
                  sd.seg, dt, b, D, test.num.copy)
                vapply(llik.all, max, double(1))
            }, double(1+length(test.num.copy))))
        px.offset.s <- vapply(lapply(px.offset, apply, 2, max), sum, double(1))
        
        px.offset.s <- exp(px.offset.s - max(px.offset.s))
        log.ratio.offset <- test.offset[min(which(runif(n = 1, min = 0, max = sum(px.offset.s)) <= 
            cumsum(px.offset.s)))]
    })
    do.call(c, lapply(seq_along(lr), function(i) rep(lr[[i]], length(seg.ids.by.chr[[i]]))))
}
.sampleError <- function(subclonal, seg, exon.lrs, sd.seg, p, C, total.ploidy, max.exon.ratio, 
    simulated.annealing, iter, log.ratio.calibration, log.ratio.offset) {
    # Gibbs sample error
    test.error <- seq(sd.seg, sd.seg + sd.seg * 0.2, length.out = 5)

    seg.ids <- seq_len(nrow(seg))
    
    px.error <- lapply(test.error, function(error) vapply(seg.ids, 
        function(i) .calcLlikSegment(subclonal[i], exon.lrs[[i]] + log.ratio.offset[i], error, 
            p, C[i], total.ploidy, max.exon.ratio), double(1))
    )
    px.error.s <- sapply(px.error, sum, na.rm = TRUE)
    if (simulated.annealing) 
        px.error.s <- px.error.s * exp(iter/4)
    px.error.s <- exp(px.error.s - max(px.error.s))
    error <- test.error[min(which(runif(n = 1, min = 0, max = sum(px.error.s)) <= 
        cumsum(px.error.s)))]
}

.sampleOffset <- function(subclonal, seg, exon.lrs, sd.seg, p, C, total.ploidy, max.exon.ratio, 
    simulated.annealing, iter, log.ratio.calibration = 0.25) {
    # Gibbs sample offset
    test.offset <- seq(sd.seg * -log.ratio.calibration, sd.seg * log.ratio.calibration, 
        by = 0.01)
    test.offset <- seq(p * -log.ratio.calibration, p * log.ratio.calibration * 0.2, length.out=12)
    seg.ids.by.chr <- list(seq_len(nrow(seg)))
    
    lr <- lapply(seq_along(seg.ids.by.chr), function(j) {
        px.offset <- lapply(test.offset, function(px) vapply(seg.ids.by.chr[[j]], 
            function(i) .calcLlikSegment(subclonal[i], exon.lrs[[i]] + px, sd.seg, 
                p, C[i], total.ploidy, max.exon.ratio), double(1))
        )
        
        px.offset.s <- sapply(px.offset, sum, na.rm = TRUE)
        if (simulated.annealing) 
            px.offset.s <- px.offset.s * exp(iter/4)
        px.offset.s <- exp(px.offset.s - max(px.offset.s))
        log.ratio.offset <- test.offset[min(which(runif(n = 1, min = 0, max = sum(px.offset.s)) <= 
            cumsum(px.offset.s)))]
    })
    do.call(c, lapply(seq_along(lr), function(i) rep(lr[[i]], length(seg.ids.by.chr[[i]]))))
}
.removeOutliers <- function(x, na.rm=TRUE,...) {
    if (length(x) < 5) 
        return(x)
    qnt <- quantile(x, probs = c(0.25, 0.75), na.rm = na.rm, ...)
    H <- 1.5 * IQR(x, na.rm = na.rm)
    if (is.na(H) || H < 0.001) return(x)
    # find points outside the 'boxplot' range
    idx <- x <= (qnt[1] - H) | x >= (qnt[2] + H)
    x[!idx]
}    
.calcPuritySomaticVariants <- function(vcf, tumor.id.in.vcf) {
    idx <- info(vcf)[[paste0(.getPureCNPrefixVcf(vcf), "PR")]] > 0.5
    median(unlist(geno(vcf[idx])$FA[, tumor.id.in.vcf]), na.rm = TRUE)/0.48
}
.robustSd <- function(d, size = 25) median(
sapply(split(d, ceiling(seq_along(d) / size)), sd, na.rm = TRUE), 
na.rm = TRUE)
.createFakeLogRatios <- function(tumor, seg.file, sampleid, chr.hash, 
    model.homozygous=FALSE, max.logr.sdev) {
    if (!is.null(tumor$log.ratio)) {
         # calculate log.ratio sd in chunks of size 25 to estimate the 
         # segmented sd

        if (.robustSd(tumor$log.ratio) < max.logr.sdev) {
            flog.info("Found log2-ratio in tumor coverage data.")
            idx <- tumor$log.ratio < -8
            if (any(idx)) {
                flog.warn("log2-ratio contains outliers < -8, ignoring them...")
                tumor$log.ratio[idx] <- NA
            }    
            return(tumor$log.ratio)
        } else {
            flog.info("Provided log2-ratio looks too noisy, using segmentation only.")
        }
    }
    if (is(seg.file, "character")) {
        seg <- readSegmentationFile(seg.file, sampleid, model.homozygous = model.homozygous, verbose=FALSE)    
    } else {
        seg <-.checkSeg(seg.file, sampleid, model.homozygous, verbose=FALSE)
    }    
    seg.gr <- GRanges(seqnames = .add.chr.name(seg$chrom, chr.hash), 
                IRanges(start = round(seg$loc.start), end = seg$loc.end))

    ov <- findOverlaps(tumor, seg.gr)
    log.ratio <- seg$seg.mean[subjectHits(ov)]
    # sanity check, so that every exon has exactly one segment log-ratio
    log.ratio <- log.ratio[match(seq_len(length(tumor)), queryHits(ov))]
    mean.log.ratio <- mean(subset(log.ratio, !is.infinite(log.ratio)), 
        na.rm = TRUE)
    # calibrate
    log.ratio <- log.ratio - mean.log.ratio
    log.ratio
}
.strip.chr.name <- function(ls, chr.hash) {
    x <- chr.hash[as.character(ls), 2]
    x[is.na(x)] <- as.numeric(ls[is.na(x)])
    x
}
.add.chr.name <- function(ls, chr.hash) {
    x <- as.character(chr.hash$chr[match(ls, chr.hash$number)])
    x[is.na(x)] <- ls[is.na(x)]
    x
}
.getChrHash <- function(ls) {
    ls <- unique(ls)
    chr.hash <- genomeStyles(species="Homo_sapiens")
    chr.hash <- chr.hash[!chr.hash$circular,]
    id <- which.max(apply(chr.hash[,-(1:3)],2,function(x) sum(ls %in%x)))+3
    chr.hash <- data.frame(chr=chr.hash[,id], number=seq_len(nrow(chr.hash)),
        row.names=chr.hash[,id])
    if (sum(!ls %in% chr.hash[,1]) == 0) return(chr.hash)
    data.frame(chr=as.factor(ls), number=seq_along(ls), row.names=ls)
}
.stopUserError <- function(...) {
    msg <- paste(c(...), collapse="")
    msg <- paste0(msg, "\n\nThis is most likely a user error due to",
        " invalid input data or parameters (PureCN ", 
        packageVersion("PureCN"), ").")
    flog.fatal(paste(strwrap(msg),"\n"))
    stop(paste(strwrap(msg),collapse = "\n"), call.= FALSE)
}
.stopRuntimeError <- function(...) {
    msg <- paste(c(...), collapse="")
    msg <- paste0(msg, "\n\nThis runtime error might be caused by",
        " invalid input data or parameters. Please report bug (PureCN ", 
        packageVersion("PureCN"), ").")
    flog.fatal(paste(strwrap(msg),"\n"))
    stop(paste(strwrap(msg),collapse = "\n"), call.= FALSE)
}
.logHeader <- function(l) {
    flog.info(strrep("-", 60))
    flog.info("PureCN %s", as.character(packageVersion("PureCN")))
    flog.info(strrep("-", 60))
    # which arguments are printable in a single line?
    idxSupported <- sapply(l, function(x) class(eval.parent(x))) %in% 
        c("character", "numeric", "NULL", "list", "logical") &
        sapply(l, function(x) object.size(eval.parent(x))) < 1024
    idxSmall <-     
        sapply(l[idxSupported], function(x) length(unlist(eval.parent(x)))) < 12
    idxSupported[idxSupported] <- idxSmall    

    l <- c(l[idxSupported],lapply(l[!idxSupported], function(x) "<data>"))
    argsStrings <- paste(sapply(seq_along(l), function(i) paste0("-", 
        names(l)[i], " ", paste(eval.parent(l[[i]]),collapse=","))),
        collapse=" ")
    flog.info("Arguments: %s", argsStrings)
}
.logFooter <- function() {
    flog.info("Done.")
    flog.info(strrep("-", 60))
}    
.calcGCmetric <- function(gc_bias, coverage, on.target, field = "average.coverage") {
    idx <- which(coverage$on.target==on.target)
    if (!length(idx)) return(NA)
    gcbins <- split(mcols(coverage[idx])[,field], gc_bias[idx] < 0.5)
    mean(gcbins[[1]], na.rm=TRUE) / mean(gcbins[[2]], na.rm=TRUE) 
}
.checkGCBias <- function(normal, tumor, log.ratio, max.dropout, on.target = TRUE) {
    # vector instead of tumor meta column provided
   
    if (is(log.ratio, "numeric") && length(log.ratio) == length(tumor)) {
        tumor$ratio <- 2^(log.ratio)
    } else {
        .stopRuntimeError("tumor and log.ratio do not align in .checkGCBias")
    }    

    gcMetricNormal <- .calcGCmetric(tumor$gc_bias, normal, on.target)
    gcMetricTumor <- .calcGCmetric(tumor$gc_bias, tumor, on.target)
    gcMetricLogRatio <- .calcGCmetric(tumor$gc_bias, tumor, on.target, "ratio")

    if (is.na(gcMetricTumor)) return(FALSE)

    flog.info("AT/GC dropout%s: %.2f (tumor), %.2f (normal), %.2f (coverage log-ratio). ", 
        ifelse(on.target,""," (off-target regions)"), gcMetricTumor,
        gcMetricNormal, gcMetricLogRatio)
    # throw warning only when the gc bias extends to normalzed log-ratio
    max.dropout.half <- sapply(max.dropout, function(x) x + (1 - x)/2)
    if (gcMetricLogRatio < max.dropout.half[1] || 
        gcMetricLogRatio > max.dropout.half[2]) {
        flog.warn("High GC-bias in normalized tumor vs normal log2 ratio.")
        return(TRUE)
    } else if (gcMetricNormal < max.dropout[1] || 
        gcMetricNormal > max.dropout[2] ||
        gcMetricTumor  < max.dropout[1] ||
        gcMetricTumor  > max.dropout[2]) {
        flog.info("High GC-bias in normal and/or tumor coverage.")
    }
    return(FALSE)
}

.gcGeneToCoverage <- function(interval.file, min.coverage, min.total.counts) {
    gc.data <- readIntervalFile(interval.file)
    gc.data$average.coverage <- min.coverage
    gc.data$coverage <- min.coverage * width(gc.data)
    gc.data$counts <- Inf
    gc.data
}

.getCentromerePositions <- function(centromeres, genome, style=NULL) {
    if (is.null(centromeres)) {
        data(centromeres, envir = environment())
        if (genome %in% names(centromeres)) {
            centromeres <- centromeres[[genome]]
            if (!is.null(style)) seqlevelsStyle(centromeres) <- style[1]
        } else {
            centromeres <- NULL
        }
    }
    centromeres
}
.checkArgs <- function(args, fname) {
    dups <- duplicated(names(args)) 
    if (sum(dups)) {
        args <- args[!dups]
        flog.warn("Duplicated arguments in %s", fname)
    }
    args
}
.calcFractionBalanced <- function(p) {
    sum(p$ML.C - p$ML.M.SEGMENT == p$ML.M.SEGMENT, na.rm=TRUE)/nrow(p)
}

# function to adjust log-ratios to segment mean
.postprocessLogRatios <- function(exon.lrs, seg.mean) {

    exon.lrs <- lapply(seq_along(exon.lrs), function(i) {
        if (length(exon.lrs[[i]]) > 3) return(exon.lrs[[i]])
        exon.lrs[[i]] - mean(exon.lrs[[i]]) + seg.mean[i]
    })

    return(exon.lrs)
}

.estimateContamination <- function(pp,  max.mapping.bias = NULL, min.fraction.chromosomes = 0.8) {
    if (is.null(max.mapping.bias)) max.mapping.bias <- 0

    idx <- pp$GERMLINE.CONTHIGH + pp$GERMLINE.CONTLOW > 0.5 & 
        pp$MAPPING.BIAS >= max.mapping.bias &
        pp$MAPPING.BIAS <= (2 - max.mapping.bias)

    if (!length(which(idx))) return(0)
    df <- data.frame(
        chr=pp$chr[idx], 
        AR=sapply(pp$AR.ADJUSTED[idx], function(x) ifelse(x>0.5, 1-x,x)),
        HIGHLOW=ifelse(pp$GERMLINE.CONTHIGH>pp$GERMLINE.CONTLOW, 
            "HIGH", "LOW")[idx]
    )
    # take the chromosome median and then average. the low count
    # might be biased in case contamination rate is < AR cutoff
    estimatedRate <- weighted.mean( 
        sapply(split(df$AR, df$HIGHLOW), median), 
        sapply(split(df$AR, df$HIGHLOW), length)
    )
    fractionChrs <- sum(unique(pp$chr) %in% df$chr)/length(unique(pp$chr))
    estimatedRate <- if (fractionChrs >= min.fraction.chromosomes) estimatedRate else 0
    estimatedRate
}

.calculate_ccf <- function(vaf, depth, purity, C){
    # see DOI: 10.1126/scitranslmed.aaa1408
    if (is.na(vaf)) return(c(NA, NA, NA))
    possible_ccfs <- seq(0.01, 1, 0.01)
    possible_vafs <- (purity * possible_ccfs)/
        ((2 * (1 - purity)) + (purity * C)) #Expected VAF for each CCF
    possible_vafs <- pmax(pmin(possible_vafs, 1), 0)  
    probs <- dbinom(x=round(vaf*depth), size = depth, prob = possible_vafs) #Prob of observed VAF
    names(probs) <- possible_ccfs
    if (!sum(probs)) {
        if (vaf > max(possible_vafs)) return(c(1, 1, 1))
        return(c(NA, NA, NA))
    }
    probs_norm <- probs / sum(probs) #Normalise to get posterior distribution
    probs_sort <- sort(probs_norm, decreasing = TRUE)
    probs_cum <- cumsum(probs_sort)
    n <- sum(probs_cum < 0.95) + 1 #Get 95% confidence interval (95% of probability)
    threshold <- probs_sort[n]
    cint  <- probs[probs_norm >= threshold]
    ccf_point <- as.numeric(names(which.max(probs_norm)))
    ccf_lower <- as.numeric(names(cint)[1])
    ccf_upper <- as.numeric(names(cint)[length(cint)])
    return(c(ccf_point, ccf_lower, ccf_upper))
}

.imputeBetaBin <- function(depth, bias, mu, rho) {
    if (is.null(mu)) { 
        mu <- bias / 2
    } else {    
        idx <- is.na(mu)
        mu[idx] <- (bias/2)[idx]
    }
    if (is.null(rho) || all(is.na(rho))) { 
        rho <- 1 / (1 + depth)
    } else {    
        idx <- is.na(rho)
        rho[idx] <- mean(rho, na.rm = TRUE)
    }
    list(mu = mu, rho = rho)
}    
.calculate_allelic_imbalance <- function(vaf, depth, max.coverage.vcf, bias, mu, rho) {
    impute <- .imputeBetaBin(depth, bias, mu, rho)
    dbetabinom(x = round(vaf * depth), depth, prob = impute$mu, rho = impute$rho, log = TRUE) 
}    

.getExonLrs <- function(seg.gr, tumor, log.ratio, idx = NULL) {
    if (!is.null(idx)) {
        tumor <- tumor[idx]
        log.ratio <- log.ratio[idx]
    }    
    ov.se <- findOverlaps(seg.gr, tumor)
    exon.lrs <- lapply(seq_len(length(seg.gr)), function(i) log.ratio[subjectHits(ov.se)[queryHits(ov.se) == 
        i]])
    exon.lrs <- lapply(exon.lrs, function(x) subset(x, !is.na(x) & !is.infinite(x)))
    # estimate stand. dev. for target logR within targets. this will be used as proxy
    # for sample error.
    targetsPerSegment <- vapply(exon.lrs, length, integer(1))
    if (!sum(targetsPerSegment > 50, na.rm = TRUE) && !is.null(idx)) {
        .stopRuntimeError("Only tiny segments.")
    }
    exon.lrs
}

# there are way too many bugs with this, so let's have one function we can more easily
# debug
.getSeqlevelsStyle <- function(x) {
    if (is(x, "character")) {
        sl <- x
    } else {    
        sl <- try(seqlevels(x))
    }    
    if (is(sl, "character")) {
        style <- seqlevelsStyle(sl)
        if ("Ensembl" %in% style) style <- "Ensembl"
        return(style[1])    
    }
    seqlevelsStyle(x)
}
lima1/PureCN documentation built on Sept. 17, 2024, 5:48 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
lima1/PureCN
Copy number calling and SNV classification using targeted short read sequencing

R/PureCN-internal.R
In lima1/PureCN: Copy number calling and SNV classification using targeted short read sequencing

R Package Documentation

Browse R Packages

We want your feedback!

lima1/PureCN Copy number calling and SNV classification using targeted short read sequencing

R/PureCN-internal.R In lima1/PureCN: Copy number calling and SNV classification using targeted short read sequencing

R Package Documentation

Browse R Packages

We want your feedback!

lima1/PureCN
Copy number calling and SNV classification using targeted short read sequencing

R/PureCN-internal.R
In lima1/PureCN: Copy number calling and SNV classification using targeted short read sequencing