R/callAmplificationsInLowPurity.R

Defines functions .calculate_tumor_normal_noise_ratio callAmplificationsInLowPurity

Documented in callAmplificationsInLowPurity

#' Calling of amplifications in low purity samples
#' 
#' Function to extract amplification from a
#' \code{\link{runAbsoluteCN}} return object in samples of too low purity
#' for the standard \code{\link{callAlterations}}.
#' 
#' 
#' @param res Return object of the \code{\link{runAbsoluteCN}} function.
#' @param normalDB Normal database, created with
#' \code{\link{createNormalDatabase}}. 
#' @param pvalue.cutoff Copy numbers log-ratio cutoffs to call
#' amplifications as calculating using the log-ratios observed in
#' \code{normalDB}
#' @param percentile.cutoff Only report genes with log2-ratio mean
#' exceeding this sample-wise cutoff.
#' @param min.width Minimum number of targets
#' @param all.genes If \code{FALSE}, then only return amplifications
#' passing the thresholds. 
#' @param purity If not \code{NULL}, then scale log2-ratios to the
#' corresponding integer copy number. Useful when accurate ctDNA
#' fractions (between 4-10 percent) are available.
#' @param BPPARAM \code{BiocParallelParam} object. If \code{NULL}, does not
#' use parallelization for fitting local optima.
#' @return A \code{data.frame} with gene-level amplification calls.
#' @author Markus Riester
#' @seealso \code{\link{runAbsoluteCN}} \code{\link{callAlterations}}
#' @examples
#' 
#' data(purecn.example.output)
#' normal.coverage.file <- system.file("extdata", "example_normal.txt.gz", 
#'     package = "PureCN")
#' normal2.coverage.file <- system.file("extdata", "example_normal2.txt.gz", 
#'     package = "PureCN")
#' normal.coverage.files <- c(normal.coverage.file, normal2.coverage.file)
#' normalDB <- createNormalDatabase(normal.coverage.files)
#' callAmplificationsInLowPurity(purecn.example.output, normalDB)["EIF2A", ]
#' 
#' @importFrom stats ecdf pnorm na.omit
#' @export callAmplificationsInLowPurity
callAmplificationsInLowPurity <- function(res, normalDB,
pvalue.cutoff = 0.001, percentile.cutoff = 90,
min.width = 3, all.genes = FALSE, purity = NULL, BPPARAM = NULL) {

    if (percentile.cutoff < 0 || percentile.cutoff > 100) {
        .stopUserError("percentile.cutoff not in expected range (0 to 100).")
    }    
    .checkFraction(pvalue.cutoff, "pvalue.cutoff")
    if (!is.null(purity)) .checkFraction(purity, "purity")
    if (!is(res$results[[1]]$gene.calls, "data.frame")) {
        .stopUserError("This function requires gene-level calls.\n",
            "Please add a column 'Gene' containing gene symbols to the ",
            "interval.file.")
    }
    if (is.null(normalDB$sd)) {
        .stopUserError("This function requires a normalDB object generated by PureCN 1.16 or later.")
    }    
    
    pon_lrs_gr <- normalDB$sd$log.ratios  
    mcols(pon_lrs_gr) <- apply(mcols(pon_lrs_gr), 2, .calibrate_log_ratio, normalDB$sd$log.ratios)
    pon_lrs_gr <- subsetByOverlaps(pon_lrs_gr, res$input$log.ratio)
    pon_w_gr <- subsetByOverlaps(normalDB$sd$weights, res$input$log.ratio)
    tumor_normal_noise_ratio <- .calculate_tumor_normal_noise_ratio(res$input$log.ratio, pon_lrs_gr)

    flog.info("Tumor/normal noise ratio: %.3f", tumor_normal_noise_ratio)
    if (tumor_normal_noise_ratio > 5) {
        flog.warn("Extensive noise in tumor compared to normals.")
        tumor_normal_noise_ratio <- 5
    }    
    tumor_normal_noise_ratio <- max(1, tumor_normal_noise_ratio)
        
    calls <- res$results[[1]]$gene.calls
    calls$chr <- as.character(calls$chr)
    calls$p.value <- NA
    calls$percentile.genome <- ecdf(calls$gene.mean)(calls$gene.mean) * 100
    calls$percentile.chromosome <- NA 
    calls_gr <- GRanges(calls)

    .get_gene_pv <- function(g) {
        g_gr <- calls_gr[g]
        # recalculate gene mean to make sure the same intervals are
        # used in tumor and pon
        pon_gene_w <- subsetByOverlaps(pon_w_gr, g_gr)$weights
        tumor_gene <- weighted.mean(
            subsetByOverlaps(res$input$log.ratio, g_gr)$log.ratio,
            pon_gene_w)
        # speed-up
        if (tumor_gene < 0 || length(pon_gene_w) < min.width ||
            (!all.genes && g_gr$percentile.genome < percentile.cutoff)) {
            return(list(gene.mean = tumor_gene, p.value = 0.5))
        }
        pon_lrs <- mcols(subsetByOverlaps(pon_lrs_gr, g_gr))
        pon_gene_lrs <- apply(pon_lrs, 2, weighted.mean, pon_gene_w, na.rm = TRUE)
        
        list(gene.mean = tumor_gene,
             p.value = pnorm(g_gr$gene.mean, mean = mean(pon_gene_lrs), 
                sd = sd(pon_gene_lrs) * tumor_normal_noise_ratio, lower.tail = FALSE),
             pon.sd = sd(pon_gene_lrs))
    }
    if (!is.null(BPPARAM) && !requireNamespace("BiocParallel", quietly = TRUE)) {
        flog.warn("Install BiocParallel for parallel optimization.")
        BPPARAM <- NULL
    } else if (!is.null(BPPARAM)) {
        flog.info("Using BiocParallel for parallel optimization.")
    }

    if (is.null(BPPARAM)) {
        gene_level_summary <- lapply(rownames(calls), .get_gene_pv)
    } else {
        gene_level_summary <- BiocParallel::bplapply(rownames(calls),
                                          .get_gene_pv, BPPARAM = BPPARAM)
    }    

    calls$p.value <- sapply(gene_level_summary, function(x) x$p.value)
    # sanity check that there is no difference in tumor and pon gene.mean
    # calculation due to overlapping probes etc. 
    calls$gene.mean <-  sapply(gene_level_summary, function(x) x$gene.mean)
    calls$percentile.genome <- ecdf(calls$gene.mean)(calls$gene.mean) * 100
    ecdf_chr <- sapply(seqlevelsInUse(calls_gr), function(i)
        ecdf(calls$gene.mean[calls$chr == i]))
    idx <- seq(nrow(calls))
    idx <- idx[calls$chr %in% names(ecdf_chr)]
    if (length(idx) != nrow(calls)) {
        flog.warn("Unable to calculate chromosome percentile for %s.",
            paste(sort(unique(calls$chr[!calls$chr %in% names(ecdf_chr)])), collapse=","))
    } 
    calls$percentile.chromosome[idx] <- sapply(idx, function(i)
        ecdf_chr[[calls$chr[i]]](calls$gene.mean[i])) * 100

    calls$type <- NA
    amp.ids <- calls$p.value <= pvalue.cutoff & calls$percentile.genome >= percentile.cutoff
    calls$type[amp.ids] <- "AMPLIFICATION"

    # delete columns not relevant for this algorithm
    calls <- calls[, !grepl("^\\.",colnames(calls))]
    calls$focal <- NULL
    calls$seg.mean <- NULL
    calls$seg.id <- NULL
    # support scaling of C in the future
    calls$C <- NA
    if (!is.null(purity)) {
        calls$C <- (2^calls$gene.mean * 2) / purity -  ((2 * (1 - purity)) / purity) 
    }   
    if (!all.genes) {
        return(calls[!is.na(calls$type),])
    }
    calls
}

.calculate_tumor_normal_noise_ratio <- function(tumor, normals) {
    tumor <- tumor[tumor$on.target]
    normals <- subsetByOverlaps(normals, tumor)
    noise_tumor <- sd(diff(na.omit(tumor$log.ratio)))
    noise_normals <- apply(mcols(normals), 2, function(x) sd(diff(na.omit(x))))
    flog.debug("Noise tumor: %f; Noise normals: %s",
               noise_tumor, paste(noise_normals, collapse = ","))
    ratio <- noise_tumor / mean(noise_normals)    
    if (is.infinite(ratio)) {
        .stopRuntimeError("Tumor/normal noise infinite.")
    }
    return(ratio)
}
lima1/PureCN documentation built on Sept. 17, 2024, 5:48 a.m.