# Make CMD check happy
globalVariables(names=c("Gene", "LR", "chrom", "seg.id", "seg.length",
"seg.mean", "C.flagged", "weights"))
# calculates the log-likelihood of a segment, given log-ratios, the
# log ratio standard deviation, purity, copy number and ploidy.
# for sub-clonal alterations, it uses a uniform distribution, otherwise
# multiple gaussians for all tested copy numbers.
.calcLlikSegment <- function(subclonal, lr, sd.seg, p, Ci, total.ploidy,
max.exon.ratio) {
if (subclonal) {
return(.calcLlikSegmentSubClonal(lr, max.exon.ratio))
}
.calcLlikSegmentClonal(lr, sd.seg, p, Ci, total.ploidy)
}
.calcLlikSegmentClonal <- function(lr, sd.seg, p, Ci, total.ploidy) {
sum(dnorm(lr, mean = log2((p * Ci + (1 - p) * 2)/total.ploidy),
sd = sd.seg, log = TRUE))
}
.calcLlikSegmentSubClonal <- function(lr, max.exon.ratio) {
sum(dunif(
x = vapply(2^lr, function(y) min(y, max.exon.ratio), double(1)),
min = 0, max = max.exon.ratio, log = TRUE))
}
# Previously calculated likelihood scores do not take the segmentation into
# account. This will find the most likely segment minor copy number
.calcMsSegmentC <- function(yy, test.num.copy, Ci, prior.K, mapping.bias.ok,
seg.id, min.variants.segment) {
prior.M <- list(0.2,0.15,c(0.1,0.25),c(0.1,0.3),c(0.1,0.2,0.55))
prior.M <- c(list(1), lapply(prior.M, function(x) c(x, 1-sum(x))))
max.M <- floor(Ci / 2)
idx.germline <- test.num.copy + length(test.num.copy) + 1
idx.somatic <- test.num.copy + 1
variant.ok <- mapping.bias.ok & is.finite(apply(yy, 1, max))
yys <- lapply(0:max.M, function(Mi) {
for (i in test.num.copy) {
n.cases.germ <- ifelse(Mi==Ci-Mi,1,2)
tmp <- unique(c(0, 1,Mi, Ci-Mi))
n.cases.somatic <- length(tmp)
Cx <- max(i,Ci)
if (i!=Mi && i!=Ci - Mi) {
yy[,idx.germline[i+1]] <- yy[,idx.germline[i+1]] + log(1-prior.K) - log(Cx + 1 - n.cases.germ)
# allow somatic mutations always have M=1
if (i <= 1) {
yy[,idx.somatic[i+1]] <- yy[,idx.somatic[i+1]] + log(prior.K) - log(n.cases.somatic)
} else {
yy[,idx.somatic[i+1]] <- yy[,idx.somatic[i+1]] +
log(1-prior.K) - log(Cx + 1 - n.cases.somatic)
}
} else {
yy[, idx.germline[i + 1]] <- yy[, idx.germline[i + 1]] + log(prior.K) - log(n.cases.germ)
yy[, idx.somatic[i + 1]] <- yy[, idx.somatic[i + 1]] + log(prior.K) - log(n.cases.somatic)
}
}
yy
})
# if not enough variants in segment, flag
flag <- sum(variant.ok) < min.variants.segment
if (!sum(variant.ok)) {
# still use them to avoid erroring out, but we will ignore the output because
# of flagging
variant.ok <- rep(TRUE, length(variant.ok))
}
likelihoodScores <- vapply(yys, function(x) {
sum(apply(x[variant.ok, , drop = FALSE], 1, max))
}, double(1))
best <- which.max(likelihoodScores)
# this should not happen...
if (!length(best)) {
flag <- TRUE
best <- 1
}
# transform and scale
likelihoodScores <- exp(likelihoodScores - likelihoodScores[best])
posterior <- likelihoodScores[best] / sum(likelihoodScores, na.rm = TRUE)
if (is.na(posterior) || posterior < 0.5) flag <- TRUE
list(best = yys[[best]], M = test.num.copy[best], flag = flag, posterior = posterior)
}
# calculate likelihood scores for all possible minor copy numbers
.calcMsSegment <- function(xxi, test.num.copy, opt.C, prior.K, mapping.bias.ok,
seg.id, min.variants.segment) {
lapply(seq_along(xxi), function(i).calcMsSegmentC(xxi[[i]], test.num.copy,
c(test.num.copy, round(opt.C))[i], prior.K, mapping.bias.ok, seg.id, min.variants.segment))
}
.calcSNVLLik <- function(vcf, tumor.id.in.vcf, ov, p, test.num.copy,
C.likelihood, C, opt.C, median.C, snv.model, snv.lr,
sampleid = NULL, cont.rate = 0.01, prior.K, max.coverage.vcf, non.clonal.M,
model.homozygous = FALSE, error = 0.001, max.mapping.bias = 0.8, max.pon,
min.variants.segment) {
.dbeta <- function(x, shape1, shape2, log, size, rho) dbeta(x=x,
shape1 = shape1, shape2 = shape2, log = log)
if (snv.model == "betabin") {
.dbeta <- function(x, shape1, shape2, log, size, rho) {
prob <- ((shape1-1)/(shape1+shape2-2))
dbetabinom(x = round(x * size),prob = prob,
size = size, rho = rho, log = TRUE)
}
}
prefix <- .getPureCNPrefixVcf(vcf)
prior.somatic <- info(vcf)[[paste0(prefix, "PR")]]
prior.cont <- ifelse(prior.somatic < 0.1, cont.rate, 0)
prior.somatic <- prior.somatic - (prior.cont*prior.somatic)
priorHom <- if (model.homozygous) -log(3) else log(0)
haploid.penalty <- 0
if (median.C < 1.1) {
haploid.penalty <- 2
}
subclonal <- apply(C.likelihood[queryHits(ov), ], 1, which.max) == ncol(C.likelihood)
seg.idx <- which(seq_len(nrow(C.likelihood)) %in% queryHits(ov))
ar_all <- unlist(geno(vcf)$FA[, tumor.id.in.vcf])
bias_all <- info(vcf)[[paste0(prefix, "MBB")]]
dp_all <- unlist(geno(vcf)$DP[, tumor.id.in.vcf])
impute <- .imputeBetaBin(dp_all, bias_all,
mu = info(vcf)[[paste0(prefix, "MBMU")]],
rho = info(vcf)[[paste0(prefix, "MBRHO")]])
rho_all <- impute$rho
ar_all <- ar_all / (impute$mu * 2)
ar_all[ar_all > 1] <- 1
if (snv.model!="betabin") dp_all[dp_all>max.coverage.vcf] <- max.coverage.vcf
# Fit variants in all segments
xx <- lapply(seg.idx, function(i) {
# classify germline vs somatic
idx <- subjectHits(ov)[queryHits(ov) == i]
shape1 <- ar_all[idx] * dp_all[idx] + 1
shape2 <- (1 - ar_all[idx]) * dp_all[idx] + 1
mInf_all <- log(double(length(shape1)))
list(vcf.ids=idx, likelihoods=lapply(seq(ncol(C.likelihood)), function(k) {
Ci <- c(test.num.copy, opt.C[i])[k]
priorM <- log(max(Ci,1) + 1 + haploid.penalty)
skip <- test.num.copy > Ci | C.likelihood[i, k] <= 0
p.ar <- lapply(c(0, 1), function(g) {
cns <- test.num.copy
if (!g) cns[1] <- non.clonal.M
dbetax <- (p * cns + g * (1-p)) / (p * Ci + 2 * (1-p))
l <- lapply(seq_along(test.num.copy), function(j) {
if (skip[j]) return(mInf_all)
.dbeta(x = dbetax[j],
shape1 = shape1,
shape2 = shape2, log = TRUE, size = dp_all[idx],
rho = rho_all[idx]) - priorM
})
do.call(cbind,l)
})
p.ar.cont.1 <- .dbeta(x = (p * Ci + 2 * (1 - p - cont.rate))/
(p * Ci + 2 * (1 - p)),shape1=shape1, shape2=shape2,
log=TRUE, size=dp_all[idx], rho = rho_all[idx]) - priorM
p.ar.cont.2 <- .dbeta(x = cont.rate / (p * Ci + 2 * (1 - p)),
shape1 = shape1, shape2 = shape2, log = TRUE,
size = dp_all[idx], rho = rho_all[idx]) - priorM
# add prior probabilities for somatic vs germline
p.ar[[1]] <- p.ar[[1]] + log(prior.somatic[idx])
p.ar[[2]] <- p.ar[[2]] + log(1 - prior.somatic[idx])
# contamination (either homozygous germline, or germline from
# other sample)
p.ar[[3]] <- p.ar.cont.1 + log(prior.cont[idx])
p.ar[[4]] <- p.ar.cont.2 + log(prior.cont[idx])
# homozygous state
p.ar[[5]] <- dbinom(round((1 - ar_all[idx]) * dp_all[idx]),
size = round(dp_all[idx]), prob = error / 3, log = TRUE) +
priorHom + log(1 - prior.somatic[idx])
do.call(cbind, p.ar)
}))
})
tmp <- lapply(seq_along(xx),function(i) .calcMsSegment(xx[[i]]$likelihoods,
test.num.copy, opt.C[seg.idx[i]], prior.K,
mapping.bias.ok = info(vcf[xx[[i]]$vcf.ids])[[paste0(prefix, "MBB")]] >= max.mapping.bias &
info(vcf[xx[[i]]$vcf.ids])[[paste0(prefix, "MBB")]] <= (2 - max.mapping.bias),
seg.id = seg.idx[i], min.variants.segment))
xx <- lapply(tmp, lapply, function(x) x$best)
.extractValues <- function(tmp, field) {
segmentValue <- sapply(seq_along(tmp), function(i)
tmp[[i]][[min(C[seg.idx[i]], max(test.num.copy))+1]][[field]])
segmentValue <- unlist(sapply(seq_along(seg.idx), function(i)
rep(segmentValue[i], sum(seg.idx[i] == queryHits(ov)))))
}
# Get segment M's for each SNV
segment.M <- .extractValues(tmp, "M")
segment.Flag <- .extractValues(tmp, "flag")
segment.Posterior <- .extractValues(tmp, "posterior")
likelihoods <- do.call(rbind,
lapply(seq_along(xx), function(i) Reduce("+",
lapply(seq(ncol(C.likelihood)), function(j)
exp(xx[[i]][[j]]) * C.likelihood[seg.idx[i], j]))))
colnames(likelihoods) <- c(paste("SOMATIC.M", test.num.copy, sep = ""),
paste("GERMLINE.M", test.num.copy, sep = ""), "GERMLINE.CONTHIGH",
"GERMLINE.CONTLOW", "GERMLINE.HOMOZYGOUS")
vcf.ids <- do.call(c, lapply(seg.idx, function(i)
subjectHits(ov)[queryHits(ov) == i]))
rownames(likelihoods) <- vcf.ids
# for very high-level amplifications, all posteriors can be 0, so make sure
# we get valid values here and flag those later.
posteriors <- likelihoods/pmax(rowSums(likelihoods),.Machine$double.xmin)
# this just adds a lot of helpful info to the SNV posteriors
xx <- .extractMLSNVState(posteriors)
posteriors <- cbind(
as.data.frame(rowRanges(vcf[vcf.ids]), row.names=NULL)[, 1:3],
ID = names(vcf[vcf.ids]),
REF = as.character(ref(vcf[vcf.ids])),
ALT = sapply(alt(vcf[vcf.ids]), function(x)
paste(as.character(x), collapse=";")),
posteriors,
xx,
ML.C = C[queryHits(ov)],
ML.M.SEGMENT = segment.M,
M.SEGMENT.POSTERIOR = segment.Posterior,
M.SEGMENT.FLAGGED = segment.Flag,
row.names = NULL
)
posteriors$ML.AR <- (p * posteriors$ML.M +
ifelse(posteriors$ML.SOMATIC, 0, 1) *
(1 - p)) / (p * posteriors$ML.C + 2 * (1 - p))
posteriors$ML.AR[posteriors$ML.AR > 1] <- 1
posteriors$AR <- unlist(geno(vcf[vcf.ids])$FA[, tumor.id.in.vcf])
posteriors$AR.ADJUSTED <- NA
posteriors$MAPPING.BIAS <- info(vcf[vcf.ids])[[paste0(prefix, "MBB")]]
posteriors$AR.ADJUSTED <- pmin(1, posteriors$AR / posteriors$MAPPING.BIAS)
# Extract LOH
posteriors$ML.LOH <- (posteriors$ML.M == posteriors$ML.C |
posteriors$ML.M == 0 | posteriors$ML.C == 1)
posteriors$CN.SUBCLONAL <- subclonal
depth <-as.numeric(geno(vcf[vcf.ids])$DP[, tumor.id.in.vcf])
ar <- posteriors$AR.ADJUSTED
ar[!posteriors$ML.SOMATIC] <- NA
m <- t(apply(cbind(ar, depth, posteriors$ML.C), 1, function(x)
.calculate_ccf(vaf = x[1], depth = x[2], purity = p, C = x[3])))
posteriors$CELLFRACTION <- as.numeric(m[,1])
posteriors$CELLFRACTION.95.LOWER <- as.numeric(m[,2])
posteriors$CELLFRACTION.95.UPPER <- as.numeric(m[,3])
ar <- posteriors$AR.ADJUSTED
posteriors$ALLELIC.IMBALANCE <- .calculate_allelic_imbalance(
vaf = posteriors$AR,
depth = depth,
max.coverage.vcf = max.coverage.vcf,
bias = posteriors$MAPPING.BIAS,
mu = info(vcf[vcf.ids])[[paste0(prefix, "MBMU")]],
rho = info(vcf[vcf.ids])[[paste0(prefix, "MBRHO")]])
rm.snv.posteriors <- apply(likelihoods, 1, max)
idx.ignore <- rm.snv.posteriors == 0 | is.na(rm.snv.posteriors) |
posteriors$MAPPING.BIAS < max.mapping.bias |
posteriors$MAPPING.BIAS > (2 - max.mapping.bias) |
posteriors$start != posteriors$end
posteriors$FLAGGED <- idx.ignore
posteriors$log.ratio <- snv.lr[vcf.ids]
posteriors$depth <- depth
posteriors$prior.somatic <- prior.somatic[vcf.ids]
posteriors$prior.contamination <- prior.cont[vcf.ids]
posteriors$on.target <- info(vcf[vcf.ids])[[paste0(prefix, "OnTarget")]]
posteriors$seg.id <- queryHits(ov)
posteriors$pon.count <- info(vcf[vcf.ids])[[paste0(prefix, "MBPON")]]
if (!is.null(posteriors$pon.count)) {
idx.ignore <- idx.ignore |
(posteriors$pon.count > max.pon & posteriors$prior.somatic > 0.1)
posteriors$FLAGGED <- idx.ignore
}
# change seqnames to chr
colnames(posteriors)[1] <- "chr"
ret <- list(
llik = sum(log(rm.snv.posteriors[!idx.ignore])) - sum(idx.ignore),
likelihoods = likelihoods,
posteriors = posteriors,
vcf.ids = vcf.ids,
posterior.contamination = 0)
ret
}
.extractMLSNVState <- function(snv.posteriors) {
# should not happen, but to avoid failure of which.max
snv.posteriors[is.nan(snv.posteriors)] <- 0
l1 <- apply(snv.posteriors, 1, which.max)
xx <- do.call(rbind, strsplit(colnames(snv.posteriors)[l1], "\\."))
xx[, 1] <- ifelse(xx[, 1] == "GERMLINE", FALSE, TRUE)
xx[, 2] <- gsub("^M", "", xx[, 2])
xx <- as.data.frame(xx, stringsAsFactors = FALSE)
xx[, 1] <- as.logical(xx[, 1])
xx[, 2] <- suppressWarnings(as.numeric(xx[, 2]))
colnames(xx) <- c("ML.SOMATIC", "ML.M")
# set sub-clonal (ML.M=0) to 1
xx$ML.M[xx$ML.SOMATIC & !xx$ML.M] <- 1
xx$POSTERIOR.SOMATIC <- apply(snv.posteriors[, grep("SOMATIC.M",
colnames(snv.posteriors))], 1, sum, na.rm=TRUE)
xx[, c(1,3,2)]
}
.checkFraction <- function(x, name) {
if (!is.numeric(x) || length(x) !=1 ||
x < 0 || x > 1) {
.stopUserError(name, " not within expected range or format.")
}
}
.checkParameters <- function(test.purity, min.ploidy, max.ploidy,
max.non.clonal, max.homozygous.loss, sampleid, prior.K,
prior.contamination, prior.purity, iterations, min.gof, model.homozygous,
interval.file, log.ratio.calibration, test.num.copy, max.mapping.bias) {
if (min(test.purity) <= 0 || max(test.purity) > 0.99)
.stopUserError("test.purity not within expected range.")
if (min.ploidy <= 0 || max.ploidy <= 2)
.stopUserError("min.ploidy or max.ploidy not within expected range.")
if (min(test.num.copy) < 0)
.stopUserError("test.num.copy not within expected range.")
if (min(test.num.copy) > 0 || max(test.num.copy)>8)
flog.warn("test.num.copy outside recommended range.")
.checkFraction(max.non.clonal, "max.non.clonal")
.checkFraction(max.homozygous.loss[1], "max.homozygous.loss")
.checkFraction(prior.K, "prior.K")
.checkFraction(prior.contamination, "prior.contamination")
.checkFraction(min.gof, "min.gof")
.checkFraction(max.mapping.bias, "max.mapping.bias")
tmp <- sapply(prior.purity, .checkFraction, "prior.purity")
if (!is.null(sampleid) && (!is(sampleid, "character") ||
length(sampleid) != 1)) {
.stopUserError("sampleid not a character string.")
}
if (abs(1-sum(prior.purity)) > 0.02) {
.stopUserError("prior.purity must add to 1. Sum is ", sum(prior.purity))
}
if (length(prior.purity) != length(test.purity)) {
.stopUserError("prior.purity must have the same length as ",
"test.purity.")
}
if (!is.null(interval.file) && !file.exists(interval.file)) {
.stopUserError("interval.file ", interval.file, " not found.")
}
stopifnot(is.numeric(min.ploidy))
stopifnot(is.numeric(max.ploidy))
stopifnot(is.numeric(test.purity))
stopifnot(is.numeric(iterations))
stopifnot(is.numeric(log.ratio.calibration))
stopifnot(is.logical(model.homozygous))
if (iterations < 10 || iterations > 250) {
.stopUserError("Iterations not in the expected range from 10 to 250.")
}
}
.failedNonAberrant <- function(result, cutoffs = c(0.01, 0.005)) {
xx <- split(result$seg, result$seg$C)
if (length(xx) < 3)
return(TRUE)
xx.sum <- sort(vapply(xx, function(x) sum(x$size), double(1)),
decreasing = TRUE)
xx.sum <- xx.sum/sum(xx.sum)
if (xx.sum[2] <= cutoffs[1] && xx.sum[3] <= cutoffs[2])
return(TRUE)
FALSE
}
.filterDuplicatedResults <- function(results, purity.cutoff = 0.1) {
if (length(results) < 2)
return(results)
idx.duplicated <- rep(FALSE, length(results))
for (i in seq_len(length(results)-1)) {
for (j in seq(i+1,length(results))) {
if (abs(results[[i]]$purity - results[[j]]$purity) < purity.cutoff &&
abs(results[[i]]$ploidy - results[[j]]$ploidy) /
results[[i]]$ploidy < 0.1) {
idx.duplicated[j] <- TRUE
}
}
}
results[!idx.duplicated]
}
.filterDuplicatedCandidates <- function(candidates) {
if (nrow(candidates) < 2)
return(candidates)
idx.duplicated <- rep(FALSE, nrow(candidates))
for (i in seq_len(nrow(candidates)-1)) {
for (j in seq(i+1,nrow(candidates))) {
if (abs(candidates$purity[i] - candidates$purity[j]) < 0.1 &&
abs(candidates$tumor.ploidy[i] - candidates$tumor.ploidy[j]) /
candidates$tumor.ploidy[i] < 0.1) {
idx.duplicated[j] <- TRUE
}
}
}
candidates[!idx.duplicated, ]
}
.findLocalMinima <- function(m) {
loc.min <- matrix(nrow = 0, ncol = 2)
for (i in seq_len(nrow(m))) {
for (j in seq_len(ncol(m))) {
x <- seq(i - 1, i + 1)
x <- x[x >= 1 & x <= nrow(m)]
y <- seq(j - 1, j + 1)
y <- y[y >= 1 & y <= ncol(m)]
if (m[i, j] == max(m[x, y]) && !is.infinite(m[i, j]))
loc.min <- rbind(loc.min, c(row = i, col = j))
}
}
loc.min
}
.appendComment <- function(a, b) {
if (is.na(a))
return(b)
paste(a, b, sep = ";")
}
.isRareKaryotype <- function(ploidy) {
ploidy > 4.5 || ploidy < 1.5
}
.flagResult <- function(result, max.non.clonal = 0.2, min.gof,
use.somatic.status, model.homozygous) {
result$flag_comment <- NA
result$flag <- .failedNonAberrant(result)
if (result$flag) {
result$flag_comment <- .appendComment(result$flag_comment,
"NON-ABERRANT")
}
if (result$fraction.subclonal > max.non.clonal*0.75) {
result$flag <- TRUE
result$flag_comment <- .appendComment(result$flag_comment,
"POLYGENOMIC")
}
if (result$fraction.homozygous.loss > 0.01) {
result$flag <- TRUE
result$flag_comment <- .appendComment(result$flag_comment,
"EXCESSIVE LOSSES")
}
if (.isRareKaryotype(result$ploidy)) {
result$flag <- TRUE
result$flag_comment <- .appendComment(result$flag_comment,
"RARE KARYOTYPE")
}
if (result$purity < 0.3) {
result$flag <- TRUE
result$flag_comment <- .appendComment(result$flag_comment,
"LOW PURITY")
}
if (result$purity > 0.9 && !model.homozygous &&
(!is.null(use.somatic.status) && !use.somatic.status)) {
result$flag <- TRUE
result$flag_comment <- .appendComment(result$flag_comment,
"HIGH PURITY AND model.homozygous=FALSE")
}
result$GoF <- .getGoF(result)
if (!is.null(result$GoF) && result$GoF < min.gof) {
result$flag <- TRUE
result$flag_comment <- .appendComment(result$flag_comment,
paste0("POOR GOF (", round(result$GoF*100,digits=1),"%)"))
}
fraction.loh <- .getFractionLoh(result)
# Assume that everything below 2.6 did not undergo genome duplication, which can
# result in lots of LOH
if (result$ploidy < 2.6 && fraction.loh > 0.5) {
result$flag <- TRUE
result$flag_comment <- .appendComment(result$flag_comment, "EXCESSIVE LOH")
}
return(result)
}
.getFractionLoh <- function(result) {
if (is.null(result$SNV.posterior)) return(0)
pp <- result$SNV.posterior$posteriors
x1 <- unique(pp$seg.id[pp$ML.M.SEGMENT==0])
sum(result$seg$size[x1])/sum(result$seg$size)
}
.getGoF <- function(result) {
if (is.null(result$SNV.posterior)) return(0)
r <- result$SNV.posterior$posteriors
e <- (r$ML.AR-r$AR.ADJUSTED)^2
maxDist <- 0.2^2
r2 <- max(1-mean(e,na.rm=TRUE)/maxDist,0)
return(r2)
}
.flagResults <- function(results, max.non.clonal = 0.2, max.logr.sdev,
logr.sdev, max.segments, min.gof, flag = NA, flag_comment = NA,
dropout=FALSE, use.somatic.status=TRUE, model.homozygous=FALSE) {
if (length(results) < 1) return(results)
results <- lapply(results, .flagResult, max.non.clonal=max.non.clonal,
min.gof=min.gof, use.somatic.status=use.somatic.status,
model.homozygous=model.homozygous)
number.segments <- nrow(results[[1]]$seg)
# some global flags
if (logr.sdev > max.logr.sdev) {
for (i in seq_along(results)) {
results[[i]]$flag <- TRUE
results[[i]]$flag_comment <- .appendComment(results[[i]]$flag_comment,
"NOISY LOG-RATIO")
}
}
if (number.segments > max.segments) {
for (i in seq_along(results)) {
results[[i]]$flag <- TRUE
results[[i]]$flag_comment <- .appendComment(results[[i]]$flag_comment,
"NOISY SEGMENTATION")
}
}
if (dropout) {
for (i in seq_along(results)) {
results[[i]]$flag <- TRUE
results[[i]]$flag_comment <- .appendComment(results[[i]]$flag_comment,
"HIGH AT- OR GC-DROPOUT")
}
}
if (!is.na(flag) && flag) {
for (i in seq_along(results)) {
results[[i]]$flag <- TRUE
results[[i]]$flag_comment <- .appendComment(results[[i]]$flag_comment,
flag_comment)
}
}
results
}
.getGeneCalls <- function(seg.adjusted, tumor, log.ratio, fun.focal,
args.focal, chr.hash) {
args.focal <- c(list(seg = seg.adjusted), args.focal)
focal <- do.call(fun.focal, args.focal)
abs.gc <- GRanges(seqnames = .add.chr.name(seg.adjusted$chrom, chr.hash), IRanges(start = seg.adjusted$loc.start,
end = seg.adjusted$loc.end))
# that will otherwise mess up the log-ratio means, mins and maxs
idx <- which(!is.nan(log.ratio) & is.finite(log.ratio) & tumor$Gene != ".")
if (!length(idx)) return(NA)
tumor <- tumor[idx]
log.ratio <- log.ratio[idx]
if (is.null(tumor$weights)) tumor$weights <- 1
ov <- findOverlaps(tumor, abs.gc)
if (is.null(seg.adjusted$weight.flagged)) seg.adjusted$weight.flagged <- NA
# use funky data.table to calculate means etc. in two lines of code.
dt <- data.table(as.data.frame(tumor[queryHits(ov)]), C = seg.adjusted[subjectHits(ov), "C"],
C.flagged = seg.adjusted$weight.flagged[subjectHits(ov)],
seg.mean = seg.adjusted[subjectHits(ov), "seg.mean"], LR = log.ratio[queryHits(ov)],
seg.id = subjectHits(ov), seg.length = seg.adjusted$size[subjectHits(ov)],
focal = focal[subjectHits(ov)]
)
# some targets have multipe genes assigned?
if (sum(grepl(",", dt$Gene))) {
dt <- dt[, list(Gene = unlist(strsplit(as.character(Gene), ",", fixed=TRUE))),
by = list(seqnames, start, end, C, C.flagged, seg.mean, seg.id,
seg.length, LR, focal, weights)]
}
multi_chrom_symbols <- names(which(sapply(lapply(split(as.character(dt$seqnames), dt$Gene), unique), length) > 1))
if (length(multi_chrom_symbols)) {
flog.warn("Some gene symbols found on multiple chromosomes. Use of approved symbols suggested.")
idx <- which(dt$Gene %in% multi_chrom_symbols)
dt$Gene[idx] <- paste(dt$Gene[idx], dt$seqnames[idx], sep = "__")
}
gene.calls <- data.frame(dt[, list(chr = seqnames[1], start = min(start),
end = max(end),
C = as.double(C[which.min(seg.length)]),
C.flagged = any(C.flagged),
seg.mean = seg.mean[which.min(seg.length)],
seg.id = seg.id[which.min(seg.length)],
.min.segid=min(seg.id), .max.segid=max(seg.id),
number.targets = length(start),
gene.mean = weighted.mean(LR, weights, na.rm = TRUE),
gene.min = min(LR, na.rm = TRUE), gene.max = max(LR, na.rm = TRUE),
focal = focal[which.min(seg.length)]), by = Gene], row.names = 1)
breakpoints <- gene.calls$.max.segid - gene.calls$.min.segid
gene.calls$breakpoints <- breakpoints
gene.calls
}
.extractCountMatrix <- function(coverages) {
useCounts <- .coverageHasCounts(coverages)
if (useCounts) {
return(do.call(cbind,
lapply(coverages, function(x) x$counts)))
}
# TODO: remove spring 2018 release
flog.info("Coverage file does not contain read count information, %s",
"using total coverage for calculating log-ratios.")
do.call(cbind,
lapply(coverages, function(x) x$coverage))
}
.coverageHasCounts <- function(coverages) {
for (i in seq_along(coverages))
if (sum(!is.na(coverages[[i]]$counts))==0) return(FALSE)
return(TRUE)
}
.checkSymbolsChromosome <- function(tumor, blank=c("", ".")) {
if (is.null(tumor$Gene)) {
tumor$Gene <- "."
return(tumor)
}
chrsPerSymbol <- lapply(split(seqnames(tumor), tumor$Gene), unique)
nonUniqueSymbols <- names(chrsPerSymbol[sapply(chrsPerSymbol, length)>1])
idx <- tumor$Gene %in% nonUniqueSymbols
idxBlank <- tumor$Gene %in% blank
tumor$Gene <- as.character(tumor$Gene)
tumor$Gene[idx] <- paste(tumor$Gene, tumor$chr, sep=".")[idx]
tumor$Gene[idxBlank] <- "."
tumor
}
.get2DPurityGrid <- function(test.purity, by=1/30) {
startPurity <- max(0.1, min(test.purity))
endPurity <- min(0.99, max(test.purity))
grid <- seq(startPurity, endPurity, by=by)
if (startPurity < 0.34 && endPurity > 0.35) {
grid <- c(seq(startPurity, 0.34, by=1/50),
seq(0.35, endPurity, by=by))
}
grid
}
.optimizeGrid <- function(test.purity, min.ploidy, max.ploidy, test.num.copy = 0:7,
exon.lrs, seg, sd.seg, li, max.exon.ratio, max.non.clonal, BPPARAM) {
ploidy.grid <- seq(min.ploidy, max.ploidy, by = 0.2)
if (min.ploidy < 1.8 && max.ploidy > 2.2) {
ploidy.grid <- c(seq(min.ploidy, 1.8, by = 0.2), 1.9, 2, 2.1, seq(2.2, max.ploidy,
by = 0.2))
}
.optimizeGridPurity <- function(p) {
b <- 2 * (1 - p)
log.ratio.offset <- 0
lapply(ploidy.grid, function(D) {
dt <- p/D
llik.all <- lapply(seq_along(exon.lrs), function(i) .calcLlikSegmentExonLrs(exon.lrs[[i]],
log.ratio.offset, max.exon.ratio, sd.seg, dt, b, D, test.num.copy))
subclonal <- vapply(llik.all, which.max, double(1)) == 1
subclonal.f <- length(unlist(exon.lrs[subclonal]))/length(unlist(exon.lrs))
if (subclonal.f > max.non.clonal) return(-Inf)
sum(vapply(llik.all, max, double(1)))
})
}
if (is.null(BPPARAM)) {
mm <- lapply(test.purity, .optimizeGridPurity)
} else {
mm <- BiocParallel::bplapply(test.purity, .optimizeGridPurity, BPPARAM = BPPARAM)
}
mm <- sapply(mm, function(x) unlist(x))
colnames(mm) <- test.purity
rownames(mm) <- ploidy.grid
if (!sum(as.vector(is.finite(mm)))) {
.stopUserError("Cannot find valid purity/ploidy solution. ",
"This happens when input segmentations are garbage, most likely ",
"due to a catastrophic sample QC failure. Re-check standard QC ",
"metrics for this sample.")
}
ai <- .findLocalMinima(mm)
candidates <- data.frame(ploidy = as.numeric(rownames(mm)[ai[, 1]]), purity = as.numeric(colnames(mm)[ai[,
2]]), llik = mm[ai])
candidates$tumor.ploidy <- (candidates$ploidy - 2 * (1 - candidates$purity))/candidates$purity
# add diploid candidate solutions in the following purity grid in
# case there are none.
grid <- seq(0,1,by=1/4)
for (i in seq_along(grid)[-length(grid)]) {
t1 <- which.min(abs(as.numeric(colnames(mm)) - grid[i]))
t2 <- which.min(abs(as.numeric(colnames(mm)) - grid[i+1]))
if (t2-t1 < 2) next
if (sum(candidates$purity>grid[i] &
candidates$purity< grid[i+1], na.rm=TRUE) < 1) next
# Nothing close to diplod in this range? Then add.
if (min(abs(2 - candidates$tumor.ploidy[candidates$purity>grid[i] &
candidates$purity< grid[i+1]])) > 0.3) {
mm.05 <- mm[, seq(t1+1, t2), drop=FALSE]
# Find row most similar to normal diploid
diploidRowId <- which.min(abs(2-as.numeric(row.names(mm.05))))
# assert that rownames are still what they should be
if (diploidRowId != which.min(abs(2-ploidy.grid))) {
.stopRuntimeError("Cannot find diploid row in grid search.")
}
candidates <- rbind(candidates,
c(2, as.numeric(names(which.max(mm.05[diploidRowId, ]))),
max(mm.05[diploidRowId, ]), 2))
# Remove again if too similar with existing candidate
if (nrow(candidates) > 2 &&
abs(Reduce("-",tail(candidates$ploidy,2))) < 0.001 &&
abs(Reduce("-",tail(candidates$purity,2))) < 0.1) {
candidates <- candidates[- (nrow(candidates) - 2 +
which.min(tail(candidates$llik,2))),]
}
}
}
candidates <- candidates[candidates$tumor.ploidy >= 0.5, ]
candidates <- .filterDuplicatedCandidates(candidates)
list(all = mm, candidates = candidates[order(candidates$llik, decreasing = TRUE),
])
}
.calcLlikSegmentExonLrs <- function(exon.lrs, log.ratio.offset, max.exon.ratio, sd.seg,
dt, b, D, test.num.copy) {
c(.calcLlikSegmentSubClonal(exon.lrs + log.ratio.offset, max.exon.ratio), vapply(test.num.copy,
function(Ci) sum(dnorm(exon.lrs + log.ratio.offset, mean = log2(dt * Ci +
b/D), sd = sd.seg, log = TRUE)),double(1)))
}
# This function is used to punish more complex copy number models
# a little bit. Based on the BIC. This just counts the number of utilized
# copy number states, excluding normal 2. Then multiplies by
# log(number exons)
.calcComplexityCopyNumber <- function(results) {
if (length(results) < 2) return(0)
cs <- sapply((0:7)[-3], function(i) sapply(results, function(y)
sum(y$seg$size[y$seg$C == i])/sum(y$seg$size)))
complexity <- apply(cs,1, function(z) sum(z>0.001))
n <- sum(results[[1]]$seg$num.mark, na.rm=TRUE)
-complexity*log(n)
}
.rankResults <- function(results) {
if (length(results) < 1) return(results)
complexity <- .calcComplexityCopyNumber(results)
for (i in seq_along(results)) {
if (is.null(results[[i]]$SNV.posterior)) {
results[[i]]$total.log.likelihood <- results[[i]]$log.likelihood
} else {
results[[i]]$total.log.likelihood <- results[[i]]$log.likelihood/2 + results[[i]]$SNV.posterior$llik
}
results[[i]]$total.log.likelihood <- results[[i]]$total.log.likelihood + complexity[i]
}
idx.opt <- order(sapply(results, function(z) z$total.log.likelihood), decreasing = TRUE)
results <- results[idx.opt]
# remove solutions with -inf likelihood score
results[!is.infinite(sapply(results, function(z) z$total.log.likelihood))]
}
.sampleOffsetFast <- function(test.num.copy, seg, exon.lrs, sd.seg, p, C, total.ploidy,
max.exon.ratio, simulated.annealing, log.ratio.calibration) {
# Gibbs sample offset
test.offset <- seq(sd.seg * -log.ratio.calibration, sd.seg * log.ratio.calibration,
by = 0.01)
test.offset <- seq(p * -log.ratio.calibration, p * log.ratio.calibration * 0.2, length.out=12)
seg.ids.by.chr <- list(seq_len(nrow(seg)))
lr <- lapply(seq_along(seg.ids.by.chr), function(j) {
px.offset <- lapply(test.offset, function(px) vapply(seg.ids.by.chr[[j]],
function(i) {
b <- 2 * (1 - p)
D <- total.ploidy
dt <- p/D
llik.all <- .calcLlikSegmentExonLrs(exon.lrs[[i]], px, max.exon.ratio,
sd.seg, dt, b, D, test.num.copy)
vapply(llik.all, max, double(1))
}, double(1+length(test.num.copy))))
px.offset.s <- vapply(lapply(px.offset, apply, 2, max), sum, double(1))
px.offset.s <- exp(px.offset.s - max(px.offset.s))
log.ratio.offset <- test.offset[min(which(runif(n = 1, min = 0, max = sum(px.offset.s)) <=
cumsum(px.offset.s)))]
})
do.call(c, lapply(seq_along(lr), function(i) rep(lr[[i]], length(seg.ids.by.chr[[i]]))))
}
.sampleError <- function(subclonal, seg, exon.lrs, sd.seg, p, C, total.ploidy, max.exon.ratio,
simulated.annealing, iter, log.ratio.calibration, log.ratio.offset) {
# Gibbs sample error
test.error <- seq(sd.seg, sd.seg + sd.seg * 0.2, length.out = 5)
seg.ids <- seq_len(nrow(seg))
px.error <- lapply(test.error, function(error) vapply(seg.ids,
function(i) .calcLlikSegment(subclonal[i], exon.lrs[[i]] + log.ratio.offset[i], error,
p, C[i], total.ploidy, max.exon.ratio), double(1))
)
px.error.s <- sapply(px.error, sum, na.rm = TRUE)
if (simulated.annealing)
px.error.s <- px.error.s * exp(iter/4)
px.error.s <- exp(px.error.s - max(px.error.s))
error <- test.error[min(which(runif(n = 1, min = 0, max = sum(px.error.s)) <=
cumsum(px.error.s)))]
}
.sampleOffset <- function(subclonal, seg, exon.lrs, sd.seg, p, C, total.ploidy, max.exon.ratio,
simulated.annealing, iter, log.ratio.calibration = 0.25) {
# Gibbs sample offset
test.offset <- seq(sd.seg * -log.ratio.calibration, sd.seg * log.ratio.calibration,
by = 0.01)
test.offset <- seq(p * -log.ratio.calibration, p * log.ratio.calibration * 0.2, length.out=12)
seg.ids.by.chr <- list(seq_len(nrow(seg)))
lr <- lapply(seq_along(seg.ids.by.chr), function(j) {
px.offset <- lapply(test.offset, function(px) vapply(seg.ids.by.chr[[j]],
function(i) .calcLlikSegment(subclonal[i], exon.lrs[[i]] + px, sd.seg,
p, C[i], total.ploidy, max.exon.ratio), double(1))
)
px.offset.s <- sapply(px.offset, sum, na.rm = TRUE)
if (simulated.annealing)
px.offset.s <- px.offset.s * exp(iter/4)
px.offset.s <- exp(px.offset.s - max(px.offset.s))
log.ratio.offset <- test.offset[min(which(runif(n = 1, min = 0, max = sum(px.offset.s)) <=
cumsum(px.offset.s)))]
})
do.call(c, lapply(seq_along(lr), function(i) rep(lr[[i]], length(seg.ids.by.chr[[i]]))))
}
.removeOutliers <- function(x, na.rm=TRUE,...) {
if (length(x) < 5)
return(x)
qnt <- quantile(x, probs = c(0.25, 0.75), na.rm = na.rm, ...)
H <- 1.5 * IQR(x, na.rm = na.rm)
if (is.na(H) || H < 0.001) return(x)
# find points outside the 'boxplot' range
idx <- x <= (qnt[1] - H) | x >= (qnt[2] + H)
x[!idx]
}
.calcPuritySomaticVariants <- function(vcf, tumor.id.in.vcf) {
idx <- info(vcf)[[paste0(.getPureCNPrefixVcf(vcf), "PR")]] > 0.5
median(unlist(geno(vcf[idx])$FA[, tumor.id.in.vcf]), na.rm = TRUE)/0.48
}
.robustSd <- function(d, size = 25) median(
sapply(split(d, ceiling(seq_along(d) / size)), sd, na.rm = TRUE),
na.rm = TRUE)
.createFakeLogRatios <- function(tumor, seg.file, sampleid, chr.hash,
model.homozygous=FALSE, max.logr.sdev) {
if (!is.null(tumor$log.ratio)) {
# calculate log.ratio sd in chunks of size 25 to estimate the
# segmented sd
if (.robustSd(tumor$log.ratio) < max.logr.sdev) {
flog.info("Found log2-ratio in tumor coverage data.")
idx <- tumor$log.ratio < -8
if (any(idx)) {
flog.warn("log2-ratio contains outliers < -8, ignoring them...")
tumor$log.ratio[idx] <- NA
}
return(tumor$log.ratio)
} else {
flog.info("Provided log2-ratio looks too noisy, using segmentation only.")
}
}
if (is(seg.file, "character")) {
seg <- readSegmentationFile(seg.file, sampleid, model.homozygous = model.homozygous, verbose=FALSE)
} else {
seg <-.checkSeg(seg.file, sampleid, model.homozygous, verbose=FALSE)
}
seg.gr <- GRanges(seqnames = .add.chr.name(seg$chrom, chr.hash),
IRanges(start = round(seg$loc.start), end = seg$loc.end))
ov <- findOverlaps(tumor, seg.gr)
log.ratio <- seg$seg.mean[subjectHits(ov)]
# sanity check, so that every exon has exactly one segment log-ratio
log.ratio <- log.ratio[match(seq_len(length(tumor)), queryHits(ov))]
mean.log.ratio <- mean(subset(log.ratio, !is.infinite(log.ratio)),
na.rm = TRUE)
# calibrate
log.ratio <- log.ratio - mean.log.ratio
log.ratio
}
.strip.chr.name <- function(ls, chr.hash) {
x <- chr.hash[as.character(ls), 2]
x[is.na(x)] <- as.numeric(ls[is.na(x)])
x
}
.add.chr.name <- function(ls, chr.hash) {
x <- as.character(chr.hash$chr[match(ls, chr.hash$number)])
x[is.na(x)] <- ls[is.na(x)]
x
}
.getChrHash <- function(ls) {
ls <- unique(ls)
chr.hash <- genomeStyles(species="Homo_sapiens")
chr.hash <- chr.hash[!chr.hash$circular,]
id <- which.max(apply(chr.hash[,-(1:3)],2,function(x) sum(ls %in%x)))+3
chr.hash <- data.frame(chr=chr.hash[,id], number=seq_len(nrow(chr.hash)),
row.names=chr.hash[,id])
if (sum(!ls %in% chr.hash[,1]) == 0) return(chr.hash)
data.frame(chr=as.factor(ls), number=seq_along(ls), row.names=ls)
}
.stopUserError <- function(...) {
msg <- paste(c(...), collapse="")
msg <- paste0(msg, "\n\nThis is most likely a user error due to",
" invalid input data or parameters (PureCN ",
packageVersion("PureCN"), ").")
flog.fatal(paste(strwrap(msg),"\n"))
stop(paste(strwrap(msg),collapse = "\n"), call.= FALSE)
}
.stopRuntimeError <- function(...) {
msg <- paste(c(...), collapse="")
msg <- paste0(msg, "\n\nThis runtime error might be caused by",
" invalid input data or parameters. Please report bug (PureCN ",
packageVersion("PureCN"), ").")
flog.fatal(paste(strwrap(msg),"\n"))
stop(paste(strwrap(msg),collapse = "\n"), call.= FALSE)
}
.logHeader <- function(l) {
flog.info(strrep("-", 60))
flog.info("PureCN %s", as.character(packageVersion("PureCN")))
flog.info(strrep("-", 60))
# which arguments are printable in a single line?
idxSupported <- sapply(l, function(x) class(eval.parent(x))) %in%
c("character", "numeric", "NULL", "list", "logical") &
sapply(l, function(x) object.size(eval.parent(x))) < 1024
idxSmall <-
sapply(l[idxSupported], function(x) length(unlist(eval.parent(x)))) < 12
idxSupported[idxSupported] <- idxSmall
l <- c(l[idxSupported],lapply(l[!idxSupported], function(x) "<data>"))
argsStrings <- paste(sapply(seq_along(l), function(i) paste0("-",
names(l)[i], " ", paste(eval.parent(l[[i]]),collapse=","))),
collapse=" ")
flog.info("Arguments: %s", argsStrings)
}
.logFooter <- function() {
flog.info("Done.")
flog.info(strrep("-", 60))
}
.calcGCmetric <- function(gc_bias, coverage, on.target, field = "average.coverage") {
idx <- which(coverage$on.target==on.target)
if (!length(idx)) return(NA)
gcbins <- split(mcols(coverage[idx])[,field], gc_bias[idx] < 0.5)
mean(gcbins[[1]], na.rm=TRUE) / mean(gcbins[[2]], na.rm=TRUE)
}
.checkGCBias <- function(normal, tumor, log.ratio, max.dropout, on.target = TRUE) {
# vector instead of tumor meta column provided
if (is(log.ratio, "numeric") && length(log.ratio) == length(tumor)) {
tumor$ratio <- 2^(log.ratio)
} else {
.stopRuntimeError("tumor and log.ratio do not align in .checkGCBias")
}
gcMetricNormal <- .calcGCmetric(tumor$gc_bias, normal, on.target)
gcMetricTumor <- .calcGCmetric(tumor$gc_bias, tumor, on.target)
gcMetricLogRatio <- .calcGCmetric(tumor$gc_bias, tumor, on.target, "ratio")
if (is.na(gcMetricTumor)) return(FALSE)
flog.info("AT/GC dropout%s: %.2f (tumor), %.2f (normal), %.2f (coverage log-ratio). ",
ifelse(on.target,""," (off-target regions)"), gcMetricTumor,
gcMetricNormal, gcMetricLogRatio)
# throw warning only when the gc bias extends to normalzed log-ratio
max.dropout.half <- sapply(max.dropout, function(x) x + (1 - x)/2)
if (gcMetricLogRatio < max.dropout.half[1] ||
gcMetricLogRatio > max.dropout.half[2]) {
flog.warn("High GC-bias in normalized tumor vs normal log2 ratio.")
return(TRUE)
} else if (gcMetricNormal < max.dropout[1] ||
gcMetricNormal > max.dropout[2] ||
gcMetricTumor < max.dropout[1] ||
gcMetricTumor > max.dropout[2]) {
flog.info("High GC-bias in normal and/or tumor coverage.")
}
return(FALSE)
}
.gcGeneToCoverage <- function(interval.file, min.coverage, min.total.counts) {
gc.data <- readIntervalFile(interval.file)
gc.data$average.coverage <- min.coverage
gc.data$coverage <- min.coverage * width(gc.data)
gc.data$counts <- Inf
gc.data
}
.getCentromerePositions <- function(centromeres, genome, style=NULL) {
if (is.null(centromeres)) {
data(centromeres, envir = environment())
if (genome %in% names(centromeres)) {
centromeres <- centromeres[[genome]]
if (!is.null(style)) seqlevelsStyle(centromeres) <- style[1]
} else {
centromeres <- NULL
}
}
centromeres
}
.checkArgs <- function(args, fname) {
dups <- duplicated(names(args))
if (sum(dups)) {
args <- args[!dups]
flog.warn("Duplicated arguments in %s", fname)
}
args
}
.calcFractionBalanced <- function(p) {
sum(p$ML.C - p$ML.M.SEGMENT == p$ML.M.SEGMENT, na.rm=TRUE)/nrow(p)
}
# function to adjust log-ratios to segment mean
.postprocessLogRatios <- function(exon.lrs, seg.mean) {
exon.lrs <- lapply(seq_along(exon.lrs), function(i) {
if (length(exon.lrs[[i]]) > 3) return(exon.lrs[[i]])
exon.lrs[[i]] - mean(exon.lrs[[i]]) + seg.mean[i]
})
return(exon.lrs)
}
.estimateContamination <- function(pp, max.mapping.bias = NULL, min.fraction.chromosomes = 0.8) {
if (is.null(max.mapping.bias)) max.mapping.bias <- 0
idx <- pp$GERMLINE.CONTHIGH + pp$GERMLINE.CONTLOW > 0.5 &
pp$MAPPING.BIAS >= max.mapping.bias &
pp$MAPPING.BIAS <= (2 - max.mapping.bias)
if (!length(which(idx))) return(0)
df <- data.frame(
chr=pp$chr[idx],
AR=sapply(pp$AR.ADJUSTED[idx], function(x) ifelse(x>0.5, 1-x,x)),
HIGHLOW=ifelse(pp$GERMLINE.CONTHIGH>pp$GERMLINE.CONTLOW,
"HIGH", "LOW")[idx]
)
# take the chromosome median and then average. the low count
# might be biased in case contamination rate is < AR cutoff
estimatedRate <- weighted.mean(
sapply(split(df$AR, df$HIGHLOW), median),
sapply(split(df$AR, df$HIGHLOW), length)
)
fractionChrs <- sum(unique(pp$chr) %in% df$chr)/length(unique(pp$chr))
estimatedRate <- if (fractionChrs >= min.fraction.chromosomes) estimatedRate else 0
estimatedRate
}
.calculate_ccf <- function(vaf, depth, purity, C){
# see DOI: 10.1126/scitranslmed.aaa1408
if (is.na(vaf)) return(c(NA, NA, NA))
possible_ccfs <- seq(0.01, 1, 0.01)
possible_vafs <- (purity * possible_ccfs)/
((2 * (1 - purity)) + (purity * C)) #Expected VAF for each CCF
possible_vafs <- pmax(pmin(possible_vafs, 1), 0)
probs <- dbinom(x=round(vaf*depth), size = depth, prob = possible_vafs) #Prob of observed VAF
names(probs) <- possible_ccfs
if (!sum(probs)) {
if (vaf > max(possible_vafs)) return(c(1, 1, 1))
return(c(NA, NA, NA))
}
probs_norm <- probs / sum(probs) #Normalise to get posterior distribution
probs_sort <- sort(probs_norm, decreasing = TRUE)
probs_cum <- cumsum(probs_sort)
n <- sum(probs_cum < 0.95) + 1 #Get 95% confidence interval (95% of probability)
threshold <- probs_sort[n]
cint <- probs[probs_norm >= threshold]
ccf_point <- as.numeric(names(which.max(probs_norm)))
ccf_lower <- as.numeric(names(cint)[1])
ccf_upper <- as.numeric(names(cint)[length(cint)])
return(c(ccf_point, ccf_lower, ccf_upper))
}
.imputeBetaBin <- function(depth, bias, mu, rho) {
if (is.null(mu)) {
mu <- bias / 2
} else {
idx <- is.na(mu)
mu[idx] <- (bias/2)[idx]
}
if (is.null(rho) || all(is.na(rho))) {
rho <- 1 / (1 + depth)
} else {
idx <- is.na(rho)
rho[idx] <- mean(rho, na.rm = TRUE)
}
list(mu = mu, rho = rho)
}
.calculate_allelic_imbalance <- function(vaf, depth, max.coverage.vcf, bias, mu, rho) {
impute <- .imputeBetaBin(depth, bias, mu, rho)
dbetabinom(x = round(vaf * depth), depth, prob = impute$mu, rho = impute$rho, log = TRUE)
}
.getExonLrs <- function(seg.gr, tumor, log.ratio, idx = NULL) {
if (!is.null(idx)) {
tumor <- tumor[idx]
log.ratio <- log.ratio[idx]
}
ov.se <- findOverlaps(seg.gr, tumor)
exon.lrs <- lapply(seq_len(length(seg.gr)), function(i) log.ratio[subjectHits(ov.se)[queryHits(ov.se) ==
i]])
exon.lrs <- lapply(exon.lrs, function(x) subset(x, !is.na(x) & !is.infinite(x)))
# estimate stand. dev. for target logR within targets. this will be used as proxy
# for sample error.
targetsPerSegment <- vapply(exon.lrs, length, integer(1))
if (!sum(targetsPerSegment > 50, na.rm = TRUE) && !is.null(idx)) {
.stopRuntimeError("Only tiny segments.")
}
exon.lrs
}
# there are way too many bugs with this, so let's have one function we can more easily
# debug
.getSeqlevelsStyle <- function(x) {
if (is(x, "character")) {
sl <- x
} else {
sl <- try(seqlevels(x))
}
if (is(sl, "character")) {
style <- seqlevelsStyle(sl)
if ("Ensembl" %in% style) style <- "Ensembl"
return(style[1])
}
seqlevelsStyle(x)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.