#' ShearwaterML
#'
#' Maximum likelihood version of Shearwater producing p-values instead of Bayes factors.
#'
#' @param counts The array of counts typically generated by loadAllData.
#' @param rho Use this variable to fix the dispersion parameter to a value of interest. Default: NULL, rho will be estimated from the data.
#' @param truncate Samples with variant allele frequencies higher than "truncate" will be excluded from the background error model.
#' @param rho.min If rho=NULL, rho will be estimated from the data in the interval [rho.min,rho.max].
#' @param rho.max If rho=NULL, rho will be estimated from the data in the interval [rho.min,rho.max].
#' @param maxvaf Sites with an average rate of mimatches higher than maxvaf will not be considered (e.g. SNPs or reference sites).
#' @param mindepth Minimum coverage required to test a site.
#' @param maxtruncate Maximum number of samples that can be excluded from the background error model by truncate for a site to be tested.
#' @return A list with two arrays for P- and Q-values.
#' @examples
#' # code to be added
#' @export
#' @author Inigo Martincorena and Moritz Gerstung
#' @references Martincorena I, Roshan A, Gerstung M, et al. (2015). High burden and pervasive positive selection of somatic mutations in normal human skin. _Science_ (Under consideration).
betabinLRT = function (counts, rho = NULL, truncate = 0.05, rho.min = 1e-04, rho.max = 0.8, maxvaf = 0.3, mindepth = 10, maxtruncate = 0.5) {
# deepSNV functions
estimateRho = deepSNV:::estimateRho
logbb = deepSNV:::logbb
pseudo = .Machine$double.eps
ncol = dim(counts)[3]/2
x.fw = counts[, , 1:ncol]
x.bw = counts[, , 1:ncol + ncol]
n.fw = rep(rowSums(x.fw, dims = 2), dim(x.fw)[3])
n.bw = rep(rowSums(x.bw, dims = 2), dim(x.bw)[3])
dim(n.fw) = dim(x.fw)
dim(n.bw) = dim(x.bw)
x = x.fw + x.bw
n = n.fw + n.bw
mu = pmax(x,pseudo)/pmax(n,pseudo)
ix = (mu < truncate)
# Calculation of rho
if (is.null(rho)) {
rho = estimateRho(x, mu, ix)
rho = pmin(pmax(rho, rho.min), rho.max)
rho[is.na(rho)] = rho.min
}
disp = (1 - rho)/rho
rdisp = rep(disp, each = nrow(counts))
mu = mu * rdisp
tr.fw = x.fw * ix
tr.bw = x.bw * ix
X.fw = rep(colSums(tr.fw, dims = 1), each = nrow(counts)) - tr.fw
N.fw = rep(colSums(n.fw * ix), each = nrow(counts)) - n.fw * ix
X.bw = rep(colSums(tr.bw, dims = 1), each = nrow(counts)) - tr.bw
N.bw = rep(colSums(n.bw * ix), each = nrow(counts)) - n.bw * ix
prob0.fw = (X.fw + x.fw)/(N.fw + n.fw); prob0.fw[prob0.fw==0] = pseudo
prob1s.fw = x.fw/(n.fw+pseudo); prob1s.fw[prob1s.fw==0] = pseudo
prob1c.fw = X.fw/(N.fw+pseudo); prob1c.fw[prob1c.fw==0] = pseudo
prob1s.fw = pmax(prob1s.fw,prob1c.fw) # Min error rate is that of the population (one-sided test)
nu0.fw = prob0.fw * rdisp; nu1s.fw = prob1s.fw * rdisp; nu1c.fw = prob1c.fw * rdisp;
prob0.bw = (X.bw + x.bw)/(N.bw + n.bw); prob0.bw[prob0.bw==0] = pseudo
prob1s.bw = x.bw/(n.bw+pseudo); prob1s.bw[prob1s.bw==0] = pseudo
prob1c.bw = X.bw/(N.bw+pseudo); prob1c.bw[prob1c.bw==0] = pseudo
prob1s.bw = pmax(prob1s.bw,prob1c.bw) # Min error rate is that of the population (one-sided test)
nu0.bw = prob0.bw * rdisp; nu1s.bw = prob1s.bw * rdisp; nu1c.bw = prob1c.bw * rdisp;
# Beta-Binomial LRT
prob1s.both = (x.fw+x.bw)/(n.fw+n.bw+pseudo); prob1s.both[prob1s.both==0] = pseudo
prob1s.both = pmax(prob1s.both,pmax(prob1c.fw,prob1c.bw)) # Min error rate is that of the population (one-sided test)
nu1s.both = prob1s.both * rdisp
LL.both = logbb(x.fw, n.fw, nu0.fw, rdisp) + logbb(X.fw, N.fw, nu0.fw, rdisp) + logbb(x.bw, n.bw, nu0.bw, rdisp) + logbb(X.bw, N.bw, nu0.bw, rdisp) - logbb(x.fw, n.fw, nu1s.both, rdisp) - logbb(X.fw, N.fw, nu1c.fw, rdisp) - logbb(x.bw, n.bw, nu1s.both, rdisp) - logbb(X.bw, N.bw, nu1c.bw, rdisp)
pvals_both = pchisq(-2*LL.both, df=2, lower.tail=F)/2 # We divide by 2 as we are performing a 1-sided test
# Masking out undesired tests (excluding sites with too low coverage can boost power by reducing multiple testing)
pvals_both[(n.fw+n.bw)<mindepth] = NA
pvals_both[(prob0.bw>maxvaf) & (prob0.fw>maxvaf)] = NA
truncfilt = rep(apply(ix,c(2,3),mean)<maxtruncate, each=dim(ix)[1])
dim(truncfilt) = dim(ix)
pvals_both[truncfilt] = NA
# False discovery rate (it should be applied to the entire collection of sites tested)
qvals = p.adjust(pvals_both, method="BH")
dim(qvals) = dim(pvals_both)
return(list(pvals=pvals_both, qvals=qvals))
}
#' Function to create a \code{\link{VCF}} object with variant calls from an array of q-values.
#'
#' This function thresholds the q-values computed by the shearwater algorithm and creates a \code{\link{VCF}} object as output.
#' @param qvals array of q-values from \code{\link{betabinLRT}}.
#' @param counts array of counts from \code{\link{loadAllData}}.
#' @param regions \code{\link{GRanges}} with the regions corresponding to counts and qvals.
#' @param samples vector of samples names.
#' @param cutoff Cutoff for the q-values below which a variant is considered to be true (default = 0.05)
#' @param mvcf boolean flag, if TRUE compute a large VCF with as many genotype columns as samples. Default TRUE. Otherwise use duplicate rows and only one genotype column. The sample is then provided by the info:PD field. Can be inefficient for large sample sizes.
#' @param err Optional matrix of error rates, otherwise recomputed from counts.
#' @param mu Optional matrix of relative frequencies, otherwise recomputed from counts.
#' @return A \code{\link{VCF}} object
#'
#' @author mg14
#' @note Experimental code, subject to changes
#' @export
## TODO: check for cases with zero result
qvals2Vcf <- function(qvals, counts, regions, samples = 1:nrow(counts), err = NULL, mu = NULL, cutoff = 0.05, mvcf=TRUE){
coordinates <- regions2Coordinates(regions)
w = which(qvals < cutoff, arr.ind=TRUE)
w = w[order(w[,2], w[,3], w[,1]),,drop=FALSE]
## If no variants found, select first possible and set to NULL later (avoids a lot of errors in the following)
if(dim(w)[1]==0) {
w = matrix(1,ncol = 3)
isNull = TRUE
}else{
isNull = FALSE
}
totCounts = counts[,,1:5] + counts[,,6:10]
if(is.null(err)){
err = colSums(totCounts)
err = err / (rowSums(err) + .Machine$double.eps)
}
if(is.null(mu))
mu = (totCounts)/rep(rowSums(counts, dims=2)+.Machine$double.eps, 5)
#coordinates = regions2Coordinates(regions.GR)
ref = factor(apply(err,1,which.max), levels = 1:5, labels=c("A","T","C","G","-"))
# gr = GRanges(paste(w[,1], w[,3]==5, sep="."), IRanges(w[,2], width=1), alt=w[,3], AF = select(w,subcl))
# o = order(gr)
# rd = reduce(gr)
# m = match(gr[o], rd)
# alt = lapply(split(c("A","T","C","G","")[w[o,3]], m), paste, collapse="")
if(!mvcf){
headerTemplate = scanVcfHeader(system.file("extdata", "shearwater.vcf", package="deepSNV"))
v = VCF(
rowRanges=GRanges(coordinates$chr[w[,2]],
IRanges(coordinates$pos[w[,2]] - (w[,3]==5), width=1 + (w[,3]==5)), ## If del make one longer..
),
fixed = DataFrame(
REF = DNAStringSet(paste(ifelse(w[,3]==5,as.character(ref[w[,2]-1]),""), ref[w[,2]], sep="")),
ALT = do.call(DNAStringSetList,as.list(paste(ifelse(w[,3]==5,as.character(ref[w[,2]-1]),""), c("A","T","C","G","")[w[,3]], sep=""))),
#ALT = DNAStringSet(paste(ifelse(w[,3]==5,as.character(ref[w[,2]-1]),""), c("A","T","C","G","")[w[u,3]], sep="")),
QUAL = round(-10*log10(select(w,qvals))),
FILTER = "PASS"
),
info = DataFrame(
#paramRangeID=NA,
PD = samples[w[,1]],
AF = select(w, mu),
ER = select(w[,-1], err),
FW = select(w, counts[,,1:5]),
BW = select(w, counts[,,6:10]),
DP = select(w[,-3], rowSums(totCounts, dims=2)),
QV = select(w, qvals),
LEN = 1),
exptData = list(header = VCFHeader(
reference = reference(headerTemplate),
samples = as.character(samples),
header = append(header(headerTemplate), .makeVCFheader("date", paste(Sys.time()))))),
collapsed = FALSE
)}else{
u = !duplicated(w[,-1, drop=FALSE])
wu = w[u,,drop=FALSE]
pp = mapply(function(i,j){ qvals[,i,j]}, wu[,2],wu[,3])
geno = SimpleList(
GT = t(pp < cutoff)+0,
GQ = t(round(-10*log10(pp))),
QV = signif(t(mapply(function(i,j){ qvals[,i,j]}, wu[,2],wu[,3])),3),
VF = signif(t(mapply(function(i,j){ mu[,i,j]}, wu[,2],wu[,3])),3),
FW = t(mapply(function(i,j){ counts[,i,j]}, wu[,2],wu[,3])),
BW = t(mapply(function(i,j){ counts[,i,j+5]}, wu[,2],wu[,3])),
# DP = t(mapply(function(i,j){ rowSums(totCounts[,i,])}, wu[,2],wu[,3]))
FD = t(mapply(function(i,j){ rowSums(counts[,i,1:5])}, wu[,2],wu[,3])),
BD = t(mapply(function(i,j){ rowSums(counts[,i,6:10])}, wu[,2],wu[,3]))
)
headerTemplate = scanVcfHeader(system.file("extdata", "shearwaterML.vcf", package="deepSNV"))
#rownames(w) = samples[w[,1]]
v = VCF(
rowRanges=GRanges(coordinates$chr[wu[,2]],
IRanges(coordinates$pos[wu[,2]] - (wu[,3]==5), width=1 + (wu[,3]==5)),
),
fixed = DataFrame(
REF = DNAStringSet(paste(ifelse(wu[,3]==5,as.character(ref[wu[,2]-1]),""), ref[wu[,2]], sep="")), ##TODO: Warning if w[u,3][1]==5 & w[u,2]==1...
ALT = do.call(DNAStringSetList,as.list(paste(ifelse(wu[,3]==5,as.character(ref[wu[,2]-1]),""), c("A","T","C","G","")[wu[,3]], sep=""))),
#ALT = DNAStringSet(paste(ifelse(w[u,3]==5,as.character(ref[w[u,2]-1]),""), c("A","T","C","G","")[w[u,3]], sep="")),
QUAL = 1,
FILTER = "PASS"
),
info = DataFrame(
#paramRangeID=NA,
ER = select(wu[,-1], err),
AF = rowMeans(geno$GT),
LEN = 1),
geno = geno,
exptData = list(header = VCFHeader(
reference = reference(headerTemplate),
samples = as.character(samples),
header = append(header(headerTemplate), .makeVCFheader("date", paste(Sys.time()))))),
colData = DataFrame(samples=1:length(samples), row.names=samples),
collapsed = TRUE
)
colnames(v) = samples
}
## If no variants found set to zero..
if(isNull)
v <- v[NULL]
return(v)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.