#' doubletThresholding
#'
#' Sets the doublet scores threshold; typically called by
#' \code{\link[scDblFinder]{scDblFinder}}.
#'
#' @param d A data.frame of cell properties, with each row representing a cell, as
#' produced by `scDblFinder(..., returnType="table")`, or minimally containing a `score`
#' column.
#' @param dbr The expected (mean) doublet rate. If `d` contains a `cluster` column, the
#' doublet rate will be adjusted for homotypic doublets.
#' @param dbr.sd The standard deviation of the doublet rate, representing the
#' uncertainty in the estimate. Ignored if `method!="optim"`.
#' @param dbr.per1k The expected proportion of doublets per 1000 cells.
#' @param stringency A numeric value >0 and <1 which controls the relative weight of false
#' positives (i.e. real cells) and false negatives (artificial doublets) in setting the
#' threshold. A value of 0.5 gives equal weight to both; a higher value (e.g. 0.7) gives
#' higher weight to the false positives, and a lower to artificial doublets. Ignored if
#' `method!="optim"`.
#' @param method The thresholding method to use, either 'auto' (default, automatic
#' selection depending on the available fields), 'optim' (optimization of
#' misclassification rate and deviation from expected doublet rate), 'dbr' (strictly
#' based on the expected doublet rate), or 'griffiths' (cluster-wise number of
#' median absolute deviation in doublet score).
#' @param perSample Logical; whether to perform thresholding individually for each sample.
#' @param p The p-value threshold determining the deviation in doublet score.
#' @param returnType The type of value to return, either doublet calls (`call`) or
#' thresholds (`threshold`).
#'
#' @return A vector of doublet calls if `returnType=="call"`, or a threshold (or vector
#' of thresholds) if `returnType=="threshold"`.
#'
#' @importFrom stats pcauchy optimize ecdf lm predict dnbinom quantile
#'
#' @examples
#' sce <- mockDoubletSCE()
#' d <- scDblFinder(sce, verbose=FALSE, returnType="table")
#' th <- doubletThresholding(d, dbr=0.05)
#' th
#'
#' @importFrom stats mad qnorm setNames
#' @export
doubletThresholding <- function( d, dbr=NULL, dbr.sd=NULL, dbr.per1k=0.008,
stringency=0.5, p=0.1,
method=c("auto","optim","dbr","griffiths"),
perSample=TRUE, returnType=c("threshold","call")){
method <- match.arg(method)
returnType <- match.arg(returnType)
if(is.null(d$src)) d$src <- d$type
if(is.null(dbr.sd)) dbr.sd <- mean(0.4*.gdbr(d,dbr))
dbr <- .estimateHeterotypicDbRate(d, .checkPropArg(dbr))
if(!is.data.frame(d) && !is(d,"DFrame"))
stop("`d` should be a data.frame with minimally the 'score' column.")
conds <- list("optim"=c("type","score"),
"dbr"=c("score"),
"griffiths"=c("score"))
w <- vapply(conds, FUN.VALUE=logical(1), FUN=function(x) all(x %in% colnames(d)))
if(method=="auto"){
if(length(w)==0) stop("`d` misses the necessary columns.")
method <- names(conds)[which(w)[1]]
}else{
if(!w[[method]]) stop("`d` misses the necessary columns.")
}
if(method=="optim"){
if(is.null(d$cluster)) d$cluster <- 1L
if(!all(sort(as.character(unique(d$type)))==c("doublet","real")))
stop("`type` should be either 'real' or 'doublet'.")
if(is.null(d$include.in.training)) d$include.in.training <- TRUE
if(!is.null(d$sample) && perSample){
si <- split(seq_len(nrow(d)), d$sample)
if(!is.null(dbr)){
if(length(dbr)==1) dbr <- setNames(rep(dbr, length(si)), names(si))
if(!all(names(si) %in% names(dbr)))
stop("The names of `dbr` do not correspond to samples of `d`")
}
th <- sapply(setNames(names(si),names(si)), FUN=function(s){
.optimThreshold(d[si[[s]],c("type","src","score","cluster","include.in.training")],
dbr=dbr[[s]], dbr.sd=dbr.sd, stringency=stringency)
})
ret <- as.factor(d$score > th[d$sample])
}else{
th <- .optimThreshold(d, dbr=.gdbr(d,dbr), dbr.sd=dbr.sd, stringency=stringency)
ret <- as.factor(d$score>th)
}
if(returnType=="threshold") return(th)
}else{
if(!is.null(d$src)) d <- d[d$src=="real",]
if(method=="dbr"){
th <- quantile(d$score, 1-dbr)
if(returnType=="threshold") return(th)
ret <- as.factor(d$score>th)
}else if(method=="griffiths"){
if(!("sample" %in% colnames(d))) d$sample <- "all"
i <- split(seq_len(nrow(d)), d$sample)
meds <- vapply(i, FUN.VALUE=numeric(1), FUN=function(x){
median(d$score[x],na.rm=TRUE)
})
d$dev <- d$score-meds[d$sample]
mad <- vapply(i, FUN.VALUE=numeric(1), FUN=function(x){
x <- d$dev[x]
median(x[x>0],na.rm=TRUE)
}) * formals(mad)$constant
if(returnType=="threshold"){
return(qnorm(p, mean=meds, sd=mad, lower.tail=FALSE))
}else{
d$p <- pnorm(d$score, mean=meds[d$sample], sd=mad[d$sample],
lower.tail=FALSE)
ret <- as.factor(d$p < p)
}
}else{
stop("Unknown method '",method,"'")
}
}
levels(ret) <- c("singlet","doublet")
return(ret)
}
# dbr should be already corrected for homotypy
.optimThreshold <- function(d, dbr=NULL, dbr.sd=NULL, ths=NULL, stringency=0.5){
if(!(stringency > 0) || !(stringency<1))
stop("`stringency` should be >0 and <1.")
if(is.null(dbr)) dbr <- .gdbr(d, dbr=.estimateHeterotypicDbRate(d))
if(!is.null(dbr.sd)) dbr <- c(max(0,dbr-dbr.sd), min(1,dbr+dbr.sd))
if(is.null(d$cluster)) d$cluster <- 1L
wR <- which(d$src=="real")
expected <- dbr*length(wR)
if(!is.logical(d$type)) d$type <- d$type=="real"
fdr.include <- which(d$include.in.training)
eFN <- sum(grepl("^rDbl\\.",row.names(d))) *
propHomotypic(d$cluster[d$src=="real"])
if(length(unique(d$cluster))==1) eFN <- 0
totfn <- function(x){
edev <- .prop.dev(d$type,d$score,expected,x)^2
y <- edev + 2*(1-stringency)*.FNR(d$type, d$score, x, expectedFN=eFN)
if(!is.null(fdr.include))
y <- y + .FPR(d$type[fdr.include], d$score[fdr.include], x)*2*stringency
y
}
if(is.null(ths)) return(optimize(totfn, c(0,1), maximum=FALSE)$minimum)
data.frame( threshold=ths,
FNR=vapply(ths, FUN.VALUE=numeric(1), FUN=function(x){
.FNR(d$type, d$score, x, expectedFN=eFN)
}),
FDR=vapply(ths, FUN.VALUE=numeric(1), FUN=function(x){
.FDR(d$type[fdr.include], d$score[fdr.include], x)
}),
FPR=vapply(ths, FUN.VALUE=numeric(1), FUN=function(x){
.FPR(d$type[fdr.include], d$score[fdr.include], x)
}),
dev=vapply(ths, FUN.VALUE=numeric(1), FUN=function(x){
.prop.dev(d$type,d$score,expected,x)^2
}),
cost=vapply(ths, FUN.VALUE=numeric(1), FUN=totfn)
)
}
.FNR <- function(type, score, threshold, expectedFN=0){
if(!is.logical(type)) type <- type=="real"
max(c(0, (sum(!type & score<threshold, na.rm=TRUE)-expectedFN)))/sum(!type)
}
.FDR <- function(type, score, threshold){
if(length(type)==0 || sum(score>=threshold, na.rm=TRUE)==0) return(0)
if(!is.logical(type)) type <- type=="real"
sum(type & score>=threshold, na.rm=TRUE)/sum(score>=threshold, na.rm=TRUE)
}
.FPR <- function(type, score, threshold){
if(length(type)==0) return(0)
if(!is.logical(type)) type <- type=="real"
sum(type & score>=threshold, na.rm=TRUE)/sum(type)
}
.prop.dev <- function(type, score, expected, threshold){
if(!is.logical(type)) type <- type=="real"
x <- 1+sum(score>=threshold & type)
expected <- expected + 1
if(length(expected)>1 && x>min(expected) && x<max(expected)) return(0)
min(abs(x-expected)/expected)
}
.compareToExpectedDoublets <- function(origins, expected, dbr=NULL, od=NULL){
if(length(origins)==0){
d <- data.frame(combination=names(expected),
observed=rep(0,length(expected)))
d$deviation <- d$expected <- as.numeric(expected)
}else{
n <- names(expected)
o <- as.numeric(table(origins)[names(expected)])
o[is.na(o)] <- 0
expected <- as.numeric(expected)
d <- data.frame( combination=n, observed=o, expected=expected,
deviation=abs(expected-o))
}
d$prop.deviation <- d$deviation/sum(expected)
if(!is.null(od)) d$p <- dnbinom(d$observed, mu=d$expected, size=1/od)
d
}
.getDoubletStats <- function( d, th, dbr=NULL, dbr.sd=0.015, dbr.per1k=0.008 ){
# check that we have all necessary fields:
fields <- c("cluster","src","type","score","mostLikelyOrigin", "originAmbiguous","difficulty")
if(!all(fields %in% colnames(d))) stop("Input misses some columns.")
if(!is.null(d$sample))
return(dplyr::bind_rows(lapply(split(seq_len(nrow(d)), d$sample), FUN=function(i){
.getDoubletStats(d[i,fields], th, dbr=dbr, dbr.sd=dbr.sd)
}), .id="sample"))
if(is.null(dbr)) dbr <- dbr.per1k*sum(d$src=="real",na.rm=TRUE)/1000
o <- d$mostLikelyOrigin[d$type=="real" & d$score>=th]
expected <- getExpectedDoublets(d$cluster[d$src=="real"], dbr=dbr)
stats <- .compareToExpectedDoublets(o, dbr=dbr, expected=expected)
stats$combination <- as.character(stats$combination)
stats$FNR <- vapply(split(as.data.frame(d), d$mostLikelyOrigin),
FUN.VALUE=numeric(1L),
FUN=function(x) .FNR(x$type, x$score, th) )[stats$combination]
stats$difficulty <- vapply(split(d$difficulty, d$mostLikelyOrigin),
FUN.VALUE=numeric(1L), na.rm=TRUE,
FUN=median)[stats$combination]
stats
}
#' @importFrom stats quantile
.filterUnrecognizableDoublets <- function( d, minSize=5, minMedDiff=0.1 ){
if(is.null(d$src)) d$src <- d$type
da <- d[d$src=="artificial" & grepl("+", d$mostLikelyOrigin, fixed=TRUE),]
dr <- d[d$src=="real",]
dr.med <- median(dr$score)
dr <- split(dr$score, dr$cluster)
rq <- t(vapply(dr, FUN.VALUE=numeric(2), na.rm=TRUE, probs=c(0.5, 0.9), FUN=quantile))
da <- split(da$score, droplevels(da$mostLikelyOrigin))
origs <- strsplit(names(da), "+", fixed=TRUE)
out <- vapply(names(da), FUN.VALUE=logical(1), FUN=function(x){
z <- da[[x]]
if(length(z)<minSize) return(FALSE)
z <- quantile(z, probs=c(0.1,0.5), na.rm=TRUE)
x <- origs[[x]]
any(z[1]<rq[x,2]) || (z[2]-max(dr.med, rq[x,1]))<minMedDiff
})
out <- names(out)[which(out)]
d[d$src!="artificial" | !(d$mostLikelyOrigin %in% out),]
}
.estimateHeterotypicDbRate <- function(d, dbr=NULL){
d <- as.data.frame(d)
if(!is.null(d$sample)){
sd <- split(d[,setdiff(colnames(d),"sample")],d$sample)
if(length(dbr)==1) dbr <- rep(dbr,length(sd))
if(is.null(dbr)) dbr <- unlist(lapply(sd, FUN=.gdbr))
return(unlist(mapply(d=sd, dbr=dbr, FUN=.estimateHeterotypicDbRate)))
}
dbr <- .gdbr(d, dbr=dbr)
if(is.null(d$cluster)){
th <- sum(d$src=="artificial")/nrow(d)
prop.homo <- sum(d$src=="artificial" & d$score<th)/sum(d$src=="artificial")
return(dbr * (1-prop.homo))
}
if(is.factor(d$cluster)) d$cluster <- droplevels(d$cluster)
d <- d[d$src=="real",]
sum(getExpectedDoublets(d$cluster, dbr=dbr))/nrow(d)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.