R/doubletThresholding.R
In scDblFinder: scDblFinder

Documented in doubletThresholding

#' doubletThresholding
#'
#' Sets the doublet scores threshold; typically called by 
#' \code{\link[scDblFinder]{scDblFinder}}.
#'
#' @param d A data.frame of properties of real and artificial cells, as produced
#' by `scDblFinder(returnType="table")`.
#' @param dbr The expected (mean) doublet rate.
#' @param dbr.sd The standard deviation of the doublet rate, representing the 
#' uncertainty in the estimate.
#' @param local Logical; whether to use local calibration (experimental 
#' feature!)
#'
#' @return A scaler indicating the decided threshold.
#' 
#' @importFrom stats pcauchy optimize ecdf lm predict dnbinom
#' 
#' @examples
#' sce <- mockDoubletSCE()
#' d <- scDblFinder(sce, verbose=FALSE, returnType="table")
#' th <- doubletThresholding(d, dbr=0.05)
#' th$th
#' 
#' @export
doubletThresholding <- function( d, dbr=0.025, dbr.sd=0.015, local=FALSE ){
  # check that we have all necessary fields:
  if(!all(c("cluster","type","score","mostLikelyOrigin", "originAmbiguous") 
          %in% colnames(d))) stop("Input misses some columns.")
  
  if(!all(sort(as.character(unique(d$type)))==c("doublet","real"))){
    stop("`type` should be either 'real' or 'doublet'.")
  }
  expected <- getExpectedDoublets(d$cluster[!is.na(d$cluster)], dbr=dbr)
  if(local){
    ths <- .optimThresholds(d$score, d$type, d$mostLikelyOrigin, d$difficulty, 
                          expected)
    d$score.global <- d$score
    logit <- function(p) log(p/(1-p))
    adjp <- ths[d$mostLikelyOrigin,1]
    adjp[is.na(adjp)] <- median(ths)
    d$score <- 1/(1+exp(-(logit(d$score)-logit(sqrt(adjp)))))
  }
  w <- NULL
  if(!is.null(d$include.in.training)) w <- which(!d$include.in.training)
  ee <- sum(expected,na.rm=TRUE)
  ee <- c(ee*(1-dbr.sd), ee*(1+dbr.sd))
  th <- .optimThreshold(d$score, d$type, ee, fdr.include=w)
  o <- d$mostLikelyOrigin[d$type=="real" & d$score>=th]
  stats <- .compareToExpectedDoublets(o, expected, dbr)
  stats$combination <- as.character(stats$combination)
  stats$FNR <- vapply(split(as.data.frame(d), d$mostLikelyOrigin), 
                      FUN.VALUE=numeric(1L), 
                  FUN=function(x) .FNR(x$type, x$score, th) )[stats$combination]
  stats$difficulty <- vapply(split(d$difficulty, d$mostLikelyOrigin), 
                             FUN.VALUE=numeric(1L), na.rm=TRUE, 
                             FUN=median)[stats$combination]
  if(local){
    stats$local.threshold <- ths[stats$combination,2]
    stats$local.moderated <- ths[stats$combination,1]
  }
  list(th=th, stats=stats, finalScores=d$score)
}

.optimThreshold <- function(score, type, expected, fdr.include=NULL){
  type <- type=="real"
  if(!is.null(fdr.include)) fdr.include <- seq_along(score)
  totfn <- function(x){
    .FNR(type, score, x)*2 + .FDR(type[fdr.include], score[fdr.include], x) +
          .prop.dev(type,score,expected,x)^2
  }
  optimize(totfn, c(0,1), maximum=FALSE)$minimum
}

.optimThresholds <- function(score, type, origins, difficulty, expected, 
                             moderate=TRUE){
  ll <- split(seq_along(score), origins)
  names(n) <- n <- names(ll)
  ths <- vapply(n, FUN.VALUE=numeric(1L), FUN=function(comb){
    i <- ll[[comb]]
    type <- type=="real"
    totfn <- function(x){
      .FNR(type[i], score[i], x)*2 + .FDR(type[i], score[i], x) + 
        .prop.dev(type[i],score[i],expected[comb],x)^2
    }
    optimize(totfn, c(0,1), maximum=FALSE)$minimum
  })
  if(moderate){
    diff <- vapply(split(difficulty,origins), na.rm=TRUE, FUN=median, 
                   FUN.VALUE=numeric(1))
    diff <- diff[names(ths)]
    mod <- tryCatch({ MASS::rlm(ths~diff) }, error=function(e){
        lm(ths~diff)
    })
    ths2 <- (ths+predict(mod))/2
    if(length(w <- which(ths2<0.01))>0)
      ths2[w] <- apply(cbind(ths[w],rep(0.01,length(w))),1,FUN=max)
    ths2[ths2>1] <- 1
    ths <- cbind(moderated=ths2, local=ths)
  }
  cbind(ths)
}

.FNR <- function(type, score, threshold){
  if(!is.logical(type)) type <- type=="real"
  sum(!type & score<threshold, na.rm=TRUE)/sum(!type)
}
.FDR <- function(type, score, threshold, include=FALSE){
  if(sum(score>=threshold, na.rm=TRUE)==0) return(0)
  if(!is.logical(type)) type <- type=="real"
  sum(type & score>=threshold, na.rm=TRUE)/sum(score>=threshold, na.rm=TRUE)
}

.prop.dev <- function(type, score, expected, threshold){
  if(!is.logical(type)) type <- type=="real"
  x <- 1+sum(score>=threshold & type)
  expected <- expected + 1
  if(length(expected)>1 && x>min(expected) && x<max(expected)) return(0)
  min(abs(x-expected)/expected)
}

.compareToExpectedDoublets <- function(origins, expected, dbr=NULL, od=NULL){
  if(length(origins)==0){
    d <- data.frame(combination=names(expected), 
                    observed=rep(0,length(expected)))
    d$deviation <- d$expected <- as.numeric(expected)
  }else{
    n <- names(expected)
    o <- as.numeric(table(origins)[names(expected)])
    o[is.na(o)] <- 0 
    expected <- as.numeric(expected)
    d <- data.frame( combination=n, observed=o, expected=expected,
                     deviation=abs(expected-o))
  }
  d$prop.deviation <- d$deviation/sum(expected)
  if(!is.null(od)) d$p <- dnbinom(d$observed, mu=d$expected, size=1/od)
  d
}
  
.getThresholdStats <- function(x, expected=NULL, thresholds=(0:100/100), 
                               dbr=NULL, od=0.05){
  if(is(x,"SingleCellExperiment")){
    x <- as.data.frame(colData(x))
    if(is.null(x$scDblFinder.type)) x$scDblFinder.type <- "real"
    x <- x[,grep("^scDblFinder",colnames(x))]
    colnames(x) <- gsub("^scDblFinder\\.","",colnames(x))
  }
  real <- which(x$type=="real")
  if(is.null(expected)) expected <- getExpectedDoublets(x$clusters[w], dbr)
  if(is.null(names(thresholds))) names(thresholds) <- thresholds
  a <- lapply( thresholds, FUN=function(i){
    .compareToExpectedDoublets(x$mostLikelyOrigin[which(x$score>=i & 
                                                            x$type=="real")], 
                               expected, dbr=dbr, od=od)
  })
  a <- cbind(threshold=rep(names(a),vapply(a,nrow,integer(1))),
             do.call(rbind, a))
  if(!all(x$type=="real")){
    b <- lapply( thresholds, FUN=function(i){
      y <- vapply(split(x, x$mostLikelyOrigin), FUN.VALUE=numeric(1L), 
                  FUN=function(x) .FNR(x$type, x$score, i) )
      data.frame(combination=names(y), FNR=y)
    })
    b <- cbind(threshold=rep(names(b),vapply(b,nrow,integer(1))),
               do.call(rbind, b))
    a <- merge(a,b,by=c("combination","threshold"),all.x=TRUE)
  }
  a$threshold <- as.numeric(a$threshold)
  a
}