Nothing
#' getArtificialDoublets
#'
#' Create expression profiles of random artificial doublets.
#'
#' @param x A count matrix, with features as rows and cells as columns.
#' @param n The approximate number of doublet to generate (default 3000).
#' @param clusters The optional clusters labels to use to build cross-cluster
#' doublets.
#' @param resamp Logical; whether to resample the doublets using the poisson
#' distribution. Alternatively, if a proportion between 0 and 1, the proportion
#' of doublets to resample.
#' @param halfSize Logical; whether to half the library size of doublets
#' (instead of just summing up the cells). Alternatively, a number between 0
#' and 1 can be given, determining the proportion of the doublets for which
#' to perform the size adjustment.
#' @param adjustSize Logical; whether to adjust the size of the doublets using
#' the ratio between each cluster's median library size. Alternatively, a number
#' between 0 and 1 can be given, determining the proportion of the doublets for
#' which to perform the size adjustment.
#' @param propRandom The proportion of the created doublets that are fully
#' random (default 0.1); the rest will be doublets created across clusters.
#' Ignored if `clusters` is NULL.
#' @param selMode The cell pair selection mode for inter-cluster doublet
#' generation, either 'uniform' (same number of doublets for each combination),
#' 'proportional' (proportion expected from the clusters' prevalences), or
#' 'sqrt' (roughly the square root of the expected proportion).
#' @param n.meta.cells The number of meta-cell per cluster to create. If given,
#' additional doublets will be created from cluster meta-cells. Ignored if
#' `clusters` is missing.
#' @param meta.triplets Logical; whether to create triplets from meta cells.
#' Ignored if `clusters` is missing.
#'
#' @return A list with two elements: `counts` (the count matrix of
#' the artificial doublets) and `origins` the clusters from which each
#' artificial doublets originated (NULL if `clusters` is not given).
#'
#' @examples
#' m <- t(sapply( seq(from=0, to=5, length.out=50),
#' FUN=function(x) rpois(30,x) ) )
#' doublets <- getArtificialDoublets(m, 30)
#'
#' @export
getArtificialDoublets <- function( x, n=3000, clusters=NULL,resamp=0.5,
halfSize=0.5, adjustSize=0.2, propRandom=0.1,
selMode=c("proportional","uniform","sqrt"),
n.meta.cells=2, meta.triplets=TRUE ){
selMode <- match.arg(selMode)
ls <- Matrix::colSums(x)
w <- which(ls>0 & ls>=quantile(ls,0.01) & ls<=quantile(ls,0.99))
x <- x[,w,drop=FALSE]
if(!is.null(clusters)){
if(is.list(clusters)){
clusters$k <- clusters$k[w]
clo <- clusters
clusters <- clusters$k
}else{
clusters <- clusters[w]
if(any(is.na(clusters))){
w <- which(!is.na(clusters))
clusters <- clusters[w]
x <- x[,w]
}
clo <- clusters
}
clusters <- droplevels(as.factor(clusters))
}
if(is.null(clusters) || propRandom==1){
# create random combinations
if(ncol(x)^2 <= n){
# few combinations, get them all
ad <- expand.grid(seq_len(ncol(x)),seq_len(ncol(x)))
}else{
ad <- matrix(sample.int(ncol(x), 2*n, replace=(2*n >= ncol(x))),ncol=2)
}
# remove doublets of the same cell
ad <- ad[which(ad[,1]!=ad[,2]),]
# create doublets
if(is.null(clusters)){
ad.m <- createDoublets(x, ad, adjustSize=FALSE, resamp=resamp,
halfSize=halfSize, prefix="rDbl.")
oc <- NULL
}else{
oc1 <- matrix(clusters[as.numeric(ad)],ncol=2)
ad.m <- createDoublets(x, ad, clusters=clusters, adjustSize=adjustSize,
halfSize=halfSize, resamp=resamp, prefix="rDbl.")
oc <- paste(oc1[,2],oc1[,1],sep="+")
oc[which(oc1[,2]==oc1[,1])] <- NA
}
row.names(ad.m) <- row.names(x)
return(list( counts=ad.m, origins=as.factor(oc) ))
}
if((nr <- ceiling(n*propRandom))>0){
ad.m <- getArtificialDoublets(x, n=nr, clusters=clusters,
halfSize=halfSize, resamp=resamp,
propRandom=1, adjustSize=adjustSize)
oc <- ad.m$origins
ad.m <- ad.m$counts
n <- ceiling(n*(1-propRandom))
}else{
ad.m <- x[,c(),drop=FALSE]
oc <- character()
}
if(length(unique(clusters))<3) n.meta.cells <- 0
# create doublets across clusters:
n <- ceiling(n)
ca <- getCellPairs(clo, n=ifelse(n.meta.cells>0,ceiling(n*0.9),n),
selMode=selMode)
m2 <- createDoublets(x, ca, clusters=clusters, adjustSize=adjustSize,
halfSize=halfSize, resamp=resamp)
oc <- c(oc, as.character(ca$orig.clusters))
names(oc) <- names(m2)
ad.m <- cbind(ad.m, m2)
rm(m2)
gc(verbose=FALSE)
if(n.meta.cells>0){
# create doublets from meta cells:
meta <- .getMetaCells(x, clusters, n.meta.cells=n.meta.cells,
meta.cell.size=30)
cl2 <- meta$clusters
meta <- meta$mu
ca <- getCellPairs(cl2, n=ceiling(n*0.1))
m2 <- createDoublets(meta, ca, clusters=cl2, adjustSize=FALSE,
halfSize=FALSE, resamp=TRUE, prefix="artMetaDbl.")
oc2 <- ca$orig.clusters
names(oc2) <- colnames(m2)
ad.m <- cbind(ad.m, m2)
oc <- c(oc,as.character(oc2))
}
pc10 <- length(clusters)/10
tt <- table(clusters)
if(meta.triplets && sum(tt>=pc10)>2){
# get clusters that have more than 10% of the cells
cl2 <- names(tt)[tt>=pc10]
# otherwise get the 3 largest clusters
if(length(cl2)<3) cl2 <- names(sort(tt, decreasing=TRUE))[1:3]
w <- which(clusters %in% cl2)
# create triplets from meta cells:
meta <- .getMetaCells(x[,w], clusters[w], n.meta.cells=1,
meta.cell.size=100)$mu
i <- seq_len(ncol(meta))
ca <- expand.grid(i, i, i)
ca <- ca[ca[,1]<ca[,2] & ca[,2]<ca[,3],,drop=FALSE]
m2 <- round((meta[,ca[,1],drop=FALSE]+meta[,ca[,2]]+meta[,ca[,3]])/2)
if(is(ad.m,"dgCMatrix")) m2 <- as(m2,"dgCMatrix")
oc2 <- rep(NA_character_, ncol(m2))
names(oc2) <- colnames(m2) <-
paste0("artTriplet.", ncol(ad.m)+seq_len(ncol(m2)))
ad.m <- cbind(ad.m, m2)
oc <- c(oc, oc2)
}
list( counts=ad.m, origins=as.factor(oc) )
}
#' getCellPairs
#'
#' Given a vector of cluster labels, returns pairs of cross-cluster cells
#'
#' @param x A vector of cluster labels for each cell, or a list containing
#' metacells and graph
#' @param n The number of cell pairs to obtain
#' @param ... Further arguments, for instance the `k` vector of precluster
#' labels if `x` is a metacell graph.
#'
#' @return A data.frame with the columns
#' @export
#'
#' @examples
#' # create random labels
#' x <- sample(head(LETTERS), 100, replace=TRUE)
#' getCellPairs(x, n=6)
getCellPairs <- function(x, n=1000, ...){
if(is(x,"igraph")) return(.getCellPairsFromGraph(x, n=n, ...))
if(is.list(x) && all(c("graph","k") %in% names(x)))
return(.getCellPairsFromGraph(x$graph, k=x$k, n=n, ...))
.getCellPairsFromClusters(x, n=n, ...)
}
# get cross-cluster pairs of cells
.getCellPairsFromClusters <- function(clusters, n=1000, ls=NULL, q=c(0.1,0.9),
selMode="uniform", soft.min=5, ...){
cli <- split(seq_along(clusters), clusters)
if(!is.null(ls)){
ls <- split(ls, clusters)
for(f in names(cli)){
qr <- as.numeric(quantile(ls[[f]], prob=q))
cli[[f]] <- cli[[f]][which(ls[[f]]>=qr[1] & ls[[f]]<=qr[2])]
}
}
ca <- expand.grid(seq_along(cli), seq_along(cli))
ca <- as.data.frame(ca[ca[,1]<ca[,2],])
ca$orig <- factor(paste( names(cli)[ca[,1]], names(cli)[ca[,2]], sep="+"))
if(selMode=="uniform"){
ca$n <- ceiling(n/nrow(ca))
}else{
ed <- getExpectedDoublets(clusters, only.heterotypic=TRUE)
ed <- ed*n/sum(ed)
if(selMode=="sqrt") ed <- sqrt(ed)
ed <- soft.min + ed
ed <- ceiling(ed*n/sum(ed))
ca$n <- ed[as.character(ca$orig)]
}
ca <- ca[ca$n>0,]
lvls <- levels(ca$orig)
ca <- do.call(rbind, lapply( seq_len(nrow(ca)), FUN=function(i){
cbind( cell1=sample(cli[[ca[i,1]]], size=ca[i,4],
replace=ca[i,4]>length(cli[[ca[i,1]]])),
cell2=sample(cli[[ca[i,2]]], size=ca[i,4],
replace=ca[i,4]>length(cli[[ca[i,2]]])),
orig.clusters=rep(ca[i,3],ca[i,4]) )
}))
ca <- as.data.frame(ca)
ca$orig.clusters <- factor(ca$orig.clusters, levels=seq_along(lvls), labels=lvls)
ca[!duplicated(ca),]
}
# get pairs of cells based on distances on the meta-cell graph
#' @importFrom igraph distances V
.getCellPairsFromGraph <- function(g, k=seq_len(length(V(g))), n=1000,
weightFun=NULL, ...){
cli <- split(seq_along(k), k)
ca <- expand.grid(seq_along(cli), seq_along(cli))
ca <- as.data.frame(ca[ca[,1]<ca[,2],])
d <- distances(g)
d[is.infinite(d)] <- max(d[!is.infinite(d)])
if(is.null(weightFun)) weightFun <- function(d) sqrt(d)-0.5
if(is.character(weightFun)) weightFun <- get(weightFun)
d <- weightFun(d)
d[is.nan(d) | d<0] <- 0
ca$dist <- vapply(seq_len(nrow(ca)), FUN.VALUE=numeric(1),
FUN=function(i) d[ca[i,1],ca[i,2]])
ca$n <- rpois(nrow(ca), n/sum(ca$dist)*ca$dist)
ca <- ca[ca$n>0,]
oc <- rep(factor(paste(names(cli)[ca[,1]], names(cli)[ca[,2]], sep="+")),ca$n)
ca <- do.call(rbind, lapply( seq_len(nrow(ca)), FUN=function(i){
cbind( sample(cli[[ca[i,1]]],size=ca$n[i],replace=TRUE),
sample(cli[[ca[i,2]]],size=ca$n[i],replace=TRUE) )
}))
ca <- data.frame(ca, orig.clusters=oc)
colnames(ca) <- c("cell1","cell2","orig.clusters")
ca[!duplicated(ca),]
}
# creates within-cluster meta-cells from a count matrix
.getMetaCells <- function(x, clusters, n.meta.cells=20, meta.cell.size=20){
if(is.factor(clusters)) clusters <- droplevels(clusters)
cli <- split(seq_along(clusters), clusters)
meta <- lapply(cli, FUN=function(x){
lapply(seq_len(n.meta.cells), FUN=function(y){
sample(x,min(ceiling(0.6*length(x)),meta.cell.size),replace=FALSE)
})
})
cl2 <- rep(names(cli), lengths(meta))
meta <- vapply(unlist(meta, recursive=FALSE), FUN.VALUE=double(nrow(x)),
FUN=function(y){ Matrix::rowMeans(x[,y,drop=FALSE]) })
colnames(meta) <- paste0("metacell.",seq_len(ncol(meta)))
list(mu=meta, clusters=cl2)
}
#' addDoublets
#'
#' Adds artificial doublets to an existing dataset
#'
#' @param x A count matrix of singlets, or a
#' \code{\link[SummarizedExperiment]{SummarizedExperiment-class}}
#' @param clusters A vector of cluster labels for each column of `x`
#' @param dbr The doublet rate
#' @param only.heterotypic Whether to add only heterotypic doublets.
#' @param adjustSize Whether to adjust the library sizes of the doublets.
#' @param prefix Prefix for the colnames generated.
#' @param ... Any further arguments to \code{\link{createDoublets}}.
#'
#' @return A `SingleCellExperiment` with the colData columns `cluster` and
#' `type` (indicating whether the cell is a singlet or doublet).
#'
#' @export
#' @examples
#' sce <- mockDoubletSCE(dbl.rate=0)
#' sce <- addDoublets(sce, clusters=sce$cluster)
addDoublets <- function(x, clusters, dbr=(0.01*ncol(x)/1000),
only.heterotypic=TRUE, adjustSize=FALSE,
prefix="doublet.", ...){
ed <- round(getExpectedDoublets(clusters, dbr=dbr,
only.heterotypic=only.heterotypic))
ed <- ed[ed>0]
if(length(ed)==0) return(x)
if(is(x, "SingleCellExperiment")) x <- counts(x)
ca <- getCellPairs(clusters, n=length(ed)*(30+max(ed)), ls=Matrix::colSums(x))
ca <- split(ca, ca$orig.clusters)
ca <- do.call(rbind, lapply(names(ed), FUN=function(i){
ca2 <- ca[[i]]
ca2[sample.int(nrow(ca2), ed[[i]]),,drop=FALSE]
}))
cl <- as.factor(c(as.character(clusters), as.character(ca[,3])))
m2 <- createDoublets(x, ca, clusters, adjustSize=adjustSize, prefix=prefix)
cd <- data.frame(type=rep(c("singlet","doublet"),c(ncol(x),ncol(m2))),
cluster=cl)
SingleCellExperiment(list(counts=cbind(x, m2)), colData=cd)
}
#' createDoublets
#'
#' Creates artificial doublet cells by combining given pairs of cells
#'
#' @param x A count matrix of real cells
#' @param dbl.idx A matrix or data.frame with pairs of cell indexes stored in
#' the first two columns.
#' @param clusters An optional vector of cluster labels (for each column of `x`)
#' @param resamp Logical; whether to resample the doublets using the poisson
#' distribution. Alternatively, if a proportion between 0 and 1, the proportion
#' of doublets to resample.
#' @param halfSize Logical; whether to half the library size of doublets
#' (instead of just summing up the cells). Alternatively, a number between 0
#' and 1 can be given, determining the proportion of the doublets for which
#' to perform the size adjustment. Ignored if not resampling.
#' @param adjustSize Logical; whether to adjust the size of the doublets using
#' the median sizes per cluster of the originating cells. Requires `clusters` to
#' be given. Alternatively to a logical value, a number between 0 and 1 can be
#' given, determining the proportion of the doublets for which to perform the
#' size adjustment.
#' @param prefix Prefix for the colnames generated.
#'
#' @return A matrix of artificial doublets.
#' @export
#' @examples
#' sce <- mockDoubletSCE()
#' idx <- getCellPairs(sce$cluster, n=200)
#' art.dbls <- createDoublets(sce, idx)
createDoublets <- function(x, dbl.idx, clusters=NULL, resamp=0.5,
halfSize=0.5, adjustSize=FALSE, prefix="dbl."){
adjustSize <- .checkPropArg(as.numeric(adjustSize),FALSE)
halfSize <- .checkPropArg(as.numeric(halfSize),FALSE)
resamp <- .checkPropArg(as.numeric(resamp),FALSE)
if(is(x,"SingleCellExperiment")) x <- counts(x)
if(adjustSize>1 || adjustSize<0)
stop("`adjustSize` should be a logical or a number between 0 and 1.")
if(halfSize>1 || halfSize<0)
stop("`adjustSize` should be a logical or a number between 0 and 1.")
wAd <- sample.int(nrow(dbl.idx), size=round(adjustSize*nrow(dbl.idx)))
wNad <- setdiff(seq_len(nrow(dbl.idx)),wAd)
x1 <- x[,dbl.idx[wNad,1],drop=FALSE]+x[,dbl.idx[wNad,2],drop=FALSE]
if(length(wAd)>1){
if(is.null(clusters)) stop("If `adjustSize=TRUE`, clusters must be given.")
dbl.idx <- as.data.frame(dbl.idx[wAd,,drop=FALSE])
ls <- Matrix::colSums(x)
csz <- vapply(split(ls,clusters), FUN=median, FUN.VALUE=numeric(1))
dbl.idx$ls.ratio <- ls[dbl.idx[,1]]/(ls[dbl.idx[,1]]+ls[dbl.idx[,2]])
ls1 <- csz[as.character(clusters[dbl.idx[,1]])]
ls2 <- csz[as.character(clusters[dbl.idx[,2]])]
dbl.idx$factor <- (dbl.idx$ls.ratio+ls1/(ls1+ls2))/2
dbl.idx$factor[dbl.idx$factor>0.8] <- 0.8
dbl.idx$factor[dbl.idx$factor<0.2] <- 0.2
dbl.idx$ls <- (ls[dbl.idx[,1]]+ls[dbl.idx[,2]])
x2 <- x[,dbl.idx[,1]]*dbl.idx$factor+x[,dbl.idx[,2]]*(1-dbl.idx$factor)
x2 <- tryCatch(x2 %*% diag(dbl.idx$ls/Matrix::colSums(x2)),
error=function(e) t(t(x2)/Matrix::colSums(x2)))
x1 <- cbind(x1,x2)
rm(x2)
}
x <- x1
rm(x1)
if(halfSize>0){
wAd <- sample.int(nrow(dbl.idx), size=ceiling(halfSize*nrow(dbl.idx)))
if(length(wAd)>0) x[,wAd] <- x[,wAd]/2
}
if(resamp>0){
if(resamp!=halfSize) wAd <- sample.int(ncol(x), ceiling(resamp*ncol(x)))
if(length(wAd)>0)
x[,wAd] <- matrix(as.integer(rpois(nrow(x)*length(wAd),
as.numeric(as.matrix(x[,wAd])))),
nrow=nrow(x))
}else{
x <- round(x)
}
x <- as(x,"dgCMatrix")
colnames(x) <- paste0( prefix, seq_len(ncol(x)) )
x
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.