celda: CEllular Latent Dirichlet Allocation

Documented in simulateCells

#' @title Simulate count data from the celda generative models.
#' @description This function generates a \linkS4class{SingleCellExperiment}
#'  containing a simulated counts matrix in the \code{"counts"} assay slot, as
#'  well as various parameters used in the simulation which can be
#'  useful for running celda and are stored in \code{metadata} slot. The user
#'  must provide the desired model (one of celda_C, celda_G, celda_CG) as well
#'  as any desired tuning parameters for those model's simulation functions
#'  as detailed below.
#' @param model Character. Options available in \code{celda::availableModels}.
#'  Can be one of \code{"celda_CG"}, \code{"celda_C"}, or \code{"celda_G"}.
#'  Default \code{"celda_CG"}.
#' @param S Integer. Number of samples to simulate. Default 5. Only used if
#'  \code{model} is one of \code{"celda_CG"} or \code{"celda_C"}.
#' @param CRange Integer vector. A vector of length 2 that specifies the lower
#'  and upper bounds of the number of cells to be generated in each sample.
#'  Default c(50, 100). Only used if
#'  \code{model} is one of \code{"celda_CG"} or \code{"celda_C"}.
#' @param NRange Integer vector. A vector of length 2 that specifies the lower
#'  and upper bounds of the number of counts generated for each cell. Default
#'  c(500, 1000).
#' @param C Integer. Number of cells to simulate. Default 100. Only used if
#'  \code{model} is \code{"celda_G"}.
#' @param G Integer. The total number of features to be simulated. Default 100.
#' @param K Integer. Number of cell populations. Default 5. Only used if
#'  \code{model} is one of \code{"celda_CG"} or \code{"celda_C"}.
#' @param L Integer. Number of feature modules. Default 10. Only used if
#'  \code{model} is one of \code{"celda_CG"} or \code{"celda_G"}.
#' @param alpha Numeric. Concentration parameter for Theta. Adds a pseudocount
#'  to each cell population in each sample. Default 1. Only used if
#'  \code{model} is one of \code{"celda_CG"} or \code{"celda_C"}.
#' @param beta Numeric. Concentration parameter for Phi. Adds a pseudocount to
#'  each feature module in each cell population. Default 1.
#' @param gamma Numeric. Concentration parameter for Eta. Adds a pseudocount to
#'  the number of features in each module. Default 5. Only used if
#'  \code{model} is one of \code{"celda_CG"} or \code{"celda_G"}.
#' @param delta Numeric. Concentration parameter for Psi. Adds a pseudocount to
#'  each feature in each module. Default 1. Only used if
#'  \code{model} is one of \code{"celda_CG"} or \code{"celda_G"}.
#' @param seed Integer. Passed to \link[withr]{with_seed}. For reproducibility,
#'  a default value of 12345 is used. If NULL, no calls to
#'  \link[withr]{with_seed} are made.
#' @return A \link[SingleCellExperiment]{SingleCellExperiment} object with
#'  simulated count matrix stored in the "counts" assay slot. Function
#'  parameter settings are stored in the \link{metadata} slot. For
#'  \code{"celda_CG"} and \code{"celda_C"} models,
#'  columns \code{celda_sample_label} and \code{celda_cell_cluster} in
#'  \link{colData} contain simulated sample labels and
#'  cell population clusters. For \code{"celda_CG"} and \code{"celda_G"}
#'  models, column \code{celda_feature_module} in
#'  \link{rowData} contains simulated gene modules.
#' @examples
#' sce <- simulateCells()
#' @export
simulateCells <- function(
    model = c("celda_CG", "celda_C", "celda_G"),
    S = 5,
    CRange = c(50, 100),
    NRange = c(500, 1000),
    C = 100,
    G = 100,
    K = 5,
    L = 10,
    alpha = 1,
    beta = 1,
    gamma = 5,
    delta = 1,
    seed = 12345) {

    model <- match.arg(model)

    if (model == "celda_C") {
        sce <- .simulateCellsMaincelda_C(model = model,
            S = S,
            CRange = CRange,
            NRange = NRange,
            G = G,
            K = K,
            alpha = alpha,
            beta = beta,
            seed = seed)
    } else if (model == "celda_CG") {
        sce <- .simulateCellsMaincelda_CG(
            model = model,
            S = S,
            CRange = CRange,
            NRange = NRange,
            G = G,
            K = K,
            L = L,
            alpha = alpha,
            beta = beta,
            gamma = gamma,
            delta = delta,
            seed = seed)
    } else if (model == "celda_G") {
        sce <- .simulateCellsMaincelda_G(
            model = model,
            C = C,
            L = L,
            NRange = NRange,
            G = G,
            beta = beta,
            delta = delta,
            gamma = gamma,
            seed = seed)
    } else {
        stop("'model' must be one of 'celda_C', 'celda_G', or 'celda_CG'")
    }

    return(sce)
}


.simulateCellsMaincelda_C <- function(model,
    S,
    CRange,
    NRange,
    G,
    K,
    alpha,
    beta,
    seed) {

    if (is.null(seed)) {
        res <- .simulateCellscelda_C(model = model,
            S = S,
            CRange = CRange,
            NRange = NRange,
            G = G,
            K = K,
            alpha = alpha,
            beta = beta)
    } else {
        res <- with_seed(seed,
            .simulateCellscelda_C(model = model,
                S = S,
                CRange = CRange,
                NRange = NRange,
                G = G,
                K = K,
                alpha = alpha,
                beta = beta))
    }

    sce <- .createSCEsimulateCellsCeldaC(res, seed)

    return(sce)
}


.simulateCellscelda_C <- function(model,
    S = 5,
    CRange = c(50, 100),
    NRange = c(500, 1000),
    G = 100,
    K = 5,
    alpha = 1,
    beta = 1) {

    phi <- .rdirichlet(K, rep(beta, G))
    theta <- .rdirichlet(S, rep(alpha, K))

    ## Select the number of cells per sample
    nC <- sample(seq(CRange[1], CRange[2]), size = S, replace = TRUE)
    cellSampleLabel <- rep(seq(S), nC)

    ## Select state of the cells
    z <- unlist(lapply(seq(S), function(i) {
        sample(seq(K),
            size = nC[i],
            prob = theta[i, ],
            replace = TRUE)
    }))

    ## Select number of transcripts per cell
    nN <- sample(seq(NRange[1], NRange[2]),
        size = length(cellSampleLabel),
        replace = TRUE)

    ## Select transcript distribution for each cell
    cellCounts <- vapply(seq(length(cellSampleLabel)), function(i) {
        stats::rmultinom(1, size = nN[i], prob = phi[z[i], ])
    }, integer(G))

    rownames(cellCounts) <- paste0("Gene_", seq(nrow(cellCounts)))
    colnames(cellCounts) <- paste0("Cell_", seq(ncol(cellCounts)))
    cellSampleLabel <- paste0("Sample_", seq(S))[cellSampleLabel]
    cellSampleLabel <- factor(cellSampleLabel,
        levels = paste0("Sample_", seq(S)))

    ## Peform reordering on final Z and Y assigments:
    cellCounts <- .processCounts(cellCounts)
    names <- list(row = rownames(cellCounts),
        column = colnames(cellCounts),
        sample = unique(cellSampleLabel))
    countChecksum <- .createCountChecksum(cellCounts)
    result <- methods::new("celda_C",
        clusters = list(z = z),
        params = list(K = as.integer(K),
            alpha = alpha,
            beta = beta,
            countChecksum = countChecksum),
        sampleLabel = cellSampleLabel,
        names = names)
    class(result) <- "celda_C"
    result <- .reorderCeldaC(counts = cellCounts, res = result)

    return(list(z = celdaClusters(result)$z,
        counts = cellCounts,
        sampleLabel = cellSampleLabel,
        G = G,
        K = K,
        alpha = alpha,
        beta = beta,
        CRange = CRange,
        NRange = NRange,
        S = S))
}


.createSCEsimulateCellsCeldaC <- function(simList, seed) {
    sce <- SingleCellExperiment::SingleCellExperiment(
        assays = list(counts = simList$counts))

    # add metadata
    S4Vectors::metadata(sce)[["celda_simulateCellscelda_C"]] <- list(
        model = "celda_C",
        sampleLevels = as.character(unique(simList$sampleLabel)),
        cellClusterLevels = sort(unique(simList$z)),
        S = simList$S,
        CRange = simList$CRange,
        NRange = simList$NRange,
        G = simList$G,
        K = simList$K,
        alpha = simList$alpha,
        beta = simList$beta,
        seed = seed)

    SummarizedExperiment::rowData(sce)["rownames"] <- rownames(simList$counts)
    SummarizedExperiment::colData(sce)["colnames"] <-
        colnames(simList$counts)
    SummarizedExperiment::colData(sce)["celda_sample_label"] <-
        as.factor(simList$sampleLabel)
    SummarizedExperiment::colData(sce)["celda_cell_cluster"] <-
        as.factor(simList$z)

    return(sce)
}


.simulateCellsMaincelda_CG <- function(model,
    S,
    CRange,
    NRange,
    G,
    K,
    L,
    alpha,
    beta,
    gamma,
    delta,
    seed) {

    if (is.null(seed)) {
        res <- .simulateCellscelda_CG(
            model = model,
            S = S,
            CRange = CRange,
            NRange = NRange,
            G = G,
            K = K,
            L = L,
            alpha = alpha,
            beta = beta,
            gamma = gamma,
            delta = delta)
    } else {
        with_seed(
            seed,
            res <- .simulateCellscelda_CG(
                model = model,
                S = S,
                CRange = CRange,
                NRange = NRange,
                G = G,
                K = K,
                L = L,
                alpha = alpha,
                beta = beta,
                gamma = gamma,
                delta = delta
            )
        )
    }

    sce <- .createSCEsimulateCellsCeldaCG(res, seed)

    return(sce)
}


.simulateCellscelda_CG <- function(model = model,
    S = S,
    CRange = CRange,
    NRange = NRange,
    G = G,
    K = K,
    L = L,
    alpha = alpha,
    beta = beta,
    gamma = gamma,
    delta = delta) {

    ## Number of cells per sample
    nC <- sample(seq(CRange[1], CRange[2]), size = S, replace = TRUE)
    nCSum <- sum(nC)
    cellSampleLabel <- rep(seq(S), nC)

    ## Select number of transcripts per cell
    nN <- sample(seq(NRange[1], NRange[2]),
        size = length(cellSampleLabel),
        replace = TRUE
    )

    ## Generate cell population distribution for each sample
    theta <- t(.rdirichlet(S, rep(alpha, K)))

    ## Assign cells to cellular subpopulations
    z <- unlist(lapply(seq(S), function(i) {
        sample(seq(K),
            size = nC[i],
            prob = theta[, i],
            replace = TRUE
        )
    }))

    ## Generate transcriptional state distribution for each cell subpopulation
    phi <- .rdirichlet(K, rep(beta, L))

    ## Assign genes to gene modules
    eta <- .rdirichlet(1, rep(gamma, L))
    y <- sample(seq(L),
        size = G,
        prob = eta,
        replace = TRUE
    )
    if (length(table(y)) < L) {
        warning(
            "Some gene modules did not receive any genes after sampling.",
            " Try increasing G and/or making gamma larger."
        )
        L <- length(table(y))
        y <- as.integer(as.factor(y))
    }

    psi <- matrix(0, nrow = G, ncol = L)
    for (i in seq(L)) {
        ind <- y == i
        psi[ind, i] <- .rdirichlet(1, rep(delta, sum(ind)))
    }

    ## Select transcript distribution for each cell
    cellCounts <- matrix(0, nrow = G, ncol = nCSum)
    for (i in seq(nCSum)) {
        transcriptionalStateDist <- as.integer(stats::rmultinom(1,
            size = nN[i], prob = phi[z[i], ]
        ))
        for (j in seq(L)) {
            if (transcriptionalStateDist[j] > 0) {
                cellCounts[, i] <- cellCounts[, i] + stats::rmultinom(1,
                    size = transcriptionalStateDist[j], prob = psi[, j]
                )
            }
        }
    }

    ## Ensure that there are no all-0 rows in the counts matrix, which violates
    ## a celda modeling
    ## constraint (columns are guarnteed at least one count):
    zeroRowIdx <- which(rowSums(cellCounts) == 0)
    if (length(zeroRowIdx > 0)) {
        cellCounts <- cellCounts[-zeroRowIdx, ]
        y <- y[-zeroRowIdx]
    }

    ## Assign gene/cell/sample names
    rownames(cellCounts) <- paste0("Gene_", seq(nrow(cellCounts)))
    colnames(cellCounts) <- paste0("Cell_", seq(ncol(cellCounts)))
    cellSampleLabel <- paste0("Sample_", seq(S))[cellSampleLabel]
    cellSampleLabel <- factor(cellSampleLabel,
        levels = paste0("Sample_", seq(S))
    )

    ## Peform reordering on final Z and Y assigments:
    cellCounts <- .processCounts(cellCounts)
    names <- list(
        row = rownames(cellCounts),
        column = colnames(cellCounts),
        sample = unique(cellSampleLabel)
    )
    countChecksum <- .createCountChecksum(cellCounts)
    result <- methods::new("celda_CG",
        clusters = list(z = z, y = y),
        params = list(
            K = as.integer(K),
            L = as.integer(L),
            alpha = alpha,
            beta = beta,
            delta = delta,
            gamma = gamma,
            countChecksum = countChecksum
        ),
        sampleLabel = cellSampleLabel,
        names = names
    )

    result <- .reorderCeldaCG(counts = cellCounts, res = result)

    return(list(
        z = celdaClusters(result)$z,
        y = celdaClusters(result)$y,
        counts = cellCounts,
        sampleLabel = cellSampleLabel,
        G = G,
        K = K,
        L = L,
        CRange = CRange,
        NRange = NRange,
        S = S,
        alpha = alpha,
        beta = beta,
        gamma = gamma,
        delta = delta
    ))
}


.createSCEsimulateCellsCeldaCG <- function(simList, seed) {
    sce <- SingleCellExperiment::SingleCellExperiment(
        assays = list(counts = simList$counts))

    # add metadata
    S4Vectors::metadata(sce)[["celda_simulateCellscelda_CG"]] <- list(
        model = "celda_CG",
        sampleLevels = as.character(unique(simList$sampleLabel)),
        cellClusterLevels = sort(unique(simList$z)),
        featureModuleLevels = sort(unique(simList$y)),
        S = simList$S,
        CRange = simList$CRange,
        NRange = simList$NRange,
        G = simList$G,
        K = simList$K,
        L = simList$L,
        alpha = simList$alpha,
        beta = simList$beta,
        gamma = simList$gamma,
        delta = simList$delta,
        seed = seed)

    SummarizedExperiment::rowData(sce)["rownames"] <- rownames(simList$counts)
    SummarizedExperiment::colData(sce)["colnames"] <-
        colnames(simList$counts)
    SummarizedExperiment::colData(sce)["celda_sample_label"] <-
        as.factor(simList$sampleLabel)
    SummarizedExperiment::colData(sce)["celda_cell_cluster"] <-
        as.factor(simList$z)
    SummarizedExperiment::rowData(sce)["celda_feature_module"] <-
        as.factor(simList$y)

    return(sce)
}


.simulateCellsMaincelda_G <- function(model,
    C,
    L,
    NRange,
    G,
    beta,
    delta,
    gamma,
    seed) {

    if (is.null(seed)) {
        res <- .simulateCellscelda_G(
            model = model,
            C = C,
            L = L,
            NRange = NRange,
            G = G,
            beta = beta,
            delta = delta,
            gamma = gamma)
    } else {
        with_seed(
            seed,
            res <- .simulateCellscelda_G(
                model = model,
                C = C,
                L = L,
                NRange = NRange,
                G = G,
                beta = beta,
                delta = delta,
                gamma = gamma)
        )
    }

    sce <- .createSCEsimulateCellsCeldaG(res, seed)

    return(sce)
}


.simulateCellscelda_G <- function(model,
    C = 100,
    L = 10,
    NRange = c(500, 1000),
    G = 100,
    beta = 1,
    gamma = 5,
    delta = 1,
    ...) {

    eta <- .rdirichlet(1, rep(gamma, L))

    y <- sample(seq(L),
        size = G,
        prob = eta,
        replace = TRUE
    )
    if (length(table(y)) < L) {
        stop(
            "Some states did not receive any features after sampling. Try",
            " increasing G and/or setting gamma > 1."
        )
    }

    psi <- matrix(0, nrow = G, ncol = L)
    for (i in seq(L)) {
        ind <- y == i
        psi[ind, i] <- .rdirichlet(1, rep(delta, sum(ind)))
    }

    phi <- .rdirichlet(C, rep(beta, L))

    ## Select number of transcripts per cell
    nN <- sample(seq(NRange[1], NRange[2]), size = C, replace = TRUE)

    ## Select transcript distribution for each cell
    cellCounts <- matrix(0, nrow = G, ncol = C)
    for (i in seq(C)) {
        cellDist <- stats::rmultinom(1, size = nN[i], prob = phi[i, ])
        for (j in seq(L)) {
            cellCounts[, i] <- cellCounts[, i] + stats::rmultinom(1,
                size = cellDist[j], prob = psi[, j]
            )
        }
    }

    ## Ensure that there are no all-0 rows in the counts matrix, which violates
    ## a celda modeling
    ## constraint (columns are guarnteed at least one count):
    zeroRowIdx <- which(rowSums(cellCounts) == 0)
    if (length(zeroRowIdx > 0)) {
        cellCounts <- cellCounts[-zeroRowIdx, ]
        y <- y[-zeroRowIdx]
    }

    rownames(cellCounts) <- paste0("Gene_", seq(nrow(cellCounts)))
    colnames(cellCounts) <- paste0("Cell_", seq(ncol(cellCounts)))

    ## Peform reordering on final Z and Y assigments:
    cellCounts <- .processCounts(cellCounts)
    names <- list(
        row = rownames(cellCounts),
        column = colnames(cellCounts)
    )
    countChecksum <- .createCountChecksum(cellCounts)
    result <- methods::new("celda_G",
        clusters = list(y = y),
        params = list(
            L = as.integer(L),
            beta = beta,
            delta = delta,
            gamma = gamma,
            countChecksum = countChecksum
        ),
        names = names
    )
    result <- .reorderCeldaG(counts = cellCounts, res = result)

    return(list(
        y = celdaClusters(result)$y,
        counts = cellCounts,
        C = C,
        G = G,
        L = L,
        NRange = NRange,
        beta = beta,
        delta = delta,
        gamma = gamma
    ))
}


.createSCEsimulateCellsCeldaG <- function(simList, seed) {
    sce <- SingleCellExperiment::SingleCellExperiment(
        assays = list(counts = simList$counts))

    # add metadata
    S4Vectors::metadata(sce)[["celda_simulateCellscelda_G"]] <- list(
        model = "celda_G",
        featureModuleLevels = sort(unique(simList$y)),
        NRange = simList$NRange,
        C = simList$C,
        G = simList$G,
        L = simList$L,
        beta = simList$beta,
        gamma = simList$gamma,
        delta = simList$delta,
        seed = seed)

    SummarizedExperiment::rowData(sce)["rownames"] <- rownames(simList$counts)
    SummarizedExperiment::colData(sce)["colnames"] <-
        colnames(simList$counts)
    SummarizedExperiment::rowData(sce)["celda_feature_module"] <-
        as.factor(simList$y)

    return(sce)
}