R/initialize_tau.R

Defines functions initialize_tau_single_etp initialize_tau_multi_etp initialize_tau

Documented in initialize_tau

#' initialize_tau
#' @description Initialize all tau values based on the observed replicate variance.
#'
#' @param Model an object of class gemini.model
#' @param CONSTANT a numeric indicating a constant value that shifts counts to reduce outliers (default = 32).
#' @param prior_shape shape parameter of Gamma distribution used to model the variation in the data in \code{Input}. If single numeric value, then shape parameters for all samples are assumed equal. Otherwise, a named numeric vector of shape parameters the same length as the number of samples (excluding early time point).
#' @param window numeric if window smoothing should be done on initialized tau values, otherwise NULL (default) for no window smoothing
#' @param monotonize logical specifying whether the variance should be monotonically increasing (default FALSE)
#' @param verbose default FALSE
#'
#' @return a Model object of class gemini.model including new slots for alpha and beta values
#' 
#' @importFrom stats sd
#' @importFrom magrittr set_rownames
#' @importFrom magrittr set_colnames
#' @export
#' @examples
#' data("Model", package = "gemini")
#' Model <- initialize_tau(Model, CONSTANT = 32, prior_shape = 0.5)
initialize_tau <- function(Model,
                           CONSTANT = 32,
                           prior_shape = 0.5,
                           window = NULL,
                           monotonize = FALSE,
                           verbose = FALSE) {
    # Check input
    stopifnot("gemini.model" %in% class(Model))
    
    # User message
    if (verbose)
        message("Initializing tau")
    
    Input <- Model$Input
    sample2replicate <- Input[[Model$replicate.map]]
    
    # cell line names
    sample_early = unique(sample2replicate[sample2replicate[, "TP"] == "ETP", Model$sample.column.name])
    sample_late = unique(sample2replicate[sample2replicate[, "TP"] == "LTP", Model$sample.column.name])
    
    # Pass to appropriate helper function, given the number of ETP samples
    if (length(intersect(sample_early, sample_late))==0) {
        Model <-
            initialize_tau_single_etp(
                Model,
                CONSTANT,
                prior_shape,
                window,
                monotonize,
                verbose
            )
    } else{
        Model <-
            initialize_tau_multi_etp(
                Model,
                CONSTANT,
                prior_shape,
                window,
                monotonize,
                verbose
            )
    }
    return(Model)
}

initialize_tau_multi_etp <- function(Model,
                                     CONSTANT = 32,
                                     prior_shape = 0.5,
                                     window = NULL,
                                     monotonize = FALSE,
                                     verbose = FALSE) {
    Input <- Model$Input
    sample2replicate <- Input[[Model$replicate.map]]
    
    # cell line names
    sample_early = unique(sample2replicate[sample2replicate[, "TP"] == "ETP", Model$sample.column.name])
    sample_late = unique(sample2replicate[sample2replicate[, "TP"] == "LTP", Model$sample.column.name])
    
    # prior shape as vector or constant?
    if (length(prior_shape) == 1) {
        prior_shape <- rep(prior_shape, length(sample_late))
        names(prior_shape) <- sample_late
    } else if (length(prior_shape) != length(sample_late)) {
        stop(
            "Prior shape not defined for each sample! Please ensure that length of prior shape is the same as the number of LTP samples."
        )
    }
    if(!all(sample_late %in% names(prior_shape))){
        stop("Prior shape not defined for each LTP sample! Please ensure that length of prior shape is the same as the number of LTP samples and that names of prior_shape match sample names.")
    }
    Model$prior_shape <- prior_shape
    
    
    # define mean and sd matrix
    meanmat = sdmat = matrix(
        nrow = nrow(Input$counts),
        ncol = length(sample_late),
        dimnames = list(rownames(Input$counts), sample_late)
    )
    
    meanmat_etp = sdmat_etp = matrix(
        nrow = nrow(Input$counts),
        ncol = length(sample_early),
        dimnames = list(rownames(Input$counts), sample_early)
    )
    
    # average counts across all samples
    countavr = sum(Input$counts, na.rm = TRUE) / (ncol(Input$counts))
    
    # mean and sd calculation
    for (i in sample_early) {
        col.ind = sample2replicate[sample2replicate[, Model$sample.column.name] == i &
                                       sample2replicate[, "TP"] == "ETP", 1] # replicate names for cell line i
        count_replicates = as.matrix(Input$counts[, col.ind])
        count_replicates = apply(count_replicates, 2, function(x)
            x / sum(x, na.rm = TRUE)) * countavr # normalize samples by total counts
        count_replicates_mednorm = apply(count_replicates, 2, function(x)
            .median_normalize(x, CONSTANT)) # median-normalize samples
        meanmat_etp[, i] = rowMeans(count_replicates_mednorm, na.rm = TRUE) # calculate mean across rows
        sdmat_etp[, i] = apply(count_replicates_mednorm, 1, function(x)
            sd(x, na.rm = TRUE)) # calculate sd across rows
    }
    
    for (i in sample_late) {
        col.ind = sample2replicate[sample2replicate[, Model$sample.column.name] == i &
                                       sample2replicate[, "TP"] == "LTP", 1] # replicate names for cell line i
        count_replicates = as.matrix(Input$counts[, col.ind])
        count_replicates = apply(count_replicates, 2, function(x)
            x / sum(x, na.rm = TRUE)) * countavr # normalize samples by total counts
        count_replicates_mednorm = apply(count_replicates, 2, function(x)
            .median_normalize(x, CONSTANT)) # median-normalize samples
        meanmat[, i] = rowMeans(count_replicates_mednorm, na.rm = TRUE) # calculate mean
        sdmat[, i] = apply(count_replicates_mednorm, 1, function(x)
            sd(x, na.rm = TRUE)) # calculate sd
    }
    
    # window smoothing and monotonizing
    if (!is.null(window)) {
        for (i in seq_len(length(sample_early))) {
            I = order(meanmat_etp[, i], decreasing = FALSE)
            vars = sdmat_etp[I, i] ^ 2
            # window smoothing
            vars = .window_smooth(vars, window)
            # monotonizing
            if (monotonize) {
                vars = .monotonize(vars)
            }
            sdmat_etp[I, i] = vars ^ 0.5
        }
        for (i in seq_len(length(sample_late))) {
            I = order(meanmat[, i], decreasing = FALSE)
            vars = sdmat[I, i] ^ 2
            # window smoothing
            vars = .window_smooth(vars, window)
            # monotonizing
            if (monotonize) {
                vars = .monotonize(vars)
            }
            sdmat[I, i] = vars ^ 0.5
        }
    }
    
    # sd calculation when one replicate available
    if (any(!is.na(sdmat)) | any(!is.na(sdmat_etp))) {
        # at least two replicates available for minimum one cell line
        # median uncertainty across all cell lines if no replicate available for a cell line
        sdmat[, colSums(!is.na(sdmat)) == 0] = median(cbind(sdmat, sdmat_etp), na.rm = TRUE)
        sdmat_etp[, colSums(!is.na(sdmat_etp)) == 0] = median(cbind(sdmat, sdmat_etp), na.rm = TRUE)
        
        # median uncertainty within one cell line if no replicate available for a guide
        sdmat = apply(sdmat, 2, function(x) {
            x[is.na(x)] = median(x, na.rm = TRUE)
            return(x)
        })
        sdmat_etp = apply(sdmat_etp, 2, function(x) {
            x[is.na(x)] = median(x, na.rm = TRUE)
            return(x)
        })
        
        beta <-
            matrix(
                nrow = nrow(sdmat),
                ncol = length(sample_late),
                dimnames = list(rownames(sdmat), sample_late)
            )
        for (i in sample_late) {
            beta[, i] = prior_shape[i] * sdmat[, i] ^ 2 + sdmat_etp[, sample_early == i] ^
                2 + 2 * 1e-2
        }
    } else {
        # if no replicate available for any cell line
        beta = matrix(
            0.2 * prior_shape[sample_late],
            nrow = nrow(sdmat),
            ncol = ncol(sdmat),
            dimnames = list(rownames(sdmat), sample_late),
            byrow = TRUE
        ) # assume beta = 0.2*prior_shape, leading to the variance beta/alpha = 0.2 for LFCs
    }
    
    alpha = matrix(
        prior_shape[sample_late],
        nrow = nrow(beta),
        ncol = ncol(beta),
        dimnames = list(rownames(beta), colnames(beta)),
        byrow = TRUE
    )
    
    # output
    Model$alpha = alpha
    Model$beta = Model$beta_prior = beta
    return(Model)
}

initialize_tau_single_etp <- function(Model,
                                      CONSTANT = 32,
                                      prior_shape = 0.5,
                                      window = NULL,
                                      monotonize = FALSE,
                                      verbose = FALSE) {
    Input <- Model$Input
    sample2replicate <- Input[[Model$replicate.map]]
    
    # cell line names
    sample_early = unique(sample2replicate[sample2replicate[, "TP"] == "ETP", Model$sample.column.name])
    sample = unique(sample2replicate[,Model$sample.column.name])
    
    # prior shape as vector or constant?
    if (length(prior_shape) == 1) {
        prior_shape <-
            rep(prior_shape, length(unique(sample2replicate$samplename[sample2replicate[, "TP"] == 'LTP'])))
        names(prior_shape) <- unique(sample2replicate$samplename[sample2replicate[, "TP"] == 'LTP'])
    } else if (length(prior_shape) != length(unique(sample2replicate[sample2replicate[, "TP"] ==
                                                                     "LTP", Model$sample.column.name]))) {
        stop(
            "Prior shape not defined for each LTP sample! Please ensure that length of prior shape is the same as the number of LTP samples, or that one prior shape is defined for all."
        )
    }
    if(!all(unique(sample2replicate[sample2replicate[, "TP"] == "LTP", Model$sample.column.name]) %in% names(prior_shape))){
        stop("Prior shape not defined for each LTP sample! Please ensure that length of prior shape is the same as the number of LTP samples and that names of prior_shape match sample names.")
    }
    Model$prior_shape <- prior_shape
    
    # define mean and sd matrix
    meanmat = sdmat = matrix(
        nrow = nrow(Input$counts),
        ncol = length(sample),
        dimnames = list(rownames(Input$counts), sample)
    )
    countavr = sum(Input$counts, na.rm = TRUE) / (ncol(Input$counts)) # average counts per replicate
    
    # mean and sd calculation
    for (i in sample) {
        col.ind = sample2replicate[sample2replicate[, Model$sample.column.name] == i, 1] # replicate names for sample i
        count_replicates = as.matrix(Input$counts[, col.ind])
        count_replicates = apply(count_replicates, 2, function(x)
            x / sum(x, na.rm = TRUE)) * countavr # normalize samples by total counts
        count_replicates_mednorm = apply(count_replicates, 2, function(x)
            .median_normalize(x, CONSTANT)) # median-normalize samples
        meanmat[, i] = rowMeans(count_replicates_mednorm, na.rm = TRUE) # calculate mean
        sdmat[, i] = apply(count_replicates_mednorm, 1, function(x)
            sd(x, na.rm = TRUE)) # calculate sd
    }
    
    # window smoothing and monotonizing
    if (!is.null(window)) {
        for (i in seq_len(length(sample))) {
            I = order(meanmat[, i], decreasing = FALSE)
            vars = sdmat[I, i] ^ 2
            # window smoothing
            vars = .window_smooth(vars, window)
            # monotonizing
            if (monotonize) {
                vars = .monotonize(vars)
            }
            sdmat[I, i] = vars ^ 0.5
        }
    }
    
    # sd calculation when one replicate available
    if (any(!is.na(sdmat))) {
        # at least two replicates available for minimum one cell line
        
        # median uncertainty across all cell lines if no replicate available for a cell line
        sdmat[, colSums(!is.na(sdmat)) == 0] = median(sdmat, na.rm = TRUE)
        
        # median uncertainty within one cell line if no replicate available for a guide
        sdmat = apply(sdmat, 2, function(x) {
            x[is.na(x)] = median(x, na.rm = TRUE)
            return(x)
        })
        
        beta = t(t(sdmat[,!(sample_early == colnames(sdmat))] ^ 2 + sdmat[, sample_early] ^
                       2 + 2 * 1e-2) * prior_shape[sample[!(sample_early == colnames(sdmat))]]) %>%
            as.matrix()  %>%
            magrittr::set_colnames(sample[!(sample_early == colnames(sdmat))]) %>% 
            magrittr::set_rownames(rownames(sdmat))
    } else {
        # if no replicate available for any cell line
        beta = matrix(
            0.2 * prior_shape[sample[!(sample_early == colnames(sdmat))]],
            nrow = dim(sdmat)[1],
            ncol = sum(!(sample_early == colnames(sdmat))),
            dimnames = list(rownames(sdmat), sample[!(sample_early == colnames(sdmat))]),
            byrow = TRUE
        ) # assume beta = 0.2*prior_shape, leading to the variance beta/alpha = 0.2 for LFCs
    }
    
    alpha = matrix(
        prior_shape[sample[!(sample_early == colnames(sdmat))]],
        nrow = dim(beta)[1],
        ncol = dim(beta)[2],
        dimnames = list(rownames(beta), colnames(beta)),
        byrow = TRUE
    )
    
    # output
    Model$alpha = alpha
    Model$beta = Model$beta_prior = beta
    return(Model)
}
sellerslab/gemini documentation built on Dec. 5, 2022, 8:56 a.m.