R/utils.R

Defines functions filter_by_variability filter_by_coverage_across_cells filter_by_cpg_coverage .align_cluster cluster_ari cluster_error eval_cluster_performance eval_imputation_performance .impute_files_internal impute_met_files impute_test_met partition_dataset extract_y init_design_matrix .add_func .parallel_cores log_sum_exp

Documented in cluster_ari cluster_error eval_cluster_performance eval_imputation_performance extract_y filter_by_coverage_across_cells filter_by_cpg_coverage filter_by_variability impute_met_files impute_test_met init_design_matrix log_sum_exp partition_dataset

#' @title Compute stable log-sum-exp
#'
#' @description \code{log_sum_exp} computes the log sum exp trick for avoiding
#'   numeric underflow and have numeric stability in computations of small
#'   numbers.
#'
#' @param x A vector of observations
#'
#' @return The logs-sum-exp value
#'
#' @author C.A.Kapourani \email{C.A.Kapourani@@ed.ac.uk}
#'
#' @references
#' \url{https://hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/}
#'
log_sum_exp <- function(x) {
  # Computes log(sum(exp(x))
  offset <- max(x)
  return(log(sum(exp(x - offset))) + offset)
}

# @title Number of parallel cores
#
# @description Function for creating the number of parallel cores that will be
#   used during EM.
# @param no_cores Number of cores given as input
# @param is_parallel Logical, did we require parallel computations
# @param M Total number of sources
#
.parallel_cores <- function(no_cores=NULL, is_parallel=FALSE, max_cores = NULL){
  if (is_parallel) { # If parallel mode is ON
    # If number of cores is not given
    if (is.null(no_cores)) { no_cores <- parallel::detectCores() - 1
    } else{if (no_cores >= parallel::detectCores()) {
      no_cores <- parallel::detectCores() - 1 }
    }
    if (is.na(no_cores)) { no_cores <- 2 }
    if (!is.null(max_cores)) {
      if (no_cores > max_cores) { no_cores <- max_cores }
    }
  }
  return(no_cores)
}


## A general-purpose adder:
.add_func <- function(x) Reduce(f = "+", x = x)



#' @title Initialise design matrices
#'
#' @description Given a list of observations, initialise design matrices H for
#'   computational efficiency.
#'
#' @param basis Basis object.
#' @param X Observations
#' @param coverage_ind Which observations have coverage
#'
#' @return The design matrix H
#'
#' @importFrom BPRMeth design_matrix
#'
init_design_matrix <- function(basis, X, coverage_ind){
  # Design matrix for each genomic region -> i.e. gene
  H <- vector(mode = "list", length = length(X))
  # Set all values to NA
  H <- lapply(H, function(x) x <- NA)
  if (length(coverage_ind) < 1) {
    stop("No coverage across all regions. This cell should be discarded!")
  }
  H[coverage_ind] <- lapply(X = X[coverage_ind], FUN = function(y)
    design_matrix(obj = basis, obs = y[,1])$H)
  return(H)
}


#' @title Extract responses y
#'
#' @description Given a list of observations, extract responses y
#'
#' @param X Observations
#' @param coverage_ind Which observations have coverage
#'
#' @return The design matrix H
#'
extract_y <- function(X, coverage_ind){
  # Responses y for each genomic region -> i.e. gene
  y <- vector(mode = "list", length = length(X))
  # Set all values to NA
  y <- lapply(y, function(x) x <- NA)
  if (length(coverage_ind) < 1) {
    stop("No coverage across all regions. This cell should be discarded!")
  }
  y[coverage_ind] <- lapply(X = X[coverage_ind], FUN = function(x) x[,2])
  return(y)
}



#' @title Partition synthetic dataset to training and test set
#'
#' @param dt_obj Melissa data object
#' @param data_train_prcg Percentage of genomic regions that will be fully used
#'   for training, i.e. across the whole region we will have no CpGs missing.
#' @param region_train_prcg Fraction of genomic regions to keep for training
#'   set, i.e. some genomic regions will have no coverage at all during
#'   training.
#' @param cpg_train_prcg Fraction of CpGs in each genomic region to keep for
#'   training set.
#' @param is_synth Logical, whether we have synthetic data or not.
#'
#' @return The Melissa object with the following changes. The `met` element will
#'   now contain only the `training` data. An additional element called
#'   `met_test` will store the data that will be used during testing to evaluate
#'   the imputation performance. These data will not be seen from Melissa during
#'   inference.
#'
#' @author C.A.Kapourani \email{C.A.Kapourani@@ed.ac.uk}
#'
#' @examples
#' # Partition the synthetic data from Melissa package
#' dt <- partition_dataset(melissa_encode_dt)
#'
#' @seealso \code{\link{create_melissa_data_obj}}, \code{\link{melissa}},
#' \code{\link{filter_regions}}
#'
#' @export
#'
partition_dataset <- function(dt_obj, data_train_prcg = 0.5,
                              region_train_prcg = 0.95, cpg_train_prcg = 0.5,
                              is_synth = FALSE){
  assertthat::assert_that(data_train_prcg >= 0 & data_train_prcg <= 1)
  assertthat::assert_that(region_train_prcg >= 0 & region_train_prcg <= 1)
  assertthat::assert_that(cpg_train_prcg >= 0 & cpg_train_prcg <= 1)
  # If `met_test` element already exists, stop
  if (!is.null(dt_obj$met_test)) { stop("Stopping. Test data already exist!")}
  train = test <- dt_obj$met
  N <- length(dt_obj$met)       # Number of cells
  M <- length(dt_obj$met[[1]])  # Number of genomic regions
  for (n in seq_len(N)) {     # Iterate over each cell
    if (!is_synth) { # If we have real data
      # Keep indices of genomic regions that have CpG coverage
      cov_gen_ind <- which(!is.na(dt_obj$met[[n]]))
      # Compute the number of those regions
      N_cov <- length(cov_gen_ind)
      if (N_cov < 10) {
        message("Low genomic coverage..."); return(1)
      }
      pivot <- region_train_prcg * N_cov
      # These are the training data
      train_ind <- cov_gen_ind[sort(sample(N_cov, round(pivot)))]
    }else {
      # Genomic regions with CpG coverage
      pivot <- region_train_prcg * M
      train_ind <- sort(sample(M, round(pivot)))
    }
    train[[n]][-train_ind] <- NA
    test[[n]][train_ind] <- NA
    # Iterate over each covered genomic region
    for (m in train_ind) {
      # Will this sample be used fully for training
      is_train <- stats::rbinom(1, 1, data_train_prcg)
      if (!is_train) {
        # Get fraction of CpGs covered
        pivot <- cpg_train_prcg * NROW(dt_obj$met[[n]][[m]])
        idx <- sort(sample(NROW(dt_obj$met[[n]][[m]]), round(pivot)))
        train[[n]][[m]] <- dt_obj$met[[n]][[m]][idx,,drop = FALSE]
        test[[n]][[m]] <- dt_obj$met[[n]][[m]][-idx,,drop = FALSE]
      }
    }
  }
  # Set the training data as the `met` element
  dt_obj$met <- train
  # Set the test data as the `met_test` element
  dt_obj$met_test <- test
  return(dt_obj)
}


#' @title Impute/predict test methylation states
#'
#' @description Make predictions of missing methylation states, i.e. perfrom
#'   imputation using Melissa. This requires keepin a subset of data as a held
#'   out test set during Melissa inference. If you want to impute a whole
#'   directory containing cells (files) with missing methylation levels, see
#'   \code{\link{impute_met_files}}.
#' @param obj Output of Melissa inference object.
#' @param test Test data to evaluate performance.
#' @param basis Basis object, if NULL we perform imputation using Melissa,
#'   otherwise using BPRMeth.
#' @param is_predictive Logical, use predictive distribution for imputation, or
#'   choose the cluster label with the highest responsibility.
#' @param return_test Whether or not to return a list with the predictions.
#'
#' @return A list containing two vectors, the true methylation state and the
#'   predicted/imputed methylation states.
#'
#' @importFrom BPRMeth eval_probit_function
#'
#' @author C.A.Kapourani \email{C.A.Kapourani@@ed.ac.uk}
#'
#' @examples
#' # Extract synthetic data
#' dt <- melissa_synth_dt
#'
#' # Partition to train and test set
#' dt <- partition_dataset(dt)
#'
#' # Create basis object from BPRMeth package
#' basis_obj <- BPRMeth::create_rbf_object(M = 3)
#'
#' # Run Melissa
#' melissa_obj <- melissa(X = dt$met, K = 2, basis = basis_obj, vb_max_iter=10,
#'    vb_init_nstart = 1, is_parallel = FALSE, is_verbose = FALSE)
#'
#' imputation_obj <- impute_test_met(obj = melissa_obj,
#'                                    test = dt$met_test)
#'
#' @seealso \code{\link{create_melissa_data_obj}}, \code{\link{melissa}},
#'   \code{\link{filter_regions}}, \code{\link{eval_imputation_performance}},
#'   \code{\link{eval_cluster_performance}}
#'
#' @export
impute_test_met <- function(obj, test, basis = NULL,
                             is_predictive = TRUE, return_test = FALSE){
  N         <- length(test)                      # Number of cells
  test_pred <- test                              # Copy test data
  act_obs   <- vector(mode = "list", length = N) # Keep actual CpG states
  pred_obs  <- vector(mode = "list", length = N) # Keep predicted CpG states
  # Iterate over each cell...
  for (n in seq_len(N)) {
    # Regions that have CpG observations for testing
    idx <- which(!is.na(test[[n]]))
    cell_act_obs  <- vector("list", length = length(idx))
    cell_pred_obs <- vector("list", length = length(idx))
    iter <- 1
    # Iterate over each genomic region
    for (m in idx) {
      # When joint imputation method
      if (is.null(basis)) {
        K <- NCOL(obj$r_nk) # Number of clusters
        # If we use the predictive density for imputation
        if (is_predictive) {
          tmp_mixt <- vector("numeric", length = length(test[[n]][[m]][, 1]))
          for (k in seq_len(K)) {
            # Evalute profile from weighted predictive
            tmp_mixt <- tmp_mixt + obj$r_nk[n,k] *
              eval_probit_function(obj = obj$basis,
                                   obs = test[[n]][[m]][, 1],
                                   w = obj$W[m,,k])
          }
          # Evaluate the methylation profile
          test_pred[[n]][[m]][,2] <- tmp_mixt
        }else{
          # Get cluster assignment
          k <- which.max(obj$r_nk[n,])
          test_pred[[n]][[m]][,2] <- eval_probit_function(obj = obj$basis,
                            obs = test[[n]][[m]][, 1], w = obj$W[m,,k])
        }
      }else{
        test_pred[[n]][[m]][,2] <- eval_probit_function(obj = basis,
                            obs = test[[n]][[m]][, 1], w = obj[m,,n])
      }
      test_pred[[n]][[m]][,2] <- round(test_pred[[n]][[m]][,2], 3)
      # TODO
      # Actual CpG states
      cell_act_obs[[iter]]  <- test[[n]][[m]][, 2]
      # Predicted CpG states (i.e. function evaluations)
      cell_pred_obs[[iter]] <- test_pred[[n]][[m]][, 2]
      iter <- iter + 1
    }
    # Combine data to a big vector
    act_obs[[n]] <- do.call("c", cell_act_obs)
    pred_obs[[n]] <- do.call("c", cell_pred_obs)
  }
  if (return_test) {
    return(list(test_pred = test_pred, act_obs = do.call("c", act_obs),
                pred_obs = do.call("c", pred_obs)))
  }else{
    return(list(act_obs = do.call("c", act_obs),
                pred_obs = do.call("c", pred_obs)))
  }
}


#' @title Impute/predict methylation files
#'
#' @description Make predictions of missing methylation states, i.e. perfrom
#'   imputation using Melissa. Each file in the directory will be used as input
#'   and a new file will be created in \code{outdir} with an additional column
#'   containing the predicted met state (value between 0 and 1). Note that
#'   predictions will be made only on \code{annotation regions} that were used
#'   for training Melissa. Check \code{\link{impute_test_met}}, if you want to
#'   make predictions only on test data.
#' @param met_dir Directory of methylation files, each file corresponds to a
#'   single cell. It should contain three columns <chr> <pos> <met_state>
#'   (similar to the input required by \code{\link{create_melissa_data_obj}}),
#'   where \code{met_state} can be any value that denotes missing CpG
#'   information, e.g. -1. Note that files can contain also CpGs for which we
#'   have coverage information, and we can check the predictions made by
#'   Melissa, hence the value can also be 0 (unmet) or (1) met. Predictions made
#'   by Melissa, will not change the <met_state> column. Melissa will just add
#'   an additional column named <predicted>.
#' @param outdir Directory to store the output files for each cell with exactly
#'   the same name. If NULL, then a directory called `imputed` inside `met_dir`
#'   will be created by default.
#' @param obj Output of Melissa inference object.
#' @param anno_region Annotation region object. This will be the outpuf of
#'   \code{\link{create_melissa_data_obj}} function, e..g
#'   melissa_data$anno_region. This is required to select those regions that
#'   were used to train Melissa.
#' @param basis Basis object, if NULL we perform imputation using Melissa,
#'   otherwise using BPRMeth (then \code{obj} should be BPRMeth output).
#' @param is_predictive Logical, use predictive distribution for imputation, or
#'   choose the cluster label with the highest responsibility.
#' @param no_cores Number of cores to be used for parallel processing of data.
#'
#' @return A new directory \code{outdir} containing files (cells) with predicted
#'   / imputed methylation states per CpG location.
#'
#'
#' @author C.A.Kapourani \email{C.A.Kapourani@@ed.ac.uk}
#'
#' @examples
#' \dontrun{
#' # Met directory
#' met_dir <- "name_of_met_dir"
#' # Annotation file name
#' anno_file <- "name_of_anno_file"
#' # Create data object
#' melissa_data <- create_melissa_data_obj(met_dir, anno_file)
#' # Run Melissa
#' melissa_obj <- melissa(X = melissa_data$met, K = 2)
#' # Annotation object
#' anno_region <- melissa_data$anno_region
#'
#' # Peform imputation
#' impute_met_dir <- "name_of_met_dir_for_imputing_cells"
#' out <- impute_met_files(met_dir = impute_met_dir, obj = melissa_obj,
#'                         anno_region = anno_region)
#' }
#'
#'
#' @seealso \code{\link{create_melissa_data_obj}}, \code{\link{melissa}},
#'   \code{\link{filter_regions}}
#'
#' @export
impute_met_files <- function(met_dir, outdir = NULL, obj, anno_region,
                             basis = NULL, is_predictive = TRUE, no_cores = NULL) {

  # Vector of filenames to perform imputation
  met_files <- setdiff(list.files(path = met_dir, full.names = FALSE),
                       list.dirs(path = met_dir, recursive = FALSE,
                                 full.names = FALSE))
  # The out directory will be inside `met_dir/imputed`
  if (is.null(outdir)) {
    outdir <- paste0(met_dir, "/imputed/")
  }
  # Create out directory if it doesn't exist
  ifelse(!dir.exists(outdir), dir.create(outdir), FALSE)

  if (is.null(no_cores)) {
    test_met <- lapply(X = met_files, FUN = function(file){
      .impute_files_internal(file = file, met_dir = met_dir, outdir = outdir,
                             obj = obj,
                             anno_region = anno_region, basis = basis,
                             is_predictive = is_predictive)
    })
  } else {
    test_met <- parallel::mclapply(X = met_files, FUN = function(file) {
      .impute_files_internal(file = file, met_dir = met_dir, outdir = outdir,
                             obj = obj,
                             anno_region = anno_region, basis = basis,
                             is_predictive = is_predictive)
    }, mc.cores = no_cores)
  }
  return(1)
}


# Internal function for imputing file
.impute_files_internal <- function(file, met_dir, outdir, obj,
                                   anno_region, basis = NULL,
                                   is_predictive = TRUE) {
  # So we can pass Build without NOTEs
  coordinates = pos = chr <- NULL
  # Read scBS seq data
  met_dt <- BPRMeth::read_met(file = sprintf("%s/%s", met_dir, file),
                              type = "sc_seq", strand_info = FALSE)
  # Create promoter methylation regions
  met_region <- BPRMeth::create_region_object(met_dt = met_dt,
                    anno_dt = anno_region, cov = 1, sd_thresh = -1,
                    ignore_strand = TRUE, filter_empty_region = FALSE)$met
  # Add additional column for predicted values
  met_region <- lapply(X = met_region, FUN = function(x){
    if (!is.na(x)) {
      return(cbind(x, rep(-1, NROW(x))))
    } else {
      return(NA)
    }
  })

  # Obtain cell ID
  cell_id <- sub(".gz|.zip|.bz2|.rar|.7z","", file)
  # Match test with train cell ID
  n <- which(rownames(obj$r_nk) == cell_id)
  if (length(n) == 0) {
    warning("Cell ID:", cell_id, " was not used for training.\n",
            "Skipping imputation of CpG states for this file.")
    return(-1)
  }

  # Regions that have CpG observations for prediction
  idx <- which(!is.na(met_region))
  # Iterate over each genomic region
  for (m in idx) {
    # When joint imputation method
    if (is.null(basis)) {
      K <- NCOL(obj$r_nk) # Number of clusters
      # If we use the predictive density for imputation
      if (is_predictive) {
        tmp_mixt <- vector("numeric", length = length(met_region[[m]][, 1]))
        for (k in seq_len(K)) {
          # Evalute profile from weighted predictive
          tmp_mixt <- tmp_mixt + obj$r_nk[n,k] *
            eval_probit_function(obj = obj$basis,
                                 obs = met_region[[m]][, 1],
                                 w = obj$W[m,,k])
        }
        # Evaluate the methylation profile
        met_region[[m]][,3] <- tmp_mixt
      }else{
        # Get cluster assignment
        k <- which.max(obj$r_nk[n,])
        met_region[[m]][,3] <- eval_probit_function(obj = obj$basis,
                          obs = met_region[[m]][, 1], w = obj$W[m,,k])
      }
    }else{
      met_region[[m]][,3] <- eval_probit_function(obj = basis,
                          obs = met_region[[m]][, 1], w = obj[m,,n])
    }
    met_region[[m]][,3] <- round(met_region[[m]][,3], 3)
  }
  # Remove region names prior to merging the list
  names(met_region) <- NULL
  # Merge data and create final object
  pred_obj <- data.table::as.data.table(do.call(rbind, met_region),
                                        keep.rownames = "coordinates") %>%
    stats::na.omit() %>%
    .[, c("chr", "pos") := data.table::tstrsplit(coordinates, ":",
                                                 fixed = TRUE)] %>%
    .[, pos := as.numeric(pos)] %>%
    .[, c("chr", "pos", "V2", "V3")] %>%
    data.table::setnames(c("chr", "pos", "met_state", "predicted")) %>%
    data.table::setkey(chr, pos)

  # Store imputed file
  outfile <- sprintf("%s/%s", outdir, cell_id)
  data.table::fwrite(x = pred_obj, file = outfile, sep = "\t")

  return(1)
}




#' @title Evaluate imputation performance
#'
#' @description \code{eval_imputation_performance} is a wrapper function for
#'   computing imputation/clustering performance in terms of different metrics,
#'   such as AUC and precision recall curves.
#'
#' @param obj Output of Melissa inference object.
#' @param imputation_obj List containing two vectors of equal length,
#'   corresponding to true methylation states and predicted/imputed methylation
#'   states.
#'
#' @return The `melissa` object, with an additional slot named `imputation`,
#'   containing the AUC, F-measure, True Positive Rate (TPR) and False Positive
#'   Rate (FPR), and Precision Recall (PR) curves.
#'
#' @author C.A.Kapourani \email{C.A.Kapourani@@ed.ac.uk}
#'
#' @examples
#' # First take a subset of cells to efficiency
#' # Extract synthetic data
#' dt <- melissa_synth_dt
#'
#' # Partition to train and test set
#' dt <- partition_dataset(dt)
#'
#' # Create basis object from BPRMeth package
#' basis_obj <- BPRMeth::create_rbf_object(M = 3)
#'
#' # Run Melissa
#' melissa_obj <- melissa(X = dt$met, K = 2, basis = basis_obj, vb_max_iter = 10,
#'   vb_init_nstart = 1, is_parallel = FALSE, is_verbose = FALSE)
#'
#' imputation_obj <- impute_test_met(obj = melissa_obj, test = dt$met_test)
#'
#' melissa_obj <- eval_imputation_performance(obj = melissa_obj,
#'                                            imputation_obj = imputation_obj)
#'
#' cat("AUC: ", melissa_obj$imputation$auc)
#'
#' @seealso \code{\link{create_melissa_data_obj}}, \code{\link{melissa}},
#'   \code{\link{impute_test_met}}, \code{\link{filter_regions}},
#'   \code{\link{eval_imputation_performance}},
#'   \code{\link{eval_cluster_performance}}
#'
#' @export
#'
eval_imputation_performance <- function(obj, imputation_obj) {
  # Create object of predictions
  pred_obj <- ROCR::prediction(round(imputation_obj$pred_obs, 2),
                               imputation_obj$act_obs)
  obj$imputation <- list()
  # Store AUC
  obj$imputation$auc <- ROCR::performance(pred_obj, "auc")@y.values[[1]]
  # Store F-measure
  f <- ROCR::performance(pred_obj, "f")
  obj$imputation$f_measure <- f@y.values[[1]][min(which(f@x.values[[1]] <= 0.5))]
  # Store TPR/FPR
  obj$imputation$tpr_fpr <- ROCR::performance(pred_obj, "tpr", "fpr")
  # Store PR
  obj$imputation$pr <- ROCR::performance(pred_obj, "prec", "rec")
  return(obj)
}



#' @title Evaluate clustering performance
#'
#' @description \code{eval_cluster_performance} is a wrapper function for
#'   computing clustering performance in terms of ARI and clustering assignment
#'   error.
#'
#' @param obj Output of Melissa inference object.
#' @param C_true True cluster assignemnts.
#'
#' @return The `melissa` object, with an additional slot named `clustering`,
#'   containing the ARI and clustering assignment error performance.
#'
#' @author C.A.Kapourani \email{C.A.Kapourani@@ed.ac.uk}
#'
#' @examples
#' ## Extract synthetic data
#' dt <- melissa_synth_dt
#'
#' # Partition to train and test set
#' dt <- partition_dataset(dt)
#'
#' # Create basis object from BPRMeth package
#' basis_obj <- BPRMeth::create_rbf_object(M = 3)
#'
#' # Run Melissa
#' melissa_obj <- melissa(X = dt$met, K = 2, basis = basis_obj, vb_max_iter = 10,
#'   vb_init_nstart = 1, is_parallel = FALSE, is_verbose = FALSE)
#'
#' # Compute cluster performance
#' melissa_obj <- eval_cluster_performance(melissa_obj, dt$opts$C_true)
#'
#' cat("ARI: ", melissa_obj$clustering$ari)
#'
#' @seealso \code{\link{create_melissa_data_obj}}, \code{\link{melissa}},
#'   \code{\link{filter_regions}}, \code{\link{eval_imputation_performance}},
#'   \code{\link{eval_cluster_performance}}
#'
#' @export
#'
eval_cluster_performance <- function(obj, C_true) {
  # Create new element
  obj$clustering <- list()
  # Compute clustering assignment error
  obj$clustering$error <- cluster_error(C_true, obj$r_nk)
  # Compute ARI for clustering performance
  obj$clustering$ari <- cluster_ari(C_true, obj$r_nk)
  return(obj)
}


#' @title Compute clustering assignment error
#'
#' \code{cluster_error} computes the clustering assignment error, i.e. the
#' average number of incorrect cluster assignments: \deqn{OE =
#' \sum_{n=1}^{N}(I(LT_{n} \neq LP_{n})) / N}
#'
#' @param C_true True cluster assignemnts.
#' @param C_post Posterior mean of predicted cluster assignemnts.
#'
#' @return The clustering assignment error
#'
#' @author C.A.Kapourani \email{C.A.Kapourani@@ed.ac.uk}
#'
cluster_error <- function(C_true, C_post){
  # Obtain the total number of objects
  N <- NROW(C_post)
  # Align cluster indices
  C_post <- .align_cluster(Z1 = C_true, Z2 = C_post, type = "mat")$Z2
  # Number of correct assignments
  C_match <- sum((C_post == C_true))
  # Compute the error
  error <- 1 - C_match / (NCOL(C_post) * N)
  return(error)
}


#' @title Compute clustering ARI
#'
#' @description \code{cluster_ari} computes the clustering performance in terms
#'   of the Adjusted Rand Index (ARI) metric.
#'
#' @param C_true True cluster assignemnts.
#' @param C_post Posterior responsibilities of predicted cluster assignemnts.
#'
#' @return The clustering ARI.
#'
#' @importFrom mclust adjustedRandIndex
#'
#' @author C.A.Kapourani \email{C.A.Kapourani@@ed.ac.uk}
#'
cluster_ari <- function(C_true, C_post){
  # Obtain labels from 1-hot-encoding data
  C_true_lab <- unlist(apply(C_true, 1,
                             function(x) which(x == max(x, na.rm = TRUE))[1]))
  C_est_lab <- unlist(apply(C_post, 1,
                            function(x) which(x == max(x, na.rm = TRUE))[1]))
  # Compute the overall clustering ARI
  ari <- mclust::adjustedRandIndex(x = C_true_lab, y = C_est_lab)
  return(ari)
}


# @title Align cluster labels
#
# @description Align cluster labels when we have (soft) clusterings, i.e.
#   responsibilities from mixture models.
#
# @param Z1 True cluster assignments.
# @param Z2 Estimate cluster assignments
# @param type Object type of the cluster assignemnts, either 'array',
#   'mat_array', 'array_mat', 'mat' or 'vec'.
#
# @return The aligned labels.
#
.align_cluster <- function(Z1, Z2, params = NULL, type = "mat"){
  if (type == "array") {
    L1 <- apply(Z1, c(1,3), sum) # Source 1 cluster assignments
    Lm <- apply(Z2, c(1,3), sum) # Source m cluster assignments
  }else if (type == "mat_array") {
    L1 <- Z1
    Lm <- apply(Z2, c(1,3), sum) # Source m cluster assignments
  }else if (type == "array_mat") {
    L1 <- apply(Z1, c(1,3), sum) # Source 1 cluster assignments
    Lm <- Z2
  }else if (type == "mat") {
    L1 <- Z1
    Lm <- Z2
  }else if (type == "vec") {
    # For each cluster k in previous Cluster
    for (k in seq_along(unique(Z1))) {
      # Find Max
      Max <- sum(Z1 == k & Z2 == k)/(.01 + sum(Z1 == k) + sum(Z2 == k))
      # For each cluster k in current Cluster
      for (tempk in seq_along(unique(Z2))) {
        # Check if the proportions are higher than Max
        if ( (sum(Z1 == k & Z2 == tempk)/(.01 + sum(Z1 == k) +
                                          sum(Z2 == tempk))) > Max) {
          # Get proportion that the two cluster indices are the same
          Max <- sum(Z1 == k & Z2 == tempk)/(.01 + sum(Z1 == k) +
                                               sum(Z2 == tempk))
          dummy <- (Z2 == k)    # Keep indices that do not match
          Z2[Z2 == tempk] <- k  # Swap the incorrect indices
          Z2[dummy] <- tempk    # Swap the incorrect indices
        }
      }
    }
    return(list(Z2 = Z2, params = params))
  }else {stop("Type of alignment not implemented yet.") }

  ##-------------------------------------
  # When input is any type except "vec"
  K <- NCOL(Z1) # Number of clusters
  N <- NROW(Z1) # Number of objects
  # TODO: Check if this holds in the general case
  # Convert probs to hard clusterings
  L1_k <- max.col(L1)
  Lm_k <- max.col(Lm)

  L1 <- matrix(0, ncol = K, nrow = N)
  Lm <- matrix(0, ncol = K, nrow = N)
  for (k in seq_len(K)) {
    cl_L1 <- which(L1_k == k)
    cl_Lm <- which(Lm_k == k)
    if (length(cl_L1) > 0) { L1[cl_L1, k] <- 1 }
    if (length(cl_Lm) > 0) { Lm[cl_Lm, k] <- 1 }
  }

  for (k in seq_len(NCOL(L1))) {  # For each cluster k in L1 source
    for (tempk in seq_len(NCOL(Lm))) { # For each cluster k in Lm source
      Max <- sum(L1 == Lm)       # Matches between the cluster indices
      Lm_dummy <- Lm             # Keep the Lm indices in a dummy variable
      Lm_dummy[,k] = Lm[,tempk]  # Swap the incorrect indices
      Lm_dummy[,tempk] = Lm[,k]  # Swap the incorrect indices
      # If the swaps make a better alignment, update indices
      if (sum(L1 == Lm_dummy) > Max) {
        Lm <- Lm_dummy
        if (type == "array" || type == "mat_array") {
          tmp <- Z2
          Z2[,,k] <- tmp[,,tempk]
          Z2[,,tempk] <- tmp[,,k]
        }
      }
    }
  }
  if (type == "mat") { Z2 <- Lm}
  return(list(Z2 = Z2, params = params)) # Return the aligned parameters
}



#' @name filter_regions
#' @rdname filter_regions
#' @aliases filter_cpgs, melissa_filter
#'
#' @title Filtering process prior to running Melissa
#'
#' @description Fuctions for filter genomic regions due to (1) low CpG coverage,
#'   (2) low coverage across cells, or (3) low mean methylation variability.
#'
#' @param obj Melissa data object.
#' @param min_cpgcov Minimum CpG coverage for each genomic region.
#' @param min_cell_cov_prcg Threshold on the proportion of cells that have
#'   coverage for each region.
#' @param min_var Minimum variability of mean methylation across cells, measured
#'   in terms of standard deviation.
#' @details The (1) `filter_by_cpg_coverage` function does not actually remove
#'   the region, it only sets NA to those regions. The (2)
#'   `filter_by_coverage_across_cells` function keeps regions from which we can
#'   share information across cells. The (3) `filter_by_variability` function
#'   keeps variable regions which are informative for cell subtype
#'   identification.
#'
#' @return The filtered Melissa data object
#'
#' @author C.A.Kapourani \email{C.A.Kapourani@@ed.ac.uk}
#'
#' @seealso \code{\link{melissa}}, \code{\link{create_melissa_data_obj}}
#'
NULL


#' @rdname filter_regions
#'
#' @examples
#' # Run on synthetic data from Melissa package
#' filt_obj <- filter_by_cpg_coverage(melissa_encode_dt, min_cpgcov = 20)
#'
#' @export
#'
filter_by_cpg_coverage <- function(obj, min_cpgcov = 10) {
  assertthat::assert_that(min_cpgcov >= 0)
  # Consider only regions with enough CpG coverage, the rest are set to NA
  obj$met <- lapply(obj$met, function(x)
    lapply(x, function(y){
      if (NROW(y) < min_cpgcov) return(NA) else return(y)
      }))
  return(obj)
}


#' @rdname filter_regions
#'
#' @examples
#' # Run on synthetic data from Melissa package
#' filt_obj <- filter_by_coverage_across_cells(melissa_encode_dt,
#'                                             min_cell_cov_prcg = 0.7)
#'
#' @export
#'
filter_by_coverage_across_cells <- function(obj, min_cell_cov_prcg = 0.5) {
  assertthat::assert_that(min_cell_cov_prcg >= 0 & min_cell_cov_prcg <= 1)
  N <- length(obj$met)      # Number of cells
  M <- length(obj$met[[1]]) # Number of genomic regions
  non_cov_reg <- vector(mode = "numeric")
  for (m in seq_len(M)) { # Iterate over each region
    # Number of cells covered in each region
    cov_cells <- length(which(!is.na(lapply(obj$met, "[[", m))))
    # If no coverage
    if (length(cov_cells) == 0) {
      non_cov_reg <- c(non_cov_reg, m)
    }else{
      # If coverage does not pass threshold, filter again
      if (cov_cells/N < min_cell_cov_prcg) {
        non_cov_reg <- c(non_cov_reg, m)
      }
    }
  }
  if (length(non_cov_reg) != 0) {
    if (!is.null(obj$anno_region)) {
      obj$anno_region <- obj$anno_region[-non_cov_reg]  # Filter anno regions
    }
    for (n in seq_len(N)) {                             # Filter met data
      obj$met[[n]] <- obj$met[[n]][-non_cov_reg]
      obj$met[[n]] <- unname(obj$met[[n]])
    }
    obj$met <- obj$met
  }
  return(obj)
}


#' @rdname filter_regions
#'
#' @examples
#' # Run on synthetic data from Melissa package
#' filt_obj <- filter_by_variability(melissa_encode_dt, min_var = 0.1)
#'
#' @export
#'
filter_by_variability <- function(obj, min_var = 0.1) {
  assertthat::assert_that(min_var >= 0)
  # Number of genomic regions
  M <- length(obj$met[[1]])
  cell_region_sd <- vector("numeric", length = M)
  for (m in seq_len(M)) {
    # Extract all cell observations for specific region m
    tmp <- lapply(obj$met, "[[", m)
    # Keep only regions that have observations
    tmp <- tmp[!is.na(tmp)]
    if (length(tmp) == 0) {
      cell_region_sd[m] <- 0
    }else {# Compute the standard deviation of region across cells
      cell_region_sd[m] <- stats::sd(sapply(tmp, function(x) mean(x[,2])))
    }
  }
  # Keep only highly varying genomic regions
  ind <- which(cell_region_sd > min_var)

  # Subset according to methylation variability
  if (!is.null(obj$anno_region)) {
    obj$anno_region <- obj$anno_region[ind,]
  }
  obj$met <- lapply(obj$met, function(x) x[ind])
  return(obj)
}
andreaskapou/Melissa documentation built on June 12, 2020, 5:54 p.m.