R/normalize_data.R

Defines functions normalize_data

Documented in normalize_data

#' @title normalize_data
#' @description Data normalization.
#' @author Xiaotao Shen
#' \email{shenxt1990@@outlook.com}
#' @param object A mass_dataset object.
#' @param method Normalization method, mean, median, total svr or loess,
#' default is svr. Please see the details.
#' @param keep_scale Remain scale or not. Default is TRUE.
#' @param optimization TRUE or not.
#' @param pqn_reference for pqn method.
#' @param begin 0.5
#' @param end 1
#' @param step 0.2
#' @param multiple multiple
#' @param threads 4
#' @export
#' @return A new mass_dataset object.
#' @examples
#'\dontrun{
#' data("object1", package = "demodata")
#' object1 = impute_mv(object1, method = "minimum")
#' object_mean = normalize_data(object = object1, method = "mean")
#' }

normalize_data <-
  function(object,
           method = c("svr", "total", "median", "mean", "pqn", "loess"),
           keep_scale = TRUE,
           optimization = TRUE,
           pqn_reference = c("median", "mean"),
           begin = 0.5,
           end = 1,
           step = 0.2,
           multiple = 1,
           threads = 4) {
    method <- match.arg(method)
    pqn_reference <- match.arg(pqn_reference)
    
    massdataset::check_object_class(object = object, class = "mass_dataset")
    
    if (method == "svr" | method == "loess") {
      check_result <-
        check_for_qc_normalization(object = object)
      if (length(grep("error", check_result)) > 0) {
        stop(check_result)
      }
    }
    
    expression_data <-
      as.data.frame(object@expression_data)
    
    if (sum(is.na(expression_data)) > 0) {
      stop("Please impute MV first.\n")
    }
    
    ##processing information
    ####add parameters
    process_info <- object@process_info
    
    ##sample-wise methods
    if (method == "total") {
      new_expression_data <-
        normalize_data_total(x = expression_data, keep_scale = keep_scale)
      
      object@expression_data <-
        as.data.frame(new_expression_data)
      
      parameter <- new(
        Class = "tidymass_parameter",
        pacakge_name = "masscleaner",
        function_name = "normalize_data()",
        parameter = list(method = method,
                         keep_scale = keep_scale),
        time = Sys.time()
      )
    }
    
    if (method == "mean") {
      new_expression_data <-
        normalize_data_mean(x = expression_data, keep_scale = keep_scale)
      object@expression_data <-
        as.data.frame(new_expression_data)
      
      parameter <- new(
        Class = "tidymass_parameter",
        pacakge_name = "masscleaner",
        function_name = "normalize_data()",
        parameter = list(method = method,
                         keep_scale = keep_scale),
        time = Sys.time()
      )
    }
    
    if (method == "median") {
      new_expression_data <-
        normalize_data_median(x = expression_data, keep_scale = keep_scale)
      object@expression_data <-
        as.data.frame(new_expression_data)
      
      parameter <- new(
        Class = "tidymass_parameter",
        pacakge_name = "masscleaner",
        function_name = "normalize_data()",
        parameter = list(method = method,
                         keep_scale = keep_scale),
        time = Sys.time()
      )
    }
    
    ####-----------------------------------------------------------------------
    ##pqn (Probabilistic Quotient Normalization) method
    if (method == "pqn") {
      pgn_reference_sample <-
        which(object@sample_info$class == "QC")
      if (length(pgn_reference_sample) == 0) {
        pgn_reference_sample <- NULL
      }
      new_expression_data <-
        normalize_data_pqn(
          x = expression_data,
          pqn_reference = pqn_reference,
          pgn_reference_sample = pgn_reference_sample
        )
      
      object@expression_data <- as.data.frame(new_expression_data)
      
      parameter <- new(
        Class = "tidymass_parameter",
        pacakge_name = "masscleaner",
        function_name = "normalize_data()",
        parameter = list(
          method = method,
          keep_scale = keep_scale,
          pqn_reference = pqn_reference,
          pgn_reference_sample = pgn_reference_sample
        ),
        time = Sys.time()
      )
      
    }
    
    #######loess normalization
    sample_info <-
      object@sample_info
    
    if (all(colnames(object@sample_info) != "batch")) {
      sample_info$batch <- 1
    }
    
    if (method == "loess") {
      if (!"Subject" %in% unique(sample_info$class)) {
        stop("No Subject in your sample_info column 'class'")
      }
      
      data_nor <-
        purrr::map(unique(sample_info$batch), function(batch_idx) {
          message(crayon::yellow("Batch", batch_idx, "..."))
          subject_id <-
            sample_info %>%
            dplyr::filter(class == "Subject" &
                            batch == batch_idx) %>%
            dplyr::pull(sample_id)
          
          subject_idx <- match(subject_id, sample_info$sample_id)
          
          qc_id <-
            sample_info %>%
            dplyr::filter(class == "QC" & batch == batch_idx) %>%
            dplyr::pull(sample_id)
          
          qc_idx <- match(qc_id, sample_info$sample_id)
          
          subject_data <-
            expression_data[, subject_idx, drop = FALSE]
          qc_data <- expression_data[, qc_idx, drop = FALSE]
          
          subject_order <- sample_info$injection.order[subject_idx]
          qc_order <- sample_info$injection.order[qc_idx]
          
          new_data <-
            normalize_data_loess(
              subject_data = subject_data,
              qc_data = qc_data,
              subject_order = subject_order,
              qc_order = qc_order,
              optimization = optimization,
              begin = begin,
              end = end,
              step = step,
              threads = threads
            )
          
          new_data <-
            new_data %>%
            dplyr::bind_cols()
          
          new_data
        })
      
      data_nor <-
        data_nor %>%
        dplyr::bind_cols()
      
      new_expression_data <-
        expression_data
      
      new_expression_data[, colnames(data_nor)] <-
        data_nor
      
      object@expression_data <-
        as.data.frame(new_expression_data)
      
      parameter <- new(
        Class = "tidymass_parameter",
        pacakge_name = "masscleaner",
        function_name = "normalize_data()",
        parameter = list(
          method = method,
          keep_scale = keep_scale,
          optimization = optimization,
          begin = begin,
          end = end,
          step = step,
          threads = threads
        ),
        time = Sys.time()
      )
    }
    
    if (method == "svr") {
      if (!"Subject" %in% unique(sample_info$class)) {
        stop("No Subject in your sample_info column 'class'")
      }
      
      sample_info <-
        sample_info %>%
        dplyr::mutate(batch = as.character(batch))
      
      sample_info$batch[sample_info$class != "QC" &
                          sample_info$class != "Subject"] <-
        NA
      
      sample_info$batch[is.na(sample_info$batch)] <- "NA"
      
      data_nor <-
        purrr::map(unique(sample_info$batch), function(batch_idx) {
          message(crayon::yellow("Batch", batch_idx, "..."))
          if (batch_idx == "NA") {
            new_data <-
              expression_data[, sample_info$batch == batch_idx]
            return(new_data)
          }
          subject_id <-
            sample_info %>%
            dplyr::filter(class == "Subject" &
                            batch == batch_idx) %>%
            dplyr::pull(sample_id)
          
          subject_idx <- match(subject_id, sample_info$sample_id)
          
          qc_id <-
            sample_info %>%
            dplyr::filter(class == "QC" & batch == batch_idx) %>%
            dplyr::pull(sample_id)
          
          qc_idx <- match(qc_id, sample_info$sample_id)
          
          subject_data <-
            expression_data[, subject_idx, drop = FALSE]
          qc_data <- expression_data[, qc_idx, drop = FALSE]
          
          subject_order <- sample_info$injection.order[subject_idx]
          qc_order <- sample_info$injection.order[qc_idx]
          
          new_data <-
            normalize_data_svr(
              subject_data = subject_data,
              qc_data = qc_data,
              subject_order = subject_order,
              qc_order = qc_order,
              multiple = multiple,
              threads = threads
            )
          
          new_data <-
            new_data %>%
            dplyr::bind_cols()
          
          new_data
        })
      
      data_nor <-
        data_nor %>%
        dplyr::bind_cols()
      
      new_expression_data <-
        as.data.frame(expression_data)
      
      new_expression_data[, colnames(data_nor)] <-
        data_nor
      
      object@expression_data <-
        as.data.frame(new_expression_data)
      
      parameter <- new(
        Class = "tidymass_parameter",
        pacakge_name = "masscleaner",
        function_name = "normalize_data()",
        parameter = list(
          method = method,
          keep_scale = keep_scale,
          multiple = multiple,
          threads = threads
        ),
        time = Sys.time()
      )
      
    }
    
    
    ###process_info
    if (all(names(process_info) != "normalize_data")) {
      process_info$normalize_data <- parameter
    } else{
      process_info$normalize_data <- c(process_info$normalize_data,
                                       parameter)
    }
    
    object@process_info <- process_info
    return(object)
  }
tidymass/masscleaner documentation built on Sept. 4, 2023, 3:21 a.m.