R/count_set.R

Defines functions count_set

Documented in count_set

#' Generate a count_set summerized experiment from NanoString data
#'
#' @description Generates a summerized experiment from NanoString count
#' data in an nSolver RCC Collector Tool Format or matrix and (optional)
#' sample annotations including group, batch, and sample pairing.
#'
#' @param rccexp_dir Full path to the csv file generated by nSolver RCC
#'  Collector Tool Format Export. Default = NULL
#' @param count_data a matrix of count data; genes as rows, samples as columns.
#' Contains 3 columns called "Code_Class", "Gene_Name" & "Accession". Code_class
#' contains probe annotations "Endogenous", "Positive" and "Negative". Default = NULL
#' @param group Biological groups for each lane of counts defined by a factor
#'  or vector. Default = NULL
#' @param batch Batch information such as nanostring chip ID, defined by a
#'  factor or vector of integers. Default = NULL
#' @param pair Sample pairing defined by a factor or vector. Default = NULL
#' @param samp_id Sample IDs defined by a factor or vector. This is used to
#'  build the replicate matrix. Default = NULL
#' @param output_log File path to directory where the summarized experiment
#' will be saved
#' @param count_se File path to an already generated summarized experiment
#' @param low_reps Force Ccount_set to build if a group contains less than
#'  2 replicates. Default = FALSE
#' @param verbose Verbosity ON/OFF. Default = TRUE
#'
#' @examples
#'
#' # biological groups
#' rnf5_group <- c(rep("WT", 5), rep("KO", 5))
#'
#' # sample ids
#' rnf5_sampleid <- c("GSM3638131", "GSM3638132", "GSM3638133", "GSM3638134",
#'                   "GSM3638135", "GSM3638136", "GSM3638137", "GSM3638138",
#'                   "GSM3638139", "GSM3638140")
#'
#' # build count_set
#' rnf5_count_set <- count_set(count_data = Rnf5,
#'                             group = rnf5_group,
#'                             samp_id = rnf5_sampleid)
#'
#' @return A summarized experiment
#'
#' @export count_set
#'
#' @importFrom SummarizedExperiment colData colData<- SummarizedExperiment
#' rowData rowData<- SummarizedExperiment
#' @importFrom S4Vectors metadata metadata<- SimpleList
#' @importFrom utils read.csv

count_set <- function(rccexp_dir = NULL,
                      count_data = NULL,
                      group = NULL,
                      batch = NULL,
                      pair = NULL,
                      samp_id = NULL,
                      output_log = NULL,
                      count_se = NULL,
                      low_reps = FALSE,
                      verbose = TRUE) {


  ######## Check Inputs #######

  # check data input
  if(is.null(rccexp_dir) & is.null(count_data) & is.null(count_se)){
    stop("rccexp_dir or count_data must be specified")}

  # check rccexp_dir
  if(is.null(rccexp_dir) & is.null(count_data) & is.null(count_se)){
    stop("A full path to a csv exported by nSolver RCC Collector Tool Format
         must be provided.")}

  # check group
  if(!is.null(group)) {
    group <- as.factor(group)

    if(low_reps == FALSE & (min(summary(group)))<2){
      stop("The groups provided contain groups with less than two replicates.")
    }

  }

  # check batch
  if(!is.null(batch)) {
    batch <- as.factor(batch)
  }

  # check pair
  if(!is.null(pair)) {
    pair <- as.factor(pair)

    if((min(summary(pair)))<2) warning("The pairs provided
      contain >=1 pair with less than 2 samples.", "\n", sep="")
  }

  # check output_log
  if(is.null(output_log)){
    message("### No output directory provided. The summarized experiment will
    not be saved",
            "\n", sep=" ")}

  # check low_reps
  if(!is.null(low_reps)) {
    if(low_reps != TRUE & low_reps != FALSE) stop("options for low_reps are
                                                  TRUE & FALSE")
  }

  # check samp_id
  if(!is.null(samp_id)) {
    samp_id <- as.character(samp_id)
  }

  ####### Finished Input Checks #######

  ####### Create Count Set #######

  # if rccexp_dir is provided and is correct file type

  if(!is.null(rccexp_dir) & is.null(count_se)) {
    rccexp <- read.csv(rccexp_dir, strip.white = TRUE, check.names = FALSE)
    names(rccexp)[1:3] <-  c("Code_Class", "Gene_Name", "Accession")
    nsamples <- (ncol(rccexp)-3)

    if(verbose == TRUE) {
      message(paste("###", nsamples, "samples have been loaded", "\n",
                    sep = " "))
    }

    if(!("FOV Counted" %in% rccexp[[1]]))
        stop("The file provided is not a csv exported by nSolver RCC Collector
             Tool Format.")
  }

  #if rccexp_dir is not provided and count_data is
  if(is.null(rccexp_dir) &!is.null(count_data)) {
    nsamples <- (ncol(count_data)-3)
    if(verbose) {
      message(paste("###", nsamples, "samples have been loaded", "\n",
                    sep = " "))
    }
  }

  #if rccexp_dir & count_data is not provided but count_se is
  if(!is.null(rccexp_dir) || !is.null(count_data)){

    # check group length
    if(is.null(group)) group <- rep(1, nsamples)

    if(!is.null(group) & length(group) != nsamples){
     stop("The length of group must be equal to the number of samples.")}

    # check batch length
    if(is.null(batch)) batch = rep(1, nsamples)

    if(!is.null(batch) & length(batch) != nsamples){
     stop("The length of batch must be equal to the number of samples.")}

    # check pair length
    if(is.null(pair)) pair = rep(0, nsamples)

    if(!is.null(pair) & length(pair) != nsamples){
     stop("The length of pairs must be equal to the number of samples.")}

    # check samp_id length
    if(is.null(samp_id)) samp_id = rep(0, nsamples)

    if(!is.null(samp_id) & length(samp_id) != nsamples){
      stop("The length of replicates must be equal to the number
           of samples.")}

    # make a count table

    if(!is.null(rccexp_dir)){
      count_tab <- subset(rccexp, Code_Class == "Endogenous"|
                          Code_Class == "Endogenous1"|
                          Code_Class == "Housekeeping"|
                          Code_Class == "Positive"|
                          Code_Class == "Negative"|
                          Code_Class == "Ligation"|
                          Code_Class == "SpikeIn",
                        select = 1:ncol(rccexp))

    }

    if(!is.null(count_data)){
      count_tab <- subset(count_data, Code_Class == "Endogenous"|
                          Code_Class == "Endogenous1"|
                          Code_Class == "Housekeeping"|
                          Code_Class == "Positive"|
                          Code_Class == "Negative"|
                          Code_Class == "Ligation"|
                          Code_Class == "SpikeIn",
                        select = 1:ncol(count_data))

    }

    # make annotation table and remove annotations from count table
    annot_tab <- count_tab[, c("Code_Class", "Gene_Name", "Accession")]
    annot_tab$Code_Class <- as.factor(as.character(annot_tab$Code_Class))

    rownames(count_tab) <- count_tab$Gene_Name
    rms <- c("Code_Class", "Gene_Name", "Accession")
    count_tab <- count_tab[, !names(count_tab) %in% rms]

    #ensure counts are numeric
    count_num <- apply(count_tab, 2, as.numeric)
    rownames(count_num) <- rownames(count_tab)

    # make a sample table
    samp_tab <- as.data.frame(cbind(as.character(group),
                                    (as.character(batch)),
                                    (as.character(pair))))
    samp_tab$samp_id <- samp_id
    rownames(samp_tab) <- colnames(count_num)
    colnames(samp_tab) <- c("group", "batch", "pair", "samp_id")

    # make a summerized experiment
    se_build <-
      SummarizedExperiment(assays = SimpleList(counts = as.matrix(count_num)))

    # add phenotype data to samples
    colData(se_build) <- S4Vectors::DataFrame(samp_tab)
    rowData(se_build) <- S4Vectors::DataFrame(annot_tab)

    se_out <- se_build

    #save.se file
    if(!is.null(output_log)){
      save(se_out, file = paste(output_log, "se.R", sep=""))
      if(verbose == TRUE){
        message("### summariseExperiment saved to:", paste(output_log, "se.R",
                                                           "\n", sep=""))
        }
      }
    }


  # if a summarized experiment or file path to a se.R is provided
  if(!is.null(count_se)){
    #if file path provided
    if(is.character(count_se)){
      load(count_se)
      if(verbose == TRUE){
        message("### A summarizedExperiment has been loaded from:", count_se,
                "\n", sep="")
      }
    }

      # check the file format
      if(count_se@class != "SummarizedExperiment"){
        stop(paste("summarized file provided is not a SummarizedExperiment,
                  Please produce a SummarizedExperiment using count_set.", "\n",
                   sep=""))
      }

    se_out <- count_se

    # if already a summarized file
    if(is.character(count_se) == FALSE &&
       count_se@class == "SummarizedExperiment"){
      se_out <- count_se
      if(verbose == TRUE){
        message("### an existing summarized experiment has been provided")
      }
    }
  }

  return(se_out)

}
MarthaCooper/NanoStringClustR documentation built on June 25, 2021, 9:41 p.m.