R/createDesign.R
In ENCODExplorer: A compilation of ENCODE metadata

#' Create a design for the files associated with the result of a queryEncode,
#'  fuzzySearch research or a \code{data.table} from createDesign.
#' 
#' @return is a \code{data.table} with files for all the experiments or 
#' a \code{list} of \code{data.table} with all the file per experiment when
#' the parameter split is set to \code{TRUE}
#'
#' @param input The \code{data.table} created by a queryEncode or
#' searchEncode research, or a 
#' @param df The \code{data.table} used to extract the files link.
#' Default :\code{get_encode_df()}
#' @param split Allow to the function to return a \code{list} of \code{data.table}
#' where each \code{data.table} contain the files for a single experiment
#' Default: \code{FALSE}.
#' @param fileFormat A string that correspond to the type of the files 
#' that need to be extracted.
#' Default: bam
#' @param dataset_type A string that correspong to the type of dataset that
#' will be extrated.
#' Default: experiments
#' @param format The format (long or wide) to represent the data. The 'long' format
#' will contain three columns (File, Experiment, Value). The 'wide' format
#' organize the data as an array with the experiments as columns and files as rows.
#' Default: long
#' @param output_type The type of output of the function, can be \code{data.table}
#' or a \code{data.table}
#' Default: \code{data.table}
#' @param ID A two element numeric vector, that first element is the value assign
#' to replicate and the second is the value assign to control.
#' Default: 1 and 2 
#' @examples
#' # You will need to replace get_encode_df_demo() with your own encode_df object,
#' # the get_encode_df() function or the get_encode_df_full() function.
#' fuzzy_result <- fuzzySearch(searchTerm = "brca",
#' database=get_encode_df_demo(), filterVector ="target")
#' design_result <- createDesign(input = fuzzy_result,df=get_encode_df_demo(),
#' fileFormat="fastq")
#' 
#' @import data.table
#' @import stringr 
#' @importFrom tidyr spread
#' @importFrom dplyr filter
#' @importFrom dplyr setdiff
#' 
#' @export
createDesign <- function (input=NULL, df=get_encode_df(), split=FALSE, fileFormat="bam",
                          dataset_type="experiments", format="long",
                          output_type="data.table", ID=c(1,2)){
  stopifnot(class(input) %in% c("data.table", "data.frame"))
  stopifnot(output_type %in% c("data.frame", "data.table"))
  
  if(length(ID) != 2) {
      stop("ID must be of length 2.", call.=FALSE)
  }
  stopifnot(format %in% c("wide", "long"))
  
  design <- data.table()
  
  if(is.data.frame(input)){
      input <- as.data.table(input)
  }
  
  if(!(dataset_type %in% df$dataset_type)) {
      stop("dataset_type must be a value present within df$dataset_type.", call.=FALSE)
  }
  
  if(!(fileFormat %in%  unique(df$file_format))) {
      stop("fileFormat must be a value present within df$file_format.", call.=FALSE)
  }
  
  #Filtering character type of ID
  if(is.character(ID)){
      if(sum(as.numeric((is.na(suppressWarnings(as.numeric(ID))))))>0){
          warning("Error : Invalid type of ID, must be numeric value. Default settings will be use",
                  call. = FALSE)
          ID <- c(1,2)
      }else{
        ID <- as.numeric(ID)
      }
  }
  
  #Before we start creating the design, we have to make sure the input object
  #has file for the given fileFormat
  if (nrow(dplyr::filter(input, file_format == fileFormat)) == 0) {
    msg <- c("Error : there is no ", fileFormat, " file available ")
    warning(msg, call. = FALSE)
    return(data.table(File=character(), Experiement=character(), Value=c()))
  }
  
  
  # The first step is to clean the main df to avoid having to do this later
  ff <- fileFormat
  dt <- dataset_type
  clean_df <- df[file_format %in% ff & status == "released" & dataset_type == dt,] 

  # Get experiment/controls matches.
  ## One challenge is that the same control can be used by more than one experiment.
  ## Another challenge is that some experiment don't have a matching control.
  ## Finally, there can also be more than one control per experiment (separated by "; ").
  replicates <- clean_df[accession %in% input$accession, unique(accession)]
  i <- match(replicates, clean_df$accession)
  ctrl <- clean_df[i, controls]
  ctrl <- unlist(strsplit(ctrl, "; "))
  ctrl <- ctrl[!is.na(ctrl)]
  ## The counts variable is the same length as the replicates variable and will contain the
  ## number of controls for each experiment.
  counts <- stringr::str_count(clean_df[i, controls], ";") + 1
  counts[is.na(counts)] <- 0
  ## ctrl is a data.table with all the replicates and their matching controls,
  ## with only one experiment/control combination. The trick is that if we have a count
  ## of zero, the corresponding experiment will be removed from the vector after the rep
  ## function call.
  ctrl <- data.table(replicates = rep(replicates, counts), ctrl)

  # Create the ctrl part of the design
  get_ctrl_design <- function(dt) {
      File <- clean_df[accession == dt$ctrl, basename(href)]
      if (length(File) > 0) {
          data.table(File = File, Experiment = dt$replicates,
                     Value = ID[2])
      } else {
          data.table(File = character(), Experiment = character(), Value = numeric())
      }
  }
  
  design_ctrl <- ctrl[,get_ctrl_design(.SD), by = .(replicates, ctrl),
               .SDcols = c("replicates","ctrl")]
  design_ctrl <- design_ctrl[, `:=`(replicates = NULL, ctrl = NULL)]

  # Create the replicate part of the design
  design_replicates <- clean_df[accession %in% input$accession,
                         .(File = basename(href), Experiment = accession, Value = ID[1])]

  # Merge the two parts
  design <- rbind(design_replicates, design_ctrl)
  setorder(design,"Experiment")
  # Post-processing
  if (output_type == "data.frame") {
      design <- as.data.frame(design)
  }
  if (split) {
      design <- split(design, design$Experiment)
  }
  if (format == "wide") {
    if (is.data.table(design)) {
      design <- tidyr::spread(design, key = Experiment, value = Value)
    } else {
      design <- lapply(design, tidyr::spread, key = Experiment, value = Value)
    }
  }
  
  if(output_type == "data.frame"){
     return(as.data.frame(design))
  }else{
     return(design) 
  }
  
}