R/utils.R

Defines functions get_input_params parse_pipeline_inputs write_matrix .ge_list_to_flat_file .fix_headers .write_raw_expression log_message .get_supp_files_dir

Documented in .fix_headers .ge_list_to_flat_file .get_supp_files_dir write_matrix .write_raw_expression

#' Get supplemental files directory
#'
#' Create run-specific base directory for supplementary files.
#' Important that run specific directory is created to avoid overlap with previous runs.
#' This done with unique cell_type * arm_accession string (aka cohort_type).
#' Any spaces in cell_type or arm_accession are replaced with \code{-}
#'
#' @param analysis_dir analysis directory
#' @param gef result of ISCon$getDataset("gene_expression_files") for one run.
.get_supp_files_dir <- function(analysis_dir,
                                gef) {
  supp_files_dir <- file.path(
    analysis_dir,
    "supp_files",
    paste0(
      gsub(" ", "-", unique(gef$type)),
      "_",
      unique(gef$arm_accession)
    )
  )
  if (!dir.exists(supp_files_dir)) {
    dir.create(supp_files_dir,
      recursive = TRUE
    )
  }
  return(supp_files_dir)
}

log_message <- function(...) {
  cat(sprintf("[%s] %s\n", Sys.time(), paste0(..., collapse = "")))
}

#' Write raw expression to a file
#'
#' Writes \code{ge_tbl} to a (tsv-formatted) \code{.txt} file, using controlled
#' format and file name.
#'
#' @return path to raw expression file
#'
#' @param ge_tbl object containing gene expression values (matrix, data.frame, etc)
#' @param supp_files_dir path to base directory (via \code{.get_supp_files_dir()})
#' @param study study accession eg \code{SDY269}
.write_raw_expression <- function(ge_tbl, supp_files_dir, study) {
  input_files <- file.path(supp_files_dir, paste0(study, "_raw_expression.txt"))
  fwrite(ge_tbl, file = input_files, sep = "\t", quote = FALSE, row.names = FALSE)
  return(input_files)
}

#' fix headers
#'
#' Custom fixes for individual studies
#'
#' @param ge_list list of matrices for one study
#' @param study study accession eg \code{SDY269}
.fix_headers <- function(ge_list, study) {
  if (study == "SDY224") {
    ge_list <- lapply(ge_list, function(x) {
      setnames(x, as.character(x[1, ]))
      x <- x[-(1:2), ]
      colnames(x)[[1]] <- "ID_REF"
      return(x)
    })
  } else if (study == "SDY400") {
    # Using mapping file provided by Hailong Meng at Yale, Dec 2018
    # since note in header of file is misleading due to gsm swaps made
    # later based on knowledge of switched samples.
    ge_list <- lapply(ge_list, function(x) {
      mp <- fread("/share/files/Studies/SDY400/@files/rawdata/gene_expression/SDY400_HeaderMapping.csv")
      setnames(x, colnames(x), as.character(x[2, ]))
      x <- x[-(1:2), ]
      smpls <- grep("SAMPLE", colnames(x), value = T)
      titles <- mp$Title[match(smpls, mp$Sample)]
      setnames(x, smpls, titles)
      return(x)
    })
  } else if (study == "SDY1325") {
    ge_list <- lapply(ge_list, function(x) {
      setnames(x, colnames(x), as.character(x[5, ]))
      x <- x[6:nrow(x), ]
      return(x)
    })
  } else if (study == "SDY1324") {
    # Custom header mapping provided by authors via P.Dunn Dec 2018.
    ge_list <- lapply(ge_list, function(x) {
      mp <- fread("/share/files/Studies/SDY1324/@files/rawdata/gene_expression/raw_counts/SDY1324_Header_Mapping.csv")
      accs <- grep("V1", colnames(x), invert = TRUE, value = TRUE)
      esNms <- mp$experimentAccession[match(accs, mp$AuthorGivenId)]
      setnames(x, accs[!is.na(esNms)], esNms[!is.na(esNms)])
      return(x)
    })
  } else if (study == "SDY787") {
    # Fist number is unique id
    ge_list <- lapply(ge_list, function(x) {
      # Remove first "_" and everything following
      setnames(x, colnames(x), gsub("_.*$", "", colnames(x)))
    })
  }
  return(ge_list)
}

#' matrix list to flat file
#'
#' ge_list to flat file. Used when samples are in different files on GEO
#'
#' @return path to raw, prepped input files
#'
#' @param ge_list list of gene expression tables (matrix, data.frame, etc)
#' @param supp_files_dir path to base directory (via \code{.get_supp_files_dir()})
#' @param study study accession eg \code{SDY269}
.ge_list_to_flat_file <- function(ge_list, supp_files_dir, study) {
  ge_df <- Reduce(f = function(x, y) {
    merge(x, y)
  }, ge_list)
  input_files <- .write_raw_expression(ge_df, supp_files_dir, study)
}
#######################################
###            MAPPING              ###
#######################################


#' Write matrix to file system
#'
#' @details Writes four versions of flat tsv files:
#'   1. \code{<matrix_name>.raw.tsv}: raw, background-corrected values
#'   2. \code{<matrix_name>.tsv}: normalized values
#'   3. \code{<matrix_name>.summary.tsv}: normalized values, summarized by gene symbol
#'   (based on current annotation)
#'   4. \code{<matrix_name>.summary.orig}: normalized values, summarized by gene symbol
#'   (based on original annotation)
#'
#' @param output_dir path to output directory (directory where final matrices
#' will be written)
#' @param matrix_name Name of matrix
#' @param exprs data.table of raw (background-corrected) expression (from \code{makeRawMatrix()})
#' @param norm_exprs data.table of normalized expression (from \code{normalizeMatrix()})
#' @param sum_exprs data.table of summarized expression (from \code{summarizeMatrix()})
#' @param verbose Write verbose print statements?
#'
#' @export
write_matrix <- function(output_dir,
                         matrix_name,
                         exprs,
                         norm_exprs,
                         sum_exprs,
                         verbose = FALSE) {
  .write_em <- function(df, file_name, verbose) {
    if (verbose) log_message("Writing ", file_name, "...")
    fwrite(df,
      file = file.path(
        output_dir,
        file_name
      ),
      sep = "\t",
      quote = FALSE,
      row.names = FALSE
    )
  }

  # Raw EM
  .write_em(exprs, paste0(matrix_name, ".tsv.raw"), verbose)

  # Normalized EM
  .write_em(norm_exprs, paste0(matrix_name, ".tsv"), verbose)

  # summary EM
  .write_em(sum_exprs, paste0(matrix_name, ".tsv.summary"), verbose)

  # original summary EM assuming run created with _orig FasId
  .write_em(sum_exprs, paste0(matrix_name, ".tsv.summary.orig"), verbose)
}


parse_pipeline_inputs <- function(pipeline_inputs) {
  list(
    study = gsub("/Studies/", "", pipeline_inputs$labkey.url.path),
    matrix_name = gsub("\\.tsv", "", pipeline_inputs$output.tsv),
    selected_biosamples = pipeline_inputs$selectedBiosamples,
    fas_id = pipeline_inputs$fasId,
    labkey.url.base = pipeline_inputs$labkey.url.base,
    base_dir = pipeline_inputs$pipeline.root,
    taskOutputParams = pipeline_inputs$taskOutputParams,
    verbose = TRUE
  )
}


get_input_params <- function(con = CreateConnection(""),
                             matrix_name) {
  # First make sure the matrix exists
  stopifnot(matrix_name %in% con$listGEMatrices()$name)

  # First check for inputParams
  baseUrl <- con$config$labkey.url.base
  study <- con$listGEMatrices()[name == matrix_name, folder]
  if (Rlabkey::labkey.webdav.pathExists(
    baseUrl = con$config$labkey.url.base,
    folderPath = paste0("Studies/", study),
    remoteFilePath = file.path(
      "rawdata",
      "gene_expression",
      "create-matrix",
      matrix_name,
      "create-matrix-vars.tsv"
    ),
    fileSet = "@files"
  )) {
    local_path <- tempfile()
    Rlabkey::labkey.webdav.get(
      baseUrl = con$config$labkey.url.base,
      folderPath = paste0("Studies/", study),
      remoteFilePath = file.path(
        "rawdata",
        "gene_expression",
        "create-matrix",
        matrix_name,
        "create-matrix-vars.tsv"
      ),
      localFilePath = local_path,
      fileSet = "@files"
    )
    input_params <- fread(local_path)
    return(parse_pipeline_inputs(input_params))
  } else {
    # Get biosamples and fasID
    input_samples <- Rlabkey::labkey.selectRows(
      baseUrl = con$config$labkey.url.base,
      folderPath = paste0("Studies/", study),
      schemaName = "assay.ExpressionMatrix.matrix",
      queryName = "inputSamples_computed",
      colSelect = c("Biosample", "Run/featureSet"),
      colFilter = makeFilter(c("Run/Name", "EQUAL", matrix_name)),
      colNameOpt = "fieldname"
    )
    selected_biosamples <- paste0(input_samples$Biosample, collapse = ",")
    fas_id <- unique(input_samples$`Run/featureSet`)
    return(list(
      study = study,
      matrix_name = matrix_name,
      selected_biosamples = selected_biosamples,
      fas_id = fas_id,
      labkey.url.base = baseUrl,
      base_dir = file.path("/share", "files", "Studies", study, "@files"),
      verbose = TRUE
    ))
  }
}
RGLab/HIPCMatrix documentation built on Jan. 29, 2023, 5:13 a.m.