R/import_funcs.R

Defines functions import2dDataset renameColumns .medianNormalizeRatios .checkRatioRef filterOutContaminants annotateDataList configWide2Long import2dMain TPP_importCheckConfigTable .TPP_importFct_readConfigTable .TPP_importCheckTemperatures .TPP_detectLabelColumnsInConfigTable .TPP_nonLabelColumns .TPP_checkFunctionArgs .TPP_importFct_checkConditions .TPP_importFct_checkComparisons .TPP_importFct_checkExperimentCol .TPP_importFct_rmZeroSias .TPP_replaceZeros .TPP_importFct_removeDuplicates

Documented in annotateDataList configWide2Long filterOutContaminants import2dDataset import2dMain renameColumns TPP_importCheckConfigTable

.TPP_importFct_CheckDataFormat <- function (files, dataframes, expNames){
    # internal function copied from TPP package to avoid 
    # import of non-exported package functions
    . <- NULL
    isDF <- !is.null(dataframes)
    isF <- !is.null(files)
    isBoth <- isDF & isF
    isNone <- !(isDF | isF)
    if (isBoth) {
        stop("Data import function received a",
             " filename AND a dataframe object. \n",
             "Please specify only one.")
    }
    else if (isNone) {
        stop("Data import function requires a", 
             " filename or a dataframe object. \n",
             "Please specify one.")
    }
    if (isDF) {
        isClassList <- is.list(dataframes) && !is.data.frame(dataframes)
        isClassDF <- is.data.frame(dataframes)
        if (isClassList) {
            classesInList <- dataframes %>% 
            vapply(. %>% inherits(., "data.frame"), TRUE)
            if (!all(classesInList)) {
                stop(paste("Argument 'dataframes' contains", 
                           "elements that are not of type", 
                           "'data.frame' at the following positions: "), 
                     which(!classesInList) %>% paste(collapse = ", "), ".")
            }
          }
          else if (isClassDF) {
              dataframes <- list(dataframes)
              names(dataframes) <- expNames
          }
          else {
              stop("Argument 'dataframes' must be either an object of class \n
                   'data.frame', or a list of such objects.")
          }
    }
    if (isF) {
        files <- as.character(files)
        names(files) <- expNames
    }
    return(list(files = files, dataframes = dataframes))
}

#' @importFrom utils read.delim
#' @importFrom RCurl url.exists
.TPP_importFct_readFiles <- function (files, naStrs){
  # internal function copied from TPP package to avoid 
  # import of non-exported package functions
  expNames <- names(files)
  data <- vector("list", length(files))
  names(data) <- expNames
  for (expName in expNames) {
    fTmp <- files[[expName]]
    if (file.exists(fTmp) || url.exists(fTmp)) {
      data[[expName]] <- read.delim(fTmp, as.is = TRUE, 
                                    na.strings = naStrs, quote = "")
    }
    else {
      stop("File ", fTmp, " could not be found.")
    }
  }
  return(data)
}

#' @import dplyr
.TPP_importFct_removeDuplicates <- function(inDF, refColName, 
                                           nonNAColNames, qualColName){
  # internal function copied from TPP package to avoid 
  # import of non-exported package functions
  message("Removing duplicate identifiers using quality column '", 
          qualColName, "'...")
  nonUniques <- unique(as_tibble(inDF)[duplicated(inDF[[refColName]]), 
                           refColName])
  retDF <- subset(inDF, !(get(refColName) %in% nonUniques))
  if(nrow(nonUniques)){
      for (nU in nonUniques) {
          tmpDF <- subset(inDF, get(refColName) == nU)
          nonNArows <- NULL
          for (r in seq_len(nrow(tmpDF))) {
              if (any(!is.na(tmpDF[r, nonNAColNames]))) {
                  nonNArows <- c(nonNArows, r)
              }
          }
          if (length(nonNArows) > 1) {
              if (is.null(qualColName)) {
                  useRow <- 1
              }
              else {
                  qualVals <- tmpDF[nonNArows, qualColName]
                  useRow <- match(max(qualVals), qualVals)
              }
          }
          else {
              useRow <- nonNArows[1]
          }
          retDF <- rbind(retDF, tmpDF[useRow, ])
      }
  }
  message(nrow(retDF), " out of ", nrow(inDF), 
          " rows kept for further analysis.")
  return(retDF)
}

.TPP_replaceZeros <- function(x){
  # internal function copied from TPP package to avoid 
  # import of non-exported package functions
  x[which(x == 0)] <- NA
  return(x)
}

.TPP_importFct_rmZeroSias <- function(data.list, 
                                      intensityStr){
  # internal function copied from TPP package to avoid 
  # import of non-exported package functions
  out <- lapply(names(data.list), function(l.name) {
    datTmp <- data.list[[l.name]]
    colsTmp <- colnames(datTmp)
    intensity.cols <- grep(intensityStr, colsTmp, value = TRUE)
    intensity.df <- subset(datTmp, select = intensity.cols) %>% 
      mutate_all(as.character) %>% mutate_all(as.numeric)
    new.intensity.df <- intensity.df %>% mutate_all(.TPP_replaceZeros)
    datTmp[, intensity.cols] <- new.intensity.df
    return(datTmp)
  })
  names(out) <- names(data.list)
  return(out)
}

.TPP_importFct_checkExperimentCol <- function(expCol){
  # internal function copied from TPP package to avoid 
  # import of non-exported package functions
  if (is.null(expCol)) {
    m <- paste("Config table needs an 'Experiment'", 
               "column with unique experiment IDs.")
    stop(m, "\n")
  }
  oldExpNames <- expCol
  newExpNames <- gsub("([^[:alnum:]])", "_", expCol)
  iChanged <- oldExpNames != newExpNames
  if (any(iChanged)) {
    m1 <- paste("Replaced non-alphanumeric characters", 
                "in the 'Experiment' column entries:")
    m2 <- paste("'", paste(oldExpNames[iChanged], collapse = "', '"), 
                "'\nby\n'", paste(newExpNames[iChanged], collapse = "', '"), 
                sep = "")
    message(m1, "\n", m2, "\n")
  }
  return(newExpNames)
}

.TPP_importFct_checkComparisons <- function(confgTable){
  # internal function copied from TPP package to avoid 
  # import of non-exported package functions
  expConds <- confgTable$Condition
  expNames <- confgTable$Experiment
  compCols <- grep("Comparison", colnames(confgTable), ignore.case = TRUE, 
                   value = TRUE)
  compChars <- apply(confgTable[compCols], 2, function(x) {
    length(grep("[[:alnum:]]", x, value = TRUE))
  })
  comp_unequal_two <- compChars != 2
  if (any(comp_unequal_two)) {
    warning(paste("\nThe following comparison columns could not be evaluated", 
                  "because they did not contain exactly two entries:\n\t\t"), 
            paste(compCols[comp_unequal_two], collapse = ",\n\t\t"))
  }
  validCompCols <- compCols[!comp_unequal_two]
  allCompStrs <- c()
  if (length(validCompCols)) {
    message("Comparisons will be performed between the following experiments:")
    for (colName in validCompCols) {
      current_compEntries <- confgTable[[colName]]
      current_compRows <- grep("[[:alnum:]]", current_compEntries)
      current_compExps <- expNames[current_compRows]
      compRef <- current_compExps[1]
      compTreatm <- current_compExps[2]
      if ("Condition" %in% names(confgTable)) {
        current_compConds <- expConds[current_compRows]
        if ("Vehicle" %in% current_compConds && "Treatment" %in% 
            current_compConds) {
          compRef <- current_compExps[current_compConds == 
                                        "Vehicle"]
          compTreatm <- current_compExps[current_compConds == 
                                           "Treatment"]
        }
      }
      compStr <- paste(compTreatm, "_vs_", compRef, sep = "")
      names(compStr) <- colName
      message(compStr)
      allCompStrs <- c(allCompStrs, compStr)
    }
    message("\n")
  }
  return(allCompStrs)
}

#' @importFrom stringr str_to_title
.TPP_importFct_checkConditions <- function(condInfo, 
                                          expectedLength){
  # internal function copied from TPP package to avoid 
  # import of non-exported package functions
  flagGenerateConds <- FALSE
  if (is.null(condInfo)) {
    message("No information about experimental conditions given.", 
            "Assigning NA instead.\n",
            "Reminder: recognition of Vehicle and Treatment groups", 
            "during pairwise \n",
            "comparisons is only possible when they are specified ",
            "in the config table.\n")
    condInfo <- rep(NA_character_, expectedLength)
  }
  else {
    condInfo <- as.character(condInfo) %>% 
      stringr::str_to_title()
    condLevels <- unique(condInfo)
    invalidLevels = 
      setdiff(condLevels, c("Treatment", "Vehicle"))
    if (length(invalidLevels) > 0) {
      stop("The entry '", invalidLevels, 
           paste("' in the condition column is invalid.", 
                 "Only the values 'Treatment' and", 
                 "'Vehicle' are allowed. Please correct", 
                 "this and start again."))
    }
  }
  return(condInfo)
}

.TPP_checkFunctionArgs <- 
  function(functionCall, expectedArguments){
  # internal function copied from TPP package to avoid 
  # import of non-exported package functions
  myArgs <- names(functionCall)
  lapply(expectedArguments, function(arg) {
    if (!arg %in% myArgs) {
      stop("Error in ", paste(functionCall)[1], 
           ": argument '", 
           arg, "' is missing, with no default", 
           call. = FALSE)
    }
  })
}

.TPP_nonLabelColumns <- function(){
  # internal function copied from TPP package to avoid 
  # import of non-exported package functions
  out <- data.frame(
    column = c("Experiment", "Experiment",
               "Experiment", "Path", "Path", 
               "Path", "Condition", "Replicate", 
               "Compound", "Temperature", "RefCol"), 
    type = c("TR", "CCR", "2D", "TR", "CCR", "2D", 
             "TR", "TR", "2D", "2D", "2D"), 
    obligatory = c(TRUE, TRUE, TRUE, FALSE, FALSE, 
                   FALSE, TRUE, FALSE, TRUE, TRUE, TRUE), 
    exclusive = c(FALSE, FALSE, FALSE, FALSE, FALSE, 
                  FALSE, TRUE, TRUE, TRUE, TRUE, TRUE))
  return(out)
}

.TPP_detectLabelColumnsInConfigTable <- 
  function(allColumns){
  # internal function copied from TPP package to avoid 
  # import of non-exported package functions
  .TPP_checkFunctionArgs(match.call(), c("allColumns"))
  noLabelCols <- .TPP_nonLabelColumns()$column %>% 
    as.character %>% 
    unique
  compCols <- grep("comparison", allColumns, value = TRUE, 
                   ignore.case = TRUE)
  noLabelCols <- c(noLabelCols, compCols)
  labelCols <- setdiff(allColumns, noLabelCols)
  return(labelCols)
}

.TPP_importCheckTemperatures <- function(temp){
  # internal function copied from TPP package to avoid 
  # import of non-exported package functions
  tempMatrix <- as.matrix(temp)
  rownames(tempMatrix) <- NULL
  naRows <- apply(is.na(tempMatrix), 1, all)
  if (any(naRows)) {
    stop("Row(s) ", paste(which(naRows), collapse = ", "), 
         " in the configuration table contain", 
         " only missing temperature values.")
  }
  return(tempMatrix)
}

#' @importFrom openxlsx read.xlsx
#' @importFrom utils read.table
.TPP_importFct_readConfigTable <- function(cfg){
  # internal function copied from TPP package to avoid 
  # import of non-exported package functions
  if (is.character(cfg)) {
    if (file.exists(cfg)) {
      strChunks <- strsplit(cfg, "\\.")[[1]]
      fileExtension <- strChunks[length(strChunks)]
      if (fileExtension == "txt") {
        tab <- read.table(
          file = cfg, header = TRUE, 
          check.names = FALSE, stringsAsFactors = FALSE, 
          sep = "\t")
      }
      else if (fileExtension == "csv") {
        tab <- read.table(
          file = cfg, header = TRUE, 
          check.names = FALSE, stringsAsFactors = FALSE, 
          sep = ",")
      }
      else if (fileExtension == "xlsx") {
        tab <- openxlsx::read.xlsx(cfg)
      }
      else {
        stop("Error during data import: ", cfg, 
             " does not belong to a valid configuration file.")
      }
    }
    else {
      stop("Error during data import: ", cfg, 
           " does not belong to a valid configuration file.")
    }
    cfg <- tab
  }
  return(cfg)
}

#' Import and chech configuration table
#' 
#' @param infoTable character string of a file path to
#' a config table (excel,txt or csv file) or data frame
#' containing a config table
#' @param type charater string indicating dataset type
#' default is 2D
#' 
#' @return data frame with config table
#' 
#' @examples 
#' data("config_tab")
#' TPP_importCheckConfigTable(config_tab, type = "2D")
#' @export
TPP_importCheckConfigTable <- function(infoTable, type = "2D"){
  .TPP_checkFunctionArgs(match.call(), c("infoTable", "type"))
  Experiment <- Path <- Compound <- NULL
  isValidDF <- FALSE
  if (is.data.frame(infoTable)) {
    if ((ncol(infoTable) > 1) & 
        ("Experiment" %in% colnames(infoTable))) {
      isValidDF <- TRUE
    }
  }
  if (!is.character(infoTable) & !isValidDF) {
    stop("'infoTable' must either be a data frame", 
         " with an 'Experiment' column \n",
         "and at least one isobaric label column,", 
         "or a filename pointing at a \n",
         "table that fulfills the same criteria")
  }
  isValidType <- type %in% c("2D")
  if (!isValidType) {
    stop("'type' must have this value: '2D'")
  }
  infoTable <- .TPP_importFct_readConfigTable(cfg = infoTable)
  infoTable$Experiment <- 
    .TPP_importFct_checkExperimentCol(infoTable$Experiment)
  infoTable <- subset(infoTable, Experiment != "")
  givenPaths <- NULL
  if (any("Path" %in% colnames(infoTable))) {
    if (all(infoTable$Path == "") || all(is.na(infoTable$Path))) {
      message("Removing empty 'Path' column from config table")
      infoTable <- infoTable %>% select(-Path)
    }
    else {
      givenPaths <- infoTable$Path
    }
  }
  compStrs <- NA
  infoTable$Condition <- NULL
  allCols <- colnames(infoTable)
  labelCols <- .TPP_detectLabelColumnsInConfigTable(allColumns = allCols)
  labelValues <- infoTable[, labelCols]
  labelValuesNum <- suppressWarnings(labelValues %>% apply(2, 
                                                           as.numeric))
  if (is.matrix(labelValuesNum)) {
    isInvalid <- labelValuesNum %>% apply(2, is.na) %>% apply(2, 
                                                              all)
  }
  else if (is.vector(labelValuesNum)) {
    isInvalid <- is.na(labelValuesNum)
  }
  invalidLabels <- labelCols[isInvalid]
  infoTable[, invalidLabels] <- NULL
  labelColsNew <- labelCols[!isInvalid]
  labelStr <- paste(labelColsNew, collapse = ", ")
  message("The following valid label columns were detected:\n", 
          labelStr, ".")
  if (type == "2D") {
    temperatures <- infoTable$Temperature
    if (is.null(temperatures) | length(temperatures) < 2) {
      m1 <- paste("Insufficient temperatures (<2)", 
                  "specified in config file.")
      m2 <- paste("Does your configuration table", 
                  "have the correct column names?")
      stop(m1, "\n", m2)
    }
    else if (length(which(!infoTable$RefCol %in% labelColsNew)) != 
             0) {
      stop(paste("Labels in reference column not found", 
                 "in any of the label columns."))
    }
    hasCompoundCol <- any(allCols == "Compound")
    if (!hasCompoundCol) {
      m <- paste("Config table of a 2D-TPP experiment", 
                 "needs a 'Compound' column.")
      stop(m, "\n")
    }
    else {
      infoTable <- infoTable %>% 
        mutate(Compound = 
                 gsub("([^[:alnum:]])", "_", Compound))
    }
    out <- infoTable
  }
  else {
    temperatures <- subset(infoTable, select = labelColsNew)
    tempMatrix <- .TPP_importCheckTemperatures(temp = temperatures)
    infoList <- list(
      expNames = as.character(infoTable$Experiment), 
      expCond = infoTable$Condition, files = givenPaths, 
      compStrs = compStrs, labels = labelColsNew, 
      tempMatrix = tempMatrix)
    out <- infoList
  }
  return(out)
}

#' Import 2D-TPP dataset main function
#' 
#' @param configTable character string of a file path to a config table
#' @param data possible list of datasets from different MS runs 
#' corresponding to a 2D-TPP dataset, circumvents loading datasets 
#' referencend in config table, default is NULL
#' @param idVar character string indicating which data column provides the 
#' unique identifiers for each protein.
#' @param intensityStr character string indicating which columns contain 
#' raw intensities measurements
#' @param fcStr character string indicating which columns contain the actual 
#' fold change values. Those column names containing the suffix \code{fcStr} 
#' will be regarded as containing fold change values.
#' @param naStrs character vector indicating missing values in the data table. 
#' When reading data from file, this value will be passed on to the argument 
#' \code{na.strings} in function \code{read.delim}.
#' @param addCol character string indicating additional column to import
#' @param nonZeroCols column like default qssm that should be imported and
#' requested to be non-zero in analyzed data
#' @param qualColName character string indicating which column can be used for 
#' additional quality criteria when deciding between different non-unique 
#' protein identifiers.
#' 
#' @return list of data frames containing different
#' datasets
#' 
#' @examples 
#' data("config_tab")
#' data("raw_dat_list")
#' dataList <- import2dMain(configTable = config_tab,
#'                          data = raw_dat_list,
#'                          idVar = "protein_id",
#'                          fcStr = "rel_fc_",
#'                          addCol = "gene_name",
#'                          naStrs = NA,
#'                          intensityStr = "signal_sum_",
#'                          nonZeroCols = "qusm",
#'                          qualColName = "qupm")
#' @export
import2dMain <- function(configTable, data, idVar, fcStr,
                         addCol, naStrs, intensityStr,
                         qualColName, nonZeroCols){
  
  # internal import main function, adapted from TPP package
  files <- configTable$Path
  if (!is.null(files)) {
    if (any(files == "")) {
      files <- NULL
    }
  }
  Experiment <- Compound <- Temperature <- RefCol <- NULL
  expNames <- configTable$Experiment
  argList <- .TPP_importFct_CheckDataFormat(dataframes = data, 
                                           files = files,
                                           expNames = expNames)
  data <- argList[["dataframes"]]
  files <- argList[["files"]]
  if (!is.null(files)) {
    files2 <- files[!duplicated(names(files))]
    data <- .TPP_importFct_readFiles(files = files2, 
                                    naStrs = naStrs)
  }
  iVec <- seq_len(nrow(configTable))
  dataList <- lapply(iVec, function(iTmp) {
    rowTmp <- configTable[iTmp, ]
    expTmp <- rowTmp$Experiment
    message("Importing 2D-TPP dataset: ", expTmp)
    tTmp <- rowTmp$Temperature
    dataTmp <- data[[expTmp]]
    noFCCols <- c("Compound", "Experiment", "Temperature",
                  "RefCol", "Path", "Condition")
    allCols <- colnames(rowTmp)
    labelCols <- setdiff(allCols, noFCCols)
    labelValues <- suppressWarnings(rowTmp[, labelCols] %>%
                                      as.numeric)
    labelColsNum <- labelCols[!is.na(labelValues)]
    signalCols <- paste(intensityStr, labelColsNum, sep = "")
    relevant.cols <- c(idVar, qualColName, nonZeroCols, addCol,
                       signalCols) %>% unique
    if (!is.null(fcStr)) {
      fcCols <- paste(fcStr, labelColsNum, sep = "")
      relevant.cols <- c(relevant.cols, fcCols)
      dataCols <- fcCols
    }
    else {
      dataCols <- signalCols
    }
    if (!all(relevant.cols %in% colnames(dataTmp))) {
      notFound <- paste(setdiff(relevant.cols, colnames(dataTmp)),
                        collapse = "', '")
      stop("The following columns could not be found: '",
           notFound, paste("'. Please check the suffices and the", 
                           "additional column names you have specified."))
    }
    dataFiltered <- .TPP_importFct_removeDuplicates(
      inDF = dataTmp,refColName = idVar, 
      nonNAColNames = dataCols, 
      qualColName = qualColName[1])
    idsTmp <- as.character(dataFiltered[, idVar])
    idsAnnotated <- paste(expTmp, tTmp, idsTmp, sep = "_")
    dataFinal <- dataFiltered %>% subset(select = relevant.cols) %>%
      mutate(temperature = tTmp, experiment = expTmp, unique_ID = idsAnnotated)
    return(dataFinal)
  })
  newNames <- vapply(seq(nrow(configTable)), function(iTmp) {
    rowTmp <- configTable[iTmp, ]
    tTmp <- rowTmp$Temperature
    expTmp <- rowTmp$Experiment
    newName <- paste(expTmp, tTmp, sep = "_")
    return(newName)
  }, "")
  names(dataList) <- newNames
  out <- .TPP_importFct_rmZeroSias(data.list = dataList,
                                   intensityStr = intensityStr)
  return(out)
}

#' Tranform configuration table from wide to long
#' 
#' @param configWide data frame containing a config table
#' @return data frame containing config table in long format 
#' 
#' @importFrom tidyr gather
#' @examples 
#' data("config_tab")
#' configWide2Long(configWide = config_tab)
#' 
#' @export
configWide2Long <- function(configWide){
 Path <- label <- conc <- Compound <- Experiment <- 
    Temperature <- RefCol <- NULL
  
  if(any(grepl("Path", colnames(configWide)))){
    configLong <- configWide %>%
      dplyr::select(-Path) %>%
      gather(label, conc, -Compound, 
             -Experiment, -Temperature, -RefCol) %>%
      filter(conc != "-")
  }else{
    configLong <- configWide %>%
      gather(label, conc, -Compound, 
             -Experiment, -Temperature, -RefCol) %>%
      filter(conc != "-")
  }
}

#' Annotate imported data list using a config table
#' @param dataList list of datasets from different MS runs 
#' corresponding to a 2D-TPP dataset
#' @param geneNameVar character string of the column name that describes
#' the gene name of a given protein in the raw data files
#' @param configLong long formatted data frame of a corresponding
#' config table
#' @param intensityStr character string indicating which columns contain 
#' raw intensities measurements
#' @param fcStr character string indicating which columns contain the actual 
#' fold change values. Those column names containing the suffix \code{fcStr} 
#' will be regarded as containing fold change values.
#' 
#' @return data frame containing all data annotated
#' by information supplied in the config table
#'   
#' @importFrom tidyr spread
#' 
#' @examples 
#' data("config_tab")
#' data("raw_dat_list")
#' dataList <- import2dMain(configTable = config_tab,
#'                          data = raw_dat_list,
#'                          idVar = "protein_id",
#'                          fcStr = "rel_fc_",
#'                          addCol = "gene_name",
#'                          naStrs = NA,
#'                          intensityStr = "signal_sum_",
#'                          nonZeroCols = "qusm",
#'                          qualColName = "qupm")
#' configLong <- configWide2Long(configWide = config_tab)
#' annotateDataList(dataList = dataList,
#'                  geneNameVar = "gene_name",
#'                  configLong = configLong,
#'                  intensityStr = "signal_sum_",
#'                  fcStr = "rel_fc_")
#' @export
annotateDataList <- function(dataList, geneNameVar, configLong,
                             intensityStr, fcStr){
  channel <- signal <- Temperature <- RefCol <- label <- 
    conc <- unique_ID <- spread_var <- NULL
  
  combinedTab <- bind_rows(lapply(dataList, function(dat){
    datLong <- dat %>% as_tibble() %>%
      gather(channel, signal, matches(intensityStr), matches(fcStr)) %>%
      mutate(label = gsub(fcStr, "", gsub(intensityStr, "", channel))) %>%
      left_join(configLong %>% 
                  dplyr::select(Temperature, RefCol, label, conc),
                by = c("temperature" = "Temperature", "label")) %>%
      mutate(spread_var = 
               ifelse(grepl(fcStr, channel), "rel_value", "raw_value")) %>%
      dplyr::select(-channel, -unique_ID) %>%
      spread(spread_var, signal)
  }))
  return(combinedTab)
}

#' Filter out contaminants
#' 
#' @param dataLong long format data frame of imported dataset
#' 
#' @return data frame containing full dataset filtered to 
#' contain no contaminants
#' 
#' @examples 
#' data("simulated_cell_extract_df")
#' filterOutContaminants(simulated_cell_extract_df)
#' 
#' @export
filterOutContaminants <- function(dataLong){
  # internal function to filter out contaminants
  representative <- NULL
  filter(dataLong, !grepl("##", representative))
}

.checkRatioRef <- function(dataLong, idVar, concFactor = 1e6){
  # internal function to check that protein 
  # fold changes are computed
  # relative to the correct TMT channel
  label <- RefCol <- rel_value <- raw_value <- conc <- NULL
  
  if(!all(filter(dataLong, label == RefCol)$rel_value == 1, 
          na.rm = TRUE)){
    message("Recomputing ratios!")
    dataOut <- dataLong %>%
      group_by(.dots = c(idVar, "temperature")) %>%
      mutate(rel_value = rel_value/rel_value[label == RefCol]) %>%
      ungroup %>%
      filter(!is.na(raw_value)) %>%
      mutate(conc = as.numeric(conc)) %>%
      mutate(log_conc = log10(conc/concFactor))
    
    return(dataOut)
    
  }else{
    message("Ratios were correctly computed!")
    return(dataLong %>%
             filter(!is.na(raw_value)) %>%
             mutate(conc = as.numeric(conc)) %>%
             mutate(log_conc = log10(conc/concFactor)))
  }
}

#' @importFrom stats median
.medianNormalizeRatios <- function(dataLong){
  # internal function to perform median normalization 
  # of ratios
  rel_value <- temperature <- conc <- 
    raw_rel_value <- NULL
  
  dataOut <- dataLong %>%
    rename(raw_rel_value = rel_value) %>%
    group_by(temperature, conc) %>%
    mutate(rel_value = raw_rel_value / 
             median(raw_rel_value, na.rm = TRUE)) %>%
    ungroup()
  
  return(dataOut)
}

#' Rename columns of imported data frame
#' 
#' @param dataLong long format data frame of imported dataset
#' @param idVar character string indicating which data column provides the 
#' unique identifiers for each protein.
#' @param geneNameVar character string of the column name that describes
#' the gene name of a given protein in the raw data files
#' 
#' @return data frame containing imported data with renamed
#' columns
#' 
#' @examples 
#' data("config_tab")
#' data("raw_dat_list")
#' 
#' dataList <- import2dMain(configTable = config_tab,
#'                          data = raw_dat_list,
#'                          idVar = "protein_id",
#'                          fcStr = "rel_fc_",
#'                          addCol = "gene_name",
#'                          naStrs = NA,
#'                          intensityStr = "signal_sum_",
#'                          nonZeroCols = "qusm",
#'                          qualColName = "qupm")
#' configLong <- configWide2Long(configWide = config_tab)
#' annoDat <- annotateDataList(dataList = dataList,
#'                             geneNameVar = "gene_name",
#'                             configLong = configLong,
#'                             intensityStr = "signal_sum_",
#'                             fcStr = "rel_fc_")
#' renameColumns(annoDat, 
#'               idVar = "protein_id", 
#'               geneNameVar = "gene_name")
#' @export
renameColumns <- function(dataLong, idVar, geneNameVar){
  clustername <- representative <- NULL
  
  dplyr::rename(dataLong, "representative" = idVar, 
                "clustername" = geneNameVar) %>%
    group_by(clustername) %>%
    mutate(representative =
             .paste_rmNA(unique(unlist(strsplit(representative, 
                                               split = "\\|"))), 
                        sep = "|")) %>%
    ungroup()
}

#' Import 2D-TPP dataset using a config table
#' 
#' @param configTable character string of a file path to a config table
#' @param data possible list of datasets from different MS runs 
#' corresponding to a 2D-TPP dataset, circumvents loading datasets 
#' referencend in config table, default is NULL
#' @param idVar character string indicating which data column provides the 
#'   unique identifiers for each protein.
#' @param intensityStr character string indicating which columns contain 
#'   raw intensities measurements
#' @param fcStr character string indicating which columns contain the actual 
#'   fold change values. Those column names containing the suffix \code{fcStr} 
#'   will be regarded as containing fold change values.
#' @param naStrs character vector indicating missing values in the data table. 
#'   When reading data from file, this value will be passed on to the argument 
#'   \code{na.strings} in function \code{read.delim}.
#' @param qualColName character string indicating which column can be used for 
#'   additional quality criteria when deciding between different non-unique 
#'   protein identifiers.
#' @param medianNormalizeFC perform median normalization (default: TRUE).
#' @param addCol character string indicating additional column to import
#' @param filterContaminants boolean variable indicating whether data 
#' should be filtered to exclude contaminants (default: TRUE).
#' @param nonZeroCols column like default qssm that should be imported and
#' requested to be non-zero in analyzed data
#' @param geneNameVar character string of the column name that describes
#' the gene name of a given protein in the raw data files
#' @param concFactor numeric value that indicates how concentrations need to 
#' be adjusted to yield total unit e.g. default mmol - 1e6
#' 
#' @return tidy data frame representing a 2D-TPP dataset
#' 
#' @examples 
#' data("config_tab")
#' data("raw_dat_list")
#' import_df <- import2dDataset(configTable = config_tab, 
#'                              data = raw_dat_list,
#'                              idVar = "protein_id",
#'                              intensityStr = "signal_sum_",
#'                              fcStr = "rel_fc_",
#'                              nonZeroCols = "qusm",
#'                              geneNameVar = "gene_name",
#'                              addCol = NULL,
#'                              qualColName = "qupm",
#'                              naStrs = c("NA", "n/d", "NaN"),
#'                              concFactor = 1e6,
#'                              medianNormalizeFC = TRUE,
#'                              filterContaminants = TRUE)
#' 
#' @export
import2dDataset <- function(configTable, data,
                            idVar = "representative",
                            intensityStr = "sumionarea_protein_",
                            fcStr = "rel_fc_protein_",
                            nonZeroCols = "qssm",
                            geneNameVar = "clustername",
                            addCol = NULL,
                            qualColName = "qupm",
                            naStrs = c("NA", "n/d", "NaN"),
                            concFactor = 1e6,
                            medianNormalizeFC = TRUE,
                            filterContaminants = TRUE){
  
  configWide <- TPP_importCheckConfigTable(
    infoTable = configTable, type = "2D")
  configLong <- configWide2Long(configWide = configWide)
  
  dataList <- import2dMain(configTable = configWide,
                           data = data,
                           idVar = idVar,
                           fcStr = fcStr,
                           addCol = c(geneNameVar, addCol),
                           naStrs = naStrs,
                           intensityStr = intensityStr,
                           nonZeroCols = nonZeroCols,
                           qualColName = qualColName)
  
  dataLong <- annotateDataList(dataList = dataList,
                               geneNameVar = geneNameVar,
                               configLong = configLong,
                               intensityStr = intensityStr,
                               fcStr = fcStr)
  
  dataRatioChecked <- .checkRatioRef(dataLong, idVar = idVar,
                                     concFactor = concFactor)
  
  if(medianNormalizeFC){
    message("Median normalizing fold changes...")
    dataNorm <- .medianNormalizeRatios(dataRatioChecked)
  }else{
    dataNorm <- dataRatioChecked
  }
  
  dataOut <- renameColumns(dataNorm,
                           idVar = idVar,
                           geneNameVar = geneNameVar)
  
  if(filterContaminants){
    dataOut <- filterOutContaminants(dataOut)
  }
  
  return(dataOut)
}

Try the TPP2D package in your browser

Any scripts or data that you put into this service are public.

TPP2D documentation built on Nov. 8, 2020, 4:54 p.m.