R/formatData.R

Defines functions .calling .extract.data.pheno .extract.data.feature .extract.data formatData

Documented in formatData

#' @title Format RNA-seq or Affymetrix data downloaded from Bgee.
#'
#' @description This function formats the data downloaded with the getData() function into an object of the Bioconductor "expressionSet" Class.
#'
#' @param myBgeeObject A Reference Class Bgee object, notably specifying the targeted species and data type.
#'
#' @param data A list of data frames including data from multiple experiments, or a data frame including data from a single experiment.
#'
#' @param stats A character indicating what expression values should be used in the formatted data expressionSet object matrix.
#'  \itemize{
#'    \item{"rpkm" for RNA-seq}
#'    \item{"counts" for RNA-seq}
#'    \item{"tpm" for RNA-seq (Bgee release 14 and above)}
#'    \item{"intensities" for Affymetrix microarrays}
#'  }
#'
#' @param callType A character indicating whether intensities should be displayed only for present (i.e., expressed) genes, present high quality genes, or all genes (default).
#'  \itemize{
#'    \item{"present"}
#'    \item{"present high quality"}
#'    \item{"all"}
#'  }
#'
#' @return If data was a list of data frames from multiple experiments, returns a list of ExpressionSet objects. If data was a data frame from a single experiment, returns an ExpressionSet object.
#'
#' @author Andrea Komljenovic 
#'
#' @examples{
#'   bgee <- Bgee$new(species = "Mus_musculus", dataType = "rna_seq")
#'   dataMouseGSE30617 <- getData(bgee, experimentId = "GSE30617")
#'   dataMouseGSE30617.rpkm <- formatData(bgee, dataMouseGSE30617, callType = "present", stats = "rpkm")
#' }
#'
#' @importFrom dplyr %>%
#' @importFrom tidyr spread_
#' @export

formatData = function(myBgeeObject, data, stats = NULL, callType = "all"){

  ## check that fields of Bgee object are not empty
  if (length(myBgeeObject$quantitativeData) == 0 | length(myBgeeObject$dataType) == 0){
    stop("ERROR: there seems to be a problem with the input Bgee class object, some fields are empty. Please check that the object is valid.")
  }

  ## Check that download of data is possible for targeted species and data type
  if (myBgeeObject$quantitativeData != TRUE){
    stop("ERROR: formatting quantitative data is not possible for the species and data type of the input Bgee class object.")
  }

  if (length(stats) == 0){
    if (myBgeeObject$dataType == "affymetrix"){
      cat("\nWARNING: stats parameter set to \"intensities\" for the next steps.\n")
      stats <- "intensities"
    } else if (myBgeeObject$dataType == "rna_seq"){
      stop("Please specify the stats parameters. Should be set to \"rpkm\", \"counts\" or \"tpm\"")
    }
  } else if (myBgeeObject$dataType == "affymetrix" & stats != "intensities"){
    cat("\nWARNING: For Affymetrix microarray data, stats parameter can only be set to \"intensities\". This will be used for the next steps.\n")
    stats <- "intensities"
  } else if (myBgeeObject$dataType == "rna_seq" & myBgeeObject$release == "13_2" & !(stats %in% c('rpkm', 'counts'))){
    stop("Choose whether data formatting should create a matrix of RPKMs or read counts, with stats option set as \"rpkm\" or \"counts\"")
  } else if (myBgeeObject$dataType == "rna_seq" & !(stats %in% c('rpkm', 'counts', 'tpm'))){
    stop("Choose whether data formatting should create a matrix of RPKMs, TPMs or read counts, with stats option set as \"rpkm\", \"tpm\" or \"counts\"")
  }

  if(!(callType %in% c('present','present high quality','all'))){
    stop("Choose between displaying intensities for present genes, present high quality genes or all genes, e.g., 'present', 'present high quality', 'all' ")
  }

  if(length(data) == 1) data[[1]] else data

  cat("\nExtracting expression data matrix...\n")
  if(stats  == "rpkm"){
    columns <- c("Library.ID", "Gene.ID", "RPKM")
    expr <- .extract.data(data, columns, callType)
  } else if(stats  == "tpm"){
    columns <- c("Library.ID", "Gene.ID", "TPM")
    expr <- .extract.data(data, columns, callType)
  } else if (stats == "counts"){
    columns <- c("Library.ID", "Gene.ID", "Read.count")
    expr <- .extract.data(data, columns, callType)
  } else {
    columns <- c("Chip.ID", "Probeset.ID", "Log.of.normalized.signal.intensity", "Gene.ID")
    expr <- .extract.data(data, columns, callType)
  }
  ## one data matrix
  if(is.data.frame(expr$assayData)){

    ## sort objects to have samples in the same order
    expr$assayData <- expr$assayData[, order(names(expr$assayData))]
    expr$pheno <- expr$pheno[order(sampleNames(expr$pheno))]

    ## create ExpressionSet object
    eset <- new('ExpressionSet',
                exprs=as.matrix(expr$assayData),
                phenoData=expr$pheno,
                featureData=expr$features)
  } else if(is.list(expr$assayData)){
    ## multiple data matrices
    eset <- mapply(function(x,y,z){
      ## sort objects to have samples in the same order
      x <- x[, order(names(x))]
      y <- y[order(sampleNames(y))]

      ## create ExpressionSet object
      new('ExpressionSet',
          exprs=as.matrix(x),
          phenoData=y,
          featureData=z)
    },
    expr$assayData, expr$pheno, expr$features)
  }
  return(eset)
}

## Take in unformatted data downloaded from Bgee and outputs a list of expression matrices, phenotype annotations, feature annotations, and calls.
.extract.data <- function(data, columns, callType){
  ## if multiple data frames (multiple experiments or chips)
  if(class(data) == "list"){
    calls <- lapply(data, function(x) .calling(x, callType, columns[3]))
    expr <- lapply(calls,
                   function(x) {
                     ## subset the data to keep relevant columns
                     xt <- x[, columns[1:3]]
                     ## from sample and feature columns, create a matrix with features as rows and samples as columns
                     xtt <- xt %>% spread_(columns[1], columns[3])
                     rownames(xtt) <- xtt[, columns[2]]
                     ## Remove feature column to keep only data
                     xtt[,-1, drop = FALSE]
                   }
    )
    cat("\nExtracting features information...\n")
    features <- mapply(.extract.data.feature, calls, expr, rep(list(columns), times=length(calls)))
    cat("\nExtracting samples information...\n")
    phenos <- mapply(.extract.data.pheno, calls, rep(list(columns[1]), times=length(calls)))
  } else {
    ## if only a single dataframe
    calls <- .calling(data, callType, columns[3])
    ## subset the data to keep relevant columns
    xt <- calls[, columns[1:3]]
    xtt <- xt %>% spread_(columns[1], columns[3])
    rownames(xtt) <- xtt[, columns[2]]
    ## Remove feature column to keep only data
    expr <- xtt[,-1, drop = FALSE]

    cat("\nExtracting features information...\n")
    features <- .extract.data.feature( calls, expr, columns)
    cat("\nExtracting samples information...\n")
    phenos <- .extract.data.pheno( calls, columns[1])
  }
  return(list(assayData = expr, pheno = phenos, features = features, calls = calls))
}

# Extract feature data (probesets or genes)
.extract.data.feature <- function(calls, expr, columns){
  ## RNA-seq
  if(length(columns) == 3){
    fdata <- calls[match(rownames(expr), calls[, columns[2]]), columns[2], drop = FALSE]
  }
  ## Affymetrix, 4 columns
  else if(length(columns) == 4){
    fdata <- calls[match(rownames(expr), calls[, columns[2]]), columns[c(2,4)], drop = FALSE]
  }
  rownames(fdata) <- fdata[, columns[2]]
  fdata <- as(fdata, "AnnotatedDataFrame")
  return(fdata)
}

# Extract annotation of samples
.extract.data.pheno <- function(calls, column){
  phdata <- calls[, c(column, "Anatomical.entity.ID", "Anatomical.entity.name", "Stage.ID", "Stage.name")]
  phdata <- phdata[!duplicated(phdata[, column]), ]
  rownames(phdata) <- phdata[, column]
  phdata <- as.data.frame(phdata)
  metadata <- data.frame(labelDescription=colnames(phdata),
                         row.names=colnames(phdata))
  phenodata <- new("AnnotatedDataFrame",
                   data=phdata,
                   varMetadata=metadata
  )
  return(phenodata)
}

.calling <- function(x, callType, column){
  ## check data type
  if(callType == "present"){
    cat("  Keeping only present genes.\n")
    x[(x$Detection.flag == "absent"), column] <- NA
  } else if (callType == "present high quality"){
    cat("  Keeping only present high quality genes.\n")
    x[which(x$Detection.flag == "absent" | x$Detection.quality == "poor quality"), column] <- NA
  }
  return(x)
}
wirawara/BgeeDB documentation built on Feb. 10, 2023, 4:15 a.m.