R/formatData.R
In BgeeDB: Annotation and gene expression data retrieval from Bgee database. TopAnat, an anatomical entities Enrichment Analysis tool for UBERON ontology

Documented in formatData

#' @title Format RNA-seq or Affymetrix data downloaded from Bgee.
#'
#' @description This function formats the data downloaded with the getData() function into an object of the Bioconductor "expressionSet" Class.
#'
#' @param myBgeeObject A Reference Class Bgee object, notably specifying the targeted species and data type.
#'
#' @param data A list of data frames including data from multiple experiments, or a data frame including data from a single experiment.
#'
#' @param stats A character indicating what expression values should be used in the formatted data expressionSet object matrix.
#'  \itemize{
#'    \item{"rpkm" for RNA-seq (Bgee release 13.2 and before)}
#'    \item{"fpkm" for RNA-seq (Bgee release 14 and above)}
#'    \item{"counts" for RNA-seq}
#'    \item{"tpm" for RNA-seq (Bgee release 14 and above)}
#'    \item{"intensities" for Affymetrix microarrays}
#'  }
#'
#' @param callType A character indicating whether intensities should be displayed only for present (i.e., expressed) genes, present high quality genes, or all genes (default).
#'  \itemize{
#'    \item{"present"}
#'    \item{"present high quality"}
#'    \item{"all"}
#'  }
#'
#' @return If data was a list of data frames from multiple experiments, returns a list of ExpressionSet objects. If data was a data frame from a single experiment, returns an ExpressionSet object.
#'
#' @author Andrea Komljenovic and Julien Roux.
#'
#' @examples{
#'   bgee <- Bgee$new(species = "Mus_musculus", dataType = "rna_seq")
#'   dataMouseGSE43721 <- getData(bgee, experimentId = "GSE43721")
#'   dataMouseGSE43721.fpkm <- formatData(bgee,
#'                                        dataMouseGSE43721,
#'                                        callType = "present",
#'                                        stats = "fpkm")
#' }
#'
#' @importFrom dplyr %>%
#' @importFrom tidyr spread_
#' @export

formatData = function(myBgeeObject, data, stats = NULL, callType = "all"){

  ## check that fields of Bgee object are not empty
  if (length(myBgeeObject$quantitativeData) == 0 | length(myBgeeObject$dataType) == 0){
    stop("ERROR: there seems to be a problem with the input Bgee class object, some fields are empty. Please check that the object is valid.")
  }

  ## Check that download of data is possible for targeted species and data type
  if (myBgeeObject$quantitativeData != TRUE){
    stop("ERROR: formatting quantitative data is not possible for the species and data type of the input Bgee class object.")
  }

  if (length(stats) == 0){
    if (myBgeeObject$dataType == "affymetrix"){
      cat("\nWARNING: stats parameter set to \"intensities\" for the next steps.\n")
      stats <- "intensities"
    } else if (myBgeeObject$dataType == "rna_seq"){
      stop("Please specify the stats parameters. Should be set to \"rpkm\", \"counts\" or \"tpm\"")
    }
  } else if (myBgeeObject$dataType == "affymetrix" & stats != "intensities"){
    cat("\nWARNING: For Affymetrix microarray data, stats parameter can only be set to \"intensities\". This will be used for the next steps.\n")
    stats <- "intensities"
  } else if (myBgeeObject$dataType == "rna_seq" & compareVersion(gsub("_", ".", myBgeeObject$release), "13.2") <= 0 & !(stats %in% c('rpkm', 'counts'))){
    stop("Choose whether data formatting should create a matrix of RPKMs or read counts, with stats option set as \"rpkm\" or \"counts\"")
  } else if (myBgeeObject$dataType == "rna_seq" & compareVersion(gsub("_", ".", myBgeeObject$release), "13.2") > 0 & !(stats %in% c('fpkm', 'counts', 'tpm'))){
    stop("Choose whether data formatting should create a matrix of FPKMs, TPMs or read counts, with stats option set as \"fpkm\", \"tpm\" or \"counts\"")
  }

  if(!(callType %in% c('present','present high quality','all'))){
    stop("Choose between displaying intensities for present genes, present high quality genes or all genes, e.g., 'present', 'present high quality', 'all' ")
  }

  if(length(data) == 1) data[[1]] else data

  cat("\nExtracting expression data matrix...\n")
  if(stats  == "rpkm"){
    columns <- c("Library.ID", "Gene.ID", "RPKM")
    expr <- .extract.data(data, columns, callType)
  } else if(stats  == "fpkm"){
    columns <- c("Library.ID", "Gene.ID", "FPKM")
    expr <- .extract.data(data, columns, callType)
  } else if(stats  == "tpm"){
    columns <- c("Library.ID", "Gene.ID", "TPM")
    expr <- .extract.data(data, columns, callType)
  } else if (stats == "counts"){
    columns <- c("Library.ID", "Gene.ID", "Read.count")
    expr <- .extract.data(data, columns, callType)
  } else {
    columns <- c("Chip.ID", "Probeset.ID", "Log.of.normalized.signal.intensity", "Gene.ID")
    expr <- .extract.data(data, columns, callType)
  }
  ## one data matrix
  if(is.data.frame(expr$assayData)){

    ## sort objects to have samples in the same order
    expr$assayData <- expr$assayData[, order(names(expr$assayData))]
    expr$pheno <- expr$pheno[order(sampleNames(expr$pheno))]

    ## create ExpressionSet object
    eset <- new('ExpressionSet',
                exprs=as.matrix(expr$assayData),
                phenoData=expr$pheno,
                featureData=expr$features)
  } else if(is.list(expr$assayData)){
    ## multiple data matrices
    eset <- mapply(function(x,y,z){
      ## sort objects to have samples in the same order
      x <- x[, order(names(x))]
      y <- y[order(sampleNames(y))]

      ## create ExpressionSet object
      new('ExpressionSet',
          exprs=as.matrix(x),
          phenoData=y,
          featureData=z)
    },
    expr$assayData, expr$pheno, expr$features)
  }
  return(eset)
}

## Take in unformatted data downloaded from Bgee and outputs a list of expression matrices, phenotype annotations, feature annotations, and calls.
.extract.data <- function(data, columns, callType){
  ## if multiple data frames (multiple experiments or chips)
  if(class(data) == "list"){
    calls <- lapply(data, function(x) .calling(x, callType, columns[3]))
    expr <- lapply(calls,
                   function(x) {
                     ## subset the data to keep relevant columns
                     xt <- x[, columns[1:3]]
                     ## from sample and feature columns, create a matrix with features as rows and samples as columns
                     xtt <- xt %>% spread_(columns[1], columns[3])
                     rownames(xtt) <- xtt[, columns[2]]
                     ## Remove feature column to keep only data
                     xtt[,-1, drop = FALSE]
                   }
    )
    cat("\nExtracting features information...\n")
    features <- mapply(.extract.data.feature, calls, expr, rep(list(columns), times=length(calls)))
    cat("\nExtracting samples information...\n")
    phenos <- mapply(.extract.data.pheno, calls, rep(list(columns[1]), times=length(calls)))
  } else {
    ## if only a single dataframe
    calls <- .calling(data, callType, columns[3])
    ## subset the data to keep relevant columns
    xt <- calls[, columns[1:3]]
    xtt <- xt %>% spread_(columns[1], columns[3])
    rownames(xtt) <- xtt[, columns[2]]
    ## Remove feature column to keep only data
    expr <- xtt[,-1, drop = FALSE]

    cat("\nExtracting features information...\n")
    features <- .extract.data.feature( calls, expr, columns)
    cat("\nExtracting samples information...\n")
    phenos <- .extract.data.pheno( calls, columns[1])
  }
  return(list(assayData = expr, pheno = phenos, features = features, calls = calls))
}

# Extract feature data (probesets or genes)
.extract.data.feature <- function(calls, expr, columns){
  ## RNA-seq
  if(length(columns) == 3){
    fdata <- calls[match(rownames(expr), calls[, columns[2]]), columns[2], drop = FALSE]
  }
  ## Affymetrix, 4 columns
  else if(length(columns) == 4){
    fdata <- calls[match(rownames(expr), calls[, columns[2]]), columns[c(2,4)], drop = FALSE]
  }
  rownames(fdata) <- fdata[, columns[2]]
  fdata <- as(fdata, "AnnotatedDataFrame")
  return(fdata)
}

# Extract annotation of samples
.extract.data.pheno <- function(calls, column){
  phdata <- calls[, c(column, "Anatomical.entity.ID", "Anatomical.entity.name", "Stage.ID", "Stage.name")]
  phdata <- phdata[!duplicated(phdata[, column]), ]
  rownames(phdata) <- phdata[, column]
  phdata <- as.data.frame(phdata)
  metadata <- data.frame(labelDescription=colnames(phdata),
                         row.names=colnames(phdata))
  phenodata <- new("AnnotatedDataFrame",
                   data=phdata,
                   varMetadata=metadata
  )
  return(phenodata)
}

.calling <- function(x, callType, column){
  ## check data type
  if(callType == "present"){
    cat("  Keeping only present genes.\n")
    x[(x$Detection.flag == "absent"), column] <- NA
  } else if (callType == "present high quality"){
    cat("  Keeping only present high quality genes.\n")
    x[which(x$Detection.flag == "absent" | x$Detection.quality == "poor quality"), column] <- NA
  }
  return(x)
}

Any scripts or data that you put into this service are public.

BgeeDB documentation built on Nov. 8, 2020, 8:21 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

BgeeDB
Annotation and gene expression data retrieval from Bgee database. TopAnat, an anatomical entities Enrichment Analysis tool for UBERON ontology

R/formatData.R
In BgeeDB: Annotation and gene expression data retrieval from Bgee database. TopAnat, an anatomical entities Enrichment Analysis tool for UBERON ontology

Defines functions .calling .extract.data.pheno .extract.data.feature .extract.data formatData

Documented in formatData

Try the BgeeDB package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

BgeeDB Annotation and gene expression data retrieval from Bgee database. TopAnat, an anatomical entities Enrichment Analysis tool for UBERON ontology

R/formatData.R In BgeeDB: Annotation and gene expression data retrieval from Bgee database. TopAnat, an anatomical entities Enrichment Analysis tool for UBERON ontology

Defines functions .calling .extract.data.pheno .extract.data.feature .extract.data formatData

Documented in formatData

Try the BgeeDB package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

BgeeDB
Annotation and gene expression data retrieval from Bgee database. TopAnat, an anatomical entities Enrichment Analysis tool for UBERON ontology

R/formatData.R
In BgeeDB: Annotation and gene expression data retrieval from Bgee database. TopAnat, an anatomical entities Enrichment Analysis tool for UBERON ontology