R/ae2bioc.r

Defines functions ae2bioc

Documented in ae2bioc

# A function that takes ArrayExpress MAGETAB files for a specific experiment and returns an equivalent R object representation (NChannelSet, ExpressionSet or AffyBatch) 
# 
# Author: iemam
# Maintainer: Jose Marugan
###############################################################################

ae2bioc = function(mageFiles, dataCols=NULL, drop=TRUE){
  
  dataFiles <- NULL
  sdrf <- NULL
  idf <- NULL
  adf <- NULL
  path <- NULL
  
  dataFiles = lapply(mageFiles$rawArchive, function(x){
    if (!is_empty(x)){
      return(basename(x))
    }
  })
  if (!is_empty(mageFiles$mageTabFiles)){
    sdrf = basename(mageFiles$mageTabFiles[grep(mageFiles$mageTabFiles, pattern = "sdrf.txt$")])
    idf = basename(mageFiles$mageTabFiles[grep(mageFiles$mageTabFiles, pattern = "idf.txt$")])
  }
  else{
    stop("No SDRF file found.")
  }
  
  adf = mageFiles$adf
  path = mageFiles$path
  
  
  
  notuse = grep(dataFiles, pattern = "info.txt$|idf.txt$|processed|sdrf.txt$|.log$|RData|class|txt.magetab")
  if (length(notuse) != 0) 
    dataFiles = dataFiles[-notuse]
  dataFiles = dataFiles[dataFiles != ""]
  allDataFiles = dataFiles;
  
  if(length(dataFiles)==0)
    stop("ArrayExpress: Experiment has no raw files available. Consider using processed data instead by following procedure in the vignette")
  
  
  #check for duplicates in sdrf
  sdrfData<-read.delim(paste(path,sdrf,sep='/'), check.names=FALSE)
  if(!'Array Data File' %in% colnames(sdrfData)){
    stop("ArrayExpress: 'Array Data File' column not found in the SDRF file. Please make sure the assay is an array assay")
  }
  if(any(duplicated(sdrfData[, 'Array Data File']))){
    message("Duplicates found in SDRF file")
    #remove duplicates based on Array Data Files and update SDRF
    sdrfData<-sdrfData[!duplicated(sdrfData[, 'Array Data File']), ]
    write.table(sdrfData, file=sdrf, row.names=FALSE, quote=FALSE, sep='\t')
    message("Removed duplicates in SDRF file")
  }
  
  #read sample annotations
  ph = readPhenoData(sdrf,path)
  if(inherits(ph, 'try-error')){
    ph=NULL
    stop("ArrayExpress: Parsing SDRF failed. Please make sure SDRF file ",sdrf," exists in ",path, " and is not corrupt.")
  }
  fullPhenoData = ph;
  
  #Checks
  arrayDataCol = getSDRFcolumn("ArrayDataFile",varLabels(ph))
  #	labelCol = getSDRFcolumn("label",varLabels(ph))
  
  #ArrayDesign REF in SDRF
  adr = unique(pData(ph)[,getSDRFcolumn("ArrayDesignREF",varLabels(ph))])
  adr = adr[adr != ""]
  
  if(!all(dataFiles %in% ph[[arrayDataCol]]))
    warning("Some data files in the zip archive are missing from the SDRF. The object may not be built.")
  if(length(adr)>1)
    message("ArrayExpress: Experiment uses multiple Array Designs. A separate expressionSet will be created for each")	
  if((length(adr) == 0 || is.na(adr)) && length(grep(".cel",dataFiles, ignore.case = TRUE)) != 0)
    warning("ArrayExpress: Cannot find the array design reference in the sdrf file. The object may not be built.")
  if((length(adr) == 0 || is.na(adr)) && length(grep(".cel",dataFiles, ignore.case = TRUE)) == 0)
    stop("ArrayExpress: Cannot find the array design reference in the sdrf file. The object cannot be built.")
  
  #list of return R objects
  robjs=list();
  
  for (ad in adr){
    #Subselect SDRF data for current ArrayDesign REF
    if(length(adr)>1){
      res=getPhenoDataPerAD(ad,fullPhenoData,allDataFiles)
      dataFiles = unique(res$dataFiles)
      ph = res$pheno
    }	
    
    #read data files
    green.only = isOneChannel(sdrf,path)
    rawdata= try(readAEdata(path = path,files = dataFiles,dataCols=dataCols,green.only=green.only))
    if(inherits(rawdata, "try-error"))
      stop("ArrayExpress: Unable to read assay data")
    
    #read and match array feature metadata to raw data
    if(!inherits(rawdata,"FeatureSet")){
      adfFile = adf[grep(ad,adf)]
      features= try(readFeatures(adf=adfFile,path=path))
      if(inherits(features, "try-error")){
        warning("ArrayExpress: Unable to read feature data")
        features = NULL;
      }
    }
    
    #read experiment meta data
    experimentData = try(readExperimentData(idf=idf,path=path))
    if(inherits(experimentData, "try-error")){
      warning("ArrayExpress: Unable to read experiment data");
      experimentData = new("MIAME");
    }
    
    
    #Finally build ExpressionSet
    
    #Attach pheno and feature data to oligo::FeatureSet
    if(inherits(rawdata,"FeatureSet")){
      raweset=rawdata
      phenoData(raweset) = ph[sampleNames(rawdata)]
    }
    
    
    if(class(rawdata) == "RGList" | class(rawdata) == "EListRaw"){
      #construct nchannelset
      if(class(rawdata) == "RGList"){
        assayData = if("Rb" %in% names(rawdata)) #FIXME: keep all
          with(rawdata, assayDataNew(R = R, G = G, Rb = Rb, Gb = Gb)) #will not work if datacolumns where user specified
        else 
          with(rawdata, assayDataNew(G = G, R = R))
        
        raweset = new("NChannelSet",
                      assayData = assayData,
                      experimentData = experimentData)
      }
      
      #construct expressionSet
      if(class(rawdata) == "EListRaw"){
        assayData = with(rawdata, assayDataNew(E = E, Eb = Eb))
        raweset = new("NChannelSet",
                      assayData = assayData,
                      experimentData = experimentData)
      }
      
      #Attach pheno data
      if(!is.null(ph)){ #FIXME:?
        #imagene doesnt have targets slot
        ph = ph[rawdata$targets$FileName,]
        phenoData(raweset) = ph
      }
      
      #Attach features
      if(!is.null(rawdata$genes) && rawdata$source != "ae1"){
        features2 = new("AnnotatedDataFrame",rawdata$genes)
        featureData(raweset) = features2;
      }
      else
        if(!is.null(features))
          featureData(raweset) = features;
        #consistency of order between features in featureData and assayData is established via previous sorting of feature columns
        
        if(length(featureNames(assayData(raweset))) != length(featureNames(featureData(raweset))))
          warning("Number of features in assayData and featureData are not equal. Check control features (NA) that might have been removed from either assayData or featureData.");
        
    }
    
    
    robjs[[ad]]=raweset
    
  }
  
  if(drop && length(robjs) == 1)
    robjs = robjs[[1]]
  
  return(robjs)
}
ebi-gene-expression-group/bioconductor-ArrayExpress documentation built on April 9, 2023, 6:39 a.m.