R/cbaf-obtainMultipleStudies.R

Defines functions obtainMultipleStudies

Documented in obtainMultipleStudies

#' @title Obtain the requested data for various cancer studies.
#'
#' @description This function Obtains the requested data for the given genes
#' across multiple cancer studies. It can check whether or not all genes are
#' included in cancer studies and, if not, looks for the alternative gene names.
#'
#' @details
#' \tabular{lllll}{
#' Package: \tab cbaf \cr
#' Type: \tab Package \cr
#' Version: \tab 1.20.0 \cr
#' Date: \tab 2022-10-24 \cr
#' License: \tab Artistic-2.0 \cr
#' }
#'
#'
#'
#' @importFrom cBioPortalData cBioPortal getStudies sampleLists molecularProfiles getDataByGenes
#'
#' @importFrom BiocFileCache BiocFileCache bfcnew bfcquery bfcpath
#'
#' @importFrom utils head setTxtProgressBar txtProgressBar
#'
#'
#'
#' @usage obtainMultipleStudies(genesList, submissionName, studiesNames,
#'   desiredTechnique, cancerCode = FALSE, validateGenes = TRUE)
#'
#'
#'
#' @param genesList a list that contains at least one gene group
#'
#' @param submissionName a character string containing name of interest. It is
#' used for naming the process.
#'
#' @param studiesNames a character vector or a matrix that containes desired
#' cancer names. The character vector containes standard
#' names of cancer studies that can be found on cbioportal.org, such as
#' \code{"Acute Myeloid Leukemia (TCGA, NEJM 2013)"}. Alternatively, a matrix can
#' used if users prefer user-defined cancer names. In this case, the first
#' column of matrix comprises the standard cancer names while the second column
#' must contain the desired cancer names.
#'
#' @param desiredTechnique a character string that is one of the following
#' techniques: \code{"RNA-Seq"}, \code{"RNA-SeqRTN"}, \code{"microRNA-Seq"},
#' \code{"microarray.mRNA"}, \code{"microarray.microRNA"} or
#' \code{"methylation"}.
#'
#' @param cancerCode a logical value that tells the function to use cbioportal
#' abbreviated cancer names instead of complete cancer names, if set to be
#' \code{TRUE}. For example, \code{"laml_tcga_pub"} is the abbreviated name for
#' \code{"Acute Myeloid Leukemia (TCGA, NEJM 2013)"}.
#'
#' @param validateGenes a logical value that, if set to be \code{TRUE}, function
#'  will check each cancer study to find whether or not each gene has a record.
#'  If a cancer study doesn't have a record for specific gene, it checks for
#'  alternative gene names that cbioportal might use instead of the given gene
#'  name.
#'
#'
#'
#' @return a BiocFileCach object that contains the obtained data without further
#'  processing. Name of the object is combination of \code{bfc_} and
#'  \code{submissionName}. Inside it, there is a section for the obtained data,
#'  which is stored as a list. At first level, this list is subdivided into
#'  diferent groups based on the list of genes that user has given the function,
#'   then each gene group itself contains one matrix for every cancer study.
#'   Additonally, if \code{validateGenes = TRUE}, another section that contains
#'   gene validation results will be created in the BiocFileCach object.
#'
#'
#'
#' @examples
#' genes <- list(K.demethylases = c("KDM1A", "KDM1B", "KDM2A", "KDM2B", "KDM3A",
#'  "KDM3B", "JMJD1C", "KDM4A"), K.methyltransferases = c("SUV39H1", "SUV39H2",
#'  "EHMT1", "EHMT2", "SETDB1", "SETDB2", "KMT2A", "KMT2A"))
#'
#' studies <- c("Acute Myeloid Leukemia (TCGA, Provisional)",
#' "Adrenocortical Carcinoma (TCGA, Provisional)",
#' "Bladder Urothelial Carcinoma (TCGA, Provisional)",
#' "Brain Lower Grade Glioma (TCGA, Provisional)",
#' "Breast Invasive Carcinoma (TCGA, Provisional)")
#'
#' obtainMultipleStudies(genes, "test2", studies, "RNA-Seq")
#'
#' @author Arman Shahrisa, \email{shahrisa.arman@hotmail.com} [maintainer,
#' copyright holder]
#' @author Maryam Tahmasebi Birgani, \email{tahmasebi-ma@ajums.ac.ir}
#'
#' @export



################################################################################
################################################################################
################ Obtain the requested data for multiple Cancers ################
################################################################################
################################################################################

obtainMultipleStudies <- function(

  genesList,

  submissionName,

  studiesNames,

  desiredTechnique,

  cancerCode = FALSE,

  validateGenes = TRUE

){

  ##############################################################################
  ########## Prerequisites

  # Check genes

  if(!is.list(genesList)){

    # stop("[obtainMultipleStudies] 'genes' must be a list that contains at list one group of genes")

    if(is.vector(genesList)){

      genesList <- list(anonymous = genesList)

    }

  }

  # Convert genes to Upper Level

  genesList <- lapply(genesList, toupper)



  # Check submissionName

  if(!is.character(submissionName)){

    stop("[obtainMultipleStudies] 'submissionName' must be a character string!")

  }



  # Check studiesNames

  if(!is.character(studiesNames)){

    stop("[obtainMultipleStudies] 'studiesNames' must be character vector!")

  }



  # Check desiredTechnique

  if(is.character(desiredTechnique)){

    supported.techniques <- c("RNA-Seq",

                              "RNA-SeqRTN",

                              "microRNA-Seq",

                              "Microarray.mRNA",

                              "Microarray.microRNA",

                              "methylation")

    if(!(desiredTechnique %in% supported.techniques) |

       length(desiredTechnique)!= 1){

      stop("[obtainMultipleStudies] 'desiredTechnique' must be either 'RNA-Seq', 'microRNA-Seq', 'microarray.mRNA', 'microarray.microRNA' or 'methylation' !")

    } else if(desiredTechnique %in% supported.techniques |

              length(desiredTechnique)== 1){

      if(desiredTechnique == "RNA-Seq"){

        L1.characteristics <- RNA.Seq_L1.terms

        L2.characteristics <- RNA.Seq_L2.terms

      } else if(desiredTechnique == "RNA-SeqRTN"){

        L1.characteristics <- RNA.Seq_L1.terms

        L2.characteristics <- RNA.Seq_rtn_L2.terms

      } else if(desiredTechnique == "microRNA-Seq"){

        L1.characteristics <- microRNA.Seq_L1.terms

        L2.characteristics <- microRNA.Seq_L2.terms

      } else if(desiredTechnique == "microarray.mRNA"){

        L1.characteristics <- microarray.with.mRNA_L1.terms

        L2.characteristics <- microarray.with.mRNA_L2.terms

      } else if(desiredTechnique == "microarray.microRNA"){

        L1.characteristics <- microarray.with.microRNA_L1.terms

        L2.characteristics <- microarray.with.microRNA_L2.terms

      } else if(desiredTechnique == "methylation"){

        L1.characteristics <- methylation_L1.terms

        L2.characteristics <- methylation_L2.terms

      }

    }

  } else {

    stop("[obtainMultipleStudies] 'desiredTechnique' must be a character string!")

  }



  # Check cancerCode

  if(!is.logical(cancerCode)){

    stop("[obtainMultipleStudies] 'cancerCode' must be either TRUE or FALSE!")

  }



  # Check validateGenes

  if(!is.logical(validateGenes)){

    stop("[obtainMultipleStudies] 'validateGenes' must be either TRUE or FALSE!")

  }





  ##############################################################################
  ########## Decide whether function should stops now!

  # Set cgdsr, stop if submissionName is either "test" or "test2" to
  # improving package test speed

  if(!(submissionName %in% c("test", "test2"))){

    cbio <- cBioPortal()

    #! mycgds = CGDS("http://www.cbioportal.org/")


    # Getting cancer names

    studies <- getStudies(cbio)

    supportedCancers <- studies

    #! supportedCancers_old <- getCancerStudies(mycgds)

  }


  # Check if all studiesNames are included in supportedCancers

  if(!(submissionName %in% c("test", "test2"))){

    if(any(studiesNames %in% supportedCancers$name)){

      studiesNames <- studiesNames[studiesNames %in% supportedCancers$name]

    }else{

      stop("[obtainMultipleStudies] None of the requested cancer studies is supported!")

    }

  }


  # Store the new parameteres

  newParameters <-list()

  newParameters$genesList <- genesList

  newParameters$submissionName <- submissionName

  newParameters$studyName <- studiesNames

  newParameters$desiredTechnique <- desiredTechnique

  newParameters$cancerCode <- cancerCode

  newParameters$validateGenes <- validateGenes





  # Check the database

  database <-

    system.file("extdata", submissionName, package="cbaf")



  # Remove old database

  if(dir.exists(database) & !(submissionName %in% c("test", "test2"))){

    creation.time <- file.info(database, extra_cols = FALSE)$ctime

    past.time <-

      as.numeric(difftime(Sys.time(), creation.time, units = c("days")))

    if(past.time >= 5){

      message("[obtainMultipleStudies] The downloaded data are outdated!")

      message("[obtainMultipleStudies] Removing the previous data.")

      unlink(database, recursive = TRUE)

    }

  }



  # Check wheather the requested data exists

  if(dir.exists(database)){

    bfc <- BiocFileCache(

      file.path(system.file("extdata", package = "cbaf"), submissionName),

      ask = FALSE

    )

    if(nrow(bfcquery(bfc, "Parameters for obtainMultipleStudies()")) == 1){

      oldParameters <- readRDS(

        bfcpath(

          bfc,

          bfcquery(bfc, c("Parameters for obtainMultipleStudies()"))$rid

        )

      )

      if(identical(oldParameters[-7], newParameters) |

         submissionName %in% c("test", "test2")){

        continue <- FALSE

        # Store the last parameter

        newParameters$lastRunStatus <- "skipped"

        oldParamObtainMultipleStudies <- newParameters

        saveRDS(

          oldParamObtainMultipleStudies,

          file=bfc[[bfcquery(bfc, "obtainMultipleStudies()")$rid]]

        )

        if(submissionName %in% c("test", "test2")){

          message("[obtainMultipleStudies] Please choose a name other than 'test' and 'test2'.")

        }

        message("[obtainMultipleStudies] The requested data already exist locally.")

        message("[obtainMultipleStudies] The function was haulted!")

      }else{

        continue <- TRUE

      }

    }else{

      continue <- TRUE

    }

  } else{

    continue <- TRUE

  }





  if(continue){

    ############################################################################
    ########## Set the function ready to work

    # Creating a vector for cancer names and subsequent list subset name

    if(!is.matrix(studiesNames)){

      studiesNames <- studiesNames[order(studiesNames)]

      studiesNamesMatrix <- studiesNames


      if(cancerCode){

        cancer.studies.idx <-

          which(supportedCancers$name %in% as.character(studiesNames))

        groupNames <- supportedCancers$studyId[cancer.studies.idx]

      } else if(!cancerCode){

        groupNames <- as.character(studiesNames)

      }


    } else if(is.matrix(studiesNames)){

      studiesNamesMatrix <- studiesNames[order(studiesNames[,1]),]

      studiesNames <- studiesNamesMatrix[,2]

      groupNames <- studiesNames

    }



    ############################################################################
    ########## Repetitive code section

    compatible_ProfileData <- function(

      cbio,

      mycancerstudy,

      ordered_genesNames,

      mycaselist,

      mygeneticprofile

      ){

      # split genes in groups of 250 names

      operational_genes <- split(

        ordered_genesNames, ceiling(seq_along(ordered_genesNames)/250)

      )


      # Create empty list for gene_matrices

      separated_results <- vector(

        "list", length = length(operational_genes)

      )

      for(operational in seq_along(operational_genes)){

        Unprocessed_ProfileData_list <-

          cBioPortalData::getDataByGenes(

            cbio,

            studyId = mycancerstudy,

            genes = operational_genes[[operational]],

            by = "hugoGeneSymbol",

            sampleListId = mycaselist,

            molecularProfileIds = mygeneticprofile

          )

        # Extracting data.frame from List

        Unprocessed_ProfileData <- Unprocessed_ProfileData_list[[1]]

        # Getting all gene names to later find NA genes for patients

        complete_genes <- unique(Unprocessed_ProfileData$hugoGeneSymbol)

        # Subsetting data.frame to contain the needed Columns

        Filtered_Unprocessed_ProfileData <-

          Unprocessed_ProfileData[,c("sampleId", "value", "hugoGeneSymbol")]

        # Splitting data.frame by sampleId

        patient_genes_list <- split(Filtered_Unprocessed_ProfileData,

                                    Filtered_Unprocessed_ProfileData$sampleId)

        # Making each table in the list a one column table

        for(hugo in seq_along(patient_genes_list)){


          present_gene_table <- patient_genes_list[[hugo]]

          present_gene_table_2 <-

            present_gene_table[,"value", drop = FALSE]

          # Converting to matrix

          present_gene_matrix <- as.matrix(present_gene_table_2)

          # Giving gene names and patient id to the values

          colnames(present_gene_matrix) <-

            names(patient_genes_list)[hugo]

          rownames(present_gene_matrix) <-

            present_gene_table$hugoGeneSymbol

          hugo_output <- t(present_gene_matrix)

          hugo_output <-

            hugo_output[,order(colnames(hugo_output)), drop = FALSE]

          # Accounting for missing genes (NA genes)

          returned_genes <- colnames(hugo_output)

          NA_genes <- complete_genes[! complete_genes %in% returned_genes]

          if(length(NA_genes) > 0){

            NA_matrix <- matrix(NA, ncol = 1, nrow = 1)

            rownames(NA_matrix) <- rownames(hugo_output)

            NA_list <- vector("list", length = length(NA_genes))

            for(NA_gene in seq_along(NA_genes)){

              colnames(NA_matrix) <- NA_genes[NA_gene]

              NA_list[[NA_gene]] <- NA_matrix

            }

            NA_output <- do.call(cbind, NA_list)

            hugo_output_2 <- cbind(hugo_output, NA_output)

            patient_genes_list[[hugo]] <-

              hugo_output_2[,order(colnames(hugo_output_2)), drop = FALSE]

          } else{

            patient_genes_list[[hugo]] <- hugo_output

          }

        }


        # Generating old ProfileData format by collapsing the list

        ProfileData <- do.call(rbind, patient_genes_list)

        # Sorting the ProfileData by column and row names

        ProfileData <- ProfileData[,order(colnames(ProfileData)), drop = FALSE]

        ProfileData <- ProfileData[order(rownames(ProfileData)),, drop = FALSE]

        separated_results[[operational]] <- ProfileData


      }


      # Merging data

      ProfileData <- do.call("cbind", separated_results)

      ProfileData <- ProfileData[,order(colnames(ProfileData)), drop = FALSE]

      ProfileData <- ProfileData[order(rownames(ProfileData)),, drop = FALSE]

      ProfileData

    }



    ############################################################################
    ########## Core segment

    # Report

    message("[obtainMultipleStudies] Downloading the required data.")



    # create progress bar

    obtainMultipleStudiesProgressBar <-

      txtProgressBar(min = 0, max = length(studiesNames), style = 3)



    # Create parent list for storing final results in the global environment

    rawList <- list()

    # Creating child lists

    for(nname in seq_along(genesList)){

      rawList[[nname]] <- list(); names(rawList)[nname] <-

        names(genesList)[nname]

    }

    # Creating a list for gene validation results

    if(validateGenes){

      validationResult <- list()

      for(nname in seq_along(genesList)){

        validationResult[[nname]] <- "x"; names(validationResult)[nname] <-

          names(genesList)[nname]

      }

    }


    # Create Empty List to fill with validation matrices

    validationMList <- vector("list", length(genesList)*length(studiesNames))

    # List of cancers with corrupted data

    cancersWithCorruptedData <- NULL

    # List of cancers lacking appropriate data

    cancersLackingData <- NULL

    # Accounting for missing genes from different studies

    genes_in_studies <- vector("list", length(studiesNames))





    ## Getting the required gene expresssion profile ...

    # 'for' control structure for obtaining data and calculating the parameters

    for(c in seq_along(studiesNames)){

      # Determining name for list subset of study name

      groupName <- groupNames[c]

      # Correcting possible errors of list names

      groupName <- gsub(

        groupName, pattern = "\\+ ", replacement = " possitive ",

        ignore.case = TRUE

      )

      groupName <- gsub(

        groupName, pattern = "\\- ", replacement = " negative ",

        ignore.case = TRUE

      )





      # Find cancer abbreviated name

      CancerStudies.idx <-

        which(supportedCancers$name == as.character(studiesNames[c]))

      mycancerstudy = as.character(

        supportedCancers[CancerStudies.idx, "studyId"]

      )



      # Finding the first characteristics of data in the cancer

      AvailableDataTypes <- sampleLists(cbio, mycancerstudy)

      #! AvailableDataTypes_old <- getCaseLists(mycgds, mycancerstudy)

      if(length(AvailableDataTypes) > 1){

        # Available terms and priotizing based on cbaf-constants.R

        match_index_1 <- match(L1.characteristics, AvailableDataTypes$name)

        f.condition <- AvailableDataTypes$name[match_index_1]

        f.condition <- f.condition[!is.na(f.condition)]

        if(length(f.condition) >= 1){

          f.condition <- f.condition[1]

          f.condition.idx <- which(AvailableDataTypes$name == f.condition)

          mycaselist <- as.character(

            AvailableDataTypes[f.condition.idx ,"sampleListId"]

          )


          CancerPossessData <- TRUE

        } else if(length(f.condition) == 0){

          CancerPossessData <- FALSE

        }

        CancerPossessCorruptedData <- FALSE

      } else{

        CancerPossessCorruptedData <- TRUE

        # Updating cancers list

        if(is.null(cancersWithCorruptedData)){

          cancersWithCorruptedData <- studiesNames[c]

        }else{

          cancersWithCorruptedData <-

            c(cancersWithCorruptedData, studiesNames[c])

        }

      }





      # Finding the second characteristics of data in the cancer

      if(!CancerPossessCorruptedData){

        if(CancerPossessData){

          AvailableDataFormats <- molecularProfiles(cbio, mycancerstudy)

          #! AvailableDataFormats_old <- getGeneticProfiles(mycgds, mycancerstudy)

          match_index_2 <- match(L2.characteristics, AvailableDataFormats$name)

          s.condition <- AvailableDataFormats$name[match_index_2]

          s.condition <- s.condition[!is.na(s.condition)]



          if(length(s.condition) >= 1){

            s.condition <- s.condition[1]

            s.condition.idx <- which(AvailableDataFormats$name == s.condition)

            mygeneticprofile <- as.character(

              AvailableDataFormats[s.condition.idx ,"molecularProfileId"]

            )

            CancerPossessData <- TRUE

          } else if (length(s.condition) == 0){

            CancerPossessData <- FALSE

          }

        }

      }



      # obtaining data for every genegroup

      for(group in seq_along(genesList)){

        # Chose one group of genes

        genesNames <- unique(genesList[[group]])

        numberOfGenes <- length(genesNames)

        # Merging four constitutive genes with data in case all genes are NA

        geneNames_plus_constitutive_genes <- c(genesNames, constitutive_genes)

        order_index <- order(geneNames_plus_constitutive_genes)

        ordered_genesNames <- geneNames_plus_constitutive_genes[order_index]


        # Obtaining Expression log z-scores for the requested genes

        if(!CancerPossessCorruptedData){

          if(CancerPossessData){

            ProfileData <-

              compatible_ProfileData(

                cbio,

                mycancerstudy,

                ordered_genesNames,

                mycaselist,

                mygeneticprofile
              )

          }

        }


        # Check if all requested genes are present and remove four constitutive genes

        presence_index <- colnames(ProfileData) %in% constitutive_genes

        number_of_present_constitutive_genes <- sum(presence_index)

        # Determine the first constitutive gene for gene validation

        first_constitutive_gene <- (colnames(ProfileData)[presence_index])[1]


        if(ncol(ProfileData) <= number_of_present_constitutive_genes){

          ProfileData <- NA

          CancerPossessData <- FALSE

        }else{

          ProfileData <-

            ProfileData[,!colnames(ProfileData) %in% constitutive_genes, drop = FALSE]

        }


        # Updating cancers list

        if(!CancerPossessData){

          if(is.null(cancersLackingData)){

            cancersLackingData <- studiesNames[c]

          }else{

            cancersLackingData <-

              c(cancersLackingData, studiesNames[c])

          }

        }


        # Accounting for missing genes: obtaining existing genes

        # assign genes if ProfileData is not NA

        if(! length(ProfileData) == 1){

          genes_in_studies[[c]] <- colnames(ProfileData)

        }





        if(!CancerPossessCorruptedData){

          if(CancerPossessData){

            # Assaign data to specific list member

            rawList[[group]][[c]] <- data.matrix(ProfileData)

            names(rawList[[group]])[c] <- groupName

            # For convenience

            this.segment <- rawList[[group]][[c]]



            # Find whether alternative gene names are used

            # Alter c.genes to be compatible with gene names in cBioPortal
            # output

            alteredGeneNames <-  sort(gsub("-", ".", genesNames))



            # Obtain unique name of genes that are absent in requested cancer

            presence_index_2 <-

              unique(alteredGeneNames) %in% colnames(this.segment)

            absentGenes <- alteredGeneNames[!presence_index_2]

            # For loop for determining changed genes

            if(length(absentGenes) != 0){

              alternativeGeneNames <-

                vector("character", length = length(absentGenes))

              # For loop

              for(ab in seq_along(absentGenes)){

                OneAbsentGene_OneContutiveGenes <-

                  c(absentGenes[ab], first_constitutive_gene)


                absent.gene.profile.data <-

                  compatible_ProfileData(

                    cbio,

                    mycancerstudy,

                    OneAbsentGene_OneContutiveGenes,

                    mycaselist,

                    mygeneticprofile
                  )


                absentGeneProfileData <- colnames(

                  data.matrix(absent.gene.profile.data)

                )

                absentGeneProfileData_pure <-

                  absentGeneProfileData[!absentGeneProfileData %in% constitutive_genes]


                # Check whether gene has an alternative name or missed from the

                # database

                if(length(absentGeneProfileData_pure) == 1){

                  alternativeGeneNames[ab] <- absentGeneProfileData_pure

                } else if(length(absentGeneProfileData_pure) == 0){

                  alternativeGeneNames[ab] <- "-"

                }

              }

              # Naming Alternative.gene.names

              names(alternativeGeneNames) <- absentGenes

              # Seperating genes with alternative names from those that lack

              genesLackData <- alternativeGeneNames[alternativeGeneNames == "-"]

              genesWithData <- alternativeGeneNames[alternativeGeneNames != "-"]



              # modifying gene names containing an alternative name

              for(re in seq_along(genesWithData)){

                colnames.idx <-

                  colnames(rawList[[group]][[c]]) %in% genesWithData[re]


                colnames(rawList[[group]][[c]])[colnames.idx] <-

                  paste0(genesWithData[re], " (", names(genesWithData[re]), ")")

              }


            }else{

              genesLackData <- NULL

              genesWithData <- NULL

            }





            # validateGenes

            if(validateGenes){

              # Empty validation matrix

              validationMatrix <- matrix(, ncol = numberOfGenes, nrow = 1)

              # Naming empty matrix

              if(length(genesLackData) != 0){

                dimnames(validationMatrix) <- list(

                  groupNames[c],

                  c(colnames(this.segment), names(genesLackData))

                )

              } else{

                dimnames(validationMatrix) <- list(

                  groupNames[c],

                  colnames(this.segment))

              }



              # modifying gene names containing an alternative name

              if(length(genesWithData) != 0){

                for(re in seq_along(genesWithData)){

                  colnames.idx <-

                    colnames(validationMatrix) %in% genesWithData[re]


                  colnames(validationMatrix)[colnames.idx] <-

                    paste0(genesWithData[re],
                           " (",
                           names(genesWithData[re]), ")")

                }

              }





              # Puting value for genes lacking data

              colnames.idx <-

                colnames(validationMatrix) %in% names(genesLackData)

              validationMatrix[,colnames.idx] <- "-"



              for(eval in seq_len(ncol(this.segment))){

                loop.section <- (this.segment)[,eval]

                ## Validating Genes

                # Correct those that are not found

                if(length((loop.section)[!is.nan(loop.section)]) > 0 &

                   all(!is.finite(loop.section)) &

                   is.nan(mean(as.vector(loop.section)[abs(loop.section)],

                               na.rm=TRUE))){

                  validationMatrix[1, eval] <- "-"

                } else {

                  validationMatrix[1, eval] <- "Found"

                }

              }

              # Storing the results in validationMList

              validationMatrix <-

                validationMatrix[,sort(colnames(validationMatrix)), drop=FALSE]

              idx <- ((group-1)*length(studiesNames))+c

              validationMList[[idx]] <- validationMatrix

            }

          }

        }

      }

      # Update progressbar

      setTxtProgressBar(obtainMultipleStudiesProgressBar, c)

    }

    # Closing progress bar

    close(obtainMultipleStudiesProgressBar)


    # Remove NA studies

    for(missingStudies in seq_along(rawList)){

      current_gene_group <- rawList[[missingStudies]]

      current_gene_group2 <- Filter(Negate(is.null), current_gene_group)

      rawList[[missingStudies]] <- current_gene_group2

    }


    # Accounting for missing genes in various cancers: finding unique genes

    maximum_available_genes <-

      unique(unlist(genes_in_studies, use.names = FALSE))

    # Altering rawList to have equal number of genes for all studies

    for(gene_group in seq_along(rawList)){

      for(studies in seq_along(rawList[[gene_group]])){

        current_raw_data <- rawList[[gene_group]][[studies]]

        missing_index <-

          ! maximum_available_genes %in% colnames(current_raw_data)

        if(sum(missing_index) > 0){

          missing_genes <- maximum_available_genes[missing_index]

          missing_ProfileData <- vector("list", length = length(missing_genes))

          sample_number <- nrow(current_raw_data)

          NA_for_a_missing_gene <- rep(NA, sample_number)

          dim(NA_for_a_missing_gene) <-

            c(sample_number,1); rownames(NA_for_a_missing_gene) <-

            rownames(current_raw_data)

          for(missing in seq_along(missing_genes)){

           colnames(NA_for_a_missing_gene) <- missing_genes[missing]

           missing_ProfileData[[missing]] <- NA_for_a_missing_gene

          }

          missing_ProfileData_dataframe <- do.call(cbind, missing_ProfileData)

          complete_ProfileData <-

            cbind(current_raw_data, missing_ProfileData_dataframe)

          col_index <- order(colnames(complete_ProfileData))

          complete_ProfileData <- complete_ProfileData[,col_index, drop = FALSE]

          row_index <- order(rownames(complete_ProfileData))

          complete_ProfileData <- complete_ProfileData[row_index,, drop = FALSE]

          rawList[[gene_group]][[studies]] <- complete_ProfileData

        }

      }

    }






    # Print studies with Corrupted or absent data

    if(!is.null(cancersWithCorruptedData)){

      if(length(cancersWithCorruptedData) == 1){

        message("[obtainMultipleStudies] Corrupted / under modification data:")

      } else{

        message("[obtainMultipleStudies] Corrupted / being modification data:")

      }

      print(cancersWithCorruptedData)

    }



    if(!is.null(cancersLackingData)){

      if(length(cancersLackingData) == 1){

        message("[obtainMultipleStudies] Following study lacks ", desiredTechnique, " Data:")

      } else{

        message("[obtainMultipleStudies] Following study contains corrupted Data:")

      }

      print(cancersLackingData)

    }





    # Check if any cancer has data

    StudiesWithResults_idx <-

      !(studiesNames %in% c(cancersWithCorruptedData, cancersLackingData))

    StudiesWithResults <- studiesNames[StudiesWithResults_idx]

    if( length(StudiesWithResults) < 1 ){

      stop("[obtainMultipleStudies] No cancer study exists with ", desiredTechnique, " data or data for all studies are completely corrupted!")

    }





    ## bfc object

    # create bfc object

    if(!dir.exists(database)){

      bfc <- BiocFileCache(

        file.path(system.file("extdata", package = "cbaf"), submissionName),

        ask = FALSE

      )

    }



    # Remove NA objects from List objects

    for(MainList in seq_along(rawList)){

      rawList[[MainList]] <-

        Filter(function(a) any(!is.na(a)), rawList[[MainList]])

    }


    validationMList <- Filter(function(a) any(!is.na(a)), validationMList)





    # Store the obtained Data

    number.of.rows.obtained.data <-

      nrow(bfcquery(bfc, "Obtained data for multiple studies"))

    if(number.of.rows.obtained.data == 0){

      saveRDS(

        rawList,

        file=bfcnew(bfc, "Obtained data for multiple studies", ext="RDS")

      )

    } else if(number.of.rows.obtained.data == 1){

      saveRDS(

        rawList,

        file=bfc[[bfcquery(bfc, "Obtained data for multiple studies")$rid]]

      )

    }





    # Fill the Validation Result

    for(mix in seq_along(genesList)){

      validationResult[[mix]] <- do.call(

        "rbind",

        validationMList[((mix-1)*length(StudiesWithResults))+seq_along(StudiesWithResults)]

      )

    }

    # Store the validation data

    if(validateGenes){


      number.of.rows.validaion.data <-

        nrow(bfcquery(bfc, "Validation data for multiple studies"))


      if(number.of.rows.validaion.data == 0){

        saveRDS(

          validationResult,

          file=bfcnew(bfc, "Validation data for multiple studies", ext="RDS")

        )

      } else if(number.of.rows.validaion.data == 1){

        saveRDS(

          validationResult,

          file=bfc[[bfcquery(bfc, "Validation data for multiple studies")$rid]]

        )

      }

    }

    # Store the last parameter

    newParameters$lastRunStatus <- "succeeded"

    oldParamObtainMultipleStudies <- newParameters

    CorrectCnacer_idx <-

      !(studiesNames %in% c(cancersWithCorruptedData, cancersLackingData))

    newParameters$studyName <- studiesNames[CorrectCnacer_idx]


    # Store the parameters for this run

    number.of.rows.parameters <-

      nrow(bfcquery(bfc, "Parameters for obtainMultipleStudies()"))

    if(number.of.rows.parameters == 0){

      saveRDS(

        oldParamObtainMultipleStudies,

        file=bfcnew(bfc, "Parameters for obtainMultipleStudies()", ext="RDS")

      )

    } else if(number.of.rows.parameters == 1){

      saveRDS(

        oldParamObtainMultipleStudies,

        file=bfc[[bfcquery(bfc, "Parameters for obtainMultipleStudies()")$rid]]

      )

    }

    # message("[obtainMultipleStudies] Finished.")

  }

}
armanshahrisa/cbaf documentation built on Nov. 5, 2022, 3:21 a.m.