R/getFirehoseData.R

Defines functions .getAnswer .setCache getFirehoseData .files_from_html_table .exportFiles .checkFileSize .barcodeUUID .getLinks .makeExprMat

Documented in getFirehoseData

#' @importFrom data.table fread
.makeExprMat <- function(file, normMethod, dataType, mergeSize = 1000, arrayData = FALSE)
{
  #Get selected type only
  tmpCols = read.delim(file,nrows=1,colClasses="character")
  if (!arrayData) {
    colOrder <- 1:ncol(tmpCols)
    colOrder <- colOrder[startsWith(unlist(tmpCols[1, ]), normMethod)]
  } else {
    colOrder <- 2:ncol(tmpCols)
  }
  #closeAllConnections()
  message(paste(dataType,"data will be imported! This may take a while!",sep=" "))
  message(paste0("Start: ",Sys.time()))
  tmpMat <- fread(file,header=FALSE,colClasses = "character", select=c(1,colOrder), data.table = FALSE)
  message(paste0("Done: " ,Sys.time()))
  #closeAllConnections()
  if (!arrayData) {
    colnames(tmpMat) <- tmpMat[1,]
  } else {
    colnames(tmpMat) <- c("Symbol",tmpMat[1,2:ncol(tmpMat)])
  }
  tmpMat <- tmpMat[-c(1:2),]
  removeQM <- grepl("\\?\\|",tmpMat[,1])
  tmpMat <- tmpMat[!removeQM,]
  names1 <- tmpMat[,1]
  names2 <- sapply(names1,function(x) {unlist(strsplit(x,"\\|"))[1]})
  names1 <- duplicated(names2)
  tmpMat <- tmpMat[!names1,]
  rownames(tmpMat) <- names2[!names1]
  tmpMat <- tmpMat[,-1]
  cNames <- colnames(tmpMat)
  rNames <- rownames(tmpMat)
  tmpMat <- apply(tmpMat,2,as.numeric)
  colnames(tmpMat) <- cNames
  rownames(tmpMat) <- rNames
  return(tmpMat)
}

#' @importFrom utils setTxtProgressBar
.getLinks <- function(keyWord1,keyWord2,datasetLink=NULL,doc) {
  plinks <- grep(keyWord1, doc, value = TRUE)
  search <-
    if (is.null(datasetLink)) keyWord2 else paste0("*.", datasetLink, keyWord2)
  grep(search,plinks,value=TRUE)
}

#' @importFrom utils txtProgressBar
#' @importFrom RJSONIO fromJSON
#' @importFrom RCurl getURL
.barcodeUUID <- function(object) {
  message("Converting barcodes to UUID")
  barcodes <- NULL
  if (dim(object@RNASeqGene)[1] > 0 & dim(object@RNASeqGene)[2] > 0) {
    barcodes <- c(barcodes,colnames(object@RNASeqGene))
  }
  if (dim(object@RNASeq2Gene)[1] > 0 & dim(object@RNASeq2Gene)[2] > 0) {
    barcodes <- c(barcodes,colnames(object@RNASeq2Gene))
  }
  if (dim(object@RNASeq2GeneNorm)[1] > 0 & dim(object@RNASeq2GeneNorm)[2] > 0) {
    barcodes <- c(barcodes,colnames(object@RNASeq2GeneNorm))
  }
  if (dim(object@miRNASeqGene)[1] > 0 & dim(object@miRNASeqGene)[2] > 0) {
    barcodes <- c(barcodes,colnames(object@miRNASeqGene))
  }
  if (dim(object@CNASNP)[1] > 0 & dim(object@CNASNP)[2] > 0) {
    barcodes <- c(barcodes,as.character(object@CNASNP[,1]))
  }
  if (dim(object@CNVSNP)[1] > 0 & dim(object@CNVSNP)[2] > 0) {
    barcodes <- c(barcodes,as.character(object@CNVSNP[,1]))
  }
  if (dim(object@CNASeq)[1] > 0 & dim(object@CNASeq)[2] > 0) {
    barcodes <- c(barcodes,as.character(object@CNASeq[,1]))
  }
  if (length(object@CNACGH) > 0 ) {
    barcodes <- c(barcodes,as.character(object@CNACGH[,1]))
  }
  if (length(object@Methylation) > 0 ) {
    for(i in 1:length(object@Methylation)) {
      barcodes <- c(barcodes,colnames(object@Methylation[[i]]@DataMatrix))
    }
  }
  if (length(object@mRNAArray) > 0 ) {
    for(i in 1:length(object@mRNAArray)) {
      barcodes <- c(barcodes,colnames(object@mRNAArray[[i]]@DataMatrix))
    }
  }
  if (length(object@miRNAArray) > 0 ) {
    for(i in 1:length(object@miRNAArray)) {
      barcodes <- c(barcodes,colnames(object@miRNAArray[[i]]@DataMatrix))
    }
  }
  if (length(object@RPPAArray) > 0 ) {
    for(i in 1:length(object@RPPAArray)) {
      barcodes <- c(barcodes,colnames(object@RPPAArray[[i]]@DataMatrix))
    }
  }
  if (length(object@GISTIC@Dataset) > 0) {
    barcodes <- c(barcodes,colnames(object@GISTIC@AllByGene)[-c(1:3)])
  }
  if (dim(object@Mutation)[1] > 0 & dim(object@Mutation)[2] > 0) {
    barcodes <- c(barcodes,unique(as.character(object@Mutation[,16])))
    barcodes <- c(barcodes,unique(as.character(object@Mutation[,17])))
  }
  barcodes <- unique(barcodes)
  barcodes <- toupper(barcodes)
  barcodes <- gsub(pattern = "\\.",replacement = "-",x = barcodes)

  pb <- txtProgressBar(min = 0, max = length(barcodes), style = 3)
  breakPoints <- seq(1,length(barcodes),20)
  for(i in breakPoints) {
    setTxtProgressBar(pb, i)
    urlToGo <- "https://tcga-data.nci.nih.gov/uuid/uuidws/mapping/json/barcode/batch"
    endPoint <- (i+19)
    if (endPoint > length(barcodes)) {endPoint = length(barcodes)}
    searchBarcode <- paste(barcodes[i:endPoint], collapse=",")
    uuids = fromJSON(getURL(urlToGo, customrequest="POST",
                            httpheader=c("Content-Type: text/plain"),
                            postfields=searchBarcode))$uuidMapping
    if (is.character(uuids)) {
      if (exists("convertTable")) {
        convertTable <- rbind(convertTable,c(uuids[1],uuids[2]))
      } else {
        convertTable <- data.frame(Barcode=uuids[1],UUID=uuids[2])
      }
    } else {
      if (exists("convertTable")) {
        convertTable <- rbind(convertTable,
                              data.frame(matrix(unlist(uuids), nrow=length(uuids), byrow=TRUE),stringsAsFactors=FALSE))
      } else {
        convertTable <- data.frame(matrix(unlist(uuids), nrow=length(uuids), byrow=TRUE),stringsAsFactors=FALSE)
      }
    }

    if ((endPoint %% 400) == 0) {
      message("RTCGAToolbox will wait 185 seconds due to TCGA web service limitations")
      Sys.sleep(185)
    }

  }

  shortNames <- apply(convertTable,1,function(x) {
    as.character(paste(strsplit(x[1],split = "-")[[1]][1:3],collapse = "-"))
  })
  convertTable <- cbind(convertTable,shortNames)
  colnames(convertTable) <- c("Barcode","UUID","ShortName")
  setTxtProgressBar(pb, length(barcodes))
  return(convertTable)
}

.checkFileSize <- function(dataURL, fileSizeLimit) {
  fileSize <- httr::HEAD(dataURL)
  # this should convert file size from byte to MB
  fileSize <- as.numeric(fileSize$headers$`content-length`)/1e6

  if (fileSize > fileSizeLimit) {
    message(dataURL)
    message(paste("File Size: ~"), fileSize, "MB")
    message("File above won't be downloaded due to data size, RTCGAToolbox will skip this data!")
    FALSE
  } else {
    TRUE
  }
}

#' @importFrom utils download.file untar
.exportFiles <- function(fileLink, dataset, fileExt, searchName,subSearch=FALSE,
    exportName, manifest=FALSE, destdir=destdir, forceDownload=FALSE, runDate)
{

  if (!dir.exists(destdir))
      stop("Directory does not exist")

  tcgafile <- file.path(destdir, paste0(dataset, fileExt, sep=""))
  destfile <- file.path(destdir, paste0(runDate,"-",dataset,exportName))

  if (forceDownload || !file.exists(destfile)) {
    download.file(url=fileLink, destfile=tcgafile, method="auto",
        quiet = FALSE, mode = "wb")
    fileList <- untar(tcgafile, list = TRUE)
    fullList <- paste(destdir, fileList, sep = "/")
    if (any(nchar(fullList) > 259) && identical(.Platform$OS.type, "windows"))
        warning("File path too long, make 'destdir' shorter")
    if (!subSearch) {
      fileList = fileList[grepl(searchName,fileList)]
    } else {
      if (!manifest) {
        grepSearch = paste0("*.",dataset,searchName)
        fileList = fileList[grepl(grepSearch,fileList)]
      } else {
        fileList = fileList[!grepl("MANIFEST.txt",fileList)]
      }
    }

    untar(tcgafile, files = fileList, exdir = destdir)
    file.rename(from=file.path(destdir, fileList), to=destfile)
    file.remove(tcgafile)

    message(dirname(fileList))
    unlink(file.path(destdir, dirname(fileList)), recursive = TRUE)
  } else {
    message(sprintf('Using locally cached version of %s',destfile))
  }
  return(destfile)
}


.files_from_html_table <- function(url) {
    page_table <- rvest::html_table(rvest::read_html(url))[[1L]]
    rm.cols <- vapply(page_table, function(x) !all(is.na(x)), logical(1L))
    page_table <- page_table[, rm.cols]
    doc <- page_table[
      -which(page_table[["Name"]] %in% c("", "Parent Directory")), "Name"
    ]
    unlist(doc)
}

#' Get data from Firehose portal.
#'
#' `getFirehoseData` returns `FirehoseData` object that stores TCGA data.
#'
#' This is a main client function to download data from Firehose TCGA portal.
#'
#' @details To avoid unnecessary downloads, we use
#'   `tools::R_user_dir("RTCGAToolbox", "cache")` to set the default `destdir`
#'   parameter to the cached directory. To get the actual default directory,
#'   one can run `RTCGAToolbox:::.setCache()`.
#'
#' @param dataset A cohort disease code. TCGA cancer codes can be obtained via
#'   [getFirehoseDatasets]
#' @param runDate Standard data run dates. Date list can be accessible via
#'   [getFirehoseRunningDates]
#' @param gistic2Date Analysis run date for GISTIC obtained via
#'   [getFirehoseAnalyzeDates]
#' @param RNASeqGene Logical (default FALSE) RNAseq TPM data.
#' @param clinical Logical (default TRUE) clinical data.
#' @param RNASeq2Gene Logical (default FALSE) RNAseq v2 (RSEM processed) data;
#'   see `RNAseqNorm` argument.
#' @param RNASeq2GeneNorm Logical (default FALSE) RNAseq v2 (RSEM processed)
#'   data.
#' @param miRNASeqGene Logical (default FALSE) smallRNAseq data.
#' @param miRNASeqGeneType Character (default "read_count") Indicate which type
#'   of data should be pulled from the miRNASeqGene data. Must be one of
#'   "reads_per_million_miRNA_mapped", "read_count", or "cross-mapped".
#' @param CNASNP Logical (default FALSE) somatic copy number alterations data
#'   from SNP array.
#' @param CNVSNP Logical (default FALSE) germline copy number variants data from
#'   SNP array.
#' @param CNASeq Logical (default FALSE) somatic copy number alterations data
#'   from sequencing.
#' @param CNACGH Logical (default FALSE) somatic copy number alterations data
#'   from CGH.
#' @param Methylation Logical (default FALSE) methylation data.
#' @param Mutation Logical (default FALSE) mutation data from sequencing.
#' @param mRNAArray Logical (default FALSE) mRNA expression data from
#'   microarray.
#' @param miRNAArray Logical (default FALSE) miRNA expression data from
#'   microarray.
#' @param RPPAArray Logical (default FALSE) RPPA data
#' @param RNAseqNorm RNAseq data normalization method. (Default raw_count)
#' @param RNAseq2Norm RNAseq v2 data normalization method. (Default
#'   normalized_count or one of RSEM_normalized_log2, raw_count,
#'   scaled_estimate)
#' @param GISTIC logical (default FALSE) processed copy number data
#' @param forceDownload A logic (Default FALSE) key to force download
#'   RTCGAToolbox every time. By default if you download files into your working
#'   directory once than RTCGAToolbox using local files next time.
#' @param destdir Directory in which to store the resulting downloaded file.
#'   Defaults to a cache directory given by `RTCGAToolbox:::.setCache()`.
#' @param fileSizeLimit Files that are larger than set value (megabyte) won't be
#'   downloaded (Default: 500)
#' @param getUUIDs Logical key to get UUIDs from barcode (Default: FALSE)
#' @param ... Additional arguments to pass down.
#'
#' @return A `FirehoseData` data object that stores data for selected data types.
#'
#' @seealso \link{getLinks}, \url{https://gdac.broadinstitute.org/}
#'
#' @importFrom utils write.table
#'
#' @examples
#' # Sample Dataset
#' data(accmini)
#' accmini
#' \dontrun{
#' BRCAdata <- getFirehoseData(dataset="BRCA",
#' runDate="20140416",gistic2Date="20140115",
#' RNASeqGene=TRUE,clinical=TRUE,mRNAArray=TRUE,Mutation=TRUE)
#' }
#' @export getFirehoseData
getFirehoseData <- function(dataset, runDate="20160128", gistic2Date="20160128",
    RNASeqGene=FALSE, RNASeq2Gene=FALSE, clinical=TRUE, miRNASeqGene=FALSE,
    miRNASeqGeneType =
      c("read_count", "reads_per_million_miRNA_mapped", "cross-mapped"),
    RNASeq2GeneNorm=FALSE, CNASNP=FALSE, CNVSNP=FALSE, CNASeq=FALSE,
    CNACGH=FALSE, Methylation=FALSE, Mutation=FALSE, mRNAArray=FALSE,
    miRNAArray=FALSE, RPPAArray=FALSE, GISTIC=FALSE, RNAseqNorm="raw_count",
    RNAseq2Norm = c(
      "normalized_counts", "RSEM_normalized_log2", "raw_counts", "scaled_estimate"
    ),
    forceDownload=FALSE, destdir=.setCache(),
    fileSizeLimit=500, getUUIDs=FALSE, ...) {
  #check input parameters
  if (!is.character(dataset) || is.null(dataset) || !length(dataset) == 1 || nchar(dataset) < 2) {
      stop('Please set "dataset" parameter! You should specify one dataset name. Ex: dataset="BRCA"...')
      }
  runDatasets <- getFirehoseDatasets()
  if (!any(runDatasets==dataset)) {
      stop('Please use valid dataset name! "getFirehoseDatasets" function gives you the vector of valid dataset names!')
      }
  if (!is.null(runDate)) {
    if (!is.character(runDate) || !length(runDate) == 1 || !nchar(runDate) == 8) {
        stop('Please set "runDate" parameter! You should specify one Firehose run date. Ex: runDate="20140416"...')
        }
    runDateList <- getFirehoseRunningDates()
    if (!any(runDateList==runDate)) {
        stop('Please use valid run date! "getFirehoseRunningDates" function gives you the vector of valid dates!')
        }
  }

    if (GISTIC) {
      if (!S4Vectors::isSingleString(gistic2Date) &&
          !gistic2Date %in% getFirehoseAnalyzeDates())
          stop('Please set a valid "gistic2Date" parameter.')
    }
    trim <- function (x) gsub("^\\s+|\\s+$", "", x)

    resultClass <- new("FirehoseData", Dataset = dataset)
    if (!is.null(runDate)) {
      resultClass@runDate <- runDate
    } else {
      resultClass@runDate <- NA_character_
    }
    if (GISTIC) {
      resultClass@gistic2Date <- gistic2Date
    } else {
      resultClass@gistic2Date <- NA_character_
    }

  if (!is.null(runDate))
  {
    ##build URL for getting file links
    fh_url <- "https://gdac.broadinstitute.org/runs/stddata__"
    fh_url <- paste0(fh_url,substr(runDate,1,4),"_",substr(runDate,5,6),"_",substr(runDate,7,8),"/data/")
    fh_url <- paste0(fh_url,dataset,"/",runDate,"/")
    doc <- .files_from_html_table(fh_url)

    #Download clinical data
    if (clinical)
    {
      #Search for links
      plinks <- .getLinks(".Clinical_Pick_Tier1.Level_4","*.tar[.]gz$",NULL,doc)

      for(i in trim(plinks))
      {
        if (.checkFileSize(paste0(fh_url,i),fileSizeLimit))
        {
          export.file <- .exportFiles(paste0(fh_url,i),dataset,"-Clinical.tar.gz","*.clin.merged.picked.txt$",FALSE,
                       "-Clinical.txt",FALSE,destdir,forceDownload,runDate)
          raw.clin <- read.delim(export.file,colClasses="character")
          df.clin <- data.frame(do.call(rbind, raw.clin[, -1]),stringsAsFactors = FALSE)
          colnames(df.clin) <- raw.clin[, 1]
          resultClass@clinical <- df.clin
          gc()
        }
      }
    }

    #Download RNAseq gene level data
    if (RNASeqGene)
    {
      #Search for links
      plinks <- .getLinks("Level_3__gene_expression__data.Level_3","*.Merge_rnaseq__.*._rnaseq__.*.tar[.]gz$",NULL,doc)

      for(i in trim(plinks))
      {
        if (.checkFileSize(paste0(fh_url,i),fileSizeLimit))
        {
          export.file <- .exportFiles(paste0(fh_url,i),dataset,"-RNAseqGene.tar.gz",
                       "[.]rnaseq__.*.__Level_3__gene_expression__data.data.txt$",
                       TRUE,"-RNAseqGene.txt",FALSE,destdir,forceDownload,runDate)
          #Get selected type only
          resultClass@RNASeqGene <- .makeExprMat(export.file,RNAseqNorm,"RNAseq",mergeSize=1000,arrayData=FALSE)
          gc()
        }
      }
    }

    #Download RNAseq2 gene level data (scaled estimates = TPM / 1e6 or raw_count, not rounded)
    if (RNASeq2Gene)
    {
      #Search for links
      plinks <- .getLinks("Level_3__RSEM_genes__data.Level_3","*.Merge_rnaseqv2__.*._rnaseqv2__.*.tar[.]gz$",NULL,doc)

      for(i in trim(plinks))
      {
        if (.checkFileSize(paste0(fh_url,i),fileSizeLimit))
        {
          export.file <- .exportFiles(paste0(fh_url,i),dataset,"-RNAseq2Gene.tar.gz",
                                      "[.]rnaseqv2__.*.__Level_3__RSEM_genes__data.data.txt$",
                                      TRUE,"-RNAseq2Gene.txt",FALSE,destdir,forceDownload,runDate)
          #Get selected type only
          resultClass@RNASeq2Gene <- .makeExprMat(export.file, RNAseqNorm, "RNAseq2",mergeSize=1000,arrayData=FALSE)
          gc()
        }
      }
    }

    #Download RNAseq2 gene level data
    if (RNASeq2GeneNorm)
    {
      #Search for links

      RNAseq2Norm <- match.arg(RNAseq2Norm)
      if (identical(RNAseq2Norm, "normalized_counts")) {
        searchName <- "[.]rnaseqv2__.*.__Level_3__RSEM_genes_normalized__data.data.txt$"
        plinks <- .getLinks("Level_3__RSEM_genes_normalized__data.Level_3","*.Merge_rnaseqv2__.*._rnaseqv2__.*.tar[.]gz$",NULL,doc)
      } else {
        searchName <- paste0(".*\\.mRNAseq_", RNAseq2Norm, "\\.txt$")
        plinks <- .getLinks("mRNAseq_Preprocess",".*\\.Level_3\\..*\\.tar\\.gz$",NULL,doc)
      }
      plinks <- stats::setNames(
        vapply(plinks, function(x)
            .checkFileSize(paste0(fh_url, x), fileSizeLimit), logical(1L)
        ),
        plinks)
      plinks <- plinks[plinks]
      res <- lapply(seq_along(plinks), function(k) {
        i <- names(plinks)[k]
        export.file <- .exportFiles(
            fileLink = paste0(fh_url,i),
            dataset = dataset,
            fileExt = "-RNAseq2GeneNorm.tar.gz",
            searchName = searchName,
            subSearch = TRUE,
            exportName =
              paste0("-RNAseq2GeneNorm-", RNAseq2Norm, "-", k, ".txt"),
            manifest = FALSE,
            destdir = destdir,
            forceDownload = forceDownload,
            runDate = runDate
        )
            new("FirehosemRNAArray",Filename=i,
                DataMatrix=.makeExprMat(export.file,
                    RNAseq2Norm,"RNAseq2",mergeSize=1000,arrayData=TRUE))
      })
      # res <- res[[which.max(vapply(res, ncol, integer(1L)))]]
      resultClass@RNASeq2GeneNorm <- res
    }

    #Download miRNAseq gene level data
    if (miRNASeqGene)
    {
      #Search for links
      plinks <- .getLinks("Level_3__miR_gene_expression__data.Level_3","[.]Merge_mirnaseq__.*.hiseq_mirnaseq__.*.tar[.]gz$",dataset,doc)
      miRNAtype <- match.arg(miRNASeqGeneType)

      for(i in trim(plinks))
      {
        if (.checkFileSize(paste0(fh_url,i),fileSizeLimit))
        {
          export.file <- .exportFiles(
              fileLink = paste0(fh_url,i),
              dataset = dataset,
              fileExt = "-miRNAseqGene.tar.gz",
              searchName = "[.]mirnaseq__.*.__Level_3__miR_gene_expression__data.data.txt$",
              subSearch = TRUE,
              exportName = paste0("-", miRNAtype, "-miRNAseqGene.txt"),
              manifest = FALSE,
              destdir = destdir,
              forceDownload = forceDownload,
              runDate = runDate
          )
          resultClass@miRNASeqGene <- .makeExprMat(export.file,miRNAtype,"miRNAseq",mergeSize=100,arrayData=FALSE)
          gc()
        }
      }
    }

    #Download CNA SNP data
    if (CNASNP)
    {
      #Search for links
      plinks <- .getLinks("Level_3__segmented_scna_hg19__seg.Level_3","[.]Merge_snp__.*.__Level_3__segmented_scna_hg19__seg.Level_3.*.tar[.]gz$",dataset,doc)

      for(i in trim(plinks))
      {
        if (.checkFileSize(paste0(fh_url,i),fileSizeLimit))
        {
          export.file <- .exportFiles(paste0(fh_url,i),dataset,
                      "-CNASNPHg19.tar.gz",
                      "[.]snp__.*.__Level_3__segmented_scna_hg19__seg.seg.txt$",
                      TRUE,
                      "-CNASNPHg19.txt",FALSE,destdir,forceDownload,runDate)
          #Get selected type only
          tmpMat = fread(export.file,header=TRUE,colClasses=c("character","numeric","numeric",
                                                                                                "numeric","numeric","numeric"),data.table = FALSE)
          resultClass@CNASNP <- tmpMat
        }
      }
    }

    #Download CNV SNP data
    if (CNVSNP)
    {
      #Search for links
      plinks <- .getLinks("Level_3__segmented_scna_minus_germline_cnv_hg19__seg.Level_3","[.]Merge_snp__.*.__Level_3__segmented_scna_minus_germline_cnv_hg19__seg.Level_3.*.tar[.]gz$",dataset,doc)

      for(i in trim(plinks))
      {
        if (.checkFileSize(paste0(fh_url,i),fileSizeLimit))
        {
          export.file <- .exportFiles(paste0(fh_url,i),dataset,
                      "-CNVSNPHg19.tar.gz",
                      "[.]snp__.*.__Level_3__segmented_scna_minus_germline_cnv_hg19__seg.seg.txt$",
                      TRUE,
                      "-CNVSNPHg19.txt",FALSE,destdir,forceDownload,runDate)
          #Get selected type only
          tmpMat = fread(export.file,header=TRUE,colClasses=c("character","numeric","numeric",
                                                                                                "numeric","numeric","numeric"),data.table = FALSE)
          resultClass@CNVSNP <- tmpMat
        }
      }
    }

    #Download CNA DNAseq data
    if (CNASeq)
    {
      #Search for links
      plinks <- .getLinks("__Level_3__segmentation__seg.Level_3","[.]Merge_cna__.*.dnaseq.*.__Level_3__segmentation__seg.Level_3.*.tar[.]gz$",dataset,doc)

      for(i in trim(plinks))
      {
        if (.checkFileSize(paste0(fh_url,i),fileSizeLimit))
        {
          export.file <- .exportFiles(paste0(fh_url,i),dataset,
                      "-CNAseq.tar.gz",
                      "[.]cna__.*.__Level_3__segmentation__seg.seg.txt$",
                      TRUE,
                      "-CNAseq.txt",FALSE,destdir,forceDownload,runDate)
          #Get selected type only
          tmpMat = fread(export.file,
                         header=TRUE,colClasses=c("character","numeric","numeric","numeric","numeric","numeric"),
                         data.table = FALSE)
          #tmpMat = read.delim(paste0(runDate,"-",dataset,"-CNAseq.txt"),header=TRUE,colClasses=c("character","numeric","numeric",
          #                                                                                 "numeric","numeric"))
          resultClass@CNASeq <- tmpMat
        }
      }
    }

    #Download CNA CGH data
    if (CNACGH)
    {
      #Search for links
      plinks <- .getLinks("__Level_3__segmentation__seg.Level_3","[.]Merge_cna__.*.cgh.*.__Level_3__segmentation__seg.Level_3.*.tar[.]gz$",dataset,doc)

      dataLists <- list()
      listCount = 1
      for(i in trim(plinks))
      {
        if (.checkFileSize(paste0(fh_url,i),fileSizeLimit))
        {
          export.file <- .exportFiles(paste0(fh_url,i),dataset,
                      "-CNACGH.tar.gz",
                      "[.]cna__.*.__Level_3__segmentation__seg.seg.txt$",
                      TRUE,
                      paste0("-CNACGH-",listCount,".txt"),FALSE,destdir,forceDownload,runDate)
          #Get selected type only
          tmpMat = fread(export.file,
                         header=TRUE,colClasses=c("character","numeric","numeric","numeric","numeric","numeric"),
                         data.table = FALSE)
          #tmpMat = read.delim(paste0(runDate,"-",dataset,"-CNACGH-",listCount,".txt",sep=""),header=TRUE,colClasses=c("character","numeric","numeric",
          #                                                                                               "numeric","numeric"))
          tmpReturn <- new("FirehoseCGHArray",Filename=i,DataMatrix=tmpMat)
          dataLists[[listCount]] <- tmpReturn
          listCount = listCount + 1
        }
      }
      resultClass@CNACGH <- dataLists
    }

    #Download methylation
    if (Methylation)
    {
      #Search for links
      plinks <- .getLinks("__Level_3__within_bioassay_data_set_function__data.Level_3","[.]Merge_methylation__.*.methylation.*.__Level_3__within_bioassay_data_set_function__data.Level_3.*.tar[.]gz$",dataset,doc)

      dataLists <- list()
      listCount = 1
      for(ii in trim(plinks))
      {
        if (.checkFileSize(paste0(fh_url,ii),fileSizeLimit))
        {
          export.file <- .exportFiles(paste0(fh_url,ii),dataset,
                      "-Methylation.tar.gz",
                      "[.]methylation__.*.__Level_3__within_bioassay_data_set_function__data.data.txt$",
                      TRUE,
                      paste0("-Methylation-",listCount,".txt"),FALSE,destdir,forceDownload,runDate)

          #Get selected type only
          tmpCols <- read.delim(export.file,nrows=1,colClasses="character")
          BetaCols <- which(tmpCols[1L,] == "Beta_value")
          fixedCols <- match(c("Composite Element REF", "Gene_Symbol", "Chromosome", "Genomic_Coordinate"), tmpCols[1L, ])
          colOrder <- c(fixedCols, BetaCols)

          tmpMat <- fread(export.file,header=FALSE,colClasses = "character", select=colOrder, data.table = FALSE)
          fixedCols <- match(c("Composite Element REF", "Gene_Symbol", "Chromosome", "Genomic_Coordinate"), tmpMat[2L, ])
          BetaCols <- which(tmpMat[2L, ] == "Beta_value")
          colOrder <- c(fixedCols, BetaCols)
          tmpMat <- tmpMat[, colOrder]
          tmpMat[2L, ] <- gsub(" ", "", tmpMat[2L, ])
          firstFixed <- seq_along(fixedCols)
          BetaCols <- setdiff(seq_along(tmpMat), firstFixed)
          colnames(tmpMat) <- unlist(c(tmpMat[2L, firstFixed], tmpMat[1L, BetaCols]))

          tmpMat <- tmpMat[-c(1:2),]
          removeQM <- grepl("\\?\\|",tmpMat[,1])
          tmpMat <- tmpMat[!removeQM,]
          names1 <- tmpMat[,1]
          tmpMat <- tmpMat[,-1]
          rownames(tmpMat) <- names1
          tmpReturn <- new("FirehoseMethylationArray",Filename=ii,DataMatrix=tmpMat)
          dataLists[[listCount]] <- tmpReturn
          listCount = listCount + 1
        }
      }
      resultClass@Methylation <- dataLists
    }

    #Download mRNA array
    if (mRNAArray)
    {
      #Search for links
      plinks1 <- .getLinks("Merge_transcriptome__agilentg4502a_07","[.]Merge_transcriptome__agilentg4502a_.*.__Level_3__unc_lowess_normalization_gene_level__data.Level_3.*.tar[.]gz$",dataset,doc)
      plinks2 <- .getLinks("Merge_transcriptome__ht_hg_u133a","[.]Merge_transcriptome__ht_hg_u133a__.*.__Level_3__gene_rma__data.Level_3.*.tar[.]gz$",dataset,doc)
      plinks3 <- .getLinks("Merge_exon__huex_1_0_st_v2","[.]Merge_exon__huex_1_0_st_v2__.*.__Level_3__quantile_normalization_gene__data.Level_3.*.tar[.]gz$",dataset,doc)

      plinks = c(plinks1,plinks2,plinks3)
      plinks = unique(plinks[plinks != ""])
      dataLists <- list()
      listCount = 1
      for(ii in trim(plinks))
      {
        if (.checkFileSize(paste0(fh_url,ii),fileSizeLimit))
        {
          export.file <- .exportFiles(paste0(fh_url,ii),dataset,
                      "-mRNAArray.tar.gz",
                      "",
                      TRUE,
                      paste0("-mRNAArray-",listCount,".txt"),
                      TRUE,destdir,forceDownload,runDate)
          tmpReturn <- new("FirehosemRNAArray",Filename=ii,
                           DataMatrix=.makeExprMat(export.file,"","mRNAArray",1000,TRUE))
          dataLists[[listCount]] <- tmpReturn
          listCount = listCount + 1
        }
      }
      resultClass@mRNAArray <- dataLists
    }

    #Download miRNA array
    if (miRNAArray)
    {
      #Search for links
      plinks <- .getLinks("h_mirna_8x15k","[.]Merge_mirna__h_mirna_8x15k.*.data.Level_3.*.tar[.]gz$",dataset,doc)
      plinks = unique(plinks[plinks != ""])

      dataLists <- list()
      listCount = 1
      for(ii in trim(plinks))
      {
        if (.checkFileSize(paste0(fh_url,ii),fileSizeLimit))
        {
          export.file <- .exportFiles(paste0(fh_url,ii),dataset,
                      "-miRNAArray.tar.gz",
                      "",
                      TRUE,
                      paste0("-miRNAArray-",listCount,".txt"),
                      TRUE,destdir,forceDownload,runDate)
          tmpReturn <- new("FirehosemRNAArray",Filename=ii,
                           DataMatrix=.makeExprMat(export.file,"","miRNAArray",100,TRUE))
          dataLists[[listCount]] <- tmpReturn
          listCount = listCount + 1
        }
      }
      resultClass@miRNAArray <- dataLists
    }

    #Download RPPA array
    if (RPPAArray)
    {
      #Search for links
      plinks <- .getLinks("rppa_core","[.]Merge_protein_exp.*.protein_normalization__data.Level_3.*.tar[.]gz$",dataset,doc)
      plinks = unique(plinks[plinks != ""])

      dataLists <- list()
      listCount = 1
      for(ii in trim(plinks))
      {
        if (.checkFileSize(paste0(fh_url,ii),fileSizeLimit))
        {
          export.file <- .exportFiles(paste0(fh_url,ii),dataset,
                      "-RPPAArray.tar.gz",
                      "",
                      TRUE,
                      paste0("-RPPAArray-",listCount,".txt"),
                      TRUE,destdir,forceDownload,runDate)
          tmpReturn <- new("FirehosemRNAArray",Filename=ii,
                           DataMatrix=.makeExprMat(export.file,"","RPPAArray",100,TRUE))
          dataLists[[listCount]] <- tmpReturn
          listCount = listCount + 1
        }
      }
      resultClass@RPPAArray <- dataLists
    }

    #Download RPPA array
    if (Mutation)
    {
      #Search for links
      plinks <- .getLinks("Mutation_Packager_Calls","[.]Mutation_Packager_Calls[.]Level_3[.].*.tar[.]gz$",dataset,doc)
      plinks = unique(plinks[plinks != ""])

      dataLists <- list()
      listCount = 1
      for(ii in trim(plinks))
      {
        if (.checkFileSize(paste0(fh_url,ii),fileSizeLimit))
        {
          allsampsfile <- paste0(destdir,"/",runDate,"-",dataset,"-Mutations-AllSamples.txt")
          if (forceDownload || !file.exists(allsampsfile)) {
            download_link = paste(fh_url,ii,sep="")
            tarfile <- file.path(destdir, paste0(dataset, "-Mutation.tar.gz"))
            download.file(url=download_link,destfile=tarfile,method="auto",quiet = FALSE, mode = "wb")
            fileList <- untar(tarfile, list = TRUE)
            grepSearch = "MANIFEST.txt"
            fileList = fileList[!grepl(grepSearch,fileList)]
            ###
            untar(tarfile, files = fileList, exdir = destdir)
            datalist <- lapply(file.path(destdir, fileList),FUN=function(files) {
              read.delim(files,header=TRUE,colClasses="character")
            })
            dlengths <- lengths(datalist)
            if (!identical(length(unique(dlengths)), 1L)) {
                splitlist <- split(datalist, dlengths)
                splitlist <- lapply(splitlist, function(g) do.call(rbind, g))
                retMutations <-
                    Reduce(function(...) merge(..., all = TRUE), splitlist)
            } else {
                retMutations <- do.call(rbind, datalist)
            }
            delFolder <- file.path(destdir, dirname(fileList[[1L]]))
            unlink(delFolder, recursive = TRUE)
            file.remove(tarfile)
            write.table(retMutations,file=allsampsfile,sep="\t",row.names=FALSE,quote=FALSE)
          } else {
            retMutations <- fread(allsampsfile,header=TRUE,colClasses="character", data.table = FALSE)
          }
          resultClass@Mutation <- retMutations
        }
      }
    }
  }
  if (GISTIC) {
    tag <- switch(dataset, SKCM = "TM", LAML = "TB", "TP")
    dset <- paste0(dataset, "-", tag)
    ##build URL for getting file links
    fh_url <- "https://gdac.broadinstitute.org/runs/analyses__"
    fh_url <- paste(fh_url, substr(gistic2Date, 1, 4), "_",
        substr(gistic2Date, 5, 6), "_", substr(gistic2Date, 7, 8),
        "/data/", sep="")
    fh_url <- paste(fh_url, dset, "/", gistic2Date, "/", sep="")
    doc <- .files_from_html_table(fh_url)
    #Search for links
    plinks <- .getLinks("CopyNumber_Gistic2.Level_4",
        "[.]CopyNumber_Gistic2[.]Level_4.*.tar[.]gz$", dset, doc)
    for (ii in trim(plinks))
    {
      if (.checkFileSize(paste0(fh_url,ii),fileSizeLimit)) {
        if (forceDownload || !file.exists(paste0(destdir,"/",gistic2Date,"-",dataset,"-all_thresholded.by_genes.txt"))) {
          download_link <- paste(fh_url, ii, sep = "")
          fileLoc <- file.path(destdir, paste0(dataset,"-Gistic2.tar.gz"))
          download.file(url = download_link, destfile = fileLoc,
            method = "auto", quiet = FALSE, mode = "wb")

          fileList <- untar(fileLoc, list = TRUE)
          grepSearch = "all_data_by_genes.txt"
          fileList = fileList[grepl(grepSearch,fileList)]
          untar(fileLoc, files = fileList, exdir = destdir)
          filepaths <- file.path(destdir, fileList)
          tmpCNAll = fread(filepaths, header = TRUE, colClasses = "character",
              data.table = FALSE)
          file.rename(from = filepaths,
            to = paste0(destdir, "/", gistic2Date, "-",
                dataset, "-all_data_by_genes.txt"))

          fileList <- untar(fileLoc, list = TRUE)
          grepSearch = "all_thresholded.by_genes.txt"
          fileList = fileList[grepl(grepSearch,fileList)]
          untar(fileLoc, files = fileList, exdir = destdir)
          filepaths <- file.path(destdir, fileList)
          tmpCNThreshhold = fread(filepaths, header = TRUE,
              colClasses = "character", data.table = FALSE)
          file.rename(from = filepaths,
            to = paste0(destdir, "/", gistic2Date, "-",
                dataset, "-all_thresholded.by_genes.txt"))

          fileList <- untar(fileLoc, list = TRUE)
          grepSearch = "all_lesions.conf_99.txt"
          fileList = fileList[grepl(grepSearch,fileList)]
          untar(fileLoc, files = fileList, exdir = destdir)
          filepaths <- file.path(destdir, fileList)
          tmpPeaks <- fread(filepaths, header = TRUE,
              colClasses = "character", data.table = FALSE)
          file.rename(from = filepaths,
            to = paste0(destdir, "/", gistic2Date, "-",
                dataset, "-all_lesions.conf_99.txt"))

          delFodler <- file.path(destdir, dirname(fileList))
          unlink(delFodler, recursive = TRUE)
          file.remove(fileLoc)
        } else {
          tmpCNThreshhold = fread(paste0(destdir,"/",gistic2Date,"-",dataset,"-all_thresholded.by_genes.txt"),header=TRUE,colClasses = "character", data.table = FALSE)
          tmpCNAll = fread(paste0(destdir,"/",gistic2Date,"-",dataset,"-all_data_by_genes.txt"),header=TRUE,colClasses="character", data.table = FALSE)
          tmpPeaks = fread(paste0(destdir, "/", gistic2Date, "-", dataset, "-all_lesions.conf_99.txt"), header = TRUE, colClasses = "character", data.table = FALSE)
        }
        tmpReturn <- new("FirehoseGISTIC",Dataset=dataset,AllByGene=data.frame(tmpCNAll),
            ThresholdedByGene=data.frame(tmpCNThreshhold), Peaks = data.frame(tmpPeaks))
        resultClass@GISTIC <- tmpReturn
      }
    }
  }

  if (getUUIDs) {
    resultClass@BarcodeUUID <- .barcodeUUID(resultClass)
  }
  return(resultClass)
}

.setCache <- function(
    directory = tools::R_user_dir("RTCGAToolbox", "cache"),
    verbose = TRUE,
    ask = interactive()
) {
    stopifnot(
        is.character(directory),
        S4Vectors::isSingleString(directory),
        !is.na(directory)
    )

    if (!dir.exists(directory)) {
        if (ask) {
            qtxt <- sprintf(
                "Create RTCGAToolbox cache at \n    %s? [y/n]: ",
                directory
            )
            answer <- .getAnswer(qtxt, allowed = c("y", "Y", "n", "N"))
            if ("n" == answer)
                stop("RTCGAToolbox directory not created")
        }
        dir.create(directory, recursive = TRUE, showWarnings = FALSE)
    }

    if (verbose)
        message("RTCGAToolbox cache directory set to:\n    ", directory)
    invisible(directory)
}

.getAnswer <- function(msg, allowed) {
    if (interactive()) {
        repeat {
            message(msg)
            answer <- readLines(n = 1)
            if (answer %in% allowed)
                break
        }
        tolower(answer)
    } else {
        "n"
    }
}
mksamur/RTCGAToolbox documentation built on Dec. 22, 2024, 7:11 p.m.