R/cBioDataPack.R
In cBioPortalData: Exposes and makes available data from the cBioPortal web resources

Documented in cBioDataPack downloadStudy loadStudy untarStudy

# previously http://download.cbioportal.org
.url_location <- "https://cbioportal-datahub.s3.amazonaws.com"

getRelevantFilesFromStudy <- function(filelist) {
    ## Remove files that are corrupt / hidden (start with ._)
    datafiles <- grep(x = filelist, pattern = "data.*\\.(txt|seg)$",
        value = TRUE)
    datafiles <- c(datafiles, grep("meta_study", filelist, value = TRUE),
        grep("/LICENSE", filelist, value = TRUE))
    datafiles
}

cbioportal2metadata <- function(meta_file, lic_file) {
    if (!length(meta_file) & !length(lic_file))
        return(list())
    md <- readLines(meta_file, warn = FALSE)
    mdl <- lapply(seq_along(md), function(i) {
        sub(".+: ", "", md[[i]])
    })
    names(mdl) <- sub(":.+", "", md)
    if (length(lic_file)) {
        lic <- readLines(lic_file, warn = FALSE)
        lic <- paste0(lic[lic != ""], collapse = "\n")
        lic <- list(LICENSE = lic)
    }
    c(mdl, if (exists("lic")) lic)
}

.subBCLetters <- function(df, ptID = "PATIENT_ID") {
    idVector <- df[[ptID]]
    allBC <- all(grepl("[A-Z]{4}.[0-9]{2}.[0-9]{4}", idVector))
    noTCGAstart <- is.character(idVector) && !all(startsWith(idVector, "TCGA"))
    if (allBC && noTCGAstart) {
        idVector <- gsub("^[A-Z]{4}", "TCGA", idVector)
        df[[ptID]] <- idVector
    }
    df
}

.silentRead <- function(file, comm = "#", mxlines = Inf, ...) {
    suppressMessages({
        readr::read_tsv(
            file, comment = comm, n_max = mxlines, progress = FALSE, ...
        )
    })
}

.processMeta <- function(clinmeta) {
    cnames <- unlist(unname(clinmeta[5L, ]))
    clinmeta <- clinmeta[-c(3L:5L), ]
    clinmeta <- t(clinmeta)
    clinmeta <- sub("^\\#", "", clinmeta)
    colnames(clinmeta) <- c("column", "definition")
    res <- lapply(setNames(seq_along(cnames), cnames), function(i) {
        clinmeta[i, ]
    })
    as(res, "DataFrame")
}

.getClinMeta <- function(clinfiles) {
    allmeta <- lapply(setNames(nm = clinfiles), function(x) {
        .silentRead(x, comm = "", mxlines = 5L, col_names = FALSE)
    })
    lapply(allmeta, .processMeta)
}

.readAll <- function(namedlist) {
    lapply(setNames(nm = names(namedlist)), function(x)
        .silentRead(x)
    )
}

.readSeparateMerge <- function(datalist) {
    alldata <- .readAll(datalist)
    Reduce(function(x, y) {
        merge(x, y, all = TRUE)
    }, alldata)
}

cbioportal2clinicaldf <- function(files) {
    if (length(files) > 1) {
        mappers <- lapply(setNames(nm = files), function(file)
            .whichMappers(.silentRead(file, mxlines = 5L))
        )
        hasMappers <- lengths(mappers) == 2L
        if (any(hasMappers)) {
            combdata <- mappers[hasMappers]
            clindata <- .readSeparateMerge(combdata)
        }
        ## try merge single mapper data to bigger merged
        singleCols <- lengths(mappers) == 1L
        if (all(singleCols)) {
            clindata <- .readSeparateMerge(mappers[singleCols])
        } else if (any(singleCols)) {
            singles <- .readAll(mappers[singleCols])
            clindata <- Reduce(function(x, y) {
                merge(x, y, all = TRUE)
            }, c(list(clindata), singles))
        }
    } else {
        clindata <- .silentRead(file, mxlines = 5L)
    }
    clinmeta <- .getClinMeta(files)
    clindata <- as(clindata, "DataFrame")
    metadata(clindata) <- clinmeta

    clindata <- .subBCLetters(clindata)
    rownames(clindata) <- clindata[["PATIENT_ID"]]
    clindata
}

.validStudyID <- function(cancer_study_id) {

    if (missing(cancer_study_id))
        stop("Provide a valid 'cancer_study_id' from 'studiesTable'")

    stopifnot(is.character(cancer_study_id),
        !is.na(cancer_study_id), length(cancer_study_id) == 1L)

    cancer_study_id <- tolower(cancer_study_id)
    ## Load dataset to envir
    loc_data <- new.env(parent = emptyenv())
    data("studiesTable", envir = loc_data, package = "cBioPortalData")
    studiesTable <- loc_data[["studiesTable"]]

    ## Ensure study ID is valid
    inTable <- cancer_study_id %in% studiesTable[["cancer_study_id"]]

    if (!inTable)
        stop("Study identifier not found in look up table")
    else
        inTable
}

.download_data_file <-
    function(fileURL, cancer_study_id, verbose = FALSE, force = FALSE)
{
    bfc <- .get_cache()
    rid <- bfcquery(bfc, cancer_study_id, "rname", exact = TRUE)$rid
    if (!length(rid)) {
        rid <- names(bfcadd(bfc, cancer_study_id, fileURL, download = FALSE))
    }
    if (!.cache_exists(bfc, cancer_study_id) || force) {
        if (verbose)
            message("Downloading study file: ", cancer_study_id, ".tar.gz")
            bfcdownload(bfc, rid, ask = FALSE)
    } else
        message("Study file in cache: ", cancer_study_id)

    bfcrpath(bfc, rids = rid)
}

.manageLocalFile <- function(cancer_study_id, inpath) {
    bfc <- .get_cache()
    rid <- bfcquery(bfc, cancer_study_id, "rname", exact = TRUE)$rid
    if (!length(rid))
        stop("Can't update non-existing cache item")

    cachedir <- bfccache(bfc)
    finalname <- paste0(gsub("file", "", basename(tempfile())), "_",
        cancer_study_id, ".tar.gz")
    fileLoc <- file.path(cachedir, finalname)
    file.copy(inpath, fileLoc)

    bfcupdate(bfc, rids = rid, rpath = fileLoc)

    file.remove(inpath)

    bfcrpath(bfc, rids = rid)
}

.altDownload <- function(fileURL, cancer_study_id, verbose = FALSE) {
    if (verbose)
        message("Downloading study file: ", cancer_study_id, ".tar.gz")

    tmpFile <- file.path(tempdir(), paste0(cancer_study_id, ".tar.gz"))
    utils::download.file(fileURL, destfile = tmpFile, quiet = TRUE,
        method = "wget")

    .manageLocalFile(cancer_study_id, tmpFile)
}

#' @name downloadStudy
#'
#' @title Manually download, untar, and load study tarballs
#'
#' @description **Note** that these functions should be used when a particular
#' study is _not_ currently available as a `MultiAssayExperiment`
#' representation. Otherwise, use `cBioDataPack`. Provide a `cancer_study_id`
#' from the `studiesTable` and retrieve the study tarball from cBioPortal.
#' These functions are used by `cBioDataPack` under the hood to download,
#' untar, and load the tarball datasets with caching. As stated in
#' `?cBioDataPack`, not all studies are currently working as
#' `MultiAssayExperiment` objects. As of July 2020, about ~80% of
#' datasets can be successfully imported into the `MultiAssayExperiment` data
#' class. Please open an issue if you would like the team to prioritize a
#' study. You may also check `studiesTable$pack_build` for a more current
#' status.
#'
#' @param cancer_study_id character(1) The study identifier from cBioPortal as
#' in \url{https://cbioportal.org/webAPI}
#'
#' @param use_cache logical(1) (default TRUE) create the default cache location
#' and use it to track downloaded data. If data found in the cache, data will
#' not be re-downloaded. A path can also be provided to data cache location.
#'
#' @param force logical(1) (default FALSE) whether to force re-download data from
#' remote location
#'
#' @param url_location character(1)
#' (default "https://cbioportal-datahub.s3.amazonaws.com") the URL location for
#' downloading packaged data. Can be set using the 'cBio_URL' option (see
#' `?cBioDataPack` for more details)
#'
#' @param names.field A character vector of possible column names for the column
#' that is used to label ranges from a mutations or copy number file.
#'
#' @param cancer_study_file character(1) indicates the on-disk location
#' of the downloaded tarball
#'
#' @param exdir character(1) indicates the folder location to *put*
#' the contents of the tarball (default `tempdir()`; see also `?untar`)
#'
#' @param filepath character(1) indicates the folder location where
#' the contents of the tarball are *located* (usually the same as `exdir`)
#'
#' @return \itemize{
#'   \item {downloadStudy - The file location of the data tarball}
#'   \item {untarStudy - The directory location of the contents}
#'   \item {loadStudy - A MultiAssayExperiment-class object}
#' }
#'
#' @md
#'
#' @seealso \link{cBioDataPack}, \linkS4class{MultiAssayExperiment}
#'
#' @examples
#'
#' (acc_file <- downloadStudy("acc_tcga"))
#'
#' (file_dir <- untarStudy(acc_file, tempdir()))
#'
#' loadStudy(file_dir)
#'
#' @export
downloadStudy <- function(cancer_study_id, use_cache = TRUE, force = FALSE,
    url_location = getOption("cBio_URL", .url_location))
{
    .validStudyID(cancer_study_id)

    url_file <- file.path(url_location, paste0(cancer_study_id, ".tar.gz"))

    if (is.character(use_cache) && length(use_cache) == 1L)
        cBioCache(directory = use_cache)
    else if (isTRUE(use_cache))
        cBioCache()
    else
        stop("Use 'setCache' or specify a download location")

    tryCatch({
        .download_data_file(url_file, cancer_study_id, verbose = TRUE,
            force = force)
        },
        error = function(cond) {
            message("\n", cond)
            message("\nRetrying download with alternative function...")
            .altDownload(url_file, cancer_study_id, verbose = TRUE)
        }
    )
}

#' @rdname downloadStudy
#'
#' @export
untarStudy <- function(cancer_study_file, exdir = tempdir()) {
    exarg <- if (identical(.Platform$OS.type, "unix") &&
        Sys.info()["sysname"] != "Darwin")
        "--warning=no-unknown-keyword" else NULL

    filelist <- untar(cancer_study_file, list = TRUE, extras = exarg)
    filelist <- gsub("^\\.\\/", "", filelist)
    filekeepind <- grep("^\\._", basename(filelist), invert = TRUE)
    filelist <- filelist[filekeepind]
    datafiles <- getRelevantFilesFromStudy(filelist)

    folder <- basename(cancer_study_file)
    exdir <- file.path(exdir, gsub(".tar.gz", "", folder))
    if (!dir.exists(exdir))
        dir.create(exdir)

    untar(cancer_study_file, files = datafiles, exdir = exdir, extras = exarg)
    exdir
}

#' @rdname downloadStudy
#'
#' @export
loadStudy <-
    function(
        filepath, names.field = c("Hugo_Symbol", "Entrez_Gene_Id", "Gene")
    )
{
    datafiles <- getRelevantFilesFromStudy(
        list.files(filepath, recursive = TRUE)
    )

    exptfiles <- file.path(filepath,
        grep("clinical|study|LICENSE|fusion|gistic", datafiles, invert = TRUE,
            value = TRUE))
    clinicalfiles <- file.path(filepath,
        grep("clinical", datafiles, value = TRUE))
    mdatafile <- file.path(filepath,
        grep("meta_study", datafiles, value = TRUE))
    licensefile <- file.path(filepath,
        grep("/LICENSE", datafiles, value = TRUE))
    fusionExtra <- file.path(filepath, grep("fusion", datafiles,
        value = TRUE, ignore.case = TRUE))
    gisticExtra <- file.path(filepath, grep("gistic", datafiles,
        value = TRUE, ignore.case = TRUE))

    expnames <- sub(".*data_", "", sub("\\.txt", "", basename(exptfiles)))
    expseq <- seq_along(exptfiles)
    names(expseq) <- expnames

    coldata <- cbioportal2clinicaldf(clinicalfiles)

    exptlist <- lapply(expseq, function(i, files, xpnames) {
        fname <- files[[i]]
        message(paste0("Working on: ", fname))
        dat <- utils::read.delim(
            fname, sep = "\t", comment.char = "#", stringsAsFactors = FALSE,
            check.names = FALSE
        )
        dat <- .cleanHugo(dat)
        dat <- .cleanStrands(dat)
        dat <- .standardizeBuilds(dat)

        names.field <- .findValidNames(dat, names.field)
        names.field <- .findUniqueField(dat, names.field)
        names.field <- .findMinDupField(dat, names.field)

        dat <- as(dat, "DataFrame")
        if (!RTCGAToolbox:::.hasExperimentData(dat, coldata[["PATIENT_ID"]]))
            return(dat)
        cexp <- xpnames[[i]]
        if (grepl("meth", cexp)) {
            .getMixedData(dat, names.field)
        } else {
            .biocExtract(dat, names.field, coldata[["PATIENT_ID"]])
        }
    }, files = exptfiles, xpnames = expnames)

    names(exptlist) <-
        sub(".*data_", "", sub("\\.txt", "", basename(exptfiles)))

    .checkNonExpData <- function(exp) {
        is(exp, "GRanges") || is(exp, "DataFrame")
    }

    metadats <- Filter(.checkNonExpData, exptlist)
    exptlist <- Filter(function(expt) {!.checkNonExpData(expt)}, exptlist)
    mdat <- cbioportal2metadata(mdatafile, licensefile)

    if (length(fusionExtra))
        fudat <- list(Fusion = .silentRead(fusionExtra))
    else
        fudat <- list()

    if (length(gisticExtra))
        gist <- lapply(gisticExtra, function(x) {
            gfile <- .silentRead(x)
            .getGisticData(gfile)
        })
    else
        gist <- list()

    mdat <- c(mdat, metadats, fudat, gist)
    exptlist <- MultiAssayExperiment::ExperimentList(exptlist)

    if (any(.TCGAcols(coldata))) {
        gmap <- TCGAutils::generateMap(exptlist, coldata,
            TCGAutils::TCGAbarcode)
    } else if (.hasMappers(coldata)) {
        gmap <- TCGAutils::generateMap(exptlist, coldata,
            sampleCol = "SAMPLE_ID", patientCol = "PATIENT_ID")
    } else {
        stop("Experiment data could not be mapped to colData")
    }

    MultiAssayExperiment(experiments = exptlist,
        colData = coldata, sampleMap = gmap, metadata = mdat)
}

#' @name cBioDataPack
#'
#' @title Obtain pre-packaged data from cBioPortal and represent as
#' a MultiAssayExperiment object
#'
#' @description The `cBioDataPack` function allows the user to
#' download and process cancer study datasets found in MSKCC's cBioPortal.
#' Output datasets use the \linkS4class{MultiAssayExperiment} data
#' representation to faciliate analysis and data management operations.
#'
#' @details The list of datasets can be found in the `studiesTable` dataset
#' by doing `data("studiesTable")`. Some datasets may not be available
#' for download and are not guaranteed to be represented as MultiAssayExperiment
#' data objects. After taking a random sample of 100
#' (using \code{set.seed(1234)}), we were able to succesfully represent about
#' 76 percent of the study identifiers as MultiAssayExperiment objects. Please
#' refer to the #' \href{http://cbioportal.org/data_sets.jsp}{website} for the
#' full list of available datasets. Users who would like to prioritize
#' particular datasets should open GitHub issues at the URL in the `DESCRIPTION`
#' file. For a more fine-grained approach to downloading data from the
#' cBioPortal API, refer to the `cBioPortalData` function.
#'
#' @section cBio_URL:
#' The `cBioDataPack` function accesses data from the `cBio_URL` option.
#' By default, it points to an Amazon S3 bucket location. Previously, it
#' pointed to 'http://download.cbioportal.org'. This recent change
#' (> 2.1.17) should provide faster and more reliable downloads for all users.
#' See the URL using `cBioPortalData:::.url_location`. This can be changed
#' if there are mirrors that host this data by setting the `cBio_URL` option
#' with `getOption("cBio_URL", "https://some.url.com/")` before running the
#' function.
#'
#' @inheritParams downloadStudy
#'
#' @param names.field A character vector of possible column names for the column
#' that is used to label ranges from a mutations or copy number file.
#'
#' @param ask A logical vector of length one indicating whether to prompt the
#' the user before downloading and loading study `MultiAssayExperiment`. If
#' TRUE, the user will be prompted to continue for studies that are not
#' currently building as `MultiAssayExperiment` based on previous testing
#' (in a non-interactive session, no data will be downloaded and built unless
#' `ask = FALSE`).
#'
#' @return A \linkS4class{MultiAssayExperiment} object
#'
#' @seealso \url{https://www.cbioportal.org/datasets}, \link{cBioPortalData}
#'
#' @author Levi Waldron, Marcel R., Ino dB.
#' @include utils.R
#'
#' @md
#'
#' @examples
#'
#' data(studiesTable)
#'
#' head(studiesTable[["cancer_study_id"]])
#'
#' # ask=FALSE for non-interactive use
#' mae <- cBioDataPack("acc_tcga", ask = FALSE)
#'
#' @export
cBioDataPack <- function(cancer_study_id, use_cache = TRUE,
    names.field = c("Hugo_Symbol", "Entrez_Gene_Id", "Gene"), ask = TRUE) {

    denv <- new.env(parent = emptyenv())
    data("studiesTable", package = "cBioPortalData", envir = denv)
    studiesTable <- denv[["studiesTable"]]

    intable <- studiesTable[["cancer_study_id"]] %in% cancer_study_id
    if (!any(intable))
        stop("'cancer_study_id', ", cancer_study_id, ", not found.",
            " See 'data(\"studiesTable\")'.")

    builds <- studiesTable[["pack_build"]]
    hasbuilt <- unlist(builds[intable])

    if (!hasbuilt && any(builds)) {
        qtxt <- sprintf(
            paste0("Based on our tests, '%s' is not currently building.",
                "\n Proceed anyway? [y/n]: "),
            cancer_study_id
        )
        if (ask && .getAnswer(qtxt, allowed = c("y", "Y", "n", "N")) == "n")
            stop("'", cancer_study_id, "' is not yet supported.",
                " \n Use 'downloadStudy()' to obtain the study files.")
    }

    cancer_study_file <- downloadStudy(cancer_study_id, use_cache)
    exdir <- untarStudy(cancer_study_file)
    loadStudy(exdir, names.field)
}

Any scripts or data that you put into this service are public.

cBioPortalData documentation built on April 17, 2021, 6:07 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

cBioPortalData
Exposes and makes available data from the cBioPortal web resources

R/cBioDataPack.R
In cBioPortalData: Exposes and makes available data from the cBioPortal web resources

Defines functions cBioDataPack loadStudy untarStudy downloadStudy .altDownload .manageLocalFile .download_data_file .validStudyID cbioportal2clinicaldf .readSeparateMerge .readAll .getClinMeta .processMeta .silentRead .subBCLetters cbioportal2metadata getRelevantFilesFromStudy

Documented in cBioDataPack downloadStudy loadStudy untarStudy

Try the cBioPortalData package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

cBioPortalData Exposes and makes available data from the cBioPortal web resources

R/cBioDataPack.R In cBioPortalData: Exposes and makes available data from the cBioPortal web resources

Defines functions cBioDataPack loadStudy untarStudy downloadStudy .altDownload .manageLocalFile .download_data_file .validStudyID cbioportal2clinicaldf .readSeparateMerge .readAll .getClinMeta .processMeta .silentRead .subBCLetters cbioportal2metadata getRelevantFilesFromStudy

Documented in cBioDataPack downloadStudy loadStudy untarStudy

Try the cBioPortalData package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

cBioPortalData
Exposes and makes available data from the cBioPortal web resources

R/cBioDataPack.R
In cBioPortalData: Exposes and makes available data from the cBioPortal web resources