R/cBioPortalData.R

Defines functions cBioPortalData update.args eval.args match.args std.args .updateRowData .getRagEx .portalExperiments

Documented in cBioPortalData

.portalExperiments <- function(
    api, by, genePanelId, genes, studyId, molecularProfileIds,
    sampleListId, sampleIds
) {
    expers <- getDataByGenes(api, genes = genes, genePanelId = genePanelId,
        studyId = studyId, molecularProfileIds = molecularProfileIds,
        sampleListId = sampleListId, sampleIds = sampleIds, by = by)

    sampmap <- lapply(expers, function(x) {
        if (length(x)) {
            smap <- x[, c("molecularProfileId", "patientId", "sampleId")]
            names(smap) <- c("assay", "primary", "colname")
            smap[["assay"]] <- factor(smap[["assay"]])
            smap[!duplicated(smap), ]
        } else {
            tibble::tibble(assay = character(0L), primary = character(0L),
                colname = character(0L))
        }
    })
    sampleMap <- dplyr::bind_rows(sampmap)

    experlist <- lapply(setNames(nm = names(expers)),
        function(molprof) {
            byGene <- expers[[molprof]]
            isMut <- grepl("mutation", molprof, ignore.case = TRUE)
            if (isMut)
                colsOI <- c(by, "chr", "startPosition", "endPosition",
                    "ncbiBuild", "sampleId", "mutationType")
            else
                colsOI <- c(by, "sampleId", "value")
            if (length(byGene)) {
                colsoi <- colsOI[colsOI %in% names(byGene)]

                if (isMut) {
                    res <- tidyr::pivot_wider(byGene[, colsoi],
                        names_from = "sampleId",
                        values_from = "mutationType",
                        values_fn = list(mutationType =
                            function(x) paste0(x, collapse = ";")
                        )
                    )
                    .getMutationData(res, by)
                } else {
                    res <- tidyr::pivot_wider(byGene[, colsoi],
                        names_from = "sampleId",
                        values_from = "value"
                    )
                    .getMixedData(res, by)
                }
            } else {
                SummarizedExperiment::SummarizedExperiment()
            }
        }
    )
    experlist <- as(Filter(length, experlist), "List")

    isTCGA <- grepl("tcga", studyId, ignore.case = TRUE)

    metalist <- lapply(names(experlist), function(molprof) {
        isMut <- grepl("mutation", molprof, ignore.case = TRUE)
        byGene <- expers[[molprof]]
        if (isMut) {
            colsOI <- c(by, "chr", "startPosition", "endPosition",
                "ncbiBuild", "sampleId", "mutationType")
            metaGene <- byGene[, !names(byGene) %in% colsOI]
            if (isTCGA) {
                ragex <- .getRagEx(byGene)
                experlist <<- c(experlist, list(proteinPos = ragex))
            }
        } else {
            colsOI <- c(by, "sampleId", "value")
            metaGene <- byGene[, !names(byGene) %in% colsOI]
            experlist <<- .updateRowData(metaGene, by, molprof, experlist)
        }
        metaGene
    })

    list(
        sampleMap = as(sampleMap, "DataFrame"),
        experiments = experlist,
        metadata = metalist
    )
}

.getRagEx <- function(metainfo, byname, assayname, explist) {
    ncbiBuildCol <- grep("ncbiBuild", names(metainfo), value = TRUE,
        fixed = TRUE)
    rangeSet <- cbind(
        metainfo[["proteinPosStart"]], metainfo[["proteinPosEnd"]]
    )
    metainfo[["proteinPosStart"]] <- apply(rangeSet, 1L, min, na.rm = TRUE)
    metainfo[["proteinPosEnd"]] <- apply(rangeSet, 1L, max, na.rm = TRUE)
    splitframe <- GenomicRanges::makeGRangesListFromDataFrame(metainfo,
        split.field = "sampleId", start.field = "proteinPosStart",
        end.field = "proteinPosEnd", keep.extra.columns = TRUE)
    ptIds <- TCGAutils::TCGAbarcode(names(splitframe))
    rex <- RaggedExperiment::RaggedExperiment(splitframe, colData =
        S4Vectors::DataFrame(row.names = ptIds)
    )
    if (length(ncbiBuildCol))
        genome(rex) <- TCGAutils::uniformBuilds(metainfo[[ncbiBuildCol]])
    rex
}

.updateRowData <- function(metainfo, byname, assayname, explist) {
    newby <- grep(byname, x = c("hugoGeneSymbol", "entrezGeneId"), value = TRUE,
        fixed = TRUE, invert = TRUE)
    stopifnot(is.character(newby), length(newby) == 1L)
    if (length(newby)) {
        exptoupdate <- explist[[assayname]]
        altNames <- unique(metainfo[[newby]])
        allName <- if (!is.null(altNames)) {
            all.equal(
                altNames,
                Reduce(intersect, split(metainfo[[newby]], metainfo[["patientId"]]))
            )
        } else { FALSE }
        if (isTRUE(allName)) {
            altDF <- DataFrame(altNames)
            names(altDF) <- newby
            rowData(exptoupdate) <- altDF
            explist[[assayname]] <- exptoupdate
        }
    }
    explist
}

std.args <- function(call, formals) {
    callargs <- as.list(call)[-1]
    toadd <- setdiff(names(formals), names(callargs))
    call[toadd] <- formals[toadd]
    call
}

match.args <- function(fun, call, ...) {
    funfor <- formals(fun)
    exargs <- intersect(names(funfor), names(call))
    c(as.list(call)[-1][exargs], ...)
}

eval.args <- function(args) {
    toeval <- !names(args) %in% c("api", "idConvert", "studyId")
    evalargs <- lapply(args[toeval], eval)
    stud <- dynGet("studyId")
    args[["studyId"]] <- stud
    api <- dynGet("api")
    args[["api"]] <- api
    args[toeval] <- evalargs
    args
}

update.args <- function(args) {
    molecularProfileIds <- args[["molecularProfileIds"]]
    studyId <- args[["studyId"]]
    api <- args[["api"]]
    if (is.null(molecularProfileIds)) {
        molProfs <- molecularProfiles(api, studyId)
        ## data type not working yet
        ## https://github.com/cBioPortal/cbioportal/issues/7816
        args[["molecularProfileIds"]] <- molProfs[
            molProfs[["molecularAlterationType"]] != "STRUCTURAL_VARIANT",
            "molecularProfileId",
            drop = TRUE
        ]
    }

    args[["molecularProfileIds"]] <-
        setNames(nm = args[["molecularProfileIds"]])

    args
}

#' Download data from the cBioPortal API
#'
#' Obtain a `MultiAssayExperiment` object for a particular gene panel,
#' `studyId`, `molecularProfileIds`, and `sampleListIds` combination. Default
#' `molecularProfileIds` and `sampleListIds` are set to NULL for including all
#' data. This option is best for users who wish to obtain a section of the
#' study data that pertains to a specific molecular profile and gene panel
#' combination. For users looking to download the entire study data as provided
#' by the \url{https://www.cbioportal.org/datasets}, refer to `cBioDataPack`.
#'
#' @details We are able to succesfully represent 98 percent of the study
#'     identifiers as `MultiAssayExperiment` objects as obtained via
#'     `cBioPortalData` with the `IMPACT341` `genePanelId` as the example
#'     gene panel. Datasets that currently fail to import
#'     can be seen in the `getStudies(..., buildReport = TRUE)` dataset
#'     under the `"api_build"` column.
#'     Note that changes to the cBioPortal API may affect this rate at any
#'     time. If you encounter any issues, please open a GitHub issue at the
#'     \url{https://github.com/waldronlab/cBioPortalData/issues/} page with
#'     a fully reproducible example.
#'
#' @inheritParams cBioPortal
#'
#' @param check_build logical(1L) Whether to check the build status of the
#'     `studyId` using an internal dataset. This argument should be set to
#'     `FALSE` if using alternative `hostnames`, e.g.,
#'     'pedcbioportal.kidsfirstdrc.org'
#'
#' @param ask logical(1) Whether to prompt the the user before downloading and
#'   loading study `MultiAssayExperiment` that is not currently building based
#'   on previous testing. Set to `interactive()` by default. In a
#'   non-interactive session, data download will be attempted; equivalent to
#'   `ask = FALSE`. The argument will also be used when a cache directory needs
#'   to be created when using `downloadStudy`.
#'
#' @md
#'
#' @examples
#'
#' cbio <- cBioPortal()
#'
#' samps <- samplesInSampleLists(cbio, "acc_tcga_rppa")[[1]]
#'
#' getGenePanelMolecular(
#'     cbio, molecularProfileIds = c("acc_tcga_rppa", "acc_tcga_linear_CNA"),
#'     samps
#' )
#'
#' acc_tcga <- cBioPortalData(
#'     cbio, by = "hugoGeneSymbol",
#'     studyId = "acc_tcga",
#'     genePanelId = "AmpliSeq",
#'     molecularProfileIds =
#'         c("acc_tcga_rppa", "acc_tcga_linear_CNA", "acc_tcga_mutations")
#' )
#'
#' @return A \linkS4class{MultiAssayExperiment} object
#'
#' @seealso \link{cBioDataPack}, \link{removeDataCache}
#'
#' @export
cBioPortalData <-
    function(api, studyId = NA_character_,
        genePanelId = NA_character_,
        genes = NA_character_,
        molecularProfileIds = NULL,
        sampleListId = NULL,
        sampleIds = NULL,
        by = c("entrezGeneId", "hugoGeneSymbol"),
        check_build = TRUE,
        ask = interactive()
    )
{
    if (missing(api))
        stop("Provide a valid 'api' from 'cBioPortal()'")

    by <- match.arg(by)

    formals <- formals()
    formals[["by"]] <- by
    call <- std.args(match.call(), formals)
    exargs <- match.args(.portalExperiments, call)
    exargs <- eval.args(exargs)
    exargs <- update.args(exargs)

    if (check_build)
        .is_study_id_building(exargs[["studyId"]], "api_build", ask = ask)

    lists <- do.call(.portalExperiments, exargs)

    clinargs <- match.args(clinicalData, call)
    clinargs <- eval.args(clinargs)
    clin <- do.call(clinicalData, clinargs)
    clin <- as(clin, "DataFrame")

    # resolve duplicate IDs
    if (anyDuplicated(clin[["patientId"]])) {
        mets <- clin[duplicated(clin[["patientId"]]), ]
        metadata(clin) <- list(duplicated = mets)
        clin <- clin[!duplicated(clin[["patientId"]]), ]
    }
    rownames(clin) <- clin[["patientId"]]

    lists[["colData"]] <- clin
    do.call(MultiAssayExperiment, lists)
}
waldronlab/cBioPortalData documentation built on Nov. 4, 2024, 9:15 a.m.