.assaysAvailable <- function(listfiles) {
slots <- c("metadata", "colData", "sampleMap")
assaysAvailable <-
vapply(strsplit(gsub("(^[A-Z]*_)(.*)", "\\2", listfiles), "-"),
`[[`, character(1L), 1L)
assaysAvailable <- unique(assaysAvailable)
sort(assaysAvailable[!assaysAvailable %in% slots])
}
.removeExt <- function(fileNames) {
gsub("\\.[RrHh][Dd5][AaSs]?$", "", fileNames)
}
.getComboSort <- function(...) {
sort(apply(expand.grid(..., stringsAsFactors = FALSE), MARGIN = 1L,
FUN = paste, collapse = "_"))
}
.conditionToIndex <- function(FUN, reference, position) {
reference <-
vapply(strsplit(reference, "-"), utils::head, character(1L), 1L)
logmat <- vapply(position, FUN, logical(length(reference)), x = reference)
apply(logmat, 1L, any)
}
.loadMethyl <- function(ehub, methylpaths, verbose) {
fact <- gsub("_assays\\.[Hh]5|_se\\.[Rr][Dd][Ss]", "", methylpaths)
methList <- split(sort(methylpaths), fact)
fnames <- basename(unique(fact))
names(methList) <- fnames
lapply(methList, function(methfile, fn) {
if (verbose)
message("Loading: ", paste(fn, collapse = ",\n "))
assaydat <- query(ehub, methfile[1L])[[1L]]
se <- query(ehub, methfile[2L])[[1L]]
h5array <- HDF5Array::HDF5Array(assaydat, "assay001")
SummarizedExperiment::`assays<-`(
x = se, withDimnames = FALSE,
value = list(counts = h5array)
)
}, fn = names(methList))
}
#' @importFrom methods is
.checkRaggedExperiment <- function(reslist) {
reclass <- vapply(reslist, is, logical(1L), "RaggedExperiment")
if (any(reclass)) {
if (!requireNamespace("RaggedExperiment", quietly = TRUE))
stop("Install 'RaggedExperiment' to load these data.")
}
}
.getResources <- function(ExperimentHub, resTable, verbose) {
fileNames <- stats::setNames(resTable[["RDataPath"]], resTable[["Title"]])
anyMeth <- grepl("Methyl", fileNames, ignore.case = TRUE)
resources <- lapply(fileNames[!anyMeth], function(res) {
if (verbose)
message(
"Querying and downloading: ",
basename(tools::file_path_sans_ext(res))
)
query(ExperimentHub, res)[[1L]]
})
if (any(anyMeth))
resources <- c(resources,
.loadMethyl(ExperimentHub, fileNames[anyMeth], verbose))
.checkRaggedExperiment(resources)
resources
}
.searchFromInputs <- function(glob, searchFields) {
regGlob <- glob2rx(unique(glob))
res <- unlist(lapply(regGlob, function(x) {
grep(x, searchFields, ignore.case = TRUE, value = TRUE)
}))
if (!length(res))
stop("No matches found, modify search criteria")
res
}
.resolveNames <- function(datList) {
commonNames <- Reduce(intersect, lapply(datList, names))
mLogic <- vapply(commonNames, function(y) {
classes <- unlist(lapply(datList, function(x) class(x[[y]])))
length(unique(classes)) == 1L
}, logical(1L))
unMergeable <- names(which(!mLogic))
lapply(seq_along(datList), function(i, x) {
varIdx <- match(unMergeable, names(x[[i]]))
DFnames <- names(x[[i]])
DFnames[varIdx] <- paste0(unMergeable, ".", i)
names(x[[i]]) <- DFnames
x[[i]]
}, x = datList)
}
.test_eh <- function(...) {
tryCatch({
ExperimentHub(...)
}, error = function(e) {
emsg <- conditionMessage(e)
if (grepl("Timeout", emsg))
warning("[experimenthub.bioconductor.org] timeout, localHub=TRUE",
call.=FALSE)
ExperimentHub(..., localHub = TRUE)
})
}
.queryResources <- function(ExperimentHub, resTable, verbose) {
fileNames <- stats::setNames(resTable[["RDataPath"]], resTable[["Title"]])
lapply(fileNames, function(res) {
if (verbose)
message(
"Querying EH with: ", basename(tools::file_path_sans_ext(res))
)
query(ExperimentHub, res)
})
}
.getResourceInfo <- function(ExperimentHub, resTable, verbose) {
infos <- .queryResources(ExperimentHub, resTable, verbose)
resID <- vapply(infos, names, character(1L))
restab <- AnnotationHub::getInfoOnIds(ExperimentHub, resID)
restab <-
restab[, !names(restab) %in% c("fetch_id", "status", "biocversion")]
sizes <- as.numeric(restab[["file_size"]])
class(sizes) <- "object_size"
restab <- as.data.frame(append(
restab,
list(file_size = format(sizes, units = "Mb")),
which(names(restab) == "title")
))
restab[, -length(restab)]
}
.filterRecent <- function(metadf) {
metabiocver <- metadf[["BiocVersion"]]
if (identical(length(unique(metabiocver)), 1L))
return(metadf)
if (nzchar(system.file(package = "BiocVersion")))
biocver <- utils::packageVersion("BiocVersion")[, 1:2]
else
stop("Bioconductor version not found; see BiocManager vignette")
recentver <- metabiocver == biocver
iscurr <- any(recentver)
if (!iscurr)
recentver <- metabiocver == max(as.package_version(unique(metabiocver)))
metadf[recentver, ]
}
.genSampMap <- function(expname, colnms) {
assays <- factor(rep(expname, lengths(colnms)))
allnames <- unlist(colnms, use.names = FALSE)
primaries <- .part_bcode(allnames)
S4Vectors::DataFrame(
assay = assays,
primary = primaries,
colname = allnames
)
}
.onTheFlySampleMap <- function(explist, peripherals) {
affected <- grepl("RNASeq2GeneNorm", names(explist), fixed = TRUE)
odl <- explist[affected]
bydx <- split(odl, gsub("RNASeq2GeneNorm", "sampleMap", names(odl)))
sampmaps <- lapply(bydx, function(dxdata) {
assayName <- names(dxdata)
cnames <- colnames(dxdata)
.genSampMap(assayName, cnames)
})
nmaps <- Map(function(x, y) {
x <- x[x[["assay"]] != levels(y[["assay"]]), ]
rbind(x, y)
}, x = peripherals[names(sampmaps)], y = sampmaps)
peripherals[names(nmaps)] <- nmaps
peripherals
}
.VALID_VERSIONS <- c("1.1.38", "2.0.1", "2.1.0", "2.1.1")
.VALID_VERSIONS_DISPLAY <- paste(.VALID_VERSIONS, collapse = ", ")
#' Create a MultiAssayExperiment from specific assays and cohorts
#'
#' @description curatedTCGAData assembles data on-the-fly from ExperimentHub
#' to provide cohesive \linkS4class{MultiAssayExperiment} container objects.
#' All the user has to do is to provide TCGA disease code(s) and assay types.
#' It is highly recommended to use the companion package `TCGAutils`,
#' developed to work with TCGA data specifically from `curatedTCGAData` and
#' some flat files.
#'
#' @details This function will check against available resources in
#' ExperimentHub. Only the latest runDate ("2016-01-28") is supported.
#' Use the \code{dry.run = FALSE} to download remote datasets and
#' build an integrative \linkS4class{MultiAssayExperiment} object.
#' For a list of 'diseaseCodes', see the \link{curatedTCGAData-package}
#' help page.
#'
#' @param diseaseCode character() A vector of TCGA cancer cohort codes
#' (e.g., `COAD`)
#'
#' @param assays character() A vector of TCGA assays, glob matches allowed;
#' see below for more details
#'
#' @param version character(1) One of `1.1.38`, `2.0.1`, `2.1.0`, or `2.1.1`
#' indicating the data version to obtain from `ExperimentHub`. Version `2.1.1`
#' includes various improvements as well as the addition of the `RNASeq2Gene`
#' assay and subtype updates. See `version` section details.
#'
#' @param dry.run logical(1) Whether to return the dataset names
#' before actual download (default TRUE)
#'
#' @param ... Additional arguments passed on to the
#' \code{\link[ExperimentHub:ExperimentHub-class]{ExperimentHub}}
#' constructor
#'
#' @param verbose logical(1) Whether to show the dataset currenlty being
#' (down)loaded (default TRUE)
#'
#' @section Available Assays:
#'
#' Below is a list of partial `ExperimentList` assay names and their respective
#' description. These assays can be entered as part of the \code{assays}
#' argument in the main function. Partial glob matches are allowed such as:
#' \code{'CN*'} for "CNASeq", "CNASNP", "CNVSNP" assays. Credit: Ludwig G.
#' \preformatted{
#'
#' ExperimentList data types Description
#' ----------------------------------------------------------------------------
#' SummarizedExperiment*
#' RNASeqGene Gene expression values
#' RNASeq2Gene RSEM TPM gene expression values
#' RNASeq2GeneNorm Upper quartile log2 normalized RSEM TPM gene
#' expression values
#' miRNAArray Probe-level miRNA expression values
#' miRNASeqGene Gene-level log2 RPM miRNA expression values
#' mRNAArray Unified gene-level mRNA expression values
#' mRNAArray_huex Gene-level mRNA expression values from Affymetrix
#' Human Exon Array
#' mRNAArray_TX_g4502a Gene-level mRNA expression values from Agilent
#' 244K Array
#' mRNAArray_TX_ht_hg_u133a Gene-level mRNA expression values from Affymetrix
#' Human Genome U133 Array
#' GISTIC_AllByGene Gene-level GISTIC2 copy number values
#' GISTIC_ThresholdedByGene Gene-level GISTIC2 thresholded discrete copy
#' number values
#' RPPAArray Reverse Phase Protein Array normalized protein
#' expression values
#' RangedSummarizedExperiment
#' GISTIC_Peaks GISTIC2 thresholded discrete copy number values
#' in recurrent peak regions
#' SummarizedExperiment with HDF5Array DelayedMatrix
#' Methylation_methyl27 Probe-level methylation beta values from Illumina
#' HumanMethylation 27K BeadChip
#' Methylation_methyl450 Probe-level methylation beta values from Infinium
#' HumanMethylation 450K BeadChip
#' RaggedExperiment
#' CNASNP Segmented somatic Copy Number Alteration calls
#' from SNP array
#' CNVSNP Segmented germline Copy Number Variant calls from
#' SNP Array
#' CNASeq Segmented somatic Copy Number Alteration calls
#' from low pass DNA Sequencing
#' Mutation* Somatic mutations calls
#' CNACGH_CGH_hg_244a Segmented somatic Copy Number Alteration calls
#' from CGH Agilent Microarray 244A
#' CNACGH_CGH_hg_415k_g4124a Segmented somatic Copy Number Alteration calls
#' from CGH Agilent Microarray 415K
#' * All can be converted to RangedSummarizedExperiment (except RPPAArray) with
#' TCGAutils
#'
#' }
#'
#' @section version:
#'
#' Version `2.1.1` provides a couple of corrections to the `colData` for ovarian
#' cancer (`OV`) and skin cancer (`SKCM`). In these new data, the cancer
#' subtype variables are fully available. One get obtain the mapping of columns
#' to subtypes in the `colData` with the `getSubtypeMap` function in
#' `TCGAutils`.
#'
#' Version `2.1.0` provides gene-level log2 RPM miRNA expression values for
#' `miRNASeqGene` data log2 normalized RSEM for `RNASeq2GeneNorm` assays.
#' Previously, the data provided were read counts and normalized counts,
#' respectively. See issue [#53 on
#' GitHub](https://github.com/waldronlab/curatedTCGAData/issues/53) for
#' additional details.
#'
#' The new version `2.0.1` includes various improvements including an
#' additional assay that provides `RNASeq2Gene` data as RSEM TPM gene
#' expression values (issue #38). Additional changes include genomic
#' information for `RaggedExperiment` type data objects where '37' is now
#' 'GRCh37' as reported in issue #40. Datasets (e.g., OV, GBM) that contain
#' multiple assays that could be merged are now provided as merged assays
#' (issue #27). We corrected an issue where `mRNAArray` assays were returning
#' `DataFrame`s instead of `matrix` type data (issue #31). Version `1.1.38`
#' provides the original run of `curatedTCGAData` and is provided due to legacy
#' reasons.
#'
#' @seealso curatedTCGAData-package
#'
#' @return a \linkS4class{MultiAssayExperiment} of the specified assays and
#' cancer codes or informative data.frame of resources when `dry.run` is `TRUE`
#'
#' @examples
#'
#' curatedTCGAData(
#' diseaseCode = c("GBM", "ACC"), assays = "CNASNP", version = "2.0.1"
#' )
#'
#' curatedTCGAData("BRCA", "GISTIC*", "2.0.1")
#'
#' @md
#'
#' @export curatedTCGAData
curatedTCGAData <-
function(
diseaseCode = "*", assays = "*", version,
dry.run = TRUE, verbose = TRUE, ...
)
{
runDate <- "20160128"
if (missing(version) || !version %in% .VALID_VERSIONS)
stop(
"'version' must be one of ", .VALID_VERSIONS_DISPLAY," ; ",
"see '?curatedTCGAData'"
)
assays_file <- system.file("extdata", "metadata.csv",
package = "curatedTCGAData", mustWork = TRUE)
assay_metadat <- read.csv(assays_file, stringsAsFactors = FALSE)
assay_metadat <-
assay_metadat[assay_metadat[["SourceVersion"]] == version, ]
assay_metadat <- .filterRecent(assay_metadat)
eh_assays <- assay_metadat[["ResourceName"]]
tcgaCodes <- sort(unique(gsub("(^[A-Z]*)_(.*)", "\\1", eh_assays)))
assaysAvail <- .assaysAvailable(eh_assays)
diseaseCode <- toupper(diseaseCode)
resultCodes <- .searchFromInputs(diseaseCode, tcgaCodes)
resultAssays <- .searchFromInputs(assays, assaysAvail)
codeAssay <- .getComboSort(resultCodes, resultAssays)
fileIdx <- .conditionToIndex(endsWith, eh_assays, codeAssay)
fileMatches <- assay_metadat[fileIdx, c("Title", "DispatchClass")]
if (!length(nrow(fileMatches)))
stop("Cancer and data type combination(s) not available")
eh <- .test_eh(...)
if (dry.run) {
message("See '?curatedTCGAData' for 'diseaseCode' and 'assays' inputs")
return(
.getResourceInfo(
eh, assay_metadat[fileIdx, c("Title", "RDataPath")], verbose
)
)
}
assay_list <- .getResources(
eh, assay_metadat[fileIdx, c("Title", "RDataPath")], verbose
)
eh_experiments <- ExperimentList(assay_list)
ess_names <- c("colData", "metadata", "sampleMap")
ess_idx <- .conditionToIndex(
startsWith, eh_assays, .getComboSort(resultCodes, ess_names)
)
ess_list <- .getResources(eh,
assay_metadat[ess_idx, c("Title", "RDataPath")], verbose)
hasRNAS2GN <- any(grepl("RNASeq2GeneNorm", names(assay_list)))
if (identical(version, "2.1.1") && hasRNAS2GN)
ess_list <- .onTheFlySampleMap(eh_experiments, ess_list)
if (length(resultCodes) > 1L) {
# Save metadata from all datasets
colDatIdx <- grepl("colData", names(ess_list), ignore.case = TRUE)
metas <- lapply(ess_list[colDatIdx], metadata)
metas <- do.call(c, metas)
ess_list <- lapply(seq_along(ess_names), function(i, grp, funs) {
grpd <- grepl(grp[i], names(ess_list), fixed = TRUE)
dats <- ess_list[grpd]
if (identical(funs[[i]], merge)) {
dats <- .resolveNames(dats)
mObj <- Reduce(function(x, y) {
merge(x, y, by = intersect(names(x), names(y)),
all = TRUE, sort = FALSE)
}, dats)
rownames(mObj) <- mObj[["patientID"]]
} else if (identical(funs[[i]], c)) {
mObj <- dats
} else {
mObj <- Reduce(funs[[i]], dats)
}
mObj
}, grp = ess_names, funs = list(merge, c, rbind))
names(ess_list) <- ess_names
## Include all metadata from colData(s)
metadata(ess_list[["colData"]]) <- metas
} else {
meta_idx <- grepl("metadata", names(ess_list), ignore.case = TRUE)
ess_list[meta_idx] <- list(metadata = ess_list[meta_idx])
names(ess_list) <- gsub("[A-Z]*_(.*)-[0-9]*", "\\1", names(ess_list))
}
MultiAssayExperiment(
experiments = eh_experiments,
colData = ess_list[["colData"]],
sampleMap = ess_list[["sampleMap"]],
metadata = ess_list[["metadata"]]
)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.