#' @importFrom GenomicFeatures genes microRNAs
#' @importFrom GenomeInfoDb keepStandardChromosomes seqlevelsStyle
#' seqlevelsStyle<-
NULL
.checkHas <-
function(x, pattern = c("^hsa", "^cg", "symbols"), threshold = 0.9) {
if (identical(pattern, "symbols"))
pattern <- "^[A-Z0-9]{1,6}|^C[0-9]orf[0-9]{1,4}"
mean(c(FALSE, grepl(pattern, rownames(x))), na.rm = TRUE) > 0.9
}
.isSummarizedExperiment <- function(x) {
is(x, "SummarizedExperiment") & !is(x, "RangedSummarizedExperiment")
}
.convertTo <- function(x, which, FUN, keep, unmap) {
for (i in which(which)) {
lookup <- FUN(rownames(x[[i]]))
ranges <- lookup[["mapped"]]
rse <- x[[i]][names(ranges), ]
# rowData not merged with mcols of RHS in `rowRanges<-` method
mcols(ranges) <-
S4Vectors::DataFrame(rowData(rse), S4Vectors::mcols(ranges))
SummarizedExperiment::rowRanges(rse) <- ranges
x <- c(x, setNames(S4Vectors::List(rse),
paste0(names(x)[i], "_ranged")))
if (length(lookup[["unmapped"]]) && unmap) {
se <- x[[i]][lookup[["unmapped"]], ]
x <- c(x, setNames(S4Vectors::List(se),
paste0(names(x)[i], "_unranged")))
}
}
if (!keep & any(which))
x <- x[, , -match(names(which(which)), names(x))]
x
}
#' @name hidden-helpers
#' @title A small document for helper functions
#' @param x A character vector
#' @param gn A GRanges object with some of its names found in x
#' @return A list of length 2: unmapped (character vector) and mapped (GRanges)
#' @keywords internal
.makeListRanges <- function(x, gn) {
res <- list(unmapped = x[!x %in% names(gn)])
x <- x[x %in% names(gn)]
gn <- gn[match(x, names(gn))]
res[["mapped"]] <- gn
return(res)
}
.getGN <- function(gen, FUN) {
stopifnot(is.character(gen), length(gen) == 1L)
fun <- switch(FUN,
genes = GenomicFeatures::genes,
microRNAs = GenomicFeatures::microRNAs
)
txdb <- if (identical(gen, "hg18"))
TxDb.Hsapiens.UCSC.hg18.knownGene::TxDb.Hsapiens.UCSC.hg18.knownGene
else if (identical(gen, "hg19"))
TxDb.Hsapiens.UCSC.hg19.knownGene::TxDb.Hsapiens.UCSC.hg19.knownGene
gn <- keepStandardChromosomes(fun(txdb), pruning.mode = "coarse")
seqlevelsStyle(gn) <- "NCBI"
if (identical(FUN, "genes"))
names(gn) <- AnnotationDbi::mapIds(
org.Hs.eg.db::org.Hs.eg.db,
names(gn),
keytype = "ENTREZID",
column = "SYMBOL"
)
else if (identical(FUN, "microRNAs"))
names(gn) <- mcols(gn)[["mirna_id"]]
gn
}
#' @rdname hidden-helpers
#' @return list of length 2: "unmapped" is a character vector providing
#' unmapped symbols, "mapped" is a GRanges object with ranges of mapped symbols
#' @keywords internal
.getRangesOfSYMBOLS <- function(x) {
gn <- .getGN("hg19", "genes")
.makeListRanges(x, gn)
}
#' @rdname hidden-helpers
#' @param x A SummarizedExperiment containing hsa miR IDs as rownames
#' @keywords internal
.getRangesOfMir <- function(x) {
mr <- .getGN("hg19", "microRNAs")
.makeListRanges(x, mr)
}
.checkPkgsAvail <- function(pkgnames) {
vapply(pkgnames, function(pkgname) {
if (!requireNamespace(pkgname, quietly = TRUE)) {
func <- as.character(sys.call(-3L)[[1L]])
func <- func[!(func %in% c("::", "TCGAutils"))]
stop("Install the '", pkgname, "' package to use '", func, "'",
call. = FALSE)
} else
TRUE
}, logical(1L))
}
.getRangesOfCpG <- function(x) {
local_data_store <- new.env(parent = emptyenv())
data("Locations", envir = local_data_store,
package = "IlluminaHumanMethylation450kanno.ilmn12.hg19")
Locations <- local_data_store[["Locations"]]
clist <- list(seqnames = "chr", pos = "pos", strand = "strand")
gps <- do.call(GenomicRanges::GPos,
lapply(clist, function(x) Locations[, x]))
names(gps) <- rownames(Locations)
seqlevelsStyle(gps) <- "NCBI"
.makeListRanges(x, gps)
}
#' @rdname simplifyTCGA
#'
#' @title Functions to convert rows annotations to ranges and RaggedExperiment
#' to RangedSummarizedExperiment
#'
#' @description This group of functions will convert row annotations as
#' either gene symbols or miRNA symbols to row ranges based on database
#' resources 'TxDB' and 'org.Hs' packages. It will also simplify the
#' representation of
#' [RaggedExperiment][RaggedExperiment::RaggedExperiment-class] objects to
#' [RangedSummarizedExperiment][SummarizedExperiment::RangedSummarizedExperiment-class].
#'
#' @details The original `SummarizedExperiment` containing either gene symbol
#' or miR annotations is replaced or supplemented by a
#' [RangedSummarizedExperiment][SummarizedExperiment::RangedSummarizedExperiment-class]
#' for those that could be mapped to
#' [GRanges][GenomicRanges::GRanges-class], and optionally another
#' [SummarizedExperiment][SummarizedExperiment::SummarizedExperiment-class]
#' for annotations that could not be mapped to
#' [GRanges][GenomicRanges::GRanges-class].
#'
#' @section qreduceTCGA:
#'
#' Using `TxDb.Hsapiens.UCSC.hg19.knownGene` as the reference, `qreduceTCGA`
#' reduces the data by applying either the `weightedmean` or `nonsilent`
#' function (see below) to non-mutation or mutation data, respectively.
#' Internally, it uses [RaggedExperiment::qreduceAssay()] to reduce the ranges
#' to the gene-level.
#'
#' `qreduceTCGA` will update `genome(x)` based on the NCBI reference annotation
#' which includes the patch number, e.g., GRCh37.p14, as provided by the
#' `seqlevelsStyle` setter, `seqlevelsStyle(gn) <- "NCBI"`. `qreduceTCGA`
#' uses the NCBI genome annotation as the default reference.
#'
#' nonsilent <- function(scores, ranges, qranges)
#' any(scores != "Silent")
#'
#' `RaggedExperiment` mutation objects become a genes by patients
#' `RangedSummarizedExperiment` object containing '1' if there is a non-silent
#' mutation somewhere in the gene, and '0' otherwise as obtained from the
#' `Variant_Classification` column in the data.
#'
#' weightedmean <- function(scores, ranges, qranges) {
#' isects <- GenomicRanges::pintersect(ranges, qranges)
#' sum(scores * BiocGenerics::width(isects)) /
#' sum(BiocGenerics::width(isects))
#' }
#'
#' "CNA" and "CNV" segmented copy number are reduced using a weighted mean in
#' the rare cases of overlapping (non-disjoint) copy number regions.
#'
#' These functions rely on `TxDb.Hsapiens.UCSC.hg19.knownGene` and
#' `org.Hs.eg.db` to map to the 'hg19' NCBI build. Use the `liftOver` procedure
#' for datasets that are provided against a different reference genome (usually
#' 'hg18'). See an example in the vignette.
#'
#' @param obj A `MultiAssayExperiment` object obtained from `curatedTCGAData`
#'
#' @param keep.assay logical (default FALSE) Whether to keep the
#' `SummarizedExperiment` assays that have been converted to
#' `RangedSummarizedExperiment`
#'
#' @param unmapped logical (default TRUE) Include an assay of data that was
#' not able to be mapped in reference database
#'
#' @param suffix character (default "_simplified") A character string to append
#' to the newly modified assay for `qreduceTCGA`.
#'
#' @return A
#' [`MultiAssayExperiment`][MultiAssayExperiment::MultiAssayExperiment-class]
#' with any gene expression, miRNA, copy number, and mutations converted to
#' [`RangedSummarizedExperiment`][SummarizedExperiment::RangedSummarizedExperiment-class]
#' objects
#'
#' @author L. Waldron
#'
#' @md
#'
#' @examples
#'
#' library(curatedTCGAData)
#' library(GenomeInfoDb)
#'
#' accmae <-
#' curatedTCGAData(diseaseCode = "ACC",
#' assays = c("CNASNP", "Mutation", "miRNASeqGene", "GISTICT"),
#' version = "1.1.38",
#' dry.run = FALSE)
#'
#' ## update genome annotation
#' rex <- accmae[["ACC_Mutation-20160128"]]
#'
#' ## Translate build to "hg19"
#' tgenome <- vapply(genome(rex), translateBuild, character(1L))
#' genome(rex) <- tgenome
#'
#' accmae[["ACC_Mutation-20160128"]] <- rex
#'
#' simplifyTCGA(accmae)
#'
#' @export
simplifyTCGA <- function(obj, keep.assay = FALSE, unmapped = TRUE) {
obj <- qreduceTCGA(obj, keep.assay)
obj <- mirToRanges(obj, keep.assay, unmapped)
symbolsToRanges(obj, keep.assay, unmapped)
}
#' @name simplifyTCGA
#' @aliases symbolsToRanges
#' @export
symbolsToRanges <- function(obj, keep.assay = FALSE, unmapped = TRUE) {
can.fix <- vapply(experiments(obj), function(y) {
.checkHas(y, "symbols") & .isSummarizedExperiment(y)
}, logical(1L))
.checkPkgsAvail(c("TxDb.Hsapiens.UCSC.hg19.knownGene", "org.Hs.eg.db"))
.convertTo(
x = obj,
which = can.fix,
FUN = .getRangesOfSYMBOLS,
keep = keep.assay,
unmap = unmapped
)
}
#' @name simplifyTCGA
#' @aliases mirToRanges
#' @export
mirToRanges <- function(obj, keep.assay = FALSE, unmapped = TRUE) {
can.fix <- vapply(experiments(obj), function(y) {
.checkHas(y, "^hsa") & .isSummarizedExperiment(y)
}, logical(1L))
.checkPkgsAvail(c("TxDb.Hsapiens.UCSC.hg19.knownGene", "mirbase.db"))
.convertTo(
x = obj,
which = can.fix,
FUN = .getRangesOfMir,
keep = keep.assay,
unmap = unmapped
)
}
#' @name simplifyTCGA
#' @aliases CpGtoRanges
#' @export
CpGtoRanges <- function(obj, keep.assay = FALSE, unmapped = TRUE) {
can.fix <- vapply(experiments(obj), function(y) {
.checkHas(y, "^cg") & .isSummarizedExperiment(y)
}, logical(1L))
.checkPkgsAvail("IlluminaHumanMethylation450kanno.ilmn12.hg19")
.convertTo(
x = obj,
which = can.fix,
FUN = .getRangesOfCpG,
keep = keep.assay,
unmap = unmapped
)
}
#' @name simplifyTCGA
#' @aliases qreduceTCGA
#' @export
qreduceTCGA <- function(obj, keep.assay = FALSE, suffix = "_simplified") {
.checkPkgsAvail(c("TxDb.Hsapiens.UCSC.hg19.knownGene", "org.Hs.eg.db"))
gn <- genes(
TxDb.Hsapiens.UCSC.hg19.knownGene::TxDb.Hsapiens.UCSC.hg19.knownGene)
gn <- keepStandardChromosomes(GenomicRanges::granges(gn),
pruning.mode = "coarse")
seqlevelsStyle(gn) <- "NCBI"
names(gn) <- AnnotationDbi::mapIds(org.Hs.eg.db::org.Hs.eg.db, names(gn),
keytype = "ENTREZID", column = "SYMBOL")
weightedmean <- function(scores, ranges, qranges) {
isects <- GenomicRanges::pintersect(ranges, qranges)
sum(scores * BiocGenerics::width(isects)) /
sum(BiocGenerics::width(isects))
}
nonsilent <- function(scores, ranges, qranges)
any(scores != "Silent")
isRE <-
function(x) vapply(experiments(x), is, logical(1L), "RaggedExperiment")
isMut <- function(x) grepl("Mutation", names(x))
for (i in which(isMut(obj))) {
sqls <- seqlevelsStyle(obj[[i]])
seqlevelsStyle(gn) <- sqls
## remove patch release info
gname <- genome(gn)
genome(gn) <- gsub("\\.p[0-9]{1,2}$", "", genome(gn))
mutations <- RaggedExperiment::qreduceAssay(obj[[i]], gn, nonsilent,
"Variant_Classification")
rownames(mutations) <- names(gn)
mutations[is.na(mutations)] <- 0
remove.rows <- is.na(rownames(mutations))
mut_ranges <- gn[!remove.rows]
## replace patch release info
genome(mut_ranges) <- gname
mutations <- SummarizedExperiment(mutations[!remove.rows,],
rowRanges = mut_ranges)
el <- ExperimentList(x = mutations)
names(el) <- paste0(names(obj)[i], suffix)
obj <- c(obj, el)
}
for (i in which(isRE(obj) & !isMut(obj))) {
sqls <- seqlevelsStyle(obj[[i]])
seqlevelsStyle(gn) <- sqls
suppressWarnings(
cn <- RaggedExperiment::qreduceAssay(obj[[i]], gn,
weightedmean, "Segment_Mean")
)
rownames(cn) <- names(gn)
remove.rows <- is.na(rownames(cn))
cn <- SummarizedExperiment(
cn[!remove.rows, ], rowRanges = gn[!remove.rows]
)
el <- ExperimentList(x = cn)
names(el) <- paste0(names(obj)[i], suffix)
obj <- c(obj, el)
}
if (!keep.assay) {
obj <- obj[, , !isRE(obj)]
}
return(obj)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.