Nothing
########################################################################################################################
## readGEO.R
## created: 2012-03-27
## creator: Yassen Assenov
## ---------------------------------------------------------------------------------------------------------------------
## Loading array-based methylation datasets from GEO.
########################################################################################################################
## G L O B A L S #######################################################################################################
GEO.PLATFORMS <- c(
"GPL8490" = "probes27",
"GPL13534" = "probes450",
"GPL16304" = "probes450",
"GPL21145" = "probesEPIC")
## F U N C T I O N S ###################################################################################################
#' Preprocesse sample characteristics
#'
#' Preprocesses sample characteristics lines from a series matrix definition file.
#'
#' @param txt Lines in the file to be parsed.
#' @param regex.sample.info Regular expression capturing a definition of sample characteristics.
#' @return \code{character} vector storing sequence of preprocessed lines defining sample characteristics; an empty
#' \code{vector} if none of the lines in \code{txt} matches the given regular expression.
#'
#' @author Yassen Assenov
#' @noRd
rnb.geo.parse.sample.info <- function(txt, regex.sample.info = "^!Sample_([^\t]+)\\t(.+)$") {
s.info.lines <- grep(regex.sample.info, txt, value = TRUE)
if (length(s.info.lines) == 0) {
return(character())
}
result <- gsub(regex.sample.info, "\\2", s.info.lines)
names(result) <- gsub(regex.sample.info, "\\1", s.info.lines)
result
}
########################################################################################################################
#' Parses sample annotation
#'
#' Parses the sample annotation table from the corresponding lines in a series matrix definition file.
#'
#' @param txt Lines of the series matrix definition file that define sample characteristics. These should be
#' preprocessed and not contain the initial sample specification (the token starting with \code{"!Sample_"}).
#' @return Sample annotation table as a \code{data.frame}.
#'
#' @author Yassen Assenov
#' @noRd
rnb.geo.parse.sample.anno <- function(txt) {
## Validate that the sample annotation forms a table
regex.sample.value <- "^\"([^\"]*)\"$"
s.infos <- strsplit(txt, "\t", fixed = TRUE)
s.valid <- mapply(grepl, x = s.infos, MoreArgs = list(pattern = regex.sample.value), SIMPLIFY = FALSE)
if (!all(unlist(s.valid))) {
stop("Cannot parse sample information; quotation marks and/or TAB characters are probably escaped")
}
s.infos <- lapply(s.infos, function(x) { gsub(regex.sample.value, "\\1", x) })
sample.count <- range(sapply(s.infos, length))
if (sample.count[1] != sample.count[2]) {
stop("Incosistent number of samples in the annotation")
}
sample.count <- sample.count[1]
## Parse characteristics_ch1
for (i in which(names(s.infos) == "characteristics_ch1")) {
ch1.values <- strsplit(s.infos[[i]], ": ", fixed = TRUE)
if (all(sapply(ch1.values, length) %in% c(0, 2))) {
j <- which(sapply(ch1.values, length) == 2)
if (length(j) == 0) { next }
column.title <- unique(sapply(ch1.values[j], '[', 1L))
if (length(column.title) == 1) {
s.infos[[i]] <- sapply(ch1.values, '[', 2L)
names(s.infos)[i] <- column.title
}
}
}
suppressWarnings(rm(i, ch1.values, j, column.title))
## Detect column types
s.infos <- as.data.frame(s.infos, check.names = FALSE, stringsAsFactors = FALSE)
buffer <- character()
b.connection <- textConnection('buffer', open = "w", local = TRUE)
write.table(s.infos, file = b.connection, quote = FALSE, sep = "\t", row.names = FALSE)
close(b.connection)
b.connection <- textConnection(buffer)
s.infos <- read.delim(b.connection, quote = "", check.names = FALSE)
close(b.connection)
for (i in 1:ncol(s.infos)) {
if (is.factor(s.infos[, i]) && nlevels(s.infos[, i]) == sample.count) {
s.infos[, i] <- as.character(s.infos[, i])
}
}
## Auto-rename columns in case there are duplicates
if (ncol(s.infos) > 1) {
s.infos <- s.infos[, rep(TRUE, ncol(s.infos))]
}
s.infos
}
########################################################################################################################
#' Parse sample identifiers
#'
#' Parses column names from the first line of series matrix table definition.
#' @param txt Line in the series matrix file that defines table columns; this is the line following the matrix
#' definition \code{"series_matrix_table_begin"}.
#' @return All column names in the form of a \code{character} vector.
#'
#' @author Yassen Assenov
#' @noRd
rnb.geo.parse.ids <- function(txt) {
txt <- strsplit(txt, "\t", fixed = TRUE)[[1]]
if (length(txt) < 2 || txt[1] != '"ID_REF"') {
stop("Unexpected header of series matrix table")
}
regex.id <- "^\\\"(GSM\\d+)\\\"$"
if (!all(grepl(regex.id, txt[-1]))) {
stop("Unexpected header of series matrix table")
}
gsub(regex.id, "\\1", txt[-1])
}
########################################################################################################################
#' Initialize methylation matrix
#'
#' Initializes the beta value matrix from a series matrix file.
#' @param txt Line in the series matrix file that defines table columns; this is the line following the matrix
#' definition \code{"series_matrix_table_begin"}.
#' @param N.expected Number of columns expected, based on the sample definition lines above.
#' @param assay.type Expected assay. This must be one of \code{"probes27"}, \code{"probes450"} or \code{"probesEPIC"}.
#' @return Newly initialized \code{matrix} of type \code{double}.
#'
#' @author Yassen Assenov
#' @noRd
rnb.geo.init.matrix <- function(txt, N.expected, assay.type) {
sample.ids <- rnb.geo.parse.ids(txt)
if (length(sample.ids) != N.expected) {
stop("Inconsistent sample characteristics and series matrix table")
}
probe.ids <- rnb.get.annotation(assay.type, assembly = "hg19")
probe.ids <- unlist(lapply(probe.ids, names), use.names = FALSE)
## TODO: Consider rnb.getOption("disk.dump.big.matrices") and initialize different things
matrix(NA_real_, length(probe.ids), length(sample.ids), dimnames = list(probe.ids, sample.ids))
}
########################################################################################################################
#' Parse a series matrix file
#'
#' Parses the sample annotation data and methylation values from a series matrix file.
#'
#' @param fname Name of the file that contains the series matrix definition. This can also be gzipped.
#' @param verbose Flag indicating if messages are to be sent to the logger.
#' @return \code{list} with three elements: sample annotation table (\code{data.frame}), methylation beta value matrix,
#' and a platform, specified as one of \code{"probes27"}, \code{"probes450"} or \code{"probesEPIC"}.
#'
#' @author Yassen Assenov
#' @noRd
rnb.geo.parse.series.matrix <- function(fname, verbose) {
if (grepl("\\.gz$", fname)) {
con <- gzfile(fname, "rt")
} else {
con <- file(fname, "rt")
}
if (verbose) {
logger.start("Parsing Series Matrix File")
}
N.buffer <- 100000L # size of buffer, in number of lines
regex.platform <- "^\\!Series_platform_id\t\\\"(GPL\\d+)\\\"$"
data.platform <- NULL
s.infos <- character()
state.data.matrix <- 0L # 0: not started, 1: parsing header, 2: parsing data, 3: ready
data.matrix <- NULL
regex.data <- "^\\\"([^\"]+)\\\"\t(.*)$"
skipped.records <- 0L
repeat {
txt <- NULL; invisible(gc())
txt <- scan(con, "", n = N.buffer, sep = "\n", quiet = TRUE)
if (length(txt) == 0) {
break
}
## Parse platform
i <- grep(regex.platform, txt)
if (length(i) == 1) {
if (!is.null(data.platform)) {
stop("Multiple platforms defined")
}
data.platform <- gsub(regex.platform, "\\1", txt[i])
if (!(data.platform %in% names(GEO.PLATFORMS))) {
stop(paste("Platform", data.platform, "is not supported"))
}
if (verbose) {
i <- paste0("Identified platform ", data.platform, " (", GEO.PLATFORMS[data.platform], ")")
logger.status(i)
}
} else if (length(i) > 1) {
stop("Multiple platforms defined; RnBeads supports a single platform only")
}
i <- which(txt == "!series_matrix_table_begin")
if (length(i) == 0) {
if (state.data.matrix == 0L) {
s.infos <- c(s.infos, rnb.geo.parse.sample.info(txt))
next
}
if (state.data.matrix == 1L) {
data.matrix <- rnb.geo.init.matrix(txt[1], nrow(s.infos), GEO.PLATFORMS[data.platform])
i.first <- 2L
state.data.matrix <- 2L
} else if (state.data.matrix == 2L) {
i.first <- 1L
}
} else if (length(i) == 1 && state.data.matrix == 0L) {
## Parse sample annotation table from the identified lines
if (i > 1) {
s.infos <- c(s.infos, rnb.geo.parse.sample.info(head(txt, i - 1)))
}
s.infos <- rnb.geo.parse.sample.anno(s.infos)
## Initialize data matrix
if (i + 1L <= length(txt)) {
data.matrix <- rnb.geo.init.matrix(txt[i + 1], nrow(s.infos), GEO.PLATFORMS[data.platform])
state.data.matrix <- 2L
if (i + 1L == length(txt)) {
next
}
i.first <- i + 2L
} else {
state.data.matrix <- 1L
next
}
} else {
stop("Multiple series matrix defined; RnBeads supports a single series matrix only")
}
i <- which(txt == "!series_matrix_table_end")
if (length(i) == 0) {
if (state.data.matrix != 2L) {
next
}
i.last <- length(txt)
} else if (length(i) == 1 && state.data.matrix == 2L) {
i.last <- i - 1L
if (i.last < i.first) {
stop("Empty series matrix definition")
}
} else {
stop("Multiple or misplaced end definition(s) of a series matrix")
}
## Parse lines to fill in the beta value matrix
if (state.data.matrix == 2L) {
txt <- txt[i.first:i.last]
i.probes <- grep(regex.data, txt, invert = TRUE)
if (length(i.probes) != 0) {
stop("Invalid format of data table")
}
i.probes <- match(gsub(regex.data, "\\1", txt), rownames(data.matrix))
i.found <- which(i.probes != 0L)
skipped.records <- skipped.records + length(i.probes) - length(i.found)
if (length(i.found) != 0) {
txt <- gsub(regex.data, "\\2", txt[i.found])
i.probes <- i.probes[i.found]
txt <- gsub("\t$", "\t\t", txt)
mm <- suppressWarnings(sapply(strsplit(txt, "\t", fixed = TRUE), as.double))
if (!(is.matrix(mm) && nrow(mm) == ncol(data.matrix))) {
stop("Invalid format of data table")
}
data.matrix[i.probes, ] <- t(mm)
}
suppressWarnings(rm(i.probes, i.found, mm))
if (length(i) == 1) {
state.data.matrix <- 3L
}
}
}
close(con)
if (!(is.data.frame(s.infos) && state.data.matrix == 3L)) {
stop("Missing or incomplete data table")
}
if (skipped.records != 0 && verbose) {
i <- ifelse(skipped.records == 1, "record", "records")
logger.warning(paste("Skipped", skipped.records, i, "referring to unsupported probes"))
}
data.matrix <- data.matrix[!apply(is.na(data.matrix), 1, all), , drop = FALSE]
colnames(data.matrix) <- NULL
if (verbose) {
logger.status(paste("Loaded non-missing data for", nrow(data.matrix), "probes"))
logger.completed()
}
list("annotation" = s.infos, "betas" = data.matrix, "platform" = GEO.PLATFORMS[data.platform])
}
########################################################################################################################
#' Import methylation data from GEO
#'
#' Imports Infinium 450K or MethylationEPIC data series from the Gene Expression Omnibus. This function uses the
#' series matrix file.
#'
#' @param accession Character string, starting with \code{"GSE"}, representing the GEO series for download and parsing.
#' Alternatively, this parameter can specify the file name of a previously downloaded GEO series
#' matrix file or its gzipped representation (in which case the filename must end in \code{".gz"}).
#' Other file formats, such as SOFT files, are not supported.
#' @param verbose Flag indicating if messages should be created informing about the progress. If the logger is
#' initialized prior to calling this function, the informative messages are sent to the logger.
#' Warnings and errors are not affected by this parameters, the function always outputs them.
#' @param destdir The destination directory for any downloads. Defaults to the (architecture-dependent) temporary
#' directory. Keep in mind that GEO series can be demanding in terms of storage space.
#' @return \code{\linkS4class{RnBeadSet}} object with phenotypic and beta value information.
#'
#' @author Yassen Assenov
#' @export
rnb.read.geo <- function(accession = NULL, verbose = logger.isinitialized(), destdir = tempdir()) {
if (verbose) {
rnb.logger.start("Loading GEO Data Series")
}
if (!(is.character(accession) && length(accession) == 1 && isTRUE(accession != ""))) {
stop("Invalid value for accession")
}
if (isTRUE(file.info(accession)[1, "isdir"] == FALSE)) {
## Specified existing file
fname <- accession
} else if (!grepl("^GSE\\d\\d+$", accession)) {
stop("Invalid value for accession; expected GEO data series identifier or an existing file")
} else {
## Specified GSE data series, download from GEO
fname <- tempfile("gse", destdir, ".txt.gz")
gse.url <- gsub("^(GSE\\d\\d).*$", "\\1", accession)
gse.url <- sprintf("%snnn/%s/matrix/%s", gse.url, accession, accession)
gse.url <- paste0("ftp://ftp.ncbi.nlm.nih.gov/geo/series/", gse.url, "_series_matrix.txt.gz")
if (verbose) {
rnb.info(paste("Using URL", gse.url))
rnb.info(paste("Using download method", getOption("download.file.method")))
}
result <- tryCatch( # quiet = TRUE would be cleaner, but it seems to fail on MacOS
download.file(gse.url, fname, quiet = FALSE, mode = "wb"),
error = function(err) { return(-99L) })
if (result != 0) {
invisible(file.remove(fname))
stop(paste("Could not download GEO Data Series, error code", result))
}
if (verbose) {
rnb.status(paste("Downloaded GSE matrix to", fname))
}
rm(gse.url, result)
}
## Parse the series matrix file
result <- rnb.geo.parse.series.matrix(fname, verbose)
x <- c("probes27" = "27k", "probes450" = "450k", "probesEPIC" = "EPIC")[result[[3]]]
RnBeadSet(pheno = result[[1]], betas = result[[2]], platform = x)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.