#' Get GSE text from GEO
#'
#' @param gse_name GEO study name to get metadata for
#'
#' @return Character vector of lines on GSE record.
#' @export
#'
#' @examples
#' gse_text <- crawl_gse('GSE111459')
#'
crawl_gse <- function(gse_name) {
# get html text for GSE page
gse_url <- paste0(
"https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=",
gse_name,
"&targ=self&form=text&view=full")
gse_text <- NULL
attempt <- 1
while(is.null(gse_text) && attempt <= 3) {
con <- url(gse_url)
try(gse_text <- readLines(con))
if(is.null(gse_text)) Sys.sleep(15)
close(con)
attempt <- attempt + 1
}
return(gse_text)
}
#' Extract GSMs needed to download RNA-seq data for a series
#'
#'
#' @param gse_text GSE text returned from \code{\link{crawl_gse}}
#'
#' @return Character vector of sample GSMs for the series \code{gse_name}
#' @export
#'
#' @examples
#'
#' gse_text <- crawl_gse('GSE111459')
#' gsm_names <- extract_gsms(gse_text)
#'
extract_gsms <- function(gse_text) {
# GSM names
gsm_lines <- grep('^!Series_sample_id', gse_text)
gsm_names <- gsub(
'^!Series_sample_id = (GSM\\d+)$', '\\1',
gse_text[gsm_lines])
return(gsm_names)
}
#' Crawls SRX pages for each GSM to get metadata.
#'
#' Goes to each GSM page to get SRX then to each SRX page to get some more
#' metadata.
#'
#'
#' @param gsm_names Character vector of GSMs.
#' @param max.workers Maximum number of parallel workers to split task betweem
#' @importFrom foreach %dopar%
#'
#' @return data.frame
#' @export
#'
#' @examples
#' srp_meta <- crawl_gsms("GSM3031462")
#'
#' # returns NULL because records on dbGAP for privacy reasons
#' srp_meta <- crawl_gsms("GSM2439650")
#'
#' # example with empty values
#' srp_meta <- crawl_gsms('GSM4043025')
#'
crawl_gsms <- function(gsm_names, max.workers = 50) {
# for R CMD Check
j = NULL
nsamp <- length(gsm_names)
cat(nsamp, 'GSMs to process\n')
cl <- parallel::makeCluster(min(max.workers, nsamp))
doParallel::registerDoParallel(cl)
srp_meta <- foreach::foreach(
j = seq_len(nsamp),
.combine = plyr::rbind.fill) %dopar% {
# save in srp_meta
srp_meta <- data.frame(stringsAsFactors = FALSE)
gsm_name <- gsm_names[j]
# get html text
geo_url <- "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc="
gsm_url <- paste0(
geo_url, gsm_name, '&targ=self&form=text&view=full')
gsm_text <- NULL
attempt <- 1
while(is.null(gsm_text) && attempt <= 3) {
con <- url(gsm_url)
try(gsm_text <- readLines(con))
if(is.null(gsm_text)) Sys.sleep(5)
close(con)
attempt <- attempt + 1
}
# get SRA number for this GSM
# some won't (e.g. submitted to dbGAP for privacy reasons)
has.srx <- grep('!Sample_relation = SRA: .+?SRX\\d+', gsm_text)
if (!length(has.srx)) return(NULL)
experiment <- gsub('^.+?(SRX\\d+)$', '\\1', gsm_text[has.srx])
# fix for e.g. GSM4043025 with empty values
gsm_text <- gsm_text[gsm_text != ""]
info <- gsub('^!Sample_', '', gsm_text[-1])
cols <- gsub('^(.+?) = .+?$', '\\1', info)
cols <- make.unique(cols)
vals <- gsub('^.+? = (.+?$)', '\\1', info)
names(vals) <- cols
if (!is.na(experiment)) {
# extract SRR runs info -----
srx_url <- paste0(
"https://www.ncbi.nlm.nih.gov/sra/",
experiment,
'[accn]?report=FullXml')
srx_html <- NULL
attempt <- 1
while(is.null(srx_html) && attempt <= 3) {
try(srx_html <- xml2::read_html(srx_url))
if(is.null(srx_html)) Sys.sleep(5)
attempt <- attempt + 1
}
srx_text <- rvest::html_text(srx_html)
runs <- stringr::str_extract_all(
srx_text, '<PRIMARY_ID>SRR\\d+</PRIMARY_ID>')[[1]]
runs <- gsub(
'<PRIMARY_ID>(SRR\\d+)</PRIMARY_ID>', '\\1', unique(runs))
taxon_id <- stringr::str_extract(
srx_text, '<TAXON_ID>\\d+</TAXON_ID>')
library_source <- stringr::str_extract(
srx_text, '<LIBRARY_SOURCE>.+?</LIBRARY_SOURCE>')
library_layout <- stringr::str_extract(
srx_text, '<LIBRARY_LAYOUT>.+?</LIBRARY_LAYOUT>')
library_layout <- stringr::str_extract(
library_layout, 'SINGLE|PAIRED')
if (length(runs)) {
# add info to srp_meta
srp_meta[runs, 'run'] <- runs
srp_meta[runs, 'experiment'] <- experiment
srp_meta[runs, 'gsm_name'] <- gsm_name
for (col in cols) srp_meta[runs, col] <- vals[col]
srp_meta[runs, 'library_source'] <- gsub(
'<LIBRARY_SOURCE>(.+?)</LIBRARY_SOURCE>',
'\\1',
library_source)
srp_meta[runs, 'library_layout'] <- library_layout
srp_meta[runs, 'taxon_id'] <- gsub(
'<TAXON_ID>(\\d+)</TAXON_ID>',
'\\1',
taxon_id)
srp_meta[runs, 'ebi_dir'] <-
vapply(runs, get_dldir, 'ebi')
srp_meta[runs, 'ncbi_dir'] <-
vapply(runs, get_dldir, 'ncbi')
}
}
return(srp_meta)
}
parallel::stopCluster(cl)
row.names(srp_meta) <- srp_meta$run
return(srp_meta)
}
#' Gets part of path to download bulk RNAseq sample from EBI or NCBI
#'
#' @param srr SRR/ERR run name
#' @param type Either \code{'ebi'} or \code{'ncbi'}
#'
#' @return String path used by \code{\link{get_fastqs}}.
#' @export
#' @examples
#'
#' get_dldir('SRR014242')
#'
get_dldir <- function(srr, type = c('ebi', 'ncbi')) {
dir1 <- substr(srr, 1, 6)
if (type[1] == 'ebi') {
digits <- gsub('^SRR|^ERR', '', srr)
ndigits <- nchar(digits)
if (ndigits == 7) {
dir2 <- paste0('00', substr(digits, 7, 7))
} else if (ndigits == 8) {
dir2 <- paste0('0', substr(digits, 7, 8))
} else if (ndigits == 9) {
dir2 <- substr(digits, 7, 9)
} else if (ndigits == 6) {
return(file.path(dir1, srr))
}
return(file.path(dir1, dir2, srr))
} else if (type[1] == 'ncbi') {
return(file.path(dir1, srr))
}
}
#' Download and RNA-seq fastq data from EBI
#'
#' First tries to get RNA-Seq fastq files from EBI.
#'
#' @param srp_meta \code{data.frame} with SRP meta info. Returned from
#' \code{\link{crawl_gsms}}.
#' @param data_dir Path to folder that fastq files will be downloaded to. Will
#' be created if doesn't exist.
#' @param method One of \code{'aspera'} or \code{'ftp'}. \code{'aspera'} is
#' generally faster but requires the
#' ascp command line utility to be on your path and in the authors experience
#' frequently stalls.
#' @param max_rate Used when \code{method = 'aspera'} only. Sets the target
#' transfer rate. The default is \code{'300m'}.
#'
#' @return Named vector of integer return codes from \code{ascp} or
#' \code{download.file}. Names are SRR runs.
#' @export
#'
#' @examples
#' gsm_name <- 'GSM3926903'
#' srp_meta <- crawl_gsms(gsm_name)
#' data_dir <- tempdir()
#' res <- get_fastqs(srp_meta, data_dir)
#'
get_fastqs <- function(
srp_meta, data_dir, method = c('ftp', 'aspera'), max_rate = '1g') {
# setup fastq directory
dir.create(data_dir, showWarnings = FALSE)
# seperate runs based on GSM (can be multiple per GSM)
srr_names <- srp_meta$run
gsm_names <- unique(srp_meta$gsm_name)
srr_names_list <- lapply(
gsm_names, function(gsm_name)srr_names[srp_meta$gsm_name %in% gsm_name])
names(srr_names_list) <- gsm_names
method <- method[1]
ngsm <- length(gsm_names)
res <- c()
for (i in seq_len(ngsm)) {
# download everything
srr_names <- srr_names_list[[i]]
resi <- c()
for (srr_name in srr_names) {
# try to get fastq from ebi
resi <- c(
resi,
tryCatch(
get_ebi_fastqs(
srp_meta, srr_name, data_dir, method, max_rate),
error = function(e) {
warning(e$message, call. = FALSE)
return(1)
}))
}
names(resi) <- srr_names
res <- c(res, resi)
}
return(res)
}
#' Download fastqs from EBI
#'
#' Much faster to use aspera than ftp
#'
#' @param srr_name Run accession as string.
#' @inheritParams get_fastqs
#' @keywords internal
#'
#' @return Integer return code from ascp or \code{download.file}.
#'
get_ebi_fastqs <- function(
srp_meta, srr_name, data_dir, method=c('ftp', 'aspera'), max_rate='300m') {
pre_url <- 'ftp://ftp.sra.ebi.ac.uk/vol1/fastq/'
url <- paste0(pre_url, srp_meta[srr_name, 'ebi_dir'], '/')
resp <- RCurl::getURL(url)
resp <- strsplit(resp, '\n')[[1]]
resp <- strsplit(resp, ' +')
fnames <- vapply(resp, `[`, '', 9)
fsizes <- vapply(resp, `[`, '', 5)
if (method[1] == 'aspera') {
ascp_path <- system2('which', 'ascp', stdout = TRUE)
ascp_pubkey <- gsub('bin/ascp$',
'etc/asperaweb_id_dsa.openssh',
ascp_path)
# only overwrite if different from source
ascp_args <- c(
'--overwrite=diff',
'-k1', '-QT', '-l', max_rate, '-P33001', '-i', ascp_pubkey)
files <- paste0('era-fasp@fasp.sra.ebi.ac.uk:/vol1/fastq/',
srp_meta[srr_name, 'ebi_dir'], '/', fnames)
for (i in seq_along(files)) {
destfile <- file.path(data_dir, fnames[i])
if (file.exists(destfile) && file.size(destfile) == fsizes[i]) {
res <- 0
} else {
res <- ascpR(ascp_args, files[i], data_dir)
}
}
} else if (method[1] == 'ftp') {
files <- paste0('ftp://ftp.sra.ebi.ac.uk/vol1/fastq/',
srp_meta[srr_name, 'ebi_dir'], '/', fnames)
for (i in seq_along(files)) {
# check for existing file with same size
destfile <- file.path(data_dir, fnames[i])
if (file.exists(destfile) && file.size(destfile) == fsizes[i]) {
res <- 0
} else {
res <- utils::download.file(files[i], destfile)
}
}
}
return(res)
}
#' Utility function to run aspera
#'
#' @param ascp_args Character vector of arguments to \code{ascp}.
#' @param file Url to aspera file to download.
#' @param destDir Path to directory to download \code{files} into.
#' @keywords internal
#'
#' @return return code from call to ascp
#'
ascpR <- function (ascp_args, file, destDir = getwd()) {
ret <- system2('ascp', c(ascp_args, file, destDir))
return(ret)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.