#' Download data from GDC Data Portal and GDC Legacy Archive
#'
#' \code{download_gdc} is a function designed to download methylation, mutation,
#' clinical data, protein expression, MAGETAB, gene expression, isoform
#' expression, miRNA expression and clinical images data from GDC Data Portal
#' and GDC Legacy Archive.
#'
#' @param data_type Type of data. It could be \code{"methylation", "mutation",
#' "clinical_supplement", "biospecimen", "gene", or "clinical"(biotab)}.
#' \itemize{ \item{Only present
#' in "Legacy" database:}{\code{"protein", "Exon quantification", "miRNA gene
#' quantification", "miRNA isoform quantification", "isoform", and "image"}.}
#' \item{Only present in "GDC" database:}{\code{"miRNA
#' Expression Quantification", and "Isoform Expression Quantification"
#' (miRNA)}.}}
#' @param tumor A character string contaning one of the 33 tumors available in
#' the TCGA project. For instance, the \code{"BRCA"} stands for breast
#' cancer.
#' @param data_base A character string specifying \code{"GDC"} for GDC Data
#' Portal or \code{"legacy"} for GDC Legacy Archive.
#' @param htseq A character string indicating which HTSeq workflow data should
#' be downloaded: \code{"Counts", "FPKM", or "all"}. The default is
#' \code{"all"}.
#' @param work_dir A character string specifying the path to work directory.
#' @param all_files A logical value. Set \code{FALSE} to avoid the download of
#' not used data to reduce download size, e.g. quantification files. The
#' default is \code{FALSE}.
#' @param platform A character string indicating the platform name for
#' methylation, exon quantificaton, miRNA, and mutation data. \itemize{
#' \item{For mutation and exon quantificaton data:}{\code{"Illumina GA",
#' "Illumina HiSeq" or "all"}.} \item{For methylation data}{\code{"Illumina
#' Human Methylation 450", "Illumina Human Methylation 27" or "all"}.}
#' \item{For miRNA data:}{\code{"Illumina GA", "Illumina HiSeq",
#' "H-miRNA_8x15K" (for GBM tumor), "H-miRNA_8x15Kv2" (for OV tumor), or
#' "all"}.} }The default for all data_type cited is \code{"all"} (when
#' downloading data).
#'
#' @return the files download are stored inside the determined folders in the
#' user machine.
#'
#' @import AnnotationDbi clusterProfiler devtools DOSE ggbiplot ggplot2 methods
#' stringi survminer yarrr
#' @export
#'
#' @importFrom curl curl
#' @importFrom httr content
#' @importFrom httr GET
#' @importFrom jsonlite fromJSON
#' @importFrom tools md5sum
#' @importFrom grDevices dev.off hsv png svg
#' @importFrom graphics abline axis hist image layout legend lines matplot
#' mtext par plot.new points rect rug text title
#' @importFrom stats TukeyHSD anova aov as.dendrogram as.dist chisq.test coef
#' confint cor density dist formula hclust kruskal.test median model.matrix
#' na.exclude na.omit order.dendrogram p.adjust pairwise.wilcox.test prcomp
#' reorder residuals sd shapiro.test summary.aov summary.lm
#' @importFrom utils combn read.csv read.delim read.table setTxtProgressBar
#' txtProgressBar untar write.csv write.table
#'
#' @examples
#' library(DOAGDC)
#'
#' # Downloading gene expression data from GDC Legacy Archive
#' download_gdc("gene", "CHOL", "legacy", work_dir = "~/Desktop")
download_gdc <- function(data_type = "gene",
tumor,
data_base = "legacy",
htseq = "",
work_dir,
all_files = FALSE,
platform = "all") {
# local functions ####
download_httr <- function(url, destfile) {
first <- httr::GET(url = url)
second <- httr::content(x = first, as = "raw")
writeBin(object = second, con = destfile)
}
size_par <- function(tumor, type_of_data, db) {
if (db == "legacy") {
first_part <- "https://api.gdc.cancer.gov/legacy/projects/TCGA-"
} else {
first_part <- "https://api.gdc.cancer.gov/projects/TCGA-"
}
url <- paste0(
first_part, toupper(tumor),
"?expand=summary,summary.data_categories&pretty=true"
)
jason <- jsonlite::fromJSON(url, simplifyDataFrame = TRUE)
jason <- jason$data
jason <- jason$summary
jason <- jason$data_categories
jason$data_category <- tolower(jason$data_category)
size <- as.numeric(subset(
x = jason,
subset = data_category == tolower(type_of_data),
"file_count"
))
return(size)
}
# old api url https://gdc-api.nci.nih.gov/
# selecting the right API
# code ####
db_bool <- tolower(data_base) == "legacy"
if (db_bool) {
inicio <- "https://api.gdc.cancer.gov/legacy/data/"
url_inicio <- "https://api.gdc.cancer.gov/legacy/files/"
status <- "https://api.gdc.cancer.gov/legacy/status"
folder_name <- paste0(tolower(data_type), "_data")
} else if (tolower(data_base) == "gdc") {
inicio <- "https://api.gdc.cancer.gov/data/"
url_inicio <- "https://api.gdc.cancer.gov/files/"
status <- "https://api.gdc.cancer.gov/status"
folder_name <- paste(tolower(data_base), tolower(data_type),
"data",
sep = "_"
)
} else {
stop("Please insert a data base name!")
}
message("Please wait, accessing GDC server...")
tryCatch(tmp <- read.csv(status),
error = function(e) {
stop(message(cat(
"GDC server or your internet conection is",
" off. \n Please try again later!"
)))
}
)
dir.create(path = file.path(work_dir, "DOAGDC"), showWarnings = FALSE)
dir.create(
path = file.path(work_dir, "DOAGDC", toupper(tumor)),
showWarnings = FALSE
)
# legacy ####
if (db_bool) {
# NOTE gene and isoform ####
if ("gene" %in% tolower(data_type) || "isoform" %in% tolower(data_type)) {
size_par_rsem <- function(tumor) {
url <- paste0(
"https://api.gdc.cancer.gov/legacy/",
"projects/TCGA-", toupper(tumor),
"?expand=summary,summary.data_categories&",
"pretty=true"
)
jason <- jsonlite::fromJSON(url, simplifyDataFrame = TRUE)
jason <- jason$data
jason <- jason$summary
jason <- jason$data_categories
size <- subset(
x = jason,
subset = data_category == "Gene expression",
"file_count"
)
return(size)
}
size <- size_par_rsem(tumor = tumor)
if ("gene" %in% tolower(data_type)) {
dir.create(
path = file.path(
work_dir, "DOAGDC",
toupper(tumor), "gene_data"
),
showWarnings = FALSE
)
direc <- file.path(work_dir, "DOAGDC", toupper(tumor),
"gene_data")
url <- paste0(
url_inicio, "?pretty=true&expand=cases.samples.",
"portions.analytes.aliquots,cases.project,",
"center,analysis&size=",
size, "&filters=%7B%22op%22:%22and%22,%22",
"content%22:%5B%7B%22op%22:%22",
"in%22,%22content%22:%7B%22field%22:%22",
"files.access%22,%22value%22:%5B",
"%22open%22%5D%7D%7D,%7B%22op%22:%22in%22,%22",
"content%22:%7B%22field%22:",
"%22cases.project.project_id%22,%22value%22:",
"%5B%22TCGA-", toupper(tumor),
"%22%5D%7D%7D,%7B%22op%22:%22in%22,%22content",
"%22:%7B%22field%22:%22files",
".data_category%22,%22value%22:%5B%22Gene%20",
"expression%22%5D%7D%7D,%7B%",
"22op%22:%22in%22,%22content%22:%7B%22field",
"%22:%22files.data_type%22,%",
"22value%22:%5B%22Gene%20expression%20",
"quantification%22%5D%7D%7D%5D%7D&",
"pagination=%7B%22files%22:%7B%22from%22:0,%22",
"size%22:100,%22sort%22:",
"%22cases.project.project_id:asc%22%7D%7D&",
"format=JSON"
)
} else if ("isoform" %in% tolower(data_type)) {
dir.create(
path = file.path(
work_dir, "DOAGDC", toupper(tumor),
"isoform_data"
),
showWarnings = FALSE
)
direc <- file.path(
work_dir, "DOAGDC", toupper(tumor),
"isoform_data"
)
url <- paste0(
url_inicio, "?pretty=true&expand=cases.samples.",
"portions.analytes.aliquots,cases.project,",
"center,analysis&size=",
size, "&filters=%7B%22op%22:%22and%22,%22",
"content%22:%5B%7B%22op%22:%22in",
"%22,%22content%22:%7B%22field%22:%22files.",
"access%22,%22value%22:%5B%22",
"open%22%5D%7D%7D,%7B%22op%22:%22in%22,%22",
"content%22:%7B%22field%22:%22",
"cases.project.project_id%22,%22value%22:",
"%5B%22TCGA-", toupper(tumor),
"%22%5D%7D%7D,%7B%22op%22:%22in%22,%22content",
"%22:%7B%22field%22:%22",
"files.data_category%22,%22value%22:%5B%22",
"Gene%20expression%22%5D%7D%",
"7D,%7B%22op%22:%22in%22,%22content%22:%7B%22",
"field%22:%22files.data_",
"type%22,%22value%22:%5B%22Isoform%20expression",
"%20quantification%22%5D",
"%7D%7D%5D%7D&pagination=%7B%22files%22:%7B%22",
"from%22:0,%22size%22:100,%",
"22sort%22:%22cases.project.project_id:asc",
"%22%7D%7D&format=JSON"
)
}
message("\n\nDownloading manifest...\n")
json <- jsonlite::fromJSON(url, simplifyDataFrame = TRUE)
manifest_df <- json$data$hits
manipular <- manifest_df[, "cases"]
# 16(submitter_id) will be cases
manifest_df[, c(
"center", "acl", "state_comment", "cases",
"tags"
)] <- NULL
cases <- matrix(nrow = nrow(manifest_df), ncol = 1)
for (index in seq_len(length(manipular))) {
patient_code <- as.character(unlist(manipular[[index]][1]))
tmp <- patient_code[grep("TCGA", patient_code)]
if (tmp == paste0('TCGA-', toupper(tumor))) {
patient_code <- as.character(unlist(manipular[[index]][2]))
cases[index, 1] <- patient_code[grep("TCGA", patient_code)]
} else {
cases[index, 1] <- tmp
}
}
manifest_df$submitter_id <- cases
if (!all_files) {
# no need to download these files
manifest_df <- manifest_df[!grepl(
"^.+(.pergene.txt)$",
manifest_df$file_name
), ]
manifest_df <- manifest_df[!grepl(
"^.+(.quantification.txt)$",
manifest_df$file_name
), ]
manifest_df <- manifest_df[!grepl(
"^.+(level3.data.txt)$",
manifest_df$file_name
), ]
manifest_df <- manifest_df[!grepl(
"^.+(tags.txt)$",
manifest_df$file_name
), ]
manifest_df <- manifest_df[!grepl(
"^.+(genes.txt)$",
manifest_df$file_name
), ]
}
write.table(
x = manifest_df, file = paste0(direc, "/manifest.sdrf"),
quote = FALSE, row.names = FALSE, sep = "\t"
)
manifest_df <- manifest_df[, c("file_name", "md5sum", "file_id")]
colnames(manifest_df) <- c("filename", "md5", "id")
id_matrix <- manifest_df[, "id"]
}
} else if (tolower(data_base) == "gdc") {
# NOTE gene GDC ####
if ("gene" %in% tolower(data_type)) {
dir.create(
path = file.path(
work_dir, "DOAGDC", toupper(tumor),
"gdc_gene_data"
),
showWarnings = FALSE
)
direc <- file.path(work_dir, "DOAGDC", toupper(tumor),
"gdc_gene_data")
size_par <- function(tumor) {
url <- paste0(
"https://api.gdc.cancer.gov/projects/TCGA-",
toupper(tumor),
"?expand=summary,summary.",
"data_categories&pretty=true"
)
jason <- jsonlite::fromJSON(url, simplifyDataFrame = TRUE)
jason <- jason$data
jason <- jason$summary
jason <- jason$data_categories
tmp <- jason$data_category == "Transcriptome Profiling"
size <- subset(x = jason, subset = tmp, "file_count")
return(as.numeric(size))
}
size <- size_par(tumor = tumor)
# selecting which HTSeq data to download
url <- paste0(
url_inicio, "?pretty=true&expand=cases.samples.",
"portions.analytes.aliquots,cases.project,",
"center,analysis&size=",
size, "&filters=%7B%22op%22:%22and%22,%22",
"content%22:%5B%7B%22op%22:%22",
"in%22,%22content%22:%7B%22field%22:%22files.",
"data_type%22,%22value%22:",
"%5B%22Gene%20Expression%20Quantification%22",
"%5D%7D%7D,%7B%22op%22:%22",
"in%22,%22content%22:%7B%22field%22:%22files.",
"access%22,%22value%22:%5B%22",
"open%22%5D%7D%7D,%7B%22op%22:%22in%22,%22",
"content%22:%7B%22field%22:%22",
"files.data_format%22,%22value%22:%5B%22TXT%22%",
"5D%7D%7D,%7B%22op%22:%22",
"in%22,%22content%22:%7B%22field%22:%22files.",
"analysis.workflow_type%22,%22",
"value%22:%5B%22HTSeq%20-%20Counts%22,",
"%22HTSeq%20-%20FPKM%22%5D%7D%7D,%7B%22op%22:%22in%22,",
"%22content%22:%7B%22field%22:%22",
"files.experimental_strategy%22,%22value%22:",
"%5B%22RNA-Seq%22%5D%7D%7D,%7B%22",
"op%22:%22in%22,%22content%22:%7B%22field%22:",
"%22cases.project.project_id%22,",
"%22value%22:%5B%22TCGA-", toupper(tumor),
"%22%5D%7D%7D,%7B%22op%22:%22in%22,",
"%22content%22:%7B%22field%22:%22files.data_",
"category%22,%22value%22:%5B%22",
"Transcriptome%20Profiling%22%5D%7D%7D%5D%7D&",
"format=JSON"
)
message("\n\nDownloading manifest...\n")
sw <- function(x) {
suppressWarnings(x)
}
sw(json <- jsonlite::fromJSON(readLines(curl::curl(url))))
manifest_df <- json$data$hits
manipular <- manifest_df[, "cases"]
# 16(submitter_id) will be cases
manifest_df[, c("analysis", "acl")] <- NULL
cases <- matrix(nrow = nrow(manifest_df), ncol = 1)
for (index in seq_len(length(manipular))) {
patient_code <- as.character(unlist(manipular[[index]][1]))
tmp <- patient_code[grep("TCGA", patient_code)]
if (tmp == paste0('TCGA-', toupper(tumor))) {
patient_code <- as.character(unlist(manipular[[index]][2]))
cases[index, 1] <- patient_code[grep("TCGA", patient_code)]
} else {
cases[index, 1] <- tmp
}
}
manifest_df$cases <- cases
manifest_df[, "cases"] <- as.character(manifest_df[, "cases"])
write.table(
x = manifest_df, file = file.path(
direc, "manifest.sdrf"
),
quote = FALSE, row.names = FALSE, sep = "\t"
)
seletor <- unname(sapply(
manifest_df$submitter_id,
function(w) {
paste0(unlist(strsplit(w, "_"))[2])
}
))
manifest_df <- manifest_df[
grep(tolower(htseq), seletor),
c("file_name", "md5sum", "file_id")
]
colnames(manifest_df) <- c("filename", "md5", "id")
id_matrix <- manifest_df[, "id"]
} else if ("isoform" %in% tolower(data_type)) {
stop(message(
"\nThere is no isoform data in GDC data",
" base! (unfortunately...)\n"
))
}
}
# NOTE mutation ####
if ("mutation" %in% tolower(data_type)) {
dir.create(
path = file.path(
work_dir, "DOAGDC", toupper(tumor),
folder_name
),
showWarnings = FALSE
)
direc <- file.path(work_dir, "DOAGDC", toupper(tumor), folder_name)
if (db_bool) {
# legacy exclusive
if (tolower(platform) %in% "all") {
tmpform <- "Illumina GA|Illumina HiSeq"
} else if (tolower(platform) %in% "illumina ga") {
tmpform <- "Illumina GA"
} else if (tolower(platform) %in% "illumina hiseq") {
tmpform <- "Illumina HiSeq"
}
platform <- paste0(
"%22value%22:%5B%22Illumina%20GA%22,",
"%22Illumina%20HiSeq%22%5D%7D%7D"
)
url <- paste0(
url_inicio, "?pretty=true&expand=cases.samples.",
"portions.analytes.aliquots,cases.project,center,",
"analysis&filters=%7B%22op%22",
":%22and%22,%22content%22:%5B%7B%22op%22:%22in%22,",
"%22content%22:%7B%22field%22",
":%22files.data_format%22,%22value%22:%5B%22MAF%22%",
"5D%7D%7D,%7B%22op%22:%22in%",
"22,%22content%22:%7B%22field%22:%22files.access%22",
",%22value%22:%5B%22open%22%",
"5D%7D%7D,%7B%22op%22:%22in%22,%22content%22:%7B%",
"22field%22:%22files.experimental",
"_strategy%22,%22value%22:%5B%22DNA-Seq%22%5D%7D%7D",
",%7B%22op%22:%22in%22,%22content",
"%22:%7B%22field%22:%22files.platform%22,", platform,
",%7B%22op%22:%22in%22,%22",
"content%22:%7B%22field%22:%22cases.project.",
"project_id%22,%22value%22:%5B%22TCGA-",
toupper(tumor), "%22%5D%7D%7D%5D%7D&format=JSON"
)
} else if (tolower(data_base) == "gdc") {
url <- paste0(
url_inicio, "?pretty=true&expand=cases.samples.",
"portions.analytes.aliquots,cases.project,center,",
"analysis&filters=%7B%",
"22op%22:%22and%22,%22content%22:%5B%7B%22op%22:%22",
"in%22,%22content%22:",
"%7B%22field%22:%22files.data_format%22,%22value%22",
":%5B%22MAF%22%5D%7D%",
"7D,%7B%22op%22:%22in%22,%22content%22:%7B%22field",
"%22:%22files.access%",
"22,%22value%22:%5B%22open%22%5D%7D%7D,%7B%22op%22:",
"%22in%22,%22content%",
"22:%7B%22field%22:%22files.data_category%22,%22",
"value%22:%5B%22Simple%20",
"Nucleotide%20Variation%22%5D%7D%7D,%7B%22op%22:%22",
"in%22,%22content%22:",
"%7B%22field%22:%22files.data_type%22,%22value%22:",
"%5B%22Masked%20Somatic",
"%20Mutation%22%5D%7D%7D,%7B%22op%22:%22in%22,%22",
"content%22:%7B%22field%22",
":%22files.experimental_strategy%22,%22value%22:",
"%5B%22WXS%22%5D%7D%7D,%7B",
"%22op%22:%22in%22,%22content%22:%7B%22field%22:",
"%22cases.project.project_id",
"%22,%22value%22:%5B%22TCGA-", toupper(tumor),
"%22%5D%7D%7D%5D%7D&facetTab=cases&format=JSON"
)
}
json <- jsonlite::fromJSON(url, simplifyDataFrame = TRUE)
manifest_df <- json$data$hits
# checkin' if there are available data to download (e.g. LAML)
if (length(manifest_df) == 0) {
stop("There're not data to be downloaded for this cancer type!")
}
# center only available at legacy db
if (!is.null(manifest_df$center)) {
center <- manifest_df$center
manifest_df[, c("tags", "acl", "center")] <- NULL
manifest_df <- cbind(center, manifest_df)
} else {
analysis <- manifest_df$analysis
manifest_df[, c("analysis", "acl")] <- NULL
manifest_df <- cbind(analysis, manifest_df)
}
manipular <- manifest_df[, "cases"]
manifest_df[, "cases"] <- NULL
pre_cases <- matrix(data = "",
nrow = nrow(manifest_df),
ncol = ncol(manipular[[1]]["project"][[1]][1, ])
)
colnames(pre_cases) <- colnames(manipular[[1]]["project"][[1]][1, ])
for (i in seq_len(length(manipular))) {
cases <- unlist(manipular[[i]]["samples"])
cases <- as.character(cases[grep("TCGA", cases)])
cases <- unname(sapply(
cases,
function(w) {
paste(unlist(strsplit(w, "-"))[1:4], collapse = "-")
}
))
manifest_df[i, "cases"] <- paste(cases, collapse = "/")
pre_cases[i, ] <- as.matrix(manipular[[i]]["project"][[1]][1, ])
}
manifest_df <- cbind(manifest_df, pre_cases)
write.table(
x = manifest_df, file = paste0(direc, "/manifest.sdrf"),
quote = FALSE, row.names = FALSE, sep = "\t"
)
if (db_bool) {
manifest_df <- manifest_df[grep(tmpform, manifest_df$platform), ]
}
manifest_df <- manifest_df[, c("file_name", "md5sum", "file_id")]
colnames(manifest_df) <- c("filename", "md5", "id")
id_matrix <- manifest_df[, "id"]
}
# NOTE methylation ####
if ("methylation" %in% tolower(data_type)) {
dir.create(
path = file.path(
work_dir, "DOAGDC", toupper(tumor),
folder_name
),
showWarnings = FALSE
)
direc <- file.path(work_dir, "DOAGDC", toupper(tumor), folder_name)
if (tolower(platform) %in% "all") {
platform <- paste0(
"%22value%22:%5B%22Illumina%20Human%20Methylation",
"%20450%22,%22Illumina%20Human%20Methylation%2027%22%5D%7D%7D"
)
tmpform <- paste0(
"Illumina Human Methylation 450|Illumina ",
"Human Methylation 27"
)
} else if (tolower(platform) %in% "illumina human methylation 450") {
platform <- paste0(
"%22value%22:%5B%22Illumina%20Human%20",
"Methylation%20450%22%5D%7D%7D"
)
tmpform <- "Illumina Human Methylation 450"
} else if (tolower(platform) %in% "illumina human methylation 27") {
platform <- paste0(
"%22value%22:%5B%22Illumina%20Human%20",
"Methylation%2027%22%5D%7D%7D"
)
tmpform <- "Illumina Human Methylation 27"
}
size <- size_par(
type_of_data = "DNA Methylation", tumor = tumor,
db = tolower(data_base)
)
if (db_bool) {
url <- paste0(
url_inicio, "?pretty=true&expand=cases.samples.",
"portions.analytes.aliquots,cases.project,center,",
"analysis&size=", size,
"&filters=%7B%22op%22:%22and%22,%22content%22:",
"%5B%7B%22op%22:%22in%22,",
"%22content%22:%7B%22field%22:%22files.data_format",
"%22,%22value%22:%5B",
"%22TXT%22%5D%7D%7D,%7B%22op%22:%22in%22,%22",
"content%22:%7B%22field%22",
":%22files.access%22,%22value%22:%5B%22open%22",
"%5D%7D%7D,%7B%22op%22:%",
"22in%22,%22content%22:%7B%22field%22:%22files",
".platform%22,", platform,
",%7B%22op%22:%22in%22,%22content%22:%7B%22",
"field%22:%22files.data_category%22,%22value%22:",
"%5B%22DNA%20methylation",
"%22%5D%7D%7D,%7B%22op%22:%22in%22,%22content%22:",
"%7B%22field%22:%22cases",
".project.project_id%22,%22value%22:%5B%22TCGA-",
toupper(tumor),
"%22%5D%7D%7D%5D%7D&format=JSON"
)
} else if (tolower(data_base) == "gdc") {
url <- paste0(
url_inicio, "?pretty=true&expand=cases.samples.",
"portions.analytes.aliquots,cases.project,center,",
"analysis&size=",
size, "&filters=%7B%22op%22:%22and%22,%22content%22",
":%5B%7B%22op%22:%22in",
"%22,%22content%22:%7B%22field%22:%22files.data_",
"category%22,%22value%22:",
"%5B%22DNA%20Methylation%22%5D%7D%7D,%7B%22op%22:",
"%22in%22,%22content%22:",
"%7B%22field%22:%22files.platform%22,", platform,
",%7B%22op%22:%22in%22,%22content%22:%7B%",
"22field%22:%22files.access%22,%22value%22:%5B%22",
"open%22%5D%7D%7D,%7B%",
"22op%22:%22in%22,%22content%22:%7B%22field%22:%22",
"files.data_format%22,%",
"22value%22:%5B%22TXT%22%5D%7D%7D,%7B%22op%22:%22",
"in%22,%22content%22:%7B",
"%22field%22:%22cases.project.project_id%22,%22",
"value%22:%5B%22TCGA-", toupper(tumor),
"%22%5D%7D%7D%5D%7D&format=JSON"
)
}
message("\n\nDownloading manifest...\n")
json <- jsonlite::fromJSON(url, simplifyDataFrame = TRUE)$data$hits
manifest_df <- json
# checkin' if there are available data to download (e.g. LAML)
if (length(manifest_df) == 0) {
stop("There're not data to be downloaded for this cancer type!")
}
# center only available at legacy db
if (!is.null(manifest_df$center)) {
center <- manifest_df$center
manifest_df[, c("tags", "acl", "center")] <- NULL
manifest_df <- cbind(center, manifest_df)
} else {
analysis <- manifest_df$analysis
manifest_df[, c("analysis", "acl")] <- NULL
manifest_df <- cbind(analysis, manifest_df)
}
manipular <- manifest_df[, "cases"]
cases <- matrix(nrow = nrow(manifest_df), ncol = 1)
for (index in seq_len(length(manipular))) {
# patient_code <- manipular[[index]][ifelse(db_bool, 2, 1)]
patient_code <- as.character(unlist(manipular[[index]][1]))
tmp <- patient_code[grep("TCGA", patient_code)]
if (tmp == paste0('TCGA-', toupper(tumor))) {
patient_code <- as.character(unlist(manipular[[index]][2]))
cases[index, 1] <- patient_code[grep("TCGA", patient_code)]
} else {
cases[index, 1] <- tmp
}
}
manifest_df$cases <- cases
write.table(
x = manifest_df, file = paste0(direc, "/manifest.sdrf"),
quote = FALSE, row.names = FALSE, sep = "\t"
)
manifest_df <- manifest_df[
grep(tmpform, manifest_df$platform),
c("file_name", "md5sum", "file_id")
]
colnames(manifest_df) <- c("filename", "md5", "id")
id_matrix <- manifest_df[, "id"]
}
# NOTE clinical and image ####
tmp <- c("clinical", "biospecimen", "clinical_supplement", "image")
if (tolower(data_type) %in% tmp) {
dir.create(
path = file.path(
work_dir, "DOAGDC", toupper(tumor),
folder_name
),
showWarnings = FALSE
)
direc <- file.path(work_dir, "DOAGDC", toupper(tumor), folder_name)
if (db_bool) {
size <- size_par(
tumor = tumor, type_of_data = "Clinical",
db = "legacy"
)
if ("image" == tolower(data_type)) {
message(
"A lot of data are going to be downloaded,",
" please wait..."
)
url <- paste0(
url_inicio, "?pretty=true&expand=cases.samples.",
"portions.analytes.aliquots,cases.project,",
"center,analysis&size=10000&filters=%7B%22op%22",
":%22and%22,%22content%22:%5B%7B%22op%22:%22",
"in%22,%22content%22:%7B%22field%22",
":%22files.data_category%22,%22value%22:%5B%22",
"Clinical%22%5D%7D%7D,%7B%22op%22",
":%22in%22,%22content%22:%7B%22field%22:%22",
"files.access%22,%22value%22:%5B%22",
"open%22%5D%7D%7D,%7B%22op%22:%22in%22,%22",
"content%22:%7B%22field%22:%22files.",
"platform%22,%22value%22:%5B%22Clinical%22%",
"5D%7D%7D,%7B%22op%22:%22in%22,%22",
"content%22:%7B%22field%22:%22cases.project.",
"project_id%22,%22value%22:%5B%22TCGA-",
toupper(tumor), "%22%5D%7D%7D,%7B%22op%22:%22",
"in%22,%22content%22:%7B%22field%22",
":%22files.data_format%22,%22value%22:%5B%22",
"SVS%22%5D%7D%7D%5D%7D&format=JSON"
)
} else if ("clinical_supplement" == tolower(data_type)) {
url <- paste0(
url_inicio, "?pretty=true&expand=cases.samples.",
"portions.analytes.aliquots,cases.project,",
"center,analysis&size=", size, "&filters=",
"%7B%22op%22:%22and%22,%22content%22:%5B%7B",
"%22op%22:%22in%22,%22content%22:%7B%22",
"field%22:%22files.access%22,%22value%22:%5B",
"%22open%22%5D%7D%7D,%7B%22op%22:%22in%",
"22,%22content%22:%7B%22field%22:%22cases.",
"project.program.name%22,%22value%22:%5B%",
"22TCGA%22%5D%7D%7D,%7B%22op%22:%22in%22,%22",
"content%22:%7B%22field%22:%22cases.pro",
"ject.project_id%22,%22value%22:%5B%22TCGA-",
toupper(tumor),
"%22%5D%7D%7D,%7B%22op%22:%22in%22,",
"%22content%22:%7B%22field%22:%22files.data_",
"format%22,%22value%22:%5B%22BCR%20XML",
"%22%5D%7D%7D,%7B%22op%22:%22in%22,%22content",
"%22:%7B%22field%22:%22files.data_type",
"%22,%22value%22:%5B%22Clinical%20Supplement",
"%22%5D%7D%7D%5D%7D&format=JSON"
)
} else if ("clinical" == tolower(data_type)) {
url <- paste0(
url_inicio, "?pretty=true&expand=cases.samples.",
"portions.analytes.aliquots,cases.project,",
"center,analysis&size=", size, "&filters=%7B%",
"22op%22:%22and%22,%22content%22:%5B%7B%22op",
"%22:%22in%22,%22content%22:%7B%22field%22",
":%22files.access%22,%22value%22:%5B%22open",
"%22%5D%7D%7D,%7B%22op%22:%22in%22,%22content",
"%22:%7B%22field%22:%22cases.project.program",
".name%22,%22value%22:%5B%22TCGA%22%5D%7D%7D,",
"%7B%22op%22:%22in%22,%22content%22:%7B%22",
"field%22:%22cases.project.project_id%22,%22",
"value%22:%5B%22TCGA-", toupper(tumor),
"%22%5D%7D%7D,%7B%22op%22:%22in%22,%22content%",
"22:%7B%22field%22:%22files.data_category%22,",
"%22value%22:%5B%22Clinical%22%5D%7D%7D,",
"%7B%22op%22:%22in%22,%22content%22:%7B%22",
"field%22:%22files.data_format%22,%22value%",
"22:%5B%22Biotab%22%5D%7D%7D%5D%7D&format=JSON"
)
} else if ("biospecimen" == tolower(data_type)) {
url <- paste0(
url_inicio, "?pretty=true&expand=cases.samples.",
"portions.analytes.aliquots,cases.project,",
"center,analysis&size=", size, "&filters=%7B%22",
"op%22:%22and%22,%22content%22:%5B%7B%22op%22:",
"%22in%22,%22content%22:%7B%22field%22:%22",
"files.access%22,%22value%22:%5B%22open%22%5D",
"%7D%7D,%7B%22op%22:%22in%22,%22content%22:",
"%7B%22field%22:%22cases.project.program.name",
"%22,%22value%22:%5B%22TCGA%22%5D%7D%7D,%7B%",
"22op%22:%22in%22,%22content%22:%7B%22field%22",
":%22cases.project.project_id%22,%22value%",
"22:%5B%22TCGA-", toupper(tumor), "%22%5D%7D%",
"7D,%7B%22op%22:%22in%22,%22content%22:%7B%",
"22field%22:%22files.data_format%22,%22value",
"%22:%5B%22BCR%20XML%22%5D%7D%7D,%7B%22op%22",
":%22in%22,%22content%22:%7B%22field%22:%22",
"files.data_type%22,%22value%22:%5B%22Biospeci",
"men%20Supplement%22%5D%7D%7D%5D%7D&format=JSON"
)
}
} else if (tolower(data_base) == "gdc") {
size <- size_par(
tumor = tumor, type_of_data = "Clinical",
db = "gdc"
)
if ("clinical_supplement" == tolower(data_type)) {
url <- paste0(
url_inicio, "?pretty=true&expand=cases.samples.",
"portions.analytes.aliquots,cases.project,",
"center,analysis&size=", size,
"&filters=%7B%22op%",
"22%3A%22and%22%2C%22content%22%3A%5B%7B%22op%",
"22%3A%22in%22%2C%22content%22%3A%7B%22field%22",
"%3A%22cases.project.project_id%22%2C%22value",
"%22%3A%5B%22TCGA-", toupper(tumor), "%22%5D%",
"7D%7D%2C%7B%22op%22%3A%22in%22%2C%22content",
"%22%3A%7B%22field%22%3A%22files.access%22%2C%22",
"value%22%3A%5B%22open%22%5D%7D%7D%2C%7B%22",
"op%22%3A%22in%22%2C%22content%22%3A%7B%22field",
"%22%3A%22files.data_format%22%2C%22value%",
"22%3A%5B%22BCR%20XML%22%5D%7D%7D%2C%7B%22op%22%",
"3A%22in%22%2C%22content%22%3A%7B%22field%22",
"%3A%22files.data_type%22%2C%22value%22%3A%5B%22",
"Clinical%20Supplement%22%5D%7D%7D%5D%7D&",
"format=JSON"
)
} else if ("clinical" == tolower(data_type)) {
url <- paste0(
url_inicio, "?pretty=true&expand=cases.samples.",
"portions.analytes.aliquots,cases.project,",
"center,analysis&size=", size, "&filters=",
"%7B%22op%22%3A%22and%22%2C%22content%22",
"%3A%5B%7B%22op%22%3A%22in%22%2C%22content",
"%22%3A%7B%22field%22%3A%22cases.project.",
"program.name%22%2C%22value%22%3A%5B%22",
"TCGA%22%5D%7D%7D%2C%7B%22op%22%3A%22in%22",
"%2C%22content%22%3A%7B%22field%22%3A%22",
"cases.project.project_id%22%2C%22value%22%3A%5B",
"%22TCGA-", toupper(tumor), "%22%5D%7D%7D%2C",
"%7B%22op%22%3A%22in%22%2C%22content%22%3A%7B",
"%22field%22%3A%22files.access%22%2C%22value",
"%22%3A%5B%22open%22%5D%7D%7D%2C%7B%22op%22",
"%3A%22in%22%2C%22content%22%3A%7B%22field",
"%22%3A%22files.data_category%22%2C%22value",
"%22%3A%5B%22Clinical%22%5D%7D%7D%2C%7B%22",
"op%22%3A%22in%22%2C%22content%22%3A%7B%22",
"field%22%3A%22files.data_format%22%2C%22",
"value%22%3A%5B%22BCR%20Biotab%22%5D%7D",
"%7D%5D%7D&format=JSON"
)
} else if ("biospecimen" == tolower(data_type)) {
url <- paste0(
url_inicio, "?pretty=true&expand=cases.samples.",
"portions.analytes.aliquots,cases.proje",
"ct,center,analysis&size=", size,
"&filters=%7B%22op%",
"22%3A%22and%22%2C%22content%22%3A%5B%7B%22op",
"%22%3A%22in%22%2C%22content%22%3A%7B%22field%",
"22%3A%22cases.project.project_id%22%2C%22value%",
"22%3A%5B%22TCGA-", toupper(tumor),
"%22%5D%7D%7D%2C%7B%22op",
"%22%3A%22in%22%2C%22content%22%3A%7B%22field",
"%22%3A%22files.access%22%2C%22value%22%3A%5B%22",
"open%22%5D%7D%7D%2C%7B%22op%22%3A%22in%22%2C%",
"22content%22%3A%7B%22field%22%3A%22files.",
"data_format%22%2C%22value%22%3A%5B%22BCR%20XML",
"%22%5D%7D%7D%2C%7B%22op%22%3A%22in%22%2C%22",
"content%22%3A%7B%22field%22%3A%22files.data_",
"type%22%2C%22value%22%3A%5B%22Biospecimen%20",
"Supplement%22%5D%7D%7D%5D%7D&format=JSON"
)
}
}
message("\n\nDownloading manifest...\n")
json <- jsonlite::fromJSON(url, simplifyDataFrame = TRUE)
manifest_df <- json$data$hits
# checkin' if there are available data to download (e.g. LAML)
if (length(manifest_df) == 0) {
stop("There're not data to be downloaded for this cancer type!")
}
# center only available at legacy db
if (!is.null(manifest_df$center)) {
center <- manifest_df$center
manifest_df[, c("tags", "acl", "center", "cases")] <- NULL
manifest_df <- cbind(center, manifest_df)
} else {
manifest_df[, c("tags", "acl", "cases")] <- NULL
}
write.table(
x = manifest_df, file = paste0(direc, "/manifest.sdrf"),
quote = FALSE, row.names = FALSE, sep = "\t"
)
manifest_df <- manifest_df[, c("file_name", "md5sum", "file_id")]
colnames(manifest_df) <- c("filename", "md5", "id")
id_matrix <- manifest_df[, "id"]
}
# NOTE protein ####
if ("protein" == tolower(data_type)) {
if (tolower(data_base) == "gdc") {
stop(message(
"\nThrere is no protein expression data in",
" GDC data base!!",
"\nPlease use 'legacy' data base"
))
}
dir.create(
path = file.path(
work_dir, "DOAGDC", toupper(tumor),
folder_name
),
showWarnings = FALSE
)
direc <- file.path(work_dir, "DOAGDC", toupper(tumor), folder_name)
size <- size_par(
tumor = tumor, type_of_data = "Protein expression",
db = "legacy"
)
message("Cheking if size is different of zero...")
stopifnot(!is.na(size))
url <- paste0(
url_inicio, "?pretty=true&expand=cases.samples.",
"portions.analytes.aliquots,cases.project,center,",
"analysis&size=", size, "&filters=%7B%22op%22",
":%22and%22,%22content%22:%5B%7B%22op%22:%22in%22,%22",
"content%22:%7B%22field%22",
":%22files.data_category%22,%22value%22:%5B%22",
"Protein%20expression%22%5D%7D%",
"7D,%7B%22op%22:%22in%22,%22content%22:%7B%22field",
"%22:%22files.access%22,%22",
"value%22:%5B%22open%22%5D%7D%7D,%7B%22op%22:%22in%22",
",%22content%22:%7B%22",
"field%22:%22cases.project.project_id%22,%22value%22",
":%5B%22TCGA-",
toupper(tumor), "%22%5D%7D%7D%5D%7D&format=JSON"
)
message("\n\nDownloading manifest...\n")
json <- jsonlite::fromJSON(url, simplifyDataFrame = TRUE)
manifest_df <- json$data$hits
# checkin' if there are available data to download (e.g. LAML)
if (length(manifest_df) == 0) {
stop("There're not data to be downloaded for this cancer type!")
}
center <- manifest_df$center
manifest_df[, c("center", "acl", "cases")] <- NULL
manifest_df <- cbind(center, manifest_df)
# for some reason sometimes- 16(submitter_id) will be cases
patient_code <- manifest_df$submitter_id
write.table(
x = manifest_df, file = paste0(direc, "/manifest.sdrf"),
quote = FALSE, row.names = FALSE, sep = "\t"
)
manifest_df <- manifest_df[, c("file_name", "md5sum", "file_id")]
colnames(manifest_df) <- c("filename", "md5", "id")
id_matrix <- manifest_df[, "id"]
}
# NOTE mage ####
if ("mage" == tolower(data_type)) {
dir.create(
path = file.path(
work_dir, "DOAGDC", toupper(tumor),
folder_name
),
showWarnings = FALSE
)
direc <- file.path(work_dir, "DOAGDC", toupper(tumor), folder_name)
url <- paste0(
url_inicio, "?pretty=true&expand=cases.samples.",
"portions.analytes.aliquots,cases.project,center,",
"analysis&size=1000000&filters=%7B%22op%22:",
"%22and%22,%22content%22:%5B%7B%22op%22:%22in%22,%22",
"content%22:%7B%22field%22:",
"%22files.data_format%22,%22value%22:%5B%22MAGETAB%",
"22%5D%7D%7D,%7B%22op%22:%22",
"in%22,%22content%22:%7B%22field%22:%22files.access",
"%22,%22value%22:%5B%22open%",
"22%5D%7D%7D,%7B%22op%22:%22in%22,%22content%22:%7B",
"%22field%22:%22files.data_type",
"%22,%22value%22:%5B%22TCGA%20DCC%20Archive%22%5D%",
"7D%7D,%7B%22op%22:%22in%22,%22",
"content%22:%7B%22field%22:%22files.data_category",
"%22,%22value%22:%5B%22Archive%22",
"%5D%7D%7D%5D%7D&pagination=%7B%22files%22:%7B%22from",
"%22:0,%22size%22:100,%22sort",
"%22:%22cases.project.project_id:asc%22%7D",
"%7D&format=JSON"
)
message("\n\nDownloading manifest...\n")
json <- jsonlite::fromJSON(url, simplifyDataFrame = TRUE)
manifest_df <- json$data$hits
# checkin' if there are available data to download (e.g. LAML)
if (nrow(manifest_df) == 0) {
stop("There're not data to be downloaded for this cancer type!")
}
# 16(submitter_id) will be cases
manifest_df[, c("acl", "cases")] <- NULL
manifest_df <- manifest_df[grepl(toupper(tumor), manifest_df$file_name,
ignore.case = FALSE, perl = TRUE
), ]
write.table(
x = manifest_df, file = paste0(direc, "/manifest.sdrf"),
quote = FALSE, row.names = FALSE, sep = "\t"
)
manifest_df <- manifest_df[, c("file_name", "md5sum", "file_id")]
colnames(manifest_df) <- c("filename", "md5", "id")
id_matrix <- manifest_df[, "id"]
}
# NOTE mirna ####
is_mirna <- "mirna" %in% strsplit(tolower(data_type), split = " ")[[1]][1]
is_isoform <- "isoform expression quantification" == tolower(data_type)
if (is_mirna || is_isoform) {
dir.create(
path = file.path(
work_dir, "DOAGDC", toupper(tumor),
folder_name
),
showWarnings = FALSE
)
direc <- file.path(work_dir, "DOAGDC", toupper(tumor), folder_name)
if (tolower(platform) %in% "all") {
platform <- ""
tmp <- "Illumina HiSeq|Illumina GA|H-miRNA_8x15Kv2|H-miRNA_8x15Kv"
} else if (tolower(platform) %in% "illumina hiseq") {
platform <- paste0(
",%7B%22op%22:%22in%22,%22content%22:%7B%22",
"field%22:%22files.platform%22,%22",
"value%22:%5B%22Illumina%20HiSeq%22%5D%7D%7D"
)
tmp <- "Illumina HiSeq"
} else if (tolower(platform) %in% "illumina ga") {
platform <- paste0(
",%7B%22op%22:%22in%22,%22content%22:%7B%22",
"field%22:%22files.platform%22,%22value%22",
":%5B%22Illumina%20GA%22%5D%7D%7D"
)
tmp <- "Illumina GA"
} else if (tolower(platform) %in% "h-mirna_8x15kv2") {
platform <- paste0(
",%7B%22op%22:%22in%22,%22content%22:%7B%22",
"field%22:%22files.platform%22,%22",
"value%22:%5B%22H-miRNA_8x15Kv2%22%5D%7D%7D"
)
tmp <- "H-miRNA_8x15Kv2"
} else if (tolower(platform) %in% "h-mirna_8x15kv") {
platform <- paste0(
",%7B%22op%22:%22in%22,%22content%22:",
"%7B%22field%22:%22files.platform%22,",
"%22value%22:%5B%22H-miRNA_8x15Kv%22%5D%7D%7D"
)
tmp <- "H-miRNA_8x15Kv"
}
if (db_bool) {
size <- size_par(
type_of_data = "Gene expression", tumor = tumor,
db = tolower(data_base)
)
if ("mirna gene quantification" %in% tolower(data_type)) {
url <- paste0(
url_inicio, "?pretty=true&expand=cases.samples.",
"portions.analytes.aliquots,cases.project,",
"center,analysis&size=", size,
"&filters=%7B%22op%22:%22and%22,%22content%22:",
"%5B%7B%22op%22:%22in%22,%",
"22content%22:%7B%22field%22:%22files.data_type",
"%22,%22value%22:%5B%22miRNA%",
"20gene%20quantification%22%5D%7D%7D,%7B%22op",
"%22:%22in%22,%22content%22:%7B%22",
"field%22:%22files.access%22,%22value%22:%5B%",
"22open%22%5D%7D%7D,%7B%22op%22:",
"%22in%22,%22content%22:%7B%22field%22:%22",
"cases.project.project_id%22,%22",
"value%22:%5B%22TCGA-", toupper(tumor),
"%22%5D%7D%7D", platform, "%5D%7D&format=JSON"
)
print(url)
} else if ("mirna isoform quantification" %in% tolower(data_type)) {
url <- paste0(
url_inicio, "?pretty=true&expand=cases.samples.",
"portions.analytes.aliquots,cases.project,",
"center,analysis&size=", size,
"&filters=%7B%22op%22:%22and%22,%22content%22",
":%5B%7B%22op%22:%22in%22,%22",
"content%22:%7B%22field%22:%22files.data_type",
"%22,%22value%22:%5B%22miRNA%",
"20isoform%20quantification%22%5D%7D%7D,%7B%22",
"op%22:%22in%22,%22content%22",
":%7B%22field%22:%22files.access%22,%22value",
"%22:%5B%22open%22%5D%7D%7D,%7B",
"%22op%22:%22in%22,%22content%22:%7B%22field",
"%22:%22cases.project.project_id",
"%22,%22value%22:%5B%22TCGA-", toupper(tumor),
"%22%5D%7D%7D", platform, "%5D%7D&format=JSON"
)
}
} else if (tolower(data_base) == "gdc") {
size <- size_par(
type_of_data = "Transcriptome Profiling",
tumor = tumor, db = tolower(data_base)
)
if ("mirna expression quantification" %in% tolower(data_type)) {
url <- paste0(
url_inicio, "?pretty=true&expand=cases.samples.",
"portions.analytes.aliquots,cases.project,",
"center,analysis&size=",
size, "&filters=%7B%22op%22%3A%22and%22%2C%",
"22content%22%3A%5B%7B",
"%22op%22%3A%22in%22%2C%22content%22%3A%7B%",
"22field%22%3A%22cases.",
"project.project_id%22%2C%22value%22%3A%",
"5B%22TCGA-", toupper(tumor),
"%22%5D%7D%7D%2C%7B%22op%22%3A%22in%22%",
"2C%22content%22%3A%7B%22",
"field%22%3A%22files.access%22%2C%22value",
"%22%3A%5B%22open%22%5D%7D%",
"7D%2C%7B%22op%22%3A%22in%22%2C%22content%",
"22%3A%7B%22field%22%3A%22",
"files.data_type%22%2C%22value%22%3A%5B%22",
"miRNA%20Expression%20Quan",
"tification%22%5D%7D%7D%5D%7D&format=JSON"
)
} else if (is_isoform) {
url <- paste0(
url_inicio, "?pretty=true&expand=cases.samples.",
"portions.analytes.aliquots,cases.project,",
"center,analysis&size=",
size, "&filters=%7B%22op%22%3A%22and%22%2C%22",
"content%22%3A%5B%7B%22",
"op%22%3A%22in%22%2C%22content%22%3A%7B%22",
"field%22%3A%22cases.project",
".project_id%22%2C%22value%22%3A%5B%22TCGA-",
toupper(tumor),
"%22%5D%7D%7D%2C%7B%22op%22%3A%22in%22%2C%22",
"content%22%3A%7B%22field",
"%22%3A%22files.access%22%2C%22value%22%3A%5B",
"%22open%22%5D%7D%7D%2C%",
"7B%22op%22%3A%22in%22%2C%22content%22%3A%7B%",
"22field%22%3A%22files.",
"data_type%22%2C%22value%22%3A%5B%22Isoform%",
"20Expression%20Quantifi",
"cation%22%5D%7D%7D%5D%7D&format=JSON"
)
}
}
message("\n\nDownloading manifest...\n")
json <- jsonlite::fromJSON(url, simplifyDataFrame = TRUE)
manifest_df <- json$data$hits
# checkin' if there are available data to download (e.g. LAML)
if (length(manifest_df) == 0) {
stop("There're not data to be downloaded for this cancer type!")
}
# center only available at legacy db
if (!is.null(manifest_df$center)) {
center <- manifest_df$center
manifest_df[, c("tags", "acl", "center")] <- NULL
manifest_df <- cbind(center, manifest_df)
} else {
analysis <- manifest_df$analysis
manifest_df[, c("analysis", "acl")] <- NULL
manifest_df <- cbind(analysis, manifest_df)
}
manipular <- manifest_df[, "cases"]
cases <- matrix(nrow = nrow(manifest_df), ncol = 1)
for (index in seq_len(length(manipular))) {
patient_code <- as.character(unlist(manipular[[index]][1]))
tmp <- patient_code[grep("TCGA", patient_code)]
if (tmp == paste0('TCGA-', toupper(tumor))) {
patient_code <- as.character(unlist(manipular[[index]][2]))
cases[index, 1] <- patient_code[grep("TCGA", patient_code)]
} else {
cases[index, 1] <- tmp
}
}
manifest_df$cases <- cases
write.table(
x = manifest_df, file = paste0(direc, "/manifest.sdrf"),
quote = FALSE, row.names = FALSE, sep = "\t"
)
if (db_bool) {
manifest_df <- manifest_df[grep(tmp, manifest_df$platform), ]
}
manifest_df <- manifest_df[, c("file_name", "md5sum", "file_id")]
colnames(manifest_df) <- c("filename", "md5", "id")
id_matrix <- manifest_df[, "id"]
}
# NOTE Exon quantification ####
if ("exon" == strsplit(tolower(data_type), split = " ")[[1]][1]) {
if (tolower(data_base) == "gdc") {
stop(message(
"\nThrere is no Exon quantification ",
"data in GDC data base!!",
"\nPlease use 'legacy' data base"
))
}
dir.create(
path = file.path(
work_dir, "DOAGDC", toupper(tumor),
folder_name
),
showWarnings = FALSE
)
direc <- file.path(work_dir, "DOAGDC", toupper(tumor), folder_name)
if (tolower(platform) %in% "all") {
tmpform <- "Illumina GA|Illumina HiSeq"
} else if (tolower(platform) %in% "illumina ga") {
tmpform <- "Illumina GA"
} else if (tolower(platform) %in% "illumina hiseq") {
tmpform <- "Illumina HiSeq"
}
platform <- paste0(
"%22value%22:%5B%22Illumina%20GA%22,",
"%22Illumina%20HiSeq%22%5D%7D%7D"
)
size <- size_par(
tumor = tumor, type_of_data = "Gene expression",
db = "legacy"
)
url <- paste0(
url_inicio, "?pretty=true&expand=cases.samples.",
"portions.analytes.aliquots,cases.project,center,",
"analysis&size=", size, "&filters=",
"%7B%22op%22:%22and%22,%22content%22:%5B%7B%22op%22:",
"%22in%22,%22content%22:%7B%22",
"field%22:%22cases.project.program.name%22,%22value",
"%22:%5B%22TCGA%22%5D%7D%7D,%7B",
"%22op%22:%22in%22,%22content%22:%7B%22field%22:%22",
"files.data_type%22,%22value%22:",
"%5B%22Exon%20quantification%22%5D%7D%7D,%7B%22op%22:",
"%22in%22,%22content%22:%7B%22",
"field%22:%22files.access%22,%22value%22:%5B%22open%22%",
"5D%7D%7D,%7B%22op%22:%22in%",
"22,%22content%22:%7B%22field%22:%22files.platform%22,",
platform, ",%7B%22op%22:",
"%22in%22,%22content%22:%7B%22field%22:%22cases.project",
".project_id%22,%22value%22",
":%5B%22TCGA-", toupper(tumor),
"%22%5D%7D%7D%5D%7D&format=JSON"
)
message("\n\nDownloading manifest...\n")
json <- tryCatch(jsonlite::fromJSON(url, simplifyDataFrame = TRUE),
error = function(e) stop(e),
finally = message(
"The tumor ", toupper(tumor),
" does not have ", tolower(data_type),
"data!"
)
)
manifest_df <- json$data$hits
# checkin' if there are available data to download (e.g. LAML)
if (length(manifest_df) == 0) {
stop("There're not data to be downloaded for this cancer type!")
}
center <- manifest_df$center
manifest_df[, c("center", "acl", "cases", "tags")] <- NULL
manifest_df <- cbind(center, manifest_df)
# for some reason sometimes- 16(submitter_id) will be cases
patient_code <- manifest_df$submitter_id # same protein problem
write.table(
x = manifest_df, file = paste0(direc, "/manifest.sdrf"),
quote = FALSE, row.names = FALSE, sep = "\t"
)
manifest_df <- manifest_df[
grep(tmpform, manifest_df$platform),
c("file_name", "md5sum", "file_id")
]
colnames(manifest_df) <- c("filename", "md5", "id")
id_matrix <- manifest_df[, "id"]
}
# NOTE Download PPD ####
if (length(dir(direc)) > 1) {
# verifying if the data is already downloaded
pattern <- paste(".sdrf", "Data_access_time.txt", sep = "|")
already_downloaded <- dir(
path = direc, include.dirs = FALSE,
recursive = FALSE,
full.names = TRUE
)[!grepl(
pattern,
dir(path = direc)
)]
message("Checking md5 from downloaded files\n")
already_downloaded_md5 <- as.vector(tools::md5sum(already_downloaded))
selector <- manifest_df[, "md5"] %in% already_downloaded_md5
id_matrix <- manifest_df[!selector, "id"]
}
if (length(id_matrix) != 0) {
message("OK!\n")
### download data
url <- paste0(inicio, id_matrix)
pb <- txtProgressBar(min = 0, max = length(url), style = 3)
cont <- 0
for (id in url) {
cont <- cont + 1
message(paste("\nDownloading", tumor, data_type, cont, "of",
length(url),
sep = " "
))
setTxtProgressBar(pb, cont)
tmp <- manifest_df[manifest_df$id == id_matrix[cont], "filename"]
download_httr(url = id, destfile = paste0(direc, "/", tmp))
md5 <- tools::md5sum(dir(
path = direc, pattern = tmp,
full.names = TRUE
))
tmp_md5 <- manifest_df[manifest_df$id == id_matrix[cont], "md5"]
while (md5[[1]] != tmp_md5) {
message(paste0(
"The md5 of file '", tmp,
"' is wrong. Downloading again...\n"
))
download_httr(url = id, destfile = paste0(direc, "/", tmp))
md5 <- tools::md5sum(dir(direc, tmp, full.names = TRUE))
}
}
close(pb)
# from python
# file_endpt = 'https://api.gdc.cancer.gov/files/'
# file_uuid = 'd853e541-f16a-4345-9f00-88e03c2dc0bc'
# response = requests.get(file_endpt + file_uuid)
# message(sprintf(
# "On %s I realized %s was...\n%s by the street", Sys.Date(), person,
# action))
message("\n\nDownload is done!\n\n")
} else {
message(paste0(
"There is nothing to download for ", tumor,
". You already have all data available",
" in the selected data base!"
))
}
# saving accession data
write.table(Sys.time(), paste0(direc, "/Data_access_time.txt"),
quote = FALSE,
row.names = FALSE, col.names = FALSE
)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.