knitr::opts_chunk$set(echo = TRUE) knitr::opts_knit$set(progress = FALSE)
TCGAbiolinks has provided a few functions to search GDC database.
library(TCGAbiolinks) library(SummarizedExperiment) library(dplyr) library(DT)
You can easily search GDC data using the GDCquery
function.
Using a summary of filters as used in the TCGA portal, the function works with the following arguments:
| ?project | A list of valid project (see table below)] | | |----------------------- |------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |------------------------------------- | | data.category | A valid project (see list with TCGAbiolinks:::getProjectSummary(project)) | | | data.type | A data type to filter the files to download | | | workflow.type | GDC workflow type | | | access | Filter by access type. Possible values: controlled, open | | | platform | Example: | | | | CGH- 1x1M_G4447A | IlluminaGA_RNASeqV2 | | | AgilentG4502A_07 | IlluminaGA_mRNA_DGE | | | Human1MDuo | HumanMethylation450 | | | HG-CGH-415K_G4124A | IlluminaGA_miRNASeq | | | HumanHap550 | IlluminaHiSeq_miRNASeq | | | ABI | H-miRNA_8x15K | | | HG-CGH-244A | SOLiD_DNASeq | | | IlluminaDNAMethylation_OMA003_CPI | IlluminaGA_DNASeq_automated | | | IlluminaDNAMethylation_OMA002_CPI | HG-U133_Plus_2 | | | HuEx- 1_0-st-v2 | Mixed_DNASeq | | | H-miRNA_8x15Kv2 | IlluminaGA_DNASeq_curated | | | MDA_RPPA_Core | IlluminaHiSeq_TotalRNASeqV2 | | | HT_HG-U133A | IlluminaHiSeq_DNASeq_automated | | | diagnostic_images | microsat_i | | | IlluminaHiSeq_RNASeq | SOLiD_DNASeq_curated | | | IlluminaHiSeq_DNASeqC | Mixed_DNASeq_curated | | | IlluminaGA_RNASeq | IlluminaGA_DNASeq_Cont_automated | | | IlluminaGA_DNASeq | IlluminaHiSeq_WGBS | | | pathology_reports | IlluminaHiSeq_DNASeq_Cont_automated | | | Genome_Wide_SNP_6 | bio | | | tissue_images | Mixed_DNASeq_automated | | | HumanMethylation27 | Mixed_DNASeq_Cont_curated | | | IlluminaHiSeq_RNASeqV2 | Mixed_DNASeq_Cont | | file.type | To be used in the legacy database for some platforms, to define which file types to be used. | | | barcode | A list of barcodes to filter the files to download | | | experimental.strategy | Filter to experimental strategy. Harmonized: WXS, RNA-Seq, miRNA-Seq, Genotyping Array. | | | sample.type | A sample type to filter the files to download | |
The options for the field project
are below:
datatable( TCGAbiolinks:::getGDCprojects(), filter = 'top', options = list(scrollX = TRUE, keys = TRUE, pageLength = 10), rownames = FALSE, caption = "List of projects" )
The options for the field sample.type
are below:
datatable( TCGAbiolinks:::getBarcodeDefinition(), filter = 'top', options = list(scrollX = TRUE, keys = TRUE, pageLength = 10), rownames = FALSE, caption = "List sample types" )
The other fields (data.category, data.type, workflow.type, platform, file.type) can be found below. Please, note that these tables are still incomplete.
datatable( readr::read_csv("https://docs.google.com/spreadsheets/d/1f98kFdj9mxVDc1dv4xTZdx8iWgUiDYO-qiFJINvmTZs/export?format=csv&gid=2046985454",col_types = readr::cols()), filter = 'top', options = list(scrollX = TRUE, keys = TRUE, pageLength = 40), rownames = FALSE )
In this example we will access the harmonized database and search for all DNA methylation data for recurrent glioblastoma multiform (GBM) and low grade gliomas (LGG) samples.
query <- GDCquery( project = c("TCGA-GBM", "TCGA-LGG"), data.category = "DNA Methylation", platform = c("Illumina Human Methylation 450"), sample.type = "Recurrent Tumor" ) datatable( getResults(query), filter = 'top', options = list(scrollX = TRUE, keys = TRUE, pageLength = 5), rownames = FALSE )
In this example we will access the harmonized database and search for all patients with DNA methylation (platform HumanMethylation450k) and gene expression data for Colon Adenocarcinoma tumor (TCGA-COAD).
query_met <- GDCquery( project = "TCGA-COAD", data.category = "DNA Methylation", platform = c("Illumina Human Methylation 450") ) query_exp <- GDCquery( project = "TCGA-COAD", data.category = "Transcriptome Profiling", data.type = "Gene Expression Quantification", workflow.type = "STAR - Counts" ) # Get all patients that have DNA methylation and gene expression. common.patients <- intersect( substr(getResults(query_met, cols = "cases"), 1, 12), substr(getResults(query_exp, cols = "cases"), 1, 12) ) # Only select the first 5 patients query_met <- GDCquery( project = "TCGA-COAD", data.category = "DNA Methylation", platform = c("Illumina Human Methylation 450"), barcode = common.patients[1:5] ) query_exp <- GDCquery( project = "TCGA-COAD", data.category = "Transcriptome Profiling", data.type = "Gene Expression Quantification", workflow.type = "STAR - Counts", barcode = common.patients[1:5] )
datatable( getResults(query_met, cols = c("data_type","cases")), filter = 'top', options = list(scrollX = TRUE, keys = TRUE, pageLength = 5), rownames = FALSE ) datatable( getResults(query_exp, cols = c("data_type","cases")), filter = 'top', options = list(scrollX = TRUE, keys = TRUE, pageLength = 5), rownames = FALSE )
This example shows how the user can search for breast cancer Raw Sequencing Data ("Controlled") and verify the name of the files and the barcodes associated with it.
query <- GDCquery( project = "TCGA-ACC", data.category = "Sequencing Reads", data.type = "Aligned Reads", data.format = "bam", workflow.type = "STAR 2-Pass Transcriptome" ) # Only first 10 to make render faster datatable( getResults(query, rows = 1:10,cols = c("file_name","cases")), filter = 'top', options = list(scrollX = TRUE, keys = TRUE, pageLength = 5), rownames = FALSE ) query <- GDCquery( project = "TCGA-ACC", data.category = "Sequencing Reads", data.type = "Aligned Reads", data.format = "bam", workflow.type = "STAR 2-Pass Genome" ) # Only first 10 to make render faster datatable( getResults(query, rows = 1:10,cols = c("file_name","cases")), filter = 'top', options = list(scrollX = TRUE, keys = TRUE, pageLength = 5), rownames = FALSE ) query <- GDCquery( project = "TCGA-ACC", data.category = "Sequencing Reads", data.type = "Aligned Reads", data.format = "bam", workflow.type = "STAR 2-Pass Chimeric" ) # Only first 10 to make render faster datatable( getResults(query, rows = 1:10,cols = c("file_name","cases")), filter = 'top', options = list(scrollX = TRUE, keys = TRUE, pageLength = 5), rownames = FALSE ) query <- GDCquery( project = "TCGA-ACC", data.category = "Sequencing Reads", data.type = "Aligned Reads", data.format = "bam", workflow.type = "BWA-aln" ) # Only first 10 to make render faster datatable( getResults(query, rows = 1:10,cols = c("file_name","cases")), filter = 'top', options = list(scrollX = TRUE, keys = TRUE, pageLength = 5), rownames = FALSE ) query <- GDCquery( project = "TCGA-ACC", data.category = "Sequencing Reads", data.type = "Aligned Reads", data.format = "bam", workflow.type = "BWA with Mark Duplicates and BQSR" ) # Only first 10 to make render faster datatable( getResults(query, rows = 1:10,cols = c("file_name","cases")), filter = 'top', options = list(scrollX = TRUE, keys = TRUE, pageLength = 5), rownames = FALSE )
If you want to get the manifest file from the query object you can use the function getManifest. If you
set save to TRUE
a txt file that can be used with GDC-client Data transfer tool (DTT) or with its GUI version ddt-ui will be created.
getManifest(query,save = FALSE)
For the moment, ATAC-seq data is available at the GDC publication page. Also, for more details, you can check an ATAC-seq workshop at http://rpubs.com/tiagochst/atac_seq_workshop
The list of file available is below:
datatable( getResults(TCGAbiolinks:::GDCquery_ATAC_seq())[,c("file_name","file_size")], filter = 'top', options = list(scrollX = TRUE, keys = TRUE, pageLength = 5), rownames = FALSE )
You can use the function GDCquery_ATAC_seq
filter the manifest table and use GDCdownload
to save the data locally.
query <- TCGAbiolinks:::GDCquery_ATAC_seq(file.type = "rds") GDCdownload(query, method = "client") query <- TCGAbiolinks:::GDCquery_ATAC_seq(file.type = "bigWigs") GDCdownload(query, method = "client")
Retrieve the numner of files under each data_category + data_type + experimental_strategy + platform. Almost like https://portal.gdc.cancer.gov/exploration
tab <- getSampleFilesSummary(project = "TCGA-ACC") datatable( head(tab), filter = 'top', options = list(scrollX = TRUE, keys = TRUE, pageLength = 5), rownames = FALSE )
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.