Nothing
## ----init, results='hide', echo=FALSE, warning=FALSE, message=FALSE-----------
library(knitr)
opts_chunk$set(warning=FALSE, message=FALSE)
BiocStyle::markdown()
## ----install_bioc, eval=FALSE-------------------------------------------------
# if (!require("BiocManager"))
# install.packages("BiocManager")
# BiocManager::install('GenomicDataCommons')
## ----libraries, message=FALSE-------------------------------------------------
library(GenomicDataCommons)
## ----statusQS-----------------------------------------------------------------
GenomicDataCommons::status()
## ----statusCheck--------------------------------------------------------------
stopifnot(GenomicDataCommons::status()$status=="OK")
## ----manifest-----------------------------------------------------------------
ge_manifest = files() %>%
filter( cases.project.project_id == 'TCGA-OV') %>%
filter( type == 'gene_expression' ) %>%
filter( analysis.workflow_type == 'HTSeq - Counts') %>%
manifest()
head(ge_manifest)
fnames = lapply(ge_manifest$id[1:20],gdcdata)
## ----downloadQS---------------------------------------------------------------
fnames = lapply(ge_manifest$id[1:20],gdcdata)
## ----gdc_clinical-------------------------------------------------------------
case_ids = cases() %>% results(size=10) %>% ids()
clindat = gdc_clinical(case_ids)
names(clindat)
## ----clinData-----------------------------------------------------------------
head(clindat[["main"]])
head(clindat[["diagnoses"]])
## ----metadataQS---------------------------------------------------------------
expands = c("diagnoses","annotations",
"demographic","exposures")
clinResults = cases() %>%
GenomicDataCommons::select(NULL) %>%
GenomicDataCommons::expand(expands) %>%
results(size=50)
str(clinResults[[1]],list.len=6)
# or listviewer::jsonedit(clinResults)
## ----projectquery-------------------------------------------------------------
pquery = projects()
## ----pquery-------------------------------------------------------------------
str(pquery)
## ----pquerycount--------------------------------------------------------------
pcount = count(pquery)
# or
pcount = pquery %>% count()
pcount
## ----pqueryresults------------------------------------------------------------
presults = pquery %>% results()
## ----presultsstr--------------------------------------------------------------
str(presults)
## ----presultsall--------------------------------------------------------------
length(ids(presults))
presults = pquery %>% results_all()
length(ids(presults))
# includes all records
length(ids(presults)) == count(pquery)
## ----defaultfields------------------------------------------------------------
default_fields('files')
# The number of fields available for files endpoint
length(available_fields('files'))
# The first few fields available for files endpoint
head(available_fields('files'))
## ----selectexample------------------------------------------------------------
# Default fields here
qcases = cases()
qcases$fields
# set up query to use ALL available fields
# Note that checking of fields is done by select()
qcases = cases() %>% GenomicDataCommons::select(available_fields('cases'))
head(qcases$fields)
## ----aggexample---------------------------------------------------------------
# total number of files of a specific type
res = files() %>% facet(c('type','data_type')) %>% aggregations()
res$type
## ----allfilesunfiltered-------------------------------------------------------
qfiles = files()
qfiles %>% count() # all files
## ----onlyGeneExpression-------------------------------------------------------
qfiles = files() %>% filter( type == 'gene_expression')
# here is what the filter looks like after translation
str(get_filter(qfiles))
## ----filtAvailFields----------------------------------------------------------
grep('pro',available_fields('files'),value=TRUE) %>%
head()
## ----filtProgramID------------------------------------------------------------
files() %>%
facet('cases.project.project_id') %>%
aggregations() %>%
head()
## ----filtfinal----------------------------------------------------------------
qfiles = files() %>%
filter( cases.project.project_id == 'TCGA-OV' & type == 'gene_expression')
str(get_filter(qfiles))
qfiles %>% count()
## ----filtChain----------------------------------------------------------------
qfiles2 = files() %>%
filter( cases.project.project_id == 'TCGA-OV') %>%
filter( type == 'gene_expression')
qfiles2 %>% count()
(qfiles %>% count()) == (qfiles2 %>% count()) #TRUE
## ----filtAndManifest----------------------------------------------------------
manifest_df = qfiles %>% manifest()
head(manifest_df)
## ----filterForHTSeqCounts-----------------------------------------------------
qfiles = files() %>% filter( ~ cases.project.project_id == 'TCGA-OV' &
type == 'gene_expression' &
analysis.workflow_type == 'HTSeq - Counts')
manifest_df = qfiles %>% manifest()
nrow(manifest_df)
## ----authenNoRun, eval=FALSE--------------------------------------------------
# token = gdc_token()
# transfer(...,token=token)
# # or
# transfer(...,token=get_token())
## ----singlefileDL-------------------------------------------------------------
fnames = gdcdata(manifest_df$id[1:2],progress=FALSE)
## ----bulkDL-------------------------------------------------------------------
fnames = gdcdata(manifest_df$id[3:10], access_method = 'client')
## ----casesPerProject----------------------------------------------------------
res = cases() %>% facet("project.project_id") %>% aggregations()
head(res)
library(ggplot2)
ggplot(res$project.project_id,aes(x = key, y = doc_count)) +
geom_bar(stat='identity') +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
## ----casesInTCGA--------------------------------------------------------------
cases() %>% filter(~ project.program.name=='TARGET') %>% count()
## ----casesInTARGET------------------------------------------------------------
cases() %>% filter(~ project.program.name=='TCGA') %>% count()
## ----casesTCGABRCASampleTypes-------------------------------------------------
# The need to do the "&" here is a requirement of the
# current version of the GDC API. I have filed a feature
# request to remove this requirement.
resp = cases() %>% filter(~ project.project_id=='TCGA-BRCA' &
project.project_id=='TCGA-BRCA' ) %>%
facet('samples.sample_type') %>% aggregations()
resp$samples.sample_type
## ----casesTCGABRCASolidNormal-------------------------------------------------
# The need to do the "&" here is a requirement of the
# current version of the GDC API. I have filed a feature
# request to remove this requirement.
resp = cases() %>% filter(~ project.project_id=='TCGA-BRCA' &
samples.sample_type=='Solid Tissue Normal') %>%
GenomicDataCommons::select(c(default_fields(cases()),'samples.sample_type')) %>%
response_all()
count(resp)
res = resp %>% results()
str(res[1],list.len=6)
head(ids(resp))
## ----filesVCFCount------------------------------------------------------------
res = files() %>% facet('type') %>% aggregations()
res$type
ggplot(res$type,aes(x = key,y = doc_count)) + geom_bar(stat='identity') +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
## ----filesRNAseqGeneGBM-------------------------------------------------------
q = files() %>%
GenomicDataCommons::select(available_fields('files')) %>%
filter(~ cases.project.project_id=='TCGA-GBM' &
data_type=='Gene Expression Quantification')
q %>% facet('analysis.workflow_type') %>% aggregations()
# so need to add another filter
file_ids = q %>% filter(~ cases.project.project_id=='TCGA-GBM' &
data_type=='Gene Expression Quantification' &
analysis.workflow_type == 'HTSeq - Counts') %>%
GenomicDataCommons::select('file_id') %>%
response_all() %>%
ids()
## ----filesRNAseqGeneGBMforBAM-------------------------------------------------
q = files() %>%
GenomicDataCommons::select(available_fields('files')) %>%
filter(~ cases.project.project_id == 'TCGA-GBM' &
data_type == 'Aligned Reads' &
experimental_strategy == 'RNA-Seq' &
data_format == 'BAM')
file_ids = q %>% response_all() %>% ids()
## ----slicing10, eval=FALSE----------------------------------------------------
# bamfile = slicing(file_ids[1],regions="chr12:6534405-6538375",token=gdc_token())
# library(GenomicAlignments)
# aligns = readGAlignments(bamfile)
## ----sessionInfo--------------------------------------------------------------
sessionInfo()
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.