Data Introduction

This package provides all necessary data to run and compile TCGAbiolinks package.

Loading the data

library(TCGAbiolinksData)
data("case1")
data("case2")
data("case3")
data("geneInfoHT")
data("geneInfo")

Data Creation

Case study n. 1: Pan Cancer downstream analysis BRCA

This case of study downlods Breast Invasive Carcinoma (BRCA) Gene Expression Quantification data aligned against GRCh37/hg19, performs DEA analysis, and correlation between it with the survival analysis.

library(SummarizedExperiment)
library(TCGAbiolinks)
query.exp <- GDCquery(project = "TCGA-BRCA", 
                      legacy = TRUE,
                      data.category = "Gene expression",
                      data.type = "Gene expression quantification",
                      platform = "Illumina HiSeq", 
                      file.type = "results",
                      experimental.strategy = "RNA-Seq",
                      sample.type = c("Primary solid Tumor","Solid Tissue Normal"))
GDCdownload(query.exp)
brca.exp <- GDCprepare(query = query.exp)

# get subtype information
dataSubt <- TCGAquery_subtype(tumor = "BRCA")

# get clinical data
dataClin <- GDCquery_clinic(project = "TCGA-BRCA","clinical") 

# which samples are solid tissue normal 
group1 <- TCGAquery_SampleTypes(colnames(brca.exp), typesample = c("NT"))

# Which samples are primary solid tumor
group2 <- TCGAquery_SampleTypes(colnames(brca.exp), typesample = c("TP"))

dataPrep <- TCGAanalyze_Preprocessing(object = brca.exp, cor.cut = 0.6)                      

dataNorm <- TCGAanalyze_Normalization(tabDF = dataPrep,
                                      geneInfo = geneInfo,
                                      method = "gcContent")                

dataFilt <- TCGAanalyze_Filtering(tabDF = dataNorm,
                                  method = "quantile", 
                                  qnt.cut =  0.25)   

dataDEGs <- TCGAanalyze_DEA(mat1 = dataFilt[,group1],
                            mat2 = dataFilt[,group2],
                            Cond1type = "Normal",
                            Cond2type = "Tumor",
                            fdr.cut = 0.01 ,
                            logFC.cut = 1,
                            method = "glmLRT")  

dataSurv <- TCGAanalyze_SurvivalKM(clinical_patient = dataClin,
                                   dataGE = dataFilt,
                                   Genelist = rownames(dataDEGs),
                                   Survresult = FALSE,
                                   ThreshTop = 0.67,
                                   ThreshDown = 0.33,
                                   p.cut = 0.05, 
                                   group1 = group1, 
                                   group2 = group2)
library(dnet)  
org.Hs.string <- dRDataLoader(RData = "org.Hs.string")
TabCoxNet <- TCGAvisualize_SurvivalCoxNET(dataClin,
                                          dataFilt, 
                                          Genelist = rownames(dataSurv),
                                          scoreConfidence = 700,
                                          org.Hs.string = org.Hs.string,
                                          titlePlot = "Case Study n.1 dnet")

save(brca.exp,dataClin,dataSurv,dataSubt, group1, group2, file = "case1.rda", compress = "xz")

Case study n. 2: Pan Cancer downstream analysis LGG

This case of study downlods low grade gliomas (LGG) Gene Expression Quantification data aligned against GRCh38/hg38, performs clusterization algorithm, which result will be visualized thourgh a heatmap. Finally evaluate the correlation between the clustering and the survival of each group.

library(TCGAbiolinks)
library(SummarizedExperiment)

query.exp <- GDCquery(project = "TCGA-LGG", 
                      data.category = "Transcriptome Profiling",
                      data.type = "Gene Expression Quantification", 
                      workflow.type = "HTSeq - Counts",
                      sample.type = "Primary solid Tumor")
GDCdownload(query.exp)
lgg.exp <- GDCprepare(query = query.exp)

# get subtype information
dataSubt <- TCGAquery_subtype(tumor = "LGG")

# get indexed clinical data
dataClin <- GDCquery_clinic(project = "TCGA-LGG", "Clinical")

# expression data with molecular subtypes
lgg.exp <- subset(lgg.exp, select = colData(lgg.exp)$patient %in% dataSubt$patient)

dataPrep <- TCGAanalyze_Preprocessing(object = lgg.exp,cor.cut = 0.6)
dataNorm <- TCGAanalyze_Normalization(tabDF = dataPrep,
                                      geneInfo = geneInfoHT, # infor for hg38 data
                                      method = "gcContent")

datFilt1 <- TCGAanalyze_Filtering(tabDF = dataNorm,method = "varFilter")
datFilt2 <- TCGAanalyze_Filtering(tabDF = datFilt1,method = "filter1")
datFilt <- TCGAanalyze_Filtering(tabDF = datFilt2,method = "filter2")

data_Hc1 <- TCGAanalyze_Clustering(tabDF = datFilt,
                                   method = "hclust",
                                   methodHC = "ward.D2")
data_Hc2 <- TCGAanalyze_Clustering(tabDF = datFilt,
                                   method = "consensus",
                                   methodHC = "ward.D2")

#------  Add cluster information
cluster <- data.frame("groupsHC" = data_Hc2[[4]]$consensusClass)
cluster$groupsHC <- paste0("EC",cluster$groupsHC)
cluster$patient <-  substr(colData(lgg.exp)$patient,1,12)

# Add information about gropus from consensus Cluster in clinical data
dataClin <- merge(dataClin,cluster, by.x="bcr_patient_barcode", by.y="patient")

# Merge subtype and clinical data
clin_subt <- merge(dataClin,dataSubt, by.x="bcr_patient_barcode", by.y="patient")
clin_subt_all <- merge(dataClin,dataSubt, 
                       by.x="bcr_patient_barcode", by.y="patient", all.x = TRUE)

LGGmut <- GDCquery_Maf(tumor = "LGG", pipelines = "muse")
# Selecting gene
mRNAsel <- "ATRX"
LGGselected <- LGGmut[LGGmut$Hugo_Symbol == mRNAsel,]

dataMut <- LGGselected[!duplicated(LGGselected$Tumor_Sample_Barcode),]
dataMut$Tumor_Sample_Barcode <- substr(dataMut$Tumor_Sample_Barcode,1,12)

# Adding the Expression Cluster classification found before
dataMut <- merge(dataMut, cluster, by.y="patient", by.x="Tumor_Sample_Barcode")
dataMut <- dataMut[dataMut$Variant_Classification!=0,]
save(clin_subt,clin_subt_all, lgg.exp, LGGmut, datFilt,file = "case2.rda", compress = "xz")

Case study n. 3: Integration of methylation and expression for ACC

In this case of study, we downloaded both adrenal cortical carcinoma (ACC) DNA methylation data for HumanMethylation450k platform and Gene Expression Quantification data aligned against GRCh38/hg38.

Also, by default TCGAbiolinks adds subtypes already published by researchers.

We selected two molecular subtypes CIMP-low and CIMP-high to perform an integrative analysis usng RNA expression and DNA methylation.

library(TCGAbiolinks)
library(SummarizedExperiment)
dir.create("case3")
setwd("case3")
#-----------------------------------
# STEP 1: Search, download, prepare |
#-----------------------------------
# 1.1 - DNA methylation
# ----------------------------------
query.met <- GDCquery(project = "TCGA-ACC", 
                      data.category = "DNA Methylation",
                      platform = "Illumina Human Methylation 450")
GDCdownload(query.met,chunks.per.download = 5)
acc.met <- GDCprepare(query = query.met)

#-----------------------------------
# 1.2 - RNA expression
# ----------------------------------
query.exp <- GDCquery(project = "TCGA-ACC", 
                      data.category = "Transcriptome Profiling",
                      data.type = "Gene Expression Quantification", 
                      workflow.type = "HTSeq - Counts")
GDCdownload(query.exp)
acc.exp <- GDCprepare(query = query.exp)

For DNA methylation, we perform a DMR (different methylated region) analysis, which will give the difference of DNA methylation for the probes of the groups and their significance value.

# na.omit: remove probes with NAs
acc.met <- subset(acc.met,subset = (rowSums(is.na(assay(acc.met))) == 0))

# Volcano plot
acc.met <- TCGAanalyze_DMR(acc.met, 
                           groupCol = "subtype_MethyLevel",
                           group1 = "CIMP-high",
                           group2 = "CIMP-low",
                           p.cut = 10^-5,
                           diffmean.cut = 0.25,
                           legend = "State",
                           plot.filename = "CIMP-highvsCIMP-low_metvolcano.png")

For the gene expression data, we perform a DEA (differential expression analysis) which will give the fold change of gene expression and their significance value.

#-------------------------------------------------
# 2.3 - DEA - Expression analysis - volcano plot
# ------------------------------------------------
acc.exp.aux <- subset(acc.exp, 
                      select = colData(acc.exp)$subtype_MethyLevel %in% c("CIMP-high","CIMP-low"))

idx <- colData(acc.exp.aux)$subtype_MethyLevel %in% c("CIMP-high")
idx2 <- colData(acc.exp.aux)$subtype_MethyLevel %in% c("CIMP-low")

dataPrep <- TCGAanalyze_Preprocessing(object = acc.exp.aux, cor.cut = 0.6)

dataNorm <- TCGAanalyze_Normalization(tabDF = dataPrep,
                                      geneInfo = geneInfoHT,
                                      method = "gcContent")

dataFilt <- TCGAanalyze_Filtering(tabDF = dataNorm,
                                  qnt.cut = 0.25,
                                  method='quantile')

acc.dataDEGs <- TCGAanalyze_DEA(mat1 = dataFilt[,idx],
                                mat2 = dataFilt[,idx2],
                                Cond1type = "CIMP-high",
                                Cond2type = "CIMP-low",
                                method = "glmLRT")

save(acc.met, acc.exp, acc.dataDEGs, file = "case3.rda", compress = "xz")

Session info

pander::pander(sessionInfo(), compact = FALSE)


BioinformaticsFMRP/TCGAbiolinksData documentation built on May 6, 2019, 7:54 a.m.