# LinkedOmics Firehose Check
# Gabriel Odom
# 2019-05-13
library(tidyverse)
library(pathwayPCA)
# Download the clinical annotation, PNNL proteome data for tumor samples
# (log-ratio normalized), and gene-level copy-number change (GISTIC2 log
# ratio) from:
# http://linkedomics.org/data_download/TCGA-OV/
###### Clinical Data ########################################################
# This file is named a much longer "firehose" name:
# Human__TCGA_OV__MS__Clinical__Clinical__01_28_2016__BI__Clinical__Firehose.tsi,
# but I saved it under Human_TCGA_OV_Clinical_Annotation.tsi instead.
linkedOmics_TCGAOVPheno_df <- read_delim(
"inst/extdata/Human_TCGA_OV_Clinical_Annotation.tsi",
"\t", escape_double = FALSE, trim_ws = TRUE
)
LOPhenoT_df <- TransposeAssay(linkedOmics_TCGAOVPheno_df) %>%
# Replace "." with "-"
mutate(Sample = str_replace_all(Sample, "\\.", "-")) %>%
# character to numeric and change case to lower
mutate(years_to_birth = as.integer(years_to_birth)) %>%
rename(tumor_purity = Tumor_purity) %>%
mutate(tumor_purity = as.numeric(tumor_purity)) %>%
rename(platinum_status = Platinum_status) %>%
# Recode radiation to logical
mutate(radiation_therapy = radiation_therapy == "yes") %>%
# survival time and status
rename(OS_time = overall_survival) %>%
mutate(OS_time = as.numeric(OS_time)) %>%
rename(OS_status = status) %>%
mutate(OS_status = as.integer(OS_status)) %>%
select(-overallsurvival)
# 591 obs x 10 features
LOsurv_df <- LOPhenoT_df %>%
select(Sample, OS_time, OS_status)
ovPheno_df <- LOsurv_df[complete.cases(LOsurv_df), ]
# 565 obs x 3 features
rm(linkedOmics_TCGAOVPheno_df, LOPhenoT_df, LOsurv_df)
# usethis::use_data(ovPheno_df)
###### Proteomics ###########################################################
# The "firehose" name for this file is:
# Human__TCGA_OV__PNNL__Proteome__Velos___QExact__01_28_2016__PNNL__Gene__CDAP_iTRAQ_UnsharedLogRatio_r2.cct
# I named it Human_TCGA_OV_PNNL_Proteome_logRatio.cct.
linkedOmics_TCGAOV_Proteomics_df <- read_delim(
"inst/extdata/Human_TCGA_OV_PNNL_Proteome_logRatio.cct",
"\t", escape_double = FALSE, trim_ws = TRUE
)
# For some bizarre reason, this object is also a "spec_tbl_df"?
LOProteomicsT_df <- TransposeAssay(linkedOmics_TCGAOV_Proteomics_df) %>%
# Replace "." with "-"
mutate(Sample = str_replace_all(Sample, "\\.", "-"))
rm(linkedOmics_TCGAOV_Proteomics_df)
### Check Missingness ###
LOProtMissing_num <- sapply(LOProteomicsT_df, function(x) mean(is.na(x)))
summary(LOProtMissing_num)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.0000 0.0000 0.0000 0.1225 0.2143 0.5952
LOProtSampMissing_num <- apply(
LOProteomicsT_df,
MARGIN = 1,
function(x) mean(is.na(x))
)
summary(LOProtSampMissing_num)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.04046 0.06841 0.11736 0.12251 0.15936 0.23873
# Since we have some features with a ton of missing data, we are first going to
# match the clinical data to the proteomics data to find if any subjects are
# also missing survival time / outcome.
### Subset by Recorded Clinical Outcome ###
keepProtSamples_char <- intersect(
ovPheno_df$Sample,
LOProteomicsT_df$Sample
)
LOProteomicsT_df <- LOProteomicsT_df %>%
filter(Sample %in% keepProtSamples_char)
# We lose 1
# Now, we can cut any proteins with greater than 20% missingness
LOProtMissing_num <- sapply(LOProteomicsT_df, function(x) mean(is.na(x)))
summary(LOProtMissing_num)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.0000 0.0000 0.0000 0.1227 0.2169 0.6024
LOProtTrimT_df <- LOProteomicsT_df[, which(LOProtMissing_num < 0.2)]
# Subsetting the LOProteomicsT_df object removes the "spec_tbl_df" class?
rm(
keepProtSamples_char,
LOProtMissing_num,
LOProtSampMissing_num,
LOProteomicsT_df
)
mean(is.na(LOProtTrimT_df[, -1]))
# 2.85% missingness
### Impute Missing Data ###
# Use kNN from Bioconductor's package impute:: with default settings
LOProtTrim_mat <-
LOProtTrimT_df %>%
dplyr::select(-Sample) %>%
t
colnames(LOProtTrim_mat) <- LOProtTrimT_df$Sample
LOProtTrim_ls <- impute::impute.knn(LOProtTrim_mat)
ovProteome_df <- TransposeAssay(
as_tibble(LOProtTrim_ls$data, rownames = "Proteins"),
omeNames = "firstCol"
)
rm(LOProtTrimT_df, LOProtTrim_mat, LOProtTrim_ls)
# usethis::use_data(ovProteome_df)
###### Copy Number Data #####################################################
# The "firehose" name for this file is:
# Human__TCGA_OV__BI__SCNA__SNP_6.0__01_28_2016__BI__Gene__Firehose_GISTIC2.cct
# I named it Human_TCGA_OV_SCNV_GISTIC2.cct.
linkedOmics_TCGAOV_CNV_df <- read_delim(
"inst/extdata/Human_TCGA_OV_SCNV_GISTIC2.cct",
"\t", escape_double = FALSE, trim_ws = TRUE
)
# This also has class "spec_tbl_df". I think it's from readr' import "specs"
ovCNV_df <- TransposeAssay(linkedOmics_TCGAOV_CNV_df) %>%
# Replace "." with "-"
mutate(Sample = str_replace_all(Sample, "\\.", "-"))
anyNA(ovCNV_df)
rm(linkedOmics_TCGAOV_CNV_df)
# When I subsetted the proteome data frame, I lose that "spec" class. From the
# readr 1.3.0 NEWS, subsetting removes this class and returns a regular
# tibble. See https://cran.r-project.org/web/packages/readr/news/news.html.
ovCNV_df <- ovCNV_df[]
# usethis::use_data(ovCNV_df)
###### gene expression Data #####################################################
require(TCGAbiolinks)
queryDown.OV <- GDCquery(project = "TCGA-OV",
data.category = "Transcriptome Profiling",
data.type = "Gene Expression Quantification",
experimental.strategy = "RNA-Seq",
workflow.type = "HTSeq - FPKM-UQ")
GDCdownload(query = queryDown.OV)
dataPrep.OV <- GDCprepare(query = queryDown.OV)
save(dataPrep.OV, file = "TCGA_dataPrep.OV.hg38.Rdata")
dataPrep2 <- TCGAanalyze_Preprocessing(
object = dataPrep.OV,
cor.cut = 0.6)
dataNorm <- TCGAanalyze_Normalization(
tabDF = as.matrix(dataPrep2),
geneInfo = geneInfoHT,
method = "gcContent")
boxplot(dataNorm[,c(1:40)],outline = FALSE)
dataNorm <- as.data.frame(dataNorm)
dataNorm <- rownames_to_column(dataNorm, var = "ensembl_gene_id")
###change ensemble id into gene name
load("~/Bioc2019pathwayPCA/inst/extdata/Human_genes__GRCh38_p12_.rda")
gene.location <- gene.location[,c(5,7)]
mRNA_norm <- left_join(dataNorm, gene.location, by = "ensembl_gene_id")
####rm duplicate
bool <- duplicated(mRNA_norm$ensembl_gene_id)
mRNA_norm_sub <- mRNA_norm[!bool,]
mRNA_norm_sub <- mRNA_norm_sub[,-1]
mRNA_norm_sub <- mRNA_norm_sub[,c(ncol(mRNA_norm_sub),1:ncol(mRNA_norm_sub)-1)]
ovmRNA_df <- TransposeAssay(mRNA_norm_sub) %>%
# Replace "." with "-"
mutate(Sample = str_replace_all(Sample, "\\.", "-"))
ovmRNA_df$Sample <- str_sub(ovmRNA_df$Sample, start = 1, end = 12)
#usethis::usedata(ovmRNA_df)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.