knitr::opts_chunk$set(echo = TRUE) devtools::load_all()
This vignette extends the xcore user guide by showing how to perform gene level expression modeling using RNA-seq and microarray input data.
In xcore we model the expression as a function of promoter's sequence described by the ChIP-seq signatures. Similarly to differential expression analysis, one can consider the expression at the level of genes or transcripts. While most of the analysis steps would be identical for the two variants, the key difference is in the choice of promoters annotation. Particularly, different gene's transcripts can have distinct promoters. One the other hand, expression at gene level is an aggregation of transcripts expression yet we would rather like to use only one promoter to describe a single gene.
xcoredata provides ChIP-seq based molecular signatures constructed against
FANTOM5 promoters. These promoters correspond to transcripts' promoters making
them very easy to use with CAGE data mapped to FANTOM5 reference (as exemplified
in
xcore user guide).
Additionally, xcoredata provides a set of core FANTOM5 promoters
(promoters_f5_core
) which defines only one promoter per gene. Using this core
set of promoters it is possible to construct a gene level analysis with
non-CAGE data, by matching input gene identifiers with core FANTOM5 promoters.
The reminder of this document shows how to construct a gene level xcore analysis. For this purpose we use publicly available RNA-seq time-series dataset obtained from classic TGFꞵ induced epithelial-to-mesenchymal transition experiment conducted in A-549 cell line (Chang H, et al. NAR. 2016., GSE69667). The other example uses microarray time-series dataset also from TGFꞵ induced epithelial-to-mesenchymal transition experiment conducted in A-549 cell line (Sartor MA, et al. Bioinformatics. 2010., GSE17708).
library("xcore")
library("xcoredata")
We have downloaded the raw counts and metadata for GSE69667 study from GREIN portal that provide raw counts for many RNA-seq datasets deposited in the GEO repository. Before these can be used in xcore framework, downloaded metadata needs to be combined with raw counts.
If you are interested in the processing details, look up the Rmd source file of this vignette.
counts_rna_seq <- read.csv("GSE69667_GeneLevel_Raw_data.csv", row.names = "X") gene_symbol <- counts_rna_seq$gene_symbol i_dups <- duplicated(gene_symbol) | duplicated(gene_symbol, fromLast = TRUE) counts_rna_seq <- counts_rna_seq[! i_dups, colnames(counts_rna_seq) != "gene_symbol"] %>% as.matrix() %>% round() rownames(counts_rna_seq) <- gene_symbol[! i_dups] mode(counts_rna_seq) <- "integer" metadata <- read.csv("GSE69667_metadata.csv", row.names = "X") colnames(counts_rna_seq) <- sub(pattern = " ", replacement = "_", x = metadata[colnames(counts_rna_seq), "characteristics"]) out_prefix <- system.file("inst", "extdata", package = "xcore") save(counts_rna_seq, file = file.path(out_prefix, "GSE69667.rda"))
Here we recommend that ready to use counts matrix can be downloaded from our server.
download.file(url = "https://zdglab.iimcb.gov.pl/mmigdal/GSE69667.rda", destfile = "GSE69667.rda") load("GSE69667.rda")
The code below shows how to construct design matrix, by taking advantage of the patterns present in the sample names.
knitr::kable(head(counts_rna_seq))
cond <- sub(pattern = "_.*", replacement = "", x = colnames(counts_rna_seq)) ncond <- cond %>% unique() %>% length() design <- diag(ncond)[rep(seq_len(ncond), times = table(cond)[unique(cond)]), ] rownames(design) <- colnames(counts_rna_seq) colnames(design) <- unique(cond) knitr::kable(design)
promoters_f5_core <- xcoredata::promoters_f5_core() remap_promoters_f5 <- xcoredata::remap_promoters_f5()
This is the key step in gene level analysis. First we obtain a character vector
mapping gene symbols to FANTOM5 core promoters. Then we can use it to translate
our counts matrix using translateCounts
function.
eh <- ExperimentHub::ExperimentHub() symbol2fantom <- eh[["EH7700"]] counts_rna_seq_fantom <- translateCounts(counts_rna_seq, dict = symbol2fantom)
For the purpose of making this vignette compile in a reasonable time we are
using only a small selection of signatures (selected_signatures
). Outside the
scope of this vignette one would rather use the whole set of remap_promoters_f5
signatures.
mae_rna_seq <- prepareCountsForRegression( counts = counts_rna_seq_fantom, design = design, base_lvl = "0h", drop_base_lvl = NULL, ) mae_rna_seq <- addSignatures(mae_rna_seq, remap = remap_promoters_f5) mae_rna_seq <- filterSignatures(mae_rna_seq, min = 0.05, max = 0.95) mae_rna_seq_nb <- prepareCountsForRegression( counts = counts_rna_seq_fantom, design = design, base_lvl = "0h", drop_base_lvl = NULL, log2 = FALSE ) mae_rna_seq_nb <- addSignatures(mae_rna_seq_nb, remap = remap_promoters_f5) mae_rna_seq_nb <- filterSignatures(mae_rna_seq_nb, min = 0.05, max = 0.95)
As always, depending on infrastructure you are using this step can be time consuming.
# load negative.binomial library(MASS) # register parallel backend doMC::registerDoMC(cores = 15L) # set seed set.seed(314159265) res_rna_seq_normal <- modelGeneExpression( mae = mae_rna_seq, xnames = "remap", nfolds = 5, family = "gaussian", pvalues = FALSE) res_rna_seq_nb_pval <- modelGeneExpression( mae = mae_rna_seq, xnames = "remap", nfolds = 5, family = negative.binomial(theta = 5), pvalues = FALSE)
top_signatures <- head(res_rna_seq$results$remap, 15) pheatmap::pheatmap( mat = top_signatures[, 2:8], labels_row = top_signatures$name, cluster_cols = FALSE, color = colorRampPalette(c("blue", "white", "red"))(35), breaks = seq(from = -0.1, to = 0.1, length.out = 36), main = "TGFꞵ EMT in A-549 cell line (GSE69667)\nReMap2020molecular signatures activity" )
## 0h rsq0h <- list() j <- grepl("0h", colnames(mae_rna_seq[["Y"]])) nm <- "remap" rsq0h$normal <- foreach(i = seq_len(sum(j)), .combine = c) %dopar% estimateStat( x = mae_rna_seq[[nm]], y = mae_rna_seq[["Y"]][, j][, i], u = mae_rna_seq[["U"]], s = res_rna_seq_normal[["regression_models"]][[nm]][j][[i]]$lambda.min, statistic = rsq) rsq0h$nb <- foreach(i = seq_len(sum(j)), .combine = c) %dopar% estimateStat( x = mae_rna_seq[[nm]], y = mae_rna_seq[["Y"]][, j][, i], u = mae_rna_seq[["U"]], s = res_rna_seq_nb[["regression_models"]][[nm]][j][[i]]$lambda.min, statistic = rsq, family = negative.binomial(theta = 5), type="response") ## 72h rsq72h <- list() j <- grepl("72h", colnames(mae_rna_seq[["Y"]])) nm <- "remap" rsq72h$normal <- foreach(i = seq_len(sum(j)), .combine = c) %dopar% estimateStat( x = mae_rna_seq[[nm]], y = mae_rna_seq[["Y"]][, j][, i], u = mae_rna_seq[["U"]], s = res_rna_seq_normal[["regression_models"]][[nm]][j][[i]]$lambda.min, statistic = rsq) rsq72h$nb <- foreach(i = seq_len(sum(j)), .combine = c) %dopar% estimateStat( x = mae_rna_seq[[nm]], y = mae_rna_seq[["Y"]][, j][, i], u = mae_rna_seq[["U"]], s = res_rna_seq_nb[["regression_models"]][[nm]][j][[i]]$lambda.min, statistic = rsq, family = negative.binomial(theta = 5), type="response")
```r plotMolecularSignaturesSetComparison <- function() { cex.axis <- 0.8
dev.hold() on.exit(dev.flush()) op <- par(no.readonly = TRUE) on.exit(par(op), add = TRUE)
xsel <- c("remap", "chip_atlas", "jaspar", "swissregulon", "remap_randomized") xlabs <- c( "ReMap2020", "ChIP-Atlas", "Jaspar", "SwissRegulon", "randomized\nReMap2020" )
# define layout graphics::layout(mat = matrix( data = c(1, 2, 1, 3), nrow = 2, ncol = 2 ), heights = c(2, 7))
# title par(mar = c(0.5, 1.1, 1.1, 1.1)) plot.new() graphics::text( x = 0.5, y = 0.5, adj = c(0.5, 0.5), labels = expression(atop( "Molecular signatures sets comparison", paste("CV ", R^2, " across replicates, 24h DPI") )), cex = 1.8, font = 2 )
# boxplots par(mar = c(9.1, 5.1, 2.1, 1.1), font.main = 1, cex.main = 1.1, cex = 1.0) graphics::boxplot( x = rsqA$dpi[xsel], names = xlabs, main = "A-549", ylab = expression(R ^ 2), xlab = "", ylim = c(0, 0.4), las = 2, cex.axis = cex.axis ) graphics::boxplot( x = rsqM$dpi[xsel], names = xlabs, main = "MDA-321-D", ylab = expression(R ^ 2), xlab = "", ylim = c(0, 0.4), las = 2, cex.axis = cex.axis )
invisible() }
plotMolecularSignaturesSetComparison()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.