options(width=120) knitr::opts_chunk$set(warning=FALSE, comment="# ", collapse=TRUE)
This guide follows the Bioconductor RNA-Seq workflow to find differentially
expressed genes in GSE132056 using DESeq2 version r gsub("‘’", "",
packageVersion("DESeq2"))
. For more details about the statistics, check the
original paper or online tutorials like the one from Harvard.
Load the sample table with treatment and diet in the extdata
directory.
library(tidyverse) extdata <- system.file("extdata", package="hciR") samples <- read_tsv(paste(extdata, "liver_samples.tsv", sep="/")) samples
Load the combined featureCounts matrix.
counts <- read_tsv(paste(extdata, "liver_counts.tsv", sep="/")) counts[, 1:8]
Remove 17597 features with zero counts and 16431 features with 5 or fewer reads in every sample to create a final count matrix with 19773 rows.
library(hciR) counts <- filter_counts(counts, n=5)
Load the mouse annotations from Ensembl 92 in hciRdata and check genes with the highest number of assigned reads.
options(width=110) library(hciRdata) n1 <- rowMeans(as_matrix(counts)) inner_join( dplyr::select(mouse92, 1:4,8), tibble(id= names(n1), mean_count = n1)) %>% mutate(description=trimws(substr(description, 1, 40))) %>% arrange(desc(mean_count))
Drop the two MT rRNAs.
counts <- semi_join(counts, filter(mouse92, biotype!="Mt_rRNA"), by=c(geneid="id"))
Following the DESeq2 vignette on interactions, there are a few ways to model the data.
Combine treatment and diet into a new column and order the factor levels. NOTE: Starting in hciR version 1.5, control groups should be listed first and pairwise combinations are selected in reverse order.
samples <- mutate(samples, trt_diet = gsub("Degs1_", "", paste(trt, diet, sep="_"))) samples$trt_diet <- factor(samples$trt_diet, levels = c("Control_NCD", "Control_HFD", "KO_NCD", "KO_HFD"))
Run DESeq using the new trt_diet column in the design formula and get the regularized log (rlog) counts for sample visualizations. These values are similar to the log2 normalized counts except the variance in low count genes is reduced.
dds1 <- deseq_from_tibble(counts, samples, design = ~ trt_diet ) rld1 <- r_log(dds1)
Plot the first two principal components using the rlog values from the top 500 variable genes.
# plot_pca(rld1, "trt_diet", tooltip=c("id", "name", "diet") , width=700) plot_pca(rld1, "trt_diet", ggplot=TRUE, label="id")
Cluster all the rlog values using the R function dist
to calculate the
Euclidean distance between samples.
plot_dist(rld1, c("trt", "diet"), na_col="white")
Run check_contrasts
to list the pairwise comparisons.
data.frame(vs=check_contrasts(dds1$trt_diet))
Use the subset
option to skip the 3rd and 4th contrasts and
compare the remaining rows using a 5% false discovery rate (FDR).
res <- results_all(dds1, mouse92, subset=c(2,5,6,1))
Plot fold changes and p-values from high fat KO vs. Control in the first contrast in a volcano plot.
plot_volcano(res[[1]], pvalue=3)
Plot the mean normalized count and fold change in an MA-plot.
plot_ma(res[[1]])
Cluster the rlog values from all 682 significant genes and scale by rows, so values represent the number of standard deviations from the mean rlog value.
x <- top_counts(res[[1]], rld1, top=1000) nrow(x) plot_genes(x, c("trt", "diet"), scale ="row", annotation_names_col=FALSE, show_rownames=FALSE)
Optionally, drop the normal chow samples.
x <- filter_top_counts(x, diet == "HFD") plot_genes(x, "trt", scale ="row", annotation_names_col=FALSE, show_rownames=FALSE)
Find genes in the PPAR Signaling Pathway using the MSigDB pathways in hciRdata. Note the mouse annotations include human homologs from MGI.
p1 <- filter(res[[1]], human_homolog %in% msig_pathways$KEGG[["PPAR Signaling Pathway"]]) dplyr::select(p1, 1:7,12)
Cluster the PPAR genes in a heatmap. There are 70 expressed genes but only 3 are significant - this will plot 37 genes with an FDR < 50%.
x <- top_counts( filter(p1, padj < 0.5), rld1, filter=FALSE) nrow(x) x <- filter_top_counts(x, diet == "HFD") plot_genes(x, "trt", fontsize_row=8, scale = "row")
Run DESeq
using ~ trt * diet in the design formula.
dds2 <- deseq_from_tibble(counts, samples, design = ~ trt * diet) rld2 <- r_log(dds2)
Check if the treatment effect differs across diets using a 5% false discovery rate (FDR). There are 105 signfiicant interactions.
DESeq2::resultsNames(dds2) int <- DESeq2::results(dds2, name = "trtDegs1_KO.dietNCD", alpha = 0.05) DESeq2::summary(int)
Add gene names and biotypes to the results.
int <- annotate_results(int, mouse92)
Create an interaction plot using the scaled rlog values from the top 25 genes sorted by adjusted p-value.
x <- top_counts( int, rld1, top=25) plot_interactions(x, c( "diet", "trt"), ylab="Z-score") + theme_bw()
Save the DESeq and interaction results, raw counts, normalized counts, regularized log counts
and mouse annotations to a single Excel file in DESeq.xlsx
and R objects to a
binary data file to load into a new session.
res_all <- c(res, list(Interactions=int)) write_deseq(res_all, dds1, rld1, mouse92) save(res, int, dds1, rld1, dds2, rld2, file="dds.rda")
There are a number of options for pathway analysis and most can be divided into one of two groups based on the input dataset. Gene set enrichment methods like Broad's GSEA require all expressed genes sorted by fold change and calculate a running sum statistic to find pathways that are enriched with either up- or down-regulated genes.
Over representation methods require a smaller subset of significant genes and use a Fisher's test to identify significant pathways. There are many online tools like Enrichr that accept a list of significant genes as input and return enriched sets. To get a list of genes, just sort the DESeq results in the Excel file by adjusted p-value and copy and paste the gene names into the search box.
The fgsea package (fast gene set enrichment analysis) is similar to Broad's
GSEA and finds pathways that are enriched with either up- or down-regulated
human genes. Load the KEGG pathways from MSigDB and run fgsea
using a 10% FDR.
set.seed(77) k1 <- fgsea_all(res, msig_pathways$KEGG)
Print the top pathways from KO_HFD vs. KO_NCD and check the GSEA user guide for details about the statistics.
options(width=110) group_by(k1[[1]][, -8], enriched) %>% top_n(4, abs(NES)) %>% ungroup()
Get the fold change vector and create an enrichment plot for Ribosome.
library(fgsea) fc <- write_gsea_rnk(res, write=FALSE) head(fc[[1]]) plotEnrichment(msig_pathways$KEGG[["Ribosome"]], fc[[1]]) + ggplot2::labs(title="Ribosome")
Compare to ECM Receptor Interaction with mostly up-regulated genes.
plotEnrichment(msig_pathways$KEGG[["ECM Receptor Interaction"]], fc[[1]]) + ggplot2::labs(title="ECM Receptor Interaction")
Plot NES scores from significant pathways in two or more contrasts.
plot_fgsea(k1, fontsize_row=7, sets =2)
Save the enriched pathways to an Excel file.
openxlsx::write.xlsx(k1, file = "KEGG_pathways.xlsx")
The genes from MSigDB are saved as a list of vectors and include hallmark, pathways, go, motifs, cancer, immunologic and oncogenic sets.
lapply(msig_hallmark[1:3], head, 7)
Four datasets are a list of lists and include two or more groups, so select a list
element like msig_pathways$REACTOME
to return the sets.
options(width=110) names(msig_pathways) names(msig_go) names(msig_motifs) names(msig_cancer)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.