## These are the options I tend to favor library(hpgltools) library(hpgldata) ## tt <- devtools::load_all("~/hpgltools") knitr::opts_knit$set(progress = TRUE, verbose = TRUE, width = 90, echo = TRUE) knitr::opts_chunk$set(error = TRUE, fig.width = 8, fig.height = 8, dpi = 96) old_options <- options(digits = 4, stringsAsFactors = FALSE, knitr.duplicate.label = "allow") ggplot2::theme_set(ggplot2::theme_bw(base_size = 10)) set.seed(1) rmd_file <- "c-03_fission_differential_expression.Rmd"
This document aims to provide further examples in how to use the hpgltools in order to perform a set of differential expression analyses.
Here are the commands I invoke to get ready to play with new data, including everything required to install hpgltools, the software it uses, and the fission data.
library(hpgltools) tt <- sm(library(fission)) tt <- data(fission)
Later on in this, I will do some ontology shenanigans. But I can grab some annotations from biomart now.
pombe_annotations <- load_biomart_annotations( host = "fungi.ensembl.org", trymart = "fungal_mart", trydataset = "spombe_eg_gene", gene_requests = c("pombase_transcript", "ensembl_gene_id", "ensembl_transcript_id", "hgnc_symbol", "description", "gene_biotype"), species = "spombe", overwrite = TRUE) pombe_mart <- pombe_annotations[["mart"]] annotations <- pombe_annotations[["annotation"]] rownames(annotations) <- make.names(gsub(pattern = "\\.\\d+$", replacement = "", x = rownames(annotations)), unique = TRUE)
All the work I do in Dr. El-Sayed's lab makes some pretty hard assumptions about how data is stored. As a result, to use the fission data set I will do a little bit of shenanigans to match it to the expected format. Now that I have played a little with fission, I think its format is quite nice and am likely to have my experiment class instead be a SummarizedExperiment.
## Extract the meta data from the fission dataset meta <- as.data.frame(fission@colData) ## Make conditions and batches meta[["condition"]] <- paste(meta$strain, meta$minute, sep = ".") meta[["batch"]] <- meta[["replicate"]] meta[["sample.id"]] <- rownames(meta) ## Grab the count data fission_data <- fission@assays[["data"]][["counts"]] ## This will make an experiment superclass called 'expt' and it contains ## an ExpressionSet along with any arbitrary additional information one might want to include. ## Along the way it writes a Rdata file which is by default called 'expt.Rdata' fission_expt <- create_expt(metadata = meta, count_dataframe = fission_data, gene_info = annotations)
My tools by default attempt all possible pairwise comparisons, this can take a long time.
fun_data <- subset_expt(fission_expt, subset = "condition=='wt.120'|condition=='wt.30'") fun_filt <- normalize_expt(fun_data, filter = "simple") fun_norm <- sm(normalize_expt(fun_filt, batch = "limma", norm = "quant", transform = "log2", convert = "cpm"))
The following block will perform all pairwise comparisons of the fission dataset using limma. If one were to add the model_batch argument, the statistical model could include a known batch effect or estimates from SVA. If one wishes to use combat modified data, then one must use a normalize function to modify the counts with combat.
limma_comparison <- limma_pairwise(fun_filt) names(limma_comparison[["all_tables"]]) summary(limma_comparison[["all_tables"]][["wt30_vs_wt120"]]) scatter_wt_mut <- extract_coefficient_scatter(limma_comparison, type = "limma", x = "wt30", y = "wt120") scatter_wt_mut[["scatter"]] scatter_wt_mut[["both_histogram"]][["plot"]] + ggplot2::scale_y_continuous(limits = c(0, 0.20)) ma_wt_mut <- extract_de_plots(limma_comparison, type = "limma") ma_wt_mut[["ma"]][["plot"]] ma_wt_mut[["volcano"]][["plot"]]
deseq2_pairwise() invokes DESeq2 in as an identical fashion as possible to how limma was run previously. Note, I commented out the extract_de_plots() call because I changed it so that, for the moment, it only works when using combine_de_tables(). This is something I intend to change soon.
deseq_comparison <- deseq2_pairwise(fun_filt) summary(deseq_comparison[["all_tables"]][["wt30_vs_wt120"]]) scatter_wt_mut <- extract_coefficient_scatter(deseq_comparison, type = "deseq", x = "wt30", y = "wt120") scatter_wt_mut[["scatter"]] #plots_wt_mut <- extract_de_plots(deseq_comparison, type = "deseq") #plots_wt_mut[["ma"]][["plot"]] #plots_wt_mut[["volcano"]][["plot"]]
edger_pairwise(), as I suspect you already guessed, does the same process using EdgeR. This time I made explicit the model_batch parameter. It is also worth noting that I am feeding these programs the filtered expression data. This is not required, but will generally help the fidelity of the result. One may also filter the data at runtime via the 'filter' argument to *_pairwise().
edger_comparison <- edger_pairwise(fun_filt, model_batch = TRUE) #plots_wt_mut <- extract_de_plots(edger_comparison, type = "edger") scatter_wt_mut <- extract_coefficient_scatter(edger_comparison, type = "edger", x = "wt30", y = "wt120") scatter_wt_mut[["scatter"]] #plots_wt_mut[["ma"]][["plot"]] #plots_wt_mut[["volcano"]][["plot"]]
EBSeq is a bit of an outlier method. It is a purely Bayesian method and makes some peculiar decisions with respect to how it handles the inputs. In addition, it is much slower for larger datasets and is therefore automatically disabled when there are more than a few contrasts in a dataset.
ebseq_comparison <- ebseq_pairwise(fun_filt) head(ebseq_comparison$all_tables[[1]])
basic_pairwise() is also an outlier method. It explicitly does not perform any statistical operations on the data beyond a log2(cpm()) subtraction of the data. Thus, if another method agrees with this, the various normalizations, modelling, etc of that method had no effect on the result.
basic_comparison <- basic_pairwise(fun_filt) summary(basic_comparison$all_tables$wt30_vs_wt120) scatter_wt_mut <- extract_coefficient_scatter(basic_comparison, type = "basic", x = "wt30", y = "wt120") scatter_wt_mut[["scatter"]] #plots_wt_mut <- extract_de_plots(basic_comparison, type = "basic") #plots_wt_mut[["ma"]][["plot"]] #plots_wt_mut[["volcano"]][["plot"]]
This is a new entrant to the hpgltools. It has its own way of handling batches/surrogates. As a result, it ignores the model_batch parameter. Noiseq also prints a lot of random stuff to the user that (to my eyes) has no utility. I am therefore wrapping it in the sm() function to silence it.
noiseq_comparison <- sm(noiseq_pairwise(fun_filt)) summary(noiseq_comparison$all_tables$wt30_vs_wt120)
This is another recent addition. The authors of variancePartition showed in their last paper how one might include the variance estimates produced in order to supplement a statistical model. In their paper, they used limma to implement these ideas; I assume that the way voom models variance is most appropriate for this method and so in my implementation did not change that aspect of it. dream_pairwise also ignores the model_batch parameter.
all_comparisons <- all_pairwise(fun_data, model_batch = TRUE) all_comparisons all_combined <- combine_de_tables(all_comparisons, excel = "excel/wt30_vs_wt120.xlsx") all_combined save(file = "de_table.rda", list = "all_combined")
I am not sure how this will render in a rmarkdown.
## This is a neat interactive shiny widget which I forgot I wrote slide_de_threshold(all_combined)
## Here we see that edger and deseq agree the least: all_comparisons[["comparison"]][["comp"]] ## And here we can look at the set of 'significant' genes according to various tools: yeast_sig <- sm(extract_significant_genes(all_combined, excel = FALSE)) yeast_barplots <- sm(significant_barplots(combined = all_combined)) yeast_barplots[["limma"]] yeast_barplots[["edger"]] yeast_barplots[["deseq"]]
Since I didn't acquire this data in a 'normal' way, I am going to post-generate a gff file which may be used by clusterprofiler, topgo, and gostats.
Therefore, I am going to make use of TxDb to make the requisite gff file.
limma_results <- limma_comparison[["all_tables"]] ## The set of comparisons performed names(limma_results) table <- limma_results[["wt30_vs_wt120"]] dim(table) gene_names <- rownames(table) updown_genes <- get_sig_genes(table, p = 0.05, lfc = 0.4, p_column = "P.Value") tt <- please_install("GenomicFeatures") tt <- please_install("biomaRt") available_marts <- biomaRt::listMarts(host = "fungi.ensembl.org") available_marts ensembl_mart <- biomaRt::useMart("fungi_mart", host = "fungi.ensembl.org") available_datasets <- biomaRt::listDatasets(ensembl_mart) pombe_hit <- grep(pattern = "pombe", x = available_datasets[["description"]]) pombe_name <- available_datasets[pombe_hit, "dataset"] pombe_mart <- biomaRt::useDataset(pombe_name, mart = ensembl_mart) pombe_goids <- biomaRt::getBM(attributes = c("pombase_transcript", "go_id"), values = gene_names, mart = pombe_mart) colnames(pombe_goids) <- c("ID", "GO")
The above worked, it provided a table of ID and ontology. It was however a bit fraught. Here is another way.
## In theory, the above should work with a single function call: pombe_goids_simple <- load_biomart_go(species = "spombe", overwrite = TRUE, dl_rows = c("pombase_transcript", "go_id"), host = "fungi.ensembl.org") head(pombe_goids_simple[["go"]]) head(pombe_goids) ## This used to work, but does so no longer and I do not know why. ## pombe <- sm(GenomicFeatures::makeTxDbFromBiomart(biomart = "fungal_mart", ## dataset = "spombe_eg_gene", ## host = "fungi.ensembl.org")) ## I bet I can get all this information from ensembl now. ## This was found at the bottom of: https://www.biostars.org/p/232005/ link <- "ftp://ftp.ensemblgenomes.org/pub/release-34/fungi/gff3/schizosaccharomyces_pombe/Schizosaccharomyces_pombe.ASM294v2.34.gff3.gz" pombe <- GenomicFeatures::makeTxDbFromGFF(link, format = "gff3", taxonomyId = 4896, organism = "Schizosaccharomyces pombe") pombe_transcripts <- as.data.frame(GenomicFeatures::transcriptsBy(pombe)) lengths <- pombe_transcripts[, c("group_name","width")] colnames(lengths) <- c("ID","width") lengths[["ID"]] <- gsub(x = lengths[["ID"]], pattern = "^gene:", replacement = "") ## Something useful I didn't notice before: ## makeTranscriptDbFromGFF() ## From GenomicFeatures, much like my own gff2df() gff_from_txdb <- GenomicFeatures::asGFF(pombe) ## why is GeneID: getting prefixed to the IDs!? gff_from_txdb$ID <- gsub(x = gff_from_txdb$ID, pattern = "GeneID:", replacement = "") written_gff <- rtracklayer::export.gff3(gff_from_txdb, con = "pombe.gff")
summary(updown_genes) test_genes <- updown_genes[["down_genes"]] ##rownames(test_genes) <- paste0(rownames(test_genes), ".1") ##lengths[["ID"]] <- paste0(lengths[["ID"]], ".1") goseq_result <- simple_goseq(sig_genes = test_genes, go_db = pombe_goids, length_db = lengths) head(goseq_result[["all_data"]]) goseq_result[["pvalue_plots"]][["mfp_plot_over"]] goseq_result[["pvalue_plots"]][["bpp_plot_over"]] test_genes <- updown_genes[["up_genes"]] ##rownames(test_genes) <- paste0(rownames(test_genes), ".1") goseq_result <- simple_goseq(sig_genes = test_genes, go_db = pombe_goids, length_db = lengths) head(goseq_result[["all_data"]]) goseq_result[["pvalue_plots"]][["mfp_plot_over"]] goseq_result[["pvalue_plots"]][["bpp_plot_over"]]
clusterProfiler really prefers an orgdb instance to use, which is probably smart, as they are pretty nice. Sadly, there is no pre-defined orgdb for pombe...
## holy crap makeOrgPackageFromNCBI is slow, no slower than some of mine, so who am I to complain. if (! "org.Spombe.eg.db" %in% installed.packages()) { orgdb <- AnnotationForge::makeOrgPackageFromNCBI( version = "0.1", author = "atb <abelew@gmail.com>", maintainer = "atb <abelew@gmail.com>", tax_id = "4896", genus = "Schizosaccharomyces", species = "pombe") ## This created the directory 'org.spombe.eg.db' devtools::install_local("org.Spombe.eg.db") } library(org.Spombe.eg.db) ## Don't forget to remove the terminal .1 from the gene names... ## If you do forget this, it will fail for no easily visible reason until you remember ## this and get really mad at yourself. rownames(test_genes) <- gsub(pattern = ".1$", replacement = "", x = rownames(test_genes)) pombe_goids[["ID"]] <- gsub(pattern = ".1$", replacement = "", x = pombe_goids[["ID"]]) cp_result <- simple_clusterprofiler(sig_genes = test_genes, do_david = FALSE, do_gsea = FALSE, de_table = all_combined$data[[1]], orgdb = org.Spombe.eg.db, orgdb_to = "ALIAS") cp_result[["pvalue_plots"]][["ego_all_mf"]] ## Yay bar plots!
## Get rid of those stupid terminal .1s. #rownames(test_genes) <- gsub(pattern = ".1$", replacement = "", x = rownames(test_genes)) #pombe_goids[["ID"]] <- gsub(pattern = ".1$", replacement = "", x = pombe_goids[["ID"]]) tp_result <- simple_topgo(sig_genes = test_genes, go_db = pombe_goids, pval_column = "limma_adjp") tp_result
## Get rid of those stupid terminal .1s. ##rownames(test_genes) <- gsub(pattern = ".1$", replacement = "", x = rownames(test_genes)) ##pombe_goids[["ID"]] <- gsub(pattern = ".1$", replacement = "", x = pombe_goids[["ID"]]) ## universe_merge is the column in the final data frame when. ## gff_type is the field in the gff file providing the id, this may be redundant with ## universe merge, that is something to check on... gst_result <- simple_gostats(sig_genes = test_genes, go_db = pombe_goids, universe_merge = "id", gff_type = "gene", gff = "pombe.gff", pval_column = "limma_adjp") gst_result
pander::pander(sessionInfo())
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.