inst/doc/mulea.R

## ----global_options, include=TRUE, echo=FALSE---------------------------------
knitr::opts_chunk$set(
    echo = TRUE,
    warning = TRUE,
    message = TRUE,
    error = FALSE)

## ----'install1', eval=FALSE, message=FALSE, warning=FALSE---------------------
#  # Installing the BiocManager package if needed
#  if (!require("BiocManager", quietly = TRUE))
#      install.packages("BiocManager")
#  # Installing the fgsea package with the BiocManager
#  BiocManager::install("fgsea")

## ----'install2', eval=FALSE, message=FALSE, warning=FALSE---------------------
#  install.packages("mulea")

## ----'install3', eval=FALSE, message=FALSE, warning=FALSE---------------------
#  # Installing the devtools package if needed
#  if (!require("devtools", quietly = TRUE))
#      install.packages("devtools")
#  
#  # Installing the mulea package from GitHub
#  devtools::install_github("https://github.com/ELTEbioinformatics/mulea")

## ----'calling1', eval=FALSE---------------------------------------------------
#  library(mulea)
#  library(tidyverse)

## ----'calling2', echo=FALSE---------------------------------------------------
suppressMessages(library(mulea))
suppressMessages(library(tidyverse))

## ----'read_GMT1', eval=FALSE--------------------------------------------------
#  # Reading the mulea GMT file locally
#  tf_ontology <- read_gmt("Transcription_factor_RegulonDB_Escherichia_coli_GeneSymbol.gmt")

## ----'read_GMT2'--------------------------------------------------------------
# Reading the GMT file from the GitHub repository
tf_ontology <- read_gmt("https://raw.githubusercontent.com/ELTEbioinformatics/GMT_files_for_mulea/main/GMT_files/Escherichia_coli_83333/Transcription_factor_RegulonDB_Escherichia_coli_GeneSymbol.gmt")

## ----'read_GMT3', eval=FALSE--------------------------------------------------
#  # Reading the Enrichr GMT file locally
#  tf_enrichr_ontology <- read_gmt("TRRUST_Transcription_Factors_2019.txt")
#  
#  # The ontology_name is empty, therefore we need to fill it with the ontology_id
#  tf_enrichr_ontology$ontology_name <- tf_enrichr_ontology$ontology_id
#  

## ----'read_GMT4', eval=FALSE--------------------------------------------------
#  # Reading the MsigDB GMT file locally
#  tf_msigdb_ontology <- read_gmt("c3.tft.v2023.2.Hs.symbols.gmt")

## ----'read_muleaData', eval=FALSE---------------------------------------------
#  # Installing the ExperimentHub package from Bioconductor
#  BiocManager::install("ExperimentHub")
#  
#  # Calling the ExperimentHub library.
#  library(ExperimentHub)
#  
#  # Downloading the metadata from ExperimentHub.
#  eh <- ExperimentHub()
#  
#  # Creating the muleaData variable.
#  muleaData <- query(eh, "muleaData")
#  
#  # Looking for the ExperimentalHub ID of the ontology.
#  EHID <- mcols(muleaData) %>%
#    as.data.frame() %>%
#    dplyr::filter(title == "Transcription_factor_RegulonDB_Escherichia_coli_GeneSymbol.rds") %>%
#    rownames()
#  
#  # Reading the ontology from the muleaData package.
#  tf_ontology <- muleaData[[EHID]]
#  
#  # Change the header
#  tf_ontology <- tf_ontology %>%
#    rename(ontology_id = "ontologyId",
#           ontology_name = "ontologyName",
#           list_of_values = "listOfValues")

## ----'exclude_ontology'-------------------------------------------------------
# Filtering the ontology
tf_ontology_filtered <- filter_ontology(gmt = tf_ontology,
                                        min_nr_of_elements = 3,
                                        max_nr_of_elements = 400)

## ----'save_gmt', eval=FALSE---------------------------------------------------
#  # Saving the ontology to GMT file
#  write_gmt(gmt = tf_ontology_filtered,
#            file = "Filtered.gmt")

## ----'list_to_gmt_example', eval=FALSE----------------------------------------
#  # Creating a list of gene sets
#  ontology_list <- list(gene_set1 = c("gene1", "gene2", "gene3"),
#                        gene_set2 = c("gene4", "gene5", "gene6"))
#  
#  # Converting the list to a ontology (GMT) object
#  new_ontology_df <- list_to_gmt(ontology_list)

## ----'reading_target_bg'------------------------------------------------------
# Taget set
target_set <- readLines("https://raw.githubusercontent.com/ELTEbioinformatics/mulea/master/inst/extdata/target_set.txt")

# Background set
background_set  <- readLines("https://raw.githubusercontent.com/ELTEbioinformatics/mulea/master/inst/extdata/background_set.txt")

## ----'ora'--------------------------------------------------------------------
# Creating the ORA model using the GMT variable
ora_model <- ora(gmt = tf_ontology_filtered, 
                 # Test set variable
                 element_names = target_set, 
                 # Background set variable
                 background_element_names = background_set, 
                 # p-value adjustment method
                 p_value_adjustment_method = "eFDR", 
                 # Number of permutations
                 number_of_permutations = 10000,
                 # Number of processor threads to use
                 nthreads = 2, 
                 # Setting a random seed for reproducibility
                 random_seed = 1) 

# Running the ORA
ora_results <- run_test(ora_model)

## ----'ora_size'---------------------------------------------------------------
ora_results %>%
  # Rows where the eFDR < 0.05
  filter(eFDR < 0.05) %>% 
  # Number of such rows
  nrow()

## ----'print_ora', eval=FALSE--------------------------------------------------
#  ora_results %>%
#    # Arrange the rows by the eFDR values
#    arrange(eFDR) %>%
#    # Rows where the eFDR < 0.05
#    filter(eFDR < 0.05)

## ----'print_ora2', echo=FALSE-------------------------------------------------
ora_results %>%
  # Arrange the rows by the eFDR values
  arrange(eFDR) %>% 
  # Rows where the eFDR < 0.05
  filter(eFDR < 0.05) %>% 
  knitr::kable()

## ----'init_plot_ora'----------------------------------------------------------
# Reshapeing the ORA results for visualisation
ora_reshaped_results <- reshape_results(model = ora_model, 
                                        model_results = ora_results, 
                                        # Choosing which column to use for the
                                        #     indication of significance
                                        p_value_type_colname = "eFDR")

## ----'lollipop_plot_ora'------------------------------------------------------
plot_lollipop(reshaped_results = ora_reshaped_results,
              # Column containing the names we wish to plot
              ontology_id_colname = "ontology_id",
              # Upper threshold for the value indicating the significance
              p_value_max_threshold = 0.05,
              # Column that indicates the significance values
              p_value_type_colname = "eFDR")

## ----'bar_plot_ora'-----------------------------------------------------------
plot_barplot(reshaped_results = ora_reshaped_results,
              # Column containing the names we wish to plot
              ontology_id_colname = "ontology_id",
              # Upper threshold for the value indicating the significance
              p_value_max_threshold = 0.05,
              # Column that indicates the significance values
              p_value_type_colname = "eFDR")

## ----'network_plot_ora'-------------------------------------------------------
plot_graph(reshaped_results = ora_reshaped_results,
           # Column containing the names we wish to plot
           ontology_id_colname = "ontology_id",
           # Upper threshold for the value indicating the significance
           p_value_max_threshold = 0.05,
           # Column that indicates the significance values
           p_value_type_colname = "eFDR")

## ----'heatmap_ora'------------------------------------------------------------
plot_heatmap(reshaped_results = ora_reshaped_results,
             # Column containing the names we wish to plot
             ontology_id_colname = "ontology_id",
             # Column that indicates the significance values
             p_value_type_colname = "eFDR")

## ----'ora_bh_bonferroni'------------------------------------------------------
# Creating the ORA model using the Benjamini-Hochberg p-value correction method
BH_ora_model <- ora(gmt = tf_ontology_filtered, 
                 # Test set variable
                 element_names = target_set, 
                 # Background set variable
                 background_element_names = background_set, 
                 # p-value adjustment method
                 p_value_adjustment_method = "BH",
                 # Number of processor threads to use
                 nthreads = 2) 

# Running the ORA
BH_results <- run_test(BH_ora_model)

# Creating the ORA model using the Bonferroni p-value correction method
Bonferroni_ora_model <- ora(gmt = tf_ontology_filtered, 
                            # Test set variable
                            element_names = target_set, 
                            # Background set variable
                            background_element_names = background_set, 
                            # p-value adjustment method
                            p_value_adjustment_method = "bonferroni",
                            # Number of processor threads to use
                            nthreads = 2) 

# Running the ORA
Bonferroni_results <- run_test(Bonferroni_ora_model)

## ----'compare_p.adj'----------------------------------------------------------
# Merging the Benjamini-Hochberg and eFDR results
merged_results <- BH_results %>% 
  # Renaming the column
  rename(BH_adjusted_p_value = adjusted_p_value) %>% 
  # Selecting the necessary columns
  select(ontology_id, BH_adjusted_p_value) %>%
  # Joining with the eFDR results
  left_join(ora_results, ., by = "ontology_id") %>% 
  # Converting the data.frame to a tibble
  tibble()

# Merging the Bonferroni results with the merged results
merged_results <- Bonferroni_results %>% 
  # Renaming the column
  rename(Bonferroni_adjusted_p_value = adjusted_p_value) %>% 
  # Selecting the necessary columns
  select(ontology_id, Bonferroni_adjusted_p_value) %>%
  # Joining with the eFDR results
  left_join(merged_results, ., by = "ontology_id") %>% 
  # Arranging by the p-value
  arrange(p_value)

# filter the p-value < 0.05 results
merged_results_filtered <- merged_results %>% 
  filter(p_value < 0.05) %>% 
  # remove the unnecessary columns
  select(-ontology_id, -nr_common_with_tested_elements, 
         -nr_common_with_background_elements)

## ----'print_compare_p.adj', echo=FALSE----------------------------------------
merged_results_filtered %>% 
  knitr::kable()

## ----'reading_ordered'--------------------------------------------------------
# Reading the tsv containing the ordered set
ordered_set <- read_tsv("https://raw.githubusercontent.com/ELTEbioinformatics/mulea/master/inst/extdata/ordered_set.tsv")

## ----'gsea', warning=FALSE, message=FALSE-------------------------------------
# Creating the GSEA model using the GMT variable
gsea_model <- gsea(gmt = tf_ontology_filtered,
                   # Names of elements to test
                   element_names = ordered_set$Gene.symbol,
                   # LogFC-s of elements to test
                   element_scores = ordered_set$logFC,
                   # Consider elements having positive logFC values only
                   element_score_type = "pos",
                   # Number of permutations
                   number_of_permutations = 10000)

# Running the GSEA
gsea_results <- run_test(gsea_model)

## ----'gsea_size'--------------------------------------------------------------
gsea_results %>%
  # rows where the adjusted_p_value < 0.05
  filter(adjusted_p_value < 0.05) %>% 
  # the number of such rows
  nrow()

## ----'print_gsea', eval=FALSE-------------------------------------------------
#  gsea_results %>%
#    # arrange the rows by the adjusted_p_value values
#    arrange(adjusted_p_value) %>%
#    # rows where the adjusted_p_value < 0.05
#    filter(adjusted_p_value < 0.05)

## ----'print_gsea2', echo=FALSE------------------------------------------------
gsea_results %>%
  # arrange the rows by the adjusted_p_value values
  arrange(adjusted_p_value) %>% 
  # rows where the adjusted_p_value < 0.05
  filter(adjusted_p_value < 0.05) %>% 
  knitr::kable()

## ----'init_plot_gsea'---------------------------------------------------------
# Reshaping the GSEA results for visualisation
gsea_reshaped_results <- reshape_results(model = gsea_model, 
                                         model_results = gsea_results, 
                                         # choosing which column to use for the
                                         # indication of significance
                                         p_value_type_colname = "adjusted_p_value")

## ----'network_plot_gsea'------------------------------------------------------
plot_graph(reshaped_results = gsea_reshaped_results,
           # the column containing the names we wish to plot
           ontology_id_colname = "ontology_id",
           # upper threshold for the value indicating the significance
           p_value_max_threshold = 0.05,
           # column that indicates the significance values
           p_value_type_colname = "adjusted_p_value")

## ----'DE1', eval=TRUE, message=FALSE, warning=FALSE---------------------------
# Importing necessary libraries and reading the DE results table
geo2r_result_tab <- read_tsv("https://raw.githubusercontent.com/ELTEbioinformatics/mulea/master/inst/extdata/GSE55662.table_wt_non_vs_cipro.tsv")

## ----'print_geo1', eval=FALSE-------------------------------------------------
#  # Printing the first few rows of the data frame
#  geo2r_result_tab %>%
#    head(3)

## ----'print_geo2', echo=FALSE-------------------------------------------------
# Printing the first few rows of the data frame
geo2r_result_tab %>%  
  head(3) %>% 
  knitr::kable()

## ----'format_geo'-------------------------------------------------------------
# Formatting the data frame
geo2r_result_tab <- geo2r_result_tab %>% 
  # Extracting the primary gene symbol and removing extraneous information
  mutate(Gene.symbol = str_remove(string = Gene.symbol,
                                  pattern = "\\/.*")) %>% 
  # Filtering out rows with NA gene symbols
  filter(!is.na(Gene.symbol)) %>% 
  # Sorting by logFC
  arrange(desc(logFC))

## ----'print_geo_formatted1', eval=FALSE---------------------------------------
#  # Printing the first few rows of the formatted data frame
#  geo2r_result_tab %>%
#    head(3)

## ----'print_geo_formatted2', echo=FALSE---------------------------------------
# Printing the first few rows of the formatted data frame
geo2r_result_tab %>%  
  head(3) %>% 
  knitr::kable()

## ----'target_set'-------------------------------------------------------------
target_set <- geo2r_result_tab %>% 
  # Filtering for adjusted p-value < 0.05 and logFC > 1
  filter(adj.P.Val < 0.05 & logFC > 1) %>% 
  # Selecting the Gene.symbol column
  select(Gene.symbol) %>% 
  # Converting the tibble to a vector
  pull() %>% 
  # Removing duplicates
  unique()

## ----'target_head'------------------------------------------------------------
target_set %>% 
  head(10)

## ----'target_gene_nr'---------------------------------------------------------
target_set %>% 
  length()

## ----'background_set'---------------------------------------------------------
background_set <- geo2r_result_tab %>% 
  # Selecting the Gene.symbol column
  select(Gene.symbol) %>% 
  # Converting the tibble to a vector
  pull() %>% 
  # Removing duplicates
  unique()

## ----'background_gene_nr'-----------------------------------------------------
background_set %>% 
  length()

## ----'save_target_bg', eval=FALSE---------------------------------------------
#  # Save taget set to text file
#  target_set %>%
#    writeLines("target_set.txt")
#  
#  # Save background set to text file
#  background_set %>%
#    writeLines("inst/extdata/background_set.txt")

## ----'gsea_input'-------------------------------------------------------------
# If there are duplicated Gene.symbols keep the first one only
ordered_set <- geo2r_result_tab %>% 
  # Grouping by Gene.symbol to be able to filter
  group_by(Gene.symbol) %>%
  # Keeping the first row for each Gene.symbol from rows with the same 
  #     Gene.symbol
  filter(row_number()==1) %>% 
  # Ungrouping
  ungroup() %>% 
  # Arranging by logFC in descending order
  arrange(desc(logFC)) %>%
  select(Gene.symbol, logFC)

## ----'ordered_genes_length'---------------------------------------------------
ordered_set %>% 
  nrow()

## ----'save_ordered', eval=FALSE-----------------------------------------------
#  # Save ordered set to text file
#  ordered_set %>%
#    write_tsv("ordered_set.tsv")

## ----'session_info'-----------------------------------------------------------
sessionInfo()

Try the mulea package in your browser

Any scripts or data that you put into this service are public.

mulea documentation built on Sept. 30, 2024, 9:44 a.m.