knitr::opts_chunk$set(echo = TRUE, 
                      message = params$debug,
                      warning = params$debug,
                      cache = FALSE, 
                      error = params$error,
                      results = if(isTRUE(params$interact)) 'asis' else 'markup')
### Create Output Directory ###
# if save_output = TRUE
if(params$save_output){
  outfile_dir <- file.path(params$output_dir,"EpiCompare_file")
  if(!dir.exists(outfile_dir)){
    dir.create(outfile_dir, showWarnings = FALSE, recursive = TRUE)
  }
}

#### ------ Prepare genome builds ------ ####
# e.g. genome_build <- list(reference="hg19",peakfiles="hg38",blacklist="hg19")
# or... genome_build <- "hg19"
builds <- prepare_genome_builds(genome_build = params$genome_build,
                                blacklist = params$blacklist)
## Standardise all data to hg19 build 
output_build <- prepare_output_build(params$genome_build_output)

#### ------ Prepare peaklist(s) ------ ####
# and check that the list is named, if not default filenames are used 
peaklist <- prepare_peaklist(peaklist = params$peakfiles) 
peaklist <- liftover_grlist(grlist = peaklist, 
                            input_build = builds$peaklist,
                            output_build = output_build)

#### ------ Prepare reference(s) ------ ####
reference <- prepare_reference(reference = params$reference)
reference <- liftover_grlist(grlist = reference, 
                            input_build = builds$reference,
                            output_build = output_build)

#### ------ Prepare blacklist ------ ####
blacklist <- prepare_blacklist(blacklist = params$blacklist,
                               output_build = output_build,
                               blacklist_build = builds$blacklist) 

### Standardise peaklist(s) ###
peaklist <- tidy_peakfile(peaklist = peaklist,
                          blacklist = blacklist)
peaklist_tidy <- peaklist

### Standardise reference(s) ###
# and include in peaklist
reference_tidy <- reference
if (!is.null(reference)){
  reference_tidy <- tidy_peakfile(peaklist = reference,
                                  blacklist = blacklist)
  peaklist_tidy <- c(peaklist_tidy, reference_tidy)
}

### Obtain Genome Annotation ### 
txdb <- check_genome_build(genome_build = output_build)

### Dynamic Figure Height ###  
fig_height <- fig_length(default_size = 7,
                         number_of_items = length(peaklist_tidy),
                         max_items = 10)

needs_ref <- function(arg,
                      reference=NULL){ 
  if(isTRUE(arg)){
    if(is.null(reference)){
      cat("NOTE: This plot is not generated when reference=NULL.")
      return(FALSE)
    } else{
      return(TRUE)
    }
  } else {
    return(FALSE)
  }
}

EpiCompare compares epigenomic datasets for quality control and benchmarking purposes.

Summary {-}

Table of Contents {-}

The report consists of three sections:

  1. General Metrics: Metrics on peaks for each sample: % blacklisted and non-standard peaks, peak widths, fragments, duplication rates.

  2. Peak Overlap: Frequency, percentage, statistical significance of overlapping and non-overlapping peaks. This also includes upset, precision-recall, and correlation plots.

  3. Functional Annotation: Functional annotation (ChromHMM analysis, peak annotation, and enrichment analysis) of peaks. This also includes peak distributions around transcription start sites (TSS).

Input Datasets {-}

# print peak file names and numerate 
cat(paste0(" - File ",seq_len(length(names(peaklist_tidy))),": ",
        names(peaklist_tidy),
        collapse = "\n\n"))

Processed, filtered and lifted files for: peaklist, reference, blacklist

download_button(object = list(peaklist = peaklist,
                              reference = reference_tidy,
                              blacklist = blacklist),
                save_output = params$save_output, 
                filename = paste0("processed_peakfiles_",output_build), 
                outfile_dir = outfile_dir,
                add_download_button = TRUE) # Always include button

Command {-}

The EpiCompare function call used to generate the report:

cmd <- report_command(params = params, 
                      peaklist_tidy = peaklist_tidy,
                      reference_tidy = reference_tidy)
cat(cmd)

General Metrics {#general_metrics .tabset}

Peak Information

Column descriptions

:information: Note: All EpiCompare analyses conducted on the peak files after they have been filtered (i.e. blacklisted regions and non-standard chromosomes removed) and lifted (as needed).


peak_info_df <- peak_info(peaklist = peaklist,
                          blacklist = blacklist)
download_button(object = peak_info_df,
                filename = "peak_info_df", 
                output_extension = ".csv",
                add_download_button = params$add_download_button)
# Print table 
knitr::kable(peak_info_df, format = "markdown") 
save_output(save_output = params$save_output,
            file = peak_info_df,
            file_type = "data.frame",
            filename = "peak_info",
            outpath = outfile_dir) 
remove(peak_info_df)

Fragment Information

Metrics on fragments is shown only if Picard summary is provided. See manual for help.

Column descriptions


if (!is.null(params$picard_files)){
  fragment_info_df <- fragment_info(picard_list = params$picard_files)
  download_button(object = fragment_info_df,
                  filename = "fragment_info",
                  output_extension = ".csv",
                  add_download_button = params$add_download_button)
  # Print data frame
  knitr::kable(fragment_info_df, format = "markdown") 
  save_output(save_output = params$save_output,
              file = fragment_info_df,
              file_type = "data.frame",
              filename = "fragment_info",
              outpath = outfile_dir) 
  remove(fragment_info_df)
}   

Peak widths

Distribution of peak widths in samples


width_plot <- width_boxplot(peaklist = peaklist_tidy, 
                            interact = params$interact)
download_button(object = width_plot,
                filename = "width_boxplot",
                self_contained = params$interact,
                add_download_button = params$add_download_button)
width_plot$plot
# Save boxplot
save_output(save_output = params$save_output,
            file = width_plot$plot,
            file_type = "ggplot",
            filename = "width_plot",
            outpath = outfile_dir, 
            interactive = params$interact)
# Remove variable
remove(width_plot)

Peak Overlap {#peak_overlap .tabset}

Percentage Overlap

Percentage of overlapping peaks between samples. Hover over heatmap for percentage values.

The heatmap can be interpreted as follows:


overlap_heatmap <- overlap_heatmap(peaklist = peaklist_tidy,
                                   interact = params$interact)
download_button(object = overlap_heatmap,
                filename = "overlap_heatmap", 
                self_contained = params$interact,
                add_download_button = params$add_download_button)
overlap_heatmap$plot

# Save output 
save_output(save_output = params$save_output,
            file = overlap_heatmap$plot,
            file_type = "ggplot",
            filename = "samples_percent_overlap",
            outpath = outfile_dir,
            interactive = params$interact)
# Delete variable
remove(overlap_heatmap)

Upset Plot

Upset plot of overlapping peaks between samples. See here on how to interpret the plot.

upset_plot <- NULL
if(isTRUE(params$upset_plot)){
  upset_plot <- overlap_upset_plot(peaklist = peaklist_tidy)
  download_button(object = upset_plot,
                  filename = "upset_plot",
                  add_download_button = params$add_download_button)
  save_output(save_output = params$save_output,
              file = upset_plot,
              file_type = "image",
              filename = "upset_plot",
              outpath = outfile_dir)
}
upset_plot
remove(upset_plot)

Statistical Significance

Depending on the format of the reference file, EpiCompare produces different plots:

Reference peakfile: r names(reference_tidy)

Keys


if (needs_ref(params$stat_plot, reference)){
  stat_plot <- overlap_stat_plot(reference = reference_tidy,
                                 peaklist = peaklist,
                                 txdb = txdb, 
                                 interact = params$interact)
  download_button(object = stat_plot,
                  filename = "overlap_stat_plot", 
                  button_label = "Download overlap stat plot",
                  self_contained = params$interact,
                  add_download_button = params$add_download_button)
  stat_plot$plot
  # Save output 
  save_output(save_output = params$save_output,
              file = stat_plot$plot,
              file_type = "ggplot",
              filename = "stat_plot",
              outpath = outfile_dir, 
              interactive = params$interact)
  # Remove variables
  remove(stat_plot)
}

Precision-Recall Curves

The first plot shows the balance between precision and recall across multiple peak calling stringency thresholds.

The second plot shows F1 score (a score that combines precision and recall) across the different peak calling stringency thresholds.

pr_out <- NULL
if(needs_ref(params$precision_recall_plot, reference)){
    #### Create save path ####
    save_path <- if(isFALSE(params$save_output)){NULL}else{
        file.path(outfile_dir,"precision_recall.csv")
    }
    pr_out <- plot_precision_recall(peakfiles = peaklist, 
                                    reference = reference_tidy,
                                    n_threshold = params$n_threshold,
                                    workers = params$workers,
                                    show_plot = FALSE,
                                    verbose = FALSE,
                                    save_path = save_path, 
                                    interact = params$interact)
    download_button(object = pr_out,
                    filename = "precision_recall", 
                    self_contained = params$interact,
                    add_download_button = params$add_download_button)  
}
pr_out$precision_recall_plot
cat("\n\n")
pr_out$f1_plot
remove(pr_out)

Correlation Plot

The correlation plot shows the correlation between the quantiles when the genome is binned at a set size. These quantiles are based on the intensity of the peak, dependent on the peak caller used (q-value for MACS2):

cp_out <- NULL
if(isTRUE(params$corr_plot)){
    #### Create save path ####
    save_path <- if(isFALSE(params$save_output)){NULL}else{
        file.path(outfile_dir,"corr.csv.gz")
    }
    cp_out <- plot_corr(peakfiles = peaklist_tidy, 
                        # reference can be NULL
                        reference = reference_tidy,
                        genome_build = output_build,
                        bin_size = params$bin_size,
                        workers = params$workers,
                        show_plot = FALSE,
                        save_path = save_path,
                        interact = params$interact)  
    download_button(object = cp_out,
                    filename = "correlation_plot", 
                    self_contained = params$interact,
                    add_download_button = params$add_download_button)
}
cp_out$corr_plot
remove(cp_out)

Functional Annotation {#functional_annotation}

ChromHMM {.tabset}

ChromHMM annotates and characterises peaks into different chromatin states.

ChromHMM annotations used in EpiCompare were obtained from here.

ChromHMM annotation definitions:

For more information on ChromHMM states, please see here

  1. Active promoter: being transcribed
  2. Weak promoter: less transcriptional activity
  3. Poised promoter: ready for transcriptional activity
  4. Strong enhancer: activate transcription at high level
  5. Strong enhancer: activate transcription at high level
  6. Weak enhancer: activate transcription at low level
  7. Weak enhancer: activate transcription at low level
  8. Insulator: block transcription
  9. Txn elongation: transcription elongation
  10. Txn elongation: transcription elongation
  11. Weak Txn: weak transcription
  12. Repressed: decreased transcription
  13. Heterochrom.Io: heterochromatin and/or low signal
  14. Repetitive CNV: repetitive or copy number variation
  15. Repetitive CNV: repetitive or copy number variation


All samples

ChromHMM annotation of individual samples.

samples_chromHMM <- NULL
if(isTRUE(params$chromHMM_plot)){
  # Get ChromHMM annotation file
  chromHMM_list <- get_chromHMM_annotation(cell_line = params$chromHMM_annotation)
  # Plot chromHMM
  samples_chromHMM <- plot_chromHMM(peaklist = peaklist_tidy,
                                    chromHMM_annotation = chromHMM_list,
                                    genome_build = output_build,
                                    interact = params$interact)
  download_button(object = samples_chromHMM,
                  filename = "samples_ChromHMM",
                  self_contained = params$interact,
                  add_download_button = params$add_download_button)
  save_output(save_output = params$save_output,
              file = samples_chromHMM,
              file_type = "ggplot",
              filename = "samples_ChromHMM",
              outpath = outfile_dir,
              interactive = params$interact)
}
samples_chromHMM
remove(samples_chromHMM)

Overlap: Sample peaks in Reference peaks

Percentage of Sample peaks found in Reference peaks (Reference peakfile: r names(reference_tidy))

if(needs_ref(params$chromHMM_plot, reference)){
  # generate data frame of percentage overlap 
  sample_in_ref_df <- overlap_percent(peaklist1 = peaklist_tidy,
                                      peaklist2 = reference_tidy,
                                      invert = FALSE)
  download_button(object = sample_in_ref_df,
                  filename = "sample_in_ref_df",
                  output_extension = ".csv",
                  add_download_button = params$add_download_button)
  knitr::kable(sample_in_ref_df, format = "markdown") 
}

ChromHMM annotation of sample peaks found in reference peaks.

sample_in_ref_chromHMM <- NULL
if(needs_ref(params$chromHMM_plot, reference)){ 
  # Obtain overlapping peaks 
  sample_in_ref_list <- mapply(peaklist_tidy, FUN=function(file){
      IRanges::subsetByOverlaps(x = file, 
                                ranges = reference_tidy[[1]])
  }) 
  # Run ChromHMM
  sample_in_ref_chromHMM <- plot_chromHMM(peaklist = sample_in_ref_list,
                                          chromHMM_annotation = chromHMM_list,
                                          genome_build = output_build,
                                          interact = params$interact)
  download_button(object = sample_in_ref_chromHMM,
                filename = "sample_in_ref_chromHMM", 
                self_contained = params$interact,
                add_download_button = params$add_download_button) 
  save_output(save_output = params$save_output,
              file = sample_in_ref_chromHMM,
              file_type = "ggplot",
              filename = "sample_in_ref_ChromHMM",
              outpath = outfile_dir,
              interactive = params$interact)  
}
sample_in_ref_chromHMM
remove(sample_in_ref_chromHMM)

Overlap: Reference peaks in Sample peaks

Percentage of Reference peaks found in Sample peaks (Reference peakfile: r names(reference_tidy))

if (needs_ref(params$chromHMM_plot, reference)){
    # Data frame of overlapping peaks 
    ref_in_sample_df <- overlap_percent(peaklist1 = reference_tidy,
                                        peaklist2 = peaklist_tidy,
                                        invert = FALSE)
   download_button(object = ref_in_sample_df,
                  filename = "ref_in_sample_df",
                  output_extension = ".csv",
                  add_download_button = params$add_download_button)
   knitr::kable(ref_in_sample_df, format = "markdown")
}

ChromHMM annotation of reference peaks found in sample peaks.

ref_in_sample_chromHMM <- NULL
if (needs_ref(params$chromHMM_plot, reference)){
  # Subset overlapping peaks
  ref_in_sample_list <- mapply(peaklist_tidy, FUN = function(file){
    IRanges::subsetByOverlaps(x = reference_tidy[[1]], 
                              ranges = file)
    }) 
  # Plot ChromHMM
  ref_in_sample_chromHMM <- plot_chromHMM(peaklist = ref_in_sample_list,
                                          chromHMM_annotation = chromHMM_list,
                                          genome_build = output_build,
                                          interact = params$interact)
  download_button(object = ref_in_sample_chromHMM,
                  filename = "ref_in_sample_chromHMM", 
                  self_contained = params$interact,
                  add_download_button = params$add_download_button)  
  save_output(save_output = params$save_output,
              file = ref_in_sample_chromHMM,
              file_type = "ggplot",
              filename = "ref_in_sample_ChromHMM",
              outpath = outfile_dir,
              interactive = params$interact) 
}
ref_in_sample_chromHMM
remove(ref_in_sample_chromHMM)

Unique: Sample peaks not in Reference peaks

Percentage of sample peaks not found in reference peaks (Reference peakfile: r names(reference_tidy))

if (needs_ref(params$chromHMM_plot, reference)){
  # Data frame of non-overlapping peaks 
  sample_not_in_ref_df <- overlap_percent(peaklist1 = peaklist_tidy,
                                          peaklist2 = reference_tidy,
                                          invert = TRUE)
  download_button(object = sample_not_in_ref_df,
                  filename = "sample_not_in_ref_df", 
                  output_extension = ".csv",
                  add_download_button = params$add_download_button)
  knitr::kable(sample_not_in_ref_df, format = "markdown")
}

ChromHMM annotation of sample peaks not found in reference peaks.

sample_not_in_ref_chromHMM <- NULL
if (needs_ref(params$chromHMM_plot, reference)){
  sample_not_in_ref_list <- mapply(peaklist_tidy, FUN = function(file){
    IRanges::subsetByOverlaps(x = file, 
                              ranges = reference_tidy[[1]], 
                              invert = TRUE)
    }) 
  # Run ChromHMM
  sample_not_in_ref_chromHMM<-plot_chromHMM(peaklist = sample_not_in_ref_list,
                                            chromHMM_annotation = chromHMM_list,
                                            genome_build = output_build,
                                            interact = params$interact)
  download_button(object = sample_not_in_ref_chromHMM,
                 filename = "sample_not_in_ref_chromHMM", 
                 self_contained = params$interact,
                 add_download_button = params$add_download_button)  
  save_output(save_output = params$save_output,
              file = sample_not_in_ref_chromHMM,
              file_type = "ggplot",
              filename = "sample_not_in_ref_ChromHMM",
              outpath = outfile_dir,
              interactive = params$interact)  
} 
sample_not_in_ref_chromHMM
remove(sample_not_in_ref_chromHMM)

Unique: Reference peaks not in Sample peaks

Percentage of reference peaks not found in sample peaks (Reference peakfile: r names(reference_tidy))

if (needs_ref(params$chromHMM_plot, reference)){
  # Data frame of non-overlapping peaks 
  ref_not_in_sample_df <- overlap_percent(peaklist1 = reference_tidy,
                                          peaklist2 = peaklist_tidy,
                                          invert = TRUE)
  download_button(object = ref_not_in_sample_df,
                  filename = "ref_not_in_sample_df", 
                  output_extension = ".csv",
                  add_download_button = params$add_download_button)
  knitr::kable(ref_not_in_sample_df, format = "markdown")
}

ChromHMM annotation of reference peaks not found in sample peaks.

ref_not_in_sample_chromHMM <- NULL
if (needs_ref(params$chromHMM_plot, reference)){
  # Subset unique peaks 
  ref_not_in_sample_list <- mapply(peaklist_tidy, FUN = function(file){
    IRanges::subsetByOverlaps(x = reference_tidy[[1]], 
                              ranges = file, 
                              invert = TRUE)
    })
  # Run ChromHMM
  ref_not_in_sample_chromHMM<-plot_chromHMM(peaklist = ref_not_in_sample_list,
                                            chromHMM_annotation = chromHMM_list,
                                            genome_build = output_build,
                                            interact = params$interact)
  download_button(object = ref_not_in_sample_chromHMM,
                  filename = "ref_not_in_sample_chromHMM", 
                  self_contained = params$interact,
                  add_download_button = params$add_download_button)  
  save_output(save_output = params$save_output,
              file = ref_not_in_sample_chromHMM,
              file_type = "ggplot",
              filename = "ref_not_in_sample_ChromHMM",
              outpath = outfile_dir,
              interactive = params$interact) 
}
ref_not_in_sample_chromHMM
remove(ref_not_in_sample_chromHMM)

Annotate Peaks

EpiCompare uses ChIPseeker::annotatePeak() to annotate peaks with the nearest gene and genomic regions where the peak is located. The peaks are annotated with genes taken from human genome annotations (hg19 or hg38) distributed by Bioconductor.

chipseeker_plot <- NULL
if(isTRUE(params$chipseeker_plot)){
  chipseeker_plot <- plot_ChIPseeker_annotation(
    peaklist = peaklist_tidy,
    txdb = txdb, 
    tss_distance = params$tss_distance,
    interact = params$interact)
  download_button(object = chipseeker_plot,
                  filename = "chipseeker_plot", 
                  self_contained = params$interact,
                  add_download_button = params$add_download_button) 
  save_output(save_output = params$save_output,
              file = chipseeker_plot,
              file_type = "ggplot",
              filename = "chipseeker_annotation",
              outpath = outfile_dir, 
              interactive = params$interact) 
}
chipseeker_plot
remove(chipseeker_plot)

Functional Enrichment Analyses {.tabset}

EpiCompare performs KEGG pathway and GO enrichment analysis using clusterProfiler. ChIPseeker::annotatePeak() is first used to assign peaks to nearest genes. Biological themes amongst the genes are identified using ontologies (KEGG and GO). The peaks are annotated with genes taken from annotations of human genome (hg19 or hg38) provided by Bioconductor.

enrichment_plots <- NULL
if (isTRUE(params$enrichment_plot)){
  enrichment_plots <- plot_enrichment(peaklist = peaklist_tidy,
                                      txdb = txdb, 
                                      tss_distance = params$tss_distance,
                                      interact = params$interact) 
  # Figure height
  max_terms <- max(
    length(unique(enrichment_plots$kegg_plot$data$Description)),
    length(unique(enrichment_plots$go_plot$data$Description))
  )
  fig_height <- fig_length(default_size = 10,
                           number_of_items = max_terms,
                           max_items = 20)
}

KEGG

Kyoto Encyclopedia of Genes and Genomes (KEGG) enrichment results.

if (isTRUE(params$enrichment_plot)){
  download_button(object = enrichment_plots$kegg_plot,
                  filename = "KEGG_plot", 
                  self_contained = params$interact,
                  add_download_button = params$add_download_button)  
  save_output(save_output = params$save_output,
              file = enrichment_plots$kegg_plot,
              file_type = "ggplot",
              filename = "KEGG_analysis",
              outpath = outfile_dir, 
              interactive = params$interact)
}
enrichment_plots$kegg_plot

GO

Gene Ontology (GO) enrichment results.

GeneRatio definition:

GeneRatio is the number of observed genes divided by the number of expected genes from each GO category. GO terms can be useful when assessing different biological themes present in each epigenomic dataset.

if (isTRUE(params$enrichment_plot)){
  download_button(object = enrichment_plots$go_plot,
                  filename = "GO_plot", 
                  self_contained = params$interact,
                  add_download_button = params$add_download_button) 
  save_output(save_output = params$save_output,
              file = enrichment_plots$go_plot,
              file_type = "ggplot",
              filename = "GO_analysis",
              outpath = outfile_dir,
              interactive = params$interact)  
} 
enrichment_plots$go_plot
remove(enrichment_plots)

Peak Frequency around TSS

This plots peaks that are mapping to transcriptional start sites (TSS). TSS regions are defined as the flanking sequence of the TSS sites.

tssplt <- NULL
if (isTRUE(params$tss_plot)){
  tssplt <- tss_plot(peaklist = peaklist_tidy,
                     txdb = txdb,
                     tss_distance = params$tss_distance,
                     workers = params$workers,
                     interact = params$interact)
  download_button(object = tssplt,
                  filename = "tss_plots", 
                  self_contained = params$interact,
                  add_download_button = params$add_download_button)
}
tssplt
remove(tssplt,p)

Citation {-}

If you use EpiCompare, please cite:

cat(utils::citation("EpiCompare")$textVersion)

Session Info {-}

utils::sessionInfo()




neurogenomics/EpiCompare documentation built on Oct. 18, 2024, 11:04 p.m.