R/diffAnalysis.R

Defines functions diffAnalysis

## Differential peak calling with MeRIP-Seq.
## return the peaks in GRangesList format.
diffAnalysis <- function(bam_IP,
                         bam_input,
                         bam_IP_treated,
                         bam_input_treated,
                         txdb = NULL,
                         genome = NULL,
                         bin_size = 25,
                         step_size = 25,
                         fragment_length = 100,
                         strandness = c("unstrand", "1st_strand", "2nd_strand"),
                         gff = NULL,
                         test_method = c("Poisson", "DESeq2"),
                         p_cutoff = 1e-10,
                         lfc_threshold = 0,
                         diff_p_cutoff = 0.01,
                         alt_hypothesis = c("greaterAbs", "lessAbs", "greater", "less"),
                         plot_gc = FALSE,
                         parallel = 1,
                         motif_based = FALSE,
                         motif_sequence = "DRACH",
                         absolute_diff = FALSE,
                         fig_dir = "exomePeak2_output",
                         mode = c("exon","full_transcript","whole_genome"),
                         confounding_factor = NULL){
  #require(GenomicRanges)
  #require(GenomicFeatures)
  #require(SummarizedExperiment)
  #require(DESeq2)

  strandness <- match.arg(strandness)
  alt_hypothesis <- match.arg(alt_hypothesis)
  test_method <- match.arg(test_method)
  stopifnot(fragment_length > 0 & bin_size > 0 & step_size > 0)

  if(is.null(gff) & is.null(txdb)){
    stop("Please at least provide one among gff and TxDb for transcript annotation.")
  }

  if(is.null(txdb) & !is.null(gff)){
    txdb <- makeTxDbFromGFF(gff)
  }

  if(!(all(file.exists(bam_IP)) &
       all(file.exists(bam_input)) &
       all(file.exists(bam_IP_treated)) &
       all(file.exists(bam_input_treated)))){
    stop("At least one bam file directories provided cannot be found, please check it.")
  }

  if(is.null(genome) & motif_based){
    stop("Motif cannot be extracted without providing the 'genome' argument, please try to find the reference genome.")
  }

  if(is.null(genome)){
    warning("Reference genome not provided, GC content bias is left uncorrected.")
  }

  #Extract bins for count
  message("Extract bin features ... ", appendLF = FALSE)
  exByGene  <- exonsByiGenes(txdb) %>% quiet
  if(!motif_based){
    peakBins <- exonicBins(exByGene, bin_size, step_size) %>% quiet
  }else{
    peakBins <- sampleSequence(motif_sequence, exByGene, genome) %>% quiet
    peakBins <-  resize(peakBins, 1, fix = "center") %>% quiet
  }
  mcols(peakBins) <- NULL
  if(is(peakBins, "GRanges")) peakBins <- split(peakBins, seq_along(peakBins)) %>% quiet
  message("OK")

  #Count the bam files
  message("Count reads on bin features ... ", appendLF = FALSE)
  bam_dirs <- c(bam_IP, bam_IP_treated, bam_input, bam_input_treated)
  se <- featuresCounts(peakBins, bam_dirs, strandness, parallel) %>% quiet
  message("OK")

  #Annotate SummarizedExperiment
  length_indx <- c(length(bam_IP), length(bam_IP_treated),
                   length(bam_input), length(bam_input_treated))
  se$IP_input <- rep(c("IP", "IP", "input", "input"), length_indx)
  se$Perturbation <- rep(c("C", "Treated", "C", "Treated"), length_indx)
  rm(peakBins, length_indx)


  #Identify Backgrounds
  message("Identify background features ... ", appendLF = FALSE)
  se <- classifyBackground(se) %>% quiet
  message("OK")

  #Estimate sample size factors
  message("Estimate sample sepecific size factors from the background ... ", appendLF = FALSE)
  se <- estimateColumnFactors(se) %>% quiet
  message("OK")

  if(!is.null(genome)){
  #Calculate GC contents
  message("Calculate bin GC contents on exons ... ", appendLF = FALSE)
  se <- calculateGCcontents(se, fragment_length, exByGene, genome) %>% quiet
  message("OK")

  #Fit GC content biases
  message("Fit GC curves with smoothing splines ... ", appendLF = FALSE)
  se <- fitBiasCurves(se) %>% quiet
  message("OK")

  #Estimate matrix correction factors
  message("Calculate offset matrix for bins ... ", appendLF = FALSE)
  se <- estimateMatrixFactors(se) %>% quiet
  message("OK")

  ## Plot GC bias fits
  if(plot_gc) plotGCbias(se, fig_dir) %>% quiet

  }else{
  #Assign matrix correction factors without GC offsets
  se <- estimateMatrixFactors(se) %>% quiet
  }
  
  #Filter low count rows if not exon mode
  if(mode %in% c("full_transcript","whole_genome")){
    se <- se[rowMeans(assay(se)) >= 5,]
  }
  
  #Peak calling
  message("Detect peaks with GLM ... ", appendLF = FALSE)
  peaks <- callPeaks(se, txdb, test_method, p_cutoff, exByGene, bin_size, motif_based, confounding_factor) %>% quiet
  message("OK")

  #Count the bam files
  message("Count reads on peaks ... ", appendLF = FALSE)
  se2 <- featuresCounts(peaks, bam_dirs, strandness, parallel) %>% quiet
  rm(bam_dirs, peaks)
  message("OK")

  #Re-annotate SummarizedExperiment
  colData(se2) <- colData(se)
  metadata(se2) <- metadata(se)
  rm(se)

  #Calculate GC contents
  se2 <- calculateGCcontents(se2, fragment_length, exByGene, genome) %>% quiet

  #Estimate matrix correction factors
  message("Calculate offset matrix for peaks ... ", appendLF = FALSE)
  se2 <- estimateMatrixFactors(se2) %>% quiet
  message("OK")

  #Differential calling
  if(!absolute_diff){
    message("Detect differentially modified peaks with interactive GLM ... ", appendLF = FALSE)  
  }else{
    message("Detect absolute differentially modified peaks with GLM ... ", appendLF = FALSE)  
  }
  diffPeaks <- callDiff(se2, txdb, test_method, diff_p_cutoff, exByGene, bin_size, alt_hypothesis, lfc_threshold, motif_based, absolute_diff, confounding_factor) %>% quiet
  message("OK")

  return(diffPeaks)
}
ZW-xjtlu/exomePeak2 documentation built on Oct. 12, 2024, 3:30 a.m.