cragr: R package for CRAG

stop_quietly <- function() {
  opt <- options(show.error.messages = FALSE)
  on.exit(options(opt))
  stop()
}

library(rlang)

ifs_parser <- optparse::OptionParser(
  option_list = list(
    optparse::make_option(
      opt_str = c("-i", "--input"),
      help = "Path to the input fragment file. The file should be in bgzip-compressed BED format, alongside with the .tbi index file."
    ),
    optparse::make_option(c("-o", "--output"), type = "character", help = "Path to the output file."),
    optparse::make_option(c("--genome"), type = "character", help = "Which reference genome the input fragment file is based on. Should be either GRCh37 or GRCh38."),
    optparse::make_option(
      c("-g", "--gc-correct"),
      default = FALSE,
      action = "store_true",
      help = "Perform GC correction."
    ),
    optparse::make_option(
      c("--gc-correct-method"),
      default = "standard",
      help = "GC correction method. Should be either standard or caret [standard]."
    ),
    optparse::make_option(c("--gc-correct-n"),
      default = 1e6L,
      help = "Maximal number of data points for GC correction model training [1000000]."
    ),
    optparse::make_option(
      c("-m", "--high-mappability"),
      type = "character",
      help = "Path to the mappability file, which should be in BED format [NULL]."
    ),
    optparse::make_option(
      c("--chrom"),
      type = "character",
      help = "Perform the analysis for the specified chromosome."
    ),
    optparse::make_option(c("--min-mapq"),
      default = 30L,
      help = "Minimal MAPQ for fragments included in the analysis."
    ),
    optparse::make_option(c("--min-fraglen"),
      default = 50L,
      help = "Minimal length for fragments included in the analysis."
    ),
    optparse::make_option(c("--max-fraglen"),
      default = 1000L,
      help = "Maximal length for fragments included in the analysis."
    ),
    optparse::make_option(
      c("--exclude-region"),
      type = "character",
      default = NULL,
      # "encode.blacklist.hs37-1kg",
      help = "BED files defining regions to be excluded from the analysis, separated by colon. Default is the ENCODE Blacklist: https://www.nature.com/articles/s41598-019-45839-z, which is included in this R package"
    ),
    # optparse::make_option(
    #   c("--exclude-soft-clipping"),
    #   action = "store_true",
    #   default = FALSE,
    #   help = "Exclude fragments with leading soft-clipping from the analysis"
    # ),
    optparse::make_option(c("-w", "--window-size"),
      default = 200L,
      help = "Size of the sliding window [200]"
    ),
    optparse::make_option(c("-s", "--step-size"), default = 20L, help = "Step size of the sliding window [20]"),
    optparse::make_option(c(
      "-t", "--thread",
      default = 1L, help = "Number of threads [1]"
    )),
    optparse::make_option(c("--verbose"), default = FALSE, action = "store_true")
  )
)


peak_parser <- optparse::OptionParser(
  option_list = list(
    optparse::make_option(c("-i", "--input"), help = "Path to the input file. If there are multiple input files, they should be separated by colons"),
    optparse::make_option(c("-o", "--output"), type = "character", help = "Path to output file"),
    optparse::make_option(c("--genome"), type = "character", help = "Genome of the input"),
    optparse::make_option(
      c("-g", "--gc-correct"),
      default = FALSE,
      action = "store_true",
      help = "Whether to perform GC correction"
    ),
    optparse::make_option(
      c("--gc-correct-method"),
      default = "standard",
      help = "Methods used in GC correction. Should be either standard or caret [standard]"
    ),
    optparse::make_option(
      c("--gc-correct-n"),
      default = 1e6L,
      help = "Maximal sample size for GC correction model training [1e6L]"
    ),
    optparse::make_option(
      c("-w", "--window-size"),
      default = 200L,
      help = "Size of the sliding window [200]"
    ),
    optparse::make_option(c("-s", "--step-size"), default = 20L, help = "Step size of the sliding window [20]"),
    optparse::make_option(c("-t", "--thread", default = 1L, help = "Number of threads [1]")),
    optparse::make_option(c("--verbose"), default = FALSE, action = "store_true")
  )
)


signal_parser <- optparse::OptionParser(
  option_list = list(
    optparse::make_option(c("-i", "--input"), help = "Path to the input file. If there are multiple input files, they should be separated by colons"),
    optparse::make_option(c("--hotspot"), help = "Path to the hotspot file"),
    optparse::make_option(c("-o", "--output"), type = "character", help = "Path to output file"),
    optparse::make_option(c("--genome"), type = "character", help = "Genome of the input"),
    optparse::make_option(
      c("-g", "--gc-correct"),
      default = FALSE,
      action = "store_true",
      help = "Whether to perform GC correction"
    ),
    optparse::make_option(
      c("--gc-correct-method"),
      default = "standard",
      help = "Methods used in GC correction. Should be either standard or caret [standard]"
    ),
    optparse::make_option(
      c("--gc-correct-n"),
      default = 1e6L,
      help = "Maximal sample size for GC correction model training [1e6L]"
    ),
    optparse::make_option(
      c("-m", "--high-mappability"),
      type = "character",
      help = "Path to the mappability file. Default is NULL, i.e. do NOT exclude fragments from low-mappability regions"
    ),
    optparse::make_option(
      c("--chrom"),
      type = "character",
      help = "Perform the analysis only for a selected group of chromosomes. Separated by colons, such as 12:16:X. If not provided, all chromosomes found in the input file will be used"
    ),
    optparse::make_option(c("--min-mapq"),
      default = 30L,
      help = "Minimal MAPQ for fragments included in the analysis"
    ),
    optparse::make_option(c("--min-fraglen"),
      default = 50L,
      help = "Minimal length for fragments included in the analysis"
    ),
    optparse::make_option(c("--max-fraglen"),
      default = 1000L,
      help = "Maximal length for fragments included in the analysis"
    ),
    optparse::make_option(
      c("--exclude-region"),
      type = "character",
      default = NULL, # "encode.blacklist.hs37-1kg",
      help = "BED files defining regions to be excluded from the analysis, separated by colon. Default is the ENCODE Blacklist: https://www.nature.com/articles/s41598-019-45839-z, which is included in this R package"
    ),
    # optparse::make_option(
    #   c("--exclude-soft-clipping"),
    #   action = "store_true",
    #   default = FALSE,
    #   help = "Exclude fragments with leading soft-clipping from the analysis"
    # ),
    optparse::make_option(
      c("-w", "--window-size"),
      default = 200L,
      help = "Size of the sliding window [200]"
    ),
    # optparse::make_option(c("-s", "--step-size"), default = 20L, help = "Step size of the sliding window [20]"),
    optparse::make_option(c("-t", "--thread", default = 1L, help = "Number of threads [1]")),
    optparse::make_option(c("--verbose"), default = FALSE, action = "store_true")
  )
)



parse_script_args <- function() {
  if (interactive()) {
    script_args <- get0("script_args")
    subcommand <- get0("subcommand")

    if (is_null(script_args) || is_null(subcommand)) {
      stop()
    } else {
      return(list(subcommand, script_args))
    }
  } else {
    # The first argument is subcommand. Should be one of the following:
    # * main: run the full pipeline, i.e. calculate IFS scores and then call hotspots
    # * ifs: only calculate IFS scores
    # * hotspot: only call hotspots from a existing IFS score track
    args <- commandArgs(trailingOnly = TRUE)
    subcommand <- args[1]

    if (!subcommand %in% c("ifs", "peak", "signal")) {
      stop("Subcommand should be one of the following: ifs, peak, hotspot, signal")
    }

    if (is_true(subcommand == "ifs")) {
      parser <- ifs_parser
    } else if (is_true(subcommand == "peak")) {
      parser <- peak_parser
    } else if (is_true(subcommand == "signal")) {
      parser <- signal_parser
    } else {
      stop("Subcommand should be one of the following: ifs, peak, hotspot, signal")
    }

    args <- args[-1]
    script_args <-
      optparse::parse_args(parser,
        args = args,
        convert_hyphens_to_underscores = TRUE
      )

    library(dplyr)
    library(purrr)
    library(stringr)
    library(magrittr)

    # Process arguments
    if ("input" %in% names(script_args)) {
      script_args$input <-
        str_split(script_args$input, pattern = ":")[[1]]
    }

    if (!("chrom" %in% names(script_args))) {
      script_args$chrom <- NULL
    }

    c("chrom", "exclude_region") %>%
      walk(function(arg) {
        if (!is_null(script_args[[arg]])) {
          script_args[[arg]] <<-
            str_split(script_args[[arg]], pattern = "[:,]")[[1]]
        }
      })

    c(
      "min_mapq",
      "min_fraglen",
      "max_fraglen",
      "window_size",
      "step_size"
    ) %>%
      walk(function(arg) {
        if (!is_null(script_args) && arg %in% names(script_args)) {
          script_args[[arg]] <<- as.numeric(script_args[[arg]])
        }
      })

    if (is_true(script_args$window_size %% script_args$step_size != 0)) {
      stop("window_size must be multiples of step_size")
    }

    # genome must be one of GRCh37, GRCh38, hg19 and hg38
    # internally it can only can be GRCh37 or GRCh38
    if (script_args$genome %in% c("GRCh37", "hg19")) {
      script_args$genome <- "GRCh37"
    } else if (script_args$genome %in% c("GRCh38", "hg38")) {
      script_args$genome <- "GRCh38"
    } else {
      stop(paste0("Unsupported genome: ", script_args$genome))
    }

    return(list(subcommand, script_args))
  }
}


# Make adjustment to IFS interval start/end, and write to disk
write_ifs_as_bedgraph <- function(ifs, script_args, comments) {
  # In IFS, each row corresponds to a 200bp region, and they are overlapping.
  # This is not a bedGraph track.
  #
  # We're more interested in a bedGraph-style non-overlapping track
  #
  #    chrom    start      end    score cov mappability    gc   score0 cov_corrected      pval pval_adjust pval_cpois pval_cpois_adjust
  # 1:    21  9412520  9412720 27.58913   9   0.9050833 0.295 17.90432      13.78962 0.9599079           1  0.9611725                 1
  # 2:    21  9412540  9412740 29.36389  10   0.9750000 0.290 19.67908      14.78962 0.9841276           1  0.9828842                 1
  # 3:    21  9412560  9412760 27.48691   9   0.9417500 0.270 17.80209      13.78962 0.9599079           1  0.9594159                 1
  # 4:    21 10872940 10873140 71.61589  31   0.9162698 0.470 61.93108      35.78962 1.0000000           1  1.0000000                 1
  # 5:    21 10873160 10873360 73.56209  33   0.9200000 0.510 63.87727      37.78962 1.0000000           1  1.0000000                 1
  #
  # After this transformation, the IFS track looks like this:
  #    chrom    start      end    score cov mappability    gc   score0 cov_corrected      pval pval_adjust pval_cpois pval_cpois_adjust
  # 1:    21  9412610  9412630 27.58913   9   0.9050833 0.295 17.90432      13.78962 0.9599079           1  0.9611725                 1
  # 2:    21  9412630  9412650 29.36389  10   0.9750000 0.290 19.67908      14.78962 0.9841276           1  0.9828842                 1
  # 3:    21  9412650  9412670 27.48691   9   0.9417500 0.270 17.80209      13.78962 0.9599079           1  0.9594159                 1
  # 4:    21 10873030 10873050 71.61589  31   0.9162698 0.470 61.93108      35.78962 1.0000000           1  1.0000000                 1
  # 5:    21 10873250 10873270 73.56209  33   0.9200000 0.510 63.87727      37.78962 1.0000000           1  1.0000000                 1
  ws_ratio <- script_args$window_size %/% script_args$step_size
  offset <- as.integer((ws_ratio - 1) / 2 * script_args$step_size)

  GenomicRanges::start(ifs) <- GenomicRanges::start(ifs) + offset
  GenomicRanges::width(ifs) <- script_args$step_size

  # # Rearrage orders
  # df <- GenomicRanges::mcols(ifs) %>%
  #   as_tibble() %>%
  #   relocate(c(z_score, score), .after = end) %>% select(-gc, -mappability)
  # GenomicRanges::mcols(ifs) <- df

  bedtorch::write_bed(ifs,
    file_path = script_args$output,
    comments = comments
  )
}


log_mem <- function(label = "Unknown") {
  if (requireNamespace("lobstr")) {
    mem <- as.numeric(lobstr::mem_used()) / 1024**2
    logging::logdebug(str_interp("[${label}] Memory used: ${mem} MB"))
  }
}


# example
# script_args <- list(
#   input = "sandbox/xh_intermediate/batch2_res_merged_sort.chr21.bed.gz",
#   input_ifs = "sandbox/xh_intermediate/batch2_res_merged_sort.chr21.ifs.bed.gz",
#   input_hotspot = "sandbox/xh_intermediate/batch2_res_merged_sort.chr21.hotspot.bed.gz",
#   genome = "hs37-1kg",
#   gc_correct = TRUE,
#   high_mappability = "sandbox/mappability.hs37-1kg.w200.s20.0_9.bed.gz",
#   chrom = NULL,
#   min_mapq = 30L,
#   min_fraglen = 50L,
#   max_fraglen = 1000L,
#   exclude_region = "encode.blacklist.hs37-1kg",
#   exclude_soft_clipping = FALSE,
#   window_size = 200L,
#   step_size = 20L,
#   cpois = FALSE,
#   fdr = 0.01,
#   local_pval = 1e-5,
#   verbose = TRUE
# )


raw_ifs_helper <- function(script_args, chrom) {
  frag <- script_args$input %>%
    map(function(input_file) {
      logging::loginfo(str_interp("Loading fragment file ${input_file} ..."))
      read_fragments(input_file,
        range = chrom,
        genome = script_args$genome
      )
    }) %>%
    do.call(c, args = .) %>%
    sort()

  logging::loginfo("Fragments summary:")
  print(frag)

  logging::loginfo("Calculating IFS scores ...")
  ifs <- ifs_score(
    frag,
    window_size = script_args$window_size,
    step_size = script_args$step_size,
    gc_correct = script_args$gc_correct,
    blacklist_region = script_args$exclude_region,
    high_mappability_region = script_args$high_mappability,
    min_mapq = script_args$min_mapq,
    min_fraglen = script_args$min_fraglen,
    max_fraglen = script_args$max_fraglen,
    exclude_soft_clipping = FALSE #script_args$exclude_soft_clipping
  )$ifs

  list(frag = frag, ifs = calc_gc(ifs))
}


# 1. Read fragments
# 2. Calculate raw IFS scores (without GC)
# 3. Calculate GC content for each fragment
subcommand_ifs <- function(script_args) {
  # Make sure the genome is available
  bsgenome <- switch(script_args$genome,
    "GRCh37" = "BSgenome.Hsapiens.1000genomes.hs37d5",
    "GRCh38" = "BSgenome.Hsapiens.NCBI.GRCh38",
    stop(paste0("Invalid genome: ", script_args$genome))
  )

  assertthat::assert_that(requireNamespace(bsgenome), msg = str_interp("${bsgenome} is required"))
  assertthat::assert_that(!is_null(script_args$chrom))

  ifs <- map(script_args$chrom, function(chrom) {
    logging::loginfo(str_interp("Process chromosome ${chrom} ..."))
    raw_ifs_helper(script_args, chrom)$ifs
  }) %>%
    do.call(c, args = .)

  logging::loginfo("Raw IFS summary:")
  print(ifs)
  log_mem("Done calculating raw IFS scores")

  bedtorch::write_bed(ifs, file_path = script_args$output, comments = comments)
}


subcommand_peak <- function(script_args) {
  # Load IFS score from input file
  logging::loginfo(str_interp("Loading raw IFS scores: ${script_args$input} ..."))
  ifs <-
    bedtorch::read_bed(
      script_args$input,
      col.names = c("chrom", "start", "end", "score", "cov", "fraglen", "gc")
    )
  ifs <- assign_seqinfo(ifs, script_args$genome)

  logging::loginfo("Raw IFS summary:")
  print(ifs)

  if (script_args$gc_correct) {
    logging::loginfo("Performing GC correction ...")
    ifs <-
      gc_correct(
        ifs,
        span = 0.75,
        method = script_args$gc_correct_method,
        max_training_dataset = script_args$gc_correct_n,
        thread = script_args$thread,
      )
    log_mem("Done GC correction")
  } else {
    # No GC correction, just placeholder
    ifs$score_pre_gc <- ifs$score
  }

  logging::loginfo("Calculating z-scores ...")
  ifs <- calc_ifs_z_score(ifs)

  logging::loginfo("Calculating global p-values ...")
  ifs <- calc_pois_pval(ifs)
  log_mem("Done calculating global p-values")

  logging::loginfo("Calculating local p-values ...")
  ifs <-
    calc_pois_pval_local(
      ifs,
      window_size = script_args$window_size,
      step_size = script_args$step_size,
      local_layout = list(`50k` = 50e3L) # list(`5k` = 5e3L, `10k` = 10e3L, `25k` = 25e3L, `50k` = 50e3L)
    )
  log_mem("Done calculating local p-values")

  logging::loginfo("IFS summary:")
  print(ifs)

  write_ifs_as_bedgraph(ifs, script_args, comments)
}


# Perform signal-level analysis
subcommand_signal <- function(script_args) {
  script_args$step_size <- script_args$window_size

  # Make sure the genome is available
  bsgenome <- switch(script_args$genome,
    "GRCh37" = "BSgenome.Hsapiens.1000genomes.hs37d5",
    "hs37-1kg" = "BSgenome.Hsapiens.1000genomes.hs37d5",
    "GRCh38" = "BSgenome.Hsapiens.NCBI.GRCh38",
    stop(paste0("Invalid genome: ", genome_name))
  )

  assertthat::assert_that(requireNamespace(bsgenome), msg = str_interp("${bsgenome} is required"))
  assertthat::assert_that(!is_null(script_args$chrom))

  ifs2 <- map(script_args$chrom, function(chrom) {
    logging::loginfo(str_interp("Process chromosome ${chrom} ..."))
    result <- raw_ifs_helper(script_args, chrom)
    ifs <- result$ifs
    ifs$score_pre_gc <- ifs$score
    frag <- result$frag
    rm(result)

    if (script_args$gc_correct) {
      logging::loginfo("Perform GC correction ...")

      gc_result <-
        gc_correct(
          ifs,
          span = 0.75,
          method = script_args$gc_correct_method,
          max_training_dataset = script_args$gc_correct_n,
          thread = script_args$thread,
          return_model = TRUE
        )
      gc_model <- gc_result$model
      ifs <- gc_result$ifs
    } else {
      gc_model <- NULL
    }
    score_mean <- mean(ifs$score, na.rm = TRUE)
    score_sd <- sd(ifs$score, na.rm = TRUE)

    hotspot <- bedtorch::read_bed(script_args$hotspot) %>%
      assign_seqinfo(genome = script_args$genome) %>%
      GenomicRanges::resize(width = script_args$window_size, fix = "center")
    
    ifs2 <- ifs_score(
      frag,
      interval = hotspot,
      window_size = NULL,
      step_size = NULL,
      gc_correct = script_args$gc_correct,
      blacklist_region = script_args$exclude_region,
      high_mappability_region = script_args$high_mappability,
      min_mapq = script_args$min_mapq,
      min_fraglen = script_args$min_fraglen,
      max_fraglen = script_args$max_fraglen,
      exclude_soft_clipping = FALSE #script_args$exclude_soft_clipping
    )$ifs %>%
      calc_gc()
    ifs2$score_pre_gc <- ifs2$score

    if (script_args$gc_correct) {
      na_idx <- is.na(ifs2$gc)
      pred <-
        predict(gc_model, newdata = data.frame(gc = ifs2$gc[!na_idx]))
      ifs2$score <- NA
      ifs2$score[!na_idx] <-
        pmax(0, ifs2$score_pre_gc[!na_idx] - pred + mean(ifs$score_pre_gc, na.rm = TRUE))
    }

    ifs2$z_score <- (ifs2$score - score_mean) / score_sd
    ifs2
  }) %>%
    do.call(c, args = .)

  bedtorch::write_bed(ifs2, file_path = script_args$output, comments = comments)
}

# Main ----
parse_script_args_result <- parse_script_args()
subcommand <- parse_script_args_result[[1]]
script_args <- parse_script_args_result[[2]]
rm(parse_script_args_result)

# Build comment lines
comments <- c(
  paste0("cragr_version=", as.character(packageVersion("cragr"))),
  paste0("bedtorch_version=", as.character(packageVersion("bedtorch"))),
  paste0("timestamp=", lubridate::now() %>% format("%Y-%m-%dT%H:%M:%S%z")),
  # All items in script_args
  names(script_args) %>% purrr::map_chr(function(name) {
    v <- script_args[[name]]
    v_str <- if (!is_null(v)) {
      v %>%
        purrr::map_chr(as.character) %>%
        paste(collapse = ":")
    } else {
      ""
    }
    paste0(name, "=", v_str)
  })
)

if (script_args$verbose) {
  logging::setLevel("DEBUG")
}

logging::loginfo(str_interp("Argument summary:"))
comments %>% purrr::walk(function(v) {
  logging::loginfo(v)
})

library(cragr)

if (subcommand == "ifs") {
  subcommand_ifs(script_args)
} else if (subcommand == "peak") {
  subcommand_peak(script_args)
  # } else if (subcommand == "hotspot") {
  #   subcommand_hotspot(script_args)
} else if (subcommand == "signal") {
  subcommand_signal(script_args)
} else {
  stop("Invalid subcommand")
}