R/Mappability.R

Defines functions .run_IRFinder_MapExclusionRegions .run_IRFinder_GenerateMapReads .gmr_check_params Mappability_CalculateExclusions Mappability_GenReads

Documented in Mappability_CalculateExclusions Mappability_GenReads

#' Calculate low mappability genomic regions
#'
#' @description
#' These functions empirically calculate low-mappability (Mappability Exclusion)
#' regions using the given genome FASTA file. A splice-aware alignment software
#' capable of aligning reads to the genome is required.
#' See details and examples below.
#'
#' @details Creating a Mappability Exclusion BED file is a three-step process.
#' \cr
#' * First, using `Mappability_GenReads()`,
#' synthetic reads are systematically generated using the given genome contained
#' within `reference_path`.
#' * Second, an aligner
#' such as STAR (preferably the same aligner used for the subsequent RNA-seq
#' experiment) is required to align these reads to the source genome. Poorly
#' mapped regions of the genome will be reflected by regions of low coverage
#' depth.
#' * Finally, the BAM file containing the aligned reads is analysed
#' using `Mappability_CalculateExclusions()`, to identify
#' low-mappability regions to compile the Mappability Exclusion BED file.
#'
#' It is recommended to leave all parameters to their default settings. Regular
#' users should only specify `reference_path`, `aligned_bam` and `n_threads`,
#' as required.
#'
#' NB: [STAR_Mappability] runs all 3 steps required, using the `STAR` aligner.
#' This only works in systems where `STAR` is installed.
#'
#' NB2: In systems where `STAR` is not available, consider using HISAT2 or
#' Rsubread. A working example using Rsubread is shown below.
#'
#' @param reference_path The directory of the reference prepared by
#'   `GetReferenceResource()`
#' @param read_len The nucleotide length of the synthetic reads
#' @param read_stride The nucleotide distance between adjacent synthetic reads
#' @param error_pos The position of the procedurally-generated nucleotide error
#'   from the start of each synthetic reads
#' @param verbose Whether additional status messages are shown
#' @param alt_fasta_file (Optional) The path to the user-supplied genome fasta
#'   file, if different to that found inside the `resource` subdirectory of the
#'   `reference_path`. If `GetReferenceResource()` has already been run,
#'   this parameter should be omitted.
#' @param aligned_bam The BAM file of alignment of the synthetic reads generated
#'   by `Mappability_GenReads()`. Users should use a genome splice-aware
#'   aligner, preferably the same aligner used to align the samples in their
#'   experiment.
#' @param threshold Genomic regions with this alignment read depth (or below)
#'   in the aligned synthetic read BAM are defined as low
#'   mappability regions.
#' @param n_threads The number of threads used to calculate mappability
#'   exclusion regions from aligned bam file of synthetic reads.
#' @return
#' * For `Mappability_GenReads`: writes `Reads.fa` to the `Mappability`
#'   subdirectory inside the given `reference_path`.
#' * For `Mappability_CalculateExclusions`: writes a gzipped BED file
#'   named `MappabilityExclusion.bed.gz` to the `Mappability` subdirectory
#'   inside `reference_path`.
#'   This BED file is automatically used by `BuildReference()` if
#'   `MappabilityRef` is not specified.
#' @examples
#'
#' # (1a) Creates genome resource files
#'
#' ref_path <- file.path(tempdir(), "Reference")
#'
#' GetReferenceResource(
#'     reference_path = ref_path,
#'     fasta = chrZ_genome(),
#'     gtf = chrZ_gtf()
#' )
#'
#' # (1b) Systematically generate reads based on the NxtIRF example genome:
#'
#' Mappability_GenReads(
#'     reference_path = ref_path
#' )
#' \dontrun{
#'
#' # (2) Align the generated reads using Rsubread:
#'
#' # (2a) Build the Rsubread genome index:
#'
#' setwd(ref_path)
#' Rsubread::buildindex(basename = "./reference_index",
#'     reference = chrZ_genome())
#'
#' # (2b) Align the synthetic reads using Rsubread::subjunc()
#'
#' Rsubread::subjunc(
#'     index = "./reference_index",
#'     readfile1 = file.path(ref_path, "Mappability", "Reads.fa"),
#'     output_file = file.path(ref_path, "Mappability", "AlignedReads.bam"),
#'     useAnnotation = TRUE,
#'     annot.ext = chrZ_gtf(),
#'     isGTF = TRUE
#' )
#'
#' # (3) Analyse the aligned reads in the BAM file for low-mappability regions:
#'
#' Mappability_CalculateExclusions(
#'     reference_path = ref_path,
#'     aligned_bam = file.path(ref_path, "Mappability", "AlignedReads.bam")
#' )
#'
#' # (4) Build the NxtIRF reference using the calculated Mappability Exclusions
#'
#' BuildReference(ref_path)
#'
#' # NB the default is to search for the BED file generated by
#' # `Mappability_CalculateExclusions()` in the given reference_path
#' }
#' @name Mappability-methods
#' @aliases
#' Mappability_GenReads
#' Mappability_CalculateExclusions
#' @seealso [BuildReference]
#' @md
NULL

#' @describeIn Mappability-methods Generates synthetic reads from a
#' genome FASTA file, for mappability calculations.
#' @export
Mappability_GenReads <- function(reference_path,
        read_len = 70, read_stride = 10, error_pos = 35,
        verbose = TRUE, alt_fasta_file) {
    .gmr_check_params(read_len, read_stride, error_pos + 1)
    if (missing(alt_fasta_file)) {
        alt_fasta_file <- .STAR_get_FASTA(reference_path)
        if (!file.exists(alt_fasta_file)) .log(paste("In Mappability_GenReads,",
            "failed to generate genome fasta file from given reference"))
    } else if (!file.exists(alt_fasta_file)) {
        .log(paste("In Mappability_GenReads,",
            "given fasta file", alt_fasta_file, "not found"))
    }
    .validate_path(file.path(normalizePath(reference_path), "Mappability"))
    # Run map read generator:
    outfile <- file.path(normalizePath(reference_path),
        "Mappability", "Reads.fa")
    .log(paste("Generating synthetic reads, saving to", outfile), "message")
    .run_IRFinder_GenerateMapReads(
        normalizePath(alt_fasta_file), outfile,
        read_len, read_stride, error_pos + 1
    )
    .STAR_clean_temp_FASTA_GTF(reference_path)
}

#' @describeIn Mappability-methods Generate a BED file defining
#' low mappability regions, using reads generated by
#' \code{Mappability_GenReads()}, aligned to the genome.
#' @export
Mappability_CalculateExclusions <- function(reference_path,
        aligned_bam = file.path(reference_path, "Mappability",
            "Aligned.out.bam"),
        threshold = 4, n_threads = 1) {
    if (!file.exists(aligned_bam))
        .log(paste("In Mappability_CalculateExclusions(),",
            aligned_bam, "BAM file does not exist"))

    .validate_path(file.path(normalizePath(reference_path), "Mappability"))
    output_file <- file.path(normalizePath(reference_path), "Mappability",
        "MappabilityExclusion.bed")

    .log(paste("Calculating Mappability Exclusion regions from:",
        aligned_bam), type = "message")
    .run_IRFinder_MapExclusionRegions(
        bamfile = normalizePath(aligned_bam),
        output_file = output_file,
        threshold = threshold,
        n_threads = n_threads
    )
}

# Checks whether given parameters are valid
.gmr_check_params <- function(read_len, read_stride, error_pos) {
    if (!is.numeric(read_len) || read_len < 30) {
        .log(paste("In Mappability_GenReads,",
            "read_len must be numerical and at least 30"))
    }
    if (!is.numeric(read_stride) || read_stride > read_len) {
        .log(paste("In Mappability_GenReads,",
            "read_stride must be numerical and less than read_len"))
    }
    if (!is.numeric(error_pos) || error_pos > read_len) {
        .log(paste("In Mappability_GenReads,",
            "error_pos must be numerical and less than read_len"))
    }
}

# Wrappers to native Rcpp functions:

.run_IRFinder_GenerateMapReads <- function(genome.fa = "", out.fa,
    read_len = 70, read_stride = 10, error_pos = 36) {
    return(
        IRF_GenerateMappabilityReads(normalizePath(genome.fa),
            file.path(normalizePath(dirname(out.fa)), basename(out.fa)),
            read_len = read_len,
            read_stride = read_stride,
            error_pos = error_pos)
    )
}

.run_IRFinder_MapExclusionRegions <- function(bamfile = "", output_file,
        threshold = 4, includeCov = FALSE, n_threads = 1) {
    s_bam <- normalizePath(bamfile)

    IRF_GenerateMappabilityRegions(s_bam,
        output_file,
        threshold = threshold,
        includeCov = includeCov,
        verbose = TRUE, n_threads = n_threads
    )
    # check file is actually made; then gzip it
    if (file.exists(paste0(output_file, ".txt"))) {
        R.utils::gzip(filename = paste0(output_file, ".txt"),
            destname = paste0(output_file, ".gz"))
    } else {
        .log(paste(paste0(output_file, ".txt"), "was not produced"))
    }
}
alexchwong/NxtIRFcore documentation built on Oct. 31, 2022, 9:14 a.m.