#' Calculate low mappability genomic regions
#'
#' @description
#' These functions empirically calculate low-mappability (Mappability Exclusion)
#' regions using the given genome FASTA file. A splice-aware alignment software
#' capable of aligning reads to the genome is required.
#' See details and examples below.
#'
#' @details Creating a Mappability Exclusion BED file is a three-step process.
#' \cr
#' * First, using `Mappability_GenReads()`,
#' synthetic reads are systematically generated using the given genome contained
#' within `reference_path`.
#' * Second, an aligner
#' such as STAR (preferably the same aligner used for the subsequent RNA-seq
#' experiment) is required to align these reads to the source genome. Poorly
#' mapped regions of the genome will be reflected by regions of low coverage
#' depth.
#' * Finally, the BAM file containing the aligned reads is analysed
#' using `Mappability_CalculateExclusions()`, to identify
#' low-mappability regions to compile the Mappability Exclusion BED file.
#'
#' It is recommended to leave all parameters to their default settings. Regular
#' users should only specify `reference_path`, `aligned_bam` and `n_threads`,
#' as required.
#'
#' NB: [STAR_Mappability] runs all 3 steps required, using the `STAR` aligner.
#' This only works in systems where `STAR` is installed.
#'
#' NB2: In systems where `STAR` is not available, consider using HISAT2 or
#' Rsubread. A working example using Rsubread is shown below.
#'
#' @param reference_path The directory of the reference prepared by
#' `GetReferenceResource()`
#' @param read_len The nucleotide length of the synthetic reads
#' @param read_stride The nucleotide distance between adjacent synthetic reads
#' @param error_pos The position of the procedurally-generated nucleotide error
#' from the start of each synthetic reads
#' @param verbose Whether additional status messages are shown
#' @param alt_fasta_file (Optional) The path to the user-supplied genome fasta
#' file, if different to that found inside the `resource` subdirectory of the
#' `reference_path`. If `GetReferenceResource()` has already been run,
#' this parameter should be omitted.
#' @param aligned_bam The BAM file of alignment of the synthetic reads generated
#' by `Mappability_GenReads()`. Users should use a genome splice-aware
#' aligner, preferably the same aligner used to align the samples in their
#' experiment.
#' @param threshold Genomic regions with this alignment read depth (or below)
#' in the aligned synthetic read BAM are defined as low
#' mappability regions.
#' @param n_threads The number of threads used to calculate mappability
#' exclusion regions from aligned bam file of synthetic reads.
#' @return
#' * For `Mappability_GenReads`: writes `Reads.fa` to the `Mappability`
#' subdirectory inside the given `reference_path`.
#' * For `Mappability_CalculateExclusions`: writes a gzipped BED file
#' named `MappabilityExclusion.bed.gz` to the `Mappability` subdirectory
#' inside `reference_path`.
#' This BED file is automatically used by `BuildReference()` if
#' `MappabilityRef` is not specified.
#' @examples
#'
#' # (1a) Creates genome resource files
#'
#' ref_path <- file.path(tempdir(), "Reference")
#'
#' GetReferenceResource(
#' reference_path = ref_path,
#' fasta = chrZ_genome(),
#' gtf = chrZ_gtf()
#' )
#'
#' # (1b) Systematically generate reads based on the NxtIRF example genome:
#'
#' Mappability_GenReads(
#' reference_path = ref_path
#' )
#' \dontrun{
#'
#' # (2) Align the generated reads using Rsubread:
#'
#' # (2a) Build the Rsubread genome index:
#'
#' setwd(ref_path)
#' Rsubread::buildindex(basename = "./reference_index",
#' reference = chrZ_genome())
#'
#' # (2b) Align the synthetic reads using Rsubread::subjunc()
#'
#' Rsubread::subjunc(
#' index = "./reference_index",
#' readfile1 = file.path(ref_path, "Mappability", "Reads.fa"),
#' output_file = file.path(ref_path, "Mappability", "AlignedReads.bam"),
#' useAnnotation = TRUE,
#' annot.ext = chrZ_gtf(),
#' isGTF = TRUE
#' )
#'
#' # (3) Analyse the aligned reads in the BAM file for low-mappability regions:
#'
#' Mappability_CalculateExclusions(
#' reference_path = ref_path,
#' aligned_bam = file.path(ref_path, "Mappability", "AlignedReads.bam")
#' )
#'
#' # (4) Build the NxtIRF reference using the calculated Mappability Exclusions
#'
#' BuildReference(ref_path)
#'
#' # NB the default is to search for the BED file generated by
#' # `Mappability_CalculateExclusions()` in the given reference_path
#' }
#' @name Mappability-methods
#' @aliases
#' Mappability_GenReads
#' Mappability_CalculateExclusions
#' @seealso [BuildReference]
#' @md
NULL
#' @describeIn Mappability-methods Generates synthetic reads from a
#' genome FASTA file, for mappability calculations.
#' @export
Mappability_GenReads <- function(reference_path,
read_len = 70, read_stride = 10, error_pos = 35,
verbose = TRUE, alt_fasta_file) {
.gmr_check_params(read_len, read_stride, error_pos + 1)
if (missing(alt_fasta_file)) {
alt_fasta_file <- .STAR_get_FASTA(reference_path)
if (!file.exists(alt_fasta_file)) .log(paste("In Mappability_GenReads,",
"failed to generate genome fasta file from given reference"))
} else if (!file.exists(alt_fasta_file)) {
.log(paste("In Mappability_GenReads,",
"given fasta file", alt_fasta_file, "not found"))
}
.validate_path(file.path(normalizePath(reference_path), "Mappability"))
# Run map read generator:
outfile <- file.path(normalizePath(reference_path),
"Mappability", "Reads.fa")
.log(paste("Generating synthetic reads, saving to", outfile), "message")
.run_IRFinder_GenerateMapReads(
normalizePath(alt_fasta_file), outfile,
read_len, read_stride, error_pos + 1
)
.STAR_clean_temp_FASTA_GTF(reference_path)
}
#' @describeIn Mappability-methods Generate a BED file defining
#' low mappability regions, using reads generated by
#' \code{Mappability_GenReads()}, aligned to the genome.
#' @export
Mappability_CalculateExclusions <- function(reference_path,
aligned_bam = file.path(reference_path, "Mappability",
"Aligned.out.bam"),
threshold = 4, n_threads = 1) {
if (!file.exists(aligned_bam))
.log(paste("In Mappability_CalculateExclusions(),",
aligned_bam, "BAM file does not exist"))
.validate_path(file.path(normalizePath(reference_path), "Mappability"))
output_file <- file.path(normalizePath(reference_path), "Mappability",
"MappabilityExclusion.bed")
.log(paste("Calculating Mappability Exclusion regions from:",
aligned_bam), type = "message")
.run_IRFinder_MapExclusionRegions(
bamfile = normalizePath(aligned_bam),
output_file = output_file,
threshold = threshold,
n_threads = n_threads
)
}
# Checks whether given parameters are valid
.gmr_check_params <- function(read_len, read_stride, error_pos) {
if (!is.numeric(read_len) || read_len < 30) {
.log(paste("In Mappability_GenReads,",
"read_len must be numerical and at least 30"))
}
if (!is.numeric(read_stride) || read_stride > read_len) {
.log(paste("In Mappability_GenReads,",
"read_stride must be numerical and less than read_len"))
}
if (!is.numeric(error_pos) || error_pos > read_len) {
.log(paste("In Mappability_GenReads,",
"error_pos must be numerical and less than read_len"))
}
}
# Wrappers to native Rcpp functions:
.run_IRFinder_GenerateMapReads <- function(genome.fa = "", out.fa,
read_len = 70, read_stride = 10, error_pos = 36) {
return(
IRF_GenerateMappabilityReads(normalizePath(genome.fa),
file.path(normalizePath(dirname(out.fa)), basename(out.fa)),
read_len = read_len,
read_stride = read_stride,
error_pos = error_pos)
)
}
.run_IRFinder_MapExclusionRegions <- function(bamfile = "", output_file,
threshold = 4, includeCov = FALSE, n_threads = 1) {
s_bam <- normalizePath(bamfile)
IRF_GenerateMappabilityRegions(s_bam,
output_file,
threshold = threshold,
includeCov = includeCov,
verbose = TRUE, n_threads = n_threads
)
# check file is actually made; then gzip it
if (file.exists(paste0(output_file, ".txt"))) {
R.utils::gzip(filename = paste0(output_file, ".txt"),
destname = paste0(output_file, ".gz"))
} else {
.log(paste(paste0(output_file, ".txt"), "was not produced"))
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.