#' Import BAMs
#'
#' @description
#' Import and process BAM files.
#'
#' @inheritParams common_params
#' @param paired Whether the BAMs are paired (TRUE) or unpaired (FALSE).
#' @param soft_remove Remove read if greater than this number of soft-clipped bases
#' is present at its 5' most end.
#' @param proper_pair Remove reads flagged as improperly paired.
#' TRUE by default when data is paired-end.
#' @param remove_secondary Remove secondary alignments (TRUE).
#' @param remove_duplicate Remove duplicate reads (paired-end only) (TRUE).
#'
#' @details
#' Import BAMs using the information from the sample sheet.
#' If the BAMs are from paired-end data,
#' 'proper_pair' allows removal of reads without a proper-pair SAM flag.
#' Additionally 'remove_secondary' and 'remove_duplicate' will remove reads
#' with the secondary alignment and duplicate flags set.
#'
#' Most TSS mapping methodologies tend to add at least one non-templated base
#' at the 5' end of the read.
#' Furthermore, template switching reverse transcription (TSRT)-based methods such
#' as STRIPE-seq or nanoCAGE can have up to 3 or 4 non-templated 5' bases.
#' We recommend setting `soft_remove` to at minimum 3 because of this,
#' which removes the read if the given number of soft-clip bases is exceeded.
#'
#' @return TSRexploreR object with BAM GRanges and soft-clipped base information.
#'
#' @examples
#' bam_file <- system.file("extdata", "S288C.bam", package="TSRexploreR")
#' assembly <- system.file("extdata", "S288C_Assembly.fasta", package="TSRexploreR")
#' samples <- data.frame(sample_name="S288C", file_1=bam_file, file_2=NA)
#'
#' exp <- tsr_explorer(sample_sheet=samples, genome_assembly=assembly)
#' import_bams(exp, paired=TRUE)
#'
#' @export
import_bams <- function(
experiment,
paired,
sample_sheet=NULL,
soft_remove=3,
proper_pair=NULL,
remove_secondary=TRUE,
remove_duplicate=FALSE
) {
## Check if GenomicAlignments is installed.
if (!requireNamespace("GenomicAlignments", quietly = TRUE)) {
stop("Package \"GenomicAlignments\" needed for this function to work. Please install it.",
call. = FALSE)
}
## Input checks.
assert_that(is(experiment, "tsr_explorer"))
assert_that(is.flag(paired))
assert_that(
is.null(sample_sheet) ||
(is.character(sample_sheet) | is.data.frame(sample_sheet))
)
assert_that(is.null(soft_remove) || is.count(soft_remove))
assert_that(is.null(proper_pair) || is.flag(proper_pair))
assert_that(is.flag(remove_secondary))
assert_that(is.flag(remove_duplicate))
## Prepare sample sheet if necessary.
sample_sheet_type <- case_when(
is.null(sample_sheet) ~ "internal",
is.character(sample_sheet) ~ "file",
is.data.frame(sample_sheet) ~ "data_frame"
)
sample_sheet <- switch(sample_sheet_type,
"internal"=experiment@meta_data$sample_sheet,
"file"=fread(sample_sheet, sep="\t"),
"data_frame"=as.data.table(sample_sheet)
)
samples <- as.list(sample_sheet[, file_1])
names(samples) <- sample_sheet[, sample_name]
## Specifying flag settings for filtering.
flag_args <- list(
isUnmappedQuery=FALSE # Remove unmapped.
)
if (remove_secondary) {
flag_args <- c(flag_args, list(
isSecondaryAlignment=FALSE # Remove non-primary.
))
}
if (paired & (!is.null(proper_pair) && proper_pair)) {
flag_args <- c(flag_args, list(
isPaired=TRUE, # Return only paired reads.
isProperPair=TRUE, #Remove improperly paired reads.
hasUnmappedMate=FALSE #Remove reads with unmapped mate.
))
}
if (remove_duplicate) {
flag_args <- c(flag_args, list(isDuplicate=FALSE))
}
## Import BAMs.
if (paired) {
bams <- map(samples, function(x) {
bam <- GenomicAlignments::readGAlignmentPairs(
x, param=ScanBamParam(
what="seq", flag=do.call(scanBamFlag, flag_args)
)
)
bam <- as.data.table(bam)[, .(
seqnames=seqnames.first, start=start.first, end=end.first,
strand=strand.first, cigar=cigar.first, seq=seq.first
)]
return(bam)
})
} else {
bams <- map(samples, function(x) {
bam <- GenomicAlignments::readGAlignments(
x, param=ScanBamParam(
what="seq", flag=do.call(scanBamFlag, flag_args)
)
)
bam <- as.data.table(bam)[, .(seqnames, start, end, strand, cigar, seq)]
return(bam)
})
}
## Get soft-clipped bases.
walk(bams, function(x) {
x[, n_soft := as.numeric(ifelse(
strand == "+",
str_extract(cigar, "^[[:digit:]]+(?=S)"), # For + strand soft-clip is at cigar beginning.
str_extract(cigar, "[[:digit:]]+S$") # For - strand soft-clip is at cigar end.
))][,
n_soft := replace_na(n_soft, 0)
][
n_soft == 0, seq := "none" # If there are no soft-clipped bases, replace seq with 'none'
][
n_soft > 0, # Only subset sequences with soft-clipped bases.
seq := str_sub(seq, end=n_soft) # The sequence for negative strand is reverse complement.
]
setnames(x, old="seq", new="seq_soft")
return(x)
})
## Remove reads with too many soft-clipped bases.
bams <- map(bams, ~ .x[is.na(n_soft) | n_soft <= soft_remove])
## Convert to GRanges.
walk(bams, function(x) {
x[,
start := ifelse(strand == "+", start, end)
][,
end := start
]
x[, cigar := NULL]
})
bams <- map(bams, as_granges)
## Add GRanges and sample sheet to TSRexploreR object.
experiment@experiment$TSSs <- bams
experiment@meta_data$sample_sheet <- sample_sheet
return(experiment)
}
#' Aggregate TSSs
#'
#' @description
#' Aggregate overlapping TSSs to generate positional scores.
#'
#' @inheritParams common_params
#'
#' @details
#' 'import_bams' generates a GRanges object of non-aggregated TSSs.
#' This function aggregates overlapping TSSs into a sum total score
#' per genomic position.
#'
#' @return TSRexploreR object with GRanges of aggregated TSSs.
#'
#' @examples
#' bam_file <- system.file("extdata", "S288C.bam", package="TSRexploreR")
#' assembly <- system.file("extdata", "S288C_Assembly.fasta", package="TSRexploreR")
#' samples <- data.frame(sample_name="S288C", file_1=bam_file, file_2=NA)
#'
#' exp <- tsr_explorer(sample_sheet=samples, genome_assembly=assembly)
#' exp <- exp %>%
#' import_bams(paired=TRUE) %>%
#' tss_aggregate
#'
#' @export
tss_aggregate <- function(experiment) {
## Check inputs.
assert_that(is(experiment, "tsr_explorer"))
## Get samples.
samples <- experiment@experiment$TSSs
## Aggregate TSSs.
samples <- map(samples, function(x) {
x <- as.data.table(x)
x <- x[, .(score=.N), by=.(seqnames, start, end, strand)]
x <- as_granges(x)
return(x)
})
## Add samples back to TSRexploreR object.
experiment@experiment$TSSs <- samples
return(experiment)
}
#' Correct G artifact
#'
#' @description
#' Correct overrepresentation of 5' G bases added during reverse transcription.
#'
#' @param experiment TSRexploreR object.
#' @param assembly Genome assembly in FASTA or BSgenome format.
#'
#' @details
#' A common artifact in most TSS mapping methods is the presence of a G base upstream
#' of the true TSS, presumably templated by the 5' cap during reverse transcription.
#' Soft-clipping analysis can remove such Gs if they are not incidentally templated
#' onto the genome; however, in cases where they match the genome during alignment,
#' they cannot be distinguished from true TSSs. In order to account for this artifact,
#' TSRexploreR first determines the frequency of reads with a soft-clipped G in a
#' given sample. For each read with a non-soft-clipped G at its 5' end, a Bernoulli
#' trial is performed, with the above-mentioned frequency used as the probability of
#' "success" (removal of the 5' G).
#'
#' @return TSRexploreR object with G-corrected TSS GRanges.
#'
#' @seealso
#' \code{\link{import_bams}} to import BAMs.
#'
#' @examples
#' bam_file <- system.file("extdata", "S288C.bam", package="TSRexploreR")
#' assembly <- system.file("extdata", "S288C_Assembly.fasta", package="TSRexploreR")
#' samples <- data.frame(sample_name="S288C", file_1=bam_file, file_2=NA)
#'
#' exp <- tsr_explorer(sample_sheet=samples, genome_assembly=assembly) %>%
#' import_bams(paired=TRUE)
#'
#' exp <- G_correction(exp, assembly=assembly)
#'
#' @export
G_correction <- function(
experiment,
assembly=NULL
) {
## Check inputs.
assert_that(is(experiment, "tsr_explorer"))
assert_that(
is.null(assembly) ||
(is.character(assembly) | is(assembly, "BSgenome"))
)
## Prepare assembly.
assembly <- .prepare_assembly(assembly, experiment)
assembly_type <- case_when(
is(assembly, "FaFile") ~ "fafile",
is(assembly, "BSgenome") ~ "bsgenome"
)
## Get samples.
select_samples <- experiment@experiment$TSSs
## Retrieve +1 base.
select_samples <- map(select_samples, function(x) {
seq <- switch(
assembly_type,
"fafile"=Rsamtools::getSeq(assembly, x),
"bsgenome"=BSgenome::getSeq(assembly, x)
)
x$plus_one <- as.character(seq)
x <- as.data.table(x)
return(x)
})
## Correct for frequency of soft-clipped Gs and templated Gs.
select_samples <- map(select_samples, function(x) {
# Frequency of soft-clipped Gs.
sfreq <- x[, .(count=.N), by=seq_soft]
sfreq[, freq := count / sum(count)]
sfreq <- sfreq[seq_soft == "G", freq]
# Correct frequencies.
x[
plus_one == "G" & n_soft == 0,
start := ifelse(
rbinom(nrow(x[plus_one == "G" & n_soft == 0]), 1, sfreq) == 1,
ifelse(strand == "+", start + 1, start - 1),
start
)
][,
c("end", "plus_one") := list(start, NULL)
]
return(x)
})
## Add data back to TSRexploreR object.
select_samples <- map(select_samples, as_granges)
experiment@experiment$TSSs <- select_samples
return(experiment)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.