#' Perform stranded Bin counts
#' @param bam.files character vector. BAM files to use
#' @param restrictChrs character vector. chromosomes to use
#' @param bam_param ScanBAMParams
#' @param bp_param BPPARAM
#' @param window_size integer. size of window to use
#' @param sliding logical. perform sliding window counts?
#' @param func function to preprocess reads
#' @importFrom SummarizedExperiment assay rowRanges
#' @return RangedSE object with forward and reverse strand counts
strandBinCounts <- function(bam.files, restrictChrs, bam_param,
bp_param, window_size, sliding = FALSE,
func) {
if (sliding == FALSE) {
windows <- getChromBins(bam.files, restrictChr = restrictChrs, binSize = window_size)
ignoreMultiMap <- TRUE
} else {
windows <- getChromWindows(bam.files, restrictChr = restrictChrs,
binSize = window_size, stepSize = floor(window_size/2) )
ignoreMultiMap <- FALSE
fdata <-
features = windows$,
reads = bam.files,
mode = "IntersectionStrict",
ignore.strand = FALSE,
inter.feature = ignoreMultiMap,
singleEnd = TRUE,
fragments = FALSE,
preprocess.reads = func,
param = bam_param,
BPPARAM = bp_param)
rdata <-
features = windows$gr.minus,
reads = bam.files,
mode = "IntersectionStrict",
ignore.strand = FALSE,
inter.feature = ignoreMultiMap,
singleEnd = TRUE,
fragments = FALSE,
preprocess.reads = func,
param = bam_param,
BPPARAM = bp_param)
coldat <- S4Vectors::DataFrame(bam.files = bam.files,
forward.totals = BiocGenerics::colSums(assay(fdata)),
reverse.totals = BiocGenerics::colSums(assay(rdata)),
ext = NA,
rlen = 1L)
combined <- SummarizedExperiment::SummarizedExperiment(
rbind(assay(fdata, "counts"), assay(rdata, "counts")),
rowRanges = c(rowRanges(fdata), rowRanges(rdata)),
colData = coldat)
# drop empty bins
combined <- combined[BiocGenerics::rowSums(assay(combined)) > 0]
# Suggestion : Drop bins with counts < threshold ?
combined$totals <- combined$forward.totals + combined$reverse.totals
#' Detection of Trancription start sites based on local enrichment
#' @rdname detectTSS
#' @param CSobject CapSet object created using \code{\link{newCapSet}} function
#' @param groups a character vector that contains group name of the sample, for replicate-based TSS
#' calling (see example)
#' @param outfile_prefix Output name prefix for the .Rdata file containing window counts, background counts
#' and filtering statistics calculated during TSS detection.
#' @param windowSize Size of the window to bin the genome for TSS detection. By default, a window size of
#' 10 is used for binning the genome, however smaller window sizes can optionally be provided
#' for higher resolution TSS detection. Note that the background size is set to 200x the
#' window size (2kb for 10bp windows) to calculate local enrichment. Subsequently enriched windows
#' are merged, unless the mergeLength is increased.
#' @param sliding TRUE/FALSE. Indicating whether or not to use sliding windows. The windows are shifted by length which
#' is half of the specified window length.
#' @param foldChange Numeric. A fold change cutoff of local enrichment to detect the TSS. If the
#' samples have good signal enrichment over background (inspect in genome browser),
#' a low cutoff of 2-fold can be used. For samples with low sequencing depth it's
#' also desirable to have a low cutoff of 2-fold. The final "score" of detected TSS
#' is the mean fold-change of all merged windows that passed the foldChange cutoff.
#' TSSs can therefore also be filtered using this score after detectTSS is run.
#' @param mergeLength Integer. Merge the windows within this distance that pass the foldChange cutoff.
#' Default (1L) means that only subsequently enriched windows would be merged.
#' @param restrictChr Chromosomes to restrict the analysis to.
#' @param ncores No. of cores/threads to use
#' @param readPos character. position of read to use. Options are "start", "end" and "center".
#' For TSS detection, the "start" of reads are used (default). But center or end might be
#' useful for detecting RNA-binding proteins (in iCLIP-like data)
#' @return .bed files containing TSS position for each group, along with a bed file for consensus
#' (union) TSS sites of all samples.
#' @export
#' @importFrom utils write.table
#' @importFrom S4Vectors aggregate
#' @importFrom SummarizedExperiment mcols mcols<- colData colData<- rowRanges
#' @importFrom csaw readParam strandedCounts regionCounts filterWindows mergeWindows
#' @examples
#' # before running this
#' # 1. Create a CapSet object
#' # 2. de-multiplex the fastqs
#' # 3. map them
#' # 4. filter duplicate reads from mapped BAM
#' # load a previously saved CapSet object
#' cs <- exampleCSobject()
#' # detect TSS (samples in same group are treated as replicates)
#' cs <- detectTSS(cs, groups = rep(c("wt","mut"), each = 2), outfile_prefix = "testTSS",
#' foldChange = 6, restrictChr = "X", ncores = 1)
signature = "CapSet",
readPos) {
# check whether group and outfile_prefix is provided
if (missing(outfile_prefix))
stop("Please provide outfile_prefix!")
if (missing(groups))
stop("Please provide groups!")
# convert group to char
si <- sampleInfo(CSobject)
design <-
data.frame(row.names = si$samples, group = as.character(groups))
if (all($filtered_file))) {
warning("Filtered files not found under sampleInfo(CSobject). Using mapped files")
bam.files <- si$mapped_file
} else {
bam.files <- si$filtered_file
if (any( stop("Some or all of the bam files are not defined!")
if (sum(file.exists(bam.files)) != length(bam.files)) {
stop("One or more bam files don't exist! Check sampleInfo(CSobject) ")
# Counting params
countall = !(CSobject@paired_end)
bamParams <- Rsamtools::ScanBamParam(
flag = getBamFlags(countAll = countall))
bpParams <- getMCparams(ncores)
# register parallel backend
if (!BiocParallel::bpisup(bpParams)) {
# window size
bin_size <- windowSize
# background region size (200 x Window size)
surrounds <- 200*bin_size
## resize to read pos as requested
ppfunc <- switch(readPos,
"start" = readsTo5p,
"end" = readsTo3p,
"center" = readsToCenter)
# Count reads into sliding windows
data <- strandBinCounts(bam.files, restrictChr,
bam_param = bamParams,
bp_param = bpParams,
window_size = bin_size,
sliding = sliding,
func = ppfunc)
# add metadata
#mdat <- list(spacing = bin_size, width = bin_size,
# shift = 0, bin = TRUE, final.ext = 1)
#S4Vectors::metadata(data) <- mdat
colnames(data) <- rownames(design)
colData(data) <- c(colData(data), design)
# Get counts for background region
neighbors <- suppressWarnings(GenomicRanges::trim(
surrounds, fix = "center")
wider <-
features = neighbors,
reads = bam.files,
mode = "IntersectionStrict",
ignore.strand = FALSE,
inter.feature = FALSE,
singleEnd = TRUE,
fragments = FALSE,
preprocess.reads = ppfunc,
param = bamParams,
BPPARAM = bpParams)
#S4Vectors::metadata(wider) <- mdat
# set totals to same value as data (to avoid error from filterWindows)
colData(wider) <- colData(data)
colnames(wider) <- rownames(design)
colData(wider) <- c(colData(wider), design)
## take out groups --> Generate filter statistics for each group (based on local enrichment)
filterstat <- lapply(unique(design$group), function(x) {
stat <- localFilter(data[, data$group == x],
wider[, wider$group == x])
# add filter stats as metadata to the data
mcols(data) <- filterstat
# Require X-fold enrichment over local background to keep the window (similar to MACS)
keep <- lapply(filterstat, function(x) {
kp <- x$logFC > log2(foldChange)
}) <- lapply(keep, function(keep) {
return(data[keep,]) # mcols are carried over
## merge nearby windows (within bin_size) to get broader TSS
## final fold change = avgFC of windows
merged <- lapply(seq_along(, function(d) {
dr <- GenomicRanges::granges([[d]])
drm <- mcols(dr)
dr$logFC <- drm[[d]]$logFC
dr_reduced <- GenomicRanges::reduce(dr,
min.gapwidth = mergeLength,
ignore.strand = FALSE,
with.revmap = TRUE)
mcols(dr_reduced) <- aggregate(dr,
score = BiocGenerics::mean(logFC))
# update the Capset object
names(merged) <- unique(as.character(groups))
CSobject@tss_detected <- GenomicRanges::GRangesList(merged)
## Calculate prop reads in TSS per group
message("Counting reads within detected TSS")
mergedall <- base::Reduce(S4Vectors::union, merged)
si$num_intss <- as.numeric(numReadsInBed(mergedall, bam.files, countall = countall))
sampleInfo(CSobject) <- si
# Add the results as a list and save as .Rdata
output <- list( = data,
counts.background = wider)
if (!(is.null(outfile_prefix))) {
message("Writing filtering information as .Rdata")
save(output, file = paste0(outfile_prefix, ".Rdata"))
#' Export the detected TSS from CapSet object as .bed files
#' @rdname exportTSS
#' @param CSobject The modified CapSet object after running \code{\link{detectTSS}} function
#' @param outfile_prefix Prefix (with path) for output .bed files
#' @param pergroup If TRUE, write output per group of samples
#' @param merged If TRUE, write merged bed file (union of all groups)
#' @return .bed file(s) containing detected TSS.
#' @importFrom rtracklayer export.bed
#' @export
#' @examples
#' # load a previously saved CapSet object
#' cs <- exampleCSobject()
#' # export tss
#' exportTSS(cs, merged = TRUE, outfile_prefix = "testTSS")
signature = "CapSet",
merged) {
mergedBED <- CSobject@tss_detected
if (isTRUE(pergroup)) {
## write merged output for each group
message("Writing output .bed files per group")
function(bedfile, group) {
export.bed(object = bedfile, con = group)
bedfile = mergedBED,
group = paste0(outfile_prefix, "_" , names(mergedBED), ".bed")
if (isTRUE(merged)) {
## write out the union of GRanges
message("Writing merged .bed files")
mergedall <- base::Reduce(S4Vectors::union, mergedBED)
con = paste(outfile_prefix, "merged.bed", sep = "_"))
