#' @importFrom data.table fread
#' @importFrom data.table fwrite
#' @importFrom data.table data.table
#' @importFrom data.table as.data.table
#' @importFrom data.table dcast
#' @importFrom data.table merge.data.table
#' @importFrom data.table setorder
#' @importFrom data.table setkey
#' @importFrom data.table setDT
#' @importFrom data.table setattr
#' @importFrom GenomicRanges makeGRangesFromDataFrame
#' @importFrom GenomicRanges seqnames
#' @importFrom GenomicRanges reduce
#' @importFrom BiocGenerics sort
#' @importFrom BiocGenerics width
#' @importFrom stats ecdf
#' @importFrom stats setNames
#' @importFrom stats density
#' @importFrom methods is
#' @importFrom utils globalVariables
#' @importFrom utils packageVersion
#' @importFrom utils head
#' @importFrom Rcpp evalCpp
#' @useDynLib epialleleR, .registration=TRUE
# internal globals, constants and helper functions
#
################################################################################
# Globals, unload
################################################################################
utils::globalVariables(
c(".", ".I", ".N", ":=", "bedmatch", "context", "rname", "start", "strand",
"templid", "FALSE+", "FALSE-", "TRUE+", "TRUE-", "REF", "ALT",
"M+Ref","U+Ref","M+Alt","U+Alt", "M-Ref","U-Ref","M-Alt","U-Alt",
"M+A", "M+C", "M+G", "M+T", "M-A", "M-C", "M-G", "M-T",
"U+A", "U+C", "U+G", "U+T", "U-A", "U-C", "U-G", "U-T",
".SD", "bin", "count", "code", "pos", "cntx", "base", "meth", "x", "y",
"label")
)
.onUnload <- function (libpath) {library.dynam.unload("epialleleR", libpath)}
################################################################################
# Constants
################################################################################
# descr: methylation context to XM bases.
# The "u" and "U" are simply ignored - just as Bismark does
# in order to save us from confusion at the analysis stage
.context.to.bases <- list (
"CG" = list (ctx.meth = "Z", ctx.unmeth = "z",
ooctx.meth = "XH", ooctx.unmeth = "xh"),
"CHG" = list (ctx.meth = "X", ctx.unmeth = "x",
ooctx.meth = "ZH", ooctx.unmeth = "zh"),
"CHH" = list (ctx.meth = "H", ctx.unmeth = "h",
ooctx.meth = "ZX", ooctx.unmeth = "zx"),
"CxG" = list (ctx.meth = "ZX", ctx.unmeth = "zx",
ooctx.meth = "H", ooctx.unmeth = "h"),
"CX" = list (ctx.meth = "ZXH", ctx.unmeth = "zxh",
ooctx.meth = "", ooctx.unmeth = "")
)
################################################################################
# Functions: reading/writing files
################################################################################
# descr: check BAM file before reading (using HTSlib)
# value: list with tags and counters from rcpp_check_bam or error
# if BAM loading is not possible
.checkBam <- function (bam.file,
verbose)
{
if (verbose) message("Checking BAM file: ", appendLF=FALSE)
bam.file <- path.expand(bam.file)
bam.check <- rcpp_check_bam(bam.file)
# predominantly PE:
bam.check$paired <- (bam.check$npaired > bam.check$nrecs/2)
# name-sorted:
bam.check$sorted <- (bam.check$ntempls > 0) &
any(bam.check$ntempls >= c(bam.check$nrecs, bam.check$npaired)%/%2)
# main logic
if (bam.check$nrecs==0) { # no records
stop("Empty file provided! Exiting",
call.=FALSE)
} else if (is.null(bam.check$XG) & !is.null(bam.check$YD)) { # YD but no XG
stop("No XG tags found (though YD tags are there)! BWA-meth alignment?\n",
"If so, make methylation calls using epialleleR::callMethylation.\n",
"Exiting", call.=FALSE)
} else if (is.null(bam.check$XG) & !is.null(bam.check$ZS)) { # ZS but no XG
stop("No XG tags found (though ZS tags are there)! BSMAP alignment?\n",
"If so, make methylation calls using epialleleR::callMethylation.\n",
"Exiting", call.=FALSE)
} else if (is.null(bam.check$XM) & !is.null(bam.check$XG)) { # XG but no XM
stop("No XM tags found! Was methylation called successfully?\n",
"If not, make methylation calls using epialleleR::callMethylation.\n",
"Exiting", call.=FALSE)
} else if (!is.null(bam.check$MM) | !is.null(bam.check$Mm)) { # MM or Mm
bam.check$tagged <- "MM"
} else if (!is.null(bam.check$XG) & !is.null(bam.check$XM)) { # XG and XM
bam.check$tagged <- "XM"
} else {
stop("No known methylation tags found!\n",
"If you believe it's a mistake, please submit an issue at GitHub.\n",
"Exiting", call.=FALSE)
}
if (bam.check$paired & !bam.check$sorted) {# paired but not name-sorted
stop("BAM file seems to be paired-end but not sorted by name!\n",
"Please sort using 'samtools sort -n -o out.bam in.bam'.\n",
"Exiting", call.=FALSE)
}
if (verbose) message(
ifelse(bam.check$tagged=="XM", "short-read, ", "long-read, "),
ifelse(bam.check$paired, "paired-end, ", "single-end, "),
ifelse(bam.check$sorted, "name-sorted", "unsorted"),
" alignment detected", appendLF=TRUE
)
return(bam.check)
}
################################################################################
# descr: reads genomic (bgzipped) FASTA files using HTSlib
# value: list
.readGenome <- function (genome.file,
nthreads,
verbose)
{
if (verbose) message("Reading reference genome file ", appendLF=FALSE)
tm <- proc.time()
genome.file <- path.expand(genome.file)
genome.processed <- rcpp_read_genome(genome.file, nthreads)
if (verbose) message(sprintf("[%.3fs]",(proc.time()-tm)[3]), appendLF=TRUE)
return(genome.processed)
}
################################################################################
# descr: reads BAM files using HTSlib
# value: data.table
.readBam <- function (bam.file,
bam.check,
min.mapq,
min.baseq,
min.prob,
highest.prob,
skip.duplicates,
skip.secondary,
skip.qcfail,
skip.supplementary,
trim,
nthreads,
verbose)
{
if (verbose) message("Reading ", ifelse(bam.check$paired, "paired", "single"),
"-end BAM file ", appendLF=FALSE)
tm <- proc.time()
bam.file <- path.expand(bam.file)
skip.flags <- sum(c(4, 256, 512, 1024, 2048)[ # 4==BAM_FUNMAP
c(TRUE, skip.secondary, skip.qcfail, skip.duplicates, skip.supplementary)])
if (bam.check$tagged=="XM") { # short-read alignment
if (bam.check$paired) { # paired-end
skip.flags <- skip.flags + 8 # 8==BAM_FMUNMAP
bam.processed <- rcpp_read_bam_paired(bam.file, min.mapq, min.baseq,
skip.flags, trim[1], trim[2],
nthreads)
} else { # single-end
bam.processed <- rcpp_read_bam_single(bam.file, min.mapq, min.baseq,
skip.flags, trim[1], trim[2],
nthreads)
}
} else { # long-read alignment
bam.processed <- rcpp_read_bam_mm_single(bam.file, min.mapq, min.baseq,
min.prob, highest.prob,
skip.flags, trim[1], trim[2],
nthreads)
}
data.table::setDT(bam.processed)
bam.processed[,templid:=c(0:(.N-1))]
data.table::setorder(bam.processed, rname, start)
if (verbose) message(sprintf("[%.3fs]",(proc.time()-tm)[3]), appendLF=TRUE)
return(bam.processed)
}
################################################################################
# descr: (fast) reads BED file with amplicons
# value: object of type GRanges
.readBed <- function (bed.file,
zero.based.bed,
verbose)
{
if (verbose) message("Reading BED file ", appendLF=FALSE)
tm <- proc.time()
bed.df <- data.table::fread(file=bed.file, sep="\t", blank.lines.skip=TRUE,
data.table=FALSE)
colnames(bed.df)[seq_len(3)] <- c("chr", "start", "end")
bed <- GenomicRanges::makeGRangesFromDataFrame(
bed.df, keep.extra.columns=TRUE, ignore.strand=TRUE,
starts.in.df.are.0based=zero.based.bed
)
if (verbose) message(sprintf("[%.3fs]",(proc.time()-tm)[3]), appendLF=TRUE)
return(bed)
}
################################################################################
# descr: (fast) reads VCF file
# value: object of VCF type (VariantAnnotation)
.readVcf <- function (vcf.file,
vcf.style,
bed,
verbose)
{
if (verbose) message("Reading VCF file ", appendLF=FALSE)
tm <- proc.time()
vcf.genome <- "unknown"
vcf.param <- VariantAnnotation::ScanVcfParam(info=NA, geno=NA)
if (!is.null(bed)) {
bed <- GenomicRanges::reduce(bed)
bed.style <- GenomeInfoDb::seqlevelsStyle(bed)
if (!is.null(vcf.style))
GenomeInfoDb::seqlevelsStyle(bed) <- vcf.style
bed.levels <- levels(GenomicRanges::seqnames(bed))
vcf.genome <- stats::setNames(rep("unknown",length(bed.levels)), bed.levels)
vcf.param <- VariantAnnotation::ScanVcfParam(info=NA, geno=NA, which=bed)
}
vcf <- VariantAnnotation::readVcf(
file=vcf.file,
genome=vcf.genome,
param=vcf.param
)
if (exists("bed.style")) {
vcf.ranges <- SummarizedExperiment::rowRanges(vcf)
suppressPackageStartupMessages({
GenomeInfoDb::seqlevelsStyle(vcf.ranges) <- bed.style
})
SummarizedExperiment::rowRanges(vcf) <- vcf.ranges
}
if (verbose) message(sprintf("[%.3fs]",(proc.time()-tm)[3]), appendLF=TRUE)
return(vcf)
}
################################################################################
# descr: (fast) writes the report
# value: void
.writeReport <- function (report,
report.file,
gzip,
verbose)
{
if (verbose) message("Writing the report ", appendLF=FALSE)
tm <- proc.time()
data.table::fwrite(report, file=report.file, quote=FALSE, sep="\t",
row.names=FALSE, col.names=TRUE,
compress=if (gzip) "gzip" else "none")
if (verbose) message(sprintf("[%.3fs]",(proc.time()-tm)[3]), appendLF=TRUE)
}
################################################################################
# Functions: processing
################################################################################
# descr: writing out example BAM
# value: number of records written
.simulateBam <- function (output.bam.file,
qname, flag, rname,
pos, mapq, cigar, rnext,
pnext, tlen, seq, qual, ...,
verbose)
{
if (verbose) message("Writing sample BAM ", appendLF=FALSE)
tm <- proc.time()
if (!is.null(output.bam.file)) output.bam.file <- path.expand(output.bam.file)
tags <- list(...)
nrecs <- max(
sapply(c(as.list(environment()), tags), length)[names(match.call())],
1, na.rm=TRUE
)
if (is.null(qname)) qname <- sprintf("q%.04i", seq_len(nrecs))
else qname <- rep_len(qname, nrecs)
if (is.null(flag)) flag <- rep_len(0, nrecs)
else flag <- rep_len(flag, nrecs)
if (is.null(rname)) rname <- factor(rep_len("chrS", nrecs))
else rname <- factor(rep_len(rname, nrecs))
if (is.null(pos)) pos <- rep_len(1, nrecs)
else pos <- rep_len(pos, nrecs)
if (is.null(mapq)) mapq <- rep_len(60, nrecs)
else mapq <- rep_len(mapq, nrecs)
if (is.null(seq)) {
if ("XM" %in% names(tags)) {nbases <- nchar(tags$XM)}
else if (!is.null(tlen)) {nbases <- tlen}
else {nbases <- 10}
seq <- sapply(nbases, function (l) {
paste(sample(c("A","C","T","G"), l, replace=TRUE), collapse="")
})
}
seq <- rep_len(seq, nrecs)
if (is.null(cigar)) cigar <- paste0(nchar(seq), "M")
else cigar <- rep_len(cigar, nrecs)
if (is.null(rnext)) rnext <- factor(rep_len("chrS", nrecs))
else rnext <- factor(rep_len(rnext, nrecs))
if (is.null(pnext)) pnext <- rep_len(1, nrecs)
else pnext <- rep_len(pnext, nrecs)
if (is.null(tlen)) tlen <- nchar(seq)
else tlen <- rep_len(tlen, nrecs)
if (is.null(qual)) qual <- sapply(lapply(nchar(seq), rep_len, x="F"), paste, collapse="")
else qual <- rep_len(qual, nrecs)
header <- c(
sprintf("@SQ\tSN:%s\tLN:%i", levels(rname), max(pos, pnext)+max(tlen)-1),
sprintf("@PG\tID:epialleleR\tPN:epialleleR\tVN:%s\tCL:rcpp_simulate_bam()",
utils::packageVersion("epialleleR"))
)
fields <- data.table::data.table(
qname=qname, flag=flag, tid=as.integer(rname)-1, pos=pos-1, mapq=mapq,
cigar=cigar, mtid=as.integer(rnext)-1, mpos=pnext-1,
isize=tlen, seq=seq, qual=qual
)
i_tags <- data.table::as.data.table(tags[sapply(tags, is.integer)])
i_tags <- i_tags[rep_len(seq_len(nrow(i_tags)), nrecs)]
f_tags <- data.table::as.data.table(tags[sapply(tags, is.double)])
f_tags <- f_tags[rep_len(seq_len(nrow(f_tags)), nrecs)]
s_tags <- data.table::as.data.table(tags[sapply(tags, is.character)])
s_tags <- s_tags[rep_len(seq_len(nrow(s_tags)), nrecs)]
a_tags <- data.table::as.data.table(tags[sapply(tags, is.list)])
a_tags <- a_tags[rep_len(seq_len(nrow(a_tags)), nrecs)]
a_types <- apply(a_tags, 2, function (t) {
t.all <- unlist(t)
t.class <- class(t.all)
if (t.class=="numeric") {
return("f")
} else if (t.class=="integer") {
t.min <- min(t.all)
t.max <- max(t.all)
if (t.min<0 & t.min>-2**7 & t.max<2**7) { # int8_t
return("c")
} else if (t.min>=0 & t.max<2**8) { # uint8_t
return("C")
} else if (t.min<0 & t.min>-2**15 & t.max<2**15) { # int16_t
return("s")
} else if (t.min>=0 & t.max<2**16) { # uint16_t
return("S")
} else if (t.min<0) { # int32_t
return("i")
} else { # uint32_t
return("I")
}
} else if (length(t.all)>0) { # unsupported array tags
stop("BAM file format does not support non-numeric arrays", call.=FALSE)
} else {
return(character(0)) # no array tags
}
})
if (!is.null(output.bam.file))
result <- rcpp_simulate_bam(header, fields,
i_tags, f_tags, s_tags,
a_tags, as.character(a_types),
output.bam.file)
else
result <- cbind(fields, i_tags, f_tags, s_tags, a_tags)
if (verbose) message(sprintf("[%.3fs]",(proc.time()-tm)[3]), appendLF=TRUE)
return(result)
}
################################################################################
# descr: calling methylation (writing XG/XM)
# value: simple statistics and output BAM
.callMethylation <- function (input.bam.file, output.bam.file,
genome, nthreads, verbose)
{
if (verbose) message("Making methylation calls ", appendLF=FALSE)
tm <- proc.time()
input.bam.file <- path.expand(input.bam.file)
output.bam.file <- path.expand(output.bam.file)
bam.check <- rcpp_check_bam(input.bam.file)
if (bam.check$nrecs==0) { # no records
stop("Empty file provided! Exiting", call.=FALSE)
} else if (!is.null(bam.check$XG)) {
tag <- "XG"
} else if (!is.null(bam.check$YD)) {
tag <- "YD"
} else if (!is.null(bam.check$ZS)) {
tag <- "ZS"
} else stop("Unable to call methylation: neither of XG/YD/ZS tags is present",
" (genome strand unknown).\nExiting", call.=FALSE)
result <- rcpp_call_methylation_genome(
input.bam.file, output.bam.file, genome, tag, nthreads
)
if (verbose) message(sprintf("[%.3fs]",(proc.time()-tm)[3]), appendLF=TRUE)
return(result)
}
################################################################################
# descr: apply thresholding criteria to processed BAM reads
# value: bool vector with true for reads passing the threshold
.thresholdReads <- function (bam.processed,
ctx.meth, ctx.unmeth, ooctx.meth, ooctx.unmeth,
min.context.sites, min.context.beta,
max.outofcontext.beta, verbose)
{
if (verbose) message("Thresholding reads ", appendLF=FALSE)
tm <- proc.time()
# fast thresholding, vectorised
pass <- rcpp_threshold_reads(
bam.processed,
ctx.meth, ctx.unmeth, ooctx.meth, ooctx.unmeth,
min.context.sites, min.context.beta, max.outofcontext.beta
)
if (verbose) message(sprintf("[%.3fs]",(proc.time()-tm)[3]), appendLF=TRUE)
return(pass)
}
################################################################################
# descr: matching BED target (amplicon/capture)
# value: numeric vector
.matchTarget <- function (bam.processed, bed, bed.type,
match.tolerance, match.min.overlap)
{
# fast, vectorised
bed.dt <- data.table::as.data.table(bed)
bed.dt[, seqnames := factor(seqnames, levels=levels(bam.processed$rname))]
if (bed.type=="amplicon") {
bed.match <- rcpp_match_amplicon(bam.processed, bed.dt, match.tolerance)
} else if (bed.type=="capture") {
bed.match <- rcpp_match_capture(bam.processed, bed.dt, match.min.overlap)
}
return(bed.match)
}
################################################################################
# Functions: reporting
################################################################################
# descr: prepare cytosine report for processed reads according to filter
# value: data.table with Bismark-like cytosine report
.getCytosineReport <- function (bam.processed,
pass,
ctx,
verbose)
{
if (verbose) message("Preparing cytosine report ", appendLF=FALSE)
tm <- proc.time()
# must be ordered
cx.report <- rcpp_cx_report(bam.processed, pass, ctx)
data.table::setDT(cx.report)
if (verbose) message(sprintf("[%.3fs]",(proc.time()-tm)[3]), appendLF=TRUE)
return(cx.report)
}
################################################################################
# descr: prepare lMHL report for processed reads
# value: data.table with lMHL report
.getMhlReport <- function (bam.processed,
ctx, max.window, min.length, max.ooctx.beta,
verbose)
{
if (verbose) message("Preparing lMHL report ", appendLF=FALSE)
tm <- proc.time()
# must be ordered
mhl.report <- rcpp_mhl_report(bam.processed, ctx, max.window, min.length,
max.ooctx.beta)
data.table::setDT(mhl.report)
if (verbose) message(sprintf("[%.3fs]",(proc.time()-tm)[3]), appendLF=TRUE)
return(mhl.report)
}
################################################################################
# descr: BED-assisted (amplicon/capture) report
.getBedReport <- function (bam.processed, pass, bed, bed.type,
match.tolerance, match.min.overlap,
verbose)
{
if (verbose) message("Preparing ", bed.type, " report ", appendLF=FALSE)
tm <- proc.time()
bam.subset <- data.table::data.table(
strand=bam.processed$strand,
bedmatch=.matchTarget(bam.processed=bam.processed, bed=bed,
bed.type=bed.type, match.tolerance=match.tolerance,
match.min.overlap=match.min.overlap),
pass=factor(pass, levels=c(TRUE,FALSE))
)
data.table::setkey(bam.subset, bedmatch)
bam.dt <- data.table::dcast(
bam.subset[, list(nreads=.N), by=list(bedmatch,strand,pass), nomatch=0],
bedmatch~pass+strand, value.var=c("nreads"), sep="", drop=FALSE, fill=0
)
bam.dt[,`:=` (`nreads+`=`FALSE+`+`TRUE+`,
`nreads-`=`FALSE-`+`TRUE-`,
VEF=(`TRUE+`+`TRUE-`)/(`FALSE+`+`TRUE+`+`FALSE-`+`TRUE-`) )]
bed.dt <- data.table::as.data.table(bed)
# bed.cl <- colnames(bed.dt)
bed.dt[, bedmatch:=.I]
bed.report <- data.table::merge.data.table(bed.dt, bam.dt, by="bedmatch",
all=TRUE)[order(bedmatch)]
if (verbose) message(sprintf("[%.3fs]",(proc.time()-tm)[3]), appendLF=TRUE)
return(bed.report[,setdiff(names(bed.report),
c("bedmatch","FALSE+","FALSE-","TRUE+","TRUE-")),
with=FALSE])
}
################################################################################
# descr: calculates beta values and returns ECDF functions for BED file entries
# value: list of lists with context and out-of-context ECDF functions
.getBedEcdf <- function (bam.processed, bed, bed.type, bed.rows,
match.tolerance, match.min.overlap,
ctx.meth, ctx.unmeth, ooctx.meth, ooctx.unmeth,
verbose)
{
if (verbose) message("Computing ECDFs for within- and out-of-context",
" per-read beta values ", appendLF=FALSE)
tm <- proc.time()
bed.match <- .matchTarget(bam.processed=bam.processed, bed=bed,
bed.type=bed.type, match.tolerance=match.tolerance,
match.min.overlap=match.min.overlap)
# Rcpp::sourceCpp("rcpp_get_xm_beta.cpp")
ctx.beta <- rcpp_get_xm_beta(bam.processed, ctx.meth, ctx.unmeth)
ooctx.beta <- rcpp_get_xm_beta(bam.processed, ooctx.meth, ooctx.unmeth)
all.bed.rows <- sort(unique(bed.match), na.last=TRUE)
if (is.null(bed.rows))
bed.rows <- all.bed.rows
else
bed.rows <- intersect(bed.rows, all.bed.rows)
bed.match.isna <- is.na(bed.match)
bed.ecdf <- lapply(bed.rows, function (n) {
matched.rows <- if (is.na(n))
c(bed.match.isna)
else
c(!bed.match.isna & bed.match==n)
return(c(context=stats::ecdf(ctx.beta[matched.rows]),
out.of.context=stats::ecdf(ooctx.beta[matched.rows])))
})
names(bed.ecdf) <- as.character(as.character(bed)[bed.rows])
if (verbose) message(sprintf("[%.3fs]",(proc.time()-tm)[3]), appendLF=TRUE)
return(bed.ecdf)
}
################################################################################
# descr: calculates base frequences at particular positions
# value: data.table with base freqs
.getBaseFreqReport <- function (bam.processed, pass, vcf,
verbose)
{
if (verbose) message("Extracting base frequences ", appendLF=FALSE)
tm <- proc.time()
vcf.ranges <- BiocGenerics::sort(SummarizedExperiment::rowRanges(vcf))
vcf.ranges <- vcf.ranges[BiocGenerics::width(vcf.ranges)==1 &
vapply(as.character(vcf.ranges$ALT),
nchar,
FUN.VALUE=numeric(1), USE.NAMES=FALSE)==1]
# GenomeInfoDb::seqlevels(vcf.ranges, pruning.mode="coarse") <-
# levels(bam.processed$rname)
vcf.dt <- data.table::as.data.table(vcf.ranges)
vcf.dt[, seqnames := factor(seqnames, levels=levels(bam.processed$rname))]
if (all(is.na(vcf.dt$seqnames)))
stop("Looks like seqlevels styles of BAM and VCF don't match. ",
"Please provide VCF as an object with correct seqlevels.")
freqs <- rcpp_get_base_freqs(bam.processed, pass, vcf.dt)
colnames(freqs) <- c("U+A","U+C","U+G","U+T","U+N",
"U-A","U-C","U-G","U-T","U-N",
"M+A","M+C","M+G","M+T","M+N",
"M-A","M-C","M-G","M-T","M-N")
bf.report <- data.table::data.table(
name=names(vcf.ranges),
vcf.dt[,.(seqnames, range=start, REF, ALT)],
freqs[,grep("[ACTG]$",colnames(freqs))]
)
bf.report[REF=="A" & ALT=="C", `:=` (`M+Ref`=`M+A`, `U+Ref`=`U+A`, `M-Ref`=`M-A`, `U-Ref`=`U-A`,
`M+Alt`=`M+C`+`M+T`, `U+Alt`=`U+C`+`U+T`, `M-Alt`=`M-C`, `U-Alt`=`U-C` )]
bf.report[REF=="A" & ALT=="T", `:=` (`M+Ref`=`M+A`, `U+Ref`=`U+A`, `M-Ref`=`M-A`, `U-Ref`=`U-A`,
`M+Alt`=`M+T`, `U+Alt`=`U+T`, `M-Alt`=`M-T`, `U-Alt`=`U-T` )]
bf.report[REF=="A" & ALT=="G", `:=` (`M+Ref`=`M+A`, `U+Ref`=`U+A`, `M-Ref`=NA, `U-Ref`=NA,
`M+Alt`=`M+G`, `U+Alt`=`U+G`, `M-Alt`=NA, `U-Alt`=NA )]
bf.report[REF=="C" & ALT=="A", `:=` (`M+Ref`=`M+C`+`M+T`, `U+Ref`=`U+C`+`U+T`, `M-Ref`=`M-C`, `U-Ref`=`U-C`,
`M+Alt`=`M+A`, `U+Alt`=`U+A`, `M-Alt`=`M-A`, `U-Alt`=`U-A` )]
bf.report[REF=="C" & ALT=="T", `:=` (`M+Ref`=NA, `U+Ref`=NA, `M-Ref`=`M-C`, `U-Ref`=`U-C`,
`M+Alt`=NA, `U+Alt`=NA, `M-Alt`=`M-T`, `U-Alt`=`U-T` )]
bf.report[REF=="C" & ALT=="G", `:=` (`M+Ref`=`M+C`+`M+T`, `U+Ref`=`U+C`+`U+T`, `M-Ref`=`M-C`, `U-Ref`=`U-C`,
`M+Alt`=`M+G`, `U+Alt`=`U+G`, `M-Alt`=`M-A`+`M-G`, `U-Alt`=`U-A`+`U-G`)]
bf.report[REF=="T" & ALT=="A", `:=` (`M+Ref`=`M+T`, `U+Ref`=`U+T`, `M-Ref`=`M-T`, `U-Ref`=`U-T`,
`M+Alt`=`M+A`, `U+Alt`=`U+A`, `M-Alt`=`M-A`, `U-Alt`=`U-A` )]
bf.report[REF=="T" & ALT=="C", `:=` (`M+Ref`=NA, `U+Ref`=NA, `M-Ref`=`M-T`, `U-Ref`=`U-T`,
`M+Alt`=NA, `U+Alt`=NA, `M-Alt`=`M-C`, `U-Alt`=`U-C` )]
bf.report[REF=="T" & ALT=="G", `:=` (`M+Ref`=`M+T`, `U+Ref`=`U+T`, `M-Ref`=`M-T`, `U-Ref`=`U-T`,
`M+Alt`=`M+G`, `U+Alt`=`U+G`, `M-Alt`=`M-A`+`M-G`, `U-Alt`=`U-A`+`U-G`)]
bf.report[REF=="G" & ALT=="A", `:=` (`M+Ref`=`M+G`, `U+Ref`=`U+G`, `M-Ref`=NA, `U-Ref`=NA,
`M+Alt`=`M+A`, `U+Alt`=`U+A`, `M-Alt`=NA, `U-Alt`=NA )]
bf.report[REF=="G" & ALT=="C", `:=` (`M+Ref`=`M+G`, `U+Ref`=`U+G`, `M-Ref`=`M-A`+`M-G`, `U-Ref`=`U-A`+`U-G`,
`M+Alt`=`M+C`+`M+T`, `U+Alt`=`U+C`+`U+T`, `M-Alt`=`M-C`, `U-Alt`=`U-C` )]
bf.report[REF=="G" & ALT=="T", `:=` (`M+Ref`=`M+G`, `U+Ref`=`U+G`, `M-Ref`=`M-A`+`M-G`, `U-Ref`=`U-A`+`U-G`,
`M+Alt`=`M+T`, `U+Alt`=`U+T`, `M-Alt`=`M-T`, `U-Alt`=`U-T` )]
bf.report[, `:=` (SumRef=rowSums(bf.report[,.(`M+Ref`,`U+Ref`,`M-Ref`,`U-Ref`)], na.rm=TRUE),
SumAlt=rowSums(bf.report[,.(`M+Alt`,`U+Alt`,`M-Alt`,`U-Alt`)], na.rm=TRUE))]
# FEp <- function (x) { if (any(is.na(x))) NA else stats::fisher.test(matrix(x, nrow=2))$p.value }
# bf.report[, `:=` (`FEp+`=apply(bf.report[,.(`M+Ref`,`U+Ref`,`M+Alt`,`U+Alt`)], 1, FEp),
# `FEp-`=apply(bf.report[,.(`M-Ref`,`U-Ref`,`M-Alt`,`U-Alt`)], 1, FEp))]
bf.report[, `:=` (`FEp+`=rcpp_fep(bf.report, c("M+Ref","U+Ref","M+Alt","U+Alt")),
`FEp-`=rcpp_fep(bf.report, c("M-Ref","U-Ref","M-Alt","U-Alt")))]
if (verbose) message(sprintf("[%.3fs]",(proc.time()-tm)[3]), appendLF=TRUE)
return(bf.report)
}
################################################################################
# descr: extracts methylation patterns for particular range
# value: data.table with patterns
.getPatterns <- function (bam.processed, bed, bed.row, match.min.overlap,
extract.context, min.context.freq,
clip.patterns, strand.offset, highlight.positions,
verbose)
{
if (verbose) message("Extracting methylation patterns ", appendLF=FALSE)
tm <- proc.time()
bed.dt <- data.table::as.data.table(bed)[bed.row]
bed.dt[, `:=` (seqnames = factor(seqnames, levels=levels(bam.processed$rname)),
strand = "*")]
highlight.positions <- sort(unique(
highlight.positions[highlight.positions>=bed.dt$start &
highlight.positions<=bed.dt$end]
))
patterns <- rcpp_extract_patterns(bam.processed,
as.integer(bed.dt$seqnames),
as.integer(bed.dt$start),
as.integer(bed.dt$end),
match.min.overlap, extract.context,
min.context.freq,
clip.patterns, strand.offset,
highlight.positions)
data.table::setDT(patterns)
colnames(patterns) <- sub("^X([0-9]+)$", "\\1", colnames(patterns))
data.table::setattr(patterns, "bed", as.character(bed)[bed.row])
if (verbose) message(sprintf("[%.3fs]",(proc.time()-tm)[3]), appendLF=TRUE)
return(patterns)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.