###################################################################
#'
#' Generate a peak x cell UMI count matrix
#'
#' Generates a UMI count matrix where rows are the peaks and columns are the cells. Counts cells
#' that are identified through a provided 'white list' of cell barcodes. If alignment done using CellRanger,
#' this will be the barcodes.tsv file contained in the 'filtered_gene_matrices_mex' folder for example.
#'
#' @param peak.sites.file a file containing peak coordinates generated by FindPeaks
#' @param gtf.file reference (GTF) file
#' @param bamfile scRNA-seq BAM file
#' @param whitelist.file file of cell barcodes to count
#' @param output.dir name of directory to write output (will be created if it doesn't exist)
#' @param countUMI whether to count UMIs (default: TRUE)
#' @param ncores Number of cores for multithreading
#' @param chr.names names of chromosomes
#' @param filter.chr names of chromosomes to filter
#' @param gene.symbol.ref field in the GTF file containing the gene symbol
#' @param CBtag cell barcode tag identifier present in BAM file. Default 'CB'.
#' @param UMItag UMI barcode tag identifier present in BAM file. Default 'UB'.
#' @return NULL. Writes counts to file.
#' @examples
#'
#' extdata_path <- system.file("extdata",package = "Sierra")
#' reference.file <- paste0(extdata_path,"/Vignette_cellranger_genes_subset.gtf")
#'
#' bamfile <- c(paste0(extdata_path,"/Vignette_example_TIP_sham.bam"),
#' paste0(extdata_path,"/Vignette_example_TIP_mi.bam") )
#'
#' whitelist.bc.file <- paste0(extdata_path,"/example_TIP_sham_whitelist_barcodes.tsv")
#'
#' peak.merge.output.file = paste0(extdata_path, "/TIP_merged_peaks.txt")
#'
#' \dontrun{
#' CountPeaks(peak.sites.file = peak.merge.output.file,
#' gtf.file = reference.file,
#' bamfile = bamfile[1],
#' whitelist.file = whitelist.bc.file[1],
#' output.dir = count.dirs[1],
#' countUMI = TRUE,
#' ncores = 1)
#' }
#'
#'
#' @importFrom magrittr "%>%"
#' @importFrom foreach "%dopar%"
#' @importFrom Matrix writeMM
#' @import utils
#'
#' @export
CountPeaks <- function(peak.sites.file,
gtf.file,
bamfile,
whitelist.file,
output.dir,
countUMI=TRUE,
ncores = 1,
chr.names = NULL,
filter.chr = FALSE,
gene.symbol.ref = 'gene_name',
CBtag='CB',
UMItag='UB')
{
lock <- tempfile()
whitelist.bc <- read.table(whitelist.file, stringsAsFactors = FALSE)
whitelist.bc <- whitelist.bc[,1]
n.bcs <- length(whitelist.bc)
message("There are ", n.bcs, " whitelist barcodes.")
n.columns <- n.bcs + 1
# read in gene reference
genes.ref <- make_reference(gtf_file = gtf.file, chr.names = chr.names, filter.chr = filter.chr, gene.symbol.ref = gene.symbol.ref)
chr.names <- as.character(unique(genes.ref$chr))
n.genes <- nrow(genes.ref)
peak.sites <- read.table(peak.sites.file, header = T, sep = "\t", quote = '',
stringsAsFactors = FALSE)
# Count the peaks
n.total.sites <- nrow(peak.sites)
message("There are ", n.total.sites, " sites")
message("Doing counting for each site...")
# Set up multiple workers
system.name <- Sys.info()['sysname']
new_cl <- FALSE
if (system.name == "Windows") {
new_cl <- TRUE
cluster <- parallel::makePSOCKcluster(rep("localhost", ncores))
doParallel::registerDoParallel(cluster)
} else {
doParallel::registerDoParallel(cores=ncores)
}
#print(chr.names)
mat.to.write <- foreach::foreach(each.chr = chr.names, .combine = 'rbind', .packages=c("magrittr")) %dopar% {
mat.per.chr <- c()
message("Processing chr: ", each.chr)
for(strand in c(1, -1) ) {
message(" and strand ", strand)
isMinusStrand <- if(strand==1) FALSE else TRUE
peak.sites.chr <- dplyr::filter(peak.sites, Chr == each.chr & Strand == strand) %>%
dplyr::select(Gene, Chr, Fit.start, Fit.end, Strand)
peak.sites.chr$Fit.start <- as.integer(peak.sites.chr$Fit.start)
peak.sites.chr$Fit.end <- as.integer(peak.sites.chr$Fit.end)
peak.sites.chr <- dplyr::filter(peak.sites.chr, Fit.start < Fit.end)
# If there are no sites in this range, then just keep going
if(nrow(peak.sites.chr) == 0) {
next
}
isMinusStrand <- if(strand==1) FALSE else TRUE
which <- GenomicRanges::GRanges(seqnames = each.chr, ranges = IRanges::IRanges(1, max(peak.sites.chr$Fit.end) ))
param <- Rsamtools::ScanBamParam(tag=c(CBtag, UMItag),
which = which,
flag=Rsamtools::scanBamFlag(isMinusStrand=isMinusStrand))
aln <- GenomicAlignments::readGAlignments(bamfile, param=param)
nobarcodes <- which(unlist(is.na(GenomicRanges::mcols(aln)[CBtag])))
noUMI <- which(unlist(is.na(GenomicRanges::mcols(aln)[UMItag])))
to.remove <- dplyr::union(nobarcodes, noUMI)
if (length(to.remove) > 0) {
aln <- aln[-to.remove]
}
whitelist.pos <- which(unlist(GenomicRanges::mcols(aln)[CBtag]) %in% whitelist.bc)
aln <- aln[whitelist.pos]
# Check that at least one whitelisted barcode was identified.
if (length(aln) == 0) {
next
}
# For de-duplicating UMIs, let's just remove a random read
# when there is a duplicate
if(countUMI) {
GenomicRanges::mcols(aln)$CB_UB <- paste0(unlist(GenomicRanges::mcols(aln)[CBtag]),
"_", unlist(GenomicRanges::mcols(aln)[UMItag]))
uniqUMIs <- which(!duplicated(GenomicRanges::mcols(aln)$CB_UB))
aln <- aln[uniqUMIs]
}
aln <- GenomicRanges::split(aln, unlist(GenomicRanges::mcols(aln)[CBtag]))
polyA.GR <- GenomicRanges::GRanges(seqnames = peak.sites.chr$Chr,
IRanges::IRanges(start = peak.sites.chr$Fit.start,
end = as.integer(peak.sites.chr$Fit.end)))
n.polyA <- length(polyA.GR)
barcodes.gene <- names(aln)
res <- sapply(barcodes.gene, function(x) GenomicRanges::countOverlaps(polyA.GR, aln[[x]]))
# Reorder the columns of the res matrix to match the whitelist barcodes
res.mat <- matrix(0L, nrow = n.polyA, ncol = n.bcs)
res.mat[,match(barcodes.gene, whitelist.bc)] <- res
# Return a sparse matrix
mat.per.strand <- Matrix::Matrix(res.mat, sparse = TRUE)
#mat.to.write <- matrix(0L, nrow = n.polyA, ncol = n.bcs)
#mat.to.write[,match(barcodes.gene, whitelist.bc)] <- res
polyA.ids <- paste0(peak.sites.chr$Gene, ":", peak.sites.chr$Chr, ":", peak.sites.chr$Fit.start,
"-", peak.sites.chr$Fit.end, ":", peak.sites.chr$Strand )
rownames(mat.per.strand) <- polyA.ids
# Need to combine the two matrices from each strand
if(is.null(mat.per.chr)) {
mat.per.chr <- mat.per.strand
} else {
mat.per.chr <- rbind(mat.per.chr, mat.per.strand)
}
} # Loop for strand
# Return sparse matrix for each chromosome for combining across all threads
return(mat.per.chr)
} # Loop for chr
if (new_cl) { ## Shut down cluster if on Windows
## stop cluster
parallel::stopCluster(cluster)
}
if (!dir.exists(output.dir)){
dir.create(output.dir)
}
Matrix::writeMM(mat.to.write, file = paste0(output.dir, "/matrix.mtx"))
write.table(whitelist.bc, file = paste0(output.dir, "/barcodes.tsv"), quote = FALSE, row.names = FALSE, col.names = FALSE)
write.table(rownames(mat.to.write), file = paste0(output.dir, "/sitenames.tsv"), quote = FALSE, row.names = FALSE, col.names = FALSE)
## Compress the output files
R.utils::gzip(paste0(output.dir, "/matrix.mtx"), overwrite = TRUE)
R.utils::gzip(paste0(output.dir, "/barcodes.tsv"), overwrite = TRUE)
R.utils::gzip(paste0(output.dir, "/sitenames.tsv"), overwrite = TRUE)
} # End function
###################################################################
#'
#' Helper function
#'
#' Helper function
#'
#' @param x x
#' @return to write
#' @examples
#' \dontrun{
#' make_exons(x)
#' }
#'
#'
make_exons <- function(x) {
to.write <- paste0("(", as.character(x[1]), ",")
niters <- length(x) -1
for(i in 1:niters) {
if(x[i+1]-x[i] == 1) {
next
} else {
to.write <- paste0(to.write, as.character(x[i]), ")(",
as.character(x[i+1]), ",")
}
}
to.write <- paste0(to.write, x[niters+1], ")")
return(to.write)
}
###################################################################
#'
#' Build gene start-end reference from a gtf file
#'
#' @param gtf_file gtf file
#' @param chr.names a list of valid chromosome names to use
#' @param filter.chr whether to filter chromosomes in the GTF file
#' @param gene.symbol.ref field in the GTF file containing the gene symbol
#'
#' Takes a GTF file as input and creates a table of chromosome start-end
#' positions for each gene. Works with GTF files downloaded from 10x Genomics website.
#'
make_reference <- function(gtf_file,
chr.names = NULL,
filter.chr = FALSE,
gene.symbol.ref = 'gene_name') {
## Read in the gtf file
gtf_gr <- rtracklayer::import(gtf_file)
gtf_TxDb <- GenomicFeatures::makeTxDbFromGFF(gtf_file, format="gtf")
## Build a table of gene start-end positions
genes <- GenomicFeatures::genes(gtf_TxDb)
genes.ref <- as.data.frame(genes)
rownames(genes.ref) <- genes.ref$gene_id
## Build a unique map of Ensembl ID to gene symbol
ensembl.symbol.map <- data.frame(EnsemblID = gtf_gr@elementMetadata@listData$gene_id,
GeneName = gtf_gr@elementMetadata@listData[[gene.symbol.ref]],
stringsAsFactors = FALSE)
ensembl.symbol.map %>% dplyr::distinct(EnsemblID, .keep_all = TRUE) -> ensembl.symbol.map.unique
rownames(ensembl.symbol.map.unique) <- ensembl.symbol.map.unique$EnsemblID
## Add gene symbol to the gene table
overlapping.gene.ids <- dplyr::intersect(rownames(genes.ref), ensembl.symbol.map.unique$EnsemblID)
ensembl.symbol.map.unique <- ensembl.symbol.map.unique[overlapping.gene.ids, ]
genes.ref$Gene <- ensembl.symbol.map.unique$GeneName
## update column names in reference file
colnames(genes.ref) <- c("chr", "start", "end", "width", "strand", "EnsemblID", "Gene")
genes.ref <- genes.ref[, c("EnsemblID", "chr", "start", "end", "strand", "Gene")]
genes.ref$strand <- plyr::mapvalues(genes.ref$strand, from = c("-", "+"), to = c("-1", "1"))
## Filter the chromosome names
if (filter.chr) {
if (!is.null(chr.names)) {
# Use provided chromosome names
chr.use <- chr.names
} else{
# Assume we're working with human or mouse
chr.use1 <- as.character(c(1:22, c("X", "Y", "MT")))
chr.use2 <- paste0("chr", as.character(c(1:22, c("X", "Y", "MT"))))
chr.use <- c(chr.use1, chr.use2)
}
genes.ref.use <- subset(genes.ref, chr %in% chr.use )
genes.ref.use$chr <- droplevels(genes.ref.use$chr)
if (nrow(genes.ref.use) == 0) {
warning("No entries were left after filtering GTF for chromosome names. Reverting to original entries.")
genes.ref.use <- genes.ref
}
} else{
# Use GTF as is
genes.ref.use <- genes.ref
}
return(genes.ref.use)
}
###################################################################
#'
#' Fit Gaussian curve to the coverage
#'
#' Given read coverage data, fit a Guassian curve using either
#' NLS or MLE fits.
#'
#' @param fit.data read coverage to fit
#' @param maxval maximum read coverage
#' @param fit.method Either NLS or MLE
#' @param mu initialised value for the centre of the peak (default: 300)
#'
#'
fit_gaussian <- function(fit.data, maxval, fit.method, mu = 300) {
if(fit.method == "NLS") {
nls.res <- NULL
tryCatch({
nls.res <- nls( y ~ k*exp(-1/2*(x-mu)^2/sigma^2),
start=c(mu=mu,sigma=100,k=maxval) , data = fit.data)
}, error = function(err) { })
return(nls.res)
} else if(fit.method == "MLE") {
x = fit.data$x
y = fit.data$y
# Likelihood function for MLE
LL <- function(mu, sigma,k) {
R = dnorm(y, k*exp(-1/2*(x-mu)^2/sigma^2), 10, log = TRUE)
return(-sum(R))
}
mle.fit = NULL
tryCatch({
mle.fit = mle(LL, start = list(mu = mu, sigma=100, k =maxval))
}, error = function(err) { })
return(mle.fit)
} else {
stop("Fit method need to be either NLS or MLE")
}
}
###################################################################
#'
#' Perform splice-aware peak calling on a BAM file produced from a scRNA-seq experiment
#'
#' Takes as input a BAM file produced from barcoded scRNA-seq experiment, the reference (GTF) file used during alignment and
#' a BED file of junctions produced by regtools. For each gene in the reference file, the peak calling process first splits
#' the read coverage into 'across junction' and 'no junction' subsets. Within each subset, the site of maximum coverage
#' is identified and a peak called, by fitting a Gaussian to the read coverage, from a 600bp window around this region.
#' After calling a peak, the local read coverage is removed and the next site of maximum coverage is identified. This process
#' runs iteratively until at least one of two stopping criteria are reached. The first criteria is defined as the maximum
#' read coverage a minimum cutoff (min.cov.cutoff) and proportion (min.cov.prop). The second critera is the size of the peak,
#' including a absolute threshold (min.peak.cutoff) and a relative threshold (min.peak.prop).
#'
#' @param output.file a file containing polyA sites
#' @param gtf.file reference (GTF) file
#' @param bamfile scRNA-seq BAM file
#' @param junctions.file BED file (as produced by regtools) or SJ.out.tab file (STAR aligner) containing splice junction coordinates
#' @param min.jcutoff minimum number of spliced reads across a junction for it to be considered (default: 50).
#' @param min.jcutoff.prop minimum proportion of junction reads out of all junction reads for that gene (default: 0.05)
#' @param min.cov.cutoff minimum number of reads to consider a peak (default: 500)
#' @param min.cov.prop minimum proportion of reads to consider a peak (default: 0.05)
#' @param min.peak.cutoff minimum peak height (default: 200)
#' @param min.peak.prop minimum ratio of current peak height relative to maximum peak height for this gene (default: 0.05)
#' @param ncores number of cores to use
#' @param chr.names names of chromosomes
#' @param filter.chr names of chromosomes to filter
#' @param gene.symbol.ref field in the GTF file containing the gene symbol
#' @return NULL. Writes counts to file.
#' @examples
#'
#' extdata_path <- system.file("extdata",package = "Sierra")
#' reference.file <- paste0(extdata_path,"/Vignette_cellranger_genes_subset.gtf")
#' junctions.file <- paste0(extdata_path,"/Vignette_example_TIP_sham_junctions.bed")
#'
#' bamfile <- c(paste0(extdata_path,"/Vignette_example_TIP_sham.bam"),
#' paste0(extdata_path,"/Vignette_example_TIP_mi.bam") )
#'
#' peak.output.file <- c("Vignette_example_TIP_sham_peaks.txt", "Vignette_example_TIP_MI_peaks.txt")
#'
#'
#' FindPeaks(output.file=peak.output.file[1], gtf.file = reference.file,
#' bamfile=bamfile[1], junctions.file=junctions.file)
#'
#' @importFrom magrittr "%>%"
#' @importFrom foreach "%dopar%"
#' @import GenomicRanges
#'
#' @export
#'
FindPeaks <- function(output.file,
gtf.file,
bamfile,
junctions.file,
min.jcutoff=50,
min.jcutoff.prop = 0.05,
min.cov.cutoff = 500,
min.cov.prop = 0.05,
min.peak.cutoff=200,
min.peak.prop = 0.05,
ncores = 1,
chr.names = NULL,
filter.chr = FALSE,
gene.symbol.ref = 'gene_name',
fit.method = "NLS") {
if(!fit.method %in% c("NLS", "MLE") ) { stop("fit.method needs to be either NLS or MLE. ")}
lock <- tempfile()
#genes.ref <- read.table(reference.file,
# header = TRUE, sep = ",", stringsAsFactors = FALSE)
#chr.names <- as.character(unique(genes.ref$chr))
#genes.ref <- subset(genes.ref, chr %in% chr.names)
## Read in the gtf file
genes.ref = make_reference(gtf_file = gtf.file, chr.names = chr.names, filter.chr = filter.chr, gene.symbol.ref = gene.symbol.ref)
n.genes = nrow(genes.ref)
message(paste(n.genes, "gene entries to process"))
# Initiate the output file
write("Gene\tChr\tStrand\tMaxPosition\tFit.max.pos\tFit.start\tFit.end\tmu\tsigma\tk\texon/intron\texon.pos\tLogLik",
file = output.file)
## Read in junctions from either bed file or SJ file from STAR aligner
# step1 determine file type by extension
idx.star <- grep(pattern="SJ.out.tab(.gz|)", x=junctions.file)
idx.bed <- grep(pattern=".bed(.gz|)", x=junctions.file)
junctions <- read.table(junctions.file, sep = "\t",header = FALSE)
if (length(idx.star) > 0)
{
junctions.GR <- GenomicRanges::GRanges(seqnames = junctions$V1,
IRanges::IRanges(start =junctions$V2, end = junctions$V3),
counts = junctions$V8)
}
else if(length(idx.bed) > 0)
{
junctions <- cbind(junctions,
reshape2::colsplit(junctions$V11, ",", c("blocks1","blocks2")))
junctions$start <- junctions$V2+junctions$blocks1
junctions$end <- junctions$V3-junctions$blocks2
junctions.GR <- GenomicRanges::GRanges(seqnames = junctions$V1,
IRanges::IRanges(start = junctions$start,
end = junctions$end), counts = junctions$V5)
}
else # unrecognisable format
{
warning(paste0("Unrecognisable junction file format.\n",
"File must have either bed or SJ.out.tab extension.\n",
"Cannot continue to findPeaks\n"))
return(NULL)
}
# Set up multiple workers
system.name <- Sys.info()['sysname']
new_cl <- FALSE
if (system.name == "Windows") {
new_cl <- TRUE
cluster <- parallel::makePSOCKcluster(rep("localhost", ncores))
doParallel::registerDoParallel(cluster)
} else {
doParallel::registerDoParallel(cores=ncores)
}
foreach::foreach(i = 1:n.genes, .packages = c("GenomicRanges")) %dopar% {
gene.name <- genes.ref[i, "Gene"]
seq.name <- genes.ref[i,"chr"]
gene.start <- genes.ref[i,"start"]
gene.end <- genes.ref[i, "end"]
strand <- genes.ref[i,"strand"]
#message(i, " :", gene.name)
isMinusStrand <- if(strand==1) FALSE else TRUE
which <- GenomicRanges::GRanges(seqnames = seq.name, ranges = IRanges::IRanges(gene.start, gene.end))
param <- Rsamtools::ScanBamParam(which = which,
flag=Rsamtools::scanBamFlag(isMinusStrand=isMinusStrand))
aln <- GenomicAlignments::readGAlignments(bamfile, param=param)
aln_cov <- GenomicRanges::coverage(aln)[seq.name][[1]]
if (length(aln_cov@values) > 1) { ## check for 0 read coverage for this gene
data <- data.frame(pos = seq(gene.start, gene.end),
coverage = S4Vectors::runValue(aln_cov)[S4Vectors::findRun(gene.start:gene.end, aln_cov)])
# Find the junction which overlaps this gene
j.cutoff <- max(min.jcutoff,min.jcutoff.prop*max(data$coverage))
hits <- GenomicRanges::findOverlaps(which, junctions.GR)
this.junctions.GR <- junctions.GR[hits@to]
this.junctions.GR <- this.junctions.GR[this.junctions.GR$counts > j.cutoff]
#this.junctions.GR <- IRanges::subset(this.junctions.GR, counts > j.cutoff)
n.junctions <- length(this.junctions.GR)
data.no.juncs <- data
## Filter
## This is pretty slow way to do this filtering,
## can definitely improve computationally!
if(n.junctions > 0) {
for(i in 1:n.junctions) {
j.start <- IRanges::start(this.junctions.GR[i])
j.end <- IRanges::end(this.junctions.GR[i])
data.no.juncs <- data.no.juncs %>%
dplyr::filter(pos < j.start | pos > j.end)
}
}
## Find peaks
totalcov <- sum(data$coverage)
cutoff <- max(min.cov.cutoff,min.cov.prop*totalcov)
covsum <- totalcov
maxpeakval <- max(data$coverage)
maxpeakcutoff <- max(min.peak.cutoff,min.peak.prop*maxpeakval )
#message("Finding exonic sites...")
n.points <- nrow(data.no.juncs)
if(n.points > 0) {
while(covsum > cutoff) {
maxpeak <- which.max(data.no.juncs$coverage)
if(data.no.juncs[maxpeak, "coverage"] < maxpeakcutoff) { break }
start <- maxpeak - 300
end <- maxpeak + 299
if(start < 1 ) { start <- 1 }
if(end > n.points) { end <- n.points }
maxval <- max(data.no.juncs$coverage)
#message(start, ",", end, ",", maxval)
#message("length: ", data.no.juncs[end,"pos"] - data.no.juncs[start, "pos"])
#message("max peak: ", data.no.juncs[maxpeak, "pos"])
fit.data <- data.frame(x = seq(1,end-start+1), y = data.no.juncs[start:end,"coverage"])
#nls.res <- NULL
#tryCatch({
# nls.res <- nls( y ~ k*exp(-1/2*(x-mu)^2/sigma^2),
# start=c(mu=300,sigma=100,k=maxval) , data = fit.data)
#}, error = function(err) { })
gaussian.fit <- fit_gaussian(fit.data, maxval, fit.method)
if(!is.null(gaussian.fit)) {
est_mu <- coef(gaussian.fit)["mu"]
est_sigma <- abs(coef(gaussian.fit)["sigma"]) # this has to be positie in the Gaussian
est_k <- coef(gaussian.fit)["k"]
fit.loglik <- logLik(gaussian.fit)[1]
fitted.peak <- maxpeak - 300 + floor(est_mu)
from <- fitted.peak - 3*floor(est_sigma)
to <- fitted.peak + 3*floor(est_sigma)
# Handle the cases where the peak is too close to eithr the start or the end
if(from < 1) { from = 1 }
if(to > n.points) { to = n.points}
if(fitted.peak <= 0) {
peak.pos <- "Negative"
} else {
peak.pos <- data.no.juncs[fitted.peak, "pos"]
}
isGapped <- FALSE
if(to <= 0) {
to.pos <- "Negative"
} else {
to.pos <- data.no.juncs[to, "pos"]
# Check if this is a spliced region
pos.gaps <- diff(data.no.juncs[from:to, "pos"])
if(length(which(pos.gaps > 1) > 0)) {
isGapped <- TRUE
}
}
if(isGapped) {
exon.pos <- make_exons(data.no.juncs[from:to, "pos"])
line <- paste(gene.name, seq.name, strand, data.no.juncs[maxpeak, "pos"],
peak.pos,
data.no.juncs[from, "pos"],
to.pos,
est_mu, est_sigma, est_k,
"across-junctions", exon.pos, fit.loglik, sep="\t")
} else {
exon.pos <- "NA"
line <- paste(gene.name, seq.name, strand, data.no.juncs[maxpeak, "pos"],
peak.pos,
data.no.juncs[from, "pos"],
to.pos,
est_mu, est_sigma, est_k,
"no-junctions", exon.pos, fit.loglik, sep="\t")
}
#print(line)
locked <- flock::lock(lock)
write(line,file=output.file,append=TRUE)
flock::unlock(locked)
} else {
line <- paste(gene.name, seq.name, strand, data.no.juncs[maxpeak, "pos"],
"NA", "NA", "NA", "NA", "NA", "NA", "no-junctions", "NA", "NA", sep="\t")
locked <- flock::lock(lock)
write(line,file=output.file,append=TRUE)
flock::unlock(locked)
}
data.no.juncs[start:end, "coverage"] <- 0
covsum <- sum(data.no.juncs$coverage)
#print(covsum)
}
}
## Now let's see if there are any peaks in the introns
## test each intron separate
#message("Finding intronic peaks...")
reduced.junctions <- GenomicRanges::reduce(this.junctions.GR)
n.rjunctions <- length(reduced.junctions)
if(n.junctions > 0) {
for(i in 1:n.rjunctions) {
#message(i)
j.start <- IRanges::start(reduced.junctions[i])
j.end <- IRanges::end(reduced.junctions[i])
intron.data <- data %>%
dplyr::filter(pos > j.start & pos < j.end)
if(nrow(intron.data) == 0) { next }
maxpeak <- which.max(intron.data$coverage)
maxval <- intron.data[maxpeak, "coverage"]
#message(maxpeak, " ", maxval)
if(maxval < maxpeakcutoff) { next }
fit.data <- data.frame(x = seq(1,nrow(intron.data)),
y = intron.data[,"coverage"])
#nls.res <- NULL
#tryCatch({
# nls.res <- nls( y ~ k*exp(-1/2*(x-mu)^2/sigma^2),
# start=c(mu=maxpeak,sigma=100,k=maxval) , data = fit.data)
#}, error = function(err) { })
gaussian.fit <- fit_gaussian(fit.data, maxval, fit.method, mu=maxpeak)
if(!is.null(gaussian.fit)) {
# residuals <- sum(summary(nls.res)$residuals )
#v <- summary(nls.res)$parameters[,"Estimate"]
est_mu <- coef(gaussian.fit)["mu"]
est_sigma <- abs(coef(gaussian.fit)["sigma"])
est_k <- coef(gaussian.fit)["k"]
fit.loglik <- logLik(gaussian.fit)[1]
fitted.peak <- floor(est_mu)
from <- fitted.peak - 3*floor(est_sigma)
to <- fitted.peak + 3*floor(est_sigma)
# Make sure position is within the intron
if(from < 1) { from = 1 }
this.n.points <- nrow(intron.data)
if(to > this.n.points) { to = this.n.points}
if(fitted.peak <= 0) {
peak.pos <- "Negative"
} else {
peak.pos <- intron.data[fitted.peak, "pos"]
}
if(to <= 0) {
to.pos <- "Negative"
} else {
to.pos <- intron.data[to, "pos"]
}
line=paste(gene.name, seq.name, strand, intron.data[maxpeak, "pos"],
peak.pos,
intron.data[from, "pos"],
to.pos,
est_mu, est_sigma, est_k,
"no-junctions", "NA", fit.loglik, sep="\t")
#line=paste(gene.name, seq.name, maxpeak, v[1], v[2], v[3], "junctions", sep=",")
#print(line)
locked <- flock::lock(lock)
write(line,file=output.file,append=TRUE)
flock::unlock(locked)
} else {
line=paste(gene.name, seq.name, strand, intron.data[maxpeak, "pos"],
"NA", "NA", "NA", "NA", "NA", "NA", "no-junctions", "NA", "NA", sep="\t")
locked <- flock::lock(lock)
write(line,file=output.file,append=TRUE)
flock::unlock(locked)
}
}
}
}
} # End loop for genes
if (new_cl) { ## Shut down cluster if on Windows
## stop cluster
parallel::stopCluster(cluster)
}
## As a final step, read in the peak file, filter, and add Peak IDs
peak.sites.file = output.file
peak.sites <- read.table(peak.sites.file, header = T, sep = "\t", quote = '',
stringsAsFactors = FALSE)
## Filter the polyA sites
n.total.sites <- nrow(peak.sites)
to.filter <- which(peak.sites$Fit.max.pos == "Negative")
to.filter <- dplyr::union(to.filter, which(peak.sites$Fit.start == "Negative"))
to.filter <- dplyr::union(to.filter, which(peak.sites$Fit.end == "Negative"))
to.filter <- dplyr::union(to.filter, which(is.na(peak.sites$Fit.start)))
to.filter <- dplyr::union(to.filter, which(is.na(peak.sites$Fit.end)))
to.filter <- dplyr::union(to.filter, which(is.na(peak.sites$Fit.max.pos)))
if (length(to.filter) > 0)
peak.sites <- peak.sites[-to.filter,]
## Check for any examples of peaks with start before end
sites.diffs <- as.numeric(peak.sites$Fit.end) - as.numeric(peak.sites$Fit.start)
to.filter <- which(sites.diffs < 0)
if (length(to.filter) > 0)
peak.sites <- peak.sites[-to.filter,]
n.filt.sites <- nrow(peak.sites)
message("There are ", n.total.sites, " unfiltered sites and ", n.filt.sites, " filtered sites")
## Add polyA IDs to the table
polyA.ids <- paste0(peak.sites$Gene, ":", peak.sites$Chr, ":", peak.sites$Fit.start,
"-", peak.sites$Fit.end, ":", peak.sites$Strand )
peak.sites$polyA_ID = polyA.ids
## Remove any duplicates
peak.sites %>% dplyr::distinct(polyA_ID, .keep_all = TRUE) -> peak.sites
n.updated.sites = nrow(peak.sites)
message("There are ", n.updated.sites, " sites following duplicate removal")
## re-write the updated table
write.table(peak.sites, file = output.file, sep="\t", quote = FALSE, row.names = FALSE)
} # End function
###################################################################
#'
#' Aggregate multiple peak count outputs together
#'
#' Aggregate the output from multiple runs of CountPeaks together. By default, this function will
#' update the cell barcodes in a format consistent with the CellRanger aggr program. That is, for
#' n experiments to aggregate, cell barcodes will be appended with '-1', '-2',...,'-n'. For downstream
#' analysis, if using the PeakSeuratFromTransfer function, it is important to ensure that these match what is
#' in the gene count matrix. The name of the expected separator character can be updated with the 'exp.sep'
#' parameter and preferred labels can be specified with the 'exp.labels' parameter. The barcode updates can
#' be turned off by setting update.labels = FALSE if manual setting is preferred.
#'
#' @param peak.sites.file a file containing peak site coordinates
#' @param count.dirs a list of output directories from CountPeaks
#' @param output.dir output directory for aggregate count matrix
#' @param update.labels whether to append an experiment label to the cell barcode (default: TRUE)
#' @param exp.sep a character separating the cell barcode from experiment label (default: '-')
#' @param exp.labels optional labels to append to cell barcodes corresponding to count.dirs
#' @return NULL. Writes counts to file.
#' @examples
#'
#' library(Sierra)
#' extdata_path <- system.file("extdata",package = "Sierra")
#' reference.file <- paste0(extdata_path,"/Vignette_cellranger_genes_subset.gtf")
#' junctions.file <- paste0(extdata_path,"/Vignette_example_TIP_sham_junctions.bed")
#' bamfile <- c(paste0(extdata_path,"/Vignette_example_TIP_sham.bam"),
#' paste0(extdata_path,"/Vignette_example_TIP_mi.bam") )
#' whitelist.bc.file <- c(paste0(extdata_path,"/example_TIP_sham_whitelist_barcodes.tsv"),
#' paste0(extdata_path,"/example_TIP_MI_whitelist_barcodes.tsv"))
#'
#' ### Peak calling
#' peak.output.file <- c("Vignette_example_TIP_sham_peaks.txt",
#' "Vignette_example_TIP_MI_peaks.txt")
#' FindPeaks(output.file = peak.output.file[1], # output filename
#' gtf.file = reference.file, # gene model as a GTF file
#' bamfile = bamfile[1], # BAM alignment filename.
#' junctions.file = junctions.file, # BED filename of splice junctions exising in BAM file.
#' ncores = 1) # number of cores to use
#'
#'
#' FindPeaks(output.file = peak.output.file[2], # output filename
#' gtf.file = reference.file, # gene model as a GTF file
#' bamfile = bamfile[2], # BAM alignment filename.
#' junctions.file = junctions.file, # BED filename of splice junctions exising in BAM file.
#' ncores = 1)
#'
#' #### Peak merging
#' peak.dataset.table = data.frame(Peak_file = peak.output.file,
#' Identifier = c("TIP-example-Sham", "TIP-example-MI"),
#' stringsAsFactors = FALSE)
#'
#' peak.merge.output.file = "TIP_merged_peaks.txt"
#' MergePeakCoordinates(peak.dataset.table, output.file = peak.merge.output.file, ncores = 1)
#'
#' count.dirs <- c("example_TIP_sham_counts", "example_TIP_MI_counts")
#' #sham data set
#' CountPeaks(peak.sites.file = peak.merge.output.file, gtf.file = reference.file,
#' bamfile = bamfile[1], whitelist.file = whitelist.bc.file[1],
#' output.dir = count.dirs[1], countUMI = TRUE, ncores = 1)
#' # MI data set
#' CountPeaks(peak.sites.file = peak.merge.output.file, gtf.file = reference.file,
#' bamfile = bamfile[2], whitelist.file = whitelist.bc.file[2],
#' output.dir = count.dirs[2], countUMI = TRUE, ncores = 1)
#'
#' out.dir <- "example_TIP_aggregate"
#' AggregatePeakCounts(peak.sites.file = peak.merge.output.file, count.dirs = count.dirs,
#' exp.labels = c("Sham", "MI"), output.dir = out.dir)
#'
#' @export
#'
AggregatePeakCounts <- function(peak.sites.file,
count.dirs,
output.dir,
update.labels = TRUE,
exp.sep = "-",
exp.labels = NULL) {
if (!is.null(exp.labels)) {
if (length(exp.labels) != length(count.dirs)) {
stop("number of count directories should equal number of labels")
}
}
peak.table <- read.table(peak.sites.file, sep="\t", quote = '', header = TRUE, stringsAsFactors = FALSE)
all.peaks <- peak.table$polyA_ID
peaks.use <- all.peaks
## Get intersection of the peak IDs from each of the input files
for (i in 1:length(count.dirs)) {
this.dir <- count.dirs[i]
## First check if files are compressed
file.list <- list.files(this.dir)
if (sum(endsWith(file.list, ".gz")) == 3) {
gzipped = TRUE
} else{
gzipped = FALSE
}
if (gzipped) {
sites.file <- paste0(this.dir, "/sitenames.tsv.gz")
#peaks.con <- gzfile(sites.file)
#this.peak.set <- readLines(peaks.con)
peaks.con <- gzfile(sites.file)
this.peak.set <- readLines(peaks.con)
close(peaks.con)
} else {
sites.file <- paste0(this.dir, "/sitenames.tsv")
this.peak.set <- readLines(sites.file)
}
peaks.use <- dplyr::intersect(peaks.use, this.peak.set)
## Check if peak IDs from the sites file are different to the count file
if (length(dplyr::setdiff(all.peaks, this.peak.set)) > 0 | length(dplyr::setdiff(this.peak.set, all.peaks)) > 0) {
warning(paste0("Some peaks in file ", this.dir, " do not match input peak coordinate file."))
}
}
print(paste0("Aggregating counts for ", length(peaks.use), " peak coordinates"))
aggregate.counts <- c()
for (i in 1:length(count.dirs)) {
this.dir <- count.dirs[i]
this.data <- ReadPeakCounts(this.dir)
if (update.labels) { ## update cell barcode names
cell.names = colnames(this.data)
pattern <- paste0("(.*)", exp.sep, ".*")
barcodes = sub(pattern, "\\1", cell.names)
if (is.null(exp.labels)) {
cell.names.update = paste0(barcodes, exp.sep, i)
} else{
cell.names.update = paste0(barcodes, exp.sep, exp.labels[i])
}
colnames(this.data) = cell.names.update
}
this.data <- this.data[peaks.use, ]
aggregate.counts <- cbind(aggregate.counts, this.data)
}
if (!dir.exists(output.dir)){
dir.create(output.dir)
}
Matrix::writeMM(aggregate.counts, file = paste0(output.dir, "/matrix.mtx"))
writeLines(colnames(aggregate.counts), paste0(output.dir, "/barcodes.tsv"))
writeLines(rownames(aggregate.counts), paste0(output.dir, "/sitenames.tsv"))
## Compress the output files
R.utils::gzip(paste0(output.dir, "/matrix.mtx"), overwrite = TRUE)
R.utils::gzip(paste0(output.dir, "/barcodes.tsv"), overwrite = TRUE)
R.utils::gzip(paste0(output.dir, "/sitenames.tsv"), overwrite = TRUE)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.