## query : GRanges : value has one row per unique name in GRanges
## query : GRangesList : value has one row per name of GRangesList
## query : TxDb : value has has one row per 'reportLevel'
## query : NULL & reportLevel=="junction": value has has one row per 'junction'
#' Quantify alignments
#'
#' Quantify alignments from sequencing data.
#'
#' \code{qCount} is used to count alignments in each sample from a
#' \code{qProject} object. The features to be quantified, together with
#' the mode of quantification, are specified by the \code{query}
#' argument, which is one of:
#' \describe{
#' \item{\code{\link[GenomicRanges:GRanges-class]{GRanges}}}{: Overlapping alignments
#' are counted separately for each coordinate region. If multiple
#' regions have identical names, their counts will be summed, counting
#' each alignment only once even if it overlaps more than one of these
#' regions. Alignments may be counted more than once if they overlap
#' multiple regions that have different names.
#' This mode is for example used to quantify ChIP-seq alignments in
#' promoter regions, or gene expression levels in an RNA-seq experiment
#' (using a \code{query} with exon regions named by gene).}
#' \item{\code{\link[GenomicRanges:GRangesList-class]{GRangesList}}}{: Alignments are
#' counted and summed for each list element in \code{query} if they
#' overlap with any of the regions contained in the list element. The
#' order of the list elements defines a hierarchy for quantification:
#' Alignment will only be counted for the first element (the one with
#' the lowest index in \code{query}) that they overlap, but not for any
#' potential further list elements containing overlapping regions.
#' This mode can be used to hierarchically and uniquely count (assign)
#' each alignment to a one of several groups of regions (the elements
#' in \code{query}), for example to estimate the fractions of different
#' classes of RNA in an RNA-seq experiment (rRNA, tRNA, snRNA, snoRNA,
#' mRNA, etc.)}
#' \item{\code{\link[GenomicFeatures:TxDb-class]{TxDb}}}{: Used to extract
#' regions from annotation and report alignment counts depending on the
#' value of \code{reportLevel}. If \code{reportLevel="exon"},
#' alignments overlapping each exon in \code{query} are counted.
#' If \code{reportLevel="gene"}, alignment counts for all exons of a
#' gene will be summed, counting each alignment only once even if it
#' overlaps multiple annotated exons of a gene. These are useful to
#' calculate exon or gene expression levels in RNA-seq experiments
#' based on the annotation in a \code{TxDb} object. If
#' \code{reportLevel="promoter"}, the \code{promoters} function from package
#' \pkg{GenomicFeatures} is used with default arguments to extract
#' promoter regions around transcript start sites, e.g. to quantify
#' alignments inf a ChIP-seq experiment.}
#' \item{any of the above or \code{NULL} for
#' \code{reportLevel="junction"}}{: The \code{query} argument is ignored
#' if \code{reportLevel} is set to \code{"junction"}, and \code{qCount}
#' will count the number of alignments supporting each exon-exon
#' junction detected in any of the samples in \code{proj}. The
#' arguments \code{selectReadPosition}, \code{shift},
#' \code{orientation}, \code{useRead} and \code{mask} will have no
#' effect in this quantification mode.}
#' }
#'
#' The additional arguments allow to fine-tune the quantification:
#'
#' \code{selectReadPosition} defines the part of the alignment that has
#' to be contained within a query region for an overlap. The values
#' \code{start} (default) and \code{end} refer to the biological start
#' (5'-end) and end (3'-end) of the alignment. For example, the
#' \code{start} of an alignment on the plus strand is its leftmost
#' (lowest) base, and the \code{end} of an alignment on the minus strand
#' is also the leftmost base.
#'
#' \code{shift} allows on-the-fly shifting of alignments towards their
#' 3'-end prior to overlap determination and counting. This can be
#' helpful to increase resolution of ChIP-seq experiments by moving
#' alignments by half the immuno-precipitated fragment size towards the
#' middle of fragments. \code{shift} is either an \dQuote{integer} vector
#' with one value per alignment file in \code{proj}, or a single
#' \dQuote{integer} value, in which case all alignment files will be
#' shifted by the same value. For paired-end experiments, it can be
#' alternatively set to "halfInsert", which will estimate the true
#' fragment size from the distance between aligned read pairs and shift
#' the alignments accordingly.
#'
#' \code{orientation} controls the interpretation of alignment strand
#' when counting, relative to the strand of the query region. \code{any}
#' will count all overlapping alignments, irrespective of the alignment
#' strand (e.g. used in an unstranded RNA-seq experiment). \code{same}
#' will only count the alignments on the same strand as the query region
#' (e.g. in a stranded RNA-seq experiment that generates reads from the
#' sense strand), and \code{opposite} will only
#' count the alignments on the opposite strand from the query region
#' (e.g. to quantify anti-sense transcription in a stranded RNA-seq
#' experiment).
#'
#' \code{includeSpliced} and \code{includeSecondary} can be used to
#' include or exclude spliced or secondary alignments,
#' respectively. \code{mapqMin} and \code{mapqMax} allow to select alignments
#' based on their mapping qualities. \code{mapqMin} and \code{mapqMax} can
#' take integer values between 0 and 255 and equal to
#' \eqn{-10 log_{10} Pr(\textnormal{mapping position is wrong})}{-10
#' log10 Pr(mapping position is wrong)}, rounded to the nearest
#' integer. A value 255 indicates that the mapping quality is not available.
#'
#' In paired-end experiments, \code{useRead} allows to quantify either
#' all alignments (\code{useRead="any"}), or only the first
#' (\code{useRead="first"}) or last (\code{useRead="last"}) read from a
#' read pair or read group. Note that for \code{useRead="any"} (the
#' default), an alignment pair that is fully contained within a query
#' region will contribute two counts to the value of that
#' region. \code{absIsizeMin} and \code{absIsizeMax} can be used to
#' select alignments based on their insert size (TLEN field in SAM Spec
#' v1.4).
#'
#' \code{auxiliaryName} selects the reference sequence for which
#' alignments should be quantified. \code{NULL} (the default) will
#' select alignments against the genome. If set to a character string
#' that matches one of the auxiliary target names (as specified in
#' the \code{auxiliaryFile} argument of \code{\link[QuasR]{qAlign}}), the
#' corresponding alignments will be counted.
#'
#' \code{mask} can be used to specify a
#' \code{\link[GenomicRanges:GRanges-class]{GRanges}} object with regions in the
#' reference sequence to be excluded from quantification. The regions
#' will be considered unstranded (\code{strand="*"}). Alignments that
#' overlap with a region in \code{mask} will not be counted. Masking may
#' reduce the effective width of query regions reported by \code{qCount},
#' even down to zero for regions that are fully contained in \code{mask}.
#'
#' If \code{clObj} is set to an object that inherits from class
#' \code{cluster}, for example an object returned by
#' \code{\link[parallel]{makeCluster}} from package \pkg{parallel}, the
#' quantification task is split into multiple chunks and processed in
#' parallel using \code{\link[parallel:clusterApply]{clusterMap}}. Currently, not all
#' tasks will be efficiently parallelized: For example, a single query
#' region and a single (group of) bam files will not be split into
#' multiple chunks.
#'
#' @param proj A \code{\linkS4class{qProject}} object representing a
#' sequencing experiment as returned by \code{\link[QuasR]{qAlign}}
#' @param query An object of type \code{\link[GenomicRanges:GRanges-class]{GRanges}},
#' \code{\link[GenomicRanges:GRangesList-class]{GRangesList}} or
#' \code{\link[GenomicFeatures:TxDb-class]{TxDb}} with the regions to be
#' quantified. The type of \code{query} will determine the mode of
#' quantification (see \sQuote{Details}). For
#' \code{reportLevel="junction"}, \code{query} is ignored and can also
#' be \code{NULL}.
#' @param reportLevel Level of quantification (\code{query} of type
#' \code{TxDb} or \code{NULL}), one of
#' \describe{
#' \item{\code{gene} (default)}{: one value per gene}
#' \item{\code{exon}}{: one value per exon}
#' \item{\code{promoter}}{: one value per promoter}
#' \item{\code{junction}}{: one count per detected exon-exon junction
#' (\code{query} will be ignored in this case) }
#' }
#' @param selectReadPosition defines the part of the alignment that has
#' to be contained within a query region to produce an overlap (see
#' \sQuote{Details}). Possible values are:
#' \describe{
#' \item{\code{start} (default)}{: start of the alignment}
#' \item{\code{end}}{: end of the alignment}
#' }
#' @param shift controls the shifting alignments towards their 3'-end before
#' quantification. \code{shift} can be one of:
#' \itemize{
#' \item an \dQuote{integer} vector of the same length as the
#' number of alignment files
#' \item a single \dQuote{integer} value
#' \item the character string \code{"halfInsert"} (only available for
#' paired-end experiments)
#' }
#' The default of \code{0} will not shift any alignments.
#' @param orientation sets the required orientation of the alignments relative
#' to the query region in order to be counted, one of:
#' \describe{
#' \item{\code{any} (default)}{: count alignment on the same and opposite strand}
#' \item{\code{same}}{: count only alignment on the same strand}
#' \item{\code{opposite}}{: count only alignment on the opposite strand}
#' }
#' @param useRead For paired-end experiments, selects the read mate whose
#' alignments should be counted, one of:
#' \describe{
#' \item{\code{any} (default)}{: count all alignments}
#' \item{\code{first}}{: count only alignments from the first read}
#' \item{\code{last}}{: count only alignments from the last read}
#' }
#' @param auxiliaryName Which bam files to use in an experiments with
#' auxiliary alignments (see \sQuote{Details}).
#' @param mask If not \code{NULL}, a \code{\link[GenomicRanges:GRanges-class]{GRanges}}
#' object with reference regions to be masked, i.e. excluded from the
#' quantification, such as unmappable or highly repetitive regions (see
#' \sQuote{Details}).
#' @param collapseBySample If \code{TRUE} (the default), sum alignment
#' counts from bam files with the same sample name.
#' @param includeSpliced If \code{TRUE} (the default), include spliced
#' alignments when counting. A spliced alignment is defined as an
#' alignment with a gap in the read of at least 60 bases.
#' @param includeSecondary If \code{TRUE} (the default), include alignments
#' with the secondary bit (0x0100) set in the \code{FLAG} when counting.
#' @param mapqMin Minimal mapping quality of alignments to be included when
#' counting (mapping quality must be greater than or equal to
#' \code{mapqMin}). Valid values are between 0 and 255. The default (0)
#' will include all alignments.
#' @param mapqMax Maximal mapping quality of alignments to be included when
#' counting (mapping quality must be less than or equal to \code{mapqMax}).
#' Valid values are between 0 and 255. The default (255) will include
#' all alignments.
#' @param absIsizeMin For paired-end experiments, minimal absolute insert
#' size (TLEN field in SAM Spec v1.4) of alignments to be included when
#' counting. Valid values are greater than 0 or \code{NULL} (default),
#' which will not apply any minimum insert size filtering.
#' @param absIsizeMax For paired-end experiments, maximal absolute insert
#' size (TLEN field in SAM Spec v1.4) of alignments to be included when
#' counting. Valid values are greater than 0 or \code{NULL} (default),
#' which will not apply any maximum insert size filtering.
#' @param maxInsertSize Maximal fragment size of the paired-end experiment.
#' This parameter is used if \code{shift="halfInsert"} and will
#' ensure that query regions are made wide enough to emcompass all
#' alignment pairs whose mid falls into the query region. The default
#' value is \code{500} bases.
#' @param clObj A cluster object to be used for parallel processing (see
#' \sQuote{Details}).
#'
#' @name qCount
#' @aliases qCount
#'
#' @return
#' A \code{matrix} with effective query regions width in the first
#' column, and alignment counts in subsequent columns, or a
#' \code{GRanges} object if \code{reportLevel="junction"}.
#'
#' The effective query region width returned as first column in the
#' matrix is calculated by the number of unique, non-masked bases in the
#' reference sequence that contributed to the count of this query
#' name (irrespective if the bases were covered by alignments or not).
#' An effective width of zero indicates that the region was fully
#' masked and will have zero counts in all samples.
#'
#' The alignment counts in the matrix are contained from column two
#' onwards. For projects with allele-specific quantification, i.e. if a
#' file with single nucleotide polymorphisms was supplied to the
#' \code{snpFile} argument of \code{\link[QuasR]{qAlign}}, there will be
#' three columns per bam file (number of alignments for Reference,
#' Unknown and Alternative genotypes, with suffixed _R, _U and
#' _A). Otherwise there is a single columns per bam file.
#'
#' If \code{collapseBySample}=\code{TRUE}, groups of bam files with identical
#' sample name are combined by summing their alignment counts.
#'
#' For \code{reportLevel="junction"}, the return value is a
#' \code{GRanges} object. The start and end coordinates correspond to the
#' first and last base in each detected intron. Plus- and minus-strand
#' alignments are quantified separately, so that in an unstranded RNA-seq
#' experiment, the same intron may be represented twice; once for each
#' strand. The counts for each sample are contained in the \code{mcols}
#' of the \code{GRanges} object.
#'
#' @author Anita Lerch, Dimos Gaidatzis and Michael Stadler
#' @keywords utilities misc
#'
#' @export
#'
#' @seealso
#' \code{\link[QuasR]{qAlign}},
#' \code{\linkS4class{qProject}},
#' \code{\link[parallel]{makeCluster}} from package \pkg{parallel}
#'
#' @examples
#' library(GenomicRanges)
#' library(Biostrings)
#' library(Rsamtools)
#'
#' # copy example data to current working directory
#' file.copy(system.file(package="QuasR", "extdata"), ".", recursive=TRUE)
#'
#' # load genome sequence
#' genomeFile <- "extdata/hg19sub.fa"
#' gseq <- readDNAStringSet(genomeFile)
#' chrRegions <- GRanges(names(gseq), IRanges(start=1,width=width(gseq),names=names(gseq)))
#'
#' # create alignments (paired-end experiment)
#' sampleFile <- "extdata/samples_rna_paired.txt"
#' proj <- qAlign(sampleFile, genomeFile, splicedAlignment=TRUE)
#'
#' # count reads using a "GRanges" query
#' qCount(proj, query=chrRegions)
#' qCount(proj, query=chrRegions, useRead="first")
#'
#' # hierarchical counting using a "GRangesList" query
#' library(rtracklayer)
#' annotationFile <- "extdata/hg19sub_annotation.gtf"
#' gtfRegions <- import.gff(annotationFile, format="gtf", feature.type="exon")
#' names(gtfRegions) <- mcols(gtfRegions)$source
#' gtfRegionList <- split(gtfRegions, names(gtfRegions))
#' names(gtfRegionList)
#'
#' res3 <- qCount(proj, gtfRegionList)
#' res3
#'
#' # gene expression levels using a "TxDb" query
#' library("txdbmaker")
#' genomeRegion <- scanFaIndex(genomeFile)
#' chrominfo <- data.frame(chrom=as.character(seqnames(genomeRegion)),
#' length=end(genomeRegion),
#' is_circular=rep(FALSE, length(genomeRegion)))
#' txdb <- makeTxDbFromGFF(annotationFile,
#' format="gtf",
#' chrominfo=chrominfo,
#' dataSource="Ensembl modified",
#' organism="Homo sapiens")
#'
#' res4 <- qCount(proj, txdb, reportLevel="gene")
#' res4
#'
#' # exon-exon junctions
#' res5 <- qCount(proj, NULL, reportLevel="junction")
#' res5
#'
#' @importFrom Rsamtools scanBamHeader
#' @importFrom GenomeInfoDb seqlevels seqlevelsInUse seqlengths
#' @importFrom parallel clusterMap clusterEvalQ splitIndices
#' @importFrom GenomicRanges GRanges reduce findOverlaps seqnames
#' @importFrom IRanges IRanges ranges
#' @importFrom S4Vectors mcols elementNROWS endoapply Rle subjectHits queryHits
#' split
#' @importFrom BiocGenerics width strand end start setdiff unlist
#' @importFrom GenomicFeatures exons promoters exonsBy
qCount <- function(proj,
query,
reportLevel = c(NULL, "gene", "exon", "promoter", "junction"),
selectReadPosition = c("start", "end"),
shift = 0L,
orientation = c("any", "same", "opposite"),
useRead = c("any", "first", "last"),
auxiliaryName = NULL,
mask = NULL,
collapseBySample = TRUE,
includeSpliced = TRUE,
includeSecondary = TRUE,
mapqMin = 0L,
mapqMax = 255L,
absIsizeMin = NULL,
absIsizeMax = NULL,
maxInsertSize = 500L,
clObj = NULL) {
## setup variables from 'proj' ---------------------------------------------
## 'proj' is correct type?
if (!inherits(proj, "qProject", which = FALSE))
stop("'proj' must be an object of type 'qProject' (returned by 'qAlign')")
samples <- proj@alignments$SampleName
nsamples <- length(samples)
bamfiles <-
if (is.null(auxiliaryName))
proj@alignments$FileName
else if(!is.na(i <- match(auxiliaryName, proj@aux$AuxName)))
unlist(proj@auxAlignments[i, ], use.names = FALSE)
else
stop("unknown 'auxiliaryName', should be one of: NULL, ",
paste(sprintf("'%s'", proj@aux$AuxName), collapse = ", "))
## validate parameters -----------------------------------------------------
reportLevel <- match.arg(reportLevel)
selectReadPosition <- match.arg(selectReadPosition)
orientation <- match.arg(orientation)
useRead <- match.arg(useRead)
if ((!is.null(absIsizeMin) || !is.null(absIsizeMax)) && proj@paired == "no")
stop("'absIsizeMin' and 'absIsizeMax' can only be used for paired-end experiments")
if (is.null(absIsizeMin)) # -1L -> do not apply TLEN filtering
absIsizeMin <- -1L
if (is.null(absIsizeMax))
absIsizeMax <- -1L
## check shift
if (length(shift) == 1 && shift == "halfInsert") {
if (proj@paired == "no") {
stop("'shift=\"halfInsert\"' can only be used for paired-end experiments")
} else {
shifts <- rep(-1000000L, nsamples)
broaden <- as.integer(ceiling(maxInsertSize/2))
}
} else {
if (!is.numeric(shift) || (length(shift) > 1 && length(shift) != nsamples))
stop(sprintf("'shift' must be 'halfInsert', a single integer or an integer vector with %d values", nsamples))
else if(length(shift) == 1)
shifts <- rep(as.integer(shift), nsamples)
else
shifts <- as.integer(shift)
broaden <- 0L
}
## all query chromosomes present in all bamfiles?
trTab <- table(unlist(lapply(Rsamtools::scanBamHeader(bamfiles),
function(bh) names(bh$targets))))
trCommon <- names(trTab)[trTab == length(bamfiles)]
queryseqs <- NULL
if (inherits(query, c("GRanges","GRangesList"))) {
queryseqs <- GenomeInfoDb::seqlevelsInUse(query)
} else if (inherits(query,"TxDb")) { # only use active sequences
queryseqs <- GenomeInfoDb::seqlevels(query) # isActiveSeq is deprecated; no seqlevelsInUse for TxDb yet
}
if (!is.null(query) && any(f <- !(queryseqs %in% trCommon)))
stop(sprintf("sequence levels in 'query' not found in alignment files: %s",
paste(GenomeInfoDb::seqlevels(query)[f], collapse = ", ")))
## 'query' is correct type?
if (reportLevel == "junction") {
if (proj@splicedAlignment != TRUE && proj@samplesFormat != "bam")
stop("reportLevel=\"junction\" cannot be used for non-spliced alignments")
if (absIsizeMin != -1L || absIsizeMax != -1L) {
warning("ignoring 'absIsizeMin' and 'absIsizeMax' for reportLevel=\"junction\"")
absIsizeMin <- absIsizeMax <- -1L
}
if (!is.null(query))
warning("ignoring 'query' for reportLevel=\"junction\"")
### reportLevel == "junction" ------------------------------------------
## setup tasks for parallelization -------------------------------------
if (!is.null(clObj) & inherits(clObj, "cluster", which = FALSE)) {
loadQuasR(clObj)
taskTargets <- rep(trCommon, nsamples)
bamfiles <- rep(bamfiles, each = length(trCommon))
iByBamfile <- split(seq_along(bamfiles), bamfiles)[unique(bamfiles)]
myapply <- function(...) {
ret <- parallel::clusterMap(clObj, ..., SIMPLIFY = FALSE,
.scheduling = "dynamic")
# fuse
if (!is.na(proj@snpFile)) {
# ret is a list of list(id,R,U,A)
ret <- lapply(iByBamfile, function(i)
list(id = do.call(c, lapply(ret[i], "[[", "id")),
R = do.call(c, lapply(ret[i], "[[", "R")),
U = do.call(c, lapply(ret[i], "[[", "U")),
A = do.call(c, lapply(ret[i], "[[", "A"))))
} else {
# ret is a list of named vectors
ret <- lapply(iByBamfile, function(i) do.call(c, unname(ret[i])))
}
ret
}
} else {
taskTargets <- rep(list(NULL), length(bamfiles))
myapply <- function(...) {
ret <- mapply(..., SIMPLIFY = FALSE)
ret
}
}
## count junctions -----------------------------------------------------
message("counting junctions...", appendLF = FALSE)
resL <- myapply(countJunctionsOneBamfile,
bamfile = bamfiles,
targets = taskTargets,
MoreArgs = list(allelic = !is.na(proj@snpFile),
includeSecondary = includeSecondary,
mapqmin = as.integer(mapqMin)[1],
mapqmax = as.integer(mapqMax)[1]))
message("done")
## make result rectangular and collapse (sum) counts by sample if necessary
if (!is.na(proj@snpFile)) {
# ret is a list of list(id,R,U,A)
allJunctions <- unique(Reduce(c, lapply(resL, "[[", "id")))
res <- matrix(0, nrow = length(allJunctions),
ncol = 3 * length(resL), dimnames = list(allJunctions, NULL))
for (i in seq_len(length(resL))) # make res a matrix with 3 columns per sample
res[resL[[i]]$id, ((i - 1) * 3 + 1):((i - 1) * 3 + 3)] <-
do.call(cbind, resL[[i]][c("R", "U", "A")])
if (nsamples > length(unique(samples))) {
if (collapseBySample) {
message("collapsing counts by sample...", appendLF = FALSE)
iBySample <- split(seq_len(nsamples), samples)[unique(samples)]
res <- do.call(cbind, lapply(iBySample, function(i)
cbind(R = rowSums(res[, (i - 1) * 3 + 1, drop = FALSE]),
U = rowSums(res[, (i - 1) * 3 + 2, drop = FALSE]),
A = rowSums(res[, (i - 1) * 3 + 3, drop = FALSE]))))
colnames(res) <- paste(rep(names(iBySample), each = 3),
c("R", "U", "A"), sep = "_")
message("done")
} else {
# unify non-collapsed identical sample names
colnames(res) <- paste(rep(displayNames(proj), each = 3),
c("R", "U", "A"), sep = "_")
}
} else {
colnames(res) <- paste(rep(samples, each = 3), c("R", "U", "A"),
sep = "_")
}
} else {
# ret is a list of named vectors
allJunctions <- unique(Reduce(c, lapply(resL, names)))
res <- matrix(0, nrow = length(allJunctions), ncol = length(resL),
dimnames = list(allJunctions, NULL))
for (i in seq_len(length(resL)))
res[names(resL[[i]]),i] <- resL[[i]]
if (nsamples > length(unique(samples))) {
if (collapseBySample) {
message("collapsing counts by sample...", appendLF = FALSE)
iBySample <- split(seq_len(nsamples), samples)[unique(samples)]
res <- do.call(cbind, lapply(iBySample, function(i)
rowSums(res[, i, drop = FALSE])))
message("done")
} else {
# unify non-collapsed identical sample names
colnames(res) <- displayNames(proj)
}
} else {
colnames(res) <- samples
}
}
## make GRanges object
res2 <- GenomicRanges::GRanges(
seqnames = sub("^(.+):[0-9]+:[0-9]+:.$", "\\1", allJunctions),
ranges = IRanges::IRanges(
start = as.integer(sub("^.+:([0-9]+):[0-9]+:.$",
"\\1", allJunctions)),
end = as.integer(sub("^.+:[0-9]+:([0-9]+):.$",
"\\1", allJunctions))),
strand = sub("^.+:[0-9]+:[0-9]+:(.)$", "\\1", allJunctions))
S4Vectors::mcols(res2) <- res
## return results
return(res2)
} else {
### reportLevel != "junction" ------------------------------------------
if (!inherits(query, c("GRanges", "GRangesList", "TxDb")))
stop("'query' must be either an object of type 'GRanges', 'GRangesList' or 'TxDb', or NULL for reportLevel=\"junction\"")
## 'useRead' set but not a paired-end experiment?
if (useRead != "any" && proj@paired == "no")
warning("ignoring 'useRead' for single read experiments")
## preprocess query ----------------------------------------------------
## --> create 'flatquery', 'querynames', 'querylengths' and 'zeroquerynames'
## GRanges query ----------------------------------------------------
if (inherits(query, "GRanges")) {
if (!is.null(names(query)) && length(query) > length(unique(names(query)))) {
# remove redundancy from 'query' by names
tmpquery <- GenomicRanges::reduce(S4Vectors::split(query, names(query))[unique(names(query))])
flatquery <- BiocGenerics::unlist(tmpquery, use.names = FALSE)
querynames <- rep(names(tmpquery), S4Vectors::elementNROWS(tmpquery))
rm(tmpquery)
} else {
flatquery <- query
querynames <- if (is.null(names(query)))
as.character(seq_len(length(query))) else names(query)
}
querylengths <- BiocGenerics::width(flatquery)
zeroquerynames <- character(0)
## GRangesList query --------------------------------------------
} else if (inherits(query, "GRangesList")) {
if (any(i <- S4Vectors::elementNROWS(query) == 0)) {
warning(sprintf("removing %d elements from 'query' with zero regions: %s",
sum(i), paste(names(query)[i], collapse = ", ")))
query <- query[-i]
}
# hierarchically remove redundancy from 'query'
message("hierarchically removing redundancy from 'query'...",
appendLF = FALSE)
if (orientation == "any")
BiocGenerics::strand(query) <-
S4Vectors::endoapply(BiocGenerics::strand(query), function(x)
S4Vectors::Rle(factor("*", levels = c("+", "-", "*")),
lengths = length(x)))
query <- GenomicRanges::reduce(query)
if (length(query) > 1) {
cumquery <- query[[1]]
for (i in 2:length(query)) {
query[[i]] <- BiocGenerics::setdiff(query[[i]], cumquery)
cumquery <- c(query[[i]], cumquery)
}
}
message("done")
flatquery <- BiocGenerics::unlist(query, use.names = FALSE)
querynames <- rep(if(is.null(names(query))) as.character(seq_len(length(query))) else names(query),
S4Vectors::elementNROWS(query))
querylengths <- unlist(BiocGenerics::width(query), use.names = FALSE)
zeroquerynames <- (if(is.null(names(query))) as.character(seq_len(length(query))) else names(query))[S4Vectors::elementNROWS(query) == 0]
## TxDb query -----------------------------------------------
} else if (inherits(query, "TxDb")) {
if (is.null(reportLevel))
stop("'reportLevel' must be set to a non-NULL value for 'query' of type 'TxDb'")
message(sprintf("extracting %s regions from TxDb...", reportLevel),
appendLF = FALSE)
if (reportLevel == "gene") {
tmpquery <- GenomicRanges::reduce(
GenomicFeatures::exonsBy(query, by = "gene")
)
flatquery <- BiocGenerics::unlist(tmpquery, use.names = FALSE)
querynames <- rep(names(tmpquery), S4Vectors::elementNROWS(tmpquery))
querylengths <- unlist(BiocGenerics::width(tmpquery), use.names = FALSE)
rm(tmpquery)
} else if (reportLevel == "exon") {
flatquery <- GenomicFeatures::exons(query, columns = "exon_id")
querynames <- as.character(S4Vectors::mcols(flatquery)$exon_id)
querylengths <- BiocGenerics::width(flatquery)
} else if (reportLevel == "promoter") {
flatquery <- GenomicFeatures::promoters(query,
columns = c("tx_id", "tx_name"))
querynames <- paste(as.character(S4Vectors::mcols(flatquery)$tx_id),
as.character(S4Vectors::mcols(flatquery)$tx_name), sep = ";")
querylengths <- BiocGenerics::width(flatquery)
}
zeroquerynames <- character(0)
message("done")
}
if (length(flatquery) == 0)
stop("'query' is empty - nothing to do")
## from now on, only use 'flatquery' (GRanges object) with names in 'querynames' and lengthes in 'querylengths'
## apply 'mask' to flatquery -------------------------------------------
if (!is.null(mask)) {
if (!inherits(mask, "GRanges"))
stop("'mask' must be an object of type 'GRanges'")
message("removing 'mask' ranges from 'query'...", appendLF = FALSE)
BiocGenerics::strand(mask) <- "*"
mask <- GenomicRanges::reduce(mask)
ov <- GenomicRanges::findOverlaps(flatquery, mask)
qOM <- unique(S4Vectors::queryHits(ov))
gr1 <- GenomicRanges::GRanges(
seqnames = S4Vectors::Rle(qOM),
ranges = IRanges::IRanges(start = BiocGenerics::start(flatquery)[qOM],
end = BiocGenerics::end(flatquery)[qOM])
)
gr2 <- GenomicRanges::GRanges(
seqnames = S4Vectors::Rle(S4Vectors::queryHits(ov)),
ranges = IRanges::IRanges(
start = BiocGenerics::start(mask)[S4Vectors::subjectHits(ov)],
end = BiocGenerics::end(mask)[S4Vectors::subjectHits(ov)]
)
)
SD <- BiocGenerics::setdiff(gr1, gr2)
notOverlappingMaskInd <- which(!((seq_len(length(flatquery))) %in% qOM))
completelyMaskedInd <-
qOM[!(qOM %in% as.numeric(as.character(unique(GenomicRanges::seqnames(SD)))))]
# 'zeroquery' contains regions that are completely masked (will get zero count and length)
#zeroquery <- flatquery[completelyMaskedInd]
tmpnames <- querynames[completelyMaskedInd]
zeroquerynames <- c(zeroquerynames, tmpnames[!(tmpnames %in% querynames[-completelyMaskedInd])])
#zeroquerylengths <- rep(0L, length(completelyMaskedInd))
# masked 'flatquery' maybe split into several non-masked pieces
flatquery <- c(flatquery[notOverlappingMaskInd],
GenomicRanges::GRanges(
seqnames = GenomicRanges::seqnames(flatquery)[as.numeric(as.character(GenomicRanges::seqnames(SD)))],
ranges = IRanges::ranges(SD),
strand = GenomicRanges::strand(flatquery)[as.numeric(as.character(GenomicRanges::seqnames(SD)))],
seqlengths = GenomeInfoDb::seqlengths(flatquery)),
ignore.mcols = TRUE)
querynames <- querynames[c(notOverlappingMaskInd, as.numeric(as.character(GenomicRanges::seqnames(SD))))]
querylengths <- BiocGenerics::width(flatquery)
message("done")
}
## setup tasks for parallelization -------------------------------------
## TODO: if sum(width(flatquery)) close to sum(seqlengths(genome)) -> select variant counting algorithm (sequential walk through bamfiles)
if (!is.null(clObj) & inherits(clObj, "cluster", which = FALSE)) {
loadQuasR(clObj)
taskIByFlatQuery <- parallel::splitIndices(
nx = length(flatquery),
ncl = ceiling(length(clObj)/nsamples*2)
)
if (inherits(taskIByFlatQuery, "integer", which = FALSE))
taskIByFlatQuery <- list(taskIByFlatQuery) # make sure taskIByFlatQuery is a list, even if ceiling(length(clObj) /nsamples *2)==1
taskSamples <- rep(samples, each = length(taskIByFlatQuery))
taskBamfiles <- rep(bamfiles, each = length(taskIByFlatQuery))
flatquery <- lapply(taskIByFlatQuery, function(i) flatquery[i])
shifts <- rep(shifts, each = length(taskIByFlatQuery))
myapply <- function(...) {
ret <- parallel::clusterMap(clObj, ..., SIMPLIFY = FALSE,
.scheduling = "dynamic")
## fuse
iBySample <- split(seq_along(ret),names(ret))[unique(names(ret))]
names(ret) <- NULL
if (!is.na(proj@snpFile)) {
ret <- do.call(cbind, lapply(iBySample, function(i)
do.call(rbind, ret[i])))
postfix <- substring(colnames(ret), nchar(colnames(ret)))
## rename
dimnames(ret) <- list(querynames,
paste(rep(samples, each = 3),
postfix, sep = "_"))
} else {
ret <- do.call(cbind, lapply(iBySample, function(i)
do.call(c, ret[i])))
## rename
dimnames(ret) <- list(querynames, samples)
}
ret
}
} else {
taskSamples <- samples
taskBamfiles <- bamfiles
flatquery <- list(flatquery)
myapply <- function(...) {
ret <- do.call(cbind, mapply(..., SIMPLIFY = FALSE))
## rename
if (!is.na(proj@snpFile))
dimnames(ret) <- list(querynames,
paste(rep(samples, each = 3),
substring(colnames(ret),
nchar(colnames(ret))),
sep = "_"))
else
dimnames(ret) <- list(querynames, samples)
ret
}
}
## count alignments ----------------------------------------------------
message("counting alignments...", appendLF = FALSE)
res <- myapply(countAlignments,
bamfile = taskBamfiles,
regions = flatquery,
shift = shifts,
MoreArgs = list(
selectReadPosition = selectReadPosition,
orientation = orientation,
useRead = useRead,
broaden = broaden,
allelic = !is.na(proj@snpFile),
includeSpliced = includeSpliced,
includeSecondary = includeSecondary,
mapqmin = as.integer(mapqMin)[1],
mapqmax = as.integer(mapqMax)[1],
absisizemin = as.integer(absIsizeMin)[1],
absisizemax = as.integer(absIsizeMax)[1]))
message("done")
## collapse (sum) counts by sample if necessary
if (nsamples > length(unique(samples))) {
if (collapseBySample) {
message("collapsing counts by sample...", appendLF = FALSE)
if (is.na(proj@snpFile))
iBySample <- split(seq_len(nsamples), samples)[unique(samples)]
else
iBySample <- split(seq_len(ncol(res)),
colnames(res))[unique(colnames(res))]
res <- do.call(cbind, lapply(iBySample, function(i)
rowSums(res[, i, drop = FALSE])))
message("done")
} else {
# unify non-collapsed identical sample names
if (is.na(proj@snpFile))
colnames(res) <- displayNames(proj)
else
colnames(res) <- paste(rep(displayNames(proj), each = 3),
substring(colnames(res),
nchar(colnames(res))), sep = "_")
}
}
## add the region width as first column
res <- cbind(width = querylengths, res)
rm(querylengths)
## collapse (sum) counts by 'querynames'
if (length(querynames) > length(unique(querynames))) {
message("collapsing counts by query name...", appendLF = FALSE)
iByQuery <- split(seq_len(nrow(res)), querynames)[unique(querynames)]
res <- do.call(rbind, lapply(iByQuery, function(i)
colSums(res[i, seq_len(ncol(res)), drop = FALSE])))
rownames(res) <- querynames <- names(iByQuery)
message("done")
}
if (length(zeroquerynames) > length(unique(zeroquerynames)))
zeroquerynames <- unique(zeroquerynames)
## combine with zeroquery and reorder according to 'query'
res2 <- matrix(0, nrow = length(querynames) + length(zeroquerynames),
ncol = ncol(res),
dimnames = list(if(inherits(query, "TxDb"))
sort(c(querynames, zeroquerynames))
else if(is.null(names(query)))
as.character(seq_len(length(query)))
else unique(names(query)),
colnames(res)))
res2[rownames(res), ] <- res
## return results
return(res2)
}
}
## count junctions (with the C-function) for single bamfile and optionally selected target sequences
## return a named vector with junction elements (names of the form "chromosome:first_intronic_base:last_intronic_base:strand")
#' @keywords internal
#' @importFrom Rsamtools scanBamHeader
countJunctionsOneBamfile <- function(bamfile, targets, allelic,
includeSecondary, mapqmin, mapqmax) {
tryCatch({ # try catch block goes through the whole function
# prepare region vectors
bh <- Rsamtools::scanBamHeader(bamfile)[[1]]$targets
tid <- seq_along(bh) - 1L
if (!is.null(targets)) {
if (any(is.na(i <- match(targets, names(bh)))))
stop(sprintf("some targets not found in bamfile '%s': %s",
bamfile, targets[which(is.na(i))]))
bh <- bh[i]
tid <- tid[i]
}
start <- rep(0L, length(tid)) # samtools library has 0-based inclusive start
end <- unname(bh) ## samtool library has 0-based exclusiv end
# count junctions
count <- .Call(countJunctions, bamfile, tid, start, end, allelic,
includeSecondary, mapqmin, mapqmax)
return(count)
}, error = function(ex) {
emsg <- paste("Internal error on", Sys.info()['nodename'],
"query bamfile", bamfile,
"\n Error message is:", ex$message)
stop(emsg)
})
}
## count alignments (with the C-function) for single bamfile, single shift, and single set of regions
## return a numeric vector with length(regions) elements (same order as regions)
#' @keywords internal
#' @importFrom Rsamtools scanBamHeader
#' @importFrom GenomicRanges seqnames
#' @importFrom BiocGenerics strand start end match as.vector
countAlignments <- function(bamfile, regions, shift, selectReadPosition, orientation,
useRead, broaden, allelic, includeSpliced, includeSecondary,
mapqmin, mapqmax, absisizemin, absisizemax) {
tryCatch({ # try catch block goes through the whole function
## translate seqnames to tid and create region data.frame
seqnamesBamHeader <- names(Rsamtools::scanBamHeader(bamfile)[[1]]$targets)
## prepare region vectors
#tid <- IRanges::as.vector(IRanges::match(seqnames(regions), seqnamesBamHeader)) - 1L
tid <- BiocGenerics::as.vector(BiocGenerics::match(GenomicRanges::seqnames(regions),
seqnamesBamHeader) - 1L)
start <- BiocGenerics::start(regions) - 1L ## samtool library has 0-based inclusiv start
end <- BiocGenerics::end(regions) ## samtools library has 0-based exclusive end
## swap strand for 'orientation="opposite"'
if (orientation == "any")
strand <- rep("*", length(regions))
else if (orientation == "opposite")
strand <- c("+"="-", "-"="+", "*"="*")[as.character(BiocGenerics::strand(regions))]
else # orientation == "same"
strand <- as.character(BiocGenerics::strand(regions))
## translate useRead parameter
BAM_FREAD1 <- 64L
BAM_FREAD2 <- 128L
if (useRead == "any")
readBitMask <- BAM_FREAD1 + BAM_FREAD2
else if (useRead == "first")
readBitMask <- BAM_FREAD1
else if (useRead == "last")
readBitMask <- BAM_FREAD2
## translate includeSecondary parameter
BAM_FSECONDARY <- 256L
if (includeSecondary)
readBitMask <- readBitMask + BAM_FSECONDARY
## get counts
if (!allelic) {
count <- .Call(countAlignmentsNonAllelic, bamfile, tid, start, end, strand,
selectReadPosition, readBitMask, shift, broaden, includeSpliced,
mapqmin, mapqmax, absisizemin, absisizemax)
} else {
count <- as.matrix(as.data.frame(
.Call(countAlignmentsAllelic, bamfile, tid, start, end, strand,
selectReadPosition, readBitMask, shift, broaden, includeSpliced,
mapqmin, mapqmax, absisizemin, absisizemax)
))
}
return(count)
}, error = function(ex) {
reg <- regions[c(1, length(regions))]
emsg <- paste("Internal error on", Sys.info()['nodename'],
"query bamfile", bamfile,"with regions\n",
paste(GenomicRanges::seqnames(reg), BiocGenerics::start(reg),
"-" , BiocGenerics::end(reg),
BiocGenerics::strand(reg), collapse = "\n\t...\n"),
"\n Error message is:", ex$message)
stop(emsg)
})
}
# ## Counts alignments in a given set of regions which are located in a subspace of the genome
# ## using a per-base-coverage vector approach
# ## shift the read, broaden fetch region,
# ## return a numeric vector with length(regions) elements (same order as regions)
# countAlignmentsSubregionsC <- function(bamfile, regions, selectReadPosition, shift=0L, broaden=0L, includeSpliced=TRUE)
# {
# ## check if region are located on one chromosme
# seqName <- unique(seqnames(regions))
# if(length(seqName) > 1L)
# stop("regions should only be located on one chromosome")
#
# ## check broaden and shift parameter
# if(broaden < 0)
# stop("'broaden' should not be negative")
# # if(shift > 0 && selectReadPosition="midwithin")
# # stop("'shift' parameter must be zero if 'selectReadPosition' is set to midwithin")
# # if(broaden > 0 && (selectReadPosition="startwithin" || selectReadPosition="endwithin"))
# # stop("'broaden' parameter must be zero if 'selectReadPosition' is set to startwithin or endwithin")
#
# ## translate seqName to tid
# seqnamesList <- names(scanBamHeader(bamfile)[[1]]$targets)
# tidList <- as.integer(seq_along(seqnamesList)-1)
# tid <- tidList[ match(seqName, seqnamesList) ]
#
# ## convert grange to data.frame
# ## with 0-based start inclusive
# ## with 0-based end exclusive
# regionsTable <- data.frame(start=as.integer(start(regions)-1), ## samtool library has 0-based start
# end=as.integer(end(regions)),
# strand=as.character(strand(regions)),
# stringsAsFactors=FALSE
# )
#
# ## call c-function
# cnt <- .Call(countAlignmentsSubregions, bamfile, bamfile, tid, min(regionsTable$start), max(regionsTable$end),
# regionsTable, as.integer(shift), as.integer(broaden), selectReadPosition, includeSpliced)
#
# return(cnt)
# }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.