Nothing
## query : GRanges : value is list of matrices:
## one list element per sample with matrix:
## one(normal) or three(allelic) rows per unique query name and max(width(query)) columns
## e.g. res[['Sample1']]['HCP_TSS','-100']
qProfile <-
function(proj,
query,
upstream=1000,
downstream=upstream,
selectReadPosition=c("start", "end"),
shift=0L,
orientation=c("any","same","opposite"),
useRead=c("any","first","last"),
auxiliaryName=NULL,
mask=NULL,
collapseBySample=TRUE,
includeSpliced=TRUE,
includeSecondary=TRUE,
mapqMin=0L,
mapqMax=255L,
absIsizeMin=NULL,
absIsizeMax=NULL,
maxInsertSize=500L,
binSize=1L,
clObj=NULL) {
## setup variables from 'proj' -----------------------------------------------------------------------
## 'proj' is correct type?
if(!inherits(proj, "qProject", which=FALSE))
stop("'proj' must be an object of type 'qProject' (returned by 'qAlign')")
samples <- proj@alignments$SampleName
nsamples <- length(samples)
bamfiles <-
if(is.null(auxiliaryName))
proj@alignments$FileName
else if(!is.na(i <- match(auxiliaryName, proj@aux$AuxName)))
unlist(proj@auxAlignments[i,], use.names=FALSE)
else
stop("unknown 'auxiliaryName', should be one of: NULL, ",
paste(sprintf("'%s'", proj@aux$AuxName), collapse=", "))
## validate parameters -------------------------------------------------------------------------------
if(!is.numeric(upstream))
stop("'upstream' must be of type 'numeric'")
if(!is.numeric(downstream))
stop("'downstream' must be of type 'numeric'")
upstream <- as.integer(upstream)
downstream <- as.integer(downstream)
selectReadPosition <- match.arg(selectReadPosition)
orientation <- match.arg(orientation)
useRead <- match.arg(useRead)
if(!is.logical(collapseBySample) || length(collapseBySample) != 1L)
stop("'collapseBySample' must be either TRUE or FALSE")
if(!is.logical(includeSpliced) || length(includeSpliced) != 1L)
stop("'includeSpliced' must be either TRUE or FALSE")
if(!is.logical(includeSecondary) || length(includeSecondary) != 1L)
stop("'includeSecondary' must be either TRUE or FALSE")
if((!is.null(absIsizeMin) || !is.null(absIsizeMax)) && proj@paired == "no")
stop("'absIsizeMin' and 'absIsizeMax' can only be used for paired-end experiments")
if(is.null(absIsizeMin)) # -1L -> do not apply TLEN filtering
absIsizeMin <- -1L
if(is.null(absIsizeMax))
absIsizeMax <- -1L
if(!is.numeric(maxInsertSize) || length(maxInsertSize) != 1L)
stop("'maxInsertSize' must be a numerical scalar")
if(!is.numeric(binSize) || length(binSize) != 1L || binSize <= 0)
stop("'binSize' must be a single numerical value greater than zero")
if(binSize %% 2 != 1)
stop("'binSize' must be an odd number")
## check shift
if(length(shift) == 1 && shift == "halfInsert") {
if(proj@paired == "no") {
stop("'shift=\"halfInsert\"' can only be used for paired-end experiments")
} else {
shifts <- rep(-1000000L, nsamples)
broaden <- as.integer(ceiling(maxInsertSize/2))
}
} else {
if(!is.numeric(shift) || (length(shift)>1 && length(shift)!=nsamples))
stop(sprintf("'shift' must be 'halfInsert', a single integer or an integer vector with %d values",nsamples))
else if(length(shift) == 1)
shifts <- rep(as.integer(shift), nsamples)
else
shifts <- as.integer(shift)
broaden <- 0L
}
## all query chromosomes present in all bamfiles?
trTab <- table(unlist(lapply(scanBamHeader(bamfiles), function(bh) names(bh$targets))))
trCommon <- names(trTab)[trTab==length(bamfiles)]
if(any(f <- !(seqlevels(query) %in% trCommon)))
stop(sprintf("sequence levels in 'query' not found in alignment files: %s",
paste(seqlevels(query)[f],collapse=", ")))
## 'useRead' set but not a paired-end experiment?
if(useRead != "any" && proj@paired == "no")
warning("ignoring 'useRead' for single read experiments")
## preprocess query -------------------------------------------------------------------------------
## --> extract 'querynames', 'refpos', 'queryWin', 'maxUp', 'maxDown', 'maxUpBin', 'maxDownBin'
## split into 'queryWinL', 'refposL', 'querynamesIntL' by name
## GRanges query -------------------------------------------------------------------------------
if(inherits(query,"GRanges")) {
# define 'querynames'
if(!is.null(names(query)))
querynames <- names(query)
else
querynames <- rep("query",length(query)) # combine all regions
# define 'refpos' and 'queryWin'
if(length(upstream)==1)
upstream <- rep(upstream,length(query))
else if(length(upstream)!=length(query))
stop(sprintf("the length of 'upstream' (%d) must be one or the same as 'query' (%d)",
length(upstream), length(query)))
if(length(downstream)==1)
downstream <- rep(downstream,length(query))
else if(length(downstream)!=length(query))
stop(sprintf("the length of 'downstream' (%d) must be one or the same as 'query' (%d)",
length(downstream), length(query)))
# adjust 'upstream', 'downstream' to include full bins
upstream <- ceiling((upstream - (binSize - 1)/2) / binSize) * binSize + (binSize - 1)/2
downstream <- ceiling((downstream - (binSize - 1)/2) / binSize) * binSize + (binSize - 1)/2
# define 'maxUp', 'maxDown', 'maxUpBin' and 'maxDownBin'
# (have one bin that is centered at anchor point, relative position of bin mid at zero)
# maxUp, maxDown: number of bases upstream/downstream of start(query) --> includes half of middle bin, but not the anchor posittion
# (total number of covered bases: maxUp + maxDown + 1)
# maxUpBin, maxDownBin: number of bins upstream/downstream of middle bin
# (total number of bins: maxUpBin + maxDownBin + 1)
maxUpBin = as.integer(ceiling((max(upstream) + 1 - (binSize + 1)/2)/binSize))
maxDownBin = as.integer(ceiling((max(downstream) + 1 - (binSize + 1)/2)/binSize))
maxUp = as.integer(maxUpBin * binSize + (binSize - 1)/2)
maxDown = as.integer(maxDownBin * binSize + (binSize - 1)/2)
# err = (maxUp + maxDown + 1) %% binSize # has to be zero
binNames <- as.character(seq(-maxUpBin * binSize, maxDownBin * binSize, by = binSize))
if((maxUpBin + maxDownBin + 1) > 20000L)
warning(sprintf("profiling over large region (%d basepairs, %d bins) - may be slow and require a lot of memory",
maxUp + maxDown + 1, maxUpBin + maxDownBin))
plusStrand <- as.character(strand(query)) != "-"
refpos <- ifelse(plusStrand, start(query), end(query))
queryWin <- GRanges(seqnames(query),
IRanges(start = pmax(1, ifelse(plusStrand, refpos -upstream, refpos -downstream)),
end = ifelse(plusStrand, refpos +downstream, refpos +upstream)),
strand = strand(query))
# split 'queryWin' --> 'queryWinL' and 'refpos' --> 'refposL' by 'querynames'
if(!is.null(clObj) & inherits(clObj,"cluster",which=FALSE)) {
profileChunkL <- split(seq_len(length(queryWin)), factor(querynames,levels=unique(querynames)))
approxNumRegionsPerChunk <- length(queryWin) /(length(clObj)/length(bamfiles))
profileChunkToTask <- round(cumsum(sapply(profileChunkL,length)) /approxNumRegionsPerChunk)
taskL <- lapply(split(seq_along(profileChunkL), profileChunkToTask), function(i) do.call(c,profileChunkL[i]))
} else {
taskL <- list(seq_len(length(queryWin))) # single task per bam file
}
queryWinL <- lapply(taskL, function(i) queryWin[i])
refposL <- lapply(taskL, function(i) refpos[i])
querynamesInt <- as.integer(factor(querynames,levels=unique(querynames)))
querynamesIntL <- lapply(taskL, function(i) querynamesInt[i])
# calculate coverage
cvgBaseL <- lapply(seq_along(queryWin), function(i) (maxUp-upstream[i]+1):(maxUp+downstream[i]+1))
cvgBase <- do.call(rbind, lapply(split(seq_along(queryWin),factor(querynames,levels=unique(querynames))),
function(i) tabulate(unlist(cvgBaseL[i]), nbins = maxUp + maxDown + 1)))
cvg <- do.call(cbind, lapply(split(seq.int(maxUp + maxDown + 1), rep(seq(-maxUpBin, maxDownBin), each = binSize)),
function(i) rowSums(cvgBase[, i, drop = FALSE])))
colnames(cvg) <- binNames
} else {
stop("'query' must be an object of type 'GRanges'")
}
## from now on, only use 'queryWinL' (named list of GRanges objects) with reference positions in 'refposL'
## apply 'mask' to queryWinL ----------------------------------------------------------------------------
if(!is.null(mask)) {
if(!inherits(mask,"GRanges"))
stop("'mask' must be an object of type 'GRanges'")
stop("'mask' is not yet supported for qProfile")
}
## setup tasks for parallelization -------------------------------------------------------------------
if(!is.null(clObj) & inherits(clObj, "cluster", which=FALSE)) {
message("preparing to run on ", length(clObj), " nodes...", appendLF=FALSE)
ret <- clusterEvalQ(clObj, library("QuasR")) # load libraries on nodes
if(!all(sapply(ret, function(x) "QuasR" %in% x)))
stop("'QuasR' package could not be loaded on all nodes in 'clObj'")
myapply <- function(...) clusterMap(clObj, ..., SIMPLIFY=FALSE, .scheduling="dynamic")
message("done")
} else {
myapply <- function(...) mapply(..., SIMPLIFY=FALSE)
}
## count alignments ----------------------------------------------------------------------------------
message("profiling alignments...", appendLF=FALSE)
res <- myapply(profileAlignments,
bamfile=rep(bamfiles, each=length(queryWinL)),
queryids=querynamesIntL,
regions=queryWinL,
refpos=refposL,
shift=rep(shifts, each=length(queryWinL)),
MoreArgs=list(
selectReadPosition=selectReadPosition,
orientation=orientation,
useRead=useRead,
broaden=broaden,
allelic=!is.na(proj@snpFile),
maxUp=maxUp,
maxDown=maxDown,
maxUpBin=maxUpBin,
maxDownBin=maxDownBin,
includeSpliced=includeSpliced,
includeSecondary=includeSecondary,
mapqmin=as.integer(mapqMin)[1],
mapqmax=as.integer(mapqMax)[1],
absisizemin=as.integer(absIsizeMin)[1],
absisizemax=as.integer(absIsizeMax)[1],
binsize=as.integer(binSize),
binNames=binNames))
message("done")
## fuse by input file, rename and collapse by sample
if(!is.na(proj@snpFile)) {
res <- lapply(split(seq_along(res), rep(displayNames(proj), each=length(queryWinL))),
function(i) {
tmpR <- do.call(rbind, lapply(res[i],"[[","R"))
tmpU <- do.call(rbind, lapply(res[i],"[[","U"))
tmpA <- do.call(rbind, lapply(res[i],"[[","A"))
rownames(tmpR) <- rownames(tmpU) <- rownames(tmpA) <- unique(querynames)
list(R=tmpR, U=tmpU, A=tmpA)
})
if(collapseBySample) {
res <- lapply(split(seq_along(res), factor(samples,levels=unique(samples))),
function(i) list(R=Reduce("+", lapply(res[i],"[[","R")),
U=Reduce("+", lapply(res[i],"[[","U")),
A=Reduce("+", lapply(res[i],"[[","A"))))
}
nms <- paste(rep(names(res),each=3),c("R","U","A"),sep="_")
res <- do.call(c, res)
names(res) <- nms
} else {
res <- lapply(split(seq_along(res), rep(displayNames(proj), each=length(queryWinL))),
function(i) {
tmp <- do.call(rbind, res[i])
rownames(tmp) <- unique(querynames)
tmp
})
if(collapseBySample) {
res <- lapply(split(seq_along(res), factor(samples,levels=unique(samples))),
function(i) Reduce("+", res[i]))
}
}
## add the region coverage as first elemnt
res <- c(list(coverage=cvg), res)
## return results
return(res)
}
## profile alignments (with the C-function) for single bamfile, multiple regions, single shift
## return a numeric vector with maxWidth elements corresponding to the positions within the query regions
profileAlignments <- function(bamfile, queryids, regions, refpos, shift, selectReadPosition,
orientation, useRead, broaden, allelic, maxUp, maxDown, maxUpBin, maxDownBin,
includeSpliced, includeSecondary,
mapqmin, mapqmax, absisizemin, absisizemax, binsize, binNames)
{
tryCatch({ # try catch block contains whole function
# translate seqnames to tid and create region data.frame
seqnamesBamHeader <- names(scanBamHeader(bamfile)[[1]]$targets)
# prepare region vectors
tid <- as.vector(match(seqnames(regions), seqnamesBamHeader) - 1L)
s <- start(regions) - 1L # Samtools library has 0-based inclusive start
e <- end(regions) # Samtools library has 0-based exclusive end
rp <- refpos - 1L # Samtools library has 0-based inclusive start
## swap selstrand for 'orientation="opposite"'
regstrand <- as.character(strand(regions))
if(orientation == "any")
selstrand <- rep("*", length(regions))
else if(orientation == "opposite")
selstrand <- c("+"="-", "-"="+", "*"="*")[regstrand]
else # orientation == "same"
selstrand <- regstrand
## translate useRead parameter
BAM_FREAD1 <- 64L
BAM_FREAD2 <- 128L
if(useRead == "any")
readBitMask <- BAM_FREAD1 + BAM_FREAD2
else if (useRead == "first")
readBitMask <- BAM_FREAD1
else if (useRead == "last")
readBitMask <- BAM_FREAD2
## translate includeSecondary parameter
BAM_FSECONDARY <- 256L
if(includeSecondary)
readBitMask <- readBitMask + BAM_FSECONDARY
## count alignments by position
if(!allelic) {
count <- t(.Call(profileAlignmentsNonAllelic, bamfile, queryids, tid, s, e, rp, selstrand, regstrand,
selectReadPosition, readBitMask, shift, broaden, maxUp, maxDown, maxUpBin, maxDownBin,
includeSpliced, mapqmin, mapqmax, absisizemin, absisizemax, binsize, binNames))
} else {
count <- lapply(.Call(profileAlignmentsAllelic, bamfile, queryids, tid, s, e, rp, selstrand, regstrand,
selectReadPosition, readBitMask, shift, broaden, maxUp, maxDown, maxUpBin, maxDownBin,
includeSpliced, mapqmin, mapqmax, absisizemin, absisizemax, binsize, binNames), t)
}
return(count)
}, error = function(ex) {
reg <- regions[c(1, length(regions))]
emsg <- paste("Internal error on ", Sys.info()['nodename'], ", bamfile ", bamfile," with regions\n\t",
paste(seqnames(reg), ":", start(reg), "-" , end(reg), ":", strand(reg), sep="", collapse="\n\t...\n\t"),
"\n Error message is: ", ex$message, sep="")
stop(emsg)
})
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.