Nothing
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## ##
## Project : spliceSites ##
## Created : 29.Okt.2012 ##
## Author : W. Kaisers ##
## Content : Funktionality for working with RNA-seq splice-sites data ##
## based von rbamtools and refGenome packages. ##
## for usage in R ##
## Version : 1.1.1 ##
## ##
## Changelog : ##
## 12.11.12 : Corrected false calculation of gaplen in merge.alignGaps ##
## 12.12.12 : Added Dependency on BiocGenerics ##
## 03.12.12 : Corrected and checked Proteomic pipeline: lJunc,rJunc, ##
## lCodons,rCodons,lrJunc,lrCodons ##
## 04.12.12 : Added keyProfiler class ##
## 11.01.13 : 0.1.1 ##
## 06.02.13 : Added readMergedBamSites function which uses ##
## rbamtools:::bamSiteLists ##
## 18.02.13 : 0.2.1 ##
## 28.05.13 : 0.3.1 ##
## 03.06.13 : 0.3.3 added suppression of scientific notation in ##
## write.annDNAtables ##
## 03.06.13 : 0.3.4 readExpSet ##
## 03.07.13 : 0.4.0 Added truncateSeq (C-routine) and changed ##
## trypsinCleave to C-implementation ##
## 09.07.13 : 0.5.0 Added maxEnts ##
## 10.07.13 : 0.5.1 Changed position entries from 0-based to 1-based ##
## 10.07.13 : 0.5.2 Removed gapRanges, gapProbes and derived (aaX, dnaX) ##
## classes ##
## 18.07.13 : 0.6.0 Renamed gapAligns to alignGaps, removed ##
## readMergedBamGapProbes ##
## 24.07.13 : 0.7.0 Renamed alignGaps to gapSites ##
## 31.07.13 : 0.8.0 Removed dependency on data.table, C-version for ##
## alt_group ##
## 09.08.13 : 0.99.0 All C routines valgrind checked ##
## 21.08.13 : 0.99.7 Review by Marc Carlson: ##
## Added biocViews in DESCRIPTION ##
## (RNAseq,GeneExpression,DifferentialExpression,Proteomics) ##
## Added Collate entry in DESCRIPTION ##
## Added R-registration of C-routines ##
## Switched calloc to R_alloc in C (valgrind tested) ##
## 21.08.13 : 0.99.8 Added functions for hbond score (valgrind tested) ##
## 23.08.13 : 0.99.9 Submission update for Bioconductor ##
## 23.08.13 : 0.99.10 Added get_dna_nmers (valgrind tested) ##
## 03.12.13 : 1.1.1 Changed annotate.gapSites function ##
## 29.01.14 : 1.1.3 Added NA handling for Strings in maxent and hbond ##
## 11.11.14 : 1.3.3 Changed annotation procedure to overlapJuncs ##
## 27.11.15 : 1.11.0 Submitted update to BioC-devel via svn ##
## 27.03.17 : 1.23.5 Submitted update (correction of addGenomeData) ##
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
.onUnload <- function(libpath) { library.dynam.unload("spliceSites", libpath) }
strandlevels <- c("+", "-", "*")
## Unexported function for creating index values on alphabet (->readExpSet)
alphabetIndex<-function(idxLen, alpha)
{
return(.Call("get_alph_index",
as.integer(idxLen), alpha, PACKAGE="spliceSites"))
}
## 36 character alphabet
index_alphabet <- paste(paste(0:9, collapse=""),
paste(letters, collapse=""), sep="")
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## ##
## Declarations for cRanges: ##
## Centered Ranges ##
## Ranges which contain a pointer to a "position" of interest ##
## inside the range. ##
## Intended to represent a range around an exon-intron boundary ##
## where "position" represents the ##
## 1-based position of the last exon nucleotide ##
## ##
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
setMethod("initialize", "cRanges", function(.Object,
seqid=NULL, start=NULL, end=NULL,
width=NULL, strand=NULL, position=0L, id=NULL)
{
## Allows to create an empty object
if(is.null(start) && is.null(end))
return(.Object)
n<-length(start)
if(is.null(id))
id<-1:n
if(!is.null(width))
end<-start + width-1L
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## strand
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
if(is.null(strand))
strand <- factor(rep("*", n), levels=strandlevels)
if(length(strand) == 1)
strand=factor(rep(strand, n), levels=strandlevels)
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## seqid
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
if(length(seqid) == 1)
seqid <- factor(rep(seqid, n))
else
seqid <- factor(seqid)
dfr <- data.frame(seqid=seqid, start=start, end=end,
strand=strand, position=as.integer(position), id=id)
.Object@dt <- dfr[order(dfr$seqid, dfr$start, dfr$end), ]
return(.Object)
})
setMethod("initialize", "cdRanges", function(.Object,
seqid=NULL,
start=NULL, end=NULL, width=NULL,
strand=NULL,
position=0L, id=NULL, seq=NULL)
{
.Object <- callNextMethod(.Object, seqid, start,
end, width, strand, position, id)
n <- nrow(.Object@dt)
if(is.null(seq))
{
.Object@seq <- DNAStringSet()
}else if(is.character(seq)){
.Object@seq <- DNAStringSet(seq)
}else if(is(seq, "DNAStringSet")){
.Object@seq <- seq
}else{
stop("[initialize.cdRanges] seq must be character or DNAStringSet!")
}
return(.Object)
})
setValidity("cRanges", function(object)
{
if(any(object@dt$end-object@dt$start<0))
return("end must be >= start!")
## + + + + + + + + + + + +
## Range: 1 2 3 4
## Position: 3
## End-Start=3
## + + + + + + + + + + + +
if(any(object@dt$position>(object@dt$end-object@dt$start + 1)))
return("position must be inside of range (i.e. <= end-start + 1)!")
else
return(TRUE)
})
setMethod("seqid", "cRanges", function(x) return(x@dt$seqid))
setMethod("start", "cRanges", function(x) return(x@dt$start))
setMethod("end", "cRanges", function(x) return(x@dt$end))
setMethod("id", "cRanges", function(x) return(x@dt$id))
setMethod("strand","cRanges", function(x,...) return(x@dt$strand))
setMethod("count", "cRanges", function(x) return(nrow(x@dt)))
setMethod("width", "cRanges", function(x) return(x@dt$end - x@dt$start + 1L))
setMethod("getSequence","cdRanges", function(x) return(x@seq))
setMethod("getSequence","caRanges", function(x) return(x@seq))
setMethod("sortTable", "cRanges", function(x)
{
o <- order(x@dt$seqid, x@dt$start, x@dt$end)
cr <- new(class(x))
cr@dt <- x@dt[o, ]
if(.hasSlot(x, "seq"))
cr@seq <- x@seq[o]
return(cr)
})
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## xCodons functions
## Do upstream frame-shift and downstream trim to full codon.
## Position values are updated
## (position = 1-based position of last exon nucleotide)
## Eventually overwrite strand column
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
##
## lCodons:
## Left side frame-shift and right-trim to full codon
##
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
setMethod("lCodons", "cRanges", function(x, frame=1, keepStrand=TRUE)
{
if(!is.element(frame, 1:3))
stop("[lCodons.cRanges] frame must be one of 1, 2, 3!")
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Convert frame into 0-based shift value
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
nframe <- as.integer(frame - 1)
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Create return object
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
cr <- new("cRanges")
cr@dt <- x@dt
cr@dt$start <- cr@dt$start + nframe
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Calculate width and truncate
## width to full codon length
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
w <- cr@dt$end - cr@dt$start + 1L
w <- w - w%%3
cr@dt$end <- cr@dt$start + w-1L
cr@dt$position <- cr@dt$position-nframe
n <- nrow(cr@dt)
cr@dt$frame <- rep(frame, n)
if(!keepStrand)
cr@dt$strand <- factor(rep("+", n), levels=strandlevels)
if(any(cr@dt$position<0))
message("[lCodons.cRanges] Found position <0!")
if(any(cr@dt$position>w))
message("[lCodons.cRanges] Found position > width!")
return(cr)
})
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
##
## rCodons:
## Right side frame-shift and left-trim to full codon
## Intended to be used in combination with reverseComplement (later on)
##
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
setMethod("rCodons", "cRanges", function(x, frame=1, keepStrand=TRUE){
if(!is.element(frame, 1:3))
stop("[rCodons.cRanges] frame must be one of 1,2,3!")
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Convert frame into 0-based shift value
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
nframe <- as.integer(frame-1)
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Create return object
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
cr <- new("cRanges")
cr@dt <- x@dt
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Right shift
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
cr@dt$end <- cr@dt$end-nframe
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Calculate width and truncate
## width to full codon length
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
w <- cr@dt$end-cr@dt$start + 1L
w <- w-w%%3
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Left truncation
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
cr@dt$start <- cr@dt$end-w + 1L
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Correct position
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
cr@dt$position <- cr@dt$position-nframe
n <- nrow(cr@dt)
cr@dt$frame <- rep(frame, n)
if(!keepStrand)
cr@dt$strand <- factor(rep("-", n), levels=strandlevels)
if(any(cr@dt$position<0))
message("[rCodons.cRanges] Found position <0!")
if(any(cr@dt$position>w))
message("[rCodons.cRanges] Found position > width!")
return(cr)
})
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Fills the 'seq' slot.
## Possibly removes lines (when removeUnknownStrand is set)
## Result is position sorted
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
setMethod("dnaRanges",
c("cRanges", "DNAStringSet", "missing"),
function(x, dnaset)
dnaRanges(x, dnaset, useStrand=TRUE,
removeUnknownStrand=TRUE, verbose=TRUE))
setMethod("dnaRanges",
c("cRanges", "DNAStringSet", "logical"),
function(x, dnaset, useStrand=TRUE,
removeUnknownStrand=TRUE, verbose=TRUE, ...)
{
# if(!is(dnaset, "DNAStringSet"))
# stop("dnaset must be DNAStringSet!")
#
# if(!is.logical(useStrand))
# stop("useStrand must be logical!")
dnames <- names(dnaset)
if(is.null(dnames))
stop("[dnaRanges.cRanges] names(dnaset) must not be NULL!")
bm <- Sys.localeconv()[7]
if(removeUnknownStrand && useStrand[1])
{
lstr <- x@dt$strand != "*"
nlstr <- sum(lstr)
if(nlstr == 0)
stop("Removing all(!) rows from cRanges due to unknown strand!")
if(verbose)
message("[dnaRanges.cRanges] Removing ",
format(length(lstr) - nlstr, big.mark=bm),
" rows from cRanges due to unknown strand (rest: ",
format(nlstr, big.mark=bm),
" rows).")
x@dt <- x@dt[lstr, ]
}
n <- length(dnames)
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## New table as template
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
ans <- x@dt[0, ]
ans$seq <- character()
for(i in 1:n)
{
lg <- as.logical(seqid(x) == dnames[i])
if(any(lg))
{
dt <- x@dt[x@dt$seqid == dnames[i], ]
dt$seq <- as.character(Views(dnaset[[i]], dt$start, dt$end))
# Add data for seqid to data.frame
ans <- rbind(dt, ans)
}
}
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Sort ans
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
ans <- ans[order(ans$seqid, ans$start, ans$end), ]
if(nrow(ans) == 0)
stop("No match of sequence names between ",
"seqid(cRanges) and names(dnaset)!")
dss <- DNAStringSet(ans$seq)
ans$seq <- NULL
if(useStrand)
{
lg <- as.logical(ans$strand == "-")
dss[lg] <- reverseComplement(dss[lg])
if(verbose)
{
if(removeUnknownStrand)
{
message("[dnaRanges.cRanges] Reversed ",
format(sum(lg), big.mark=bm), " sequences.")
}else{
message("[dnaRanges.cRanges] useStrand: ",
"'*' is treated as ' + '. Reversed ",
format(sum(lg), big.mark=bm), " sequences.")
}
}
}
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Create returned object
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
res=new("cdRanges")
res@dt <- ans
res@seq <- dss
return(res)
})
setMethod("extractRange", "gapSites", function(object, seqid, start, end)
{
if(!is.character(seqid))
stop("[extractRange.gapSites] seqid must be character!")
if(!is.numeric(start))
stop("[extractRange.gapSites] start must be numeric!")
start <- as.integer(start)
if(!is.numeric(end))
stop("[extractRange.gapSites] end must be numeric!")
end <- as.integer(end)
if(start >= end)
stop("[extractRange.gapSites] end must be greater than start!")
sq <- object@dt$seqid == seqid
st <- object@dt$lstart >= start
se <- object@dt$rend <= end
res <- sq & st & se
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Create returned object
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
exr <- new(class(object))
exr@dt <- object@dt[res, ]
exr@nAligns <- object@nAligns
exr@nAlignGaps <- object@nAlignGaps
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Add annotation table
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
mtc <- match(exr@dt$id, object@annotation$id)
exr@annotation <- object@annotation[mtc, ]
if(.hasSlot(object, "seq"))
exr@seq <- object@seq[res]
return(exr)
})
setMethod("extractRange", "cRanges", function(object, seqid, start, end)
{
if(!is.character(seqid))
stop("seqid must be character!")
if(!is.numeric(start))
stop("start must be numeric!")
start <- as.integer(start)
if(!is.numeric(end))
stop("end must be numeric!")
end <- as.integer(end)
if(start >= end)
stop("end must be greater than start!")
sq <- object@dt$seqid == seqid
st <- object@dt$start >= start
se <- object@dt$end <= end
res <- sq & st & se
exr <- new(class(object))
exr@dt <- object@dt[res, ]
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## cdRanges and caRanges
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
if(.hasSlot(object, "seq"))
exr@seq <- object@seq[res]
return(exr)
})
setMethod("extractByGeneName", "cRanges", function(object, geneNames, src, ...)
{
if(missing(geneNames))
stop("geneNames argument is not optional!")
if(missing(src))
stop("src argument is not optional!")
if(!is.character(geneNames))
stop("geneNames must be character!")
if(!is(src, "refGenome"))
stop("src must be of class 'refGenome'!")
reg <- extractByGeneName(src, geneNames)
gpos <- getGenePositions(reg)
n <- nrow(gpos)
if(n == 0)
stop("Empty gene position table (no matches in geneNames?)!")
res <- extractRange(object, seqid=as.character(gpos$seqid[1]),
start=gpos$start[1], end=gpos$end[1])
if(n>1)
{
for(i in 1:n)
res <- c(res, extractRange(object,
seqid=as.character(gpos$seqid[i]), start=gpos$start[i],
end=gpos$end[i]))
}
return(res)
})
setMethod("extractByGeneName", "gapSites", function(object, geneNames, src, ...)
{
if(missing(geneNames))
stop("geneNames argument is not optional!")
if(missing(src))
stop("src argument is not optional!")
if(!is.character(geneNames))
stop("geneNames must be character!")
if(!is(src, "refGenome"))
stop("src must be of class 'refGenome'!")
reg <- extractByGeneName(src, geneNames)
gpos <- getGenePositions(reg)
n <- nrow(gpos)
if(n == 0)
stop("Empty gene position table (no matches in geneNames?)!")
res <- extractRange(object, seqid=as.character(gpos$seqid[1]),
start=gpos$start[1], end=gpos$end[1])
if(n>1)
{
for(i in 1:n)
{
res <- c(res, extractRange(object,
seqid=as.character(gpos$seqid[i]), start=gpos$start[i],
end=gpos$end[i]))
}
}
return(res)
})
setMethod("seqlogo", "cdRanges", function(x, strand="+", useStrand=TRUE, ...)
{
if(length(x@seq) == 0)
stop("length(DNAStringSet) = 0!")
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Generates seqLogo for whole DNAStringSet
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
bm <- Sys.localeconv()[7]
if(useStrand)
{
strandseqs <- x@dt$strand == strand
message("[seqlogo.cdRanges] + : ", format(sum(strandseqs), big.mark=bm),
", total: ", format(nrow(x@dt), big.mark=bm), ".")
if(sum(strandseqs) == 0)
stop("No range found for strand '", strand, "'!")
cs <- consensusMatrix(x@seq[strandseqs], as.prob=T, baseOnly=TRUE)
}else{
cs <- consensusMatrix(x@seq, as.prob=T, baseOnly=TRUE)
}
pwm <- makePWM(cs[1:4, ])
## Plot seqLogo
seqLogo(pwm, ic.scale=F)
})
setMethod("translate", "cdRanges", function(x)
{
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Exclude data with N's in Sequence
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
af <- alphabetFrequency(x@seq)
withN <- af[, "N"] > 0
noN <- !withN
bm <- Sys.localeconv()[7]
if(sum(withN)>0)
{
message("[translate.cdRanges] Excluding ",
format(sum(withN), big.mark=bm), "/",
format(nrow(x@dt), big.mark=bm), " rows because of N's!")
x@dt <- x@dt[noN, ]
x@seq <- x@seq[noN]
}
ca <- new("caRanges")
ca@dt <- x@dt
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Range: 1 2 3 | 4 5 6 | 7 8 9
## Position: 4
## Corrected position: 2 (4L %/% 3L = 1L)
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
ca@dt$position <- (ca@dt$position %/% 3L) + 1L
ca@seq <- translate(x@seq)
return(ca)
})
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## truncSeq - functions
## Truncation functions for amino-acid containing objects
## Intended to cut sequences at stop-codons and recalculate dependent positions
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
truncate_seq <- function(seq, pos, id, rme=TRUE, trunc=42L)
{
if(!is.character(seq))
stop("seq must be character!")
if(!is.numeric(pos))
stop("pos must be numeric!")
if(length(seq) != length(pos))
stop("seq and pos must have same length!")
if(missing(id))
id <- 1:length(seq)
else{
if(!is.numeric(id))
stop("id must be numeric!")
id <- as.integer(id)
}
pos <- as.integer(pos)
if(!is.logical(rme))
stop("rme must be logical!")
if(!is.numeric(trunc))
stop("trunc must be numeric!")
trunc <- as.integer(trunc)
ret <- .Call("trunc_pos", id, pos, seq, trunc[1], PACKAGE="spliceSites")
if(rme)
ret <- ret[ret$lseq>0, ]
return(ret)
}
trnctsq <- function(x, rme=TRUE, trunc=42L)
{
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## rme: remove empty seqs
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
if(!is.logical(rme))
stop("rme must be logical!")
ret <- .Call("trunc_pos", x@dt$id,
as.integer(x@dt$position), as.character(x@seq),
as.integer(trunc[1]), PACKAGE="spliceSites")
rr <- new(class(x))
if(rme[1])
{
nn <- ret$lseq>0
rr@dt <- x@dt[nn, ]
rr@dt$position <- ret$pos[nn]
rr@dt$lseq <- ret$lseq[nn]
rr@seq <- AAStringSet(ret$seq[nn])
return(rr)
}
rr@dt <- x@dt
rr@dt$position <- ret$pos
rr@dt$lseq <- ret$lseq
rr@seq <- AAStringSet(ret$seq)
return(rr)
}
setMethod("truncateSeq", "caRanges", function(x, rme=TRUE, trunc=42L)
{ trnctsq(x, rme=TRUE, trunc=trunc) })
setMethod("truncateSeq","aaGapSites", function(x, rme=TRUE, trunc=42L)
{ trnctsq(x, rme=TRUE, trunc=trunc)})
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## trypsinCleave:
## Performs in silico trypsinization
##
## The C-routine implements the "Keil"-rule, where sites are described by
## the regex "[RK](?!P)". The cut position is between [RK] and
## the following character.
##
## The function returns the fragment which contains the position depicted
## exon-intron boundary.
##
## Sequences where the trypsinization product is shorter then minLen are
## excluded from the result.
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
trclv <- function(x, minLen=5, ...)
{
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## position: 1-based position of last AA on left side
## position must be given as 0-based for "silic_tryp_pos"
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
ret <- .Call("silic_tryp_pos", as.integer(x@dt$id),
as.integer(x@dt$position - 1),
as.character(x@seq), PACKAGE="spliceSites")
nn <- ret$lseq >= minLen
rr <- new(class(x))
rr@dt <- x@dt[nn, ]
rr@dt$position <- ret$pos[nn]
rr@dt$lseq <- ret$lseq[nn]
rr@seq <- AAStringSet(ret$seq[nn])
## Remove too short AA sequences
bm <- Sys.localeconv()[7]
if(sum(nn) == 0)
{
warning(paste("[trypsinCleave] No AA-Sequence left",
"by filter width >= minLen (=", minLen, ")!"))
}
message("[trypsinCleave] Removing ", format(sum(!nn), big.mark=bm),
" sequences shorter than ", minLen, ".")
return(rr)
}
setMethod("trypsinCleave", "caRanges", function(x, minLen=5, ...)
{trclv(x, minLen, ...)} )
setMethod("trypsinCleave", "aaGapSites", function(x, minLen=5, ...)
{trclv(x , minLen, ...)})
silic_tryp <- function(seq, pos, id)
{
if(!is.character(seq))
stop("seq must be character!")
if(!is.numeric(pos))
stop("pos must be numeric!")
if(length(seq) != length(pos))
stop("seq and pos must have same length!")
if(missing(id)){
id <- 1:length(seq)
}else{
if(!is.numeric(id))
stop("id must be numeric!")
id <- as.integer(id)
}
pos <- as.integer(pos)
return(.Call("silic_tryp_pos", id, pos, seq, PACKAGE="spliceSites"))
}
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
setMethod("write.files", "caRanges", function(x, path, filename, ...)
{
if(!file.exists(path))
dir.create(path)
tablefile <- file.path(path, paste(filename, ".csv", sep=""))
fastafile <- file.path(path, paste(filename, ".fa", sep=""))
x@dt$fasta_id <- 1:nrow(x@dt)
x@dt$seq <- as.character(x@seq)
names(x@seq) <- paste(x@dt$fasta_id, filename, sep="|")
write.table(x@dt, tablefile, sep=";", dec=",", row.names=F, ...)
writeXStringSet(x@seq, fastafile)
return(invisible())
})
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## ##
## Definitions for gapSites class ##
## ##
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Model: ##
## ##
## ##
## 1000 > > > ascending reference positions > > > 2000 ##
## ##
## left right ##
## ##
## lstart lend rstart rend ##
## ###############<- gaplen ->############### ##
## | lfeat | gap | rfeat | ##
## ##
## ##
## alt_left : alt group ##
## alt_right: group alt ##
## ##
## ##
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
##
## Constructor function:
## Makes 'gapSites' object out of raw coordinate values.
##
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
gapSites <- function(
seqid=factor(),
lstart=integer(),
lend=integer(),
rstart=integer(),
rend=integer(),
gaplen,
strand,
nr_aligns=1,
nAligns=sum(nr_aligns),
nAlignGaps=sum(nr_aligns),
nProbes=1)
{
n <- length(seqid)
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Return empty object
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
if(n == 0)
return(new("gapSites"))
if(!is(seqid, "factor"))
seqid <- factor(seqid)
if(length(lstart) != n)
{
stop("length(lStart) ", length(lstart),
" unequal length(seqid) ", n, "!")
}
if(length(lend) != n)
{
stop("length(lend) ", length(lend),
" unequal length(seqid) ", n, "!")
}
if(length(rstart) != n)
{
stop("length(rstart) ", length(rstart),
" unequal length(seqid) ", n, "!")
}
if(length(lend) != n){
stop("length(lend) ", length(lend),
" unequal length(seqid) ", n, "!")
}
if(missing(gaplen))
gaplen <- rstart-lend - 1L
if(length(gaplen) != n)
{
stop("length(gaplen) ", length(gaplen),
" unequal length(seqid) ", n, "!")
}
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Align number values
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
if(length(nr_aligns) == 1)
nr_aligns <- rep(nr_aligns, n)
if(length(nr_aligns) != n)
{
stop("length(nr_aligns) ", length(nr_aligns),
" unequal length(seqid) ", n, "!")
}
if(length(nAligns) != 1)
stop("[gapSites] length(nAligns) ", length(nAligns), " must be 1!")
if(length(nAlignGaps) != 1)
stop("[gapSites] length(nAlignGaps) ", length(nAligns), " must be 1!")
nAligns <- as.double(nAligns)
nAlignGaps <- as.double(nAlignGaps)
nProbes <- as.double(nProbes)
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## strand
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
if(missing(strand))
strand <- factor(rep("*", n), levels=strandlevels)
if(length(strand) == 1)
{
if( (strand == 1) || (strand == "-") )
{
strand <- factor(rep("-", n), levels=strandlevels)
}else{
strand <- factor(rep("+", n), levels=strandlevels)
}
}
else if(length(strand) == n)
{
lgl <- logical(n)
if(is.character(strand)){
strand <- factor(ifelse(strand == "+", "+", "-"),
levels=strandlevels)
}else if(is.numeric(strand)){
strand <- factor(ifelse(strand == 0, "+", "-"),
levels=strandlevels)
}else if(!is(strand, "factor")){
if(!setequal(levels(strand), strandlevels))
stop("strand levels must be ' + -*'!")
}
}else{
stop("length(strand) must be 1 or length(seqid)=", length(seqid), "!")
}
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## rpmg and gptm (align number derived values)
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
rpmg_fac <- 1e6/nAlignGaps
gptm_fac <- 1e7/nAligns
gptm <- round(nr_aligns*gptm_fac, 3)
rpmg <- round(nr_aligns*rpmg_fac, 3)
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Data table
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
dt <- data.frame(id=1:n, seqid=seqid, lstart=lstart, lend=lend,
rstart=rstart, rend=rend, gaplen=gaplen,
strand=strand, nAligns=nr_aligns, gptm=gptm,
rpmg=rpmg, nProbes=nProbes)
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Create returned object
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
ga <- new("gapSites")
ga@dt <- dt[order(dt$seqid, dt$lend, dt$rstart), ]
ga@nAligns <- round(nAligns, 0)
ga@nAlignGaps <- round(nAlignGaps, 0)
return(ga)
}
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
##
## Accessors + Operators for gapSites, dnaGapSites and aaGapSites classes
##
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Accessors for table column vectors
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
setMethod("seqid" , "gapSites",function(x) x@dt$seqid)
setMethod("lstart", "gapSites",function(x) x@dt$lstart)
setMethod("lend" , "gapSites",function(x) x@dt$lend)
setMethod("rstart", "gapSites",function(x) x@dt$rstart)
setMethod("rend" , "gapSites",function(x) x@dt$rend)
setMethod("strand", "gapSites",function(x,...) x@dt$strand)
setMethod("nAligns", "gapSites",function(object) object@nAligns)
setMethod("nAlignGaps","gapSites",function(object) object@nAlignGaps)
setMethod("gptm", "gapSites",function(x) x@dt$gptm)
setMethod("rpmg", "gapSites",function(x) x@dt$rpmg)
setMethod("seqnames", "gapSites",function(x) levels(x@dt$seqid))
setMethod("getProfile","gapSites",function(x) x@profile)
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## "[" Operator
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Added this operator for extraction of strand specific subsets as input
## for lJunc or rJunc
setMethod("[", "gapSites", function(x, i, j, drop){
ga <- new("gapSites")
ga@dt <- x@dt[i, ]
if(!is.null(x@annotation))
ga@annotation <- x@annotation[i, ]
ga@nAligns <- x@nAligns
ga@nAlignGaps <- x@nAlignGaps
return(ga)
})
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## sortTable
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
setMethod("sortTable", "gapSites", function(x)
{
o <- order(x@dt$seqid, x@dt$lend, x@dt$rstart)
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Create sorted copy
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
cr <- new(class(x))
cr@dt <- x@dt[o, ]
cr@annotation <- x@annotation[o, ]
cr@nAligns <- x@nAligns
cr@nAlignGaps <- x@nAlignGaps
cr@profile <- x@profile
if(.hasSlot(x, "seq"))
{
cr@seq <- x@seq[o]
}
return(cr)
})
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Merge for gapSites
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
merge.gapSites <- function(x, y, digits=10, ...)
{
if(!is(y, "gapSites"))
stop("y must be of class gapSites!")
message("strand is set to undefined ('*')!")
bm <- Sys.localeconv()[7]
message("[merge.gapSites] rbind for ",
format(x@nAligns, big.mark=bm, digits=digits),
" aligns from x and ",
format(y@nAligns, big.mark=bm, digits=digits), " aligns from y.")
dt <- rbind(x@dt, y@dt)
nAligns=as.double(x@nAligns) + as.double(y@nAligns)
nAlignGaps=as.double(x@nAlignGaps) + as.double(y@nAlignGaps)
sm <- summaryBy(nAligns + nProbes ~ seqid + lend + rstart,
data=dt, FUN=sum, keep.names=TRUE)
mx <- summaryBy(rend + gaplen ~ seqid + lend + rstart,
data=dt, FUN=max, keep.names=TRUE)
mn <- summaryBy(lstart ~ seqid + lend + rstart,
data=dt, FUN=min, keep.names=TRUE)
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Create returned object
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
ga <- gapSites(seqid=sm$seqid, lstart=mn$lstart,
lend=sm$lend, rstart=sm$rstart, rend=mx$rend,
gaplen=mx$gaplen, nr_aligns=sm$nAligns,
nAligns=nAligns, nAlignGaps=nAlignGaps ,nProbes=sm$nProbes)
return(ga)
}
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
##
## Junction Sites
##
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
setMethod("trim_left", "gapSites", function(x, maxlen)
{
if(!is.numeric(maxlen))
stop("maxlen must be numeric!")
if(length(maxlen)>1)
stop("length(maxlen) must be 1!")
maxlen <- as.integer(maxlen)-1L
cobj <- deparse(substitute(x))
etxt <- paste(cobj, "@dt$lstart<-pmax(", cobj, "@dt$lstart,", cobj,
"@dt$lend-", maxlen, ")", sep="")
eval.parent(parse(text=etxt))
return(invisible())
})
setMethod("trim_right", "gapSites", function(x, maxlen)
{
if(!is.numeric(maxlen))
stop("maxlen must be numeric!")
if(length(maxlen)>1)
stop("length(maxlen) must be 1!")
maxlen <- as.integer(maxlen)-1L
cobj <- deparse(substitute(x))
etxt <- paste(cobj, "@dt$rend<-pmax(", cobj, "@dt$rend,", cobj,
"@dt$rstart + ", maxlen, ")", sep="")
eval.parent(parse(text=etxt))
return(invisible())
})
setMethod("resize_left", "gapSites", function(x, len)
{
if(!is.numeric(len))
stop("len must be numeric!")
if(length(len)>1)
stop("length(len) must be 1!")
len <- as.integer(len)-1L
cobj <- deparse(substitute(x))
etxt <- paste(cobj, "@dt$lstart<-", cobj, "@dt$lend-", len, sep="")
eval.parent(parse(text=etxt))
return(invisible())
})
setMethod("resize_right", "gapSites", function(x, len)
{
if(!is.numeric(len))
stop("len must be numeric!")
if(length(len) > 1)
stop("length(len) must be 1!")
len <- as.integer(len) - 1L
cobj <- deparse(substitute(x))
etxt <- paste(cobj, "@dt$rend<-", cobj, "@dt$rstart + ", len, sep="")
eval.parent(parse(text=etxt))
return(invisible())
})
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
##
## lJunc function
##
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
setMethod("lJunc", "gapSites", function(x, featlen,
gaplen, unique=FALSE, strand, ...)
{
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Prepare featlen and gaplen
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
if(!is.numeric(featlen))
stop("featlen must be numeric!")
if(length(featlen) > 1)
stop("featlen must have length 1!")
if(!is.numeric(gaplen))
stop("gaplen must be numeric!")
if(length(gaplen) > 1)
stop("gaplen must have length 1!")
featlen <- as.integer(featlen)
gaplen <- as.integer(gaplen)
## Position: 1-based last-exon position
llen <- featlen-1L
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## featlen gaplen
## | |
## 4321 12345
## feat #### ----- gap
## |
## lend
##
## position=featlen
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Prepare strand
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
if(missing(strand))
stop("strand argument is not optional!")
if(!is.character(strand))
stop("strand must be character!")
if(length(strand) > 1)
stop("strand must have length 1!")
if(!is.element(strand, strandlevels))
stop("strand must be valid strand level ( + ,-,*)!")
cr <- new("cRanges", seqid=x@dt$seqid, start=x@dt$lend - llen,
width=featlen + gaplen, strand=strand,
position=featlen, id=x@dt$id)
cr@dt$nAligns <- x@dt$nAligns
## Remove multiplets which arise from alternative (right) sites
if(unique)
{
## Group table
smin <- summaryBy(id + position~seqid + start + end,
data=cr@dt, FUN=min, keep.names=TRUE)
ssum <- summaryBy(nAligns~seqid + start + end,
data=cr@dt, FUN=sum, keep.names=TRUE)
slen <- summaryBy(id~seqid + start + end,
data=cr@dt, FUN=length, keep.names=TRUE)
## Copy strand (cannot be grouped)
dtu <- cbind(smin[, 1:3],
data.frame(
strand=cr@dt$strand[smin$id],
position=smin$position,
id=smin$id,
nAligns=ssum$nAligns))
cr@dt <- dtu
}
return(cr)
})
setMethod("rJunc", "gapSites",
function(x, featlen, gaplen, unique=FALSE, strand, ...)
{
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Prepare featlen and gaplen
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
if(!is.numeric(featlen))
stop("featlen must be numeric!")
if(length(featlen) > 1)
stop("featlen must have length 1!")
if(!is.numeric(gaplen))
stop("gaplen must be numeric!")
if(length(gaplen) > 1)
stop("gaplen must have length 1!")
featlen <- as.integer(featlen)
gaplen <- as.integer(gaplen)
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## gaplen featlen
## | |
## 4321 12345
## gap ---- ##### feat
## |
## rstart
##
##
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Prepare strand
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
if(missing(strand))
stop("strand argument is not optional!")
if(!is.character(strand))
stop("strand must be character!")
if(length(strand) > 1)
stop("strand must have length 1!")
if(!is.element(strand, strandlevels))
stop("strand must be valid strand level ( + , -, *)!")
cr <- new("cRanges")
cr@dt <- data.frame(seqid=x@dt$seqid, start=x@dt$rstart - gaplen,
end=x@dt$rstart + featlen - 1L, strand=strand,
position=featlen, id=x@dt$id)
cr@dt$nAligns <- x@dt$nAligns
## Remove multiplets which arise from alternative (right) sites
if(unique)
{
smin <- summaryBy(id + position~seqid + start + end,
data=cr@dt, FUN=min, keep.names=TRUE)
ssum <- summaryBy(nAligns~seqid + start + end,
data=cr@dt, FUN=sum, keep.names=TRUE)
slen <- summaryBy(id~seqid + start + end,
data=cr@dt, FUN=length, keep.names=TRUE)
## Copy strand (cannot be grouped)
dtu <- cbind(smin[, 1:3],
data.frame(strand=cr@dt$strand[smin$id],
position=smin$position, id=smin$id, nAligns=ssum$nAligns))
cr@dt <- dtu
}
return(cr)
})
setMethod("lrJunc", "gapSites", function(x, lfeatlen, rfeatlen, strand, ...)
{
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Prepare featlen and gaplen
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
if(missing(lfeatlen))
stop("lfeatlen is not optional!")
if(!is.numeric(lfeatlen))
stop("lfeatlen must be numeric!")
if(length(lfeatlen) > 1)
stop("lfeatlen must have length 1!")
if(missing(rfeatlen))
stop("rfeatlen is not optional!")
if(!is.numeric(rfeatlen))
stop("rfeatlen must be numeric!")
if(length(rfeatlen) > 1)
stop("rfeatlen must have length 1!")
lfeatlen <- as.integer(lfeatlen)
rfeatlen <- as.integer(rfeatlen)
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Prepare strand
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
if(missing(strand))
stop("strand argument is mandatory!")
if(length(strand) > 1)
stop("strand must be one unique value!")
ga <- gapSites(seqid=x@dt$seqid,
lstart=x@dt$lend - lfeatlen + 1L,
lend=x@dt$lend,
rstart=x@dt$rstart,
rend=x@dt$rstart + rfeatlen - 1L,
gaplen=x@dt$gaplen,
strand=strand,
nr_aligns=x@dt$nAligns,
nAligns=x@nAligns,
nAlignGaps=x@nAlignGaps)
ga@dt$lfeatlen <- lfeatlen
ga@dt$rfeatlen <- rfeatlen
return(ga)
})
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## lJuncStrand + rJuncStrand
## Extracting cRanges objects from gapSites
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
setMethod("lJuncStrand", "gapSites", function(x, featlen, gaplen, ...)
{
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Difference to lJunc function:
## A) lJunc additionally takes nonoptional strand argument
## -> Strand is not a copy of x@dt$strand
## B) Removes multiplets (which arise from alternative sites)
##
## lJuncStrand does not remove lines or reorder
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Prepare featlen and gaplen
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
if(!is.numeric(featlen))
stop("featlen must be numeric!")
if(length(featlen) > 1)
stop("featlen must have length 1!")
if(!is.numeric(gaplen))
stop("gaplen must be numeric!")
if(length(gaplen) > 1)
stop("gaplen must have length 1!")
featlen <- as.integer(featlen)
gaplen <- as.integer(gaplen)
## Position: 1-based last-exon position
llen <- featlen - 1L
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## featlen gaplen
## | |
## 4321 12345
## feat #### ----- gap
## |
## lend
##
## position=featlen
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
cr <- new("cRanges", seqid=x@dt$seqid, start=x@dt$lend - llen,
width=featlen + gaplen, strand=x@dt$strand,
position=featlen, id=x@dt$id)
cr@dt$nAligns <- x@dt$nAligns
return(cr)
})
setMethod("rJuncStrand", "gapSites", function(x, featlen, gaplen, ...)
{
##------------------------------------------------------------------------##
## Prepare featlen and gaplen
##------------------------------------------------------------------------##
if(!is.numeric(featlen))
stop("featlen must be numeric!")
if(length(featlen) > 1)
stop("featlen must have length 1!")
if(!is.numeric(gaplen))
stop("gaplen must be numeric!")
if(length(gaplen) > 1)
stop("gaplen must have length 1!")
featlen <- as.integer(featlen)
gaplen <- as.integer(gaplen)
##------------------------------------------------------------------------##
## gaplen featlen
## | |
## 4321 12345
## gap ---- ##### feat
## |
## rstart
##
##
##------------------------------------------------------------------------##
cr <- new("cRanges")
cr@dt <- data.frame(seqid=x@dt$seqid, start=x@dt$rstart - gaplen,
end=x@dt$rstart + featlen - 1L, strand=x@dt$strand,
position=featlen, id=x@dt$id)
cr@dt$nAligns <- x@dt$nAligns
return(cr)
})
setMethod("lrJuncStrand", "gapSites", function(x, lfeatlen, rfeatlen, ...)
{
##------------------------------------------------------------------------##
## Prepare featlen and gaplen
##------------------------------------------------------------------------##
if(missing(lfeatlen))
stop("lfeatlen is not optional!")
if(!is.numeric(lfeatlen))
stop("lfeatlen must be numeric!")
if(length(lfeatlen) > 1)
stop("lfeatlen must have length 1!")
if(missing(rfeatlen))
stop("rfeatlen is not optional!")
if(!is.numeric(rfeatlen))
stop("rfeatlen must be numeric!")
if(length(rfeatlen) > 1)
stop("rfeatlen must have length 1!")
lfeatlen <- as.integer(lfeatlen)
rfeatlen <- as.integer(rfeatlen)
ga <- gapSites(seqid=x@dt$seqid,
lstart=x@dt$lend - lfeatlen + 1L,
lend=x@dt$lend,
rstart=x@dt$rstart,
rend=x@dt$rstart + rfeatlen - 1L,
gaplen=x@dt$gaplen,
strand=x@dt$strand,
nr_aligns=x@dt$nAligns,
nAligns=x@nAligns,
nAlignGaps=x@nAlignGaps)
ga@dt$lfeatlen <- lfeatlen
ga@dt$rfeatlen <- rfeatlen
return(ga)
})
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
##
## lrCodons (for gapSites)
##
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
setMethod("lrCodons", "gapSites", function(x, frame=1L)
{
if(!is.element(frame, 1:3))
stop("frame must be one of 1, 2, 3!")
## Make frame 0-based
nframe <- as.integer(frame) - 1L
if(sum(table(x@dt$strand) > 0) > 1)
stop("Strand must be homogenous. Use 'lrJunc'!")
if(x@dt$strand[1] == ' + ')
{
## + + + + + + + + + + + + + + + + + + + + + + ##
## We're reading from left to right, so
## shift left and truncateSeq right to get full codons in frame
## Shift left side
## + + + + + + + + + + + + + + + + + + + + + + ##
lstart <- x@dt$lstart + nframe
lfeatlen <- x@dt$lfeatlen - nframe
if(any(lstart >= x@dt$lend))
stop("Left feature size <= 0 in frame", frame, "and ( + )-strand!")
# truncateSeq right side
diff_width <- (lfeatlen + x@dt$rfeatlen) %% 3
rend <- x@dt$rend - diff_width
rfeatlen <- x@dt$rfeatlen - diff_width
if(any(x@dt$rstart >= rend))
{
stop("Right feature size <= 0 after truncation to codon in frame",
frame, "and ( + )-strand!")
}
}else{
## + + + + + + + + + + + + + + + + + + + + + + ##
## We're reading from right to left, so
## shift right and truncateSeq left to get full codons in frame
## Shift right
## + + + + + + + + + + + + + + + + + + + + + + ##
## truncateSeq left
rend <- x@dt$rend - nframe
rfeatlen <- x@dt$rfeatlen - nframe
diff_width <- (rfeatlen + x@dt$lfeatlen) %% 3
if(any(rend <= x@dt$rstart))
stop("Right feature size <=0 in frame", frame, "and (-)-strand!")
lstart <- x@dt$lstart + diff_width
lfeatlen <- x@dt$lfeatlen - diff_width
if(any(lstart >= x@dt$lend))
{
stop("Left feature size <=0 after truncation to codon in frame",
frame, "and (-)-strand!")
}
}
dt <- data.frame(
id=1:length(lstart),
seqid=x@dt$seqid,
lstart=lstart, lend=x@dt$lend,
rstart=x@dt$rstart, rend=rend,
gaplen=x@dt$gaplen, strand=x@dt$strand,
nAligns=x@dt$nAligns,
gptm=x@dt$gptm, rpmg=x@dt$rpmg,
nProbes=x@dt$nProbes,
lfeatlen=lfeatlen, rfeatlen=rfeatlen,
frame=frame
)
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## copy featlen's from input
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
ga <- new("gapSites")
ga@dt <- dt
ga@nAligns <- x@nAligns
ga@nAlignGaps <- x@nAlignGaps
return(ga)
})
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
##
## dnaGapSites : Add dna sequence to gapSites object
##
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
setMethod("dnaGapSites",c("gapSites","DNAStringSet"),function(x, dnaset, strand)
{
dnames <- names(dnaset)
if(is.null(dnames))
stop("names(dnaset) must not be NULL!\n")
sqnames <- seqid(x)
n <- length(dnames)
seqs <- DNAStringSet()
lvl <- list()
rvl <- list()
## Create empty template
ans <- x@dt[0]
j <- 1
for(i in 1:n)
{
lg <- match(dnames[i], sqnames)
if(!is.na(lg))
{
dt <- x@dt[x@dt$seqid == dnames[i],]
## Add data for seqid to data.frame
ans <- rbind(ans, dt)
## Add seqname
## #ans_sqnames[j]<-dnames[i]
## Read dna-ranges
lvl[[j]] <- Views(dnaset[[i]], start=dt$lstart, end=dt$lend)
rvl[[j]] <- Views(dnaset[[i]], start=dt$rstart, end=dt$rend)
j <- j + 1
}
}
## chain up dna
lds <- DNAStringSet(unlist(lapply(lvl, as.character)))
rds <- DNAStringSet(unlist(lapply(rvl, as.character)))
dss <- xscat(lds, rds)
## Remove unused factor levels
ans$seqid <- factor(ans$seqid)
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## strand
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
nrows <- nrow(ans)
if(missing(strand))
{
mtc <- match(ans$id, x@dt$id)
ans$strand <- x@dt$strand[mtc]
revStrand <- ans$strand == "-"
dss[revStrand] <- reverseComplement(dss[revStrand])
gap_pos <- nchar(lds)
}else{
if( (strand == "-") || (strand == 0) )
{
ans$strand <- factor(rep("-", nrows), levels=strandlevels)
dss <- reverseComplement(dss)
gap_pos <- nchar(lds)
}else{
ans$strand <- factor(rep("+", nrows), levels=strandlevels)
gap_pos <- nchar(rds)
}
}
## Assembly of result
o <- order(ans$seqid, ans$lend, ans$rstart)
ga <- new("dnaGapSites")
ga@dt <- ans[o, ]
ga@nAligns <- x@nAligns
ga@nAlignGaps <- x@nAlignGaps
## Position = 1-based postion of last exon nucleotide
ga@dt$position <- gap_pos
ga@seq <- dss[o]
return(ga)
})
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## ##
## Proteomic functions: translate and trypsinCleave ##
## ##
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
setMethod("translate", "dnaGapSites", function(x)
{
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Exclude data with N's in Sequence
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
af <- alphabetFrequency(x@seq)
withN <- af[, "N"]>0
noN <- !withN
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Create new object
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
aa <- new("aaGapSites")
aa@nAligns <- x@nAligns
aa@nAlignGaps <- x@nAlignGaps
if(sum(withN)>0)
{
message("[translate.dnaGapSites] Excluding ", sum(withN), "/",
nrow(x@dt), " rows because of N's!")
aa@dt <- x@dt[noN, ]
aa@seq <- translate(x@seq[noN])
}else{
aa@dt <- x@dt
aa@seq <- translate(x@seq)
}
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Range: 1 2 3 | 4 5 6 | 7 8 9
## Position: 4
## Corrected position: 2 (4L %/% 3L = 1L)
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
aa@dt$position <- (aa@dt$position %/% 3L) + 1L
return(aa)
})
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## seqlogo
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
setMethod("seqlogo", "dnaGapSites", function(x, strand="+", useStrand=TRUE, ...)
{
if(length(x@seq) == 0)
stop("[seqlogo.cdRanges] length(DNAStringSet)=0!")
## Generates seqLogo for whole DNAStringSet
if(useStrand)
{
strandseqs <- x@dt$strand == strand
if(sum(strandseqs) == 0)
stop("No range found for strand '", strand, "'!")
cm <- consensusMatrix(x@seq[strandseqs])
}else{
cm <- consensusMatrix(x@seq)
}
## Extract first four lines (ACGT - lines)
cm <- cm[1:4, ]
## Make columns to prob's
sm <- apply(cm, 2, sum)
pwm <- makePWM(t((t(cm)/sm)))
seqLogo(pwm, ic.scale=FALSE)
})
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## write.files
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
setMethod("write.files", "aaGapSites", function(x, path, filename, ...)
{
tablefile <- file.path(path, paste(filename, ".csv", sep=""))
fastafile <- file.path(path, paste(filename, ".fa", sep=""))
df <- x@dt
fasta_id <- 1:nrow(df)
df$fasta_id <- fasta_id
df$seq <- as.character(x@seq)
seq <- x@seq
names(seq) <- paste(fasta_id, filename, sep="|")
write.table(df, tablefile, sep=";", dec=", ", row.names=F, ...)
writeXStringSet(seq, fastafile)
return(invisible())
})
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## ##
## Read gapAlings and merged gapSites from BAM via rbamtools ##
## ##
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Not exported
do_group_align_data <- function(dt, nAligns, nAlignGaps, startid=1)
{
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Expects a data.frame with columns:
## seqid, lstart, lend, rstart, lend, nAligns
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
# Do grouping by lend, rstart
dtlen <- summaryBy(position~seqid + lend + rstart,
dt, FUN=length, keep.names=TRUE)
dtmin <- summaryBy(lstart~seqid + lend + rstart,
dt, FUN=min, keep.names=TRUE)
dtmax <- summaryBy(rend~seqid + lend + rstart,
dt, FUN=max, keep.names=TRUE)
n <- nrow(dtlen)
## RESET id + strand:
sites <- data.frame(id=1:n,
seqid=dtlen$seqid,
lstart=dtmin$lstart,
lend=dtlen$lend,
rstart=dtlen$rstart,
rend=dtmax$rend,
gaplen=dtlen$rstart-dtlen$lend-1L,
nAligns=dtlen$position,
strand=factor(rep("*", n), levels=strandlevels))
## Calculate new columns for full gapSites object:
rpmg_fac <- 1e6/nAlignGaps
gptm_fac <- 1e7/nAligns
sites$gaplen <- sites$rstart - sites$lend - 1L
sites$gptm <- round(sites$nAligns*gptm_fac, 3)
sites$rpmg <- round(sites$nAligns*rpmg_fac, 3)
return(invisible(sites))
}
getGapSites <- function(reader, seqid, startid=1)
{
if(missing(reader))
stop("reader argument is not optional!")
if(!is(reader, "bamReader"))
stop("reader must be bamReader!")
if(!indexInitialized(reader))
stop("reader must have initialized Index!")
if(!is.numeric(seqid))
stop("seqid must be numeric!")
if(length(seqid) != 1)
stop("seqid must have length 1")
if(seqid <= 0)
stop("seqid must be positive")
ref <- getRefData(reader)
n <- nrow(ref)
if(seqid>n)
stop("seqid '", seqid, "' must be <=", n, "!")
## Read gaps from BAM
coords <- c(ref$ID[seqid], 0, ref$LN[seqid])
gl <- gapList(reader, coords)
## BAM-file may contain no reads for given ref
if(size(gl) == 0)
return(invisible(new("gapSites")))
## Calculate needed columns
dt <- data.frame(as.data.frame.gapList(gl))
names(dt)[c(1, 5, 7)] <- c("seqid", "lend", "rstart")
dt$lstart <- dt$lend-dt$left_cigar_len + 1
dt$rend <- dt$rstart + dt$right_cigar_len-1
sites <- do_group_align_data(dt, nAligns(gl),
nAlignGaps(gl), startid=startid)
nRows <- nrow(sites)
sites$seqid <- factor(rep(ref$SN[seqid], nRows))
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## create gapSites Object
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
ga <- new("gapSites")
ga@dt <- sites
ga@dt$nProbes <- 1
ga@nAligns <- nAligns(gl)
ga@nAlignGaps <- nAlignGaps(gl)
return(invisible(ga))
}
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## alignGapProbes: Reads align gaps from entire bam file.
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
alignGapProbes <- function(reader, startid=1)
{
if(missing(reader))
stop("reader argument is not optional!")
if(!is(reader, "bamReader"))
stop("reader must be bamReader!")
if(!rbamtools::isOpen(reader))
stop("reader must be open!")
if(!indexInitialized(reader))
stop("reader must have initialized index!")
l <- list()
ref <- getRefData(reader)
n <- nrow(ref)
bm <- Sys.localeconv()[7]
nAligns <- 0
k <- 1
for(i in 1:n)
{
gal <- getGapSites(reader=reader, seqid=i, startid=startid)
if(nrow(gal@dt)>0)
{
l[[k]] <- gal
nAligns <- nAligns + nAligns(l[[k]])
startid=max(l[[k]]@dt$id) + 1
k <- k + 1
message("[alignGapProbes] seqid: ",
format(i, width=2),
"\tnAligns:",
format(nAligns, big.mark=bm, width=10))
}else{
message("[alignGapProbes] seqid: ",
format(i, width=2),
"\tnAligns:",
format(0, big.mark=bm, width=10))
}
}
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Create returned object
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
ga <- new("gapSites")
ga@dt <- do.call(rbind, lapply(l, getDataFrame))
ga@dt$nProbes <- 1
## Push nProbes behind nAligns
ga@dt <- ga@dt[, c(1:9, 12, 10, 11)]
ga@nAligns <- sum(unlist(lapply(l, nAligns)))
ga@nAlignGaps <- sum(unlist(lapply(l, nAlignGaps)))
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## rpmg and gptm
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
rpmg_fac <- 1e6/ga@nAlignGaps
gptm_fac <- 1e7/ga@nAligns
ga@dt$gptm <- round(ga@dt$nAligns*gptm_fac, 3)
ga@dt$rpmg <- round(ga@dt$nAligns*rpmg_fac, 3)
return(invisible(ga))
}
alignGapList <- function(reader, digits=3)
{
if(missing(reader))
stop("reader argument is not optional!")
if(!is(reader, "bamReader"))
stop("reader must be bamReader!")
if(!rbamtools::isOpen(reader))
stop("reader must be open!")
if(!indexInitialized(reader))
stop("reader must have initialized index!")
bm <- Sys.localeconv()[7]
bgl <- bamGapList(reader)
dtb <- as.data.frame(bgl)
n <- nrow(dtb)
dtb$strand <- factor(rep("*", n), levels=strandlevels)
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## rpmg and gptm
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
n_aligns <- nAligns(bgl)
n_gap_aligns <- nAlignGaps(bgl)
rpmg_fac <- 1e6/n_gap_aligns
gptm_fac <- 1e7/n_aligns
dtb$gptm <- round(dtb$nAligns*gptm_fac, digits=digits)
dtb$rpmg <- round(dtb$nAligns*rpmg_fac, digits=digits)
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Create gapSites object
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
ga <- new("gapSites")
ga@nAligns <- n_aligns
ga@nAlignGaps <- n_gap_aligns
ga@dt <- dtb[order(dtb$seqid, dtb$lend, dtb$rstart), ]
return(ga)
}
## + + + + + + + + + + + + + + + + + + + + + + + + ##
readMergedBamGaps <- function(infiles,
idxInfiles=paste(infiles, ".bai", sep=""), digits=3)
{
bm <- Sys.localeconv()[7]
n <- length(infiles)
profile <- data.frame(infile=infiles)
profile$nAligns <- 0 ## nAligns for each BAM-file
profile$nAlignGaps <- 0 ## nAlignGaps for each BAM-file
profile$nSites <- 0 ## number of sites for each BAM-file
profile$cSites <- 0 ## number of collected sites (after merging)
for(i in 1:n)
{
bam <- infiles[i]
reader <- bamReader(bam)
if(!file.exists(idxInfiles[i]))
{
message("[readMergedBamGaps] Creating BAM-index.", appendLF=FALSE)
createIndex(reader, idxInfiles[i])
message("Finished.")
}
loadIndex(reader, idxInfiles[i])
message("[readMergedBamGaps] i:(",
format(i, width=2), "/", n, ")", appendLF=FALSE)
if(i == 1)
{
ga <- bamGapList(reader)
profile$nAligns[1] <- nAligns(ga)
profile$nAlignGaps[1] <- nAlignGaps(ga)
profile$nSites[1] <- size(ga)
profile$cSites[1] <- size(ga)
}else{
ga1 <- bamGapList(reader)
ga <- merge.bamGapList(ga, ga1)
profile$nAligns[i] <- nAligns(ga1)
profile$nAlignGaps[i] <- nAlignGaps(ga1)
profile$nSites[i] <- size(ga1)
profile$cSites[i] <- size(ga)
}
message("\tnr sites: ", format(size(ga), big.mark=bm, width=9))
}
dtb <- as.data.frame(ga)
n <- nrow(dtb)
dtb$strand <- factor(rep("*", n), levels=strandlevels)
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## rpmg and gptm
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
n_aligns <- nAligns(ga)
n_gap_aligns <- nAlignGaps(ga)
rpmg_fac <- 1e6/n_gap_aligns
gptm_fac <- 1e7/n_aligns
dtb$gptm <- round(dtb$nAligns*gptm_fac, digits=digits)
dtb$rpmg <- round(dtb$nAligns*rpmg_fac, digits=digits)
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Create returned object
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
gap <- new("gapSites")
## push gqs, nlstart, qmm, nMcs to end of table
gap@dt <- dtb[order(dtb$seqid, dtb$lend, dtb$rstart),
c(1:7, 14, 8, 9, 15, 16, 10:13)]
gap@nAligns <- n_aligns
gap@nAlignGaps <- n_gap_aligns
gap@profile <- profile
return(gap)
}
readTabledBamGaps <- function(infiles,
idxInfiles=paste(infiles, ".bai", sep=""),
prof, rpmg=TRUE)
{
## Check types
if(!is.character(infiles))
stop("infiles must be character!")
if(!is.character(idxInfiles))
stop("idxInfiles must be character!")
if(!is.data.frame(prof))
stop("prof must be data.frame!")
if(!is.logical(rpmg))
stop("rpmg must be logical!")
## Check sizes
n <- length(infiles)
if(nrow(prof) != n)
stop("Length of infiles must be equal to row number in prof!")
if(length(idxInfiles) != n)
stop("Length of infiles must be equal length of idxInfiles!")
if(!all(lapply(prof, class) == "factor"))
stop("All columns of 'prof' (profile) must be factor!")
## Remove unused factor levels
for(i in 1:ncol(prof))
prof[, i] <- factor(prof[, i])
## Create profile table
profile <- prof
profile$nAligns <- 0 ## nAligns for each BAM-file
profile$nAlignGaps <- 0 ## nAlignGaps for each BAM-file
profile$nSites <- 0 ## number of sites for each BAM-file
profile$cSites <- 0 ## number of collected sites (after merging)
profile$infile <- infiles
bm <- Sys.localeconv()[7]
for(i in 1:n)
{
bam <- infiles[i]
reader <- bamReader(bam)
if(!file.exists(idxInfiles[i]))
{
message("[readTabledBamGaps] Creating BAM-index.", appendLF=FALSE)
createIndex(reader, idxInfiles[i])
message("Finished.")
}
loadIndex(reader, idxInfiles[i])
message("[readTabledBamGaps] i:(",
format(i, width=2), "/", n, ")",
appendLF=FALSE)
if(i == 1)
{
ga <- bamGapList(reader)
dt <- as.data.frame(ga)
kpc <- new("keyProfiler",
keyTable=dt[, c("seqid", "lend", "rstart")],
prof=prof, index=1, unique=TRUE)
kpa <- new("keyProfiler",
keyTable=dt[, c("seqid", "lend", "rstart")],
prof=prof, index=1, values=dt$nAligns, unique=TRUE)
profile$nAligns[1] <- nAligns(ga)
profile$nAlignGaps[1] <- nAlignGaps(ga)
profile$nSites[1] <- size(ga)
profile$cSites[1] <- size(ga)
}else{
ga1 <- bamGapList(reader)
dt <- as.data.frame(ga1)
addKeyTable(kpc, keyTable=dt[, c("seqid", "lend", "rstart")],
index=i)
addKeyTable(kpa, keyTable=dt[, c("seqid", "lend", "rstart")],
index=i, values=dt$nAligns)
profile$nAligns[i] <- nAligns(ga1)
profile$nAlignGaps[i] <- nAlignGaps(ga1)
profile$nSites[i] <- size(ga1)
## Merge
ga <- merge.bamGapList(ga, ga1)
profile$cSites[i] <- size(ga)
}
message("\tnr sites: ", format(size(ga), big.mark=bm, width=9))
}
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Create data.frame from bamGapList
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
dtb <- as.data.frame(ga)
n <- nrow(dtb)
dtb$strand <- factor(rep("*", n), levels=strandlevels)
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## rpmg and gptm
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
n_aligns <- nAligns(ga)
n_gap_aligns <- nAlignGaps(ga)
rpmg_fac <- 1e6/n_gap_aligns
gptm_fac <- 1e7/n_aligns
dtb$gptm <- round(dtb$nAligns*gptm_fac, 3)
dtb$rpmg <- round(dtb$nAligns*rpmg_fac, 3)
#setkeyv(dtb, c("seqid", "lend", "rstart"))
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Append KeyTable values
res <- appendKeyTable(kpc, dtb, prefix="c.")
res <- appendKeyTable(kpa, res, prefix="aln.")
if(rpmg)
{
res <- appendKeyTable(kpa, res, prefix="rpmg.",
rateFactor=1e6, digits=3)
}
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Create gapSites object
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
gap <- new("gapSites")
gap@nAligns <- n_aligns
gap@nAlignGaps <- n_gap_aligns
gap@profile <- profile
## Reorder first 16 columns
nCol <- ncol(res)
gap@dt <- res[order(res$seqid, res$lend, res$rstart),
c(4, 1, 5, 2, 3, 6, 7, 14, 8:13, 15, 16, 17:nCol)]
return(gap)
}
rangeByGeneName <- function(reader, genome, gene, complex=TRUE)
{
if(!is(reader, "bamReader"))
stop("'reader' must be of class 'bamReader'!")
if(!is(genome, "refGenome"))
stop("'genome' must be of class 'refGenome'!")
if(!rbamtools::isOpen(reader))
stop("'reader' must be opened!")
if(!indexInitialized(reader))
stop("'reader' must have initialized index!")
if(!is.character(gene))
stop("'gene' must be character!")
if(length(gene) > 1)
stop("'gene' must have length 1!")
if(!is.logical(complex))
stop("'complex' must be logical!")
if(length(complex) > 1)
stop("'complex' must have length 1!")
exons <- extractByGeneName(genome, gene)
if(is.null(exons))
{
message("[rangeByGeneName] Gene '", gene,
"' does not match any annotation. Returning NULL.")
return(invisible(NULL))
}
pos <- getGenePositions(exons)
seq <- as.character(pos$seqid)
start <- as.numeric(pos$start)
end <- as.numeric(pos$end)
ref <- getRefData(reader)
mtc <- match(seq, ref$SN)
if(is.na(mtc))
stop("Missing seqid for reference sequence '", seq, "'!")
seqid <- ref$ID[mtc]
return(bamRange(reader, coords=c(seqid, start, end), complex=complex))
}
countByGeneName <- function(object,
infiles,
idxInfiles=paste(infiles, ".bai", sep=""),
gene, tag="N")
{
if(!is(object, "refGenome"))
stop("'object' must be of class 'refGenome'!")
if(missing(gene))
stop("'gene' argument is not optional!")
if(length(gene) > 1)
stop("Only single genes!")
if(length(tag) > 1)
stop("Only single tags!")
exons <- extractByGeneName(object, gene)
## Gene does not match?
if(is.null(exons))
{
message("[countByGeneName] Gene '", gene,
"' does not match any annotation. Returning NULL.\n")
return(invisible(NULL))
}
## Calls 'exon'-specific version of getGenePositions
## Different for ensemblGenome and ucscGenome (see: 'by' argument)
pos <- getGenePositions(exons)
## Try to deal with multiple results
seq <- as.character(unique(pos$seqid))
if(length(seq) > 1)
stop("Found gene on multiple seqid's!")
start <- min(as.numeric(pos$start))
end <- max(as.numeric(pos$end))
n <- length(infiles)
res <- numeric(n)
tag_match <- 0
for(i in 1:n)
{
message("[countByGeneName] i:(", format(i, width=2), "/", n, ")")
bam <- infiles[i]
reader <- bamReader(bam)
ref <- getRefData(reader)
mtc <- match(seq, ref$SN)
if(is.na(mtc))
{
stop("Annotation seqid '", seq,
"' not found in BAM-file (", i, ")!")
}
if(!file.exists(idxInfiles[i]))
{
message("[countByGeneName] Creating BAM-index.", appendLF=FALSE)
createIndex(reader, idxInfiles[i])
message("Finished.\n")
}
loadIndex(reader, idxInfiles[i])
count <- bamCount(reader, coords=c(ref$ID[mtc], start, end))
if(tag_match == 0)
{
tag_match <- match(tag, names(count))
if(is.na(tag_match))
stop("[countByGeneName] Tag '", tag, "' not found in count values!")
}
res[i] <- count[tag_match]
}
message("[countByGeneName] Finished.")
return(res)
}
##===========================================================================##
## Beginn annotation routines ##
##===========================================================================##
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Overlap and merge gapSites data with annotation (refGenome)
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
overlap_genome <- function(qry, ref, verbose=FALSE)
{
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## expects qry and ref as data.frames with columns
## id, start, end, seqid
## qryRefs<-table(qry$seqid)
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
qRefNames <- levels(qry$seqid)
nQryRefs <- length(qRefNames)
l <- list()
k <- 1
bm <- Sys.localeconv()[7]
for(i in 1:nQryRefs)
{
qrs <- qry[qry$seqid == qRefNames[i], ]
qrs <- qrs[order(qrs$start), ]
rfs <- ref[ref$seqid == qRefNames[i], ]
rfs <- rfs[order(rfs$start), ]
nq <- nrow(qrs)
nr <- nrow(rfs)
if(verbose)
{
message("[overlap_genome] i: ", format(i, width=2), "\tseqid: ",
format(qRefNames[i], width=4), "\tquery set: ",
format(nq, width=7, big.mark=bm),
"\tref set:", format(nr, width=6, big.mark=bm))
if((nq == 0) | (nr == 0)) {
message("\t\tskipped!")
}else{
message("")
l[[k]] <- overlap(qry=qrs, ref=rfs)
k <- k + 1
}
}else{ # verbose=FALSE
if(nq>0&nr>0)
{
l[[k]] <- overlap(qry=qrs, ref=rfs)
k <- k + 1
}
}
}
return(do.call(rbind, l))
}
setMethod("annotate", c("gapSites", "refJunctions"), function(object, junc)
{
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# The annotates via the 'refGenome' overlapJuncs method
# which finds the best hit of alignment gap positions
# in genomic position pairs of splice - junctions.
#
# Both sides of the junction are annotated at once.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
res <- overlapJuncs(
object@dt[,c("id", "seqid", "lstart", "lend", "rstart", "rend")],
junc)
# Rename first column from 'qid' to 'id'
# (as preparation for later table merging)
names(res)[1] <- "id"
class(res) <- c("data.frame", "annotation")
message("[annotate.gapSites] Finished.")
return(invisible(res))
})
setMethod("annotate", c("ExpressionSet", "refJunctions"),
function(object, junc)
{
fdo <- featureData(object)
fd <- fdo@data
fd$id <- 1:nrow(fd)
ov <- overlapJuncs(
fd[,c("id","seqid","lstart","lend","rstart","rend")],
junc)
names(ov)[1] <- "id"
res <- merge(fd,
ov[, c("id", "gene_id", "gene_name")],
by="id",
all.x=TRUE)
meta <- data.frame(labelDescription=
c("Identifier", "Sequence identifier",
"Left end", "Right start",
"Left start", "Right end", "Gene id",
"Gene name"
),
row.names=colnames(res)
)
adf <- new("AnnotatedDataFrame", data=res, varMetadata=meta)
message("[annotate] Finished.")
return(adf)
})
setReplaceMethod("annotation", "gapSites", function(object, value)
{
# - - - - - - - - - - - - - - - - - - - - - #
# The function inserts annotation tables
# obtained from either 'annotate' or
# 'annotateJuncs' into the appropriate
# slots of gapSites object.
# - - - - - - - - - - - - - - - - - - - - - #
if(is(value, "annotation"))
{
object@annotation <- value
}else{
stop("Value must be of type 'annotation'. Use 'annotate'!")
}
return(object)
})
setMethod("annotation", "gapSites", function(object)
{
# - - - - - - - - - - - - - - - - - - - - - #
# The function merges annotation table(s)
# with the main table (dt, contains
# gap - site coordinates)
# and returns the result
# - - - - - - - - - - - - - - - - - - - - - #
if(!is.null(object@annotation))
{
# Annotation from 'annotateJuncs' function given
dt <- merge(
object@dt,
object@annotation,
by="id",
all.x=TRUE
)
return(invisible(
new("annGapSites",
dt=dt,
nAligns=object@nAligns,
nAlignGaps=object@nAlignGaps
)
)
)
}else
stop("Annotation table is NULL! Use 'annotation<-' and 'annotate'!")
})
setMethod("getAnnStrand", "gapSites", function(object)
{
# - - - - - - - - - - - - - - - - - - - - - #
# Returns strand information from
# contained annotation table as
# data.frame
# (for usage as input for 'strand<-')
# - - - - - - - - - - - - - - - - - - - - - #
if(is.null(object@annotation))
{
stop("annotation table must not be NULL! ",
"Use 'annotate' and 'annotation<-'!")
}
annStrand <- object@annotation[, c("id","strand")]
class(annStrand) <- c("data.frame", "AnnStrandInfo")
return(invisible(annStrand))
})
setReplaceMethod("strand", "gapSites", function(x, value)
{
# - - - - - - - - - - - - - - - - - - - - - #
# Replaces strand column in contained
# align - gap position table
# (with table obtained from 'getAnnStrand'
# function)
# - - - - - - - - - - - - - - - - - - - - - #
if(!is(value, "AnnStrandInfo"))
stop("value must be of class 'AnnStrandInfo'. Use 'getAnnStrand'!")
# Overwrite existing strand column
n <- nrow(x@dt)
x@dt$strand <- factor(rep("*", n), levels=strandlevels)
mtc <- match(x@dt$id, value$id)
nmtc <- !is.na(mtc)
x@dt$strand[nmtc] <- value$strand[mtc[nmtc]]
return(x)
})
setMethod("addGeneAligns", "gapSites", function(x)
{
# - - - - - - - - - - - - - - - - - - - - - #
# This function adds the number of
# read alignments per gene as new column
# to the table of alignment gap positions
# - - - - - - - - - - - - - - - - - - - - - #
if(!is.null(x@annotation)){
# Annotation table 'annJunc'
atb <- annotation(x)@dt
# Sum nAligns over gene_id's
sal <- tapply(atb$nAligns, atb$gene_id, sum)
tal <- data.frame(
gene_aligns=sal,
gene_id=names(sal)
)
# Create copy of incoming object
ga <- x
# Add gene_aligns
mtc <- match(atb$gene_id, tal$gene_id)
atb$gene_aligns <- tal$gene_aligns[mtc]
mtc <- match(ga@dt$id, atb$id)
ga@dt$gene_aligns <- atb$gene_aligns[mtc]
return(ga)
}
stop("Annotation missing. Use 'annotation(x)<-annotate(x, ...)'!")
})
## Plots tabled distance between inner gap-site and annotation borders
setMethod("plot_diff", "annGapSites", function(x, n=20)
{
tbl1l <- table(abs(x@dt$ldiff))
tbl1r <- table(abs(x@dt$rdiff))
op <- par(mfrow=c(1, 2))
barplot(tbl1l[1:n], main="Annotation left-gap_site distance")
barplot(tbl1r[1:n], main="Annotation right-gap site distance")
par(op)
return(invisible())
})
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## End annotation routines
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## ##
## Beginn alternative sites routines ##
## ##
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Basic alt_left_ranks and alt_right_ranks functions
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## + + + + + + + + + + + + + + + + + + + + + + + ##
## Model: ##
## lstart lend rstart rend ##
## ############### ############### ##
## | left exon | intron | right exon | ##
## ##
## alt_left: alt group ##
## ##
## + + + + + + + + + + + + + + + + + + + + + + + ##
setMethod("alt_left_ranks", "gapSites", function(x)
{
## Function core is generic for left and right:
## alt_left: grouping by rstart and alt(ernatives) = lend
## Extraction of alt-table
gt <- as.data.frame(x@dt)[, c(1, 2, 4, 5, 7)]
gt <- data.frame(lapply(gt, as.integer))
gt <- gt[order(gt$seqid, gt$rstart, gt$gaplen), ]
return(invisible(.Call("alt_group", gt$id, gt$seqid, gt$rstart, gt$gaplen)))
})
## + + + + + + + + + + + + + + + + + + + + + + + ##
## Model: ##
## lstart lend rstart rend ##
## 5' -> ############### ############### -> 3' ##
## | left exon | intron | right exon | ##
## ##
## alt_right: group alt ##
## ##
## + + + + + + + + + + + + + + + + + + + + + + + ##
setMethod("alt_right_ranks", "gapSites", function(x)
{
## Function core is generic for left and right:
## alt_left: grouping by rstart and alt(ernatives) = lend
## Extraction of alt-table
gt <- as.data.frame(x@dt)[, c(1, 2, 4, 5, 7)]
gt <- data.frame(lapply(gt, as.integer))
gt <- gt[order(gt$seqid, gt$lend, gt$gaplen), ]
return(invisible(.Call("alt_group", gt$id, gt$seqid, gt$lend, gt$gaplen)))
})
setMethod("alt_ranks", "gapSites", function(object)
{
alt_left <- alt_left_ranks(object)
alt_right <- alt_right_ranks(object)
res <- merge(alt_left, alt_right, by="id", suffixes=c("_left", "_right"))
res$seqid <- object@dt$seqid
res$lend <- object@dt$lend
res$rstart <- object@dt$rstart
return(res)
})
setMethod("plot_diff_ranks", "gapSites", function(x){
alt_left <- alt_left_ranks(x)
alt_right <- alt_right_ranks(x)
tbll <- table(alt_left$gap_diff)
tblr <- table(alt_right$gap_diff)
op <- par(mfrow=c(1, 2))
barplot(tbll[2:50], main="table left_gap_diff")
barplot(tblr[2:50], main="table right_gap_diff")
par(op)
return(invisible())
})
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## End alternative sites routines
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Coerce
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
as.data.frame.cRanges <- function(x, row.names=NULL, optional=FALSE, ...)
{return(x@dt)}
as.data.frame.gapSites <- function(x, row.names=NULL, optional=FALSE, ...)
{
if(!is.null(x@annotation))
{
# Annotation from 'annotation' function given
dt <- merge(
x@dt,
x@annotation[,c(
"id",
"sod",
"gene_id",
"transcript_id",
"gene_name")
],
by="id",
all.x=TRUE
)
return(dt)
}
return(x@dt)
}
# Not exported:
getDataFrame <- function(x) {return(x@dt)}
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
##============================================================================##
## Export functions: write csv files
##============================================================================##
setMethod("write.annDNA.tables",
c("gapSites", "DNAStringSet", "character") ,
function(object, dnaset, filename, featlen=3, gaplen=8,
sep=";", dec=",", row.names=FALSE)
{
# if(!is(dnaset, "DNAStringSet"))
# stop("dnaset must be 'DNAStringSet'!")
if(is.null(object@annotation))
stop("No annotation table available! Use 'annotation<- '!")
# if(!is.character(filename))
# stop("filename must be 'character'!")
dt <- annotation(object)@dt
dt <- dt[order(dt$id), ]
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Reverses Strand for "-" entries
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## lJuncStrand does not remove lines or reorder
## dnaRanges (removeUnknownStrand=FALSE) does not remove lines
ljseq <- dnaRanges(lJuncStrand(object, featlen=featlen, gaplen=gaplen),
dnaset, useStrand=TRUE, removeUnknownStrand=FALSE, verbose=FALSE)
lsq <- ljseq@dt
lsq$leftseq <- as.character(ljseq@seq)
names(lsq)[names(lsq) == "position"] <- "left_position"
lsq$seqid <- NULL
lsq$start <- NULL
lsq$end <- NULL
lsq$strand <- NULL
lsq$nAligns <- NULL
## merge by standard sorts by "id" here
dt <- merge(dt, lsq, by="id", all.x=TRUE)
## Reverses Strand for "-" entries
rjseq <- dnaRanges(rJuncStrand(object, featlen=featlen, gaplen=gaplen),
dnaset, useStrand=TRUE, removeUnknownStrand=FALSE, verbose=FALSE)
rsq <- rjseq@dt
rsq$rightseq <- as.character(rjseq@seq)
names(lsq)[names(lsq) == "position"] <- "right_position"
rsq$seqid <- NULL
rsq$start <- NULL
rsq$end <- NULL
rsq$strand <- NULL
rsq$nAligns <- NULL
## merge by standard sorts by "id" here
dt <- merge(dt, rsq, by="id", all.x=TRUE)
## Scientific notation penalty:
## scipen=999 prevents all scientific notation
old_sci <- options()$scipen
options(scipen=999)
write.table(dt, file=filename, sep=sep, dec=dec, row.names=row.names, na="")
options(scipen=old_sci)
message("[write.AnnotatedDNA.tables.gapSites] Finished.")
return(invisible())
})
##===========================================================================##
## keyProfiler: ##
## ##
## counts occurrence of profile (prof) items defined by factor levels in ##
## profile table in successively added tables by keys defined in ##
## object@ev$dt ##
## ##
## unique: allows adding keys only once for each profile line ##
## values: when given, conditions are not simply counted but the given ##
## valuesare summed up for each condition. ##
## values must either be absent or present for all accumulated data ##
##===========================================================================##
setMethod("initialize","keyProfiler",
function(.Object,
keyTable,
prof,
values,
unique=TRUE,
index=1L)
{
.Object@ev=new.env()
if(!is(prof, "data.frame"))
stop("prof must be data.frame!")
n <- nrow(prof)
if(n < 2)
{
stop("prof must at least have two rows (=categories), ",
"otherwise there's nothing to count!")
}
if(!all(unlist(lapply(prof, "class")) == "factor"))
stop("All columns in prof must be of class 'factor'!")
if(!is.logical(unique))
stop("unique must be 'logical'!")
if(length(unique) > 1)
stop("unique must have length 1!")
if(!is.numeric(index))
stop("index must be numeric!")
if(!is.integer(index))
index <- as.integer(index)
if(length(index) > 1)
stop("index must have length 1!")
if(index<0 || index>n)
stop("index<0 or index>", n, " (points to row in prof)!")
useValues <- !missing(values)
if(useValues)
{
if(!is.numeric(values))
stop("Given values must be numeric (they are summed up)!")
if(length(values) != nrow(keyTable))
stop("length(values) unequal row-number in keyTable!")
}
## Reshape profile prof-table for counting
.Object@ev$prof <- prof
nProfCols <- ncol(.Object@ev$prof)
profLevels <- lapply(.Object@ev$prof, levels)
for(i in 1:nProfCols)
{
tag <- names(profLevels)[i]
colNames <- paste(tag, profLevels[[i]], sep=".")
nCol <- length(colNames)
for(j in 1:nCol)
{
lgl <- ifelse(.Object@ev$prof[, tag] == profLevels[[i]][j],
1, 0)
.Object@ev$prof[, colNames[j]] <- lgl
}
.Object@ev$prof[, tag] <- NULL
}
## Add profiling columns to keytable
nProfCols <- ncol(.Object@ev$prof)
colNames <- names(.Object@ev$prof)
kt <- data.frame(keyTable)
if(useValues)
{
# Sum up values
for(i in 1:nProfCols)
{
if(as.integer(.Object@ev$prof[index, i]) == 1)
kt[, colNames[i]] <- values
else
kt[, colNames[i]] <- 0
}
}else{
## Do simple counting
for(i in 1:nProfCols)
kt[, colNames[i]] <- as.integer(.Object@ev$prof[index, i])
}
.Object@ev$dtb <- kt
.Object@ev$keyTableNames <- names(keyTable)
.Object@unique <- unique
.Object@useValues <- useValues
.Object@ev$groupExpr <- parse(text=
paste("dtb[,lapply(.SD,sum),by=list(", paste(names(keyTable),
collapse=","), ")]", sep=""))
if(unique)
{
.Object@counted <- rep(FALSE,nrow(prof))
.Object@counted[index] <- TRUE
}
return(.Object)
})
setMethod("addKeyTable", "keyProfiler", function(object, keyTable, index, values)
{
if(object@unique)
{
if(object@counted[index]){
stop("For each index, only one table can be added ",
"(or set unique=FALSE)!")
}
object@counted[index] <- TRUE
}
## Add profiling columns to keytable
nProfCols <- ncol(object@ev$prof)
colNames <- names(object@ev$prof)
if(object@useValues)
{
## Sum up values
for(i in 1:nProfCols)
{
if(as.integer(object@ev$prof[index, i]) == 1)
{
keyTable[, colNames[i]] <- values
}else{
keyTable[, colNames[i]] <- 0
}
}
}else{
## Do simple counting
for(i in 1:nProfCols)
keyTable[, colNames[i]] <- as.integer(object@ev$prof[index, i])
}
dtb <- data.frame(rbind(object@ev$dtb, keyTable))
object@ev$dtb <- summaryBy(.~ seqid + lend + rstart,
dtb, FUN=sum, keep.names=TRUE)
return(invisible())
})
setMethod("getKeyTable", "keyProfiler", function(object)
{return(object@ev$dtb)})
setMethod("appendKeyTable", "keyProfiler",
function(object, keytable, prefix, valFactor, rateFactor, digits)
{
if(!missing(valFactor))
{
if(!is.numeric(valFactor))
stop("valFactor must be numeric!")
if(length(valFactor) > 1)
stop("valFactor must have length 1!")
if(!missing(rateFactor))
message("rateFactor omitted because valFactor is present!")
## When a value Factor is given, each profile column
## will be multiplied with the given value
keylen <- length(object@ev$keyTableNames)
tbl_width <- ncol(object@ev$dtb)
if(missing(digits))
{
## Walk through profile columns
## ToDo: Check for valid table size? (keylen + 1)<tbl_width ?
for(i in (keylen + 1):tbl_width)
object@ev$dtb[, i] <- object@ev$dtb[, i]*valFactor
}else{
if(!is.numeric(digits))
stop("digits must be numeric!")
if(length(digits) > 1)
stop("digits must have length 1!")
## Walk through profile columns
for(i in (keylen + 1):tbl_width)
{
object@ev$dtb[, i] <- round(object@ev$dtb[, i]*valFactor,
digits=as.integer(digits))
}
}
}
else if(!missing(rateFactor))
{
if(!is.numeric(rateFactor))
stop("rateFactor must be numeric!")
if(length(rateFactor) > 1)
stop("rateFactor must have length 1!")
## + + + + + + + + + + + + + + + + + + + + + + ##
## When a rate Factor is given,
## each profile column will be converted into a rate
## i.e. each profile value will be divided by its column-sum.
##
## In order to get a sensible rate, the profile columns
## are then multiplied with the rateFactor.
## + + + + + + + + + + + + + + + + + + + + + + ##
keylen <- length(object@ev$keyTableNames)
tbl_width <- ncol(object@ev$dtb)
if(missing(digits))
{
# Walk through profile columns
for(i in (keylen + 1):tbl_width)
{
col_sum <- sum(object@ev$dtb[, i])
if(col_sum > 0)
{
col_factor <- rateFactor/col_sum
object@ev$dtb[, i] <- object@ev$dtb[, i] * col_factor
}
}
}else{
if(!is.numeric(digits))
stop("digits must be numeric!")
if(length(digits) > 1)
stop("digits must have length 1!")
## Walk through profile columns
for(i in (keylen + 1):tbl_width)
{
col_sum <- sum(object@ev$dtb[, i])
if(col_sum > 0)
{
col_factor <- rateFactor/col_sum
object@ev$dtb[, i] <- round(object@ev$dtb[, i] * col_factor,
digits=as.integer(digits))
}
}
}
}
ans <- merge(keytable, object@ev$dtb, by=object@ev$keyTableNames)
if(!missing(prefix))
{
colNames <- names(object@ev$prof)
mtc <- match(colNames, names(ans))
names(ans)[mtc] <- paste(prefix, colNames, sep="")
}
return(ans)
})
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## End keyProfiler
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Read Gene FPKM values from cufflinks files into an ExpressionSet
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
readCuffGeneFpkm <- function(cuff, phenoData, summ="max")
{
if(!is.character(cuff))
stop("'cuff' must be character!")
if(!all(file.exists(cuff)))
stop("Non existing files in cuff vector!")
if(!is(phenoData, "AnnotatedDataFrame"))
stop("phenoData must be AnnotatedDataFrame!")
if(nrow(pData(phenoData)) != length(cuff))
stop("Number of rows in pData(phenoData) must equal length(cuff)!")
if(summ == "max")
{
sm <- 1
}else if(summ == "sum"){
sm <- 2
}else{
stop("summ must be 'max' or 'sum'!")
}
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Read count data from bam-files
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
n <- length(cuff)
probes <- rownames(pData(phenoData))
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## First cuff-file
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
message("[", format(1, width=2), "/", n, "]\tprobe: ", probes[1])
tbl <- read.table(cuff[1], sep="\t", header=TRUE)
##colnames<-c("tracking_id", "gene_short_name", "locus", "FPKM")
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## A handful of tracking id's occur up to four times
## (because of different transcripts)
## The one with the maximal FPKM is chosen
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
track <- unique(tbl$tracking_id)
mtc <- match(track, tbl$tracking_id)
gene <- tbl$gene_short_name[mtc]
locus <- tbl$locus[mtc]
res <- data.frame(tracking_id=track, gene=gene, locus=locus)
if(sm == 1)
{
fpkm <- tapply(tbl$FPKM, tbl$tracking_id, max)
}else if(sm == 2)
{
fpkm <- tapply(tbl$FPKM, tbl$tracking_id, sum)
}
mtc <- match(res$tracking_id, names(fpkm))
res[, probes[1]] <- fpkm[mtc]
for(i in 2:n)
{
## + + + + + + + + + + + + + + + + + + + + + + ##
## Subsequent cuff-file
## + + + + + + + + + + + + + + + + + + + + + + ##
message("[", format(i, width=2), "/", n, "]\tprobe: ", probes[i])
tbl <- read.table(cuff[i], sep="\t", header=TRUE)
if(sm == 1)
{
fpkm <- tapply(tbl$FPKM, tbl$tracking_id, max)
}else if(sm == 2){
fpkm <- tapply(tbl$FPKM, tbl$tracking_id, sum)
}
mtc <- match(names(fpkm), res$tracking_id)
res[, probes[i]] <- 0L
res[, probes[i]][mtc] <- fpkm
}
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Create ExpressionSet from data
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
row.names(res) <- res$tracking_id
exprm <- as.matrix(res[, -(1:3)])
row.names(exprm) <- res$tracking_id
meta <- data.frame(labelDescription=c("Ensembl gene id",
"Gene name", "Gene locus"),
row.names=c("tracking_id", "gene", "locus"))
fd <- new("AnnotatedDataFrame", data=res[, 1:3], varMetadata=meta)
exSet <- ExpressionSet(assayData=exprm, phenoData=phenoData,
featureData=fd)
return(invisible(exSet))
}
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## SpliceCountSet ##
## ##
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
setMethod("initialize", "SpliceCountSet", function(.Object, assayData,
phenoData, featureData, exprs, ...)
{
.Object <- callNextMethod(.Object, phenoData=phenoData,
featureData=featureData, exprs=exprs, ...)
})
##===========================================================================##
## maxEnt ##
## Calculates splice site scores ##
##===========================================================================##
load.maxEnt <- function(file)
{
if(missing(file))
file <- system.file("extdata", "maxent.RData", package="spliceSites")
if(dirname(file) == ".")
mxe <- new("maxEnt", ev=new.env(), basedir=getwd())
else
mxe <- new("maxEnt", ev=new.env(), basedir=dirname(file))
load(file, envir=mxe@ev)
return(invisible(mxe))
}
read.maxEnt <- function(basedir=getwd())
{
me <- new("maxEnt", ev=new.env(), basedir=basedir)
me@ev$me2x5 <- scan(file.path(basedir, "me2x5"))
# scan max-ent files
maxent_files=file.path(basedir, "splicemodels",
c('me2x3acc1', 'me2x3acc2', 'me2x3acc3', 'me2x3acc4', 'me2x3acc5',
'me2x3acc6', 'me2x3acc7', 'me2x3acc8', 'me2x3acc9'))
wmm_files=file.path(basedir, "splicemodels",
c('me1s0acc1', 'me1s0acc2', 'me1s0acc3', 'me1s0acc4', 'me1s0acc5',
'me1s0acc6', 'me1s0acc7', 'me1s0acc8', 'me1s0acc9'))
emm_files=file.path(basedir,"splicemodels",
c('me2s0acc1', 'me2s0acc2', 'me2s0acc3', 'me2s0acc4', 'me2s0acc5',
'me2s0acc6', 'me2s0acc7', 'me2s0acc8', 'me2s0acc9'))
for(i in 1:length(maxent_files))
me@ev$maxent[[i]] <- scan(maxent_files[i])
for(i in 1:length(wmm_files))
me@ev$wmm[[i]] <- scan(wmm_files[i])
for(i in 1:length(emm_files))
me@ev$emm[[i]] <- scan(emm_files[i])
return(me)
}
setMethod("saveMaxEnt", "maxEnt",
function(object, filename, useBasedir=FALSE, ...)
{
if(!is.character(filename))
stop("'filename' must be character!")
if(useBasedir && (length(object@basedir) > 0) )
{
save(file=file.path(object@basedir, filename),
list=ls(envir=object@ev), envir=object@ev, ...)
}else{
save(file=filename, list=ls(envir=object@ev), envir=object@ev, ...)
}
return(invisible())
})
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## basedir
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Generics from refGenome
setMethod("basedir","maxEnt",function(object) {return(object@basedir)})
setReplaceMethod("basedir","maxEnt",function(object,value)
{
if(!file.exists(value))
{
warning("[basedir.maxEnt] Directory '",
value, "' does not exist!\n", sep="")
}
object@basedir <- value
return(object)
})
setMethod("score5", "maxEnt", function(x, seq, pos, ...)
{
if(!is.numeric(pos))
stop("pos must be numeric!")
if(is(seq,"DNAStringSet"))
seq <- as.character(seq)
if(!is(seq,"character"))
stop("seq must be character (or DNAStringSet)!")
pos <- as.integer(pos)
if(length(pos) == 1)
pos <- rep(pos,length(seq))
if(length(seq) != length(pos))
stop("seq and pos must have same length!")
if(any(pos < 3))
stop("pos must be >= 3! At least 3 exon nucs needed!\n")
if(any(nchar(seq) < pos + 6))
{
stop("Sequence length must be >= pos + 6",
pos + 6, ". At least 6 intron nucs needed.\n")
}
return(.Call("maxent_score5", seq, pos, x@ev$me2x5, PACKAGE="spliceSites"))
})
setMethod("scoreSeq5", "maxEnt", function(x, seq, frame){
if(!is.character(seq))
stop("seq must be character!")
nc <- nchar(seq)
if(nc<9) # =l5seq
stop("nchar(seq) must be >= 9!")
if(missing(frame))
frame <- c(3, nc - 6)
if(!is.numeric(frame))
stop("frame must be numeric!")
if(length(frame) != 2)
stop("frame must have length 2!")
## pos=1-based last exon nuc = 3, l5seq=9
## |
## ATG | GTC | ATCGAA
## 123 456 789012
## | |
## | end=6
## start=3
return(.Call("maxent_seq_score5", seq, as.integer(frame),
x@ev$me2x5, PACKAGE="spliceSites"))
})
setMethod("scoreSeq3", "maxEnt", function(x, seq, frame, which="ent", ...)
{
if(!is.character(seq))
stop("seq must be character!")
nc <- nchar(seq)
if(nc<23) # =l3seq
stop("nchar(seq) must be >= 23!")
if(missing(frame))
frame <- c(20, nc - 3)
if(!is.numeric(frame))
stop("frame must be numeric!")
if(length(frame) != 2)
stop("frame must have length 2!")
if(which == "ent")
{
return(.Call("maxent_seq_score3", seq, as.integer(frame),
x@ev$maxent, PACKAGE="spliceSites"))
}
if(which == "wmm")
{
return(.Call("maxent_seq_score3", seq, as.integer(frame),
x@ev$wmm, PACKAGE="spliceSites"))
}
if(which == "emm")
{
return(.Call("maxent_seq_score3", seq, as.integer(frame),
x@ev$emm, PACKAGE="spliceSites"))
}
})
setMethod("score3", "maxEnt", function(x, seq, pos, which="ent", ...)
{
if(!is.numeric(pos))
stop("pos must be numeric!")
if(is(seq, "DNAStringSet"))
seq <- as.character(seq)
if(!is(seq, "character"))
stop("seq must be character (or DNAStringSet)!")
pos <- as.integer(pos)
if(length(pos) == 1)
pos <- rep(pos, length(seq))
if(length(pos) != length(seq))
stop("[score3.maxEnt] seq and pos must have same length!")
if(any(pos < 20))
stop("pos must be >= 20: At least 20 intron nucs needed!\n")
if(any(nchar(seq) < pos + 3))
{
stop("Sequence length must be >= pos + 3", pos + 3,
". At least 3 exon nucs needed!\n")
}
if(!is.character(which))
stop("Which argument must be character!")
if(which == "ent")
{
return(.Call("maxent_score3", seq, pos,
x@ev$maxent, PACKAGE="spliceSites"))
}
if(which == "wmm")
{
return(.Call("maxent_score3", seq, pos,
x@ev$wmm, PACKAGE="spliceSites"))
}
if(which == "emm")
{
return(.Call("maxent_score3", seq, pos,
x@ev$emm, PACKAGE="spliceSites"))
}
})
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Calculate maxEnts for gapSites objects
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
setMethod("addMaxEnt", "gapSites", function(x, dna, maxent, digits=1)
{
if(!is(dna, "DNAStringSet"))
stop("dna must be DNAStringSet!")
if(!is(maxent, "maxEnt"))
stop("maxEnt must be of class 'maxEnt'")
if(!is.numeric(digits))
stop("digits must be numeric!")
if(length(digits) > 1)
stop("digits mus have length 1!")
featlen <- 3 # nr of exonic nucleotides
gaplen <- 20 # nr of intronic nucleotides
res <- new("gapSites")
res@dt <- x@dt
res@nAligns <- x@nAligns
res@nAlignGaps <- x@nAlignGaps
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Calculate scores for "+"-strand
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
lj <- lJunc(x, featlen, gaplen, strand="+")
dlj <- dnaRanges(lj, dna, useStrand=TRUE, verbose=FALSE)
mtc <- match(res@dt$id, dlj@dt$id)
if(any(is.na(mtc)))
{
bm <- Sys.localeconv()[7]
message("[addMaxEnt.gapSites] Removing ",
format(sum(is.na(mtc)), big.mark=bm),
" records due to missing match in dna (DNAStringSet)." )
res@dt <- x@dt[!is.na(mtc), ]
res@dt$seqid <- factor(res@dt$seqid)
mtc <- match(res@dt$id, dlj@dt$id)
}
res@dt$mxe_ps5 <- round(score5(maxent, dlj@seq[mtc], featlen), digits)
rj <- rJunc(x, featlen, gaplen, strand="+", unique=FALSE)
drj <- dnaRanges(rj, dna, useStrand=TRUE, verbose=FALSE)
mtc <- match(res@dt$id, drj@dt$id)
res@dt$mxe_ps3 <- round(score3(maxent, drj@seq[mtc], gaplen), digits)
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Calculate scores for "-"-strand
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
rj <- rJunc(x, featlen, gaplen, strand="-", unique=FALSE)
drj <- dnaRanges(rj, dna, useStrand=TRUE, verbose=FALSE)
mtc <- match(res@dt$id, drj@dt$id)
res@dt$mxe_ms5 <- round(score5(maxent, drj@seq[mtc], featlen), digits)
lj <- lJunc(x, featlen, gaplen, strand="-", unique=FALSE)
dlj <- dnaRanges(lj, dna, useStrand=TRUE, verbose=FALSE)
mtc <- match(res@dt$id, dlj@dt$id)
res@dt$mxe_ms3 <- round(score3(maxent, dlj@seq[mtc], gaplen), digits)
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Combine score information to strand information
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
res@dt$s5strand <- .Call("maxent_score2strand", res@dt$mxe_ps5,
res@dt$mxe_ms5, PACKAGE="spliceSites")
res@dt$s3strand <- .Call("maxent_score2strand", res@dt$mxe_ps3,
res@dt$mxe_ms3, PACKAGE="spliceSites")
res@dt$meStrand <- .Call("maxent_combine_strand", res@dt$s5strand,
res@dt$s3strand, PACKAGE="spliceSites")
return(res)
})
setMethod("setMeStrand", "gapSites", function(x)
{
mtc <- match("meStrand", names(x@dt))
if(is.na(mtc))
stop("meStrand column not found. Use 'addMaxEnt!")
res <- new("gapSites")
res@dt <- x@dt
res@dt$strand <- res@dt$meStrand
return(res)
})
setMethod("getMeStrand", "gapSites", function(x)
{
mtc <- match("meStrand", names(x@dt))
if(is.na(mtc))
stop("meStrand column not found. Use 'addMaxEnt!")
return(x@dt$meStrand)
})
##===========================================================================##
## hbond ##
## Calculates splice site scores ##
##===========================================================================##
load.hbond <- function(file)
{
if(missing(file))
file <- system.file("extdata", "hbs.RData", package="spliceSites")
if(dirname(file) == ".")
hb <- new("hbond", ev=new.env(), basedir=getwd())
else
hb <- new("hbond", ev=new.env(), basedir=dirname(file))
load(file, envir=hb@ev)
return(invisible(hb))
}
# Generics from refGenome
setMethod("basedir", "hbond", function(object) {return(object@basedir)})
setReplaceMethod("basedir", "hbond", function(object, value)
{
if(!file.exists(value))
{
warning("[basedir.hbond] Directory '",
value, "' does not exist!\n", sep="")
}
object@basedir <- value
return(object)
})
setMethod("hbond", "hbond", function(x, seq, pos, ...)
{
if(!is.numeric(pos))
stop("pos must be numeric!")
if(is(seq, "DNAStringSet"))
seq <- as.character(seq)
if(!is(seq, "character"))
stop("seq must be character (or DNAStringSet)!")
pos <- as.integer(pos)
if(length(pos) == 1)
pos <- rep(pos, length(seq))
if(length(pos) != length(seq))
stop("seq and pos must have same length!")
if(any(pos < 3))
stop("pos must be >= 3: 3 exon nucs required!\n")
if(any(nchar(seq) < pos + 8))
{
stop("Sequence length must be >= pos + 8. ",
"At least 8 intron nucs required!\n")
}
return(.Call("hbond_score", substr(seq, pos - 2, pos + 8),
x@ev$hbs, PACKAGE="spliceSites"))
})
setMethod("addHbond", "cdRanges", function(x)
{
hb <- load.hbond()
x@dt$hbond <- hbond(hb, x@seq, x@dt$pos)
return(x)
})
setMethod("addHbond", "gapSites", function(x, dna)
{
if(!is(dna, "DNAStringSet"))
stop("[addMaxEnd.gapSites] dna must be DNAStringSet!")
featlen <- 3 ## nr of exonic nucleotides
gaplen <- 8 ## nr of intronic nucleotides
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Create output object
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
res <- new("gapSites")
res@dt <- x@dt
res@nAligns <- x@nAligns
res@nAlignGaps <- x@nAlignGaps
## Loading takes 8 ms so it's ok not to reuse object
hb <- load.hbond()
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Calculate scores separated for left side ("+"-strand)
## and right side ("-"-strand)
## (lJunc and dnaRanges do not remove lines)
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
lj <- lJunc(x, featlen, gaplen, unique=FALSE, strand="+")
dlj <- dnaRanges(lj, dna, useStrand=TRUE, verbose=FALSE)
mtc <- match(res@dt$id, dlj@dt$id)
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## There are NA's present due to seqid entries in lj
## which are not present in dna (e.g. M=Mitochondrial)
## Their hbond is returned as NA
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
res@dt$lhbond <- .Call("hbond_score", as.character(dlj@seq)[mtc],
hb@ev$hbs, PACKAGE="spliceSites")
rj <- rJunc(x, featlen, gaplen, unique=FALSE, strand="-")
drj <- dnaRanges(rj, dna, useStrand=TRUE, verbose=FALSE)
mtc <- match(res@dt$id, drj@dt$id)
res@dt$rhbond <- .Call("hbond_score", as.character(drj@seq)[mtc],
hb@ev$hbs, PACKAGE="spliceSites")
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Terminate routine
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
return(res)
})
##============================================================================##
## ExpressionSet related functions:
## readExpSet
##============================================================================##
padd <- function(text, len=5L, char="#", left=TRUE)
{
if(!is.character(text))
stop("'text' must be character!")
if(!is.integer(len))
stop("'len' must be integer!")
if(length(len) > 1)
stop("'len' must have length 1!")
if(!is.character(char))
stop("'char' must be character")
if(!is.logical(left))
stop("'left' must be logical")
slen <- nchar(text)
if(any(slen)>len)
stop("'len' must be >= nchar(text)!")
padstring <- paste(rep(char, len), collapse="")
padlen <- len-slen # >= 0
if(left)
return(sprintf("%s%s", substring(padstring, 1, padlen), text))
else
return(sprintf("%s%s)", text, substring(padstring, 1, padlen)))
}
## ExpressionSet: featureData
readExpSet <- function(bam, idx, val="nAligns", phenoData, expData)
{
## bam: vector of bam-file names
## idx: optional vector of index-file names
## val: gptm or rpmg
## probes: vector containing probe identifier
vals <- c("gptm", "rpmg", "nAligns")
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Check incoming args
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
if(!is.character(bam))
stop("'bam' argument must be character!")
if(!missing(idx))
{
if(length(idx) != length(bam))
stop("idx must have same length as bam!")
}
if(!is(phenoData, "AnnotatedDataFrame"))
stop("phenoData must be of class 'AnnotatedDataFrame")
if(nrow(pData(phenoData)) != length(bam))
stop("Number of rows in pData(phenoData) must equal length(bam)!")
if(!is.character(val))
stop("'val' argument must be character!")
if(length(val) > 1)
stop("'val' argument must have length 1!")
if(!is.element(val, vals))
stop("'val' argument must be 'gptm', 'rpmg' or 'nAligns'!")
if(!missing(expData))
{
if(!is(expData, "MIAME"))
stop("expData must be MIAME")
}
bm <- Sys.localeconv()[7]
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Read count data from bam-files
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
n <- length(bam)
probes <- rownames(pData(phenoData))
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## First bam-file
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
message("[", format(1, width=2), "/", n, "] ", appendLF=FALSE)
if(missing(idx))
reader <- bamReader(bam[1], idx=TRUE)
else
reader <- bamReader(bam[1], indexname=idx[1])
bsl <- bamGapList(reader)
dfr <- as.data.frame(bsl)
bamClose(reader)
message("size: ", format(size(bsl), big.mark=bm))
if(val == "gptm")
{
n_aligns <- nAligns(bsl)
gptm_fac <- 1e7/n_aligns
dfr$gptm <- round(dfr$nAligns*gptm_fac, 5)
}else if(val == "rpmg")
{
n_gap_aligns <- nAlignGaps(bsl)
rpmg_fac <- 1e6/n_gap_aligns
dfr$rpmg <- round(dfr$nAligns*rpmg_fac, 5)
}
res <- dfr[, c("seqid", "lstart", "lend", "rstart", "rend", val)]
## Name new column with probe-id
names(res)[ncol(res)] <- probes[1]
for(i in 2:n)
{
## + + + + + + + + + + + + + + + + + + + + + + ##
## Subsequent bam-files
## + + + + + + + + + + + + + + + + + + + + + + ##
message("[", format(i, width=2), "/", n, "] ", appendLF=FALSE)
if(missing(idx))
reader <- bamReader(bam[i], idx=TRUE)
else
reader <- bamReader(bam[i], indexname=idx[i])
bsl <- bamGapList(reader)
dfr <- as.data.frame(bsl)
bamClose(reader)
message("size: ", format(size(bsl), big.mark=bm))
if(val == "gptm")
{
n_aligns <- nAligns(bsl)
gptm_fac <- 1e7/n_aligns
dfr$gptm <- round(dfr$nAligns*gptm_fac, 5)
}else if(val == "rpmg")
{
n_gap_aligns <- nAlignGaps(bsl)
rpmg_fac <- 1e6/n_gap_aligns
dfr$rpmg <- round(dfr$nAligns*rpmg_fac, 5)
}else{
dfr$count <- dfr$nAligns
}
dfr <- dfr[, c("seqid", "lstart", "lend", "rstart", "rend", val)]
## Name new column with probe-id
names(dfr)[ncol(dfr)] <- probes[i]
## Merge
res <- merge(res, dfr, by=c("seqid", "lend", "rstart"),
all=T, suffixes=c("", "2"))
res$lstart <- pmin(res$lstart, res$lstart2, na.rm=T)
res$lstart2 <- NULL
res$rend <- pmax(res$rend, res$rend2, na.rm=T)
res$rend2 <- NULL
## + + + + + + + + + + + + + + + + + + + + + + ##
}
res[is.na(res)] <- 0
res <- res[order(res$seqid, res$lend, res$rstart), ]
## Remove possibly prefixing 'chr' (because uninformative)
seqid <- padd(sub("^chr", "", res$seqid), 2L, char="0")
## Unique alphanumerical index
index <- alphabetIndex(nrow(res), index_alphabet)
## Index as suffix: fast search (and sort) from left.
rownames(res) <- paste(index, seqid, sep="")
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Create ExpressionSet from data
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## assay data
message("[readExpSet] Create ExpressionSet.")
ncols <- ncol(res)
exprm <- as.matrix(res[, 6:ncols])
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## featureData
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
meta <- data.frame(labelDescription=c("sequence identifier", "left start",
"left end", "right start", "right end" ),
row.names=c("seqid", "lstart", "lend", "rstart", "rend"))
featureData <- new("AnnotatedDataFrame", data=res[, 1:5], varMetadata=meta)
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## ExpressionSet
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
if(missing(expData))
{
exSet <- ExpressionSet(assayData=exprm, phenoData=phenoData,
featureData=featureData)
}else{
exSet <- ExpressionSet(assayData=exprm, phenoData=phenoData,
experimentData=expData, featureData=featureData)
}
message("[readExpSet] Finished.")
return(exSet)
} ## End of function
setMethod("uniqueJuncAnn", c("ExpressionSet", "refJunctions"),
function(object, junc, ann=TRUE, ...)
{
if(!is.logical(ann))
stop("ann must be logical!")
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Extract sub-tables from ExpressionSet
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
message("[uniqueJuncAnn.ExpressionSet] Unifying juncs.")
uj <- unifyJuncs(junc)
ex <- exprs(object)
fd <- featureData(object)@data
fd$fid <- row.names(fd)
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Merge featureData with annotation data
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
message("[uniqueJuncAnn.ExpressionSet] Merge with annotation.")
mrg <- merge(fd, uj@ev$gtf[, c("seqid", "lend", "rstart", "id", "gene_id",
"strand", "fexid")],
by=c("seqid", "lend", "rstart"), all.x=TRUE)
mrg <- mrg[order(mrg$fid), ]
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Remove unannotated junction sites
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
idna <- is.na(mrg$id)
bm <- Sys.localeconv()[7]
if(ann)
{
message("[uniqueJuncAnn.ExpressionSet] Remove ",
format(sum(idna), big.mark=bm), " unannotated sites.")
nna <- !idna
rfd <- mrg[nna,]
row.names(rfd) <- rfd$fid
rfd$fid <- NULL
rex <- ex[nna, ]
}else{
message("[uniqueJuncAnn.ExpressionSet] Remove ",
format(sum(!idna), big.mark=bm), " annotated sites.")
rfd <- mrg[idna, ]
row.names(rfd) <- rfd$fid
rfd$fid <- NULL
rex <- ex[idna, ]
}
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Assemble result
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
message("[uniqueJuncAnn.ExpressionSet] Assemble result.")
meta <- data.frame(labelDescription=c("sequence identifier",
"left start", "left end", "right start", "right end",
"id (junction id)", "gene id", "strand", "first exon id" ),
row.names=c("seqid", "lstart", "lend", "rstart", "rend","id",
"gene_id", "strand", "fexid"))
fdn <- new("AnnotatedDataFrame", data=rfd, varMetadata=meta)
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Terminate routine
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
message("[uniqueJuncAnn.ExpressionSet] Finished.")
return(ExpressionSet(assayData=rex, phenoData=phenoData(object),
featureData=fdn))
})
setMethod("addGenomeData", c("ExpressionSet", "DNAStringSet", "refJunctions"),
function(object, dna, junc)
{
## ToDo:
## Check for seqnames and positions on DNAStringSet
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Add annotation to featureData
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
fd <- annotate(object, junc)
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Extract featureData
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
vl <- varLabels(fd)
vmd <- varMetadata(fd)
fdt <- fd@data
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Create gapSites object
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
fdg <- gapSites(seqid=fdt$seqid,
lstart=fdt$lstart, lend=fdt$lend,
rstart=fdt$rstart, rend=fdt$rend)
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Add Maxent scores
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
message("[addGenomeData] Add MaxEnt scores.")
mes <- load.maxEnt()
fdmx <- addMaxEnt(fdg, dna, mes)
fdmxt <- fdmx@dt
fdt$mxe_ps5 <- fdmxt$mxe_ps5
fdt$mxe_ps3 <- fdmxt$mxe_ps3
fdt$mxe_ms5 <- fdmxt$mxe_ms5
fdt$mxe_ms3 <- fdmxt$mxe_ms3
fdt$meStrand <- fdmxt$meStrand
message("[addGenomeData] add left sequence data")
fdlj <- lJunc(fdg, featlen=3, gaplen=8, strand='+')
fdljd <- dnaRanges(fdlj, dna)
fdt$ljseq <- as.character(fdljd@seq)
fdt$ldin <- substr(fdt$ljseq, 4, 5)
message("[addGenomeData] add right sequence data")
fdrj <- rJunc(fdg, featlen=3, gaplen=8, strand="-")
fdrjd <- dnaRanges(fdrj, dna)
fdt$rjseq <- as.character(fdrjd@seq)
fdt$rdin <- substr(fdt$rjseq, 4, 5)
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Create new featureData object
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
vlg <- c(vl,
"mxe_ps5", "mxe_ps3", "mxe_ms5", "mxe_ms3", "meStrand",
"ljseq", "ldin", "rjseq", "rdin")
new_labs <- c("MaxEnt + strand score 5",
"MaxEnt + strand score 3",
"MaxEnt - strand score 5",
"MaxEnt - strand score 3",
"MaxEnt derived Strand",
"Left junction seq (strand=+, featlen=3, gaplen=8)",
"Left junction intron dinucleotide",
"Right junction seq (strand=-, featlen=3, gaplen=8)",
"Right junction intron dinucleotide")
meta <- data.frame(labelDescription=c(vmd$labelDescription, new_labs),
row.names = vlg)
adf <- new("AnnotatedDataFrame", data=fdt, varMetadata=meta)
featureData(object) <- adf
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Terminate routine
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
message("[addGenomeData] Finished.")
return(object)
})
setMethod("addGenomeData", c("gapSites", "DNAStringSet", "refJunctions"),
function(object, dna, junc)
{
# Big mark:
bm <- Sys.localeconv()[7]
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Annotate object gap-site data
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
message("[addGenomeData] Add annotation.")
ann <- annotate(object, junc)
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Add maxent scores
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
message("[addGenomeData] Add MaxEnt score")
mes <- load.maxEnt()
object <- addMaxEnt(object, dna, mes)
object@annotation <- ann
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Add Junction sequence data
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
message("[addGenomeData] add left sequence data")
fdlj <- lJunc(object, featlen=3, gaplen=8, strand='+')
fdljd <- dnaRanges(fdlj, dna)
object@dt$ljseq <- as.character(fdljd@seq)
object@dt$ldin <- substr(object@dt$ljseq, 4, 5)
message("[addGenomeData] add right sequence data")
fdrj <- rJunc(object, featlen=3, gaplen=8, strand="-")
fdrjd <- dnaRanges(fdrj, dna)
# rJunc is not in same order as object@dt
mtc <- match(object@dt$id, fdrjd@dt$id)
object@dt$rjseq <- as.character(fdrjd@seq[mtc])
object@dt$rdin <- substr(object@dt$rjseq, 4, 5)
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Add WGIS (= weighted gap information score = gqmx1)
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
message("[addGenomeData] Add wgis score.")
## Columns nlstart and qsm may be missing when object is not created
## using readTabledBamGaps
if(is.null(object@dt$nlstart)) {
message("[addGenomeData] 'nlstart' column missing. Adding nlstart=1.")
object@dt$nlstart <- 1
}
if(is.null(object@dt$qsm)) {
message("[addGenomeData] 'qsm' column missing. Adding qsm=1.")
object@dt$qsm <- 1
}
# Remove NA values from maxEnt scores
mx_na_val <- (-50)
mxep5 <- object@dt$mxe_ps5
mxep5[is.na(mxep5)] <- mx_na_val
mxen5 <- object@dt$mxe_ms5
mxen5[is.na(mxen5)] <- mx_na_val
mxep3 <- object@dt$mxe_ps3
mxep3[is.na(mxep3)] <- mx_na_val
mxen3 <- object@dt$mxe_ms3
mxen3[is.na(mxen3)] <- mx_na_val
# Derive strand information from maxent score:
# a) Determines whether mxe_ps5 or mxe_ms5 is used for wgis
# b) Will be used as algebraic sign for score indicating strand
mxstrp <- mxep3 > mxen3
mxe5 <- ifelse(mxep5 > mxen5, mxep5, mxen5)
mxe5 <- pmax(mxe5, 1)
mxe3 <- ifelse(mxep3 > mxen3, mxep3, mxen3)
mxe3 <- pmax(mxe3, 1)
# Weighted version of nlstart values
nlss <- (log2(log2(object@dt$nlstart) + 1) + 1)
# All qsm <=15 are set to 0
qsms <- log2(log2(pmax(object@dt$qsm - 13, 2)))
# WGIS = weighted gap information score (= gqmx1)
object@dt$wgis <- round(nlss * qsms * log2(mxe5) * log2(mxe3), 1)
# strand-factor:
strfac <- (as.numeric(mxstrp) * 2) - 1
object@dt$wgis <- object@dt$wgis * strfac
# GQL
message("[addGenomeData] Add GQL")
object@dt$gql <- 0
object@dt$gql[object@dt$wgis != 0] <- 1
object@dt$gql[abs(object@dt$wgis) > 30] <- 2
object@dt$gql[abs(object@dt$wgis) > 80] <- 3
tbl <- table(mxstrp)
message("[addGenomeData] MaxEnt score3 based strand: \t+: ",
format(tbl["FALSE"], big.mark=bm), "\t-: ",
format(tbl["TRUE"], big.mark=bm))
message("[addGenomeData] Finished.")
return(object)
})
##============================================================================##
## Miscellaneous
## get_dna_nmers
## plotGeneAlignDepth
##============================================================================##
get_dna_nmers <- function(len)
{
if(!is.numeric(len))
stop("[get_dna_nmers] len must be numeric!")
len <- as.integer(len)
if(len <= 0)
stop("len must be >0")
if(len > 32)
stop("len must be <=32")
return(.Call("create_dna_n_mers", len, PACKAGE="spliceSites"))
}
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Plot align depth for transcripts
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
setMethod("plotGeneAlignDepth", "bamReader",
function(reader, genome, gene=NULL, transcript=NULL,
log="y", cex.main=2,
col="grey50", fill="grey90", grid=TRUE,
box.col="grey20", box.border="grey80")
{
if(!isOpen(reader))
stop("reader must be opened.")
if(!indexInitialized(reader))
stop("reader must have initialized index.")
if(!is(genome, "refGenome"))
stop("genome must be refGenome.")
if(is.null(gene))
stop("gene argument is not optional.")
if(!is.character(gene))
stop("gene must be 'character'.")
if(length(gene)!=1)
stop("gene vector must have length 1.")
if(!is.null(transcript))
{
if(!is.character(transcript))
stop("transcript must be 'character'.")
if(length(transcript) != 1)
stop("transcript must have length 1.")
}
gen <- extractByGeneName(genome, gene)
genpos <- getGenePositions(gen)[1, ]
xlim <- c(genpos$start, genpos$end)
# - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Calculate alignment depth in target region
# - - - - - - - - - - - - - - - - - - - - - - - - - - - #
ref <- getRefData(reader)
refid <- ref$ID[ref$SN==genpos$seqid]
if( length(refid) == 0 )
{
stop("No match for seqid between refGenome and BAM header.\n",
"(Alignment against different genome?)")
}
coords <- c(refid[1], genpos$start, genpos$end)
enr <- bamRange(reader, coords)
ad <- alignDepth(enr)
# - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Use transcript information for drawing
# of exon boxes
# - - - - - - - - - - - - - - - - - - - - - - - - - - - #
features <- gen@ev$gtf
if(!is.null(transcript))
{
if( all(features$transcript_id!=transcript))
{
cat("[plotGeneAlignDepth] No match for for transcript in transcript_id:")
print(table(factor(features$transcript_id)))
exons <- NULL
}else{
features <- features[features$transcript_id==transcript, ]
exons <- features[features$feature=="exon", ]
if(nrow(exons)==0 )
exons <- NULL
}
}
# - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Do plot
# - - - - - - - - - - - - - - - - - - - - - - - - - - - #
plotAlignDepth(ad, lwd = 2, xlim = xlim,
main = paste("Alignment depth for gene", gene),
ylab = "Alignment depth",
start = exons$start,
end = exons$end,
strand = genpos$strand,
transcript = paste("Chromosome ",
genpos$seqid,
"\tGene ", genpos$gene_id,
"\tTranscript", transcript
),
log=log, cex.main=cex.main,
col=col, fill=fill, grid=grid,
box.col=box.col, box.border=box.border
)
return(invisible())
})
################################################################################
## END OF FILE
################################################################################
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.