Nothing
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## ##
## Project : seqTools ##
## Created : 26.August.2013 ##
## Author : W. Kaisers ##
## Content : Doing some diagnostic and interventional tasks on fastq ##
## and fasta ##
## esp. DNA k-mer counts. ##
## Version : 0.99.34 ##
## ##
## Changelog : ##
## 26.08.13 : 0.0.1 Project created ##
## 03.09.13 : 0.0.6 C-Code valgrind tested ##
## 27.09.13 : 0.1.0 Added fastqDnaKmers ##
## 14.10.13 : 0.1.1 Added C-library for parsing gz fasta and fastq files ##
## 17.10.13 : 0.1.3 C-Wrapper for fastq working. ##
## 17.10.13 : 0.1.6 First version of R package ##
## 21.10.13 : 0.3.0 New C library for fastq parsing ##
## 28.10.13 : 0.4.0 Added fastq-loc functions ##
## 29.10.13 : 0.4.4 seqTools C-code valgrind tested. ##
## 01.11.13 : 0.5.0 Distance matrices implemented ##
## 02.11.13 : 0.5.1 First working version with clustering based on ##
## K-mers ##
## 07.11.13 : 0.5.4 countFastaKmers now resizes arrays automatically ##
## 09.11.13 : 0.6.0 count_fastq now resizes arrays automatically ##
## 11.11.13 : 0.6.5 Added fastq simulation functions ##
## 19.11.13 : 0.7.0 Added trimFastq function ##
## 30.11.13 : 0.9.0 Separated R source files ##
## 22.12.13 : 0.99.2 Added '['-operator for Fastqq class ##
## 19.01.14 : 0.99.7 Added zlib version control for correction of ##
## gzBuffer ##
## error ##
## + checks: cran win-builder + valgrind ##
## 21.05.14 : 0.99.34 Corrected error in count_fasta_Kmers ##
## which freezed function ##
## 07.10.15 : 1.2.4 Added kmerSvd function ##
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
.onUnload <- function(libpath) { library.dynam.unload("seqTools", libpath) }
## see: http://bioconductor.org/developers/how-to/coding-style/
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## Data collection functions:
## Fastqq, fastqKmerLocs, fastqKmerSubsetLocs
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
fastqq <- function(filenames, k=6, probeLabel)
{
k <- as.integer(k)
tl <- list()
tl$start <- Sys.time()
filenames <- path.expand(filenames)
res <- .Call("count_fastq", filenames, k, PACKAGE="seqTools")
tl$end <- Sys.time()
res@collectTime <- tl
# Correct minSeqLen when empty files are counted
if(any(res@nReads==0))
res@seqLen[1,res@nReads==0] <- 0
if(!missing(probeLabel))
{
if(length(probeLabel) == res@nFiles)
res@probeLabel <- as.character(probeLabel)
else{
warning("[Fastqq] probeLabel and filenames must have equal length.")
res@probeLabel <- as.character(1:res@nFiles)
}
}else{
res@probeLabel <- as.character(1:res@nFiles)
}
return(res)
}
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## fastq K-mer locs
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
fastqKmerLocs <- function(filenames, k=4)
{
if(!is.numeric(k))
stop("'k' must be numeric.")
k <- as.integer(k)
if( (k < 0) || (k > max_k) )
stop("'k' must be in range 0, ... , 16.")
return(.Call("fastq_Kmer_locs", filenames, k, PACKAGE="seqTools"))
}
fastqKmerSubsetLocs <- function(filenames, k=4, kIndex)
{
# Returns list with matrix elements.
if(!is.numeric(k))
stop("'k' must be numeric.")
k <- as.integer(k)
if( (k < 0) || (k > max_k) )
stop("'k' must be in range 0, ... , ", max_k, ".")
if(!is.numeric(kIndex))
stop("'kIndex' must be numeric.")
kIndex <- as.integer(kIndex)
if(any(kIndex < 0))
stop("No negative 'kIndex' values allowed.")
if(any(kIndex > (4^k)) )
stop("'kIndex' out of range (>4^k).")
return(.Call("fastq_KmerSubset_locs",
filenames, k, kIndex, PACKAGE="seqTools"))
}
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## End: Data collection functions.
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## Standard slot accessor functions
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
setMethod("fileNames", "Fastqq", function(object)
{return(object@filenames)})
setMethod("collectTime", "Fastqq", function(object)
{return(object@collectTime)})
setMethod("collectDur", "Fastqq", function(object) {
return(as.numeric(difftime(object@collectTime$end, object@collectTime$start,
units = "secs")))
})
setMethod("getK", "Fastqq", function(object) {return(object@k)})
setMethod("nFiles", "Fastqq", function(object) {return(object@nFiles)})
setMethod("nNnucs", "Fastqq", function(object) {return(object@nN)})
setMethod("nReads", "Fastqq", function(object) {return(object@nReads)})
setMethod("maxSeqLen", "Fastqq", function(object) {return(object@maxSeqLen)})
setMethod("seqLenCount", "Fastqq", function(object)
{
res<-object@seqLenCount
colnames(res) <- object@probeLabel
return(res)
})
setMethod("nucFreq", "Fastqq", function(object, i)
{
if(missing(i))
stop("Argument 'i' is not optional.")
if(!is.numeric(i))
stop("'i' must be numeric.")
i <- as.integer(i)
if( (i < 1) || (i > object@nFiles) )
stop("'i' must be >0 and < nFiles.")
return(object@nac[[i]])
})
setMethod("gcContent", "Fastqq", function(object, i)
{
if(missing(i))
stop("Argument 'i' is not optional.")
if(!is.numeric(i))
stop("'i' must be numeric.")
i <- as.integer(i)
if( (i < 1) || (i > object@nFiles))
stop("'i' must be >0 and < nFiles.")
return(object@gcContent[, i])
})
setMethod("gcContentMatrix", "Fastqq", function(object)
{
gcc <- object@gcContent
colnames(gcc) <- object@probeLabel
return(gcc)
})
setMethod("seqLen", "Fastqq", function(object)
{
sql <- object@seqLen
colnames(sql) <- object@probeLabel
return(sql)
})
setMethod("kmerCount", "Fastqq", function(object)
{
kmer <- object@kmer
colnames(kmer) <- object@probeLabel
return(kmer)
})
setMethod("probeLabel", "Fastqq", function(object){return(object@probeLabel)})
setReplaceMethod("probeLabel", "Fastqq", function(object, value)
{
if(length(value) != nFiles(object))
stop("'value' must have length ", nFiles(object), ".")
val <- as.character(value)
if(any(table(val)) > 1)
{
warning("[probeLabel <- .Fastqq] probeLabel unique suffix added.")
val <- paste(1:nFiles(object), val, sep="_")
}
object@probeLabel <- val
return(object)
})
setMethod("phred", signature="Fastqq", definition=function(object, i)
{
if(missing(i))
stop("Argument 'i' is not optional.")
if(!is.numeric(i))
stop("'i' must be numeric.")
i <- as.integer(i)
if( (i < 1) || (i > object@nFiles) )
stop("'i' must be >0 and < nFiles.")
return(object@phred[[i]])
})
setMethod("kmerSvd", "Fastqq", function(object)
{
# x = Fastqq
km <- t(log(object@kmer + 1))
# Column wise centerin
kmc <- apply(km, 2, function(x) {return(scale(x, center=TRUE, scale=FALSE))})
ksv <- svd(kmc)
# Extract Left singular vectors and Singular values
return(list(lsv=ksv$u, sv=ksv$d))
})
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## End: Standard slot accessor functions
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
setMethod("show", "Fastqq", function(object)
{
bm <- Sys.localeconv()[7]
w <- 20
r <- "right"
cat("Class : ", format(class(object), w=w, j=r) , "\n", sep="")
cat("nFiles : ", format(format(nFiles(object) , big.m=bm), w=w, j=r) , "\n", sep="")
cat("maxSeqLen : ", format(format(maxSeqLen(object) , big.m=bm), w=w, j=r) , "\n", sep="")
cat("k (Kmer len): ", format(format(getK(object) , big.m=bm), w=w, j=r) , "\n", sep="")
cat("\n")
cat("nReads : ", format(format(sum(as.numeric(nReads(object))) , big.m=bm), w=w, j=r) , "\n", sep="")
cat("nr N nuc : ", format(format(sum(nNnucs(object)) , big.m=bm), w=w, j=r) , "\n", sep="")
cat("Min seq len : ", format(format(min(seqLen(object)[1, ]), big.m=bm), w=w, j=r) , "\n", sep="")
cat("Max seq len : ", format(format(max(seqLen(object)[2, ]), big.m=bm), w=w, j=r) , "\n", sep="")
return(invisible())
})
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## Phred related functions
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
setMethod("phredQuantiles", "Fastqq", function(object, quantiles, i, ...)
{
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
# Checking arguments
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
## Check quantiles argument
if(missing(quantiles))
stop("'quantiles' argument is not optional")
if(!is.numeric(quantiles))
stop("Quantiles must be numeric.")
if(!(all(quantiles >= 0) & all(quantiles <= 1) ))
stop("All quantiles mustbe in [0, 1]")
quantiles <- sort(unique(round(quantiles, 2)))
## Check 'i' argument
if(missing(i))
stop("'i' argument is not optional")
if(!is.numeric(i))
stop("'i' must be numeric.")
if(length(i) > 1)
stop("'i' must have length 1")
i <- as.integer(i)
if( (i < 1) || (i > object@nFiles) )
stop("'i' must be >0 and <nFiles.")
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
# Count qual values for each sequence position
# Convert integer counts into column-wise relative values
# Maximum counted read length
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
vec <- 1:seqLen(object)[2, i]
qrel <- as.data.frame(apply(object@phred[[i]][, vec], 2, rel_int))
names(qrel) <- vec
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
# Walk through each column and extract row number
# for given quantile values
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
res <- .Call("get_column_quantiles", quantiles, qrel, PACKAGE="seqTools")
return(res)
})
setMethod("plotPhredQuant", "Fastqq", function(object, i, main, ...){
if(!is.numeric(i))
stop("'i' must be numeric.")
i <- as.integer(i)
if( (i < 1) || (i > object@nFiles) )
stop("'i' must be >0 and <nFiles.")
quant <- c(0.1, 0.25, 0.5, 0.75, 0.9)
cols <- c("#1F78B4", "#FF7F00", "#E31A1C", "#FF7F00", "#1F78B4")
qq <- phredQuantiles(object, quant, i)
maxQ = floor(1.2*max(qq))
xv <- 1:ncol(qq)
if(missing(main))
{
main <- paste("Position wise Phred Quantiles (",
probeLabel(object)[i], ")", sep="")
}
plot(xv, xv, ylim=c(0, maxQ), type="n", bty="n", las=1,
ylab = "Phred score", xlab="Sequence position", main=main, ...)
lines(xv, qq[1, ], col=cols[1], lty=2)
lines(xv, qq[2, ], col=cols[2], lty=1)
lines(xv, qq[3, ], col=cols[3], lwd=2)
lines(xv, qq[4, ], col=cols[4], lty=1)
lines(xv, qq[5, ], col=cols[5], lty=2)
legend("top", ncol=6, lty=c(2, 1, 1, 1, 2),
lwd=c(1, 1, 2, 1, 1), col=cols, xjust=0.5,
legend=c("10%", "25%", "50%", "75%", "90%"), bty="n", cex=0.8)
return(invisible())
})
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## Global Phred content functions
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
setMethod("phredDist", "Fastqq", function(object, i){
idx <- 1:nFiles(object)
if(missing(i))
i <- idx
else
{
if(!is.numeric(i))
stop("'i' must be numeric.")
if(!all(is.element(i, idx)))
stop("'i' is out of range.")
}
phred <- Reduce("+", object@phred[i])
phred <- matrix(as.numeric(phred), nrow=nrow(phred))
phred_vals <- apply(phred, 1, sum)
phred_dist <- phred_vals/sum(phred_vals)
names(phred_dist) <- rownames(object@phred[[1]])
return(phred_dist)
})
setMethod("plotPhredDist", "Fastqq", function(object, i, maxp=45, col, ...)
{
if(!is.numeric(maxp))
stop("maxp must be numeric")
if(!is.integer(maxp))
maxp<-as.integer(maxp)
if(maxp <= 0)
stop("maxp must be >=0")
if(missing(col))
col <- topo.colors(10)[3]
phred <- phredDist(object, i)
maxy <- ceiling(max(phred) * 5) / 5
x <- 1:maxp
xmax <- 10 * (ceiling(maxp / 10))
plot(x, phred[x], ylim=c(0, maxy), xlim=c(0, xmax), type="l", lwd=2,
col=col, yaxt="n", bty="n", xlab="Phred value",
ylab="Content (%)", ...)
ylb <- 0:(10 * maxy) / 10
axis(side=2, at=ylb, labels=100 * ylb, las=1)
return(invisible())
})
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
# Not exported:
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
pd_l10 <- function(x){ x <- phredDist(x); return(sum(x[1:10]) / sum(x))}
setMethod("propPhred","Fastqq",function(object, greater=30, less=93)
{
if(!is.numeric(greater))
stop("'greater' must be numeric.")
if(length(greater) != 1)
stop("'greater' must have length 1.")
if(!is.numeric(less))
stop("'less' must be numeric.")
if(length(less) != 1)
stop("'less must have length 1.")
## + + + + + + + + + + + + + + + + + + + + + + ##
## greater and less shall be used as
## array indices: increase greater
## + + + + + + + + + + + + + + + + + + + + + + ##
greater<-as.integer(greater+1)
less<-as.integer(less)
if(greater < 1)
stop("'greater' must be >=0.")
if(less > 93)
stop("'less' must be < 94.")
if(greater >= less)
stop("'greater' must be <= 'less'")
n <- nFiles(object)
res <- numeric(n)
for(i in 1:n)
{
pd <- phredDist(object, i)
res[i] <- sum(pd[greater:less])
}
names(res) <- probeLabel(object)
return(res)
})
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## End: Global Phred content functions
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
setMethod("mergedPhred", "Fastqq", function(object){
sql <- seqLen(object)
maxSeqLen <- max(sql[2, ])
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## as.numeric: Sum on integer is likely to exceed
## maximal 32-bit integer values
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
if(sql[2, 1] < maxSeqLen)
{
mp <- as.numeric(.Call("r_enlarge_int_mat", object@phred[[1]],
c(nrow(object@phred[[1]]), maxSeqLen), PACKAGE="seqTools"))
}else{
mp <- as.numeric(object@phred[[1]])
}
n <- nFiles(object)
if(n > 1)
{
for(i in 2:n)
{
if(sql[2, i] < maxSeqLen)
{
mp <- mp + as.numeric(.Call("r_enlarge_int_mat",
object@phred[[i]],
c(nrow(object@phred[[i]]), maxSeqLen),
PACKAGE="seqTools"))
}else{
mp <- mp + as.numeric(object@phred[[i]])
}
}
}
mp <- matrix(mp, ncol = maxSeqLen)
rownames(mp) <- rownames(object@phred[[1]])
colnames(mp) <- 1:maxSeqLen
return(mp)
})
setMethod("mergedPhredQuantiles", "Fastqq", function(object, quantiles)
{
if(!(all(quantiles >= 0) & all(quantiles <= 1)) )
stop("[mergedPhredQuantiles.Fastqq] all quantiles mustbe in [0, 1]")
quantiles <- sort(unique(round(quantiles, 2)))
sql <- seqLen(object)
maxSeqLen <- max(sql[2, ])
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
## Count qual values for each sequence position
## Convert counts into column-wise relative values
## Maximum counted read length
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
vec <- 1:maxSeqLen
mrg <- mergedPhred(object)
qrel <- as.data.frame(apply(mrg[, vec], 2, rel_real))
names(qrel)
names(qrel) <- vec
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
## Walk through each column and extract row number
## for given quantile values
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
res <- .Call("get_column_quantiles", quantiles, qrel, PACKAGE="seqTools")
return(res)
})
setMethod("plotMergedPhredQuant", "Fastqq", function(object, main, ...)
{
quant <- c(0.1, 0.25, 0.5, 0.75, 0.9)
cols <- c("#1F78B4", "#FF7F00", "#E31A1C", "#FF7F00", "#1F78B4")
qq <- mergedPhredQuantiles(object, quant)
maxQ = floor(1.2*max(qq))
xv <- 1:ncol(qq)
if(missing(main))
main <- paste("Merged position wise Phred Quantiles.", sep = "")
plot(xv, xv, ylim=c(0, maxQ), type="n", bty="n", las=1,
ylab="Phred score", xlab="Sequence position", main=main, ...)
lines(xv, qq[1, ], col=cols[1], lty=2)
lines(xv, qq[2, ], col=cols[2], lty=1)
lines(xv, qq[3, ], col=cols[3], lwd=2)
lines(xv, qq[4, ], col=cols[4], lty=1)
lines(xv, qq[5, ], col=cols[5], lty=2)
legend("top", ncol=6, lty=c(2, 1, 1, 1, 2),
lwd=c(1, 1, 2, 1, 1), col=cols, xjust=0.5,
legend=c("10%", "25%", "50%", "75%", "90%"), bty="n", cex=0.8)
return(invisible())
})
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## End: Phred related functions
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## Nucleotide frequency related functions
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
setMethod("plotNucFreq", "Fastqq", function(object, i, main, maxx, ...)
{
if(!is.numeric(i))
stop("'i' must be numeric.")
i <- as.integer(i)
if( (i < 1) || (i > object@nFiles) )
stop("'i' must be >0 and <nFiles.")
maxSeqlen <- max(seqLen(object)[2, ])
if(missing(maxx))
{
maxx <- maxSeqlen
}
else
{
if(!is.numeric(maxx))
stop("'maxx' must be numeric.")
maxx <- as.integer(maxx)
if(maxx < 1)
stop("'maxx' must be >0.")
if(maxx > maxSeqlen)
maxx <- maxSeqlen
}
# Skip extra line
x <- 1:maxx
nac <- object@nac[[i]][1:4, x]
nacrel <- apply(nac, 2, rel_int)
maxY = round(1.4 * max(nacrel), 1)
# Maximum counted read length
nacrel <- apply(nac, 2, rel_int)
cols <- c("#1F78B4", "#33A02C", "#E31A1C", "#FF7F00")
if(missing(main))
main <- paste("Position wise Nucleotide frequency (",
probeLabel(object)[i], ")", sep="")
plot(x, x, ylim=c(0, maxY), type="n", bty="n", las=1,
ylab="Nucleotide fequency", xlab="sequence position",
main=main, ...)
lines(x, nacrel[1, ], col=cols[1], lwd=2)
lines(x, nacrel[2, ], col=cols[2], lwd=2)
lines(x, nacrel[3, ], col=cols[3], lwd=2)
lines(x, nacrel[4, ], col=cols[4], lwd=2)
legend("top", ncol=6,
lwd=c(1, 1, 2, 1, 1), col=cols, xjust=0.5,
legend=c("A", "C", "G", "T"), bty="n", cex=0.8)
return(invisible())
})
setMethod("plotGCcontent", "Fastqq", function(object,main,...)
{
cols <- c("#A6CEE3", "#1F78B4", "#B2DF8A", "#33A02C", "#FB9A99",
"#E31A1C", "#FDBF6F", "#FF7F00", "#CAB2D6", "#6A3D9A")
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
## Normalize matrix to colsum = 1
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
gc <- apply(object@gcContent, 2, rel_int)
maxY = round(1.3*max(gc), 2)
nf <- nFiles(object)
x <- 1:nrow(gc)
if(missing(main))
main<-"GC content"
plot(x, x, ylim=c(0, maxY), type="n", bty="n", las=1,
ylab="Proportion of reads (%)", xlab="Relative GC content (%)",
main=main, ...)
for(i in 1:nf)
lines(x, gc[, i], col=cols[i], lwd=2)
legend("right", lwd=2, col=cols, legend=probeLabel(object),
bty="n", cex=0.8)
return(invisible())
})
setMethod("getGCcontent", "Fastqq", function(object){
f <- function(x) { return(sum(as.numeric(x) * 0:100) / sum(as.numeric(x)))}
n <- length(object@filenames)
gc <- apply(object@gcContent, 2, f)
names(gc) <- probeLabel(object)
return(gc)
})
setMethod("plotNucCount", "Fastqq", function(object, nucs=16, maxx, ...)
{
# j = 16: N, j = 2:3: gc
if(!is.numeric(nucs))
stop("'nucs' must be numeric.")
nucs <- as.integer(nucs)
if(any(nucs < 1) || any(nucs > 19))
stop("'nucs' must be >0 and <20.")
maxSeqlen <- max(seqLen(object)[2, ])
if(missing(maxx))
{
maxx <- maxSeqlen
}
else
{
if(!is.numeric(maxx))
stop("'maxx' must be numeric.")
maxx <- as.integer(maxx)
if(maxx<1)
stop("'maxx must be >0.")
if(maxx > maxSeqlen)
maxx <- maxSeqlen
}
n <- nFiles(object)
fvec <- 1:n
i <- 1
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## Skip extra line
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
x <- 1:maxx
nac <- object@nac[[i]][, x]
nacrel <- apply(nac, 2, rel_int)
if(length(nucs) == 1){
dfr <- data.frame(a = nacrel[nucs, ])
}else{
dfr <- data.frame(a = apply(nacrel[nucs, ], 2, sum))
}
if(n > 1)
{
for(i in 2:n)
{
nac <- object@nac[[i]][, x]
nacrel <- apply(nac, 2, rel_int)
if(length(nucs) == 1){
dfr[, i] <- nacrel[nucs, ]
}else{
dfr[, i] <- apply(nacrel[nucs, ], 2, sum)
}
}
}
maxY = round(1.4 * max(dfr), 3)
cols <- c("#A6CEE3", "#1F78B4", "#B2DF8A", "#33A02C", "#FB9A99",
"#E31A1C", "#FDBF6F", "#FF7F00", "#CAB2D6", "#6A3D9A")
nv <- paste(iupac_chars[nucs], collapse = "")
plot(x, x, ylim=c(0, maxY), type="n", bty="n", las=1,
ylab="Nucleotide fequency", xlab="sequence position",
main=paste("Position wise Nucleotide frequency: '",
nv, "'", sep=""))
for(i in fvec)
lines(x, dfr[, i], col=cols[i %% 10], lwd=2)
legend("top", ncol=6,
lwd=c(1, 1, 2, 1, 1), col=cols[fvec %% 10], xjust=0.5,
legend=probeLabel(object), bty="n", cex=0.8)
return(invisible())
})
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## End: Nucleotide frequency related functions
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
setMethod("plotKmerCount", "Fastqq",
function(object, index, mxey, main="K-mer count", ...)
{
n <- nFiles(object)
if(missing(index))
{
index <- 1:n
}else{
if(!is.numeric(index))
stop("'index' must be numeric.")
index <- sort(unique(as.integer(index)))
if(any(index < 0) || any(index > n))
stop("'index' must be in 1, .., ", n, " ( = nFiles).")
}
if(!missing(mxey))
{
if(!is.numeric(mxey))
stop("'mxey' must be numeric.")
mxey <- as.integer(mxey)
if(mxey<1)
stop("'mxey' must be positive.")
}
cols <- c("#A6CEE3", "#1F78B4", "#B2DF8A", "#33A02C", "#FB9A99",
"#E31A1C", "#FDBF6F", "#FF7F00", "#CAB2D6", "#6A3D9A")
pk <- 6
if(object@k <= pk)
{
pk <- object@k
x <- 1:(4^pk)
if(missing(mxey))
lg2y <- floor(log2(1.2 * (max(object@kmer))))
else
lg2y <- mxey
maxY <- 2^lg2y
plot(x, x, ylim=c(0, maxY), type="n", bty="n", las=1,
ylab="K-mer count", xlab="K-mer index", main=main,
axes=FALSE, ...)
for(i in index)
lines(x, object@kmer[, i], col=cols[i], lwd=2)
}
else
{
x <- 1:(4^pk)
melt_factor <- as.integer(4^(object@k - pk))
y_factor <- max(.Call("melt_vector", object@kmer[, 1], melt_factor,
PACKAGE="seqTools")) / max(object@kmer[, 1])
if(missing(mxey))
lg2y <- floor(log2(1.2 * (max(object@kmer)) * y_factor))
else
lg2y <- mxey
maxY <- 2^lg2y
plot(x, x, ylim=c(0, maxY), type="n", bty="n", las=1,
ylab="K-mer count", xlab="K-mer index",
main=main, axes=FALSE, ...)
for(i in index)
{
lines(x, .Call("melt_vector", object@kmer[, i], melt_factor,
PACKAGE="seqTools"), col=cols[i], lwd=2)
}
}
axis(side=1, at=0:4 * 4^(pk - 1), labels=c("A", "C", "G", "T", ""))
axis(side=2, at=c(0, maxY),
labels=c(0, paste("2^", lg2y, sep="")), las=1)
legend("right", lwd=2, col=cols[index],
legend=probeLabel(object)[index], bty="n", cex=0.8)
return(invisible())
})
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## Merging and melting Fastqq objects
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
setMethod("mergeFastqq", "Fastqq", function(lhs, rhs)
{
res <- new("Fastqq")
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Simple concatenations
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
res@filenames <- c(lhs@filenames, rhs@filenames)
res@nFiles <- lhs@nFiles+rhs@nFiles
res@nReads <- c(lhs@nReads, rhs@nReads)
res@nN <- c(lhs@nN, rhs@nN)
res@seqLenCount <- cbind(lhs@seqLenCount, rhs@seqLenCount)
res@gcContent <- cbind(gcContentMatrix(lhs), gcContentMatrix(rhs))
res@seqLen <- cbind(lhs@seqLen, rhs@seqLen)
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Singularize probeLabel
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
res@probeLabel <- c(lhs@probeLabel, rhs@probeLabel)
if(any(table(res@probeLabel) > 1))
{
message("[mergeFastqq] Singularizing probeLabel (new suffix).")
res@probeLabel <- paste(1:res@nFiles, res@probeLabel, sep="_")
}
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
## Eventually resize arrays when lhs and rhs have different maxSeqLen
## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ##
if(lhs@maxSeqLen > rhs@maxSeqLen)
{
message("[mergeFastqq] Resizing rhs.")
msl <- lhs@maxSeqLen
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## nac
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
new_dim <- as.integer(c(nrow(rhs@nac), msl))
rhs_nac <- .Call("r_enlarge_int_mat", rhs@nac, new_dim,
PACKAGE="seqTools")
res@nac <- c(lhs@nac, rhs_nac)
# seqLenCount
new_dim <- as.integer(c(msl, rhs@nFiles))
rhs_seqLenCount <- .Call("r_enlarge_int_mat",
rhs@seqLenCount, new_dim, PACKAGE="seqTools")
res@seqLenCount <- c(lhs@seqLenCount, rhs_seqLenCount)
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## phred
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
new_dim <- as.integer(nrow(rhs@phred), msl)
rhs_phred_list <- list()
for(i in 1:nFiles(rhs)){
rhs_phred_list[[i]] <- .Call("r_enlarge_int_mat",
rhs@phred[[i]], new_dim, PACKAGE="seqTools")
}
res@phred <- c(lhs@phred, rhs_phred_list)
res@maxSeqLen <- lhs@maxSeqLen
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
} else if(rhs@maxSeqLen > lhs@maxSeqLen)
{
message("[mergeFastqq] Resizing lhs.")
msl <- rhs@maxSeqLen
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## nac
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
new_dim <- as.integer(c(nrow(lhs@nac), msl))
lhs_nac <- .Call("r_enlarge_int_mat",
lhs@nac, new_dim, PACKAGE="seqTools")
res@nac <- c(rhs@nac, lhs_nac)
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## seqLenCount
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
new_dim <- as.integer(c(msl, lhs@nFiles))
lhs_seqLenCount <- .Call("r_enlarge_int_mat",
lhs@seqLenCount, new_dim, PACKAGE="seqTools")
res@seqLenCount <- c(rhs@seqLenCount, lhs_seqLenCount)
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## phred
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
new_dim <- as.integer(nrow(lhs@phred), msl)
lhs_phred_list <- list()
for(i in 1:nFiles(lhs))
{
lhs_phred_list[[i]] <- .Call("r_enlarge_int_mat",
lhs@phred[[i]], new_dim, PACKAGE="seqTools")
}
res@phred <- c(rhs@phred, lhs_phred_list)
res@maxSeqLen <- rhs@maxSeqLen
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
} else {
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## rhs@maxSeqLen == lhs@maxSeqLen
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
res@maxSeqLen = lhs@maxSeqLen
res@nac <- c(lhs@nac, rhs@nac)
res@phred <- c(lhs@phred, rhs@phred)
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
}
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## Eventually melt down k-mer count matrix
## when lhs and rhs have different k
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
res@k <- pmin(lhs@k, rhs@k)
kml <- kmerCount(lhs)
if(lhs@k > res@k)
{
kml <- .Call("melt_kmer_matrix",
kml, c(lhs@k, res@k), PACKAGE="seqTools")
}
kmr <- kmerCount(rhs)
if(rhs@k > res@k)
{
kmr <- .Call("melt_kmer_matrix",
kmr, c(rhs@k, res@k), PACKAGE="seqTools")
}
res@kmer <- cbind(kml, kmr)
fkml <- lhs@firstKmer
if(lhs@k > res@k)
{
fkml <- .Call("melt_kmer_matrix",
fkml, c(lhs@k, res@k), PACKAGE="seqTools")
}
fkmr <- rhs@firstKmer
if(rhs@k > res@k)
{
fkmr <- .Call("melt_kmer_matrix",
fkmr, c(rhs@k, res@k), PACKAGE="seqTools")
}
res@firstKmer <- cbind(fkml, fkmr)
return(res)
})
setMethod("meltDownK", "Fastqq", function(object, newK)
{
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
if(!is.numeric(newK))
stop("'newK' must be numeric")
newK <- as.integer(newK)
if(length(newK) != 1)
stop("'newK' must have length 1.")
if(newK < 1 || newK >= getK(object))
stop("'newK' must be >= 1 and <= ", getK(object), ".")
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
res <- new("Fastqq")
res@filenames <- object@filenames
res@nFiles <- object@nFiles
res@k <- newK
res@maxSeqLen <- object@maxSeqLen
res@kmer <- .Call("melt_kmer_matrix",
object@kmer, c(getK(object), newK), PACKAGE="seqTools")
res@firstKmer <- .Call("melt_kmer_matrix",
object@firstKmer, c(getK(object), newK), PACKAGE="seqTools")
res@nReads <- object@nReads
res@nN <- object@nN
res@seqLenCount <- object@seqLenCount
res@gcContent <- object@gcContent
res@nac <- object@nac
res@phred <- object@phred
res@seqLen <- object@seqLen
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
return(res)
})
listMelt <- function(x, oldK, newK)
{
f <- function(x) .Call("melt_kmer_matrix", x,
as.integer(c(oldK, newK)), PACKAGE="seqTools")
return(lapply(x, f))
}
setMethod("[", "Fastqq", function(x, i, j, drop="missing")
{
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
res <- new("Fastqq")
res@filenames <- x@filenames[i]
res@probeLabel <- x@probeLabel[i]
res@nFiles <- length(i)
res@k <- x@k
res@seqLen <- x@seqLen[, i, drop=FALSE]
res@maxSeqLen <- max(res@seqLen[2, ])
res@kmer <- x@kmer[, i, drop=FALSE]
res@firstKmer <- x@firstKmer[, i, drop=FALSE]
res@nN <- x@nN[i]
res@seqLenCount <- x@seqLenCount[, i, drop=FALSE]
res@gcContent <- x@gcContent[, i, drop=FALSE]
res@nac <- x@nac[i]
res@phred <- x@phred[i]
res@nReads <- x@nReads[i]
res@collectTime <- x@collectTime
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
return(res)
})
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## Calculation of distance matrix based on Canberra distance
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
setMethod("cbDistMatrix", "Fastqq",
function(object, nReadNorm=max(nReads(object)))
{
if(!is.numeric(nReadNorm))
stop("'nReadNorm' must be numeric.")
nReadNorm <- as.integer(nReadNorm)
if(nReadNorm < max(nReads(object)))
stop("'nReadNorm' must be greater than all nRead.")
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## Column-wise normalizing read counts (by upscaling)
## so that column sums become nearly equal in order to
## compensate sequencing depth artifacts in
## Canberra distance values
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
scale <- nReadNorm/nReads(object)
scaled <- .Call("scale_kmer_matrix",
kmerCount(object), scale, PACKAGE="seqTools")
colnames(scaled) <- object@probeLabel
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
return(.Call("cb_distance_matrix", scaled, PACKAGE="seqTools"))
})
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## END OF FILE
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.