Nothing
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## ##
## Project : seqTools ##
## Created : 26.August.2013 ##
## Author : W. Kaisers ##
## Content : Doing some diagnostic and interventional tasks on fastq ##
## and fasta ##
## esp. DNA k-mer counts. ##
## Version : 0.99.34 ##
## ##
## Changelog : ##
## 26.08.13 : 0.0.1 Project created ##
## 03.09.13 : 0.0.6 C-Code valgrind tested ##
## 27.09.13 : 0.1.0 Added fastqDnaKmers ##
## 14.10.13 : 0.1.1 Added C-library for parsing gz fasta and fastq files ##
## 17.10.13 : 0.1.3 C-Wrapper for fastq working. ##
## 17.10.13 : 0.1.6 First version of R package ##
## 21.10.13 : 0.3.0 New C library for fastq parsing ##
## 28.10.13 : 0.4.0 Added fastq-loc functions ##
## 29.10.13 : 0.4.4 seqTools C-code valgrind tested. ##
## 01.11.13 : 0.5.0 Distance matrices implemented ##
## 02.11.13 : 0.5.1 First working version with clustering based on ##
## K-mers ##
## 07.11.13 : 0.5.4 countFastaKmers now resizes arrays automatically ##
## 09.11.13 : 0.6.0 count_fastq now resizes arrays automatically ##
## 11.11.13 : 0.6.5 Added fastq simulation functions ##
## 19.11.13 : 0.7.0 Added trimFastq function ##
## 30.11.13 : 0.9.0 Separated R source files ##
## 22.12.13 : 0.99.2 Added '['-operator for Fastqq class ##
## 19.01.14 : 0.99.7 Added zlib version control for correction of ##
## gzBuffer ##
## error ##
## + checks: cran win-builder + valgrind ##
## 21.05.14 : 0.99.34 Corrected error in count_fasta_Kmers ##
## which freezed function ##
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
.onUnload <- function(libpath) { library.dynam.unload("seqTools", libpath) }
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## Data collection functions:
## Fastqq, fastqKmerLocs, fastqKmerSubsetLocs
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
fastqq <- function(filenames, k = 6, probeLabel)
{
k <- as.integer(k)
tl <- list()
tl$start <- Sys.time()
res <- .Call("count_fastq", filenames, k, PACKAGE = "seqTools")
tl$end <- Sys.time()
res@collectTime <- tl
if(!missing(probeLabel))
{
if(length(probeLabel) == res@nFiles)
res@probeLabel <- as.character(probeLabel)
else{
warning("[Fastqq] probeLabel and filenames must have equal length.")
res@probeLabel <- as.character(1:res@nFiles)
}
}else{
res@probeLabel <- as.character(1:res@nFiles)
}
return(res)
}
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## fastq K-mer locs
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
fastqKmerLocs <- function(filenames, k = 4)
{
if(!is.numeric(k))
stop("'k' must be numeric.")
k <- as.integer(k)
if( (k < 0) || (k > max_k) )
stop("'k' must be in range 0, ... , 16.")
return(.Call("fastq_Kmer_locs", filenames, k, PACKAGE = "seqTools"))
}
fastqKmerSubsetLocs <- function(filenames, k = 4, kIndex)
{
# Returns list with matrix elements.
if(!is.numeric(k))
stop("'k' must be numeric.")
k <- as.integer(k)
if( (k < 0) || (k > max_k) )
stop("'k' must be in range 0, ... , ", max_k, ".")
if(!is.numeric(kIndex))
stop("'kIndex' must be numeric.")
kIndex <- as.integer(kIndex)
if(any(kIndex < 0))
stop("No negative 'kIndex' values allowed.")
if(any(kIndex > (4^k)) )
stop("'kIndex' out of range (>4^k).")
return(.Call("fastq_KmerSubset_locs", filenames, k, kIndex,
PACKAGE = "seqTools"))
}
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## End: Data collection functions.
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## Standard slot accessor functions
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
setMethod("fileNames", "Fastqq", function(object)
{return(object@filenames)})
setMethod("collectTime", "Fastqq", function(object)
{return(object@collectTime)})
setMethod("collectDur", "Fastqq", function(object) {
return(as.numeric(difftime(object@collectTime$end, object@collectTime$start,
units = "secs")))
})
setMethod("getK", "Fastqq", function(object) {return(object@k)})
setMethod("nFiles", "Fastqq", function(object) {return(object@nFiles)})
setMethod("nNnucs", "Fastqq", function(object) {return(object@nN)})
setMethod("nReads", "Fastqq", function(object) {return(object@nReads)})
setMethod("maxSeqLen", "Fastqq", function(object) {return(object@maxSeqLen)})
setMethod("seqLenCount", "Fastqq", function(object)
{
res<-object@seqLenCount
colnames(res) <- object@probeLabel
return(res)
})
setMethod("nucFreq", "Fastqq", function(object, i)
{
if(missing(i))
stop("Argument 'i' is not optional.")
if(!is.numeric(i))
stop("'i' must be numeric.")
i <- as.integer(i)
if( (i < 1) || (i > object@nFiles) )
stop("'i' must be >0 and < nFiles.")
return(object@nac[[i]])
})
setMethod("gcContent", "Fastqq", function(object, i)
{
if(missing(i))
stop("Argument 'i' is not optional.")
if(!is.numeric(i))
stop("'i' must be numeric.")
i <- as.integer(i)
if( (i < 1) || (i > object@nFiles))
stop("'i' must be >0 and < nFiles.")
return(object@gcContent[, i])
})
setMethod("gcContentMatrix", "Fastqq", function(object)
{
gcc <- object@gcContent
colnames(gcc) <- object@probeLabel
return(gcc)
})
setMethod("seqLen", "Fastqq", function(object)
{
sql <- object@seqLen
colnames(sql) <- object@probeLabel
return(sql)
})
setMethod("kmerCount", "Fastqq", function(object)
{
kmer <- object@kmer
colnames(kmer) <- object@probeLabel
return(kmer)
})
setMethod("probeLabel", "Fastqq", function(object){return(object@probeLabel)})
setReplaceMethod("probeLabel", "Fastqq", function(object, value)
{
if(length(value) != nFiles(object))
stop("'value' must have length ", nFiles(object), ".")
val <- as.character(value)
if(any(table(val)) > 1)
{
warning("[probeLabel <- .Fastqq] probeLabel unique suffix added.")
val <- paste(1:nFiles(object), val, sep = "_")
}
object@probeLabel <- val
return(object)
})
setMethod("phred", signature = "Fastqq", definition = function(object, i)
{
if(missing(i))
stop("Argument 'i' is not optional.")
if(!is.numeric(i))
stop("'i' must be numeric.")
i <- as.integer(i)
if( (i < 1) || (i > object@nFiles) )
stop("'i' must be >0 and < nFiles.")
return(object@phred[[i]])
})
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## End: Standard slot accessor functions
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
setMethod("show", "Fastqq", function(object){
bm <- Sys.localeconv()[7]
w <- 20
r <- "right"
cat("Class : ", format(class(object), w = w, j = r) , "\n", sep = "")
cat("nFiles : ", format(format(nFiles(object) , big.m = bm), w = w, j = r) , "\n", sep = "")
cat("maxSeqLen : ", format(format(maxSeqLen(object) , big.m = bm), w = w, j = r) , "\n", sep = "")
cat("k (Kmer len): ", format(format(getK(object) , big.m = bm), w = w, j = r) , "\n", sep = "")
cat("\n")
cat("nReads : ", format(format(sum(as.numeric(nReads(object))) , big.m = bm), w = w, j = r) , "\n", sep = "")
cat("nr N nuc : ", format(format(sum(nNnucs(object)) , big.m = bm), w = w, j = r) , "\n", sep = "")
cat("Min seq len : ", format(format(min(seqLen(object)[1, ]), big.m = bm), w = w, j = r) , "\n", sep = "")
cat("Max seq len : ", format(format(max(seqLen(object)[2, ]), big.m = bm), w = w, j = r) , "\n", sep = "")
return(invisible())
})
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## Phred related functions
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
setMethod("phredQuantiles", "Fastqq", function(object, quantiles, i, ...){
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
# Checking arguments
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
## Check quantiles argument
if(missing(quantiles))
stop("'quantiles' argument is not optional")
if(!is.numeric(quantiles))
stop("Quantiles must be numeric.")
if(!(all(quantiles >= 0) & all(quantiles <= 1) ))
stop("All quantiles mustbe in [0, 1]")
quantiles <- sort(unique(round(quantiles, 2)))
## Check 'i' argument
if(missing(i))
stop("'i' argument is not optional")
if(!is.numeric(i))
stop("'i' must be numeric.")
if(length(i)>1)
stop("'i' must have length 1")
i <- as.integer(i)
if( (i < 1) || (i > object@nFiles) )
stop("'i' must be >0 and <nFiles.")
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
# Count qual values for each sequence position
# Convert integer counts into column-wise relative values
# Maximum counted read length
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
vec <- 1:seqLen(object)[2, i]
qrel <- as.data.frame(apply(object@phred[[i]][, vec], 2, rel_int))
names(qrel) <- vec
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
# Walk through each column and extract row number
# for given quantile values
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
res <- .Call("get_column_quantiles", quantiles, qrel, PACKAGE = "seqTools")
return(res)
})
setMethod("plotPhredQuant", "Fastqq", function(object, i, main, ...){
if(!is.numeric(i))
stop("'i' must be numeric.")
i <- as.integer(i)
if( (i < 1) || (i > object@nFiles) )
stop("'i' must be >0 and <nFiles.")
quant <- c(0.1, 0.25, 0.5, 0.75, 0.9)
cols <- c("#1F78B4", "#FF7F00", "#E31A1C", "#FF7F00", "#1F78B4")
qq <- phredQuantiles(object, quant, i)
maxQ = floor(1.2*max(qq))
xv <- 1:ncol(qq)
if(missing(main))
main <- paste("Position wise Phred Quantiles (", probeLabel(object)[i], ")", sep = "")
plot(xv, xv, ylim = c(0, maxQ), type = "n", bty = "n", las = 1, ylab = "Phred score",
xlab = "Sequence position", main = main, ...)
lines(xv, qq[1, ], col = cols[1], lty = 2)
lines(xv, qq[2, ], col = cols[2], lty = 1)
lines(xv, qq[3, ], col = cols[3], lwd = 2)
lines(xv, qq[4, ], col = cols[4], lty = 1)
lines(xv, qq[5, ], col = cols[5], lty = 2)
legend("top", ncol = 6, lty = c(2, 1, 1, 1, 2),
lwd = c(1, 1, 2, 1, 1), col = cols, xjust = 0.5,
legend = c("10%", "25%", "50%", "75%", "90%"), bty = "n", cex = 0.8)
return(invisible())
})
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## Global Phred content functions
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
setMethod("phredDist", "Fastqq", function(object, i){
idx <- 1:nFiles(object)
if(missing(i))
i <- idx
else
{
if(!is.numeric(i))
stop("'i' must be numeric.")
if(!all(is.element(i, idx)))
stop("'i' is out of range.")
}
phred <- Reduce("+", object@phred[i])
phred <- matrix(as.numeric(phred), nrow = nrow(phred))
phred_vals <- apply(phred, 1, sum)
phred_dist <- phred_vals/sum(phred_vals)
names(phred_dist) <- rownames(object@phred[[1]])
return(phred_dist)
})
setMethod("plotPhredDist", "Fastqq", function(object, i, maxp = 45, col, ...)
{
if(!is.numeric(maxp))
stop("maxp must be numeric")
if(!is.integer(maxp))
maxp<-as.integer(maxp)
if(maxp<=0)
stop("maxp must be >=0")
if(missing(col))
col <- topo.colors(10)[3]
phred <- phredDist(object, i)
maxy <- ceiling(max(phred) * 5) / 5
x <- 1:maxp
xmax <- 10*(ceiling(maxp/10))
plot(x, phred[x], ylim = c(0, maxy), xlim = c(0, xmax), type = "l", lwd = 2,
col = col, yaxt = "n", bty = "n", xlab = "Phred value",
ylab = "Content (%)", ...)
ylb <- 0:(10 * maxy) / 10
axis(side = 2, at = ylb, labels = 100*ylb, las = 1)
return(invisible())
})
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
# Not exported:
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
pd_l10 <- function(x){ x <- phredDist(x); return(sum(x[1:10]) / sum(x))}
setMethod("propPhred","Fastqq",function(object, greater = 30, less = 93)
{
if(!is.numeric(greater))
stop("'greater' must be numeric.")
if(length(greater) != 1)
stop("'greater' must have length 1.")
if(!is.numeric(less))
stop("'less' must be numeric.")
if(length(less) != 1)
stop("'less must have length 1.")
# + + + + + + + + + + + + + + + + + + + + + + #
# greater and less shall be used as
# array indices: increase greater
# + + + + + + + + + + + + + + + + + + + + + + #
greater<-as.integer(greater+1)
less<-as.integer(less)
if(greater < 1)
stop("'greater' must be >=0.")
if(less > 93)
stop("'less' must be < 94.")
if(greater>=less)
stop("'greater' must be <= 'less'")
n<-nFiles(object)
res<-numeric(n)
for(i in 1:n)
{
pd<-phredDist(object,i)
res[i]<-sum(pd[greater:less])
}
names(res)<-probeLabel(object)
return(res)
})
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## End: Global Phred content functions
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
setMethod("mergedPhred", "Fastqq", function(object){
sql <- seqLen(object)
maxSeqLen <- max(sql[2, ])
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
# as.numeric: Sum on integer is likely to exceed
# maximal 32-bit integer values
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
if(sql[2, 1] < maxSeqLen)
{
mp <- as.numeric(.Call("r_enlarge_int_mat", object@phred[[1]],
c(nrow(object@phred[[1]]), maxSeqLen), PACKAGE = "seqTools"))
}else{
mp <- as.numeric(object@phred[[1]])
}
n <- nFiles(object)
if(n>1)
{
for(i in 2:n)
{
if(sql[2, i] < maxSeqLen)
{
mp <- mp+as.numeric(.Call("r_enlarge_int_mat", object@phred[[i]],
c(nrow(object@phred[[i]]), maxSeqLen), PACKAGE = "seqTools"))
}else{
mp <- mp+as.numeric(object@phred[[i]])
}
}
}
mp <- matrix(mp, ncol = maxSeqLen)
rownames(mp) <- rownames(object@phred[[1]])
colnames(mp) <- 1:maxSeqLen
return(mp)
})
setMethod("mergedPhredQuantiles", "Fastqq", function(object, quantiles){
if(!(all(quantiles >= 0) & all(quantiles <= 1)) )
stop("[mergedPhredQuantiles.Fastqq] all quantiles mustbe in [0, 1]")
quantiles <- sort(unique(round(quantiles, 2)))
sql <- seqLen(object)
maxSeqLen <- max(sql[2, ])
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
# Count qual values for each sequence position
# Convert counts into column-wise relative values
# Maximum counted read length
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
vec <- 1:maxSeqLen
mrg <- mergedPhred(object)
qrel <- as.data.frame(apply(mrg[, vec], 2, rel_real))
names(qrel)
names(qrel) <- vec
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
# Walk through each column and extract row number
# for given quantile values
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
res <- .Call("get_column_quantiles", quantiles, qrel, PACKAGE = "seqTools")
return(res)
})
setMethod("plotMergedPhredQuant", "Fastqq", function(object, main, ...){
quant <- c(0.1, 0.25, 0.5, 0.75, 0.9)
cols <- c("#1F78B4", "#FF7F00", "#E31A1C", "#FF7F00", "#1F78B4")
qq <- mergedPhredQuantiles(object, quant)
maxQ = floor(1.2*max(qq))
xv <- 1:ncol(qq)
if(missing(main))
main <- paste("Merged position wise Phred Quantiles.", sep = "")
plot(xv, xv, ylim = c(0, maxQ), type = "n", bty = "n", las = 1,
ylab = "Phred score", xlab = "Sequence position", main = main, ...)
lines(xv, qq[1, ], col = cols[1], lty = 2)
lines(xv, qq[2, ], col = cols[2], lty = 1)
lines(xv, qq[3, ], col = cols[3], lwd = 2)
lines(xv, qq[4, ], col = cols[4], lty = 1)
lines(xv, qq[5, ], col = cols[5], lty = 2)
legend("top", ncol = 6, lty = c(2, 1, 1, 1, 2),
lwd = c(1, 1, 2, 1, 1), col = cols, xjust = 0.5,
legend = c("10%", "25%", "50%", "75%", "90%"), bty = "n", cex = 0.8)
return(invisible())
})
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## End: Phred related functions
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## Nucleotide frequency related functions
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
setMethod("plotNucFreq", "Fastqq", function(object, i, main, maxx, ...){
if(!is.numeric(i))
stop("'i' must be numeric.")
i <- as.integer(i)
if( (i < 1) || (i > object@nFiles) )
stop("'i' must be >0 and <nFiles.")
maxSeqlen <- max(seqLen(object)[2, ])
if(missing(maxx))
{
maxx <- maxSeqlen
}
else
{
if(!is.numeric(maxx))
stop("'maxx' must be numeric.")
maxx <- as.integer(maxx)
if(maxx < 1)
stop("'maxx' must be >0.")
if(maxx > maxSeqlen)
maxx <- maxSeqlen
}
# Skip extra line
x <- 1:maxx
nac <- object@nac[[i]][1:4, x]
nacrel <- apply(nac, 2, rel_int)
maxY = round(1.4*max(nacrel), 1)
# Maximum counted read length
nacrel <- apply(nac, 2, rel_int)
cols <- c("#1F78B4", "#33A02C", "#E31A1C", "#FF7F00")
if(missing(main))
main <- paste("Position wise Nucleotide frequency (",
probeLabel(object)[i], ")", sep = "")
plot(x, x, ylim = c(0, maxY), type = "n", bty = "n", las = 1,
ylab = "Nucleotide fequency", xlab = "sequence position",
main = main, ...)
lines(x, nacrel[1, ], col = cols[1], lwd = 2)
lines(x, nacrel[2, ], col = cols[2], lwd = 2)
lines(x, nacrel[3, ], col = cols[3], lwd = 2)
lines(x, nacrel[4, ], col = cols[4], lwd = 2)
legend("top", ncol = 6,
lwd = c(1, 1, 2, 1, 1), col = cols, xjust = 0.5,
legend = c("A", "C", "G", "T"), bty = "n", cex = 0.8)
return(invisible())
})
setMethod("plotGCcontent", "Fastqq", function(object,main,...)
{
cols <- c("#A6CEE3", "#1F78B4", "#B2DF8A", "#33A02C", "#FB9A99",
"#E31A1C", "#FDBF6F", "#FF7F00", "#CAB2D6", "#6A3D9A")
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
# Normalize matrix to colsum = 1
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
gc <- apply(object@gcContent, 2, rel_int)
maxY = round(1.3*max(gc), 2)
nf <- nFiles(object)
x <- 1:nrow(gc)
if(missing(main))
main<-"GC content"
plot(x, x, ylim = c(0, maxY), type = "n", bty = "n", las = 1,
ylab = "Proportion of reads (%)", xlab = "Relative GC content (%)",
main = main, ...)
for(i in 1:nf)
lines(x, gc[, i], col = cols[i], lwd = 2)
legend("right", lwd = 2, col = cols, legend = probeLabel(object),
bty = "n", cex = 0.8)
return(invisible())
})
setMethod("plotNucCount", "Fastqq", function(object, nucs = 16, maxx, ...){
# j = 16: N, j = 2:3: gc
if(!is.numeric(nucs))
stop("'nucs' must be numeric.")
nucs <- as.integer(nucs)
if(any(nucs < 1) || any(nucs > 19))
stop("'nucs' must be >0 and <20.")
maxSeqlen <- max(seqLen(object)[2, ])
if(missing(maxx))
{
maxx <- maxSeqlen
}
else
{
if(!is.numeric(maxx))
stop("'maxx' must be numeric.")
maxx <- as.integer(maxx)
if(maxx<1)
stop("'maxx must be >0.")
if(maxx>maxSeqlen)
maxx <- maxSeqlen
}
n <- nFiles(object)
fvec <- 1:n
i <- 1
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
# Skip extra line
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
x <- 1:maxx
nac <- object@nac[[i]][, x]
nacrel <- apply(nac, 2, rel_int)
if(length(nucs) == 1){
dfr <- data.frame(a = nacrel[nucs, ])
}else{
dfr <- data.frame(a = apply(nacrel[nucs, ], 2, sum))
}
if(n > 1)
{
for(i in 2:n)
{
nac <- object@nac[[i]][, x]
nacrel <- apply(nac, 2, rel_int)
if(length(nucs) == 1){
dfr[, i] <- nacrel[nucs, ]
}else{
dfr[, i] <- apply(nacrel[nucs, ], 2, sum)
}
}
}
maxY = round(1.4*max(dfr), 3)
cols <- c("#A6CEE3", "#1F78B4", "#B2DF8A", "#33A02C", "#FB9A99",
"#E31A1C", "#FDBF6F", "#FF7F00", "#CAB2D6", "#6A3D9A")
nv <- paste(iupac_chars[nucs], collapse = "")
plot(x, x, ylim = c(0, maxY), type = "n", bty = "n", las = 1,
ylab = "Nucleotide fequency", xlab = "sequence position",
main = paste("Position wise Nucleotide frequency: '",
nv, "'", sep = ""))
for(i in fvec)
lines(x, dfr[, i], col = cols[i%%10], lwd = 2)
legend("top", ncol = 6,
lwd = c(1, 1, 2, 1, 1), col = cols[fvec%%10], xjust = 0.5,
legend = probeLabel(object), bty = "n", cex = 0.8)
return(invisible())
})
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## End: Nucleotide frequency related functions
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
setMethod("plotKmerCount", "Fastqq", function(object, index, mxey,
main = "K-mer count", ...){
n <- nFiles(object)
if(missing(index))
{
index <- 1:n
}else{
if(!is.numeric(index))
stop("'index' must be numeric.")
index <- sort(unique(as.integer(index)))
if(any(index < 0) || any(index > n))
stop("'index' must be in 1, .., ", n, " ( = nFiles).")
}
if(!missing(mxey))
{
if(!is.numeric(mxey))
stop("'mxey' must be numeric.")
mxey <- as.integer(mxey)
if(mxey<1)
stop("'mxey' must be positive.")
}
cols <- c("#A6CEE3", "#1F78B4", "#B2DF8A", "#33A02C", "#FB9A99",
"#E31A1C", "#FDBF6F", "#FF7F00", "#CAB2D6", "#6A3D9A")
pk <- 6
if(object@k <= pk)
{
pk <- object@k
x <- 1:(4^pk)
if(missing(mxey))
lg2y <- floor(log2(1.2*(max(object@kmer))))
else
lg2y <- mxey
maxY <- 2^lg2y
plot(x, x, ylim = c(0, maxY), type = "n", bty = "n", las = 1,
ylab = "K-mer count", xlab = "K-mer index", main = main,
axes = FALSE, ...)
for(i in index)
lines(x, object@kmer[, i], col = cols[i], lwd = 2)
}
else
{
x <- 1:(4^pk)
melt_factor <- as.integer( 4^(object@k - pk) )
y_factor <- max(.Call("melt_vector", object@kmer[, 1], melt_factor,
PACKAGE = "seqTools"))/max(object@kmer[, 1])
if(missing(mxey))
lg2y <- floor(log2(1.2*(max(object@kmer))*y_factor))
else
lg2y <- mxey
maxY <- 2^lg2y
plot(x, x, ylim = c(0, maxY), type = "n", bty = "n", las = 1,
ylab = "K-mer count", xlab = "K-mer index",
main = main, axes = FALSE, ...)
for(i in index)
{
lines(x, .Call("melt_vector", object@kmer[, i], melt_factor,
PACKAGE = "seqTools"), col = cols[i], lwd = 2)
}
}
axis(side = 1, at = 0:4 * 4^(pk - 1), labels = c("A", "C", "G", "T", ""))
axis(side = 2, at = c(0, maxY),
labels = c(0, paste("2^", lg2y, sep = "")), las = 1)
legend("right", lwd = 2, col = cols[index],
legend = probeLabel(object)[index], bty = "n", cex = 0.8)
return(invisible())
})
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## Merging and melting Fastqq objects
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
setMethod("mergeFastqq", "Fastqq", function(lhs, rhs){
res <- new("Fastqq")
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
# Simple concatenations
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
res@filenames <- c(lhs@filenames, rhs@filenames)
res@nFiles <- lhs@nFiles+rhs@nFiles
res@nReads <- c(lhs@nReads, rhs@nReads)
res@nN <- c(lhs@nN, rhs@nN)
res@seqLenCount <- cbind(lhs@seqLenCount, rhs@seqLenCount)
res@gcContent <- cbind(gcContentMatrix(lhs), gcContentMatrix(rhs))
res@seqLen <- cbind(lhs@seqLen, rhs@seqLen)
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
# Singularize probeLabel
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
res@probeLabel <- c(lhs@probeLabel, rhs@probeLabel)
if(any(table(res@probeLabel)>1))
{
message("[mergeFastqq] Singularizing probeLabel (new suffix).")
res@probeLabel <- paste(1:res@nFiles, res@probeLabel, sep = "_")
}
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
# Eventually resize arrays when lhs and rhs have different maxSeqLen
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
if(lhs@maxSeqLen>rhs@maxSeqLen)
{
message("[mergeFastqq] Resizing rhs.")
msl <- lhs@maxSeqLen
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
# nac
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
new_dim <- as.integer(c(nrow(rhs@nac), msl))
rhs_nac <- .Call("r_enlarge_int_mat", rhs@nac, new_dim, PACKAGE = "seqTools")
res@nac <- c(lhs@nac, rhs_nac)
# seqLenCount
new_dim <- as.integer(c(msl, rhs@nFiles))
rhs_seqLenCount <- .Call("r_enlarge_int_mat", rhs@seqLenCount, new_dim,
PACKAGE = "seqTools")
res@seqLenCount <- c(lhs@seqLenCount, rhs_seqLenCount)
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
# phred
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
new_dim <- as.integer(nrow(rhs@phred), msl)
rhs_phred_list <- list()
for(i in 1:nFiles(rhs))
rhs_phred_list[[i]] <- .Call("r_enlarge_int_mat", rhs@phred[[i]],
new_dim, PACKAGE = "seqTools")
res@phred <- c(lhs@phred, rhs_phred_list)
res@maxSeqLen <- lhs@maxSeqLen
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
} else if(rhs@maxSeqLen>lhs@maxSeqLen) {
message("[mergeFastqq] Resizing lhs.")
msl <- rhs@maxSeqLen
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
# nac
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
new_dim <- as.integer(c(nrow(lhs@nac), msl))
lhs_nac <- .Call("r_enlarge_int_mat", lhs@nac, new_dim,
PACKAGE = "seqTools")
res@nac <- c(rhs@nac, lhs_nac)
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
# seqLenCount
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
new_dim <- as.integer(c(msl, lhs@nFiles))
lhs_seqLenCount <- .Call("r_enlarge_int_mat", lhs@seqLenCount,
new_dim, PACKAGE = "seqTools")
res@seqLenCount <- c(rhs@seqLenCount, lhs_seqLenCount)
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
# phred
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
new_dim <- as.integer(nrow(lhs@phred), msl)
lhs_phred_list <- list()
for(i in 1:nFiles(lhs))
lhs_phred_list[[i]] <- .Call("r_enlarge_int_mat", lhs@phred[[i]], new_dim, PACKAGE = "seqTools")
res@phred <- c(rhs@phred, lhs_phred_list)
res@maxSeqLen <- rhs@maxSeqLen
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
} else {
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
# rhs@maxSeqLen == lhs@maxSeqLen
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
res@maxSeqLen = lhs@maxSeqLen
res@nac <- c(lhs@nac, rhs@nac)
res@phred <- c(lhs@phred, rhs@phred)
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
}
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
# Eventually melt down k-mer count matrix when lhs and rhs have different k
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
res@k <- pmin(lhs@k, rhs@k)
kml <- kmerCount(lhs)
if(lhs@k>res@k)
kml <- .Call("melt_kmer_matrix", kml, c(lhs@k, res@k),
PACKAGE = "seqTools")
kmr <- kmerCount(rhs)
if(rhs@k>res@k)
kmr <- .Call("melt_kmer_matrix", kmr, c(rhs@k, res@k),
PACKAGE = "seqTools")
res@kmer <- cbind(kml, kmr)
fkml <- lhs@firstKmer
if(lhs@k>res@k)
fkml <- .Call("melt_kmer_matrix", fkml, c(lhs@k, res@k),
PACKAGE = "seqTools")
fkmr <- rhs@firstKmer
if(rhs@k>res@k)
fkmr <- .Call("melt_kmer_matrix", fkmr, c(rhs@k, res@k),
PACKAGE = "seqTools")
res@firstKmer <- cbind(fkml, fkmr)
return(res)
})
setMethod("meltDownK", "Fastqq", function(object, newK){
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
if(!is.numeric(newK))
stop("'newK' must be numeric")
newK <- as.integer(newK)
if(length(newK) != 1)
stop("'newK' must have length 1.")
if(newK < 1 || newK >= getK(object))
stop("'newK' must be >= 1 and <= ", getK(object), ".")
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
res <- new("Fastqq")
res@filenames <- object@filenames
res@nFiles <- object@nFiles
res@k <- newK
res@maxSeqLen <- object@maxSeqLen
res@kmer <- .Call("melt_kmer_matrix", object@kmer, c(getK(object), newK),
PACKAGE = "seqTools")
res@firstKmer <- .Call("melt_kmer_matrix", object@firstKmer,
c(getK(object), newK), PACKAGE = "seqTools")
res@nReads <- object@nReads
res@nN <- object@nN
res@seqLenCount <- object@seqLenCount
res@gcContent <- object@gcContent
res@nac <- object@nac
res@phred <- object@phred
res@seqLen <- object@seqLen
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
return(res)
})
listMelt <- function(x, oldK, newK) {
f <- function(x) .Call("melt_kmer_matrix", x,
as.integer(c(oldK, newK)), PACKAGE = "seqTools")
return(lapply(x, f))
}
setMethod("[", "Fastqq", function(x, i, j, drop = "missing"){
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
res <- new("Fastqq")
res@filenames <- x@filenames[i]
res@probeLabel <- x@probeLabel[i]
res@nFiles <- length(i)
res@k <- x@k
res@seqLen <- x@seqLen[, i, drop = FALSE]
res@maxSeqLen <- max(res@seqLen[2, ])
res@kmer <- x@kmer[, i, drop = FALSE]
res@firstKmer <- x@firstKmer[, i, drop = FALSE]
res@nN <- x@nN[i]
res@seqLenCount <- x@seqLenCount[, i, drop = FALSE]
res@gcContent <- x@gcContent[, i, drop = FALSE]
res@nac <- x@nac[i]
res@phred <- x@phred[i]
res@nReads <- x@nReads[i]
res@collectTime <- x@collectTime
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
return(res)
})
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## Calculation of distance matrix based on Canberra distance
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
setMethod("cbDistMatrix", "Fastqq", function(object,
nReadNorm = max(nReads(object)))
{
if(!is.numeric(nReadNorm))
stop("'nReadNorm' must be numeric.")
nReadNorm <- as.integer(nReadNorm)
if(nReadNorm < max(nReads(object)))
stop("'nReadNorm' must be greater than all nRead.")
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
# Column-wise normalizing read counts (by upscaling)
# so that column sums become nearly equal in order to
# compensate sequencing depth artifacts in
# Canberra distance values
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
scale <- nReadNorm/nReads(object)
scaled <- .Call("scale_kmer_matrix", kmerCount(object), scale,
PACKAGE = "seqTools")
colnames(scaled) <- object@probeLabel
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
return(.Call("cb_distance_matrix", scaled, PACKAGE = "seqTools"))
})
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## END OF FILE
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.