Nothing
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## ##
## Project : seqTools ##
## Created : 26.August.2013 ##
## Author : W. Kaisers ##
## Version : 0.99.13 ##
## File : simFunctions.r ##
## Content : Functions which produce fastq files with simulated data ##
## (Done for testing Fastqq and DNA k-mer counting) ##
## writeSimFastq, writeSimContFastq, simFastqqRunTimes, ##
## sim_fq ##
## ##
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## Create fastq files with simulated k-mers
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
writeSimFastq<-function(k = 6, nk = 5, nSeq = 10, filename = "sim.fq.gz")
{
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
# k
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
if(!is.numeric(k))
stop("'k' must be numeric.")
if(length(k) != 1)
stop("'k' must have length 1.")
k <- as.integer(k)
if( (k < 1) || (k > max_k) )
stop("'k' out of range.")
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
# nk
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
if(!is.numeric(nk))
stop("'nk' must be numeric.")
if(length(nk) != 1)
stop("'nk' must have length 1.")
nk<-as.integer(nk)
if( (nk < 1) || (nk > 1000) )
stop("nk must be positive and <1000.")
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
# nSeq
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
if(!is.numeric(nSeq))
stop("'nSeq' must be numeric.")
if(nSeq < 1)
stop("'nSeq' must be positive.")
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
# filename
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
if(!is.character(filename))
stop("'filename' must be string.")
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
# Do the work
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
bm <- Sys.localeconv()[7]
val <- as.integer(c(k, nk, nSeq))
pseq <- .Call("sim_dna_k_mer", val, PACKAGE = "seqTools")
res <- .Call("gzwrite_fastq_dna", val, pseq, filename, PACKAGE = "seqTools")
message("[writeSimFastq] file '", basename(filename), "': ",
format(res,big.mark=bm), " Bytes written.")
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
# Terminate function.
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
return(invisible())
}
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
writeSimContFastq<-function(k = 6, nk = 5, nSeq = 10, pos = 1,
kIndex = 1, nContam = nSeq, filename = "simc.fq.gz")
{
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
# k
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
if(!is.numeric(k))
stop("k must be numeric.")
if(length(k) != 1)
stop("k must have length 1.")
k<-as.integer(k)
if( (k < 1) || (k > max_k) )
stop("k out of range.")
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
# nk
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
if(!is.numeric(nk))
stop("nk must be numeric.")
if(length(nk) != 1)
stop("nk must have length 1.")
nk<-as.integer(nk)
if( (nk < 1) || (nk > 1000) )
stop("nk must be positive and <1000.")
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
# nSeq
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
if(!is.numeric(nSeq))
stop("nSeq must be numeric.")
if(length(nSeq) != 1)
stop("nSeq must have length 1.")
if(nSeq < 1)
stop("nSeq must be positive.")
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
# pos
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
if(!is.numeric(pos))
stop("pos must be numeric.")
pos<-as.integer(pos)
if( any(pos < 1) || any(pos > (k*(nk-1)+1) ) )
stop("pos out of range.")
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
# writeSimContFastq should take 1-based positions
# while "set_dna_k_mer" takes 0-based positions
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
pos<-pos - 1L
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
# kIndex
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
if(!is.numeric(kIndex))
stop("kIndex must be numeric.")
kIndex<-as.integer(kIndex)
if(length(pos) != length(kIndex))
stop("pos and kIndex must have equal length.")
if( any(kIndex < 1L) || any(kIndex > 4^k) )
stop("kIndex out of range.")
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
# writeSimContFastq should take 1-based k-indices
# while "set_dna_k_mer" takes 0-based k-indices
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
kIndex <- kIndex-1L
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
# nContam: Deterministic replacements are copied
# into the first 'nContam' reads
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
if(!is.numeric(nContam))
stop("nContam must be integer.")
nContam<-as.integer(nContam)
if(length(nContam) != 1)
stop("nContam must have length 1.")
if( (nContam < 1) || (nContam > nSeq) )
stop("nContam must be >0 and <nSeq.")
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
# filename
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
if(!is.character(filename))
stop("filename must be character.")
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
# Do the work
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
bm <- Sys.localeconv()[7]
val <- as.integer(c(k, nk, nSeq))
pseq <- .Call("sim_dna_k_mer", val, PACKAGE = "seqTools")
prseq <- .Call("set_dna_k_mer", val, pseq,pos, kIndex,
nContam, PACKAGE = "seqTools")
res <- .Call("gzwrite_fastq_dna", val, prseq, filename,
PACKAGE = "seqTools")
message("[writeSimContFastq] file '", basename(filename),
"': ", format(res,big.mark=bm),
" Bytes written.")
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
# Terminate function
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
return(invisible())
}
simFastqqRunTimes<-function(k, nSeq, filedir = ".")
{
if(missing(k))
k <- 2:(max_k)
else
{
if(!is.numeric(k))
stop("'k' must be numeric.")
k <- sort(unique(as.integer(k)))
if(any(k < 1) || any(k > max_k))
stop("'k' out of range.")
}
if(missing(nSeq))
nSeq <- as.integer(c(1e2, 1e3, 1e4, 1e5, 1e6, 1e7))
else {
if(!is.numeric(nSeq))
stop("'nSeq' must be numeric.")
nSeq <- as.integer(nSeq)
if(any(nSeq < 1) )
stop("'nSeq' must be positive.")
}
if(!is.character(filedir))
stop("'filedir' must be character.")
if(!file.exists(filedir))
{
if(!dir.create(filedir))
stop("Cannot create filedir '",filedir,"'.")
}
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
# Write simulated fastq files
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
message("[simFastqqRunTimes] Creating fastq files:")
fqFiles <- character(length(nSeq))
bm <- Sys.localeconv()[7]
for(j in 1:length(nSeq))
{
fqFiles[j] <- file.path(filedir,
paste("sfqrt_nSeq", nSeq[j], ".fq.gz", sep=""))
message("[simFastqqRunTimes] (",format(j , w = 3),"/", length(nSeq),
") nSeq=", format(nSeq[j], big.mark = bm, w = 12))
# Allways equal sized reads: k=6, read length=102
writeSimFastq(k = 6, nk = 17, nSeq = nSeq[j], filename = fqFiles[j])
}
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
# Prepare result table
# + + + + + + + + + + + + + + + + + + + + + + + + + + + + #
nSim <- length(k) * length(nSeq)
m <- 1
res<-data.frame(id = 1:nSim, k = 0, nSeq = 0, runtime = 0)
for(i in 1:length(k))
{
for(j in 1:length(nSeq))
{
message("[simFastqqRunTimes] (", format(m, w = 3),"/", nSim,
") Fastqq run.")
fq <- fastqq(fqFiles[j], k = k[i])
res$k[m] <- k[i]
res$nSeq[m] <- nSeq[j]
res$runtime[m] <- collectDur(fq)
m <- m + 1
}
}
return(res)
}
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## Encapsulates whole simulation studies
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
sim_fq<-function(nRep = 2, nContamVec = c(100,1000), grSize = 20, nSeq = 1e4,
k = 6, kIndex = 1365, pos = 20)
{
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## Arguments
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
# nContam : Absolute number of contaminated sequences
# grSize : Nr of fastq files in control and contamination group
# nSeq : Nr of reads per fastq file
# k : k-mer length used in fastqq function
# kIndex : k-mer index of contaminating sequence default is "CCCCCC"
# pos : Contamination is inserted at position in read sequence,
# default is 20
if(length(kIndex) != length(pos))
stop("'kIndex' and 'pos' must have equal length.")
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## Fixed internal values
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
# ksim : k-mer sized used for simulation
ksim <- 6
# nksim : number of k-mers per read (determines read length)
nksim <- 17
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## Write simulated fastq files for control group
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
# ctrl_vec=leaf labels for control group
ctrl_vec <- 1:grSize
ctrl_files <- paste("sfq_i", ctrl_vec, "_ctrl.fq.gz", sep="")
for(i in ctrl_vec)
writeSimFastq(k = ksim, nk = nksim, nSeq = nSeq, filename = ctrl_files[i])
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## Prepare values for contamination and result
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
# cont_vec=leaf labels for contamination group
# (must be nonoverlapping with ctrl_vec)
cont_vec <- ctrl_vec + 100
contamVecLen <- length(nContamVec)
nSim = nRep * contamVecLen
# Result data.frame
res <- data.frame(id = 1:nSim, nContam = 0, rep = 0, sum = NA)
m <- 1
for(i in 1:length(nContamVec))
{
nContam<-nContamVec[i]
for(j in 1:nRep)
{
message("[sim_fq] Simulation (",format(m, width = 3),"/", nSim, ").")
cont_files <- paste("sfq_i", cont_vec, "_cont.fq.gz", sep = "")
for(i in 1:length(cont_vec))
writeSimContFastq(k = ksim, nk = nksim, nSeq = nSeq, pos = pos,
nContam = nContam, kIndex = kIndex,
filename = cont_files[i])
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## read Fastqq
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
ctrl_fqq <- fastqq(ctrl_files, k = k)
probeLabel(ctrl_fqq) <- ctrl_vec
cont_fqq <- fastqq(cont_files, k = k)
probeLabel(cont_fqq) <- cont_vec
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## merge, dist, hclust
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
mrg <- mergeFastqq(ctrl_fqq, cont_fqq)
mtx <- cbDistMatrix(mrg)
hc <- hclust(as.dist(mtx))
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## Calculate results and write into res
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
res$nContam[m] <- nContam
res$rep[m] <- j
# Calculate how many labels of the contamination group
# are in the first half of leaf labels
res$sum[m] <- sum(hc$order[ctrl_vec] > grSize)
m <- m + 1
}
}
res$lat <- pmin(res$sum, grSize - res$sum) / grSize * 100
res$perc <- res$nContam / nSeq * 100
return(res)
}
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
## END OF FILE
## + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ##
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.