context("preprocessReads")
# prepare sample data
# Fragment design:
# Left Adapter ATACTG
# Insert TGTGACAGACCTCGGGGCCACATGCACTGACTCCTCAGCTGCCAGATGTGCAGTCCAAGCTGGGCCGAGGTC
# Right Adapter ATCTCGTATGCCGTCTTCTGCTTG
# ... create two temporary fasta files and return file names
faFiles <- tempfile(fileext = rep(".fa", 2), tmpdir = "extdata")
writeLines(c(">seq1", "CCTCAGCTGCCAGATGTGCAGTCCAAGCTGGGCCGA", # onlyInsert
">seq2", "AAAAANNAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", # low complexity, 2N
">seq3", "AAAATACTGTGTGACAGACCTCGGGGCCACATGCAC", # A.., LAdapter
">seq4", "ATACTGTGTGACAGACCTCGGGGCCACATGCACTGA", # LAdapter
">seq5", "TGTGACAGACCTCGGGGCCACATGCACTGACTCCTC", # Insert
">seq6", "ATACTGATCTCGTATGCCGTCTTCTGCTTGAAAAAA", # LAdapter, noInsert, RAdapter, A..
">seq7", "ATACTGATCTCGTATGCCGTCTTCTGCTTGAAAANN", # LAdapter, noInsert, RAdapter, A.., 2N
">seq8", "ATACTGGAGGTCATCTCGTATGCCGTCTTCTGCTTG", # LAdapter, shortInsert, RAdapter
">seq9", "TGACAGACCTCGGGGCCAC"), # shortLength
faFiles[1])
writeLines(c(">seq1", "AAAAAAAAAAAAAAAAAAA", # low complexity
">seq2", "TCAGCTGCCAGATGTGCAGTCCAAGCTGGGCCGAGG", # low complexity, shortLength
">seq3", "CTCCTCAGCTGCCAGATGTGCAGTCCAAGCTGGGCC", # onlyInsert
">seq4", "GAGGTCATCTCGTATGCCGTCTTCTGCTTGAAAAAA", # RAdapter
">seq5", "AGCTGCCAGATGTGCAGTCCAAGCTGGGCCGAGGTC", # Insert
">seq6", "ATACTGATCTCGTATGCCGTCTTCTGCTTGAAAAAA", # LAdapter, noInsert, RAdapter, A..
">seq7", "ATACTGATCTCGTATGCCGTCTTCTGCTTGAAAANN", # LAdapter, noInsert, RAdapter, A.., 2N
">seq8", "AAAAATACTGGAGGTCATCTCGTATGCCGTCTTCTG", # A.., LAdapter, shortInsert, RAdapter
">seq9", "CAGCTGCCAGATGTGCAGTCCAAGCTGGGCCGAGGT"),# Insert
faFiles[2])
# ... create two temporary fastq files and return file names
fqFiles <- tempfile(fileext = rep(".fq", 2), tmpdir = "extdata")
writeLines(c("@seq1", "CCTCAGCTGCCAGATGTGCAGTCCAAGCTGGGCCGA", "+", "5..49<494*<49493####################",
"@seq2", "AAAAANNAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", "+", "+4544944444444######################",
"@seq3", "AAAATACTGTGTGACAGACCTCGGGGCCACATGCAC", "+", "BCCBCCBBB60>CA;5;@BB@A6+;8@BC?0:B@/=",
"@seq4", "ATACTGTGTGACAGACCTCGGGGCCACATGCACTGA", "+", "BCBBB>ACBCCCBCC@BCC@*7@82=BBBB1>CABC",
"@seq5", "TGTGACAGACCTCGGGGCCACATGCACTGACTCCTC", "+", "BA?AB??60>A6?0BBBB95>057;@*.<(434;+4",
"@seq6", "ATACTGATCTCGTATGCCGTCTTCTGCTTGAAAAAA", "+", "+BCCCCBCCCBACB:?BBCCCCCCBCBC>;(>BBB@",
"@seq7", "ATACTGATCTCGTATGCCGTCTTCTGCTTGAAAANN", "+", "####################################",
"@seq8", "ATACTGGAGGTCATCTCGTATGCCGTCTTCTGCTTG", "+", "(33(;?B@AB43*,/;9(6</7>5;<##########",
"@seq9", "TGACAGACCTCGGGGCCAC", "+", "###################"),
fqFiles[1])
writeLines(c("@seq1", "AAAAAAAAAAAAAAAAAAA", "+", "3@?3/>9A8@A1-*/4@BB",
"@seq2", "TCAGCTGCCAGATGTGCAGTCCAAGCTGGGCCGAGG", "+", "4-093<##############################",
"@seq3", "CTCCTCAGCTGCCAGATGTGCAGTCCAAGCTGGGCC", "+", "CB?(8=(<A/<=-(07+7&883@#############",
"@seq4", "GAGGTCATCTCGTATGCCGTCTTCTGCTTGAAAAAA", "+", "5..49<494*<4949#####################",
"@seq5", "AGCTGCCAGATGTGCAGTCCAAGCTGGGCCGAGGTC", "+", "+45449444444########################",
"@seq6", "ATACTGATCTCGTATGCCGTCTTCTGCTTGAAAAAA", "+", "BCCBCCBBB60>CA;5;@BB@A6+;8@BC?0:B@A<",
"@seq7", "ATACTGATCTCGTATGCCGTCTTCTGCTTGAAAANN", "+", "BCBBB>ACBCCCBCC@BCC@*7@82=BBBB1>CABB",
"@seq8", "AAAAATACTGGAGGTCATCTCGTATGCCGTCTTCTG", "+", "BA?AB??60>A6?0BBBB95>057;@*.<(434;@B",
"@seq9", "CAGCTGCCAGATGTGCAGTCCAAGCTGGGCCGAGGT", "+", "+BCCCCBCCCBACB:?BBCCCCCCBCBC>;(>B@4;"),
fqFiles[2])
test_that("arguments in preprocessReads are digested correctly", {
tmptxt <- tempfile(fileext = ".txt", tmpdir = "extdata")
writeLines("AAA", tmptxt)
tmpfagz <- tempfile(fileext = ".fa.gz", tmpdir = "extdata")
Biostrings::writeXStringSet(DNAStringSet(x = "AAA"),
filepath = tmpfagz, compress = TRUE)
expect_error(preprocessReads(1L),
".filename. must be of type character")
expect_error(preprocessReads(c("a","b"), "c"),
".filename. and .outputFilename. must have equal length")
expect_error(preprocessReads("in1.fa", "out1.fa"),
"non-existing input file.s.: in1.fa")
expect_error(preprocessReads("a", 1L),
".outputFilename. must be of type character")
expect_error(preprocessReads(tmptxt, NULL),
"unsupported file format")
expect_error(preprocessReads(faFiles[1], "b.fastq"),
"format of .filename. and .outputFilename. must be identical")
expect_error(preprocessReads(tmpfagz, "b.fa.gz"),
"compressed .fasta. input is not yet supported")
expect_error(preprocessReads(faFiles[1], "out1.fa", faFiles[2], "out2.fa", Lpattern = "AAA"),
"Removing adapters from paired-end samples is not yet supported")
expect_error(preprocessReads(faFiles[1], "out1.fa", 1L, "out2.fa"),
".filenameMate. must be of type character")
expect_error(preprocessReads(faFiles[1], "out1.fa", faFiles[2], 1L),
".outputFilenameMate. must be of type character")
expect_error(preprocessReads(faFiles[1], "out1.fa", faFiles[2], c("o2a.fa","o2b.fa")),
".filenameMate. and .outputFilenameMate. must have equal length")
expect_error(preprocessReads(faFiles[1], "out1.fa", faFiles[2], "out2.fq"),
"format of .filenameMate. and .outputFilenameMate. must be identical")
expect_error(preprocessReads(faFiles[1], "out1.fa.gz", faFiles[2], "out2.fa.bz2"),
"compression format of .outputFilename. and .outputFilenameMate. must be identical")
})
test_that("preprocessReads correctly processes single-read files", {
# results for default parameters
outFilesFasta <- tempfile(fileext = rep(".fa", 2), tmpdir = "extdata")
resSoll <- cbind(c(9,0,0,0,0,0,9), c(9,0,0,0,0,0,9))
rownames(resSoll) <- c("totalSequences","matchTo5pAdapter","matchTo3pAdapter",
"tooShort","tooManyN","lowComplexity","totalPassed")
colnames(resSoll) <- basename(faFiles)
outFilesFastq <- tempfile(fileext = rep(".fq", 2), tmpdir = "extdata")
# default
res <- preprocessReads(faFiles, outFilesFasta, clObj = clObj)
expect_equal(resSoll, res)
# existing output file
expect_error(preprocessReads(faFiles, outFilesFasta))
unlink(outFilesFasta)
# complexity
res <- preprocessReads(fqFiles, outFilesFastq, complexity = 0.5)
expect_equal(c(1, 1), unname(res["lowComplexity",]))
unlink(outFilesFastq)
# minLength
res <- preprocessReads(faFiles, outFilesFasta, minLength = 30)
expect_equal(c(1, 1), unname(res["tooShort",]))
unlink(outFilesFasta)
# nBases
res <- preprocessReads(fqFiles, outFilesFastq, nBases = 1)
expect_equal(c(2, 1), unname(res["tooManyN",]))
unlink(outFilesFastq)
# truncateStartBases
res <- preprocessReads(faFiles, outFilesFasta, truncateStartBases = 2)
expect_equal(nchar(readLines(faFiles[1]))[seq(2, 18, by = 2)] - 2L,
nchar(readLines(outFilesFasta[1])[seq(2, 18, by = 2)]))
unlink(outFilesFasta)
# truncateEndBases
res <- preprocessReads(fqFiles, outFilesFastq, truncateEndBases = 3)
expect_equal(nchar(readLines(fqFiles[2]))[seq(2, 36, by = 4)] - 3L,
nchar(readLines(outFilesFastq[2])[seq(2, 36, by = 4)]))
unlink(outFilesFastq)
# Lpattern
res <- preprocessReads(faFiles, outFilesFasta, Lpattern = "ATACTG")
expect_equal(c(7, 4), unname(res["matchTo5pAdapter",]))
unlink(outFilesFasta)
# Rpattern
res <- preprocessReads(fqFiles, outFilesFastq, Rpattern = "ATCTCGTATGCCGTCTTCTGCTTG")
expect_equal(c(6, 5), unname(res["matchTo3pAdapter",]))
unlink(outFilesFastq)
})
test_that("preprocessReads correctly processes paired-end files", {
# results for default parameters
outFilesFasta <- tempfile(fileext = rep(".fa", 2), tmpdir = "extdata")
resSoll <- matrix(c(9,NA,NA,0,0,0,9), nrow = 7, ncol = 1,
dimnames = list(c("totalSequences","matchTo5pAdapter","matchTo3pAdapter",
"tooShort","tooManyN","lowComplexity","totalPassed"),
paste(basename(faFiles), collapse = ":")))
outFilesFastq <- tempfile(fileext = rep(".fq", 2), tmpdir = "extdata")
# default
res <- preprocessReads(faFiles[1], outFilesFasta[1], faFiles[2], outFilesFasta[2])
expect_equal(resSoll, res)
unlink(outFilesFasta)
# complexity
res <- preprocessReads(fqFiles[1], outFilesFastq[1], fqFiles[2], outFilesFastq[2],
complexity = 0.5)
expect_equal(2, res["lowComplexity",])
unlink(outFilesFastq)
# minLength
res <- preprocessReads(faFiles[1], outFilesFasta[1], faFiles[2], outFilesFasta[2],
minLength = 30)
expect_equal(2, res["tooShort",])
unlink(outFilesFasta)
# nBases
res <- preprocessReads(fqFiles[1], outFilesFastq[1], fqFiles[2], outFilesFastq[2],
nBases = 1)
expect_equal(2, res["tooManyN",])
unlink(outFilesFastq)
# truncateStartBases
res <- preprocessReads(faFiles[1], outFilesFasta[1], faFiles[2], outFilesFasta[2],
truncateStartBases = 2)
expect_equal(nchar(readLines(faFiles[1]))[seq(2, 18, by = 2)] - 2L,
nchar(readLines(outFilesFasta[1])[seq(2, 18, by = 2)]))
expect_equal(nchar(readLines(faFiles[2]))[seq(2, 18, by = 2)] - 2L,
nchar(readLines(outFilesFasta[2])[seq(2, 18, by = 2)]))
unlink(outFilesFasta)
# truncateEndBases and compressed output
res <- preprocessReads(fqFiles[1], outFilesFastq[1], fqFiles[2], outFilesFastq[2],
truncateEndBases = 3)
res <- preprocessReads(fqFiles[1], paste0(outFilesFastq[1], ".gz"),
fqFiles[2], paste0(outFilesFastq[2], ".gz"),
truncateEndBases = 3)
expect_equal(nchar(readLines(fqFiles[1]))[seq(2, 36, by = 4)] - 3L,
nchar(readLines(paste0(outFilesFastq[1],".gz"))[seq(2, 36, by = 4)]))
expect_equal(nchar(readLines(fqFiles[2]))[seq(2, 36, by = 4)] - 3L,
nchar(readLines(paste0(outFilesFastq[2],".gz"))[seq(2, 36, by = 4)]))
unlink(paste0(outFilesFastq,".gz"))
})
test_that("preprocessReads correctly (de-)compresses files", {
# results for default parameters
resSoll <- matrix(c(9,0,0,0,0,0,9), ncol = 1,
dimnames = list(c("totalSequences","matchTo5pAdapter","matchTo3pAdapter",
"tooShort","tooManyN","lowComplexity","totalPassed"),
basename(faFiles[1])))
outFile <- tempfile(fileext = ".fa.gz", tmpdir = "extdata")
res <- preprocessReads(faFiles[1], outFile)
expect_equal(resSoll, res)
unlink(outFile)
outFile <- tempfile(fileext = ".fa.bz2", tmpdir = "extdata")
res <- preprocessReads(faFiles[1], outFile)
expect_equal(resSoll, res)
unlink(outFile)
outFile <- tempfile(fileext = ".fa.xz", tmpdir = "extdata")
res <- preprocessReads(faFiles[1], outFile)
expect_equal(resSoll, res)
unlink(outFile)
})
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.