## Generate test data for linkMultipleVariants
## Construct: C(6) - V(24) - C(10) - V(30) - C(10) - V(40) - C(6)
## Forward read length 90
## Reverse read length 70
nReads <- 100
status <- rep("OK", nReads)
## First constant region
c1 <- rep("ACGTGA", nReads)
set.seed(1)
i1 <- sample(seq_len(nReads), 4) ## reads with mutation
c1[i1] <- "ACTTGA"
status[i1] <- gsub("OK,", "", paste0(status[i1], ",C1mut"))
## Second constant region
c2 <- rep("TTGATCCCGG", nReads)
set.seed(2)
i2 <- sample(seq_len(nReads), 5) ## reads with mutation
c2[i2] <- "TTGATCGCGG"
status[i2] <- gsub("OK,", "", paste0(status[i2], ",C2mut"))
## Third constant region
c3 <- rep("TAAATACCGG", nReads)
set.seed(3)
i3 <- sample(seq_len(nReads), 5) ## reads with mutation
c3[i3] <- "TAGATACCGG"
status[i3] <- gsub("OK,", "", paste0(status[i3], ",C3mut"))
## Fourth constant region
c4 <- rep("ACCTGA", nReads)
set.seed(4)
i4 <- sample(seq_len(nReads), 4) ## reads with mutation
c4[i4] <- "ACCCGA"
status[i4] <- gsub("OK,", "", paste0(status[i4], ",C4mut"))
## First variable region (barcodes)
tmpstatus <- rep("", nReads)
set.seed(1)
v1true <- rep(c(paste(sample(c("A", "C", "G", "T"), 24, replace = TRUE), collapse = ""),
paste(sample(c("A", "C", "G", "T"), 24, replace = TRUE), collapse = ""),
paste(sample(c("A", "C", "G", "T"), 24, replace = TRUE), collapse = "")),
c(floor(nReads/3), floor(nReads/3), nReads - 2 * floor(nReads/3)))
v1 <- v1true
i1 <- sample(seq_len(nReads), 10) ## reads with mutation
for (i in i1) {
tmp <- strsplit(v1[i], "")[[1]]
i0 <- sample(seq_along(tmp), 1)
tmp[i0] <- ifelse(tmp[i0] == "A", "T", "A")
v1[i] <- paste(tmp, collapse = "")
tmpstatus[i] <- ",V1mut"
}
i2 <- sample(seq_len(nReads), 4) ## reads with N
for (i in i2) {
tmp <- strsplit(v1[i], "")[[1]]
tmp[sample(seq_along(tmp), 1)] <- "N"
v1[i] <- paste(tmp, collapse = "")
tmpstatus[i] <- paste0(tmpstatus[i], ",V1N")
}
ordr <- sample(seq_along(v1true), length(v1true))
v1true <- v1true[ordr]
v1 <- v1[ordr]
tmpstatus <- tmpstatus[ordr]
status <- gsub("OK,", "", paste0(status, tmpstatus))
## Second variable region (multiple WT sequences)
tmpstatus <- rep("", nReads)
set.seed(2)
V2v1 <- paste(sample(c("A", "C", "G", "T"), 30, replace = TRUE), collapse = "")
V2v2 <- paste(sample(c("A", "C", "G", "T"), 30, replace = TRUE), collapse = "")
V2v3 <- paste(sample(c("A", "C", "G", "T"), 30, replace = TRUE), collapse = "")
v2true <- rep(c(V2v1, V2v2, V2v3),
c(floor(nReads/3), floor(nReads/3), nReads - 2 * floor(nReads/3)))
n2true <- rep(c("V2v1", "V2v2", "V2v3"),
c(floor(nReads/3), floor(nReads/3), nReads - 2 * floor(nReads/3)))
v2 <- v2true
i1 <- sample(seq_len(nReads), 20) ## reads with mutation
for (i in i1) {
tmp <- strsplit(v2[i], "")[[1]]
i0 <- sample(seq_along(tmp), 1)
repl <- ifelse(tmp[i0] == "A", "T", "A")
tmp[i0] <- repl
v2[i] <- paste(tmp, collapse = "")
tmpstatus[i] <- paste0(",", n2true[i], ".", i0, ".", repl)
}
i3 <- sample(seq_len(nReads), 3) ## reads with deletion
## don't modify the reads here, since that won't generate an invalid overlap
## (the modification will be in both the fwd and rev reads). Just record
## which reads to modify
for (i in i3) {
tmpstatus[i] <- paste0(tmpstatus[i], ",V2del")
}
ordr <- sample(seq_along(v2true), length(v2true))
v2true <- v2true[ordr]
n2true <- n2true[ordr]
v2 <- v2[ordr]
tmpstatus <- tmpstatus[ordr]
status <- gsub("OK,", "", paste0(status, tmpstatus))
## Third variable region (one WT sequence)
tmpstatus <- rep("", nReads)
set.seed(3)
V3 <- paste(sample(c("A", "C", "G", "T"), 40, replace = TRUE), collapse = "")
v3true <- rep(V3, nReads)
v3 <- v3true
i1 <- sample(seq_len(nReads), 30) ## reads with mutation
for (i in i1) {
tmp <- strsplit(v3[i], "")[[1]]
i0 <- sample(seq_along(tmp), 1)
repl <- ifelse(tmp[i0] == "A", "T", "A")
tmp[i0] <- repl
v3[i] <- paste(tmp, collapse = "")
tmpstatus[i] <- paste0(",V3.", i0, ".", repl)
}
ordr <- sample(seq_along(v3true), length(v3true))
v3true <- v3true[ordr]
v3 <- v3[ordr]
tmpstatus <- tmpstatus[ordr]
status <- gsub("OK,", "", paste0(status, tmpstatus))
## Final reads
reads <- paste0(c1, v1, c2, v2, c3, v3, c4)
qualities <- rep(paste(rep("I", 126), collapse = ""), nReads)
readsR1 <- substr(reads, start = 1, stop = 90)
readsR2 <- substr(reads, start = 57, stop = 126)
qualitiesR1 <- substr(qualities, start = 1, stop = 90)
qualitiesR2 <- substr(qualities, start = 57, stop = 126)
## Add the deletion to the forward reads
idx <- grep("V2del", status)
for (i in idx) {
tmp <- strsplit(readsR1[i], "")[[1]]
tmp[41] <- ""
tmp <- c(tmp, substr(reads[i], start = 91, stop = 91))
tmp <- paste(tmp, collapse = "")
readsR1[i] <- tmp
}
fastqR1 <- Biostrings::QualityScaledDNAStringSet(
x = Biostrings::DNAStringSet(
x = structure(readsR1, names = paste0("R", seq_len(nReads)))
),
quality = Biostrings::PhredQuality(
Biostrings::BStringSet(
x = structure(qualitiesR1, names = paste0("R", seq_len(nReads)))
)
)
)
fastqR2 <- Biostrings::QualityScaledDNAStringSet(
x = Biostrings::reverseComplement(
Biostrings::DNAStringSet(
x = structure(readsR2, names = paste0("R", seq_len(nReads)))
)
),
quality = Biostrings::PhredQuality(
Biostrings::BStringSet(
x = structure(qualitiesR2, names = paste0("R", seq_len(nReads)))
)
)
)
Biostrings::writeQualityScaledXStringSet(fastqR1, filepath = "multipleVariableRegions_R1.fastq.gz", compress = TRUE)
Biostrings::writeQualityScaledXStringSet(fastqR2, filepath = "multipleVariableRegions_R2.fastq.gz", compress = TRUE)
## True sequences
barcode <- v1true
V2var <- vapply(strsplit(status, ","), function(x) {
i <- grep("V2.*\\.", x)
if (length(i) > 0) {
x[i]
} else {
NA_character_
}}, NA_character_)
V2var[is.na(V2var)] <- paste0(n2true[is.na(V2var)], ".0.WT")
V3var <- vapply(strsplit(status, ","), function(x) {
i <- grep("V3.*\\.", x)
if (length(i) > 0) {
x[i]
} else {
NA_character_
}}, NA_character_)
V3var[is.na(V3var)] <- "V3.0.WT"
WTV2 <- c(V2v1 = V2v1, V2v2 = V2v2, V2v3 = V2v3)
WTV3 <- c(V3 = V3)
constFwd <- "ACGTGATTGATCCCGGTAAATACCGG"
constRev <- "TAAATACCGGACCTGA"
truth <- data.frame(read = paste0("R", seq_len(nReads)),
trueBarcode = barcode,
trueV2 = V2var,
trueV3 = V3var,
obsBarcode = v1,
status = status)
saveRDS(list(truth = truth, WTV2 = WTV2, WTV3 = WTV3,
constFwd = constFwd, constRev = constRev),
"multipleVariableRegions_truth.rds")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.