readSimFFPE <- function(sourceSeq, referencePath, PhredScoreProfile, outFile,
coverage, readLen=150, meanInsertLen=250,
sdInsertLen=80, enzymeCut=FALSE, chimericProp=0.1,
sameChrProp=0.43, windowLen=5000, adjChimProp=0.63,
sameStrandProp=0.65, meanLogSRCRLen=1.8,
sdLogSRCRLen=0.55, maxSRCRLen=32,
meanLogSRCRDist=4.7, sdLogSRCRDist=0.35,
distWinLen=5000, spikeWidth = 1500,
betaShape1=0.5, betaShape2=0.5, sameTarRegionProb=0,
adjFactor=1.65, distFactor=1.65,
chimMutRate=0.003, noiseRate=0.0015,
highNoiseRate=0.08, highNoiseProp=0.01,
pairedEnd=TRUE, prefix="SimFFPE", threads=1,
adjChimeric=TRUE, distChimeric=TRUE,
normalReads=TRUE, overWrite=FALSE) {
if(missing(sourceSeq)) stop("sourceSeq is required")
if(missing(referencePath)) stop("referencePath is required")
if(missing(PhredScoreProfile)) stop("PhredScoreProfile is required")
if(missing(outFile)) stop("outFile is required")
if(missing(coverage)) stop("coverage is required")
if(max(betaShape1, betaShape2) > 1) {
stop ("betaShape1 and betaShape2 should not be greater than 1")
}
if(min(betaShape1, betaShape2) <= 0) {
stop ("betaShape1 and betaShape2 should be greater than 0")
}
if (nrow(PhredScoreProfile) != readLen) {
stop("The number of rows in PhredScoreProfile should be the same as",
" the input readLen")
}
if (overWrite) {
overWriteFastq(outFile = outFile, pairedEnd = pairedEnd)
}
covFFPE <- coverage * chimericProp
covNorm <- coverage * (1 - chimericProp)
adjChimProp <- sameChrProp * adjChimProp
sameStrandProp <- sameStrandProp * 1.05
reference <- readDNAStringSet(referencePath)
reference <- reference[width(reference) >= distWinLen]
names(reference) <- unlist(lapply(names(reference),
function(x) strsplit(x, " ")[[1]][1]))
names(sourceSeq) <- unlist(lapply(names(sourceSeq),
function(x) strsplit(x, " ")[[1]][1]))
nReadsLocal <- 0
nReadsDistant <- 0
nReadsNorm <- 0
cl <- makeCluster(threads)
registerDoParallel(cl)
message("Using ", threads, " thread(s) for simulation.")
for (i in seq_along(sourceSeq)) {
chr <- names(sourceSeq)[i]
message("Simulating FFPE reads on the chromosome ", chr, "...")
fullSeq <- sourceSeq[[i]]
if (adjChimeric) {
startPoses <- seq(1, length(fullSeq), by=windowLen)
endPoses <- startPoses + windowLen - 1
endPoses[length(endPoses)] <- length(fullSeq)
if(pairedEnd) {
nSeedsAll <- round(covFFPE * adjChimProp * adjFactor *
length(fullSeq) / readLen / 2)
} else {
enzymeCut <- FALSE
nSeedsAll <- round(covFFPE * adjChimProp * adjFactor *
length(fullSeq) / readLen)
}
nSeedsRegional <- floor(nSeedsAll / length(startPoses))
nSeeds <- rep(nSeedsRegional, length(startPoses))
restSeeds <- nSeedsAll - sum(nSeeds)
nSeeds[length(startPoses)] <-
round(nSeedsRegional / windowLen *
(length(fullSeq) - startPoses[length(startPoses)]+1))
tmp <- sample(seq_along(startPoses), restSeeds)
nSeeds[tmp] <- nSeeds[tmp] + 1
startPos <- NULL
endPos <- NULL
nSeed <- NULL
simReads <-
foreach(startPos = startPoses,
endPos = endPoses,
nSeed = nSeeds,
.combine = "c",
.inorder = TRUE,
.verbose = FALSE,
.errorhandling = "remove",
.packages = c("Biostrings"),
.export = c("generateRegionalChimericReads",
"generateRegionalChimericSeqs",
"generateEnzymicCutSeq",
"addRandomMutation",
"addNoise",
"generateReads",
"rtruncnorm")
) %dopar% {
generateRegionalChimericReads(
seq = fullSeq[startPos:endPos],
nSeed = nSeed,
meanLogSRCRLen = meanLogSRCRLen,
sdLogSRCRLen = sdLogSRCRLen,
maxSRCRLen=maxSRCRLen,
meanLogSRCRDist = meanLogSRCRDist,
sdLogSRCRDist = sdLogSRCRDist,
meanInsertLen = meanInsertLen,
sdInsertLen = sdInsertLen,
enzymeCut = enzymeCut,
readLen = readLen,
chimMutRate = chimMutRate,
noiseRate = noiseRate,
highNoiseRate = highNoiseRate,
highNoiseProp = highNoiseProp,
pairedEnd = pairedEnd,
sameStrandProp = sameStrandProp)
}
readsToFastq(simReads = simReads,
PhredScoreProfile = PhredScoreProfile,
prefix = prefix,
prefix2 = "adjChimeric",
chr = chr,
pairedEnd = pairedEnd,
outFile = outFile,
threads = threads)
message("Generated ", length(simReads), " adjacent chimeric reads ",
"on chromosome ", chr, ".")
nReadsLocal <- nReadsLocal + length(simReads)
simReads <- NULL
}
if (distChimeric) {
allChr <- names(reference)
totalLen <- do.call("sum", lapply(reference, length))
chrProb <- unlist(lapply(reference, length)) / totalLen
tmpProb <- (sameChrProp - adjChimProp) / (1 - adjChimProp)
chrProb[chr] <- 0
chrProb <- chrProb / sum(chrProb) * (1 - tmpProb)
chrProb[chr] <- tmpProb
if (pairedEnd) {
nSeedDistant <- round(covFFPE * (1 - adjChimProp) * distFactor *
length(fullSeq) / readLen / 2)
} else {
nSeedDistant <- round(covFFPE * (1 - adjChimProp) * distFactor *
length(fullSeq) / readLen)
}
if (nSeedDistant > 1e+4) {
distWindow <- round(length(fullSeq) / (nSeedDistant / 1e+4))
nTimes <- ceiling(length(fullSeq) / distWindow / threads)
nSeedDistant <-
round(nSeedDistant * distWindow / length(fullSeq))
} else {
distWindow <- length(fullSeq)
nTimes <- 1
}
allStartPos <- seq(1, length(fullSeq), by = distWindow)
nReadsDistantChr <- 0
firstID <- 1
for (j in seq_len(nTimes)) {
tmp <- min(j * threads, length(allStartPos))
startPos <- allStartPos[((j - 1) * threads + 1):tmp]
simReads <-
foreach(startPos = startPos,
.combine = "c",
.inorder = TRUE,
.verbose = FALSE,
.packages = "Biostrings",
.errorhandling = "remove",
.export = c("generateDistantChimericReads",
"generateDistantChimericSeqs",
"generateReads",
"addRandomMutation",
"addNoise",
"generateTargetSeqs",
"rbeta", "round",
"rtruncnorm")
) %dopar% {
generateDistantChimericReads(
fullSeq = fullSeq,
referencePath = referencePath,
startPos = startPos,
windowLen = distWindow,
distWinLen = distWinLen,
nSeed = nSeedDistant,
meanLogSRCRLen = meanLogSRCRLen,
sdLogSRCRLen = sdLogSRCRLen,
maxSRCRLen = maxSRCRLen,
meanInsertLen = meanInsertLen,
sdInsertLen = sdInsertLen,
readLen = readLen,
allChr = allChr,
chrProb = chrProb,
chimMutRate = chimMutRate,
noiseRate = noiseRate,
highNoiseRate = highNoiseRate,
highNoiseProp = highNoiseProp,
pairedEnd = pairedEnd,
spikeWidth = spikeWidth,
betaShape1 = betaShape1,
betaShape2 = betaShape2,
sameTarRegionProb = sameTarRegionProb,
sameStrandProp= 0.5)
}
readsToFastq(simReads = simReads,
PhredScoreProfile = PhredScoreProfile,
prefix = prefix,
prefix2 = "distChimeric",
chr = chr,
firstID = firstID,
pairedEnd = pairedEnd,
outFile = outFile,
threads = threads)
if (pairedEnd) {
firstID = firstID + length(simReads) / 2
} else {
firstID = firstID + length(simReads)
}
nReadsDistantChr <- nReadsDistantChr + length(simReads)
simReads <- NULL
}
message("Generated ", nReadsDistantChr,
" distant chimeric reads ",
"on chromosome ", chr, ".")
nReadsDistant <- nReadsDistant + nReadsDistantChr
}
if (normalReads) {
if (pairedEnd) {
nSeqNorm <- round(covNorm * length(fullSeq) / readLen / 2)
} else {
nSeqNorm <- round(covNorm * length(fullSeq) / readLen)
}
if (nSeqNorm > 5e+6) {
nSeqPerTime <- c(rep(5e+6, floor(nSeqNorm / 5e+6)),
nSeqNorm %% 5e+6)
nTimes <- ceiling(nSeqNorm / 5e+6)
} else {
nSeqPerTime <- nSeqNorm
nTimes <- 1
}
nReadsNormChr <- 0
firstID <- 1
for (j in seq_len(nTimes)) {
nSeqPerCluster <- round(nSeqPerTime[j] / threads)
simReads <- foreach(i = seq_len(threads),
.combine = "c",
.inorder = TRUE,
.verbose = FALSE,
#.errorhandling = "remove",
.packages = c("Biostrings"),
.export = c("addNoise",
"rtruncnorm",
"generateNormalReads")
) %dopar% {
generateNormalReads(
fullSeq = fullSeq,
nSeq = nSeqPerCluster,
meanInsertLen = meanInsertLen,
sdInsertLen = sdInsertLen,
readLen = readLen,
noiseRate = noiseRate,
highNoiseRate = highNoiseRate,
highNoiseProp = highNoiseProp,
pairedEnd = pairedEnd)
}
readsToFastq(simReads = simReads,
PhredScoreProfile = PhredScoreProfile,
prefix = prefix,
prefix2 = "Normal",
chr = chr,
firstID = firstID,
pairedEnd = pairedEnd,
outFile = outFile,
threads = threads)
if (pairedEnd) {
firstID = firstID + length(simReads) / 2
} else {
firstID = firstID + length(simReads)
}
nReadsNormChr <- nReadsNormChr + length(simReads)
simReads <- NULL
}
message("Generated ", nReadsNormChr, " normal reads ",
"on chromosome ", chr, ".")
nReadsNorm <- nReadsNorm + nReadsNormChr
}
}
stopCluster(cl)
message("In totoal ", nReadsLocal, " adjacent chimeric reads, ",
nReadsDistant, " distant chimeric reads, ",
nReadsNorm, " normal reads were generated.",
"\nAlltogether ", nReadsLocal + nReadsDistant + nReadsNorm,
" reads were generated. ", nReadsLocal + nReadsDistant,
" reads (", round((nReadsLocal + nReadsDistant) /
(nReadsLocal + nReadsDistant + nReadsNorm),
4) * 100,
"%, adjusted by chimericProp) are artifact chimeric reads.",
"\nOf all artifact chimeric reads, ", nReadsLocal, " reads (",
round(nReadsLocal / (nReadsLocal + nReadsDistant), 4) * 100,
"%, adjusted by sameChrProp * adjChimProp)",
" are adjacent chimeric reads.",
"\nSimulation done.")
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.