targetReadSimFFPE <- function(referencePath, PhredScoreProfile,
targetRegions, outFile, coverage,
readLen=150, meanInsertLen=250, sdInsertLen=80,
enzymeCut=FALSE, padding=50, minGap=5,
chimericProp=0.1, sameChrProp=0.43,
windowLen=5000, adjChimProp=0.63,
sameStrandProp=0.65, meanLogSRCRLen=1.8,
sdLogSRCRLen=0.55, maxSRCRLen=32,
meanLogSRCRDist=4.7, sdLogSRCRDist=0.35,
distWinLen=5000, spikeWidth=1500,
betaShape1=0.5, betaShape2=0.5,
sameTarRegionProb=0, adjFactor = 1.3,
distFactor = 1.3, chimMutRate=0.003,
noiseRate=0.0015, highNoiseRate=0.08,
highNoiseProp=0.01, pairedEnd=TRUE,
prefix="SimFFPE", threads=1,
adjChimeric=TRUE, distChimeric=TRUE,
normalReads=TRUE, overWrite=FALSE) {
if(missing(referencePath)) stop("referencePath is required")
if(missing(PhredScoreProfile)) stop("PhredScoreProfile is required")
if(missing(targetRegions)) stop("targetRegions is required")
if(missing(outFile)) stop("outFile is required")
if(missing(coverage)) stop("coverage is required")
if(max(betaShape1, betaShape2) > 1) {
stop ("betaShape1 and betaShape2 should not be greater than 1")
}
if(min(betaShape1, betaShape2) <= 0) {
stop ("betaShape1 and betaShape2 should be greater than 0")
}
if (nrow(PhredScoreProfile) != readLen) {
stop("The number of rows in PhredScoreProfile should be the same as",
" the input readLen")
}
if (overWrite) {
overWriteFastq(outFile = outFile, pairedEnd = pairedEnd)
}
covFFPE <- coverage * chimericProp
covNorm <- coverage * (1 - chimericProp)
adjChimProp <- sameChrProp * adjChimProp
sameStrandProp <- sameStrandProp * 1.05
if(!pairedEnd) {
enzymeCut <- FALSE
}
reference <- readDNAStringSet(referencePath)
reference <- reference[width(reference) >= distWinLen]
names(reference) <- unlist(lapply(names(reference),
function(x) strsplit(x, " ")[[1]][1]))
if (is(targetRegions, "GenomicRanges")) {
targetRegions <- data.frame(seqnames(targetRegions),
start(targetRegions),
end(targetRegions))
}
colnames(targetRegions) <- c("chr", "start", "end")
targetRegions <- targetRegions[targetRegions$chr %in% names(reference), ]
targetRegions <- targetRegions[targetRegions$end >= targetRegions$start, ]
nReadsLocal <- 0
nReadsDistant <- 0
nReadsNorm <- 0
cl <- makeCluster(threads)
registerDoParallel(cl)
message("Using ", threads, " thread(s) for simulation.")
for (chr in unique(targetRegions$chr)) {
chrName <- chr
targetRegion <- targetRegions[targetRegions$chr == chrName, ]
if (nrow(targetRegion) > 0) {
fullSeq <- reference[[chr]]
message("Simulating FFPE reads on the chromosome ", chr, "...")
targetRegion$start <- unlist(lapply(targetRegion$start,
function(x)
max(0, x - padding)))
chrLen <- length(fullSeq)
targetRegion$end <- unlist(lapply(targetRegion$end,
function(x)
min(chrLen, x + padding)))
targetRegion <- targetRegion[order(targetRegion$start),]
if (nrow(targetRegion) > 1) {
starts <- targetRegion$start
starts <- starts[2:length(starts)]
ends <- targetRegion$end
ends <- ends[seq_len(length(ends) - 1)]
ifMerge <- (starts - ends) <= minGap
allStartPos <- c(targetRegion$start[1], starts[!ifMerge])
allEndPos <- c(ends[!ifMerge],
targetRegion$end[nrow(targetRegion)])
keep <- (allEndPos - allStartPos + 1) >= readLen
allStartPos <- allStartPos[keep]
allEndPos <- allEndPos[keep]
startPoses <- unlist(mapply(function(x, y) seq(from = x,
to = y,
by = windowLen),
allStartPos, allEndPos))
endPoses <- mapply(function(x, y) seq(from = x,
to = y,
by = windowLen) +
windowLen - 1,
allStartPos, allEndPos)
for (i in seq_along(endPoses)) {
endPoses[[i]][endPoses[[i]] > allEndPos[i]] <- allEndPos[i]
}
endPoses <- unlist(endPoses)
endPoses[endPoses > length(fullSeq)] <- length(fullSeq)
allStartPos <- unlist(allStartPos)
allEndPos <- unlist(allEndPos)
} else {
startPoses <- targetRegion$start
endPoses <- targetRegion$end
allStartPos <- targetRegion$start
allEndPos <- targetRegion$end
}
if (adjChimeric) {
if (pairedEnd) {
nSeeds <- round((endPoses - startPoses + 1) * adjFactor *
covFFPE * adjChimProp / readLen / 2)
} else {
nSeeds <- round((endPoses - startPoses + 1) * adjFactor *
covFFPE * adjChimProp / readLen)
}
matchPaddings <-
round((windowLen - (endPoses - startPoses + 1)) / 2)
matchPaddings[matchPaddings < 0] <- 0
targetStarts <- startPoses - matchPaddings
targetEnds <- endPoses + matchPaddings
remove <- targetStarts < 0 | targetEnds > length(fullSeq)
targetStarts[remove] <- startPoses[remove]
targetEnds[remove] <- endPoses[remove]
matchPaddings[remove] <- 0
startPos <- NULL
endPos <- NULL
targetStart <- NULL
targetEnd <- NULL
nSeed <- NULL
paddingLen <- NULL
simReads <- foreach(targetStart = targetStarts,
targetEnd = targetEnds,
nSeed = nSeeds,
paddingLen = matchPaddings,
.combine = "c",
.inorder = TRUE,
.verbose = FALSE,
.errorhandling = "remove",
.packages = c("Biostrings"),
.export = c(
"targetRegionalChimericReads",
"generateRegionalChimericSeqs",
"generateEnzymicCutSeq",
"addRandomMutation",
"generateReads",
"addNoise",
"rtruncnorm",
"round")
) %dopar% {
targetRegionalChimericReads(
targetSeq = fullSeq[targetStart:targetEnd],
targetPadding = paddingLen,
nSeed = nSeed,
meanLogSRCRLen = meanLogSRCRLen,
sdLogSRCRLen = sdLogSRCRLen,
maxSRCRLen = maxSRCRLen,
meanLogSRCRDist = meanLogSRCRDist,
sdLogSRCRDist = sdLogSRCRDist,
meanInsertLen = meanInsertLen,
sdInsertLen = sdInsertLen,
enzymeCut = enzymeCut,
readLen = readLen,
chimMutRate = chimMutRate,
noiseRate = noiseRate,
highNoiseRate = highNoiseRate,
highNoiseProp = highNoiseProp,
pairedEnd = pairedEnd,
sameStrandProp = sameStrandProp)
}
readsToFastq(simReads = simReads,
PhredScoreProfile = PhredScoreProfile,
prefix = prefix,
prefix2 = "adjChimeric",
chr = chr,
pairedEnd = pairedEnd,
outFile = outFile,
threads = threads)
message("Generated ", length(simReads),
" adjacent chimeric reads ",
"on chromosome ", chr, ".")
nReadsLocal <- nReadsLocal + length(simReads)
simReads <- NULL
}
if (distChimeric) {
allChr <- names(reference)
totalLen <- do.call("sum", lapply(reference, length))
chrProb <- unlist(lapply(reference, length)) / totalLen
tmpProb <- (sameChrProp - adjChimProp)/(1 - adjChimProp)
chrProb[chr] <- 0
chrProb <- chrProb / sum(chrProb) * (1 - tmpProb)
chrProb[chr] <- tmpProb
if (pairedEnd) {
allSeeds <- round((allEndPos - allStartPos + 1) *
covFFPE * distFactor /
readLen / 2)
} else {
allSeeds <- round((allEndPos - allStartPos + 1) *
covFFPE * distFactor / readLen)
}
if (length(allStartPos) > 1000) {
nTimes <- ceiling(length(allStartPos) / 1000 / threads)
} else {
nTimes <- 1
}
nReadsDistantChr <- 0
firstID <- 1
for (j in seq_len(nTimes)) {
startPos <- split(allStartPos,
ceiling(seq_along(allStartPos) /
1000 / threads))[[j]]
endPos <- split(allEndPos,
ceiling(seq_along(allEndPos) /
1000 / threads))[[j]]
nSeeds <- split(allSeeds,
ceiling(seq_along(allSeeds) /
1000 / threads))[[j]]
tmp <- ceiling(length(startPos) / threads)
startPos <- split(startPos,
ceiling(seq_along(startPos) / tmp))
endPos <- split(endPos,
ceiling(seq_along(endPos) / tmp))
nSeeds <- split(nSeeds,
ceiling(seq_along(nSeeds) / tmp))
simReads <-
foreach(i = seq_along(startPos),
.combine = "c",
.inorder = TRUE,
.verbose = FALSE,
.packages = "Biostrings",
.errorhandling = "remove",
.export = c("targetDistantChimericReads",
"generateTarDistChimericReads",
"generateDistantChimericSeqs",
"generateReads",
"addRandomMutation",
"addNoise",
"generateTargetSeqs",
"rbeta", "round",
"rtruncnorm")
) %dopar% {
targetDistantChimericReads(
fullSeq = fullSeq,
referencePath = referencePath,
startPoses = startPos[[i]],
endPoses = endPos[[i]],
distWinLen = distWinLen,
nSeeds = nSeeds[[i]],
meanLogSRCRLen = meanLogSRCRLen,
sdLogSRCRLen = sdLogSRCRLen,
maxSRCRLen = maxSRCRLen,
meanInsertLen = meanInsertLen,
sdInsertLen = sdInsertLen,
readLen = readLen,
allChr = allChr,
chrProb = chrProb,
chimMutRate = chimMutRate,
noiseRate = noiseRate,
highNoiseRate = highNoiseRate,
highNoiseProp = highNoiseProp,
pairedEnd = pairedEnd,
spikeWidth = spikeWidth,
betaShape1 = betaShape1,
betaShape2 = betaShape2,
sameTarRegionProb = sameTarRegionProb,
sameStrandProp = 0.5)
}
readsToFastq(simReads = simReads,
PhredScoreProfile = PhredScoreProfile,
prefix = prefix,
prefix2 = "distChimeric",
chr = chr,
firstID = firstID,
pairedEnd = pairedEnd,
outFile = outFile,
threads = threads)
if (pairedEnd) {
firstID = firstID + length(simReads) / 2
} else {
firstID = firstID + length(simReads)
}
nReadsDistantChr <- nReadsDistantChr + length(simReads)
simReads <- NULL
}
message("Generated ", nReadsDistantChr,
" distant chimeric reads ",
"on chromosome ", chr, ".")
nReadsDistant <- nReadsDistant + nReadsDistantChr
}
if (normalReads) {
if (pairedEnd) {
nSeeds <- round((allEndPos - allStartPos + 1) * covNorm /
readLen / 2)
} else {
nSeeds <- round((allEndPos - allStartPos + 1) * covNorm
/ readLen)
}
simReads <- foreach(start = allStartPos,
end = allEndPos,
nSeed = nSeeds,
.combine = "c",
.inorder = TRUE,
.verbose = FALSE,
.errorhandling = "remove",
.packages = c("Biostrings"),
.export = c("addNoise",
"rtruncnorm",
"generateNormalReads")
) %dopar% {
generateNormalReads(
fullSeq = fullSeq[start:end],
nSeq = nSeed,
meanInsertLen = meanInsertLen,
sdInsertLen = sdInsertLen,
readLen = readLen,
noiseRate = noiseRate,
highNoiseRate = highNoiseRate,
highNoiseProp = highNoiseProp,
pairedEnd = pairedEnd)
}
readsToFastq(simReads = simReads,
PhredScoreProfile = PhredScoreProfile,
prefix = prefix,
prefix2 = "Normal",
chr = chr,
pairedEnd = pairedEnd,
outFile = outFile,
threads = threads)
message("Generated ", length(simReads), " normal reads ",
"on chromosome ", chr, ".")
nReadsNorm <- nReadsNorm + length(simReads)
simReads <- NULL
}
}
}
stopCluster(cl)
message("In totoal ", nReadsLocal, " adjacent chimeric reads, ",
nReadsDistant, " distant chimeric reads, ",
nReadsNorm, " normal reads were generated.",
"\nAlltogether ", nReadsLocal + nReadsDistant + nReadsNorm,
" reads were generated. ", nReadsLocal + nReadsDistant,
" reads (", round((nReadsLocal + nReadsDistant) /
(nReadsLocal + nReadsDistant + nReadsNorm),
4) * 100,
"%, adjusted by chimericProp) are artifact chimeric reads.",
"\nOf all artifact chimeric reads, ", nReadsLocal, " reads (",
round(nReadsLocal / (nReadsLocal + nReadsDistant), 4) * 100,
"%, adjusted by sameChrProp * adjChimProp)",
" are adjacent chimeric reads.",
"\nSimulation done.")
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.