SimFFPE: NGS Read Simulator for FFPE Tissue

Documented in targetReadSimFFPE

targetReadSimFFPE <- function(referencePath, PhredScoreProfile,
                              targetRegions, outFile, coverage,
                              readLen=150, meanInsertLen=250, sdInsertLen=80,
                              enzymeCut=FALSE, padding=50, minGap=5, 
                              chimericProp=0.1, sameChrProp=0.43, 
                              windowLen=5000, adjChimProp=0.63, 
                              sameStrandProp=0.65, meanLogSRCRLen=1.8, 
                              sdLogSRCRLen=0.55, maxSRCRLen=32,
                              meanLogSRCRDist=4.7, sdLogSRCRDist=0.35, 
                              distWinLen=5000, spikeWidth=1500, 
                              betaShape1=0.5, betaShape2=0.5, 
                              sameTarRegionProb=0, adjFactor = 1.3, 
                              distFactor = 1.3, chimMutRate=0.003, 
                              noiseRate=0.0015, highNoiseRate=0.08, 
                              highNoiseProp=0.01, pairedEnd=TRUE, 
                              prefix="SimFFPE", threads=1,
                              adjChimeric=TRUE, distChimeric=TRUE,
                              normalReads=TRUE, overWrite=FALSE) {
    
    if(missing(referencePath)) stop("referencePath is required")
    if(missing(PhredScoreProfile)) stop("PhredScoreProfile is required")
    if(missing(targetRegions)) stop("targetRegions is required")
    if(missing(outFile)) stop("outFile is required")
    if(missing(coverage)) stop("coverage is required")
    
    if(max(betaShape1, betaShape2) > 1) {
        stop ("betaShape1 and betaShape2 should not be greater than 1")
    }
    if(min(betaShape1, betaShape2) <= 0) {
        stop ("betaShape1 and betaShape2 should be greater than 0")
    }
    
    if (nrow(PhredScoreProfile) != readLen) {
        stop("The number of rows in PhredScoreProfile should be the same as", 
             " the input readLen")
    }
    
    if (overWrite) {
        overWriteFastq(outFile = outFile, pairedEnd = pairedEnd)
    }
    
    covFFPE <- coverage * chimericProp
    covNorm <- coverage * (1 - chimericProp)
    adjChimProp <- sameChrProp * adjChimProp
    sameStrandProp <- sameStrandProp * 1.05
    
    if(!pairedEnd) {
        enzymeCut <- FALSE
    }
    
    reference <- readDNAStringSet(referencePath)
    reference <- reference[width(reference) >= distWinLen]
    names(reference) <- unlist(lapply(names(reference),
                                      function(x) strsplit(x, " ")[[1]][1]))
    
    if (is(targetRegions, "GenomicRanges")) {
        targetRegions <- data.frame(seqnames(targetRegions), 
                                    start(targetRegions), 
                                    end(targetRegions))
    }
    colnames(targetRegions) <- c("chr", "start", "end")
    targetRegions <- targetRegions[targetRegions$chr %in% names(reference), ]
    targetRegions <- targetRegions[targetRegions$end >= targetRegions$start, ]
    
    
    nReadsLocal <- 0
    nReadsDistant <- 0
    nReadsNorm <- 0
    
    cl <- makeCluster(threads)
    registerDoParallel(cl)
    message("Using ", threads, " thread(s) for simulation.")
    for (chr in unique(targetRegions$chr)) {
        chrName <- chr
        targetRegion <- targetRegions[targetRegions$chr == chrName, ]
        
        if (nrow(targetRegion) > 0) {
            fullSeq <- reference[[chr]]
            message("Simulating FFPE reads on the chromosome ", chr, "...")
            
            targetRegion$start <- unlist(lapply(targetRegion$start,
                                                function(x)
                                                    max(0, x - padding)))
            chrLen <- length(fullSeq)
            targetRegion$end <- unlist(lapply(targetRegion$end,
                                              function(x)
                                                  min(chrLen, x + padding)))
            
            targetRegion <- targetRegion[order(targetRegion$start),]
            
            if (nrow(targetRegion) > 1) {
                starts <- targetRegion$start
                starts <- starts[2:length(starts)]
                ends <- targetRegion$end
                ends <- ends[seq_len(length(ends) - 1)]
                ifMerge <- (starts - ends) <= minGap
                allStartPos <- c(targetRegion$start[1], starts[!ifMerge])
                allEndPos <- c(ends[!ifMerge],
                               targetRegion$end[nrow(targetRegion)])
                keep <- (allEndPos - allStartPos + 1) >= readLen
                allStartPos <- allStartPos[keep]
                allEndPos <- allEndPos[keep]
                startPoses <- unlist(mapply(function(x, y) seq(from = x,
                                                               to = y,
                                                               by = windowLen),
                                            allStartPos, allEndPos))
                endPoses <- mapply(function(x, y) seq(from = x,
                                                      to = y,
                                                      by = windowLen) +
                                       windowLen - 1,
                                   allStartPos, allEndPos)
                for (i in seq_along(endPoses)) {
                    endPoses[[i]][endPoses[[i]] > allEndPos[i]] <- allEndPos[i]
                }
                endPoses <- unlist(endPoses)
                endPoses[endPoses > length(fullSeq)] <- length(fullSeq)
                allStartPos <- unlist(allStartPos)
                allEndPos <- unlist(allEndPos)
            } else {
                startPoses <- targetRegion$start
                endPoses <- targetRegion$end
                allStartPos <- targetRegion$start
                allEndPos <- targetRegion$end
            }
            
            if (adjChimeric) {
                
                if (pairedEnd) {
                    nSeeds <- round((endPoses - startPoses + 1)  * adjFactor * 
                                        covFFPE * adjChimProp / readLen / 2)
                } else {
                    nSeeds <- round((endPoses - startPoses + 1) * adjFactor * 
                                        covFFPE * adjChimProp / readLen)
                }
                matchPaddings <- 
                    round((windowLen - (endPoses - startPoses + 1)) / 2)
                matchPaddings[matchPaddings < 0] <- 0
                targetStarts <- startPoses - matchPaddings
                targetEnds <- endPoses + matchPaddings
                remove <- targetStarts < 0 | targetEnds > length(fullSeq)
                targetStarts[remove] <- startPoses[remove]
                targetEnds[remove] <- endPoses[remove]
                matchPaddings[remove] <- 0
                startPos <- NULL
                endPos <- NULL
                targetStart <- NULL
                targetEnd <- NULL
                nSeed <- NULL
                paddingLen <- NULL
                simReads <- foreach(targetStart = targetStarts,
                                    targetEnd = targetEnds,
                                    nSeed = nSeeds,
                                    paddingLen = matchPaddings,
                                    .combine = "c",
                                    .inorder = TRUE,
                                    .verbose = FALSE,
                                    .errorhandling    = "remove",
                                    .packages = c("Biostrings"),
                                    .export = c(
                                        "targetRegionalChimericReads",
                                        "generateRegionalChimericSeqs",
                                        "generateEnzymicCutSeq",
                                        "addRandomMutation",
                                        "generateReads",
                                        "addNoise",
                                        "rtruncnorm",
                                        "round")
                ) %dopar% {
                    targetRegionalChimericReads(
                        targetSeq = fullSeq[targetStart:targetEnd],
                        targetPadding = paddingLen,
                        nSeed = nSeed,
                        meanLogSRCRLen = meanLogSRCRLen,
                        sdLogSRCRLen = sdLogSRCRLen,
                        maxSRCRLen = maxSRCRLen,
                        meanLogSRCRDist = meanLogSRCRDist,
                        sdLogSRCRDist = sdLogSRCRDist,
                        meanInsertLen = meanInsertLen,
                        sdInsertLen = sdInsertLen,
                        enzymeCut = enzymeCut,
                        readLen = readLen,
                        chimMutRate = chimMutRate,
                        noiseRate = noiseRate,
                        highNoiseRate = highNoiseRate,
                        highNoiseProp = highNoiseProp,
                        pairedEnd = pairedEnd,
                        sameStrandProp = sameStrandProp)
                }
                readsToFastq(simReads = simReads,
                             PhredScoreProfile = PhredScoreProfile,
                             prefix = prefix,
                             prefix2 = "adjChimeric",
                             chr = chr,
                             pairedEnd = pairedEnd,
                             outFile = outFile,
                             threads = threads)
                message("Generated ", length(simReads), 
                        " adjacent chimeric reads ",
                        "on chromosome ", chr, ".")
                nReadsLocal <- nReadsLocal + length(simReads)
                simReads <- NULL
            }
            
            if (distChimeric) {
                
                allChr <- names(reference)
                totalLen <- do.call("sum", lapply(reference, length))
                chrProb <- unlist(lapply(reference, length)) / totalLen
                tmpProb <- (sameChrProp - adjChimProp)/(1 - adjChimProp)
                chrProb[chr] <- 0
                chrProb <- chrProb / sum(chrProb) * (1 - tmpProb)
                chrProb[chr] <- tmpProb
                
                if (pairedEnd) {
                    allSeeds <- round((allEndPos - allStartPos + 1) * 
                                          covFFPE * distFactor / 
                                          readLen / 2)
                } else {
                    allSeeds <- round((allEndPos - allStartPos + 1) *
                                          covFFPE * distFactor / readLen)
                }
                
                if (length(allStartPos) > 1000) {
                    nTimes <- ceiling(length(allStartPos) / 1000 / threads)
                } else {
                    nTimes <- 1
                }
                
                nReadsDistantChr <- 0
                firstID <- 1
                
                for (j in seq_len(nTimes)) {
                    startPos <- split(allStartPos, 
                                      ceiling(seq_along(allStartPos) / 
                                                  1000 / threads))[[j]]
                    endPos <- split(allEndPos, 
                                    ceiling(seq_along(allEndPos) / 
                                                1000 / threads))[[j]]
                    
                    nSeeds <- split(allSeeds, 
                                    ceiling(seq_along(allSeeds) / 
                                                1000 / threads))[[j]]
                    tmp <- ceiling(length(startPos) / threads)
                    startPos <- split(startPos, 
                                      ceiling(seq_along(startPos) / tmp))
                    endPos <- split(endPos, 
                                    ceiling(seq_along(endPos) / tmp))
                    nSeeds <- split(nSeeds, 
                                    ceiling(seq_along(nSeeds) / tmp))
                    
                    simReads <- 
                        foreach(i = seq_along(startPos),
                                .combine = "c",
                                .inorder = TRUE,
                                .verbose = FALSE,
                                .packages = "Biostrings",
                                .errorhandling = "remove",
                                .export = c("targetDistantChimericReads",
                                            "generateTarDistChimericReads",
                                            "generateDistantChimericSeqs",
                                            "generateReads",
                                            "addRandomMutation",
                                            "addNoise",
                                            "generateTargetSeqs",
                                            "rbeta", "round",
                                            "rtruncnorm")
                        ) %dopar% {
                            targetDistantChimericReads(
                                fullSeq = fullSeq,
                                referencePath = referencePath,
                                startPoses = startPos[[i]],
                                endPoses = endPos[[i]],
                                distWinLen = distWinLen,
                                nSeeds = nSeeds[[i]],
                                meanLogSRCRLen = meanLogSRCRLen,
                                sdLogSRCRLen = sdLogSRCRLen,
                                maxSRCRLen = maxSRCRLen,
                                meanInsertLen = meanInsertLen,
                                sdInsertLen = sdInsertLen,
                                readLen = readLen,
                                allChr = allChr,
                                chrProb = chrProb,
                                chimMutRate = chimMutRate,
                                noiseRate = noiseRate,
                                highNoiseRate = highNoiseRate,
                                highNoiseProp = highNoiseProp,
                                pairedEnd = pairedEnd,
                                spikeWidth = spikeWidth,
                                betaShape1 = betaShape1,
                                betaShape2 = betaShape2,
                                sameTarRegionProb = sameTarRegionProb,
                                sameStrandProp = 0.5)
                        }
                    readsToFastq(simReads = simReads,
                                 PhredScoreProfile = PhredScoreProfile,
                                 prefix = prefix,
                                 prefix2 = "distChimeric",
                                 chr = chr,
                                 firstID = firstID,
                                 pairedEnd = pairedEnd,
                                 outFile = outFile,
                                 threads = threads)
                    if (pairedEnd) {
                        firstID = firstID + length(simReads) / 2
                    } else {
                        firstID = firstID + length(simReads)
                    }
                    nReadsDistantChr <- nReadsDistantChr + length(simReads)
                    simReads <- NULL
                }
                message("Generated ", nReadsDistantChr,
                        " distant chimeric reads ",
                        "on chromosome ", chr, ".")
                nReadsDistant <- nReadsDistant + nReadsDistantChr
            }
            
            if (normalReads) {
                if (pairedEnd) {
                    nSeeds <- round((allEndPos - allStartPos + 1) * covNorm /
                                        readLen / 2)
                } else {
                    nSeeds <- round((allEndPos - allStartPos + 1) * covNorm
                                    / readLen)
                }
                simReads <- foreach(start = allStartPos,
                                    end = allEndPos,
                                    nSeed = nSeeds,
                                    .combine = "c",
                                    .inorder = TRUE,
                                    .verbose = FALSE,
                                    .errorhandling  = "remove",
                                    .packages = c("Biostrings"),
                                    .export = c("addNoise",
                                                "rtruncnorm",
                                                "generateNormalReads")
                ) %dopar% {
                    generateNormalReads(
                        fullSeq = fullSeq[start:end],
                        nSeq = nSeed,
                        meanInsertLen = meanInsertLen,
                        sdInsertLen = sdInsertLen,
                        readLen = readLen,
                        noiseRate = noiseRate,
                        highNoiseRate = highNoiseRate,
                        highNoiseProp = highNoiseProp,
                        pairedEnd = pairedEnd)
                }
                
                readsToFastq(simReads = simReads,
                             PhredScoreProfile = PhredScoreProfile,
                             prefix = prefix,
                             prefix2 = "Normal",
                             chr = chr,
                             pairedEnd = pairedEnd,
                             outFile = outFile,
                             threads = threads)
                
                message("Generated ", length(simReads), " normal reads ",
                        "on chromosome ", chr, ".")
                nReadsNorm <- nReadsNorm + length(simReads)
                simReads <- NULL
            }
        }
    }
    stopCluster(cl)
    message("In totoal ", nReadsLocal, " adjacent chimeric reads, ",
            nReadsDistant, " distant chimeric reads, ",
            nReadsNorm, " normal reads were generated.",
            "\nAlltogether ", nReadsLocal + nReadsDistant + nReadsNorm,
            " reads were generated. ", nReadsLocal + nReadsDistant, 
            " reads (", round((nReadsLocal + nReadsDistant) / 
                                  (nReadsLocal + nReadsDistant + nReadsNorm), 
                              4) * 100, 
            "%, adjusted by chimericProp) are artifact chimeric reads.",
            "\nOf all artifact chimeric reads, ", nReadsLocal, " reads (",
            round(nReadsLocal / (nReadsLocal + nReadsDistant), 4) * 100,
            "%, adjusted by sameChrProp * adjChimProp)",
            " are adjacent chimeric reads.",
            "\nSimulation done.")
}