# QTLseq simulation functions
#' Randomly calculates an alternate allele frequency within a bulk
#'
#' @param n non-negative integer. The number of individuals in each bulk
#' @param pop the population structure. Defaults to "F2" and assumes "RIL"
#' population otherwise.
#'
#' @return an alternate allele frequency within the bulk. Used for simulating
#' SNP-indeces.
#'
simulateAlleleFreq <- function(n, pop = "F2") {
if (pop == "F2") {
mean(sample(
x = c(0, 0.5, 1),
size = n,
prob = c(1, 2, 1),
replace = TRUE
))
} else {
mean(sample(
x = c(0, 1),
size = n,
prob = c(1, 1),
replace = TRUE
))
}
}
#' Simulates a delta SNP-index with replication
#'
#' @param depth integer. A read depth for which to replicate SNP-index calls.
#' @param altFreq1 numeric. The alternate allele frequency for bulk A.
#' @param altFreq2 numeric. The alternate allele frequency for bulk B.
#' @param replicates integer. The number of bootstrap replications.
#' @param filter numeric. an optional minimum SNP-index filter
#'
#' @return Returns a vector of length replicates delta SNP-indeces
simulateSNPindex <-
function(depth,
altFreq1,
altFreq2,
replicates = 10000,
filter = NULL) {
SNPindex_H <- rbinom(replicates, size = depth, altFreq1) / depth
SNPindex_L <- rbinom(replicates, size = depth, altFreq2) / depth
deltaSNP <- SNPindex_H - SNPindex_L
if (!is.null(filter)) {
deltaSNP <- deltaSNP[SNPindex_H >= filter | SNPindex_L >= filter]
}
deltaSNP
}
#' Simulation of delta SPP index confidence intervals
#'
#' The method for simulating delta SNP-index confidence interval thresholds
#' as described in Takagi et al., (2013). Genotypes are randomly assigned for
#' each indvidual in the bulk, based on the population structure. The total
#' alternative allele frequency in each bulk is calculated at each depth used to simulate
#' delta SNP-indeces, with a user defined number of bootstrapped replication.
#' The requested confidence intervals are then calculated from the bootstraps.
#'
#' @param popStruc the population structure. Defaults to "F2" and assumes "RIL"
#' @param bulkSize non-negative integer vector. The number of individuals in
#' each simulated bulk. Can be of length 1, then both bulks are set to the
#' same size. Assumes the first value in the vector is the simulated high
#' bulk.
#' @param depth integer. A read depth for which to replicate SNP-index calls.
#' @param replications integer. The number of bootstrap replications.
#' @param filter numeric. An optional minimum SNP-index filter
#' @param intervals numeric vector of probabilities with values in [0,1]
#' corresponding to the requested confidence intervals
#'
#' @return A data frame of delta SNP-index thresholds corrisponding to the
#' requested confidence intervals at the user set depths.
#' @export simulateConfInt
#'
#' @examples CI <-
#' simulateConfInt(
#' popStruc = "F2",
#' bulkSize = 50,
#' depth = 1:100,
#' intervals = c(0.05, 0.95, 0.025, 0.975, 0.005, 0.995, 0.0025, 0.9975)
simulateConfInt <- function(popStruc = "F2",
bulkSize,
depth = 1:100,
replications = 10000,
filter = 0.3,
intervals = c(0.05, 0.025)) {
if (popStruc == "F2") {
message(
"Assuming bulks selected from F2 population, with ",
paste(bulkSize, collapse = " and "),
" individuals per bulk."
)
} else {
message(
"Assuming bulks selected from RIL population, with ",
bulkSize,
" individuals per bulk."
)
}
if (length(bulkSize) == 1) {
message("The 'bulkSize' argument is of length 1, setting number of individuals in both bulks to: ", bulkSize)
bulkSize[2] <- bulkSize[1]
}
if (length(bulkSize) > 2) {
message("The 'bulkSize' argument is larger than 2. Using the first two values as the bulk size.")
}
if (any(bulkSize < 0)) {
stop("Negative bulkSize values")
}
#makes a vector of possible alt allele frequencies once. this is then sampled for each replicate
tmp_freq <-
replicate(n = replications * 10, simulateAlleleFreq(n = bulkSize[1], pop = popStruc))
tmp_freq2 <-
replicate(n = replications * 10, simulateAlleleFreq(n = bulkSize[2], pop = popStruc))
message(
paste0(
"Simulating ",
replications,
" SNPs with reads at each depth: ",
min(depth),
"-",
max(depth)
)
)
message(paste0(
"Keeping SNPs with >= ",
filter,
" SNP-index in both simulated bulks"
))
# tmp allele freqs are sampled to produce 'replicate' numbers of probablities. these
# are then used as altFreq probs to simulate SNP index values, per bulk.
CI <- sapply(
X = depth,
FUN = function(x)
{
quantile(
x = simulateSNPindex(
depth = x,
altFreq1 = sample(
x = tmp_freq,
size = replications,
replace = TRUE
),
altFreq2 = sample(
x = tmp_freq2,
size = replications,
replace = TRUE
),
replicates = replications,
filter = filter
),
probs = intervals,
names = TRUE
)
}
)
CI <- as.data.frame(CI)
if (length(CI) > 1) {
CI <- data.frame(t(CI))
}
names(CI) <- paste0("CI_", 100 - (intervals * 200))
CI <- cbind(depth, CI)
#to long format for easy plotting
# tidyr::gather(data = CI,
# key = interval,
# convert = TRUE,
# value = SNPindex,-depth) %>%
# dplyr::mutate(Confidence = factor(ifelse(
# interval > 0.5,
# paste0(round((1 - interval) * 200, digits = 1), "%"),
# paste0((interval * 200), "%")
# )))
CI
}
#' Calculates delta SNP confidence intervals for QTLseq analysis
#'
#'The method for simulating delta SNP-index confidence interval thresholds
#' as described in Takagi et al., (2013). Genotypes are randomly assigned for
#' each indvidual in the bulk, based on the population structure. The total
#' alternative allele frequency in each bulk is calculated at each depth used to simulate
#' delta SNP-indeces, with a user defined number of bootstrapped replication.
#' The requested confidence intervals are then calculated from the bootstraps.
#'
#' @param SNPset The data frame imported by \code{ImportFromGATK}
#' @param windowSize the window size (in base pairs) bracketing each SNP for which to calculate the statitics.
#' @param popStruc the population structure. Defaults to "F2" and assumes "RIL" otherwise
#' @param bulkSize non-negative integer vector. The number of individuals in
#' each simulated bulk. Can be of length 1, then both bulks are set to the
#' same size. Assumes the first value in the vector is the simulated high
#' bulk.
#' @param depth integer. A read depth for which to replicate SNP-index calls.
#' @param replications integer. The number of bootstrap replications.
#' @param filter numeric. An optional minimum SNP-index filter
#' @param intervals numeric vector. Confidence intervals supplied as two-sided
#' percentiles. i.e. If intervals = '95' will return the two sided 95\%
#' confidence interval, 2.5\% on each side.
#' @param ... Other arguments passed to \code{\link[locfit]{locfit}} and
#' subsequently to \code{\link[locfit]{locfit.raw}}() (or the lfproc). Usefull
#' in cases where you get "out of vertex space warnings"; Set the maxk higher
#' than the default 100. See \code{\link[locfit]{locfit.raw}}(). But if you
#' are getting that warning you should seriously consider increasing your
#' window size.
#'
#' @return A SNPset data frame with delta SNP-index thresholds corrisponding to the
#' requested confidence intervals matching the tricube smoothed depth at each SNP.
#' @export runQTLseqAnalysis
#'
#' @examples df_filt <- runQTLseqAnalysis(
#' SNPset = df_filt,
#' bulkSize = c(25, 35)
#' windowSize = 1e6,
#' popStruc = "F2",
#' replications = 10000,
#' intervals = c(95, 99)
#' )
#'
runQTLseqAnalysis <- function(SNPset,
windowSize = 1e6,
popStruc = "F2",
bulkSize,
depth = NULL,
replications = 10000,
filter = 0.3,
intervals = c(95, 99),
...) {
message("Counting SNPs in each window...")
SNPset <- SNPset %>%
dplyr::group_by(CHROM) %>%
dplyr::mutate(nSNPs = countSNPs_cpp(POS = POS, windowSize = windowSize))
message("Calculating tricube smoothed delta SNP index...")
SNPset <- SNPset %>%
dplyr::mutate(tricubeDeltaSNP = tricubeStat(POS = POS, Stat = deltaSNP, windowSize, ...))
#convert intervals to quantiles
if (all(intervals >= 1)) {
message(
"Returning the following two sided confidence intervals: ",
paste(intervals, collapse = ", ")
)
quantiles <- (100 - intervals) / 200
} else {
stop(
"Convidence intervals ('intervals' paramater) should be supplied as two-sided percentiles. i.e. If intervals = '95' will return the two sided 95% confidence interval, 2.5% on each side."
)
}
#calculate min depth per snp between bulks
SNPset <-
SNPset %>%
dplyr::mutate(minDP = pmin(DP.LOW, DP.HIGH))
SNPset <-
SNPset %>%
dplyr::group_by(CHROM) %>%
dplyr::mutate(tricubeDP = floor(tricubeStat(POS, minDP, windowSize = windowSize, ...)))
if (is.null(depth)) {
message(
"Variable 'depth' not defined, using min and max depth from data: ",
min(SNPset$minDP),
"-",
max(SNPset$minDP)
)
depth <- min(SNPset$minDP):max(SNPset$minDP)
}
#simualte confidence intervals
CI <-
simulateConfInt(
popStruc = popStruc,
bulkSize = bulkSize,
depth = depth,
replications = replications,
filter = filter,
intervals = quantiles
)
#match name of column for easier joining of repeat columns
names(CI)[1] <- "tricubeDP"
#use join as a quick way to match min depth to matching conf intervals.
SNPset <-
dplyr::left_join(x = SNPset,
y = CI #, commented out becuase of above change. need to remove eventually
# by = c("tricubeDP" = "depth"))
)
as.data.frame(SNPset)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.