simulateGenotypeMatrix <- function(n.snps=10,
n.chromosomes=10,
n.samples=1000,
filename,
file.type=c("gds", "ncdf"),
silent = TRUE) {
file.type <- match.arg(file.type)
m <- n.snps*n.chromosomes # num of snps; num of chromosomes*num of snps per chromosome
n <- n.samples # this is the number of samples
geno <- matrix(NA, m, n)
## simulate data to be put in the variables
## get allelic freq by random sampling of a uniform dist on (0,1)
afreq <- runif(m)
## simulate genotypes by random sampling 2 gametes for each sample from binomial dist
for (i in 1:m) geno[i,] <- rbinom(n, 2, afreq[i])
## simulate the missing calls for the genotypes
mrate <- 0.05 # assumed to be the same across all snps
nmiss <- rbinom(1, n*m, mrate) # find the number of missing over entire matrix
imiss <- sample(1:(n*m),nmiss) # vectorized indices for calls to become missing
geno[imiss] <- -1 # set the indices of particular genotypes to be missing
# -1 is missing value
table(geno)
## missing call rate for each snp
miss_rate <- apply(geno, 1, function(x) length(x[x==-1])/length(x))
summary(miss_rate)
## generate snp and sample integer ids (consecutive integer values)
snpID <- 1:m
sampleID <- 1:n
## snp chromosome and position within chromosome values
chromosome <- sort(rep(1:n.chromosomes, n.snps))
position <- rep(seq(1:n.snps),n.chromosomes)
## create the file
snp.annotation <- data.frame(snpID, chromosome, position)
if (file.type == "gds") {
var.data <- list(sample.id=sampleID, genotype=geno)
nc <- .createGds(snp.annotation, filename, variables="genotype",
var.data=var.data)
} else if (file.type == "ncdf") {
if (!(requireNamespace("ncdf4"))) {
stop("please install ncdf4 to work with NetCDF files")
}
var.data <- list(sampleID=sampleID, genotype=geno)
nc <- .createNcdf(snp.annotation, filename, variables="genotype",
n.samples=length(sampleID), var.data=var.data)
}
.close(nc)
if (!silent)
return(table(geno))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.