BAFfromGenotypes <- function(
intenData,
genoData,
filename,
file.type=c("gds", "ncdf"),
min.n.genotypes = 2,
call.method = c("by.plate", "by.study"),
plate.name = "plate",
block.size = 5000,
precision = "single", compress = "LZMA_RA:1M",
verbose = TRUE) {
# check that dimensions of intenData and genoData are equal
intenSnpID <- getSnpID(intenData)
genoSnpID <- getSnpID(genoData)
if (!all(intenSnpID == genoSnpID)) stop("snp dimensions of intenData and genoData differ")
intenScanID <- getScanID(intenData)
genoScanID <- getScanID(genoData)
if (!all(intenScanID == genoScanID)) stop("scan dimensions of intenData and genoData differ")
# Check that plate is included in data if by.plate is being used
call.method <- match.arg(call.method)
if(call.method == "by.plate") {
if (hasScanVariable(intenData, plate.name)) {
plate <- getScanVariable(intenData, plate.name)
} else if (hasScanVariable(genoData, plate.name)) {
plate <- getScanVariable(genoData, plate.name)
} else stop("call.method==by.plate but plate.name not found in intenData or genoData")
}
## get file type
file.type <- match.arg(file.type)
## create data file
snp.annotation <- getSnpVariable(intenData, c("snpID", "chromosome", "position"))
variables <- c("BAlleleFreq", "LogRRatio")
if (file.type == "gds") {
genofile <- .createGdsBySnp(intenScanID, snp.annotation, filename, variables, precision, compress)
} else if (file.type == "ncdf") {
if (!(requireNamespace("ncdf4"))) {
stop("please install ncdf4 to work with NetCDF files")
}
genofile <- .createNcdf(snp.annotation, filename, variables, nscan(intenData),
precision, var.data=list(sampleID=intenScanID))
}
# make a matrix of T/F indicating which plates the samples are on
N <- length(intenScanID)
if(call.method == "by.plate") {
sidPlate <- cbind(intenScanID, plate)
orderedPlate <- levels(as.factor(plate))
Q <- length(orderedPlate)
P <- matrix(rep(FALSE,Q*N), nrow=Q, ncol=N)
dimnames(P) <- list(orderedPlate, intenScanID)
for(i in 1:nrow(sidPlate)) {
P[sidPlate[i,2],sidPlate[i,1]] <- TRUE
}
} else {
# fill P with all trues to essentially make a one plate study if want to analyze 'by study' rather than 'by plate'
Q <- 1
P <- matrix(rep(TRUE, Q*N), nrow=Q, ncol=N)
}
# define block.sizes
n <- length(intenSnpID)
nblock <- floor(n/block.size)
remain <- n-block.size*nblock
rm(n)
if(remain == 0) {M <- nblock} else { M <- nblock+1 }
# M is the number of iterations needed to get all samples
m <- 1 # first snp of first block.size
# loop through block.sizes, get X, Y and G and use them to calculate B and L
for (i in 1:M) {
if (verbose)
message(paste("block", i, "of", M))
if(i <= nblock) { n <- block.size } else { n <- remain }
# size of current block.size = number of snps to get # CHANGE columns to snps (rows)
geno <- getGenotype(genoData, snp=c(m,n), scan=c(1,-1), drop=FALSE)
x <- getX(intenData, snp=c(m,n), scan=c(1,-1), drop=FALSE)
y <- getY(intenData, snp=c(m,n), scan=c(1,-1), drop=FALSE)
# output matrices
BAF <- matrix(0.0, nrow=n, ncol=N)
LRR <- matrix(0.0, nrow=n, ncol=N)
for (s in 1:n) { # for each snp within the block.size
# get geno, x and y vectors for the current snp
gv <- geno[s,]; xv <- x[s,]; yv <- y[s,]
T <- rep(NA, N); R <- rep(NA, N)
B <- rep(NA, N); L <- rep(NA, N)
T <- atan(yv/xv)*(2/pi)
R <- xv+yv
for(k in 1:Q) { # loop through plates, if by study, essentially one plate so Q=1
p <- P[k,] # T for all sample.ids that are on the current plate k
# sum(p) # check to ensure this is the number of samples per plate
AAind <- p & gv==2 & !is.na(gv)
ABind <- p & gv==1 & !is.na(gv)
BBind <- p & gv==0 & !is.na(gv)
if(sum(AAind) < min.n.genotypes | sum(ABind) < min.n.genotypes | sum(BBind) < min.n.genotypes)
{ next }
tAA <- mean(T[AAind], na.rm=TRUE) # ADDED na.rm=TRUE arg
tAB <- mean(T[ABind], na.rm=TRUE) # note: can't use na.rm=T because T is theta
tBB <- mean(T[BBind], na.rm=TRUE)
rAA <- mean(R[AAind], na.rm=TRUE)
rAB <- mean(R[ABind], na.rm=TRUE)
rBB <- mean(R[BBind], na.rm=TRUE)
TC <- matrix(rep(NA, 4*N),nrow=4, ncol=N)
TC[1,] <- T < tAA
TC[2,] <- tAA <= T & T < tAB
TC[3,] <- tAB <= T & T < tBB
TC[4,] <- tBB <= T
# check to ensure there are no NAs in TC matrix
# TC[is.na(TC)] # should be no entries # WHAT IF THERE ARE NAs?
TC[is.na(TC)] <- FALSE # ADDED
# calculate and store B allele frequencies
# ensure only loading for those samples on the current plate
B[p & TC[1,]] <- 0
c2 <- p & TC[2,]
B[c2] <- (.5*(T[c2]-tAA))/(tAB-tAA)
c3 <- p & TC[3,]
B[c3] <- .5+(.5*(T[c3]-tAB))/(tBB-tAB)
B[p & TC[4,]] <- 1
# calculate log R ratio for samples on current plate
c <- (p & TC[1,]) | (p & TC[2,])
rhat <- rAA + ((T[c]-tAA)*(rAB-rAA))/(tAB-tAA) # if rhat is negative, make it missing (NA)
rhat[which(rhat<0)] <- NA
L[c] <- log2(R[c]/rhat)
c2 <- (p & TC[3,]) | (p & TC[4,])
rhat2 <- rAB + ((T[c2]-tAB)*(rBB-rAB))/(tBB-tAB) # if rhat2 is negative, make it missing (NA)
rhat2[which(rhat2<0)] <- NA
L[c2] <- log2(R[c2]/rhat2)
} # end for loop through plates
BAF[s,] <- B
LRR[s,] <- L
} # end for loop through SNPs within block.size
dat <- list("BAlleleFreq"=BAF, "LogRRatio"=LRR)
.addDataBySnp(genofile, variables, dat, snp.start=m, snp.count=n)
m <- m+n
} # end for loop through number of block.sizes
.close(genofile, verbose=verbose)
return(invisible(NULL))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.