## get logical vector of samples to keep
.keepSamples <- function(genoData, scan.exclude) {
if (!is.null(scan.exclude)) {
!(getScanID(genoData) %in% scan.exclude)
} else {
rep(TRUE, nscan(genoData))
}
}
## check for sex variable if X or Y
## if chr=Y, returns keep vector with males only
.checkSexChr <- function(genoData, chr, keep) {
if (XchromCode(genoData) %in% chr & !hasSex(genoData)) {
stop("Sex values for the samples are required to compute MAF for X chromosome SNPs")
}
if (YchromCode(genoData) %in% chr) {
## check for sex variable
if (!hasSex(genoData)) {
stop("Sex values for the samples are required for Y chromosome SNPs")
}
if (!all(chr == YchromCode(genoData))) {
stop("Y chromosome must be analyzed separately")
}
## only keep males
keep <- keep & (getSex(genoData) == "M")
}
keep
}
## get data frame with outcome and covariates
.modelData <- function(genoData, chr, outcome, covar, ivar=NULL) {
mod.vars <- outcome
if (!is.null(covar)) {
cvnames <- unique(unlist(strsplit(covar,"[*:]")))
mod.vars <- c(mod.vars, cvnames)
miss.vars <- setdiff(mod.vars, getScanVariableNames(genoData))
if (length(miss.vars) > 0) {
stop("Variables ", paste(miss.vars, collapse=","), "not found in scan annotation of genoData")
}
if (YchromCode(genoData) %in% chr & "sex" %in% cvnames) {
stop("Sex not allowed as a covariate for Y chromosome")
}
}
if (!is.null(ivar)) {
if (is.null(covar) | !(ivar %in% cvnames)) {
stop("ivar should also be present in covar")
}
}
dat <- as.data.frame(getScanVariable(genoData, mod.vars))
names(dat)[1] <- outcome ## fix name in case of only one column
dat
}
## geno is a matrix of genotypes [snp,sample]
## chrom is chromosome corresponding to snp
## keep is the sample index corresponding to sample
.freqFromGeno <- function(genoData, geno, chrom, keep) {
freq <- 0.5*rowMeans(geno, na.rm=TRUE)
## for X chr
if (XchromCode(genoData) %in% chrom) {
## which are on X chr
Xidx <- chrom == XchromCode(genoData)
## males
m <- (getSex(genoData) == "M")[keep]
f <- (getSex(genoData) == "F")[keep]
## calculate allele freq for X
freq[Xidx] <- (0.5 * rowSums(geno[Xidx,m,drop=FALSE], na.rm=TRUE) +
rowSums(geno[Xidx,f,drop=FALSE], na.rm=TRUE)) /
(rowSums(!is.na(geno[Xidx,m,drop=FALSE])) +
2*rowSums(!is.na(geno[Xidx,f,drop=FALSE])))
}
freq
}
# A function to Transform Genotypes based on gene action
.transformGenotype <- function(geno, gene.action) {
switch(gene.action,
additive = {},
dominant = {geno[geno == 1] <- 2},
recessive = {geno[geno == 1] <- 0}
)
geno
}
## geno is a matrix of genotypes [snp,sample]
.monomorphic <- function(geno, outcome, model.type) {
freq <- 0.5*rowMeans(geno, na.rm=TRUE)
mono <- is.na(freq) | freq == 0 | freq == 1
if (model.type %in% c("logistic", "firth")) {
for (case.status in c(0,1)) {
cc <- outcome == case.status
freq <- 0.5*rowMeans(geno[,cc,drop=FALSE], na.rm=TRUE)
mono.cc <- is.na(freq) | freq == 0 | freq == 1
mono <- mono | mono.cc
}
}
mono
}
.CI <- function(Est, SE, CI) {
LL <- Est + qnorm((1-CI)*0.5)*SE
UL <- Est + qnorm(1-((1-CI)*0.5))*SE
c(LL=LL, UL=UL)
}
.waldTest <- function(Est, cov) {
if (length(Est) == 1) {
W <- (Est^2)/cov
} else {
W <- as.numeric(t(Est) %*% solve(cov) %*% Est)
}
pval <- pchisq(W, df=length(Est), lower.tail=FALSE)
c(Wald.Stat=W, Wald.pval=pval)
}
## give the data frame a crazy name so we don't overwrite something important
.runRegression <- function(model.string, ..model..data.., model.type, CI, robust, LRtest) {
model.formula <- as.formula(model.string)
tryCatch({
if (model.type == "linear") {
mod <- lm(model.formula, data=..model..data..)
} else if (model.type == "logistic") {
mod <- glm(model.formula, data=..model..data.., family=binomial())
} else if (model.type == "poisson") {
mod <- glm(model.formula, data=..model..data.., family=poisson())
}
if (!robust) {
## model based variance
Vhat <- vcov(mod)
} else {
## sandwich variance
Vhat <- vcovHC(mod, type="HC0")
}
Est <- unname(coef(mod)["genotype"])
cov <- Vhat["genotype","genotype"]
SE <- sqrt(cov)
ret <- c(Est=Est, SE=SE, .CI(Est, SE, CI), .waldTest(Est, cov))
if (LRtest) {
## promote the data frame to the global environment so lrtest can find it
..model..data.. <<- ..model..data..
mod2 <- lrtest(mod, "genotype")[2,]
ret <- c(ret, LR.Stat=mod2[["Chisq"]], LR.pval=mod2[["Pr(>Chisq)"]])
rm(..model..data.., envir=.GlobalEnv)
}
## GxE
GxE.idx <- grep(":genotype", names(coef(mod)), fixed=TRUE)
if (length(GxE.idx) > 0) {
GxE.Est <- unname(coef(mod)[GxE.idx])
GxE.cov <- Vhat[GxE.idx,GxE.idx]
test.GxE <- setNames(.waldTest(GxE.Est, GxE.cov), c("GxE.Stat", "GxE.pval"))
if (length(GxE.idx) == 1) {
test.GxE <- c(GxE.Est=GxE.Est, GxE.SE=sqrt(GxE.cov), test.GxE)
} else {
test.GxE <- c(GxE.Est=NA, GxE.SE=NA, test.GxE)
}
Gj.idx <- grep("genotype", names(coef(mod)), fixed=TRUE)
test.Joint <- setNames(.waldTest(coef(mod)[Gj.idx], Vhat[Gj.idx,Gj.idx]),
c("Joint.Stat", "Joint.pval"))
ret <- c(ret, test.GxE, test.Joint)
}
ret
}, warning=function(w) NA, error=function(e) NA)
}
.runFirth <- function(model.string, model.data, CI, PPLtest, geno.index=NULL) {
model.formula <- as.formula(model.string)
tryCatch({
mod <- logistf(model.formula, data=model.data, alpha=(1-CI),
pl=PPLtest, plconf=geno.index, dataout=FALSE)
ind <- which(mod$terms == "genotype")
if (PPLtest) stopifnot(ind == geno.index)
Est <- unname(coef(mod)[ind])
cov <- vcov(mod)[ind,ind]
LL <- unname(mod$ci.lower[ind])
UL <- unname(mod$ci.upper[ind])
pval <- unname(mod$prob[ind])
Stat <- qchisq(pval, df=1, lower.tail=FALSE)
ret <- c(Est=Est, SE=sqrt(cov), LL=LL, UL=UL)
if (PPLtest) {
c(ret, .waldTest(Est, cov), PPL.Stat=Stat, PPL.pval=pval)
} else {
c(ret, Wald.Stat=Stat, Wald.pval=pval)
}
}, warning=function(w) NA, error=function(e) NA)
}
assocRegression <- function(genoData,
outcome,
model.type = c("linear", "logistic", "poisson", "firth"),
gene.action = c("additive", "dominant", "recessive"),
covar = NULL,
ivar = NULL,
scan.exclude = NULL,
CI = 0.95,
robust = FALSE,
LRtest = FALSE,
PPLtest = TRUE,
effectAllele = c("minor", "alleleA"),
snpStart = NULL,
snpEnd = NULL,
block.size = 5000,
verbose = TRUE) {
## check that arguments are valid
model.type <- match.arg(model.type)
gene.action <- match.arg(gene.action)
effectAllele <- match.arg(effectAllele)
## set snpStart and snpEnd
if (is.null(snpStart)) snpStart <- 1
if (is.null(snpEnd)) snpEnd <- nsnp(genoData)
## set which samples to keep
keep <- .keepSamples(genoData, scan.exclude)
## get chromosome information
chr <- getChromosome(genoData, index=snpStart:snpEnd)
## sex chromosome checks
keep <- .checkSexChr(genoData, chr, keep)
## no LR test for interactions
if (LRtest & !is.null(ivar)) {
warning("LR test not available for GxE")
LRtest <- FALSE
}
## read in outcome and covariate data
if (verbose) message("Reading in Phenotype and Covariate Data...")
dat <- .modelData(genoData, chr, outcome, covar, ivar)
## identify samples with any missing data
keep <- keep & complete.cases(dat)
dat <- dat[keep,,drop=FALSE]
if (!is.null(ivar)) ivar <- paste0(ivar, ":genotype")
model.string <- paste(outcome, "~", paste(c(covar, ivar, "genotype"), collapse=" + "))
## for firth test - determine index of genotype in model matrix
if (model.type == "firth") {
tmp <- cbind(dat, "genotype"=0)
geno.index <- which(colnames(model.matrix(as.formula(model.string), tmp)) == "genotype")
rm(tmp)
}
## sample size, assuming no missing genotypes
n <- nrow(dat)
if (verbose) message("Running analysis with ", n, " Samples")
## number of SNPs in the segment
nsnp.seg <- snpEnd - snpStart + 1
nblocks <- ceiling(nsnp.seg/block.size)
## set up results matrix
nv <- c("snpID", "chr", "effect.allele", "EAF", "MAF", "n")
if (model.type %in% c("logistic", "firth")) nv <- c(nv, "n0", "n1")
nv <- c(nv, "Est", "SE", "LL", "UL", "Wald.Stat", "Wald.pval")
if (LRtest & model.type != "firth") nv <- c(nv, "LR.Stat", "LR.pval")
if (PPLtest & model.type == "firth") nv <- c(nv, "PPL.Stat", "PPL.pval")
if (!is.null(ivar) & model.type != "firth") nv <- c(nv, "GxE.Est", "GxE.SE", "GxE.Stat", "GxE.pval", "Joint.Stat", "Joint.pval")
res <- matrix(NA, nrow=nsnp.seg, ncol=length(nv), dimnames=list(NULL, nv))
reg.cols <- which(colnames(res) == "Est"):ncol(res)
## chromosome
res[,"chr"] <- chr
if (verbose) message("Beginning Calculations...")
for (b in 1:nblocks) {
## keep track of time for rate reporting
startTime <- Sys.time()
snp.start.pos <- snpStart + (b-1)*block.size
nsnp.block <- ifelse(snp.start.pos + block.size > snpEnd,
snpEnd - snp.start.pos + 1, block.size)
bidx <- ((b-1)*block.size + 1):((b-1)*block.size + nsnp.block)
## get genotypes for the block
geno <- getGenotype(genoData, snp=c(snp.start.pos, nsnp.block), scan=c(1,-1), drop=FALSE)
geno <- geno[,keep,drop=FALSE]
## allele frequency
freq <- .freqFromGeno(genoData, geno, chr[bidx], keep)
major <- freq > 0.5 & !is.na(freq)
maf <- ifelse(major, 1-freq, freq)
res[bidx,"MAF"] <- maf
## effect allele
if (effectAllele == "minor") {
geno[major,] <- 2 - geno[major,]
## minor allele coding: A = 1, B = 0
res[bidx,"effect.allele"] <- ifelse(major, 0, 1)
res[bidx,"EAF"] <- maf
} else {
res[bidx,"effect.allele"] <- 1
res[bidx,"EAF"] <- freq
}
## transform genotype for gene action
geno <- .transformGenotype(geno, gene.action)
## check for monomorphic SNPs
mono <- .monomorphic(geno, dat[[outcome]], model.type)
## sample size
res[bidx, "n"] <- rowSums(!is.na(geno))
if (model.type %in% c("logistic", "firth")) {
res[bidx, "n0"] <- rowSums(!is.na(geno[, dat[[outcome]] == 0, drop=FALSE]))
res[bidx, "n1"] <- rowSums(!is.na(geno[, dat[[outcome]] == 1, drop=FALSE]))
}
## loop through SNPs in block
midx <- (1:nsnp.block)[!mono]
for (i in midx) {
mdat <- cbind(dat, genotype=geno[i,])
mdat <- mdat[complete.cases(mdat),]
if (model.type %in% c("linear", "logistic")) {
tmp <- .runRegression(model.string, mdat, model.type, CI, robust, LRtest)
} else if (model.type == "firth") {
tmp <- .runFirth(model.string, mdat, CI, PPLtest, geno.index)
}
res[bidx[i], reg.cols] <- tmp
}
rate <- format(Sys.time() - startTime, digits=4)
if (verbose) message(paste("Block", b, "of", nblocks, "Completed -", rate))
}
## results data frame
res <- as.data.frame(res)
res$snpID <- getSnpID(genoData, index=snpStart:snpEnd)
## convert effect.allele coding back to A/B
res$effect.allele[res$effect.allele == 1] <- "A"
res$effect.allele[res$effect.allele == 0] <- "B"
return(res)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.