or relatedness

## function that gets an n \times p matrix of p genotypes of n individuals, and a null model, and tests the genotypes associations with the outcomes. 
## Genetic data are always assumed complete. 
## Types of tests: 
## Variant set: SKAT, burden, SKAT-O. Multiple types of p-values. Default: Davies with Kuonen if does not converge. 


testVariantSet <- function( nullmod, G, weights, 
                            test = c("Burden", "SKAT", "fastSKAT", "SMMAT", "fastSMMAT", "SKATO", "BinomiRare", "CMP"),
                            # burden.test = c("Score"), 
                            neig = 200, ntrace = 500, 
                            rho = seq(from = 0, to = 1, by = 0.1)){
                           # pval.method = c("davies", "kuonen", "liu"),
                           # return.scores = FALSE, return.scores.cov = FALSE){

    test <- match.arg(test)
    # burden.test <- match.arg(burden.test)
    # pval.method <- match.arg(pval.method)

    G <- .genoAsMatrix(nullmod, G)

    if (test == "Burden") {
        out <- .testVariantSetBurden(nullmod, G, weights, burden.test = "Score")
    }
    if (test == "BinomiRare") {
        out <- .testVariantSetBurden(nullmod, G, weights, burden.test = "BinomiRare")
    }
    if (test == "CMP") {
        out <- .testVariantSetBurden(nullmod, G, weights, burden.test = "CMP")
    }
    if (test == "SKAT") {
        out <- .testVariantSetSKAT(nullmod, G, weights, neig = Inf, ntrace = Inf)
                                   # return.scores, return.scores.cov)
    }
    if(test == "fastSKAT"){
        out <- .testVariantSetSKAT(nullmod, G, weights, neig, ntrace)
    }
    if (test == "SMMAT") {
        out <- .testVariantSetSMMAT(nullmod, G, weights, neig = Inf, ntrace = Inf)
    }
    if(test == "fastSMMAT"){
        out <- .testVariantSetSMMAT(nullmod, G, weights, neig, ntrace)
    }
    if(test == "SKATO"){
        out <- .testVariantSetSKATO(nullmod, G, weights, rho)
    }
    return(out)
}



## create the burden score, than calls the appropriate single variant test function. 
## can easily implement GxE interaction with the burden score... later!
.testVariantSetBurden <- function(nullmod, G, weights, burden.test){
    # multiply G by weights and compute burden
    if(is(G, "Matrix")){
        burden <- rowSums(G %*% Diagonal(x = weights))
    }else{
        burden <- colSums(t(G) * weights)
    }
    
    # adjust burden for covariates and random effects
    if (burden.test == "Score") {
        Gtilde <- calcGtilde(nullmod, burden)
        if(is.null(nullmod$RSS0)){
            nullmod$RSS0 <- as.numeric(crossprod(nullmod$Ytilde))
        }
        out <- .testGenoSingleVarScore(Gtilde, G = burden, resid = nullmod$resid, RSS0 = nullmod$RSS0) 
    }
    # if (burden.test == "Wald"){
    #     out <- .testGenoSingleVarWald(Gtilde, Ytilde = nullmod$Ytilde,
    #                                   n = length(nullmod$Ytilde), k = ncol(nullmod$model.matrix))
    # }

    if (burden.test == "BinomiRare"){
        ## if this is a mixed model, used conditional probabilities
        if (nullmod$family$mixedmodel) {
            phat <- expit(nullmod$workingY - nullmod$resid.conditional)
        } else {
            phat <- nullmod$fitted.values
        }
        out <- .testGenoSingleVarBR(nullmod$outcome, probs=phat, G=matrix(burden))
    }

    if (burden.test == "CMP"){
        if (nullmod$family$mixedmodel) {

            phat <- expit(nullmod$workingY - nullmod$resid.conditional)
            out <- .testGenoSingleVarCMP(nullmod$outcome, probs=phat, G=matrix(burden))
        } else{ ## not a mixed model
            phat <- nullmod$fitted.values
            out <- .testGenoSingleVarBR(nullmod$outcome, probs=phat, G=matrix(burden))
        }
    }
    return(out)
}


## new function that runs both SKAT and fastSKAT
.testVariantSetSKAT <- function(nullmod, G, weights, neig = 200, ntrace = 500, verbose = FALSE){
    # multiply G by weights 
    if(is(G, "Matrix")){
        G <- G %*% Diagonal(x = weights)        
    }else{
        G <- t(t(G) * weights)
    }

    # scores
    U <- as.vector(crossprod(G, nullmod$resid)) # WGPY
    # SKAT test statistic
    Q <- sum(U^2)

    # adjust G for covariates and random effects
    G <- calcGtilde(nullmod, G) # P^{1/2}GW

    # compute the p-value
    out <- .calcPvalVCTest(Q = Q, G = G, neig = neig, ntrace = ntrace, verbose = verbose)

    return(list(Q = Q, pval = out$pval, err = out$err, pval.method = out$pval.method))
}

## function for SMMAT and fastSMMAT
.testVariantSetSMMAT <- function(nullmod, G, weights, neig = 200, ntrace = 500, verbose = FALSE) {
    # multiply G by weights 
    if(is(G, "Matrix")){
        G <- G %*% Diagonal(x = weights)        
    }else{
        G <- t(t(G) * weights)
    }

    # scores
    U <- as.vector(crossprod(G, nullmod$resid)) # WGPY
    U.sum <- sum(U) # 1WGPY

    # adjust G for covariates and random effects
    G <- calcGtilde(nullmod, G) # P^{1/2}GW

    # compute burden p-value
    G.rowSums <- rowSums(G) # P^{1/2}GW1
    GG1 <- crossprod(G, G.rowSums) # WGPGW1  # O(mn)
    V.sum <- sum(GG1) # 1WGPGW1
    burden.pval <- pchisq(U.sum^2/V.sum, df=1, lower.tail=FALSE)

    # adjust U and G for burden
    U <- U - GG1*U.sum/V.sum # WGPY - WGPGW1 * 1WGPY/(1WGPGW1)
    G <- G - tcrossprod(G.rowSums, GG1)/V.sum # O(mn)

    # SMMAT test statistic
    Q <- sum(U^2)

    ### alternative to part above; seems to be slower from testing; this is how presented in SMMAT paper ###
    # V <- crossprod(G) # WGPGW  # O(m^2n)
    # GG1 <- rowSums(V) # WGPGW1
    # # denominator for burden
    # V.sum <- sum(GG1) # 1WGPGW1
    # # burden p-value
    # burden.pval <- pchisq(U.sum^2/V.sum, df=1, lower.tail=FALSE)
    # # adjust for burden
    # U <- U - GG1*U.sum/V.sum
    # V <- V - tcrossprod(GG1)/V.sum  # O(m^2)

    # compute the p-value for the "adjusted SKAT" part
    out <- .calcPvalVCTest(Q = Q, G = G, neig = neig, ntrace = ntrace, verbose = verbose)
    theta.pval <- out$pval
    err <- out$err

    # Fisher's method to combine p-values
    smmat.pval <- tryCatch(pchisq(-2*log(burden.pval)-2*log(theta.pval), df=4, lower.tail = FALSE), error = function(e) { NA })
    if(is.na(smmat.pval)) {
        err <- 1
        smmat.pval <- burden.pval
    }
    return(list(pval_burden = burden.pval, pval_theta = theta.pval, pval_SMMAT = smmat.pval, err = err, pval_theta.method = out$pval.method))
}


.regular <- function(Q, V, ncolG) {
    if(ncolG == 1){
        pv <- list(pval = pchisq(as.numeric(Q/V), df=1, lower.tail=FALSE), method = "integration")
        
    }else{
        lambda <- eigen(V, only.values = TRUE, symmetric=TRUE)$values
        # lambda <- lambda[lambda > 0] 
        pv <- .pchisqsum(x = Q, df = rep(1, length(lambda)), a = lambda)
        # pv <- tryCatch({
        #             list(pval = .pchisqsum(x = Q, df = rep(1, length(lambda)), a = lambda, method = "integration"),
        #                  method = "integration")
        #         }, warning = function(w){
        #             list(pval = pchisqsum(x = Q, df = rep(1, length(lambda)), a = lambda, method = "saddlepoint"),
        #                  method = "saddlepoint")
        #         }, error = function(e){ 
        #             list(pval = NA_real_, 
        #                  method = NA_character_)
        #         })
    }
    pv$err <- ifelse(is.na(pv$pval), 1, 0)
    return(pv)
}

.fastH <- function(Q, V, neig) {
    pv <- list(pval = NA_real_, method = NA_character_)
    pval.try = 0
    while(is.na(pv$pval) & pval.try < 10){
        pv <- tryCatch( pchisqsum_ssvd(x = Q, M = as.matrix(V), n = neig, p = 10, q = 1),
                       error = function(e){ list(pval = NA_real_, method = "error") } )
        pval.try <- pval.try + 1
    }
    pv$method <- paste0('ssvd_', pv$method)
    if(is.na(pv$pval)){
        err <- 1
    }else if(pval.try > 1){
        err <- 2
    }else{
        err <- 0
    }
    pv[["err"]] <- err
    return(pv)
}

.fastG <- function(Q, G, neig, ntrace) {
    pv <- list(pval = NA_real_, method = NA_character_)
    pval.try = 0
    while(is.na(pv$pval) & pval.try < 10){
        pv <- tryCatch( pchisqsum_rsvd(x = Q, M = as.matrix(G), n = neig, p = 10, q = 3, tr2.sample.size = ntrace), 
                       error = function(e){ list(pval = NA_real_, method = "error") }   )
        pval.try <- pval.try + 1
    }
    pv$method <- paste0('rsvd_', pv$method)
    if(is.na(pv$pval)){
        err <- 1
    }else if(pval.try > 1){
        err <- 2
    }else{
        err <- 0
    }
    pv[["err"]] <- err
    return(pv)
}

.calcPvalVCTest <- function(Q, G, neig, ntrace, verbose){
    if(!requireNamespace("survey")) stop("package 'survey' must be installed to calculate p-values for SKAT or SMMAT")
    if(!requireNamespace("CompQuadForm")) stop("package 'CompQuadForm' must be installed to calculate p-values for SKAT or SMMAT")

    ncolG <- ncol(G) # number of snps
    nrowG <- nrow(G) # number of samples
    if (verbose) message('nsamp = ', nrowG, '; nsnp = ', ncolG)

    if(min(ncolG, nrowG) < 6000 + 20*neig){
        if(ncolG <= nrowG){
            V <- crossprod(G) # WGPGW
        }else{
            V <- tcrossprod(G) # same eigenspace but smaller matrix
        }
        if(mean(abs(V)) < sqrt(.Machine$double.eps)){
            return(list(pval = NA_real_, pval.method = NA_character_, err = 1))
        }

        if(min(ncolG, nrowG) < 2*neig){
            # use "regular" method
            pv <- .regular(Q, V, ncolG)

        }else{
            # use "fast H" method
            if (verbose) message("using method fast_H")
            pv <- .fastH(Q, V, neig)
        }

    }else{
        # use "fast G" method
        if (verbose) message("using method fast_G")
        pv <- .fastG(Q, G, neig, ntrace)
    }

    return(list(pval = pv$pval, pval.method = pv$method, err = pv$err))
}

.pchisqsum <- function(x, df, a){

    ## check for bad.df
    ## can happen with randomised trace estimator if most remaining singular values are very small
    ## leads to unreliable p-values
    if(any(df < 1)){
        stop("Negative/fractional df")
    }

    df<-round(df)
 
    ## try integration
    f <- suppressWarnings(CompQuadForm::davies(x, a, df, acc = 1e-9))
    if((f$ifault > 0) | (f$Qq < 1e3*.Machine$double.eps) | (f$Qq > 1)){
        ## try saddlepoint
        pval <- survey:::saddle(x, rep(a, df))
        method <- "saddlepoint"
    }else{
        pval <- f$Qq
        method <- "integration"
    }

    return(list(pval = pval, method = method))
}

# .calcPval <- function(Q, lambda, pval.method) {
#     if(!requireNamespace("survey")) stop("package 'survey' must be installed to calculate p-values for SKAT")
#     if(!requireNamespace("CompQuadForm")) stop("package 'CompQuadForm' must be installed to calculate p-values for SKAT")
    
#     err <- 0
#     if(pval.method == "kuonen"){
#         pval <- survey:::saddle(x = Q, lambda = lambda)
#         err <- ifelse(is.na(pval), 1, 0)

#     }else if(pval.method == "davies"){
#         tmp <- suppressWarnings(CompQuadForm::davies(q = Q, lambda = lambda, acc = 1e-06))
#         pval <- tmp$Qq
#         if((tmp$ifault > 0) | (pval <= 0) | (pval >= 1)) {
#             pval <- survey:::saddle(x = Q, lambda = lambda)
#         }
#         err <- ifelse(is.na(pval), 1, 0)

#     }else if(pval.method == "liu"){
#         pval <- CompQuadForm::liu(q = Q, lambda = lambda)
#         err <- 0
#     }
    
#     if(err > 0){
#         pval <- CompQuadForm::liu(q = Q, lambda = lambda)
#     }

#     return(c(pval=pval, err=err))
# }


## new function just for SKAT-O
.testVariantSetSKATO <- function(nullmod, G, weights, rho = 0){
#                                 # return.scores = FALSE, return.scores.cov = FALSE){
    # scores
    scores <- as.vector(crossprod(G, nullmod$resid))

    # adjust G for covariates and random effects
    geno.adj <- calcGtilde(nullmod, G)

    # covariance of scores
    V <- crossprod(geno.adj)

    # vectors to hold output
    nrho <- length(rho)
    out.Q <- rep(NA, nrho); names(out.Q) <- paste("Q", rho, sep="_")
    out.pval <- rep(NA, nrho); names(out.pval) <- paste("pval", rho, sep="_")
    out.err <- rep(NA, nrho); names(out.err) <- paste("err", rho, sep="_")
    lambdas <- vector("list", nrho)

    # get p-value for each choice of rho
    for(i in 1:nrho){
        if(rho[i] == 0){
            # Variance Component Test
            Q <- sum((weights*scores)^2) # sum[(w*scores)^2]  # for some reason SKAT_emmaX divides this by 2
            distMat <- weights*t(weights*V)  # (weights) V (weights) = (weights) X' P X (weights)

        }else if(rho[i] == 1){
            # Burden Test
            Q <- sum(weights*scores)^2 # (sum[w*scores])^2  # for some reason SKAT_emmaX divides this by 2
            distMat <- crossprod(weights,crossprod(V, weights)) # weights^T V weights

        }else if(rho[i] > 0 & rho[i] < 1){
            rhoMat <- matrix(rho[i], nrow=length(scores), ncol=length(scores)); diag(rhoMat) <- 1
            cholRhoMat <- t(chol(rhoMat, pivot=TRUE))
            Q <- crossprod(crossprod(weights*cholRhoMat,scores)) # scores' (weights) (rhoMat) (weights) scores
            distMat <- crossprod(cholRhoMat, crossprod(weights*t(weights*V), cholRhoMat)) # (cholRhoMat) (weights) X' P X (weights) (cholRhoMat)
        }

        # p value calculation
        if(length(scores) == 1){
            lambdas[[i]] <- as.numeric(distMat)
            pval <- pchisq(as.numeric(Q/distMat), df=1, lower.tail=FALSE)
            err <- ifelse(is.na(pval), 1, 0)
        }else{
            lambda <- eigen(distMat, only.values = TRUE, symmetric=TRUE)$values
            # lambda <- lambda[lambda > 0]
            lambdas[[i]] <- lambda
            pv <- .pchisqsum(x = Q, df = rep(1, length(lambda)), a = lambda)
            pval <- pv$pval
            err <- ifelse(is.na(pval), 1, 0)
        }

        # update results
        out.Q[i] <- Q
        out.pval[i] <- pval
        out.err[i] <- err
    }
    out <- as.list(c(out.Q, out.pval, out.err))

    # get SKAT-O p-value
    if(length(scores) == 1){
        # pvalue is the same for all rhos
        out2 <- list(min.pval=out.pval[1], opt.rho=NA, pval_SKATO=out.pval[1])

    }else{
        # find the minimum p-value
        minp <- min(out.pval)
        out2 <- list(min.pval=minp, opt.rho=rho[which.min(out.pval)])

        # get qmin(rho); i.e. the (1-minp)th percentile of dist of each Q
        qmin <- rep(NA, nrho)
        for(i in 1:nrho){
            qmin[i] <- skatO_qchisqsum(minp, lambdas[[i]])
        }

        # calculate other terms
        Z <- t(t(geno.adj)*weights)
        zbar <- rowMeans(Z)
        zbarTzbar <- sum(zbar^2)
        M <- tcrossprod(zbar)/zbarTzbar
        ZtImMZ <- crossprod(Z, crossprod(diag(nrow(M)) - M, Z))
        lambda.k <- eigen(ZtImMZ, symmetric = TRUE, only.values = TRUE)
        lambda.k <- lambda.k$values[lambda.k$values > 0]
        mua <- sum(lambda.k)
        sum.lambda.sq <- sum(lambda.k^2)
        sig2a <- 2*sum.lambda.sq
        trMatrix <- crossprod(crossprod(Z,crossprod(M,Z)),ZtImMZ)
        sig2xi <- 4*sum(diag(trMatrix))
        kera <- sum(lambda.k^4)/sum.lambda.sq^2 * 12
        ldf <- 12/kera

        # calculate tau(rho)
        tau <- ncol(Z)^2*rho*zbarTzbar + (1-rho)*sum(crossprod(zbar, Z)^2)/zbarTzbar

        # find min{(qmin(rho)-rho*chisq_1)/(1-rho)} with integration                            
        otherParams <- c(mu = mua, degf = ldf, varia = sig2a+sig2xi)
        # integrate
        re <- tryCatch({
            integrate(integrateFxn, lower = 0, upper = 40, subdivisions = 2000, qmin = qmin, otherParams = otherParams, tau = tau, rho = rho, abs.tol = 10^-25)
        }, error=function(e) NA)
        out2[["pval_SKATO"]] <- 1-re[[1]]
    }
    
    # update results
    out <- c(out, out2)

    # return results
    return(out) 
}


# function to calculate q_min value
# basically a qchisqsum() function that takes the quantile/percentile and the lambda values
# matches the first 2 moments and the kurtosis
# based upon liu et al (2009) paper
skatO_qchisqsum <- function(p, lambdas){
    mu <- sum(lambdas)
    sum.lambda.sq <- sum(lambdas^2)

    s1 <- sum(lambdas^3)/(sum.lambda.sq^(3/2))
    s2 <- sum(lambdas^4)/(sum.lambda.sq^2)
    if(s1^2 > s2){
    	a <- 1/(s1-sqrt(s1^2-s2))
    	d <- s1*a^3 - a^2
    	l <- a^2 - 2*d
    }else{ # s1^2 <= s2
        l <- 1/s2 # in liu et al, this is l=1/s1^2; matches kurtosis instead of skewness to improve tail prob estimates
    }  	
    
    qmin <- qchisq(1-p, df=l)
    pval <- (qmin - l)/sqrt(2*l) * sqrt(2*sum.lambda.sq) + mu
    
    return(pval)
}


## function to integrate; the first term of the optimal integrand
# it's a non-central sum of weighted chi-squares
integrateFxn <- function(x, qmin, otherParams, tau, rho){
    n.r <- length(rho)
    n.x <- length(x)
    
    t1 <- tau %x% t(x)
    tmp <- (qmin - t1)/(1-rho)
    minval <- apply(tmp,2,min)

    degf <- otherParams["degf"]
    mu <- otherParams["mu"]
    varia <- otherParams["varia"]

    temp.q<-(minval - mu)/sqrt(varia)*sqrt(2*degf) + degf

    re<-pchisq(temp.q ,df=degf) * dchisq(x,df=1)

    return(re)
}

Any scripts or data that you put into this service are public.

GENESIS documentation built on Jan. 30, 2021, 2:01 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

GENESIS
GENetic EStimation and Inference in Structured samples (GENESIS): Statistical methods for analyzing genetic data from samples with population structure and/or relatedness

R/variantSetTests.R
In GENESIS: GENetic EStimation and Inference in Structured samples (GENESIS): Statistical methods for analyzing genetic data from samples with population structure and/or relatedness

Defines functions integrateFxn skatO_qchisqsum .testVariantSetSKATO .pchisqsum .calcPvalVCTest .fastG .fastH .regular .testVariantSetSMMAT .testVariantSetSKAT .testVariantSetBurden testVariantSet

Try the GENESIS package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

GENESIS GENetic EStimation and Inference in Structured samples (GENESIS): Statistical methods for analyzing genetic data from samples with population structure and/or relatedness

R/variantSetTests.R In GENESIS: GENetic EStimation and Inference in Structured samples (GENESIS): Statistical methods for analyzing genetic data from samples with population structure and/or relatedness

Defines functions integrateFxn skatO_qchisqsum .testVariantSetSKATO .pchisqsum .calcPvalVCTest .fastG .fastH .regular .testVariantSetSMMAT .testVariantSetSKAT .testVariantSetBurden testVariantSet

Try the GENESIS package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

GENESIS
GENetic EStimation and Inference in Structured samples (GENESIS): Statistical methods for analyzing genetic data from samples with population structure and/or relatedness

R/variantSetTests.R
In GENESIS: GENetic EStimation and Inference in Structured samples (GENESIS): Statistical methods for analyzing genetic data from samples with population structure and/or relatedness