R/gGlobalAncova.R

Defines functions gGlobalAncova gGAteststats Tperm.catcat SSred row.orth2d hat.matrix Gsquared devtest getdev

Documented in gGlobalAncova

######################################
### univariate test statistics
######################################

### categorical response 

## calculate deviances

getdev <- function(x, y, weights=rep(1, nobs)){
  fit <- glm.fit(x, y, family=binomial())
  return(deviance(fit))
}


### categorical/ordinal response - general deviance test

devtest <- function(y, D.full, D.red, family=c("binomial", "multinomial", "propodds")){
  requireNamespace("VGAM")
  OK <- !is.na(y)
  y <- y[OK]
  D.full <- D.full[OK,,drop=FALSE]
  D.red <- D.red[OK,,drop=FALSE]
  
  # binomial
  family <- match.arg(family)
  if(family == "binomial"){
    dev.full <- getdev(x=D.full, y=y)
    dev.red  <- getdev(x=D.red, y=y)
    statistic <- dev.red - dev.full
  }
  
  # multinomial or ordinal
  else{
    # get model formulas
    int <- "(Intercept)" %in% colnames(D.full)
    # with intercept
    if(int){
      covars.full <- paste(colnames(D.full)[-1], collapse="+")
      covars.red  <- paste(c("1", colnames(D.red)[-1]), collapse="+")
    }
    # without
    else{
      covars.full <- paste(c("-1", colnames(D.full)), collapse="+")
      covars.red  <- paste(c("-1", colnames(D.red)), collapse="+")
    }
    
    if("y" %in% covars.full)
      covars.full[covars.full == "y"] <- "y.1"
    
    data <- data.frame(y=y, D.full)
    form.full  <- formula(paste("y ~", covars.full))
    form.red   <- formula(paste("y ~", covars.red))
    model.full <- VGAM::vglm(form.full, family=family, data=data)
    model.red  <- VGAM::vglm(form.red, family=family, data=data)
    statistic  <- VGAM::deviance(model.red) - deviance(model.full)
    
    # multinomial: get corresponding statistic for chi^2 distribution w. G-1 df (with G = number of categories of tested variable)
    # (relevant for globaltest w. mixed data, but also in case not all y's have observations in all categories...)
    if(family == "multinomial"){
      K <- length(na.omit(unique(y)))
      df <- model.red@df.residual - model.full@df.residual
      p <- pchisq(statistic, df=df)
      statistic <- qchisq(p, df=df / (K-1))
    }
  }
  
  return(statistic)
}  



## equivalent alternative when both regressor and dependent variables are categorical: G^2 statistic
# + transformation to chi^2 distribution w. G-1 df (with G = number of categories of tested variable)

Gsquared <- function(y, D.full, D.red){
  OK <- !is.na(y)
  y <- factor(y[OK])
  
  # group variable
  testcol <- setdiff(colnames(D.full), colnames(D.red))
  # ..binary
  if(length(testcol) == 1)
    x <- D.full[OK, testcol]
  # ..multicategorical
  else
    x <- colSums(t(D.full[OK, testcol]) * 1:length(testcol))
  
  # contingency table  
  tab <- table(y, x)
  K <- nrow(tab)
  L <- ncol(tab)
  
  # if there are 0s in the table, use glm (otherwise G2 will be NaN)
  if(any(tab == 0)){
    if(K == 2){
      test <- glm(y ~ x, family="binomial")
      G2 <- test$null.deviance - test$deviance
    }
    
    # in case x is multinomial
    else if(K > 2){
      requireNamespace("VGAM")
      model.full <- VGAM::vglm(y ~ x, family=multinomial)
      model.red <- VGAM::vglm(y ~ 1, family=multinomial)
      G2 <- VGAM::deviance(model.red) - deviance(model.full)
    }
  }
  
  # else, calculate it by standard formula
  else{
    n <- sum(tab)
    sr <- rowSums(tab)
    sc <- colSums(tab)
    E <- outer(sr, sc, "*")/n
    G2 <- 2 * sum(tab * log(tab / E))
  }
  
  # transform to chi^2 distribution w. G-1 df (with G = number of categories of tested variable)
  # (relevant for globaltest w. mixed data, but also in case not all y's have observations in all categories...)
  if(K > 2){
    df <- (K-1) * (L-1)
    p <- pchisq(G2, df=df)
    G2 <- qchisq(p, df=df / (K-1))
  }
  
  return(G2)
}



### continuous response - reduction in sum of squares (GlobalAncova nominator)

hat.matrix <- function(x)
  x %*% solve(t(x) %*% x) %*% t(x)

row.orth2d <- function(xx, D)
  xx %*% (diag(dim(D)[1]) - hat.matrix(D))


SSred <- function(y, D.full, D.red){
  OK <- !is.na(y)
  y <- y[OK]
  D.full <- D.full[OK,,drop=FALSE]
  D.red <- D.red[OK,,drop=FALSE]
  
  R.full <- row.orth2d(y, D.full)
  R.red  <- row.orth2d(y, D.red) 
  SS.red <- sum(R.red * R.red)
  SS.full <- sum(R.full * R.full)
  
  statistic <- SS.red - SS.full
  return(statistic)
}



#########################################
### global permutation test
#########################################

## all permutation statistics for all variables - when regressor and outcome are both categorical (for general kxl tables)
# + transformation to chi^2 distribution w. G-1 df (with G = number of categories of tested variable)
Tperm.catcat <- function(x, Perms){
  # x: data.frame (or matrix) of categorical variables
  # Perms: matrix of permutations of group variable
  
  # code x and Perms numeric from 0 to k-1/l-1
  x     <- t(apply(x, 2, function(b) as.numeric(factor(b))))
  Perms <- apply(Perms, 2, function(b) as.numeric(factor(b)))
  
  # total counts
  n <- ncol(x)
  
  # numbers of categories -1
  K <- apply(x, 1, max)  # outcome variables could have different numbers of categories
  k <- max(K)     
  l <- max(Perms) # just one group variable
  
  # marginal counts for group variable
  n.cols <- table(Perms[,1])
  
  G2 <- matrix(0, nrow=nrow(x), ncol=ncol(Perms))
  for(i in 1:k){
    xi <- x == i
    
    # marginal count for outcome variable, category i
    n.rows <- rowSums(xi)
    
    # remove variables that do not have category i
    zero <- n.rows == 0
    xi <- xi[!zero,]
    
    for(j in 1:l){
      Permsj <- Perms == j
      
      # counts for crosstable entry i.j for all variables, for all permutations
      n.ij <- xi %*% Permsj
      
      # expected counts
      e.ij <- n.cols[j] * n.rows[!zero] / n
      
      # G^2 summand
      G2[!zero,] <- G2[!zero,] + n.ij * log(n.ij / e.ij) 
    }
  }
  
  G2 <- 2 * G2
  
  # set permutations corresponding to crosstables with 0 entries (for any variable) to NA
  na <- apply(G2, 2, function(x) any(is.na(x)))
  #G2 <- G2[, !na, drop=FALSE]
  G2[, na] <- NA
  
  # transform to chi^2 distribution w. G-1 df (with G = number of categories of tested variable)
  # (relevant for globaltest w. mixed data, but also in case not all x's have observations in all categories...)
  if(k > 2){
    for(i in 1:nrow(x)){
      if(K[i] > 2){
        df <- (K[i]-1) * (l-1)
        p <- pchisq(G2[i,], df=df)
        G2[i,] <- qchisq(p, df=df / (K[i]-1))
      }
    }
  }
  
  rownames(G2) <- rownames(x)
  return(G2)
}



## univariate observed and permutation test statistics, also for mixed data

gGAteststats <- function(data, formula.full, formula.red=~1, model.dat, perm=10000){
  # data: data.frame (columns=variables)
  # formula.full, formula.red: models to be compared
  # model.dat: data.frame of regressors, containing variables specified in formula.full and formula.red
  # perm: number of permutations; if set to 0, only observed statistics are computed
  
  if(is.matrix(data)){
    data0 <- data
    if(is.null(colnames(data0)))
      colnames(data0) <- 1:ncol(data0)
    data <- data.frame(data)
    names(data) <- colnames(data0)  # prevent adding 'X' at variable names in case input data had no colnames
  }
  
  # remove missing values in regressors
  design.terms <- as.character(attr(terms(formula.full), "variables"))[-1]
  test.dat <- model.dat[,design.terms]
  OK <- complete.cases(test.dat)
  data <- data[OK,,drop=FALSE]
  model.dat <- model.dat[OK,,drop=FALSE]
  test.dat  <- model.dat[,design.terms]
  n <- nrow(model.dat)
  
  # remove outcome variables with only one observed level
  nlevels <- apply(data, 2, function(x) length(na.omit(unique(x))))
  if(any(nlevels < 2)){
    w <- which(nlevels < 2)
    data <- data[,-w]
    warning(paste(length(w), "variables removed with only one level"))
  }
  
  # model matrices
  D.full <- model.matrix(formula.full, data=model.dat)
  D.red  <- model.matrix(formula.red, data=model.dat)
  
  # model columns to be tested
  testcol <- which(!(colnames(D.full) %in% colnames(D.red)))
  
  # permutations
  if(perm > 0){
    nperm <- .nPerms(D.full, model.dat, formula.full)
    faccounts <- nperm$counts
    nperm <- nperm$nPerms
  
    # all permutations if possible
    if(nperm < perm){
      print(paste("enumerating all", nperm, "permutations"))
      if(is.null(faccounts))  # design with continuous covariates
        Perms <- .allperms(1:n)
      else
        Perms <- .allpermsG(faccounts, faccounts) # factorial design
      Perms <- apply(Perms, 2, order)  # get possible orderings
    }
  
    # random permutations otherwise
    else
      Perms <- replicate(perm, sample(n))
  }
  
  # group variables according to data type
  dc <- sapply(data, data.class)
  
  comments <- length(unique(dc)) > 1 & perm > 0
  
  # get binary variables
  bin <- sapply(data, function(x) length(na.omit(unique(x))) == 2)
  dc[bin] <- "binary"
  
  Tobs.bin <- Tobs.cat  <- Tobs.ord  <- Tobs.quant  <- NULL
  if(perm > 0)
    Tperm.bin <- Tperm.cat <- Tperm.ord <- Tperm.quant <- NULL
  
  ## continuous variables
  if(any(dc == "numeric")){
    if(comments)
      print("testing quantitative variables")
    xx <- data[,dc == "numeric", drop=FALSE]
    
    # observed univariate statistics
    Tobs.quant <- sapply(xx, SSred, D.full, D.red)
    
    # univariate permutation statistics
    if(perm > 0)
      Tperm.quant <- apply(Perms, 2, function(x) 
        sapply(xx, SSred, D.full=cbind(D.full[,-testcol, drop=FALSE], D.full[x, testcol, drop=FALSE]), D.red=D.red))
  }
  
  ## ordinal variables
  if(any(dc == "ordered")){
    if(comments)
      print("testing ordinal variables")
    xx <- data[,dc == "ordered", drop=FALSE]
    
    # observed univariate statistics
    Tobs.ord <- sapply(xx, devtest, D.full, D.red, family="propodds")
    
    # univariate permutation statistics
    if(perm > 0)
      Tperm.ord <- apply(Perms, 2, function(x) 
        sapply(xx, devtest, D.full=cbind(D.full[,-testcol, drop=FALSE], D.full[x, testcol, drop=FALSE]), D.red=D.red, family="propodds"))
  }
  
  # check if tested term is single categorical variable
  cat.group <- length(design.terms) == 1 & (data.class(test.dat) != "numeric" | length(unique(test.dat)) == 2)
  
  ## binary variables
  if(any(dc == "binary")){
    if(comments)
      print("testing binary variables")
    xx <- data[,dc == "binary", drop=FALSE]
    
    # if tested term is single categorical variable, use G^2 statistic
    if(cat.group){
      # observed univariate statistics
      Tobs.bin <- sapply(xx, Gsquared, D.full, D.red)
      
      # univariate permutation statistics
      if(length(testcol) == 1)
        group <- D.full[,testcol]
      else
        group <- colSums(t(D.full[,testcol]) * 1:length(testcol))
      
      if(perm > 0){
        Perms2 <- apply(Perms, 2, function(x) group[x])
        Tperm.bin <- Tperm.catcat(xx, Perms2)
      }
    }
    
    # in case of covariables etc., use deviance test
    else{
      Tobs.bin <- sapply(xx, devtest, D.full, D.red, family="binomial")
      if(perm > 0)
        Tperm.bin <- apply(Perms, 2, function(x) 
          sapply(xx, devtest, D.full=cbind(D.full[,-testcol, drop=FALSE], D.full[x, testcol, drop=FALSE]), D.red=D.red, family="binomial"))
    }
  }
  
  ## multi-categorical variables
  if(any(dc == "factor")){
    if(comments)
      print("testing multi-categorical variables")
    xx <- data[,dc == "factor", drop=FALSE]
    
    # if tested term is single categorical variable, use G^2 statistic
    if(cat.group){
      # observed univariate statistics
      Tobs.cat <- sapply(xx, Gsquared, D.full, D.red)
      
      # univariate permutation statistics
      if(length(testcol) == 1)
        group <- D.full[,testcol]
      else
        group <- colSums(t(D.full[,testcol]) * 1:length(testcol))
      
      if(perm > 0){
        Perms2 <- apply(Perms, 2, function(x) group[x])
        Tperm.cat <- Tperm.catcat(xx, Perms2)
      }
    }
    
    # in case of covariables etc., use deviance test
    else{
      Tobs.bin <- sapply(xx, devtest, D.full, D.red, family="multinomial")
      if(perm > 0)
        Tperm.bin <- apply(Perms, 2, function(x) 
          sapply(xx, devtest, D.full=cbind(D.full[,-testcol, drop=FALSE], D.full[x, testcol, drop=FALSE]), D.red=D.red, family="multinomial"))
    }
  }
  
  # put together
  Tobs <- c(Tobs.quant, Tobs.ord, Tobs.bin, Tobs.cat)
  if(perm > 0){
    Tperm <- rbind(Tperm.quant, Tperm.ord, Tperm.bin, Tperm.cat)
    rownames(Tperm) <- sub("Tperm.", "", rownames(Tperm))  # in case there is only 1 variable for a data type, 'Tperm.' will be added to the variable name...
  }
  
  # bring in original order
  Tobs <- Tobs[names(data)]
  if(perm > 0)
    Tperm <- Tperm[names(data),]
  
  if(perm > 0)
    return(list(Tobs=Tobs, Tperm=Tperm))
  else
    return(Tobs)
}



## 'generalized GlobalAncova' group test, also for mixed data

gGlobalAncova <- function(data, formula.full, formula.red=~1, model.dat, Sets, sumstat=sum, perm=10000){
  # data: data.frame of variables to be tested in sets (columns=variables)
  # formula.full, formula.red: models to be compared
  # model.dat: data.frame of regressors, containing variables specified in formula.full and formula.red
  # Sets: vector of variable names or indices or list of those, defining sets of variables
  # sumstat: function for summarizing univariate test statistics
  # perm: number of permutations
  
  if(!missing(Sets)){
    vars <- unique(unlist(Sets))
    if(!is.character(vars)){
      vars <- colnames(data)[vars]
      Sets <- lapply(Sets, function(x) colnames(data)[x])
    }
    data <- data[,vars]
  }
  else
    Sets <- colnames(data)
  
  if(!is.list(Sets))
    Sets <- list(Sets)
  
  unistats <- gGAteststats(data=data, formula.full=formula.full, formula.red=formula.red, model.dat=model.dat, perm=perm)
  Tobs  <- unistats$Tobs
  Tperm <- unistats$Tperm
  
  # observed global statistics 
  Sobs <- sapply(Sets, function(x) sumstat(Tobs[x]))
  
  # global permutation statistics 
  Sperm <- apply(Tperm, 2, function(x) sapply(Sets, function(y) sumstat(x[y])))
  if(is.null(dim(Sperm)))
    Sperm <- matrix(Sperm, ncol=ncol(Tperm))
  
  # mid p-values (see thesis of Monika Jelizarow, p. 44)
  p <- rowMeans(Sperm > Sobs, na.rm=TRUE) + 0.5 * rowMeans(Sperm == Sobs, na.rm=TRUE)
  
  return(data.frame(p.value=p, Statistic=Sobs))
}

Try the GlobalAncova package in your browser

Any scripts or data that you put into this service are public.

GlobalAncova documentation built on Nov. 8, 2020, 8:10 p.m.