R/test_predictions_ggeffects.R

Defines functions .compute_interactions .compute_comparisons .compute_contrasts .test_predictions_ggeffects

.test_predictions_ggeffects <- function(object,
                                        by = NULL,
                                        test = "pairwise",
                                        equivalence = NULL,
                                        scale = "response",
                                        p_adjust = NULL,
                                        df = NULL,
                                        ci_level = 0.95,
                                        collapse_levels = FALSE,
                                        verbose = TRUE,
                                        ...) {
  # sanity check for certain arguments that are not (yet) supported
  if (!is.null(equivalence)) {
    insight::format_error("Equivalence testing is currently not supported for `engine = \"ggeffects\"`.")
  }
  if (!is.null(scale) && scale != "response") {
    insight::format_error("Only `scale = \"response\"` is supported for `engine = \"ggeffects\"`.")
  }

  # check test-argument
  if (is.null(test) || test %in% c("trend", "slope")) {
    test <- "contrast"
  }
  test <- insight::validate_argument(test, c("contrast", "pairwise", "interaction"))

  # we convert the ggeffects object to a data frame, using the original
  # names of the focal terms as column names
  predictions <- datagrid <- as.data.frame(object, terms_to_colnames = TRUE)

  # some attributes we need
  focal_terms <- attributes(object)$terms
  original_terms <- attributes(object)$original.terms
  at_list <- attributes(object)$at.list
  type <- attributes(object)$type
  margin <- attributes(object)$margin
  std_erros <- attributes(object)$standard_error
  dof <- attributes(object)$df
  condition <- attributes(object)$condition
  is_latent <- !is.null(attributes(object)$latent_thresholds)

  # warn for very long at-list
  values_at_lengths <- lengths(at_list)
  if (any(values_at_lengths > 20) && verbose) {
    warning(insight::format_message(
      paste0(
        "The number of levels to compare is very high for the following terms: ",
        toString(names(values_at_lengths)[values_at_lengths > 20]),
        ". This may lead to a large number of pairwise comparisons."
      )
    ), call. = FALSE, immediate. = TRUE)
  }

  # vcov matrix, for adjusting se
  vcov_matrix <- .safe(stats::vcov(object, verbose = FALSE, ...))

  # we now need to get the model object
  object <- .get_model_object(object)
  minfo <- insight::model_info(object)
  pred_type <- "response"

  # set defaults
  if (is.null(df) || is.na(df)) {
    if (is.null(dof)) {
      df <- .get_df(object)
    } else {
      df <- dof
    }
  }
  if (is.null(ci_level) || is.na(ci_level)) {
    ci_level <- 0.95
  }
  crit_factor <- (1 + ci_level) / 2

  ## TODO: For Bayesian models, we always use the returned standard errors
  # need to check whether scale is always correct

  # for non-Gaussian models, we need to adjust the standard errors
  if (!minfo$is_linear && !minfo$is_bayesian && !is_latent && type != "simulate") {
    # zero-inflated models? If so, we need to find the correct prediction type
    # since we allow predictions / comparisons for the different model parts
    if (minfo$is_zero_inflated) {
      pred_type <- .get_zi_prediction_type(object, type)
      # for zi_prob, we need to set margin to "mean_reference",
      # else we get wrong confidence intervals
      if (type == "zi_prob") {
        margin <- "mean_reference"
      }
    }
    se_from_predictions <- tryCatch(
      {
        .datagrid <- data_grid(object, terms = original_terms, condition = condition)
        # arguments for predict(), to get SE on response scale
        # for non-Gaussian models
        my_args <- list(
          object,
          newdata = .datagrid,
          type = pred_type,
          se.fit = TRUE
        )
        # for mixed models, need to set re.form to NULL or NA
        if (insight::is_mixed_model(object)) {
          if (identical(type, "random") && !identical(margin, "empirical")) {
            my_args$re.form <- NULL
          } else {
            my_args$re.form <- NA
          }
        }
        do.call(stats::predict, my_args)
      },
      error = function(e) {
        e
      }
    )
    # check if everything worked as expected
    if (inherits(se_from_predictions, "error")) {
      insight::format_error(
        "This model (family) is probably not supported. The error that occured was:",
        se_from_predictions$message
      )
    }
    # check if we have standard errors
    if (is.null(se_from_predictions$se.fit)) {
      insight::format_error("Could not extract standard errors from predictions.")
    }
    preds_with_se <- merge(
      predictions,
      cbind(.datagrid, se_prob = se_from_predictions$se.fit),
      sort = FALSE,
      all = TRUE
    )
    predictions$std.error <- preds_with_se$se_prob
  } else {
    # for linear models, we don't need adjustment of standard errors
    vcov_matrix <- NULL
  }

  # check for valid by-variable
  by <- .validate_by_argument(by, predictions)

  # compute contrasts or comparisons
  out <- switch(test,
    contrast = .compute_contrasts(predictions, df),
    pairwise = .compute_comparisons(predictions, df, vcov_matrix, at_list, focal_terms, crit_factor),
    interaction = .compute_interactions(predictions, df, vcov_matrix, at_list, focal_terms, crit_factor)
  )

  # for pairwise comparisons, we may have comparisons inside one level when we
  # have multiple focal terms, like "1-1" and "a-b". In this case, the comparison
  # of 1 to 1 ("1-1") is just the contrast for the level "1", we therefore can
  # collpase that string
  if (isTRUE(collapse_levels)) {
    out <- .collapse_levels(out, predictions, focal_terms, by)
  }

  # filter by-variables?
  if (!is.null(by)) {
    for (by_factor in by) {
      # values in "by" are character vectors, which are saved as "level-level".
      # we now extract the unique values, and filter the data frame
      unique_values <- unique(predictions[[by_factor]])
      by_levels <- paste0(unique_values, "-", unique_values)
      keep_rows <- out[[by_factor]] %in% c(by_levels, unique_values)
      # filter final data frame
      out <- out[keep_rows, , drop = FALSE]
      # finally, replace "level-level" just by "level"
      for (i in seq_along(by_levels)) {
        out[[by_factor]] <- gsub(
          by_levels[i],
          unique_values[i],
          out[[by_factor]],
          fixed = TRUE
        )
      }
    }
    # remove by-terms from focal terms
    focal_terms <- focal_terms[!focal_terms %in% by]
    # re-arrange columns, so it matches output from "emmeans" (by comes after focal)
    out <- datawizard::data_relocate(out, select = focal_terms, before = by)
  }

  # p-value adjustment?
  if (!is.null(p_adjust)) {
    out <- .p_adjust(out, p_adjust, predictions, focal_terms, out$statistic, df, verbose)
  }

  # arrange data, but where possible, restore original data type before
  out <- .restore_focal_types(out, focal = c(by, focal_terms), model_data = predictions)
  out <- suppressWarnings(datawizard::data_arrange(out, c(by, focal_terms), safe = TRUE))

  class(out) <- c("ggcomparisons", "data.frame")
  attr(out, "ci_level") <- ci_level
  attr(out, "test") <- test
  attr(out, "p_adjust") <- p_adjust
  attr(out, "df") <- df
  attr(out, "verbose") <- verbose
  attr(out, "scale") <- "response"
  attr(out, "engine") <- "ggeffects"
  attr(out, "by_factor") <- by
  attr(out, "datagrid") <- datagrid
  attr(out, "scale_label") <- .scale_label(minfo, pred_type)
  attr(out, "standard_error") <- out$std.error
  attr(out, "link_inverse") <- .link_inverse(object, ...)
  attr(out, "link_function") <- insight::link_function(object)
  attr(out, "linear_model") <- minfo$is_linear
  attr(out, "estimate_name") <- "Contrast"

  # remove unused variables
  out$std.error <- NULL
  out$statistic <- NULL

  out
}


# contrasts ---------------------------------------------------------------
.compute_contrasts <- function(predictions, df) {
  # contrasts means we simply add the p-value to the predictions
  out <- predictions
  out$statistic <- out$predicted / out$std.error
  out$p.value <- 2 * stats::pt(abs(out$statistic), df = df, lower.tail = FALSE)
  out
}


# pairwise comparisons ----------------------------------------------------
.compute_comparisons <- function(predictions, df, vcov_matrix, at_list, focal_terms, crit_factor) {
  # pairwise comparisons are a bit more complicated, as we need to create
  # pairwise combinations of the levels of the focal terms.

  # since we split at "." later, we need to replace "." in all levels
  # with a unique character combination
  at_list <- lapply(at_list, function(i) {
    gsub(".", "#_#", as.character(i), fixed = TRUE)
  })
  # create pairwise combinations
  level_pairs <- interaction(expand.grid(at_list))
  # using the matrix and then removing the lower triangle, we get all
  # pairwise combinations, except the ones that are the same
  M <- matrix(
    1,
    nrow = length(level_pairs),
    ncol = length(level_pairs),
    dimnames = list(level_pairs, level_pairs)
  )
  M[!upper.tri(M)] <- NA
  # table() works fine to create variables of this pairwise combinations
  pairs_data <- stats::na.omit(as.data.frame(as.table(M)))
  pairs_data$Freq <- NULL
  pairs_data <- lapply(pairs_data, as.character)
  # the levels are combined by ".", we need to split them and then create
  # a list of data frames, where each data frames contains the levels of
  # the focal terms as variables
  pairs_data <- lapply(pairs_data, function(i) {
    # split at ".", which is the separator char for levels
    pair <- strsplit(i, ".", fixed = TRUE)
    # since we replaced "." with "#_#" in original levels,
    # we need to replace it back here
    pair <- lapply(pair, gsub, pattern = "#_#", replacement = ".", fixed = TRUE)
    datawizard::data_rotate(as.data.frame(pair))
  })
  # now we iterate over all pairs and try to find the corresponding predictions
  out <- do.call(rbind, lapply(seq_len(nrow(pairs_data[[1]])), function(i) {
    pos1 <- predictions[[focal_terms[1]]] == pairs_data[[1]][i, 1]
    pos2 <- predictions[[focal_terms[1]]] == pairs_data[[2]][i, 1]

    if (length(focal_terms) > 1) {
      pos1 <- pos1 & predictions[[focal_terms[2]]] == pairs_data[[1]][i, 2]
      pos2 <- pos2 & predictions[[focal_terms[2]]] == pairs_data[[2]][i, 2]
    }
    if (length(focal_terms) > 2) {
      pos1 <- pos1 & predictions[[focal_terms[3]]] == pairs_data[[1]][i, 3]
      pos2 <- pos2 & predictions[[focal_terms[3]]] == pairs_data[[2]][i, 3]
    }
    # once we have found the correct rows for the pairs, we can calculate
    # the contrast. We need the predicted values first
    predicted1 <- predictions$predicted[pos1]
    predicted2 <- predictions$predicted[pos2]
    # we then create labels for the pairs. "result" is a data frame with
    # the labels (of the pairwise contrasts) as columns.
    result <- as.data.frame(do.call(cbind, lapply(seq_along(focal_terms), function(j) {
      paste(pairs_data[[1]][i, j], pairs_data[[2]][i, j], sep = "-")
    })))
    colnames(result) <- focal_terms
    # we then add the contrast and the standard error. for linear models, the
    # SE is sqrt(se1^2 + se2^2).
    result$Contrast <- predicted1 - predicted2
    # sum of squared standard errors
    sum_se_squared <- predictions$std.error[pos1]^2 + predictions$std.error[pos2]^2
    # for non-Gaussian models, we subtract the covariance of the two predictions
    # but only if the vcov_matrix is not NULL and has the correct dimensions
    correct_row_dims <- nrow(vcov_matrix) > 0 && all(nrow(vcov_matrix) >= which(pos1))
    correct_col_dims <- ncol(vcov_matrix) > 0 && all(ncol(vcov_matrix) >= which(pos2))
    if (is.null(vcov_matrix) || !correct_row_dims || !correct_col_dims) {
      vcov_sub <- 0
    } else {
      vcov_sub <- vcov_matrix[which(pos1), which(pos2)]^2
    }
    # Avoid negative values in sqrt()
    if (vcov_sub >= sum_se_squared) {
      result$std.error <- sqrt(sum_se_squared)
    } else {
      result$std.error <- sqrt(sum_se_squared - vcov_sub)
    }
    result
  }))
  # add CI and p-values
  out$conf.low <- out$Contrast - stats::qt(crit_factor, df = df) * out$std.error
  out$conf.high <- out$Contrast + stats::qt(crit_factor, df = df) * out$std.error
  out$statistic <- out$Contrast / out$std.error
  out$p.value <- 2 * stats::pt(abs(out$statistic), df = df, lower.tail = FALSE)
  out
}


# interaction contrasts  ----------------------------------------------------
.compute_interactions <- function(predictions, df, vcov_matrix, at_list, focal_terms, crit_factor) {
  ## TODO: interaction contrasts currently only work for two focal terms
  if (length(focal_terms) != 2) {
    msg <- "Interaction contrasts currently only work for two focal terms."
    if (length(focal_terms) > 2) {
      cleaned_f3 <- .clean_terms(focal_terms[3])
      s1 <- "pr <- predict_response(\n    model,"
      s2 <- paste0("terms = c(", datawizard::text_concatenate(focal_terms[1:2], enclose = "\"", sep = ", ", last = ", "), "),") # nolint
      s3 <- paste0("condition = c(", cleaned_f3, " = \"", at_list[[cleaned_f3]][1], "\")")
      s4 <- ")"
      msg <- c(
        msg,
        "You can try to fix remaining focal terms to specific values, using the `condition` argument, e.g.:.",
        paste0("\n  ", insight::color_text(s1, "cyan")),
        paste0("  ", insight::color_text(s2, "cyan")),
        paste0("  ", insight::color_text(s3, "cyan")),
        paste0(insight::color_text(s4, "cyan")),
        insight::color_text("test_predictions(pr, engine = \"ggeffects\")", "cyan")
      )
    }
    insight::format_error(msg)
  }

  # create pairwise combinations of first focal term
  level_pairs <- at_list[[1]]
  M <- matrix(
    1,
    nrow = length(level_pairs),
    ncol = length(level_pairs),
    dimnames = list(level_pairs, level_pairs)
  )
  M[!upper.tri(M)] <- NA
  # table() works fine to create variables of this pairwise combinations
  pairs_focal1 <- stats::na.omit(as.data.frame(as.table(M)))
  pairs_focal1$Freq <- NULL

  # create pairwise combinations of second focal term
  level_pairs <- at_list[[2]]
  M <- matrix(
    1,
    nrow = length(level_pairs),
    ncol = length(level_pairs),
    dimnames = list(level_pairs, level_pairs)
  )
  M[!upper.tri(M)] <- NA
  # table() works fine to create variables of this pairwise combinations
  pairs_focal2 <- stats::na.omit(as.data.frame(as.table(M)))
  pairs_focal2$Freq <- NULL

  # now we iterate over all pairs and try to find the corresponding predictions
  out <- do.call(rbind, lapply(seq_len(nrow(pairs_focal1)), function(i) {
    # differences between levels of first focal term
    pos1 <- predictions[[focal_terms[1]]] == pairs_focal1[i, 1]
    pos2 <- predictions[[focal_terms[1]]] == pairs_focal1[i, 2]

    do.call(rbind, lapply(seq_len(nrow(pairs_focal2)), function(j) {
      # difference between levels of first focal term, *within* first
      # level of second focal term
      pos_1a <- pos1 & predictions[[focal_terms[2]]] == pairs_focal2[j, 1]
      pos_1b <- pos2 & predictions[[focal_terms[2]]] == pairs_focal2[j, 1]
      # difference between levels of first focal term, *within* second
      # level of second focal term
      pos_2a <- pos1 & predictions[[focal_terms[2]]] == pairs_focal2[j, 2]
      pos_2b <- pos2 & predictions[[focal_terms[2]]] == pairs_focal2[j, 2]
      # once we have found the correct rows for the pairs, we can calculate
      # the contrast. We need the predicted values first
      predicted1 <- predictions$predicted[pos_1a] - predictions$predicted[pos_1b]
      predicted2 <- predictions$predicted[pos_2a] - predictions$predicted[pos_2b]
      # we then create labels for the pairs. "result" is a data frame with
      # the labels (of the pairwise contrasts) as columns.
      result <- data.frame(
        a = paste(pairs_focal1[i, 1], pairs_focal1[i, 2], sep = "-"),
        b = paste(pairs_focal2[j, 1], pairs_focal2[j, 2], sep = " and "),
        stringsAsFactors = FALSE
      )
      colnames(result) <- focal_terms
      # we then add the contrast and the standard error. for linear models, the
      # SE is sqrt(se1^2 + se2^2)
      result$Contrast <- predicted1 - predicted2
      sum_se_squared <- sum(
        predictions$std.error[pos_1a]^2, predictions$std.error[pos_1b]^2,
        predictions$std.error[pos_2a]^2, predictions$std.error[pos_2b]^2
      )
      # for non-Gaussian models, we subtract the covariance of the two predictions
      # but only if the vcov_matrix is not NULL and has the correct dimensions
      correct_row_dims <- nrow(vcov_matrix) > 0 && all(nrow(vcov_matrix) >= which(pos_1a)) && all(nrow(vcov_matrix) >= which(pos_2a)) # nolint
      correct_col_dims <- ncol(vcov_matrix) > 0 && all(ncol(vcov_matrix) >= which(pos_1b)) && all(ncol(vcov_matrix) >= which(pos_2b)) # nolint
      if (is.null(vcov_matrix) || !correct_row_dims || !correct_col_dims) {
        vcov_sub <- 0
      } else {
        vcov_sub <- sum(
          vcov_matrix[which(pos_1a), which(pos_1b)]^2,
          vcov_matrix[which(pos_2a), which(pos_2b)]^2
        )
      }
      # Avoid negative values in sqrt()
      if (vcov_sub >= sum_se_squared) {
        result$std.error <- sqrt(sum_se_squared)
      } else {
        result$std.error <- sqrt(sum_se_squared - vcov_sub)
      }
      result
    }))
  }))
  # add CI and p-values
  out$conf.low <- out$Contrast - stats::qt(crit_factor, df = df) * out$std.error
  out$conf.high <- out$Contrast + stats::qt(crit_factor, df = df) * out$std.error
  out$statistic <- out$Contrast / out$std.error
  out$p.value <- 2 * stats::pt(abs(out$statistic), df = df, lower.tail = FALSE)
  out
}
strengejacke/ggeffects documentation built on Dec. 24, 2024, 3:27 a.m.