#' Aggregate the index of a `tsibble`
#'
#' @description
#'
#' `r lifecycle::badge("experimental")`
#'
#' `aggregate_index()` allows you to aggregate the index of a
#' [`tsibble`][tsibble::tsibble()] object by applying a specific function to its
#' measured variables.
#'
#' @details
#'
#' `aggregate_index()` was created to easily regularize
#' [`tsibble`][tsibble::tsibble()] objects. If you need more control while doing
#' this operation, check the [`index_by()`][tsibble::index_by()] function
#' provided by the [`tsibble`](https://tsibble.tidyverts.org/) package.
#'
#' ## Default function
#'
#' If `fun == NULL`, `aggregate_index()` will use the following function to
#' transform each measured variable:
#'
#' ```
#' function(x) {
#' checkmate::assert_atomic_vector(x)
#'
#' if (is.numeric(x) && !all(nchar(x) == 1, na.rm = TRUE)) {
#' mean(x, na.rm = TRUE)
#' } else {
#' y <- x[which(!is.na(x))]
#' unique <- unique(y)
#' unique[which.max(tabulate(match(y, unique)))]
#' }
#' }
#' ```
#' This function average values for numeric variables and assigning the most
#' frequent value (mode) for single integer or other type of variables. If no
#' mode can be found, the function will return the first value of `x`.
#'
#' @param data A [`tsibble`][tsibble::tsibble()] object.
#' @param unit A string indicating at which time unit the index must be
#' aggregated. Valid values are: `“seconds”`, `“minutes”`, `“hours”`,
#' `“days”`, `“weeks”`, `“months”`, `“quarters”`, and `“years”`) (default:
#' `"minutes"`).
#' @param fun (optional) The `function` to be applied to each measure variable
#' of `data`. If `NULL`, `aggregate_index()` will apply its default function
#' (see the Details section to learn more) (default: `NULL`).
#' @param week_start (optional) an integer number indicating the day on which
#' the week starts (`1` for Monday and `7` for `Sunday`). This is only used
#' when `unit == "weeks` (default: `1`).
#'
#' @return A [`tsibble`][tsibble::tsibble()] object.
#'
#' @family utility functions
#' @export
#'
#' @examples
#' acttrust
#'
#' aggregate_index(acttrust, unit = "hour")
#'
#' aggregate_index(acttrust, unit = "day")
#'
#' aggregate_index(acttrust, unit = "week")
#'
#' aggregate_index(acttrust, unit = "month")
#'
#' aggregate_index(acttrust, unit = "quarter")
#'
#' aggregate_index(acttrust, unit = "year")
aggregate_index <- function(data, unit, fun = NULL, week_start = 1) {
unit_choices <- c("second", "minute", "hour", "day", "week", "month",
"quarter", "year")
unit_choices <- append(unit_choices, paste0(unit_choices, "s"))
assert_tsibble(data, min.rows = 2, min.cols = 2)
assert_index_class(data, c("Date", "POSIXt"))
assert_epoch_compatibility(data, unit)
checkmate::assert_choice(unit, unit_choices)
checkmate::assert_function(fun, null.ok = TRUE)
checkmate::assert_choice(week_start, c(1, 7))
# R CMD Check variable bindings fix (see: https://bit.ly/3z24hbU)
# nolint start: object_usage_linter, object_name_linter.
. <- .iNdEx_PlAcEhOlDeR <- NULL
# nolint end
index_var <- tsibble::index_var(data)
index <- data[[index_var]]
# Workaround to avoid problems with dplyr::select()
data <- data %>%
dplyr::rename(.iNdEx_PlAcEhOlDeR = tsibble::index_var(data))
if (is.null(fun)) fun <- aggregate_index_default_fun
if (grepl("^day*", unit)) {
group <- lubridate::floor_date(index, "days") %>% as.Date()
} else if (grepl("^week*", unit)) {
group <- tsibble::yearweek(index, week_start = week_start)
} else if (grepl("^month*", unit)) {
group <- tsibble::yearmonth(index)
} else if (grepl("^quarter*", unit)) {
group <- tsibble::yearquarter(index)
} else if (grepl("^year*", unit)) {
group <- lubridate::year(index)
} else {
group <- lubridate::floor_date(index, unit)
}
data %>%
tsibble::index_by(.InDeX_pLaCeHoLdEr2 = group) %>%
dplyr::summarise(dplyr::across(dplyr::everything(), fun)) %>%
dplyr::select(-.iNdEx_PlAcEhOlDeR) %>%
dplyr::rename_with(~ gsub("^.InDeX_pLaCeHoLdEr2$", index_var, .x))
}
aggregate_index_default_fun <- function(x) {
checkmate::assert_atomic_vector(x)
if (is.numeric(x) && !all(nchar(x) == 1, na.rm = TRUE)) {
mean(x, na.rm = TRUE)
} else {
# Return value that has highest number of occurrences (mode)
y <- x[which(!is.na(x))]
unique <- unique(y)
unique[which.max(tabulate(match(y, unique)))]
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.