hierinf: Hierarchical Inference

require(testthat)

## random number generator
RNGkind("L'Ecuyer-CMRG")

test_that("cluster_var: expect error", {
  expect_error(cluster_var(x = NULL, d = NULL, method = "average", block = NULL,
                           use = "everything"),
               "Please specify x or d.")

  expect_error(cluster_var(x = matrix(1:4, ncol = 2), d = matrix(1:4, ncol = 2),
                           method = "average", block = NULL, use = "everything"),
               "Please specify only x or d.")

  # expect an error if x has no column names
  expect_error(cluster_var(x = matrix(1:4, ncol = 2), d = NULL, method = "average",
                           block = NULL, use = "everything"),
               "The matrices (or matrix) which are stored in x are required to have column names. If there is no natural naming convention, then one can set them to some integer, say, 1 to p.",
               fixed = TRUE)

  tt <- matrix(c(1:3, NA), ncol = 2)
  colnames(tt) <- c("c1", "c2")
  expect_error(cluster_var(x = tt, d = NULL, method = "average",
                           block = NULL, use = "everything"),
               "The matrix x is required to have no missing values.",
               fixed = TRUE)

  tt1 <- tt2 <- matrix(c(1:4), ncol = 2)
  colnames(tt1) <- c("c1", "c2")
  colnames(tt2) <- c("c1", "c5")
  temp <- attr(suppressWarnings(cluster_var(x = list(tt1, tt2), d = NULL,
                                            method = "average",
                                            block = NULL,
                                            use = "everything")$res.tree),
               "errorMsgs")
  names(temp) <- NULL
  expect_equal(temp,
               "There are NA's in the calculated dissimilarity matrix / distance matrix. The variables cannot be clustered. This might be due to multiple data sets which do not contain all the same variables.",
               fixed = TRUE)

  expect_error(cluster_var(x = NULL, d = matrix(1:4, ncol = 2), method = "average",
                           block = data.frame(c("c1", "c2"), c(1, 1)),
                           use = "everything"),
               "The argument block can only be specified in connection with the input x.")

  expect_error(cluster_var(x = matrix(1:4, ncol = 2), d = NULL, method = "average",
                           block = data.frame(c("c1", "c2"), c(1, 1)),
                           use = "everything"),
               "The matrices (or matrix) which are stored in x are required to have column names. If there is no natural naming convention, then one can set them to some integer, say, 1 to p.",
               fixed  = TRUE)

  tt <- matrix(1:4, ncol = 2)
  colnames(tt) <- c("c1", "c2")
  expect_error(cluster_var(x = tt, d = NULL, method = "average",
                           block = data.frame(c("c1", "c2"), c(1, 1),
                                              stringsAsFactors = TRUE),
                           use = "everything"),
               "The first column of block (column names of x) is required to be of type character.",
               fixed = TRUE)

  expect_error(cluster_var(x = tt, d = NULL, method = "average",
                           block = data.frame(c("c1", "c2"), c(1, 1),
                                              stringsAsFactors = FALSE),
                           use = "everything"),
               "The second column of the input block is required to encode at least two blocks.")

  # error because the distance matrix does not have labels
  expect_error(cluster_var(d = dist(matrix(rnorm(100), ncol = 10))),
               "The distance matrix d is required to have labels. The labels should correspond to the column names of the data set or data sets stored in x. For example, the function dist uses the row names to set the labels.")

  expect_error(cluster_var(d = matrix(rnorm(80), ncol = 8)),
               "The matrix d is required to have the same number of columns and rows.")

  d <- as.matrix(dist(matrix(rnorm(100), ncol = 10)))
  rownames(d) <- colnames(d) <- NULL
  expect_error(cluster_var(d = d),
               "The matrix d is required to have column and row names.")

  d <- matrix(rnorm(100), ncol = 10)
  rownames(d) <- colnames(d) <- letters[1:10]
  expect_error(cluster_var(d = d),
               "The matrix d is required to be symmetric.")

  # NA's in block argument
  set.seed(888)
  tt <- matrix(rnorm(300), ncol = 30)
  colnames(tt) <- paste0("rs", 1:30)
  block <- data.frame(paste0("rs", 1:30), rep(c(1, 2), each = 15),
                      stringsAsFactors = FALSE)
  block[29:30, 2] <- NA
  expect_error(cluster_var(x = tt, d = NULL, block = block),
               "There are missing values in the input block.")

  # The argument block has less variables / column names than the data sets
  require("MASS")
  p <- 20
  n <- 80
  sim.geno1 <- mvrnorm(n = n, mu = rep(0, p), Sigma = toeplitz(0.8^(seq(0, p - 1))) )
  colnames(sim.geno1) <- paste0("rsid", 1:p)
  sim.geno2 <- mvrnorm(n = n, mu = rep(0, p), Sigma = toeplitz(0.8^(seq(0, p - 1))) )
  colnames(sim.geno2) <- paste0("rsid", 1:p)

  expect_error(cluster_var(x = list(sim.geno1, sim.geno2),
                           block = data.frame(paste0("rsid", 1:10),
                                              rep(1:2, each = 5),
                                              stringsAsFactors = FALSE)),
               "There are column name of x which have no corresponding values in the first column of block (column names of x).",
               fixed = TRUE)

  # The column names of x or each element of x (list containing data sets)
  # are required to have unique column names.
  expect_error(cluster_var(x = cbind(sim.geno1, sim.geno1)),
               "Each of the matrices (or matrix) which are stored in x are required to have unique column names.",
               fixed = TRUE)

  expect_error(cluster_var(x = list(cbind(sim.geno1, sim.geno1), sim.geno1)),
               "Each of the matrices (or matrix) which are stored in x are required to have unique column names.",
               fixed = TRUE)
})


test_that("cluster_var: check return object", {
  tt <- matrix(1:4, ncol = 2)
  colnames(tt) <- c("c1", "c2")
  res_simple <- cluster_var(x = tt, d = NULL, method = "average", block = NULL,
                            use = "everything")
  expect_is(res_simple, "hierD")
  expect_type(res_simple, "list")
  expect_is(res_simple$res.tree[[1]], "dendrogram")

  set.seed(888)
  tt <- matrix(rnorm(300), ncol = 30)
  colnames(tt) <- paste0("rs", 1:30)
  res <- cluster_var(x = tt, d = NULL, method = "average",
                     block = data.frame(paste0("rs", 1:30), rep(c(1, 2), each = 15),
                                        stringsAsFactors = FALSE),
                     use = "everything")
  expect_equal(res$block, data.frame(paste0("rs", 1:30), rep(c(1, 2), each = 15),
                                     stringsAsFactors = FALSE))
  expect_equal(length(res$res.tree), 2)
  expect_true(all(labels(res$res.tree[[1]]) %in% paste0("rs", 1:15))) # labels res1 - rs15
  expect_true(all(labels(res$res.tree[[2]]) %in% paste0("rs", 16:30))) # labels rs16 - rs30


  sim.geno <- matrix(rnorm(100), ncol = 10)
  rownames(sim.geno) <- letters[1:10]
  res_d <- cluster_var(d = dist(sim.geno))
  expect_is(res_d$res.tree[[1]], "dendrogram")
  expect_true(all(labels(res_d$res.tree[[1]]) %in% letters[1:10]))
})

test_that("cluster_var: check return object for multiple data sets", {
  set.seed(938)
  tt1 <- matrix(rnorm(40), ncol = 2)
  tt2 <- matrix(rnorm(40), ncol = 2)
  tt3 <- matrix(rnorm(40), ncol = 2)
  colnames(tt1) <- c("c1", "c2")
  colnames(tt2) <- c("c1", "c5")
  colnames(tt3) <- c("c2", "c5")

  # expect error because the argument use = "everything"
  res <- suppressWarnings(cluster_var(x = list(tt1, tt2, tt3), d = NULL,
                                      method = "average",
                                      block = NULL,
                                      use = "everything"))
  tmp <- attr(res$res.tree, "errorMsgs")
  names(tmp) <- NULL
  expect_equal(tmp,
               "There are NA's in the calculated dissimilarity matrix / distance matrix. The variables cannot be clustered. This might be due to multiple data sets which do not contain all the same variables.",
               fixed = TRUE)

  # use = "pairwise.complete.obs" (default)
  res <- cluster_var(x = list(tt1, tt2, tt3), d = NULL,
                     method = "average",
                     block = NULL)
  # plot(res$res.tree[[1]])
  expect_identical(labels(res$res.tree[[1]]), c("c5", "c1", "c2"))
})

test_that("cluster_var: check if the functions runs in parallel", {
  # Run this test only locally. (Not suitable for Windows)
  skip_on_bioc()
  skip_on_os("windows")

  set.seed(888)
  tt <- matrix(rnorm(300), ncol = 30)
  colnames(tt) <- paste0("rs", 1:30)
  set.seed(999)
  res_p <- cluster_var(x = tt, d = NULL, method = "average",
                     block = data.frame(paste0("rs", 1:30), rep(c(1, 2), each = 15),
                                        stringsAsFactors = FALSE),
                     use = "everything", sort.parallel = TRUE,
                     parallel = "multicore", ncpus = 2)
  expect_is(res_p, "hierD")
  expect_type(res_p, "list")
  expect_is(res_p$res.tree[[1]], "dendrogram")
  expect_is(res_p$res.tree[[2]], "dendrogram")
  expect_equal(length(res_p), 2)
  expect_true(all(labels(res_p$res.tree[[1]]) %in% paste0("rs", 1:15))) # labels res1 - rs15
  expect_true(all(labels(res_p$res.tree[[2]]) %in% paste0("rs", 16:30))) # labels rs16 - rs30
})