tests/testthat/test-blobfish.R

# unit-tests for BLOBFISH
context("test BLOBFISH functions")

test_that("[BLOBFISH] SignificantBreadthFirstSearch() function yields expected results", {
  
  # Construct a starting network, which will be modified.
  # Here, we expect genes A and B to be connected after 1 hop via TF2, genes A and D
  # to be connected after 1 hop via TF3, and genes A and C to be connected after 2
  # hops via TF4.
  startingNetwork <- data.frame(tf = c(rep(c("tf1", "tf2", "tf3", "tf4"), 4)),
                           gene = c(rep("geneA", 4), rep("geneB", 4), rep("geneC", 4), rep("geneD", 4)),
                           score = c(-3, 3, 5, -5, 0, 4, 0.0005, -0.5, 0, 0.0005, -1, 4, 0.0005, -2, 5, 3))
  addedNoise1 <- data.frame(tf = rep(1:100, 4), gene = c(rep("geneA", 100), 
                                                        rep("geneB", 100),
                                                        rep("geneC", 100),
                                                        rep("geneD", 100)),
                            score = stats::rnorm(400) / 1000)
  addedNoise2 <- data.frame(tf = rep(1:100, 4), gene = c(rep("geneA", 100), 
                                                         rep("geneB", 100),
                                                         rep("geneC", 100),
                                                         rep("geneD", 100)),
                            score = stats::rnorm(400) / 10000)
  addedNoise3 <- data.frame(tf = rep(1:100, 4), gene = c(rep("geneA", 100), 
                                                         rep("geneB", 100),
                                                         rep("geneC", 100),
                                                         rep("geneD", 100)),
                            score = stats::rnorm(400) / 10000)
  fullNetwork1 <- rbind(startingNetwork, addedNoise1)
  rownames(fullNetwork1) <- paste(fullNetwork1$tf, fullNetwork1$gene, sep = "__")
  
  # Create a set of networks close to the original.
  fullNetworks <- fullNetwork1
  fullNetworks[,4] <- fullNetwork1$score - 0.00005
  fullNetworks[,5] <- fullNetwork1$score + 0.00005
  
  # Use all combined scores as the null distribution.
  null <- stats::rnorm(n = nrow(fullNetworks) * 3) / 100
  
  # Calculate the p-values.
  pvalues <- CalculatePValues(network = fullNetworks, pValueChunks = 2,
                              nullDistribution = null, verbose = TRUE)
  whichSig <- which(pvalues < 0.1)
  significantEdges <- rownames(fullNetworks)[whichSig]
  subnetwork <- fullNetworks[significantEdges, c(1:2)]
  pvalues = pvalues[significantEdges]
  
  # Test that errors are thrown when appropriate.
  expect_error(SignificantBreadthFirstSearch(networks = subnetwork, pValues = pvalues,
                                             startingNodes = c("blob", "fish"),
                                             nodesToExclude = c(), startFromTF = TRUE),
               "ERROR: Starting nodes do not overlap with network nodes")
  expect_error(SignificantBreadthFirstSearch(networks = subnetwork, pValues = pvalues,
                                             startingNodes = c("geneA", "geneB"),
                                             nodesToExclude = c(), startFromTF = TRUE),
               "ERROR: Starting nodes do not overlap with network nodes")
  expect_error(SignificantBreadthFirstSearch(networks = subnetwork, pValues = pvalues,
                                             startingNodes = c("tf1", "tf2"),
                                             nodesToExclude = c(), startFromTF = FALSE),
               "ERROR: Starting nodes do not overlap with network nodes")
  expect_error(SignificantBreadthFirstSearch(networks = subnetwork, pValues = pvalues,
                                             startingNodes = c("geneA", "geneB", "geneC"),
                                             nodesToExclude = c("blob", "fish"), startFromTF = FALSE),
               "ERROR: List of nodes to exclude does not overlap with network nodes")
  expect_error(SignificantBreadthFirstSearch(networks = subnetwork, pValues = pvalues,
                                             startingNodes = c("geneA", "geneB", "geneC"),
                                             nodesToExclude = c("geneA", "geneB"), startFromTF = FALSE),
               "ERROR: Starting nodes cannot overlap with nodes to exclude")
  
  # Ensure that, when starting from genes A, B, and C, we obtain the correct values.
  expect_true(length(setdiff(c("tf2__geneA", "tf2__geneB", "tf3__geneA", "tf4__geneC"),
                      rownames(SignificantBreadthFirstSearch(networks = subnetwork, pValues = pvalues,
                                                  startingNodes = c("geneA", "geneB", "geneC"),
                                                  nodesToExclude = c(), startFromTF = FALSE)))) == 0)
  
  # Ensure that, when starting from transcription factors TF2, TF3, and TF4 and removing genes A, B, and C, we obtain the correct values.
  expect_true(length(setdiff(c("tf3__geneD", "tf4__geneD"),
                             rownames(SignificantBreadthFirstSearch(networks = subnetwork, pValues = pvalues,
                                                                    startingNodes = c("tf2", "tf3", "tf4"),
                                                                    nodesToExclude = c("geneA", "geneB", "geneC"), 
                                                                    startFromTF = TRUE)))) == 0)
})
test_that("[BLOBFISH] FindConnectionsForAllHopCounts() function yields expected results", {
  
  # Set up the subnetwork from the previous example.
  geneANetHop1 <- data.frame(tf = c("tf2", "tf3"), gene = c("geneA", "geneA"))
  geneBNetHop1 <- data.frame(tf = "tf2", gene = "geneB")
  geneCNetHop1 <- data.frame(tf = "tf4", gene = "geneC")
  geneDNetHop1 <- data.frame(tf = c("tf3", "tf4"), gene = c("geneD", "geneD"))
  geneANetHop2 <- rbind(geneBNetHop1, geneDNetHop1[1,])
  geneBNetHop2 <- geneANetHop1[1,]
  geneCNetHop2 <- geneDNetHop1[2,]
  geneDNetHop2 <- rbind(geneANetHop1[2,], geneCNetHop1)
  geneANetHop3 <- geneDNetHop1[2,]
  geneBNetHop3 <- geneANetHop1[2,]
  geneCNetHop3 <- geneDNetHop1[1,]
  geneDNetHop3 <- geneANetHop1[1,]
  subnetworks <- list(geneA = list(geneANetHop1, geneANetHop2, geneANetHop3),
                      geneB = list(geneBNetHop1, geneBNetHop2, geneBNetHop3),
                      geneC = list(geneCNetHop1, geneCNetHop2, geneCNetHop3),
                      geneD = list(geneDNetHop1, geneDNetHop2, geneDNetHop3))
  
  # Obtain the overlaps for 1, 2, and 3 hops.
  expect_equal(rownames(FindConnectionsForAllHopCounts(subnetworks)), 
              c("tf2__geneA", "tf2__geneB", "tf3__geneA", "tf3__geneD", "tf4__geneC", "tf4__geneD"))
  
  # Obtain the overlaps for only the first two hops, for only A and C.
  subnetworks <- list(geneA = list(geneANetHop1, geneANetHop2),
                      geneC = list(geneCNetHop1, geneCNetHop2))
  expect_equal(rownames(FindConnectionsForAllHopCounts(subnetworks)), 
               c("tf3__geneA", "tf3__geneD", "tf4__geneC", "tf4__geneD"))
})
test_that("[BLOBFISH] FindSignificantEdgesForHop() function yields expected results",{
  
  # Construct a starting network, which will be modified.
  # Here, we expect genes A and B to be connected after 1 hop via TF2, genes A and D
  # to be connected after 1 hop via TF3, and genes A and C to be connected after 2
  # hops via TF4.
  startingNetwork <- data.frame(tf = c(rep(c("tf1", "tf2", "tf3", "tf4"), 4)),
                                gene = c(rep("geneA", 4), rep("geneB", 4), rep("geneC", 4), rep("geneD", 4)),
                                score = c(-3, 3, 5, -5, 0, 4, 0.0005, -0.5, 0, 0.0005, -1, 4, 0.0005, -2, 5, 3))
  addedNoise1 <- data.frame(tf = rep(1:100, 4), gene = c(rep("geneA", 100), 
                                                         rep("geneB", 100),
                                                         rep("geneC", 100),
                                                         rep("geneD", 100)),
                            score = stats::rnorm(400) / 10000)
  addedNoise2 <- data.frame(tf = rep(1:100, 4), gene = c(rep("geneA", 100), 
                                                         rep("geneB", 100),
                                                         rep("geneC", 100),
                                                         rep("geneD", 100)),
                            score = stats::rnorm(400) / 10000)
  addedNoise3 <- data.frame(tf = rep(1:100, 4), gene = c(rep("geneA", 100), 
                                                         rep("geneB", 100),
                                                         rep("geneC", 100),
                                                         rep("geneD", 100)),
                            score = stats::rnorm(400) / 10000)
  fullNetwork1 <- rbind(startingNetwork, addedNoise1)
  rownames(fullNetwork1) <- paste(fullNetwork1$tf, fullNetwork1$gene, sep = "__")
  
  # Create a set of networks close to the original.
  fullNetworks <- fullNetwork1
  fullNetworks[,4] <- fullNetwork1$score - 0.00005
  fullNetworks[,5] <- fullNetwork1$score + 0.00005
  
  #  Null distribution is a narrower version of the normal distribution.
  null <- rnorm(n = nrow(fullNetworks) * 3) / 100
  
  # Calculate the p-values.
  pvalues <- CalculatePValues(network = fullNetworks, pValueChunks = 2,
                              nullDistribution = null, verbose = TRUE)
  whichSig <- which(pvalues < 0.1)
  significantEdges <- rownames(fullNetworks)[whichSig]
  subnetwork <- fullNetworks[significantEdges, c(1:2)]
  pvalues = pvalues[significantEdges]
  
  # Set up the subnetwork from the previous example.
  geneANetHop1 <- data.frame(tf = c("tf2", "tf3"), gene = c("geneA", "geneA"))
  rownames(geneANetHop1) <- paste(geneANetHop1$tf, geneANetHop1$gene, sep = "__")
  geneBNetHop1 <- data.frame(tf = "tf2", gene = "geneB")
  rownames(geneBNetHop1) <- paste(geneBNetHop1$tf, geneBNetHop1$gene, sep = "__")
  geneCNetHop1 <- data.frame(tf = "tf4", gene = "geneC")
  rownames(geneCNetHop1) <- paste(geneCNetHop1$tf, geneCNetHop1$gene, sep = "__")
  geneDNetHop1 <- data.frame(tf = c("tf3", "tf4"), gene = c("geneD", "geneD"))
  rownames(geneDNetHop1) <- paste(geneDNetHop1$tf, geneDNetHop1$gene, sep = "__")
  geneANetHop2 <- rbind(geneBNetHop1, geneDNetHop1[1,])
  rownames(geneANetHop2) <- paste(geneANetHop2$tf, geneANetHop2$gene, sep = "__")
  geneBNetHop2 <- geneANetHop1[1,]
  rownames(geneBNetHop2) <- paste(geneBNetHop2$tf, geneBNetHop2$gene, sep = "__")
  geneCNetHop2 <- geneDNetHop1[2,]
  rownames(geneCNetHop2) <- paste(geneCNetHop2$tf, geneCNetHop2$gene, sep = "__")
  geneDNetHop2 <- rbind(geneANetHop1[2,], geneCNetHop1)
  rownames(geneDNetHop2) <- paste(geneDNetHop2$tf, geneDNetHop2$gene, sep = "__")
  geneANetHop3 <- geneDNetHop1[2,]
  rownames(geneANetHop3) <- paste(geneANetHop3$tf, geneANetHop3$gene, sep = "__")
  geneBNetHop3 <- geneANetHop1[2,]
  rownames(geneBNetHop3) <- paste(geneBNetHop3$tf, geneBNetHop3$gene, sep = "__")
  geneCNetHop3 <- geneDNetHop1[1,]
  rownames(geneCNetHop3) <- paste(geneCNetHop3$tf, geneCNetHop3$gene, sep = "__")
  geneDNetHop3 <- geneANetHop1[1,]
  rownames(geneDNetHop3) <- paste(geneDNetHop3$tf, geneDNetHop3$gene, sep = "__")
  subnetworksFull <- list(geneA = list(geneANetHop1, geneANetHop2, geneANetHop3),
                      geneB = list(geneBNetHop1, geneBNetHop2, geneBNetHop3),
                      geneC = list(geneCNetHop1, geneCNetHop2, geneCNetHop3),
                      geneD = list(geneDNetHop1, geneDNetHop2, geneDNetHop3))
  
  # Test method with all genes as input.
  sigEdges <- FindSignificantEdgesForHop(geneSet = c("geneA", "geneB", "geneC", "geneD"),
                                         combinedNetwork = subnetwork, pValues = pvalues,
                                         hopConstraint = 3)
  expect_true(length(setdiff(rownames(subnetworksFull$geneA[[1]]), rownames(sigEdges$geneA[[1]]))) == 0)
  expect_true(length(setdiff(rownames(subnetworksFull$geneA[[2]]), rownames(sigEdges$geneA[[2]]))) == 0)
  expect_true(length(setdiff(rownames(subnetworksFull$geneA[[3]]), rownames(sigEdges$geneA[[3]]))) == 0)
  expect_true(length(setdiff(rownames(subnetworksFull$geneB[[1]]), rownames(sigEdges$geneB[[1]]))) == 0)
  expect_true(length(setdiff(rownames(subnetworksFull$geneB[[2]]), rownames(sigEdges$geneB[[2]]))) == 0)
  expect_true(length(setdiff(rownames(subnetworksFull$geneB[[3]]), rownames(sigEdges$geneB[[3]]))) == 0)
  expect_true(length(setdiff(rownames(subnetworksFull$geneC[[1]]), rownames(sigEdges$geneC[[1]]))) == 0)
  expect_true(length(setdiff(rownames(subnetworksFull$geneC[[2]]), rownames(sigEdges$geneC[[2]]))) == 0)
  expect_true(length(setdiff(rownames(subnetworksFull$geneC[[3]]), rownames(sigEdges$geneC[[3]]))) == 0)
  expect_true(length(setdiff(rownames(subnetworksFull$geneD[[1]]), rownames(sigEdges$geneD[[1]]))) == 0)
  expect_true(length(setdiff(rownames(subnetworksFull$geneD[[2]]), rownames(sigEdges$geneD[[2]]))) == 0)
  expect_true(length(setdiff(rownames(subnetworksFull$geneD[[3]]), rownames(sigEdges$geneD[[3]]))) == 0)
  
  # Test method with only genes A, B, C.
  sigEdges <- FindSignificantEdgesForHop(geneSet = c("geneA", "geneB", "geneC"),
                                          combinedNetwork = subnetwork, hopConstraint = 3)
  expect_true(length(setdiff(rownames(subnetworksFull$geneA[[1]]), rownames(sigEdges$geneA[[1]]))) == 0)
  expect_true(length(setdiff(rownames(subnetworksFull$geneA[[2]]), rownames(sigEdges$geneA[[2]]))) == 0)
  expect_true(length(setdiff(rownames(subnetworksFull$geneA[[3]]), rownames(sigEdges$geneA[[3]]))) == 0)
  expect_true(length(setdiff(rownames(subnetworksFull$geneB[[1]]), rownames(sigEdges$geneB[[1]]))) == 0)
  expect_true(length(setdiff(rownames(subnetworksFull$geneB[[2]]), rownames(sigEdges$geneB[[2]]))) == 0)
  expect_true(length(setdiff(rownames(subnetworksFull$geneB[[3]]), rownames(sigEdges$geneB[[3]]))) == 0)
  expect_true(length(setdiff(rownames(subnetworksFull$geneC[[1]]), rownames(sigEdges$geneC[[1]]))) == 0)
  expect_true(length(setdiff(rownames(subnetworksFull$geneC[[2]]), rownames(sigEdges$geneC[[2]]))) == 0)
  expect_true(length(setdiff(rownames(subnetworksFull$geneC[[3]]), rownames(sigEdges$geneC[[3]]))) == 0)
  
  # Test method with only genes A and C.
  sigEdges <- FindSignificantEdgesForHop(geneSet = c("geneA", "geneC"),
                                          combinedNetwork = subnetwork,
                                          hopConstraint = 3)
  expect_true(length(setdiff(rownames(subnetworksFull$geneA[[1]]), rownames(sigEdges$geneA[[1]]))) == 0)
  expect_true(length(setdiff(rownames(subnetworksFull$geneA[[2]]), rownames(sigEdges$geneA[[2]]))) == 0)
  expect_true(length(setdiff(rownames(subnetworksFull$geneA[[3]]), rownames(sigEdges$geneA[[3]]))) == 0)
  expect_true(length(setdiff(rownames(subnetworksFull$geneC[[1]]), rownames(sigEdges$geneC[[1]]))) == 0)
  expect_true(length(setdiff(rownames(subnetworksFull$geneC[[2]]), rownames(sigEdges$geneC[[2]]))) == 0)
  expect_true(length(setdiff(rownames(subnetworksFull$geneC[[3]]), rownames(sigEdges$geneC[[3]]))) == 0)
})
test_that("[BLOBFISH] BuildSubnetwork() function yields expected results",{
  
  # Construct a starting network, which will be modified.
  # Here, we expect genes A and B to be connected after 1 hop via TF2, genes A and D
  # to be connected after 1 hop via TF3, and genes A and C to be connected after 2
  # hops via TF4.
  startingNetwork <- data.frame(tf = c(rep(c("tf1", "tf2", "tf3", "tf4"), 4)),
                                gene = c(rep("geneA", 4), rep("geneB", 4), rep("geneC", 4), rep("geneD", 4)),
                                score = c(-3, 3, 5, -5, 0, 4, 0.0005, -0.5, 0, 0.0005, -1, 4, 0.0005, -2, 5, 3))
  addedNoise1 <- data.frame(tf = rep(1:100, 4), gene = c(rep("geneA", 100), 
                                                         rep("geneB", 100),
                                                         rep("geneC", 100),
                                                         rep("geneD", 100)),
                            score = stats::rnorm(400) / 10000)
  addedNoise2 <- data.frame(tf = rep(1:100, 4), gene = c(rep("geneA", 100), 
                                                         rep("geneB", 100),
                                                         rep("geneC", 100),
                                                         rep("geneD", 100)),
                            score = stats::rnorm(400) / 10000)
  addedNoise3 <- data.frame(tf = rep(1:100, 4), gene = c(rep("geneA", 100), 
                                                         rep("geneB", 100),
                                                         rep("geneC", 100),
                                                         rep("geneD", 100)),
                            score = stats::rnorm(400) / 1000)
  fullNetwork1 <- rbind(startingNetwork, addedNoise1)
  rownames(fullNetwork1) <- paste(fullNetwork1$tf, fullNetwork1$gene, sep = "__")
  
  # Create a set of networks close to the original.
  fullNetwork2 <- fullNetwork1
  fullNetwork3 <- fullNetwork1
  fullNetwork2$score <- c(fullNetwork1$score - 0.00005)
  fullNetwork3$score <- c(fullNetwork1$score + 0.00005)
  
  # Paste together the networks.
  combinedNetwork <- fullNetwork1
  combinedNetwork[,4] <- fullNetwork2$score
  combinedNetwork[,5] <- fullNetwork3$score
  
  # Null distribution is a narrower version of the normal distribution.
  null <- rnorm(n = nrow(combinedNetwork) * 3) / 100
  
  # Verify that FindConnectionsForAllHopCounts() and FindSignificantEdgesForHop() are
  # working in tandem.
  # Obtain the overlaps for 1, 2, and 3 hops.
  expect_true(length(setdiff(c("tf2__geneA", "tf2__geneB", "tf3__geneA", "tf3__geneD", "tf4__geneC", "tf4__geneD"),
                             rownames(BuildSubnetwork(geneSet = c("geneA", "geneB", "geneC", "geneD"),
                                        networks = list(fullNetwork1, fullNetwork2, fullNetwork3), 
                                        alpha = 0.1, hopConstraint = 4, nullDistribution = null)))) == 0)
  
  # Obtain the overlaps for only the first two hops, for only A and C.
  expect_true(length(setdiff(c("tf3__geneA", "tf3__geneD", "tf4__geneC", "tf4__geneD"),
                             rownames(BuildSubnetwork(geneSet = c("geneA", "geneC"),
                                                      networks = list(fullNetwork1, fullNetwork2, fullNetwork3), 
                                                      alpha = 0.1, hopConstraint = 4, nullDistribution = null)))) == 0)
})
test_that("[BLOBFISH] RunBLOBFISH() function yields expected results",{
  
  # Check errors.
  expect_error(RunBLOBFISH(geneSet = 4, networks = list(), alpha = 0.5, hopConstraint = 5, nullDistribution = c(0,0,0)),
               paste("Wrong input type! geneSet must be a character vector. networks must be a list.",
               "alpha and hopConstraint must be scalar numeric values."))
  expect_error(RunBLOBFISH(geneSet = "g1", networks = 67, alpha = 0.5, hopConstraint = 5, nullDistribution = c(0,0,0)),
               paste("Wrong input type! geneSet must be a character vector. networks must be a list.",
                     "alpha and hopConstraint must be scalar numeric values."))
  expect_error(RunBLOBFISH(geneSet = "g1", networks = list(), alpha = "hi", hopConstraint = 5, nullDistribution = c(0,0,0)),
               paste("Wrong input type! geneSet must be a character vector. networks must be a list.",
                     "alpha and hopConstraint must be scalar numeric values."))
  expect_error(RunBLOBFISH(geneSet = "g1", networks = list(), alpha = 0.5, hopConstraint = "hi", nullDistribution = c(0,0,0)),
               paste("Wrong input type! geneSet must be a character vector. networks must be a list.",
                     "alpha and hopConstraint must be scalar numeric values."))
  expect_error(RunBLOBFISH(geneSet = "g1", networks = list(1,2,3), alpha = 0.5, hopConstraint = 5, nullDistribution = c(0,0,0)),
               "Each network must be a data frame.")
  expect_error(RunBLOBFISH(geneSet = "g1", networks = list(data.frame(tf = NA, gene = NA, score = NA, blah = NA)), 
                           alpha = 0.5, hopConstraint = 5, nullDistribution = c(0,0,0)),
               paste("Each network must have transcription factors in the first column,",
               "target genes in the second column, and scores in the third column."))
  expect_error(RunBLOBFISH(geneSet = "g1", networks = list(data.frame(tf = NA, gene = NA, score = NA)), 
                           alpha = -1, hopConstraint = 5, nullDistribution = c(0,0,0)), "alpha must be between 0 and 1, not including 0.")
  expect_error(RunBLOBFISH(geneSet = "g1", networks = list(data.frame(tf = NA, gene = NA, score = NA)), 
                           alpha = 0, hopConstraint = 5, nullDistribution = c(0,0,0)), "alpha must be between 0 and 1, not including 0.")
  expect_error(RunBLOBFISH(geneSet = "g1", networks = list(data.frame(tf = NA, gene = NA, score = NA)), 
                           alpha = 1.2, hopConstraint = 5, nullDistribution = c(0,0,0)), "alpha must be between 0 and 1, not including 0.")
  expect_error(RunBLOBFISH(geneSet = "g1", networks = list(data.frame(tf = NA, gene = NA, score = NA)), 
                           alpha = 1, hopConstraint = -4, nullDistribution = c(0,0,0)), "hopConstraint must be an even number of at least 2.")
  expect_error(RunBLOBFISH(geneSet = "g1", networks = list(data.frame(tf = NA, gene = NA, score = NA)), 
                           alpha = 1, hopConstraint = 7, nullDistribution = c(0,0,0)), "hopConstraint must be an even number of at least 2.")
  expect_error(RunBLOBFISH(geneSet = "g1", networks = list(data.frame(tf = NA, gene = NA, score = NA)), 
                           alpha = 1, hopConstraint = 4, nullDistribution = "hi"), "nullDistribution must be numeric.")
})
netZoo/netZooR documentation built on Oct. 16, 2024, 10:23 p.m.