In microbiome/miaSim: Microbiome Data Simulation

knitr::opts_chunk$set(
  cache = FALSE,
  fig.width = 9,
  message = FALSE,
  eval=FALSE,
  warning = FALSE)

Case study with miaSim: Strongly interacting species (SIS) explaining community types

This document provides the detailed code required to replicate case study 1 discussed in Gao et al. (2023). Methods in Ecology and Evolution. DOI: 10.1111/2041-210X.14129

For general instructions and examples on using the miaSim package tools, see the vignette. miaSim implements tools for microbiome data simulation based on different ecological modeling assumptions. These can be used to simulate species abundance matrices, including time series. For a detailed function documentation, see the function reference page

Reference: On the Origins and Control of Community Types in the Human Microbiome

The aim of this case study is to recalculate this Figure 2 which demonstrates the occurrence of SIS and the clustering results of communities.

In this figure, from left to right, different power-law distributions (defined by alpha = 7, 3, 2, 1.6, 1.01) controls the strength frequency of inter-species interactions. On the second row, inter-species interactions are visualized in networks. On the third row, the different clustering results in PCoA are demonstrated.

In this case study, we used their same sets of parameters, and acquired the similar results.

Load dependencies

library(ggplot2)
# library(igraph)
library(colourvalues)
library(GGally)
library(network)
library(sna)
library(ape)
library(dplyr)
library(philentropy)
library(cluster)
library(miaSim)

Define a function to generate power-law distribution

# FIXME: probably R already has some function for this; use that to simplify the code and
# avoid unnecessary duplication
rpower <- function(n, alpha, norm = FALSE) {
  u <- runif(n, min = 0, max = 1 - .Machine$double.eps^0.5)
  power <- (1 - u)^(1 / (1 - alpha))
  if (norm) {
    return(power / mean(power))
  } else {
    return(power)
  }
}

Load parameters used by Gibson et al.

params <- data.frame(
  alpha = c(7, 3, 2, 1.6, 1.01),
  t_end = c(20, 10, 5, 2, 1),
  t_step = c(0.02, 0.01, 0.005, 0.002, 0.001)
)
set.seed(42)
n_species <- 100

# Define local communities;
# we use here a smaller number in order to speed up demo calculations
# local_communities <- seq_len(100) # Too slow!    
local_communities <- c(1, 5, 10, 20) # Faster

create variables to store results

H <- list()
df <- list()
plot_hist <- list()
N <- list()
interactions_custom <- list()
A <- list()
g <- list()
g_plot <- list()
local_species_pool <- list()
local_A <- list()

generate constants and plot different distributions

set.seed(42)
for (row in seq_len(nrow(params))) {
  # for each power-law distribution
  print(paste("row =", row))

  H[[row]] <- rpower(n = n_species, alpha = params$alpha[row], norm = TRUE)
  df[[row]] <- data.frame(H[[row]])
  plot_hist[[row]] <- ggplot(df[[row]], aes(H[[row]])) +
    ggtitle(paste("alpha =", params$alpha[[row]])) +
    geom_histogram(fill = "steelblue", color = "grey20", bins = 10) +
    scale_y_log10(breaks = c(1, 10, 100), limits = c(1, 100)) +
    theme_bw() + 
    theme(plot.title = element_text(hjust = 0.5))

  # plot_hist[[row]]
  print(plot_hist[[row]])

  N[[row]] <- matrix(rnorm(n = n_species^2, mean = 0, sd = 1), nrow = n_species)
  diag(N[[row]]) <- 0
  interactions_custom[[row]] <- N[[row]] %*% diag(H[[row]])
  A[[row]] <- randomA(
    n_species = 100,
    diagonal = -1,
    scale_off_diagonal = 0.07,
    connectance = 1,
    interactions = interactions_custom[[row]]
  )
  g[[row]] <- graph_from_adjacency_matrix(A[[row]], weighted = TRUE, diag = FALSE)
  print(paste("max edge weight =", max(E(g[[row]])$weight)))
  E(g[[row]])$color <- head(colour_values(c(abs(E(g[[row]])$weight), 0), alpha = 0, include_alpha = TRUE), -1)

  g_plot[[row]] <- ggnet2(
    net = g[[row]],
    mode = "circle",
    node.color = "black",
    node.size = 1,
    edge.color = "color", edge.alpha = 0.25
  ) + 
    labs(title=paste("alpha =", params$alpha[[row]])) + 
    theme(plot.title = element_text(hjust = 0.5))

  print(g_plot[[row]])

  local_species_pool[[row]] <- list()
  local_A[[row]] <- list()

  for (local_community in local_communities) {
    local_species_pool[[row]][[local_community]] <- sample(x = 100, size = 80)
    local_A[[row]][[local_community]] <- A[[row]][local_species_pool[[row]][[local_community]], local_species_pool[[row]][[local_community]]]
  }
}

identify SIS using H[[row]]

SIS <- list()
group_sis <- list()
for (row in seq_len(nrow(params))) {
  SIS[[row]] <- head(order(H[[row]], decreasing = TRUE), 1)
  group_sis[[row]] <- list()
  for (local_community in local_communities) {
    if (paste0("sp", SIS[[row]]) %in% rownames(local_A[[row]][[local_community]])) {
      group_sis[[row]][[local_community]] <- "with SIS"
    } else {
      group_sis[[row]][[local_community]] <- "without SIS"
    }
  }
  group_sis[[row]] <- unlist(group_sis[[row]])
}

simulate GLV models using different growth rates in 4 scenarios

The first scenario is the original one from Gibson et al., and the rest are some explorations (the second scenario is assigning various growth rates to random species; the third scenario is assigning faster growth rate to SIS; and the fourth scenario is assigning faster growth rate to non-SIS).

Different scenarios are implemented by overwriting the default random growth rates in simulation function.

prepare result lists and scenarios

simulation_GLV <- list()
otu_table <- list()
jsd <- list()
PCoA <- list()
PCoA_coord <- list()
bestk <- list()
PAM <- list()
SI <- list()
groups <- list()
pcoa_plot <- list()
dfs_to_bind <- list()

scenarios <- c("original", "various growth rates", "SIS grows faster", "non-SIS grows faster")
growthRates <- vector(mode = "list", length = 4)
growthRates[[1]] <- rep(1, 80)
growthRates[[2]] <- NULL
# growthRates for scenario 3 and 4 are assigned in the following loop
customPalette <- c("#d95319", "#0072bd", "#edb120", "#7e2f8e")

In this case study, we only focus on the first scenario.

This example uses a smaller number of local communities for faster computations but the code can be easily adapted to vary the numbers.

In order to run over several scenarios, you can replace scenario <- 1 with a for loop like for (scenario in seq_along(scenarios)) {...}.

  scenario <- 1
  print(paste0("scenario: ", scenarios[scenario]))

  ### make new lists ####
  simulation_GLV[[scenario]] <- list()
  otu_table[[scenario]] <- list()
  jsd[[scenario]] <- list()
  PCoA[[scenario]] <- list()
  PCoA_coord[[scenario]] <- list()
  bestk[[scenario]] <- list()
  PAM[[scenario]] <- list()
  SI[[scenario]] <- list()
  groups[[scenario]] <- list()
  pcoa_plot[[scenario]] <- list()

  ### run simulations ####
  for (row in seq_len(nrow(params))) {
    simulation_GLV[[scenario]][[row]] <- list()
    otu_table[[scenario]][[row]] <- data.frame(matrix(nrow = 0, ncol = 100))
    colnames(otu_table[[scenario]][[row]]) <- paste0("sp", 1:100)
    dfs_to_bind[[scenario]] <- list()

    #### overwrite growth rates ####    
    # for (local_community in local_communities) { 
    for (local_community in local_communities) {   
      if (scenario == 3) {
        growthRates[[scenario]] <- 9.9 * local_species_pool[[row]][[local_community]] == SIS[[row]] + 0.1
      }
      if (scenario == 4) {
        growthRates[[scenario]] <- -9.9 * local_species_pool[[row]][[local_community]] == SIS[[row]] + 10
      }

      simulation_GLV[[scenario]][[row]][[local_community]] <- simulateGLV(
        n_species = 80,
        names_species = paste0("sp", local_species_pool[[row]][[local_community]]),
        A = local_A[[row]][[local_community]],
        x0 = rep(1, 80),
        growth_rates = growthRates[[scenario]],
        t_end = params$t_end[row],
        t_step = params$t_step[row],
        stochastic = FALSE,
        norm = TRUE
      )

      # makePlot(simulation_GLV[[scenario]][[row]][[local_community]]$matrix)
      dfs_to_bind[[scenario]] <- append(dfs_to_bind[[scenario]], list(simulation_GLV[[scenario]][[row]][[local_community]]$matrix[time = 1000, ]))
    }

    ### form otu tables ####
    otu_table[[scenario]][[row]] <- bind_rows(dfs_to_bind[[scenario]])
    otu_table[[scenario]][[row]] <- subset(otu_table[[scenario]][[row]], select = -time)
    otu_table[[scenario]][[row]][is.na(otu_table[[scenario]][[row]])] <- 0

    ### calculate jensen-shannon distance and plot PCoA ####
    jsd[[scenario]][[row]] <- JSD(as.matrix(otu_table[[scenario]][[row]]))
    PCoA[[scenario]][[row]] <- ape::pcoa(as.dist(jsd[[scenario]][[row]]))
    PCoA_coord[[scenario]][[row]] <- PCoA[[scenario]][[row]]$vectors[, 1:2]
    colnames(PCoA_coord[[scenario]][[row]]) <- c("PCo1", "PCo2")

    bestk[[scenario]][[row]] <- 2
    PAM[[scenario]][[row]] <- pam(PCoA[[scenario]][[row]]$vectors, k = 2)
    SI[[scenario]][[row]] <- PAM[[scenario]][[row]]$silinfo$avg.width

    # Too slow, let us limit k to 2..3 in the example (earlier 3:10)
    for (k in 2:3) {
      PAMtemp <- pam(PCoA[[scenario]][[row]]$vectors, k = k)
      if (PAMtemp$silinfo$avg.width > SI[[scenario]][[row]]) {
        SI[[scenario]][[row]] <- PAMtemp$silinfo$avg.width
        bestk[[scenario]][[row]] <- k
        PAM[[scenario]][[row]] <- PAMtemp
      }
    }

    groups[[scenario]][[row]] <- factor(PAM[[scenario]][[row]]$clustering)

    dataframe_pcoa <- as.data.frame(PCoA_coord[[scenario]][[row]])
    dataframe_pcoa$group <- groups[[scenario]][[row]]
    p <- ggplot(
      dataframe_pcoa,
      aes(x = PCo1, y = PCo2, color = group)
    ) +
      labs(title = paste("scenario", scenario, scenarios[scenario], ";", "alpha =", params$alpha[[row]])) +
      geom_point() +
      theme_bw() +
      scale_color_manual(values = customPalette) +
      theme(legend.position = "none", plot.title = element_text(hjust = 0.5))

    # Store the plot
    pcoa_plot[[scenario]][[row]] <- p
    # Show the plot
    print(pcoa_plot[[scenario]][[row]])
  }

validate relationship between cluster and SIS

If we compare the previou PCoA results with this PCoA results rendered by with/without SIS, we can find that in last figures (alpha = 1.6 and alpha=1.01, where there's one SIS), the community types are clustered in the same way as they are rendered with/without SIS, indicating that the occurrence of SIS can influence the community types.

pcoa_plot_sis <- list()
for (row in seq_len(nrow(params))) {
  pcoa_plot_sis[[row]] <- ggplot(as.data.frame(PCoA_coord[[1]][[row]]), aes(x = PCo1, y = PCo2)) +
    geom_point(aes(colour = group_sis[[row]])) +
    labs(title=paste("alpha =", params$alpha[[row]])) +
    theme_bw() + 
    theme(plot.title = element_text(hjust = 0.5))
  print(pcoa_plot_sis[[row]]) # pcoa_validation_ plots
}