tests/testthat/test-merge.R

# testthat tests don't do anything when successful.
library("phyloseq"); packageVersion("phyloseq")
library("testthat"); packageVersion("testthat")


# # # Tests!

################################################################################
# merge_samples
data(GlobalPatterns)
# GP <- prune_taxa(taxa_sums(GlobalPatterns)>0, GlobalPatterns)
GP  <- GlobalPatterns
mGP <- merge_samples(GlobalPatterns, "SampleType")

test_that("Classes of merged phyloseq objects are as expected", {
	expect_is(merge_samples(otu_table(GP), get_variable(GP, "SampleType")), ("otu_table"))
  expect_is(merge_samples(sample_data(GP), "SampleType"), ("sample_data"))
  expect_is(mGP, ("phyloseq"))
})

test_that("Same sam_data result for separate and combined merge in merge_samples", {
	expect_identical(
		merge_samples(sample_data(GP), "SampleType"),
		(sample_data(mGP))
	)
})

test_that("Same otu_table result for separate and combined merge in merge_samples", {
  expect_identical(
		merge_samples(otu_table(GP), get_variable(GP, "SampleType")),
		(otu_table(mGP))
	)
})

test_that("Sample Names of merged object now same set as merging factor levels", {
	sampleTypes = levels(data.frame(sample_data(GP))$SampleType)
	expect_identical(
	  setdiff(sampleTypes, sample_names(mGP)), 
	  (character()))
})

test_that("Counts from merged-samples are summed...", {
	OTUnames10 = names(sort(taxa_sums(GP), TRUE)[1:10])
	GP10  = prune_taxa(OTUnames10,  GP)
	mGP10 = prune_taxa(OTUnames10, mGP)
	# Loop to check the correct summation has occured for all OTUs.
	for( i in OTUnames10 ){
		isum = as(tapply(get_sample(GP10, i), get_variable(GP10, "SampleType"), sum), "numeric")
		expect_equivalent(isum, (get_sample(mGP10, i)))
	}
})

################################################################################
# merge_phyloseq
test_that("merge_phyloseq: Break apart GP based on human-association,
          then merge back together.", {
	data(GlobalPatterns)
	GP  <- prune_taxa(taxa_names(GlobalPatterns)[1:100], GlobalPatterns)
	sample_data(GP)$human <- factor(get_variable(GP, "SampleType") %in% c("Feces", "Mock", "Skin", "Tongue"))
	h1 <- subset_samples(GP, human=="TRUE")
	h0 <- subset_samples(GP, human=="FALSE")
	GP1 <- merge_phyloseq(h0, h1)

	# The species order is fixed to the tree, 
	# so should be the same between the original and merged
	expect_identical(taxa_names(GP), (taxa_names(GP1)))
	expect_identical(phy_tree(h1), (phy_tree(h0)))

	# However, the sample order has been shuffled by the split/merge. 
	# Fix the sample order by re-ordering the otu_table, and reassigning
	sa.order <- sample_names(GP)
	sa.order <- sa.order[sa.order %in% sample_names(GP1)]
	otu_table(GP1) <- otu_table(GP1)[, sa.order]
	expect_equal(sample_names(GP), sample_names(GP1))
	expect_equal(sample_names(sample_data(GP)), sample_names(sample_data(GP1)))
	# Sample data entries are the same, irrespective of factor levels
	GPfactors = which(sapply(sample_data(GP1), inherits, "factor"))
	for(j in GPfactors){
	  expect_equal(as.character(get_variable(GP, j)),
	               as.character(get_variable(GP1, j)))
	}
	# Reconcile factor level order for remaining tests
	GP1factors = which(sapply(sample_data(GP1), inherits, "factor"))
	for(j in names(GP1factors)){
	  varj = as.character(get_variable(GP1, j))
	  sample_data(GP1)[, j] <- factor(varj, levels = sort(unique(varj)))
	}
	GPfactors = which(sapply(sample_data(GP), inherits, "factor"))
	for(j in names(GPfactors)){
	  varj = as.character(get_variable(GP, j))
	  sample_data(GP)[, j] <- factor(varj, levels = sort(unique(varj)))
	}
	# Check a specific variable
	expect_equal(sample_data(GP1)$SampleType,
	             sample_data(GP)$SampleType)
	# Should be fixed now. Full object and components now identical
	expect_equal(GP1, GP) 
	expect_identical(GP1, GP)
	expect_identical(otu_table(GP1), (otu_table(GP)))
	expect_identical(tax_table(GP1), (tax_table(GP)))
	expect_identical(phy_tree(GP1), (phy_tree(GP)))
	
	## Check factor levels
	# The set
	expect_identical(sort(levels(sample_data(GP1)$SampleType)),
	                 sort(levels(sample_data(GP)$SampleType)))
	# The order
	expect_identical(levels(sample_data(GP1)$SampleType),
	                 levels(sample_data(GP)$SampleType))	
	# Overall
	expect_identical(sample_data(GP1), sample_data(GP))
	expect_identical(droplevels(sample_data(GP1)), droplevels(sample_data(GP)))
	
	# Check variable names are all there (set)
	expect_equal(
	  object = sort(intersect(colnames(sample_data(GP1)), colnames(sample_data(GP)))),
	  expected = sort(colnames(sample_data(GP1))))
	# Check column classes
	expect_equal(sapply(sample_data(GP1), class), sapply(sample_data(GP), class))
	# Check column names
	expect_equal(colnames(sample_data(GP1)), colnames(sample_data(GP)))
	# Check sample name order
	expect_equal(sample_names(sample_data(GP1)), sample_names(sample_data(GP)))
	expect_equal(sample_names(GP1), sample_names(GP))
})

################################################################################
# tax_glom
# Load data
data("GlobalPatterns")
GP.chl = subset_taxa(GlobalPatterns, Phylum == "Chlamydiae")
test_that("the tax_table slot is identical whether tax_glom()ed by itself or as component", {
	expect_is(tax_glom(tax_table(GP.chl), "Family"), ("taxonomyTable"))
	expect_is(n1<-tax_glom(GP.chl, "Family"), ("phyloseq"))
	expect_equal(ntaxa(n1), (4L))
	expect_equivalent(
		tax_glom(tax_table(GP.chl), taxrank="Family"),
		tax_table(tax_glom(GP.chl, taxrank="Family"))
	)
	n1 = as(tax_glom(tax_table(GP.chl), taxrank="Family", NArm=FALSE), "matrix")[, "Family"]
	n2 = tax_glom(GP.chl, taxrank="Family", NArm=FALSE)
  expect_true(setequal(n1, as(tax_table(n2), "matrix")[, "Family"]))
	expect_equal(ntaxa(n2), (5L))	
})
test_that("tax_glom() handles clearly agglomeration to one taxa", {
	expect_warning(n1 <- tax_glom(GP.chl, "Phylum"))
	expect_is(n1, ("phyloseq"))
	expect_equal(ntaxa(n1), (1L))
	expect_is(access(n1, "phy_tree"), ("NULL"))
})
test_that("tax_glom() can handle even the highest rank glom", {
  expect_warning(tax_glom(GP.chl, "Kingdom"))
  gpk = tax_glom(GlobalPatterns, "Kingdom")
  expect_is(gpk, "phyloseq")
  expect_equivalent(ntaxa(gpk), 2)
  expect_equivalent(taxa_sums(gpk), c(195598, 28021080))
})
################################################################################
# prune_taxa
# Use the GP.chl dataset from previous testing block
test_that("prune_taxa() handles clearly pruning to one taxa", {
	# throws warning, and NULL-tre
	expect_warning(n1 <- prune_taxa(taxa_names(GP.chl)[1:1], GP.chl))
	expect_equal(ntaxa(n1), (1L))
	expect_is(n1, ("phyloseq"))
	expect_is(access(n1, "phy_tree"), ("NULL"))
	expect_is(access(n1, "otu_table"), ("otu_table"))
})
test_that("prune_taxa() properly handles standard-cases", {
	# throws warning, and NULL-tre
  expect_is(n1 <- prune_taxa(taxa_names(GP.chl)[1:5], GP.chl), ("phyloseq"))
	expect_equal(ntaxa(n1), (5L))
	expect_is(access(n1, "phy_tree"), ("phylo"))
	expect_is(access(n1, "otu_table"), ("otu_table"))
	expect_is(access(n1, "sam_data"), ("sample_data"))
	expect_is(access(n1, "tax_table"), ("taxonomyTable"))
	# Use logical vector, and get same answer
	L2 <- vector(length=ntaxa(GP.chl))
	L2[1:5] <- TRUE
	expect_is(n2 <- prune_taxa(L2, GP.chl), ("phyloseq"))
	expect_identical(n2, (n1))	
})
################################################################################
# merge_taxa
# Use the GP.chl dataset from previous testing block
test_that("merge_taxa() properly handles standard-cases", {
	expect_is(n1 <- merge_taxa(GP.chl, c("24341", "579085")), ("phyloseq"))
	expect_equal(ntaxa(n1), (20L))
	# The first name is kept, others removed
	expect_equal("579085" %in% taxa_names(n1), (FALSE))
	expect_equal("24341" %in% taxa_names(n1), (TRUE))
	# Try a 3-element merge, check that the largest-count remains.
  OTUIDs = c("579085", "24341", "547579")
	biggestOTU = names(which.max(taxa_sums(GP.chl)[OTUIDs]))
  # Perform the merge of `OTUIDs`, and check the resulting class while at it.
	expect_is(n2 <- merge_taxa(GP.chl, OTUIDs), "phyloseq")
  # Check that there are now the correct, fewer number of OTUs
	expect_equal(ntaxa(n2), (ntaxa(GP.chl)-length(OTUIDs)+1))
	# The biggest OTU is kept, others merged
  expect_true(biggestOTU %in% taxa_names(n2))
	expect_true(!any(setdiff(OTUIDs, biggestOTU) %in% taxa_names(n2)))
  # Merge again, but only use the tax_table. No counts changes default retained to first in vector
	expect_is(n2b <- merge_taxa(tax_table(GP.chl), OTUIDs), "taxonomyTable")
	# Check that there are now the correct, fewer number of OTUs
	expect_equal(ntaxa(n2b), (ntaxa(GP.chl)-length(OTUIDs)+1))
	# The biggest OTU is kept, others merged
	expect_true(OTUIDs[1] %in% taxa_names(n2b))
	expect_true(!any(setdiff(OTUIDs, OTUIDs[1]) %in% taxa_names(n2b)))
	# Merge again, but specify the retained OTU name as the 3rd one, rather than the default
	expect_is(n3 <- merge_taxa(GP.chl, eqtaxa=OTUIDs, archetype=OTUIDs[3]), ("phyloseq"))
	# "547579" is kept, others removed
	expect_true(OTUIDs[3] %in% taxa_names(n3))
	expect_true(!any(setdiff(OTUIDs, OTUIDs[3]) %in% taxa_names(n3)))
	# Check that the remaining OTU has the sum of the values merged
	expect_identical(get_sample(n3, OTUIDs[3]), 
    colSums(as(otu_table(GP.chl), "matrix")[OTUIDs, ]))
})
test_that("merge_taxa() replaces disagreements in taxonomy with NA", {
	# Try a more difficult merge from a different subset
	GP20 <- prune_taxa(taxa_names(GlobalPatterns)[1:20], GlobalPatterns)
	# Arbitrary merge into taxa "951", NA in ranks after Phylum
	OTUIDs = c("951", "586076", "141782", "30678", "30405")
	biggestOTU = names(which.max(taxa_sums(GP20)[OTUIDs]))  
	n5 = merge_taxa(GP20, OTUIDs)
	# The biggest OTU is kept, others merged
	expect_true(biggestOTU %in% taxa_names(n5))
	expect_true(!any(setdiff(OTUIDs, biggestOTU) %in% taxa_names(n5)))
	# The taxonomy should be NA_character_ after Phylum (OTUIDs chosen carefully in this case)
	n5_merged_taxonomy <- as(tax_table(n5), "matrix")[biggestOTU, ]
	expect_true(!any(is.na(n5_merged_taxonomy[1:2])))
	expect_true(all(is.na(n5_merged_taxonomy[3:7])))	
	# Test how well it works at a different level (say first or last ranks)
	OTUIDs <- c("1126", "31759")
	biggestOTU = names(which.max(taxa_sums(GP20)[OTUIDs]))  
	n6 <- merge_taxa(GP20, OTUIDs)
	# The biggest OTU is kept, others merged
	expect_true(biggestOTU %in% taxa_names(n6))
	expect_true(!any(setdiff(OTUIDs, biggestOTU) %in% taxa_names(n6)))
	# Test that the taxonomy is NA after Order
	n6_merged_taxonomy <- as(tax_table(n6), "matrix")[biggestOTU, ]
	expect_true( !any(is.na(n6_merged_taxonomy[1:4])) )
	expect_true( all(is.na(n6_merged_taxonomy[5:7])) )	
	# Test that it works for differences at the first rank
	GP20f <- GP20
	tax_table(GP20f)[1, 1] <- "Bacteria"
  OTUIDs = taxa_names(GP20f)[1:2]
	biggestOTU = names(which.max(taxa_sums(GP20f)[OTUIDs]))  
  expect_is(n7 <- merge_taxa(GP20f, OTUIDs), "phyloseq")
	# Should be all NA taxonomy
	expect_equal( all(is.na(as(tax_table(n7), "matrix")[biggestOTU, ])), (TRUE))
	# Test that it works for differences at the last rank
	# First, make the first taxa the same as "951"
	tax_table(GP20f)[1, ] <- tax_table(GP20f)["951", ]
	# Now change the last rank of this entry to something else
	tax_table(GP20f)[1, length(rank_names(GP20f))] <- "species_phyloseq_test"
  OTUIDs = c("951", biggestOTU)
	biggestOTU = names(which.max(taxa_sums(GP20f)[OTUIDs]))
	expect_is(n8 <- merge_taxa(GP20f, OTUIDs), "phyloseq")
	t951 <- as(tax_table(n8), "matrix")[biggestOTU, ]	
	expect_equal( sum(is.na(t951)), 1L )
	expect_true( is.na(t951[length(rank_names(n8))]) )
	expect_identical( t951[-7],  as(tax_table(GP20f), "matrix")["951", ][-7] )
	# Test that it works if the taxonomies completely agree
	GP20f <- GP20	
	# Make the first taxa the same as "951"
	tax_table(GP20f)[1, ] <- tax_table(GP20f)["951", ]
	merge_these <- c("549322", "951")
	n9   <- merge_taxa(GP20f, merge_these)
	n9t1 <- as(tax_table(n9), "matrix")["549322", ]
	# None should be NA
	expect_equal(any(is.na(n9t1)), (FALSE))
	expect_equal(length(n9t1), (7L))
	# Merge worked, "951" is gone.
	expect_equal("951" %in% taxa_names(n9), (FALSE))	
})
test_that("merge_taxa() properly handles different types and orders of taxa specified by the eqtaxa and archetype arguments, and also handles refseq data", {
  # Test merge_taxa on data with a reference sequence file.
  otufile <- system.file("extdata", "GP_otu_table_rand_short.txt.gz", package="phyloseq")
  mapfile <- system.file("extdata", "master_map.txt", package="phyloseq")
  trefile <- system.file("extdata", "GP_tree_rand_short.newick.gz", package="phyloseq")
  rs_file <- system.file("extdata", "qiime500-refseq.fasta", package="phyloseq")
  rs0 <- import_qiime(otufile, mapfile, trefile, rs_file)
  rs1 = merge_taxa(rs0, c("71074", "10517", "8096"))
  rs2 = merge_taxa(rs0, c("71074", "8096", "10517"), "71074")
  rs3 = merge_taxa(rs0, c("71074", "10517", "8096"), 3)
  rs4 = merge_taxa(rs0, c("8096", "71074", "10517"))
  # rs1 and rs2 should be identical
  # rs3 and rs4 should be identical
  expect_equivalent(rs1, rs2)
  expect_true(!identical(rs1, rs3))
  expect_equivalent(rs3, rs4)
  # double-check that components are all there
  expect_equal(length(getslots.phyloseq(rs1)), (5L))
  expect_equal(length(getslots.phyloseq(rs2)), (5L))
  expect_equal(length(getslots.phyloseq(rs3)), (5L))
  expect_equal(length(getslots.phyloseq(rs4)), (5L))
  # The number of taxa should be the same as the original less two
  expect_equal(ntaxa(rs1), (ntaxa(rs0)-2L))
  expect_equal(ntaxa(rs2), (ntaxa(rs0)-2L))
  expect_equal(ntaxa(rs3), (ntaxa(rs0)-2L))
  expect_equal(ntaxa(rs4), (ntaxa(rs0)-2L))	
  # merge_taxa() errors when a bad archetype is provided
  # Throws error because keepIndex is NULL
  expect_error(merge_taxa(rs0, c("71074", "10517", "8096"), "wtf"))
  # Throws error because keepIndex is not part of eqtaxa (logic error, invalid merge)
  expect_error(merge_taxa(rs0, c("71074", "10517", "8096"), "13662"))
})
joey711/phyloseq documentation built on Nov. 4, 2022, 1:16 a.m.