################################################################################
#' Search for exact matching of sequences
#'
#' @description
#'
#' <a href="https://adrientaudiere.github.io/MiscMetabar/articles/Rules.html#lifecycle">
#' <img src="https://img.shields.io/badge/lifecycle-experimental-orange" alt="lifecycle-experimental"></a>
#'
#' Search for exact matching of sequences using complement,
#' reverse and reverse-complement. It is useful to check for
#' primers issues after cutadapt step.
#'
#' @inheritParams clean_pq
#' @param seq2search A DNAStringSet object of sequences to search for.
#' @return A list of data-frames for each input sequences with the name,
#' the sequences and the
#' number of occurrences of the original sequence, the complement sequence,
#' the reverse sequence and the reverse-complement sequence.
#' @export
#'
#' @examples
#' data("data_fungi")
#' search_primers <- search_exact_seq_pq(data_fungi,
#' seq2search = Biostrings::DNAStringSet(c("TTGAACGCACATTGCGCC", "ATCCCTACCTGATCCGAG"))
#' )
#' @author Adrien Taudière
search_exact_seq_pq <- function(physeq, seq2search) {
sequences <- seq2search
res <- vector("list", length(sequences))
for (i in seq_along(sequences)) {
original <- sequences[[i]]
rev <- Biostrings::reverse(sequences[[i]])
rev_comp <- Biostrings::reverseComplement(sequences[[i]])
comp <- Biostrings::complement(sequences[[i]])
original_count <- sum(grepl(original, physeq@refseq))
rev_count <- sum(grepl(rev, physeq@refseq))
rev_comp_count <- sum(grepl(rev_comp, physeq@refseq))
comp_count <- sum(grepl(comp, physeq@refseq))
res[[i]] <- data.frame(
c("original", as.character(original), original_count),
c("rev", as.character(rev), rev_count),
c("rev_comp", as.character(rev_comp), rev_comp_count),
c("comp", as.character(comp), comp_count)
)
colnames(res[[i]]) <- c("Names", "Sequences", "Number of occurrences")
}
return(res)
}
################################################################################
#' Calculate ecological distance among positive controls vs
#' distance for all samples
#' @description
#'
#' <a href="https://adrientaudiere.github.io/MiscMetabar/articles/Rules.html#lifecycle">
#' <img src="https://img.shields.io/badge/lifecycle-experimental-orange" alt="lifecycle-experimental"></a>
#'
#' Compute distance among positive controls,
#' i.e. samples which are duplicated
#' to test for variation, for example in
#' (i) a step in the sampling,
#' (ii) a step in the extraction,
#' (iii) a step in the sequencing.
#' @aliases dist_pos_control
#' @inheritParams clean_pq
#' @param samples_names (required) a vector of names for samples with
#' positives controls of the same samples having the same name
#' @param method (default: "bray") a method to calculate
#' the distance, parsed to [vegan::vegdist()]. See ?vegdist for a list
#' of possible values.
#' @return A list of two data-frames with
#' (i) the distance among positive controls and
#' (ii) the distance among all samples
#' @export
#'
#' @examples
#' data("enterotype")
#' sam_name_factice <- gsub("TS1_V2", "TS10_V2", sample_names(enterotype))
#' res_dist_cont <- dist_pos_control(enterotype, sam_name_factice)
#' hist(unlist(res_dist_cont$distAllSamples))
#' abline(
#' v = mean(unlist(res_dist_cont$dist_controlontrolSamples), na.rm = TRUE),
#' col = "red", lwd = 3
#' )
#' @author Adrien Taudière
dist_pos_control <- function(physeq, samples_names, method = "bray") {
verify_pq(physeq)
res <- vector("list", 2)
dist_control <- vector()
for (i in levels(as.factor(samples_names))) {
interm <- physeq@otu_table[samples_names == i, ]
if (dim(interm)[1] > 1) {
dist_control <-
c(dist_control, as.numeric(vegan::vegdist(interm, method = method)))
} else {
dist_control <- c(dist_control, NA)
}
}
names(dist_control) <- levels(as.factor(samples_names))
# Compute distance among all samples
if (taxa_are_rows(physeq)) {
matdist_interm <- t(physeq@otu_table)
} else {
matdist_interm <- physeq@otu_table
}
res[[1]] <-
data.frame("distAllSamples" = as.vector(vegan::vegdist(
matdist_interm,
diag = FALSE, upper = TRUE
)))
res[[2]] <-
data.frame("dist_control_samples" = as.vector(stats::na.omit(dist_control)))
names(res) <- c("distAllSamples", "dist_controlontrolSamples")
return(res)
}
################################################################################
################################################################################
#' Subset taxa using a taxa control or distribution based method
#'
#' @description
#' <a href="https://adrientaudiere.github.io/MiscMetabar/articles/Rules.html#lifecycle">
#' <img src="https://img.shields.io/badge/lifecycle-experimental-orange" alt="lifecycle-experimental"></a>
#'
#' There is 3 main methods : discard taxa (i) using a control taxa (e.g. truffle root tips),
#' (ii) using a mixture models to detect bimodality in pseudo-abundance distribution or
#' (iii) using a minimum difference threshold pseudo-abundance. Each cutoff is defined at
#' the sample level.
#'
#' @aliases subset_taxa_tax_control
#' @inheritParams clean_pq
#' @param taxa_distri (required) a vector of length equal to the number of
#' samples with the number of sequences per sample for the taxa control
#' @param method (default: "mean") a method to calculate the cut-off value.
#' There are 6 available methods:
#' 1. `cutoff_seq`: discard taxa with less than the number of sequence
#' than taxa control,
#' 2. `cutoff_mixt`: using mixture models,
#' 3. `cutoff_diff`: using a minimum difference threshold
#' (need the argument min_diff_for_cutoff)
#' 4. `min`: the minimum of the three firsts methods
#' 5. `max`: the maximum of the three firsts methods
#' 6. `mean`: the mean of the three firsts methods
#' @param min_diff_for_cutoff (int) argument for method `cutoff_diff`.
#' Required if method is `cutoff_diff`, `min`, `max` or `mean`
#' @return A new \code{\link[phyloseq]{phyloseq-class}} object.
#' @export
#'
#' @examples
#'
#' subset_taxa_tax_control(data_fungi,
#' as.numeric(data_fungi@otu_table[, 300]),
#' min_diff_for_cutoff = 2
#' )
#'
#' @author Adrien Taudière
subset_taxa_tax_control <-
function(physeq,
taxa_distri,
method = "mean",
min_diff_for_cutoff = NULL) {
verify_pq(physeq)
cutoff_seq <- vector(mode = "numeric", length = nsamples(physeq))
cutoff_mixt <-
vector(mode = "numeric", length = nsamples(physeq))
cutoff_diff <-
vector(mode = "numeric", length = nsamples(physeq))
cutoffs <- vector(mode = "numeric", length = nsamples(physeq))
if (method %in% c("min", "max", "mean", "cutoff_diff")) {
if (is.null(min_diff_for_cutoff)) {
stop("You need to set the `min_diff_for_cutoff` values
when using one of the following methods : min, max, mean and
cutoff_diff.")
}
}
for (i in seq_along(sample_names(physeq))) {
# for each sample
if (method %in% c("min", "max", "mean", "cutoff_mixt")) {
find_cutoff <- function(proba = 0.5, il = index_lower) {
## Cutoff such that Pr[drawn from bad component] == proba
f <- function(x) {
proba - (
model$lambda[il] * stats::dnorm(
x, model$mu[il],
model$sigma[il]
) /
(
model$lambda[1] * stats::dnorm(
x, model$mu[1],
model$sigma[1]
) +
model$lambda[2] * stats::dnorm(
x, model$mu[2],
model$sigma[2]
)
)
)
}
return(stats::uniroot(
f = f,
lower = 1,
upper = 1000
)$root) # Careful with division by zero if changing lower and upper
}
physeq_mixture <- as.vector(physeq@otu_table[i, ])
x_mixture <- physeq_mixture[physeq_mixture > 0]
if (length(x_mixture) > 25) {
try(
model <-
mixtools::normalmixEM(
x = x_mixture,
k = 2,
epsilon = 1e-03,
maxrestarts = 1000
),
silent = TRUE
)
try(index_lower <-
which.min(model$mu), silent = TRUE)
# Index of component with lower mean)
cutoff_mixt[i] <-
try(find_cutoff(proba = 0.5, il = index_lower))
} else {
cutoff_mixt[i] <- NA
}
cutoff_mixt <- as.numeric(cutoff_mixt)
}
if (method %in% c("min", "max", "mean", "cutoff_diff")) {
physeq_diff <- sort(as.vector(as.vector(physeq@otu_table[i, ])))
cond <- diff(physeq_diff) > min_diff_for_cutoff
cutoff_diff[i] <- min(physeq_diff[cond])
}
if (method %in% c("min", "max", "mean", "cutoff_seq")) {
physeq_seq <- sort(as.vector(as.vector(physeq@otu_table[i, ])))
cutoff_seq[i] <- min(physeq_seq[physeq_seq > taxa_distri[i]])
if (is.infinite(cutoff_seq[i])) {
cutoff_seq[i] <- taxa_distri[i]
}
}
}
if (method == "cutoff_mixt") {
cutoffs <- cutoff_mixt
} else if (method == "cutoff_diff") {
cutoffs <- cutoff_diff
} else if (method == "cutoff_seq") {
cutoffs <- cutoff_seq
} else if (method == "mean") {
cutoffs <-
apply(cbind(cutoff_mixt, cutoff_diff, cutoff_seq), 1, mean,
na.rm = TRUE
)
} else if (method == "min") {
cutoffs <-
apply(cbind(cutoff_mixt, cutoff_diff, cutoff_seq), 1, min,
na.rm = TRUE
)
} else if (method == "max") {
cutoffs <-
apply(cbind(cutoff_mixt, cutoff_diff, cutoff_seq), 1, max,
na.rm = TRUE
)
} else {
stop("The method name is not valid.")
}
new_physeq <- physeq
for (i in seq_along(sample_names(physeq))) {
new_physeq@otu_table[i, new_physeq@otu_table[i, ] < floor(cutoffs)[i]] <-
0
}
new_physeq <-
prune_taxa(colSums(new_physeq@otu_table) > 0, new_physeq)
ntaxa_discard <- ntaxa(physeq) - ntaxa(new_physeq)
nseq_discard <-
sum(physeq@otu_table) - sum(new_physeq@otu_table)
message(
paste(
"The filtering processes discard",
ntaxa_discard,
"taxa and",
nseq_discard,
"sequences",
sep = " "
)
)
return(new_physeq)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.