################################################################################
#' Determine if input sequence is a bimera of putative parent sequences.
#'
#' This function attempts to find an exact bimera of the parent sequences that
#' matches the input sequence. A bimera is a two-parent chimera, in which the
#' left side is made up of one parent sequence, and the right-side made up of
#' a second parent sequence. If an exact bimera is found TRUE is returned,
#' otherwise FALSE. Bimeras that are one-off from exact are also identified if
#' the allowOneOff argument is TRUE.
#'
#' @param sq (Required). A \code{character(1)}.
#' The sequence being evaluated as a possible bimera.
#'
#' @param parents (Required). Character vector.
#' A vector of possible "parent" sequence that could form the left and right
#' sides of the bimera.
#'
#' @param allowOneOff (Optional). A \code{logical(1)}. Default is FALSE.
#' If FALSE, sq will be identified as a bimera if it is one mismatch or indel away
#' from an exact bimera.
#'
#' @param minOneOffParentDistance (Optional). A \code{numeric(1)}. Default is 4.
#' Only sequences with at least this many mismatches to sq are considered as possible
#' "parents" when flagging one-off bimeras. There is no such screen when identifying
#' exact bimeras.
#'
#' @param maxShift (Optional). A \code{numeric(1)}. Default is 16.
#' Maximum shift allowed when aligning sequences to potential "parents".
#'
#' @return \code{logical(1)}.
#' TRUE if sq is a bimera of two of the parents. Otherwise FALSE.
#'
#' @seealso
#' \code{\link{isBimeraDenovo}}, \code{\link{removeBimeraDenovo}}
#'
#' @export
#'
#' @examples
#' derep1 = derepFastq(system.file("extdata", "sam1F.fastq.gz", package="dada2"))
#' sqs1 <- getSequences(derep1)
#' isBimera(sqs1[[20]], sqs1[1:10])
#'
isBimera <- function(sq, parents, allowOneOff=FALSE, minOneOffParentDistance=4, maxShift=16) {
rval <- C_is_bimera(sq, parents, allowOneOff, minOneOffParentDistance,
getDadaOpt("MATCH"), getDadaOpt("MISMATCH"), getDadaOpt("GAP_PENALTY"), maxShift)
return(rval)
}
################################################################################
#' Identify bimeras from collections of unique sequences.
#'
#' This function is a wrapper around \code{\link{isBimera}} for collections of unique
#' sequences (i.e. sequences with associated abundances). Each sequence is evaluated
#' against a set of "parents" drawn from the sequence collection that are sufficiently
#' more abundant than the sequence being evaluated. A logical vector is returned, with
#' an entry for each input sequence indicating whether it was (was not) consistent with
#' being a bimera of those more abundant "parents".
#'
#' @param unqs (Required). A \code{\link{uniques-vector}} or any object that can be coerced
#' into one with \code{\link{getUniques}}.
#'
#' @param minFoldParentOverAbundance (Optional). A \code{numeric(1)}. Default is 2.
#' Only sequences greater than this-fold more abundant than a sequence can be its
#' "parents".
#'
#' @param minParentAbundance (Optional). A \code{numeric(1)}. Default is 8.
#' Only sequences at least this abundant can be "parents".
#'
#' @param allowOneOff (Optional). A \code{logical(1)}. Default is FALSE.
#' If FALSE, sequences that have one mismatch or indel to an exact bimera are also
#' flagged as bimeric.
#'
#' @param minOneOffParentDistance (Optional). A \code{numeric(1)}. Default is 4.
#' Only sequences with at least this many mismatches to the potential bimeric sequence
#' considered as possible "parents" when flagging one-off bimeras. There is
#' no such screen when considering exact bimeras.
#'
#' @param maxShift (Optional). A \code{numeric(1)}. Default is 16.
#' Maximum shift allowed when aligning sequences to potential "parents".
#'
#' @return \code{logical} of length the number of input unique sequences.
#' TRUE if sequence is a bimera of more abundant "parent" sequences. Otherwise FALSE.
#'
#' @param multithread (Optional). Default is FALSE.
#' If TRUE, multithreading is enabled and the number of available threads is automatically determined.
#' If an integer is provided, the number of threads to use is set by passing the argument on to
#' \code{\link{mclapply}}.
#'
#' @param verbose (Optional). \code{logical(1)} indicating verbose text output. Default FALSE.
#'
#' @seealso
#' \code{\link{isBimera}}, \code{\link{removeBimeraDenovo}}
#'
#' @export
#'
#' @importFrom parallel mclapply
#' @importFrom parallel detectCores
#'
#' @examples
#' derep1 = derepFastq(system.file("extdata", "sam1F.fastq.gz", package="dada2"))
#' dada1 <- dada(derep1, err=tperr1, errorEstimationFunction=loessErrfun, selfConsist=TRUE)
#' is.bim <- isBimeraDenovo(dada1)
#' is.bim2 <- isBimeraDenovo(dada1$denoised, minFoldParentOverAbundance = 2, allowOneOff=TRUE)
#'
isBimeraDenovo <- function(unqs, minFoldParentOverAbundance = 2, minParentAbundance = 8, allowOneOff=FALSE, minOneOffParentDistance=4, maxShift=16, multithread=FALSE, verbose=FALSE) {
if(any(duplicated(getSequences(unqs)))) message("Duplicate sequences detected.")
unqs.int <- getUniques(unqs, silence=TRUE) # Internal, keep input unqs for proper return value when duplications
abunds <- unname(unqs.int)
seqs <- names(unqs.int)
seqs.input <- getSequences(unqs)
rm(unqs); gc(verbose=FALSE)
# Parse multithreading argument
if(is.logical(multithread)) {
if(multithread==TRUE) { mc.cores <- getOption("mc.cores", detectCores()) }
} else if(is.numeric(multithread)) {
mc.cores <- multithread
multithread <- TRUE
} else {
warning("Invalid multithread parameter. Running as a single thread.")
multithread <- FALSE
}
loopFun <- function(i, unqs.loop, minFoldParentOverAbundance, minParentAbundance, allowOneOff, minOneOffParentDistance, maxShift) {
sq <- names(unqs.loop)[[i]]
abund <- unqs.loop[[i]]
pars <- names(unqs.loop)[(unqs.loop>(minFoldParentOverAbundance*abund) & unqs.loop>minParentAbundance)]
if(length(pars) < 2) {
return(FALSE)
} else {
isBimera(sq, pars, allowOneOff=allowOneOff, minOneOffParentDistance=minOneOffParentDistance, maxShift=maxShift)
}
}
if(multithread) {
mc.indices <- sample(seq_along(unqs.int), length(unqs.int)) # load balance
bims <- mclapply(mc.indices, loopFun, unqs.loop=unqs.int,
allowOneOff=allowOneOff, minFoldParentOverAbundance=minFoldParentOverAbundance,
minParentAbundance=minParentAbundance,
minOneOffParentDistance=minOneOffParentDistance, maxShift=maxShift,
mc.cores=mc.cores)
bims <- bims[order(mc.indices)]
} else {
bims <- lapply(seq_along(unqs.int), loopFun, unqs.loop=unqs.int,
allowOneOff=allowOneOff, minFoldParentOverAbundance=minFoldParentOverAbundance,
minParentAbundance=minParentAbundance,
minOneOffParentDistance=minOneOffParentDistance, maxShift=maxShift)
}
bims <- unlist(bims)
bims.out <- seqs.input %in% seqs[bims]
names(bims.out) <- seqs.input
if(verbose) message("Identified ", sum(bims.out), " bimeras out of ", length(bims.out), " input sequences.")
return(bims.out)
}
################################################################################
#' Identify bimeras in a sequence table.
#'
#' This function implements a table-specific version of de novo bimera detection. In short,
#' bimeric sequences are flagged on a sample-by-sample basis. Then, a vote is performed for
#' each sequence across all samples in which it appeared. If the sequence is flagged in a
#' sufficiently high fraction of samples, it is identified as a bimera. A logical vector is
#' returned, with an entry for each sequence in the table indicating whether it was identified
#' as bimeric by this consensus procedure.
#'
#' @param seqtab (Required). A sequence table. That is, an integer matrix with colnames
#' corresponding to DNA sequences.
#'
#' @param minSampleFraction (Optional). Default is 0.9.
#' The fraction of samples in which a sequence must be flagged as bimeric in order for it to
#' be classified as a bimera.
#'
#' @param ignoreNNegatives (Optional). Default is 1.
#' The number of unflagged samples to ignore when evaluating whether the fraction of samples
#' in which a sequence was flagged as a bimera exceeds \code{minSampleFraction}. The purpose
#' of this parameter is to lower the threshold at which sequences found in few samples are
#' flagged as bimeras.
#'
#' @param minFoldParentOverAbundance (Optional). Default is 1.5.
#' Only sequences greater than this-fold more abundant than a sequence can be its
#' "parents". Evaluated on a per-sample basis.
#'
#' @param minParentAbundance (Optional). Default is 2.
#' Only sequences at least this abundant can be "parents". Evaluated on a per-sample basis.
#'
#' @param allowOneOff (Optional). Default is FALSE.
#' If FALSE, sequences that have one mismatch or indel to an exact bimera are also
#' flagged as bimeric.
#'
#' @param minOneOffParentDistance (Optional). Default is 4.
#' Only sequences with at least this many mismatches to the potential bimeric sequence
#' considered as possible "parents" when flagging one-off bimeras. There is
#' no such screen when considering exact bimeras.
#'
#' @param maxShift (Optional). Default is 16.
#' Maximum shift allowed when aligning sequences to potential "parents".
#'
#' @param multithread (Optional). Default is FALSE.
#' If TRUE, multithreading is enabled. NOT YET IMPLEMENTED.
#'
#' @param verbose (Optional). Default FALSE.
#' Print verbose text output.
#'
#' @return \code{logical} of length equal to the number of sequences in the input table.
#' TRUE if sequence is identified as a bimera. Otherwise FALSE.
#'
#' @seealso
#' \code{\link{isBimera}}, \code{\link{removeBimeraDenovo}}
#'
#' @export
#'
#' @examples
#' derep1 = derepFastq(system.file("extdata", "sam1F.fastq.gz", package="dada2"))
#' derep2 = derepFastq(system.file("extdata", "sam2F.fastq.gz", package="dada2"))
#' dd <- dada(list(derep1,derep2), err=NULL, errorEstimationFunction=loessErrfun, selfConsist=TRUE)
#' seqtab <- makeSequenceTable(dd)
#' isBimeraDenovoTable(seqtab)
#' isBimeraDenovoTable(seqtab, allowOneOff=TRUE, minSampleFraction=0.5)
#'
isBimeraDenovoTable <- function(seqtab, minSampleFraction=0.9, ignoreNNegatives=1, minFoldParentOverAbundance = 1.5, minParentAbundance = 2, allowOneOff=FALSE, minOneOffParentDistance=4, maxShift=16, multithread=FALSE, verbose=FALSE) {
sqs <- colnames(seqtab)
if(!(is.matrix(seqtab) && is.integer(seqtab) && !is.null(sqs))) {
stop("Input must be a valid sequence table.")
}
if(any(duplicated(sqs))) stop("Duplicate sequences detected in input.")
# Parse multithreading argument
if(is.logical(multithread)) {
if(multithread==TRUE) { RcppParallel::setThreadOptions(numThreads = "auto") }
else { RcppParallel::setThreadOptions(numThreads = 1) }
} else if(is.numeric(multithread)) {
RcppParallel::setThreadOptions(numThreads = multithread)
} else {
warning("Invalid multithread parameter. Running as a single thread.")
RcppParallel::setThreadOptions(numThreads = 1)
}
bimdf <- C_table_bimera2(seqtab, sqs,
minFoldParentOverAbundance, minParentAbundance, allowOneOff, minOneOffParentDistance,
getDadaOpt("MATCH"), getDadaOpt("MISMATCH"), getDadaOpt("GAP_PENALTY"), maxShift)
is.bim <- function(nflag, nsam, minFrac, ignoreN) {
nflag >= nsam || (nflag > 0 && nflag >= (nsam-ignoreN)*minFrac)
}
bims.out <- mapply(is.bim, bimdf$nflag, bimdf$nsam, minFrac=minSampleFraction, ignoreN=ignoreNNegatives)
names(bims.out) <- sqs
if(verbose) message("Identified ", sum(bims.out), " bimeras out of ", length(bims.out), " input sequences.")
return(bims.out)
}
################################################################################
#' Remove bimeras from collections of unique sequences.
#'
#' This function is a convenience interface for chimera removal. Two methods to identify chimeras are
#' supported: Identification from pooled sequences (see \code{\link{isBimeraDenovo}} for details)
#' and identification by consensus across samples (see \code{\link{isBimeraDenovoTable}} for details).
#' Sequence variants identified as bimeric are removed, and a bimera-free collection of unique sequences
#' is returned.
#'
#' @param unqs (Required). A \code{\link{uniques-vector}} or any object that can be coerced
#' into one with \code{\link{getUniques}}. A list of such objects can also be provided.
#'
#' @param method (Optional). Default is "consensus". Only has an effect if a sequence table is provided.
#'
#' If "pooled": The samples in the sequence table are all pooled together for bimera
#' identification (\code{\link{isBimeraDenovo}}).
#'
#' If "consensus": The samples in a sequence table are independently checked for bimeras,
#' and a consensus decision on each sequence variant is made (\code{\link{isBimeraDenovoTable}}).
#'
#' If "per-sample": The samples in a sequence table are independently checked for bimeras,
#' and sequence variants are removed (zeroed-out) from samples independently (\code{\link{isBimeraDenovo}}).
#'
#' @param ... (Optional). Arguments to be passed to \code{\link{isBimeraDenovo}} or \code{\link{isBimeraDenovoTable}}.
#' The documentation of those methods detail the additional algorithmic parameters that can be adjusted.
#'
#' @param verbose (Optional). Default FALSE.
#' Print verbose text output.
#'
#' @return A uniques vector, or an object of matching class if a data.frame or sequence table is provided.
#' A list of such objects is returned if a list of input unqs was provided.
#'
#' @seealso \code{\link{isBimeraDenovoTable}}, \code{\link{isBimeraDenovo}}
#'
#' @export
#'
#' @importFrom methods is
#'
#' @examples
#' derep1 = derepFastq(system.file("extdata", "sam1F.fastq.gz", package="dada2"))
#' dada1 <- dada(derep1, err=tperr1, errorEstimationFunction=loessErrfun, selfConsist=TRUE)
#' out.nobim <- removeBimeraDenovo(dada1)
#' out.nobim <- removeBimeraDenovo(dada1$clustering, method="pooled", minFoldParentOverAbundance = 2)
#'
removeBimeraDenovo <- function(unqs, method = "consensus", ..., verbose=FALSE) {
if(is(unqs, "dada") || is(unqs, "derep") || is(unqs, "data.frame")) { unqs <- list(unqs) }
if(!is.list(unqs)) { unqs <- list(unqs) }
# Should consider removing the list-wise functionality here. Adds unnecessary complexity.
if("tableMethod" %in% names(list(...))) {
stop("DEFUNCT: The tableMethod argument has been replaced by the method argument. Please update your code.")
}
outs <- list()
for(i in seq_along(unqs)) {
# The following code is adapted from getUniques
if(is.integer(unqs[[i]]) && length(names(unqs[[i]])) != 0 && !any(is.na(names(unqs[[i]])))) { # Named integer vector already
bim <- isBimeraDenovo(unqs[[i]], ..., verbose=verbose)
outs[[i]] <- unqs[[i]][!bim]
} else if(is(unqs[[i]], "dada")) { # dada return
bim <- isBimeraDenovo(unqs[[i]], ..., verbose=verbose)
outs[[i]] <- unqs[[i]]$denoised[!bim]
} else if(is(unqs[[i]], "derep")) {
bim <- isBimeraDenovo(unqs[[i]], ..., verbose=verbose)
outs[[i]] <- unqs[[i]]$uniques[!bim]
} else if(is.data.frame(unqs[[i]]) && all(c("sequence", "abundance") %in% colnames(unqs[[i]]))) {
bim <- isBimeraDenovo(unqs[[i]], ..., verbose=verbose)
outs[[i]] <- unqs[[i]][!bim,]
} else if(is.matrix(unqs[[i]]) && !any(is.na(colnames(unqs[[i]])))) { # Tabled sequences
if(method == "pooled") {
bim <- isBimeraDenovo(unqs[[i]], ..., verbose=verbose)
} else if(method == "consensus") {
bim <- isBimeraDenovoTable(unqs[[i]], ..., verbose=verbose)
} else if(method == "per-sample") {
bim <- t(apply(unqs[[i]], 1, function(x) isBimeraDenovo(x, ..., verbose=verbose)))
} else {
stop("Valid values for method: 'pooled', 'consensus', or 'per-sample'")
}
if (method %in% c("pooled", "consensus")) {
outs[[i]] <- unqs[[i]][,!bim,drop=FALSE]
} else if (method %in% c("per-sample")) {
outs[[i]] <- unqs[[i]]
outs[[i]][which(bim, arr.ind=TRUE)] <- 0
cbim <- colSums(outs[[i]])==0
outs[[i]] <- outs[[i]][,!cbim,drop=FALSE]
}
else {
stop("Valid values for method: 'pooled', 'consensus', or 'per-sample'")
}
} else {
stop("Unrecognized format: Requires named integer vector, dada-class, derep-class, sequence matrix, or a data.frame with $sequence and $abundance columns.")
}
}
names(outs) <- names(unqs)
if(length(outs) == 1) {
return(outs[[1]])
}
return(outs)
}
################################################################################
#' Identify sequences that are identical to a more abundant sequence up to an
#' overall shift.
#'
#' This function is a wrapper around isShift for collections of unique
#' sequences. Each unique sequence is evaluated against a set of "parents" drawn from
#' the sequence collection that are more abundant than the sequence being evaluated.
#'
#' @param unqs (Required). A \code{\link{uniques-vector}} or any object that can be coerced
#' into one with \code{\link{getUniques}}.
#'
#' @param minOverlap (Optional). A \code{numeric(1)}. Default is 20.
#' Minimum overlap required to call something a shift.
#'
#' @param flagSubseqs (Optional). A \code{logical(1)}. Default is FALSE.
#' Whether or not to flag strict subsequences as shifts.
#'
#' @return \code{logical} of length the number of input unique sequences.
#' TRUE if sequence is an exact shift of a more abundant sequence. Otherwise FALSE.
#'
#' @param verbose (Optional). \code{logical(1)} indicating verbose text output. Default FALSE.
#'
#' @seealso \code{\link{isBimera}}
#'
#' @export
#'
#' @examples
#' derep1 = derepFastq(system.file("extdata", "sam1F.fastq.gz", package="dada2"))
#' dada1 <- dada(derep1, err=tperr1, errorEstimationFunction=loessErrfun, selfConsist=TRUE)
#' is.shift <- isShiftDenovo(dada1)
#' is.shift <- isShiftDenovo(dada1$denoised, minOverlap=50, verbose=TRUE)
#'
isShiftDenovo <- function(unqs, minOverlap = 20, flagSubseqs=FALSE, verbose=FALSE) {
unqs.int <- getUniques(unqs, silence=TRUE) # Internal, keep input unqs for proper return value when duplications
abunds <- unname(unqs.int)
seqs <- names(unqs.int)
loopFun <- function(sq, abund) {
pars <- seqs[abunds>abund]
if(length(pars) == 0) {
if(verbose) print("No possible parents.")
return(FALSE)
} else {
isShift(sq, pars, minOverlap=minOverlap)
}
}
shifts <- mapply(loopFun, seqs, abunds)
shifts.out <- getSequences(unqs) %in% seqs[shifts]
names(shifts.out) <- getSequences(unqs)
return(shifts.out)
}
# Internal function that determines if two sequences are identical up to a shift
# Uses NW alignment with ends-free gapping
#
# @param sq1 A \code{character(1)}. The first DNA sequence.
#
# @param sq2 A \code{character(1)}. The second DNA sequence.
#
# @param minOverlap (Optional). A \code{numeric(1)}. Default is 20.
# Minimum overlap required to call something a shift.
#
isShiftedPair <- function(sq1, sq2, minOverlap=20, flagSubseqs=FALSE) {
al <- nwalign(sq1, sq2, band=-1)
foo <- C_eval_pair(al[1], al[2])
return((foo["match"] < nchar(sq1) || flagSubseqs) && (foo["match"] < nchar(sq2) || flagSubseqs) &&
foo["match"] >= minOverlap && foo["mismatch"]==0 && foo["indel"]==0)
}
isShift <- function(sq, pars, minOverlap=20, flagSubseqs=FALSE) {
return(any(sapply(pars, function(par) isShiftedPair(sq, par, minOverlap=minOverlap, flagSubseqs=flagSubseqs))))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.