##' Annotate amplified sequence variants (ASVs) with taxa labels
##' derived from a BLAST search.
##'
##' Based on a BLAST search taxonomic labels are assigned to
##' ASVs. Currently supported taxonomy levels are c("superkingdom",
##' "phylum", "class", "order", "family", "genus", "species"). For
##' each taxonomic level unique best taxa (based on bitscores) are
##' reported. If no unique best taxon exists at a particular level NA
##' is returned for this. The function combines evidence for taxnomic
##' annotations in case of seperate HSPs for (concatenated) non-merged
##' sequences.
##'
##' @title blastTaxAnnot
##' @param MA A MultiAmplicon for which taxa shoudl be annotated. This
##' should contain a \code{sequenceTableNoChime} slot, meaning
##' that the MultiAmplicon pipeline needs to be followed to that
##' point.
##' @param db The blast database. Either a full path or the path
##' relative to the you data base directory set in an
##' environmental variable.
##' @param num_threads The number of threads used for the blast
##' search.
##' @param negative_gilist A file containing NCBI GI numbers to
##' exclude from blast searches.
##' @param infasta A fasta file generated for the blast searche, a
##' temporary file in the respective R temporary folder by
##' default.
##' @param outblast A blast tabular output file generated by the blast
##' searche, a temporary file in the respective R temporary folder
##' by default.
##' @param taxonSQL An SQL file generated by the package
##' \code{\link[taxonomizr]{taxonomizr}}.
##' @param ... String (of other options) passed to blastn (see blastn
##' -help in the terminal)
##' @return A MultiAmplicon object with the taxonTable slot filled
##' @importFrom taxonomizr getTaxonomy
##' @importFrom utils read.csv
##' @importClassesFrom phyloseq taxonomyTable
##' @importFrom data.table as.data.table
##' @author Emanuel Heitlinger
##' @export
blastTaxAnnot <- function (MA, db="nt/nt",
num_threads= getOption("mc.cores", 1L),
negative_gilist = system.file("extdata", "uncultured.gi",
package = "MultiAmplicon"),
infasta=paste0(tempfile(), ".fasta"),
outblast=paste0(tempfile(), ".blt"),
taxonSQL="/SAN/db/taxonomy/taxonomizr.sql",
...
) {
SEQ <- getSequencesFromTable(MA)
if(!file.exists(infasta)){
Ssplit <- lapply(seq_along(SEQ), function (i) {
if(!is.null(SEQ[[i]])){
ampSEQ <- SEQ[[i]]
SEQnames <- paste("A", i, "S", 1:length(ampSEQ), "R_", sep="_")
names(ampSEQ) <- SEQnames
Ssplit <- strsplit(ampSEQ, "NNNNNNNNNN")
unlist(Ssplit)
} else (NULL)
})
Biostrings::writeXStringSet(DNAStringSet(unlist(Ssplit)),
infasta)
message("wrote file ", infasta,
" storing input sequences for blast\n")
} else {
message("file ", infasta, " exists, using existing file!",
" To extract input sequences for blast again delete the file\n")
}
if(!file.exists(outblast)){
## We blast this file against NR with a gi-list excluding all
## uncultured sequences
## create the gi-list as a download from an NCBI Entrez Nucleotide
## search '"environmental samples"[organism] OR metagenomes[orgn]'
## or via command line: esearch -db nucleotide -query '"environmental
## samples"[organism] OR metagenomes[orgn]' | efetch -format gi -mode
## text > /SAN/db/blastdb/uncultured_gilist.txt
command <- paste("blastn",
"-negative_gilist", negative_gilist,
"-query", infasta,
"-db", db,
"-evalue", 1e-15,
"-num_threads", num_threads,
"-max_hsps", 1,
"-max_target_seqs 25",
"-out", outblast,
"-outfmt", "'10 qaccver saccver pident length mismatch gapopen",
"qstart qend sstart send evalue bitscore staxid'",
...)
message("RUNNING BLAST with command:\n",
command)
system(command)
cat("\nFINISHED running blast\n")
} else {
message("file ", outblast, " exists, using existing file!",
" To run blast again delete file")
}
## we read the blast file
blast <- read.csv(outblast, header=FALSE)
names(blast) <- c("query", "subject", "pident", "length", "mismatch",
"gapopen", "qstart", "qend", "sstart", "send", "evalue",
"bitscore", "staxid")
blastT <- as.data.table(blast)
blastT$staxid <- as.character(blastT$staxid)
blastT$ampProd <- gsub("_R_\\d+", "", blastT$query)
## retain only the maximal bitscore for each query and staxid
blastT$qtaxid <- as.factor(paste(blastT$query, blastT$staxid, sep="|"))
blastT <- blastT[blastT[, .I[bitscore == max(bitscore)], by=qtaxid][, V1]]
## unique in case of multiple best hits in one taxID
blastT <- blastT[!duplicated(blastT$qtaxid)]
blastT <- blastT[blastT[, .I[bitscore == max(bitscore)], by=qtaxid][, V1]]
## for each amplification product get the sum of the bitscores for
## forward and reverse query (for non-merged sequences that is)
blt <- blastT[, list(bitsum = sum(bitscore)), by=c("ampProd", "staxid")]
blast.tax <- getTaxonomy(unique(blt$staxid), taxonSQL)
blast.tax <- as.data.table(blast.tax, keep.rownames="staxid")
blast.tax$staxid <- gsub("\\s*", "", blast.tax$staxid)
blt <- merge(blt, blast.tax, by="staxid", all=TRUE)
## get the best bitscore for each amplification product
blt <- blt[blt[, .I[bitsum == max(bitsum)], by=ampProd][, V1]]
get.unique.tax <- function(x, Nsupport=1) {
## unique taxa at that level excluding potential NA's
agnostic <- as.character(x)
taxa <- agnostic[!is.na(agnostic)]
ux <- unique(taxa)
## but return NA if they are not unique
if(length(taxa)>=Nsupport && ## number of supporting annotations
length(ux)==1){ ## has to be a unique answer
return(ux)
} else {as.character(NA)}
}
## now if multiple taxa with bitscores are given, set NA at any
## level
B <- blt[,
{list(superkingdom = get.unique.tax(superkingdom),
phylum = get.unique.tax(phylum),
class = get.unique.tax(class),
order = get.unique.tax(order),
family = get.unique.tax(family),
genus = get.unique.tax(genus),
species = get.unique.tax(species))
},
by=ampProd]
B$amplicon <- gsub("A_(\\d+)_S_(\\d+)(_R_)?", "\\1", B$ampProd)
B$sequence <- gsub("A_(\\d+)_S_(\\d+)(_R_)?", "\\2", B$ampProd)
annot.l <- by(B, B$amplicon, function (x){
taxDF <- as.data.frame(x)
n.amp <- as.numeric(unique(x$amplicon))
n.seq <- as.numeric(x$sequence)
rownames(taxDF) <- SEQ[[n.amp]][n.seq]
taxDF <- taxDF[SEQ[[n.amp]],
c("superkingdom", "phylum", "class", "order", "family",
"genus", "species")]
rownames(taxDF) <- SEQ[[n.amp]]
as.matrix(taxDF)
})
names(annot.l) <- names(SEQ)[as.numeric(names(annot.l))]
annot.l <- lapply(annot.l[names(SEQ)], function (x) x) ## drop array
taxTab.l <- lapply(seq_along(annot.l), function (i) {
if (!is.null(annot.l[[i]])) {
new("taxonomyTable", cbind(annot.l[[i]],
amplicon=names(annot.l)[i]))
} else {NULL}
})
names(taxTab.l) <- names(getPrimerPairsSet(MA))
MultiAmplicon(.Data=MA@.Data,
PairedReadFileSet = getPairedReadFileSet(MA),
PrimerPairsSet = getPrimerPairsSet(MA),
sampleData = getSampleData(MA),
stratifiedFilesF = getStratifiedFilesF(MA, dropEmpty=FALSE),
stratifiedFilesR = getStratifiedFilesR(MA, dropEmpty=FALSE),
rawCounts = getRawCounts(MA),
derepF = getDerepF(MA, dropEmpty=FALSE),
derepR = getDerepR(MA, dropEmpty=FALSE),
dadaF = getDadaF(MA, dropEmpty=FALSE),
dadaR = getDadaR(MA, dropEmpty=FALSE),
mergers = getMergers(MA, dropEmpty=FALSE),
sequenceTable = getSequenceTable(MA),
sequenceTableNoChime = getSequenceTableNoChime(MA),
taxonTable = taxTab.l
)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.