R/annotateTaxa.R

##' Annotate amplified sequence variants (ASVs) with taxa labels
##' derived from a BLAST search.
##'
##' Based on a BLAST search taxonomic labels are assigned to
##' ASVs. Currently supported taxonomy levels are c("superkingdom",
##' "phylum", "class", "order", "family", "genus", "species"). For
##' each taxonomic level unique best taxa (based on bitscores) are
##' reported. If no unique best taxon exists at a particular level NA
##' is returned for this. The function combines evidence for taxnomic
##' annotations in case of seperate HSPs for (concatenated) non-merged
##' sequences.
##' 
##' @title blastTaxAnnot
##' @param MA A MultiAmplicon for which taxa shoudl be annotated. This
##'     should contain a \code{sequenceTableNoChime} slot, meaning
##'     that the MultiAmplicon pipeline needs to be followed to that
##'     point.
##' @param db The blast database. Either a full path or the path
##'     relative to the you data base directory set in an
##'     environmental variable.
##' @param num_threads The number of threads used for the blast
##'     search.
##' @param negative_gilist A file containing NCBI GI numbers to
##'     exclude from blast searches.
##' @param infasta A fasta file generated for the blast searche, a
##'     temporary file in the respective R temporary folder by
##'     default.
##' @param outblast A blast tabular output file generated by the blast
##'     searche, a temporary file in the respective R temporary folder
##'     by default.
##' @param taxonSQL An SQL file generated by the package
##'     \code{\link[taxonomizr]{taxonomizr}}.
##' @param ... String (of other options) passed to blastn (see blastn
##'     -help in the terminal)
##' @return A MultiAmplicon object with the taxonTable slot filled
##' @importFrom taxonomizr getTaxonomy
##' @importFrom utils read.csv
##' @importClassesFrom phyloseq taxonomyTable
##' @importFrom data.table as.data.table
##' @author Emanuel Heitlinger
##' @export


blastTaxAnnot <- function (MA, db="nt/nt",
                           num_threads= getOption("mc.cores", 1L),
                           negative_gilist = system.file("extdata", "uncultured.gi",
                                                         package = "MultiAmplicon"),
                           infasta=paste0(tempfile(), ".fasta"),
                           outblast=paste0(tempfile(), ".blt"),
                           taxonSQL="/SAN/db/taxonomy/taxonomizr.sql",
                           ...
                              ) {

    SEQ <- getSequencesFromTable(MA)    
    if(!file.exists(infasta)){
        Ssplit <- lapply(seq_along(SEQ), function (i) {
            if(!is.null(SEQ[[i]])){
                ampSEQ <- SEQ[[i]]
                SEQnames <- paste("A", i, "S", 1:length(ampSEQ), "R_", sep="_")
                names(ampSEQ) <- SEQnames
                Ssplit <- strsplit(ampSEQ, "NNNNNNNNNN")
                unlist(Ssplit)
            } else (NULL)
        })
        Biostrings::writeXStringSet(DNAStringSet(unlist(Ssplit)),
                                    infasta)
        message("wrote file ", infasta, 
                " storing input sequences for blast\n")
    } else {
        message("file ", infasta, " exists, using existing file!",
                " To extract input sequences for blast  again delete the file\n")
    }

    if(!file.exists(outblast)){
        ## We blast this file against NR with a gi-list excluding all
        ## uncultured sequences
        ## create the gi-list as a download from an NCBI Entrez Nucleotide
        ## search '"environmental samples"[organism] OR metagenomes[orgn]'
        ## or via command line: esearch -db nucleotide -query '"environmental
        ## samples"[organism] OR metagenomes[orgn]' | efetch -format gi -mode
        ## text > /SAN/db/blastdb/uncultured_gilist.txt
        command <- paste("blastn", 
                         "-negative_gilist",  negative_gilist,
                         "-query", infasta,
                         "-db", db,
                         "-evalue",  1e-15,
                         "-num_threads", num_threads,
                         "-max_hsps", 1,
                         "-max_target_seqs 25",
                         "-out", outblast,
                         "-outfmt", "'10 qaccver saccver pident length mismatch gapopen",
                         "qstart qend sstart send evalue bitscore staxid'",
                         ...)
        message("RUNNING BLAST with command:\n",
                command)
        system(command)
        cat("\nFINISHED running blast\n")
    } else {
        message("file ", outblast, " exists, using existing file!", 
                " To run blast again delete file")
    }
    ## we read the blast file  
    blast <- read.csv(outblast, header=FALSE)
    names(blast) <- c("query", "subject", "pident", "length", "mismatch",
                      "gapopen", "qstart", "qend", "sstart", "send", "evalue",
                      "bitscore", "staxid")
    blastT <- as.data.table(blast)
    blastT$staxid <- as.character(blastT$staxid)
    blastT$ampProd <- gsub("_R_\\d+", "", blastT$query)

    
    ## retain only the maximal bitscore for each query and staxid
    blastT$qtaxid <- as.factor(paste(blastT$query, blastT$staxid, sep="|"))
    blastT <- blastT[blastT[, .I[bitscore == max(bitscore)], by=qtaxid][, V1]]

    ## unique in case of multiple best hits in one taxID
    blastT <- blastT[!duplicated(blastT$qtaxid)]
    
    blastT <- blastT[blastT[, .I[bitscore == max(bitscore)], by=qtaxid][, V1]]

    ## for each amplification product get the sum of the bitscores for
    ## forward and reverse query (for non-merged sequences that is)
    blt <- blastT[, list(bitsum = sum(bitscore)), by=c("ampProd", "staxid")]

    blast.tax <- getTaxonomy(unique(blt$staxid), taxonSQL)
    blast.tax <- as.data.table(blast.tax, keep.rownames="staxid")
    blast.tax$staxid <- gsub("\\s*", "", blast.tax$staxid)
    blt <- merge(blt, blast.tax, by="staxid", all=TRUE)
                    
    ## get the best bitscore for each amplification product
    blt <- blt[blt[, .I[bitsum == max(bitsum)], by=ampProd][, V1]]

    get.unique.tax <- function(x, Nsupport=1) {
        ## unique taxa at that level excluding potential NA's 
        agnostic <- as.character(x)
        taxa <- agnostic[!is.na(agnostic)]
        ux <- unique(taxa)
        ## but return NA if they are not unique
        if(length(taxa)>=Nsupport && ## number of supporting annotations
           length(ux)==1){ ## has to be a unique answer
            return(ux)
        } else {as.character(NA)}
    }

    ## now if multiple taxa with bitscores are given, set NA at any
    ## level
    B <- blt[,
    {list(superkingdom = get.unique.tax(superkingdom),
          phylum = get.unique.tax(phylum),
          class = get.unique.tax(class),
          order = get.unique.tax(order),
          family = get.unique.tax(family),
          genus = get.unique.tax(genus),
          species = get.unique.tax(species))
    },
    by=ampProd]

    B$amplicon <- gsub("A_(\\d+)_S_(\\d+)(_R_)?", "\\1", B$ampProd)
    B$sequence <- gsub("A_(\\d+)_S_(\\d+)(_R_)?", "\\2", B$ampProd)

    annot.l <- by(B, B$amplicon, function (x){
        taxDF <- as.data.frame(x)
        n.amp <- as.numeric(unique(x$amplicon))
        n.seq <- as.numeric(x$sequence)
        rownames(taxDF) <- SEQ[[n.amp]][n.seq]
        taxDF <- taxDF[SEQ[[n.amp]],
                       c("superkingdom", "phylum", "class", "order", "family",
                         "genus", "species")]
        rownames(taxDF) <- SEQ[[n.amp]]
        as.matrix(taxDF)
    })
    names(annot.l) <- names(SEQ)[as.numeric(names(annot.l))]
    annot.l <- lapply(annot.l[names(SEQ)], function (x) x) ## drop array
    taxTab.l <- lapply(seq_along(annot.l), function (i) {
        if (!is.null(annot.l[[i]])) {
            new("taxonomyTable", cbind(annot.l[[i]],
                                       amplicon=names(annot.l)[i]))
        } else {NULL}
    })
    names(taxTab.l) <- names(getPrimerPairsSet(MA))
    MultiAmplicon(.Data=MA@.Data,
                  PairedReadFileSet = getPairedReadFileSet(MA),
                  PrimerPairsSet = getPrimerPairsSet(MA),
                  sampleData = getSampleData(MA),
                  stratifiedFilesF = getStratifiedFilesF(MA, dropEmpty=FALSE),
                  stratifiedFilesR = getStratifiedFilesR(MA, dropEmpty=FALSE),
                  rawCounts = getRawCounts(MA),
                  derepF = getDerepF(MA, dropEmpty=FALSE),
                  derepR = getDerepR(MA, dropEmpty=FALSE),
                  dadaF = getDadaF(MA, dropEmpty=FALSE),
                  dadaR = getDadaR(MA, dropEmpty=FALSE),
                  mergers = getMergers(MA, dropEmpty=FALSE),
                  sequenceTable = getSequenceTable(MA),
                  sequenceTableNoChime = getSequenceTableNoChime(MA),
                  taxonTable = taxTab.l
                  )
}
derele/MultiAmplicon documentation built on Dec. 11, 2024, 12:09 p.m.