R/error_matrix_multicore.R

Defines functions error_m

#' Reference-based error correction of amplicon sequencing data
#'
#' @description
#' This function calculate the error matrix
#'
#' @details Ruben Garrido-Oter's group, Plant-Microbe interaction, Max Planck Institute for Plant Breeding Research
#' @author  Pengfan Zhang
#'
#' @param fq the path of merged amplicon sequencing reads in fastq format (Ns are not allowed in the reads)
#' @param ref the unique reference sequences of the reference seqeunces, each sequence must be in one line (Ns are not allowed in the sequences)
#' @param sample_size the sampling size of reads to generate the transition matrix
#' @param threads the number of threads used to align the query reads to reference sequences
#' @param ascii ascii characters used to encode phred scores
#' @param ref_seeker the method for finding the candidate error-producing reference sequence for a tag showing identical lowest K-mer distance to multiple references. 1 for the abundance-based method; 2 for the transition probability-based method, 1 in default.
#'
#'
#' @import dada2
#' @import foreach
#' @import doParallel
#'
#' @usage error_m(fq, ref, sample_size, threads, ascii, ref_seeker)
#'
#' @return The output is a 20 by 43 transition probability matrix
#'
#' @noRd
#'

error_m <- function(fq, ref, sample_size=10000, threads, ascii, ref_seeker) {

    x <- NA
    t1 <- Sys.time()

    # read FASTQ file
    if (grepl(".gz", fq, fixed = TRUE)) {
        raw_data <- read_lines(gzfile(fq))
    }
    else {
        raw_data <- read_lines(fq)
    }
    if (sample_size > length(raw_data)/4){
      stop("The sampling size ", sample_size ," exceeds the total number of reads ", length(raw_data)/4 ," in the input file")
    }

    # calculate the initial abundance of reference sequences
    derep1 <- derepFastq(fq)
    ref$abd <- derep1[["uniques"]][match(ref$ref_seq, names(derep1[["uniques"]]))]
    ref <- ref[!is.na(ref$abd), ]
    ref$est_abd <- 0

    # find the reference sequences with highest similarities for each unique sequence
    uniseqs <- names(derep1[["uniques"]])

    findbest <- function(que) {
        # use the kmer distance function implemented in DADA2
        kdist <- function(que, ref, kmer=7) {
            dis <- kmer_dist(que, ref, kmer)
            return(dis)
        }
        # calculate k-mer distances for all sequences
        kdist_res <- unlist(lapply(ref$ref_seq, function(x) kdist(que, x)))
        best_match <- which.min(kdist_res)
        # if there are more than 1 hits showing the same maximum identity,
        # the reference with the highest abundance is chosen as the best reference sequence
        if (length(best_match) > 1) {
          if (ref_seeker==1){
		message("Searching best reference based on abundace information.")
            ref_candidate <- ref[best_match, ]
            ref_best <- ref_candidate[which.max(ref_candidate[, 2]), 1]
          }
          else{ # concatenate the reference sequences
		  message("Searching best reference based on transition probability.")
            ref_best <- paste(ref[best_match, 1], collapse = "/")
          }
        }
        else {
          ref_best <- ref[best_match, 1]
        }
        return(c(que, ref_best))
    }

    #cl <- makeCluster(threads)
    registerDoParallel(cores=threads)
    que2ref <- foreach::foreach(x=uniseqs, .combine='rbind') %dopar% findbest(x)
    stopImplicitCluster()
    que2ref <- as.character(que2ref[, 2])
    message("Finished finding the best reference sequences for each unique sequences.")
    t2 <- Sys.time()
    message(t2-t1)

    # sample raw sequences for transition matrix generation
    sample_label <- sample(seq_len(length(raw_data)/4), sample_size)
    seqs <- raw_data[sample_label*4-2]
    seqs <- toupper(seqs)
    qual <- raw_data[sample_label*4]
    bestref <- que2ref[derep1[["map"]][sample_label]]
    query <- data.frame(seqs=seqs, qual=qual, refs=bestref)
    if (ref_seeker==2 & length(which(grepl('/', query$refs)))>0 ){
      query <- query[-which(grepl('/', query$refs)), ]
    }

    # calculate the transition matrix and error matrix
    trans_matrix <- trans_m(query, ascii)
    error_matrix <- loessErr(trans_matrix)

    # choose the best ref with the highest transition probability for ref_seeker=2
    if (ref_seeker==2 & length(which(grepl('/', que2ref)))>0){
      for (i in which(grepl('/', que2ref))){
        lambda_choose <- c()
        ref_choose <- c()
        for (j in unlist(strsplit(que2ref[i], '/'))){
          align_out <- nwalign(derep1$uniques[i], j)
          Seq1align <- align_out[1]
          Seq2align <- align_out[2]
          Seq1align <- unlist(strsplit(Seq1align, ""))
          Seq2align <- unlist(strsplit(Seq2align, ""))
          align_pairs <- paste(Seq2align, Seq1align, sep="2")
          align_pairs <- align_pairs[! endsWith(align_pairs, "-")]
          qual <- round(derep1[["quals"]][match(seq, names(derep1[["uniques"]])), ], digits=0)
          qual <- as.character(qual[!is.na(qual)])
          retain_index <- which(!align_pairs %in% c("A2A", "C2C", "G2G", "T2T"))
          align_pairs <- align_pairs[retain_index]
          qual <- qual[retain_index]
          data4lambda <- data.frame(align_pairs, qual)
          tp <- apply(data4lambda, 1, function(x) error_matrix[x[1], x[2]])
          lambda0 <- prod(tp)
          ref_choose <- c(ref_choose, j)
          lambda_choose <- c(lambda_choose, lambda0)
        }
        que2ref[i] <- ref_choose[which.max(lambda_choose)]
      }
    }
    
    derep1[["bestref"]] <- que2ref

    error_ref_matrix <- list(ref=ref, err=error_matrix, derep=derep1, total_reads=length(raw_data)/4)
    return(error_ref_matrix)

}
PengfanZhang/Rbec documentation built on Sept. 13, 2023, 1 a.m.