#' Reference-based error correction of amplicon sequencing data
#'
#' @description
#' This function calculate the error matrix
#'
#' @details Ruben Garrido-Oter's group, Plant-Microbe interaction, Max Planck Institute for Plant Breeding Research
#' @author Pengfan Zhang
#'
#' @param fq the path of merged amplicon sequencing reads in fastq format (Ns are not allowed in the reads)
#' @param ref the unique reference sequences of the reference seqeunces, each sequence must be in one line (Ns are not allowed in the sequences)
#' @param sample_size the sampling size of reads to generate the transition matrix
#' @param threads the number of threads used to align the query reads to reference sequences
#' @param ascii ascii characters used to encode phred scores
#' @param ref_seeker the method for finding the candidate error-producing reference sequence for a tag showing identical lowest K-mer distance to multiple references. 1 for the abundance-based method; 2 for the transition probability-based method, 1 in default.
#'
#'
#' @import dada2
#' @import foreach
#' @import doParallel
#'
#' @usage error_m(fq, ref, sample_size, threads, ascii, ref_seeker)
#'
#' @return The output is a 20 by 43 transition probability matrix
#'
#' @noRd
#'
error_m <- function(fq, ref, sample_size=10000, threads, ascii, ref_seeker) {
x <- NA
t1 <- Sys.time()
# read FASTQ file
if (grepl(".gz", fq, fixed = TRUE)) {
raw_data <- read_lines(gzfile(fq))
}
else {
raw_data <- read_lines(fq)
}
if (sample_size > length(raw_data)/4){
stop("The sampling size ", sample_size ," exceeds the total number of reads ", length(raw_data)/4 ," in the input file")
}
# calculate the initial abundance of reference sequences
derep1 <- derepFastq(fq)
ref$abd <- derep1[["uniques"]][match(ref$ref_seq, names(derep1[["uniques"]]))]
ref <- ref[!is.na(ref$abd), ]
ref$est_abd <- 0
# find the reference sequences with highest similarities for each unique sequence
uniseqs <- names(derep1[["uniques"]])
findbest <- function(que) {
# use the kmer distance function implemented in DADA2
kdist <- function(que, ref, kmer=7) {
dis <- kmer_dist(que, ref, kmer)
return(dis)
}
# calculate k-mer distances for all sequences
kdist_res <- unlist(lapply(ref$ref_seq, function(x) kdist(que, x)))
best_match <- which.min(kdist_res)
# if there are more than 1 hits showing the same maximum identity,
# the reference with the highest abundance is chosen as the best reference sequence
if (length(best_match) > 1) {
if (ref_seeker==1){
message("Searching best reference based on abundace information.")
ref_candidate <- ref[best_match, ]
ref_best <- ref_candidate[which.max(ref_candidate[, 2]), 1]
}
else{ # concatenate the reference sequences
message("Searching best reference based on transition probability.")
ref_best <- paste(ref[best_match, 1], collapse = "/")
}
}
else {
ref_best <- ref[best_match, 1]
}
return(c(que, ref_best))
}
#cl <- makeCluster(threads)
registerDoParallel(cores=threads)
que2ref <- foreach::foreach(x=uniseqs, .combine='rbind') %dopar% findbest(x)
stopImplicitCluster()
que2ref <- as.character(que2ref[, 2])
message("Finished finding the best reference sequences for each unique sequences.")
t2 <- Sys.time()
message(t2-t1)
# sample raw sequences for transition matrix generation
sample_label <- sample(seq_len(length(raw_data)/4), sample_size)
seqs <- raw_data[sample_label*4-2]
seqs <- toupper(seqs)
qual <- raw_data[sample_label*4]
bestref <- que2ref[derep1[["map"]][sample_label]]
query <- data.frame(seqs=seqs, qual=qual, refs=bestref)
if (ref_seeker==2 & length(which(grepl('/', query$refs)))>0 ){
query <- query[-which(grepl('/', query$refs)), ]
}
# calculate the transition matrix and error matrix
trans_matrix <- trans_m(query, ascii)
error_matrix <- loessErr(trans_matrix)
# choose the best ref with the highest transition probability for ref_seeker=2
if (ref_seeker==2 & length(which(grepl('/', que2ref)))>0){
for (i in which(grepl('/', que2ref))){
lambda_choose <- c()
ref_choose <- c()
for (j in unlist(strsplit(que2ref[i], '/'))){
align_out <- nwalign(derep1$uniques[i], j)
Seq1align <- align_out[1]
Seq2align <- align_out[2]
Seq1align <- unlist(strsplit(Seq1align, ""))
Seq2align <- unlist(strsplit(Seq2align, ""))
align_pairs <- paste(Seq2align, Seq1align, sep="2")
align_pairs <- align_pairs[! endsWith(align_pairs, "-")]
qual <- round(derep1[["quals"]][match(seq, names(derep1[["uniques"]])), ], digits=0)
qual <- as.character(qual[!is.na(qual)])
retain_index <- which(!align_pairs %in% c("A2A", "C2C", "G2G", "T2T"))
align_pairs <- align_pairs[retain_index]
qual <- qual[retain_index]
data4lambda <- data.frame(align_pairs, qual)
tp <- apply(data4lambda, 1, function(x) error_matrix[x[1], x[2]])
lambda0 <- prod(tp)
ref_choose <- c(ref_choose, j)
lambda_choose <- c(lambda_choose, lambda0)
}
que2ref[i] <- ref_choose[which.max(lambda_choose)]
}
}
derep1[["bestref"]] <- que2ref
error_ref_matrix <- list(ref=ref, err=error_matrix, derep=derep1, total_reads=length(raw_data)/4)
return(error_ref_matrix)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.