#' Reference-based error correction of amplicon sequencing data
#'
#' @description
#' This function iterates the error matrix till reaching the stable stage
#'
#' @details Ruben Garrido-Oter's group, Plant-Microbe interaction, Max Planck Institute for Plant Breeding Research
#' @author Pengfan Zhang
#'
#' @param fq the path of the fastq file (Ns are not allowed in the reads)
#' @param derep the derepliacted reads by dada2 function
#' @param ref the reference sequences, each sequence should take up one line (Ns are not allowed in the reads)
#' @param lambda_out the matrix containg lambda value and pvalue from the former iteration
#' @param sampling_size the subsampling size of the reads
#' @param ascii ascii characters used to encode phred scores
#' @param min_E the minimum expectation value of the Possion distribution for detecting paralogs within the same strain
#' @param min_P the P value cutoff for identifying errouneous reads
#' @param max_diff_abs the maximum absolute difference in number of corrected reads between two iterations, together with max_diff_ratio, before jumping out of the iiteration
#' @param max_diff_ratio the maximum difference in the percentages of corrected reads between two iterations
#'
#' @usage consis_err(fq, derep, ref, lambda_out, sampling_size, ascii, min_E=0.05, min_P=1e-40, max_diff_abs=100, max_diff_ratio=0.01)
#'
#' @return Returns the final files
#'
#' @noRd
#'
consis_err <- function(fq, derep, ref, lambda_out, sampling_size, ascii, min_E=0.05, min_P=1e-40, max_diff_abs=100, max_diff_ratio=0.01){
raw_data <- read_lines(gzfile(fq))
n <- 1
repeat {
message("Start ", n ," iterations")
# calculate total number of corrected reads and total abundances from the previous iteration
previous_round <- which(lambda_out$E>=min_E | lambda_out$pval>=min_P)
previous_sum <- sum(lambda_out[previous_round, "Obs_abd"])
# update abundances of reference sequences from erroneous reads asigned to them
est_abd <- numeric(0)
for (i in seq_along(unique(lambda_out$Ref_ID))) {
# find all sequences with high expectation and P-val which could have been generated by a given reference sequence
idx <-which((lambda_out$E>=min_E | lambda_out$pval>=min_P) & lambda_out$Ref_ID==unique(lambda_out$Ref_ID)[i])
# calculate estimated abundances of a given reference sequence by adding up all the abundances of assigned erroneous sequences
ref_i_abd <- sum(lambda_out$Obs_abd[idx])
est_abd <- c(est_abd, ref_i_abd)
}
# update estimated abundances for each reference sequence
ref$est_abd[unique(lambda_out$Ref_ID)] <- est_abd
message(ref$est_abd)
# extract raw sequences that belong to the error-corrected unique sequences found in the previous round
current_round <- which(derep[["map"]] %in% previous_round)
# sample the extracted sequences for transition matrix generation
current_sample <- sample(current_round, sampling_size)
seqs <- raw_data[current_sample*4-2]
seqs <- toupper(seqs)
quals <- raw_data[current_sample*4]
bestref <- derep[["bestref"]][derep[["map"]][current_sample]]
# calculate transition matrix and update error matrix
query <- data.frame(seqs=seqs, qual=quals, ref=bestref)
trans_matrix <- trans_m(query, ascii)
error_matrix <- loessErr(trans_matrix)
lambda_out <- abd_prob(derep, ref, error_matrix)
# calculate total number of corrected reads and total abundances from the current iteration
current_round <- which(lambda_out$E>=min_E | lambda_out$pval>=min_P)
current_sum <- sum(lambda_out[current_round, "Obs_abd"])
# check for convergence
diff_abs <- abs(previous_sum - current_sum)
diff_ratio <- diff_abs/previous_sum
if (diff_abs < max_diff_abs & diff_ratio <= max_diff_ratio) {
message(diff_abs)
message(diff_ratio)
message("The model reached consistency after ", n ," iteration(s)")
final_abd <- numeric(0)
for (i in seq_along(unique(lambda_out$Ref_ID))){
# find all sequences with high expectation and P-val which could have been generated by a given reference sequence
idx <- which((lambda_out$E>=min_E | lambda_out$pval>=min_P) & lambda_out$Ref_ID==unique(lambda_out$Ref_ID)[i])
# calculate final abundances of a given reference sequence by adding up all the abundances of assigned erroneous sequences
ref_i_abd <- sum(lambda_out$Obs_abd[idx])
final_abd <- c(final_abd, ref_i_abd)
}
out_table <- data.frame(Ref_seq=ref[unique(lambda_out$Ref_ID), 1], Corrected_Abundance=final_abd)
lambda_out$Total_corrected <- sum(lambda_out[which(lambda_out$E>=min_E | lambda_out$pval>=min_P), "Obs_abd"])
lambda_out$Corrected <- "No"
lambda_out$Corrected[which(lambda_out$E>=min_E | lambda_out$pval>=min_P)] <- "Yes"
lambda_out$Ref_ID <- ref[lambda_out$Ref_ID, 1]
out_file <- list(lambda_out=lambda_out, err=error_matrix, table=out_table, total_corrected=sum(final_abd))
return(out_file)
break
}
else {
message(diff_abs)
message(diff_ratio)
n <- n+1
}
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.