#' Reference-based error correction of amplicon sequencing data
#'
#' @description
#' This function corrects the amplicon sequencing data from synthetic communities where the reference sequences are known a priori
#'
#' @details Ruben Garrido-Oter's group, Plant-Microbe interaction, Max Planck Institute for Plant Breeding Research
#' @author Pengfan Zhang
#'
#' @param fastq the path of the fastq file containg merged amplicon sequencing reads (Ns are not allowed in the reads)
#' @param reference the path of the unique reference sequences, each sequence must be in one line (Ns are not allowed in the sequences)
#' @param outdir the output directory, which should be created by the user
#' @param threads the number of threads used, default 1
#' @param sampling_size the sampling size for calculating the error matrix, default 5000
#' @param ascii ascii characters used to encode phred scores (33 or 64), default 33
#' @param min_cont_obs_abd the minimum oberseved abundace of unique tags for detecting contamination sequences, default 200
#' @param min_cont_abd the relative abundance of unique tgas for detecting contamination sequences that can't be corrected by any of the references, default 0.03
#' @param min_E the minimum expectation of the Possion distribution for the identification of paralogues, default 0.05
#' @param min_P the minimum P value threshold of the Possion distribution to correct a read, default 1e-40
#' @param ref_seeker the method for finding the candidate error-producing reference sequence for a tag showing identical lowest K-mer distance to multiple references. 1 for the abundance-based method; 2 for the transition probability-based method, default 1.
#' @param cn the path to the copy number table documenting the copy number of the marker gene in each strain (header inclusive), otherwise Rbec will normalize the abundance based on the internally inferred copy number, which tends to underestimate the true copy number, defaul NULL.
#'
#' @import readr
#' @import dada2
#' @import doParallel
#' @import foreach
#'
#' @usage Rbec(fastq, reference, outdir, threads=1, sampling_size=5000, ascii=33, min_cont_obs_abd=200, min_cont_abd=0.03, min_E=0.05, min_P=1e-40, ref_seeker=1, cn=NULL)
#' @examples
#' fastq <- system.file("extdata", "test_raw_merged_reads.fastq.gz", package = "Rbec")
#'
#' ref <- system.file("extdata", "test_ref.fasta", package = "Rbec")
#'
#' Rbec(fastq=fastq, reference=ref, outdir=tempdir(), threads=1, sampling_size=500, ascii=33)
#'
#' @return lambda_final.out the lambda value and pvalue of the Poisson distribution for each read
#' @return error_matrix_final.out the error matrix in the final iteration
#' @return strain_table.txt the strain composition of the sample
#' @return strain_table_normalized.txt the copy-number-normalized strain composition of the sample if the copy number table is provided
#' @return contamination_seq.fna the potential sequences generated by contaminants
#' @return rbec.log percentage of corrected reads, which can be used to predict contaminated samples
#' @return paralogue_seq.fna paralogue sequences found in each strain except for the reference provided
#'
#' @export
#'
Rbec <- function(fastq, reference, outdir, threads=1, sampling_size=5000, ascii=33, min_cont_obs_abd=200, min_cont_abd=0.03, min_E=0.05, min_P=1e-40, ref_seeker=1, cn=NULL){
t1 <- Sys.time()
# parse input parameters and read reference sequences
tryCatch(
para_check <- as.numeric(threads),
error = function(e) {stop("parameters 'threads' should be intergers!")},
warning = function(w) {stop("parameters 'threads' should be intergers!")}
)
tryCatch(
para_check <- as.numeric(sampling_size),
error = function(e) {stop("parameters 'sampling_size' should be intergers!")},
warning = function(w) {stop("parameters 'sampling_size' should be intergers!")}
)
tryCatch(
para_check <- as.numeric(ascii),
error = function(e) {stop("parameters 'ascii' should be intergers!")},
warning = function(w) {stop("parameters 'ascii' should be intergers!")}
)
ref <- read_lines(reference)
ref_name <- data.frame(name=ref[which(seq_along(ref)%%2==1)], seq=toupper(ref[which(seq_along(ref)%%2==0)]), stringsAsFactors=FALSE)
ref_name$name <- sub(">", "", ref_name$name)
ref <- data.frame(ref_seq=toupper(ref[which(seq_along(ref)%%2==0)]), stringsAsFactors=FALSE)
# calculate the error matrix
error_ref_matrix <- error_m(fastq, ref, sampling_size, threads, ascii, ref_seeker)
error_matrix <- error_ref_matrix[["err"]]
ref <- error_ref_matrix[["ref"]]
derep <- error_ref_matrix[["derep"]]
message("Finished calculating the transition and error matrices.")
# calculate abundance probabilities
lambda_out <- abd_prob(derep, ref, error_matrix)
final_list <- consis_err(fastq, derep, ref, lambda_out, sampling_size, ascii, min_E, min_P)
straintab <- final_list[["table"]]
straintab[, 1] <- ref_name$name[match(straintab[, 1], ref_name[, 2])]
# normalize the abundance if copy number is available
if (!is.null(cn)){
cp_n <- as.data.frame(read_delim(cn, delim="\t", col_names=FALSE), stringsAsFactors = FALSE)
straintab_norm <- straintab
straintab_norm$Corrected_Abundance <- round(straintab_norm$Corrected_Abundance/cp_n[match(straintab_norm$Ref_seq, cp_n[, 1]), 2])
}
lambda_out <- final_list[["lambda_out"]]
lambda_out[, 2] <- ref_name$name[match(lambda_out[, 2], ref_name[, 2])]
# identify potential contaminants from reads which cannot be corrected and have high abundance (larger than min_cont_abd)
contamination <- which(final_list$lambda_out$Corrected=="No" &
(final_list$lambda_out$Obs_abd >= min_cont_obs_abd | final_list$lambda_out$Obs_abd/error_ref_matrix$total_reads>=min_cont_abd))
# output sequences of potential contaminants
conta_out <- data.frame(stringsAsFactors=FALSE)
c <- 1
for (i in contamination){
conta_align <- nwalign(names(derep$uniques[i]), final_list$lambda_out$Ref_ID[i])
conta_align1 <- unlist(strsplit(conta_align[1], ""))
conta_align2 <- unlist(strsplit(conta_align[2], ""))
iden <- paste(round(length(which(conta_align1==conta_align2))/length(conta_align1)*100, 1), "%", sep="")
conta_iden <- paste(lambda_out$Ref_ID[i], iden, sep=":")
conta_rec <- paste(">Contamination_seq", c, sep="")
conta_abd <- paste("Observed Abundace:", derep$uniques[i], sep="")
conta_rate <- paste("Relative Abundance:", round(derep$uniques[i]/error_ref_matrix$total_reads*100, 1), "%", sep="")
conta_rec <- paste(conta_rec, conta_abd, conta_rate, conta_iden, sep="; ")
conta_out <- rbind(conta_out, conta_rec, stringsAsFactors = FALSE)
conta_rec <- names(derep$uniques[i])
conta_out <- rbind(conta_out, conta_rec, stringsAsFactors = FALSE)
c <- c+1
}
#output paralogue sequences for each strain
paralog_out <- data.frame(stringsAsFactors=FALSE)
c <- 1
cn_internal_inference <- c()
for (i in seq(nrow(lambda_out))){
if (lambda_out$Lambda[i]!=1 & lambda_out$E[i]>=min_E & lambda_out$Corrected[i]=="Yes" & lambda_out$Obs_abd[i]/max(lambda_out$Obs_abd[which(lambda_out$Ref_ID==lambda_out$Ref_ID[i])])>=1/15){
paralog_rec <- paste(paste(">", "Seq_", c, sep=""), paste("Paralogue", lambda_out$Ref_ID[i], sep = "="), sep = ";")
paralog_out <- rbind(paralog_out, paralog_rec, stringsAsFactors=FALSE)
paralog_out <- rbind(paralog_out, names(derep$uniques[i]), stringsAsFactors=FALSE)
cn_internal_inference <- c(cn_internal_inference, lambda_out$Ref_ID[i])
c <- c+1
}
}
if (is.null(cn)){
cp_n <- as.data.frame(table(cn_internal_inference))
if (nrow(cp_n)>0){
cp_n$Freq <- cp_n$Freq + 1
straintab_norm <- straintab
names(cp_n) <- c("Var1", "Freq")
cp_n <- rbind(as.data.frame(table(straintab_norm$Ref_seq[!straintab_norm$Ref_seq %in% cp_n[,1]])), cp_n)
straintab_norm$Corrected_Abundance <- round(straintab_norm$Corrected_Abundance/cp_n[match(straintab_norm$Ref_seq, cp_n[, 1]), 2])
}
else{straintab_norm <- straintab}
}
# output final tables and logs
dir.create(outdir)
lambdaout <- paste(outdir, "lambda_final.out", sep="/")
emout <- paste(outdir, "error_matrix_final.out", sep="/")
straintabout <- paste(outdir, "strain_table.txt", sep="/")
straintabout_norm <- paste(outdir, "strain_table_normalized.txt", sep="/")
contaout <- paste(outdir, "contamination_seq.fna", sep="/")
paralogout <- paste(outdir, "paralogue_seq.fna", sep="/")
logout <- paste(outdir, "rbec.log", sep="/")
write.table(lambda_out, file= lambdaout, quote=FALSE, sep="\t", row.names = FALSE)
write.table(final_list[["err"]], file= emout, quote=FALSE, sep="\t")
write.table(straintab, file= straintabout, quote=FALSE, sep="\t", row.names=FALSE)
write.table(straintab_norm, file= straintabout_norm, quote=FALSE, sep="\t", row.names=FALSE)
write.table(conta_out, file = contaout, quote=FALSE, sep="", col.names = FALSE, row.names = FALSE)
write.table(paralog_out, file=paralogout, quote=FALSE, sep="", col.names = FALSE, row.names = FALSE)
t2 <- Sys.time()
time.used <- difftime(t2, t1, units = "mins")
time.used <- sub("Time difference of ","",time.used)
time.used <- paste("Total time used:", time.used, "mins", sep=" ")
summ <- paste(final_list$total_corrected/error_ref_matrix$total_reads*100, "% of reads were corrected!", sep="")
log_out <- rbind(time.used, summ)
write.table(log_out, file = logout, quote=FALSE, sep="", col.names = FALSE, row.names = FALSE)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.