#' Get data to create new ShortReadQ object after barcode trimming
#'
#' @param type character. expType of the CapSet object
#' @param fq_R1 character. fastq Read 1
#' @param fq_R2 character. fastq Read 2
#'
#' @importFrom ShortRead sread
#' @importFrom Biostrings quality DNAStringSet
#' @importFrom BiocGenerics width
#' @importFrom IRanges narrow
#'
#' @return A list with new R1 and R2 sequence, quality, barcode string and sample id
#'
get_newfastq <- function(type, fq_R1, fq_R2) {
# get R1 and R2 reads and quality
fq_R1read <- sread(fq_R1)
fq_R1qual <- quality(fq_R1)
fq_R2read <- sread(fq_R2)
fq_R2qual <- quality(fq_R2)
if (type == "MAPCap") {
# trim barcodes (pos 1 to 13) from R2 and prepare header
# copy the index sequence (position 6 to 11)
sample_idx <- narrow(fq_R2read, 6, 11)
# copy pcr barcode (pos 1 to 5 + pos 12 to 13)
umi_barcodes <-
DNAStringSet(paste0(
narrow(fq_R2read, 1, 5),
narrow(fq_R2read, 12, 13)
))
#copy replicate demultiplexing barcode (pos 3 to 4)
rep_idx <- narrow(fq_R2read, 3, 4)
barcode_string <-
paste(sample_idx, umi_barcodes, rep_idx, sep = ":")
# now trim the first 13 bp off the read and quality
fq_R2read <- narrow(fq_R2read, 14, width(fq_R2))
fq_R2qual <- narrow(fq_R2qual, 14, width(fq_R2))
} else if (type == "RAMPAGE") {
# trim barcodes first 15 bases from R2 and first 6 bases from R1 and prepare header
# copy the index sequence (position 1 to 6 of R1)
sample_idx <- narrow(fq_R1read, 1, 6)
# copy pcr barcode (pos 1 to 15 of R2)
umi_barcodes <- narrow(fq_R2read, 1, 15)
barcode_string <- paste(sample_idx, umi_barcodes, sep = ":")
# now trim the first 15 bp off the read and quality of R2
fq_R2read <- narrow(fq_R2read, 16, width(fq_R2))
fq_R2qual <- narrow(fq_R2qual, 16, width(fq_R2))
fq_R1read <- narrow(fq_R1read, 7, width(fq_R1read))
fq_R1qual <- narrow(fq_R1qual, 7, width(fq_R1qual))
} else {
message("type of protocol not MAPCap or RAMPAGE. Reads are not being trimmed.")
barcode_string <- NA
}
outlist <- list(
fq_R1read = fq_R1read,
fq_R1qual = fq_R1qual,
fq_R2read = fq_R2read,
fq_R2qual = fq_R2qual,
barcode_string = barcode_string,
sample_idx = sample_idx
)
return(outlist)
}
#' Split paired-end fastq by barcodes
#'
#' @param expType character. experiment type (RAMPAGE, MAPCap or CAGE)
#' @param idx_name character. barcode ID
#' @param outfile_R1 character. output fastq file : Read 1
#' @param outfile_R2 character. output fastq file : Read 2
#' @param fastq_R1 character. input fastq file : Read 1
#' @param fastq_R2 character. input fastq file : Read 2
#' @param max_mismatch integer. max allowed mismatches
#'
#' @return kept reads corresponding to each barcode.
#'
split_fastq <- function(expType,
idx_name,
outfile_R1,
outfile_R2,
fastq_R1,
fastq_R2,
max_mismatch) {
## open input stream
stream_R1 <- ShortRead::FastqStreamer(fastq_R1)
stream_R2 <- ShortRead::FastqStreamer(fastq_R2)
on.exit(close(stream_R1))
on.exit(close(stream_R2), add = TRUE)
kept_reads <- 0
## filter fastq
repeat {
# input chunk
fq_R1 <- ShortRead::yield(stream_R1)
fq_R2 <- ShortRead::yield(stream_R2)
if (length(fq_R1) == 0) {
break
}
## modify R1/R2 as per the protocol
outlist <- get_newfastq(expType, fq_R1, fq_R2)
## make a new ShortReadQ object with new header and trimmed reads
# new fastq R2
fqid_R2 <- ShortRead::id(fq_R2)
fqid_R2 <-
sub(" ", "_", fqid_R2) # replace space by underscore in readName
fq_R2new <- ShortRead::ShortReadQ(outlist$fq_R2read,
outlist$fq_R2qual,
Biostrings::BStringSet(paste(
fqid_R2,
outlist$barcode_string,
sep = "#"
)))
# new fastq R1 (seq/qual not modified, just barcodes copied from R2)
fqid_R1 <- ShortRead::id(fq_R1)
fqid_R1 <-
sub(" ", "_", fqid_R1) # replace space by underscore in readName
fq_R1new <- ShortRead::ShortReadQ(outlist$fq_R1read,
outlist$fq_R1qual,
Biostrings::BStringSet(paste(
fqid_R1,
outlist$barcode_string,
sep = "#"
)))
# demultiplex using given sample barcodes
idx_name <- Biostrings::DNAString(idx_name)
sample_idx <- Biostrings::DNAStringSet(outlist$sample_idx)
id2keep <-
as.logical(
Biostrings::vcountPattern(idx_name,
outlist$sample_idx,
max.mismatch = max_mismatch)
)
#id2keep <- filter_byIDx(idx_name,
# fq_id = ShortRead::id(fq_R2),
# maxM = max_mismatch)
# append to destination
ShortRead::writeFastq(fq_R1new[id2keep], outfile_R1, "a")
ShortRead::writeFastq(fq_R2new[id2keep], outfile_R2, "a")
# add to count
kept_reads <- kept_reads + sum(id2keep)
}
return(kept_reads)
}
#' Demultiplex and tag fastq files using sample barcodes
#' @rdname demultiplexFASTQ
#' @param CSobject CapSet object created using \code{\link{newCapSet}} function
#' @param outdir character. path to output directory
#' @param max_mismatch integer. maximum allowed mismatches in the sample barcode
#' @param ncores integrer. No. of cores/threads to use
#'
#' @return de-multiplxed fastq files corresponding to each barcode. The files are written
#' on disk with the corresponding sample names as specified in the CapSet object
#'
#' @export
#' @examples
#' # load a previously saved CapSet object
#' cs <- exampleCSobject()
#'
#' # demultiplex allowing one mismatch in sample indexes
#' dir.create("demult_fastq")
#' cs <- demultiplexFASTQ(cs, outdir = "demult_fastq", max_mismatch = 1)
#'
setMethod("demultiplexFASTQ",
signature = "CapSet",
function(
CSobject,
outdir,
max_mismatch,
ncores) {
protocol <- CSobject@expMethod
sampleinfo <- sampleInfo(CSobject)
destinations <- as.character(sampleinfo[, 1])
idx_list <- as.character(rownames(sampleinfo))
## get the fastq to split (raise error if fastq untrimmed/not existing)
fastq_R1 <- CSobject@fastq_R1
fastq_R2 <- CSobject@fastq_R2
# register parallel backend
param <- getMCparams(ncores)
if (!BiocParallel::bpisup(param)) {
BiocParallel::bpstart(param)
on.exit(BiocParallel::bpstop(param))
}
message("de-multiplexing the FASTQ file")
## filter and write
info <-
BiocParallel::bplapply(seq_along(destinations), function(i) {
split1 <- file.path(outdir, paste0(destinations[i], "_R1.fastq.gz"))
split2 <-
file.path(outdir, paste0(destinations[i], "_R2.fastq.gz"))
## stop if the outfiles already exist (otherwise the output would be appended)
if (file.exists(split1) | file.exists(split2)) {
warning("Output files already exist! Overwriting..")
if (file.exists(split1))
file.remove(split1)
if (file.exists(split1))
file.remove(split2)
}
## split files and save kept read number
kept <- split_fastq(
expType = protocol,
idx_name = idx_list[i],
outfile_R1 = split1,
outfile_R2 = split2,
fastq_R1,
fastq_R2,
max_mismatch
)
dfout <- data.frame(R1 = split1,
R2 = split2,
kept_reads = kept)
return(dfout)
}, BPPARAM = param)
## add post-demult info to sampleInfo
keptinfo <- do.call(rbind, info)
sampleinfo$demult_reads <- keptinfo$kept_reads
sampleinfo$demult_R1 <- as.character(keptinfo$R1)
sampleinfo$demult_R2 <- as.character(keptinfo$R2)
## return object
sampleInfo(CSobject) <- sampleinfo
return(CSobject)
}
)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.