# <style> # body { # text-align: justify} # </style> BiocStyle::markdown()
knitr::opts_chunk$set( collapse = TRUE #, # comment = "#>" ) options(width=120)
This document gives an overview of the DNABarcodeCompatibility R package with a brief description of the set of tools that it contains. The package includes six main functions that are briefly described below with examples. These functions allow one to load a list of DNA barcodes (such as the Illumina TruSeq small RNA kits), to filter these barcodes according to distance and nucleotide content criteria, to generate sets of compatible barcode combinations out of the filtered barcode list, and finally to generate an optimized selection of barcode combinations for multiplex sequencing experiments. In particular, the package provides an optimizer function to favour the selection of compatible barcode combinations with least heterogeneity in the frequencies of DNA barcodes, and allows one to keep barcodes that are robust against substitution and insertion/deletion errors, thereby facilitating the demultiplexing step.
The DNABarcodeCompatibility package also contains:
experiment_design()
allowing one to perform all steps
in one go. IlluminaIndexesRaw
and IlluminaIndexes
for running
and testing examples. The package deals with the three existing sequencing-by-synthesis chemistries from Illumina:
library("DNABarcodeCompatibility")
# This function is created for the purpose of the documentation export_dataset_to_file = function(dataset = DNABarcodeCompatibility::IlluminaIndexesRaw) { if ("data.frame" %in% is(dataset)) { write.table(dataset, textfile <- tempfile(), row.names = FALSE, col.names = FALSE, quote=FALSE) return(textfile) } else print(paste("The input dataset isn't a data.frame:", "NOT exported into file")) }
The function experiment_design()
uses a Shannon-entropy maximization approach
to identify a set of compatible barcode combinations in which the frequencies
of occurrences of the various DNA barcodes are as uniform as possible.
The optimization can be performed in the contexts of single and dual barcoding.
It performs either an exhaustive or a random search of compatible DNA-barcode
combinations, depending on the size of the DNA-barcode set used, and on the
number of samples to be multiplexed.
txtfile <- export_dataset_to_file ( dataset = DNABarcodeCompatibility::IlluminaIndexesRaw ) experiment_design(file1=txtfile, sample_number=12, mplex_level=3, platform=4)
txtfile <- export_dataset_to_file ( dataset = DNABarcodeCompatibility::IlluminaIndexesRaw ) experiment_design(file1=txtfile, sample_number=12, mplex_level=3, platform=2)
txtfile <- export_dataset_to_file ( dataset = DNABarcodeCompatibility::IlluminaIndexesRaw ) experiment_design(file1=txtfile, sample_number=12, mplex_level=3, platform=1)
txtfile <- export_dataset_to_file ( dataset = DNABarcodeCompatibility::IlluminaIndexesRaw ) experiment_design(file1=txtfile, sample_number=12, mplex_level=3, platform=4, metric = "hamming", d = 3)
# Select the first half of barcodes from the dataset txtfile1 <- export_dataset_to_file ( DNABarcodeCompatibility::IlluminaIndexesRaw[1:24,] ) # Select the second half of barcodes from the dataset txtfile2 <- export_dataset_to_file ( DNABarcodeCompatibility::IlluminaIndexesRaw[25:48,] ) # Get compatibles combinations of least redundant barcodes experiment_design(file1=txtfile1, sample_number=12, mplex_level=3, platform=4, file2=txtfile2)
# Select the first half of barcodes from the dataset txtfile1 <- export_dataset_to_file ( DNABarcodeCompatibility::IlluminaIndexesRaw[1:24,] ) # Select the second half of barcodes from the dataset txtfile2 <- export_dataset_to_file ( DNABarcodeCompatibility::IlluminaIndexesRaw[25:48,] ) # Get compatibles combinations of least redundant barcodes experiment_design(file1=txtfile1, sample_number=12, mplex_level=3, platform=4, file2=txtfile2, metric="hamming", d=3)
This section guides you through the detailed API of the package with the aim to
help you build your own workflow. The package is designed to be flexible and
should be easily adaptable to most experimental contexts, using the
experiment_design()
function as a template, or building your own workflow
from scratch.
The file_loading_and_checking()
function loads the file containing the DNA
barcodes set and analyzes its content. In particular, it checks that each
barcode in the set is unique and uniquely identified (removing any repetition
that occurs). It also checks the homogeneity of size of the barcodes,
calculates their GC content and detects the presence of homopolymers of
length >= 3.
file_loading_and_checking( file = export_dataset_to_file( dataset = DNABarcodeCompatibility::IlluminaIndexesRaw ) )
The total number of combinations depends on the number of available barcodes
and of the multiplex level. For 48 barcodes and a multiplex level of 3, the
total number of combinations (compatible or not) can be calculated using
choose(48,3)
, which gives r format(choose(48,3))
combinations. In many
cases the total number of combinations can become much larger (even gigantic),
and one cannot perform an exhaustive search
(see get_random_combinations()
below).
# Total number of combinations choose(48,2) # Load barcodes barcodes <- DNABarcodeCompatibility::IlluminaIndexes # Time for an exhaustive search system.time(m <- get_all_combinations(index_df = barcodes, mplex_level = 2, platform = 4)) # Each line represents a compatible combination of barcodes head(m)
# Total number of combinations choose(48,3) # Load barcodes barcodes <- DNABarcodeCompatibility::IlluminaIndexes # Time for an exhaustive search system.time(m <- get_all_combinations(index_df = barcodes, mplex_level = 3, platform = 4)) # Each line represents a compatible combination of barcodes head(m)
When the total number of combinations is too high, it is recommended to pick combinations at random and then select those that are compatible.
# Total number of combinations choose(48,3) # Load barcodes barcodes <- DNABarcodeCompatibility::IlluminaIndexes # Time for a random search system.time(m <- get_random_combinations(index_df = barcodes, mplex_level = 2, platform = 4)) # Each line represents a compatible combination of barcodes head(m)
# Total number of combinations choose(48,4) # Load barcodes barcodes <- DNABarcodeCompatibility::IlluminaIndexes # Time for a random search system.time(m <- get_random_combinations(index_df = barcodes, mplex_level = 4, platform = 4)) # Each line represents a compatible combination of barcodes head(m)
# Total number of combinations choose(48,6) # Load barcodes barcodes <- DNABarcodeCompatibility::IlluminaIndexes # Time for a random search system.time(m <- get_random_combinations(index_df = barcodes, mplex_level = 6, platform = 4)) # Each line represents a compatible combination of barcodes head(m)
# Load barcodes barcodes <- DNABarcodeCompatibility::IlluminaIndexes # Perform a random search of compatible combinations m <- get_random_combinations(index_df = barcodes, mplex_level = 3, platform = 4) # Keep barcodes that are robust against one substitution error filtered_m <- distance_filter(index_df = barcodes, combinations_m = m, metric = "hamming", d = 3) # Each line represents a compatible combination of barcodes head(filtered_m)
# Keep set of compatible barcodes that are robust against one substitution # error filtered_m <- distance_filter( index_df = DNABarcodeCompatibility::IlluminaIndexes, combinations_m = get_random_combinations(index_df = barcodes, mplex_level = 3, platform = 4), metric = "hamming", d = 3) # Use a Shannon-entropy maximization approach to reduce barcode redundancy df <- optimize_combinations(combination_m = filtered_m, nb_lane = 12, index_number = 48) # Each line represents a compatible combination of barcodes and each row a lane # of the flow cell df
# Keep set of compatible barcodes that are robust against multiple substitution # and insertion/deletion errors filtered_m <- distance_filter( index_df = DNABarcodeCompatibility::IlluminaIndexes, combinations_m = get_random_combinations(index_df = barcodes, mplex_level = 3, platform = 4), metric = "seqlev", d = 4) # Use a Shannon-entropy maximization approach to reduce barcode redundancy df <- optimize_combinations(combination_m = filtered_m, nb_lane = 12, index_number = 48) # Each line represents a compatible combination of barcodes and each row a # lane of the flow cell df
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.