inst/doc/abseqR.R

## ----setup, include=FALSE-----------------------------------------------------
library(abseqR)
library(png)
library(plotly)
library(grid)
library(gridExtra)
library(BiocStyle)
# knitr::opts_chunk$set(tidy = TRUE)
knitr::opts_chunk$set(message = FALSE)

## ----abseqR_vignette_helper_functions, include=FALSE--------------------------
sabseqR <- function() { return(Biocpkg("abseqR")); }
#sabseqR <- function() { return("`abseqR`"); }
sabseqPy <- function() { return("[abseqPy](https://github.com/malhamdoosh/abseqPy)"); }
sabseq <- function() { return("`AbSeq`"); }
read_png <- function(path) {
    if (file.exists(path)) {
        #rasterGrob(as.raster(readPNG(path)), interpolate = TRUE)
        # in windows, the path is \ but LaTEX does not appreciate these slashes
        knitr::include_graphics(gsub("\\", "/",
                                     normalizePath(path),
                                     fixed = TRUE))
    } else {
        stop("File at ", path, " not found, fatal.")
    }
}


## ----abseq-wf, fig.cap="AbSeq workflow. Comprehensive analyses and visualizations are generated from input sequencing files using abseqPy and abseqR.", echo=FALSE, out.width="100%"----
read_png("abseq_workflow_MA.jpg")

## ----abseq-sim-data, fig.cap="Generation of the three simulated datasets. The flowchart on the left shows the generation of the datasets in `\"./ex/\"` while the folder structure on the right shows the output generated by `abseqPy` on the three datasets.", echo=FALSE, out.width="100%"----
read_png("simulation_flowchart_full.jpg")

## ----abseqR_copy_data_files, results=FALSE------------------------------------
# substitute with any directory that you have read/write access to
sandboxDirectory <- tempdir()

# path to provided data (comes installed with abseqR)
exdata <- system.file("extdata", "ex", package = "abseqR")

# copy the provided data to sandboxDirectory
file.copy(exdata, sandboxDirectory, recursive = TRUE)

## ----abseqR_data_contents-----------------------------------------------------
# dataPath now holds the path to a local copy of the data
dataPath <- file.path(sandboxDirectory, "ex")
# the sample names can be found inside the auxiliary directory
list.files(path = file.path(dataPath, "auxiliary"))

## ----abseqR_quick_run, results=FALSE, warning=FALSE---------------------------
# This section will visualize all the datasets individually
# and compare PCR1 with PCR2 with PCR3

# Interim solution
if (Sys.info()["sysname"] == "Darwin") {
    BPPARAM <- BiocParallel::SerialParam()
} else {
    BPPARAM <- BiocParallel::bpparam()
}

# you should use report = 3 to generate a HTML report
samples <- abseqReport(dataPath,
                       compare = c("PCR1, PCR2, PCR3"),
                       report = 1,
                       BPPARAM = BPPARAM)

# ignore the message:
# "Sample output directory <path> is different from provided path
#   <path> assuming directory was moved"
# This warning message tells us that the directory has
# been moved (we copied the provided examples to "dataPath")

## ----abseq-final-folder-structure, fig.cap="Folder structure after `abseqR` completes. The landing page `index.html` provides a convenient way to navigate around the standalone HTML reports in `html\\_files/`.", echo=FALSE----
grid.raster(readPNG("post_report_fs.png"))

## ----abseqR_abseqReport_compare_argument, results=FALSE, warning=FALSE--------
# Example of 1 comparison
# Analyze all samples, but only compare PCR1 with PCR2
pcr1.pcr2 <- abseqReport(dataPath,
                         compare = c("PCR1, PCR2"),
                         report = 0)

# Example of 2 comparisons
# Analyze all samples. In addition, compare:
#  * PCR1 with PCR2
#  * PCR2 with PCR3
multiComparison <- abseqReport(dataPath,
                               compare = c("PCR1, PCR2", "PCR2, PCR3"),
                               report = 0)

## ----abseqR_show_return_of_abseqReport, results='hold'------------------------
# compare = c("PCR1,PCR2")
names(pcr1.pcr2)

# compare = c("PCR1, PCR2", "PCR2 ,PCR3")
names(multiComparison)

## ----abseqR_refine_comparison_simple, results=FALSE, warning=FALSE------------
# recall that samples is a named list where each element's name
# is the sample's own name (see names(samples))

# use report = 3 if the results should be collated in a HTML document
pcr1.pcr3 <- samples[["PCR1"]] + samples[["PCR3"]]
refinedComparison <- report(pcr1.pcr3,
                            outputDir = file.path(sandboxDirectory,
                                                  "refined_comparison"),
                            report = 1)

## ----abseqR_refine_comparison_sample_return_value, eval=TRUE------------------
names(refinedComparison)

## ----abseqR_adv_load_samples, eval=FALSE--------------------------------------
#  SRRControl <- abseqReport("SRR_CTRL")
#  SRRExp <- abseqReport("SRR_EXP")

## ----abseqR_adv_compare_old_new, eval=FALSE-----------------------------------
#  # short for SRRControl[[1]] + SRRControl[[2]] + ... + SRRExp[[1]] + ...
#  all.samples <- Reduce("+", c(SRRControl, SRRExp))
#  report(all.samples, outputDir = "SRRControl_vs_SRRExp")

## ----abseqR_adv_lazy_load, eval=FALSE-----------------------------------------
#  # in essence, this is identical to SRRControl <- abseqReport("SRR_CTRL"),
#  # but will not regenerate plots and reports!
#  SRRControl.lazy <- abseqReport("SRR_CTRL", report = 0)
#  
#  # exactly the same return value
#  stopifnot(names(SRRControl.lazy) == names(SRRControl))

## ----abseqR_adv_bpparam, eval=FALSE-------------------------------------------
#  
#  # use 4 CPU cores
#  nproc <- 4
#  samples <- abseqReport(dataPath,
#                         BPPARAM = BiocParallel::MulticoreParam(nproc))
#  
#  
#  # run sequentially - no multiprocessing
#  samples <- abseqReport(dataPath,
#                         BPPARAM = BiocParallel::SerialParam())

## ----abseqR_plot_path_setup, include=FALSE------------------------------------
# [s]ingle sample [dir]ectory
sdir <- file.path(dataPath, "auxiliary", "PCR1")
# [m]ulti sample [dir]ectory
mdir <- file.path(dataPath, "auxiliary", "PCR1_vs_PCR2_vs_PCR3")

## ----seq-len, fig.cap="Sequence length distribution of PCR1. Histogram of (merged) sequence lengths.", echo=FALSE, out.width="100%"----
read_png(file.path(sdir, "annot", "PCR1_all_clones_len_dist.png"))

## ----align-qual, fig.cap="Bitscore vs V germline alignment length in PCR1. Heatmap of bitscore vs alignment length shows percentage of sequences in a given value range.", echo=FALSE, out.width="100%"----
read_png(file.path(sdir, "abundance", "PCR1_igv_align_quality_bitscore_hm_static.png"))

## ----indel, fig.cap="Rate of insertions and deletions in PCR1. The percentage of indels found within the V germlines of PCR1.", echo=FALSE, out.width="100%"----
read_png(file.path(sdir, "abundance", "PCR1_igv_gaps_dist.png"))

## ----vgermline, fig.cap="V family germline distribution in PCR1. The percentage of IGHV families after germline annotation using IgBLAST.", echo=FALSE, out.width="100%"----
read_png(file.path(sdir, "abundance", "PCR1_igv_dist_family_level.png"))

## ----vjassoc, fig.cap="V-J family germline association in PCR1. Segment size represents germline proportion while the thickness of the arcs shows the proportion of associations between V and J germline families. This plot was heavily inspired by [VDJTools](https://github.com/mikessh/vdjtools/))", echo=FALSE, out.width="100%"----
read_png(file.path(sdir, "abundance", "PCR1_vjassoc.png"))

## ----prod-summ, fig.cap="Productivity rate of sequences in PCR1. This plot shows the percentage of productive and unproductive sequences, the reason for unproductive sequences are colour coded.", echo=FALSE, out.width="100%"----
read_png(file.path(sdir, "productivity", "PCR1_productivity.png"))

## ----frameshift, fig.cap="Rate of in-frame and out-of-frame sequences in PCR1. Misaligned V-J frame or non-multiple of 3 indels in either one of the framework or complementarity-determining regions causes a frameshift and therefore renders the sequence unproductive.", echo=FALSE, out.width="100%"----
read_png(file.path(sdir, "productivity", "PCR1_vjframe_dist.png"))

## ----stopcod, fig.cap="Proportion of stop codons in any given CDR or FR of out-of-frame sequences in PCR1. The percentages show the frequency of stop codons in a given region relative to the total number of stop codons present.", echo=FALSE, out.width="100%"----
read_png(file.path(sdir, "productivity", "PCR1_stopcodon_region_outframe.png"))

## ----mismatches, fig.cap="Proportion of mismatches in productive sequences of PCR1.", echo=FALSE, out.width="100%"----
read_png(file.path(sdir, "productivity", "PCR1_fr3_mismatches_dist.png"))

## ----gaps, fig.cap="Proportion of indels in productive sequences of PCR1.", echo=FALSE, out.width="100%"----
read_png(file.path(sdir, "productivity", "PCR1_fr3_gaps_dist.png"))

## ----gaps-out, fig.cap="Proportion of indels in out-of-frame sequences of PCR1.", echo=FALSE, out.width="100%"----
read_png(file.path(sdir, "productivity", "PCR1_fr3_gaps_dist_out_of_frame.png"))

## ----duplication, fig.cap="Proportion of duplicated clonotypes in PCR1. Higher-order duplication levels starting from 10 are binned.", echo=FALSE, out.width="100%"----
read_png(file.path(sdir, "diversity", "PCR1_cdr_duplication.png"))

## ----rarefaction, fig.cap="Rarefaction curve of clonotypes in PCR1. The number of deduplicated sequences are taken over the mean of 5 resampling rounds, where the shaded areas indicate 95\\% confidence interval.", echo=FALSE, out.width="100%"----
read_png(file.path(sdir, "diversity", "PCR1_cdr_rarefaction.png"))

## ----recapture, fig.cap="Capture-recapture of clonotypes in PCR1. The number of recaptured sequences are taken over the mean of 5 resampling rounds, where the shaded areas indicate 95\\% confidence interval.", echo=FALSE, out.width="100%"----
read_png(file.path(sdir, "diversity", "PCR1_cdr_recapture.png"))

## ----spectra, fig.cap="CDR3 amino acid spectratype for PCR1 with outliers removed.", echo=FALSE, out.width="100%"----
read_png(file.path(sdir, "diversity", "spectratypes", "PCR1_cdr3_spectratype_no_outliers.png"))

## ----comp, fig.cap="Amino acid composition logo for PCR1's CDR3. The unscaled (left) and the scaled (right) composition logos are colour coded and grouped by their physicochemical properties.", echo=FALSE, fig.wide=TRUE----
unscaled <- file.path(sdir, "diversity", "composition_logos", "CDR3", "PCR1_cumulative_logo.png")
scaled <- file.path(sdir, "diversity", "composition_logos", "CDR3", "PCR1_cumulative_logo_scaled.png")
img1 <-  rasterGrob(as.raster(readPNG(unscaled)), interpolate = TRUE)
img2 <- rasterGrob(as.raster(readPNG(scaled)), interpolate = TRUE)
grid.arrange(img1, img2, ncol = 2)

## ----topclones, fig.cap="Top 10 clonotypes from PCR1, PCR2, and PCR3. The clonotype proportions displayed are scaled relative to the top 10 clones in each sample.", echo=FALSE, out.width="100%"----
read_png(file.path(mdir, "clonotype_analysis", "PCR1_PCR2_PCR3_top10Clonotypes.png"))

## ----overlap, fig.cap="Number of unique overlapping clones found in PCR1, PCR2, and PCR3. The numbers in an intersection denotes the number of unique clones that are shared between the samples involved in the said intersection.", echo=FALSE, out.width="100%"----
read_png(file.path(mdir, "clonotype_analysis", "PCR1_PCR2_PCR3_cdr3_clonotypeIntersection.png"))

## ----scatter, fig.cap="Log-scaled scatter plot of clonotype frequencies in PCR1 and PCR2. Point size denotes mean frequency of the 2 samples and marginal density plots are colour coded by overlapping (blue), non-overlapping (purple), and all (grey) clonotypes.", echo=FALSE, out.width="100%"----
read_png(file.path(mdir, "clonotype_analysis", "PCR1_vs_PCR2_clone_scatter.png"))

## ----corr, fig.cap="Pearson correlation between PCR1, PCR2, and PCR3. Values denote the pearson correlation coefficient of the clonotypes frequencies between the samples. These values will appear with a cross 'x' to signify an insignificant coefficient if the p-value of the coefficient is larger than 0.05.", echo=FALSE, out.width="100%"----
read_png(file.path(mdir, "clonotype_analysis", "pearson.png"))

## ----cluster, fig.cap="Morisita-Horn distances of PCR1, PCR2, and PCR3. The distances are calculated using clonotype frequencies and are visualized as the length of the lines connecting samples or clusters.", echo=FALSE, out.width="100%"----
read_png(file.path(mdir, "clonotype_analysis", "morisita_horn.png"))

## ----mixr-test-params, echo=FALSE, message=FALSE, warnings=FALSE, results='asis'----
tabl <- "
| Parameter     | sample 1      | sample 2      | sample 3      |
|---------------|---------------|---------------|---------------|
| reads         |  10000        |  10000        | 10000         |
| clones        |  5000         |  5000         | 2000          |
| seed          |  4228         |  2428         | 2842          |
| conf          | MiSeq-300-300 | MiSeq-300-300 | MiSeq-300-300 |
| loci          |  IGH          | IGH           | IGH           |
| species       | hsa           | hsa           | hsa           |
"
cat(tabl)

## ----abseqR_session_info, echo=FALSE------------------------------------------
sessionInfo()

Try the abseqR package in your browser

Any scripts or data that you put into this service are public.

abseqR documentation built on Nov. 8, 2020, 8:28 p.m.