## Load required packages suppressPackageStartupMessages({ library(knitr) library(SummarizedExperiment) library(DESeq2) library(edgeR) library(tidyr) library(scales) library(plotly) })
## Import variables md5sum <- params$md5sum data_samples <- params$data_samples data_counts <- params$data_counts data_annotation <- params$data_annotation setGeneName <- params$setGeneName cpm_value <- params$cpm_value cpm_perc <- params$cpm_perc excluded_samples <- params$excluded_samples design_base <- params$design_base design_value <- params$design_value matrix_v1 <- params$matrix_v1 matrix_v2 <- params$matrix_v2 alpha <- params$alpha
## Values of variables
md5sum
cpm_value
cpm_perc
excluded_samples
alpha
design_base
design_value
matrix_v1
matrix_v2
## Match sample sheet & count data data_samples <- data_samples[!rownames(data_samples) %in% excluded_samples, , drop=FALSE] data_counts <- data_counts[,!colnames(data_counts) %in% excluded_samples] ## Read data as Summarized Experiment se <- readCountsFromTable(data_counts, data_samples) se <- addSamplesFromTableToSE(se, data_samples) ## Alignment summary plot_data <- alignment_summary(se) bar_plot( df = plot_data, x = "count", y = "sample", group = design_base, fill = "feature", rev = TRUE, title = "Count assignments", xlab = "Counts", ylab = "" ) ## Complexity plot_data <- complexity(se) line_plot( df = plot_data, x = "rank", y = "fraction", group = design_base, plot = "complexity", title = "Gene complexity", xlab = "Cumulative reads per number of genes", ylab = "Number of genes" ) ## Create final Summarized Experiment se <- readCountsFromTable(data_counts[!grepl('^__', rownames(data_counts)), ], data_samples) se <- addSamplesFromTableToSE(se, data_samples) if (!is.null(data_annotation)) { se <- addAnnotationsFromTableToSE(se, data_annotation) }
## Create DGE list dge <- DGEList(counts = assay(se), samples = colData(se), genes = rowData(se)) ## Remove genes with all 0 counts dge <- dge[rowSums(abs(dge$counts)) > 1, ] ## Count distribution tempDge <- dge tempDge$counts <- cpm(dge, log = TRUE, prior.count = 1) plot_data <- count_dist(tempDge) line_plot( df = plot_data, x = "x", y = "y", group = design_base, title = "Gene count distribution", xlab = "Log2CPM", ylab = "Density" )
## Create design get_design <- createDesign(dge$samples, design_base, design_value, matrix_v1, matrix_v2) get_design dge <- relevelSamples(dge, design_base, design_value, matrix_v1, matrix_v2) design <- model.matrix(eval(parse(text = get_design)), dge$samples) design ## Prepare analysis analysis <- DESeqDataSet(se, design = design) analysis <- analysis[rowSums(abs(assay(analysis))) > 1,] ## Filter data on low expressed genes counts <- cpm(counts(analysis), log = TRUE, prior.count = 1) selectedFeatures <- rownames(analysis)[apply(counts, 1, function(v) sum(v >= cpm_value)) >= (cpm_perc / 100) * ncol(counts)] ## Get high expressed genes analysis <- analysis[selectedFeatures, ]
## Run analysis analysis <- DESeq(analysis)
## Normalize data getSize <- estimateSizeFactors(analysis) normDge <- DGEList(counts = data.frame(counts(getSize, normalized = TRUE)), samples = dge$samples) normDge$counts <- cpm(normDge, log = TRUE, prior.count = 1) ## Count distribution plot_data <- count_dist(normDge) line_plot( df = plot_data, x = "x", y = "y", group = design_base, title = "Gene count distribution", xlab = "Log2CPM", ylab = "Density" ) ## PCA plot_data <- pca_data(normDge) scatter_plot( df = plot_data, x = "PC1", y = "PC2", group = design_base, size = 4, title = "PCA", xlab = paste0("PC1", " (", plot_data$percent[1], "%)"), ylab = paste0("PC2", " (", plot_data$percent[2], "%)") )
## Get contrast values get_matrix1 <- createMatrix(normDge, design_base, design_value, matrix_v1) get_matrix2 <- createMatrix(normDge, design_base, design_value, matrix_v2) get_matrix1 get_matrix2 ## Create contrast contrast <- createContrast(design, get_matrix1, get_matrix2) contrast ## Get expression results deTab <- results(analysis, contrast = contrast, alpha = alpha) summary(deTab)
## Reorder & rename table values deTab <- data.frame(deTab[, c("log2FoldChange", "pvalue", "padj")]) colnames(deTab) <- c("avgLog2FC", "P.Value", "adj.P.Val") deTab$avgLog2CPM <- rowMeans(normDge$counts) ## Set DE values deTab$DE <- 0 deTab$DE[deTab$adj.P.Val < alpha & deTab$avgLog2FC < 0] <- -1 deTab$DE[deTab$adj.P.Val < alpha & deTab$avgLog2FC > 0] <- 1
## Add annotation to deTab if (!is.null(data_annotation)) { deTab <- merge(data_annotation, deTab, by = 0, all = FALSE) rownames(deTab) <- deTab$Row.names deTab$Row.names <- NULL } ## Rename table values deTab <- rename(deTab, "FDR" = "adj.P.Val") deOrder <- c("avgLog2CPM", "avgLog2FC", "P.Value", "FDR", "DE") ## Reorder annotation (if present) if (!is.null(data_annotation)) { deTab <- deTab[colnames(deTab) [c(1, match(deOrder, names(deTab)), 2:(ncol(deTab) - 5))]] } else { deTab <- deTab[deOrder] } ## Add gene symbol if (setGeneName == "symbol" && !is.null(data_annotation)) { tempCol <- rownames(deTab) rownames(deTab) <- make.names(deTab$geneName, unique = TRUE) deTab$geneName <- tempCol colnames(deTab)[1] <- "geneId" rownames(normDge$counts) <- rownames(deTab) }
## MA index <- round(seq(1, nrow(deTab), length.out = 1000)) plot_data <- ma(deTab) scatter_plot( df = plot_data, x = "avgLog2CPM", y = "avgLog2FC", group = "DE", index = index, title = "MA Plot", xlab = "Average Log2CPM", ylab = "Average Log2FC" ) ## P-value plot_data <- pvalue_data(deTab) bar_plot( df = plot_data, x = "p", y = "x", title = "P-Value plot", xlab = "P-Value", ylab = "Count" )
## Save data files deTab <- deTab[order(deTab$FDR),] save(deTab, normDge, file = "analysis.RData")
## Session info sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.