## Load required packages suppressPackageStartupMessages({ library(knitr) library(SummarizedExperiment) library(edgeR) library(tidyr) library(scales) library(plotly) })
## Import variables md5sum <- params$md5sum data_samples <- params$data_samples data_counts <- params$data_counts data_annotation <- params$data_annotation setGeneName <- params$setGeneName cpm_value <- params$cpm_value cpm_perc <- params$cpm_perc excluded_samples <- params$excluded_samples design_base <- params$design_base design_value <- params$design_value matrix_v1 <- params$matrix_v1 matrix_v2 <- params$matrix_v2 alpha <- params$alpha
## Values of variables
md5sum
cpm_value
cpm_perc
excluded_samples
alpha
design_base
design_value
matrix_v1
matrix_v2
## Match sample sheet & count data data_samples <- data_samples[!rownames(data_samples) %in% excluded_samples, , drop = FALSE] data_counts <- data_counts[, !colnames(data_counts) %in% excluded_samples] ## Read data as Summarized Experiment se <- readCountsFromTable(data_counts, data_samples) se <- addSamplesFromTableToSE(se, data_samples) ## Alignment summary plot_data <- alignment_summary(se) bar_plot( df = plot_data, x = "count", y = "sample", group = design_base, fill = "feature", rev = TRUE, title = "Count assignments", xlab = "Counts", ylab = "" ) ## Complexity plot_data <- complexity(se) line_plot( df = plot_data, x = "rank", y = "fraction", group = design_base, plot = "complexity", title = "Gene complexity", xlab = "Cumulative reads per number of genes", ylab = "Number of genes" ) ## Create final Summarized Experiment se <- readCountsFromTable(data_counts[!grepl('^__', rownames(data_counts)), ], data_samples) se <- addSamplesFromTableToSE(se, data_samples) if (!is.null(data_annotation)) { se <- addAnnotationsFromTableToSE(se, data_annotation) }
## Create DGE list dge <- DGEList(counts = assay(se), samples = colData(se), genes = rowData(se)) row.names(dge$genes) <- row.names(dge$counts) ## Remove genes with all 0 counts dge <- dge[rowSums(abs(dge$counts)) > 1, ] ## Count distribution tempDge <- dge tempDge$counts <- cpm(dge, log = TRUE, prior.count = 1) plot_data <- count_dist(tempDge) line_plot( df = plot_data, x = "x", y = "y", group = design_base, title = "Gene count distribution", xlab = "Log2CPM", ylab = "Density" )
## Filter data on low expressed genes edger <- calcNormFactors(dge, method = "TMM") counts <- cpm(edger, log = TRUE, prior.count = 1) selectedFeatures <- rownames(edger)[apply(counts, 1, function(v) sum(v >= cpm_value)) >= (cpm_perc / 100) * ncol(counts)] ## Get high expressed genes highExprDge <- dge[selectedFeatures, , keep.lib.sizes = FALSE] ## Normalize data normDge <- calcNormFactors(highExprDge, method = "TMM") ## Count distribution tempDge <- normDge tempDge$counts <- cpm(normDge, log = TRUE, prior.count = 1) plot_data <- count_dist(tempDge) line_plot( df = plot_data, x = "x", y = "y", group = design_base, title = "Gene count distribution", xlab = "Log2CPM", ylab = "Density" ) ## PCA plot_data <- pca_data(tempDge) scatter_plot( df = plot_data, x = "PC1", y = "PC2", group = design_base, size = 4, title = "PCA", xlab = paste0("PC1", " (", plot_data$percent[1], "%)"), ylab = paste0("PC2", " (", plot_data$percent[2], "%)") )
## Create design get_design <- createDesign(normDge$samples, design_base, design_value, matrix_v1, matrix_v2) get_design normDge <- relevelSamples(normDge, design_base, design_value, matrix_v1, matrix_v2) design <- model.matrix(eval(parse(text = get_design)), normDge$samples) design ## Get contrast values get_matrix1 <- createMatrix(normDge, design_base, design_value, matrix_v1) get_matrix2 <- createMatrix(normDge, design_base, design_value, matrix_v2) get_matrix1 get_matrix2 ## Create contrast contrast <- createContrast(design, get_matrix1, get_matrix2) contrast
## Run analysis normDge <- estimateDisp(normDge, design) edfit <- glmQLFit(normDge, design) efit <- glmQLFTest(edfit, contrast = contrast) ## Get DE values efit$DE <- decideTests(efit, p.value = alpha) summary(efit$DE)
## Get DE tables deTab <- topTags(efit, n = Inf)$table deTab <- deTab[order(rownames(deTab)), ] deTab$DE <- c(efit$DE[order(rownames(efit$DE)), ]) ## Reorder & rename table values deTab$F <- NULL deTab <- rename(deTab, "avgLog2CPM" = "logCPM") deTab <- rename(deTab, "avgLog2FC" = "logFC") deTab <- rename(deTab, "P.Value" = "PValue") deOrder <- c("avgLog2CPM", "avgLog2FC", "P.Value", "FDR", "DE") ## Reorder annotation (if present) if (!is.null(data_annotation)) { deTab <- deTab[colnames(deTab) [c(1, match(deOrder, names(deTab)), 2:(ncol(deTab) - 5))]] } else { deTab <- deTab[deOrder] } ## Add gene symbol if (setGeneName == "symbol" && !is.null(data_annotation)) { tempCol <- rownames(deTab) rownames(deTab) <- make.names(deTab$geneName, unique = TRUE) deTab$geneName <- tempCol colnames(deTab)[1] <- "geneId" rownames(normDge$counts) <- rownames(deTab) }
## MA index <- round(seq(1, nrow(deTab), length.out = 1000)) plot_data <- ma(deTab) scatter_plot( df = plot_data, x = "avgLog2CPM", y = "avgLog2FC", group = "DE", index = index, title = "MA Plot", xlab = "Average Log2CPM", ylab = "Average Log2FC" ) ## P-value plot_data <- pvalue_data(deTab) bar_plot( df = plot_data, x = "p", y = "x", title = "P-Value plot", xlab = "P-Value", ylab = "Count" )
## Save data files deTab <- deTab[order(deTab$FDR),] normDge$counts <- cpm(normDge, log = TRUE, prior.count = 1) save(deTab, normDge, file = "analysis.RData")
## Session info sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.