Load packages

## Load required packages
suppressPackageStartupMessages({
  library(knitr)
  library(SummarizedExperiment)
  library(edgeR)
  library(tidyr)
  library(scales)
  library(plotly)
})

Input

Load input variables

## Import variables
md5sum <- params$md5sum
data_samples <- params$data_samples
data_counts <- params$data_counts
data_annotation <- params$data_annotation
setGeneName <- params$setGeneName
cpm_value <- params$cpm_value
cpm_perc <- params$cpm_perc
excluded_samples <- params$excluded_samples
design_base <- params$design_base
design_value <- params$design_value
matrix_v1 <- params$matrix_v1
matrix_v2 <- params$matrix_v2
alpha <- params$alpha

Show input variables

## Values of variables
md5sum
cpm_value
cpm_perc
excluded_samples
alpha
design_base
design_value
matrix_v1
matrix_v2

Pre-process data

Create Summarized Experiment

## Match sample sheet & count data
data_samples <- data_samples[!rownames(data_samples) %in% excluded_samples, , drop = FALSE]
data_counts <- data_counts[, !colnames(data_counts) %in% excluded_samples]

## Read data as Summarized Experiment
se <- readCountsFromTable(data_counts, data_samples)
se <- addSamplesFromTableToSE(se, data_samples)

## Alignment summary
plot_data <- alignment_summary(se)
bar_plot(
  df = plot_data,
  x = "count",
  y = "sample",
  group = design_base,
  fill = "feature",
  rev = TRUE,
  title = "Count assignments",
  xlab = "Counts",
  ylab = ""
)

## Complexity
plot_data <- complexity(se)
line_plot(
  df = plot_data,
  x = "rank",
  y = "fraction",
  group = design_base,
  plot = "complexity",
  title = "Gene complexity",
  xlab = "Cumulative reads per number of genes",
  ylab = "Number of genes"
)

## Create final Summarized Experiment
se <- readCountsFromTable(data_counts[!grepl('^__', rownames(data_counts)), ], data_samples)
se <- addSamplesFromTableToSE(se, data_samples)
if (!is.null(data_annotation)) {
  se <- addAnnotationsFromTableToSE(se, data_annotation)
}

Create DGE list

## Create DGE list
dge <- DGEList(counts = assay(se),
               samples = colData(se),
               genes = rowData(se))
row.names(dge$genes) <- row.names(dge$counts)

## Remove genes with all 0 counts
dge <- dge[rowSums(abs(dge$counts)) > 1, ]

## Count distribution
tempDge <- dge
tempDge$counts <- cpm(dge, log = TRUE, prior.count = 1)
plot_data <- count_dist(tempDge)
line_plot(
  df = plot_data,
  x = "x",
  y = "y",
  group = design_base,
  title = "Gene count distribution",
  xlab = "Log2CPM",
  ylab = "Density"
)

Filter & normalize data

## Filter data on low expressed genes
edger <- calcNormFactors(dge, method = "TMM")
counts <- cpm(edger, log = TRUE, prior.count = 1)
selectedFeatures <- rownames(edger)[apply(counts, 1, function(v)
  sum(v >= cpm_value)) >= (cpm_perc / 100) * ncol(counts)]

## Get high expressed genes
highExprDge <- dge[selectedFeatures, , keep.lib.sizes = FALSE]

## Normalize data
normDge <- calcNormFactors(highExprDge, method = "TMM")

## Count distribution
tempDge <- normDge
tempDge$counts <- cpm(normDge, log = TRUE, prior.count = 1)
plot_data <- count_dist(tempDge)
line_plot(
  df = plot_data,
  x = "x",
  y = "y",
  group = design_base,
  title = "Gene count distribution",
  xlab = "Log2CPM",
  ylab = "Density"
)

## PCA
plot_data <- pca_data(tempDge)
scatter_plot(
  df = plot_data,
    x = "PC1",
  y = "PC2",
  group = design_base,
  size = 4,
  title = "PCA",
  xlab = paste0("PC1", " (", plot_data$percent[1], "%)"),
  ylab = paste0("PC2", " (", plot_data$percent[2], "%)")
)

Create design and contrasts

## Create design
get_design <- createDesign(normDge$samples, design_base, design_value, matrix_v1, matrix_v2)
get_design
normDge <- relevelSamples(normDge, design_base, design_value, matrix_v1, matrix_v2)
design <- model.matrix(eval(parse(text = get_design)), normDge$samples)
design

## Get contrast values
get_matrix1 <- createMatrix(normDge, design_base, design_value, matrix_v1)
get_matrix2 <- createMatrix(normDge, design_base, design_value, matrix_v2)
get_matrix1
get_matrix2

## Create contrast
contrast <- createContrast(design, get_matrix1, get_matrix2)
contrast

Perform analysis

## Run analysis
normDge <- estimateDisp(normDge, design)
edfit <- glmQLFit(normDge, design)
efit <- glmQLFTest(edfit, contrast = contrast)

## Get DE values
efit$DE <- decideTests(efit, p.value = alpha)
summary(efit$DE)

Format DE output table

## Get DE tables
deTab <- topTags(efit, n = Inf)$table
deTab <- deTab[order(rownames(deTab)), ]
deTab$DE <- c(efit$DE[order(rownames(efit$DE)), ])

## Reorder & rename table values
deTab$F <- NULL
deTab <- rename(deTab, "avgLog2CPM" = "logCPM")
deTab <- rename(deTab, "avgLog2FC" = "logFC")
deTab <- rename(deTab, "P.Value" = "PValue")
deOrder <- c("avgLog2CPM", "avgLog2FC", "P.Value", "FDR", "DE")

## Reorder annotation (if present)
if (!is.null(data_annotation)) {
  deTab <- deTab[colnames(deTab)
                 [c(1, match(deOrder, names(deTab)), 2:(ncol(deTab) - 5))]]
} else {
  deTab <- deTab[deOrder]
}

## Add gene symbol
if (setGeneName == "symbol" && !is.null(data_annotation)) {
  tempCol <- rownames(deTab)
  rownames(deTab) <- make.names(deTab$geneName, unique = TRUE)
  deTab$geneName <- tempCol
  colnames(deTab)[1] <- "geneId"
  rownames(normDge$counts) <- rownames(deTab)
}

DE results

## MA
index <- round(seq(1, nrow(deTab), length.out = 1000))
plot_data <- ma(deTab)
scatter_plot(
  df = plot_data,
  x = "avgLog2CPM",
  y = "avgLog2FC",
  group = "DE",
  index = index,
  title = "MA Plot",
  xlab = "Average Log2CPM",
  ylab = "Average Log2FC"
)

## P-value
plot_data <- pvalue_data(deTab)
bar_plot(
  df = plot_data,
  x = "p",
  y = "x",
  title = "P-Value plot",
  xlab = "P-Value",
  ylab = "Count"
)

Save results

## Save data files
deTab <- deTab[order(deTab$FDR),]
normDge$counts <- cpm(normDge, log = TRUE, prior.count = 1)
save(deTab, normDge, file = "analysis.RData")

Session info

## Session info
sessionInfo()


LUMC/DGE_analysis documentation built on Aug. 12, 2022, 11:04 p.m.