DE analysis (by edgeR)

# DE.analysis <- function(m, group) {
#   d <- DGEList(counts = rsem.count, group = group)
#   design <- model.matrix( ~ group)
#   d <-
#   d[keep <-
#   rowSums(cpm(d) > 1) >= nrow(design.table) * params$min.expressed.sample / 100, , keep.lib.sizes =
#   FALSE]
#   d <- calcNormFactors(d)
#   d <- estimateDisp(d, design = design)
#   de <- exactTest(d)
#   de <- topTags(de, n = length(rownames(de$table)))
#   deg <- row.names(de$table[with(de$table, PValue < 0.05),][order(-de$table[with(de$table, PValue < 0.05),]$logCPM),][1:20,])
#   log.cpm <- cpm(d, prior.count=2, log=TRUE)
#   log.cpm <- data.frame(log.cpm)
#   heatmap.m <- log.cpm[deg,]
#   return(list(de$table, log.cpm, heatmap.m))
# }

suppressWarnings(rm(rsem.count))

rsem.count <- fread(type.list$RSEM)[Type!="protein_coding",]
rna.id <- rsem.count[,1]
rna.type <- rsem.count[,2]
design.table <- fread(type.list$Design, header = TRUE)
rsem.count <- rsem.count[, round(.SD), .SDcols = -(1:2), with = TRUE]
group <- as.factor(design.table$condition)
d <- edgeR::DGEList(counts = rsem.count, group = group)
design <- model.matrix( ~ group)
# Important!!! The function keep.lib.sizes belongs to subsetting family in package edgeR.
# The DGEList S3 generic must be imported first.
# According to the source code:
# assign("[.DGEList",
# function(object, i, j, keep.lib.sizes=TRUE)
# You should: 
# @importFrom edgeR "[.DGEList"
d <- d[keep <- rowSums(edgeR::cpm(d) > 1) >= nrow(design.table) * params$min.expressed.sample / 100, ,
       keep.lib.sizes = FALSE]
d <- calcNormFactors(d)
d <- estimateDisp(d, design = design)
de <- exactTest(d)
de <- topTags(de, n = length(rownames(de$table)))
deg <- row.names(de$table[with(de$table,
                               PValue < 0.05),][order(-de$table[with(de$table,
                                                                     PValue < 0.05),]$logCPM),])
log.cpm <- edgeR::cpm(d, prior.count=2, log=TRUE)
log.cpm <- data.frame(log.cpm)
heatmap.m <- log.cpm[deg,]
row.names(heatmap.m) <- as.vector(t(rna.id[,1]))[as.numeric(row.names(heatmap.m))]

Column {.tabset}

Vocalno plot

de$table$threshold <- as.factor(abs(de$table$logFC) > 2 & de$table$PValue < 0.05)
p <- ggplot(data = de$table, aes(x = logFC, y = -log10(PValue), colour = threshold)) +
  geom_point(size = 1.75, alpha = 0.4) + xlab("log2 fold change") + ylab("-log10 p-value") +
  # plotly does not support expression object now
  # xlab(expression("log"[2]*"(Fold Change)")) + ylab(expression("-log"[10]*"(p-value)")) +
  geom_vline(xintercept = 0, colour = "grey", linetype = "dashed", size = 1) +
  geom_hline(yintercept = -log10(0.05), colour = "grey", linetype = "dashed", size = 1) + 
  theme(legend.position = "none") + scale_x_continuous(limits = c(-10, 10)) + 
  get(paste0('scale_color_',params$theme))()
save_plot('vocano.tiff', p, base_height = 8.5, base_width = 11, dpi = 300, compression = 'lzw')
save_plot('vocano.pdf', p, base_height = 8.5, base_width = 11, dpi = 300)
p

Principal Components Analysis

pca.m <- prcomp(t(log.cpm), scale. = FALSE)
design.table$condition <- factor(toUpperFirstLetter(design.table$condition))
p <- ggbiplot::ggbiplot(pca.m, groups = design.table$condition, ellipse = FALSE, circle = TRUE, var.axes = FALSE) + scale_x_continuous(limits = c(-2, 2)) +
  labs(color = "Groups") + get(paste0('scale_color_',params$theme))()
save_plot('pca.tiff', p, base_height = 8.5, dpi = 300, compression = 'lzw')
save_plot('pca.pdf', p, base_height = 8.5, dpi = 300)
ggplotly(p)
rm(pca.m, p)
invisible(gc())

Heatmap

# pheatmap version
# pheatmap(heatmap.m, trace = 'none', margin = c(13, 13), display_numbers = TRUE, number_format = '%.1e')

# ggplot2 %>% plotly version
# ggheatmap <- function(exprs) {
#   # https://plot.ly/ggplot2/ggdendro-dendrograms/
#   require(ggdendro)
#   # require(stringr)
#   
#   features <- row.names(exprs)
#   exprs <- t(scale(t(exprs)))
#   
#   dd.col <- as.dendrogram(hclust(dist(exprs)))
#   dd.row <- as.dendrogram(hclust(dist(t(exprs))))
#   col.order <- order.dendrogram(dd.col)
#   dy <- dendro_data(dd.col)
#   row.order <- order.dendrogram(dd.row)
#   dx <- dendro_data(dd.row)
#   
#   ggdend <- function(df) {
#   ggplot() +
#   geom_segment(data = df, aes(
#   x = x,
#   y = y,
#   xend = xend,
#   yend = yend
#   )) +
#   labs(x = "", y = "") + theme_minimal() +
#   theme(
#   axis.text = element_blank(),
#   axis.ticks = element_blank(),
#   panel.grid = element_blank()
#   )
#   }
#   
#   exprs <-
#   data.table(features, t(scale(t(exprs)))[col.order, row.order])
#   exprs <- melt(exprs, id = 1, variable.name = 'sample.name')
#   
#   dendro.data.x <- dendro_data(dd.row)
#   dendro.data.y <- dendro_data(dd.col)
#   
#   px <- ggdend(dx$segments)
#   py <- ggdend(dy$segments) + coord_flip()
#   
#   heatmap.exprs <-
#   ggplot(exprs, aes(x = sample.name, y = features)) + geom_raster(aes(fill =
#   value)) + scale_fill_gradient2(
#   low = "green",
#   mid = "black",
#   high = "red",
#   midpoint = 0
#   ) + theme(axis.ticks = element_blank(),
#   axis.text = element_text(
#   size = rel(0.4),
#   angle = 45,
#   vjust = 0
#   ))
#   
#   eaxis <- list(showticklabels = FALSE,
#   showgrid = FALSE,
#   zeroline = FALSE)
#   
#   p_empty <- plot_ly() %>%
#   # note that margin applies to entire plot, so we can
#   # add it here to make tick labels more readable
#   layout(xaxis = eaxis,
#   yaxis = eaxis)
#   
#   subplot(
#   px,
#   p_empty,
#   heatmap.exprs,
#   py,
#   nrows = 2,
#   widths = c(0.8, 0.2),
#   heights = c(0.2, 0.8)
#   )
# }
# ggheatmap(heatmap.m)
# rm(ggheatmap)

heatmaply::heatmaply(heatmap.m, scale = 'row', xlab = "Sample", margins = c(60,110,40,20), row_text_angle = 45, column_text_angle = 30, cexRow = 0.8, cexCol = 0.8) %>% layout(margin = list(l = 100, b = 50, r = 0))
# invisible(gc())

Correlation heatmap

# ggplot2 %>% plotly version
# p <- ggplot() + geom_raster(data = melt(cor(heatmap.m)), aes(x = Var1, y = Var2, fill = value)) + labs(x = '', y = '') + guides(fill=guide_legend(title='Cor value')) + theme(axis.text = element_text(size = rel(0.8)), axis.text.x = element_text(angle = 60, vjust = 0.7))
# ggplotly(p)

heatmaply::heatmaply(cor(heatmap.m), margins = c(60, 80, 40, 20),
                     row_text_angle = 60, column_text_angle = 30, cexRow = 0.8, cexCol = 0.8,
          k_col = 2, k_row = 2,
          limits = c(-1,1)) %>% layout(margin = list(l = 50, b = 50))

Column

Description

Title: Differential expression analysis of novel and known lncRNA from LncPipe (edgeR)

edgeR adopts a quartile-adjusted conditional maximum likelihood estimator to determine overdispersion parameters in negative binomial distribution. This approach is of improved performance than standard maximum likelihood estimator, especially when there are few replicates per condition. Dispersion parameters are later refined by adjusting towards a consensus value using cross-transcript information via empirical Bayes procedure. Differentially expressed genes are validated by an exact test to determine if reads count under different condition fit the same negative binomial distribution.

What we have done: Design.matrix file and reads count matrix were fed into edgeR, an R packages for identification differential expressed gene from RNA-seq or Chip-seq experiment. Before exact test statitistics, raw reads count matrix were filtered according to the user-defined parameter min.expressed.sample, which can be explained as follows: raw reads count was first normalized in to cpm (Counts Per Million reads) matrix , gene with cpm >1 in more than min.expressed.sample numbers were retained into futher analysis. in current version, we only implemented the standard comparison that only two condition were supportted in each analysis. Experimental without repelicates were not supported . Source code of differential expression analysis can be freely checked by https://github.com/bioinformatist/LncPipeReporter/edit/master/inst/rmd/DE.Rmd. Complementary experimental design can be performed seperately besed kallisto.count.txt generated by lncPipe, this file can be imported into another software IDEA, which focus on comprehensive differential expression analysis from expression matrix.

What dose plots Mean: In this section, Density plots, Vocalno plots, PCA analysis and Heatmap plot were show at left panel.

Density plots descripts the reads count distribution among lncRNA features, it can help check the libaray quality of each sample.
LncRNAs from samples with the similar condition were expected have the same distribution. The axis X means the reads count number, axis Y denotes the density or fraction of lncRNA expressed at each reads count.

Vocalno plots show the mean foldchange of lncRNA expression in differ conditions against corrected test significant values (-logFDR). At default situation, points at up-left corner are considered as down regulated and the points at up-right are up regulated.

PCA analysis shows Principle Components Analysis result of lncRNA expression matrix, two highest PC were extracted as X and Y axis into scatter plot. Points were colored with different condition involved in this analysis. Theoretically, samples from the same library should be closer than those not. This plot canbe used to control the variance introduced by biological/technical replicates or treatment conditions. In our analysis, PCA analysis were performed using all expressed lncRNA and can be deem as a unsupervised clustering analysis method.

Heatmap heatmap plot the correlation matrix, all expressed lncRNA are involved and unsupervised hierarchical clustering method were applied for cluster samples based on correlation matrix, which was displayed in a dendrogram in side.

Reference

Robinson MD, McCarthy DJ and Smyth GK (2010). “edgeR: a Bioconductor package for differential expression analysis of digital gene expression data.” Bioinformatics, 26(1), pp. 139-140.

McCarthy, J. D, Chen, Yunshun, Smyth and K. G (2012). “Differential expression analysis of multifactor RNA-Seq experiments with respect to biological variation.” Nucleic Acids Research, 40(10), pp. 4288-4297.

Differential expressed lncRNAs table

row.names(de$table) <- as.vector(t(rna.id[,1]))[as.numeric(row.names(de$table))]
fwrite(de$table[, -5], 'DE_lncRNA.csv', row.names = TRUE)
DT::datatable(head(de$table[, -5], n = 80L)) %>% DT::formatRound(c('logFC', 'logCPM', 'PValue', "FDR"), digits = 2)

Compare

Column {.tabset}

Density

compare.dt <- data.table(Type = as.vector(t(rna.type))[as.numeric(row.names(log.cpm))], log.cpm)
p <- ggplot() + geom_density(data = melt(compare.dt, id = 'Type'), aes(x = value, colour = Type), size = 1.5) + get(paste0('scale_color_',params$theme))()
save_plot('compare_density.tiff', p, base_height = 8.5, base_width = 11, dpi = 300, compression = 'lzw')
save_plot('compare_density.pdf', p, base_height = 8.5, base_width = 11, dpi = 300)
ggplotly(p)

Violin plot

compare.dt <- data.table(Type = as.vector(t(rna.type))[as.numeric(row.names(log.cpm))], log.cpm)
p <- ggplot() + geom_violin(data = melt(compare.dt, id = 'Type'), aes(x = Type, y = value))
save_plot('compare_violin.tiff', p, base_height = 8.5, base_width = 11, dpi = 300, compression = 'lzw')
save_plot('compare_violin.pdf', p, base_height = 8.5, base_width = 11, dpi = 300)
ggplotly(p)

Column

Description

Title: Comparation analysis between novel identified lncRNA and known ones

What we have done: On the basis of identified transicrpts as well as their expression in differ samples, we performed comparation analysis between categories defined by LncPipe, which are "known", "novel" and "protein coding". while protein coding information were extracted from GENECODE. To note, catergory not presented in the plot means that the corresponding catergory was not included in result files. We do not present data at sample level or condition level but show the overview comparison included in all samples.

What dose plots Mean:

Desity plot Presents the density of lncRNA or coding genes at each abundence, this plot show the overview distibution of normalized readscount or expression in certain system. The X-axis denotes the increasing expression value, the Y-axis with range 0 to 1 represents the fraction of gene count in total genes.

Volin plot The same as boxplot, also gives a overview distribution of gene's or transcript's expression.

rsem.count <- fread(type.list$RSEM)
rna.id <- rsem.count[,1]
rna.type <- rsem.count[,2]
rsem.count <- rsem.count[, round(.SD), .SDcols = -(1:2), with = TRUE]
group <- as.factor(design.table$condition)
d <- edgeR::DGEList(counts = rsem.count, group = group)
design <- model.matrix( ~ group)
# Important!!! The function keep.lib.sizes belongs to subsetting family in package edgeR.
# The DGEList S3 generic must be imported first.
# According to the source code:
# assign("[.DGEList",
# function(object, i, j, keep.lib.sizes=TRUE)
# You should: 
# @importFrom edgeR "[.DGEList"
d <- d[keep <- rowSums(edgeR::cpm(d) > 1) >= nrow(design.table) * params$min.expressed.sample / 100, ,
       keep.lib.sizes = FALSE]
d <- calcNormFactors(d)
d <- estimateDisp(d, design = design)
de <- exactTest(d)
de <- topTags(de, n = length(rownames(de$table)))
row.names(de$table) <- as.vector(t(rna.id[,1]))[as.numeric(row.names(de$table))]
fwrite(de$table[, -5], 'DE_all.csv', row.names = TRUE)
DT::datatable(head(de$table[, -5], n = 80L)) %>% DT::formatRound(c('logFC', 'logCPM', 'PValue', "FDR"), digits = 2)


bioinformatist/LncPipeReporter documentation built on May 28, 2019, 7:11 p.m.