# DE.analysis <- function(m, group) { # d <- DGEList(counts = rsem.count, group = group) # design <- model.matrix( ~ group) # d <- # d[keep <- # rowSums(cpm(d) > 1) >= nrow(design.table) * params$min.expressed.sample / 100, , keep.lib.sizes = # FALSE] # d <- calcNormFactors(d) # d <- estimateDisp(d, design = design) # de <- exactTest(d) # de <- topTags(de, n = length(rownames(de$table))) # deg <- row.names(de$table[with(de$table, PValue < 0.05),][order(-de$table[with(de$table, PValue < 0.05),]$logCPM),][1:20,]) # log.cpm <- cpm(d, prior.count=2, log=TRUE) # log.cpm <- data.frame(log.cpm) # heatmap.m <- log.cpm[deg,] # return(list(de$table, log.cpm, heatmap.m)) # } suppressWarnings(rm(rsem.count)) rsem.count <- fread(type.list$RSEM)[Type!="protein_coding",] rna.id <- rsem.count[,1] rna.type <- rsem.count[,2] design.table <- fread(type.list$Design, header = TRUE) rsem.count <- rsem.count[, round(.SD), .SDcols = -(1:2), with = TRUE] group <- as.factor(design.table$condition) d <- edgeR::DGEList(counts = rsem.count, group = group) design <- model.matrix( ~ group) # Important!!! The function keep.lib.sizes belongs to subsetting family in package edgeR. # The DGEList S3 generic must be imported first. # According to the source code: # assign("[.DGEList", # function(object, i, j, keep.lib.sizes=TRUE) # You should: # @importFrom edgeR "[.DGEList" d <- d[keep <- rowSums(edgeR::cpm(d) > 1) >= nrow(design.table) * params$min.expressed.sample / 100, , keep.lib.sizes = FALSE] d <- calcNormFactors(d) d <- estimateDisp(d, design = design) de <- exactTest(d) de <- topTags(de, n = length(rownames(de$table))) deg <- row.names(de$table[with(de$table, PValue < 0.05),][order(-de$table[with(de$table, PValue < 0.05),]$logCPM),]) log.cpm <- edgeR::cpm(d, prior.count=2, log=TRUE) log.cpm <- data.frame(log.cpm) heatmap.m <- log.cpm[deg,] row.names(heatmap.m) <- as.vector(t(rna.id[,1]))[as.numeric(row.names(heatmap.m))]
de$table$threshold <- as.factor(abs(de$table$logFC) > 2 & de$table$PValue < 0.05) p <- ggplot(data = de$table, aes(x = logFC, y = -log10(PValue), colour = threshold)) + geom_point(size = 1.75, alpha = 0.4) + xlab("log2 fold change") + ylab("-log10 p-value") + # plotly does not support expression object now # xlab(expression("log"[2]*"(Fold Change)")) + ylab(expression("-log"[10]*"(p-value)")) + geom_vline(xintercept = 0, colour = "grey", linetype = "dashed", size = 1) + geom_hline(yintercept = -log10(0.05), colour = "grey", linetype = "dashed", size = 1) + theme(legend.position = "none") + scale_x_continuous(limits = c(-10, 10)) + get(paste0('scale_color_',params$theme))() save_plot('vocano.tiff', p, base_height = 8.5, base_width = 11, dpi = 300, compression = 'lzw') save_plot('vocano.pdf', p, base_height = 8.5, base_width = 11, dpi = 300) p
pca.m <- prcomp(t(log.cpm), scale. = FALSE) design.table$condition <- factor(toUpperFirstLetter(design.table$condition)) p <- ggbiplot::ggbiplot(pca.m, groups = design.table$condition, ellipse = FALSE, circle = TRUE, var.axes = FALSE) + scale_x_continuous(limits = c(-2, 2)) + labs(color = "Groups") + get(paste0('scale_color_',params$theme))() save_plot('pca.tiff', p, base_height = 8.5, dpi = 300, compression = 'lzw') save_plot('pca.pdf', p, base_height = 8.5, dpi = 300) ggplotly(p) rm(pca.m, p) invisible(gc())
# pheatmap version # pheatmap(heatmap.m, trace = 'none', margin = c(13, 13), display_numbers = TRUE, number_format = '%.1e') # ggplot2 %>% plotly version # ggheatmap <- function(exprs) { # # https://plot.ly/ggplot2/ggdendro-dendrograms/ # require(ggdendro) # # require(stringr) # # features <- row.names(exprs) # exprs <- t(scale(t(exprs))) # # dd.col <- as.dendrogram(hclust(dist(exprs))) # dd.row <- as.dendrogram(hclust(dist(t(exprs)))) # col.order <- order.dendrogram(dd.col) # dy <- dendro_data(dd.col) # row.order <- order.dendrogram(dd.row) # dx <- dendro_data(dd.row) # # ggdend <- function(df) { # ggplot() + # geom_segment(data = df, aes( # x = x, # y = y, # xend = xend, # yend = yend # )) + # labs(x = "", y = "") + theme_minimal() + # theme( # axis.text = element_blank(), # axis.ticks = element_blank(), # panel.grid = element_blank() # ) # } # # exprs <- # data.table(features, t(scale(t(exprs)))[col.order, row.order]) # exprs <- melt(exprs, id = 1, variable.name = 'sample.name') # # dendro.data.x <- dendro_data(dd.row) # dendro.data.y <- dendro_data(dd.col) # # px <- ggdend(dx$segments) # py <- ggdend(dy$segments) + coord_flip() # # heatmap.exprs <- # ggplot(exprs, aes(x = sample.name, y = features)) + geom_raster(aes(fill = # value)) + scale_fill_gradient2( # low = "green", # mid = "black", # high = "red", # midpoint = 0 # ) + theme(axis.ticks = element_blank(), # axis.text = element_text( # size = rel(0.4), # angle = 45, # vjust = 0 # )) # # eaxis <- list(showticklabels = FALSE, # showgrid = FALSE, # zeroline = FALSE) # # p_empty <- plot_ly() %>% # # note that margin applies to entire plot, so we can # # add it here to make tick labels more readable # layout(xaxis = eaxis, # yaxis = eaxis) # # subplot( # px, # p_empty, # heatmap.exprs, # py, # nrows = 2, # widths = c(0.8, 0.2), # heights = c(0.2, 0.8) # ) # } # ggheatmap(heatmap.m) # rm(ggheatmap) heatmaply::heatmaply(heatmap.m, scale = 'row', xlab = "Sample", margins = c(60,110,40,20), row_text_angle = 45, column_text_angle = 30, cexRow = 0.8, cexCol = 0.8) %>% layout(margin = list(l = 100, b = 50, r = 0)) # invisible(gc())
# ggplot2 %>% plotly version # p <- ggplot() + geom_raster(data = melt(cor(heatmap.m)), aes(x = Var1, y = Var2, fill = value)) + labs(x = '', y = '') + guides(fill=guide_legend(title='Cor value')) + theme(axis.text = element_text(size = rel(0.8)), axis.text.x = element_text(angle = 60, vjust = 0.7)) # ggplotly(p) heatmaply::heatmaply(cor(heatmap.m), margins = c(60, 80, 40, 20), row_text_angle = 60, column_text_angle = 30, cexRow = 0.8, cexCol = 0.8, k_col = 2, k_row = 2, limits = c(-1,1)) %>% layout(margin = list(l = 50, b = 50))
Title: Differential expression analysis of novel and known lncRNA from LncPipe (edgeR)
edgeR adopts a quartile-adjusted conditional maximum likelihood estimator to determine overdispersion parameters in negative binomial distribution. This approach is of improved performance than standard maximum likelihood estimator, especially when there are few replicates per condition. Dispersion parameters are later refined by adjusting towards a consensus value using cross-transcript information via empirical Bayes procedure. Differentially expressed genes are validated by an exact test to determine if reads count under different condition fit the same negative binomial distribution.
What we have done: Design.matrix
file and reads count matrix were fed into edgeR, an R packages for identification differential expressed gene from RNA-seq or Chip-seq experiment. Before exact test statitistics, raw reads count matrix were filtered according to the user-defined parameter min.expressed.sample
, which can be explained as follows: raw reads count was first normalized in to cpm (Counts Per Million reads) matrix , gene with cpm >1 in more than min.expressed.sample
numbers were retained into futher analysis. in current version, we only implemented the standard comparison that only two condition were supportted in each analysis. Experimental without repelicates were not supported . Source code of differential expression analysis can be freely checked by https://github.com/bioinformatist/LncPipeReporter/edit/master/inst/rmd/DE.Rmd. Complementary experimental design can be performed seperately besed kallisto.count.txt
generated by lncPipe, this file can be imported into another software IDEA, which focus on comprehensive differential expression analysis from expression matrix.
What dose plots Mean: In this section, Density plots
, Vocalno plots
, PCA analysis
and Heatmap plot
were show at left panel.
Density plots
descripts the reads count distribution among lncRNA features, it can help check the libaray quality of each sample.
LncRNAs from samples with the similar condition were expected have the same distribution. The axis X means the reads count number, axis Y denotes the density or fraction of lncRNA expressed at each reads count.
Vocalno plots
show the mean foldchange of lncRNA expression in differ conditions against corrected test significant values (-logFDR). At default situation, points at up-left corner are considered as down regulated and the points at up-right are up regulated.
PCA analysis
shows Principle Components Analysis result of lncRNA expression matrix, two highest PC were extracted as X and Y axis into scatter plot. Points were colored with different condition involved in this analysis. Theoretically, samples from the same library should be closer than those not. This plot canbe used to control the variance introduced by biological/technical replicates or treatment conditions. In our analysis, PCA analysis were performed using all expressed lncRNA and can be deem as a unsupervised clustering analysis method.
Heatmap
heatmap plot the correlation matrix, all expressed lncRNA are involved and unsupervised hierarchical clustering method were applied for cluster samples based on correlation matrix, which was displayed in a dendrogram in side.
Reference
Robinson MD, McCarthy DJ and Smyth GK (2010). “edgeR: a Bioconductor package for differential expression analysis of digital gene expression data.” Bioinformatics, 26(1), pp. 139-140.
McCarthy, J. D, Chen, Yunshun, Smyth and K. G (2012). “Differential expression analysis of multifactor RNA-Seq experiments with respect to biological variation.” Nucleic Acids Research, 40(10), pp. 4288-4297.
row.names(de$table) <- as.vector(t(rna.id[,1]))[as.numeric(row.names(de$table))] fwrite(de$table[, -5], 'DE_lncRNA.csv', row.names = TRUE) DT::datatable(head(de$table[, -5], n = 80L)) %>% DT::formatRound(c('logFC', 'logCPM', 'PValue', "FDR"), digits = 2)
compare.dt <- data.table(Type = as.vector(t(rna.type))[as.numeric(row.names(log.cpm))], log.cpm) p <- ggplot() + geom_density(data = melt(compare.dt, id = 'Type'), aes(x = value, colour = Type), size = 1.5) + get(paste0('scale_color_',params$theme))() save_plot('compare_density.tiff', p, base_height = 8.5, base_width = 11, dpi = 300, compression = 'lzw') save_plot('compare_density.pdf', p, base_height = 8.5, base_width = 11, dpi = 300) ggplotly(p)
compare.dt <- data.table(Type = as.vector(t(rna.type))[as.numeric(row.names(log.cpm))], log.cpm) p <- ggplot() + geom_violin(data = melt(compare.dt, id = 'Type'), aes(x = Type, y = value)) save_plot('compare_violin.tiff', p, base_height = 8.5, base_width = 11, dpi = 300, compression = 'lzw') save_plot('compare_violin.pdf', p, base_height = 8.5, base_width = 11, dpi = 300) ggplotly(p)
Title: Comparation analysis between novel identified lncRNA and known ones
What we have done: On the basis of identified transicrpts as well as their expression in differ samples, we performed comparation analysis between categories defined by LncPipe, which are "known", "novel" and "protein coding". while protein coding information were extracted from GENECODE. To note, catergory not presented in the plot means that the corresponding catergory was not included in result files. We do not present data at sample level or condition level but show the overview comparison included in all samples.
What dose plots Mean:
Desity plot
Presents the density of lncRNA or coding genes at each abundence, this plot show the overview distibution of normalized readscount or expression in certain system. The X-axis denotes the increasing expression value, the Y-axis with range 0 to 1 represents the fraction of gene count in total genes.
Volin plot
The same as boxplot, also gives a overview distribution of gene's or transcript's expression.
rsem.count <- fread(type.list$RSEM) rna.id <- rsem.count[,1] rna.type <- rsem.count[,2] rsem.count <- rsem.count[, round(.SD), .SDcols = -(1:2), with = TRUE] group <- as.factor(design.table$condition) d <- edgeR::DGEList(counts = rsem.count, group = group) design <- model.matrix( ~ group) # Important!!! The function keep.lib.sizes belongs to subsetting family in package edgeR. # The DGEList S3 generic must be imported first. # According to the source code: # assign("[.DGEList", # function(object, i, j, keep.lib.sizes=TRUE) # You should: # @importFrom edgeR "[.DGEList" d <- d[keep <- rowSums(edgeR::cpm(d) > 1) >= nrow(design.table) * params$min.expressed.sample / 100, , keep.lib.sizes = FALSE] d <- calcNormFactors(d) d <- estimateDisp(d, design = design) de <- exactTest(d) de <- topTags(de, n = length(rownames(de$table))) row.names(de$table) <- as.vector(t(rna.id[,1]))[as.numeric(row.names(de$table))] fwrite(de$table[, -5], 'DE_all.csv', row.names = TRUE) DT::datatable(head(de$table[, -5], n = 80L)) %>% DT::formatRound(c('logFC', 'logCPM', 'PValue', "FDR"), digits = 2)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.