lncRNA <- fread(type.list$lncRNA) setnames(lncRNA, c('Gene', 'Transcript', 'Type', 'Potential', 'Length', '我怎么知道这是什么东西?', 'Class')) lncRNA[, Type := toUpperFirstLetter(sub("_", " ", Type))]
# Order first, for using geom_line later (geom_step behaves badly when calling plotly, see https://github.com/ropensci/plotly/issues/1030) # cdf <- cdf[order(cdf[, 1], cdf[, 2])] # Subset data randomly lncRNA.subset <- lncRNA[sample(nrow(lncRNA),nrow(lncRNA) * params$cdf.percent / 100),]
p <- ggplot() + geom_line(data = lncRNA.subset, aes(x = Potential, colour = Type), size = 2, stat = 'ecdf') + scale_x_continuous(expand = c(.01, 0)) + scale_y_continuous(expand = c(0, 0)) + labs(x = 'Coding Probablity(CPAT)', title = "Coding Potential", y = "CDF") + geom_hline(yintercept = 1, colour = "grey", linetype = "dashed", size = 1) + get(paste0('scale_color_',params$theme))() save_plot('CDF.tiff', p, base_height = 8.5, base_width = 11, dpi = 300, compression = 'lzw') save_plot('CDF.pdf', p, base_height = 8.5, base_width = 11, dpi = 300) ggplotly(p) rm(p) invisible(gc())
# Use sum of exon length as transcript length lncRNA.gtf <- unique(lncRNA[, Length := sum(Length), by = .(Gene, Type)], by = 'Gene')
lncRNA.gtf[Length > params$max.lncrna.len, Length := params$max.lncrna.len] p <- ggplot() + geom_density(data = lncRNA.gtf, aes(x = Length, colour = Type), size = 1.5) + xlab('Transcript length') + ylab('Density') + scale_x_continuous(breaks = seq.int(200, params$max.lncrna.len, length.out = 10), labels = c(seq.int(200, params$max.lncrna.len, length.out = 9), paste0(params$max.lncrna.len, '+')), expand = c(0.01, 0)) + scale_y_continuous(expand = c(0.01, 0)) + get(paste0('scale_color_',params$theme))() save_plot('lncRNA_length_distribution_with_type.tiff', p, base_height = 8.5, base_width = 11, dpi = 300, compression = 'lzw') save_plot('lncRNA_length_distribution_with_type.pdf', p, base_height = 8.5, base_width = 11, dpi = 300) ggplotly(p) %>% layout(margin = list(r = 50)) rm(p) invisible(gc())
p <- ggplot() + geom_histogram(data = lncRNA.gtf, aes(x = Length), binwidth = 100) + scale_x_continuous(breaks = seq.int(200, params$max.lncrna.len, length.out = 10), labels = c(seq.int(200, params$max.lncrna.len, length.out = 9), paste0(params$max.lncrna.len, '+')), expand = c(0.01, 0)) + scale_y_continuous(expand = c(0, 10)) + labs(x = 'lncRNA length', y = 'Count') save_plot('lncRNA_length_distribution.tiff', p, base_height = 8.5, base_width = 11, dpi = 300, compression = 'lzw') save_plot('lncRNA_length_distribution.pdf', p, base_height = 8.5, base_width = 11, dpi = 300) ggplotly(p) %>% layout(margin = list(r = 50)) rm(p) invisible(gc())
cls <- lncRNA[, .(count = .N), by = Class] colors <- c('rgb(211,94,96)', 'rgb(128,133,133)', 'rgb(144,103,167)', 'rgb(171,104,87)', 'rgb(114,147,203)') plot_ly(cls, labels = ~Class, values = ~count, type = 'pie', textposition = 'inside', textinfo = 'label+percent', insidetextfont = list(color = '#FFFFFF'), hoverinfo = 'text', text = ~paste(Class, 'total count:', count), marker = list(colors = colors, line = list(color = '#FFFFFF', width = 1)), #The 'pull' attribute can also be used to create space between the sectors showlegend = FALSE) %>% layout(xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE), yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
Title Statistics of lncRNAs identified by lncPipe
What have we done In this section, lncRNAs (novel and known) identified by lncPipe are systematically summarized and compared across the two categories based on length, coding potential and classification level. In LncPipe, coding potential of final lncRNAs were re-assesed by CNCI, so CNCI CDS data were plot accordingly which coding genes were also considered; Besides, we summarized length distribution of novel and known lncRNAs and classified lncRNAs based on thier genome location to nearest protein coding genes. The following figure illustrates the classification categories.
knitr::include_graphics('Nucleotide-20170720011933.svg')
What do the plots mean As mentioned above, this section contains CDF plot
, length distrition plot
, lncRNA classification plot
and result table
. Of those :
CDF plot
represents the coding potential score of lncRNA distribution also called 'Cumulative Distribution Function' plot, which is the probability that the variable takes a value less than or equal to x. The horizontal axis is the allowable domain for the given probability function. Since the vertical axis is a probability, it must fall between zero and one. It increases from zero to one as we go from left to right on the horizontal axis.
length distrition plot
gives the length distribution of known and novel lncRNA, the X-axis is length of lncRNA sequence and Y-axis is the fraction of lncRNA at specify length. To note, not all the length numbers are plotted. While length numbers larger than max.length
parameter are grouped together and shown as ">10000" for instance.
lncRNA classification plot
shows the fraction of different kinds of novel and known lncRNAs. Classification information was produced by LncPipe and illustrated as a pie-chart.
Reference:
DT::datatable(head(lncRNA.gtf[, -6], n = 80L))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.