lncRNA <- fread(type.list$lncRNA)
setnames(lncRNA, c('Gene', 'Transcript', 'Type', 'Potential', 'Length', '我怎么知道这是什么东西?', 'Class'))
lncRNA[, Type := toUpperFirstLetter(sub("_", " ", Type))]


# Order first, for using geom_line later (geom_step behaves badly when calling plotly, see https://github.com/ropensci/plotly/issues/1030)
# cdf <- cdf[order(cdf[, 1], cdf[, 2])]
# Subset data randomly
lncRNA.subset <- lncRNA[sample(nrow(lncRNA),nrow(lncRNA) * params$cdf.percent / 100),]

Column {.tabset}

CDF against CPAT

p <- ggplot() +
  geom_line(data = lncRNA.subset, aes(x = Potential, colour = Type), size = 2, stat = 'ecdf') +
  scale_x_continuous(expand  = c(.01, 0)) + scale_y_continuous(expand = c(0, 0)) +
  labs(x = 'Coding Probablity(CPAT)', title = "Coding Potential", y = "CDF") +
  geom_hline(yintercept = 1, colour = "grey", linetype = "dashed", size = 1) +
save_plot('CDF.tiff', p, base_height = 8.5, base_width = 11, dpi = 300, compression = 'lzw')
save_plot('CDF.pdf', p, base_height = 8.5, base_width = 11, dpi = 300)
# Use sum of exon length as transcript length
lncRNA.gtf <- unique(lncRNA[, Length := sum(Length), by = .(Gene, Type)], by = 'Gene')

lncRNA length distribution with type

lncRNA.gtf[Length > params$max.lncrna.len, Length := params$max.lncrna.len]
p <- ggplot() + geom_density(data = lncRNA.gtf, aes(x = Length, colour = Type), size = 1.5) +
  xlab('Transcript length') + ylab('Density') +
  scale_x_continuous(breaks = seq.int(200, params$max.lncrna.len, length.out = 10),
                     labels = c(seq.int(200, params$max.lncrna.len, length.out = 9),
                                paste0(params$max.lncrna.len, '+')), 
                     expand = c(0.01, 0)) +
  scale_y_continuous(expand = c(0.01, 0)) +
save_plot('lncRNA_length_distribution_with_type.tiff', p, base_height = 8.5, base_width = 11, dpi = 300, compression = 'lzw')
save_plot('lncRNA_length_distribution_with_type.pdf', p, base_height = 8.5, base_width = 11, dpi = 300)
ggplotly(p) %>% layout(margin = list(r = 50))

Total lncRNA length distribution

p <- ggplot() + geom_histogram(data = lncRNA.gtf, aes(x = Length), binwidth = 100) +
  scale_x_continuous(breaks = seq.int(200, params$max.lncrna.len, length.out = 10),
                     labels = c(seq.int(200, params$max.lncrna.len, length.out = 9),
                                paste0(params$max.lncrna.len, '+')),
                     expand = c(0.01, 0)) +
  scale_y_continuous(expand = c(0, 10)) + labs(x = 'lncRNA length', y = 'Count')
save_plot('lncRNA_length_distribution.tiff', p, base_height = 8.5, base_width = 11, dpi = 300, compression = 'lzw')
save_plot('lncRNA_length_distribution.pdf', p, base_height = 8.5, base_width = 11, dpi = 300)
ggplotly(p) %>% layout(margin = list(r = 50))

lncRNA classification

cls <- lncRNA[, .(count = .N), by = Class]
colors <- c('rgb(211,94,96)', 'rgb(128,133,133)', 'rgb(144,103,167)', 'rgb(171,104,87)', 'rgb(114,147,203)')
plot_ly(cls, labels = ~Class, values = ~count, type = 'pie',
        textposition = 'inside',
        textinfo = 'label+percent',
        insidetextfont = list(color = '#FFFFFF'),
        hoverinfo = 'text',
        text = ~paste(Class, 'total count:', count),
        marker = list(colors = colors,
                      line = list(color = '#FFFFFF', width = 1)),
                      #The 'pull' attribute can also be used to create space between the sectors
        showlegend = FALSE) %>%
  layout(xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
         yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))



Title Statistics of lncRNAs identified by lncPipe

What have we done In this section, lncRNAs (novel and known) identified by lncPipe are systematically summarized and compared across the two categories based on length, coding potential and classification level. In LncPipe, coding potential of final lncRNAs were re-assesed by CNCI, so CNCI CDS data were plot accordingly which coding genes were also considered; Besides, we summarized length distribution of novel and known lncRNAs and classified lncRNAs based on thier genome location to nearest protein coding genes. The following figure illustrates the classification categories.


What do the plots mean As mentioned above, this section contains CDF plot, length distrition plot, lncRNA classification plot and result table. Of those :

CDF plot represents the coding potential score of lncRNA distribution also called 'Cumulative Distribution Function' plot, which is the probability that the variable takes a value less than or equal to x. The horizontal axis is the allowable domain for the given probability function. Since the vertical axis is a probability, it must fall between zero and one. It increases from zero to one as we go from left to right on the horizontal axis.

length distrition plot gives the length distribution of known and novel lncRNA, the X-axis is length of lncRNA sequence and Y-axis is the fraction of lncRNA at specify length. To note, not all the length numbers are plotted. While length numbers larger than max.length parameter are grouped together and shown as ">10000" for instance.

lncRNA classification plot shows the fraction of different kinds of novel and known lncRNAs. Classification information was produced by LncPipe and illustrated as a pie-chart.


DT::datatable(head(lncRNA.gtf[, -6], n = 80L))

