library(scales) library(readr) library(ggplot2) library(plotly) library(DT) library(scater) library(scran) library(scPipe) library(Rtsne)
sc_trim_barcode
r params$fq1
r params$fq2
r params$fqout
assume read1 contains the transcript
r params$bc1_info
r params$bc2_info
r params$umi_info
N
in its barcode or UMI: r params$rm_n
r params$rm_low
r if (params$rm_low){paste("\t* minimum read quality:", params$min_q, "\n", "\t* maximum number of base below minimum read quality:", params$num_bq, "\n")}
r params$fqout
r params$bam_align
r params$g_index
sc_exon_mapping
r params$bam_align
r params$bam_map
r params$anno_gff
r params$stnd
r params$fix_chr
sc_demultiplex
r params$bam_map
r params$outdir
r params$bc_anno
r params$max_mis
sc_gene_counting
r params$outdir
r params$bc_anno
r params$UMI_cor
r params$gene_fl
The organism is "r params$organism
", and gene id type is "r params$gene_id_type
".
if (is.null(params$organism) || is.na(params$organism)) { sce = create_sce_by_dir(params$outdir) } else { sce = create_sce_by_dir(params$outdir, organism=params$organism, gene_id_type=params$gene_id_type) } overall_stat = demultiplex_info(sce) datatable(overall_stat, width=800)
Plot barcode match statistics in pie chart:
plot_demultiplex(sce)
ggplotly(plot_mapping(sce, dataname=params$samplename, percentage = FALSE))
ggplotly(plot_mapping(sce, dataname=params$samplename, percentage = TRUE))
if (any(colSums(counts(sce)) == 0)) { zero_cells = sum(colSums(counts(sce)) == 0) sce = sce[, colSums(counts(sce)) > 0] } else { zero_cells = 0 }
r if (zero_cells > 0){paste(zero_cells, "cells have zero read counts, remove them.")}
Datatable of all QC metrics:
sce = calculate_QC_metrics(sce) if(!all(colSums(as.data.frame(QC_metrics(sce)))>0)){ QC_metrics(sce) = QC_metrics(sce)[, colSums(as.data.frame(QC_metrics(sce)))>0] } datatable(as.data.frame(QC_metrics(sce)), width=800, options=list(scrollX= TRUE))
Summary of all QC metrics:
datatable(do.call(cbind, lapply(QC_metrics(sce), summary)), width=800, options=list(scrollX= TRUE))
Number of reads mapped to exon before UMI deduplication VS number of genes detected:
ggplotly(ggplot(as.data.frame(QC_metrics(sce)), aes(x=mapped_to_exon, y=number_of_genes))+geom_point(alpha=0.8))
A robustified Mahalanobis Distance is calculated for each cell then outliers are detected based on the distance.
However, due to the complex nature of single cell transcriptomes and protocol used, such a method can only be used to
assist the quality control process. Visual inspection of the quality control metrics is still required. By default we
use comp = 2
and the algorithm will try to separate the quality control metrics into two gaussian clusters.
The number of outliers:
sce_qc = detect_outlier(sce, type="low", comp = 2) table(QC_metrics(sce_qc)$outliers)
Pairwise plot for QC metrics, colored by outliers:
plot_QC_pairs(sce_qc)
Remove low quality cells and plot highest expression genes.
sce_qc = remove_outliers(sce_qc) sce_qc = convert_geneid(sce_qc, returns="external_gene_name") sce_qc <- calculateQCMetrics(sce_qc) plotHighestExprs(sce_qc, n=20)
Plot the average count for each genes:
ave.counts <- rowMeans(counts(sce_qc)) hist(log10(ave.counts), breaks=100, main="", col="grey80", xlab=expression(Log[10]~"average count"))
As a loose filter we keep genes that are expressed in at least two cells and for cells that express that gene, the average count larger than two. However this is not the gold standard and the filter may variy depending on the data.
keep1 = (apply(counts(sce_qc), 1, function(x) mean(x[x>0])) > 1.1) # average count larger than 1.1 keep2 = (rowSums(counts(sce_qc)>0) > 5) # expressed in at least 5 cells sce_qc = sce_qc[(keep1 & keep2), ] dim(sce_qc)
We got r nrow(sce_qc)
genes left after removing low abundant genes.
scran
and scater
Compute the normalization size factor
ncells = ncol(sce_qc) if (ncells > 200) { sce_qc <- computeSumFactors(sce_qc) } else { sce_qc <- computeSumFactors(sce_qc, sizes=as.integer(c(ncells/7, ncells/6, ncells/5, ncells/4, ncells/3))) } summary(sizeFactors(sce_qc))
r if (min(sizeFactors(sce_qc)) <= 0){paste("We have negative size factors in the data. They indicate low quality cells and we have removed them. To avoid negative size factors, the best solution is to increase the stringency of the filtering.")}
if (min(sizeFactors(sce_qc)) <= 0) { sce_qc = sce_qc[, sizeFactors(sce_qc)>0] }
PCA plot using gene expressions as input, colored by the number of genes.
cpm(sce_qc) = calculateCPM(sce_qc, use_size_factors=FALSE) plotPCA(sce_qc, run_args = list(exprs_values="cpm"), colour_by="total_features")
The highly variable genes are chosen based on trendVar
from scran
with FDR > 0.05
and biological variation larger
than 0.5
. If the number of highly variable genes is smaller than 100 we will select the top 100 genes by
biological variation. If the number is larger than 500 we will only keep top 500 genes by biological variation.
sce_qc <- normalize(sce_qc) var.fit <- trendVar(sce_qc, method="loess", use.spikes=FALSE) var.out <- decomposeVar(sce_qc, var.fit) if (length(which(var.out$FDR <= 0.05 & var.out$bio >= 0.5)) < 500){ hvg.out <- var.out[order(var.out$bio, decreasing=TRUE)[1:500], ] }else if(length(which(var.out$FDR <= 0.05 & var.out$bio >= 0.5)) > 1000){ hvg.out <- var.out[order(var.out$bio, decreasing=TRUE)[1:1000], ] }else{ hvg.out <- var.out[which(var.out$FDR <= 0.05 & var.out$bio >= 0.5), ] hvg.out <- hvg.out[order(hvg.out$bio, decreasing=TRUE), ] } plot(var.out$mean, var.out$total, pch=16, cex=0.6, xlab="Mean log-expression", ylab="Variance of log-expression") o <- order(var.out$mean) lines(var.out$mean[o], var.out$tech[o], col="dodgerblue", lwd=2) points(var.out$mean[rownames(var.out) %in% rownames(hvg.out)], var.out$total[rownames(var.out) %in% rownames(hvg.out)], col="red", pch=16)
gene_exp = exprs(sce_qc) gene_exp = gene_exp[rownames(hvg.out)[1:100], ] hc.rows <- hclust(dist(gene_exp)) hc.cols <- hclust(dist(t(gene_exp))) gene_exp = gene_exp[hc.rows$order, hc.cols$order] m = list( l = 100, r = 40, b = 10, t = 10, pad = 0 ) plot_ly( x = colnames(gene_exp), y = rownames(gene_exp), z = gene_exp, type = "heatmap")%>% layout(autosize = F, margin = m)
plotPCA(sce_qc, run_args = list(exprs_values="logcounts"), colour_by="total_features")
set.seed(100) if (any(duplicated(t(logcounts(sce_qc)[rownames(hvg.out), ])))) { sce_qc = sce_qc[, !duplicated(t(logcounts(sce_qc)[rownames(hvg.out), ]))] } out5 <- plotTSNE(sce_qc, run_args = list(exprs_values="logcounts", perplexity=10,feature_set=rownames(hvg.out)), colour_by="total_features") + ggtitle("Perplexity = 10") out10 <- plotTSNE(sce_qc, run_args = list(exprs_values="logcounts", perplexity=20,feature_set=rownames(hvg.out)), colour_by="total_features", feature_set=rownames(hvg.out)) + ggtitle("Perplexity = 20") out20 <- plotTSNE(sce_qc, run_args = list(exprs_values="logcounts", perplexity=30,feature_set=rownames(hvg.out)), colour_by="total_features", feature_set=rownames(hvg.out)) + ggtitle("Perplexity = 30") multiplot(out5, out10, out20, cols=3)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.