#' Detect outliers based on robust linear regression of QQ plot
#' @param x a vector of mahalanobis distance
#' @param df degree of freedom for chi-square distribution
#' @param conf confidence for linear regression
#'
#' @return cell names of outliers
#' @importFrom stats ppoints qchisq coef qnorm
.qq_outliers_robust <- function(x, df, conf) {
n <- length(x)
P <- stats::ppoints(n)
z <- stats::qchisq(P, df=df)
ord.x <- x[order(x)]
coef <- stats::coef(MASS::rlm(ord.x ~ z, maxit=2000))
a <- coef[1]
b <- coef[2]
zz <- stats::qnorm(1 - (1 - conf)/2)
SE <- (b/stats::dchisq(z, df))*sqrt(P*(1 - P)/n)
fit.value <- a + b*z
upper <- fit.value + zz*SE
thr <- max(ord.x[ord.x < upper])
return(names(x[x > thr]))
}
#' Detect outliers based on QC metrics
#'
#' @description This algorithm will try to find \code{comp} number of components
#' in quality control metrics using a Gaussian mixture model. Outlier
#' detection is performed on the component with the most genes detected. The
#' rest of the components will be considered poor quality cells. More cells
#' will be classified low quality as you increase \code{comp}.
#
#' @param sce a \code{SingleCellExperiment} object containing QC metrics.
#' @param comp the number of component used in GMM. Depending on the quality of
#' the experiment.
#' @param sel_col a vector of column names which indicate the columns to use
#' for QC.
#' By default it will be the statistics generated by `calculate_QC_metrics()`
#' @param type only looking at low quality cells (`low`) or possible
#' doublets (`high`) or both (`both`)
#' @param conf confidence interval for linear regression at lower and
#' upper tails.Usually, this is smaller for lower tail because we hope
#' to pick out more low quality cells than doublets.
#' @param batch whether to perform quality control separately for each batch.
#' Default is FALSE. If set to TRUE then you should have a column called
#' `batch` in the `colData(sce)`.
#'
#' @details detect outlier using Mahalanobis distances
#'
#' @return an updated \code{SingleCellExperiment} object with an `outlier`
#' column in \code{colData}
#'
#' @importFrom stats cov pchisq mahalanobis complete.cases qchisq
#' @importFrom robustbase covMcd
#' @importFrom mclust Mclust mclustBIC
#' @importFrom glue glue glue_collapse
#'
#' @export
#' @examples
#' data("sc_sample_data")
#' data("sc_sample_qc")
#' sce = SingleCellExperiment(assays = list(counts = as.matrix(sc_sample_data)))
#' organism(sce) = "mmusculus_gene_ensembl"
#' gene_id_type(sce) = "ensembl_gene_id"
#' QC_metrics(sce) = sc_sample_qc
#' demultiplex_info(sce) = cell_barcode_matching
#' UMI_dup_info(sce) = UMI_duplication
#' # the sample qc data already run through function `calculate_QC_metrics`
#' # for a new sce please run `calculate_QC_metrics` before `detect_outlier`
#' sce = detect_outlier(sce)
#' table(QC_metrics(sce)$outliers)
#'
detect_outlier <- function(sce,
comp=1,
sel_col=NULL,
type=c("low", "both", "high"),
conf=c(0.9, 0.99),
batch=FALSE) {
type <- match.arg(type)
sce <- validObject(sce)
# check format:
if (is.null(sel_col)) {
sel_col <- c("number_of_genes", "total_count_per_cell",
"non_mt_percent", "non_ERCC_percent")
}
if (!all(sel_col %in% colnames(QC_metrics(sce)))) {
missing <- sel_col[!(sel_col %in% colnames(QC_metrics(sce)))]
missing_vars <- glue_collapse(glue("'{missing}'"), sep = ",", last = " and ")
message(glue("the following QC metrics not found in colData from sce: {missing_vars}"))
if (any(c("number_of_genes", "total_count_per_cell") %in% missing_vars)) {
stop("the quality control metrics should at least contain the number of
genes(`number_of_genes`) and counts per cell(`total_count_per_cell`)!")
}
}
qc_cols <- colnames(QC_metrics(sce)) %in% sel_col
x_all <- as.data.frame(QC_metrics(sce)[, qc_cols])
x_all$total_count_per_cell <- log2(x_all$total_count_per_cell + 1)
if (!all(stats::complete.cases(x_all))) {
stop("NAs found in the selected columns, check the quality control matrix")
}
if (is.null(dim(x_all))) {
QC_metrics(sce)$outliers <- FALSE
return(sce)
}
if (!batch) {
batch_info <- rep(1, nrow(x_all))
} else {
if ("batch" %in% colnames(colData(sce))) {
batch_info <- colData(sce)$batch
} else {
stop("did not find column `batch` in colData(sce).")
}
}
if(any(rowSums(x_all) == 0)){
stop("Some cells have zero counts. remove these cells before detecting outlier.")
}else if(any(colSums(x_all) == 0)){
stop("Some QC metrics are all zeros, check your quality control matrix by QC_metrics(sce).")
}
outliers <- rep(FALSE, nrow(x_all))
for (a_batch in unique(batch_info)) {
x <- x_all[batch_info == a_batch, ]
dist <- mahalanobis(x, center=colMeans(x), cov=cov(x))
keep <- !(dist > qchisq(0.99, ncol(x)))
mod <- Mclust(x[keep, ],
G=comp,
modelNames=c("EEE","EVV", "VVV"),
verbose=FALSE)
if (comp == 1) {
covr <- covMcd(x, alpha=0.7)
dist <- mahalanobis(x,
center=covr$center,
cov=covr$cov)
mean_diff <- sign(t(x) - covr$center)
QC_sign <- c(-1, 1)[as.factor(apply(mean_diff, 2, function(t) {sum(t) > 0}))]
neg_dist <- dist[QC_sign == -1]
pos_dist <- dist[QC_sign == 1]
if (type == "both") {
outlier_cells <- .qq_outliers_robust(neg_dist, ncol(x), conf[1])
outlier_cells <- c(outlier_cells,
.qq_outliers_robust(pos_dist, ncol(x), conf[2]))
} else if (type == "low") {
outlier_cells <- .qq_outliers_robust(neg_dist, ncol(x), conf[1])
} else if (type == "high") {
outlier_cells <- .qq_outliers_robust(pos_dist, ncol(x), conf[2])
}
} else {
ord_fst <- c(seq_len(comp))[order(mod$parameters$mean[1, ], decreasing = TRUE)]
poor_comp <- ord_fst[2:comp]
good_comp <- ord_fst[1]
keep1 <- rep(TRUE, nrow(x))
keep1[keep][mod$classification %in% poor_comp] <- FALSE
keep1[!keep] <- FALSE
sub_x <- x[keep1, ]
covr <- covMcd(sub_x, alpha=0.7)
sub_dist <- mahalanobis(sub_x,
center=covr$center,
cov=covr$cov)
mean_diff <- sign(t(sub_x) - covr$center)
QC_sign <- c(-1, 1)[as.factor(apply(mean_diff, 2, function(t) {sum(t) > 0}))]
neg_dist <- sub_dist[QC_sign == -1]
pos_dist <- sub_dist[QC_sign == 1]
outlier_cells <- .qq_outliers_robust(neg_dist, ncol(sub_x), conf[1])
outlier_cells <- c(outlier_cells,
.qq_outliers_robust(pos_dist, ncol(sub_x), conf[2]))
outlier_cells <- c(outlier_cells, rownames(x[!keep1, ]))
if (!(type == "both")) {
mean_diff <- sign(t(x)-mod$parameters$mean[, good_comp])
QC_sign <- c(-1, 1)[as.factor(apply(mean_diff, 2, function(t) {sum(t) > 0}))]
is_outlier_cell <- rownames(x) %in% outlier_cells
if (type == "low") {
outlier_cells <- rownames(x)[is_outlier_cell & (QC_sign == -1)]
} else if (type == "high") {
outlier_cells <- rownames(x)[is_outlier_cell & (QC_sign == 1)]
}
}
}
if (any(rownames(x) %in% outlier_cells)) {
outliers[batch_info == a_batch][rownames(x) %in% outlier_cells] <- TRUE
}
}
QC_metrics(sce)$outliers <- as.factor(outliers)
return(sce)
}
#' Calculate QC metrics from gene count matrix
#'
#' @param sce a \code{SingleCellExperiment} object containing gene counts
#' @details get QC metrics using gene count matrix.
#' The QC statistics added are \itemize{
#' \item{number_of_genes} number of genes detected.
#' \item{total_count_per_cell} sum of read number after UMI deduplication.
#' \item{non_mt_percent} 1 - percentage of mitochondrial gene counts.
#' Mitochondrial genes are retrived by GO term GO:0005739
#' \item{non_ERCC_percent} ratio of exon counts to ERCC counts
#' \item{non_ribo_percent} 1 - percentage of ribosomal gene counts
#' ribosomal genes are retrived by GO term GO:0005840.
#' }
#' @return an \code{SingleCellExperiment} with updated QC metrics
#'
#' @importFrom SummarizedExperiment assay assay<- assays assays<- assayNames rowData rowData<- colData colData<-
#'
#' @export
#' @examples
#' data("sc_sample_data")
#' data("sc_sample_qc")
#' sce <- SingleCellExperiment(assays = list(counts = as.matrix(sc_sample_data)))
#' organism(sce) <- "mmusculus_gene_ensembl"
#' gene_id_type(sce) <- "ensembl_gene_id"
#' QC_metrics(sce) <- sc_sample_qc
#' demultiplex_info(sce) <- cell_barcode_matching
#' UMI_dup_info(sce) <- UMI_duplication
#'
#' # The sample qc data already run through function `calculate_QC_metrics`.
#' # So we delete these columns and run `calculate_QC_metrics` to get them again:
#' colnames(colnames(QC_metrics(sce)))
#' QC_metrics(sce) <- QC_metrics(sce)[,c("unaligned","aligned_unmapped","mapped_to_exon")]
#' sce = calculate_QC_metrics(sce)
#' colnames(QC_metrics(sce))
#'
calculate_QC_metrics <- function(sce) {
sce <- validObject(sce) # check the sce object
if (!("counts" %in% names(assays(sce)))){
stop("counts not in names(assays(sce)). Cannot find count data.")
}
# get gene number
gene_number <- colSums(assay(sce,"counts")>0)
if (all(gene_number == 0)) {
stop("all genes have zero count. Check the expression matrix.")
}else{
if (!("scPipe" %in% names(sce@metadata))){
sce@metadata[["scPipe"]] <- list(QC_cols=c("number_of_genes"))
} else if (!("QC_cols" %in% names(sce@metadata$scPipe))){
sce@metadata[["scPipe"]] <- list(QC_cols=c("number_of_genes"))
QC_metrics(sce) <- DataFrame(row.names = colnames(sce))
# create a empty QC metrics if not exists
}
QC_metrics(sce)$number_of_genes <- gene_number
}
# get total count per cell
QC_metrics(sce)$total_count_per_cell <- colSums(assay(sce,"counts"))
# get ERCC ratio
if(any(grepl("^ERCC-", rownames(sce)))){
exon_count <- colSums(assay(sce,"counts")[!grepl("^ERCC-", rownames(sce)), , drop = FALSE])
ERCC_count <- colSums(assay(sce,"counts")[grepl("^ERCC-", rownames(sce)), , drop = FALSE])
QC_metrics(sce)$non_ERCC_percent <- exon_count/(ERCC_count+exon_count+1e-5)
}else{
message("no ERCC spike-in. Skip `non_ERCC_percent`")
}
# get mt percentage
if (!is.na(gene_id_type(sce))) {
mt_genes <- get_genes_by_GO(returns=gene_id_type(sce),
dataset=organism(sce),
go=c("GO:0005739"))
if (length(mt_genes)>0) {
if (sum(rownames(sce) %in% mt_genes)>1) {
mt_count <- colSums(assay(sce,"counts")[rownames(sce) %in% mt_genes, , drop = FALSE])
QC_metrics(sce)$non_mt_percent <- (colSums(assay(sce,"counts"))-
mt_count)/(colSums(assay(sce,"counts"))+1e-5)
# add 1e-5 to make sure they are not NA
}
}
}
else {
message("no gene_id_type, skip `non_mt_percent`")
}
# get ribosomal percentage
if (!is.na(gene_id_type(sce))) {
ribo_genes <- get_genes_by_GO(returns=gene_id_type(sce),
dataset=organism(sce),
go=c("GO:0005840"))
if (length(ribo_genes)>0) {
if (sum(rownames(sce) %in% ribo_genes)>1) {
ribo_count <- colSums(assay(sce,"counts")[rownames(sce) %in% ribo_genes, ])
QC_metrics(sce)$non_ribo_percent <- (colSums(assay(sce,"counts"))-
ribo_count)/(colSums(assay(sce,"counts"))+0.01)
}
}
}
else {
message("no gene_id_type, skip `non_ribo_percent`")
}
return(sce)
}
#' Plot GGAlly pairs plot of QC statistics from \code{SingleCellExperiment}
#' object
#'
#' @param sce a \code{SingleCellExperiment} object
#' @param sel_col a vector of column names which indicate the columns to use for
#' plot. By default it will be the statistics generated by
#' `calculate_QC_metrics()`
#' @export
#' @return a ggplot2 object
#' @examples
#' data("sc_sample_data")
#' data("sc_sample_qc")
#' sce = SingleCellExperiment(assays = list(counts = as.matrix(sc_sample_data)))
#' organism(sce) = "mmusculus_gene_ensembl"
#' gene_id_type(sce) = "ensembl_gene_id"
#' QC_metrics(sce) = sc_sample_qc
#' demultiplex_info(sce) = cell_barcode_matching
#' UMI_dup_info(sce) = UMI_duplication
#' sce = detect_outlier(sce)
#'
#' plot_QC_pairs(sce)
#'
plot_QC_pairs <- function(sce, sel_col=NULL) {
sce <- validObject(sce) # check the sce object
if (is.null(sel_col)) {
sel_col <- c("number_of_genes", "total_count_per_cell", "non_mt_percent",
"non_ERCC_percent", "non_ribo_percent", "outliers")
}
if (!any(sel_col %in% colnames(QC_metrics(sce)))){
stop("`sel_col` not in colnames(QC_metrics(sce))).")
}
x <- as.data.frame(QC_metrics(sce)[, colnames(QC_metrics(sce)) %in% sel_col])
if ("outliers" %in% colnames(x)) {
plot_output <- GGally::ggpairs(
x,
mapping = ggplot2::aes_string(colour = "outliers"),
upper = list(continuous = GGally::wrap("cor", size=3, hjust=0.8))
)
}
else {
plot_output <- GGally::ggpairs(x)
}
return(plot_output)
}
#' Plot mapping statistics for \code{SingleCellExperiment} object.
#'
#' @param sce a \code{SingleCellExperiment} object
#' @param sel_col a vector of column names, indicating the columns to use for
#' plot. by default it will be the mapping result.
#' @param percentage TRUE to convert the number of reads to percentage
#' @param dataname the name of this dataset, used as plot title
#'
#' @importFrom reshape melt
#' @importFrom stats prcomp reorder
#' @return a ggplot2 object
#' @export
#'
#' @examples
#' data("sc_sample_data")
#' data("sc_sample_qc")
#' sce = SingleCellExperiment(assays = list(counts = as.matrix(sc_sample_data)))
#' organism(sce) = "mmusculus_gene_ensembl"
#' gene_id_type(sce) = "ensembl_gene_id"
#' QC_metrics(sce) = sc_sample_qc
#' demultiplex_info(sce) = cell_barcode_matching
#' UMI_dup_info(sce) = UMI_duplication
#'
#' plot_mapping(sce,percentage=TRUE,dataname="sc_sample")
#'
plot_mapping <- function(sce,
sel_col=NULL,
percentage=FALSE,
dataname="") {
sce <- validObject(sce) # check the sce object
if (is.null(sel_col)) {
sel_col <- c("unaligned", "aligned_unmapped", "ambiguous_mapping",
"mapped_to_ERCC", "mapped_to_intron", "mapped_to_exon")
}
x <- as.data.frame(QC_metrics(sce)[, sel_col])
mapping_stat <- x
mapping_stat$sample_name <- stats::reorder(rownames(mapping_stat), mapping_stat$mapped_to_exon)
mapping_stat_prop <- as.data.frame(prop.table(as.matrix(mapping_stat[, sapply(mapping_stat, is.numeric)]), 1))
mapping_stat_prop$sample_name <- mapping_stat$sample_name
dat.m <- melt(mapping_stat, id.vars="sample_name")
dat.m1 <- melt(mapping_stat_prop, id.vars="sample_name")
if (!percentage) {
p <- ggplot2::ggplot(dat.m, ggplot2::aes_string(x="sample_name", y="value", fill="variable")) + ggplot2::scale_fill_brewer(palette="Set1")+
ggplot2::geom_bar(stat="identity", width=1)+
ggplot2::ylab("number of reads")+
ggplot2::xlab("cell sorted by number of reads mapped to exon")+
ggplot2::theme(axis.title.x=ggplot2::element_blank(), axis.text.x=ggplot2::element_blank())
if (dataname != "") {
p <- p + ggplot2::ggtitle(paste0("Overall mapping statistics of ", dataname, " (number of reads)"))
} else {
p <- p + ggplot2::ggtitle(paste0("Overall mapping statistics (number of reads)"))
}
}
else {
p <- ggplot2::ggplot(dat.m1, ggplot2::aes_string(x="sample_name", y="value", fill="variable")) + ggplot2::scale_fill_brewer(palette="Set1")+
ggplot2::geom_bar(stat="identity", width=1)+
ggplot2::ylab("percentage of reads")+
ggplot2::xlab("cell sorted by number of reads mapped_to_exon")+
ggplot2::scale_y_continuous(labels=scales::percent_format())+
ggplot2::theme(axis.title.x=ggplot2::element_blank(), axis.text.x=ggplot2::element_blank())
if (dataname != "") {
p <- p + ggplot2::ggtitle(paste0("Overall mapping statistics of ", dataname, " (percentage)"))
} else {
p <- p + ggplot2::ggtitle(paste0("Overall mapping statistics (percentage)"))
}
}
return(p)
}
#' plot_demultiplex
#'
#'@description Plot cell barcode demultiplexing result for the
#' \code{SingleCellExperiment}. The barcode demultiplexing result is shown
#' using a barplot, with the bars indicating proportions of total reads.
#' Barcode matches and mismatches are summarised along with whether or not the
#' read mapped to the genome. High proportion of genome aligned reads with no
#' barcode match may indicate barcode integration failure.
#'
#' @param sce a \code{SingleCellExperiment} object
#'
#' @return a ggplot2 bar chart
#'
#' @importFrom rlang .data
#' @export
#'
#' @examples
#' data("sc_sample_data")
#' data("sc_sample_qc")
#' sce = SingleCellExperiment(assays = list(counts = as.matrix(sc_sample_data)))
#' organism(sce) = "mmusculus_gene_ensembl"
#' gene_id_type(sce) = "ensembl_gene_id"
#' QC_metrics(sce) = sc_sample_qc
#' demultiplex_info(sce) = cell_barcode_matching
#' UMI_dup_info(sce) = UMI_duplication
#'
#' plot_demultiplex(sce)
#'
plot_demultiplex <- function(sce){
sce <- validObject(sce) # check the sce object
demultiplex_stat <- demultiplex_info(sce)
demultiplex_stat[,"count"] <- demultiplex_stat[,"count"]/sum(demultiplex_stat[,"count"])
if (is.null(demultiplex_stat)){
stop("`demultiplex_stat` does not exists in sce. demultiplex_info(sce) == NULL)")
}
demultiplex_stat$label_y <- demultiplex_stat[,"count"]+0.05
demultiplex_stat$label_tx <- scales::percent(demultiplex_stat[,"count"])
#kp = demultiplex_stat[,"count"]/sum(demultiplex_stat[,"count"])>0.2
#label_tx[!kp] = ""
blank_theme <- ggplot2::theme_minimal()+
ggplot2::theme(
axis.text.x = ggplot2::element_text(angle = 30, hjust=1,size=12),
panel.border = ggplot2::element_blank(),
plot.title = ggplot2::element_text(size=14, face="bold"),
legend.position = "none"
)
p <- ggplot2::ggplot(data=demultiplex_stat, ggplot2::aes(x=.data$status, y=.data$count,
fill=.data$status))+
ggplot2::geom_bar(width = 1, stat = "identity")+
#coord_polar("y", start=0)+
ggplot2::scale_fill_brewer(palette="Dark2")+
blank_theme+
ggplot2::geom_text(ggplot2::aes(y = .data$label_y, label = .data$label_tx))+
ggplot2::labs(title="Cell barcode demultiplexing statistics", y="percentage",x="")
return(p)
}
#' Plot UMI duplication frequency
#'
#' @description Plot the UMI duplication frequency.
#'
#' @param sce a \code{SingleCellExperiment} object
#' @param log10_x whether to use log10 scale for x axis
#'
#' @return a line chart of the UMI duplication frequency
#' @export
#'
#' @examples
#' data("sc_sample_data")
#' data("sc_sample_qc")
#' sce = SingleCellExperiment(assays = list(counts = as.matrix(sc_sample_data)))
#' organism(sce) = "mmusculus_gene_ensembl"
#' gene_id_type(sce) = "ensembl_gene_id"
#' QC_metrics(sce) = sc_sample_qc
#' demultiplex_info(sce) = cell_barcode_matching
#' UMI_dup_info(sce) = UMI_duplication
#'
#' plot_UMI_dup(sce)
#'
plot_UMI_dup <- function(sce, log10_x = TRUE){
sce <- validObject(sce) # check the sce object
tmp <- UMI_dup_info(sce)
tmp <- tmp[-nrow(tmp),] # remove 1000
the_sum <- cumsum(tmp)
for (i in seq_len(nrow(tmp)-1)){ # cut the zero counts
if (the_sum$count[nrow(tmp)] - the_sum$count[i]<10){
tmp <- tmp[seq_len(i),]
break
}
}
dup_col <- ""
if ("duplication_number" %in% colnames(tmp)){
dup_col <- "duplication_number"
}else if ("duplication.number" %in% colnames(tmp)){
dup_col <- "duplication.number"
}else{
message("cannot find column containing UMI duplication number.")
return(NULL)
}
p <- ggplot2::ggplot(data=tmp, ggplot2::aes(x=tmp[, dup_col], y=tmp[,"count"])) +
ggplot2::geom_smooth(method = "loess",se = FALSE)+ggplot2::theme_bw()
if (log10_x){
p <- p + ggplot2::scale_x_log10()+
ggplot2::labs(title="Distribution of UMI duplication numbers(log10).",
x="log10(UMI duplication number)",
y="frequency")
}else{
p <- p + ggplot2::labs(title="Distribution of UMI duplication numbers.",
x="UMI duplication number",
y="frequency")
}
return(p)
}
#' Remove outliers in \code{SingleCellExperiment}
#'
#' @description Removes outliers flagged by \code{detect_outliers()}
#'
#' @param sce a \code{SingleCellExperiment} object
#' @export
#' @return a \code{SingleCellExperiment} object without outliers
#' @examples
#' data("sc_sample_data")
#' data("sc_sample_qc")
#' sce = SingleCellExperiment(assays = list(counts = as.matrix(sc_sample_data)))
#' organism(sce) = "mmusculus_gene_ensembl"
#' gene_id_type(sce) = "ensembl_gene_id"
#' QC_metrics(sce) = sc_sample_qc
#' demultiplex_info(sce) = cell_barcode_matching
#' UMI_dup_info(sce) = UMI_duplication
#' sce = detect_outlier(sce)
#' dim(sce)
#' sce = remove_outliers(sce)
#' dim(sce)
#'
remove_outliers <- function(sce) {
sce <- validObject(sce) # check the sce object
if (!("outliers" %in% colnames(QC_metrics(sce)))) {
stop("No outlier information. Please run `detect_outlier()` to get the outliers.")
}
out_cell <- QC_metrics(sce)$outliers == FALSE
return(sce[, out_cell])
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.