#' Count the number of UMIs for each gene and output count matrix
#'
#' Count unique \emph{UMI:gene} pairs for single cell RNA-sequencing alignment
#' files. Write resulting count matrix to output directory. Columns are
#' samples (cells) and rows are gene IDs. The input sequence alignment files
#' must be generated using FASTQ files generated by the \code{demultiplex}
#' function in scruff package. Return a SingleCellExperiment object containing
#' the count matrix, cell and gene annotations, and all QC metrics.
#'
#' @param sce A \code{SingleCellExperiment} object of which the \code{colData}
#' slot contains the \strong{alignment_path} column with paths to input
#' cell-specific sequence alignment files (BAM or SAM format).
#' @param reference Path to the reference GTF file. The TxDb object of the GTF
#' file will be generated and saved in the current working directory with
#' ".sqlite" suffix.
#' @param umiEdit Maximally allowed Hamming distance for UMI correction. For
#' read alignments in each gene, by comparing to a more abundant UMI with more
#' reads, UMIs having fewer reads and with mismatches equal or fewer than
#' \code{umiEdit} will be assigned a corrected UMI (the UMI with more reads).
#' Default is 0, meaning no UMI correction is performed. Doing UMI correction
#' will decrease the number of transcripts per gene.
#' @param format Format of input sequence alignment files. \strong{"BAM"} or
#' \strong{"SAM"}. Default is \strong{"BAM"}.
#' @param outDir Output directory for UMI counting results. UMI corrected count
#' matrix will be stored in this directory. Default is \code{"./Count"}.
#' @param cellPerWell Number of cells per well. Can be an integer (e.g. 1)
#' indicating the number of cells in each well or an vector with length equal
#' to the total number of cells in the input alignment files specifying the
#' number of cells in each file. Default is 1.
#' @param cores Number of cores used for parallelization. Default is
#' \code{max(1, parallel::detectCores() - 2)}, i.e. the number of available
#' cores minus 2.
#' @param outputPrefix Prefix for expression table filename. Default is
#' \code{"countUMI"}.
#' @param verbose Print log messages. Useful for debugging. Default to
#' \strong{FALSE}.
#' @param logfilePrefix Prefix for log file. Default is current date and time
#' in the format of \code{format(Sys.time(), "\%Y\%m\%d_\%H\%M\%S")}.
#' @return A \strong{SingleCellExperiment} object.
#' @examples
#' \dontrun{
#' data(barcodeExample, package = "scruff")
#' # The SingleCellExperiment object returned by alignRsubread function and the
#' # alignment BAM files are required for running countUMI function
#' # First demultiplex example FASTQ files
#' fastqs <- list.files(system.file("extdata", package = "scruff"),
#' pattern = "\\.fastq\\.gz", full.names = TRUE)
#'
#' de <- demultiplex(
#' project = "example",
#' experiment = c("1h1", "b1"),
#' lane = c("L001", "L001"),
#' read1Path = c(fastqs[1], fastqs[3]),
#' read2Path = c(fastqs[2], fastqs[4]),
#' barcodeExample,
#' bcStart = 1,
#' bcStop = 8,
#' umiStart = 9,
#' umiStop = 12,
#' keep = 75,
#' overwrite = TRUE)
#'
#' # Alignment
#' library(Rsubread)
#' # Create index files for GRCm38_MT.
#' fasta <- system.file("extdata", "GRCm38_MT.fa", package = "scruff")
#' # Specify the basename for Rsubread index
#' indexBase <- "GRCm38_MT"
#' buildindex(basename = indexBase, reference = fasta, indexSplit = FALSE)
#'
#' al <- alignRsubread(de, indexBase, overwrite = TRUE)
#'
#' # Counting
#' gtf <- system.file("extdata", "GRCm38_MT.gtf", package = "scruff")
#' sce = countUMI(al, gtf, cellPerWell=c(rep(1, 94), 0, 0, rep(1, 94), 300, 1))
#' }
#'
#' # or use the built-in SingleCellExperiment object generated using
#' # example dataset (see ?sceExample)
#' data(sceExample, package = "scruff")
#' @import data.table refGenome GenomicFeatures
#' @rawNamespace import(GenomicAlignments, except = c(second, last, first))
#' @export
countUMI <- function(sce,
reference,
umiEdit = 0,
format = "BAM",
outDir = "./Count",
cellPerWell = 1,
cores = max(1, parallel::detectCores() - 2),
outputPrefix = "countUMI",
verbose = FALSE,
logfilePrefix = format(Sys.time(), "%Y%m%d_%H%M%S")) {
.checkCores(cores)
isWindows <- .Platform$OS.type == "windows"
message(Sys.time(), " Start UMI counting ...")
print(match.call(expand.dots = TRUE))
if (ncol(sce) != length(cellPerWell) & length(cellPerWell) != 1) {
stop("The length of cellPerWell is not equal to the column number",
" of the SCE object.")
}
alignmentFilePaths <- SummarizedExperiment::colData(sce)$alignment_path
if (!all(file.exists(alignmentFilePaths))) {
stop("Partial or all alignment files nonexistent. ",
"Please check paths are correct.\n",
alignmentFilePaths)
}
logfile <- paste0(logfilePrefix, "_countUMI_log.txt")
if (verbose) {
.logMessages(Sys.time(),
"... Start UMI counting",
logfile = logfile,
append = FALSE)
.logMessages(Sys.time(),
alignmentFilePaths,
logfile = logfile,
append = TRUE)
message("... Input alignment files:")
print(alignmentFilePaths)
} else {
.logMessages(Sys.time(),
"... Start UMI counting",
logfile = NULL,
append = FALSE)
}
message(Sys.time(),
" ... Creating output directory",
outDir)
dir.create(file.path(outDir),
showWarnings = FALSE,
recursive = TRUE)
message(Sys.time(),
" ... Loading TxDb file")
features <- suppressPackageStartupMessages(.gtfReadDb(reference))
tryCatch(
GenomeInfoDb::seqlevelsStyle(features) <- "NCBI",
error = function(e) {
warning("found no sequence renaming map compatible with seqname",
" style 'NCBI' for the input reference ", basename(reference))
}
)
# parallelization BiocParallel
if (format == "SAM") {
if (isWindows) {
alignmentFilePaths <- BiocParallel::bplapply(
X = alignmentFilePaths,
FUN = .toBam,
BPPARAM = BiocParallel::SnowParam(
workers = cores),
logfile, overwrite = FALSE, index = FALSE)
} else {
alignmentFilePaths <- BiocParallel::bplapply(
X = alignmentFilePaths,
FUN = .toBam,
BPPARAM = BiocParallel::MulticoreParam(
workers = cores),
logfile, overwrite = FALSE, index = FALSE)
}
alignmentFilePaths <- unlist(alignmentFilePaths)
}
if (isWindows) {
exprL <- suppressPackageStartupMessages(
BiocParallel::bplapply(
X = alignmentFilePaths,
FUN = .countUmiUnit,
BPPARAM = BiocParallel::SnowParam(
workers = cores),
features,
umiEdit,
logfile,
verbose)
)
} else {
exprL <- suppressPackageStartupMessages(
BiocParallel::bplapply(
X = alignmentFilePaths,
FUN = .countUmiUnit,
BPPARAM = BiocParallel::MulticoreParam(
workers = cores),
features,
umiEdit,
logfile,
verbose)
)
}
expr <- do.call(cbind, exprL)
expr <- data.table::as.data.table(expr, keep.rownames = TRUE)
colnames(expr)[1] <- "geneid"
message(Sys.time(),
" ... Write UMI filtered count matrix to ",
file.path(outDir, paste0(
format(Sys.time(),
"%Y%m%d_%H%M%S"), "_",
outputPrefix, ".tsv"
))
)
data.table::fwrite(expr, file.path(outDir, paste0(
format(Sys.time(), "%Y%m%d_%H%M%S"), "_",
outputPrefix, ".tsv"
)), sep = "\t")
message(Sys.time(),
" ... Add count matrix and QC metrics to SCE object.")
scruffsce <- SingleCellExperiment::SingleCellExperiment(
assays = list(counts = as.matrix(
expr[!geneid %in% c("reads_mapped_to_genome",
"reads_mapped_to_genes",
"median_reads_per_umi",
"avg_reads_per_umi",
"median_reads_per_corrected_umi",
"avg_reads_per_corrected_umi"), -"geneid"],
rownames = expr[!geneid %in% c("reads_mapped_to_genome",
"reads_mapped_to_genes",
"median_reads_per_umi",
"avg_reads_per_umi",
"median_reads_per_corrected_umi",
"avg_reads_per_corrected_umi"),
geneid])))
readmapping <- t(expr[geneid %in% c("reads_mapped_to_genome",
"reads_mapped_to_genes",
"median_reads_per_umi",
"avg_reads_per_umi",
"median_reads_per_corrected_umi",
"avg_reads_per_corrected_umi"), -"geneid"])
colnames(readmapping) <- c("reads_mapped_to_genome",
"reads_mapped_to_genes",
"median_reads_per_umi",
"avg_reads_per_umi",
"median_reads_per_corrected_umi",
"avg_reads_per_corrected_umi")
SingleCellExperiment::isSpike(scruffsce,
"ERCC") <- grepl("ERCC-",
rownames(scruffsce))
geneAnnotation <- .getGeneAnnotationRefGenome(reference, features)
SummarizedExperiment::rowData(scruffsce) <-
S4Vectors::DataFrame(geneAnnotation[order(gene_id), ],
row.names = geneAnnotation[order(gene_id),
gene_id])
message(Sys.time(),
" ... Save gene annotation data to ",
file.path(outDir, paste0(
format(Sys.time(),
"%Y%m%d_%H%M%S"), "_",
outputPrefix, "_gene_annot.tsv"
))
)
data.table::fwrite(geneAnnotation,
sep = "\t",
file = file.path(outDir, paste0(
format(Sys.time(), "%Y%m%d_%H%M%S"), "_",
outputPrefix, "_gene_annot.tsv"
)))
# UMI filtered transcripts QC metrics
# total counts exclude ERCC
totalCounts <- base::colSums(as.data.frame(
SummarizedExperiment::assay(scruffsce)
[!SingleCellExperiment::isSpike(scruffsce, "ERCC"), ]))
# MT counts
mtCounts <- base::colSums(as.data.frame(
SummarizedExperiment::assay(scruffsce)
[grep("MT", SummarizedExperiment::rowData(scruffsce)
[, "seqid"], ignore.case = TRUE), ]))
# gene number exclude ERCC
cm <- SummarizedExperiment::assay(scruffsce)[
!SingleCellExperiment::isSpike(scruffsce, "ERCC"), ]
geneNumber <- vapply(colnames(cm), function(cells) {
sum(cm[, cells] != 0)
}, integer(1))
# protein coding genes
proteinCodingGene <- geneAnnotation[gene_biotype == "protein_coding",
gene_id]
proGene <- vapply(colnames(cm), function(cells) {
sum(cm[proteinCodingGene, cells] != 0)
}, integer(1))
# protein coding counts
proCounts <- base::colSums(as.data.frame(cm[proteinCodingGene, ]))
qcdf <- cbind(SummarizedExperiment::colData(sce),
readmapping,
list(total_counts = totalCounts,
mt_counts = mtCounts,
genes = geneNumber,
protein_coding_genes = proGene,
protein_coding_counts = proCounts,
number_of_cells = cellPerWell))
message(Sys.time(),
" ... Save cell-specific quality metrics to ",
file.path(outDir, paste0(
format(Sys.time(),
"%Y%m%d_%H%M%S"), "_",
outputPrefix, "_QC.tsv"
))
)
data.table::fwrite(as.data.frame(qcdf),
sep = "\t",
file = file.path(outDir, paste0(
format(Sys.time(), "%Y%m%d_%H%M%S"), "_",
outputPrefix, "_QC.tsv"
)))
SummarizedExperiment::colData(scruffsce) <- qcdf
message(Sys.time(),
" ... Save SingleCellExperiment object to ",
file.path(outDir, paste0(
format(Sys.time(),
"%Y%m%d_%H%M%S"), "_",
outputPrefix, "_sce.rda"
))
)
save(scruffsce, file = file.path(outDir, paste0(
format(Sys.time(), "%Y%m%d_%H%M%S"), "_",
outputPrefix, "_sce.rda"
)))
message(Sys.time(), " ... UMI counting done!")
return(scruffsce)
}
.countUmiUnit <- function(i, features, umiEdit, logfile, verbose) {
if (verbose) {
.logMessages(Sys.time(),
"... UMI counting sample",
i,
logfile = logfile,
append = TRUE)
}
# if sequence alignment file is empty
if (file.size(i) == 0) {
countUmiDt <- data.table::data.table(
gene_id = c(sort(names(features)),
"reads_mapped_to_genome",
"reads_mapped_to_genes",
"median_reads_per_umi",
"avg_reads_per_umi",
"median_reads_per_corrected_umi",
"avg_reads_per_corrected_umi"))
cell <- .removeLastExtension(i)
countUmiDt[[cell]] <- 0
return (data.frame(countUmiDt,
row.names = 1,
check.names = FALSE,
fix.empty.names = FALSE))
}
bfl <- Rsamtools::BamFile(i)
bamGA <- GenomicAlignments::readGAlignments(bfl, use.names = TRUE)
tryCatch(
GenomeInfoDb::seqlevelsStyle(bamGA) <- "NCBI",
error = function(e) {
warning("found no sequence renaming map compatible with seqname",
" style 'NCBI' for the BAM file ", basename(i))
}
)
genomeReads <- data.table::data.table(
name = names(bamGA),
seqnames = as.vector(GenomicAlignments::seqnames(bamGA)))
if (length(unique(genomeReads[, name])) != nrow(genomeReads)) {
stop("Corrupt BAM file ",
i,
". Duplicate read names detected.",
" Try rerunning demultiplexing and alignment functions",
" with appropriate number of cores and set nBestLocations = 1.")
}
# reads mapped to genome (exclude ERCC spike-in)
readsMappedToGenome <- nrow(
genomeReads[!grepl("ERCC", genomeReads[, seqnames]), .(name)])
# UMI filtering
ol <- GenomicAlignments::findOverlaps(features, bamGA)
oldt <- data.table::data.table(
gene_id = base::names(features)[S4Vectors::queryHits(ol)],
name = base::names(bamGA)[S4Vectors::subjectHits(ol)],
pos = BiocGenerics::start(bamGA)[S4Vectors::subjectHits(ol)]
)
# remove ambiguous gene alignments (union mode filtering)
if (nrow(oldt) != 0) {
oldt <- oldt[!(
base::duplicated(oldt, by = "name") |
base::duplicated(oldt, by = "name", fromLast = TRUE)), ]
}
# if 0 count in the cell
if (nrow(oldt) == 0) {
# clean up
countUmiDt <- data.table::data.table(
gene_id = c(names(features),
"reads_mapped_to_genome",
"reads_mapped_to_genes",
"median_reads_per_umi",
"avg_reads_per_umi",
"median_reads_per_corrected_umi",
"avg_reads_per_corrected_umi"))
cell <- .removeLastExtension(i)
countUmiDt[[cell]] <- 0
countUmiDt[gene_id == "reads_mapped_to_genome",
cell] <- readsMappedToGenome
# coerce to data frame to keep rownames for cbind combination
countUmiDt <- data.frame(countUmiDt,
row.names = 1,
check.names = FALSE,
fix.empty.names = FALSE)
} else {
# move UMI to a separate column
oldt[, umi := data.table::last(data.table::tstrsplit(name, ":"))]
oldt[, inferred_umi := umi]
# reads mapped to genes
readsMappedToGenes <- nrow(oldt[!grepl("ERCC", oldt[, gene_id ]), ])
# median number of reads per umi
rpu <- table(oldt[, umi])
medReadsUmi <- median(rpu)
avgReadsUmi <- mean(rpu)
medReadsUmic <- 0
avgReadsUmic <- 0
# UMI filtering and correction
# strict way of doing UMI filtering:
# reads with different pos are considered unique trancsript molecules
# countUmi <- base::table(
# unique(oldt[, .(gene_id, umi, pos)])
# [, gene_id])
# The way CEL-Seq pipeline does UMI filtering:
# Reads with different UMI tags are
# considered unique trancsript molecules
# Read positions do not matter
# UMI correction
if (umiEdit != 0) {
for (g in unique(oldt[, gene_id])) {
umis <- sort(table(oldt[gene_id == g, inferred_umi]),
decreasing = TRUE)
j <- 1
while (j < length(umis)) {
u <- names(umis)[j]
sdm <- stringdist::stringdistmatrix(u, names(umis),
method = "hamming", nthread = 1)
sdm[which(sdm == 0)] <- NA
mindist <- min(sdm, na.rm = TRUE)
if (mindist <= umiEdit & mindist != 0) {
inds <- which(sdm == mindist)
oldt[umi %in% names(umis)[inds],
inferred_umi := names(umis)[j]]
}
umis <- sort(table(oldt[gene_id == g, inferred_umi]),
decreasing = TRUE)
j <- j + 1
}
}
# after correction median numbre of reads per umi
rpuc <- table(oldt[, inferred_umi])
medReadsUmic <- median(rpuc)
avgReadsUmic <- mean(rpuc)
}
countUmi <- base::table(unique(oldt[,
.(gene_id, inferred_umi)])[, gene_id])
# clean up
countUmiDt <- data.table::data.table(
gene_id = c(names(features),
"reads_mapped_to_genome",
"reads_mapped_to_genes",
"median_reads_per_umi",
"avg_reads_per_umi",
"median_reads_per_corrected_umi",
"avg_reads_per_corrected_umi"))
cell <- .removeLastExtension(i)
countUmiDt[[cell]] <- 0
countUmiDt[gene_id == "reads_mapped_to_genome",
cell] <- readsMappedToGenome
countUmiDt[gene_id == "reads_mapped_to_genes",
cell] <- readsMappedToGenes
countUmiDt[gene_id == "median_reads_per_umi",
cell] <- medReadsUmi
countUmiDt[gene_id == "avg_reads_per_umi",
cell] <- avgReadsUmi
countUmiDt[gene_id == "median_reads_per_corrected_umi",
cell] <- medReadsUmic
countUmiDt[gene_id == "avg_reads_per_corrected_umi",
cell] <- avgReadsUmic
countUmiDt[gene_id %in% names(countUmi),
eval(cell) := as.numeric(countUmi[gene_id])]
# coerce to data frame to keep rownames for cbind combination
countUmiDt <- data.frame(countUmiDt,
row.names = 1,
check.names = FALSE,
fix.empty.names = FALSE)
}
return (countUmiDt)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.