# Last modified 2018-05-29

# Highly recommended to run this template on an HPC cluster
# stopifnot(detectHPC())

bcbioSingleCell::prepareSingleCellTemplate()
source("_setup.R")

# Directory paths
dir.create(params$results_dir, recursive = TRUE, showWarnings = FALSE)

# Load seurat object
seurat_name <- load(params$seurat_file)
seurat <- get(seurat_name, inherits = FALSE)
stopifnot(is(seurat, "seurat"))
invisible(validObject(seurat))

# Load presaved Seurat markers
if (!is.null(params$all_markers_file)) {
    # Load presaved `Seurat::FindAllMarkers()` return.
    # Use the original, unsanitized data.frame.
    all_markers_name <- load(params$all_markers_file)
    all_markers <- get(all_markers_name, inherits = FALSE)
}

# Default to using our internal cell type markers, managed on Google Sheets.
# File on issue on the bcbioSingleCell GitHub repo if this list needs to get
# updated or the desired organism isn't yet supported.
if (!is.null(params$cell_type_markers_file)) {
    gene2symbol <- gene2symbol(seurat)
    cell_type_markers <- readCellTypeMarkers(
        file = params$cell_type_markers_file,
        gene2symbol = gene2symbol
    )
} else {
    stopifnot(!is.null(params$organism))
    cell_type_markers <- bcbioSingleCell::cellTypeMarkers %>%
        .[[camel(params$organism)]]
}
stopifnot(is.data.frame(cell_type_markers))
assignAndSaveData(
    name = paste(seurat_name, "cell_type_markers", sep = "_"),
    object = cell_type_markers,
    dir = params$data_dir
)

# knitr arguments (for `rmarkdown::render()` looping)
# opts_chunk$set(
#     cache.path = paste(
#         seurat_name,
#         "markers",
#         "cache/",
#         sep = "_"
#     ),
#     fig.path = paste(
#         seurat_name,
#         "markers",
#         "files/",
#         sep = "_"
#     )
# )

This workflow is adapted from the following sources:


Differential expression

Seurat can help you find markers that define clusters via differential expression. By default, it identifes positive and negative markers of a single cluster (specified in ident.1), compared to all other cells. FindAllMarkers() automates this process for all clusters, but you can also test groups of clusters vs. each other, or against all cells.

The min.pct argument requires a gene to be detected at a minimum percentage in either of the two groups of cells, and the thresh.test argument requires a gene to be differentially expressed (on average) by some amount between the two groups. You can set both of these to 0, but with a dramatic increase in time - since this will test a large number of genes that are unlikely to be highly discriminatory. As another option to speed up these computations, max.cells.per.ident can be set. This will downsample each identity class to have no more cells than whatever this is set to. While there is generally going to be a loss in power, the speed increases can be significiant and the most highly differentially expressed genes will likely still rise to the top.

Seurat has multiple tests for differential expression which can be set with the test.use parameter. Currently it defaults to Wilcoxon rank sum test ("wilcox"), which tends to overreport significant markers. Instead we currently recommend using MAST (Finak et al, Genome Biology, 2015) or DESeq2 (Love et al, Genome Biology, 2014).

# MAST and DESeq2 are recommended over Wilcoxon rank sum test (default). This
# step can a long time to run and shouldn't be cached. We recommend running this
# step outside of R Markdown and then loading the presaved R data file instead.
if (!exists("all_markers", inherits = FALSE)) {
    all_markers <- FindAllMarkers(seurat)
    # Save the original, unmodified data.frame as a backup
    assignAndSaveData(
        name = paste(seurat_name, "markers", "original", sep = "_"),
        object = all_markers,
        dir = params$data_dir
    )
}

# Sanitize the markers data.frame and include gene metadata
all_markers <- sanitizeMarkers(seurat, markers = all_markers)
assignAndSaveData(
    name = paste(seurat_name, "markers", "sanitized", sep = "_"),
    object = all_markers,
    dir = params$data_dir
)
write_csv(
    all_markers,
    path = file.path(
        params$results_dir,
        paste0(seurat_name, "_markers_sanitized.csv.gz")
    )
)

Top markers per cluster {.tabset}

top_markers <- topMarkers(all_markers)
top_markers
plotTopMarkers(
    object = seurat,
    markers = top_markers,
    headerLevel = 2,
    dark = params$dark
)

Cluster heterogeneity

Heatmaps can also be a good way to examine heterogeneity within/between clusters. The DoHeatmap() function will generate a heatmap for given cells and genes. In this case, we are plotting the top markers for each cluster.

colors <- plasma(n = 3, begin = 0, end = 1)
DoHeatmap(
    object = seurat,
    genes.use = unique(top_markers$rowname),
    col.low = colors[[1]],
    col.mid = colors[[2]],
    col.high = colors[[3]],
    remove.key = FALSE,
    rotate.key = TRUE,
    slim.col.label = TRUE
)

Known cell type markers {.tabset}

known_markers_detected <- knownMarkersDetected(
    object = all_markers,
    known = cell_type_markers
)
known_markers_detected
assignAndSaveData(
    name = paste(seurat_name, "known_markers_detected", sep = "_"),
    object = known_markers_detected,
    dir = params$data_dir
)
plotKnownMarkersDetected(
    object = seurat,
    markers = known_markers_detected,
    headerLevel = 2,
    dark = params$dark
)

Cell types per cluster {.tabset}

cell_types_per_cluster <- cellTypesPerCluster(known_markers_detected)
cell_types_per_cluster
assignAndSaveData(
    name = paste(seurat_name, "cell_types_per_cluster", sep = "_"),
    object = cell_types_per_cluster,
    dir = params$data_dir
)
plotCellTypesPerCluster(
    object = seurat,
    cellTypesPerCluster = cell_types_per_cluster,
    headerLevel = 2,
    dark = params$dark
)



roryk/bcbioSinglecell documentation built on May 27, 2019, 10:44 p.m.