##########################################################################################
# R script to prepare benchmark datasets Samusik_01 and Samusik_all
#
# This is a 39-dimensional mass cytometry (CyTOF) dataset, consisting of expression levels
# of 39 surface protein markers. Cell population labels are available for 24 manually
# gated populations. The dataset contains cells from 10 replicate bone marrow samples from
# C57BL/6J mice (samples from 10 different mice): Samusik_01 contains data from sample 01
# only, and Samusik_all contains data from all samples.
#
# This R script loads the data, adds manually gated cell population labels, and exports it
# in SummarizedExperiment and flowSet formats.
#
# Source: Samusik et al. (2016), "Automated mapping of phenotype space with single-cell
# data", Nature Methods, 13(6), 493-496.
#
# Link to paper: https://www.ncbi.nlm.nih.gov/pubmed/27183440
# Link to data (.zip file): "https://web.stanford.edu/~samusik/Panorama BM 1-10.zip"
#
# Lukas Weber, Jan 2019
##########################################################################################
# original version of this script can be found at:
# https://github.com/lmweber/cytometry-clustering-comparison
suppressPackageStartupMessages({
library(flowCore)
library(SummarizedExperiment)
})
# -------------
# Download data
# -------------
# one FCS file per sample (samples 01 to 10)
# 39 surface markers (dimensions), 24 manually gated populations
# create temporary directories
DIR_TMP <- "tmp"
dir.create(file.path(DIR_TMP))
dir.create(file.path(DIR_TMP, "fcs_files"))
dir.create(file.path(DIR_TMP, "population_IDs"))
# download from 'imlspenticton' server
URL <- "http://imlspenticton.uzh.ch/robinson_lab/HDCytoData"
DIR <- "Samusik"
# download .fcs files
fcs_filename <- "Samusik_fcs_files.zip"
download.file(file.path(URL, DIR, fcs_filename), destfile = file.path(DIR_TMP, "fcs_files", fcs_filename))
unzip(file.path(DIR_TMP, "fcs_files", fcs_filename), exdir = file.path(DIR_TMP, "fcs_files"))
files_load_fcs <- list.files(file.path(DIR_TMP, "fcs_files"), pattern = "\\.fcs$", full.names = TRUE)
# download population IDs
pop_filename <- "Samusik_population_IDs.zip"
download.file(file.path(URL, DIR, pop_filename), destfile = file.path(DIR_TMP, "population_IDs", pop_filename))
unzip(file.path(DIR_TMP, "population_IDs", pop_filename), exdir = file.path(DIR_TMP, "population_IDs"))
file_gating <- file.path(DIR_TMP, "population_IDs", "population_assignments.txt")
# ----------------------
# Load population labels
# ----------------------
data_gating <- read.table(file_gating, header = FALSE, stringsAsFactors = FALSE, sep = "\t")
# remove line with error (line is partially cut off) and reset row names
data_gating[164401, ]
data_gating <- data_gating[-164401, ]
rownames(data_gating) <- NULL
dim(data_gating) # 514,386 cells
# extract sample numbers
sample <- as.factor(sapply(strsplit(data_gating[, 1], split = " "), function(s) s[1]))
str(sample)
length(sample)
# extract event (cell) numbers
event <- sapply(strsplit(data_gating[, 1], split = " "), function(s) s[3])
event <- as.numeric(event)
str(event)
# add 1 to event numbers (event numbers are provided as index-0, but R-based row numbers
# in the .fcs files are index-1)
event <- event + 1
all(event == floor(event)) # check: all integers
sum(event == 1) # check: multiple events no. 1 (but not necessarily one for every sample due to unassigned cells)
# split event numbers into one data frame per sample
event <- split(event, sample)
str(event)
# extract population labels
population <- data_gating[, 2]
str(population)
# split population labels into one data frame per sample
population <- split(population, sample)
str(population)
# ---------------------------------
# Load data and create row metadata
# ---------------------------------
# note: 'unassigned' refers to cells not assigned to any population by manual gating
# channel and marker names
channel_name <- as.character(pData(parameters(read.FCS(files_load_fcs[1])))$name)
marker_name <- as.character(pData(parameters(read.FCS(files_load_fcs[1])))$desc)
# original column names
col_names <- colnames(exprs(read.FCS(files_load_fcs[1])))
# marker classes (cell type, cell state, or none)
marker_class <- rep("none", length(marker_name))
marker_class[c(9:47)] <- "type"
data <- list()
row_data <- list()
for (i in 1:length(files_load_fcs)) {
data_i <- exprs(read.FCS(files_load_fcs[i], transformation = FALSE, truncate_max_range = FALSE))
names_i <- basename(files_load_fcs[i])
# sample IDs
sample_i <- data.frame(sample_id = rep(gsub("^.*_([0-9]+)_.*$", "\\1", files_load_fcs[[i]]), nrow(data_i)))
# population IDs ('unassigned' = not assigned to any population by manual gating)
pop_i <- data.frame(population_id = rep("unassigned", nrow(data_i)), stringsAsFactors = FALSE)
pop_i[event[[i]], "population_id"] <- population[[i]]
# convert to factor
pop_i[, "population_id"] <- as.factor(pop_i[, "population_id"])
data[[i]] <- data_i
names(data)[i] <- names_i
row_data[[i]] <- cbind(sample_i, pop_i)
names(row_data)[i] <- names_i
}
# number of cells per sample (check against Supp. Table. S2 Excel spreadsheet)
n_cells <- sapply(data, nrow)
n_cells
# number of assigned cells per sample
n_assigned <- sapply(event, length)
n_assigned
# proportion assigned
prop_assigned <- n_assigned / n_cells
prop_assigned
# total number of cells
sum(n_cells)
# check column names
for (i in seq_along(data)) {
stopifnot(all(colnames(data[[i]]) == channel_name))
}
# ----------------------
# Delete temporary files
# ----------------------
unlink(DIR_TMP, recursive = TRUE)
# ----------------------------------
# Create SummarizedExperiment object
# ----------------------------------
# set up row data
row_data <- do.call("rbind", row_data)
rownames(row_data) <- NULL
stopifnot(nrow(row_data) == sum(n_cells))
# set up column data
marker_info <- data.frame(
channel_name = as.character(channel_name),
marker_name = as.character(marker_name),
marker_class = factor(marker_class, levels = c("none", "type", "state")),
stringsAsFactors = FALSE
)
col_data <- marker_info
# set up expression data
d_exprs <- do.call("rbind", data)
# use marker names as column names (for SummarizedExperiment)
colnames(d_exprs) <- marker_name
stopifnot(nrow(d_exprs) == nrow(row_data))
stopifnot(ncol(d_exprs) == nrow(col_data))
# create SummarizedExperiment object for 'Samusik_all' (full dataset)
d_SE_all <- SummarizedExperiment(
assays = list(exprs = d_exprs),
rowData = row_data,
colData = col_data
)
# create SummarizedExperiment object for 'Samusik_01' (sample 01)
d_SE_01 <- d_SE_all[rowData(d_SE_all)$sample_id == "01", ]
# ---------------------
# Create flowSet object
# ---------------------
# note: row data (e.g. population IDs) is stored as additional columns of data in the
# expression matrices; additional information from row data and column data (e.g. marker
# classes, cell population names) is stored in 'description' slot
# table of cell population information
population_info <- data.frame(
population_id = seq_len(nlevels(row_data$population_id)),
population_name = levels(row_data$population_id),
stringsAsFactors = FALSE
)
# create list of extra columns of data for each sample
row_data_fs <- row_data[, "population_id", drop = FALSE]
row_data_fs$population_id <- as.numeric(row_data_fs$population_id)
row_data_fs_list <- lapply(split(row_data_fs, row_data$sample_id), as.matrix)
sample_id_names <- names(table(row_data$sample_id))
stopifnot(all(names(row_data_fs_list) == sample_id_names))
stopifnot(all(sapply(data, nrow) == sapply(row_data_fs_list, nrow)))
# add extra columns of data and create new flowSet object
exprs_fs_list <- data
exprs_fs_list <- lapply(exprs_fs_list, function(e) {
# use original column names (for flowSet)
colnames(e) <- col_names
e
})
d_flowFrames_list <- mapply(function(e, extra_cols) {
# combine and create flowFrame
stopifnot(nrow(e) == nrow(extra_cols))
ff <- flowFrame(cbind(e, extra_cols))
# include both channel and marker names in 'pData(parameters(.))'
stopifnot(length(c(marker_name, colnames(extra_cols))) == nrow(pData(parameters(ff))))
pData(parameters(ff))$desc <- c(marker_name, colnames(extra_cols))
ff
}, exprs_fs_list, row_data_fs_list)
# create flowSet object for 'Samusik_all' (full dataset)
d_flowSet_all <- flowSet(d_flowFrames_list)
# include additional information in 'description' slot
for (i in seq_along(d_flowSet_all)) {
# filename and sample information
description(d_flowSet_all[[i]])$FILENAME <- identifier(d_flowSet_all[[i]])
description(d_flowSet_all[[i]])$SAMPLE_ID <- gsub("^.*_([0-9]+)_.*$", "\\1", identifier(d_flowSet_all[[i]]))
stopifnot(description(d_flowSet_all[[i]])$SAMPLE_ID == sample_id_names[i])
# data frame of marker information
description(d_flowSet_all[[i]])$MARKER_INFO <- marker_info
# data frame of cell population information
description(d_flowSet_all[[i]])$POPULATION_INFO <- population_info
}
# create flowSet object for 'Samusik_01' (sample 01)
d_flowSet_01 <- flowSet(d_flowFrames_list[1])
# include additional information in 'description' slot
for (i in seq_along(d_flowSet_01)) {
# filename and sample information
description(d_flowSet_01[[i]])$FILENAME <- identifier(d_flowSet_01[[i]])
description(d_flowSet_01[[i]])$SAMPLE_ID <- gsub("^.*_([0-9]+)_.*$", "\\1", identifier(d_flowSet_01[[i]]))
stopifnot(description(d_flowSet_all[[i]])$SAMPLE_ID == sample_id_names[i])
# data frame of marker information
description(d_flowSet_01[[i]])$MARKER_INFO <- marker_info
# data frame of cell population information
description(d_flowSet_01[[i]])$POPULATION_INFO <- population_info
}
# ------------
# Save objects
# ------------
save(d_SE_all, file = "Samusik_all_SE.rda")
save(d_flowSet_all, file = "Samusik_all_flowSet.rda")
save(d_SE_01, file = "Samusik_01_SE.rda")
save(d_flowSet_01, file = "Samusik_01_flowSet.rda")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.