##########################################################################################
# R script to prepare benchmark dataset Levine_32dim
#
# This is a 32-dimensional mass cytometry (CyTOF) dataset, consisting of expression levels
# of 32 surface protein markers. Cell population labels are available for 14 manually
# gated populations. Cells are healthy human bone marrow mononuclear cells (BMMCs), from 2
# patients.
#
# This R script loads the data, adds manually gated cell population labels, and exports it
# in SummarizedExperiment and flowSet formats.
#
# Source: "benchmark data set 2" in the following paper:
# Levine et al. (2015), "Data-Driven Phenotypic Dissection of AML Reveals Progenitor-like
# Cells that Correlate with Prognosis", Cell, 162, 184-197.
#
# Link to paper: https://www.ncbi.nlm.nih.gov/pubmed/26095251
# Link to raw data: https://www.cytobank.org/cytobank/experiments/46102 (download the .zip
# file shown under "Exported Files")
#
# Lukas Weber, Jan 2019
##########################################################################################
# original version of this script can be found at:
# https://github.com/lmweber/cytometry-clustering-comparison
suppressPackageStartupMessages({
library(flowCore)
library(SummarizedExperiment)
library(magrittr)
})
# -------------
# Download data
# -------------
# create temporary directories
DIR_TMP <- "tmp"
dir.create(file.path(DIR_TMP))
dir.create(file.path(DIR_TMP, "fcs_files"))
# download from 'imlspenticton' server
URL <- "http://imlspenticton.uzh.ch/robinson_lab/HDCytoData"
DIR <- "Levine_32dim"
# download .fcs files
fcs_filename <- "Levine_32dim_fcs_files.zip"
download.file(file.path(URL, DIR, fcs_filename), destfile = file.path(DIR_TMP, "fcs_files", fcs_filename))
unzip(file.path(DIR_TMP, "fcs_files", fcs_filename), exdir = file.path(DIR_TMP, "fcs_files"))
files <- list.files(file.path(DIR_TMP, "fcs_files"), pattern = "\\.fcs$", full.names = TRUE)
# ---------
# Load data
# ---------
# one .fcs file per manually gated cluster, per patient (H1 and H2)
# 32 surface markers (dimensions), 14 manually gated populations, 2 patients (H1 and H2)
# "unassigned" cells are those where cluster labels are unavailable
files_assigned <- files[-grep("NotDebrisSinglets", files)]
files_unassigned <- files[grep("NotDebrisSinglets", files)]
# cell population names
pop_names <-
grep("H1\\.fcs$", files_assigned) %>%
files_assigned[.] %>%
gsub("(_H1\\.fcs$)", "", .) %>%
gsub("^.*_", "", .) %>%
gsub(" ", "_", .)
# channel and marker names
channel_name <- as.character(pData(parameters(read.FCS(files_assigned[1])))$name)
marker_name <- as.character(pData(parameters(read.FCS(files_assigned[1])))$desc)
# clean marker names
marker_name <- gsub("\\(.*", "", marker_name)
# original column names
col_names <- colnames(exprs(read.FCS(files_assigned[1])))
# marker classes (cell type, cell state, or none)
marker_class <- rep("none", length(marker_name))
marker_class[5:36] <- "type"
# vector of labels for patients H1 and H2
patient_names_assigned <- rep(NA, length(files_assigned))
patient_names_assigned[grep("_H1\\.fcs$", files_assigned)] <- "H1"
patient_names_assigned[grep("_H2\\.fcs$", files_assigned)] <- "H2"
patient_names_unassigned <- c("H1", "H2")
# load data and create vectors of population IDs and sample IDs (for "assigned" cells)
data_assigned <- matrix(nrow = 0, ncol = length(marker_name))
pop_id_assigned <- c()
patient_id_assigned <- c()
for (i in 1:length(files_assigned)) {
# load data
data_i <- exprs(read.FCS(files_assigned[i], transformation = FALSE, truncate_max_range = FALSE))
data_assigned <- rbind(data_assigned, data_i)
# population IDs
pop_id_assigned <- c(pop_id_assigned, rep(pop_names[((i - 1) %% length(pop_names)) + 1], nrow(data_i)))
# patient IDs
patient_id_assigned <- c(patient_id_assigned, rep(patient_names_assigned[i], nrow(data_i)))
}
dim(data_assigned) # 104,184 assigned cells, 32 dimensions
table(pop_id_assigned) # 14 manually gated clusters
table(patient_id_assigned) # 2 patient (72,463 and 31,721 assigned cells each)
stopifnot(nrow(data_assigned) == length(pop_id_assigned))
stopifnot(nrow(data_assigned) == length(patient_id_assigned))
# load data and create vectors of population IDs and sample IDs (for "unassigned" cells)
data_unassigned <- matrix(nrow = 0, ncol = length(marker_name))
pop_id_unassigned <- c()
patient_id_unassigned <- c()
for (i in 1:length(files_unassigned)) {
# load data
data_i <- exprs(read.FCS(files_unassigned[i], transformation = FALSE, truncate_max_range = FALSE))
data_unassigned <- rbind(data_unassigned, data_i)
# population IDs
pop_id_unassigned <- c(pop_id_unassigned, rep("unassigned", nrow(data_i)))
# patient IDs
patient_id_unassigned <- c(patient_id_unassigned, rep(patient_names_unassigned[i], nrow(data_i)))
}
dim(data_unassigned) # 161,443 unassigned cells
table(patient_id_unassigned) # 2 patients (118,888 and 42,555 unassigned cells each)
stopifnot(nrow(data_unassigned) == length(pop_id_unassigned))
stopifnot(nrow(data_unassigned) == length(patient_id_unassigned))
# combine assigned and unassigned cells
data_all <- rbind(data_assigned, data_unassigned)
pop_id_all <- c(pop_id_assigned, pop_id_unassigned)
patient_id_all <- c(patient_id_assigned, patient_id_unassigned)
stopifnot(nrow(data_all) == length(pop_id_all))
stopifnot(nrow(data_all) == length(patient_id_all))
# ----------------------
# Delete temporary files
# ----------------------
unlink(DIR_TMP, recursive = TRUE)
# ----------------------------------
# Create SummarizedExperiment object
# ----------------------------------
# set up row data
row_data <- data.frame(
patient_id = as.factor(patient_id_all),
population_id = as.factor(pop_id_all),
stringsAsFactors = FALSE
)
# set up column data
marker_info <- data.frame(
channel_name = as.character(channel_name),
marker_name = as.character(marker_name),
marker_class = factor(marker_class, levels = c("none", "type", "state")),
stringsAsFactors = FALSE
)
col_data <- marker_info
# set up expression data
d_exprs <- data_all
# use marker names as column names (for SummarizedExperiment)
colnames(d_exprs) <- marker_name
stopifnot(nrow(d_exprs) == nrow(row_data))
stopifnot(ncol(d_exprs) == nrow(col_data))
# create SummarizedExperiment object
d_SE <- SummarizedExperiment(
assays = list(exprs = d_exprs),
rowData = row_data,
colData = col_data
)
# ---------------------
# Create flowSet object
# ---------------------
# note: row data (e.g. population IDs) is stored as additional columns of data in the
# expression matrices; additional information from row data and column data (e.g. marker
# classes, cell population names) is stored in 'description' slot
# table of cell population information
population_info <- data.frame(
population_id = seq_len(length(pop_names) + 1),
population_name = c(pop_names, "unassigned"),
stringsAsFactors = FALSE
)
# create list of extra columns of data for each sample
cols_keep <- 2
row_data_fs <- do.call("cbind", lapply(row_data[, cols_keep, drop = FALSE], as.numeric))
stopifnot(nrow(row_data_fs) == nrow(row_data))
row_data_fs_list <- lapply(split(row_data_fs, row_data$patient_id), matrix, ncol = ncol(row_data_fs))
patient_id_names <- names(table(row_data$patient_id))
stopifnot(all(names(row_data_fs_list) == patient_id_names))
# replace column names
row_data_fs_list <- lapply(row_data_fs_list, function(d) {
colnames(d) <- colnames(row_data_fs)
d
})
# add extra columns of data and create new flowSet object
exprs_fs_list <- lapply(split(d_exprs, row_data$patient_id), matrix, ncol = ncol(d_exprs))
exprs_fs_list <- lapply(exprs_fs_list, function(e) {
# use original column names (for flowSet)
colnames(e) <- col_names
e
})
d_flowFrames_list <- mapply(function(e, extra_cols) {
# combine and create flowFrame
stopifnot(nrow(e) == nrow(extra_cols))
flowFrame(cbind(e, extra_cols))
}, exprs_fs_list, row_data_fs_list)
d_flowSet <- flowSet(d_flowFrames_list)
# include additional information in 'description' slot
for (i in seq_along(d_flowSet)) {
# sample information
description(d_flowSet[[i]])$PATIENT_ID <- identifier(d_flowSet[[i]])
# data frame of marker information
description(d_flowSet[[i]])$MARKER_INFO <- marker_info
# data frame of cell population information
description(d_flowSet[[i]])$POPULATION_INFO <- population_info
}
# ------------
# Save objects
# ------------
save(d_SE, file = "Levine_32dim_SE.rda")
save(d_flowSet, file = "Levine_32dim_flowSet.rda")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.