#' @rdname prepData
#' @title Data preparation
#'
#' @param x a \code{flowSet} holding all samples
#' or a path to a set of FCS files.
#' @param panel a data.frame containing, for each channel,
#' its column name in the input data, targeted protein marker,
#' and (optionally) class ("type", "state", or "none").
#' If `panel` is unspecified, it will be constructed
#' from the first input sample via \code{\link{guessPanel}}.
#' @param md a table with column describing the experiment.
#' An exemplary metadata table could look as follows: \itemize{
#' \item\code{file_name}: the FCS file name
#' \item\code{sample_id}: a unique sample identifier
#' \item\code{patient_id}: the patient ID
#' \item\code{condition}: brief sample description
#' (e.g. reference/stimulated, healthy/diseased)}
#' If `md` is unspecified, the \code{flowFrame/Set}
#' \code{\link[flowCore:identifier-methods]{identifier}}(s)
#' will be used as sample IDs with no additional metadata factors.
#' @param features a logical vector, numeric vector of column indices,
#' or character vector of channel names. Specified which column to keep
#' from the input data. Defaults to the channels listed in the input panel.
#' @param transform logical. Specifies whether an arcsinh-transformation with
#' cofactor cofactor should be performed, in which case expression values
#' (transformed counts) will be stored in assay(x, "exprs").
#' @param cofactor numeric cofactor(s) to use for optional
#' arcsinh-transformation when \code{transform = TRUE};
#' single value or a vector with channels as names.
#' @param panel_cols a names list specifying
#' the \code{panel} column names that contain channel names,
#' targeted protein markers, and (optionally) marker classes.
#' When only some \code{panel_cols} deviate from the defaults,
#' specifying only these is sufficient.
#' @param md_cols a named list specifying the column names of \code{md}
#' that contain the FCS file names, sample IDs, and factors of interest
#' (batch, condition, treatment etc.). When only some \code{md_cols} deviate
#' from the defaults, specifying only these is sufficient.
#' @param by_time logical; should samples be ordered by acquisition time?
#' Ignored if \code{!is.null(md)} in which case samples will be ordered
#' as they are listed in \code{md[[md_cols$file]]}. (see details)
#' @param FACS logical; is this FACS / flow cytometry data?
#' By default, \code{prepData} moves non-mass channels to
#' the output SCE's \code{int_colData}; \code{FACS = TRUE}
#' assures that all channels are kept as \code{assay} data.
#' If \code{FALSE}, \code{prepData} will try and access
#' the input \code{flowFrame/Set}'s \code{"$CYT"} descriptor
#' (\code{keyword(., "$CYT")}) to determine the data type;
#' this may be inaccurate for some cytometer descriptors.
#' @param fix_chs specifies the strategy to use in case of panel discrepancies.
#' \code{"common"} will retain only channels present in all frames/FCS files;
#' \code{"all"} will retain the union of channels across samples. In the
#' latter case, a logical matrix with rows = channels and columns = samples
#' will be stored under \code{metadata} slot \code{chs_by_fcs} specifying
#' which channels were/n't (\code{FALSE/TRUE}) measured in which samples.
#' @param ... additional arguments passed to \code{\link{read.FCS}}.
#' E.g., \code{channel_alias} in case of panel discrepancies between frames/
#' FCS files. By default, \code{transformation = truncate_max_range = FALSE}.
#'
#' @author Helena L Crowell \email{helena.crowell@@uzh.ch}
#'
#' @return a \code{\link[SingleCellExperiment]{SingleCellExperiment}}.
#'
#' @details
#' By default, non-mass channels (e.g., time, event lengths) will be removed
#' from the output SCE's assay data and instead stored in the object's internal
#' cell metadata (\code{int_colData}) to assure these data are not subject to
#' transformations or other computations applied to the assay data.
#'
#' For more than 1 sample, \code{prepData} will concatenate cells into a single
#' \code{SingleCellExperiment} object. Note that cells will hereby be order by
#' \code{"Time"}, regardless of whether \code{by_time = TRUE} or \code{FALSE}.
#' Instead, \code{by_time} determines the sample (not cell!) order;
#' i.e., whether samples should be kept in their original order,
#' or should be re-ordered according to their acquision time
#' stored in \code{keyword(flowSet, "$BTIM")}.
#'
#' When a metadata table is specified (i.e. \code{!is.null(md)}),
#' argument \code{by_time} will be ignored and sample ordering
#' is instead determined by \code{md[[md_cols$file]]}.
#'
#' @examples
#' data(PBMC_fs, PBMC_panel, PBMC_md)
#' prepData(PBMC_fs, PBMC_panel, PBMC_md)
#'
#' # channel-specific transformation
#' cf <- sample(seq_len(10)[-1], nrow(PBMC_panel), TRUE)
#' names(cf) <- PBMC_panel$fcs_colname
#' sce <- prepData(PBMC_fs, cofactor = cf)
#' int_metadata(sce)$cofactor
#'
#' # input has different name for "condition"
#' md <- PBMC_md
#' m <- match("condition", names(md))
#' colnames(md)[m] <- "treatment"
#'
#' # add additional factor variable batch ID
#' md$batch_id <- sample(c("A", "B"), nrow(md), TRUE)
#'
#' # specify 'md_cols' that differ from defaults
#' factors <- list(factors = c("treatment", "batch_id"))
#' ei(prepData(PBMC_fs, PBMC_panel, md, md_cols = factors))
#'
#' # without panel & metadata tables
#' sce <- prepData(raw_data)
#'
#' # 'flowFrame' identifiers are used as sample IDs
#' levels(sce$sample_id)
#'
#' # panel was guess with 'guessPanel';
#' # non-mass channels are set to marker class "none"
#' rowData(sce)
#'
#' @importFrom methods new
#' @importFrom dplyr mutate_all rename
#' @importFrom flowCore colnames exprs exprs<- flowSet
#' fsApply identifier keyword parameters
#' @importFrom SingleCellExperiment SingleCellExperiment int_colData<- int_metadata<-
#' @importFrom SummarizedExperiment assay assay<-
#' @importFrom S4Vectors DataFrame
#' @export
prepData <- function(x, panel = NULL, md = NULL,
features = NULL, transform = TRUE, cofactor = 5,
panel_cols = list(channel = "fcs_colname",
antigen = "antigen", class = "marker_class"),
md_cols = list(
file = "file_name", id = "sample_id",
factors = c("condition", "patient_id")),
by_time = TRUE, FACS = FALSE,
fix_chs = c("common", "all"), ...) {
# read in data as 'flowSet'
fix_chs <- match.arg(fix_chs)
tmp <- .read_fs(x, fix_chs, ...)
fs <- tmp[[1]]; mtx <- tmp[[2]]
# reorder by acquisition time if 'md' is unspecified
if (is.null(md) && by_time && length(fs) > 1) {
ts <- keyword(fs, "$BTIM")
if(any(vapply(ts, is.null, logical(1)))) {
message("Not all samples contain information on their",
" acquisition time; ignoring argument 'by_time'.",
" Samples will be kept in their original order.")
} else {
o <- order(ts)
fs <- fs[o]
if (!is.null(mtx))
mtx <- mtx[, o, drop = FALSE]
}
}
# assure panel & metadata are data.frames
for (u in c("panel", "md"))
if (!is.null(get(u)))
assign(u, data.frame(get(u),
check.names = FALSE,
stringsAsFactors = FALSE))
# fill up missing values with function defaults
stopifnot(is.list(panel_cols), is.list(md_cols))
args <- as.list(environment())
for (i in c("md_cols", "panel_cols")) {
defs <- as.list(formals("prepData")[[i]][-1])
miss <- !names(defs) %in% names(args[[i]])
if (any(miss)) {
fill <- lapply(defs[miss], eval)
assign(i, c(args[[i]], fill)[names(defs)])
}
}
# if unspecified, construct panel & metadata tables
if (is.null(panel)) {
panel <- guessPanel(fs[[1]])
panel$marker_class <- ifelse(panel$use_channel, "state", "none")
} else stopifnot(
c("channel", "antigen") %in% names(panel_cols),
all(setdiff(unlist(panel_cols), "marker_class") %in% names(panel)))
if (is.null(md)) {
ids <- fsApply(fs, identifier)
md <- data.frame(
file_name = ids,
sample_id = basename(ids))
md_cols$factors <- NULL
} else stopifnot(
all(unlist(md_cols) %in% names(md)),
c("file", "id", "factors") %in% names(md_cols))
# check channels listed in panel
stopifnot(panel[[panel_cols$channel]] %in% colnames(fs))
if (is.null(features)) {
# default to channels listed in panel
features <- as.character(panel[[panel_cols$channel]])
} else {
# check validity of 'features'
chs <- colnames(fs)
check1 <- is.logical(features) && length(features) == length(chs)
check2 <- is.integer(features) && all(features %in% seq_along(chs))
check3 <- all(features %in% chs)
if (!any(check1, check2, check3))
stop("Invalid argument 'features'. Should be either",
" a logial vector,\n a numeric vector of indices, or",
" a character vector of column names.")
# subset panel & reorder features accordingly
m <- match(panel[[panel_cols$channel]], features, nomatch = 0)
panel <- panel[m != 0, , drop = FALSE]
features <- features[m]
}
# check that filenames or identifiers
# match b/w 'flowSet' & metadata
ids0 <- md[[md_cols$file]]
ids1 <- fsApply(fs, identifier)
ids2 <- keyword(fs, "FILENAME")
if (length(unlist(ids2)) == length(fs))
ids2 <- basename(ids2)
check1 <- all(ids1 %in% ids0)
check2 <- all(ids2 %in% ids0)
ids_use <- which(c(check1, check2))[1]
ids <- list(ids1, ids2)[[ids_use]]
if (is.null(ids)) {
stop("Couldn't match 'flowSet'/FCS filenames\n",
"with those listed in 'md[[md_cols$file]]'.")
} else {
# reorder 'flowSet' frames according to metadata table
fs <- fs[match(md[[md_cols$file]], ids)]
}
# assure correctness of formats
k <- c(md_cols$id, md_cols$factors)
md <- md[, k, drop = FALSE] %>%
mutate_all(factor) %>%
rename("sample_id" = md_cols$id)
# replace problematic characters
as <- panel[[panel_cols$antigen]]
as[is.na(as)] <- panel[[panel_cols$channel]][is.na(as)]
# column & panel subsetting
fs <- fs[, features]
chs0 <- colnames(fs)
# replace channel w/ antigen names
m1 <- match(panel[[panel_cols$channel]], chs0, nomatch=0)
m2 <- match(chs0, panel[[panel_cols$channel]], nomatch=0)
as <- as[m2]
ns <- table(as)
for (a in names(ns)) if (ns[a] > 1)
as[as == a] <- paste(a, seq_len(ns[a]), sep = ".")
flowCore::colnames(fs)[m1] <- as
chs <- colnames(fs)
# get exprs.
es <- matrix(fsApply(fs, exprs), byrow = TRUE,
nrow = length(chs), dimnames = list(chs, NULL))
# fix event times
t <- grep("time", colnames(fs), ignore.case = TRUE)
if (length(t) != 0) {
ns <- fsApply(fs, nrow)
t0 <- c(1, cumsum(ns) + 1)
tx <- t0[-1] - 1
for (i in seq_along(fs)[-1]) {
idx <- seq(t0[i], tx[i])
es[t, idx] <- es[t, idx] + es[t, tx[i - 1]]
}
}
# get & check marker classes if provided
mcs <- c("type", "state", "none")
if (is.null(panel_cols$class) || is.null(panel[[panel_cols$class]])) {
mcs <- factor("none", levels = mcs)
} else {
mcs <- factor(panel[[panel_cols$class]], levels = mcs)
if (any(is.na(mcs)))
stop("Invalid marker classes detected;",
" valid classes are 'type', 'state', and 'none'.")
}
# construct row/colData & int_metadata
rd <- DataFrame(
row.names = chs, channel_name = chs0,
marker_name = chs, marker_class = mcs)
m <- match(chs0, panel[[panel_cols$channel]], nomatch = 0)
rd$use_channel <- panel$use_channel
md$n_cells <- as.numeric(fsApply(fs, nrow))
k <- setdiff(names(md), "n_cells")
cd <- DataFrame(lapply(md[k], function(u) {
v <- as.character(rep(u, md$n_cells))
factor(v, levels = levels(u))
}), row.names = NULL)
# construct SCE
sce <- SingleCellExperiment(
assays = list(counts = es),
rowData = rd, colData = cd,
metadata = list(experiment_info = .get_ei(cd)))
if (is.null(mtx))
mtx <- matrix(TRUE, length(chs), length(fs))
dimnames(mtx) <- list(chs, ids)
metadata(sce)$chs_by_fcs <- mtx
ds <- keyword(fs[[1]])
l <- list(cyt = "\\$CYT$", sn = "\\$CYTSN$")
keep <- lapply(l, grep, names(ds))
int_metadata(sce)$description <- ds[unlist(keep)]
# move non-mass channels to internal cell metadata
if (!FACS && (length(keep$cyt) == 0 || !grepl("FACS", ds[[keep$cyt]]))) {
is_mass <- !is.na(.get_ms_from_chs(chs0))
icd <- t(es[!is_mass, , drop = FALSE])
icd <- DataFrame(icd, check.names = FALSE)
colnames(icd) <- rownames(es)[!is_mass]
# store internally & exclude from assay data
icd <- cbind(int_colData(sce), icd)
int_colData(sce) <- icd
sce <- sce[is_mass, ]
}
# (optionally) do arcsinh-transformation & return SCE
if (transform) .transform(sce, cofactor) else sce
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.