get_mtx_metadata <- function(mtx_file, n_cells_col_id = 2L) {
con <- file(mtx_file, "r")
n_to_skip <- 0L
while (TRUE) {
n_to_skip <- n_to_skip + 1L
line <- readLines(con, n = 1)
if (substr(line, 0, 1) != "%") break()
}
close(con)
metrics <- strsplit(line, split = " ", fixed = TRUE)[[1]] |> as.integer()
out <- list(n_cells = metrics[2], n_to_skip = n_to_skip)
return(out)
}
initialize_cellwise_covariates <- function(modality_names, n_cells) {
possible_covariates <- c("n_umis", "n_nonzero", "p_mito", "feature_w_max_expression", "frac_umis_max_feature")
covariates_to_compute <- list("Gene Expression" = c("n_umis", "n_nonzero", "p_mito"),
"CRISPR Guide Capture" = c("n_umis", "n_nonzero", "feature_w_max_expression", "frac_umis_max_feature"),
"Antibody Capture" = c("n_umis", "n_nonzero"))
out <- lapply(modality_names, function(modality_name) {
# evaluate the boolean vector for the modality
curr_covariates <- covariates_to_compute[[modality_name]]
bool_vect <- vapply(possible_covariates, function(possible_covariate) {
possible_covariate %in% curr_covariates
}, FUN.VALUE = logical(1))
covariate_list <- list(
n_umis = integer(length = if (bool_vect[["n_umis"]]) n_cells else 0L),
n_nonzero = integer(length = if (bool_vect[["n_nonzero"]]) n_cells else 0L),
p_mito = numeric(length = if (bool_vect[["p_mito"]]) n_cells else 0L),
feature_w_max_expression = integer(length = if (bool_vect[["feature_w_max_expression"]]) n_cells else 0L),
frac_umis_max_feature = numeric(length = if (bool_vect[["frac_umis_max_feature"]]) n_cells else 0L)
)
list(bool_vect = bool_vect,
covariate_list = covariate_list)
}) |> stats::setNames(modality_names)
return(out)
}
preprare_output_covariate_dt <- function(cellwise_covariates, new_modality_names, n_cells_per_batch, modality_feature_ids) {
names(cellwise_covariates) <- new_modality_names
l <- list()
for (i in seq_along(new_modality_names)) {
modality_name <- new_modality_names[i]
computed_feats <- cellwise_covariates[[modality_name]]$bool_vect
for (feat in names(computed_feats)) {
if (computed_feats[[feat]]) {
v <- cellwise_covariates[[modality_name]]$covariate_list[[feat]]
if (feat == "feature_w_max_expression") {
feature_ids <- modality_feature_ids[[i]]
v <- feature_ids[v + 1L]
}
l[[paste0(modality_name, "_", feat)]] <- v
}
}
}
batch <- paste0("batch_", rep(x = seq_along(n_cells_per_batch), times = n_cells_per_batch)) |> factor()
l$batch <- batch
data.table::setDT(l)
}
initialize_odms <- function(modality_names, file_names_in, n_nonzero_features_vector_list,
modality_feature_ids, n_cells, integer_id, chunk_size, compression_level) {
row_ptr_list <- lapply(seq_along(modality_names), function(k) {
file_name_in <- file_names_in[k]
create_odm(file_name_in = file_name_in,
n_nonzero_features = n_nonzero_features_vector_list[[k]],
feature_ids = modality_feature_ids[[k]],
n_cells = n_cells,
integer_id = integer_id,
chunk_size = chunk_size,
compression_level = compression_level)
read_row_ptr(file_name_in, length(n_nonzero_features_vector_list[[k]]))
}) |> stats::setNames(modality_names)
return(row_ptr_list)
}
process_input_files_round_1 <- function(matrix_fps, modality_names, modality_start_idx_features,
feature_idx_to_vector_idx_map, n_features_per_modality) {
# 0. initialize the outputs:
# a. modality_start_idx_mtx_list
# b. n_nonzero_features_vector_list
# c. n_cells
# d. n_cells_per_batch
# e. collapse_grna_counts
modality_start_idx_mtx_list <- vector(mode = "list", length = length(matrix_fps))
n_nonzero_features_vector_list <- lapply(seq_along(modality_names), function(k) {
integer(length = n_features_per_modality[k])
}) |> stats::setNames(modality_names)
n_cells_per_batch <- integer(length = length(matrix_fps))
collapse_grna_counts <- !is.null(feature_idx_to_vector_idx_map)
cat("Round 1/2 processing of the input files.\n")
for (i in seq_along(matrix_fps)) {
cat(paste0("\tProcessing file ", i , " of ", length(matrix_fps), ".\n"))
# 1. set the matrix fp and obtain the metadata
matrix_fp <- matrix_fps[i]
mtx_metadata <- get_mtx_metadata(matrix_fp)
n_cells_per_batch[i] <- mtx_metadata$n_cells
# 2. load the feature idxs, decrement, and sort
my_col_names <- if (collapse_grna_counts) c("feature_idx", "j") else "feature_idx"
my_col_classes <- if (collapse_grna_counts) c("integer", "integer", "NULL") else c("integer", "NULL", "NULL")
dt <- data.table::fread(file = matrix_fp,
skip = mtx_metadata$n_to_skip,
col.names = my_col_names,
colClasses = my_col_classes,
showProgress = FALSE, nThread = 1)
decrement_vector(dt$feature_idx) # decrement feature idx
data.table::setkey(dt, feature_idx) # radix sort on feature_idx
# 3. determine the start idx of the different modalities; convert into a list
modality_start_mtx <- c(vapply(seq_along(modality_names), function(k) {
modality_start_idx <- modality_start_idx_features[[k]]
dt[list(modality_start_idx), which = TRUE, mult = "first", roll = -Inf] - 1L # binary search for start idxs
}, FUN.VALUE = integer(1)), nrow(dt))
modality_start_mtx <- lapply(seq_along(modality_names), function(k) {
c(modality_start_mtx[k], modality_start_mtx[k + 1L] - 1L)
}) |> stats::setNames(modality_names)
modality_start_idx_mtx_list[[i]] <- modality_start_mtx
# 4. (possibly) collapse grna counts
if (collapse_grna_counts) {
modality_start_mtx <- collapse_grna_counts(dt = dt,
feature_idx_to_vector_idx_map = feature_idx_to_vector_idx_map,
modality_start_mtx = modality_start_mtx,
round_1 = TRUE)
}
# 5. update n_nonzero_features_vector for each modality
for (k in seq_along(modality_names)) {
start_idx <- modality_start_mtx[[k]][1]
end_idx <- modality_start_mtx[[k]][2]
update_n_features_vector(feature_idx = dt$feature_idx,
n_nonzero_features_vector = n_nonzero_features_vector_list[[k]],
offset = modality_start_idx_features[[k]],
start_idx = start_idx,
end_idx = end_idx)
}
rm(dt); gc() |> invisible()
}
return(list(modality_start_idx_mtx_list = modality_start_idx_mtx_list,
n_nonzero_features_vector_list = n_nonzero_features_vector_list,
n_cells = sum(n_cells_per_batch),
n_cells_per_batch = n_cells_per_batch))
}
process_input_files_round_2 <- function(matrix_fps, file_names_in, modality_names, modality_start_idx_features,
row_ptr_list, modality_start_idx_mtx_list, mt_feature_idxs,
cellwise_covariates, feature_idx_to_vector_idx_map) {
cat("Round 2/2 processing of the input files.\n")
n_cum_cells <- 0L
n_features <- diff(modality_start_idx_features) |> stats::setNames(modality_names)
for (i in seq_along(matrix_fps)) {
cat(paste0("\tProcessing file ", i , " of ", length(matrix_fps), ". "))
# 1. set the matrix fp and obtain the metadata
matrix_fp <- matrix_fps[i]
mtx_metadata <- get_mtx_metadata(matrix_fp)
n_cells <- mtx_metadata$n_cells
# 2. load the feature idxs, decrement, and sort
dt <- data.table::fread(file = matrix_fp, skip = mtx_metadata$n_to_skip,
col.names = c("feature_idx", "j", "x"),
colClasses = c("integer", "integer", "integer"),
showProgress = FALSE, nThread = 1)
decrement_vector(dt$feature_idx) # decrement feature_idx
add_value_to_vector(dt$j, n_cum_cells - 1L)
data.table::setorderv(dt, cols = c("feature_idx", "j")) # radix sort on feature_idx, cell_idx
# 3. collapse grna counts
modality_start_mtx <- modality_start_idx_mtx_list[[i]]
if (!is.null(feature_idx_to_vector_idx_map)) {
modality_start_mtx <- collapse_grna_counts(dt = dt,
feature_idx_to_vector_idx_map = feature_idx_to_vector_idx_map,
modality_start_mtx = modality_start_mtx,
round_1 = FALSE)
}
# 4. compute the cell-wise covariates for each modality
cat("Computing cellwise covariates. ")
for (k in seq_along(modality_names)) {
start_idx <- modality_start_mtx[[k]][1]
end_idx <- modality_start_mtx[[k]][2]
feature_offset <- modality_start_idx_features[[k]]
compute_cellwise_covariates(n_umis = cellwise_covariates[[k]]$covariate_list$n_umis,
n_nonzero = cellwise_covariates[[k]]$covariate_list$n_nonzero,
p_mito = cellwise_covariates[[k]]$covariate_list$p_mito,
feature_w_max_expression = cellwise_covariates[[k]]$covariate_list$feature_w_max_expression,
frac_umis_max_feature = cellwise_covariates[[k]]$covariate_list$frac_umis_max_feature,
feature_idx = dt$feature_idx,
j = dt$j,
x = dt$x,
mt_feature_idxs = mt_feature_idxs[[k]],
start_idx = start_idx,
end_idx = end_idx,
feature_offset = feature_offset,
cell_offset = n_cum_cells,
n_cells = n_cells,
compute_n_umis = cellwise_covariates[[k]]$bool_vect[["n_umis"]],
compute_n_nonzero = cellwise_covariates[[k]]$bool_vect[["n_nonzero"]],
compute_p_mito = cellwise_covariates[[k]]$bool_vect[["p_mito"]],
compute_feature_w_max_expression = cellwise_covariates[[k]]$bool_vect[["feature_w_max_expression"]],
compute_frac_umis_max_feature = cellwise_covariates[[k]]$bool_vect[["frac_umis_max_feature"]])
}
# 5. Write the data to disk in CSR format
cat("Writing to disk.\n")
for (k in seq_along(modality_names)) {
start_idx <- modality_start_mtx[[k]][1]
end_idx <- modality_start_mtx[[k]][2]
feature_offset <- modality_start_idx_features[[k]]
file_name_in <- file_names_in[k]
write_to_csr(file_name_in = file_name_in,
start_idx = start_idx,
end_idx = end_idx,
feature_offset = feature_offset,
cell_offset = n_cum_cells,
n_features = n_features[k],
f_row_ptr = row_ptr_list[[k]],
feature_idx = dt$feature_idx,
m_j = dt$j,
m_x = dt$x)
}
# 6. increment n_cum_cells
n_cum_cells <- n_cum_cells + n_cells
rm(dt); gc() |> invisible()
}
return(NULL)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.