knitr::opts_chunk$set( message = FALSE, warning = FALSE, collapse = TRUE, comment = "#>", fig.align = "center" )
In its simplest form, a reference matrix is built by averaging expression (also includes an option to take the median) of a single cell RNA-seq expression matrix by cluster. Both log transformed or raw count matrices are supported.
library(clustifyr) library(clustifyrdata) new_ref_matrix <- average_clusters( mat = pbmc_matrix, metadata = pbmc_meta$classified, # or use metadata = pbmc_meta, cluster_col = "classified" if_log = TRUE ) head(new_ref_matrix)
seurat
objectFor further convenience, a shortcut function for generating reference matrix from seurat
object is used here.
new_ref_matrix_v2 <- seurat_ref( seurat_object = s_small, cluster_col = "res.1" ) new_ref_matrix_v3 <- seurat_ref( seurat_object = s_small3, cluster_col = "RNA_snn_res.1" ) tail(new_ref_matrix_v3)
If given additional assay_name
, output ref will include features from the designated slots (such as for CITE-seq data).
new_ref_matrix_v2 <- seurat_ref( seurat_object = s_small, cluster_col = "res.1", assay_name = c("ADT", "ADT2") ) new_ref_matrix_v3 <- seurat_ref( seurat_object = s_small3, cluster_col = "RNA_snn_res.1", assay_name = c("ADT", "ADT2") ) tail(new_ref_matrix_v3)
Bulk RNA-Seq data can be obtained from any input source; in this example we will obtain a dataset from the recount2 database. This database provides > 2000 human RNA-Seq experiments that have been processed using a consistent pipeline. We have written a wrapper function to download a count matrix from recount2
, given an SRA ID.
library(dplyr) library(purrr) library(tidyr) library(stringr) library(recount2) dl_recount <- function(sra_id) { if (!file.exists(file.path(sra_id, "rse_gene.Rdata"))) { download_study(sra_id) } load(file.path(sra_id, "rse_gene.Rdata")) # no longer need to downloaded data unlink(sra_id, recursive = TRUE) rse <- scale_counts(rse_gene) read_counts <- assay(rse, "counts") gene_ids <- rownames(read_counts) # get gene symbols, which are stored in rowData id2symbol <- data_frame( ids = rowData(rse_gene)$gene_id, symbols = rowData(rse_gene)$symbol@listData ) %>% mutate(symbols = map_chr(symbols, ~ .x[1])) # clean up metadata into a dataframe print("cleaning up meta") mdata <- colData(rse) mdata_cols <- lapply( mdata$characteristics, function(x) { str_match(x, "^([^:]+):")[, 2] } ) %>% unique() %>% unlist() mdata <- data_frame( run = mdata$run, all_data = as.list(mdata$characteristics) ) %>% mutate(out = purrr::map_chr(all_data, ~ str_c(.x, collapse = "::"))) %>% tidyr::separate( out, sep = "::", into = mdata_cols ) %>% select(-all_data) %>% mutate_at( .vars = vars(-matches("run")), .funs = function(x) str_match(x, ": (.+)")[, 2] ) # convert ids to symbols row_ids_to_symbols <- left_join(data_frame(ids = gene_ids), id2symbol, by = "ids") if (length(gene_ids) != nrow(row_ids_to_symbols)) { warning("gene id mapping to symbols produce more or less ids") } row_ids_to_symbols <- filter(row_ids_to_symbols, !is.na(symbols)) out_df <- read_counts %>% as.data.frame() %>% tibble::rownames_to_column("gene_id") %>% left_join(., row_ids_to_symbols, by = c("gene_id" = "ids")) %>% dplyr::select(-gene_id) %>% dplyr::select(symbols, everything()) %>% filter(!is.na(symbols)) out_matrix <- tidyr::gather(out_df, library, expr, -symbols) %>% group_by(symbols, library) %>% summarize(expr = sum(expr)) %>% tidyr::spread(library, expr) %>% as.data.frame() if (tibble::has_rownames(out_matrix)) { out_matrix <- tibble::remove_rownames(out_matrix) } out_matrix <- out_matrix %>% tibble::column_to_rownames("symbols") %>% as.matrix() list( read_counts = out_matrix, meta_data = mdata ) } gtex_data <- dl_recount("SRP012682") gtex_data$read_counts[1:5, 1:5] gtex_data$meta_data[1:5, ]
clustifyr
works with microarray data-derived gene expression matrix as well (see Benchmark page). An example of generating a matrix for sorted immune cell type can be found below. The matrix can be found at clustifyrdata::hema_microarray_matrix
library(tidyverse) # read series matrix for column names GSE24759 <- read_tsv("GSE24759/GSE24759_series_matrix.txt", skip = 25, n_max = 1) # read series matrix for full dataset; apply column names GSE24759 <- read_tsv("GSE24759//GSE24759_series_matrix.txt", skip = 57, col_names = colnames(GSE24759)) %>% rename(ID = `!Sample_title`) # read array metadata table, remove control probes and missing gene symbols GPL4685 <- read_tsv("/Users/rf/microarray/GSE24759/GPL4685-15513.txt", skip = 14) %>% filter(!is.na(`Gene Symbol`), `Sequence Type` != "Control sequence") %>% separate(`Gene Symbol`, into = "gene_symbol", sep = " ") %>% select(ID, gene_symbol) # join data and metadata, collapse gene symbols GSE24759 <- inner_join(GSE24759, GPL4685) %>% select(-ID) %>% group_by(gene_symbol) %>% mutate_at(.vars = vars(-gene_symbol), .funs = list(~ log2(mean(2^.)))) # convert to matrix, add rownames GSE24759_mat <- ungroup(GSE24759) %>% select(-gene_symbol) %>% as.matrix() row.names(GSE24759_mat) <- GSE24759$gene_symbol # merge samples ref_hema_microarray <- GSE24759_mat %>% t() %>% as.data.frame() %>% rownames_to_column("sample") %>% mutate(sample = str_remove(sample, ", .*")) %>% group_by(sample) %>% summarise_all(.funs = list(~ log2(mean(2^.)))) if (tibble::has_rownames(ref_hema_microarray)) { ref_hema_microarray <- tibble::remove_rownames(ref_hema_microarray) } ref_hema_microarray <- ref_hema_microarray %>% column_to_rownames("sample") %>% t()
Essentially, any data in matrix form is accepted by clustifyR
.
clustifyrdata::ref_hema_microarray[1:5, 1:5]
clustify_lists
and clustify_nudge
uses dataframe or matrix of candidate genes for classification. Example formatting are shown below.
# directly from seurat function head(pbmc_markers) # manually enter into dataframe format betam <- c("INS", "IGF2", "IAPP", "MAFA", "NPTX2") alpham <- c("GCG", "PDK4", "LOXL2", "IRX2", "GC") deltam <- c("SST", "RBP4", "HHEX", "PCSK1", "LEPR") data.frame(alpham, betam, deltam) # use matrixize_markers function to convert mm <- matrixize_markers( marker_df = pbmc_markers, remove_rp = TRUE, unique = TRUE, n = 10 ) head(mm) # get markers from ref_matrix, and then matrixize_markers cbmc_m <- matrixize_markers( marker_df = ref_marker_select(cbmc_ref), remove_rp = TRUE, unique = TRUE, n = 3 ) # parse `garnett` marker files, see function `file_marker_parse`
A few reference datasets are built into clustifyr for vignettes and testing:
pbmc_bulk_matrix
,
cbmc_ref
,
pbmc_markers
,
cbmc_m
More reference data, including tabula muris, and code used to generate them are available at https://github.com/rnabioco/clustifyrdata.
Also see list for individual downloads at https://rnabioco.github.io/clustifyr/articles/download_refs.html
```{"Installation"}
devtools::install_github("rnabioco/clustifyrdata") ```
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.