rm(list = ls()) # Clear the environment
#options(warn=-1) # Turn off warning message globally
library(monocle3) # Load Monocle
library(tidymodels)
theme_set(theme_gray(base_size = 6))
expression_matrix = readRDS(url("http://staff.washington.edu/hpliner/data/packer_embryo_expression.rds"))
cell_metadata = readRDS(url("http://staff.washington.edu/hpliner/data/packer_embryo_colData.rds"))
gene_annotation = readRDS(url("http://staff.washington.edu/hpliner/data/packer_embryo_rowData.rds"))
cds <- new_cell_data_set(expression_matrix,
cell_metadata = cell_metadata,
gene_metadata = gene_annotation)
## Step 1: Normalize and pre-process the data
#cds <- estimate_size_factors(cds)
cds <- preprocess_cds(cds, num_dim = 50)
## Step 2: Correct for batch effects (optional)
cds <- align_cds(cds,
residual_model_formula_str = "~ bg.300.loading + bg.400.loading + bg.500.1.loading + bg.500.2.loading + bg.r17.loading + bg.b01.loading + bg.b02.loading",
alignment_group="batch")
set.seed(42)
## Step 2: Reduce the dimensionality of the data
cds <- reduce_dimension(cds, umap.fast_sgd = FALSE, cores=1)
plot_cells(cds, label_groups_by_cluster=FALSE, color_cells_by = "cell.type") + ggsave("embryo_umap_packer_cell_type.png", width=5, height=4, dpi = 600)
#plot_cells(cds, color_cells_by = "batch", label_cell_groups=FALSE) + ggsave("embryo_umap_batch.png", width=5, height=4, dpi = 600)
## Step 3: Cluster cells
#cds <- cluster_cells(cds, resolution=c(0, 1e-5, 1e-4, 1e-3, 1e-2))
cds <- cluster_cells(cds, resolution=5e-6)
plot_cells(cds, group_cells_by="partition", color_cells_by = "partition")+ ggsave("embryo_umap_partition.png", width=5, height=4, dpi = 600)
## Step 4: Learn cell trajectories
#
cds <- learn_graph(cds)
# Default is too coarse, need to set ncenter explicitly:
cds <- learn_graph(cds, learn_graph_control=list(ncenter=1000))
plot_cells(cds,
color_cells_by = "cell.type",
label_groups_by_cluster=FALSE,
label_leaves=FALSE,
label_branch_points=FALSE) + ggsave("embryo_pr_graph_packer_cell_type.png", width=5, height=4, dpi = 600)
plot_cells(cds,
color_cells_by = "embryo.time.bin",
label_cell_groups=FALSE,
label_leaves=TRUE,
label_branch_points=TRUE,
graph_label_size=1.5) + ggsave("embryo_pr_graph_by_time.png", width=5, height=4, dpi = 600)
cds = order_cells(cds)
plot_cells(cds,
color_cells_by = "pseudotime",
label_cell_groups=FALSE,
label_leaves=FALSE,
label_branch_points=FALSE,
graph_label_size=1.5) + ggsave("embryo_pr_graph_by_pseudotime.png", width=5, height=4, dpi = 600)
# a helper function to identify the root principal points:
get_earliest_principal_node <- function(cds, time_bin="130-170"){
cell_ids <- which(colData(cds)[, "embryo.time.bin"] == time_bin)
closest_vertex <-
cds@principal_graph_aux[["UMAP"]]$pr_graph_cell_proj_closest_vertex
closest_vertex <- as.matrix(closest_vertex[colnames(cds), ])
root_pr_nodes <-
igraph::V(principal_graph(cds)[["UMAP"]])$name[as.numeric(names
(which.max(table(closest_vertex[cell_ids,]))))]
root_pr_nodes
}
cds = order_cells(cds, root_pr_nodes=get_earliest_principal_node(cds))
plot_cells(cds,
color_cells_by = "pseudotime",
label_cell_groups=FALSE,
label_leaves=FALSE,
label_branch_points=FALSE,
graph_label_size=1.5) + ggsave("embryo_pr_graph_by_pseudotime_programmatically_ordered.png", width=5, height=4, dpi = 600)
#plot_cells(cds, genes=c("egl-21", "egl-1"))
#plot_genes_in_pseudotime(cds[rowData(cds)$gene_short_name %in% c("egl-21", "egl-1"),])
ciliated_genes = c("che-1",
"hlh-17",
"nhr-6",
"dmd-6",
"ceh-36",
"ham-1")
plot_cells(cds,
genes=ciliated_genes,
label_cell_groups=FALSE,
show_trajectory_graph=FALSE) + ggsave("embryo_ciliated_markers.png", width=5, height=4, dpi = 600)
#graph_test(cds_subset)
plot_percent_cells_positive(cds[rowData(cds)$gene_short_name %in% ciliated_genes,], group_cells_by="time.point") +
guides(fill=FALSE) +
theme(axis.text.x=element_text(angle=45, hjust=1))
# plot_genes_violin(cds[rowData(cds)$gene_short_name %in% ciliated_genes,], color_cells_by="cell.type") +
# guides(fill=FALSE) +
# theme(axis.text.x=element_text(angle=45, hjust=1))
### Basic differential expression:
cds_subset = cds[rowData(cds)$gene_short_name %in% ciliated_genes,]
gene_fits = fit_models(cds_subset, model_formula_str = "~embryo.time")
fit_coefs = coefficient_table(gene_fits)
write.csv(fit_coefs, "emb_terms.csv")
emb_time_terms = fit_coefs %>% filter(term == "embryo.time")
write.csv(emb_time_terms, "emb_time_terms.csv")
sig_emb_time_terms = emb_time_terms %>% filter (q_value < 0.05) %>% select(gene_short_name, term, q_value, estimate)
write.csv(sig_emb_time_terms, "emb_time_sig_terms.csv")
plot_genes_violin(cds[rowData(cds)$gene_short_name %in% c("che-1",
"nhr-6",
"dmd-6",
"ceh-36")], group_cells_by="embryo.time.bin", ncol=2) +
theme(axis.text.x=element_text(angle=45, hjust=1)) + ggsave("embryo_ciliated_markers_violin.png", width=4, height=4, dpi = 600)
### Subtracting unwanted effects:
gene_fits = fit_models(cds_subset, model_formula_str = "~embryo.time + batch")
fit_coefs = coefficient_table(gene_fits)
emb_terms = fit_coefs %>% filter(term != "(Intercept)")
write.csv(emb_terms, "emb_plus_batch_terms.csv")
### Evaluating fit
fit_evals = evaluate_fits(gene_fits)
write.csv(fit_evals, "emb_time_evals.csv")
gene_fits = fit_models(cds[rowData(cds)$gene_short_name %in% ciliated_genes,], model_formula_str = "~embryo.time + batch")
fit_coefs = coefficient_table(gene_fits)
emb_time_terms = fit_coefs %>% filter(term == "embryo.time")
emb_time_terms = emb_time_terms %>% mutate(q_value = stats::p.adjust(p_value))
emb_time_terms %>% filter (q_value < 0.05) %>% select(gene_short_name, term, q_value, estimate)
### Comparing models
time_models = fit_models(cds_subset, model_formula_str = "~embryo.time", expression_family="negbinomial")
time_batch_models = fit_models(cds_subset, model_formula_str = "~embryo.time + batch", expression_family="negbinomial")
emb_model_lr_test = compare_models(time_batch_models, time_models)
write.csv(emb_model_lr_test, "emb_model_lr_test.csv")
cds_subset = cds[rowData(cds)$gene_short_name %in% ciliated_genes,
is.finite(colData(cds)$pseudotime)]
gene_fits = fit_models(cds_subset, model_formula_str = "~splines::ns(pseudotime, df=3)", verbose=TRUE)
fit_coefs = coefficient_table(gene_fits)
emb_time_terms = fit_coefs %>% filter(grepl("pseudotime", term))
emb_time_terms = emb_time_terms %>% mutate(q_value = stats::p.adjust(p_value))
emb_time_terms %>% filter (q_value < 0.05) %>% select(gene_short_name, term, q_value, estimate)
# Principal graph test:
ciliated_cds_pr_test_res = graph_test(cds, neighbor_graph="principal_graph", cores=4)
pr_deg_ids = row.names(subset(ciliated_cds_pr_test_res, q_value < 0.05))
gene_module_df = monocle3:::find_gene_modules(cds[pr_deg_ids,], resolution=1e-3)
cell_group_df = tibble::tibble(cell=row.names(colData(cds)), cell_group=colData(cds)$cell.type)
agg_mat = aggregate_gene_expression(cds, gene_module_df, cell_group_df)
row.names(agg_mat) = stringr::str_c("Module ", row.names(agg_mat))
pheatmap::pheatmap(agg_mat,
scale="column", clustering_method="ward.D2",
fontsize=6, width=5, height=8,
file="emb_pseudotime_module_heatmap.png")
plot_cells(cds,
genes=gene_module_df,
label_cell_groups=FALSE,
show_trajectory_graph=FALSE) + ggsave("embryo_umap_selected_pseudotime_modules.png", width=5, height=4, dpi = 600)
rowData(cds)[gene_module_df %>% filter(module == 29) %>% pull(id),]
plot_cells(cds, genes=gene_module_df, color_cells_by="cell.type", show_trajectory_graph=FALSE)
plot_cells(cds, genes=c("hlh-4", "gcy-8", "dac-1", "oig-8"),
show_trajectory_graph=FALSE,
label_cell_groups=FALSE,
label_leaves=FALSE) + ggsave("embryo_umap_selected_markers.png", width=5, height=4, dpi = 600)
plot_cells(cds, show_trajectory_graph=FALSE) + ggsave("embryo_umap_cluster.png", width=5, height=4, dpi = 600)
AFD_lineage_cds = cds[,clusters(cds) %in% c(22, 28, 35)]
plot_genes_in_pseudotime(AFD_lineage_cds[rowData(AFD_lineage_cds)$gene_short_name %in% c("gcy-8", "dac-1", "oig-8"),],
color_cells_by="embryo.time.bin",
min_expr=0.5) +
ggsave("embryo_AFD_dynamic_genes.png", width=3, height=3, dpi = 600)
plot_cells(cds, genes=c("F29C4.2", "grld-1", "csn-5", "vamp-7"), label_branch_points=FALSE, label_roots=FALSE, label_leaves=FALSE)
######## Branch analysis:
cds_subset = choose_cells(cds)
subset_pr_test_res = graph_test(cds_subset, cores=4)
pr_deg_ids = row.names(subset(subset_pr_test_res, q_value < 0.05))
gene_module_df = monocle3:::find_gene_modules(cds_subset[pr_deg_ids,], resolution=1e-2)
cell_group_df = tibble::tibble(cell=row.names(colData(cds_subset)), cell_group=clusters(cds_subset)[colnames(cds_subset)])
agg_mat = aggregate_gene_expression(cds_subset, gene_module_df, cell_group_df)
gene_module_df$module <- factor(gene_module_df$module, levels = row.names(agg_mat)[module_dendro$order])
plot_cells(cds_subset,
genes=gene_module_df,
label_cell_groups=FALSE,
show_trajectory_graph=FALSE) + ggsave("embryo_umap_AFD_modules.png", width=5, height=4, dpi = 600)
subset_pr_test_res = inner_join(subset_pr_test_res, gene_module_df)
#top_diff_genes = subset_pr_test_res %>% group_by(module) %>% filter(q_value < 0.05) %>% top_n(5, morans_I)
top_AFD_genes = subset_pr_test_res %>% filter(q_value < 0.05 & module == 3) %>% top_n(3, morans_I)
top_AWC_genes = subset_pr_test_res %>% filter(q_value < 0.05 & module %in% c(11, 7)) %>% top_n(3, morans_I)
plot_cells(cds_subset,
genes=c(top_AFD_genes %>% pull(gene_short_name), top_AWC_genes %>% pull(gene_short_name)),
label_cell_groups=FALSE,
show_trajectory_graph=FALSE) + ggsave("embryo_umap_AFD_AWC_branch_genes.png", width=5, height=4, dpi = 600)
###### Making 3D trajectories:
cds_3d = reduce_dimension(cds, max_components = 3)
cds_3d = cluster_cells(cds_3d)
cds_3d = learn_graph(cds_3d)
cds_3d = order_cells(cds_3d, root_pr_nodes=get_earliest_principal_node(cds))
cds_3d_plot_obj = plot_cells_3d(cds_3d, color_cells_by="partition")
htmlwidgets::saveWidget(cds_3d_plot_obj, "emb_3d_by_partition.html")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.