Nothing
context("Testing outlier processing and sashimi plot functions")
# Load data ---------------------------------------------------------------
# use Genomic state to load txdb (GENCODE v31)
ref <- GenomicState::GenomicStateHub(version = "31", genome = "hg38", filetype = "TxDb")[[1]]
junctions_processed_path <- file.path(tempdir(), "junctions_processed.rda")
if (file.exists(junctions_processed_path)) {
load(junctions_processed_path)
} else {
junctions_processed <-
junction_process(
junctions_example,
count_thresh = NULL,
n_samp = NULL,
ref,
sd_const = 0.02
)
save(junctions_processed,
file = junctions_processed_path
)
}
# filter junctions to save time
junctions <- junctions_processed
# simulate random coverage scores to save time
coverage_scores <- matrix(
data = sample(-100:100, dim(junctions)[1] * dim(junctions)[2], replace = TRUE),
nrow = dim(junctions)[1],
ncol = dim(junctions)[2]
)
colnames(coverage_scores) <- dimnames(junctions)[[2]]
assays(junctions)[["coverage_score"]] <- coverage_scores
# Detecting outliers using an isolation forest ----------------------------
##### outlier_detect #####
junctions_w_outlier_scores <- outlier_detect(junctions,
feature_names = c("score", "coverage_score"),
random_state = 32L
)
up_indexes <- which(assays(junctions)[["direction"]][, 1] == 1)
outlier_up <- .outlier_score(
features =
data.frame(
scores = assays(junctions)[["score"]][, 1][up_indexes],
coverage_scores = assays(junctions)[["coverage_score"]][, 1][up_indexes]
),
random_state = 32L
)
down_indexes <- which(assays(junctions)[["direction"]][, 1] == -1)
outlier_down <- .outlier_score(
features =
data.frame(
scores = assays(junctions)[["score"]][, 1][down_indexes],
coverage_scores = assays(junctions)[["coverage_score"]][, 1][down_indexes]
),
random_state = 32L
)
test_that("outlier_detect has the correct output", {
# if junctions have been reordered either one of the direction
# or scoress of up/down would not match
expect_identical(
assays(junctions_w_outlier_scores)[["direction"]],
assays(junctions)[["direction"]]
)
expect_identical(
outlier_up %>% as.numeric(),
assays(junctions_w_outlier_scores)[["outlier_score"]][, 1][up_indexes]
)
expect_identical(
outlier_down %>% as.numeric(),
assays(junctions_w_outlier_scores)[["outlier_score"]][, 1][down_indexes]
)
})
test_that("outlier_detect catches user-input errors", {
expect_error(
outlier_detect(junctions, feature_names = c("not_an_assay")),
"Assays does not contain the following: "
)
assays(junctions)[["direction"]] <- NULL
expect_error(
outlier_detect(junctions),
"junctions must contain a 'direction' assay"
)
})
# Aggregating outlier scores to cluster-level -----------------------------
##### .outlier_wrangle #####
outlier_scores_samp <- .outlier_wrangle(junctions_w_outlier_scores, samp_id_col = "samp_id")
test_that(".outlier_wrangle has the correct output", {
expect_true(is(outlier_scores_samp, "list"))
expect_true(all(lapply(outlier_scores_samp, nrow) == length(junctions_w_outlier_scores)))
expect_identical(
outlier_scores_samp[[1]][["direction"]],
assays(junctions_w_outlier_scores)[["direction"]][, 1]
)
expect_identical(
outlier_scores_samp[[2]][["direction"]],
assays(junctions_w_outlier_scores)[["direction"]][, 2]
)
expect_identical(
outlier_scores_samp[[1]][["outlier_score"]],
assays(junctions_w_outlier_scores)[["outlier_score"]][, 1]
)
expect_identical(
outlier_scores_samp[[2]][["outlier_score"]],
assays(junctions_w_outlier_scores)[["outlier_score"]][, 2]
)
})
##### .outlier_cluster #####
# Generate key of cluster indexes vs junction indexes
clusters <- SummarizedExperiment::rowData(junctions)[["clusters"]]
names(clusters) <- seq_len(dim(junctions)[1])
clusters <- unlist(clusters)
clusters <- dplyr::tibble(
cluster_index = names(clusters),
junction_index = unname(clusters)
)
outlier_scores_samp <- BiocParallel::bplapply(outlier_scores_samp,
FUN = .outlier_cluster,
clusters = clusters
)
outlier_cluster_check <- function(junctions_w_outlier_scores, outlier_scores_samp) {
clusters <- unlist(SummarizedExperiment::rowData(junctions_w_outlier_scores)[["clusters"]])
junctions_by_cluster <- junctions_w_outlier_scores[unname(clusters)]
check <- TRUE
for (i in seq_along(outlier_scores_samp)) {
outlier_clusters <-
lapply(assays(junctions_by_cluster), FUN = function(x) x[, i]) %>%
as.data.frame() %>%
dplyr::mutate(cluster_index = names(clusters))
outlier_clusters <- outlier_clusters %>%
dplyr::group_by(cluster_index, direction) %>%
dplyr::filter(
!duplicated(outlier_score),
outlier_score == min(outlier_score)
) %>%
dplyr::group_by(cluster_index) %>%
dplyr::filter(dplyr::n() != 1)
outlier_clusters <- outlier_clusters %>%
dplyr::group_by(cluster_index) %>%
dplyr::summarise(mean_outlier_score = mean(outlier_score))
check <- all(check, all(outlier_scores_samp[[i]][["cluster_index"]] %in%
outlier_clusters[["cluster_index"]]))
check <- all(check, identical(
sort(unique(outlier_scores_samp[[i]][["mean_outlier_score"]])),
sort(unique(outlier_clusters[["mean_outlier_score"]]))
))
}
return(check)
}
test_that(".outlier_cluster has the correct output", {
expect_true(is(outlier_scores_samp, "list"))
expect_true(all(unlist(lapply(outlier_scores_samp[[1]], nrow)) == 2))
expect_true(all(unlist(lapply(outlier_scores_samp[[2]], nrow)) == 2))
expect_true(outlier_cluster_check(
junctions_w_outlier_scores,
outlier_scores_samp
))
})
##### .outlier_cluster_annot & .outlier_cluster_tidy #####
outlier_scores_samp <- BiocParallel::bplapply(outlier_scores_samp,
FUN = .outlier_cluster_annot,
BPPARAM = BiocParallel::SerialParam()
)
outlier_scores_tidy <- .outlier_cluster_tidy(outlier_scores_samp)
test_that(".outlier_cluster_tidy has the correct output", {
expect_true(is(outlier_scores_tidy, "DataFrame"))
expect_true(is(outlier_scores_tidy[["gene_id_cluster"]], "CharacterList"))
expect_true(is(outlier_scores_tidy[["junctions"]], "list"))
expect_true(all(colData(junctions_w_outlier_scores)[["samp_id"]] %in%
unique(outlier_scores_tidy[["samp_id"]])))
})
##### outlier_aggregate #####
test_that("outlier_aggregate catches user input errors", {
expect_error(
outlier_aggregate(junctions_example),
"junctions rowData must contain 'clusters'. Have you run junction_norm?"
)
expect_error(
outlier_aggregate(junctions),
"junctions must contain both 'direction' and 'outlier_score' assays"
)
SummarizedExperiment::rowData(junctions_w_outlier_scores)[["clusters"]][[1]] <-
length(junctions_w_outlier_scores) + 1
expect_error(
outlier_aggregate(junctions_w_outlier_scores),
"Not all cluster indexes match junctions. Have you filtered junctions after running junction_norm?"
)
})
# Sashimi plot functions --------------------------------------------------
# set one of the samples as control for testing plotting of controls
SummarizedExperiment::colData(junctions_processed)[["case_control"]] <- "control"
junctions_processed <-
junctions_processed %>%
junction_filter(count_thresh = c("raw" = 3))
##### .gene_tx_type_get #####
test_that(".gene_tx_type_get has the correct output", {
expect_equal(
.gene_tx_type_get("ENSG_example"),
list(gene_id = "ENSG_example")
)
expect_equal(
.gene_tx_type_get("ENST_example"),
list(tx_name = "ENST_example")
)
})
test_that(".gene_tx_type_get catches user-input errors", {
expect_error(
.gene_tx_type_get(NULL),
"gene_tx_id must be set and be of length 1"
)
expect_error(
.gene_tx_type_get(c("ENSG_example", "ENSG_example")),
"gene_tx_id must be set and be of length 1"
)
# for now, dasper does not accept gene symbols
expect_error(
.gene_tx_type_get(c("SNCA")),
"gene_tx_id does not include an ENST or ENSG prefix"
)
})
##### .exons_to_plot_get #####
# pick the gene with the most number of junctions
gene_id_to_plot <-
SummarizedExperiment::rowData(junctions_processed)[["gene_id_junction"]] %>%
unlist() %>%
table() %>%
sort() %>%
tail(1) %>%
names()
gene_tx_list <- .gene_tx_type_get(gene_id_to_plot)
exons_to_plot <- .exons_to_plot_get(
ref = ref,
gene_tx_list = gene_tx_list,
region = NULL
)
# take half the gene as the region of interest
region <- GRanges(
paste0(
seqnames(exons_to_plot) %>% as.character() %>% unique(),
":",
start(exons_to_plot) %>% min(),
"-",
mean(c(min(start(exons_to_plot)), max(end(exons_to_plot))))
)
)
exons_to_plot <- .exons_to_plot_get(
ref = ref,
gene_tx_list = gene_tx_list,
region = region
)
test_that(".exons_to_plot_get has the correct output", {
expect_equal(
.exons_to_plot_get(ref,
.gene_tx_type_get(gene_id_to_plot),
region = NULL
),
.gene_tx_type_get(gene_id_to_plot) %>%
GenomicFeatures::exons(ref, filter = .) %>%
GenomicRanges::disjoin()
)
expect_equal(
findOverlaps(exons_to_plot, region) %>% length(),
length(exons_to_plot)
)
})
test_that(".junctions_to_plot_get catches user input errors", {
expect_error(
.exons_to_plot_get(ref, gene_tx_list,
region = GRanges(c("chr22:1-1", "chr22:1-1"))
),
"region must be a GenomicRanges object of length 1"
)
expect_error(
.exons_to_plot_get(ref, gene_tx_list,
region = data.frame()
),
"region must be a GenomicRanges object of length 1"
)
expect_error(
.exons_to_plot_get(ref, list(gene_id = "ENSG_not_a_real_gene"),
region = NULL
),
"No exons found to plot"
)
expect_error(
.exons_to_plot_get(ref, gene_tx_list,
region = GRanges("chr22:1-1")
),
"No exons found to plot"
)
})
##### .junctions_to_plot_get #####
# select a transcript of interest
# pick the gene with the most number of junctions
tx_name_to_plot <-
SummarizedExperiment::rowData(junctions_processed)[["tx_name_start"]] %>%
unlist() %>%
table() %>%
sort() %>%
tail(1) %>%
names()
junctions_to_plot <- .junctions_to_plot_get(junctions_processed, gene_tx_list, region)
test_that("junctions_to_plot has the correct output", {
expect_true(all(any(mcols(junctions_to_plot)[["gene_id_start"]] == gene_tx_list) |
any(mcols(junctions_to_plot)[["gene_id_end"]] == gene_tx_list)))
expect_equal(
findOverlaps(junctions_to_plot, region) %>% length(),
length(junctions_to_plot)
)
junctions_to_plot_2 <- .junctions_to_plot_get(junctions_processed, list(tx_name = tx_name_to_plot), region = NULL)
expect_true(all(any(mcols(junctions_to_plot_2)[["tx_name_start"]] == tx_name_to_plot) |
any(mcols(junctions_to_plot_2)[["tx_name_end"]] == tx_name_to_plot)))
})
test_that(".junctions_to_plot_get catches user input errors", {
expect_error(
.junctions_to_plot_get(junctions,
list(gene_id = "ENSG_not_a_real_gene"),
region = NULL
),
"No junctions found to plot"
)
})
##### .coords_to_plot_get #####
gene_tx_to_plot <- GenomicFeatures::genes(ref, filter = gene_tx_list)
coords_to_plot <- .coords_to_plot_get(gene_tx_to_plot, exons_to_plot, junctions_to_plot, ext_factor = 20)
test_that("coords_to_plot has the correct output", {
expect_equal(
coords_to_plot[["chr"]],
seqnames(exons_to_plot) %>%
as.character() %>% unique()
)
expect_equal(
coords_to_plot[["strand"]],
strand(exons_to_plot) %>%
as.character() %>% unique()
)
min_start <- min(c(start(exons_to_plot), start(junctions_to_plot)))
max_end <- max(c(end(exons_to_plot), end(junctions_to_plot)))
expect_equal(
coords_to_plot[["x_min"]],
min_start - (max_end - min_start) / 20
)
expect_equal(
coords_to_plot[["x_max"]],
max_end + (max_end - min_start) / 20
)
})
##### .plot_gene_track #####
gene_track <- .plot_gene_track(coords_to_plot, exons_to_plot)
test_that("coords_to_plot has the correct output", {
expect_true(is(gene_track, "ggplot"))
})
##### .junctions_counts_type_get #####
junctions_to_plot_1 <- .junctions_counts_type_get(junctions_to_plot,
case_id = list(samp_id = "samp_1"),
sum_func = mean,
digits = 3,
assay_name = "norm"
)
junctions_to_plot_2 <- .junctions_counts_type_get(junctions_to_plot,
case_id = list(samp_id = c("samp_1", "samp_2")),
sum_func = NULL,
digits = 0,
assay_name = "raw"
)
test_that("coords_to_plot has the correct output", {
expect_true(is(junctions_to_plot_1, "data.frame"))
expect_true(is(junctions_to_plot_2, "data.frame"))
expect_true(length(junctions_to_plot_1) > 1)
expect_true(length(junctions_to_plot_2) > 1)
expect_true(all(c("samp_1", "control") %in% colnames(junctions_to_plot_1)))
expect_true(all(c("samp_1", "samp_2") %in% colnames(junctions_to_plot_2)))
expect_false("control" %in% colnames(junctions_to_plot_2))
check_count_val <- function(junctions_to_plot, samp_ids, range) {
counts <- junctions_to_plot[, samp_ids] %>%
unlist()
all(counts >= range[1] & counts <= range[2])
}
expect_true(check_count_val(
junctions_to_plot_1,
c("samp_1", "control"),
c(0, 1)
))
expect_false(check_count_val(
junctions_to_plot_2,
c("samp_1", "samp_2"),
c(0, 1)
))
})
##### plot_sashimi #####
sashimi1 <- plot_sashimi(
junctions = junctions_processed,
ref = ref,
gene_tx_id = gene_id_to_plot,
region = region,
count_label = FALSE
)
sashimi2 <- plot_sashimi(
junctions = junctions_processed,
ref = ref,
gene_tx_id = tx_name_to_plot,
case_id = list(samp_id = c("samp_1")),
sum_func = mean,
count_label = TRUE,
digits = 2
)
test_that("coords_to_plot has the correct output", {
expect_true(is(sashimi1, "ggplot"))
expect_true(is(sashimi2, "ggplot"))
})
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.