#' Drop attribute to abject
#'
#' @keywords internal
#'
#'
#' @param var A tibble
#' @param name A character name of the attribute
#'
#' @noRd
#'
#' @return A tibble with an additional attribute
drop_attr <- function(var, name) {
attr(var, name) <- NULL
var
}
#' Drop all attribute to abject
#'
#' @keywords internal
#'
#'
#' @param var A tibble
#' @param name A character name of the attribute
#'
#' @noRd
#'
#' @return A tibble with an additional attribute
drop_all_attr <- function(var) {
N <- var %>% attributes() %>% names()
for (n in N) {
var <- var %>% drop_attr(n)
}
var
}
#' Get matrix from tibble
#'
#' @keywords internal
#'
#' @importFrom magrittr set_rownames
#' @importFrom rlang quo_is_null
#'
#' @param tbl A tibble
#' @param rownames A character string of the rownames
#' @param do_check A boolean
#'
#' @return A matrix
#'
#' @examples
#'
#' as_matrix(head(dplyr::select(tidybulk::counts_mini, feature, count)), rownames=feature)
#'
#' @noRd
as_matrix <- function(tbl,
rownames = NULL,
do_check = TRUE) {
rownames <- enquo(rownames)
tbl %>%
# Throw warning if data frame is not numerical beside the rownames column (if present)
when(
do_check &&
(.) %>%
# If rownames defined eliminate it from the data frame
when(!quo_is_null(rownames) ~ (.)[,-1], ~ (.)) %>%
dplyr::summarise_all(class) %>%
tidyr::gather(variable, class) %>%
pull(class) %>%
unique() %>%
`%in%`(c("numeric", "integer")) %>% not() %>% any() ~ {
warning("tidybulk says: there are NON-numerical columns, the matrix will NOT be numerical")
(.)
},
~ (.)
) %>%
as.data.frame() %>%
# Deal with rownames column if present
when(
!quo_is_null(rownames) ~ (.) %>%
magrittr::set_rownames(tbl %>% pull(!!rownames)) %>%
select(-1),
~ (.)
) %>%
# Convert to matrix
as.matrix()
}
#' @importFrom tibble as_tibble
#' @importFrom SummarizedExperiment colData
#'
#' @keywords internal
#'
#' @param .data A tidySummarizedExperiment
#'
#' @noRd
to_tib <- function(.data) {
colData(.data) %>%
as.data.frame() %>%
as_tibble(rownames = "cell")
}
# Greater than
gt <- function(a, b) {
a > b
}
# Smaller than
st <- function(a, b) {
a < b
}
# Negation
not <- function(is) {
!is
}
# Raise to the power
pow <- function(a, b) {
a^b
}
# Equals
eq <- function(a, b) {
a == b
}
prepend <- function(x, values, before = 1) {
n <- length(x)
stopifnot(before > 0 && before <= n)
if (before == 1) {
c(values, x)
}
else {
c(x[seq_len(before - 1)], values, x[before:n])
}
}
#' Add class to abject
#'
#'
#' @keywords internal
#'
#' @param var A tibble
#' @param name A character name of the attribute
#'
#' @return A tibble with an additional attribute
#'
#' @noRd
add_class <- function(var, name) {
if (!name %in% class(var)) {
class(var) <- prepend(class(var), name)
}
var
}
#' Remove class to abject
#'
#' @keywords internal
#'
#'
#' @param var A tibble
#' @param name A character name of the class
#'
#' @return A tibble with an additional attribute
#'
#' @noRd
drop_class <- function(var, name) {
class(var) <- class(var)[!class(var) %in% name]
var
}
#' get abundance long
#'
#' @keywords internal
#'
#' @importFrom magrittr "%$%"
#' @importFrom utils tail
#' @importFrom SummarizedExperiment assays
#'
#' @param .data A tidySummarizedExperiment
#' @param transcripts A character
#' @param all A boolean
#'
#'
#' @return A tidySummarizedExperiment object
#'
#'
#' @noRd
get_abundance_sc_wide <- function(.data, transcripts = NULL, all = FALSE) {
# Solve CRAN warnings
. <- NULL
# For SCE there is not filed for variable features
variable_feature <- c()
# Check if output would be too big without forcing
if (
length(variable_feature) == 0 &&
is.null(transcripts) &&
!all
) {
stop("
Your object does not contain variable feature labels,
feature argument is empty and all arguments are set to FALSE.
Either:
1. use detect_variable_features() to select variable feature
2. pass an array of feature names
3. set all=TRUE (this will output a very large object, does your computer have enough RAM?)
")
}
# Get variable features if existing
if (
length(variable_feature) > 0 &&
is.null(transcripts) &&
!all
) {
variable_genes <- variable_feature
} else {
variable_genes <- NULL
}
# Just grub last assay
assays(.data) %>%
as.list() %>%
tail(1) %>%
.[[1]] %>%
when(
variable_genes %>% is.null() %>% `!`() ~ (.)[variable_genes, , drop = FALSE],
transcripts %>% is.null() %>% `!`() ~ (.)[transcripts, , drop = FALSE],
~ stop("It is not convenient to extract all genes, you should have either variable features or feature list to extract")
) %>%
as.matrix() %>%
t() %>%
as_tibble(rownames = "cell")
}
#' get abundance long
#'
#' @keywords internal
#'
#' @importFrom magrittr "%$%"
#' @importFrom tidyr pivot_longer
#' @importFrom tibble as_tibble
#' @importFrom purrr when
#' @importFrom purrr map2
#' @importFrom SummarizedExperiment assays
#'
#' @param .data A tidySummarizedExperiment
#' @param transcripts A character
#' @param all A boolean
#' @param exclude_zeros A boolean
#'
#' @return A tidySummarizedExperiment object
#'
#'
#' @noRd
get_abundance_sc_long <- function(.data, transcripts = NULL, all = FALSE,
exclude_zeros = FALSE) {
# Solve CRAN warnings
. <- NULL
# For SCE there is not filed for variable features
variable_feature <- c()
# Check if output would be too big without forcing
if (
length(variable_feature) == 0 &&
is.null(transcripts) &&
!all
) {
stop("
Your object does not contain variable feature labels,
feature argument is empty and all arguments are set to FALSE.
Either:
1. use detect_variable_features() to select variable feature
2. pass an array of feature names
3. set all=TRUE (this will output a very large object, does your computer have enough RAM?)
")
}
# Get variable features if existing
if (
length(variable_feature) > 0 &&
is.null(transcripts) &&
!all
) {
variable_genes <- variable_feature
} else {
variable_genes <- NULL
}
assay_names <- assays(.data) %>% names()
assays(.data) %>%
as.list() %>%
# Take active assay
map2(
assay_names,
~ .x %>%
when(
variable_genes %>% is.null() %>% `!`() ~ .x[variable_genes, , drop = FALSE],
transcripts %>% is.null() %>% `!`() ~ .x[toupper(rownames(.x)) %in% toupper(transcripts), , drop=FALSE],
all ~ .x,
~ stop("It is not convenient to extract all genes, you should have either variable features or feature list to extract")
) %>%
# Replace 0 with NA
when(exclude_zeros ~ (.) %>% {
x <- (.)
x[x == 0] <- NA
x
}, ~ (.)) %>%
as.matrix() %>%
data.frame() %>%
as_tibble(rownames=f_(.data)$name) %>%
tidyr::pivot_longer(
cols = -!!f_(.data)$symbol,
names_to = "cell",
values_to = "abundance" %>% paste(.y, sep = "_"),
values_drop_na = TRUE
)
# %>%
# mutate_if(is.character, as.factor) %>%
) %>%
Reduce(function(...) left_join(..., by = c(f_(.data)$name, "cell")), .)
}
#' @importFrom methods .hasSlot
#' @importFrom S4Vectors DataFrame
#' @importFrom SummarizedExperiment colData
#' @importFrom SummarizedExperiment rowData
#' @importFrom SummarizedExperiment colData<-
#' @importFrom SummarizedExperiment rowData<-
#' @importFrom S4Vectors head
#' @importFrom rlang .data
#'
#' @keywords internal
#'
#' @param .data_mutated A tibble
#' @param se A tidySummarizedExperiment
#'
#' @noRd
update_SE_from_tibble <- function(.data_mutated, se, column_belonging = NULL) {
# Comply to CRAN notes
. <- NULL
# Get the colnames of samples and feature datasets
colnames_col <-
colnames(colData(se)) %>%
c(s_(se)$name) %>%
# Forcefully add the column I know the source. This is useful in nesting
# where a unique value cannot be linked to sample or feature
c(names(column_belonging[column_belonging == s_(se)$name]))
colnames_row <- se %>%
when(
.hasSlot(., "rowData") | .hasSlot(., "elementMetadata") ~ colnames(rowData(.)),
TRUE ~ c()
) %>%
c(f_(se)$name) %>%
# Forcefully add the column I know the source. This is useful in nesting
# where a unique value cannot be linked to sample or feature
c(names(column_belonging[column_belonging == f_(se)$name]))
special_columns <- get_special_columns(
# Decrease the size of the dataset
se[1:min(100, nrow(se)), min(1, ncol(se)):min(20, ncol(se))]
)
# This is used if I have one column with one value that can be mapped to rows and columns
new_colnames_col = c()
# This condiction is because if I don't have any samples, the new column
# could be mapped to samples and return NA in the final SE
if(ncol(se) > 0)
new_colnames_col <-
.data_mutated %>%
select_if(!colnames(.) %in% setdiff(colnames_col, s_(se)$name)) %>%
# Eliminate special columns that are read only. Assays
select_if(!colnames(.) %in% special_columns) %>%
select_if(!colnames(.) %in% colnames_row) %>%
# Replace for subset
select(!!s_(se)$symbol, get_subset_columns(., !!s_(se)$symbol)) %>%
colnames()
col_data <-
.data_mutated %>%
select(c(colnames_col, new_colnames_col)) %>%
# Filter Sample is NA from SE that have 0 samples
filter(!is.na(!!s_(se)$symbol))
# This works even if I have 0 samples
duplicated_samples = col_data |> pull(!!s_(se)$symbol) %>% duplicated()
if(duplicated_samples |> which() |> length() > 0)
# Make fast distinct()
col_data = col_data |> filter(!duplicated_samples)
col_data =
col_data |>
# In case unitary SE subset does not work
data.frame(row.names = pull(col_data, !!s_(se)$symbol), check.names = FALSE) %>%
select(-!!s_(se)$symbol) %>%
DataFrame(check.names = FALSE)
# This to avoid the mismatch between missing column names for counts
# and numerical row names for colData
row_names_col =
col_data %>%
rownames() %>%
when(
colnames(se) %>% is.null ~ as.integer(.),
~ (.)
)
row_data <-
.data_mutated %>%
# Eliminate special columns that are read only
select_if(!colnames(.) %in% special_columns) %>%
#eliminate sample columns directly
select_if(!colnames(.) %in% c(s_(se)$name, colnames(col_data))) %>%
# select(one_of(colnames(rowData(se))))
# Replace for subset
select(!!f_(se)$symbol, get_subset_columns(., !!f_(se)$symbol) ) %>%
# Make fast distinct()
filter(pull(., !!f_(se)$symbol) %>% duplicated() %>% not()) %>%
# In case unitary SE subset does not work because all same
data.frame(row.names = pull(.,f_(se)$symbol), check.names = FALSE) %>%
select(-!!f_(se)$symbol) %>%
DataFrame(check.names = FALSE)
# This to avoid the mismatch between missing row names for counts
# and numerical row names for rowData
row_names_row <-
row_data %>%
rownames() %>%
when(
rownames(se) %>% is.null ~ as.integer(.),
~ (.)
)
# Subset if needed. This function is used by many dplyr utilities
se <- se[row_names_row, row_names_col]
# Update
colData(se) <- col_data
rowData(se) <- row_data
# Count-like data that is NOT in the assay slot already
colnames_assay <-
colnames(.data_mutated) %>%
setdiff(c(f_(se)$name, s_(se)$name, colnames(as.data.frame(head(rowRanges(se), 1))) )) %>%
setdiff(colnames(col_data)) %>%
setdiff(colnames(row_data)) %>%
setdiff(assays(se) %>% names)
if (length(colnames_assay) > 0)
assays(se) = #, withDimnames=FALSE) =
assays(se, withDimnames = FALSE) %>% c(
.data_mutated %>%
# Select assays column
select(!!f_(se)$symbol, !!s_(se)$symbol, colnames_assay) %>%
# Pivot for generalising to many assays
pivot_longer(cols = -c(!!s_(se)$symbol, !!f_(se)$symbol)) %>%
nest(data___ = c(!!s_(se)$symbol, !!f_(se)$symbol, value)) %>%
# Convert to matrix and to named list
mutate(data___ = map2(
data___, name,
~ {
.x =
.x %>%
spread(!!s_(se)$symbol, value) %>%
as_matrix(rownames = !!f_(se)$symbol) %>%
suppressWarnings()
# Rearrange if assays has colnames and rownames
if(!is.null(rownames(se)) & !is.null(rownames(.x))) .x = .x[rownames(se),,drop=FALSE]
if(!is.null(colnames(se)) & !is.null(colnames(.x))) .x = .x[,colnames(se),drop=FALSE]
.x %>%
list() %>%
setNames(.y)
}
)) %>%
# Create correct list
pull(data___) %>%
reduce(c)
)
# return
se
}
#' @importFrom methods is
slice_optimised <- function(.data, ..., .preserve=FALSE) {
. <- NULL
# This simulated tibble only gets samples and features so we know those that have been completely omitted already
# In order to save time for the as_tibble conversion
simulated_slice =
simulate_feature_sample_from_tibble(.data) %>%
dplyr::slice(..., .preserve = .preserve)
.data =
.data %>%
# Subset the object for samples and features present in the simulated data
.[rownames(.) %in% simulated_slice[,f_(.)$name],
colnames(.) %in% simulated_slice[,s_(.)$name]] %>%
inner_join(simulated_slice, by = c(f_(.)$name, s_(.)$name))
# If order do not match with the one proposed by slice convert to tibble
if (.data %>% is("tbl") %>% not()) {
.data_for_match <- .data %>% select(!!f_(.data)$symbol, !!s_(.data)$symbol) %>% as_tibble()
x <- c(pull(.data_for_match, !!f_(.data)$symbol), pull(.data_for_match, !!s_(.data)$symbol))
y <- c(pull(simulated_slice, !!f_(.data)$symbol), pull(simulated_slice, !!s_(.data)$symbol))
if (identical(x, y) %>% not()) {
left_join(simulated_slice, as_tibble(.data), by = c(f_(.data)$name, s_(.data)$name))
} else {
.data
}
}
}
#' @importFrom purrr map_chr
#' @importFrom dplyr select_if
#'
#' @keywords internal
#'
#' @param se A tidySummarizedExperiment
#'
#' @noRd
#'
get_special_columns <- function(.data) {
colnames_special <-
get_special_datasets(.data) %>%
# In case any of those have feature of sample in column names
map(
~ .x %>%
select_if(!colnames(.) %in% get_needed_columns(.data)) %>%
colnames()
) %>%
unlist() %>%
as.character()
colnames_counts <-
get_count_datasets(.data) %>%
select(-!!f_(.data)$symbol, -!!s_(.data)$symbol) %>%
colnames()
colnames_special %>% c(colnames_counts)
}
#' @importFrom dplyr select
#' @importFrom tidyselect one_of
#' @importFrom tibble as_tibble
#' @importFrom tibble tibble
#' @importFrom SummarizedExperiment rowRanges
#' @importFrom tibble rowid_to_column
#'
#' @noRd
get_special_datasets <- function(se) {
rr = se %>%
rowRanges()
rr %>%
when(
# If no ranges
as.data.frame(.) %>%
nrow() %>%
equals(0) ~ tibble(),
# If it is a range list (multiple rows per feature)
is(., "CompressedGRangesList") ~ {
# If GRanges does not have row names
if (is.null(rr@partitioning@NAMES)) {
rr@partitioning@NAMES <- as.character(1:nrow(se))
}
tibble::as_tibble(rr) %>%
eliminate_GRanges_metadata_columns_also_present_in_Rowdata(se) %>%
nest(GRangesList = -group_name) %>%
rename(!!f_(se)$symbol := group_name)
},
# If standard GRanges (one feature per line)
~ {
# If GRanges does not have row names
if (is.null(rr@ranges@NAMES)) {
rr@ranges@NAMES <- as.character(1:nrow(se))
}
tibble::as_tibble(rr) %>%
eliminate_GRanges_metadata_columns_also_present_in_Rowdata(se) %>%
mutate(!!f_(se)$symbol := rr@ranges@NAMES)
}
) %>%
list()
}
check_se_dimnames <- function(se) {
# Stop if any column or row names are duplicated
if (check_if_any_dimnames_duplicated(se, dim = "cols")) {
stop("tidySummarizedExperiment says: some column names are duplicated")
}
if (check_if_any_dimnames_duplicated(se, dim = "rows")) {
stop("tidySummarizedExperiment says: some row names are duplicated")
}
# Stop if column names of assays do not overlap, or if some assays have
# column names and others don't
if (check_if_assays_are_NOT_overlapped(se, dim = "cols")) {
warning(
"tidySummarizedExperiment says: at least one of the assays in your SummarizedExperiment have column names, but they don't completely overlap between assays. It is strongly recommended to make the assays consistent, to avoid erroneous matching of samples."
)
}
# Same for row names
if (check_if_assays_are_NOT_overlapped(se, dim = "rows")) {
warning(
"tidySummarizedExperiment says: at least one of the assays in your SummarizedExperiment have row names, but they don't completely overlap between assays. It is strongly recommended to make the assays consistent, to avoid erroneous matching of features."
)
}
# If the assays have dimnames but the SE does not, throw a warning and set
# the dimnames of the SE to those of the first assay with dimnames.
# (At this point we know that all assays have the same dimnames (could be
# NULL), but they could be in different order)
if (is.null(colnames(se)) &&
length(assays(se)) > 0) {
cn <- vapply(assays(se, withDimnames = FALSE), function(x) !is.null(colnames(x)), FALSE)
if (any(cn)) {
idx <- which(cn)[1]
warning(
"tidySummarizedExperiment says: the assays in your SummarizedExperiment have column names, but the SummarizedExperiment does not. Setting colnames(se) to column names of first assay with column names (assay ", idx, ")."
)
colnames(se) <- colnames(assays(se, withDimnames = FALSE)[[idx]])
}
}
if (is.null(rownames(se)) &&
length(assays(se)) > 0) {
rn <- vapply(assays(se, withDimnames = FALSE), function(x) !is.null(rownames(x)), FALSE)
if (any(rn)) {
idx <- which(rn)[1]
warning(
"tidySummarizedExperiment says: the assays in your SummarizedExperiment have row names, but the SummarizedExperiment does not. Setting rownames(se) to row names of first assay with row names (assay ", idx, ")."
)
rownames(se) <- rownames(assays(se, withDimnames = FALSE)[[idx]])
}
}
# If the assays as well as the SE have dimnames, but they don't overlap
# (they may be in different order), throw an error.
if (!is.null(colnames(se)) &&
length(assays(se)) > 0 &&
!is.null(colnames(assays(se, withDimnames = FALSE)[[1]])) &&
!all(colnames(assays(se, withDimnames = FALSE)[[1]]) %in% colnames(se))) {
warning(
"tidySummarizedExperiment says: the assays in your SummarizedExperiment have column names, but they don't agree with the column names of the SummarizedExperiment object itself. It is strongly recommended to make the assays consistent, to avoid erroneous matching of samples."
)
}
if (is.null(rownames(se)) &&
length(assays(se)) > 0) {
rn <- vapply(assays(se, withDimnames = FALSE), function(x) !is.null(rownames(x)), FALSE)
if (any(rn)) {
idx <- which(rn)[1]
warning(
"tidySummarizedExperiment says: the assays in your SummarizedExperiment have row names, but the SummarizedExperiment does not. Setting rownames(se) to row names of first assay with row names (assay ", idx, ")."
)
rownames(se) <- rownames(assays(se, withDimnames = FALSE)[[idx]])
}
}
# If the assays as well as the SE have dimnames, but they don't overlap
# (they may be in different order), throw an error.
if (!is.null(colnames(se)) &&
length(assays(se)) > 0 &&
!is.null(colnames(assays(se, withDimnames = FALSE)[[1]])) &&
!all(colnames(assays(se, withDimnames = FALSE)[[1]]) %in% colnames(se))) {
warning(
"tidySummarizedExperiment says: the assays in your SummarizedExperiment have column names, but they don't agree with the column names of the SummarizedExperiment object itself. It is strongly recommended to make the assays consistent, to avoid erroneous matching of samples."
)
}
if (!is.null(rownames(se)) &&
length(assays(se)) > 0 &&
!is.null(rownames(assays(se, withDimnames = FALSE)[[1]])) &&
!all(rownames(assays(se, withDimnames = FALSE)[[1]]) %in% rownames(se))) {
warning(
"tidySummarizedExperiment says: the assays in your SummarizedExperiment have row names, but they don't agree with the row names of the SummarizedExperiment object itself. It is strongly recommended to make the assays consistent, to avoid erroneous matching of features."
)
}
se
}
#' @importFrom tidyr gather
#' @importFrom dplyr rename
#' @importFrom dplyr left_join
#' @importFrom tibble as_tibble
#' @importFrom purrr reduce
#' @importFrom SummarizedExperiment assays
#' @importFrom magrittr equals
#'
#' @noRd
get_count_datasets <- function(se) {
# Check that dimnames are consistent
se <- check_se_dimnames(se)
# Join assays
list_assays =
map2(
assays(se, withDimnames = FALSE) %>% as.list(),
names(assays(se)),
~ {
# If the counts are in a sparse matrix convert to a matrix
# This might happen because the user loaded tidySummarizedExperiment and is
# print a SingleCellExperiment
if (is(.x, "dgCMatrix") | is(.x, "DelayedArray")) {
.x <- as.matrix(.x)
}
# Rearrange if assays has colnames and rownames
if (!is.null(rownames(se)) && !is.null(rownames(.x)) &&
all(rownames(se) %in% rownames(.x))) {
.x = .x[rownames(se), , drop = FALSE]
}
if (!is.null(colnames(se)) && !is.null(colnames(.x)) &&
all(colnames(se) %in% colnames(.x))) {
.x = .x[, colnames(se), drop = FALSE]
}
# If I don't have assay colnames and rownames add them
if (!is.null(rownames(se)) && is.null(rownames(.x))) rownames(.x) = rownames(se)
if (!is.null(colnames(se)) && is.null(colnames(.x))) colnames(.x) = colnames(se)
.x =
.x %>%
# matrix() %>%
# as.data.frame() %>%
# In case I have a sparse matrix
as.matrix() |>
tibble::as_tibble(rownames = f_(se)$name, .name_repair = "minimal") %>%
# If the matrix does not have sample names, fix column names
when(colnames(.x) %>% is.null() & is.null(colnames(se)) ~ setNames(., c(
f_(se)$name, seq_len(ncol(.x))
)),
~ (.)
)
# Avoid dug if SE if completely empty, no rows, no columns
if(.x |> select(-!!f_(se)$symbol) |> ncol() == 0) return(.x)
.x |>
pivot_longer(names_to = s_(se)$name, values_to = .y, cols = -!!f_(se)$symbol, cols_vary = "slowest")
#%>%
# rename(!!.y := count)
}) %>%
# Add dummy sample or feature if we have empty assay.
# This is needed for a correct visualisation of the tibble form
map(~when(
f_(se)$name %in% colnames(.x) %>% not ~ mutate(.x, !!f_(se)$symbol := as.character(NA)),
s_(se)$name %in% colnames(.x) %>% not ~ mutate(.x, !!s_(se)$symbol := as.character(NA)),
~ .x
))
# If assays is non empty
if(list_assays |> length() > 0)
list_assays |>
reduce(full_join, by = c(f_(se)$name, s_(se)$name))
# If assays is empty
else {
# If I don't have row column names
if(se |> rownames() |> is.null()) rn = nrow(se) |> seq_len() |> as.character()
else rn = rownames(se)
if(se |> colnames() |> is.null()) cn = ncol(se) |> seq_len() |> as.character()
else cn = colnames(se)
expand.grid( rn, cn ) |>
setNames(c(f_(se)$name, s_(se)$name)) |>
tibble::as_tibble()
}
}
get_needed_columns <- function(.data) {
c(f_(.data)$name, s_(.data)$name)
}
#' Convert array of quosure (e.g. c(col_a, col_b)) into character vector
#'
#' @keywords internal
#'
#' @importFrom rlang quo_name
#' @importFrom rlang quo_squash
#'
#' @param v A array of quosures (e.g. c(col_a, col_b))
#'
#' @return A character vector
#'
#' @noRd
quo_names <- function(v) {
v <- quo_name(quo_squash(v))
gsub("^c\\(|`|\\)$", "", v) %>%
strsplit(", ") %>%
unlist()
}
#' @importFrom purrr when
#' @importFrom dplyr select
#' @importFrom rlang expr
#' @importFrom tidyselect eval_select
#'
#' @noRd
select_helper <- function(.data, ...) {
loc <- tidyselect::eval_select(expr(c(...)), .data)
dplyr::select(.data, loc)
}
outersect <- function(x, y) {
sort(c(
setdiff(x, y),
setdiff(y, x)
))
}
#' @importFrom dplyr distinct_at
#' @importFrom dplyr vars
#' @importFrom purrr map
#' @importFrom magrittr equals
#'
#' @noRd
get_subset_columns <- function(.data, .col) {
# Comply with CRAN NOTES
. <- NULL
# Make col names
.col <- enquo(.col)
# x-annotation df
n_x <- .data %>%
distinct_at(vars(!!.col)) %>%
nrow()
# element wise columns
.data %>%
select(-!!.col) %>%
colnames() %>%
map(
~
.x %>%
when(
.data %>%
distinct_at(vars(!!.col, .x)) %>%
nrow() %>%
equals(n_x) ~ (.),
~NULL
)
) %>%
# Drop NULL
{
(.)[lengths((.)) != 0]
} %>%
unlist()
}
#' @importFrom purrr map_int
#' @importFrom purrr map
is_split_by_transcript <- function(.my_data) {
se <- .my_data[[1]]
tot_length <- .my_data %>% map(~ pull(.x, !!f_(se)$symbol) ) %>% unlist %>% unique() %>% length
all_lengths <- .my_data %>% map_int(~ pull(.x, !!f_(se)$symbol) %>% unique() %>% length)
all_lengths %>% unique %>% length() %>% gt(1) |
(all_lengths != tot_length) %>% any()
}
is_split_by_sample <- function(.my_data) {
se <- .my_data[[1]]
tot_length <- .my_data %>% map(~ pull(.x, !!s_(se)$symbol) ) %>% unlist %>% unique() %>% length
all_lengths <- .my_data %>% map_int(~ pull(.x, !!s_(se)$symbol) %>% unique() %>% length)
all_lengths %>% unique %>% length() %>% gt(1) |
(all_lengths != tot_length) %>% any()
}
get_GRanges_colnames <- function() {
"GenomicRanges"
}
eliminate_GRanges_metadata_columns_also_present_in_Rowdata <- function(.my_data, se) {
.my_data %>%
select(-one_of(colnames(rowData(se)))) %>%
# In case there is not metadata column
suppressWarnings()
}
subset_tibble_output <- function(.data, count_info, sample_info, gene_info, range_info, .subset) {
# This function outputs a tibble after subsetting the columns
.subset <- enquo(.subset)
# Build template of the output
output_colnames <-
slice(count_info, 0) %>%
left_join(slice(sample_info, 0), by = s_(.data)$name) %>%
left_join(slice(gene_info, 0), by = f_(.data)$name) %>%
when(nrow(range_info) > 0 ~ (.) %>% left_join(range_info, by = f_(.data)$name), ~ (.)) %>%
select(!!.subset) %>%
colnames()
# Sample table
sample_info <-
sample_info %>%
when(
colnames(.) %>% intersect(output_colnames) %>% length() %>% equals(0) ~ NULL,
select(., one_of(s_(.data)$name, output_colnames)) %>%
suppressWarnings()
)
# Ranges table
range_info <-
range_info %>%
when(
colnames(.) %>% intersect(output_colnames) %>% length() %>% equals(0) ~ NULL,
select(., one_of(f_(.data)$name, output_colnames)) %>%
suppressWarnings()
)
# Ranges table
gene_info <-
gene_info %>%
when(
colnames(.) %>% intersect(output_colnames) %>% length() %>% equals(0) ~ NULL,
select(., one_of(f_(.data)$name, output_colnames)) %>%
suppressWarnings()
)
# Ranges table
count_info <-
count_info %>%
when(
colnames(.) %>% intersect(output_colnames) %>% length() %>% equals(0) ~ NULL,
select(., one_of(f_(.data)$name, s_(.data)$name, output_colnames)) %>%
suppressWarnings()
)
if (
!is.null(count_info) &
(
!is.null(sample_info) & !is.null(gene_info) |
# Make exception for weirs cases (e.g. c(sample, counts))
(colnames(count_info) %>% outersect(c(f_(.data)$name, s_(.data)$name)) %>%
length() %>% gt(0))
)
) {
output_df <-
count_info %>%
when(!is.null(sample_info) ~ (.) %>% left_join(sample_info, by=s_(.data)$name), ~ (.)) %>%
when(!is.null(gene_info) ~ (.) %>% left_join(gene_info, by=f_(.data)$name), ~ (.)) %>%
when(!is.null(range_info) ~ (.) %>% left_join(range_info, by=f_(.data)$name), ~ (.))
} else if (!is.null(sample_info) ) {
output_df <- sample_info
} else if (!is.null(gene_info)) {
output_df <- gene_info %>%
# If present join GRanges
when(!is.null(range_info) ~ (.) %>% left_join(range_info, by=f_(.data)$name), ~ (.))
}
output_df %>%
# Cleanup
select(one_of(output_colnames)) %>%
suppressWarnings()
}
#' @importFrom stringr str_replace
change_reserved_column_names <- function(col_data, .data) {
# Fix NOTEs
. = NULL
col_data %>%
setNames(
colnames(.) %>%
sapply(function(x) if (x == f_(.data)$name) sprintf("%s.x", f_(.data)$name) else x) %>%
sapply(function(x) if (x == s_(.data)$name) sprintf("%s.x", s_(.data)$name) else x) %>%
str_replace("^coordinate$", "coordinate.x")
)
}
choose_name_if_present <- function(x) {
columns_query <- c()
for (i in 1:length(x)) {
if (is.null(names(x[i]))) columns_query[i] <- x[i]
else columns_query[i] <- names(x[i])
}
columns_query
}
#' @importFrom purrr when
join_efficient_for_SE <- function(x, y, by = NULL, copy = FALSE,
suffix = c(".x", ".y"), join_function,
force_tibble_route = FALSE,
...) {
# Comply to CRAN notes
. <- NULL
# Deprecation of special column names
if (is_sample_feature_deprecated_used(x, when(by, !is.null(.) ~ by, ~ colnames(y)))) {
x <- ping_old_special_column_into_metadata(x)
}
# Get the colnames of samples and feature datasets
colnames_col <- get_colnames_col(x)
colnames_row <- get_rownames_col(x)
# See if join done by sample, feature or both
columns_query <- by %>% when(
!is.null(.) ~ choose_name_if_present(.),
~ colnames(y) %>% intersect(c(colnames_col, colnames_row))
)
if (
# Complex join that it is not efficient yet
(any(columns_query %in% colnames_row) & any(columns_query %in% colnames_col)) |
# If join is with something else, the inefficient generic solution might work,
# or delegate the proper error downstream
(!any(columns_query %in% colnames_row) & !any(columns_query %in% colnames_col)) |
# Needed for internal recurrence if outcome is not valid
force_tibble_route) {
# If I have a big dataset
if (ncol(x) > 100) message("tidySummarizedExperiment says: if you are joining a dataframe both sample-wise and feature-wise, for efficiency (until further development), it is better to separate your joins and join datasets sample-wise OR feature-wise.")
x %>%
as_tibble(skip_GRanges = TRUE) %>%
join_function(y, by = by, copy = copy, suffix = suffix, ...) %>%
when(
# If duplicated sample-feature pair returns tibble
!is_not_duplicated(., x) | !is_rectangular(., x) ~ {
message(duplicated_cell_names)
message(data_frame_returned_message)
(.)
},
# Otherwise return updated tidySummarizedExperiment
~ update_SE_from_tibble(., x)
)
}
# Join only feature-wise
else if (any(columns_query %in% colnames_row) & !any(columns_query %in% colnames_col)) {
row_data_tibble <-
rowData(x) %>%
as_tibble(rownames = f_(x)$name) %>%
join_function(y, by = by, copy = copy, suffix = suffix, ...)
# Check if the result is not SE then take the tibble route
if (
is.na(pull(row_data_tibble, !!f_(x)$symbol)) %>% any |
duplicated(pull(row_data_tibble, !!f_(x)$symbol)) %>% any |
pull(row_data_tibble, !!f_(x)$symbol) %>% setdiff(rownames(colData(x))) %>% length() %>% gt(0)
) return(join_efficient_for_SE(x, y, by = by, copy = copy, suffix = suffix,
join_function, force_tibble_route = TRUE, ...))
row_data <-
row_data_tibble %>%
data.frame(row.names = pull(., !!f_(x)$symbol)) %>%
select(-!!f_(x)$symbol) %>%
DataFrame()
# Subset in case of an inner join, or a right join
x <- x[rownames(row_data),]
# Tranfer annotation
rowData(x) <- row_data
# Return
x
}
# Join only sample-wise
else if (any(columns_query %in% colnames_col) & !any(columns_query %in% colnames_row)) {
col_data_tibble <-
colData(x) %>%
as_tibble(rownames = s_(x)$name) %>%
join_function(y, by = by, copy = copy, suffix = suffix, ...)
# Check if the result is not SE then take the tibble route
if (
is.na(pull(col_data_tibble, !!s_(x)$symbol)) %>% any |
duplicated(pull(col_data_tibble, !!s_(x)$symbol)) %>% any |
pull(col_data_tibble, !!s_(x)$symbol) %>% setdiff(rownames(colData(x))) %>% length() %>% gt(0)
) return(join_efficient_for_SE(x, y, by = by, copy = copy, suffix = suffix,
join_function, force_tibble_route = TRUE, ...))
col_data <-
col_data_tibble %>%
data.frame(row.names = pull(., !!s_(x)$symbol)) %>%
select(-!!s_(x)$symbol) %>%
DataFrame()
# Subset in case of an inner join, or a right join
x <- x[,rownames(col_data)]
# Transfer annotation
colData(x) <- col_data
# Return
x
}
else stop("tidySummarizedExperiment says: ERROR FOR DEVELOPERS: this option should not exist. In join utility.")
}
get_ellipse_colnames <- function(...) {
(enquos(..., .ignore_empty = "all") %>% map(~ quo_name(.x)) %>% unlist)
}
get_colnames_col <- function(x) {
colnames(colData(x)) %>%
c(s_(x)$name)
}
get_rownames_col <- function(x) {
x %>%
when(
.hasSlot(., "rowData") | .hasSlot(., "elementMetadata") ~ colnames(rowData(.)),
TRUE ~ c()
) %>%
c(f_(x)$name)
}
# This function is used for the change of special sample column to .sample
# Check if "sample" is included in the query and is not part of any other existing annotation
#' @importFrom stringr str_detect
#' @importFrom stringr regex
is_sample_feature_deprecated_used <- function(.data, user_columns, use_old_special_names = FALSE) {
old_standard_is_used_for_sample <-
(
( any(str_detect(user_columns, regex("\\bsample\\b"))) &
!any(str_detect(user_columns, regex("\\W*(\\.sample)\\W*"))) ) |
"sample" %in% user_columns
) &
!"sample" %in% c(colnames(rowData(.data)), colnames(colData(.data)))
old_standard_is_used_for_feature <-
(
( any(str_detect(user_columns, regex("\\bfeature\\b"))) &
!any(str_detect(user_columns, regex("\\W*(\\.feature)\\W*"))) ) |
"feature" %in% user_columns
) &
!"feature" %in% c(colnames(rowData(.data)), colnames(colData(.data)))
old_standard_is_used <- old_standard_is_used_for_sample | old_standard_is_used_for_feature
if (old_standard_is_used) {
warning("tidySummarizedExperiment says: from version 1.3.1, the special columns including sample/feature id (colnames(se), rownames(se)) has changed to \".sample\" and \".feature\". This dataset is returned with the old-style vocabulary (feature and sample), however we suggest to update your workflow to reflect the new vocabulary (.feature, .sample)")
use_old_special_names <- TRUE
}
use_old_special_names
}
data_frame_returned_message <- "tidySummarizedExperiment says: A data frame is returned for independent data analysis."
duplicated_cell_names <- "tidySummarizedExperiment says: This operation lead to duplicated feature names. A data frame is returned for independent data analysis."
# Key column names
#' @importFrom S4Vectors metadata
#' @importFrom S4Vectors metadata<-
ping_old_special_column_into_metadata <- function(.data) {
metadata(.data)$feature__ <- get_special_column_name_symbol("feature")
metadata(.data)$sample__ <- get_special_column_name_symbol("sample")
.data
}
get_special_column_name_symbol <- function(name) {
list(name = name, symbol = as.symbol(name))
}
# This function produce artificially the feature ans sample column,
# to make optimisation before as_tibble is called
# for big datasets
simulate_feature_sample_from_tibble <- function(.data) {
. <- NULL
r <- rownames(.data) %>% .[rep(1:length(.), ncol(.data) )]
c <- colnames(.data) %>% .[rep(1:length(.), each = nrow(.data) )]
tibble(!!f_(.data)$symbol := r, !!s_(.data)$symbol := c)
}
feature__ <- get_special_column_name_symbol(".feature")
sample__ <- get_special_column_name_symbol(".sample")
#' @importFrom S4Vectors metadata
f_ <- function(x) {
# Check if old deprecated columns are used
if ("feature__" %in% names(metadata(x))) feature__ = metadata(x)$feature__
return(feature__)
}
#' @importFrom S4Vectors metadata
s_ <- function(x) {
if ("sample__" %in% names(metadata(x))) sample__ = metadata(x)$sample__
return(sample__)
}
split_SummarizedExperiment_by_feature_to_list <- function(.data) {
if (nrow(.data) > 1000)
message("tidySummarizedExperiment says: grouping a SummarizedExperiment by feature takes 1 minute for ~ 10,000 features.")
map(1:nrow(.data), ~ .data[.x,])
}
#' Add attribute to abject
#'
#' @keywords internal
#' @noRd
#'
#'
#' @param var A tibble
#' @param attribute An object
#' @param name A character name of the attribute
#'
#' @return A tibble with an additional attribute
add_attr <- function(var, attribute, name) {
attr(var, name) <- attribute
var
}
is_filer_columns_in_column_selection <- function(.data, ...) {
# columns = enquos(columns)
tryCatch({
.data |>
slice(0) |>
dplyr::filter(...)
TRUE
},
error = function(e) FALSE)
}
check_if_assays_are_NOT_consistently_ordered <- function(se) {
# If I have any assay at all
assays(se) |> length() |> gt(0) &&
# If I have more than one assay with colnames
Filter(
Negate(is.null),
assays(se, withDimnames = FALSE) |>
as.list() |>
map(colnames)
) |>
length() |>
gt(0) &&
# If I have lack of consistency
se |>
assays(withDimnames = FALSE) |>
as.list() |>
purrr::map_dfr(colnames) |>
apply(1, function(x) x |> unique() |> length()) |>
equals(1) |>
all() |>
not()
}
check_if_any_dimnames_duplicated <- function(se, dim = "cols") {
stopifnot(dim %in% c("rows", "cols"))
if (dim == "rows") {
dimnames_function <- rownames
nbr_unique_dimnames_function <- function(x) length(unique(rownames(x)))
length_function <- nrow
} else {
dimnames_function <- colnames
nbr_unique_dimnames_function <- function(x) length(unique(colnames(x)))
length_function <- ncol
}
# Check assays
# If I have any assay at all
assays_check <- assays(se) |> length() |> gt(0) &&
# If I have at least one assay with dimnames
Filter(
Negate(is.null),
assays(se, withDimnames = FALSE) |>
as.list() |>
map(dimnames_function)
) |>
length() |>
gt(0) &&
# If any named assay have fewer unique names than expected
assays(se, withDimnames = FALSE) |>
as.list() |>
map(dimnames_function) |>
Filter(Negate(is.null), x = _) |>
map(unique) |>
map(length) |>
reduce(min) |>
equals(length_function(se)) |>
not()
# Check SE object
se_check <- !is.null(dimnames_function(se)) &&
nbr_unique_dimnames_function(se) != length_function(se)
# Return TRUE if either of the two checks return TRUE
assays_check || se_check
}
check_if_assays_are_NOT_overlapped <- function(se, dim = "cols") {
stopifnot(dim %in% c("rows", "cols"))
if (dim == "rows") {
dimnames_function <- rownames
length_function <- nrow
} else {
dimnames_function <- colnames
length_function <- ncol
}
is_identical_for_reduce <- function(x,y) if (identical(x,y)) x else FALSE
# If I have any assay at all
assays(se) |> length() |> gt(0) &&
# If I have at least one assay with dimnames
Filter(
Negate(is.null),
assays(se, withDimnames = FALSE) |>
as.list() |>
map(dimnames_function)
) |>
length() |>
gt(0) &&
# If I have lack of consistency
# This will be TRUE also if some assays have dimnames and other don't
# For each assay, sort the dimnames, then check that they are all the
# same. Can't check for unique length, since some names may be repeated
# If they're not all the same, the reduce() step will return FALSE;
# otherwise, returns the (shared) dimnames
assays(se, withDimnames = FALSE) |>
as.list() |>
map(dimnames_function) |>
map(sort) |>
reduce(is_identical_for_reduce) |>
is.logical()
}
order_assays_internally_to_be_consistent <- function(se) {
se <- check_se_dimnames(se)
se |>
assays(withDimnames = FALSE) =
map2(
assays(se, withDimnames = FALSE) %>% as.list(),
names(assays(se)),
~ {
if (!is.null(rownames(se)) && !is.null(rownames(.x)) &&
all(rownames(se) %in% rownames(.x))) {
.x = .x[rownames(se), , drop = FALSE]
}
if (!is.null(colnames(se)) && !is.null(colnames(.x)) &&
all(colnames(se) %in% colnames(.x))) {
.x = .x[, colnames(se), drop = FALSE]
}
.x
})
se
}
#' @importFrom SummarizedExperiment cbind
reduce_cbind_se <- function(se_list){
do.call(cbind, se_list)
}
#' @importFrom purrr reduce
#' @importFrom purrr map
#' @importFrom SummarizedExperiment rbind
#' @importFrom SummarizedExperiment elementMetadata
#' @importFrom SummarizedExperiment elementMetadata<-
reduce_rbind_se <- function(se_list){
# rbind does not accept elementMetadata so I merge and take it off
element_metadata = se_list %>% map(elementMetadata) |> reduce(rbind)
# Drop elementMetadata
se_list = se_list |> map(~{
elementMetadata(.x) = NULL
.x
})
# Bind
se = do.call(rbind, se_list)
rm(se_list)
# Put elementMetadata back in - THE (safe) ASSUMPTION IS THAT THE ORDER DOES NOT CHANGE
elementMetadata(se) = element_metadata
# Return
se
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.