#' Expand/aggregate rows of matrix for many:many mappings
#'
#' Expand/aggregate rows of a matrix with any combination of
#' many:many mappings.
#' This method ensures that total counts per gene remain the
#' same regardless of how many genes it has split/condensed into.
#' This allows for many:many mappings that are otherwise not possible
#' using standard aggregation functions,
#' since they all require many:1 scenarios.\cr
#' Internally, this is done as follows:
#' \enumerate{
#' \item{Identify genes that appear more than once
#' in \code{gene_map[[input_col]]}.}
#' \item{For each gene identified, split its row into multiple
#' rows, where the number of new rows is equal to the number of times that
#' gene appears within \code{gene_map[[input_col]]}.
#' In the new expanded matrix, each row will be equal to the column sums
#' divided by the number of new rows. This means that averaged counts will be
#' split equally amongst the new rows, in a column-specific manner.\cr
#' Thus, the column sums of the output matrix will be equal
#' to the column sums in the input matrix.
#' In the case of gene expression count matrices,
#' this means that the total counts will remain equal between matrices,
#' while avoiding being forced to drop genes with many:many mappings
#' (as is the case with most other aggregation methods). }
#' \item{Map rownames of the expanded matrix onto the orthologous gene names
#' from \code{gene_map$ortholog_gene}.}
#' \item{[Optional] : When \code{aggregate_orthologs=TRUE},
#' aggregate rows of the expanded/mapped matrix
#' such that there will only be 1 row per ortholog gene,
#' using \link[orthogene]{aggregate_rows}.
#' The arguments \code{FUN}, \code{method},
#' \code{as_sparse}, \code{as_DelayedArray}, and \code{dropNA} will all
#' be passed to \link[orthogene]{aggregate_rows} if this step is selected.}
#' }
#' @param X Input matrix.
#' @param gene_map A \link[base]{data.frame} generated by
#' \link[orthogene]{map_orthologs},
#' with columns mapping \code{input_col} to \code{output_col}.
#' @param input_col Column name within \code{gene_map} with gene names matching
#' the row names of \code{X}.
#' @param output_col Column name within \code{gene_map} with gene names
#' that you wish you map the row names of \code{X} onto.
#' @param aggregate_orthologs [Optional] After performing an initial round of
#' many:many aggregation/expansion with \link[orthogene]{many2many_rows},
#' ensure each orthologous gene only appears in one row by using the
#' \link[orthogene]{aggregate_rows} function (default: \code{TRUE}).
#' @inheritParams aggregate_mapped_genes
#'
#' @return Expanded/aggregated matrix.
#'
#' @keywords internal
#' @importFrom methods is
#' @importFrom Matrix Matrix colSums
#' @importFrom stats setNames
#' @source
#' \code{
#' data("exp_mouse")
#' X <- exp_mouse
#' gene_map <- orthogene:::map_orthologs(genes = rownames(exp_mouse),
#' input_species = "mouse",
#' method="homologene")
#' X_agg <- orthogene:::many2many_rows(X = X,
#' gene_map = gene_map)
#' sum(duplicated(rownames(exp_mouse))) # 0
#' sum(duplicated(gene_map$input_gene)) # 46
#' sum(duplicated(gene_map$ortholog_gene)) # 56
#' sum(duplicated(rownames(X_agg))) # 56
#' }
many2many_rows <- function(X,
gene_map,
input_col = "input_gene",
output_col = "ortholog_gene",
agg_fun = "sum",
agg_method = c("monocle3", "stats"),
as_sparse = TRUE,
as_DelayedArray = FALSE,
dropNA = TRUE,
aggregate_orthologs = TRUE,
verbose = TRUE) {
messager("Mapping many:many rows.", v = verbose)
#### Step 1: expand/contract many:many genes ####
groupings <- gene_map[[input_col]]
#### Find NA genes ####
na_genes <- find_all_nas(v = groupings)
groupings[na_genes] <- NA
# #### Figure out how times each input_gene is duplicated ####
# Add +1 bc duplicated() doesn't count the first instance
dup_counts <- table(groupings[duplicated(groupings)]) + 1
#### Expand/aggregate rows as needed per gene ####
## This method ensure that total counts per gene remain the
## same regardless of how many genes it's split/condensed into.
## This allows for many:many mappings that are otherwise not possible
## using standard aggregation functions,
## since they all require many:1 scenarios.
#### Faster to work with sparse and then convert afterwards ####
if(is_delayed_array(X)){
X <- to_sparse(X,
allow_delayed_array = FALSE,
verbose = verbose)
}
#### Select the correct function ####
X_is_sparse <- is_sparse_matrix(X)
colSums_func <- if(X_is_sparse){
function(x){Matrix::colSums(x, sparseResult=TRUE) }
} else {
function(x){Matrix::colSums(x) }
}
for(g in names(dup_counts)){
#### Extract all rows with that genes ####
rows1 <- X[rownames(X)==g,,drop=FALSE]
n1 <- nrow(rows1)
#### Convert to a new number of rows ####
n2 <- dup_counts[g]
messager(g,": converting",n1,"row(s) -->",n2,"row(s).")
## Get the colsum of all rows with the gene
rows2_sum <- Matrix::Matrix(
colSums_func(rows1),
nrow = 1,
ncol = ncol(rows1),
sparse = X_is_sparse
)
rownames(rows2_sum) <- g
## Repeat the summed row n2 times,
## splitting the summed values evenly across n2 rows.
rows3 <- do.call("rbind",
lapply(seq_len(n2),
FUN=function(x){rows2_sum/n2}
)
)
X <- rbind(X[rownames(X)!=g,],
rows3)
}
# Matrix::colSums(rows3) == Matrix::colSums(rows1)
# Matrix::colSums(X) == Matrix::colSums(exp_mouse)
#### Rename rows with ortholog_gene names ####
gene_dict <- stats::setNames(gene_map[[output_col]],
gene_map[[input_col]])
gene_dict <- gene_dict[!duplicated(gene_dict)]
rownames(X) <- gene_dict[rownames(X)]
X <- X[!is.na(rownames(X)),]
#### Step 2: Aggregate orthologs ####
if(isTRUE(aggregate_orthologs)){
X <- aggregate_rows(X = X,
groupings = rownames(X),
agg_fun = agg_fun,
agg_method = agg_method,
as_sparse = as_sparse,
as_DelayedArray = as_DelayedArray,
dropNA = dropNA,
verbose = verbose)
} else {
#### Convert to sparse matrix ####
if (as_sparse) {
X <- to_sparse(
gene_df2 = X,
verbose = verbose
)
}
#### Convert to DelayedArray ####
X <- as_delayed_array(
exp = X,
as_DelayedArray = as_DelayedArray,
verbose = verbose
)
}
return(X)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.