orthogene: Interspecies gene mapping

Documented in many2many_rows

#' Expand/aggregate rows of matrix for many:many mappings
#'
#' Expand/aggregate rows of a matrix with any combination of 
#'  many:many mappings. 
#' This method ensures that total counts per gene remain the
#' same regardless of how many genes it has split/condensed into.
#' This allows for many:many mappings that are otherwise not possible
#' using standard aggregation functions,
#' since they all require many:1 scenarios.\cr
#' Internally, this is done as follows:
#' \enumerate{
#' \item{Identify genes that appear more than once
#'  in \code{gene_map[[input_col]]}.}
#' \item{For each gene identified, split its row into multiple
#'  rows, where the number of new rows is equal to the number of times that
#'  gene appears within \code{gene_map[[input_col]]}.
#' In the new expanded matrix, each row will be equal to the column sums
#' divided by the number of new rows. This means that averaged counts will be 
#' split equally amongst the new rows, in a column-specific manner.\cr
#' Thus, the column sums of the output matrix will be equal 
#' to the column sums in the input matrix. 
#' In the case of gene expression count matrices,
#'  this means that the total counts will remain equal between matrices, 
#'  while avoiding being forced to drop genes with many:many mappings 
#'  (as is the case with most other aggregation methods). } 
#' \item{Map rownames of the expanded matrix onto the orthologous gene names
#' from \code{gene_map$ortholog_gene}.}
#' \item{[Optional] : When \code{aggregate_orthologs=TRUE}, 
#' aggregate rows of the expanded/mapped matrix 
#' such that there will only be 1 row per ortholog gene, 
#' using \link[orthogene]{aggregate_rows}. 
#' The arguments \code{FUN}, \code{method}, 
#' \code{as_sparse}, \code{as_DelayedArray}, and \code{dropNA} will all
#' be passed to \link[orthogene]{aggregate_rows} if this step is selected.}
#' } 
#' @param X Input matrix.
#' @param gene_map A \link[base]{data.frame} generated by
#'  \link[orthogene]{map_orthologs}, 
#' with columns mapping \code{input_col} to \code{output_col}. 
#' @param input_col Column name within \code{gene_map} with gene names matching 
#' the row names of \code{X}. 
#' @param output_col Column name within \code{gene_map} with gene names
#' that you wish you map the row names of \code{X} onto.
#' @param aggregate_orthologs [Optional] After performing an initial round of 
#' many:many aggregation/expansion with \link[orthogene]{many2many_rows},
#'  ensure each orthologous gene only appears in one row by using the 
#' \link[orthogene]{aggregate_rows} function (default: \code{TRUE}).
#' @inheritParams aggregate_mapped_genes
#'
#' @return Expanded/aggregated matrix.
#'
#' @keywords internal
#' @importFrom methods is
#' @importFrom Matrix Matrix colSums
#' @importFrom stats setNames
#' @source 
#' \code{
#' data("exp_mouse")
#' X <- exp_mouse 
#' gene_map <- orthogene:::map_orthologs(genes = rownames(exp_mouse),
#'                                       input_species = "mouse",
#'                                       method="homologene")
#' X_agg <- orthogene:::many2many_rows(X = X,
#'                                     gene_map = gene_map)
#' sum(duplicated(rownames(exp_mouse))) # 0                                      
#' sum(duplicated(gene_map$input_gene)) # 46
#' sum(duplicated(gene_map$ortholog_gene)) # 56
#' sum(duplicated(rownames(X_agg))) # 56
#' }
many2many_rows <- function(X,
                           gene_map,
                           input_col = "input_gene",
                           output_col = "ortholog_gene",
                           agg_fun = "sum",
                           agg_method = c("monocle3", "stats"),
                           as_sparse = TRUE,
                           as_DelayedArray = FALSE,
                           dropNA = TRUE,
                           aggregate_orthologs = TRUE,
                           verbose = TRUE) {
    messager("Mapping many:many rows.", v = verbose) 
    #### Step 1: expand/contract many:many genes ####
    groupings <- gene_map[[input_col]]
    #### Find NA genes ####
    na_genes <- find_all_nas(v = groupings)
    groupings[na_genes] <- NA 
    # #### Figure out how times each input_gene is duplicated #### 
    # Add +1 bc duplicated() doesn't count the first instance  
    dup_counts <- table(groupings[duplicated(groupings)]) + 1  
    #### Expand/aggregate rows as needed per gene ####
    ## This method ensure that total counts per gene remain the 
    ## same regardless of how many genes it's split/condensed into.
    ## This allows for many:many mappings that are otherwise not possible
    ## using standard aggregation functions, 
    ## since they all require many:1 scenarios. 
    
    #### Faster to work with sparse and then convert afterwards ####
    if(is_delayed_array(X)){
        X <- to_sparse(X, 
                       allow_delayed_array = FALSE, 
                       verbose = verbose)
    } 
    #### Select the correct function ####
    X_is_sparse <- is_sparse_matrix(X) 
    colSums_func <- if(X_is_sparse){
        function(x){Matrix::colSums(x, sparseResult=TRUE) }
    } else {
        function(x){Matrix::colSums(x) }
    } 
    
    for(g in names(dup_counts)){ 
        #### Extract all rows with that genes ####
        rows1 <- X[rownames(X)==g,,drop=FALSE]
        n1 <- nrow(rows1)
        #### Convert to a new number of rows ####
        n2 <- dup_counts[g] 
        messager(g,": converting",n1,"row(s) -->",n2,"row(s).") 
        ## Get the colsum of all rows with the gene
        rows2_sum <- Matrix::Matrix(
            colSums_func(rows1),
            nrow = 1, 
            ncol = ncol(rows1), 
            sparse = X_is_sparse
        )   
        rownames(rows2_sum) <- g
        ## Repeat the summed row n2 times, 
        ## splitting the summed values evenly across n2 rows.
        rows3 <- do.call("rbind", 
                         lapply(seq_len(n2), 
                                FUN=function(x){rows2_sum/n2}
                                )
                         ) 
        X <- rbind(X[rownames(X)!=g,],
                   rows3)
    }
    # Matrix::colSums(rows3) == Matrix::colSums(rows1) 
    # Matrix::colSums(X) == Matrix::colSums(exp_mouse)
    
    #### Rename rows with ortholog_gene names ####
    gene_dict <-  stats::setNames(gene_map[[output_col]], 
                                  gene_map[[input_col]])
    gene_dict <- gene_dict[!duplicated(gene_dict)] 
    rownames(X) <- gene_dict[rownames(X)]
    X <- X[!is.na(rownames(X)),]
    #### Step 2: Aggregate orthologs ####
    if(isTRUE(aggregate_orthologs)){
        X <- aggregate_rows(X = X, 
                            groupings = rownames(X), 
                            agg_fun = agg_fun,
                            agg_method = agg_method, 
                            as_sparse = as_sparse, 
                            as_DelayedArray = as_DelayedArray,
                            dropNA = dropNA,
                            verbose = verbose)
    } else {
        #### Convert to sparse matrix ####
        if (as_sparse) {
            X <- to_sparse(
                gene_df2 = X,
                verbose = verbose
            )
        }
        #### Convert to DelayedArray ####
        X <- as_delayed_array(
            exp = X,
            as_DelayedArray = as_DelayedArray,
            verbose = verbose
        )
    }
    return(X)
}