R/homologene.R

Defines functions homologene_organisms homologene_uniprot_orthology homologene_download homologene_raw

Documented in homologene_download homologene_organisms homologene_raw homologene_uniprot_orthology

#!/usr/bin/env Rscript

#
#  This file is part of the `OmnipathR` R package
#
#  Copyright
#  2018-2024
#  Saez Lab, Uniklinik RWTH Aachen, Heidelberg University
#
#  File author(s): Alberto Valdeolivas
#                  Dénes Türei (turei.denes@gmail.com)
#                  Attila Gábor
#
#  Distributed under the MIT (Expat) License.
#  See accompanying file `LICENSE` or find a copy at
#      https://directory.fsf.org/wiki/License:Expat
#
#  Website: https://r.omnipathdb.org/
#  Git repo: https://github.com/saezlab/OmnipathR
#


#' Orthology data from NCBI HomoloGene
#'
#' Retrieves NCBI HomoloGene data without any processing. Processed tables
#' are more useful for most purposes, see below other functions that provide
#' those. Genes of various organisms are grouped into homology groups
#' ("hgroup" column). Organisms are identified by NCBI Taxonomy IDs, genes
#' are identified by four different identifier types.
#'
#' @return A data frame as provided by NCBI HomoloGene.
#'
#' @examples
#' hg <- homologene_raw()
#' hg
#' # # A tibble: 275,237 × 6
#' #    hgroup ncbi_taxid entrez  genesymbol  gi        refseqp
#' #     <int>      <int> <chr>   <chr>       <chr>     <chr>
#' #  1      3       9606 34      ACADM       4557231   NP_000007.1
#' #  2      3       9598 469356  ACADM       160961497 NP_001104286.1
#' #  3      3       9544 705168  ACADM       109008502 XP_001101274.1
#' #  4      3       9615 490207  ACADM       545503811 XP_005622188.1
#' #  5      3       9913 505968  ACADM       115497690 NP_001068703.1
#' # # . with 275,232 more rows
#'
#' # which organisms are available?
#' common_name(unique(hg$ncbi_taxid))
#' #  [1] "Human" "Chimpanzee" "Macaque" "Dog" "Cow" "Mouse" "Rat" "Zebrafish"
#' #  [9] "D. melanogaster" "Caenorhabditis elegans (PRJNA13758)"
#' # [11] "Tropical clawed frog" "Chicken"
#' # ...and 9 more organisms with missing English names.
#'
#' @importFrom magrittr %>% %T>%
#' @importFrom readr cols col_integer col_character
#' @export
#' @seealso \itemize{
#'     \item{\code{\link{homologene_download}}}
#' }
homologene_raw <- function(){

    .slow_doctest(value = list(ncbi_taxid = 'OmnipathR: no data'))

    hdr <- c('hgroup', 'ncbi_taxid', 'entrez', 'genesymbol', 'gi', 'refseqp')

    'homologene' %>%
    generic_downloader(
        reader_param = list(
            col_names = hdr,
            col_types = cols(
                hgroup = col_integer(),
                ncbi_taxid = col_integer(),
                entrez = col_character(),
                gi = col_character()
            )
        ),
        resource = 'NCBI HomoloGene'
    ) %T>%
    load_success()

}


#' Orthology table for a pair of organisms
#'
#' Orthologous pairs of genes for a pair of organisms from NCBI HomoloGene,
#' using one identifier type.
#'
#' @param target Character or integer: name or ID of the target organism.
#' @param source Character or integer: name or ID of the source organism.
#' @param id_type Symbol or character: identifier type, possible values are
#'     "genesymbol", "entrez", "refseqp" or "gi".
#' @param hgroup_size Logical: include a column with the size of the homology
#'     groups. This column distinguishes one-to-one and one-to-many or
#'     many-to-many mappings.
#'
#' @details
#' The operation of this function is symmetric, *source* and *target* are
#' interchangeable but determine the column layout of the output. The column
#' "hgroup" is a numberic identifier of the homology groups. Most of the
#' groups consist of one pair of orthologous genes (one-to-one mapping), and
#' a few of them multiple ones (one-to-many or many-to-many mappings).
#'
#' @return A data frame with orthologous identifiers between the two organisms.
#'
#' @examples
#' chimp_human <- homologene_download(chimpanzee, human, refseqp)
#' chimp_human
#' # # A tibble: 17,737 × 3
#' #    hgroup refseqp_source refseqp_target
#' #     <int> <chr>          <chr>
#' #  1      3 NP_000007.1    NP_001104286.1
#' #  2      5 NP_000009.1    XP_003315394.1
#' #  3      6 NP_000010.1    XP_508738.2
#' #  4      7 NP_001096.1    XP_001145316.1
#' #  5      9 NP_000014.1    XP_523792.2
#' # # . with 17,732 more rows
#'
#' @importFrom rlang !! enquo sym
#' @importFrom magrittr %>%
#' @importFrom dplyr inner_join filter select group_by mutate n ungroup
#' @export
#' @seealso \itemize{
#'     \item{\code{\link{homologene_raw}}}
#'     \item{\code{\link{homologene_uniprot_orthology}}}
#' }
homologene_download <- function(
    target = 10090L,
    source = 9606L,
    id_type = 'genesymbol',
    hgroup_size = FALSE
){

    .slow_doctest()

    # NSE vs. R CMD check workaround
    hgroup <- NULL

    source %<>% ncbi_taxid
    target %<>% ncbi_taxid
    id_type <- .nse_ensure_str(!!enquo(id_type))

    homologene_raw() %>%
    {inner_join(
        filter(., ncbi_taxid == source) %>% select(hgroup, !!sym(id_type)),
        filter(., ncbi_taxid == target) %>% select(hgroup, !!sym(id_type)),
        by = 'hgroup',
        suffix = c('_source', '_target')
    )} %>%
    {`if`(
        hgroup_size,
        group_by(., hgroup) %>% mutate(hgroup_size = n()) %>% ungroup,
        .
    )}

}


#' Orthology table with UniProt IDs
#'
#' Orthologous pairs of UniProt IDs for a pair of organisms, based on NCBI
#' HomoloGene data.
#'
#' @param target Character or integer: name or ID of the target organism.
#' @param source Character or integer: name or ID of the source organism.
#' @param by Symbol or character: the identifier type in NCBI HomoloGene
#'     to use. Possible values are "refseqp", "entrez", "genesymbol", "gi".
#' @param ... Further arguments passed to \code{\link{translate_ids}}.
#'
#' @return A data frame with orthologous pairs of UniProt IDs.
#'
#' @examples
#' homologene_uniprot_orthology(by = genesymbol)
#' # # A tibble: 14,235 × 2
#' #    source target
#' #    <chr>  <chr>
#' #  1 P11310 P45952
#' #  2 P49748 P50544
#' #  3 P24752 Q8QZT1
#' #  4 Q04771 P37172
#' #  5 Q16586 P82350
#' # # . with 14,230 more rows
#'
#' @importFrom rlang !! enquo sym :=
#' @importFrom magrittr %>%
#' @importFrom dplyr select filter distinct
#' @export
homologene_uniprot_orthology <- function(
    target = 10090L,
    source = 9606L,
    by = entrez,
    ...
){

    .slow_doctest()

    # NSE vs. R CMD check workaround
    entrez <- uniprot <- NULL

    by <- .nse_ensure_str(!!enquo(by))
    log_trace('HomoloGene: translating to UniProt by `%s`.', by)
    source %<>% ncbi_taxid
    target %<>% ncbi_taxid

    homologene_download(
        target = target,
        source = source,
        id_type = !!sym(by)
    ) %>%
    translate_ids(
        !!sym(sprintf('%s_source', by)) := !!sym(by),
        source = uniprot,
        organism = source,
        ...
    ) %>%
    translate_ids(
        !!sym(sprintf('%s_target', by)) := !!sym(by),
        target = uniprot,
        organism = target,
        ...
    ) %>%
    select(source, target) %>%
    filter(!is.na(source) & !is.na(target)) %>%
    distinct


}


#' Organisms in NCBI HomoloGene
#'
#' @param name_type Character: type of the returned name or identifier.
#'     Many synonyms are accepted, the shortest ones: "latin", "ncbi",
#'     "common", "ensembl". Case unsensitive.
#'
#' @details Not all NCBI Taxonomy IDs can be translated to common or
#'     latin names. It means some organisms will be missing if translated
#'     to those name types. In the future we will address this issue, until
#'     then if you want to see all organisms use NCBI Taxonomy IDs.
#'
#' @return A character vector of organism names.
#'
#' @importFrom magrittr %>%
#' @importFrom dplyr pull arrange
#' @importFrom tibble tibble
#' @importFrom rlang !! enquo
#' @importFrom purrr map_chr
#' @export
homologene_organisms <- function(name_type = 'ncbi'){

    name_type_s = .nse_ensure_str(!!enquo(name_type))

    homologene_raw() %>%
        pull(ncbi_taxid) %>%
        unique %>%
        {`if`(
            name_type_s != 'ncbi',
            map_chr(taxon_name, name_type = !!enquo(name_type)),
            .
        )}

}
saezlab/OmnipathR documentation built on Oct. 16, 2024, 11:49 a.m.