#' Given 2 species names from the eupathdb, make orthology tables betwixt them.
#'
#' The eupathdb provides such a tremendous wealth of information. For me
#' though, it is difficult sometimes to boil it down into just the bits of
#' comparison I want for 1 species or between 2 species. A singularly common
#' question I am asked is: "What are the most similar genes between species x
#' and y among these two arbitrary parasites?" There are lots of ways to poke
#' at this question: run BLAST/fasta36, use biomart, query the ortholog tables
#' from the eupathdb, etc. However, in all these cases, it is not trivial to
#' ask the next question: What about: a:b and b:a?
#' This function attempts to address that for the case of two eupath species
#' from the same domain. (tritrypdb/fungidb/etc.) It does however assume that
#' the sqlite package has been installed locally, if not it suggests you run the
#' make_organismdbi function in order to do that.
#'
#' One other important caveat: this function assumes queries in the format
#' 'table_column' where in this particular instance, the table is further
#' assumed to be the ortholog table.
#'
#' @param db Species name (subset) from one eupath database.
#' @param master Primary keytype to use for indexing the various tables.
#' @param query_species A list of exact species names to search for. If uncertain
#' about them, add print_speciesnames=TRUE and be ready for a big blob of
#' text. If left null, then it will pull all species.
#' @param id_column What column in the database provides the set of ortholog IDs?
#' @param org_column What column provides the species name?
#' @param group_column Ortholog group column name.
#' @param name_column Name of the gene for this group.
#' @param count_column Name of the column with the count of species represented.
#' @param print_speciesnames Dump the species names for diagnostics?
#' @param webservice Which eupathdb project to query?
#' @return A big table of orthoMCL families, the columns are:
#' \enumerate{
#' \item GID: The gene ID
#' \item ORTHOLOG_ID: The gene ID of the associated ortholog.
#' \item ORTHOLOG_SPECIES: The species of the associated ortholog.
#' \item ORTHOLOG_URL: The OrthoMCL group ID's URL.
#' \item ORTHOLOG_COUNT: The number of all genes from all species represented in
#' this group.
#' \item ORTHOLOG_GROUP: The family ID
#' \item QUERIES_IN_GROUP: How many of the query species are represented in this
#' group?
#' \item GROUP_REPRESENTATION: ORTHOLOG_COUNT / the number of possible species.
#' }
#' @author atb
#' @export
extract_eupath_orthologs <- function(db, master = "GID", query_species = NULL,
id_column = "ORTHOLOGS_GID",
org_column = "ORTHOLOGS_ORGANISM",
group_column = "ANNOT_GENE_ORTHOMCL_NAME",
name_column = "ORTHOLOGS_PRODUCT",
count_column = "ORTHOLOGS_COUNT",
print_speciesnames = FALSE,
webservice = "eupathdb") {
pkg <- NULL
if (class(db)[1] == "OrgDb") {
pkg <- db
} else if ("character" %in% class(db)) {
pkg <- load_eupath_pkg(db, webservice=webservice)
} else if ("list" %in% class(db)) {
if ("character" %in% class(db[["orgdb_name"]])) {
pkg <- load_eupath_pkg(db[["orgdb_name"]], webservice=webservice)
} else {
stop("I only understand lists which contain package names.")
}
} else {
stop("I only understand orgdbs or the name of a species.")
}
columns <- c(id_column, group_column, org_column, name_column, count_column)
columns <- toupper(columns)
gene_set <- AnnotationDbi::keys(pkg, keytype = master)
column_set <- AnnotationDbi::columns(pkg)
column_intersect <- columns %in% column_set
if (sum(column_intersect) == length(columns)) {
message("Found all the required columns!")
} else {
missing_idx <- ! columns %in% column_set
missing <- columns[missing_idx]
message("Some columns were missing: ", toString(missing))
message("Removing them, which may end badly.")
columns <- columns[column_intersect]
}
all_orthos <- AnnotationDbi::select(x = pkg, keytype = master,
keys = gene_set, columns = columns)
all_orthos[[org_column]] <- as.factor(all_orthos[[org_column]])
num_possible <- 1
species_names <- levels(all_orthos[[org_column]])
if (is.null(query_species)) {
query_species <- species_names
} else if (! query_species %in% species_names) {
warning("Did not find the desired species in the set of all species.")
query_species <- species_names
}
num_possible <- length(species_names)
message("There are ", num_possible, " possible species in this group.")
if (isTRUE(print_speciesnames)) {
print(toString(species_names))
return(invisible())
}
## Now pull out the species of interest
found_species <- 0
for (sp in query_species) {
if (sp %in% all_orthos[[org_column]]) {
message("Found species: ", sp)
} else {
message("Did not find species: ", sp)
}
}
kept_orthos_idx <- all_orthos[[org_column]] %in% query_species
kept_orthos <- all_orthos[kept_orthos_idx, ]
## The following is not possible if we used the orthologslite table.
## In fact, the orthologslite table is (I am realizing) quite a disappointment.
## I might remove that query and just force the much slower orthologs table as it
## provides much more useful information.
if (is.null(all_orthos[["ORTHOLOGS_COUNT"]])) {
kept_orthos_dt <- data.table::as.data.table(kept_orthos)
} else {
colnames(kept_orthos) <- c(master, "ORTHOLOGS_ID", "ORTHOLOGS_GROUP", "ORTHOLOGS_SPECIES",
"ORTHOLOGS_NAME", "ORTHOLOGS_COUNT")
kept_orthos[["ORTHOLOGS_COUNT"]] <- as.integer(kept_orthos[["ORTHOLOGS_COUNT"]])
GID <- NULL
kept_orthos_dt <- data.table::as.data.table(kept_orthos) %>%
dplyr::group_by(GID) %>%
dplyr::add_count(GID)
colnames(kept_orthos_dt) <- c(master, "ORTHOLOGS_ID", "ORTHOLOGS_GROUP",
"ORTHOLOGS_SPECIES", "ORTHOLOGS_NAME", "ORTHOLOGS_COUNT",
"QUERIES_IN_GROUP")
kept_orthos_dt[["ORTHOLOGS_REPRESENTATION"]] <- kept_orthos_dt[["ORTHOLOGS_COUNT"]] / num_possible
num_queries <- length(query_species)
}
return(kept_orthos_dt)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.