### =========================================================================
### fetch_table_dump_from_Ensembl_FTP()
### -------------------------------------------------------------------------
###
### Nothing in this file is exported.
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Ensembl Core schema (only for the tables we care about)
###
### List of Ensembl public MySQL servers / ports
### https://www.ensembl.org/info/data/mysql.html
### Ensembl Core Schema:
### https://www.ensembl.org/info/docs/api/core/core_schema.html
### Fundamental Tables
### 'attrib_type' table.
### (331 rows in homo_sapiens_core_99_38)
.ENSEMBLDB_ATTRIB_TYPE_COLUMNS <- c(
"attrib_type_id", # primary key
"code",
"name",
"description"
)
### Assembly Tables
### 'seq_region' table.
### (268443 rows in homo_sapiens_core_99_38)
.ENSEMBLDB_SEQ_REGION_COLUMNS <- c(
"seq_region_id", # primary key
"name",
"coord_system_id", # ==> coord_system.coord_system_id
"length"
)
### 'coord_system' table.
### (9 rows in homo_sapiens_core_99_38)
.ENSEMBLDB_COORD_SYSTEM_COLUMNS <- c(
"coord_system_id", # primary key
"species_id",
"name",
"version",
"rank",
"attrib"
)
### 'seq_region_attrib' table.
### (5927 rows in homo_sapiens_core_99_38)
.ENSEMBLDB_SEQ_REGION_ATTRIB_COLUMNS <- c(
"seq_region_id", # ==> seq_region.seq_region_id
"attrib_type_id", # ==> attrib_type.attrib_type_id
"value"
)
### 'seq_region_synonym' table.
### (2360 rows in homo_sapiens_core_99_38)
.ENSEMBLDB_SEQ_REGION_SYNONYM_COLUMNS <- c(
"seq_region_synonym_id", # primary key
"seq_region_id", # ==> seq_region.seq_region_id
"synonym",
"external_db_id" # ==> external_db.external_db_id
)
### External References
### 'external_db' table.
### (446 rows in homo_sapiens_core_99_38)
.ENSEMBLDB_EXTERNAL_DB_COLUMNS <- c(
"external_db_id", # primary key
"db_name",
"db_release",
"status",
"priority",
"db_display_name",
"type",
"secondary_db_name",
"secondary_db_table",
"description"
)
.ENSEMBLDB_COLUMNS <- list(
attrib_type=.ENSEMBLDB_ATTRIB_TYPE_COLUMNS,
seq_region=.ENSEMBLDB_SEQ_REGION_COLUMNS,
coord_system=.ENSEMBLDB_COORD_SYSTEM_COLUMNS,
seq_region_attrib=.ENSEMBLDB_SEQ_REGION_ATTRIB_COLUMNS,
seq_region_synonym=.ENSEMBLDB_SEQ_REGION_SYNONYM_COLUMNS,
external_db=.ENSEMBLDB_EXTERNAL_DB_COLUMNS
)
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### fetch_table_dump_from_Ensembl_FTP()
###
### Fetch a table (.txt.gz file) from one of the core DB dumps located on
### the Ensembl FTP server (ftp.ensembl.org).
### This provides access to stuff that is not available thru biomaRt like
### for example the lengths of the sequences in the reference genome
### associated with a particular dataset and Ensembl release (e.g. for
### dataset "hsapiens_gene_ensembl" in release "64").
###
### TODO: Querying the Ensembl MySQL server (via RMariaDB) would probably
### be a better way to do this.
### Update (Feb 10, 2020): Some preliminary testing indicates that using
### RMariaDB to fetch full tables is actually significantly slower.
### For example, to fetch table "seq_region" from the homo_sapiens_core_99_38
### core DB (the table has 268443 rows and 4 columns):
### o The utils::download.file() + utils::read.table() approach takes about
### 7 sec to download and parse the MySQL dump located at:
### ftp://ftp.ensembl.org/pub/current_mysql/homo_sapiens_core_99_38/
### o The RMariaDB method takes about 22 sec to retrieve the table from the
### MySQL server at ensembldb.ensembl.org.
###
### Table dumps from Ensembl release < 99 can contain inline \r characters
### (carriage returns) which break utils::read.table().
.table_can_contain_CRs <- function(core_db_url)
{
release <- sub("_.*$", "", sub("^.*_core_", "", core_db_url))
release <- suppressWarnings(as.integer(release))
!is.na(release) && release < 99L
}
### 'core_db_url' must be the full URL to a core DB directory located
### on the Ensembl FTP server e.g.
### ftp://ftp.ensembl.org/pub/release-99/mysql/mus_musculus_core_99_38/ or
### ftp://ftp.ensembl.org/pub/grch37/release-87/mysql/homo_sapiens_core_87_37/
### The "ftp://" part and trailing slash are both mandatory!
### Use get_Ensembl_FTP_core_db_url() defined in Ensembl-utils.R to obtain
### such URL for a given species/release/division programmatically.
fetch_table_dump_from_Ensembl_FTP <-
function(core_db_url, table, full.colnames=FALSE, nrows=-1L)
{
columns <- .ENSEMBLDB_COLUMNS[[table]]
if (!is.null(columns) && full.colnames)
columns <- paste(table, columns, sep=".")
url <- paste0(core_db_url, table, ".txt.gz")
remove_CRs <- .table_can_contain_CRs(core_db_url)
if (is.null(columns))
warning(wmsg("unknown table: ", table, " (download might fail ",
"or the returned data frame will have automatic ",
"colnames)"),
immediate.=TRUE)
## fetch_table_from_url() downloads the full file before reading it.
ans <- fetch_table_from_url(url, colnames=columns, nrows=nrows,
remove_CRs=remove_CRs)
if (is.null(columns) && full.colnames)
colnames(ans) <- paste(table, colnames(ans), sep=".")
ans
}
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### OLD_fetch_table_dump_from_Ensembl_FTP()
###
### Replaced by fetch_table_dump_from_Ensembl_FTP() above that uses
### fetch_table_from_url()'s newly added argument 'remove_CRs' to handle the
### carriage return problem found in table dumps from Ensembl release < 99.
### TODO: Either get rid of this (and of the soft dep on data.table and
### R.utils), OR add the 'reader' argument to
### fetch_table_dump_from_Ensembl_FTP() (choices would be reduced
### to "read.table" or "fread" only, no more "auto", with default
### being "read.table").
.please_install_missing_CRAN_pkgs <- function(pkgs, reader0)
{
fmt <- paste0("Couldn't load %s. The %s needed %s. Please install %s ",
"with 'install.packages(%s)' and try again.")
if (length(pkgs) == 1L) {
what1 <- paste0("package ", pkgs)
what2 <- "package is"
what3 <- "it"
what4 <- paste0("\"", pkgs, "\"")
} else {
what1 <- paste0("the following packages: ", paste0(pkgs, collapse=", "))
what2 <- "packages are"
what3 <- "them"
what4 <- paste0("c(", paste0("\"", pkgs, "\"", collapse=", "), ")")
}
if (reader0 == "auto") {
why <- paste0("to fetch data from Ensembl FTP server ",
"for Ensembl releases older than 99")
} else {
why <- paste0("when 'reader=\"", reader0, "\"'")
}
sprintf(fmt, what1, what2, why, what3, what4)
}
### 'core_db_url' must be the full URL to a core DB directory located
### on the Ensembl FTP server e.g.
### ftp://ftp.ensembl.org/pub/release-99/mysql/mus_musculus_core_99_38/ or
### ftp://ftp.ensembl.org/pub/grch37/release-87/mysql/homo_sapiens_core_87_37/
### The "ftp://" part and trailing slash are mandatory!
### Use get_Ensembl_FTP_core_db_url() defined in Ensembl-utils.R to obtain
### such URL for a given species/release/division programmatically.
OLD_fetch_table_dump_from_Ensembl_FTP <-
function(core_db_url, table, full.colnames=FALSE, nrows=-1L,
reader=c("auto", "read.table", "fread"))
{
reader0 <- reader <- match.arg(reader)
columns <- .ENSEMBLDB_COLUMNS[[table]]
if (!is.null(columns) && full.colnames)
columns <- paste(table, columns, sep=".")
url <- paste0(core_db_url, table, ".txt.gz")
if (reader == "auto") {
## Table dumps from Ensembl release < 99 can contain inline \r
## characters (carriage returns) which break utils::read.table().
## However data.table::fread() seems to be slightly better at handling
## them. See https://github.com/Bioconductor/GenomeInfoDb/issues/98
## and https://github.com/Bioconductor/GenomeInfoDb/issues/97.
if (.table_can_contain_CRs(core_db_url)) {
reader <- "fread"
} else {
reader <- "read.table"
}
}
if (reader == "read.table") {
if (is.null(columns))
warning(wmsg("unknown table: ", table, " (download might fail ",
"or the returned data frame will have automatic ",
"colnames)"),
immediate.=TRUE)
## fetch_table_from_url() downloads the full file before reading it.
ans <- fetch_table_from_url(url, colnames=columns, nrows=nrows)
if (is.null(columns) && full.colnames)
colnames(ans) <- paste(table, colnames(ans), sep=".")
} else {
if (is.null(columns))
stop(wmsg("unknown table: ", table))
## data.table::fread() needs R.utils to read compressed files.
missing_pkgs <- character(0)
if (!requireNamespace("R.utils", quietly=TRUE))
missing_pkgs <- c(missing_pkgs, "R.utils")
if (!requireNamespace("data.table", quietly=TRUE))
missing_pkgs <- c(missing_pkgs, "data.table")
if (length(missing_pkgs) != 0L) {
errmsg <- .please_install_missing_CRAN_pkgs(missing_pkgs, reader0)
stop(wmsg(errmsg))
}
## data.table::fread() downloads the full file before reading it.
ans <- data.table::fread(url, nrows=nrows, strip.white=FALSE,
showProgress=FALSE)
ans <- as.data.frame(ans)
colnames(ans) <- columns
}
ans
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.