setClass("IDMapper",
slots=c(species="character",
mart="Mart"))
#----------------------------------------------------------------------------------------------------
#url.exists <- function(url) {
# HEAD(url)$headers$status == "200"
# }
#-------------------------------------------------------------------------------
IDMapper <- function(species)
{
if(!species %in% "9606") {
stop("IDMapper only supports human ID mapping for now")
}
# check (separately) 3 steps which must be working to create an IDMapper
message("checking for biomart access...")
host="www.ensembl.org"
url <- paste0("http://", host)
message(sprintf(" does '%s' respond?", url))
stopifnot(url.exists(url))
message(" creating ensembl mart");
mart <- useMart(biomart = "ensembl", host=host)
dataset <- "hsapiens_gene_ensembl"
message(sprintf(" %s dataset provided?", dataset))
available.datasets <- listDatasets(mart)$dataset
stopifnot(dataset %in% listDatasets(mart)[,1])
self <- new("IDMapper")
if(species == "9606"){
self@species <- "9606"
message("connecting to biomart...")
self@mart <- useMart(biomart="ENSEMBL_MART_ENSEMBL", dataset=dataset)
}
self
}
#-------------------------------------------------------------------------------
setGeneric("addGeneInfo", signature="object",
function(object, tbl)
standardGeneric("addGeneInfo"))
setGeneric("addStandardNames", signature="object",
function(object, tbl)
standardGeneric("addStandardNames"))
#-------------------------------------------------------------------------------
.categorize <- function(rawIDs)
{
x <- rawIDs
string.entries <- grep("string:", x, value=TRUE)
uniprotkb.entries <- grep("uniprotkb:", x, value=TRUE)
# string entries can be misconstrued as uniprot entries. prevent that
# here
if(length(string.entries) > 0)
uniprotkb.entries <- setdiff(uniprotkb.entries, string.entries)
locuslink.entries <- grep("locuslink:", x, value=TRUE)
refseq.entries <- grep("refseq:", x, value=TRUE)
ensembl.gene.entries <- grep("ensembl:ENSG", x, value=TRUE)
ensembl.prot.entries <- grep("ensembl:ENSP", x, value=TRUE)
recognized.entries <- c(uniprotkb.entries, locuslink.entries,
refseq.entries, ensembl.gene.entries,
ensembl.prot.entries, string.entries)
unrecognized.entries <- setdiff(x, recognized.entries)
list(ensemblGene=ensembl.gene.entries,
ensemblProt=ensembl.prot.entries,
locuslink=locuslink.entries,
refseq=refseq.entries,
string=string.entries,
uniprotkb=uniprotkb.entries,
unrecognized=unrecognized.entries)
} # .categorize
#-------------------------------------------------------------------------------
.translate.uniprotkb <- function(mart, entries)
{
uniprots <- gsub(".*uniprotkb:([A-Z0-9]*).*", "\\1", entries)
names(uniprots) <- entries
biomart.filter <- "uniprotsptrembl"
columns = c (biomart.filter, "entrezgene_id", "hgnc_symbol");
tbl.uniprot.trembl <- getBM(filters=biomart.filter, values=uniprots,
attributes=columns, mart=mart)
colnames(tbl.uniprot.trembl) <- c("id", "geneID", "symbol")
raw.ids <- names(uniprots)[match(tbl.uniprot.trembl$id,
as.character(uniprots))]
tbl.uniprot.trembl$raw.id <- raw.ids
biomart.filter <- "uniprotswissprot"
columns = c (biomart.filter, "entrezgene_id", "hgnc_symbol");
tbl.uniprot <- getBM(filters=biomart.filter, values=uniprots, attributes=columns,
mart=mart)
colnames(tbl.uniprot) <- c("id", "geneID", "symbol")
raw.ids <- names(uniprots)[match(tbl.uniprot$id, as.character(uniprots))]
tbl.uniprot$raw.id <- raw.ids
tbl <- rbind(tbl.uniprot.trembl, tbl.uniprot)
tbl$geneID <- as.character(tbl$geneID)
# sometimes (e.g., P34896) we get multiple geneIDs for one protein ID,
# which shows up as multiple rows. handle that here
dup.ids <- names(which(as.list(table(tbl$id)) > 1))
removers <- c()
# for each dup'd geneID, find and store the numerically larger one/s
for(r in seq_len(length(dup.ids))){
geneID.int <- as.integer(subset(tbl, id == dup.ids[r])$geneID)
biggerThanMin <- geneID.int[which(geneID.int != min(geneID.int))]
removers <- c(removers, biggerThanMin)
} # for r
# find the rows with these removers, one or more big, dup'd geneIDs
if(length(removers) > 0){
indices.to.remove <- match(as.character(removers), tbl$geneID)
tbl <- tbl[-indices.to.remove,]
} # if length
tbl
} # translate.uniprotkb
#-------------------------------------------------------------------------------
.translate.string <- function(mart, entries)
{
tbl.string <- data.frame(id=character(0), geneID=character(0),
symbol=character(0), raw.id=character(0))
if(length(entries) == 0)
return(tbl.string)
string.ensps <- gsub(".*\\.(ENSP[0-9]*)\\|.*", "\\1", entries)
names(string.ensps) <- entries
biomart.filter <- "ensembl_peptide_id"
columns = c (biomart.filter, "entrezgene_id", "hgnc_symbol");
tbl.string <- getBM(filters=biomart.filter, values=string.ensps,
attributes=columns, mart=mart)
colnames(tbl.string) <- c("id", "geneID", "symbol")
raw.ids <- names(string.ensps)[match(tbl.string$id, as.character(string.ensps))]
tbl.string$raw.id <- raw.ids
tbl.string$geneID <- as.character(tbl.string$geneID)
tbl.string
} # .translate.string
#-------------------------------------------------------------------------------
.translate.ensemblGene <- function(mart, entries)
{
tbl.ensg <- data.frame(id=character(0), geneID=character(0),
symbol=character(0), raw.id=character(0))
if(length(entries) == 0)
return(tbl.ensg)
ensembl.genes <- gsub(".*ensembl:([A-Z0-9_]*).*", "\\1", entries)
names(ensembl.genes) <- entries
biomart.filter <- "ensembl_gene_id"
columns = c (biomart.filter, "entrezgene_id", "hgnc_symbol");
tbl.ensg <- getBM(filters=biomart.filter, values=ensembl.genes,
attributes=columns, mart=mart)
colnames(tbl.ensg) <- c("id", "geneID", "symbol")
raw.ids <- names(ensembl.genes)[match(tbl.ensg$id,
as.character(ensembl.genes))]
tbl.ensg$raw.id <- raw.ids
tbl.ensg$geneID <- as.character(tbl.ensg$geneID)
tbl.ensg
} # .translate.ensemblGene
#-------------------------------------------------------------------------------
.translate.ensemblProt <- function(mart, entries)
{
tbl.ensp <- data.frame(id=character(0), geneID=character(0),
symbol=character(0), raw.id=character(0))
if(length(entries) == 0)
return(tbl.ensp)
ensembl.prots <- gsub(".*ensembl:([A-Z0-9_]*).*", "\\1", entries)
names(ensembl.prots) <- entries
biomart.filter <- "ensembl_peptide_id"
columns <- c(biomart.filter, "entrezgene_id", "hgnc_symbol")
tbl.ensp <- getBM(filters=biomart.filter, values=ensembl.prots,
attributes=columns, mart=mart)
colnames(tbl.ensp) <- c("id", "geneID", "symbol")
raw.ids <- names(ensembl.prots)[match(tbl.ensp$id,
as.character(ensembl.prots))]
tbl.ensp$raw.id <- raw.ids
tbl.ensp$geneID <- as.character(tbl.ensp$geneID)
tbl.ensp
} # .translate.ensemblProt
#-------------------------------------------------------------------------------
.translate.locuslink <- function(mart, entries)
{
tbl.entrezs <- data.frame(id=character(0), geneID=character(0),
symbol=character(0), raw.id=character(0))
if(length(entries) == 0)
return(tbl.entrezs)
entrezs <- gsub(".*locuslink:([A-Z0-9]*).*", "\\1", entries)
names(entrezs) <- entries
biomart.filter <- "entrezgene_id"
columns <- c(biomart.filter, "entrezgene_id", "hgnc_symbol")
tbl.entrezs <- getBM(filters=biomart.filter, values=entrezs, attributes=columns, mart=mart)
colnames(tbl.entrezs) <- c("id", "geneID", "symbol")
raw.ids <- names(entrezs)[match(tbl.entrezs$id, as.character(entrezs))]
tbl.entrezs$raw.id <- raw.ids
tbl.entrezs$geneID <- as.character(tbl.entrezs$geneID)
tbl.entrezs
} # .translate.locuslink
#-------------------------------------------------------------------------------
.translate.refseq <- function(mart, entries)
{
tbl.refseq <- data.frame(id=character(0), geneID=character(0),
symbol=character(0), raw.id=character(0))
if(length(entries) == 0)
return(tbl.refseq)
refseqs <- gsub(".*refseq:([A-Z0-9_]*).*", "\\1", entries)
names(refseqs) <- entries
biomart.filter <- "refseq_peptide"
columns <- c(biomart.filter, "entrezgene_id", "hgnc_symbol")
tbl.refseqs <- getBM(filters=biomart.filter, values=refseqs,
attributes=columns, mart=mart)
colnames(tbl.refseqs) <- c("id", "geneID", "symbol")
raw.ids <- names(refseqs)[match(tbl.refseqs$id, as.character(refseqs))]
tbl.refseqs$raw.id <- raw.ids
tbl.refseqs$geneID <- as.character(tbl.refseqs$geneID)
tbl.refseqs
} # .translate.refseq
#-------------------------------------------------------------------------------
.translate.geneSymbol <- function(mart, entries)
{
tbl.geneSymbol <- data.frame(id=character(0), geneID=character(0),
symbol=character(0), raw.id=character(0))
if(length(entries) == 0)
return(NA)
refseqs <- gsub(".*refseq:([A-Z0-9_]*).*", "\\1", entries)
names(refseqs) <- entries
biomart.filter <- "refseq_peptide"
columns <- c(biomart.filter, "entrezgene_id", "hgnc_symbol")
tbl.refseqs <- getBM(filters=biomart.filter, values=refseqs,
attributes=columns, mart=mart)
colnames(tbl.refseqs) <- c("id", "geneID", "symbol")
raw.ids <- names(refseqs)[match(tbl.refseqs$id, as.character(refseqs))]
tbl.refseqs$raw.id <- raw.ids
tbl.refseqs$geneID <- as.character(tbl.refseqs$geneID)
tbl.refseqs
} # .translate.geneSymbol
#-------------------------------------------------------------------------------
.translateAll <- function(mart, raw.ids)
{
categories <- .categorize(raw.ids)
result <- data.frame()
for(category in names(categories)){
ids <- categories[[category]]
if(length(ids) > 0){
if(category == "ensemblGene")
result <- rbind(result, .translate.ensemblGene(mart, ids))
if(category == "ensemblProt")
result <- rbind(result, .translate.ensemblProt(mart, ids))
if(category == "locuslink")
result <- rbind(result, .translate.locuslink(mart, ids))
if(category == "refseq")
result <- rbind(result, .translate.refseq(mart, ids))
if(category == "string")
result <- rbind(result, .translate.string(mart, ids))
if(category == "uniprotkb")
result <- rbind(result, .translate.uniprotkb(mart, ids))
} # if length
} # for category
result
} # .translateAll
#-------------------------------------------------------------------------------
setMethod("addGeneInfo", signature=c(object="IDMapper"),
function(object, tbl) {
# default assumption: all rows need geneInfo added
unmapped.rows <- seq_len(nrow(tbl))
# but in fact, some may not
some.geneInfo.present <- all(c("A.name", "B.name", "A.id", "B.id")
%in% colnames(tbl))
if(some.geneInfo.present)
unmapped.rows <- which(tbl$A.name == "-")
A <- tbl$A[unmapped.rows]
B <- tbl$B[unmapped.rows]
raw.ids <- unique(c(A, B))
tbl.xref <- .translateAll(object@mart, raw.ids)
# create two named lists, for fast lookup of tbl$A and $B
syms <- tbl.xref$symbol
names(syms) <- tbl.xref$raw.id
geneIDs <- tbl.xref$geneID
names(geneIDs) <- tbl.xref$raw.id
A.name <- as.character(syms[A])
B.name <- as.character(syms[B])
A.id <- as.character(geneIDs[A])
B.id <- as.character(geneIDs[B])
A.name[is.na(A.name)] <- "-"
B.name[is.na(B.name)] <- "-"
A.id[is.na(A.id)] <- "-"
B.id[is.na(B.id)] <- "-"
if(!some.geneInfo.present){
empty.column <- rep("-", nrow(tbl))
tbl <- cbind(tbl,
A.name=empty.column,
B.name=empty.column,
A.id=empty.column,
B.id=empty.column,
stringsAsFactors=FALSE)
}# no previous geneInfo
tbl$A.name[unmapped.rows] <- A.name
tbl$B.name[unmapped.rows] <- B.name
tbl$A.id[unmapped.rows] <- A.id
tbl$B.id[unmapped.rows] <- B.id
tbl
}) # addGeneInfo
#-------------------------------------------------------------------------------
setMethod("addStandardNames", signature=c(object="IDMapper"),
function(object, tbl) {
# default assumption: all rows need geneInfo added
unmapped.rows <- seq_len(nrow(tbl))
# but in fact, some may not
some.geneInfo.present <- all(c("A.name", "B.name", "A.id", "B.id")
%in% colnames(tbl))
if(some.geneInfo.present)
unmapped.rows <- which(tbl$A.name == "-")
A <- tbl$A[unmapped.rows]
B <- tbl$B[unmapped.rows]
raw.ids <- unique(c(A, B))
tbl.xref <- .translateAll(object@mart, raw.ids)
# create two named lists, for fast lookup of tbl$A and $B
syms <- tbl.xref$symbol
names(syms) <- tbl.xref$raw.id
geneIDs <- tbl.xref$geneID
names(geneIDs) <- tbl.xref$raw.id
A.name <- as.character(syms[A])
B.name <- as.character(syms[B])
A.id <- as.character(geneIDs[A])
B.id <- as.character(geneIDs[B])
A.name[is.na(A.name)] <- "-"
B.name[is.na(B.name)] <- "-"
A.id[is.na(A.id)] <- "-"
B.id[is.na(B.id)] <- "-"
if(!some.geneInfo.present){
empty.column <- rep("-", nrow(tbl))
tbl <- cbind(tbl,
A.name=empty.column,
B.name=empty.column,
A.id=empty.column,
B.id=empty.column,
stringsAsFactors=FALSE)
}# no previous geneInfo
tbl$A.name[unmapped.rows] <- A.name
tbl$B.name[unmapped.rows] <- B.name
tbl$A.id[unmapped.rows] <- A.id
tbl$B.id[unmapped.rows] <- B.id
tbl
}) # addGeneInfo
#-------------------------------------------------------------------------------
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.