Nothing
# jaspar/import.R
# notes: 3 matrices come w/o speciesID, tax = 'vertebrates'. not our problem to fix, at least not yet.
# TBP, HNF4A and CEBPA (MA0108.2, MA0114.1, MA0102.2)
#------------------------------------------------------------------------------------------------------------------------
library (org.Hs.eg.db)
library (org.Mm.eg.db)
#------------------------------------------------------------------------------------------------------------------------
printf <- function(...) print(noquote(sprintf(...)))
#------------------------------------------------------------------------------------------------------------------------
run = function (dataDir)
{
dataDir <- file.path(dataDir, "jaspar")
tbl.rmat = readRawMatrices (dataDir)
matrices = convertRawMatricesToStandard (tbl.rmat)
tbl.anno = createAnnotationTable (dataDir)
tbl.md = createMetadataTable (tbl.anno, matrices)
matrices = renameMatrices (matrices, tbl.md)
matrices = normalizeMatrices (matrices)
serializedFile <- "jaspar.RData"
save (matrices, tbl.md, file=serializedFile)
printf("saved %d matrices to %s", length(matrices), serializedFile)
printf("next step: copy %s to <packageRoot>/MotifDb/inst/extdata, rebuild package", serializedFile)
} # run
#------------------------------------------------------------------------------------------------------------------------
# jaspar's sql rendering of their matrices is unusual. see below. translate them to a 4-row, n-column form in
# convertRawMatricesToStandard, below.
# the id can be mapped to other names and information via simple subset lookup in tbl.anno
# id base pos count
# 1 9229 A 1 0
# 2 9229 A 2 3
# 3 9229 A 3 79
# ...
# 11 9229 C 1 94
# 12 9229 C 2 75
# 13 9229 C 3 4
# ...
# 20 9229 C 10 3
# 21 9229 G 1 1
# 22 9229 G 2 0
# ...
# 38 9229 T 8 81
# 39 9229 T 9 1
# 40 9229 T 10 6
readRawMatrices = function (dataDir)
{
file <- file.path(dataDir, 'MATRIX_DATA.txt')
tbl.matrices = read.table (file, sep='\t', header=FALSE, as.is=TRUE, colClasses=c ('character', 'character', 'numeric', 'numeric'))
colnames (tbl.matrices) = c ('id', 'base', 'pos', 'count')
invisible (tbl.matrices)
} # readRawMatrices
#------------------------------------------------------------------------------------------------------------------------
convertRawMatricesToStandard = function (tbl.rmat)
{
matrix.ids = unique (tbl.rmat$id)
result = vector ('list', length (matrix.ids))
i = 1
for (matrix.id in matrix.ids) {
tbl.sub = subset (tbl.rmat, id == matrix.id)
# sanity check this rather unusual representation of a position count matrix
base.count = as.data.frame (table (tbl.sub$base))
stopifnot (base.count$Var1 == c ('A', 'C', 'G', 'T'))
# conservative length check. actually expect sequence lengths of 6 - 20 bases
if (base.count$Freq [1] < 4 && base.count$Freq [1] > 30) {
printf ('matrix.id %s has sequence of length %d', matrix.id, base.count$Freq [1])
}
stopifnot (all (base.count$Freq == base.count$Freq [1]))
nucleotide.counts = tbl.sub$count
row.names = c('A', 'C', 'G', 'T');
col.names = 1:(nrow (tbl.sub) / 4)
m = matrix (nucleotide.counts, byrow=TRUE, nrow=4, dimnames=list(row.names, col.names))
result [[i]] = m
i = i + 1
} # for matrix.id
names (result) = matrix.ids
return (result)
} # convertRawMatricesToStandard
#------------------------------------------------------------------------------------------------------------------------
# read 'mysql' tables provide by jaspar:
# MATRIX.txt: 9229 CORE MA0001 1 AGL3
# MATRIX_PROTEIN.txt: 9229 P29383
# MATRIX_SPECIES.txt: 9229 3702
# MATRIX_ANNOTATION.txt:
# 9229 class Other Alpha-Helix
# 9229 comment -
# 9229 family MADS
# 9229 medline 7632923
# 9229 tax_group plants
# 9229 type SELEX
createAnnotationTable = function (dataDir)
{
file <- file.path(dataDir, "MATRIX.txt")
tbl.matrix = read.table (file, sep='\t', header=F, as.is=TRUE)
colnames (tbl.matrix) = c ('id', 'category', 'mID', 'version', 'binder')
file <- file.path(dataDir, "MATRIX_PROTEIN.txt")
tbl.protein = read.table (file, sep='\t', header=F, as.is=TRUE)
colnames (tbl.protein) = c ('id', 'proteinID')
file <- file.path(dataDir, "MATRIX_SPECIES.txt")
tbl.species = read.table (file, sep='\t', header=F, as.is=TRUE)
colnames (tbl.species) = c ('id', 'speciesID')
file <- file.path(dataDir, "MATRIX_ANNOTATION.txt")
tbl.anno = read.table (file, sep='\t', header=F, as.is=TRUE, quote="")
colnames (tbl.anno) = c ('id', 'attribute', 'value')
tbl.family = subset (tbl.anno, attribute=='family') [, -2];
colnames (tbl.family) = c ('id', 'family')
tbl.tax = subset (tbl.anno, attribute=='tax_group') [,-2];
colnames (tbl.tax) = c ('id', 'tax')
tbl.class = subset (tbl.anno, attribute=='class') [,-2];
colnames (tbl.class) = c ('id', 'class')
tbl.comment = subset (tbl.anno, attribute=='comment')[,-2];
colnames (tbl.comment) = c ('id', 'comment')
tbl.pubmed = subset (tbl.anno, attribute=='medline')[,-2];
colnames (tbl.pubmed) = c ('id', 'pubmed')
tbl.type = subset (tbl.anno, attribute=='type')[,-2];
colnames (tbl.type) = c ('id', 'type')
tbl.md = merge (tbl.matrix, tbl.species, all.x=TRUE)
tbl.md = merge (tbl.md, tbl.protein, all.x=TRUE)
tbl.md = merge (tbl.md, tbl.family, all.x=TRUE)
tbl.md = merge (tbl.md, tbl.tax, all.x=TRUE)
tbl.md = merge (tbl.md, tbl.class, all.x=TRUE)
tbl.md = merge (tbl.md, tbl.pubmed, all.x=TRUE)
tbl.md = merge (tbl.md, tbl.type, all.x=TRUE)
fullID = paste (tbl.md$mID, tbl.md$version, sep='.')
tbl.md = cbind (fullID, tbl.md, stringsAsFactors=FALSE)
invisible (tbl.md)
} # createAnnotationTable
#------------------------------------------------------------------------------------------------------------------------
# assemble these columns:
# names=character(), # source-species-gene: UniPROBE-Mmusculus-Rhox11-306b; ScerTF-Scerevisiae-ABF2-e73a
# nativeNames=character(), # badis.ABF2, Cell08/Rhox11_2205.1_pwm.txt
# geneSymbols=character(), # ABF2, Rhox11
# sequenceCounts=integer(), # often NA
# organisms=character(), # Scerevisiae, Mmusculus
# bindingMolecules=character(), # YMR072W, 194738
# bindingMoleculeIdTypes=character(), # SGD, entrez gene
# bindingDomainTypes=character(), # NA, Homeo
# dataSources=character(), # ScerTF, UniPROBE
# experimentTypes=character(), # NA, protein-binding microarray
# pubmedIDs=character(), # 19111667, 1858359
# tfFamilies=character()) # NA, NA
#
# from these
#
createMetadataTable = function (tbl.anno, matrices)
{
options (stringsAsFactors=FALSE)
tbl.md = data.frame ()
matrix.ids = names (matrices)
dataSource = 'JASPAR_CORE'
for (matrix.id in matrix.ids) {
matrix = matrices [[matrix.id]]
stopifnot (length (intersect (matrix.id, tbl.anno$id)) == 1)
tbl.sub = subset (tbl.anno, id==matrix.id)
if (nrow (tbl.sub) > 1) {
# the binder is a dimer, perhaps a homodimer, and two proteinIDs are provided. Arnt::Ahr
# some others, a sampling: P05412;P01100, P08047, P22814;Q24040, EAW80806;EAW53510
dimer = paste (unique (tbl.sub$proteinID), collapse=';')
tbl.sub [1, 'proteinID'] = dimer
}
anno = as.list (tbl.sub [1,])
taxon.code = anno$speciesID
geneId.info = assignGeneId (anno$proteinID)
new.row = list (providerName=anno$binder,
providerId=anno$fullID,
dataSource=dataSource,
geneSymbol=anno$binder,
geneId=geneId.info$geneId,
geneIdType=geneId.info$type,
proteinId=anno$proteinID,
proteinIdType=guessProteinIdentifierType (anno$proteinID),
organism=convertTaxonCode(anno$speciesID),
sequenceCount=as.integer (mean (colSums (matrix))),
bindingSequence=NA_character_,
bindingDomain=anno$class,
tfFamily=anno$family,
experimentType=anno$type,
pubmedID=anno$pubmed)
tbl.md = rbind (tbl.md, data.frame (new.row, stringsAsFactors=FALSE))
full.name = sprintf ('%s-%s-%s-%s', convertTaxonCode(anno$speciesID), dataSource, anno$binder, anno$fullID)
rownames (tbl.md) [nrow (tbl.md)] = full.name
} # for i
# Mmusculus-JASPAR_CORE-NF-kappaB-MA0061.1 has 'NA' for proteinID, not <NA>
NA.string.indices = grep ('NA', tbl.md$proteinId)
if (length (NA.string.indices) > 0)
tbl.md$proteinId [NA.string.indices] = NA
invisible (tbl.md)
} # createMetadataTable
#------------------------------------------------------------------------------------------------------------------------
renameMatrices = function (matrices, tbl.md)
{
stopifnot (length (matrices) == nrow (tbl.md))
names (matrices) = rownames (tbl.md)
invisible (matrices)
} # renameMatrices
#------------------------------------------------------------------------------------------------------------------------
# full names: ('Mus musculus', 'Rattus norvegicus', 'Rattus rattus', 'Arabidopsis thaliana', 'Pisum sativum',
# 'Nicotiana sylvestris', 'Petunia hybrida', 'Antirrhinum majus', 'Hordeum vulgare', 'Triticum aestivam',
# 'Zea mays', 'Saccharomyces cerevisiae', 'Caenorhabditis elegans', 'Drosophila melanogaster',
# 'Halocynthia roretzi', 'Vertebrata', 'Onchorhynchus mykiss', 'Xenopus laevis', 'Xenopus tropicalis',
# 'Gallus gallus', 'Homo sapiens', 'Bos taurus', 'Oryctolagus cuniculus'),
convertTaxonCode = function (ncbi.code)
{
#if (is.na (ncbi.code))
# return (NA_character_)
ncbi.code = as.character (ncbi.code)
if (ncbi.code %in% c ('-', NA_character_, 'NA'))
return ('Vertebrata')
tbl = data.frame (code= c('10090', '10116', '10117', '3702', '3888', '4094', '4102',
'4151', '4513', '4565', '4577', '4932', '6239', '7227', '7729',
'7742', '8022', '8355', '8364', '9031', '9606', '9913', '9986'),
name=c ('Mmusculus', 'Rnorvegicus', 'Rrattus', 'Athaliana', 'Psativum',
'Nsylvestris', 'Phybrida', 'Amajus', 'Hvulgare', 'Taestivam',
'Zmays', 'Scerevisiae', 'Celegans', 'Dmelanogaster',
'Hroretzi', 'Vertebrata', 'Omykiss', 'Xlaevis', 'Xtropicalis',
'Gallus', 'Hsapiens', 'Btaurus', 'Ocuniculus'),
stringsAsFactors=FALSE)
ncbi.code = as.character (ncbi.code)
index = which (tbl$code == ncbi.code)
if (length (index) == 1)
return (tbl$name [index])
else {
write (sprintf (" unable to map organism code |%s|", ncbi.code), stderr ())
return (NA_character_)
}
} # convertTaxonCode
#------------------------------------------------------------------------------------------------------------------------
# an empirical and not altogether trustworthy solution to identifying identifier types.
guessProteinIdentifierType = function (moleculeName)
{
if (nchar (moleculeName) == 0)
return (NA_character_)
if (is.na (moleculeName))
return (NA_character_)
first.char = substr (moleculeName, 1, 1)
if (first.char == 'Y')
return ('SGD')
if (first.char %in% c ('P', 'Q', 'O', 'A', 'C'))
return ('UNIPROT')
if (length (grep ('^EAW', moleculeName)) == 1)
return ('NCBI')
if (length (grep ('^EAX', moleculeName)) == 1)
return ('NCBI')
if (length (grep ('^NP_', moleculeName)) == 1)
return ('REFSEQ')
if (length (grep ('^BAD', moleculeName)) == 1)
return ('EMBL')
return (NA_character_)
} # guessProteinIdentifierType
#------------------------------------------------------------------------------------------------------------------------
normalizeMatrices = function (matrices)
{
mtx.normalized = sapply (matrices, function (mtx) apply (mtx, 2, function (colvector) colvector / sum (colvector)))
invisible (mtx.normalized)
} # normalizeMatrices
#------------------------------------------------------------------------------------------------------------------------
assignGeneId = function (proteinId)
{
if (!exists ('id.uniprot.human')) {
tbl = toTable (org.Hs.egUNIPROT)
id.uniprot.human <<- as.character (tbl$gene_id)
names (id.uniprot.human) <<- tbl$uniprot_id
tbl = toTable (org.Hs.egREFSEQ)
tbl = tbl [grep ('^NP_', tbl$accession),]
id.refseq.human <<- as.character (tbl$gene_id)
names (id.refseq.human) <<- tbl$accession
tbl = toTable (org.Mm.egUNIPROT)
id.uniprot.mouse <<- as.character (tbl$gene_id)
names (id.uniprot.mouse) <<- tbl$uniprot_id
tbl = toTable (org.Mm.egREFSEQ)
tbl = tbl [grep ('^NP_', tbl$accession),]
id.refseq.mouse <<- as.character (tbl$gene_id)
names (id.refseq.mouse) <<- tbl$accession
}
proteinId = strsplit (proteinId, '\\.')[[1]][1] # remove any trailing '.*'
if (proteinId %in% names (id.uniprot.human))
return (list (geneId=as.character (id.uniprot.human [proteinId]), type='ENTREZ'))
if (proteinId %in% names (id.uniprot.mouse))
return (list (geneId=as.character (id.uniprot.mouse [proteinId]), type='ENTREZ'))
if (proteinId %in% names (id.refseq.human))
return (list (geneId=as.character (id.refseq.human [proteinId]), type='ENTREZ'))
if (proteinId %in% names (id.refseq.mouse))
return (list (geneId=as.character (id.refseq.mouse [proteinId]), type='ENTREZ'))
found.leading.Y = length (grep ("^Y", proteinId, perl=TRUE))
if (found.leading.Y == 1)
return (list (geneId=proteinId, type='SGD'))
return (list (geneId=NA_character_, type=NA_character_))
} # assignGeneId
#------------------------------------------------------------------------------------------------------------------------
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.