Nothing
# jaspar2014/import.R
#-----------------------------------------------------------------------------------
library (org.Hs.eg.db)
library (org.Mm.eg.db)
#------------------------------------------------------------------------------------------------------------------------
options (stringsAsFactors=FALSE)
printf <- function(...) print(noquote(sprintf(...)))
#------------------------------------------------------------------------------------------------------------------------
run = function (dataDir)
{
dataDir <- file.path(dataDir,"jaspar2014")
rawMatrixList <- readRawMatrices (dataDir)
matrices <- extractMatrices (rawMatrixList)
tbl.anno <- createAnnotationTable(dataDir)
tbl.md <- createMetadataTable (tbl.anno, matrices)
matrices <- normalizeMatrices (matrices)
matrices <- renameMatrices (matrices, tbl.md)
serializedFile <- file.path(dataDir, "jaspar2014.RData")
save (matrices, tbl.md, file=serializedFile)
printf("saved %d matrices to %s", length(matrices), serializedFile)
printf("next step: copy %s to <packageRoot>/MotifDb/inst/extdata, rebuild package", serializedFile)
} # run
#------------------------------------------------------------------------------------------------------------------------
readRawMatrices = function (dataDir)
{
# our convention is that there is a shared "dataDir" visible to
# the importer, and that within that directory there is one
# subdirectory for each data source.
# for this example importer, that directory will be <dataDir>/test
# within which we will look for one small file "sample.pcm"
filename <- file.path(dataDir, "matrix_data.txt")
stopifnot(file.exists(filename))
all.lines = scan (filename, what=character(0), sep='\n', quiet=TRUE)
title.lines = grep ('^>', all.lines)
title.line.count <<- length (title.lines)
max = title.line.count - 1
pwms = list ()
for (i in 1:max) {
start.line = title.lines [i]
end.line = title.lines [i+1] - 1
new.pwm = parsePwm (all.lines [start.line:end.line])
pwms = c (pwms, list (new.pwm))
} # for i
invisible (pwms)
} # readRawMatrices
#------------------------------------------------------------------------------------------------------------------------
extractMatrices = function (pwm.list)
{
matrices = sapply (pwm.list, function (element) element$matrix)
matrix.names <- sapply (pwm.list, function (element) element$title)
matrix.names <- sub("^>", "", matrix.names)
names (matrices) <- matrix.names
return (matrices)
} # extractMatrices
#------------------------------------------------------------------------------------------------------------------------
createAnnotationTable <- function(dataDir)
{
file <- file.path(dataDir, "MATRIX.txt")
tbl.matrix <- read.table(file, sep='\t', header=FALSE, as.is=TRUE)
colnames(tbl.matrix) <- c('id', 'category', 'mID', 'version', 'binder')
file <- file.path(dataDir, "MATRIX_PROTEIN.txt")
stopifnot(file.exists(file))
tbl.protein <- read.table(file, sep='\t', header=FALSE, as.is=TRUE)
colnames(tbl.protein) <- c('id', 'proteinID')
file <- file.path(dataDir, "MATRIX_SPECIES.txt")
stopifnot(file.exists(file))
tbl.species <- read.table(file, sep='\t', header=FALSE, as.is=TRUE)
colnames(tbl.species) <- c('id', 'speciesID')
file <- file.path(dataDir, "MATRIX_ANNOTATION.txt")
stopifnot(file.exists(file))
tbl.anno <- read.table(file, sep='\t', header=FALSE, as.is=TRUE, quote="")
colnames(tbl.anno) <- c('id', 'attribute', 'value')
file <- file.path(dataDir, "MATRIX_ANNOTATION.txt")
tbl.family <- subset(tbl.anno, attribute=='family') [, -2];
colnames(tbl.family) <- c('id', 'family')
# create 5 2-column data.frames out of tbl.anno
# which can all then be merged on the "id" column
tbl.tax <- subset(tbl.anno, attribute=='tax_group') [,-2];
colnames(tbl.tax) <- c('id', 'tax')
tbl.class <- subset(tbl.anno, attribute=='class') [,-2];
colnames(tbl.class) <- c('id', 'class')
tbl.comment <- subset(tbl.anno, attribute=='comment')[,-2];
colnames(tbl.comment) <- c('id', 'comment')
tbl.pubmed <- subset(tbl.anno, attribute=='medline')[,-2];
colnames(tbl.pubmed) <- c('id', 'pubmed')
tbl.type <- subset(tbl.anno, attribute=='type')[,-2];
colnames(tbl.type) <- c('id', 'type')
tbl.md <- merge(tbl.matrix, tbl.species, all.x=TRUE)
tbl.md <- merge(tbl.md, tbl.protein, all.x=TRUE)
tbl.md <- merge(tbl.md, tbl.family, all.x=TRUE)
tbl.md <- merge(tbl.md, tbl.tax, all.x=TRUE)
tbl.md <- merge(tbl.md, tbl.class, all.x=TRUE)
tbl.md <- merge(tbl.md, tbl.pubmed, all.x=TRUE)
tbl.md <- merge(tbl.md, tbl.type, all.x=TRUE)
fullID <- paste(tbl.md$mID, tbl.md$version, sep='.')
tbl.md <- cbind(fullID, tbl.md, stringsAsFactors=FALSE)
invisible(tbl.md)
} # createAnnotationTable
#-------------------------------------------------------------------------------
createMetadataTable = function (tbl.anno, matrices)
{
options (stringsAsFactors=FALSE)
tbl.md = data.frame ()
matrix.ids = names (matrices)
dataSource = 'JASPAR_2014'
for (matrix.id in matrix.ids) {
matrix = matrices [[matrix.id]]
tbl.sub = subset (tbl.anno, fullID==substr(matrix.id,0,8))
if (nrow (tbl.sub) > 1) {
# the binder is a dimer, perhaps a homodimer, and two proteinIDs are provided. Arnt::Ahr
# some others, a sampling: P05412;P01100, P08047, P22814;Q24040, EAW80806;EAW53510
dimer = paste (unique (tbl.sub$proteinID), collapse=';')
tbl.sub [1, 'proteinID'] = dimer
}
anno = as.list (tbl.sub [1,])
taxon.code = anno$speciesID
geneId.info = assignGeneId (anno$proteinID)
new.row = list (providerName=anno$binder,
providerId=anno$fullID,
dataSource=dataSource,
geneSymbol=anno$binder,
geneId=geneId.info$geneId,
geneIdType=geneId.info$type,
proteinId=anno$proteinID,
proteinIdType=guessProteinIdentifierType (anno$proteinID),
organism=convertTaxonCode(anno$speciesID),
sequenceCount=as.integer (mean (colSums (matrix))),
bindingSequence=NA_character_,
bindingDomain=anno$class,
tfFamily=anno$family,
experimentType=anno$type,
pubmedID=anno$pubmed)
tbl.md = rbind (tbl.md, data.frame (new.row, stringsAsFactors=FALSE))
full.name = sprintf ('%s-%s-%s-%s', convertTaxonCode(anno$speciesID), dataSource, anno$binder, anno$fullID)
rownames (tbl.md) [nrow (tbl.md)] = full.name
} # for i
# Mmusculus-JASPAR_CORE-NF-kappaB-MA0061.1 has 'NA' for proteinID, not <NA>
NA.string.indices = grep ('NA', tbl.md$proteinId)
if (length (NA.string.indices) > 0)
tbl.md$proteinId [NA.string.indices] = NA
invisible (tbl.md)
}#createMetadataTable
#------------------------------------------------------------------------------------------------------------------------
renameMatrices = function (matrices, tbl.md)
{
stopifnot (length (matrices) == nrow (tbl.md))
names (matrices) = rownames (tbl.md)
invisible (matrices)
} # renameMatrices
#------------------------------------------------------------------------------------------------------------------------
# an empirical and not altogether trustworthy solution to identifying identifier types.
guessProteinIdentifierType = function (moleculeName)
{
if (nchar (moleculeName) == 0)
return (NA_character_)
if (is.na (moleculeName))
return (NA_character_)
first.char = substr (moleculeName, 1, 1)
if (first.char == 'Y')
return ('SGD')
if (first.char %in% c ('P', 'Q', 'O', 'A', 'C'))
return ('UNIPROT')
if (length (grep ('^NP_', moleculeName)) == 1)
return ('REFSEQ')
return (NA_character_)
} # guessProteinIdentifierType
#------------------------------------------------------------------------------------------------------------------------
normalizeMatrices = function (matrices)
{
mtx.normalized = sapply (matrices,
function (mtx) apply (mtx, 2, function (colvector) colvector / sum (colvector)))
invisible (mtx.normalized)
} # normalizeMatrices
#------------------------------------------------------------------------------------------------------------------------
parsePwm = function (text)
{
lines = strsplit (text, '\t')
#browser()
stopifnot(length(lines)==5) # title line, one line for each base
title = lines [[1]][1]
line.count = length(lines)
# expect 4 rows, and a number of columns we can discern from
# the incoming text.
cols <- length(lines[[2]])
result <- matrix (nrow=4, ncol=cols,
dimnames=list(c('A','C','G','T'),
as.character(1:cols)))
row = 1
for(i in 2:line.count){
result [row,] = as.numeric (lines[[i]])
row = row + 1
} # for i
#result = t (result)
#return (list (title=title, consensus.sequence=consensus.sequence, matrix=result))
return (list (title=title, matrix=result))
} # parsePwm
#----------------------------------------------------------------------------------------------------
#------------------------------------------------------------------------------------------------------------------------
# full names: ('Mus musculus', 'Rattus norvegicus', 'Rattus rattus', 'Arabidopsis thaliana', 'Pisum sativum',
# 'Nicotiana sylvestris', 'Petunia hybrida', 'Antirrhinum majus', 'Hordeum vulgare', 'Triticum aestivam',
# 'Zea mays', 'Saccharomyces cerevisiae', 'Caenorhabditis elegans', 'Drosophila melanogaster',
# 'Halocynthia roretzi', 'Vertebrata', 'Onchorhynchus mykiss', 'Xenopus laevis', 'Xenopus tropicalis',
# 'Gallus gallus', 'Homo sapiens', 'Bos taurus', 'Oryctolagus cuniculus'),
convertTaxonCode = function (ncbi.code)
{
#if (is.na (ncbi.code))
# return (NA_character_)
ncbi.code = as.character (ncbi.code)
if (ncbi.code %in% c ('-', NA_character_, 'NA'))
return ('Vertebrata')
tbl = data.frame (code= c('10090', '10116', '10117', '3702', '3888', '4094', '4102',
'4151', '4513', '4565', '4577', '4932', '6239', '7227', '7729',
'7742', '8022', '8355', '8364', '9031', '9606', '9913', '9986'),
name=c ('Mmusculus', 'Rnorvegicus', 'Rrattus', 'Athaliana', 'Psativum',
'Nsylvestris', 'Phybrida', 'Amajus', 'Hvulgare', 'Taestivam',
'Zmays', 'Scerevisiae', 'Celegans', 'Dmelanogaster',
'Hroretzi', 'Vertebrata', 'Omykiss', 'Xlaevis', 'Xtropicalis',
'Gallus', 'Hsapiens', 'Btaurus', 'Ocuniculus'),
stringsAsFactors=FALSE)
ncbi.code = as.character (ncbi.code)
index = which (tbl$code == ncbi.code)
if (length (index) == 1)
return (tbl$name [index])
if (nchar(ncbi.code)>6)
return ('Vertebrata')
else {
browser()
write (sprintf ("unable to map organism code |%s|", ncbi.code), stderr ())
return (NA_character_)
}
} # convertTaxonCode
#----------------------------------------------------------------------------------------------------
assignGeneId = function (proteinId)
{
if (!exists ('id.uniprot.human')) {
tbl = toTable (org.Hs.egUNIPROT)
id.uniprot.human <<- as.character (tbl$gene_id)
names (id.uniprot.human) <<- tbl$uniprot_id
tbl = toTable (org.Hs.egREFSEQ)
tbl = tbl [grep ('^NP_', tbl$accession),]
id.refseq.human <<- as.character (tbl$gene_id)
names (id.refseq.human) <<- tbl$accession
tbl = toTable (org.Mm.egUNIPROT)
id.uniprot.mouse <<- as.character (tbl$gene_id)
names (id.uniprot.mouse) <<- tbl$uniprot_id
tbl = toTable (org.Mm.egREFSEQ)
tbl = tbl [grep ('^NP_', tbl$accession),]
id.refseq.mouse <<- as.character (tbl$gene_id)
names (id.refseq.mouse) <<- tbl$accession
}
proteinId = strsplit (proteinId, '\\.')[[1]][1] # remove any trailing '.*'
if (proteinId %in% names (id.uniprot.human))
return (list (geneId=as.character (id.uniprot.human [proteinId]), type='ENTREZ'))
if (proteinId %in% names (id.uniprot.mouse))
return (list (geneId=as.character (id.uniprot.mouse [proteinId]), type='ENTREZ'))
if (proteinId %in% names (id.refseq.human))
return (list (geneId=as.character (id.refseq.human [proteinId]), type='ENTREZ'))
if (proteinId %in% names (id.refseq.mouse))
return (list (geneId=as.character (id.refseq.mouse [proteinId]), type='ENTREZ'))
found.leading.Y = length (grep ("^Y", proteinId, perl=TRUE))
if (found.leading.Y == 1)
return (list (geneId=proteinId, type='SGD'))
return (list (geneId=NA_character_, type=NA_character_))
} # assignGeneId
#-----------------------------------------------------------------------------
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.