Nothing
# uniprobe/import.R
#------------------------------------------------------------------------------------------------------------------------
#library (RMySQL)
library (RSQLite)
library (org.Hs.eg.db)
library (org.Mm.eg.db)
library (org.Sc.sgd.db)
library (org.Ce.eg.db)
library(tools) # for md5sum
#------------------------------------------------------------------------------------------------------------------------
printf <- function(...) print(noquote(sprintf(...)))
#------------------------------------------------------------------------------------------------------------------------
run = function (dataDir)
{
dataDir <- file.path(dataDir, "uniprobe")
stopifnot(file.exists(dataDir))
all.files = identifyFiles (file.path(dataDir, 'All_PWMs'))
matrices = readAndParse (all.files)
tbl.pubRef = createPublicationRefTable ()
tbl.geneRef = createGeneRefTable (dataDir)
tbl.md = createMetadata (matrices, tbl.pubRef, tbl.geneRef)
stopifnot (length (matrices) == nrow (tbl.md))
matrices = renameMatrices (matrices, tbl.md)
serializedFile <- "uniprobe.RData"
save (matrices, tbl.md, file=serializedFile)
printf("saved %d matrices to %s", length(matrices), serializedFile)
printf("next step: copy %s to <packageRoot>/MotifDb/inst/extdata, rebuild package", serializedFile)
} # run
#------------------------------------------------------------------------------------------------------------------------
createMatrixNameUniqifier = function (matrix)
{
temporary.file <- tempfile ()
write (as.character (matrix), file=temporary.file)
md5sum.string <- as.character (md5sum (temporary.file))
stopifnot (nchar (md5sum.string) == 32)
md5.6chars = substr (md5sum.string, 29, 32)
#unlink (temporary.file)
} # createMatrixNameUniqifier
#------------------------------------------------------------------------------------------------------------------------
parsePWMfromText = function (lines.of.text)
{
z = lines.of.text
stopifnot (sum (sapply (c ('A', 'C', 'T', 'G'), function (token) length (grep (token, lines.of.text)))) == 4)
# determine the number of columns
token.count.first.line = length (strsplit (lines.of.text [1], '\t')[[1]])
column.count = token.count.first.line - 1 # subtract off the 'A:\t' token
result = matrix (nrow=4, ncol=column.count, byrow=TRUE, dimnames=list (c ('A', 'C', 'G', 'T'), 1:column.count))
for (line in lines.of.text) {
tokens = strsplit (line, '\\s*[:\\|]') [[1]]
nucleotide = tokens [1]
numbers.raw = tokens [2]
number.tokens = strsplit (numbers.raw, '\\s+', perl=T)[[1]]
while (nchar (number.tokens [1]) == 0)
number.tokens = number.tokens [-1]
numbers = as.numeric (number.tokens)
#printf ('adding %s: %s', nucleotide, list.to.string (numbers))
result [nucleotide,] = numbers
}
return (result)
} # parsePWMfromText
#------------------------------------------------------------------------------------------------------------------------
extractPWMfromFile = function (filename)
{
text = scan (filename, sep='\n', what=character (0), quiet=TRUE)
matrix.start.lines = grep ('A\\s*[:\\|]', text)
stopifnot (length (matrix.start.lines) == 1)
#printf ('%50s: %s', filename, list.to.string (matrix.start.lines))
start.line = matrix.start.lines [1]
end.line = start.line + 3
lines = text [start.line:end.line]
pwm.matrix = parsePWMfromText (lines)
return (pwm.matrix)
} # extractPWMfromFile
#------------------------------------------------------------------------------------------------------------------------
createPublicationRefTable = function ()
{
options (stringsAsFactors=FALSE)
tbl.ref = data.frame (folder=c('CR09', 'Cell08', 'Cell09', 'EMBO10', 'GD09', 'GR09', 'MMB08', 'NBT06', 'PNAS08', 'SCI09'))
tbl.ref = cbind (tbl.ref, author=c('Scharer', 'Berger', 'Grove', 'Wei', 'Lesch', 'Zhu', 'Pompeani', 'Berger', 'De Silva', 'Badis'))
tbl.ref = cbind (tbl.ref, pmid=c('19147588','18585359','19632181','20517297','19204119','19158363','18681939','16998473','18541913','19443739'))
tbl.ref = cbind (tbl.ref, organism=c('Hsapiens', 'Mmusculus', 'Celegans', 'Mmusculus', 'Celegans', 'Scerevisiae', 'Vharveyi',
'Scerevisiae;Hsapiens;Mmusculus', 'Apicomplexa', 'Mmusculus'))
tbl.ref = cbind (tbl.ref, count=c(1, 168, 34, 25, 1, 89, 1, 5, 3, 104))
titles = c ('Genome-wide promoter analysis of the SOX4 transcriptional network in prostate cancer cells',
'Variation in homeodomain DNA binding revealed by high-resolution analysis of sequence preferences',
'A Multiparameter Network Reveals Extensive Divergence between C. elegans bHLH Transcription Factors',
'Genome-wide analysis of ETS-family DNA-binding in vitro and in vivo',
'Transcriptional regulation and stabilization of left right neuronal identity in C. elegans',
'High-resolution DNA-binding specificity analysis of yeast transcription factors',
'The Vibrio harveyi master quorum-sensing regulator, LuxR...',
'Compact, universal DNA microarrays...',
'Specific DNA-binding by Apicomplexan AP2 transcription factors',
'Diversity and complexity in DNA recognition by transcription factors')
tbl.ref = cbind (tbl.ref, title=titles)
tbl.ref
} # createPublicationRefTable
#------------------------------------------------------------------------------------------------------------------------
identifyFiles = function (filePath)
{
stopifnot(file.exists(filePath))
cmd = sprintf ('find %s -name "*pwm*"', filePath)
files.raw = system (cmd, intern=TRUE)
# all legit files end in ".pwm" or ".txt"
files = c (grep (".pwm$", files.raw, value=T, ignore.case=T),
grep (".txt$", files.raw, value=T, ignore.case=T))
reverseComplementMatrices = grep ('RC', files)
if (length (reverseComplementMatrices) > 0)
files = files [-reverseComplementMatrices]
# don't know why these were excluded. leave them in for now
embo10.excluders = grep ('EMBO', files)
if (length (embo10.excluders) > 0)
files = files [-embo10.excluders]
cell09.excluders = grep ('Cell09', files) # include these once the sql tables are updated
if (length (cell09.excluders) > 0)
files = files [-cell09.excluders]
secondary.excluders = grep ('secondary', files)
if (length (secondary.excluders) > 0)
files = files [-secondary.excluders]
invisible (files)
} # identifyFiles
#------------------------------------------------------------------------------------------------------------------------
readAndParse = function (file.list)
{
matrices = list ()
for (file in file.list) {
#printf ('read and parse %s', file)
text = scan (file, sep='\n', what=character (0), quiet=TRUE)
matrix.start.lines = grep ('A:', text)
stopifnot (length (matrix.start.lines) == 1)
start.line = matrix.start.lines [1]
end.line = start.line + 3
lines.of.text = text [start.line:end.line]
pwm.matrix = parsePWMfromText (lines.of.text)
name.tokens <- strsplit(file,"/")[[1]]
token.count <- length(name.tokens)
matrix.name <- paste(name.tokens[(token.count-1):token.count], collapse="/")
matrices [[matrix.name]] = pwm.matrix
}
invisible (matrices)
} # readAndParse
#------------------------------------------------------------------------------------------------------------------------
# eg, ./All_PWMs/GD09/Nsy-7.pwm to simply 'Nsy-7'
#
translateFileNameToGeneName = function (long.name)
{
if (length (grep ('/', long.name)) > 0) {
tokens = strsplit (long.name, '/')[[1]]
count = length (tokens)
gene.name.raw = tokens [count] # get the last one
}
else {
gene.name.raw = long.name
}
gene.name = strsplit (gene.name.raw, '\\.')[[1]][1] # remove any file suffix
gene.name
} # test.translateFileNameToGeneName
#------------------------------------------------------------------------------------------------------------------------
# and update matrix names
createMetadata = function (matrices, tbl.pubRef, tbl.geneRef)
{
options (stringsAsFactors=FALSE)
dataSource = 'UniPROBE'
trim = function (s) {sub (' +$', '', sub ('^ +', '', s))}
tbl.md = data.frame ()
removers = list (Cart1=110, Cutl1=115, Hoxa7=156, Irx3=185)
for (m in 1: length (matrices)) {
matrix.name = names (matrices) [m]
#printf ('%d: %s', m, matrix.name)
native.name.raw = gsub ('All_PWMs/', '', matrix.name)
native.name = extractNativeNames (native.name.raw)
# extractNativeNames needs to be rethought. but for now, just hack in some fixes
if (native.name == 'Cgd2') native.name='Cgd2_3490' # inconsistency at uniprobe, accomodated here
if (native.name == 'PF14') native.name='PF14_0633'
if (native.name == 'Uncx4') native.name='Uncx4.1'
if (!native.name %in% tbl.geneRef$name) {
browser (text='native.name not in tbl.geneRef')
}
experiment.folder = strsplit (native.name.raw, '/')[[1]][1]
up.id.number = subset (tbl.geneRef, name==native.name & folder_name==experiment.folder)$id
# todo BUG here!
full.uniprobe.id = sprintf ('UP%05d', up.id.number)
if (length (full.uniprobe.id) != 1) {
browser (text='full.uniprobe.id has wrong length')
}
geneId = subset (tbl.geneRef, name==native.name & folder_name==experiment.folder)$stdID
if (is.na (geneId) | geneId == '')
geneId = NA_character_
bindingDomain = trim (subset (tbl.geneRef, name==native.name & folder_name==experiment.folder)$domain)
organism = subset (tbl.geneRef, name==native.name & folder_name==experiment.folder)$species
# a native name (a gene symbol) may have a dash, but for the long name, we want dashes to separate
# the organism-dataSource-geneIdentifier matrix name.
# so covert this here, but do not eclipse any dash-including native.names
native.name.no.dashes = gsub ('-', '_', native.name)
native.name.uniq = sprintf ('%s-%s-%s.%s', organism, dataSource, native.name.no.dashes, full.uniprobe.id)
if (native.name.uniq %in% rownames (tbl.md)) {
matrix = matrices [[m]]
uniqifier = createMatrixNameUniqifier (matrix)
#printf ('before, not unique: %s', native.name.uniq)
native.name.uniq = paste (native.name.uniq, uniqifier, sep='.')
#printf ('after, not unique: %s', native.name.uniq)
}
sequenceCount = NA_integer_
bindingSequence = NA_character_
tfFamily = NA_character_
experimentType = 'protein binding microarray'
pubmedID = subset (tbl.pubRef, folder==experiment.folder)$pmid
#printf ('%12s: %20s', native.name, organism)
if (is.na (geneId))
geneIdType = NA
else if (organism == 'Scerevisiae')
geneIdType = 'SGD'
else if (organism %in% c ('Mmusculus', 'Hsapiens', 'Celegans'))
geneIdType = 'ENTREZ'
else
geneIdType = 'todo'
proteinId = NA_character_
proteinIdType = NA_character_
proteinId.tmp = subset (tbl.geneRef, name==native.name & folder_name==experiment.folder)$uniprot
if (!is.na (proteinId.tmp) & nchar (proteinId.tmp) > 1) {
#printf ('getting uniprot id for %s: %s', native.name, proteinId.tmp)
proteinId = proteinId.tmp
proteinIdType = 'UNIPROT'
} # found good id in the uniprot column
if (is.na (proteinId.tmp) | nchar (proteinId.tmp) == 0) {
proteinId.tmp = subset (tbl.geneRef, name==native.name & folder_name==experiment.folder)$refseq_id
if (!is.na (proteinId.tmp) & nchar (proteinId.tmp) > 1) {
#printf ('getting refseq id for %s: %s', native.name, proteinId.tmp)
proteinId = proteinId.tmp
proteinIdType = 'REFSEQ'
}
} # need to look in the refseq column
bindingSequence = subset (tbl.geneRef, name==native.name & folder_name==experiment.folder)$bindingSequence
#printf ('%12s: %40s', native.name, bindingSequence)
new.row = list (providerName=native.name.raw,
providerId=full.uniprobe.id,
dataSource=dataSource,
geneSymbol=native.name,
geneId=geneId,
geneIdType=geneIdType,
proteinId=proteinId,
proteinIdType=proteinIdType,
organism=organism,
sequenceCount=NA,
bindingSequence=bindingSequence,
bindingDomain=bindingDomain,
tfFamily=tfFamily,
experimentType=experimentType,
pubmedID=pubmedID)
if (native.name.uniq %in% rownames (tbl.md)) browser (text='dup row name')
tbl.md = rbind (tbl.md, data.frame (new.row, stringsAsFactors=FALSE))
if (length (native.name.uniq) != 1) browser (text='native.name.unique length != 1')
rownames (tbl.md) [m] = native.name.uniq
}
invisible (tbl.md)
} # createMetadata
#------------------------------------------------------------------------------------------------------------------------
extractNativeNames = function (native.names.raw)
{
name.count = length (native.names.raw)
result = vector ('character', name.count)
for (i in 1:name.count) {
native.name.raw = native.names.raw [i]
tokens = strsplit (native.name.raw, '/') [[1]]
count = length (tokens)
cooked.1 = native.name.raw
if (count > 1)
cooked.1 = tokens [length (tokens)]
tokens = strsplit (cooked.1, '[_\\.]')[[1]]
cooked.2 = tokens [1]
result [i] = cooked.2
}
invisible (result)
} # extractNativeNames
#------------------------------------------------------------------------------------------------------------------------
# 'standard' is usually entrez gene ID. for yeast it is orf. for fly ...
uniprotToStandardID = function (organism, uniprot.ids)
{
# uniprot.ids = unique (uniprot.ids)
if (!exists ('lst.yeast.uniprot')) {
tbl.tmp = toTable (org.Sc.sgdUNIPROT)
yeast.uniprot <<- tbl.tmp$systematic_name
names (yeast.uniprot) <<- tbl.tmp$uniprot_id
}
if (!exists ('mouse.uniprot')) {
tbl.tmp = toTable (org.Mm.egUNIPROT)
mouse.uniprot <<- tbl.tmp$gene_id
names (mouse.uniprot) <<- tbl.tmp$uniprot_id
}
if (!exists ('human.uniprot')) {
tbl.tmp = toTable (org.Hs.egUNIPROT)
human.uniprot <<- tbl.tmp$gene_id
names (human.uniprot) <<- tbl.tmp$uniprot_id
}
if (!exists ('worm.uniprot')) {
tbl.tmp = toTable (org.Ce.egUNIPROT)
worm.uniprot <<- tbl.tmp$gene_id
names (worm.uniprot) <<- tbl.tmp$uniprot_id
}
organism = unique (organism)
stopifnot (length (unique (organism)) == 1)
stopifnot (organism %in% (c ('human', 'mouse', 'yeast', 'worm')))
if (organism == 'human') {
ids = human.uniprot [uniprot.ids]
}
else if (organism == 'mouse') {
ids = mouse.uniprot [uniprot.ids]
}
else if (organism == 'yeast') {
ids = yeast.uniprot [uniprot.ids]
}
else if (organism == 'worm') {
ids = worm.uniprot [uniprot.ids]
}
# embarrassing but true: could not get this to work by creating a list directly
# round-about solution: make a 1-column data.frame, then construct a named list from that
df = data.frame (uniprot=uniprot.ids, std=as.character (ids), stringsAsFactors=FALSE)
#rownames (df) = uniprot.ids
#result = df$stdID
#names (result) = uniprot.ids
return (df)
} # uniprotToStandardID
#------------------------------------------------------------------------------------------------------------------------
# see ~/v/snotes/log "* load uniprobe sql dump file (11 may 2012)" for info on the uniprobe mysql database
# used here.
createGeneRefTable = function (dataDir)
{
if (!exists ('db')){
dbFile <- file.path(dataDir, "uniprobe.sqlite")
stopifnot(file.exists(dbFile))
db <<- dbConnect (dbDriver("SQLite"), dbFile)
}
tbl.pubmed = data.frame (folder=c('CR09', 'Cell08', 'Cell09', 'EMBO10', 'GD09', 'GR09', 'MMB08', 'NBT06', 'PNAS08', 'SCI09'),
pmid=c('19147588', '18585359', '19632181', '20517297', '19204119', '19158363', '18681939', '16998473', '18541913', '19443739'),
stringsAsFactors=FALSE)
#CR09 1 Scharer 19147588 human Genome-wide promoter analysis of the SOX4 transcriptional
# network in prostate cancer cells
#Cell08 168 Berger 18585359 mouse Variation in homeodomain DNA binding revealed by high-resolution
# analysis of sequence preferences
#Cell09 34 Grove 19632181 worm A Multiparameter Network Reveals Extensive Divergence between
# C. elegans bHLH Transcription Factors
#EMBO10 25 Wei 20517297 mouse Genome-wide analysis of ETS-family DNA-binding in vitro and in vivo
#GD09 1 Lesch 19204119 worm Transcriptional regulation and stabilization of left right neuronal
# identity in C. elegans
#GR09 89 Zhu 19158363 yeast High-resolution DNA-binding specificity analysis of yeast transcription factors
#MMB08 1 Pompeani 18681939 Vibrio The Vibrio harveyi master quorum-sensing regulator, LuxR...
#NBT06 5 Berger 16998473 yeast;human;mouse Compact, universal DNA microarrays...
#PNAS08 3 De Silva 18541913 apicomplexa Specific DNA-binding by Apicomplexan AP2 transcription factors
#SCI09 104 Badis 19443739 mouse Diversity and complexity in DNA recognition by transcription factors
tbl.gene = dbGetQuery (db, 'select * from gene_ids_public')
colnames (tbl.gene) = c ('id', 'name', 'species', 'pub', 'type')
tbl.genomic = dbGetQuery (db, 'select * from genomic_info') [, -c(2,3,8:11)] # remove the longish 'description' field
tbl.pub = dbGetQuery (db, 'select * from publication_ids') [,-3] # remove 'full_ref' for legibility
tbl = merge (tbl.gene, tbl.pub [, c (1,4)], by.x='pub', by.y='publication_id', all.x=TRUE)
tbl = merge (tbl, tbl.pubmed, by.x='folder_name', by.y='folder', all.x=TRUE)
tbl = merge (tbl, tbl.genomic, all.x=TRUE, by.x='name', by.y='gene_name')
redundant.column = grep ('species.y', colnames (tbl))
stopifnot (length (redundant.column) == 1)
tbl = tbl [, -redundant.column]
column.to.rename = grep ('species.x', colnames (tbl))
stopifnot (length (column.to.rename) == 1)
colnames (tbl) [column.to.rename] = 'species'
# now add 'standard IDs' -- orf names for yeast, entrez geneIDs for everything else
stdID = getAllStandardIDs (tbl)
tbl = cbind (tbl, stdID)
duplicates = which (duplicated (tbl [, c (1,2)]))
if (length (duplicates) > 0)
tbl = tbl [-duplicates,]
# fix the organism (species) name, from (e.g.) "Homo sapiens" to "Hsapiens"
tbl$species = standardizeSpeciesNames (tbl$species)
invisible (tbl)
# now add bindingSequence, from DBDs: gene_name + seq, apparently the binding sequence when known, of 171 of 372 genes
tbl.dbds = dbGetQuery (db, 'select * from DBDs')
dbds.list = tbl.dbds$seq
names (dbds.list) = tbl.dbds$gene_name
bindingSequence = rep (NA_character_, nrow (tbl))
names (bindingSequence) = tbl$name
shared.names = unique (intersect (tbl.dbds$gene_name, tbl$name))
bindingSequence [shared.names] = dbds.list [shared.names]
tbl = cbind (tbl, bindingSequence)
invisible (tbl)
} # createGeneRefTable
#------------------------------------------------------------------------------------------------------------------------
# make successive species-specific calls to 'uniprotToStandardID', assembling a new column to be added to tbl.geneRef
getAllStandardIDs = function (tbl.geneRef)
{
mouse.rows = grep ('Mus musculus', tbl.geneRef$species)
yeast.rows = grep ('Saccharomyces cerevisiae', tbl.geneRef$species)
human.rows = grep ('Homo sapiens', tbl.geneRef$species)
worm.rows = grep ('Caenorhabditis elegans', tbl.geneRef$species)
stdID = rep (NA_character_, nrow (tbl.geneRef))
mouse.uids = tbl.geneRef$uniprot [mouse.rows]
yeast.uids = tbl.geneRef$uniprot [yeast.rows]
human.uids = tbl.geneRef$uniprot [human.rows]
worm.uids = tbl.geneRef$uniprot [worm.rows]
# add mouse geneIDs
tbl.mouse = uniprotToStandardID ('mouse', mouse.uids)
stopifnot (length (mouse.rows) == nrow (tbl.mouse))
stdID [mouse.rows] = tbl.mouse$std
# add yeast orfs
tbl.yeast = uniprotToStandardID ('yeast', yeast.uids)
stopifnot (length (yeast.rows) == nrow (tbl.yeast))
stdID [yeast.rows] = tbl.yeast$std
# add human geneIDs
tbl.human = uniprotToStandardID ('human', human.uids)
stopifnot (length (human.rows) == nrow (tbl.human))
stdID [human.rows] = tbl.human$std
# add worm geneIDs
tbl.worm = uniprotToStandardID ('worm', worm.uids)
stopifnot (length (worm.rows) == nrow (tbl.worm))
stdID [worm.rows] = tbl.worm$std
invisible (stdID)
} # getAllStandardIDs
#------------------------------------------------------------------------------------------------------------------------
# change, e.g., "Homo sapiens" to "Hsapiens"
standardizeSpeciesNames = function (names)
{
fix = function (name) {
tokens = strsplit (name, ' ')[[1]]
stopifnot (length (tokens) == 2)
genus = tokens [1]
species = tokens [2]
return (paste (substr (genus, 1, 1), species, sep=''))
}
fixed.names = as.character (sapply (names, fix))
invisible (fixed.names)
} # standardizeSpeciesNames
#------------------------------------------------------------------------------------------------------------------------
renameMatrices = function (matrices, tbl.md)
{
stopifnot (length (matrices) == nrow (tbl.md))
names (matrices) = rownames (tbl.md)
invisible (matrices)
} # renameMatrices
#------------------------------------------------------------------------------------------------------------------------
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.