# jolma2013/import.R
#------------------------------------------------------------------------------------------------------------------------
library (org.Hs.eg.db)
library (org.Mm.eg.db)
#------------------------------------------------------------------------------------------------------------------------
printf <- function(...) print(noquote(sprintf(...)))
#------------------------------------------------------------------------------------------------------------------------
run = function (dataDir)
{
dataDir <- file.path(dataDir, "jolma2013")
x <- readRawMatrices(dataDir)
matrices <- x$matrices
tbl.mAnno <- x$anno
tbl.mAnno$symbol[tbl.mAnno$symbol=="Tp53"] <- "Trp53"
tbl.mAnno$symbol[tbl.mAnno$symbol=="Tp73"] <- "Trp73"
tbl.anno <- readRawAnnotationTable(dataDir)
mouse.syms <- subset(tbl.anno, organism=="mouse")$symbol
organism <- rep("Hsapiens", nrow(tbl.mAnno))
mouse.indices <- which(names(matrices) %in% mouse.syms)
organism[mouse.indices] <- "Mmusculus"
tbl.mAnno <- cbind(tbl.mAnno, organism, stringsAsFactors=FALSE)
tbl.md <- createMetadataTable(tbl.mAnno, matrices)
rownames(tbl.md) <- tbl.md$providerName
matrices <- normalizeMatrices(matrices)
names(matrices) <- tbl.md$providerName
serializedFile <- "jolma2013.RData"
save (matrices, tbl.md, file=serializedFile)
printf("saved %d matrices to %s", length(matrices), serializedFile)
printf("next step: copy %s to <packageRoot>/MotifDb/inst/extdata, and rebuild package", serializedFile)
browser("rr")
} # run
#-------------------------------------------------------------------------------
normalizeMatrices = function (matrices)
{
mtx.normalized = sapply (matrices, function (mtx) apply (mtx, 2, function (colvector) colvector / sum (colvector)))
invisible (mtx.normalized)
} # normalizeMatrices
#------------------------------------------------------------------------------------------------------------------------
createMetadataTable = function (tbl.mAnno, matrices)
{
options (stringsAsFactors=FALSE)
tbl.md = data.frame ()
dataSource = 'Jolma2013'
provider.names <- createProviderNames(tbl.mAnno)
stopifnot(length(which(duplicated(provider.names))) == 0)
stopifnot(nrow(tbl.mAnno) == length(matrices))
size <- length(provider.names)
tbl <- data.frame(list(providerName=vector("character", size),
providerId=vector("character", size),
dataSource=vector("character",size),
geneSymbol=vector("character",size),
geneId=vector("character",size),
geneIdType=vector("character",size),
proteinId=vector("character",size),
proteinIdType=vector("character",size),
organism=vector("character",size),
sequenceCount=vector("integer", size),
bindingSequence=vector("character",size),
bindingDomain=vector("character",size),
tfFamily=vector("character",size),
experimentType=vector("character",size),
pubmedID=vector("character",size)))
#rownames(tbl) <- provider.names
geneIDs <- determineGeneIDs(tbl.mAnno$symbol)
counts = as.integer (sapply (matrices, function (mtx) as.integer (mean (round (colSums (mtx))))))
for (r in 1:length(provider.names)){
tbl[r, "providerName"] <- provider.names[r]
tbl[r, "providerId"] <- tbl.mAnno$symbol[r]
tbl[r, "dataSource"] <- "jolma2013"
tbl[r, "geneSymbol"] <- tbl.mAnno$symbol[r]
tbl[r, "geneId"] <- geneIDs[r]
tbl[r, "geneIdType"] <- "ENTREZ"
tbl[r, "proteinId"] <- NA_character_
tbl[r, "proteinIdType"] <- NA_character_
tbl[r, "organism"] <- tbl.mAnno$organism[r]
tbl[r, "sequenceCount"] <- counts[r]
tbl[r, "bindingSequence"] <- tbl.mAnno$sequence[r]
tbl[r, "bindingDomain"] <- NA_character_
tbl[r, "tfFamily"] <- tbl.mAnno$family[r]
tbl[r, "experimentType"] <- "SELEX"
tbl[r, "pubmedID"] <- "23332764"
} # for r
invisible (tbl)
} # createMetadataTable
##------------------------------------------------------------------------------------------------------------------------
readRawMatrices = function (dataDir)
{
data.file <- file.path(dataDir, "TableS3-PWM-models.tsv")
tbl.raw <- read.table(data.file, skip=15, header=FALSE, as.is=TRUE, fill=TRUE, sep="\t")
# the tsv file, exported from the excel spreadsheet, \r converted to \n,
# has an unconventional structure, with 5 rows per pwm, including 1 header line
# followed by by 4 rows of base counts, in which the first column is A,C,T or G
# each header line has 10 columns, apparently these
# symbol family clone type ligand sequence batch Seed multinomial cycle
# BCL6B C2H2 DBD TGCGGG20NGA AC TGCTTTCTAGGAATTMM 2 4 monomeric yes
# CTCF C2H2 full TAGCGA20NGCT AJ NGCGCCMYCTAGYGGTN 2 4 monomeric yes
# EGR1 C2H2 DBD TCTCTT20NGA Y NMCGCCCMCGCANN 2 2 monomeric yes
# EGR1 C2H2 full TACTAT20NATC AA NACGCCCACGCANN 2 4 monomeric no
# EGR2 C2H2 DBD TCGGCC20NGA W MCGCCCACGCA 2 3 monomeric no
# though there are some left-over column titles. don't know quite what to make of them
# site:
# type:
# comment:
# Matrix is one of the representative PWMs:
matrix.start.rows <- seq(3, nrow(tbl.raw), 5)
tbl.anno <- tbl.raw[matrix.start.rows, 1:10]
colnames(tbl.anno) <- c("symbol", "family", "clone", "type", "ligand",
"sequence", "batch", "Seed", "multinomial",
"cycle")
tbl.indices <- data.frame(start = matrix.start.rows + 1, end = matrix.start.rows + 4)
matrix.list = apply(tbl.indices, 1, function(x) tbl.raw[x[1]:x[2],])
names(matrix.list) = tbl.anno$symbol
matrix.list = lapply(matrix.list, function(mtx) {
tmp = mtx[,-1] # drop the ACTG column
# identify empty or NA columns
columns.to.discard = apply(tmp, 2, function(x) all(is.na(x) | all(x == "")))
tmp = tmp[, ! columns.to.discard]
tmp2 = as.numeric(unlist(tmp))
tmp = matrix(tmp2, nrow = 4, ncol = length(tmp2)/4)
rownames(tmp) = mtx[,1]
colnames(tmp) = 1:ncol(tmp)
tmp
})
unique.ids <- apply(tbl.anno, 1, function(s) paste(s, collapse="."))
#names(matrix.list) <- unique.ids
# insert some naming repairs discovered by diego diez
tbl.anno$symbol[tbl.anno$symbol=="Tp53"] <- "Trp53"
tbl.anno$symbol[tbl.anno$symbol=="Tp73"] <- "Trp73"
invisible (list(matrices=matrix.list, anno=tbl.anno))
} # readRawMatrices
#------------------------------------------------------------------------------------------------------------------------
readRawAnnotationTable <- function(dataDir)
{
data.file <- file.path(dataDir, "Cell_2013_Jolma_annotations.tsv")
tbl.raw <- read.table(data.file, header=TRUE, as.is=TRUE, fill=TRUE, sep="\t")
# no need for the protein sequence
tbl.raw <- tbl.raw[,-3]
# insert some naming repairs discovered by diego diez
tbl.raw$symbol[tbl.raw$symbol=="Egr1_E410D_FARSDERtoFARSDDR"] <- "Egr1"
invisible(tbl.raw)
} # readRawAnnotationTable
#------------------------------------------------------------------------------------------------------------------------
determineGeneIDs <- function(symbols)
{
geneIDs <- mget(symbols, org.Hs.egSYMBOL2EG, ifnotfound=NA)
failures <- unique(names(which(is.na(geneIDs))))
geneIDs <- geneIDs[which(!is.na(geneIDs))]
alias.failures <- c()
if(length(failures) > 0){ #
aliasGeneIDs <- mget(failures, org.Hs.egALIAS2EG, ifnotfound=NA)
alias.failures <- names(which(is.na(aliasGeneIDs)))
alias.success <- aliasGeneIDs[which(!is.na(aliasGeneIDs))]
if(length(alias.success) > 0)
geneIDs <- c(geneIDs, alias.success)
if(length(alias.failures) > 0){
mouseGeneIDs <- mget(alias.failures, org.Mm.egSYMBOL2EG, ifnotfound=NA)
mouse.failures <- names(which(is.na(mouseGeneIDs)))
mouse.success <- mouseGeneIDs[which(!is.na(aliasGeneIDs))]
if(length(mouse.success) > 0)
geneIDs <- c(geneIDs, mouse.success)
if(length(mouse.failures) > 0){
names(mouse.failures) <- mouse.failures
mouse.failures [mouse.failures] <- NA
geneIDs <- c(geneIDs, mouse.failures)
} # if mouse.failures
} # if alias.failures
}# if failures
result <- as.character(sapply (symbols,
function(sym) {target <- sprintf("^%s$", toupper(sym));
as.character(geneIDs[grep(target, names(geneIDs))])[1]}))
names(result) <- symbols
result
} # determineGeneIDs
#-------------------------------------------------------------------------------
createProviderNames <- function(tbl.mAnno)
{
x <- as.character(apply(tbl.mAnno, 1,
function(row) paste(c(row[11], "jolma2013", row[c(1)]), collapse="-")))
dups <- which(duplicated(x))
# printf("incoming dups count: %d", length(dups))
suffix <- 1
while(length(dups) > 0) {
suffix <- suffix + 1
fixed.names <- as.character(sapply(x[dups],
function(dup) paste(c(dup, suffix), collapse="-")))
x[dups] <- fixed.names
dups <- which(duplicated(x))
} # while dups
x <- sub("2$", "2", x)
x <- sub("2-3$", "3", x)
x <- sub("2-3-4$", "4", x)
x <- sub("2-3-4-5$", "5", x)
x <- sub("2-3-4-5-6$", "6", x)
x <- sub("2-3-4-5-6-7$", "7", x)
x <- sub("2-3-4-5-6-7-8$", "8", x)
x <- sub("2-3-4-5-6-7-8-9$", "9", x)
x
} # createProviderNames
#-------------------------------------------------------------------------------
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.