Nothing
# hocomoco.file <- "HOCOMOCOv11_core_HUMAN_mono_jaspar_format.txt"
# metadata.file <- "HOCOMOCOv11_core_annotation_HUMAN_mono.tsv"
# outputFile <- "hocomocov11-core.RData"
hocomoco.file <- "HOCOMOCOv11_full_HUMAN_mono_jaspar_format.txt"
metadata.file <- "HOCOMOCOv11_full_annotation_HUMAN_mono.tsv"
outputFile <- "hocomocov11-secondary.RData"
# MotifDb/inst/scripes/import/HOCOMOCO/import.R
#------------------------------------------------------------------------------------------------------------------------
library(RUnit)
library(RCurl)
#------------------------------------------------------------------------------------------------------------------------
runTests <- function()
{
test_readRawMatrices()
test_extractMatrices()
test_createMetaDataTable()
test_renameMatrices()
test_normalizeMatrices()
} # runTests
#------------------------------------------------------------------------------------------------------------------------
readRawMatrices <- function (dataDir, count=-1)
{
# our convention is that there is a shared "dataDir" visible to
# the importer, and that within that directory there is one
# subdirectory for each data source.
# for this example importer, that directory will be <dataDir>/test
# within which we will look for one small file "sample.pcm"
filename <- file.path(dataDir, hocomoco.file)
stopifnot(file.exists(filename))
all.lines <- scan(filename, what=character(0), sep='\n', quiet=TRUE)
title.lines <- grep('^>', all.lines)
title.line.count <- length(title.lines)
max <- title.line.count - 1
if(count > 0)
max <- count
pwms <- list ()
for (i in 1:max){ # loop through all motifs in the matrix file, one motif at a time
start.line <- title.lines[i]
end.line <- title.lines[i+1] - 1
new.pwm <- parsePwm(all.lines [start.line:end.line])
pwms <- c(pwms, list(new.pwm))
} # for i
invisible (pwms)
} # readRawMatrices
#------------------------------------------------------------------------------------------------------------------------
test_readRawMatrices <- function()
{
printf("--- test_readRawMatrices")
pwms <- readRawMatrices("./", count=3)
checkEquals(length(pwms), 3)
pwm.1 <- pwms[[1]]
checkEquals(pwm.1$title, "AHR_HUMAN.H11MO.0.B")
mtx <- pwm.1$matrix
checkEquals(dim(mtx), c(4, 9))
checkEquals(rownames(mtx), c("A", "C", "G", "T"))
checkEquals(sum(mtx), 1386)
} # test_readRawMatrices
#------------------------------------------------------------------------------------------------------------------------
# simple operation: for each element in the pwm.list, create an intem in a new list where
# rather than a sublist of title and matrix, each element is simply a matrix, with the element named
extractMatrices <- function(pwm.list)
{
matrices <- sapply(pwm.list, function (element) element$matrix)
matrix.names <- sapply(pwm.list, function (element) element$title)
matrix.names <- sub("^> ", "", matrix.names)
names(matrices) <- matrix.names
matrices
} # extractMatrices
#------------------------------------------------------------------------------------------------------------------------
test_extractMatrices <- function()
{
printf("--- test_extractMatrices")
pwms <- readRawMatrices("./", count=3)
matrices <- extractMatrices(pwms)
checkEquals(length(matrices), 3)
checkEquals(names(matrices), c("AHR_HUMAN.H11MO.0.B", "AIRE_HUMAN.H11MO.0.C", "ALX1_HUMAN.H11MO.0.B"))
} # test_extractMatrices
#------------------------------------------------------------------------------------------------------------------------
createMetaDataTable <- function(dataDir, matrices, raw.metadata.filename)
{
filename <- file.path(dataDir, raw.metadata.filename)
#printf("checking for readable metadata file:")
#printf(" %s", filename)
stopifnot(file.exists(filename))
tbl.raw <- read.table(filename, sep="\t", header=TRUE, as.is=TRUE)
tbl.md <- data.frame ()
matrix.ids <- names(matrices)
for (matrix.id in matrix.ids) {
tokens <- strsplit(matrix.id, ".", fixed=TRUE)[[1]]
reliability.score <- tokens[length(tokens)]
matrix <- matrices[[matrix.id]]
geneSymbol <- sub("_HUMAN.*$", "", matrix.id)
tbl.sub <- subset(tbl.raw, Model==matrix.id)
entrez.gene <- tbl.sub$EntrezGene[1]
uniprot.id <- tbl.sub$UniProt.ID[1]
tf.family <- paste(tbl.sub$TF.family[1], tbl.sub$TF.subfamily[1], sep=": ")
experiment.type <- tbl.sub$Data.source
dataSource <- sprintf("HOCOMOCOv11%s", reliability.score)
organism <- "Hsapiens"
new.row <- list (providerName=matrix.id,
providerId=matrix.id, #"HOCOMOCO v8 and ChiPMunk 3.1"
dataSource=dataSource,
geneSymbol=geneSymbol,
geneId=entrez.gene,
geneIdType="ENTREZ",
proteinId=uniprot.id,
proteinIdType="UNIPROT",
organism="Hsapiens",
sequenceCount=max(colSums(matrix)),
bindingSequence=NA_character_,
bindingDomain=NA,
tfFamily=tf.family,
experimentType=experiment.type,
pubmedID="23175603")
#printf("matrix.id: %s", matrix.id);
# inefficient but acceptable:
tbl.md <- rbind(tbl.md, data.frame (new.row, stringsAsFactors=FALSE))
full.name <- sprintf ('%s-%s-%s', organism, dataSource, matrix.id)
rownames (tbl.md) [nrow (tbl.md)] <- full.name
} # for matrix.id
invisible (tbl.md)
} # createMetaDataTable
#------------------------------------------------------------------------------------------------------------------------
test_createMetaDataTable <- function()
{
printf("--- test_createMetaDataTable")
pwms <- readRawMatrices("./", count=3)
matrices <- extractMatrices(pwms)
tbl.md <- createMetaDataTable("./", matrices, raw.metadata.filename=metadata.file)
checkEquals(dim(tbl.md), c(3, 15))
# make sure the reliability score (A-D) is searchably found in the matrix rowname
checkEquals(length(grep("HOCOMOCOv11[ABCD]", rownames(tbl.md))), 3)
checkEquals(length(grep("HOCOMOCOv11[ABCD]", tbl.md$dataSource)), 3)
pwms <- readRawMatrices("./", count=-1)
matrices <- extractMatrices(pwms)
tbl.md <- createMetaDataTable("./", matrices, raw.metadata.filename=metadata.file)
checkEquals(length(grep("HOCOMOCOv11[ABCD]", rownames(tbl.md))), nrow(tbl.md))
} # test_createMetaDataTable
#------------------------------------------------------------------------------------------------------------------------
# we now have metadata rownames, one per matrix, each of which has all (or at least at lot) of information
# used in querying. For instance, the hocomoco give name for the first matrix is
# "AHR_HUMAN.H11MO.0.B"
# we transform that here to
# "Hsapiens-HOCOMOCOv11B-AHR_HUMAN.H11MO.0.B"
renameMatrices <- function (matrices, tbl.md)
{
stopifnot(length(matrices) == nrow(tbl.md))
names(matrices) <- rownames(tbl.md)
invisible(matrices)
} # renameMatrices
#------------------------------------------------------------------------------------------------------------------------
test_renameMatrices <- function()
{
printf("--- test_renameMatrices")
pwms <- readRawMatrices("./", count=3)
matrices <- extractMatrices(pwms)
checkEquals(names(matrices), c("AHR_HUMAN.H11MO.0.B",
"AIRE_HUMAN.H11MO.0.C",
"ALX1_HUMAN.H11MO.0.B"))
tbl.md <- createMetaDataTable("./", matrices, raw.metadata.filename=metadata.file)
matrices.renamed <- renameMatrices(matrices, tbl.md)
checkEquals(names(matrices.renamed), c("Hsapiens-HOCOMOCOv11B-AHR_HUMAN.H11MO.0.B",
"Hsapiens-HOCOMOCOv11C-AIRE_HUMAN.H11MO.0.C",
"Hsapiens-HOCOMOCOv11B-ALX1_HUMAN.H11MO.0.B"))
} # test_renameMatrices
#------------------------------------------------------------------------------------------------------------------------
normalizeMatrices <- function(matrices)
{
mtx.normalized <- sapply (matrices,
function (mtx) apply (mtx, 2, function (colvector) colvector / sum (colvector)))
invisible (mtx.normalized)
} # normalizeMatrices
#------------------------------------------------------------------------------------------------------------------------
test_normalizeMatrices <- function()
{
printf("--- test_normalizeMatrices")
pwms <- readRawMatrices("./", count=3)
matrices <- extractMatrices(pwms)
matrices.norm <- normalizeMatrices(matrices)
# after normalization, each matrix column should total to 1.0
# so sum(matrix) should be equals to the number of columns
# after normalization, the summed row counts should be in the same order,
# indicating that the structure has been preserved
for(i in 1:3){
checkEquals(sum(matrices.norm[[i]]), ncol(matrices.norm[[i]]))
checkEquals(order(rowSums(matrices[[i]])), order(rowSums(matrices.norm[[i]])))
} # for i
} # test_normalizeMatrices
#------------------------------------------------------------------------------------------------------------------------
parsePwm <- function (text)
{
lines <- strsplit (text, '\t')
stopifnot(length(lines)==5) # title line, one line for each base
title <- sub(">", "", lines [[1]][1], fixed=TRUE)
line.count <- length(lines)
# expect 4 rows, and a number of columns we can discern from
# the incoming text.
secondLineParsed <- unlist(strsplit(lines[[2]], " "))
cols <- length(secondLineParsed)
result <- matrix (nrow=4, ncol=cols,
dimnames=list(c('A','C','G','T'),
as.character(1:cols)))
# loop over the four lines (for each base respectively)
row <- 1
for(i in 2:line.count){
lineParsed <- unlist(strsplit(lines[[i]], "\t"))
#print(lineParsed)
result [row,] <- as.numeric(lineParsed)
row <- row + 1
} # for i
#browser()
#return (list (title=title, consensus.sequence=consensus.sequence, matrix=result))
return (list (title=title, matrix=result))
} # parsePwm
#----------------------------------------------------------------------------------------------------
readMetaDataTable <- function()
{
tbl <- read.table(metadata.file, sep="\t", header=TRUE, as.is=TRUE)
dim(tbl)
# Model AHR_HUMAN.H11MO.0.B
# Transcription.factor AHR
# Model.length 9
# Quality B
# Model.rank 0
# Consensus dKhGCGTGh
# Model.release HOCOMOCOv9
# Data.source Integrative
# Best.auROC..human. <NA>
# Best.auROC..mouse. <NA>
# Peak.sets.in.benchmark..human. <NA>
# Peak.sets.in.benchmark..mouse. <NA>
# Aligned.words 157
# TF.family PAS domain factors{1.2.5}
# TF.subfamily Ahr-like factors{1.2.5.1}
# HGNC 348
# EntrezGene 196
# UniProt.ID AHR_HUMAN
# UniProt.AC P35869
# Mmusculus-jaspar2014-Arnt-MA0004
# providerName "MA0004.1 Arnt"
# providerId "MA0004.1 Arnt"
# dataSource "jaspar2014"
# geneSymbol "Arnt"
# geneId NA
# geneIdType NA
# proteinId "P53762"
# proteinIdType "UNIPROT"
# organism "Mmusculus"
# sequenceCount "20"
# bindingSequence NA
# bindingDomain NA
# tfFamily "Helix-Loop-Helix"
# experimentType "SELEX"
# pubmedID "24194598"
} # readMetaDataTable
#----------------------------------------------------------------------------------------------------
run <- function (dataDir=".")
{
rawMatrixList <- readRawMatrices("./", dataDir)
length(rawMatrixList)
matrices <- extractMatrices (rawMatrixList)
length(matrices)
matrices[1]
tbl.md <- createMetaDataTable(dataDir, matrices, raw.metadata.filename=metadata.file)
matrices <- normalizeMatrices(matrices)
matrices <- renameMatrices(matrices, tbl.md)
serializedFile <- file.path(dataDir, outputFile)
printf("writing %s to %s", outputFile, dataDir)
save(matrices, tbl.md, file=serializedFile)
printf("saved %d matrices to %s", length(matrices), serializedFile)
printf("next step: copy %s to <packageRoot>/MotifDb/inst/extdata, rebuild package", serializedFile)
} # run
#------------------------------------------------------------------------------------------------------------------------
blendCoreAndSecondaryDataSets <- function()
{
load("hocomocov11-full.Rdata")
matrices.secondary <- matrices
tbl.secondary <- tbl.md
length(matrices.secondary); # 768
dim(tbl.secondary) # 768 15
checkTrue(all(names(matrices.secondary) %in% rownames(tbl.secondary)))
load("hocomocov11-core.RData")
matrices.core <- matrices
tbl.core <- tbl.md
checkTrue(all(names(matrices.core) %in% rownames(tbl.core)))
length(matrices.core); # 400
dim(tbl.core) # 400 15
checkTrue(all(names(matrices.core) %in% rownames(tbl.secondary)))
checkTrue(all(names(matrices.core) %in% names(matrices.secondary)))
# above tests establish that tbl.secondary and matrices.secondary are a proper superset of the core set
# all we need is to now mark the tbl.secondary$dataSource to distinguish core & secondary
mapping <- match(rownames(tbl.core), rownames(tbl.secondary))
checkEquals(length(mapping), 400)
unmapped.secondary.only <- setdiff(seq_len(nrow(tbl.secondary)), mapping)
checkEquals(length(unmapped.secondary.only), 368)
# are all core matrices in secondary, with the same names
checkTrue(all(names(matrices.core) == names(matrices.secondary)[mapping]))
checkTrue(all(rownames(tbl.core) == rownames(tbl.secondary)[mapping]))
nrow(subset(tbl.secondary, geneId=="")) # 3
rownames(subset(tbl.secondary, geneId==""))
# [1] "Hsapiens-HOCOMOCOv11D-FOXO6_HUMAN.H11MO.0.D"
# [2] "Hsapiens-HOCOMOCOv11D-KLF14_HUMAN.H11MO.0.D"
# [3] "Hsapiens-HOCOMOCOv11D-ZN713_HUMAN.H11MO.0.D"
# FOXO6 forkhead box O6 [Homo sapiens (human)]
# Gene ID: 100132074, updated on 13-Mar-2020
# KLF14 Kruppel like factor 14 [Homo sapiens (human)]
# Gene ID: 136259, updated on 13-Mar-2020
# ZNF713 zinc finger protein 713 [Homo sapiens (human)],
# Gene ID: 349075, updated on 13-Mar-2020
tbl.secondary["Hsapiens-HOCOMOCOv11D-FOXO6_HUMAN.H11MO.0.D", "geneId"] <- "100132074"
tbl.secondary["Hsapiens-HOCOMOCOv11D-KLF14_HUMAN.H11MO.0.D", "geneId"] <- "136259"
tbl.secondary["Hsapiens-HOCOMOCOv11D-ZN713_HUMAN.H11MO.0.D", "geneId"] <- "349075"
nrow(subset(tbl.secondary, geneId=="")) # was 3, should now be zero
length(mapping)
length(unmapped.secondary.only)
# revise all entries in the dataSource column
# the dataSource starts off: HOCOMOCOv11[ABCD]
# we want, in order to support MotifDb::query
# HOCOMOCOv11-[core|secondary]-[ABCD]
x <- tbl.secondary$dataSource
# pick off just the category: A, B, C, D
class <- substr(x, 12, 12)
tbl.secondary$dataSource[unmapped.secondary.only] <- paste0("HOCOMOCOv11-secondary-", class[unmapped.secondary.only])
tbl.secondary$dataSource[mapping] <- paste0("HOCOMOCOv11-core-", class[mapping])
# now revise all of the tbl.md rownames: they need "core" and "secondary"
# inserted also
x <- rownames(tbl.md)
matrices <- matrices.secondary
tbl.md <- tbl.secondary
save(matrices, tbl.md, file="hocomoco11.RData")
} # blendCoreAndSecondaryDataSets
#------------------------------------------------------------------------------------------------------------------------
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.