# flyFactorSurvey/test.R
#-------------------------------------------------------------------------------
library (RUnit)
#-------------------------------------------------------------------------------
source("import.R")
#-------------------------------------------------------------------------------
run.tests = function (dataDir)
{
dataDir <- file.path(dataDir, "flyFactorSurvey")
freshStart ()
test.fbgnToIDs()
x.list.BD <- test.createXrefBindingDomain (dataDir)
x.xref <- test.createXref ()
x.filenames <- test.getMatrixFilenames (dataDir)
x.tbl.ref <- test.createExperimentRefTable ()
x.pwm <- test.parsePWMfromText (dataDir)
x.mat3 <- test.readAndParse (dataDir)
x.tbl.md <- test.createMetadata (x.mat3, x.tbl.ref, x.xref, x.list.BD)
x.mat3f <- test.normalizeMatrices (x.mat3)
x.mat3fr <- test.renameMatrices (x.mat3f, x.tbl.md [1:3,])
} # run.tests
#-------------------------------------------------------------------------------
freshStart = function ()
{
output.files.easy.to.regenerate = grep ('RData$', dir (), value=TRUE)
if (length (output.files.easy.to.regenerate) > 0)
unlink (output.files.easy.to.regenerate)
} # freshStart
#-------------------------------------------------------------------------------
test.createXrefBindingDomain = function (dataDir)
{
print ('--- test.createXrefBindingDomain')
list.BD = createXrefBindingDomain (dataDir)
checkEquals (length (list.BD), 777)
checkTrue (all (grepl ('FBgn', names (list.BD))))
# most abundant domain is zf-C2H2
checkEquals(names (sort (table (as.character (list.BD)), decreasing=TRUE)[1]),
"zf-C2H2")
invisible(list.BD)
} # test.createXrefBindingDomain
#-------------------------------------------------------------------------------
test.createXref = function ()
{
print ('--- test.createXref')
xref = createXref ()
checkEquals (xref [['FBgn0261819']], 'F0J881')
#checkEquals (xref [['FBgn0259750']], 'E1JHF4') # gone with org.Dm.eg.db_2.9.0
checkEquals (xref [['FBgn0000014']], 'E1JIM9')
invisible (xref)
} # test.createXref
#-------------------------------------------------------------------------------
test.getMatrixFilenames = function (dataDir)
{
print ('--- test.getMatrixFilenames')
filenames = getMatrixFilenames (dataDir)
# as of (03 apr 2013): 614 matrix files, one binding domains extra
# file, TFfile2b.tsv
checkTrue(length (filenames) > 600)
sample.files <- c("Abd-A_FlyReg_FBgn0000014.pfm", "Abd-B_FlyReg_FBgn0000015.pfm",
"AbdA_Cell_FBgn0000014.pfm")
checkTrue(all(sample.files %in% filenames))
invisible (filenames)
} # test.getMatrixFilenames
#-------------------------------------------------------------------------------
test.createExperimentRefTable = function ()
{
print ('--- test.createExperimentRefTable')
tbl.ref = createExperimentRefTable ()
checkEquals (nrow (tbl.ref), 6)
invisible (tbl.ref)
} # test.createExperimentRefTable
#-------------------------------------------------------------------------------
test.parsePWMfromText = function (dataDir)
{
print ('--- test.parsePWMfromText')
path <- file.path(dataDir, 'tup_SOLEXA_10_FBgn0003896.pfm')
checkTrue(file.exists(path))
lines.of.text = scan (path, sep='\n',
what=character(0), comment='#', quiet=TRUE)
pwm.tup = parsePWMfromText (lines.of.text)
checkEquals (dim (pwm.tup), c (4, 9))
checkEquals (colnames (pwm.tup), as.character (1:9))
checkEquals (rownames (pwm.tup), c ('A', 'C', 'G', 'T'))
invisible (pwm.tup)
} # test.parsePWMfromText
#-------------------------------------------------------------------------------
test.readAndParse = function (dataDir)
{
print ('--- test.readAndParse')
all.files = file.path(dataDir,
getMatrixFilenames (dataDir))
sample.files <- c("Abd-A_FlyReg_FBgn0000014.pfm",
"Abd-B_FlyReg_FBgn0000015.pfm",
"AbdA_Cell_FBgn0000014.pfm")
checkTrue(all(sample.files %in% list.files(dataDir)))
sample.files <- file.path(dataDir, sample.files)
checkTrue(all(file.exists(sample.files)))
# by sorting these filenames, we know the order of the three returned
# matrices, and results can easily be checked
mtx.test = readAndParse (sample.files)
checkEquals (length (mtx.test), 3)
# names are alphabetized, and sort differently in linux and macos
# work around that by
expected.names = c("Abd-A_FlyReg_FBgn0000014.pfm",
"Abd-B_FlyReg_FBgn0000015.pfm",
"AbdA_Cell_FBgn0000014.pfm")
actual.names = sort(names(mtx.test))
checkEquals(expected.names, actual.names)
checkEquals (dim (mtx.test [[1]]), c (4, 8))
checkEquals (dim (mtx.test [[2]]), c (4, 8))
checkEquals (dim (mtx.test [[3]]), c (4, 7))
invisible (mtx.test)
} # test.readAndParse
#-------------------------------------------------------------------------------
test.createMetadata = function (matrices, tbl.ref, xref, list.BD)
{
print ('--- test.createMetadata')
tbl.md = createMetadata (matrices, tbl.ref, xref, list.BD)
checkEquals (nrow (tbl.md), length (matrices))
checkEquals (ncol (tbl.md), 15)
expected <- c("providerName", "providerId", "dataSource", "geneSymbol",
"geneId", "geneIdType", "proteinId", "proteinIdType",
"organism", "sequenceCount", "bindingSequence",
"bindingDomain", "tfFamily", "experimentType", "pubmedID")
checkEquals (colnames (tbl.md), expected)
tester = "Dmelanogaster-FlyFactorSurvey-Abd.A_FlyReg_FBgn0000014"
checkTrue (tester %in% rownames (tbl.md))
x = as.list (tbl.md [tester,])
checkEquals (x$providerName, "Abd-A_FlyReg_FBgn0000014")
checkEquals (x$providerId, "FBgn0000014")
checkEquals (x$geneSymbol, "abd-A")
checkEquals (x$geneId, "42037")
checkEquals (x$geneIdType, "ENTREZ")
checkEquals (x$proteinId, "E1JIM9")
checkEquals (x$proteinIdType, "UNIPROT")
checkEquals (x$organism, "Dmelanogaster")
checkEquals (x$sequenceCount, 37)
checkTrue (is.na (x$bindingSequence))
checkEquals (x$bindingDomain, 'Homeobox')
checkTrue (is.na (x$tfFamily))
checkEquals (x$experimentType, "DNASE I footprinting")
checkTrue (is.na (x$pubmedID))
invisible (tbl.md)
} # test.createMetadata
#-------------------------------------------------------------------------------
test.normalizeMatrices = function (matrices)
{
print ('--- test.normalizeMatrices')
colsums = as.integer (sapply (matrices, function (mtx) as.integer (mean (round (colSums (mtx))))))
checkTrue (all (colsums > 1))
matrices.norm = normalizeMatrices (matrices)
colsums = as.integer (sapply (matrices.norm, function (mtx) as.integer (mean (round (colSums (mtx))))))
checkTrue (all (colsums == 1))
invisible (matrices.norm)
} # test.normalizeMatrices
#------------------------------------------------------------------------------------------------------------------------
test.renameMatrices = function (matrices, tbl.md)
{
print ('--- test.renameMatrices')
stopifnot (length (matrices) == 3)
stopifnot (nrow (tbl.md) == 3)
old.names = names (matrices)
mtxr = renameMatrices (matrices, tbl.md)
# make sure the dash in the incoming, eg,
# "Abd-A_FlyReg_FBgn0000014.pfm"
# is converted to a '.', reserving dashes for
# the separation of dataSource-species-nativeGeneName
checkEquals (names (mtxr) [1],
"Dmelanogaster-FlyFactorSurvey-Abd.A_FlyReg_FBgn0000014")
checkEquals (names (mtxr) [2],
"Dmelanogaster-FlyFactorSurvey-Abd.B_FlyReg_FBgn0000015")
checkEquals (names (mtxr) [3],
"Dmelanogaster-FlyFactorSurvey-AbdA_Cell_FBgn0000014")
invisible (mtxr)
} # renameMatrices
#-------------------------------------------------------------------------------
test.fbgnToIDs <- function()
{
print("--- test.fbgnToIDs")
# first, make sure that unmappable FBgn ids are handled properly
bogus <- fbgnToIDs("bogus")
checkEquals(dim(bogus), c(1,4))
checkEquals(rownames(bogus), "bogus")
checkEquals(as.character(as.list(bogus[1,],)), c(rep("bogus", 3), "FLYBASE"))
# disable the translation of NA to rowname
bogus <- fbgnToIDs("bogus", useInputForMissingValues=FALSE)
checkEquals(dim(bogus), c(1,4))
checkEquals(rownames(bogus), "bogus")
checkTrue(all(is.na(bogus[1,1:3])))
fbgns <- c ("FBgn0003267", "FBgn0000611", "FBgn0004394", "FBgn0027364",
"FBgn0000413")
tbl.ids <- fbgnToIDs(fbgns)
checkEquals(tbl.ids$flybase_id, fbgns)
checkEquals(tbl.ids$symbol, c("ro", "exd", "pdm2", "Six4", "da"))
checkEquals(tbl.ids$geneIdType, rep("ENTREZ", 5))
# 0259750 is obsolete, replaced by 0264442
x <- "FBgn0259750"
tbl.ids <- fbgnToIDs(x)
checkEquals(as.character(tbl.ids[1,]), c(rep(x,3), "FLYBASE"))
# http://flybase.org/static_pages/downloads/IDConv.html
# FBgn0003986 FBgn0261930 FBgn0261930 vnd
x <- "FBgn0003986"
tbl.ids <- fbgnToIDs(x)
checkEquals(as.character(tbl.ids[1,]), c(rep(x, 3), "FLYBASE"))
serializedFbgnsForTesting <- system.file(package="MotifDb", "scripts",
"import", "fbgns.RData")
checkTrue(file.exists(serializedFbgnsForTesting))
load(serializedFbgnsForTesting)
checkEquals(length(fbgns), 326)
tbl.ids <- fbgnToIDs(fbgns)
checkEquals(dim(tbl.ids), c(length(fbgns), 4))
# no NA's should survive
checkTrue(!any(is.na(tbl.ids)))
# what percentage of the 326 ids fail to map? do NOT convert NAs
tbl.ids <- fbgnToIDs(fbgns, useInputForMissingValues=FALSE)
failure.count <- length(which(is.na(tbl.ids$flybase_id)))
browser()
checkTrue(failure.count < nrow(tbl.ids)/4) # ad hoc limit: not more than 25% failures,
} # test.fbgnToIDsp
#-------------------------------------------------------------------------------
# robert stojnic reported that MotifDb 1.1.6 (and before) had some incorrect
# gene symbols, as seen here:
# library(MotifDb)
# mdb <- MotifDb
# x <- query(mdb, "ab_SANGER_10_FBgn0259750")
# t(as.matrix(mcols(x)))
# [,1]
# providerName "ab_SANGER_10_FBgn0259750"
# providerId "FBgn0259750"
# dataSource "FlyFactorSurvey"
# geneSymbol "Ab"
# geneId "FBgn0259750"
# geneIdType "FLYBASE"
# proteinId "E1JHF4"
# proteinIdType "UNIPROT"
# organism "Dmelanogaster"
# sequenceCount "20"
# bindingSequence NA
# bindingDomain "zf-C2H2"
# tfFamily NA
# experimentType "bacterial 1-hybrid, SANGER sequencing"
# pubmedID NA
#
# FBgn0259750 is obsolete. flybase provides a converter:
# http://flybase.org/static_pages/downloads/IDConv.html shows this, and provide the current FBgn:
#
# Submitted ID Current ID Converted ID Related record
# FBgn0259750 FBgn0264442 FBgn0264442 ab
#
# make sure we get this right
# the matrix files are:
#
# dataDir/ab_SANGER_10_FBgn0259750.pfm
# dataDir/ab_SOLEXA_5_FBgn0259750.pfm
#
test.geneNaming <- function(dataDir)
{
dataDir <- "/Users/pshannon/s/data/public/TFBS/flyFactorSurvey/unpacked"
filenames = getMatrixFilenames (dataDir)
removers <- grep (bindingDomainXrefSourceFile(), filenames)
if(length(removers) > 0)
filenames <- filenames[-removers]
keepers <- grep("FBgn0259750", filenames)
checkTrue(length(keepers) == 2)
filenames <- filenames[keepers]
full.filenames = file.path(dataDir, filenames)
list.BD = createXrefBindingDomain (dataDir)
xref = createXref () # maps flybase ids to uniprot
tbl.ref = createExperimentRefTable ()
matrices = readAndParse (full.filenames)
tbl.md = createMetadata (matrices, tbl.ref, xref, list.BD)
# now normalize, not in readAndParse, because we need the counts to go into tbl.md$sequenceCount
checkEquals(tbl.md$geneSymbol, c("ab", "ab"))
matrices = normalizeMatrices (matrices)
matrices = renameMatrices (matrices, tbl.md)
} # test.geneNaming
#-------------------------------------------------------------------------------
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.