R/sqlForge_baseMapBuilder.R

Defines functions makeUniversalMapping getMapForArabidopsisChipPkg getMapForYeastChipPkg getMapForOtherChipPkg getMapForBiocChipPkg probe2gene labelDuplicatedProbes makeBaseMaps scrubAffyFileComments printOutBaseMaps cleanRefSeqs cleanSrcMap

## Just a utility to prevent empty IDs from ever causing any more mayhem
cleanSrcMap <- function(file) {
    fileVals <- read.delim(file=file, header=FALSE, sep="\t", quote="", 
                           colClasses = "character")
    fileVals <- fileVals[fileVals[,1]!='',]
    invalid <- which(is.na(fileVals[,1]))
    if (any(invalid))
        stop(paste0("invalid probe value in 'fileName' at row(s): ",
                    paste(invalid, collapse=',')))

    ##For the case where someone puts no value in, we need things to be an NA
    fileVals[!is.na(fileVals[,2]) & fileVals[,2]=='',2] <- NA
    
    ##Expand IDs to match all the ones available
    probe <- fileVals[,1]
    id <- fileVals[,2]
    id <- gsub(" ","", id)
    id <- strsplit(id, split=";", fixed=TRUE)
    ##Otherwise, the following may remove probes that map to nothing
    id_count <- sapply(id, length)  
    id_probe <- rep(probe, id_count)
    id <- unlist(id)

    insVals <- cbind(id_probe, id)
    insVals <- as.data.frame(insVals)
    insVals
}

cleanRefSeqs <- function(baseMap){
    baseMap = as.matrix(baseMap)
    baseMap[,2] = sub("\\.\\d+?$", "", baseMap[,2], perl=TRUE)
    baseMap = cbind(baseMap[,1],baseMap[,2])
    baseMap = as.data.frame(baseMap)
    baseMap
}

printOutBaseMaps <- function(baseMap, pkgName, otherSrcObjs){
    write.table(baseMap, paste(pkgName,"_baseMap.txt", sep=""), quote=FALSE, sep="\t", row.names=FALSE, col.names=FALSE)
    for(i in 1:length(otherSrcObjs)){
        write.table(otherSrcObjs[i], paste(pkgName,"_otherSrcMap_",i,".txt", sep=""), quote=FALSE, sep="\t", row.names=FALSE, col.names=FALSE)
    }
}


## scrubs Affy file of comments and returns a data.frame that is free of them
scrubAffyFileComments <- function(file){
  ## 1st use readLines to pull in file and clean out the cruft.
  con <- file(file)
  rows <- readLines(con)
  filtInd <- grep("^#", rows)
  if(length(filtInd)>0){## check if there were actually any comments
    rows <- rows[-filtInd] ##if so, remove those lines.
  }
  tmpFl <- tempfile()
  write(rows, file=tmpFl)
  res <- read.csv(tmpFl, as.is=TRUE, na.strings="---", colClasses="character")
  unlink(tmpFl)
  close(con)
  res
}

makeBaseMaps <- function(csvFileName,
                          GenBankIDName="Representative.Public.ID",
                          EntrezGeneIDName="Entrez.Gene",
                          targetDir="."
                          ) {
    ##csvFile <- read.csv(csvFileName, as.is=TRUE, na.strings="---", colClasses="character")
    csvFile <- scrubAffyFileComments(csvFileName)
      
    probe <- csvFile[,1]
    gb <- csvFile[, GenBankIDName]
    eg <- csvFile[, EntrezGeneIDName]
    rm(csvFile)
    gb <- lapply(unlist(gb), function(x) toupper(strsplit(x,"\\.")[[1]][1]))
    gb <- unlist(gb)
    eg <- strsplit(eg, split=" /// ", fixed=TRUE)
    eg_count <- sapply(eg, length)
    eg_probe <- rep(probe, eg_count)
    eg <- unlist(eg)
    
    baseFiles = list()
    
    baseFiles[[1]] = cbind(probe, gb)
    baseFiles[[2]] = cbind(eg_probe, eg)
    baseFiles
}


##For labelDuplicatedProbes() The 1st col HAS to be probes, and the 2nd col
##MUST be the central ID for the platform!
labelDuplicatedProbes <- function(frame){
    ##I need to test 1st if the probe matches more than one thing in the 1st
    ##col (numProbes) Then I need to test if among those matches there is more
    ##than one entrez gene ID.  If more than one, then I need to put a 1
    ##There is a corner case however, if I have 1 unmapped and one
   tmp <- tapply(1:nrow(frame), frame[,1], function(x) frame[x,2])
   tmp2 <- sapply(tmp, length)
   tmp <- sapply(tmp, function(x) length(unique(x[!is.na(x)])))
   out <- tmp > 1 & tmp2 > 1
   out <- out[frame[,1]]
   is_multiple <- as.numeric(out)
   out <- cbind(frame, is_multiple)
   out
}


probe2gene <- function(baseMap, otherSrc, baseMapType=c("gb", "eg", 
                       "refseq", "gbNRef", "image"), chipMapSrc, chipSrc, 
                       pkgName, outputDir=".", allowHomologyGene=FALSE) {
        ## message(cat(paste("Using '",chipSrc,"' for chipSrc.", sep="")))
	baseMapType <- match.arg(baseMapType)
	drv <- dbDriver("SQLite")
	outputFile <- file.path(outputDir, paste(pkgName, "sqlite", sep="."))	
	db <- dbConnect(drv, outputFile)
	dbExecute(db, 
		"CREATE TABLE metadata (name VARCHAR(80) PRIMARY KEY, value VARCHAR(255) );")
	metadata_sql <- paste("INSERT INTO metadata VALUES ('PKGNAME', '",
				pkgName, "');", sep="", collapse="")
	dbExecute(db, metadata_sql)
	dbExecute(db, "CREATE TABLE curr_map (probe_id TEXT, gene_id TEXT);")  #curr_map is the baseMap file
	dbExecute(db, "CREATE TABLE probes_ori (probe_id TEXT);")             #all the probes from curr_map
	dbExecute(db, "CREATE TABLE other (probe_id TEXT, gene_id TEXT);")     #This holds the otherSrc data
	dbExecute(db, "CREATE TABLE probe2gene (probe_id TEXT, gene_id TEXT);") 
	dbExecute(db, "CREATE TABLE probe2acc (probe_id TEXT, gene_id TEXT);") 
	dbExecute(db, "CREATE TABLE temp_probe_map (probe_id TEXT, gene_id TEXT, accession TEXT);")
	dbExecute(db, "CREATE TABLE probe_map (probe_id TEXT, gene_id TEXT, accession TEXT, is_multiple SMALLINT NOT NULL);")

        #If there might be refseq IDs, then they might have unwanted .<digit> extensions which must be removed.
        #This (unfortunately cannot be done for the otherSrc files since I won't know what I am getting in that case).
        if(baseMapType=='refseq' || baseMapType=='gbNRef'){  #need to verify that this function is not destructive to genbank IDs
            baseMap=cleanRefSeqs(baseMap)
        }
        
        #populate the contents of baseMap into curr_map
        clnVals <-baseMap
        sqlIns <- "INSERT INTO curr_map (probe_id,gene_id) VALUES (?,?);"
        dbBegin(db)
        rset <- dbSendQuery(db, sqlIns, unclass(unname(clnVals)))
        dbClearResult(rset)
        dbCommit(db)

        dbExecute(db, "INSERT INTO probes_ori SELECT DISTINCT probe_id FROM curr_map;")
	dbExecute(db,
		"DELETE FROM curr_map WHERE gene_id='NA' OR gene_id='';")
	attach_sql <- paste("ATTACH DATABASE '",chipMapSrc,"' AS src;", sep="", collapse="")
	dbExecute(db, attach_sql)
	if(baseMapType=='eg') {
                message(cat("baseMapType is eg"))
		dbExecute(db, "INSERT INTO probe2acc SELECT probe_id, NULL FROM probes_ori;")
		sql <- paste("INSERT INTO probe2gene",
                             "SELECT probe_id, gene_id",  ##To make things map to multiples, I will have to remove the DISTINCT and GROUP BY clauses when I insert into probe2gene
                             "FROM curr_map;", sep=" ", collapse="")
		dbExecute(db, sql)
	} else if (baseMapType=='image') {
                message(cat("baseMapType is image"))
		dbExecute(db, "INSERT INTO probe2acc SELECT probe_id, NULL FROM probes_ori;")
		sql <- paste("INSERT INTO probe2gene",
			    "SELECT c.probe_id, i.gene_id",
			    "FROM curr_map as c, src.image_acc_from_uni as i",
			    "WHERE c.gene_id=i.accession;", sep=" ", collapse="")
		dbExecute(db, sql)
	} else if (baseMapType=='refseq') {
                message(cat("baseMapType is refseq"))
		dbExecute(db, "CREATE INDEX cm1 ON curr_map(probe_id);")
                dbExecute(db, "CREATE INDEX cm2 ON curr_map(gene_id);")
                ##just keep ALL of the accessions that went in from the base map (whether or NOT they successfully map over)
		dbExecute(db, "INSERT INTO probe2acc SELECT DISTINCT probe_id, gene_id FROM curr_map;")
		sql <- paste("INSERT INTO probe2gene", 
			    "SELECT c.probe_id, a.gene_id",
			    "FROM curr_map as c, src.refseq as a",
			    "WHERE c.gene_id=a.accession;", sep=" ", collapse="") 
		dbExecute(db, sql)
	} else 	{ ### type='gb' or 'gbNRef'
                message(cat("baseMapType is gb or gbNRef"))
		dbExecute(db, "CREATE INDEX cm1 ON curr_map(probe_id);")
		dbExecute(db, "CREATE INDEX cm2 ON curr_map(gene_id);")
                ##just keep ALL of the accessions that went in from the base map (whether or NOT they successfully map over)
		dbExecute(db, "INSERT INTO probe2acc SELECT DISTINCT probe_id, gene_id FROM curr_map;")
		sql <- paste("INSERT INTO probe2gene", 
			    "SELECT c.probe_id, a.gene_id",
			    "FROM curr_map as c, src.accession as a",
			    "WHERE c.gene_id=a.accession;", sep=" ", collapse="") 
		dbExecute(db, sql)
		sql <- paste("DELETE FROM curr_map WHERE probe_id IN",
			"(SELECT probe_id FROM probe2gene);", sep=" ", collapse="")
 		dbExecute(db, sql)
		sql <- paste("INSERT INTO probe2gene ", 
			    "SELECT c.probe_id, a.gene_id ",
			    "FROM curr_map as c, src.accession as a",
                            "WHERE c.gene_id=a.accession;", sep=" ", collapse="")
		dbExecute(db, sql)
	}
        
        ##Code for dealing with other sources of IDs (try to find other EG matches and if so, stick them in!)        
	dbExecute(db, "DROP TABLE curr_map;")
	lapply(otherSrc, function(thisOtherName) {
            clnVals <- thisOtherName
            sqlIns <- "INSERT INTO other (probe_id,gene_id) VALUES(?,?);"
            dbBegin(db)
            rset <- dbSendQuery(db, sqlIns, unclass(unname(clnVals)))
            dbClearResult(rset)
            dbCommit(db)
	})
	dbExecute(db, "DELETE FROM other WHERE gene_id IN ('NA', '');")

        ##I need to know if there is refseq, unigene or GenBank data for the organism in question.
        tables = dbGetQuery(db, "SELECT name FROM src.sqlite_master;")       
        
        ##The following will insert any missing refseq IDs into the database AS entrez gene IDs, (by joining with the other table)
        if(length(grep("refseq",tables))>0){
            sql <- "INSERT INTO probe2gene SELECT DISTINCT m.probe_id, r.gene_id FROM other as m INNER JOIN src.refseq as r WHERE m.gene_id=r.accession;"
            dbExecute(db, sql)
        }        
        ##The following will insert any missing GenBank IDs into the database AS entrez gene IDs, (by joining with the other table)
        if(length(grep("accession",tables))>0){
            sql <- "INSERT INTO probe2gene SELECT DISTINCT m.probe_id, gb.gene_id FROM other as m INNER JOIN src.accession as gb WHERE m.gene_id=gb.accession;"
            dbExecute(db, sql)
        }
        
        ##The following will insert any missing Entrez Gene IDs into the database AS themselves, 
        if(length(grep("EGList",tables))>0){
            sql <- "INSERT INTO probe2gene SELECT DISTINCT probe_id, gene_id FROM other WHERE gene_id IN (SELECT gene_id FROM src.EGList);"
            dbExecute(db, sql)
        }

        dbExecute(db, "INSERT INTO probe2gene SELECT probe_id, NULL FROM probes_ori WHERE probe_id NOT IN (SELECT probe_id FROM probe2gene);")
	dbExecute(db, "CREATE INDEX p1 ON probe2gene(probe_id);")
	dbExecute(db, "INSERT INTO temp_probe_map SELECT DISTINCT p.probe_id, p.gene_id, a.gene_id FROM probe2acc as a LEFT OUTER JOIN probe2gene as p ON a.probe_id=p.probe_id;")
        dbExecute(db, "DROP TABLE other;")
        dbExecute(db, "DROP TABLE probe2gene;")
        dbExecute(db, "DROP TABLE probe2acc;")
        dbExecute(db, "DROP TABLE probes_ori;")
        
        ##Now I need to attach the chipsrc and use verify that all the gene_IDs are in fact contained in the genes table of the corresponding chipsrc DB...
        dbExecute(db, paste("ATTACH '", chipSrc,"' AS chipSrc;", sep="", collapse=""))
        dbExecute(db, paste("UPDATE temp_probe_map SET gene_id = NULL WHERE gene_id NOT IN (SELECT gene_id FROM chipSrc.genes);",sep=""))

        
        ##Now I need to decide which probes have multiple mappings and flag those.
        probeData <- dbGetQuery(db, "SELECT * FROM temp_probe_map;")
        modifiedData <- labelDuplicatedProbes(probeData)
        sqlIns <- "INSERT INTO probe_map (probe_id,gene_id,accession,is_multiple) VALUES (?,?,?,?);"
        dbBegin(db)
        rset <- dbSendQuery(db, sqlIns, unclass(unname(modifiedData)))
        dbClearResult(rset)
        dbCommit(db)

##         ##Drop that temp table
        dbExecute(db, "DROP TABLE temp_probe_map;")

        dbDisconnect(db)
	pkgName
}



getMapForBiocChipPkg <- function(csvFileName, pkgName, chipMapSrc, chipSrc, 
				otherSrc=character(0),
				baseMapType="gbNRef", 
				outputDir=".",
				allowHomologyGene=FALSE) {
    baseMaps = makeBaseMaps(csvFileName=csvFileName, targetDir=outputDir)
    
    #pre-clean the otherSrc's and pass them along as lists of lists:  #TODO: this fails if there are not any otherSrc's
    otherSrcObjs = list()
    if(length(otherSrc)==0){
        otherSrc = otherSrcObjs
    }else{
        for(i in 1:length(otherSrc)){
            otherSrcObjs[[i]] = cleanSrcMap(otherSrc[[i]])
        }
    }
    #The 1st item in baseMaps is always the genbank ID
    if (baseMapType == "eg"){
        baseMap <- as.data.frame(baseMaps[[2]])
    } else {
        baseMap <- as.data.frame(baseMaps[[1]])
        otherSrcObjs[[length(otherSrcObjs) + 1]] <- as.data.frame(baseMaps[[2]])
    }
    
    ##just for debugging (disable the rest of the time)
    if(FALSE){
        printOutBaseMaps(baseMap, pkgName, otherSrcObjs)
    }
    
    probe2gene(baseMap=baseMap, 
               baseMapType=baseMapType, 
               otherSrc=otherSrcObjs,
               chipMapSrc=chipMapSrc,
               chipSrc=chipSrc,
               pkgName=pkgName,
               outputDir=outputDir,
               allowHomologyGene=allowHomologyGene)
}

getMapForOtherChipPkg <- function(filePath,
                                  pkgName,
                                  chipMapSrc,
                                  chipSrc,
                                  otherSrc=character(0),
                                  baseMapType="gbNRef",
                                  outputDir=".",
                                  allowHomologyGene=FALSE) {
    #pre-clean the otherSrc's and pass them along as lists of lists:  TODO: fix no otherSrc bug here too
    otherSrcObjs = list()
    if(length(otherSrc)==0){
        otherSrc = otherSrcObjs
    }else{
        for(i in 1:length(otherSrc)){
            otherSrcObjs[[i]] = cleanSrcMap(otherSrc[[i]])
        }
    }
    baseMap = cleanSrcMap(filePath)

    ##just for debugging (disable the rest of the time)
    if(FALSE){
        printOutBaseMaps(baseMap, pkgName, otherSrcObjs)
    }

    probe2gene(baseMap=baseMap,
               baseMapType=baseMapType,
               otherSrc=otherSrcObjs,
               chipMapSrc=chipMapSrc,
               chipSrc=chipSrc,
               pkgName=pkgName,
               outputDir=outputDir,
               allowHomologyGene=allowHomologyGene)
}


getMapForYeastChipPkg <- function(affy, fileName, pkgName, outputDir=".") {

    baseMaps = data.frame()
    if(affy==TRUE){
        baseMaps = makeBaseMaps (csvFileName=fileName,
                        targetDir=outputDir) ##,
                        ##outputPrefix=pkgName)
    }
	drv <- dbDriver("SQLite")
	outputFile <- file.path(outputDir, paste(pkgName, "sqlite", sep="."))	
	db <- dbConnect(drv, outputFile)
	dbExecute(db,
                "CREATE TABLE metadata (name VARCHAR(80) PRIMARY KEY, value VARCHAR(255) );")
        metadata_sql <- paste("INSERT INTO metadata VALUES ('PKGNAME', '",
                                pkgName, "');", sep="", collapse="")
	dbExecute(db, metadata_sql)
	dbExecute(db, 
		"CREATE TABLE probe_map (probe_id TEXT NOT NULL, systematic_name TEXT, is_multiple SMALLINT NOT NULL);");
    if(affy==TRUE){
        probeData <- as.data.frame(baseMaps[[1]])
        clnVals <- labelDuplicatedProbes(probeData)
        sqlIns <- "INSERT INTO probe_map VALUES(?,?,?);"
        dbBegin(db)
        rset <- dbSendQuery(db, sqlIns, unclass(unname(clnVals)))
        dbClearResult(rset)
        dbCommit(db)        
    }else
    {
        probeData <- cleanSrcMap(fileName)
        clnVals <- labelDuplicatedProbes(probeData)
        sqlIns <- "INSERT INTO probe_map VALUES(?,?,?);"
        dbBegin(db)
        rset <- dbSendQuery(db, sqlIns, unclass(unname(clnVals)))
        dbClearResult(rset)
        dbCommit(db)
    }
	dbExecute(db,
		"UPDATE probe_map SET systematic_name=NULL WHERE systematic_name='NA';")
	dbDisconnect(db)
    
	pkgName
}


#Need to add fileName and affy param here
getMapForArabidopsisChipPkg <- function(affy, fileName, pkgName, chipMapSrc, outputDir=".") {

    #Note: this function and the associated chipmapsrc database for Arabidopsis both assume that affy will only
    #ever make two arrays for arabidposis.
    #if this changes, then the database and this function will need to be updated (this is unlikely however)
    #for non-affy functions, the map will be read in "as is" and used to make a package.
	drv <- dbDriver("SQLite")
	outputFile <- file.path(outputDir, paste(pkgName, "sqlite", sep="."))	
	db <- dbConnect(drv, outputFile)
	dbExecute(db,
                "CREATE TABLE metadata (name VARCHAR(80) PRIMARY KEY, value VARCHAR(255) );")
        metadata_sql <- paste("INSERT INTO metadata VALUES ('PKGNAME', '",
                                pkgName, "');", sep="", collapse="")
	dbExecute(db, metadata_sql)
	dbExecute(db, 
		"CREATE TABLE probe_map (probe_id TEXT NOT NULL, gene_id TEXT);");
        dbExecute(db, paste("ATTACH DATABASE '",chipMapSrc,"' AS src;",sep=""))

    if(affy==TRUE){#Warning: This will only work if the arabidopsis package is one of "the" TWO affy packages.

	if (pkgName == "ag"){url_name="TAIRAGURL"}
	else if (pkgName == "ath1121501"){url_name="TAIRATHURL"}

        insert_sql <- paste("INSERT INTO metadata SELECT 'TAIRCHIPMAPURL', value FROM src.metadata WHERE name='", url_name, "';", sep="")
	dbExecute(db, insert_sql);
                
      	insert_sql <- paste("INSERT INTO probe_map SELECT * FROM src.", pkgName, ";", sep="")
       	dbExecute(db, insert_sql);
    }else
    {        
        clnVals <- cleanSrcMap(fileName)
        sqlIns <- "INSERT INTO probe_map VALUES(?,?);"
        dbBegin(db)
        rset <- dbSendQuery(db, sqlIns, unclass(unname(clnVals)))
        dbClearResult(rset)
        dbCommit(db)
        
    }
	dbExecute(db, "DETACH src;");			
	dbDisconnect(db)
	pkgName
}


makeUniversalMapping <- function(pkgName,
                                chipSrc,
                                baseMapType="eg",
                                outputDir=".") {

	## The rest of this will just make a map of ALL the EGs
        drv <- dbDriver("SQLite")
        outputFile <- file.path(outputDir, paste(pkgName, "sqlite", sep="."))
        db <- dbConnect(drv, outputFile)
        dbExecute(db,
                "CREATE TABLE metadata (name VARCHAR(80) PRIMARY KEY, value VARCHAR(255) );")
        metadata_sql <- paste("INSERT INTO metadata VALUES ('PKGNAME', '",
                                pkgName, "');", sep="", collapse="")
        dbExecute(db, metadata_sql)

	#no source file so we have to grab this from the chipmapsrc
        create_sql <- "CREATE TABLE probe_map (gene_id TEXT NOT NULL);"
        dbExecute(db, create_sql)

        attach_sql <- paste("ATTACH DATABASE '",chipSrc,"' AS src;", sep="", collapse="")
        dbExecute(db, attach_sql)

        #just grab ALL Entrez genes (without any constraint other than that they be UNIQUE) from chipsrc_XXX
        insert_sql <- "INSERT INTO probe_map SELECT DISTINCT gene_id from src.genes ;"
        dbExecute(db, insert_sql)
        
        dbDisconnect(db)
        
        pkgName

}
Bioconductor/AnnotationForge documentation built on Nov. 2, 2024, 6:30 a.m.