Nothing
# TODO: Add comment
#
# Author: lgoff
###############################################################################
#####################
#File Archetype parsing
#####################
#RunInfo
loadRunInfo<-function(runInfoFile,
dbConn,
path,
fileArgs = list(sep=sep, header=header, row.names = row.names, quote=quote, na.string=na.string, ...),
sep="\t",
na.string = "-",
header = TRUE,
quote = "",
stringsAsFactors=FALSE,
row.names=NULL,
...) {
#Setup and reporting
write(paste("Reading Run Info File ",runInfoFile,sep=""),stderr())
fileArgs$file = runInfoFile
#Read Run Info file
runInfo = as.data.frame(do.call(read.table,fileArgs))
#Parsing
#not needed...
#Load into database (runInfo table)
write("Writing runInfo Table",stderr())
insert_SQL<-'INSERT INTO runInfo VALUES(:param, :value)'
bulk_insert(dbConn,insert_SQL,runInfo)
}
#ReplicateTable
loadRepTable<-function(repTableFile,
dbConn,
path,
fileArgs = list(sep=sep, header=header, row.names = row.names, quote=quote, na.string=na.string, ...),
sep="\t",
na.string = "-",
header = TRUE,
quote = "",
stringsAsFactors=FALSE,
row.names=NULL,
...) {
#Setup and reporting
write(paste("Reading Read Group Info ",repTableFile,sep=""),stderr())
fileArgs$file = repTableFile
#Read Run Info file
full = as.data.frame(read.delim(repTableFile))
#print(head(full))
#Fix sample_names
full$condition<-make.db.names(dbConn,as.character(full$condition),unique=FALSE)
#Parsing
#For now, I need to concatenate condition and replicate number
full$rep_name<-paste(full$condition,full$replicate_num,sep="_")
#Load into database (replicates table)
write("Writing replicates Table",stderr())
insert_SQL<-'INSERT INTO replicates VALUES(:file, :condition, :replicate_num, :rep_name, :total_mass, :norm_mass, :internal_scale, :external_scale)'
bulk_insert(dbConn,insert_SQL,full)
}
#ReplicateTable
loadVarModelTable<-function(VarModelFile,
dbConn,
path,
fileArgs = list(sep=sep, header=header, row.names = row.names, quote=quote, na.string=na.string, ...),
sep="\t",
na.string = "-",
header = TRUE,
quote = "",
stringsAsFactors=FALSE,
row.names=NULL,
...) {
#Setup and reporting
write(paste("Reading Var Model Info ",VarModelFile,sep=""),stderr())
fileArgs$file = VarModelFile
#Read Run Info file
full = as.data.frame(read.delim(VarModelFile))
#print(head(full))
#Fix sample_names
full$condition<-make.db.names(dbConn,as.character(full$condition),unique=FALSE)
#Load into database (replicates table)
write("Writing varModel Table",stderr())
insert_SQL<-'INSERT INTO varModel VALUES(:condition, :locus, :compatible_count_mean, :compatible_count_var, :total_count_mean, :total_count_var, :fitted_var)'
bulk_insert(dbConn,insert_SQL,full)
}
#Genes
loadGenes<-function(fpkmFile,
diffFile,
promoterFile,
countFile,
replicateFile,
dbConn,
path,
#Arguments to read.* methods
fpkmArgs = list(sep=sep, header=header, row.names = row.names, quote=quote, na.string=na.string, ...),
diffArgs = list(sep=sep, header=header, row.names = row.names, quote=quote, na.string=na.string, ...),
promoterArgs = list(sep=sep, header=header, row.names = row.names, quote=quote, na.string=na.string, ...),
countArgs = list(sep=sep, header=header, row.names = row.names, quote=quote, na.string=na.string, ...),
replicateArgs = list(sep=sep, header=header, row.names = row.names, quote=quote, na.string=na.string, ...),
sep="\t",
na.string = "-",
header = TRUE,
quote = "",
stringsAsFactors = FALSE,
row.names=NULL,
...) {
#Error Trapping
if (missing(fpkmFile))
stop("fpkmFile cannot be missing!")
if (missing(dbConn))
stop("Must provide a dbConn connection")
#TODO test dbConn connection and database structure
idCols = c(1:9)
#Read primary file
write(paste("Reading ",fpkmFile,sep=""),stderr())
fpkmArgs$file = fpkmFile
full = as.data.frame(do.call(read.table,fpkmArgs))
########
#Handle Sample Names
########
#Check that samples table is populated
write("Checking samples table...",stderr())
samples<-getSamplesFromColnames(full)
samples<-make.db.names(dbConn,samples,unique=FALSE)
dbSamples<-dbReadTable(dbConn,"samples")
if (dim(dbSamples)[1]>0) {
if (all(samples %in% dbSamples$sample_name)){
write ("OK!",stderr())
}else{
stop("Sample mismatch!")
}
}else{
write("Populating samples table...",stderr())
populateSampleTable(samples,dbConn)
}
######
#Populate genes table
######
genesTable<-full[,c(1:3,5,7:9)]
write("Writing genes table",stderr())
#dbWriteTable(dbConn,'genes',genesTable,row.names=F,append=T)
insert_SQL<-'INSERT INTO genes VALUES(:tracking_id, :class_code, :nearest_ref_id, :gene_short_name, :locus, :length, :coverage)'
bulk_insert(dbConn,insert_SQL,genesTable)
######
#Populate geneData table
######
write("Reshaping geneData table",stderr())
genemelt<-melt(full,id.vars=c("tracking_id"),measure.vars=-idCols,variable_name="sample_name")
colnames(genemelt)[colnames(genemelt)=='variable']<-'sample_name'
#Clean up and normalize data
genemelt$measurement = ""
genemelt$measurement[grepl("_FPKM$",genemelt$sample_name)] = "fpkm"
genemelt$measurement[grepl("_conf_lo$",genemelt$sample_name)] = "conf_lo"
genemelt$measurement[grepl("_conf_hi$",genemelt$sample_name)] = "conf_hi"
genemelt$measurement[grepl("_status$",genemelt$sample_name)] = "status"
genemelt$sample_name<-gsub("_FPKM$","",genemelt$sample_name)
genemelt$sample_name<-gsub("_conf_lo$","",genemelt$sample_name)
genemelt$sample_name<-gsub("_conf_hi$","",genemelt$sample_name)
genemelt$sample_name<-gsub("_status$","",genemelt$sample_name)
#Adjust sample names with make.db.names
genemelt$sample_name <- make.db.names(dbConn,as.vector(genemelt$sample_name),unique=FALSE)
#Recast
write("Recasting",stderr())
genemelt<-as.data.frame(dcast(genemelt,...~measurement))
#debugging
#write(colnames(genemelt),stderr())
#Write geneData table
write("Writing geneData table",stderr())
#dbWriteTable(dbConn,'geneData',as.data.frame(genemelt[,c(1:2,5,3,4,6)]),row.names=F,append=T)
insert_SQL<-'INSERT INTO geneData VALUES(:tracking_id,:sample_name,:fpkm,:conf_hi,:conf_lo,:status)'
#write(insert_SQL,stderr())
bulk_insert(dbConn,insert_SQL,genemelt[,c(1:2,5,3,4,6)])
#######
#Handle gene_exp.diff
#######
if(file.exists(diffFile)){
#Read diff file
write(paste("Reading ",diffFile,sep=""),stderr())
diffArgs$file = diffFile
#Something like this to make sure sample names are treated as character values and not numeric, logical, etc.
#diffArgs$colClasses<-c(rep('character',7),rep('numeric',6),'character')
diff<-as.data.frame(do.call(read.table,diffArgs))
if(dim(diff)[1]>0){
#Adjust sample names with make.db.names
diff$sample_1<-make.db.names(dbConn,as.vector(diff$sample_1),unique=FALSE)
diff$sample_2<-make.db.names(dbConn,as.vector(diff$sample_2),unique=FALSE)
write("Writing geneExpDiffData table",stderr())
diffCols<-c(1,5:14)
colnames(diff)[colnames(diff)=="log2.fold_change."]<-"log2_fold_change" # Not sure why this is the only place this is screwy...
#debugging
#write(colnames(diff[,diffCols]),stderr())
#dbWriteTable(dbConn,'geneExpDiffData',diff[,diffCols],row.names=F,append=T)
insert_SQL<-"INSERT INTO geneExpDiffData VALUES(:test_id,:sample_1,:sample_2,:status,:value_1,:value_2,:log2_fold_change,:test_stat,:p_value,:q_value,:significant)"
bulk_insert(dbConn,insert_SQL,diff[,diffCols])
}else{
write(paste("No records found in", diffFile),stderr())
}
}
########
#TODO: Handle promoters.diff
########
if(file.exists(promoterFile)){
#Read promoterFile
write(paste("Reading ",promoterFile,sep=""),stderr())
promoterArgs$file = promoterFile
promoter<-as.data.frame(do.call(read.table,promoterArgs))
write("Writing promoterDiffData table",stderr())
promoterCols<-c(2,5:14)
if(dim(promoter)[1]>0){
#dbWriteTable(dbConn,'promoterDiffData',promoter[,promoterCols],row.names=F,append=T)
insert_SQL<-"INSERT INTO promoterDiffData VALUES(?,?,?,?,?,?,?,?,?,?,?)"
bulk_insert(dbConn,insert_SQL,promoter[,promoterCols])
}else{
write(paste("No records found in", promoterFile),stderr())
}
}
#########
#Handle Feature Data (this will actually be done on CuffData objects instead...but I may include something here as well)
#########
###########
#Handle Counts .count_tracking
###########
if(file.exists(countFile)){
idCols = c(1)
#Read countFile
write(paste("Reading ", countFile,sep=""),stderr())
countArgs$file = countFile
counts<-as.data.frame(do.call(read.table,countArgs))
if(dim(counts)[1]>0){
#Reshape geneCount table
write("Reshaping geneCount table",stderr())
countmelt<-melt(counts,id.vars=c("tracking_id"),measure.vars=-idCols)
colnames(countmelt)[colnames(countmelt)=='variable']<-'sample_name'
countmelt$measurement = ""
countmelt$measurement[grepl("_count$",countmelt$sample_name)] = "count"
countmelt$measurement[grepl("_count_variance$",countmelt$sample_name)] = "variance"
countmelt$measurement[grepl("_count_uncertainty_var$",countmelt$sample_name)] = "uncertainty"
countmelt$measurement[grepl("_count_dispersion_var$",countmelt$sample_name)] = "dispersion"
countmelt$measurement[grepl("_status$",countmelt$sample_name)] = "status"
countmelt$sample_name<-gsub("_count$","",countmelt$sample_name)
countmelt$sample_name<-gsub("_count_variance$","",countmelt$sample_name)
countmelt$sample_name<-gsub("_count_uncertainty_var$","",countmelt$sample_name)
countmelt$sample_name<-gsub("_count_dispersion_var$","",countmelt$sample_name)
countmelt$sample_name<-gsub("_status$","",countmelt$sample_name)
#Adjust sample names with make.db.names
countmelt$sample_name <- make.db.names(dbConn,as.vector(countmelt$sample_name),unique=FALSE)
#Recast
write("Recasting",stderr())
countmelt<-as.data.frame(dcast(countmelt,...~measurement))
#debugging
#write(colnames(countmelt),stderr())
#Write geneCount table
write("Writing geneCount table",stderr())
insert_SQL<-'INSERT INTO geneCount VALUES(:tracking_id,:sample_name,:count,:variance,:uncertainty,:dispersion,:status)'
bulk_insert(dbConn,insert_SQL,countmelt)
}else{
write(paste("No records found in", countFile),stderr())
}
}
###########
#Handle Replicates .rep_tracking
###########
if(file.exists(replicateFile)){
idCols = 1
#Read countFile
write(paste("Reading read group info in ", replicateFile,sep=""),stderr())
replicateArgs$file = replicateFile
reps<-as.data.frame(do.call(read.table,replicateArgs))
#print(head(reps))
if(dim(reps)[1]>0){
#Adjust sample names with make.db.names
reps$condition <- make.db.names(dbConn,as.character(reps$condition),unique=FALSE)
#Create unique rep name
reps$rep_name<-paste(reps$condition,reps$replicate,sep="_")
colnames(reps)[colnames(reps)=="condition"]<-"sample_name"
#Write geneReplicateData table
write("Writing geneReplicateData table",stderr())
insert_SQL<-'INSERT INTO geneReplicateData VALUES(:tracking_id,:sample_name,:replicate,:rep_name,:raw_frags,:internal_scaled_frags,:external_scaled_frags,:FPKM,:effective_length,:status)'
bulk_insert(dbConn,insert_SQL,reps)
}else{
write(paste("No records found in", replicateFile),stderr())
}
}
}
#Isoforms
loadIsoforms<-function(fpkmFile,
diffFile,
countFile,
replicateFile,
dbConn,
path,
#Arguments to read.* methods
fpkmArgs = list(sep=sep, header=header, row.names = row.names, quote=quote, na.string=na.string, ...),
diffArgs = list(sep=sep, header=header, row.names = row.names, quote=quote, na.string=na.string, ...),
countArgs = list(sep=sep, header=header, row.names = row.names, quote=quote, na.string=na.string, ...),
replicateArgs = list(sep=sep, header=header, row.names = row.names, quote=quote, na.string=na.string, ...),
sep="\t",
na.string = "-",
header = TRUE,
quote = "",
stringsAsFactors = FALSE,
row.names=NULL,
...) {
#Error Trapping
if (missing(fpkmFile))
stop("fpkmFile cannot be missing!")
if (missing(dbConn))
stop("Must provide a dbConn connection")
#TODO test dbConn connection and database structure
idCols = c(1:9)
#Read primary file
write(paste("Reading ",fpkmFile,sep=""),stderr())
fpkmArgs$file = fpkmFile
full = as.data.frame(do.call(read.table,fpkmArgs))
########
#Handle Sample Names
########
#Check that samples table is populated
write("Checking samples table...",stderr())
samples<-getSamplesFromColnames(full)
samples<-make.db.names(dbConn,samples,unique=FALSE)
dbSamples<-dbReadTable(dbConn,"samples")
if (dim(dbSamples)[1]>0) {
if (all(samples %in% dbSamples$sample_name)){
write ("OK!",stderr())
}else{
write(samples,stderr())
stop("Sample mismatch!")
}
}else{
write("Populating samples table...",stderr())
populateSampleTable(samples,dbConn)
}
######
#Populate isoforms table
######
isoformCols<-c(1,4,5,6,2,3,7:9)
isoformsTable<-full[,isoformCols]
#This is a temporary fix until p_id is added to the 'isoforms.fpkm_tracking' file
isoformsTable<-cbind(isoformsTable[,1:2],data.frame(CDS_id=rep("NA",dim(isoformsTable)[1])),isoformsTable[,-c(1:2)])
#print (head(isoformsTable))
write("Writing isoforms table",stderr())
#dbWriteTable(dbConn,'isoforms',as.data.frame(isoformsTable),row.names=F,append=T)
insert_SQL<-'INSERT INTO isoforms VALUES(?,?,?,?,?,?,?,?,?,?)'
bulk_insert(dbConn,insert_SQL,isoformsTable)
######
#Populate isoformData table
######
write("Reshaping isoformData table",stderr())
isoformmelt<-melt(full,id.vars=c("tracking_id"),measure.vars=-idCols,variable_name="sample_name")
colnames(isoformmelt)[colnames(isoformmelt)=='variable']<-'sample_name'
#Clean up and normalize data
isoformmelt$measurement = ""
isoformmelt$measurement[grepl("_FPKM$",isoformmelt$sample_name)] = "fpkm"
isoformmelt$measurement[grepl("_conf_lo$",isoformmelt$sample_name)] = "conf_lo"
isoformmelt$measurement[grepl("_conf_hi$",isoformmelt$sample_name)] = "conf_hi"
isoformmelt$measurement[grepl("_status$",isoformmelt$sample_name)] = "status"
isoformmelt$sample_name<-gsub("_FPKM$","",isoformmelt$sample_name)
isoformmelt$sample_name<-gsub("_conf_lo$","",isoformmelt$sample_name)
isoformmelt$sample_name<-gsub("_conf_hi$","",isoformmelt$sample_name)
isoformmelt$sample_name<-gsub("_status$","",isoformmelt$sample_name)
#Adjust sample names with make.db.names
isoformmelt$sample_name <- make.db.names(dbConn,as.vector(isoformmelt$sample_name),unique=FALSE)
#Recast
write("Recasting",stderr())
isoformmelt<-as.data.frame(dcast(isoformmelt,...~measurement))
#Write geneData table
write("Writing isoformData table",stderr())
#dbWriteTable(dbConn,'isoformData',as.data.frame(isoformmelt[,c(1:2,5,3,4,6)]),row.names=F,append=T)
insert_SQL<-"INSERT INTO isoformData VALUES(?,?,?,?,?,?)"
bulk_insert(dbConn,insert_SQL,isoformmelt[,c(1:2,5,3,4,6)])
#######
#Handle isoform_exp.diff
#######
if(file.exists(diffFile)){
#Read diff file
write(paste("Reading ",diffFile,sep=""),stderr())
diffArgs$file = diffFile
diff<-as.data.frame(do.call(read.table,diffArgs))
if(dim(diff)[1]>0){
#Adjust sample names with make.db.names
diff$sample_1<-make.db.names(dbConn,as.vector(diff$sample_1),unique=FALSE)
diff$sample_2<-make.db.names(dbConn,as.vector(diff$sample_2),unique=FALSE)
write("Writing isoformExpDiffData table",stderr())
diffCols<-c(1,5:14)
#dbWriteTable(dbConn,'isoformExpDiffData',diff[,diffCols],row.names=F,append=T)
insert_SQL<-"INSERT INTO isoformExpDiffData VALUES(?,?,?,?,?,?,?,?,?,?,?)"
bulk_insert(dbConn,insert_SQL,diff[,diffCols])
}else{
write(paste("No records found in",diffFile),stderr())
}
}
###########
#Handle Counts .count_tracking
###########
if(file.exists(countFile)){
idCols = c(1)
#Read countFile
write(paste("Reading ", countFile,sep=""),stderr())
countArgs$file = countFile
counts<-as.data.frame(do.call(read.table,countArgs))
if(dim(counts)[1]>0){
#Reshape isoformCount table
write("Reshaping isoformCount table",stderr())
countmelt<-melt(counts,id.vars=c("tracking_id"),measure.vars=-idCols)
colnames(countmelt)[colnames(countmelt)=='variable']<-'sample_name'
countmelt$measurement = ""
countmelt$measurement[grepl("_count$",countmelt$sample_name)] = "count"
countmelt$measurement[grepl("_count_variance$",countmelt$sample_name)] = "variance"
countmelt$measurement[grepl("_count_uncertainty_var$",countmelt$sample_name)] = "uncertainty"
countmelt$measurement[grepl("_count_dispersion_var$",countmelt$sample_name)] = "dispersion"
countmelt$measurement[grepl("_status$",countmelt$sample_name)] = "status"
countmelt$sample_name<-gsub("_count$","",countmelt$sample_name)
countmelt$sample_name<-gsub("_count_variance$","",countmelt$sample_name)
countmelt$sample_name<-gsub("_count_uncertainty_var$","",countmelt$sample_name)
countmelt$sample_name<-gsub("_count_dispersion_var$","",countmelt$sample_name)
countmelt$sample_name<-gsub("_status$","",countmelt$sample_name)
#Adjust sample names with make.db.names
countmelt$sample_name <- make.db.names(dbConn,as.vector(countmelt$sample_name),unique=FALSE)
#Recast
write("Recasting",stderr())
countmelt<-as.data.frame(dcast(countmelt,...~measurement))
#debugging
#write(colnames(countmelt),stderr())
#Write isoformCount table
write("Writing isoformCount table",stderr())
insert_SQL<-'INSERT INTO isoformCount VALUES(:tracking_id,:sample_name,:count,:variance,:uncertainty,:dispersion,:status)'
bulk_insert(dbConn,insert_SQL,countmelt)
}else{
write(paste("No records found in",countFile),stderr())
}
}
###########
#Handle Replicates .rep_tracking
###########
if(file.exists(replicateFile)){
idCols = 1
#Read countFile
write(paste("Reading read group info in ", replicateFile,sep=""),stderr())
replicateArgs$file = replicateFile
reps<-as.data.frame(do.call(read.table,replicateArgs))
#print(head(reps))
if(dim(reps)[1]>0){
#Adjust sample names with make.db.names
reps$condition <- make.db.names(dbConn,as.character(reps$condition),unique=FALSE)
#Create unique rep name
reps$rep_name<-paste(reps$condition,reps$replicate,sep="_")
colnames(reps)[colnames(reps)=="condition"]<-"sample_name"
#Write isoformReplicateData table
write("Writing isoformReplicateData table",stderr())
insert_SQL<-'INSERT INTO isoformReplicateData VALUES(:tracking_id,:sample_name,:replicate,:rep_name,:raw_frags,:internal_scaled_frags,:external_scaled_frags,:FPKM,:effective_length,:status)'
bulk_insert(dbConn,insert_SQL,reps)
}else{
write(paste("No records found in",replicateFile),stderr())
}
}
}
#TSS groups
loadTSS<-function(fpkmFile,
diffFile,
splicingFile,
countFile,
replicateFile,
dbConn,
path,
#Arguments to read.* methods
fpkmArgs = list(sep=sep, header=header, row.names = row.names, quote=quote, na.string=na.string, ...),
diffArgs = list(sep=sep, header=header, row.names = row.names, quote=quote, na.string=na.string, ...),
splicingArgs = list(sep=sep, header=header, row.names = row.names, quote=quote, na.string=na.string, ...),
countArgs = list(sep=sep, header=header, row.names = row.names, quote=quote, na.string=na.string, ...),
replicateArgs = list(sep=sep, header=header, row.names = row.names, quote=quote, na.string=na.string, ...),
sep="\t",
na.string = "-",
header = TRUE,
quote = "",
stringsAsFactors = FALSE,
row.names=NULL,
...) {
#Error Trapping
if (missing(fpkmFile))
stop("fpkmFile cannot be missing!")
if (missing(dbConn))
stop("Must provide a dbConn connection")
#TODO test dbConn connection and database structure
idCols = c(1:9)
#Read primary file
write(paste("Reading ",fpkmFile,sep=""),stderr())
fpkmArgs$file = fpkmFile
full = as.data.frame(do.call(read.table,fpkmArgs))
########
#Handle Sample Names
########
#Check that samples table is populated
write("Checking samples table...",stderr())
samples<-getSamplesFromColnames(full)
samples<-make.db.names(dbConn,samples,unique=FALSE)
dbSamples<-dbReadTable(dbConn,"samples")
if (dim(dbSamples)[1]>0) {
if (all(samples %in% dbSamples$sample_name)){
write ("OK!",stderr())
}else{
stop("Sample mismatch!")
}
}else{
write("Populating samples table...",stderr())
populateSampleTable(samples,dbConn)
}
######
#Populate TSS table
######
tssTable<-full[,c(1:5,7:9)]
write("Writing TSS table",stderr())
#dbWriteTable(dbConn,'TSS',tssTable,row.names=F,append=T)
if (nrow(tssTable)>0){
insert_SQL<-"INSERT INTO TSS VALUES(?,?,?,?,?,?,?,?)"
bulk_insert(dbConn,insert_SQL,tssTable)
######
#Populate geneData table
######
write("Reshaping TSSData table",stderr())
tssmelt<-melt(full,id.vars=c("tracking_id"),measure.vars=-idCols,variable_name="sample_name")
colnames(tssmelt)[colnames(tssmelt)=='variable']<-'sample_name'
#Clean up and normalize data
tssmelt$measurement = ""
tssmelt$measurement[grepl("_FPKM$",tssmelt$sample_name)] = "fpkm"
tssmelt$measurement[grepl("_conf_lo$",tssmelt$sample_name)] = "conf_lo"
tssmelt$measurement[grepl("_conf_hi$",tssmelt$sample_name)] = "conf_hi"
tssmelt$measurement[grepl("_status$",tssmelt$sample_name)] = "status"
tssmelt$sample_name<-gsub("_FPKM$","",tssmelt$sample_name)
tssmelt$sample_name<-gsub("_conf_lo$","",tssmelt$sample_name)
tssmelt$sample_name<-gsub("_conf_hi$","",tssmelt$sample_name)
tssmelt$sample_name<-gsub("_status$","",tssmelt$sample_name)
#Adjust sample names with make.db.names
tssmelt$sample_name <- make.db.names(dbConn,as.vector(tssmelt$sample_name),unique=FALSE)
#Recast
write("Recasting",stderr())
tssmelt<-as.data.frame(dcast(tssmelt,...~measurement))
#Write geneData table
write("Writing TSSData table",stderr())
#dbWriteTable(dbConn,'TSSData',as.data.frame(tssmelt[,c(1:2,5,3,4,6)]),row.names=F,append=T)
insert_SQL<-"INSERT INTO TSSData VALUES(?,?,?,?,?,?)"
bulk_insert(dbConn,insert_SQL,tssmelt[,c(1:2,5,3,4,6)])
}else{
write(paste("No records found in",fpkmFile),stderr())
write("TSS FPKM tracking file was empty.",stderr())
}
#######
#Handle tss_groups_exp.diff
#######
if(file.exists(diffFile)){
#Read diff file
write(paste("Reading ",diffFile,sep=""),stderr())
diffArgs$file = diffFile
diff<-as.data.frame(do.call(read.table,diffArgs))
if(dim(diff)[1]>0){
#Adjust sample names with make.db.names
diff$sample_1<-make.db.names(dbConn,as.vector(diff$sample_1),unique=FALSE)
diff$sample_2<-make.db.names(dbConn,as.vector(diff$sample_2),unique=FALSE)
write("Writing TSSExpDiffData table",stderr())
diffCols<-c(1,5:14)
#dbWriteTable(dbConn,'TSSExpDiffData',diff[,diffCols],row.names=F,append=T)
insert_SQL<-"INSERT INTO TSSExpDiffData VALUES(?,?,?,?,?,?,?,?,?,?,?)"
bulk_insert(dbConn,insert_SQL,diff[,diffCols])
}else{
write(paste("No records found in",diffFile),stderr())
}
}
#########
#TODO: Handle splicing.diff
########
if(file.exists(splicingFile)){
#Read promoterFile
write(paste("Reading ",splicingFile,sep=""),stderr())
splicingArgs$file = splicingFile
splicing<-as.data.frame(do.call(read.table,splicingArgs))
if(dim(splicing)[1]>0){
write("Writing splicingDiffData table",stderr())
splicingCols<-c(1:2,5:14)
#dbWriteTable(dbConn,'splicingDiffData',splicing[,splicingCols],row.names=F,append=T)
insert_SQL<-"INSERT INTO splicingDiffData VALUES(?,?,?,?,?,?,?,?,?,?,?,?)"
bulk_insert(dbConn,insert_SQL,splicing[,splicingCols])
}else{
write(paste("No records found in",splicingFile),stderr())
}
}
###########
#Handle Counts .count_tracking
###########
if(file.exists(countFile)){
idCols = c(1)
#Read countFile
write(paste("Reading ", countFile,sep=""),stderr())
countArgs$file = countFile
counts<-as.data.frame(do.call(read.table,countArgs))
if(dim(counts)[1]>0){
#Reshape TSSCount table
write("Reshaping TSSCount table",stderr())
countmelt<-melt(counts,id.vars=c("tracking_id"),measure.vars=-idCols)
colnames(countmelt)[colnames(countmelt)=='variable']<-'sample_name'
countmelt$measurement = ""
countmelt$measurement[grepl("_count$",countmelt$sample_name)] = "count"
countmelt$measurement[grepl("_count_variance$",countmelt$sample_name)] = "variance"
countmelt$measurement[grepl("_count_uncertainty_var$",countmelt$sample_name)] = "uncertainty"
countmelt$measurement[grepl("_count_dispersion_var$",countmelt$sample_name)] = "dispersion"
countmelt$measurement[grepl("_status$",countmelt$sample_name)] = "status"
countmelt$sample_name<-gsub("_count$","",countmelt$sample_name)
countmelt$sample_name<-gsub("_count_variance$","",countmelt$sample_name)
countmelt$sample_name<-gsub("_count_uncertainty_var$","",countmelt$sample_name)
countmelt$sample_name<-gsub("_count_dispersion_var$","",countmelt$sample_name)
countmelt$sample_name<-gsub("_status$","",countmelt$sample_name)
#Adjust sample names with make.db.names
countmelt$sample_name <- make.db.names(dbConn,as.vector(countmelt$sample_name),unique=FALSE)
#Recast
write("Recasting",stderr())
countmelt<-as.data.frame(dcast(countmelt,...~measurement))
#debugging
#write(colnames(countmelt),stderr())
#Write TSSCount table
write("Writing TSSCount table",stderr())
insert_SQL<-'INSERT INTO TSSCount VALUES(:tracking_id,:sample_name,:count,:variance,:uncertainty,:dispersion,:status)'
bulk_insert(dbConn,insert_SQL,countmelt)
}else{
write(paste("No records found in",countFile),stderr())
}
}
###########
#Handle Replicates .rep_tracking
###########
if(file.exists(replicateFile)){
idCols = 1
#Read countFile
write(paste("Reading read group info in ", replicateFile,sep=""),stderr())
replicateArgs$file = replicateFile
reps<-as.data.frame(do.call(read.table,replicateArgs))
#print(head(reps))
if(dim(reps)[1]>0){
#Adjust sample names with make.db.names
reps$condition <- make.db.names(dbConn,as.character(reps$condition),unique=FALSE)
#Create unique rep name
reps$rep_name<-paste(reps$condition,reps$replicate,sep="_")
colnames(reps)[colnames(reps)=="condition"]<-"sample_name"
#Write TSSReplicateData table
write("Writing TSSReplicateData table",stderr())
insert_SQL<-'INSERT INTO TSSReplicateData VALUES(:tracking_id,:sample_name,:replicate,:rep_name,:raw_frags,:internal_scaled_frags,:external_scaled_frags,:FPKM,:effective_length,:status)'
bulk_insert(dbConn,insert_SQL,reps)
}else{
write(paste("No records found in",replicateFile),stderr())
}
}
}
#CDS
loadCDS<-function(fpkmFile,
diffFile,
CDSDiff,
countFile,
replicateFile,
dbConn,
path,
#Arguments to read.* methods
fpkmArgs = list(sep=sep, header=header, row.names = row.names, quote=quote, na.string=na.string, ...),
diffArgs = list(sep=sep, header=header, row.names = row.names, quote=quote, na.string=na.string, ...),
CDSDiffArgs = list(sep=sep, header=header, row.names = row.names, quote=quote, na.string=na.string, ...),
countArgs = list(sep=sep, header=header, row.names = row.names, quote=quote, na.string=na.string, ...),
replicateArgs = list(sep=sep, header=header, row.names = row.names, quote=quote, na.string=na.string, ...),
sep="\t",
na.string = "-",
header = TRUE,
quote = "",
stringsAsFactors = FALSE,
row.names=NULL,
...) {
#Error Trapping
if (missing(fpkmFile))
stop("fpkmFile cannot be missing!")
if (missing(dbConn))
stop("Must provide a dbConn connection")
#TODO test dbConn connection and database structure
idCols = c(1:9)
#Read primary file
write(paste("Reading ",fpkmFile,sep=""),stderr())
fpkmArgs$file = fpkmFile
full = as.data.frame(do.call(read.table,fpkmArgs))
########
#Handle Sample Names
########
#Check that samples table is populated
write("Checking samples table...",stderr())
samples<-getSamplesFromColnames(full)
samples<-make.db.names(dbConn,samples,unique=FALSE)
dbSamples<-dbReadTable(dbConn,"samples")
if (dim(dbSamples)[1]>0) {
if (all(samples %in% dbSamples$sample_name)){
write ("OK!",stderr())
}else{
stop("Sample mismatch!")
}
}else{
write("Populating samples table...",stderr())
populateSampleTable(samples,dbConn)
}
######
#Populate CDS table
######
cdsTable<-full[,c(1:5,6:9)]
write("Writing CDS table",stderr())
#dbWriteTable(dbConn,'CDS',cdsTable,row.names=F,append=T)
if (nrow(cdsTable)>0){
insert_SQL<-"INSERT INTO CDS VALUES(?,?,?,?,?,?,?,?,?)"
bulk_insert(dbConn,insert_SQL,cdsTable)
######
#Populate geneData table
######
write("Reshaping CDSData table",stderr())
cdsmelt<-melt(full,id.vars=c("tracking_id"),measure.vars=-idCols,variable_name="sample_name")
colnames(cdsmelt)[colnames(cdsmelt)=='variable']<-'sample_name'
#Clean up and normalize data
cdsmelt$measurement = ""
cdsmelt$measurement[grepl("_FPKM$",cdsmelt$sample_name)] = "fpkm"
cdsmelt$measurement[grepl("_conf_lo$",cdsmelt$sample_name)] = "conf_lo"
cdsmelt$measurement[grepl("_conf_hi$",cdsmelt$sample_name)] = "conf_hi"
cdsmelt$measurement[grepl("_status$",cdsmelt$sample_name)] = "status"
cdsmelt$sample_name<-gsub("_FPKM$","",cdsmelt$sample_name)
cdsmelt$sample_name<-gsub("_conf_lo$","",cdsmelt$sample_name)
cdsmelt$sample_name<-gsub("_conf_hi$","",cdsmelt$sample_name)
cdsmelt$sample_name<-gsub("_status$","",cdsmelt$sample_name)
#Adjust sample names with make.db.names
cdsmelt$sample_name <- make.db.names(dbConn,as.vector(cdsmelt$sample_name),unique=FALSE)
#Recast
write("Recasting",stderr())
cdsmelt<-as.data.frame(dcast(cdsmelt,...~measurement))
#Write geneData table
write("Writing CDSData table",stderr())
#dbWriteTable(dbConn,'CDSData',as.data.frame(cdsmelt[,c(1:2,5,3,4,6)]),row.names=F,append=T)
insert_SQL<-"INSERT INTO CDSData VALUES(?,?,?,?,?,?)"
bulk_insert(dbConn,insert_SQL,cdsmelt[,c(1:2,5,3,4,6)])
}else {
write(paste("No records found in",fpkmFile),stderr())
write("CDS FPKM tracking file was empty.",stderr())
}
#######
#Handle cds_groups_exp.diff
#######
if(file.exists(diffFile)){
#Read diff file
write(paste("Reading ",diffFile,sep=""),stderr())
diffArgs$file = diffFile
diff<-as.data.frame(do.call(read.table,diffArgs))
if(dim(diff)[1]>0){
#Adjust sample names with make.db.names
diff$sample_1<-make.db.names(dbConn,as.vector(diff$sample_1),unique=FALSE)
diff$sample_2<-make.db.names(dbConn,as.vector(diff$sample_2),unique=FALSE)
write("Writing CDSExpDiffData table",stderr())
diffCols<-c(1,5:14)
#dbWriteTable(dbConn,'CDSExpDiffData',diff[,diffCols],row.names=F,append=T)
insert_SQL<-"INSERT INTO CDSExpDiffData VALUES(?,?,?,?,?,?,?,?,?,?,?)"
bulk_insert(dbConn,insert_SQL,diff[,diffCols])
}else{
write(paste("No records found in",diffFile),stderr())
}
}
#########
#TODO: Handle CDS.diff
########
if(file.exists(CDSDiff)){
#Read promoterFile
write(paste("Reading ",CDSDiff,sep=""),stderr())
CDSDiffArgs$file = CDSDiff
CDS<-as.data.frame(do.call(read.table,CDSDiffArgs))
if(dim(CDS)[1]>0){
write("Writing CDSDiffData table",stderr())
CDSCols<-c(2,5:14)
#dbWriteTable(dbConn,'CDSDiffData',CDS[,CDSCols],row.names=F,append=T)
insert_SQL<-"INSERT INTO CDSDiffData VALUES(?,?,?,?,?,?,?,?,?,?,?)"
bulk_insert(dbConn,insert_SQL,CDS[,CDSCols])
}else{
write(paste("No records found in",CDSDiff),stderr())
}
}
###########
#Handle Counts .count_tracking
###########
if(file.exists(countFile)){
idCols = c(1)
#Read countFile
write(paste("Reading ", countFile,sep=""),stderr())
countArgs$file = countFile
counts<-as.data.frame(do.call(read.table,countArgs))
if(dim(counts)[1]>0){
#Reshape CDSCount table
write("Reshaping CDSCount table",stderr())
countmelt<-melt(counts,id.vars=c("tracking_id"),measure.vars=-idCols)
colnames(countmelt)[colnames(countmelt)=='variable']<-'sample_name'
countmelt$measurement = ""
countmelt$measurement[grepl("_count$",countmelt$sample_name)] = "count"
countmelt$measurement[grepl("_count_variance$",countmelt$sample_name)] = "variance"
countmelt$measurement[grepl("_count_uncertainty_var$",countmelt$sample_name)] = "uncertainty"
countmelt$measurement[grepl("_count_dispersion_var$",countmelt$sample_name)] = "dispersion"
countmelt$measurement[grepl("_status$",countmelt$sample_name)] = "status"
countmelt$sample_name<-gsub("_count$","",countmelt$sample_name)
countmelt$sample_name<-gsub("_count_variance$","",countmelt$sample_name)
countmelt$sample_name<-gsub("_count_uncertainty_var$","",countmelt$sample_name)
countmelt$sample_name<-gsub("_count_dispersion_var$","",countmelt$sample_name)
countmelt$sample_name<-gsub("_status$","",countmelt$sample_name)
#Adjust sample names with make.db.names
countmelt$sample_name <- make.db.names(dbConn,as.vector(countmelt$sample_name),unique=FALSE)
#Recast
write("Recasting",stderr())
countmelt<-as.data.frame(dcast(countmelt,...~measurement))
#debugging
#write(colnames(countmelt),stderr())
#Write CDSCount table
write("Writing CDSCount table",stderr())
insert_SQL<-'INSERT INTO CDSCount VALUES(:tracking_id,:sample_name,:count,:variance,:uncertainty,:dispersion,:status)'
bulk_insert(dbConn,insert_SQL,countmelt)
}else{
write(paste("No records found in",countFile),stderr())
}
}
###########
#Handle Replicates .rep_tracking
###########
if(file.exists(replicateFile)){
idCols = 1
#Read countFile
write(paste("Reading read group info in ", replicateFile,sep=""),stderr())
replicateArgs$file = replicateFile
reps<-as.data.frame(do.call(read.table,replicateArgs))
#print(head(reps))
if(dim(reps)[1]>0){
#Adjust sample names with make.db.names
reps$condition <- make.db.names(dbConn,as.character(reps$condition),unique=FALSE)
#Create unique rep name
reps$rep_name<-paste(reps$condition,reps$replicate,sep="_")
colnames(reps)[colnames(reps)=="condition"]<-"sample_name"
#Write CDSReplicateData table
write("Writing CDSReplicateData table",stderr())
insert_SQL<-'INSERT INTO CDSReplicateData VALUES(:tracking_id,:sample_name,:replicate,:rep_name,:raw_frags,:internal_scaled_frags,:external_scaled_frags,:FPKM,:effective_length,:status)'
bulk_insert(dbConn,insert_SQL,reps)
}else{
write(paste("No records found in",replicateFile),stderr())
}
}
}
########################
#Add FeatureData
########################
#####################
#Database Setup Functions
#####################
createDB_noIndex<-function(dbFname="cuffData.db",driver="SQLite") {
#Builds sqlite db at 'dbFname' and returns a dbConnect object pointing to newly created database.
#No indexes are present
drv<-dbDriver(driver)
db <- dbConnect(drv,dbname=dbFname)
schema.text<-'
-- Creator: MySQL Workbench 5.2.33/ExportSQLite plugin 2009.12.02
-- Author: Loyal Goff
-- Caption: New Model
-- Project: Name of the project
-- Changed: 2012-04-30 22:21
-- Created: 2011-05-02 12:52
PRAGMA foreign_keys = OFF;
-- Schema: cuffData
BEGIN;
DROP TABLE IF EXISTS "genes";
CREATE TABLE "genes"(
"gene_id" VARCHAR(45) PRIMARY KEY NOT NULL,
"class_code" VARCHAR(45),
"nearest_ref_id" VARCHAR(45),
"gene_short_name" VARCHAR(45),
"locus" VARCHAR(45),
"length" INTEGER,
"coverage" FLOAT
);
DROP TABLE IF EXISTS "biasData";
CREATE TABLE "biasData"(
"biasData_id" INTEGER PRIMARY KEY NOT NULL
);
DROP TABLE IF EXISTS "samples";
CREATE TABLE "samples"(
"sample_index" INTEGER NOT NULL,
"sample_name" VARCHAR(45) PRIMARY KEY NOT NULL
);
DROP TABLE IF EXISTS "TSS";
CREATE TABLE "TSS"(
"TSS_group_id" VARCHAR(45) PRIMARY KEY NOT NULL,
"class_code" VARCHAR(45),
"nearest_ref_id" VARCHAR(45),
"gene_id" VARCHAR(45) NOT NULL,
"gene_short_name" VARCHAR(45),
"locus" VARCHAR(45),
"length" INTEGER,
"coverage" FLOAT,
CONSTRAINT "fk_TSS_genes1"
FOREIGN KEY("gene_id")
REFERENCES "genes"("gene_id")
);
DROP TABLE IF EXISTS "TSSData";
CREATE TABLE "TSSData"(
"TSS_group_id" VARCHAR(45) NOT NULL,
"sample_name" VARCHAR(45) NOT NULL,
"fpkm" FLOAT,
"conf_hi" FLOAT,
"conf_lo" FLOAT,
"quant_status" VARCHAR(45),
CONSTRAINT "fk_TSSData_TSS1"
FOREIGN KEY("TSS_group_id")
REFERENCES "TSS"("TSS_group_id"),
CONSTRAINT "fk_TSSData_samples1"
FOREIGN KEY("sample_name")
REFERENCES "samples"("sample_name")
);
DROP TABLE IF EXISTS "CDS";
CREATE TABLE "CDS"(
"CDS_id" VARCHAR(45) PRIMARY KEY NOT NULL,
"class_code" VARCHAR(45),
"nearest_ref_id" VARCHAR(45),
"gene_id" VARCHAR(45),
"gene_short_name" VARCHAR(45),
"TSS_group_id" VARCHAR(45),
"locus" VARCHAR(45),
"length" INTEGER,
"coverage" FLOAT,
CONSTRAINT "fk_CDS_genes1"
FOREIGN KEY("gene_id")
REFERENCES "genes"("gene_id"),
CONSTRAINT "fk_CDS_TSS1"
FOREIGN KEY("TSS_group_id")
REFERENCES "TSS"("TSS_group_id")
);
DROP TABLE IF EXISTS "CDSData";
CREATE TABLE "CDSData"(
"CDS_id" VARCHAR(45) NOT NULL,
"sample_name" VARCHAR(45) NOT NULL,
"fpkm" FLOAT,
"conf_hi" FLOAT,
"conf_lo" FLOAT,
"quant_status" VARCHAR(45),
CONSTRAINT "fk_CDSData_CDS1"
FOREIGN KEY("CDS_id")
REFERENCES "CDS"("CDS_id"),
CONSTRAINT "fk_CDSData_samples1"
FOREIGN KEY("sample_name")
REFERENCES "samples"("sample_name")
);
DROP TABLE IF EXISTS "splicingDiffData";
CREATE TABLE "splicingDiffData"(
"TSS_group_id" VARCHAR(45) NOT NULL,
"gene_id" VARCHAR(45) NOT NULL,
"sample_1" VARCHAR(45) NOT NULL,
"sample_2" VARCHAR(45) NOT NULL,
"status" VARCHAR(45),
"value_1" FLOAT,
"value_2" FLOAT,
"JS_dist" FLOAT,
"test_stat" FLOAT,
"p_value" FLOAT,
"q_value" FLOAT,
"significant" VARCHAR(45),
CONSTRAINT "fk_splicingDiffData_samples1"
FOREIGN KEY("sample_1")
REFERENCES "samples"("sample_name"),
CONSTRAINT "fk_splicingDiffData_samples2"
FOREIGN KEY("sample_2")
REFERENCES "samples"("sample_name"),
CONSTRAINT "fk_splicingDiffData_TSS1"
FOREIGN KEY("TSS_group_id")
REFERENCES "TSS"("TSS_group_id"),
CONSTRAINT "fk_splicingDiffData_genes1"
FOREIGN KEY("gene_id")
REFERENCES "genes"("gene_id")
);
DROP TABLE IF EXISTS "TSSExpDiffData";
CREATE TABLE "TSSExpDiffData"(
"TSS_group_id" VARCHAR(45) NOT NULL,
"sample_1" VARCHAR(45) NOT NULL,
"sample_2" VARCHAR(45) NOT NULL,
"status" VARCHAR(45),
"value_1" FLOAT,
"value_2" FLOAT,
"log2_fold_change" FLOAT,
"test_stat" FLOAT,
"p_value" FLOAT,
"q_value" FLOAT,
"significant" VARCHAR(45),
CONSTRAINT "fk_TSSExpDiffData_TSS1"
FOREIGN KEY("TSS_group_id")
REFERENCES "TSS"("TSS_group_id"),
CONSTRAINT "fk_TSSExpDiffData_samples1"
FOREIGN KEY("sample_1")
REFERENCES "samples"("sample_name"),
CONSTRAINT "fk_TSSExpDiffData_samples2"
FOREIGN KEY("sample_2")
REFERENCES "samples"("sample_name")
);
DROP TABLE IF EXISTS "CDSDiffData";
CREATE TABLE "CDSDiffData"(
"gene_id" VARCHAR(45) NOT NULL,
"sample_1" VARCHAR(45) NOT NULL,
"sample_2" VARCHAR(45) NOT NULL,
"status" VARCHAR(45),
"value_1" FLOAT,
"value_2" FLOAT,
"JS_dist" FLOAT,
"test_stat" FLOAT,
"p_value" FLOAT,
"q_value" FLOAT,
"significant" VARCHAR(45),
CONSTRAINT "fk_CDSDiffData_samples1"
FOREIGN KEY("sample_1")
REFERENCES "samples"("sample_name"),
CONSTRAINT "fk_CDSDiffData_samples2"
FOREIGN KEY("sample_2")
REFERENCES "samples"("sample_name"),
CONSTRAINT "fk_CDSDiffData_genes1"
FOREIGN KEY("gene_id")
REFERENCES "genes"("gene_id")
);
DROP TABLE IF EXISTS "CDSExpDiffData";
CREATE TABLE "CDSExpDiffData"(
"CDS_id" VARCHAR(45) NOT NULL,
"sample_1" VARCHAR(45) NOT NULL,
"sample_2" VARCHAR(45) NOT NULL,
"status" VARCHAR(45),
"value_1" FLOAT,
"value_2" FLOAT,
"log2_fold_change" FLOAT,
"test_stat" FLOAT,
"p_value" FLOAT,
"q_value" FLOAT,
"significant" VARCHAR(45),
CONSTRAINT "fk_CDSExpDiffData_CDS1"
FOREIGN KEY("CDS_id")
REFERENCES "CDS"("CDS_id"),
CONSTRAINT "fk_CDSExpDiffData_samples1"
FOREIGN KEY("sample_1")
REFERENCES "samples"("sample_name"),
CONSTRAINT "fk_CDSExpDiffData_samples2"
FOREIGN KEY("sample_2")
REFERENCES "samples"("sample_name")
);
DROP TABLE IF EXISTS "promoterDiffData";
CREATE TABLE "promoterDiffData"(
"gene_id" VARCHAR(45) NOT NULL,
"sample_1" VARCHAR(45) NOT NULL,
"sample_2" VARCHAR(45) NOT NULL,
"status" VARCHAR(45),
"value_1" FLOAT,
"value_2" FLOAT,
"JS_dist" FLOAT,
"test_stat" FLOAT,
"p_value" FLOAT,
"q_value" FLOAT,
"significant" VARCHAR(45),
CONSTRAINT "fk_promoterDiffData_genes1"
FOREIGN KEY("gene_id")
REFERENCES "genes"("gene_id"),
CONSTRAINT "fk_promoterDiffData_samples1"
FOREIGN KEY("sample_1")
REFERENCES "samples"("sample_name"),
CONSTRAINT "fk_promoterDiffData_samples2"
FOREIGN KEY("sample_2")
REFERENCES "samples"("sample_name")
);
DROP TABLE IF EXISTS "geneFeatures";
CREATE TABLE "geneFeatures"(
"gene_id" VARCHAR(45) NOT NULL,
CONSTRAINT "fk_geneFeatures_genes1"
FOREIGN KEY("gene_id")
REFERENCES "genes"("gene_id")
);
DROP TABLE IF EXISTS "TSSFeatures";
CREATE TABLE "TSSFeatures"(
"TSS_group_id" VARCHAR(45) NOT NULL,
CONSTRAINT "fk_TSSFeatures_TSS1"
FOREIGN KEY("TSS_group_id")
REFERENCES "TSS"("TSS_group_id")
);
DROP TABLE IF EXISTS "CDSFeatures";
CREATE TABLE "CDSFeatures"(
"CDS_id" VARCHAR(45) NOT NULL,
CONSTRAINT "fk_CDSFeatures_CDS1"
FOREIGN KEY("CDS_id")
REFERENCES "CDS"("CDS_id")
);
DROP TABLE IF EXISTS "model_transcripts";
CREATE TABLE "model_transcripts"(
"model_transcript_id" INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL
);
DROP TABLE IF EXISTS "geneCount";
CREATE TABLE "geneCount"(
"gene_id" VARCHAR(45) NOT NULL,
"sample_name" VARCHAR(45) NOT NULL,
"count" FLOAT,
"variance" FLOAT,
"uncertainty" FLOAT,
"dispersion" FLOAT,
"status" VARCHAR(45),
CONSTRAINT "fk_geneCount_samples1"
FOREIGN KEY("sample_name")
REFERENCES "samples"("sample_name"),
CONSTRAINT "fk_geneCount_genes1"
FOREIGN KEY("gene_id")
REFERENCES "genes"("gene_id")
);
DROP TABLE IF EXISTS "CDSCount";
CREATE TABLE "CDSCount"(
"CDS_id" VARCHAR(45) NOT NULL,
"sample_name" VARCHAR(45) NOT NULL,
"count" FLOAT,
"variance" FLOAT,
"uncertainty" FLOAT,
"dispersion" FLOAT,
"status" VARCHAR(45),
CONSTRAINT "fk_CDSCount_CDS1"
FOREIGN KEY("CDS_id")
REFERENCES "CDS"("CDS_id"),
CONSTRAINT "fk_CDSCount_samples1"
FOREIGN KEY("sample_name")
REFERENCES "samples"("sample_name")
);
DROP TABLE IF EXISTS "TSSCount";
CREATE TABLE "TSSCount"(
"TSS_group_id" VARCHAR(45) NOT NULL,
"sample_name" VARCHAR(45) NOT NULL,
"count" FLOAT,
"variance" FLOAT,
"uncertainty" FLOAT,
"dispersion" FLOAT,
"status" VARCHAR(45),
CONSTRAINT "fk_TSSCount_TSS1"
FOREIGN KEY("TSS_group_id")
REFERENCES "TSS"("TSS_group_id"),
CONSTRAINT "fk_TSSCount_samples1"
FOREIGN KEY("sample_name")
REFERENCES "samples"("sample_name")
);
DROP TABLE IF EXISTS "replicates";
CREATE TABLE "replicates"(
"file" INTEGER NOT NULL,
"sample_name" VARCHAR(45) NOT NULL,
"replicate" INT NOT NULL,
"rep_name" VARCHAR(45) PRIMARY KEY NOT NULL,
"total_mass" FLOAT,
"norm_mass" FLOAT,
"internal_scale" FLOAT,
"external_scale" FLOAT,
CONSTRAINT "fk_replicates_samples1"
FOREIGN KEY("sample_name")
REFERENCES "samples"("sample_name")
);
DROP TABLE IF EXISTS "geneReplicateData";
CREATE TABLE "geneReplicateData"(
"gene_id" VARCHAR(45) NOT NULL,
"sample_name" VARCHAR(45) NOT NULL,
"replicate" INTEGER,
"rep_name" VARCHAR(45) NOT NULL,
"raw_frags" FLOAT,
"internal_scaled_frags" FLOAT,
"external_scaled_frags" FLOAT,
"fpkm" FLOAT,
"effective_length" FLOAT,
"status" VARCHAR(45),
CONSTRAINT "fk_geneData_genes10"
FOREIGN KEY("gene_id")
REFERENCES "genes"("gene_id"),
CONSTRAINT "fk_geneReplicateData_replicates1"
FOREIGN KEY("rep_name")
REFERENCES "replicates"("rep_name"),
CONSTRAINT "fk_geneReplicateData_samples1"
FOREIGN KEY("sample_name")
REFERENCES "samples"("sample_name")
);
DROP TABLE IF EXISTS "CDSReplicateData";
CREATE TABLE "CDSReplicateData"(
"CDS_id" VARCHAR(45) NOT NULL,
"sample_name" VARCHAR(45) NOT NULL,
"replicate" INTEGER,
"rep_name" VARCHAR(45) NOT NULL,
"raw_frags" FLOAT,
"internal_scaled_frags" FLOAT,
"external_scaled_frags" FLOAT,
"fpkm" FLOAT,
"effective_length" FLOAT,
"status" VARCHAR(45),
CONSTRAINT "fk_geneReplicateData_replicates100"
FOREIGN KEY("rep_name")
REFERENCES "replicates"("rep_name"),
CONSTRAINT "fk_CDSReplicateData_CDS1"
FOREIGN KEY("CDS_id")
REFERENCES "CDS"("CDS_id"),
CONSTRAINT "fk_CDSReplicateData_samples1"
FOREIGN KEY("sample_name")
REFERENCES "samples"("sample_name")
);
DROP TABLE IF EXISTS "TSSReplicateData";
CREATE TABLE "TSSReplicateData"(
"TSS_group_id" VARCHAR(45) NOT NULL,
"sample_name" VARCHAR(45) NOT NULL,
"replicate" VARCHAR(45),
"rep_name" VARCHAR(45) NOT NULL,
"raw_frags" FLOAT,
"internal_scaled_frags" FLOAT,
"external_scaled_frags" FLOAT,
"fpkm" FLOAT,
"effective_length" FLOAT,
"status" VARCHAR(45),
CONSTRAINT "fk_geneReplicateData_replicates10000"
FOREIGN KEY("rep_name")
REFERENCES "replicates"("rep_name"),
CONSTRAINT "fk_TSSReplicateData_TSS1"
FOREIGN KEY("TSS_group_id")
REFERENCES "TSS"("TSS_group_id"),
CONSTRAINT "fk_TSSReplicateData_samples1"
FOREIGN KEY("sample_name")
REFERENCES "samples"("sample_name")
);
DROP TABLE IF EXISTS "runInfo";
CREATE TABLE "runInfo"(
"param" VARCHAR(45),
"value" TEXT
);
DROP TABLE IF EXISTS "geneData";
CREATE TABLE "geneData"(
"gene_id" VARCHAR(45) NOT NULL,
"sample_name" VARCHAR(45) NOT NULL,
"fpkm" FLOAT,
"conf_hi" FLOAT,
"conf_lo" FLOAT,
"quant_status" VARCHAR(45),
CONSTRAINT "fk_geneData_genes1"
FOREIGN KEY("gene_id")
REFERENCES "genes"("gene_id"),
CONSTRAINT "fk_geneData_samples1"
FOREIGN KEY("sample_name")
REFERENCES "samples"("sample_name")
);
DROP TABLE IF EXISTS "phenoData";
CREATE TABLE "phenoData"(
"sample_name" VARCHAR(45) NOT NULL,
"parameter" VARCHAR(45) NOT NULL,
"value" VARCHAR(45),
CONSTRAINT "fk_phenoData_samples"
FOREIGN KEY("sample_name")
REFERENCES "samples"("sample_name")
);
DROP TABLE IF EXISTS "geneExpDiffData";
CREATE TABLE "geneExpDiffData"(
"gene_id" VARCHAR(45) NOT NULL,
"sample_1" VARCHAR(45) NOT NULL,
"sample_2" VARCHAR(45) NOT NULL,
"status" VARCHAR(45),
"value_1" FLOAT,
"value_2" FLOAT,
"log2_fold_change" FLOAT,
"test_stat" FLOAT,
"p_value" FLOAT,
"q_value" FLOAT,
"significant" VARCHAR(45),
CONSTRAINT "fk_geneExpDiffData_genes1"
FOREIGN KEY("gene_id")
REFERENCES "genes"("gene_id"),
CONSTRAINT "fk_geneExpDiffData_samples1"
FOREIGN KEY("sample_1")
REFERENCES "samples"("sample_name"),
CONSTRAINT "fk_geneExpDiffData_samples2"
FOREIGN KEY("sample_2")
REFERENCES "samples"("sample_name")
);
DROP TABLE IF EXISTS "isoforms";
CREATE TABLE "isoforms"(
"isoform_id" VARCHAR(45) PRIMARY KEY NOT NULL,
"gene_id" VARCHAR(45),
"CDS_id" VARCHAR(45),
"gene_short_name" VARCHAR(45),
"TSS_group_id" VARCHAR(45),
"class_code" VARCHAR(45),
"nearest_ref_id" VARCHAR(45),
"locus" VARCHAR(45),
"length" INTEGER,
"coverage" FLOAT,
CONSTRAINT "fk_isoforms_TSS1"
FOREIGN KEY("TSS_group_id")
REFERENCES "TSS"("TSS_group_id"),
CONSTRAINT "fk_isoforms_CDS1"
FOREIGN KEY("CDS_id")
REFERENCES "CDS"("CDS_id"),
CONSTRAINT "fk_isoforms_genes1"
FOREIGN KEY("gene_id")
REFERENCES "genes"("gene_id")
);
DROP TABLE IF EXISTS "isoformData";
CREATE TABLE "isoformData"(
"isoform_id" VARCHAR(45) NOT NULL,
"sample_name" VARCHAR(45) NOT NULL,
"fpkm" FLOAT NOT NULL,
"conf_hi" FLOAT,
"conf_lo" FLOAT,
"quant_status" VARCHAR(45),
CONSTRAINT "fk_isoformData_samples1"
FOREIGN KEY("sample_name")
REFERENCES "samples"("sample_name"),
CONSTRAINT "fk_isoformData_isoforms1"
FOREIGN KEY("isoform_id")
REFERENCES "isoforms"("isoform_id")
);
DROP TABLE IF EXISTS "isoformExpDiffData";
CREATE TABLE "isoformExpDiffData"(
"isoform_id" VARCHAR(45) NOT NULL,
"sample_1" VARCHAR(45) NOT NULL,
"sample_2" VARCHAR(45) NOT NULL,
"status" VARCHAR(45),
"value_1" FLOAT,
"value_2" FLOAT,
"log2_fold_change" FLOAT,
"test_stat" FLOAT,
"p_value" FLOAT,
"q_value" FLOAT,
"significant" VARCHAR(45),
CONSTRAINT "fk_isoformExpDiffData_isoforms1"
FOREIGN KEY("isoform_id")
REFERENCES "isoforms"("isoform_id"),
CONSTRAINT "fk_isoformExpDiffData_samples1"
FOREIGN KEY("sample_1")
REFERENCES "samples"("sample_name"),
CONSTRAINT "fk_isoformExpDiffData_samples2"
FOREIGN KEY("sample_2")
REFERENCES "samples"("sample_name")
);
DROP TABLE IF EXISTS "isoformFeatures";
CREATE TABLE "isoformFeatures"(
"isoform_id" VARCHAR(45) NOT NULL,
CONSTRAINT "fk_isoformFeatures_isoforms1"
FOREIGN KEY("isoform_id")
REFERENCES "isoforms"("isoform_id")
);
DROP TABLE IF EXISTS "features";
CREATE TABLE "features"(
-- GTF Features (all lines/records from reference .gtf file)
"feature_id" INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL,
"gene_id" VARCHAR(45) NOT NULL,
"isoform_id" VARCHAR(45) NOT NULL,
"seqnames" VARCHAR(45) NOT NULL,
"source" VARCHAR(45) NOT NULL,
"type" INTEGER,
"start" INTEGER,
"end" INTEGER,
"score" FLOAT,
"strand" VARCHAR(45),
"frame" VARCHAR(45),
CONSTRAINT "fk_features_genes1"
FOREIGN KEY("gene_id")
REFERENCES "genes"("gene_id"),
CONSTRAINT "fk_features_isoforms1"
FOREIGN KEY("isoform_id")
REFERENCES "isoforms"("isoform_id")
);
DROP TABLE IF EXISTS "attributes";
CREATE TABLE "attributes"(
"attribute_lookup_id" INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL,
"feature_id" INTEGER NOT NULL,
"attribute" VARCHAR(45) NOT NULL,
"value" VARCHAR(45) NOT NULL,
CONSTRAINT "fk_attribute_lookup_features1"
FOREIGN KEY("feature_id")
REFERENCES "features"("feature_id")
);
DROP TABLE IF EXISTS "isoformCount";
CREATE TABLE "isoformCount"(
"isoform_id" VARCHAR(45) NOT NULL,
"sample_name" VARCHAR(45) NOT NULL,
"count" FLOAT,
"variance" FLOAT,
"uncertainty" FLOAT,
"dispersion" FLOAT,
"status" VARCHAR(45),
CONSTRAINT "fk_isoformCount_isoforms1"
FOREIGN KEY("isoform_id")
REFERENCES "isoforms"("isoform_id"),
CONSTRAINT "fk_isoformCount_samples1"
FOREIGN KEY("sample_name")
REFERENCES "samples"("sample_name")
);
DROP TABLE IF EXISTS "isoformReplicateData";
CREATE TABLE "isoformReplicateData"(
"isoform_id" VARCHAR(45) NOT NULL,
"sample_name" VARCHAR(45) NOT NULL,
"replicate" INTEGER,
"rep_name" VARCHAR(45) NOT NULL,
"raw_frags" FLOAT,
"internal_scaled_frags" FLOAT,
"external_scaled_frags" FLOAT,
"fpkm" FLOAT,
"effective_length" FLOAT,
"status" VARCHAR(45),
CONSTRAINT "fk_geneReplicateData_replicates10"
FOREIGN KEY("rep_name")
REFERENCES "replicates"("rep_name"),
CONSTRAINT "fk_isoformReplicateData_isoforms1"
FOREIGN KEY("isoform_id")
REFERENCES "isoforms"("isoform_id"),
CONSTRAINT "fk_isoformReplicateData_samples1"
FOREIGN KEY("sample_name")
REFERENCES "samples"("sample_name")
);
DROP TABLE IF EXISTS "varModel";
CREATE TABLE "varModel"(
"condition" VARCHAR(45) NOT NULL,
"locus" VARCHAR(45) NOT NULL,
"compatible_count_mean" FLOAT,
"compatible_count_var" FLOAT,
"total_count_mean" FLOAT,
"total_count_var" FLOAT,
"fitted_var" FLOAT,
CONSTRAINT "fk_varModel_locus1"
FOREIGN KEY("locus")
REFERENCES "genes"("locus")
CONSTRAINT "fk_varModel_condition1"
FOREIGN KEY("condition")
REFERENCES "samples"("sample_name")
);
COMMIT;
'
create.sql <- strsplit(schema.text, "\n")[[1]]
create.sql <- paste(collapse="\n", create.sql)
create.sql <- strsplit(create.sql, ";")[[1]]
create.sql <- create.sql[-length(create.sql)] #nothing to run here
tmp <- sapply(create.sql,function(x) dbGetQuery(db,x))
db
}
createIndices<-function(dbFname="cuffData.db",driver="SQLite",verbose=F){
drv<-dbDriver(driver)
db <- dbConnect(drv,dbname=dbFname)
index.text<-
'CREATE INDEX "genes.gsn_index" ON "genes"("gene_short_name");
CREATE INDEX "genes.cc_index" ON "genes"("class_code");
CREATE INDEX "TSS.fk_TSS_genes1" ON "TSS"("gene_id");
CREATE INDEX "TSSData.fk_TSSData_TSS1" ON "TSSData"("TSS_group_id");
CREATE INDEX "TSSData.fk_TSSData_samples1" ON "TSSData"("sample_name");
CREATE INDEX "CDS.fk_CDS_genes1" ON "CDS"("gene_id");
CREATE INDEX "CDS.fk_CDS_TSS1" ON "CDS"("TSS_group_id");
CREATE INDEX "CDSData.fk_CDSData_CDS1" ON "CDSData"("CDS_id");
CREATE INDEX "CDSData.fk_CDSData_samples1" ON "CDSData"("sample_name");
CREATE INDEX "splicingDiffData.fk_splicingDiffData_samples1" ON "splicingDiffData"("sample_1");
CREATE INDEX "splicingDiffData.fk_splicingDiffData_samples2" ON "splicingDiffData"("sample_2");
CREATE INDEX "splicingDiffData.fk_splicingDiffData_TSS1" ON "splicingDiffData"("TSS_group_id");
CREATE INDEX "splicingDiffData.fk_splicingDiffData_genes1" ON "splicingDiffData"("gene_id");
CREATE INDEX "TSSExpDiffData.fk_TSSExpDiffData_TSS1" ON "TSSExpDiffData"("TSS_group_id");
CREATE INDEX "TSSExpDiffData.fk_TSSExpDiffData_samples1" ON "TSSExpDiffData"("sample_1");
CREATE INDEX "TSSExpDiffData.fk_TSSExpDiffData_samples2" ON "TSSExpDiffData"("sample_2");
CREATE INDEX "TSSExpDiffData.TSSExpDiffData_sig_index" ON "TSSExpDiffData"("test_stat","p_value","q_value","significant");
CREATE INDEX "CDSDiffData.fk_CDSDiffData_samples1" ON "CDSDiffData"("sample_1");
CREATE INDEX "CDSDiffData.fk_CDSDiffData_samples2" ON "CDSDiffData"("sample_2");
CREATE INDEX "CDSDiffData.fk_CDSDiffData_genes1" ON "CDSDiffData"("gene_id");
CREATE INDEX "CDSExpDiffData.fk_CDSExpDiffData_CDS1" ON "CDSExpDiffData"("CDS_id");
CREATE INDEX "CDSExpDiffData.fk_CDSExpDiffData_samples1" ON "CDSExpDiffData"("sample_1");
CREATE INDEX "CDSExpDiffData.fk_CDSExpDiffData_samples2" ON "CDSExpDiffData"("sample_2");
CREATE INDEX "CDSExpDiffData.CDSExpDiffData_sig_index" ON "CDSExpDiffData"("test_stat","p_value","q_value","significant");
CREATE INDEX "promoterDiffData.fk_promoterDiffData_genes1" ON "promoterDiffData"("gene_id");
CREATE INDEX "promoterDiffData.fk_promoterDiffData_samples1" ON "promoterDiffData"("sample_1");
CREATE INDEX "promoterDiffData.fk_promoterDiffData_samples2" ON "promoterDiffData"("sample_2");
CREATE INDEX "geneFeatures.fk_geneFeatures_genes1" ON "geneFeatures"("gene_id");
CREATE INDEX "TSSFeatures.fk_TSSFeatures_TSS1" ON "TSSFeatures"("TSS_group_id");
CREATE INDEX "CDSFeatures.fk_CDSFeatures_CDS1" ON "CDSFeatures"("CDS_id");
CREATE INDEX "geneCount.fk_geneCount_samples1" ON "geneCount"("sample_name");
CREATE INDEX "geneCount.fk_geneCount_genes1" ON "geneCount"("gene_id");
CREATE INDEX "CDSCount.fk_CDSCount_CDS1" ON "CDSCount"("CDS_id");
CREATE INDEX "CDSCount.fk_CDSCount_samples1" ON "CDSCount"("sample_name");
CREATE INDEX "TSSCount.fk_TSSCount_TSS1" ON "TSSCount"("TSS_group_id");
CREATE INDEX "TSSCount.fk_TSSCount_samples1" ON "TSSCount"("sample_name");
CREATE INDEX "replicates.fk_replicates_samples1" ON "replicates"("sample_name");
CREATE INDEX "geneReplicateData.fk_geneReplicateData_genes1" ON "geneReplicateData"("gene_id");
CREATE INDEX "geneReplicateData.fk_geneReplicateData_replicates1" ON "geneReplicateData"("rep_name");
CREATE INDEX "geneReplicateData.fk_geneReplicateData_samples1" ON "geneReplicateData"("sample_name");
CREATE INDEX "CDSReplicateData.fk_CDSReplicateData_replicates1" ON "CDSReplicateData"("rep_name");
CREATE INDEX "CDSReplicateData.fk_CDSReplicateData_CDS1" ON "CDSReplicateData"("CDS_id");
CREATE INDEX "CDSReplicateData.fk_CDSReplicateData_samples1" ON "CDSReplicateData"("sample_name");
CREATE INDEX "TSSReplicateData.fk_TSSReplicateData_replicates1" ON "TSSReplicateData"("rep_name");
CREATE INDEX "TSSReplicateData.fk_TSSReplicateData_TSS1" ON "TSSReplicateData"("TSS_group_id");
CREATE INDEX "TSSReplicateData.fk_TSSReplicateData_samples1" ON "TSSReplicateData"("sample_name");
CREATE INDEX "geneData.fk_geneData_genes1" ON "geneData"("gene_id");
CREATE INDEX "geneData.fk_geneData_samples1" ON "geneData"("sample_name");
CREATE INDEX "phenoData.fk_phenoData_samples" ON "phenoData"("sample_name");
CREATE INDEX "geneExpDiffData.fk_geneExpDiffData_genes1" ON "geneExpDiffData"("gene_id");
CREATE INDEX "geneExpDiffData.fk_geneExpDiffData_samples1" ON "geneExpDiffData"("sample_1");
CREATE INDEX "geneExpDiffData.fk_geneExpDiffData_samples2" ON "geneExpDiffData"("sample_2");
CREATE INDEX "geneExpDiffData.geneExpDiff_status_index" ON "geneExpDiffData"("status");
CREATE INDEX "geneExpDiffData.geneExpDiff_sig_index" ON "geneExpDiffData"("significant","p_value","q_value","test_stat");
CREATE INDEX "isoforms.fk_isoforms_TSS1" ON "isoforms"("TSS_group_id");
CREATE INDEX "isoforms.fk_isoforms_CDS1" ON "isoforms"("CDS_id");
CREATE INDEX "isoforms.fk_isoforms_genes1" ON "isoforms"("gene_id");
CREATE INDEX "isoformData.fk_isoformData_samples1" ON "isoformData"("sample_name");
CREATE INDEX "isoformData.fk_isoformData_isoforms1" ON "isoformData"("isoform_id");
CREATE INDEX "isoformExpDiffData.fk_isoformExpDiffData_isoforms1" ON "isoformExpDiffData"("isoform_id");
CREATE INDEX "isoformExpDiffData.fk_isoformExpDiffData_samples1" ON "isoformExpDiffData"("sample_1");
CREATE INDEX "isoformExpDiffData.fk_isoformExpDiffData_samples2" ON "isoformExpDiffData"("sample_2");
CREATE INDEX "isoformExpDiffData.isoformExpDiffData_sig_index" ON "isoformExpDiffData"("test_stat","p_value","q_value","significant");
CREATE INDEX "isoformFeatures.fk_isoformFeatures_isoforms1" ON "isoformFeatures"("isoform_id");
CREATE INDEX "attributes.fk_attributes_feature_id" ON "attributes"("feature_id");
CREATE INDEX "attributes.attributes_attribute_index" ON "attributes"("attribute");
CREATE INDEX "attributes.attributes_value_index" ON "attributes"("value");
CREATE INDEX "isoformCount.fk_isoformCount_isoforms1" ON "isoformCount"("isoform_id");
CREATE INDEX "isoformCount.fk_isoformCount_samples1" ON "isoformCount"("sample_name");
CREATE INDEX "isoformReplicateData.fk_isoformReplicateData_replicates1" ON "isoformReplicateData"("rep_name");
CREATE INDEX "isoformReplicateData.fk_isoformReplicateData_isoforms1" ON "isoformReplicateData"("isoform_id");
CREATE INDEX "isoformReplicateData.fk_isoformReplicateData_samples1" ON "isoformReplicateData"("sample_name");
CREATE INDEX "features.features_seqnames_index" ON "features"("seqnames");
CREATE INDEX "features.features_type_index" ON "features"("type");
CREATE INDEX "features.features_strand_index" ON "features"("strand");
CREATE INDEX "features.features_start_end_index" ON "features"("start","end");
CREATE INDEX "features.fk_features_genes1" ON "features"("gene_id");
CREATE INDEX "features.fk_features_isoforms1" ON "features"("isoform_id");
CREATE INDEX "varModel.varModel_condition1" ON "varModel"("condition");
CREATE INDEX "varModel.varModel_locus1" ON "varModel"("locus");
CREATE INDEX "splicingDiffData.fk_splicingDiffData_allSamples" ON "splicingDiffData"("sample_1","sample_2");
CREATE INDEX "TSSExpDiffData.fk_TSSExpDiffData_allSamples" ON "TSSExpDiffData"("sample_1","sample_2");
CREATE INDEX "CDSDiffData.fk_CDSDiffData_allSamples" ON "CDSDiffData"("sample_1","sample_2");
CREATE INDEX "CDSExpDiffData.fk_CDSExpDiffData_allSamples" ON "CDSExpDiffData"("sample_1","sample_2");
CREATE INDEX "promoterDiffData.fk_promoterDiffData_allSamples" ON "promoterDiffData"("sample_1","sample_2");
CREATE INDEX "isoformExpDiffData.fk_isoformExpDiffData_allSamples" ON "isoformExpDiffData"("sample_1","sample_2");
CREATE INDEX "geneExpDiffData.fk_geneExpDiffData_allSamples" ON "geneExpDiffData"("sample_1","sample_2");
'
create.sql <- strsplit(index.text,"\n")[[1]]
tmp <- sapply(create.sql,function(x){
if (verbose){
write(paste(x,sep=""),stderr())
}
dbGetQuery(db,x)
})
}
getSamples<-function(fpkmDF){
sample_name<-unique(fpkmDF$sample)
#sample_name<-as.data.frame(sample_name)
}
getSamplesFromColnames<-function(fpkmDF){
samples<-gsub("_FPKM$","",colnames(fpkmDF)[grepl("_FPKM$",colnames(fpkmDF))])
}
populateSampleTable<-function(samples,dbConn){
samples<-make.db.names(dbConn,samples,unique=FALSE)
samples<-data.frame(sample_index=c(1:length(samples)),sample_name=samples)
#print(samples)
dbWriteTable(dbConn,'samples',samples,row.names=F,append=T)
}
bulk_insert <- function(dbConn,sql,bound.data)
{
dbBegin(dbConn)
dbGetPreparedQuery(dbConn, sql, bind.data = bound.data)
dbCommit(dbConn)
}
#############
#readCufflinks
#############
#TODO: Add count and replicate files
readCufflinks<-function(dir = getwd(),
dbFile="cuffData.db",
gtfFile=NULL,
runInfoFile="run.info",
repTableFile="read_groups.info",
geneFPKM="genes.fpkm_tracking",
geneDiff="gene_exp.diff",
geneCount="genes.count_tracking",
geneRep="genes.read_group_tracking",
isoformFPKM="isoforms.fpkm_tracking",
isoformDiff="isoform_exp.diff",
isoformCount="isoforms.count_tracking",
isoformRep="isoforms.read_group_tracking",
TSSFPKM="tss_groups.fpkm_tracking",
TSSDiff="tss_group_exp.diff",
TSSCount="tss_groups.count_tracking",
TSSRep="tss_groups.read_group_tracking",
CDSFPKM="cds.fpkm_tracking",
CDSExpDiff="cds_exp.diff",
CDSCount="cds.count_tracking",
CDSRep="cds.read_group_tracking",
CDSDiff="cds.diff",
promoterFile="promoters.diff",
splicingFile="splicing.diff",
varModelFile="var_model.info",
driver = "SQLite",
genome = NULL,
rebuild = FALSE,
verbose = FALSE,
...){
#Set file locations with directory
dbFile=file.path(dir,dbFile)
runInfoFile=file.path(dir,runInfoFile)
repTableFile=file.path(dir,repTableFile)
geneFPKM=file.path(dir,geneFPKM)
geneDiff=file.path(dir,geneDiff)
geneCount=file.path(dir,geneCount)
geneRep=file.path(dir,geneRep)
isoformFPKM=file.path(dir,isoformFPKM)
isoformDiff=file.path(dir,isoformDiff)
isoformCount=file.path(dir,isoformCount)
isoformRep=file.path(dir,isoformRep)
TSSFPKM=file.path(dir,TSSFPKM)
TSSDiff=file.path(dir,TSSDiff)
TSSCount=file.path(dir,TSSCount)
TSSRep=file.path(dir,TSSRep)
CDSFPKM=file.path(dir,CDSFPKM)
CDSExpDiff=file.path(dir,CDSExpDiff)
CDSCount=file.path(dir,CDSCount)
CDSRep=file.path(dir,CDSRep)
CDSDiff=file.path(dir,CDSDiff)
promoterFile=file.path(dir,promoterFile)
splicingFile=file.path(dir,splicingFile)
varModelFile=file.path(dir,varModelFile)
#Check to see whether dbFile exists
if (!file.exists(dbFile) || rebuild == TRUE){
#if not, create it
write(paste("Creating database ",dbFile,sep=""),stderr())
dbConn<-createDB_noIndex(dbFile)
#populate DB
if(file.exists(runInfoFile)){
loadRunInfo(runInfoFile,dbConn)
}
if(file.exists(repTableFile)){
loadRepTable(repTableFile,dbConn)
}
if(file.exists(varModelFile)){
loadVarModelTable(varModelFile,dbConn)
}
if(!is.null(gtfFile)){
if(!is.null(genome)){
.loadGTF(gtfFile,genome,dbConn)
}else{
stop("'genome' cannot be NULL if you are supplying a .gtf file!")
}
}
loadGenes(geneFPKM,geneDiff,promoterFile,countFile=geneCount,replicateFile=geneRep,dbConn)
loadIsoforms(isoformFPKM,isoformDiff,isoformCount,isoformRep,dbConn)
loadTSS(TSSFPKM,TSSDiff,splicingFile,TSSCount,TSSRep,dbConn)
loadCDS(CDSFPKM,CDSExpDiff,CDSDiff,CDSCount,CDSRep,dbConn)
#Create Indexes on DB
write("Indexing Tables...",stderr())
createIndices(dbFile,verbose=verbose)
#load Distribution Tests
#loadDistTests(promoterFile,splicingFile,CDSDiff)
}
dbConn<-dbConnect(dbDriver(driver),dbFile)
return (
new("CuffSet",DB = dbConn,
#TODO: need to add replicate and count tables here and in AllClasses.R
genes = new("CuffData",DB = dbConn, tables = list(mainTable = "genes",dataTable = "geneData",expDiffTable = "geneExpDiffData",featureTable = "geneFeatures",countTable="geneCount",replicateTable="geneReplicateData"), filters = list(),type = "genes",idField = "gene_id"),
isoforms = new("CuffData", DB = dbConn, tables = list(mainTable = "isoforms",dataTable = "isoformData",expDiffTable = "isoformExpDiffData",featureTable = "isoformFeatures",countTable="isoformCount",replicateTable="isoformReplicateData"), filters = list(),type="isoforms",idField = "isoform_id"),
TSS = new("CuffData", DB = dbConn, tables = list(mainTable = "TSS",dataTable = "TSSData",expDiffTable = "TSSExpDiffData",featureTable = "TSSFeatures",countTable="TSSCount",replicateTable="TSSReplicateData"), filters = list(),type = "TSS",idField = "TSS_group_id"),
CDS = new("CuffData", DB = dbConn, tables = list(mainTable = "CDS",dataTable = "CDSData",expDiffTable = "CDSExpDiffData",featureTable = "CDSFeatures",countTable="CDSCount",replicateTable="CDSReplicateData"), filters = list(),type = "CDS",idField = "CDS_id"),
promoters = new("CuffDist", DB = dbConn, table = "promoterDiffData",type="promoter",idField="gene_id"),
splicing = new("CuffDist", DB = dbConn, table = "splicingDiffData",type="splicing",idField="TSS_group_id"),
relCDS = new("CuffDist", DB = dbConn, table = "CDSDiffData",type="relCDS",idField="gene_id")
)
)
}
############
# Handle GTF file
############
#loadGTF<-function(gtfFile,dbConn) {
#
# #Error Trapping
# if (missing(gtfFile))
# stop("GTF file cannot be missing!")
#
# if (missing(dbConn))
# stop("Must provide a dbConn connection")
#
# write("Reading GTF file")
# gtf<-read.table(gtfFile,sep="\t",header=F)
#
# write("Melting attributes")
# attributes<-melt(strsplit(as.character(gtf$V9),"; "))
# colnames(attributes)<-c("attribute","featureID")
# attributes<-paste(attributes$attribute,attributes$featureID)
# attributes<-strsplit(as.character(attributes)," ")
# attributes<-as.data.frame(do.call("rbind",attributes))
#
# colnames(attributes)<-c("attribute","value","featureID")
# attributes<-attributes[,c(3,1,2)]
#
# #Grab only gene_ID and transcript_ID to add to features table
# id.attributes<-attributes[attributes$attribute %in% c("gene_id","transcript_id"),]
# id.attributes$featureID<-as.numeric(as.character(id.attributes$featureID))
# id.attributes<-dcast(id.attributes,...~attribute)
#
# #Main features table
# features<-gtf[,c(1:8)]
# colnames(features)<-c("seqname","source","type","start","end","score","strand","frame")
# features$featureID<-as.numeric(as.character(rownames(features)))
#
# #Merge features and id.attributes
# features<-merge(features,id.attributes,by.x='featureID',by.y='featureID')
# features<-features[,c(1,10:11,2:9)]
#
# #strip gene_id and transcript_id from attributes
# attributes<-attributes[!(attributes$attribute %in% c("gene_id","transcript_id")),]
#
# #Write features table
# write("Writing features table",stderr())
# #dbWriteTable(dbConn,'geneData',as.data.frame(genemelt[,c(1:2,5,3,4,6)]),row.names=F,append=T)
# dbWriteTable(dbConn,'features',as.data.frame(features),append=T)
#
# #Write features attribtues table
# #write("Writing feature attributes table",stderr())
# dbWriteTable(dbConn,'attributes',as.data.frame(attributes),append=T)
#
#}
.loadGTF<-function(gtfFile,genomebuild,dbConn){
#Error Trapping
if (missing(gtfFile))
stop("GTF file cannot be missing!")
if (missing(dbConn))
stop("Must provide a dbConn connection")
write("Reading GTF file",stderr())
gr<-import(gtfFile)
gr<-as(gr,"data.frame")
#gr$genome<-genomebuild
colnames(gr)[grepl('^transcript_id$',colnames(gr))]<-'isoform_id'
colnames(gr)[grepl('^tss_id$',colnames(gr))]<-'TSS_group_id'
colnames(gr)[grepl('^p_id$',colnames(gr))]<-'CDS_id'
write("Writing GTF features to 'features' table...",stderr())
#dbSendQuery(dbConn,"DROP TABLE IF EXISTS 'features'")
#dbBegin(dbConn)
dbWriteTable(dbConn,'features',gr,row.names=F,overwrite=T)
#record Genome build
.recordGenome(genomebuild,dbConn)
#dbCommit(dbConn)
return()
}
.recordGenome<-function(genome,dbConn){
genomeInsertQuery<-paste("INSERT INTO runInfo VALUES('genome', '",genome,"')",sep="")
#print(genomeInsertQuery)
dbSendQuery(dbConn,genomeInsertQuery)
}
.cuff2db <- function(gtfFile, out.file = NULL, verbose = TRUE) {
#require(rtracklayer)
#require(GenomicRanges)
#require(GenomicFeatures)
requiredAttribs <- c("gene_id", "transcript_id", "exon_number")
if (verbose) message("Importing ", gtfFile)
tmp <- import(gtfFile)
#dispose of unspliced unstranded transcripts
#tmp <- tmp[ which(strand(tmp) %in% c('+','-')) ]
# fix the gene IDs
#values(tmp)$gene_id <- gsub('CUFF.', '', values(tmp)$gene_id)
# fix the exon IDs
#values(tmp)$transcript_id <- gsub('CUFF.', '', values(tmp)$transcript_id)
# split the object into transcript and exon pieces
#by.type = split(tmp, values(tmp)$type)
#browser()
#make transcripts table
tmpT <- split(tmp,
values(tmp)$transcript_id)
if(verbose) message('Attempting to create the transcripts data.frame')
transcripts <- data.frame(
tx_id=1:length(tmpT),
tx_name=names(tmpT),
tx_chrom=as.character(seqnames(unlist(tmpT))[start(tmpT@partitioning)]),
tx_strand=as.character(strand(unlist(tmpT))[start(tmpT@partitioning)]),
tx_start=sapply(start(ranges(tmpT)), min),
tx_end=sapply(end(ranges(tmpT)), max),
stringsAsFactors=FALSE
)
#make splicings table
tmpS <- split(tmp, values(tmp)$transcript_id)
if(verbose) message('Attempting to create the splicings data.frame')
splicings <- data.frame(
tx_id=rep(1:length(tmpS), elementNROWS(tmpS)),
exon_rank=as.integer(values(unlist(tmpS))$exon_number),
exon_chrom=as.character(seqnames(unlist(tmpS))),
exon_strand=as.character(strand(unlist(tmpS))),
exon_start=start(unlist(tmpS)),
exon_end=end(unlist(tmpS)),
stringsAsFactors=FALSE
)
#make genes table
if(verbose) message('Attempting to create the genes data.frame')
gene_txs <- tapply(values(tmp)$transcript_id, values(tmp)$gene_id, unique)
genes <- data.frame(
tx_name=unlist(gene_txs),
gene_id=rep(names(gene_txs), sapply(gene_txs, length)),
stringsAsFactors=FALSE)
#create the db
if (verbose) message("Creating TranscriptDb")
tmpdb <- makeTxDb(transcripts, splicings, genes=genes)
if (verbose) message("Use saveFeatures() to save the database to a file")
return(tmpdb)
}
#library(Gviz)
#myGeneId<-'XLOC_000071'
#geneQuery<-paste("SELECT start,end,source AS feature,gene_id as gene,exon_number AS exon,transcript_id as transcript,gene_name as symbol, exon_number as rank, strand FROM features WHERE gene_id ='",myGeneId,"'",sep="")
#geneFeatures<-dbGetQuery(cuff@DB,geneQuery)
#geneFeatures$symbol[is.na(geneFeatures$symbol)]<-"NA"
#grtrack<-GeneRegionTrack(geneFeatures,genome="hg19",chromosome="chr1",name="CuffDiff",showId=T,stacking="pack")
#biomTrack<-BiomartGeneRegionTrack(genome="hg19",chromosome="chr1",start=min(start(grtrack)),end=max(end(grtrack)),name="ENSEMBL",showId=T,stacking="pack")
#ideoTrack <- IdeogramTrack(genome = "hg19", chromosome = "chr1")
#axTrack <- GenomeAxisTrack()
#conservation <- UcscTrack(genome = "hg19", chromosome = "chr1",
# track = "Conservation", table = "phyloP46wayAll",
# from = min(start(grtrack)), to = max(end(grtrack)), trackType = "DataTrack",
# start = "start", end = "end", data = "score",
# type = "hist", window = "auto", col.histogram = "darkblue",
# fill.histogram = "darkblue", ylim = c(-3.7, 4),
# name = "Conservation")
#
#
#plotTracks(list(ideoTrack,axTrack,grtrack,biomTrack,conservation),from=min(start(grtrack))-1000,to=max(end(grtrack))+1000)
#plotTracks(list(axTrack,grtrack),from=min(start(grtrack))-1000,to=max(end(grtrack))+1000)
#######
#Unit Test
#######
#dbConn<-createDB()
#date()
#loadGenes("genes.fpkm_tracking","gene_exp.diff",dbConn)
#loadIsoforms("isoforms.fpkm_tracking","isoform_exp.diff",dbConn)
#loadTSS("tss_groups.fpkm_tracking","tss_group_exp.diff",dbConn)
#loadCDS("cds.fpkm_tracking","cds_exp.diff",dbConn)
#date()
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.