#methods-CuffData.R
#
#Author: Loyal A. Goff
#
#
####################
##################
#Initialize
##################
setMethod("initialize","CuffData",
function(.Object,
DB,
tables=list(mainTable = "",dataTable = "",expDiffTable = "",featureTable = "",countTable = "", replicateTable = ""),
filters=list(),
type = c("genes","isoforms","TSS","CDS"),
idField,
... ){
.Object<-callNextMethod(.Object,
DB = DB,
tables = tables,
filters = filters,
type = type,
idField = idField,
...)
}
)
setValidity("CuffData",function(object){
TRUE
}
)
################
#Class Methods
################
setMethod("show","CuffData",
function(object){
size<-dim(object)
cat(class(object), "instance with:\n\t",size[1],"features and",size[2],"samples\n")
}
)
setMethod("dim","CuffData",
function(x){
countQuery<-paste("SELECT COUNT(",x@idField,") as n FROM ",x@tables$mainTable)
nIds<-dbGetQuery(x@DB,countQuery)
sampleQuery<-("SELECT COUNT(sample_name) as n FROM samples")
nSamples<-dbGetQuery(x@DB,sampleQuery)
c(nIds$n,nSamples$n)
}
)
#####################
#Feature Table
#####################
.addFeatures<-function(object,features,...){
if(!is.data.frame(features)){
stop("features must be a data.frame")
}
colnames(features)[1]<-object@idField
colnames(features)<-dbQuoteIdentifier(object@DB,colnames(features),unique=T)
dbWriteTable(object@DB,object@tables$featureTable,features,row.names=F,overwrite=T)
indexQuery<-paste("CREATE INDEX ",object@idField," ON ", object@tables$featureTable," (",object@idField,")",sep="")
res<-dbGetQuery(object@DB,indexQuery)
}
setMethod("addFeatures",signature="CuffData",.addFeatures)
###################
#Accessors
###################
.annotation<-function(object){
featureQuery<-paste("SELECT * FROM ",object@tables$mainTable," x LEFT JOIN features xf USING (",object@idField,")",sep="")
dbGetQuery(object@DB, featureQuery)
}
setMethod(BiocGenerics::annotation,signature(object="CuffData"),.annotation)
.featureNames<-function(object){
featureQuery<-paste("SELECT ",object@idField," FROM ",object@tables$mainTable, sep="")
res<-dbGetQuery(object@DB,featureQuery)
res[,1]
}
setMethod("featureNames","CuffData",.featureNames)
.samples<-function(object){
res<-dbReadTable(object@DB,'samples')
res<-res$sample_name
res
}
setMethod("samples","CuffData",.samples)
.replicates<-function(object){
res<-dbReadTable(object@DB,'replicates')
res<-res$rep_name
res
}
setMethod("replicates","CuffData",.replicates)
.fpkm<-function(object,features=FALSE,sampleIdList){
#Sample subsetting
if(!missing(sampleIdList)){
if(.checkSamples(object@DB,sampleIdList)){
myLevels<-sampleIdList
}else{
stop("Sample does not exist!")
}
}else{
myLevels<-getLevels(object)
}
#Sample Search String (SQL)
sampleString<-'('
for (i in myLevels){
sampleString<-paste(sampleString,"'",i,"',",sep="")
}
sampleString<-substr(sampleString,1,nchar(sampleString)-1)
sampleString<-paste(sampleString,")",sep="")
if(!features){
FPKMQuery<-paste("SELECT * FROM ",object@tables$dataTable," WHERE sample_name IN ",sampleString,sep="")
}else{
FPKMQuery<-paste("SELECT xf.*,xm.*,x.sample_name,x.fpkm,x.conf_hi,x.conf_lo FROM ",object@tables$dataTable," x LEFT JOIN features xf ON x.",object@idField,"=xf.",object@idField," LEFT JOIN ",object@tables$mainTable," xm ON x.",object@idField,"=xm.",object@idField," WHERE x.sample_name IN ",sampleString,sep="")
#print(FPKMQuery)
}
res<-dbGetQuery(object@DB,FPKMQuery)
res$sample_name<-factor(res$sample_name,levels=getLevels(object))
res$stdev<-(res$conf_hi-res$fpkm)/2
res
}
setMethod("fpkm","CuffData",.fpkm)
.repFpkm<-function(object,features=FALSE,repIdList){
#Sample subsetting
if(!missing(repIdList)){
if(.checkReps(object@DB,repIdList)){
myLevels<-repIdList
}else{
stop("Replicate does not exist!")
}
}else{
myLevels<-getRepLevels(object)
}
#Sample Search String (SQL)
sampleString<-'('
for (i in myLevels){
sampleString<-paste(sampleString,"'",i,"',",sep="")
}
sampleString<-substr(sampleString,1,nchar(sampleString)-1)
sampleString<-paste(sampleString,")",sep="")
if(!features){
FPKMQuery<-paste("SELECT * FROM ",object@tables$replicateTable," WHERE rep_name IN ",sampleString,sep="")
}else{
FPKMQuery<-paste("SELECT xf.*,xm.*,x.rep_name,x.raw_frags,x.internal_scaled_frags,x.external_scaled_frags,x.fpkm,x.effective_length,x.status FROM ",object@tables$replicateTable," x LEFT JOIN features xf on x.",object@idField,"=xf.",object@idField," LEFT JOIN ",object@tables$mainTable," xm ON x.",object@idField,"=xm.",object@idField," WHERE x.rep_name IN ",sampleString,sep="")
}
#print(FPKMQuery)
res<-dbGetQuery(object@DB,FPKMQuery)
res$rep_name<-factor(res$rep_name,levels=getRepLevels(object))
#res$stdev<-(res$conf_hi-res$fpkm)/2 #Not really available yet since conf_hi and conf_lo are not
res
}
setMethod("repFpkm","CuffData",.repFpkm)
.count<-function(object,sampleIdList){
#Sample subsetting
if(!missing(sampleIdList)){
if(.checkSamples(object@DB,sampleIdList)){
myLevels<-sampleIdList
}else{
stop("Sample does not exist!")
}
}else{
myLevels<-getLevels(object)
}
#Sample Search String (SQL)
sampleString<-'('
for (i in myLevels){
sampleString<-paste(sampleString,"'",i,"',",sep="")
}
sampleString<-substr(sampleString,1,nchar(sampleString)-1)
sampleString<-paste(sampleString,")",sep="")
CountQuery<-FPKMQuery<-paste("SELECT * FROM ",object@tables$countTable," WHERE sample_name IN ",sampleString,sep="")
res<-dbGetQuery(object@DB,CountQuery)
res$sample_name<-factor(res$sample_name,levels=getLevels(object))
res
}
setMethod("count","CuffData",.count)
.fpkmMatrix<-function(object,fullnames=FALSE,sampleIdList){
#TODO: fix fullnames for CuffData::fpkmMatrix
#Sample subsetting
if(!missing(sampleIdList)){
if(.checkSamples(object@DB,sampleIdList)){
myLevels<-sampleIdList
}else{
stop("Sample does not exist!")
}
}else{
myLevels<-getLevels(object)
}
samp<-samples(object)
FPKMMatQuery<-paste("select x.",object@idField,", x.gene_short_name, ",sep="")
for (i in samp){
FPKMMatQuery<-paste(FPKMMatQuery,"sum(case when xd.sample_name ='",i,"' then fpkm end) as ",i,",",sep="")
}
FPKMMatQuery<-substr(FPKMMatQuery, 1, nchar(FPKMMatQuery)-1)
FPKMMatQuery<-paste(FPKMMatQuery," from ",object@tables$mainTable," x LEFT JOIN ",object@tables$dataTable," xd on x.",object@idField," = xd.",object@idField," group by x.",object@idField,sep="")
res<-dbGetQuery(object@DB,FPKMMatQuery)
if(fullnames){
res<-data.frame(res[,-c(1,2)],row.names=paste(res[,2],res[,1],sep="|"))
}else{
res<-data.frame(res[,-c(1,2)],row.names=res[,1])
}
if(!missing(sampleIdList)){
res<-data.frame(res[,sampleIdList],row.names=rownames(res))
colnames(res)<-sampleIdList
}
res
}
setMethod("fpkmMatrix","CuffData",.fpkmMatrix)
.repFpkmMatrix<-function(object,fullnames=FALSE,repIdList){
#Sample subsetting
if(!missing(repIdList)){
if(.checkReps(object@DB,repIdList)){
myLevels<-repIdList
}else{
stop("Replicate does not exist!")
}
}else{
myLevels<-getRepLevels(object)
}
samp<-replicates(object)
FPKMMatQuery<-paste("select x.",object@idField,", x.gene_short_name, ",sep="")
for (i in samp){
FPKMMatQuery<-paste(FPKMMatQuery,"sum(case when xd.rep_name ='",i,"' then fpkm end) as ",i,",",sep="")
}
FPKMMatQuery<-substr(FPKMMatQuery, 1, nchar(FPKMMatQuery)-1)
FPKMMatQuery<-paste(FPKMMatQuery," from ",object@tables$mainTable," x LEFT JOIN ",object@tables$replicateTable," xd on x.",object@idField," = xd.",object@idField," group by x.",object@idField,sep="")
res<-dbGetQuery(object@DB,FPKMMatQuery)
if(fullnames){
res<-data.frame(res[,-c(1,2)],row.names=paste(res[,2],res[,1],sep="|"))
}else{
res<-data.frame(res[,-c(1,2)],row.names=res[,1])
}
if(!missing(repIdList)){
res<-data.frame(res[,repIdList],row.names=rownames(res))
colnames(res)<-repIdList
}
res
}
setMethod("repFpkmMatrix","CuffData",.repFpkmMatrix)
.countMatrix<-function(object,fullnames=FALSE,sampleIdList){
#Sample subsetting
if(!missing(sampleIdList)){
if(.checkSamples(object@DB,sampleIdList)){
myLevels<-sampleIdList
}else{
stop("Sample does not exist!")
}
}else{
myLevels<-getLevels(object)
}
samp<-samples(object)
CountMatQuery<-paste("select x.",object@idField,", x.gene_short_name, ",sep="")
for (i in samp){
CountMatQuery<-paste(CountMatQuery,"sum(case when xd.sample_name ='",i,"' then count end) as ",i,",",sep="")
}
CountMatQuery<-substr(CountMatQuery, 1, nchar(CountMatQuery)-1)
CountMatQuery<-paste(CountMatQuery," from ",object@tables$mainTable," x LEFT JOIN ",object@tables$countTable," xd on x.",object@idField," = xd.",object@idField," group by x.",object@idField,sep="")
res<-dbGetQuery(object@DB,CountMatQuery)
if(fullnames){
res<-data.frame(res[,-c(1,2)],row.names=paste(res[,2],res[,1],sep="|"))
}else{
res<-data.frame(res[,-c(1,2)],row.names=res[,1])
}
if(!missing(sampleIdList)){
res<-data.frame(res[,sampleIdList],row.names=rownames(res))
colnames(res)<-sampleIdList
}
res
}
setMethod("countMatrix","CuffData",.countMatrix)
.repCountMatrix<-function(object,fullnames=FALSE,repIdList){
#Sample subsetting
if(!missing(repIdList)){
if(.checkReps(object@DB,repIdList)){
myLevels<-repIdList
}else{
stop("Replicate does not exist!")
}
}else{
myLevels<-getRepLevels(object)
}
reps<-replicates(object)
repCountMatQuery<-paste("select x.",object@idField,", x.gene_short_name, ",sep="")
for (i in reps){
repCountMatQuery<-paste(repCountMatQuery,"sum(case when xr.rep_name ='",i,"' then external_scaled_frags end) as ",i,",",sep="")
}
repCountMatQuery<-substr(repCountMatQuery, 1, nchar(repCountMatQuery)-1)
repCountMatQuery<-paste(repCountMatQuery," from ",object@tables$mainTable," x LEFT JOIN ",object@tables$replicateTable," xr on x.",object@idField," = xr.",object@idField," group by x.",object@idField,sep="")
res<-dbGetQuery(object@DB,repCountMatQuery)
if(fullnames){
res<-data.frame(res[,-c(1,2)],row.names=paste(res[,2],res[,1],sep="|"))
}else{
res<-data.frame(res[,-c(1,2)],row.names=res[,1])
}
if(!missing(repIdList)){
res<-data.frame(res[,repIdList],row.names=rownames(res))
colnames(res)<-repIdList
}
res
}
setMethod("repCountMatrix","CuffData",.repCountMatrix)
.statsMatrix<-function(object){
statsQuery<-paste("SELECT xd.*, xc.count, xc.variance as count_variance , xc.uncertainty as count_uncertainty, xc.dispersion as count_dispersion, (xd.conf_hi-xd.fpkm)/2 as fpkm_stdev,((xd.conf_hi-xd.fpkm)/2)/xd.fpkm AS 'CV' FROM ",object@tables$dataTable," xd LEFT JOIN ",object@tables$countTable," xc ON xd.",object@idField,"=xc.",object@idField," AND xd.sample_name=xc.sample_name",sep="")
res<-dbGetQuery(object@DB,statsQuery)
res$sample_name<-factor(res$sample_name,levels=samples(object))
res
}
#This needs a lot of work...
#TODO: Change this to remove lnFcCutoff but make sure that functions that rely on diffData have their own FC cutoff so that plotting doesn't suffer
.diffData<-function(object,x,y,features=FALSE){
if(missing(x) && missing(y)){
if(!features){
diffQuery<-paste("SELECT * FROM ",object@tables$expDiffTable,sep="")
}else{
diffQuery<-paste("SELECT xm.*, xed.*, xf.* FROM ",object@tables$mainTable," xm LEFT JOIN ",object@tables$expDiffTable," xed ON xm.",object@idField,"=xed.",object@idField," LEFT JOIN features xf ON xm.",object@idField,"=xf.",object@idField,sep="")
}
}else if (missing(x) || missing(y)){
stop("You must supply both x and y or neither.")
}else{
if(!features){
diffQuery<-paste("SELECT x.",object@idField,", xed.* FROM ",object@tables$mainTable," x LEFT JOIN ",object@tables$expDiffTable," xed on x.",object@idField," = xed.",object@idField," WHERE ((sample_1 = '",x,"' AND sample_2 = '",y,"') OR (sample_1 = '",y,"' AND sample_2 = '",x,"'))",sep="")
}else{
diffQuery<-paste("SELECT xm.*, xed.*, xf.* FROM ",object@tables$mainTable," xm LEFT JOIN ",object@tables$expDiffTable," xed on xm.",object@idField," = xed.",object@idField," LEFT JOIN features xf ON xm.",object@idField,"=xf.",object@idField," WHERE ((sample_1 = '",x,"' AND sample_2 = '",y,"') OR (sample_1 = '",y,"' AND sample_2 = '",x,"'))",sep="")
}
}
dat<-dbGetQuery(object@DB,diffQuery)
#diffQuery
dat
}
setMethod("diffData",signature(object="CuffData"),.diffData)
.diffTable<-function(object,logCutoffValue=99999){
measureVars<-c('status','value_1','value_2','log2_fold_change','test_stat','p_value','q_value','significant')
all.diff<-diffData(object,features=TRUE)
all.diff$log2_fold_change[all.diff$log2_fold_change>=logCutoffValue]<-Inf
all.diff$log2_fold_change[all.diff$log2_fold_change<=-logCutoffValue]<--Inf
all.diff.melt<-melt(all.diff,measure.vars=measureVars)
#all.diff.melt<-all.diff.melt[!grepl("^value_",all.diff.melt$variable),]
all.diff.cast<-dcast(all.diff.melt,formula=...~sample_2+sample_1+variable)
all.diff.cast
}
setMethod("diffTable",signature(object="CuffData"),.diffTable)
.QCTable<-function(object){
qcSQL<-paste("SELECT d.*, (d.conf_hi-d.fpkm)/2 as stdev, c.count, c.variance, c.uncertainty, c.dispersion, (c.variance/d.fpkm) AS 'IOD', ((d.conf_hi-d.fpkm)/2)/d.fpkm AS 'CV' FROM ",object@tables$dataTable," d LEFT JOIN ",object@tables$countTable," c ON d.gene_id=c.gene_id AND d.sample_name=c.sample_name",sep="")
res<-dbGetQuery(object@DB,qcSQL)
res
}
.getMA<-function(object,x,y,logMode=T,pseudocount=1){
if (missing(x) || missing(y)){
stop("You must supply both x and y.")
}else{
sql<-paste("SELECT x.",object@idField,", sum(case when x.sample_name = '",x,"' then x.fpkm end) AS 'x', sum(case when x.sample_name = '",y,"' then x.fpkm end) AS 'y' FROM ",object@tables$dataTable," x GROUP BY x.",object@idField,";",sep="")
#print(sql)
dat<-dbGetQuery(object@DB,sql)
if(logMode){
dat$x<-log10(dat$x+pseudocount)
dat$y<-log10(dat$y+pseudocount)
}
dat$A<-(dat$x+dat$y)/2
dat$M<-dat$x/dat$y
res<-dat[,c(1,4:5)]
res
}
}
.getCountMA<-function(object,x,y,logMode=T,pseudocount=1){
if (missing(x) || missing(y)){
stop("You must supply both x and y.")
}else{
sql<-paste("SELECT x.",object@idField,", sum(case when x.sample_name = '",x,"' then x.count end) AS 'x', sum(case when x.sample_name = '",y,"' then x.count end) AS 'y' FROM ",object@tables$countTable," x GROUP BY x.",object@idField,";",sep="")
dat<-dbGetQuery(object@DB,sql)
if(logMode){
dat$x<-log10(dat$x+pseudocount)
dat$y<-log10(dat$y+pseudocount)
}
dat$A<-(dat$x+dat$y)/2
dat$M<-dat$x/dat$y
res<-dat[,c(1,4:5)]
res
}
}
#.getRankOrder<-function(object,x,y,logMode=TRUE,pseudocount=1,ratio=TRUE){
# if (missing(x) || missing(y)){
# stop("You must supply both x and y.")
# }else{
#
# }
#}
setMethod("DB","CuffData",function(object){
return(object@DB)
})
setMethod("tables","CuffData",function(object){
return(object@tables)
})
setMethod("filters","CuffData",function(object){
return(object@filters)
})
setMethod("type","CuffData",function(object){
return(object@type)
})
setMethod("idField","CuffData",function(object){
return(object@idField)
})
##################
#Setters
##################
##################
#Subsetting
##################
#Example query
#"SELECT * FROM genes WHERE gene_id in ('XLOC_000005','XLOC_000015','XLOC_000055','XLOC_000595','XLOC_005998','ucscCodingXLOC_018816')"
.getLevels<-function(object){
levelsQuery<-'SELECT s.sample_name FROM samples s ORDER BY s.sample_index ASC'
levels<-dbGetQuery(object@DB,levelsQuery)$sample_name
levels
}
setMethod("getLevels",signature(object="CuffData"),.getLevels)
.getRepLevels<-function(object){
levelsQuery<-'SELECT r.rep_name FROM replicates r JOIN samples s ON r.sample_name=s.sample_name ORDER BY s.sample_index ASC'
levels<-dbGetQuery(object@DB,levelsQuery)$rep_name
levels
}
setMethod("getRepLevels",signature(object="CuffData"),.getRepLevels)
.getRepConditionLevels<-function(object){
levelsQuery<-'SELECT r.sample_name FROM replicates r JOIN samples s ON r.sample_name=s.sample_name ORDER BY s.sample_index ASC'
levels<-dbGetQuery(object@DB,levelsQuery)$sample_name
levels
}
#Useful SQL commands
#SELECT g.gene_id, g.class_code, g.nearest_ref_id, g.gene_short_name, g.locus, g.length, g.coverage, g.status, gd.sample_name, gd.fpkm, gd.conf_hi, gd.conf_lo FROM genes g LEFT JOIN geneData gd ON g.gene_id = gd.gene_id WHERE (g.gene_id = 'XLOC_000001');
#SELECT g.gene_id, ged.* FROM genes g LEFT JOIN geneExpDiffData ged on g.gene_id = ged.gene_id WHERE ((sample_1 = 'H1_hESC' AND sample_2 = 'Fibroblasts') OR (sample_1 = 'Fibroblasts' AND sample_2 = 'H1_hESC')) AND ged.log2_fold_change>-20 AND ged.log2_fold_change<20 ;
#Pivot table SQL for scatterplots
#select g.*, sum(case when gd.sample_name = 'Fibroblasts' then fpkm end) as Fibroblasts, sum(case when gd.sample_name = 'H1_hESC' then fpkm end) as H1_hESC from genes g LEFT JOIN geneData gd on g.gene_id = gd.gene_id group by g.gene_id;
##################
#Plotting
##################
.density<-function(object, logMode = TRUE, pseudocount=0.0, labels, features=FALSE, replicates=FALSE,...){
if(is(object,'CuffData')) {
if(replicates){
dat<-repFpkm(object,features=features)
colnames(dat)[colnames(dat)=="rep_name"]<-"condition"
}else{
dat<-fpkm(object,features=features)
colnames(dat)[colnames(dat)=="sample_name"]<-"condition"
}
} else {
stop('Un-supported class of object.')
}
if(logMode) dat$fpkm<-dat$fpkm+pseudocount
p<-ggplot(dat)
if(logMode) {
p<-p+geom_density(aes(x= log10(fpkm),group=condition,color=condition,fill=condition),alpha=I(1/3))
}else{
p<-p+geom_density(aes(x=fpkm,group=condition,color=condition,fill=condition),alpha=I(1/3))
}
p<-p + labs(title=object@tables$mainTable)
#Default cummeRbund colorscheme
p<-p + scale_fill_hue(l=50,h.start=200) + scale_color_hue(l=50,h.start=200)
#TODO: Add label callout
p
}
setMethod("csDensity",signature(object="CuffData"),.density)
.scatter<-function(object,x,y,logMode=TRUE,pseudocount=1.0,labels,smooth=FALSE,colorByStatus=FALSE, drawRug=TRUE, ...){
dat<-fpkmMatrix(object,fullnames=TRUE)
samp<-samples(object)
#check to make sure x and y are in samples
if (!all(c(x,y) %in% samp)){
stop("One or more values of 'x' or 'y' are not valid sample names!")
}
#add pseudocount if necessary
if(logMode){
for (i in samp){
dat[[i]]<-dat[[i]]+pseudocount
}
}
#Attach tracking_id and gene_short_name
if(!missing(labels)){
require(stringr)
tracking<-str_split_fixed(rownames(dat),"\\|",2)
dat$gene_short_name<-tracking[,1]
dat$tracking_id<-tracking[,2]
labeled.dat<-dat[dat$gene_short_name %in% labels,]
}
#make plot object
p<-ggplot(dat)
p<- p + aes_string(x=x,y=y)
#Right now, this does nothing, because 'significant' is not returned from fpkmMatrix object so I don't have this as a feature to draw
if(colorByStatus){
p<- p + geom_point(size=1.2,alpha=I(1/3))
}else{
p<- p + geom_point(size=1.2,alpha=I(1/3))
}
#Add symmetry line
p<- p + geom_abline(intercept=0,slope=1,linetype=2)
#Add rug
if(drawRug){
p<- p + geom_rug(size=0.8,alpha=0.01)
}
#add smoother
if(smooth){
p <- p + stat_smooth(method="lm",fill="blue",alpha=0.2)
}
#Add highlights from labels
if(!missing(labels)){
p <- p + geom_point(data=labeled.dat,aes_string(x=x,y=y),size=1.3,color="red")
p <- p + geom_text(data=labeled.dat,aes_string(x=x,y=y,label='gene_short_name'),color="red",hjust=0,vjust=0,angle=0,size=4)
}
#logMode
if(logMode){
p <- p + scale_y_log10() + scale_x_log10()
}
#Add title & Return value
p<- p + labs(title=object@tables$mainTable)
p
}
setMethod("csScatter",signature(object="CuffData"), .scatter)
.scatter2<-function(object,x,y,logMode=TRUE,pseudocount=1.0,labels,smooth=FALSE,alpha=0.05,colorByStatus=FALSE, drawRug=TRUE, ...){
samp<-samples(object)
#check to make sure x and y are in samples
if (!all(c(x,y) %in% samp)){
stop("One or more values of 'x' or 'y' are not valid sample names!")
}
#Setup query string
sampleList<-paste("('",x,"','",y,"')",sep="")
scatterQuery<-paste("SELECT m.gene_short_name,d.",object@idField,",sum(CASE WHEN d.sample_name='",x,"' THEN d.fpkm END) as x,sum(CASE WHEN d.sample_name='",y,"' THEN d.fpkm END) as y, edd.status, edd.p_value, edd.q_value FROM ",object@tables$mainTable," m LEFT JOIN ",object@tables$dataTable," d ON m.",object@idField,"=d.",object@idField," LEFT JOIN ",object@tables$expDiffTable," edd ON m.",object@idField," = edd.",object@idField," WHERE (edd.sample_1 in ",sampleList," AND (edd.sample_2 in ",sampleList,")) GROUP BY d.",object@idField,";",sep="")
#write(scatterQuery,stderr())
#Retrieve data
dat<-dbGetQuery(object@DB,scatterQuery)
#add pseudocount if necessary
if(logMode){
for (i in samp){
dat$x<-dat$x+pseudocount
dat$y<-dat$y+pseudocount
}
}
#Flag significant genes
dat$significant<-"no"
dat$significant[dat$q_value<=alpha]<-"yes"
dat$significant<-factor(dat$significant,levels=c("no","yes"))
#Attach tracking_id and gene_short_name
if(!missing(labels)){
labeled.dat<-dat[dat$gene_short_name %in% labels,]
}
#make plot object
p<-ggplot(dat) + theme_bw()
p<- p + aes_string(x='x',y='y')
#Right now, this does nothing, because 'significant' is not returned from fpkmMatrix object so I don't have this as a feature to draw
if(colorByStatus){
p<- p + geom_point(aes(color=significant),size=1.2) + scale_color_manual(values=c("black","red"))
}else{
p<- p + geom_point(size=1.2,alpha=I(1/3))
}
#Add symmetry line
p<- p + geom_abline(intercept=0,slope=1,linetype=2)
#Add rug
if(drawRug){
p<- p + geom_rug(size=0.8,alpha=0.01)
}
#add smoother
if(smooth){
p <- p + stat_smooth(method="lm",fill="blue",alpha=0.2)
}
#Add highlights from labels
if(!missing(labels)){
p <- p + geom_point(data=labeled.dat,aes_string(x='x',y='y'),size=1.3,color="red")
p <- p + geom_text(data=labeled.dat,aes_string(x='x',y='y',label='gene_short_name'),color="red",hjust=0,vjust=0,angle=0,size=4)
}
#logMode
if(logMode){
p <- p + scale_y_log10() + scale_x_log10()
}
#Add title & Return value
p<- p + labs(title=object@tables$mainTable,x=x,y=y)
p
}
.scatterMat<-function(object,replicates=FALSE,logMode=TRUE,pseudocount=1.0,hexbin=FALSE,useCounts=FALSE,...){
if(replicates){
if(useCounts){
dat<-repCountMatrix(object)
}else{
dat<-repFpkmMatrix(object)
}
}else{
if(useCounts){
dat<-countMatrix(object)
}else{
dat<-fpkmMatrix(object)
}
}
if(logMode)
{
dat<-dat+pseudocount
}
if(useCounts){
myLab = "Normalized Counts"
}else{
myLab = "FPKM"
}
if(logMode){
myLab = paste("log10 ",myLab,sep="")
p <- .plotmatrix(log10(dat),hexbin=hexbin,...)
}else{
p <- .plotmatrix(dat,hexbin=hexbin,...)
}
p<- p + geom_abline(intercept=0,slope=1,linetype=2)
p <- p + theme_bw() + ylab(myLab) + xlab(myLab)
#p<- p + aes(alpha=0.01)
p
}
setMethod("csScatterMatrix",signature(object="CuffData"),.scatterMat)
.volcano<-function(object,x,y,alpha=0.05,showSignificant=TRUE,features=FALSE,xlimits=c(-20,20),...){
samp<-samples(object)
#check to make sure x and y are in samples
if (!all(c(x,y) %in% samp)){
stop("One or more values of 'x' or 'y' are not valid sample names!")
}
dat<-diffData(object=object,features=features)
#subset dat for samples of interest
mySamples<-c(x,y)
dat<-dat[(dat$sample_1 %in% mySamples & dat$sample_2 %in% mySamples),]
dat$significant <- 'no'
dat$significant[dat$q_value<=alpha]<-'yes'
s1<-unique(dat$sample_1)
s2<-unique(dat$sample_2)
p<-ggplot(dat)
if(showSignificant==FALSE){
p<- p + geom_point(aes(x=log2_fold_change,y=-log10(p_value)),size=1.2)
}else{
p<- p + geom_point(aes(x=log2_fold_change,y=-log10(p_value),color=significant),size=1.2)
}
#Add title and return
p<- p + theme_bw()
p<- p + labs(title=paste(object@tables$mainTable,": ",s2,"/",s1,sep=""))
p<- p + scale_colour_manual(values = c("black","red"))
#Set axis limits
p<- p + scale_x_continuous(limits=xlimits)
p <- p + xlab(bquote(paste(log[2],"(fold change)",sep=""))) +
ylab(bquote(paste(-log[10],"(p value)",sep="")))
p
}
setMethod("csVolcano",signature(object="CuffData"), .volcano)
.volcanoMatrix<-function(object,alpha=0.05,xlimits=c(-20,20),mapping=aes(),...){
dat<-diffData(object)
part1<-dat[,c('sample_1','sample_2','value_1','value_2','test_stat','p_value','q_value')]
part2<-data.frame(sample_1=part1$sample_2,sample_2=part1$sample_1,value_1=part1$value_2,value_2=part1$value_1,test_stat=-part1$test_stat,p_value=part1$p_value,q_value=part1$q_value)
dat<-rbind(part1,part2)
myLevels<-union(dat$sample_1,dat$sample_2)
dat$sample_1<-factor(dat$sample_1,levels=myLevels)
dat$sample_2<-factor(dat$sample_2,levels=myLevels)
dat$log2_fold_change<-log2(dat$value_2/dat$value_1)
#Set significance value
dat$significant<-"no"
dat$significant[dat$q_value<=alpha]<-"yes"
#May need to expand filler for time-series data (when there aren't always all pairwise comparisons on which to facet
filler<-data.frame(sample_1=factor(myLevels,levels=myLevels),sample_2=factor(myLevels,levels=myLevels),label="")
filler$label<-as.character(filler$label)
mapping <- defaults(mapping, aes_string(x = "log2_fold_change", y = "-log10(p_value)", color="significant"))
class(mapping) <- "uneval"
p <-ggplot(dat) + geom_point(mapping,na.rm=TRUE,size=0.8) + scale_colour_manual(values = c("black","red")) + facet_grid(sample_1~sample_2)
p<- p + geom_vline(aes(xintercept=0),linetype=2)
p <- p + theme_bw() + xlab(bquote(paste(log[2],"(fold change)",sep=""))) + ylab(bquote(paste(-log[10],"(p value)",sep="")))
p
}
setMethod("csVolcanoMatrix",signature(object="CuffData"),.volcanoMatrix)
.distheat<-function(object, replicates=F, samples.not.genes=T, logMode=T, pseudocount=1.0, heatscale=c(low='lightyellow',mid='orange',high='darkred'), heatMidpoint=NULL, sigDigits=3, ...) {
# get expression from a sample or gene perspective
if(replicates){
obj.fpkm<-repFpkmMatrix(object,fullnames=T)
}else{
obj.fpkm<-fpkmMatrix(object,fullnames=T)
}
if(samples.not.genes) {
obj.fpkm.pos = obj.fpkm[rowSums(obj.fpkm)>0,]
} else {
obj.fpkm = t(obj.fpkm)
obj.fpkm.pos = obj.fpkm[,colSums(obj.fpkm)>0]
}
if(logMode) {
obj.fpkm.pos = log10(obj.fpkm.pos+pseudocount)
}
# compute distances
obj.dists = JSdist(makeprobs(obj.fpkm.pos))
# cluster to order
obj.hc = hclust(obj.dists)
# make data frame
dist.df = melt(as.matrix(obj.dists),varnames=c("X1","X2"))
dist.df$value<-as.numeric(format(dist.df$value,digits=sigDigits))
# initialize
g = ggplot(dist.df, aes(x=X1, y=X2, fill=value))
# draw
labels = labels(obj.dists)
g = g + geom_tile(color="black") + scale_x_discrete("", limits=labels[obj.hc$order]) + scale_y_discrete("", limits=labels[obj.hc$order])
# roll labels
g = g + theme(axis.text.x=element_text(angle=-90, hjust=0), axis.text.y=element_text(angle=0, hjust=1))
# drop grey panel background and gridlines
g = g + theme(panel.grid.minor=element_line(colour=NA), panel.grid.major=element_line(colour=NA), panel.background=element_rect(fill=NA, colour=NA))
# adjust heat scale
if (length(heatscale) == 2) {
g = g + scale_fill_gradient(low=heatscale[1], high=heatscale[3], name="JS Distance")
}
else if (length(heatscale) == 3) {
if (is.null(heatMidpoint)) {
heatMidpoint = max(obj.dists) / 2.0
}
g = g + scale_fill_gradient2(low=heatscale[1], mid=heatscale[2], high=heatscale[3], midpoint=heatMidpoint, name="JS Distance")
}
if(samples.not.genes){
g <- g + geom_text(aes(label=value))
}
# return
g
}
setMethod("csDistHeat", signature("CuffData"), .distheat)
.boxplot<-function(object,logMode=TRUE,pseudocount=0.0001,replicates=FALSE,...){
if(replicates){
dat<-repFpkm(object)
colnames(dat)[colnames(dat)=="rep_name"]<-"condition"
}else{
dat<-fpkm(object)
colnames(dat)[colnames(dat)=="sample_name"]<-"condition"
}
if(logMode) {
dat$fpkm<-dat$fpkm+pseudocount
p<-ggplot(dat)
p<-p+geom_boxplot(aes(x=condition,y=log10(fpkm),fill=condition),size=0.3,alpha=I(1/3))
} else {
p <- ggplot(dat)
p<-p+geom_boxplot(aes(x=condition,y=fpkm,fill=condition),alpha=I(1/3),size=0.3)
}
p<- p + theme(axis.text.x=element_text(angle=-90, hjust=0))
#Default cummeRbund colorscheme
p<-p + scale_fill_hue(l=50,h.start=200)
p
}
setMethod("csBoxplot",signature(object="CuffData"),.boxplot)
.dendro<-function(object,logMode=T,pseudocount=1,replicates=FALSE,...){
if(replicates){
fpkmMat<-repFpkmMatrix(object)
}else{
fpkmMat<-fpkmMatrix(object)
}
if(logMode){
fpkmMat<-log10(fpkmMat+pseudocount)
}
res<-JSdist(makeprobs(fpkmMat))
#colnames(res)<-colnames(fpkmMat)
#res<-as.dist(res)
res<-as.dendrogram(hclust(res))
plot(res,main=paste("All",deparse(substitute(object)),sep=" "),...)
res
}
setMethod("csDendro",signature(object="CuffData"),.dendro)
.MAplot<-function(object,x,y,logMode=T,pseudocount=1,smooth=FALSE,useCount=FALSE){
if(useCount){
dat<-.getCountMA(object,x,y,logMode=logMode,pseudocount=pseudocount)
}else{
dat<-.getMA(object,x,y,logMode=logMode,pseudocount=pseudocount)
}
p<-ggplot(dat)
p<-p+geom_point(aes(x=A,y=log2(M)),size=0.8)
#add smoother
if(smooth){
p <- p + stat_smooth(aes(x=A,y=log2(M)),fill="blue")
}
#add baseline
p <- p + geom_hline(yintercept=0)
p
}
setMethod("MAplot",signature(object="CuffData"),.MAplot)
.dispersionPlot<-function(object){
dat<-count(object)
p<-ggplot(dat)
p<-p+geom_point(aes(x=count,y=dispersion,color=sample_name)) + facet_wrap('sample_name') + scale_x_log10() + scale_y_log10()
p
}
setMethod("dispersionPlot",signature(object="CuffData"),.dispersionPlot)
.MDSplot<-function(object,replicates=FALSE,logMode=TRUE,pseudocount=1.0){
if(replicates){
dat<-repFpkmMatrix(object)
#repData<-sapply(replicates(object),function(x){strsplit(x,"_")[[1]][1]}) This is to color by condition and not replicate...
}else{
dat<-fpkmMatrix(object)
}
if(logMode){
dat<-log10(dat+pseudocount)
}
d<-JSdist(makeprobs(dat))
fit <- cmdscale(d,eig=TRUE, k=2)
res<-data.frame(names=rownames(fit$points),M1=fit$points[,1],M2=fit$points[,2])
p <- ggplot(res)
p <- p + geom_point(aes(x=M1,y=M2,color=names)) + geom_text(aes(x=M1,y=M2,label=names,color=names)) + theme_bw()
p
}
setMethod("MDSplot",signature(object="CuffData"),.MDSplot)
#Not sure if I want to include this or not..
.PCAplot<-function(object,x="PC1", y="PC2",replicates=FALSE,pseudocount=1.0,scale=TRUE,showPoints=TRUE,...){
if(replicates){
fpkms<-repFpkmMatrix(object)
}else{
fpkms<-fpkmMatrix(object)
}
fpkms<-log10(fpkms+pseudocount)
PC<-prcomp(fpkms,scale=scale,...)
dat <- data.frame(obsnames=row.names(PC$x), PC$x)
#dat$shoutout<-""
#dat$shoutout[matchpt(PC$rotation,PC$x)$index]<-rownames(pca$x[matchpt(pca$rotation,pca$x)$index,])
plot <- ggplot(dat, aes_string(x=x, y=y))
if(showPoints){
plot<- plot + geom_point(alpha=.4, size=0.8, aes(label=obsnames))
}
plot <- plot + geom_hline(aes(yintercept=0), size=.2) + geom_vline(aes(xintercept=0), size=.2) #+ geom_text(aes(label=shoutout),size=2,color="red")
datapc <- data.frame(varnames=rownames(PC$rotation), PC$rotation)
mult <- min(
(max(dat[,y]) - min(dat[,y])/(max(datapc[,y])-min(datapc[,y]))),
(max(dat[,x]) - min(dat[,x])/(max(datapc[,x])-min(datapc[,x])))
)
datapc <- transform(datapc,
v1 = .7 * mult * (get(x)),
v2 = .7 * mult * (get(y))
)
plot <- plot +
#coord_equal() +
geom_text(data=datapc, aes(x=v1, y=v2, label=varnames,color=varnames), vjust=1)
plot <- plot + geom_segment(data=datapc, aes(x=0, y=0, xend=v1, yend=v2,color=varnames), arrow=arrow(length=unit(0.2,"cm")), alpha=0.75) + theme_bw()
plot
}
setMethod('PCAplot',signature(object="CuffData"),.PCAplot)
#This is most definitely a work in progress
.confidencePlot<-function(object,percentCutoff=20){
res<-.statsMatrix(object)
#res$CV[is.na(res$CV)]<-0.0
res$CV<-as.numeric(res$CV)
p<-ggplot(res)
p<-p + geom_point(aes(x=log10(fpkm),y=log10(CV)),size=1.5,alpha=0.3)
p<-p + geom_smooth(aes(x=log10(fpkm),y=log10(CV),color=sample_name)) +
#p<-p + geom_point(aes(x=log10(fpkm),y=log10(count),color=-log10(CV))) +
#p<-p + geom_point(aes(x=log10(fpkm),y=log2(fpkm/count),color=-log10(CV))) + geom_hline(aes(0),linetype=2) +
#facet_wrap('sample_name') +
geom_abline(intercept=0,slope=1,linetype=2,size=0.3) +
scale_color_gradient(name="%CV",low="darkblue",high="white",limits=c(0,percentCutoff), na.value = "white") +
labs(title=object@type)
p
}
.CVdensity<-function(object){
dat<-.statsMatrix(object)
p<-ggplot(dat)
p <- p + geom_density(aes(x=CV,fill=sample_name),alpha=0.3,position="dodge") + scale_x_log10()
p
}
.fpkmSCVPlot<-function(object,FPKMLowerBound=1,showPool=FALSE){
if(!showPool){
showPool=any(table(.getRepConditionLevels(object))<2) # Counts number of replicates per condition and requires a minimum of 2. If any n<2, values are determined from pooling across samples.
if(showPool){
warning("At least one of your conditions does not have enough replicates to estimate variance. Estimating variance across all conditions instead.")
}
}
dat<-repFpkm(object)
colnames(dat)[1]<-"tracking_id"
dat<-dat[,c('tracking_id','sample_name','fpkm')]
dat<-dat[dat$fpkm>0,]
if(showPool){
subsetCols<-c('tracking_id')
}else{
subsetCols<-c('tracking_id','sample_name')
}
#Option 3 (tapply on log10(replicateFPKM) values)
dat.means<-tapply(dat$fpkm,dat[,subsetCols],function(x){mean(x,na.rm=T)})
dat.sd<-tapply(dat$fpkm,dat[,subsetCols],function(x){sd(x,na.rm=T)})
#write("Calculating replicate fpkm mean...",stderr())
dat.means<-melt(dat.means)
#write("Calculating replicate fpkm stdev...",stderr())
dat.sd<-melt(dat.sd)
colnames(dat.means)[colnames(dat.means)=="value"]<-'fpkm'
colnames(dat.sd)[colnames(dat.sd)=="value"]<-'stdev'
dat<-cbind(dat.means,dat.sd$stdev)
colnames(dat)[colnames(dat)=="dat.sd$stdev"]<-'stdev'
dat<-dat[!is.na(dat$stdev) & !is.na(dat$fpkm),]
dat<-dat[dat$fpkm>0 & dat$stdev>0,]
if(!showPool){
dat$sample_name<-factor(dat$sample_name,levels=samples(object))
}
p <-ggplot(dat,aes(x=fpkm,y=(stdev/fpkm)^2),na.rm=T)
#p <-ggplot(dat,aes(x=log10(fpkm+1),y=log10(stdev)),na.rm=T)
if(showPool){
p <- p + stat_smooth(na.rm=T,method='auto',fullrange=T)
}else{
p <- p + #geom_point(aes(color=sample_name),size=1,na.rm=T) +
stat_smooth(aes(color=sample_name,fill=sample_name),na.rm=T,method='auto',fullrange=T)
}
p <- p + scale_x_log10() +
scale_y_continuous(name=bquote(CV^2)) +
xlab(bquote(paste(log[10],"FPKM",sep=" "))) +
theme_bw() + xlim(c(log10(FPKMLowerBound),max(log10(dat$fpkm)))) + labs(title=object@type)
p
}
setMethod("fpkmSCVPlot",signature(object="CuffData"),.fpkmSCVPlot)
.sensitivityPlot<-function(object){
return
}
#TODO:Log2FC vs Test-statistic
#TODO:log2FPKM vs log2(stdev) (color by sample)
#TODO: Index of dispersion alpha (FPKM vs ?)
#SELECT gd.*, (gd.conf_hi-gd.fpkm)/2 as fpkm_stdev, gc.count, gc.variance as count_variance , gc.uncertainty, gc.dispersion, ((gd.conf_hi-gd.fpkm)/2)/gd.fpkm AS 'CV' FROM geneData gd LEFT JOIN geneCount gc ON gd.gene_id=gc.gene_id AND gd.sample_name=gc.sample_name;
#############
# Other Methods
#############
.specificity<-function(object,logMode=T,pseudocount=1,relative=FALSE,...){
fpkms<-fpkmMatrix(object,...)
if(logMode){
fpkms<-log10(fpkms+pseudocount)
}
fpkms<-t(makeprobs(t(fpkms)))
d<-diag(ncol(fpkms))
res<-apply(d,MARGIN=1,function(q){
JSdistFromP(fpkms,q)
})
colnames(res)<-paste(colnames(fpkms),"_spec",sep="")
if(relative){
res<-res/max(res)
}
1-res
}
setMethod("csSpecificity",signature(object="CuffData"),.specificity)
######################
# Exploratory Analysis
######################
.nmf<-function(object,k,logMode=T,pseudocount=1,maxiter=1000,replicates=FALSE,fullnames=FALSE){
require(NMFN)
if(missing(k)) stop("Please provide a rank value for factorization (arg=k)")
if(replicates){
m=repFpkmMatrix(object,fullnames=fullnames)
}else{
m=fpkmMatrix(object,fullnames=fullnames)
}
if(logMode)
{
m = log10(m+pseudocount)
}
myNMF<-nnmf(m,k=k,maxiter=maxiter)
return (myNMF)
}
setMethod("csNMF",signature(object="CuffData"),.nmf)
#############
# GSEA helper methods
#############
.makeRnk<-function(object,x,y,filename,...){
#Creates a log2-fold change .rnk file for all genes given an x/y comparison.
#While this method will work for any cuffData level, it really only makes sense for 'genes' as this is what is required for GSEA...
#Must provide 'x' and 'y' so as to provide a log2 fold change.
samp<-samples(object)
#check to make sure x and y are in samples
if (!all(c(x,y) %in% samp)){
stop("One or more values of 'x' or 'y' are not valid sample names!")
}
query<-paste("SELECT gd.gene_id,g.gene_short_name,(sum(CASE WHEN gd.sample_name='",x,"' THEN gd.fpkm+1 END))/(sum(CASE WHEN gd.sample_name='",y,"' THEN gd.fpkm+1 END)) as 'ratio' FROM genes g LEFT JOIN geneData gd ON g.gene_id=gd.gene_id GROUP BY g.gene_id ORDER BY ratio DESC",sep="")
res<-dbGetQuery(object@DB,query)
res$ratio<-log2(res$ratio)
#Remove gene_id field
res<-res[,-1]
#Remove rows with "NA" for gene_short_name
res<-res[!is.na(res$gene_short_name),]
#Write to file
write.table(res,file=filename,sep="\t",quote=F,...,row.names=F,col.names=F)
#res
}
.makeRnkTest<-function(object,x,y,filename,...){
samp<-samples(object)
#check to make sure x and y are in samples
if (!all(c(x,y) %in% samp)){
stop("One or more values of 'x' or 'y' are not valid sample names!")
}
query<-paste("SELECT gd.gene_id,g.gene_short_name,gd.test_stat FROM genes g LEFT JOIN geneExpDiffData gd ON g.gene_id=gd.gene_id WHERE ((gd.sample_1='",x,"' AND gd.sample_2='",y,"') OR (gd.sample_2='",x,"' AND gd.sample_1='",y,"')) ORDER BY gd.test_stat DESC",sep="")
#print(query)
res<-dbGetQuery(object@DB,query)
#Remove gene_id field
res<-res[,-1]
#Remove rows with "NA" for gene_short_name
res<-res[!is.na(res$gene_short_name),]
#Write to file
write.table(res,file=filename,sep="\t",quote=F,...,row.names=F,col.names=F)
}
.makeRnkTestStat<-function(object,x,y,filename,...){
samp<-samples(object)
#check to make sure x and y are in samples
if (!all(c(x,y) %in% samp)){
stop("One or more values of 'x' or 'y' are not valid sample names!")
}
query<-paste("SELECT gd.gene_id,g.gene_short_name,gd.sample_1, gd.sample_2,gd.test_stat FROM genes g LEFT JOIN geneExpDiffData gd ON g.gene_id=gd.gene_id WHERE ((gd.sample_1 ='",x,"' AND gd.sample_2='",y,"') OR (gd.sample_2='",x,"' AND gd.sample_1='",y,"')) GROUP BY gd.gene_id",sep="")
res<-dbGetQuery(object@DB,query)
if(unique(res$sample_2)==x){
res2<-res
res2$test_stat<--(res$test_stat)
res2$sample_1<-res$sample_2
res2$sample_2<-res$sample_1
res<-res2
}
#Order by test_stat
res<-res[order(res$test_stat,decreasing=F),]
#Remove gene_id field
res<-res[,c('gene_short_name','test_stat')]
#Remove rows with "NA" for gene_short_name
res<-res[!is.na(res$gene_short_name),]
#Write to file
write.table(res,file=filename,sep="\t",quote=F,...,row.names=F,col.names=F)
#head(res)
}
setMethod("makeRnk",signature(object="CuffData"),.makeRnkTestStat)
#############
#Utility functions
#############
.checkSamples<-function(dbConn,sampleIdList){
dbSamples<-dbReadTable(dbConn,"samples")
if (all(sampleIdList %in% dbSamples$sample_name)){
return(TRUE)
}else{
return(FALSE)
}
}
.checkReps<-function(dbConn,repIdList){
dbReps<-dbReadTable(dbConn,"replicates")
if (all(repIdList %in% dbReps$rep_name)){
return(TRUE)
}else{
return(FALSE)
}
}
###################
# Coersion methods
###################
#As ExpressionSet
###################
#Utility functions
###################
#.volcanoMatrix <- function(data){
# densities <- do.call('rbind',lapply(1:))
# p <-ggplot(data) + facet_grid(sample1~sample2,scales="free") + geom_point(aes(x=log2_fold_change,y=-log10(p_value))) + stat_density(aes(x = log2_fold_change,
# y = ..scaled.. * diff(range(log2_fold_change)) + min(log2_fold_change)), data = densities,
# position = "identity", colour = "grey20", geom = "line")
#
#}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.