###
## Summer 2013 - 2014
## rgsepd-0.3.3 KStamm
## a tool to perform linear vector projection in GO-Term space
###
GSEPD_ProjectionProcessor <- function(GSEPD) {
GSEPD <- GSEPD_CheckCounts(GSEPD) # make sure normCounts is calculated!
finalCounts <- GSEPD$normCounts
C2T <- GSEPD$C2T
sampleMeta <- GSEPD$sampleMeta
Type1=GSEPD$Conditions[1]
Type2=GSEPD$Conditions[2]
DefinedGroup <- subset(sampleMeta, sampleMeta$Condition %in% c(Type1, Type2))
ArcheTypes <- as.character(DefinedGroup$Sample)
G1=ArcheTypes[DefinedGroup$Condition == Type1];
G2=ArcheTypes[DefinedGroup$Condition == Type2]
cols=rep(1,ncol(finalCounts))
pchs=rep(3,ncol(finalCounts))
if(length(GSEPD$COLORS)!=3){
stop("Vector of GSEPD$COLORS should be three elements (Group1, middle, Group2).")
}#doesnt this really belong elsewhere.
cols[colnames(finalCounts) %in% G1] <- GSEPD$COLORS[1]
cols[colnames(finalCounts) %in% G2] <- GSEPD$COLORS[3]
pchs[colnames(finalCounts) %in% G1] <- 1
pchs[colnames(finalCounts) %in% G2] <- 2
#pull up the merged doc and grab the minor GO term
M.data <- read.csv(GSEPD_MFile(GSEPD), as.is=TRUE, header=TRUE,row.names=1)
AssembleGOSTAT <- function(GOTERM){
x=list();
x$Term=GOTERM;
DEG=subset(M.data,M.data$category==GOTERM)
x$Name=DEG$Term.x[1]
x$DataProjection <- ExtractProjection(GSEPD, DEG$REFSEQ ,
PRINTING = FALSE, DRAWING = FALSE)
x
}
tcats <- table(M.data$category)
cats <- unique( subset( M.data, M.data$over_represented_padj.x < GSEPD$LIMIT$GO_PVAL)$category)
#omit those with too few or too many
cats = setdiff(cats, names(tcats)[tcats<GSEPD$MinGenesInSet | tcats>GSEPD$MaxGenesInSet] )
if(length(cats)>1) {
outfile=paste(GSEPD$Output_Folder, "/SCA.GSEPD.",C2T[1],".",C2T[2],".pdf",sep="")
Message_Generate(outfile)
pdf(outfile)
for(i in seq(1,length(cats)-1,2)){
#print(paste(cats[i],cats[i+1]))
DIM_1=AssembleGOSTAT(cats[i])
DIM_2=AssembleGOSTAT(cats[i+1])
xs=(DIM_1$DataProjection$alpha) #GOSet 1 axis score
ys=(DIM_2$DataProjection$alpha) #GOSet 2 axis score
plot(x=xs, y=ys, main=paste(ncol(finalCounts), "Samples by 2 GO:Terms"),
xlab=paste(Type2,DIM_1$Name,Type1, sep=" - "), ylab=paste(Type2,DIM_2$Name,Type1, sep=" - "),
pch=pchs, col=cols )
legend("topleft",legend=c(GSEPD$Conditions[1],GSEPD$Conditions[2],"Other"),pch=c(1,2,3),col=c(GSEPD$COLORS[1],GSEPD$COLORS[3],1))
}
dev.off()
}else{
warning("Skipping generation of SCA.GSEPD.pdf because there aren't enough enriched GO Categories.")
}
if(length(cats)>0) {
message("Calculating Projections and Segregation Significance")
OM=matrix(nrow=length(cats), ncol=ncol(finalCounts))
WhiteOut <- OM; G1OM <- OM; G2OM <- OM;
Segregation_PV=rep(1,nrow(OM)) # see GSEPD$LIMIT$Seg_P later.
Segregation_Val=rep(1,nrow(OM)) # see GSEPD$LIMIT$Seg_P later.
colnames(WhiteOut)<-colnames(finalCounts)
rownames(OM)<-cats
rownames(WhiteOut)<-cats
rownames(G1OM)<-cats
rownames(G2OM)<-cats
names(Segregation_PV)<- cats
names(Segregation_Val)<- cats
for(i in 1:nrow(OM)){
ExDP <- AssembleGOSTAT(cats[i])$DataProjection
OM[i,] <-ExDP$alpha
WhiteOut[i,]<-ExDP$beta
G1OM[i,] <-ExDP$gamma1
G2OM[i,] <-ExDP$gamma2
#Segregation_P[i] <- Resampled_Significance.t(GSEPD,data.frame(alpha=ExDP$alpha, type=GSEPD$sampleMeta$Condition ), GSEPD$Conditions)
O <- Resampled_Significance.k(GSEPD,
ROI=M.data$REFSEQ[M.data$category==cats[i]],
SOI=ArcheTypes)
Segregation_PV[i] <- O$PV
Segregation_Val[i] <- O$Validity
}
colnames(OM)<-colnames(finalCounts)
colnames(WhiteOut)<-colnames(finalCounts)
colnames(G1OM)<-colnames(finalCounts)
colnames(G2OM)<-colnames(finalCounts)
SIGFIG=3
write.csv(signif(OM,SIGFIG), paste(GSEPD$Output_Folder, "/GSEPD.Alpha.", C2T[1],".",C2T[2],".csv",sep=""))
write.csv(signif(WhiteOut,SIGFIG), paste(GSEPD$Output_Folder, "/GSEPD.Beta.", C2T[1],".",C2T[2],".csv",sep=""))
write.csv(cbind(Segregation_Val, Segregation_PV), GSEPD_Seg_File(GSEPD))
write.csv(signif(G1OM,SIGFIG), GSEPD_HMG1CSV_File(GSEPD))
write.csv(signif(G2OM,SIGFIG), GSEPD_HMG2CSV_File(GSEPD))
GOPs <- rep(0,length(cats))
GONames <- rep("",length(cats))
for(i in 1:length(cats)) {
roi <- (M.data$category == cats[i]) #find the entry in Mdata
GONames[i] <- sprintf("%s %s",M.data$Term.x[roi][1], M.data$category[roi][1]);
GOPs[i] <- M.data$over_represented_padj.x[roi][1];
#if this GOSet fails to separate the classes, we can drop it from further processing
if( Segregation_PV[i] < GSEPD$LIMIT$Seg_P){
#if it segregates let's make a pairs plot to go with
deg_ngenes <- sum(roi)
if(deg_ngenes >= 2 && deg_ngenes <= 10){# only on those sets with a decent number of genes
GSEPD_Pairs_Plot(GSEPD, M.data$REFSEQ[roi], GONames[i], GOPs[i], Segregation_PV[i])
}
pdf(paste(GSEPD$Output_SCGO,"/Scatter.",C2T[1],".",C2T[2],".GO",substring(cats[i],4),".pdf",sep=""))
GSEPD_PCA_Spec(GSEPD=GSEPD,GOT=cats[i],MDATA=M.data)
title(sub=sprintf("%d Genes / %d Samples / GOSeq p=%f / segregation p=%f",deg_ngenes, ncol(finalCounts), GOPs[i], Segregation_PV[i]))
dev.off()
}
}
#we'll use the combination to choose great sets
GO_SumPs <- GOPs + Segregation_PV
Message_Generate(GSEPD_HMA_File(GSEPD))
if(length(G1)>1 && nrow(OM)>1)
OM.A1 <- apply(OM[,G1], 1, mean)-min(OM)+1
else
OM.A1 <-OM[,G1] - min(OM)+1
if(length(G2)>1 && nrow(OM)>1)
OM.A2 <- apply(OM[,G2], 1, mean)-min(OM)+1
else
OM.A2 <-OM[,G2] - min(OM)+1
OM.min <- apply(cbind(OM.A1,OM.A2),1,min)#*0.90
OM.max <- apply(cbind(OM.A1,OM.A2),1,max)#*1.10
#Z-scored OutMatrix
zOM <- OM-min(OM)+1;
for(i in 1:nrow(zOM)){
zOM[i, zOM[i,]< OM.min[i] ] <- OM.min[i]
zOM[i, zOM[i,]> OM.max[i] ] <- OM.max[i]
zOM[i,] <- (zOM[i,] - OM.min[i]) / (OM.max[i] - OM.min[i])
}#goes pretty much 0-1;
#generate the asterisks you see in the heatmap
#first lets get some Z-scores on the projection distance
zWhiteOut <- t(scale(t(WhiteOut)))
cellnote=matrix(" ", nrow=nrow(zOM),ncol=ncol(zOM))
for(j in 1:nrow(zOM)){
cellnote[j, zWhiteOut[j,] > GSEPD$VECTOR_DISTANCE_ZTHRESH_Moderate] <- "."
cellnote[j, zWhiteOut[j,] > GSEPD$VECTOR_DISTANCE_ZTHRESH_Severe ] <- rawToChar(as.raw(149)) #a fat circle bullet character
}
ColumnLabels <- colnames(zOM);
for(j in 1:length(ColumnLabels))
ColumnLabels[j] <- paste(sampleMeta$SHORTNAME[sampleMeta$Sample==ColumnLabels[j]],
sampleMeta$Condition[sampleMeta$Sample==ColumnLabels[j]])
ColLabelColors <- rep("black",ncol(OM));
names(ColLabelColors) <- colnames(OM)
ColLabelColors[G1] <- GSEPD$COLORS[1];
ColLabelColors[G2] <- GSEPD$COLORS[3];
#which rows are worth looking at??
sr=1:length(GO_SumPs)
sr=sort(GO_SumPs, decreasing=FALSE, index.return=TRUE)$ix[1:min(GSEPD$MAX_GOs_for_Heatmap,length(cats))]
#and remove those non-significant
sr <- sr[ GO_SumPs[sr] <= (GSEPD$LIMIT$GO_PVAL + GSEPD$LIMIT$Seg_P)] ;
if(length(sr) > 1) {
RowLabelColors <- rep("black",length(sr));
names(RowLabelColors)<-cats[sr]
sMdata<-subset(M.data, !duplicated(M.data$category), select=c("category","GOSEQ_DEG_Type"))
for(j in 1:length(sr)){
cj<-cats[sr[j]];DEGType<-sMdata$GOSEQ_DEG_Type[sMdata$category == cj];
RowLabelColors[j] <- ifelse(DEGType == "Up",GSEPD$COLORS[3],
ifelse(DEGType == "Down",GSEPD$COLORS[1],
GSEPD$COLORS[2]))
} ; rm(sMdata)
pdf(GSEPD_HMA_File(GSEPD), height=6+length(sr)/7, width=6+ncol(zOM)*0.33)
heatmap.2(zOM[sr,], labRow=GONames[sr],
scale="none", trace="none", margins=c(10,35),
cexRow=1.25, labCol=ColumnLabels, cellnote=cellnote[sr,],
notecex=3, notecol="white",
ColSideColors=ColLabelColors, RowSideColors=RowLabelColors,
col=GSEPD$COLORFUNCTION);
dev.off()
#then plot HMG#####
#it's overlaying the zOM
#but for the HMG file now I need to make a new 0-1 score from the G1OM/G2OM matrices.
zOM[,] <- 0.5 ; #default all to indeterminate/black
zOM[G1OM < 0.45] <- 0.35 #dark green
zOM[G1OM < (0.45*(2/3)) ] <- 0 # bright green
zOM[G2OM < 0.45] <- 0.65 # dark red
zOM[G2OM < (0.45*(2/3)) ] <- 1 # bright red
pdf(GSEPD_HMG_File(GSEPD), height=6+length(sr)/7, width=6+ncol(zOM)*0.33)
heatmap.2(zOM[sr,], labRow=GONames[sr],
scale="none", trace="none", margins=c(10,35),
cexRow=1.25,labCol=ColumnLabels,
ColSideColors=ColLabelColors, RowSideColors=RowLabelColors,
col=GSEPD$COLORFUNCTION);
dev.off()
}else{
warning("Not generating HMA/HMG files: <2 significant rows. See your GOSEQ and Segregation tables for details.")
}
#for the rows seen in the heatmap...
for(j in sr){ #make lots of scatter plots, one for each GO term with some enrichment
thisDEG=subset(M.data, M.data$category==cats[j])
if( nrow(thisDEG) >= GSEPD$MinGenesInSet ) {
#thought this would be taken care of above, but the GO term whitelist could create entries with invalid gene counts.
pdf(paste(GSEPD$Output_SCGO,"/GSEPD.", C2T[1],".",C2T[2],".GO",substring(cats[j],4),".pdf",sep=""))
nGenes=length(unique(thisDEG$REFSEQ))
if(nGenes>1){
for(i in seq(1,nGenes-1,2))
ExtractProjection(GSEPD, txids = unique(thisDEG$REFSEQ),
DRAWING=TRUE, GN=c(i,i+1),PRINTING=FALSE,
plotTitle=thisDEG$Term.x[1])
}
dev.off();
}
}
}else{ #length(cats)<1
warning("GSEPD : Can't make output heatmap images when zero GO-Categories enriched.")
}
} # end GOCATPROCESSOR for Type1, Type2
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.