Nothing
#later when I want 'global' variables in the rgsepd code, theyll be stored under pkg_globals
pkg_globals <- new.env();
.onAttach <- function(libname, pkgname) {
packageStartupMessage(sprintf("Loading R/GSEPD %s",packageDescription("rgsepd")$Version))
packageStartupMessage("Building human gene name caches")
#found the syntax for some kind of local/global variables here
# http://stackoverflow.com/questions/12598242/global-variables-in-packages-in-r
#assign("pkg_globals", new.env(), envir=parent.env(environment()))
RefSeqs <- grep("^N[MR]_",
AnnotationDbi::keys(org.Hs.eg.db,keytype="REFSEQ"),
value=TRUE)
lookedup <- select(org.Hs.eg.db,keytype="REFSEQ",
keys=RefSeqs, multiVals="first",
columns=c('SYMBOL','ENTREZID'))
assign("R2H", hash(RefSeqs,lookedup$SYMBOL), pkg_globals)
assign("R2E", hash(RefSeqs,lookedup$ENTREZID), pkg_globals)
assign("H2R", hash(lookedup$SYMBOL,RefSeqs), pkg_globals)
Egs <- AnnotationDbi::keys(org.Hs.eg.db,keytype="ENTREZID")
lookedup <- select(org.Hs.eg.db,keytype="ENTREZID",
keys=Egs, multiVals="first",
columns=c('SYMBOL'))
assign("H2E", hash(lookedup$SYMBOL,Egs), pkg_globals)
assign("E2H", hash(Egs,lookedup$SYMBOL), pkg_globals)
}
## function definitions for GSEPD
## will run DESEQ and will run GOSeq
##then merge results, and process into GeneSetEnrichmentProfileDisplay
DESEQ_RFile <- function(G)
paste(G$Output_Folder,"/DESEQ.RES.",G$C2T[1],".",G$C2T[2],".csv",sep="")
DESEQ_AFile <- function(G)
paste(G$Output_Folder,"/DESEQ.RES.",G$C2T[1],".",G$C2T[2],".Annote.csv",sep="")
DESEQ_AFFile <- function(G)
paste(G$Output_Folder,"/DESEQ.RES.",G$C2T[1],".",G$C2T[2],".Annote_Filter.csv",sep="")
GSEPD_GOFile <- function(G)
paste(G$Output_Folder,"/GOSEQ.RES.",G$C2T[1],".",G$C2T[2],".GO.csv",sep="")
GSEPD_GOUpFile <- function(G)
paste(G$Output_Folder,"/GOSEQ.RES.",G$C2T[1],".",G$C2T[2],".GO-UP.csv",sep="")
GSEPD_GODownFile <- function(G)
paste(G$Output_Folder,"/GOSEQ.RES.",G$C2T[1],".",G$C2T[2],".GO-DOWN.csv",sep="")
GSEPD_GO2File <- function(G)
paste(G$Output_Folder,"/GSEPD.RES." , G$C2T[1],".",G$C2T[2],".GO2.csv",sep="")
GSEPD_MFile <- function(G)
paste(G$Output_Folder,"/GSEPD.RES." , G$C2T[1],".",G$C2T[2],".MERGE.csv",sep="")
GSEPD_HMA_File <- function(G)
paste(G$Output_Folder,"/GSEPD.HMA." , G$C2T[1],".",G$C2T[2],".pdf",sep="")
DESEQ_CountsFile <- function(G)
paste(G$Output_Folder,"/DESEQ.counts." , G$C2T[1],".",G$C2T[2],".csv",sep="")
GSEPD_Seg_File <- function(G)
paste(G$Output_Folder,"/GSEPD.Segregation_P." , G$C2T[1],".",G$C2T[2],".csv",sep="")
GSEPD_HMACSV_File <- function(G)
paste(G$Output_Folder,"/GSEPD.HMA." , G$C2T[1],".",G$C2T[2],".csv",sep="")
#new for 2016, let's calculate the distance to each centroid.
GSEPD_HMG_File <- function(G)
paste(G$Output_Folder,"/GSEPD.HMG." , G$C2T[1],".",G$C2T[2],".pdf",sep="")
GSEPD_HMG1CSV_File <- function(G)
paste(G$Output_Folder,"/GSEPD.HMG1." , G$C2T[1],".",G$C2T[2],".csv",sep="")
GSEPD_HMG2CSV_File <- function(G)
paste(G$Output_Folder,"/GSEPD.HMG2." , G$C2T[1],".",G$C2T[2],".csv",sep="")
FileMerge<-function(GSEPD){
if(!GSEPD$QUIET)Message_Generate(GSEPD_MFile(GSEPD))
data.GO <- read.csv(GSEPD_GOFile(GSEPD),as.is=TRUE,header=TRUE)
sGO1<-subset(data.GO[,c("category","over_represented_padj","Term")] ,
data.GO$over_represented_padj < (GSEPD$LIMIT$GO_PVAL * 1.25));
sGO1$GOSEQ_DEG_Type <- rep("Mix", nrow(sGO1)) ; rm(data.GO)
data.GO <- read.csv(GSEPD_GOUpFile(GSEPD),as.is=TRUE,header=TRUE)
sGO2<-subset(data.GO[,c("category","over_represented_padj","Term")] ,
data.GO$over_represented_padj < (GSEPD$LIMIT$GO_PVAL * 1.25));
sGO2$GOSEQ_DEG_Type <- rep("Up", nrow(sGO2)) ; rm(data.GO)
data.GO <- read.csv(GSEPD_GODownFile(GSEPD),as.is=TRUE,header=TRUE)
sGO3<-subset(data.GO[,c("category","over_represented_padj","Term")] ,
data.GO$over_represented_padj < (GSEPD$LIMIT$GO_PVAL * 1.25));
sGO3$GOSEQ_DEG_Type <- rep("Down", nrow(sGO3)) ; rm(data.GO)
sdata.GO <- rbind(sGO3,sGO2,sGO1) ; rm(sGO1,sGO2,sGO3)
sdata.GO <- subset(sdata.GO, !duplicated(sdata.GO$category))
data.GO2 <- read.csv(GSEPD_GO2File(GSEPD),as.is=TRUE,header=TRUE)
tcats <- table(data.GO2$category)
cats<-unique(data.GO2$category)
cats=setdiff(cats,names(tcats)[tcats<GSEPD$MinGenesInSet | tcats>GSEPD$MaxGenesInSet])
cats <- unique(c(cats,GSEPD$Force_GO_Include))
mdata<-merge(sdata.GO, data.GO2, by.x="category",by.y="category")
mdata<-subset(mdata, mdata$category %in% cats,
select=c("category", "over_represented_padj.x","Term.x","GOSEQ_DEG_Type","HGNC", "LOG2.X.Y.","REFSEQ",
GSEPD$C2T[1],GSEPD$C2T[2], "PVAL","PADJ") )
write.csv(mdata,GSEPD_MFile(GSEPD))
}
Message_Generate <- function(filename,overwrite=TRUE){
if(file.exists(filename)){
if(overwrite)
message(sprintf("Generating %s, overwriting previous existing version.",filename))
else
message(sprintf("Would generate %s, but one is already present.",filename))
}else{
message(sprintf("Generating %s",filename))
}
}
#Restrict elements of x to be between two numbers.
cap<-function(x,L,H){x[x>H]=H;x[x<L]=L;x}
#Volcano Plot:
plotDE <- function( res ) {
plot(
cap(res$log2FoldChange,-7,7),
0-log(res$pval,base=10),
pch=20, cex=.33,
col = ifelse( res$padj < .10, "red", "black" ),
main=sprintf("Volcano for %d Genes",nrow(res)),
xlab="Log2 FoldChange",
ylab="Log10 p-value",
sub="red points are significant with FDR < 10%")
}
#res is the table prepopulated with $id and other fields NA
LocalDB_Lookup <- function(res)
merge(res , build_Annotation_Matrix(res$id),
by.x="id", by.y="REFSEQ", all.x=TRUE)
biomaRt_Lookup <- function(res) {
#now subset the table to those we didn't know yet.
gns=unique( subset(res,is.na(res$ENTREZ))$id)
#listMarts(host="www.ensembl.org")
ensembl = useMart("ENSEMBL_MART_ENSEMBL", host="www.ensembl.org")
ensembl = useDataset("hsapiens_gene_ensembl",mart=ensembl)
attrs=listAttributes(ensembl)
aa=c("refseq_mrna" , "chromosome_name" , "start_position" , "end_position" ,
"entrezgene_id" , "hgnc_symbol", "description" );
#well of those that start with NM we want to query refseq_mrna
#and for those that start withj NR we want to query refseq_ncrna
gnsNM=gns[grep("NM_",gns)]
if(length(gnsNM)>0){
message(paste("Querying for",length(aa),"attributes from biomaRt on",length(gnsNM)," NM numbers"));
bmt_NM=getBM(attributes = aa, filters = "refseq_mrna",values=gns[grep("NM_",gns)], mart = ensembl)
message(paste("We got back",length(bmt_NM$refseq_mrna),"entries, which will be added to the local database."));
if(nrow(bmt_NM)>0) {
for(i in 1:length(bmt_NM$refseq_mrna)){
rightrow=which(res$id==bmt_NM$refseq_mrna[i])[1];
res$ENTREZ[rightrow] <- as.character(bmt_NM$entrezgene_id[i])
res$HGNC[rightrow] <- as.character(bmt_NM$hgnc_symbol[i])
res$LOCUS[rightrow] <- paste(bmt_NM$chromosome_name[i], ":",bmt_NM$start_position[i],"-",bmt_NM$end_position[i],sep="");
res$DESC[rightrow] <- bmt_NM$description[i]
} }
}
gnsNR=gns[grep("NR_",gns)]
if(length(gnsNR)>0){
message(paste("Querying for",length(aa),"attributes from biomaRt on",length(gnsNR)," NR numbers"));
aa[1]="refseq_ncrna";
bmt_NR=getBM(attributes = aa, filters = "refseq_ncrna",values=gns[grep("NR_",gns)], mart = ensembl)
message(paste("We got back",length(bmt_NR$refseq_ncrna),"entries, which will be added to the local database."));
if(nrow(bmt_NR)>0) {
for(i in 1:length(bmt_NR$refseq_ncrna)){
rightrow=which(res$id==bmt_NR$refseq_ncrna[i])[1];
res$ENTREZ[rightrow] <- as.character(bmt_NR$entrezgene[i])
res$HGNC[rightrow] <- as.character(bmt_NR$hgnc_symbol[i])
res$LOCUS[rightrow] <- paste(bmt_NR$chromosome_name[i], ":",bmt_NR$start_position[i],"-",bmt_NR$end_position[i],sep="");
res$DESC[rightrow] <- bmt_NR$description[i]
} }
}
res
}
AnnotateTable <- function(G){
C2T<-G$C2T
infile=DESEQ_RFile(G)
outfile=DESEQ_AFile(G)
res=read.csv(infile,header=TRUE,as.is=TRUE)
#res has got columns like
#id baseMean baseMeanA baseMeanB foldChange log2FoldChange pval padj
# print("dumping rows with PADJ>0.90")
# res=subset(res,padj <= 0.90)
message("dropping rows with raw PVAL>0.990 OR baseMean < 1 OR on excludes list")
res=subset(res,res$pval <= 0.990 & res$baseMean >= 1 & !(res$id %in% G$EXCLUDES))
res$id = sub(" +$", "", res$id)
message(paste("Rows remaining:",length(res$id)));
if(length(unique(res$id)) < 1){
warning("AnnotateTable : no rows significant enough to annotate. Quitting table re-gen.");
return();
}
message("Converting identifiers with local DB");
res<-LocalDB_Lookup(res);
#and the rest
message("Converting identifiers with biomart");
#fix strings?
res$ENTREZ <- as.character(res$ENTREZ)
res$HGNC <- as.character(res$HGNC)
res = tryCatch({
biomaRt_Lookup(res)
}, warning = function(w) {
warning(w)
res
}, error = function(e) {
warning(e) #if bioMart is down, just keep moving.
res
})
res<-subset(res,select=c("id","baseMean","baseMeanA","baseMeanB","foldChange","log2FoldChange","lfcSE","pval","padj","HGNC","ENTREZ"))
columns = c("REFSEQ","baseMean",C2T[1],C2T[2],"(X/Y)","LOG2(X/Y)","lfcSE","PVAL","PADJ","HGNC","ENTREZ");
if(!G$QUIET)Message_Generate(outfile)
write.table(res,file=outfile,row.names=FALSE,sep=",",col.names=columns);
## and a post-filter for shortlisting
write.table(subset(res, res$padj <= G$LIMIT$PADJ &
abs(res$log2FoldChange) >= G$LIMIT$LFC &
res$baseMean >= G$LIMIT$baseMean )
,file=DESEQ_AFFile(G),row.names=FALSE,sep=",",col.names=columns);
}
AnnotateTable.GO <- function(G){
C2T<-G$C2T
infile=DESEQ_AFile(G)
outfile=GSEPD_GOFile(G)
outfile2=GSEPD_GO2File(G)
plotFile=paste(G$Output_Folder,"/GOSEQ.PWF.",C2T[1],".",C2T[2],".pdf",sep="")
data=read.csv(infile,as.is=TRUE,header=TRUE)
data=subset(data,!duplicated(data$ENTREZ))
sdata=subset(data,data$ENTREZ != "NA")
#default hard filtering:
DEG=ifelse( sdata$PADJ < G$LIMIT$PADJ &
sdata$baseMean >= G$LIMIT$baseMean &
abs(sdata$LOG2.X.Y.) >= G$LIMIT$LFC, 1,0)
#but if we're not set to HARD mode, and we got only a handful of genes:
if( !(G$LIMIT$HARD) && sum(DEG)<(0.075*length(DEG))){
ToLevel=round((0.075*nrow(sdata)))
PThresh = sort(sdata$PVAL,decreasing=FALSE)[ToLevel]
DEG=ifelse(sdata$PVAL < PThresh , 1,0)
message(sprintf("Not many genes found differentially expressed, (re)moving the filters to raw p=%f so we can use %d genes with GOSEQ.",PThresh,sum(DEG)))
}
if(sum(DEG)<2){
stop("I can't do pathways or GO terms when so few genes pass significance. Try changing your G$LIMIT parameters.")
}
DEG.up = ifelse(DEG & (sdata$LOG2.X.Y. > 0) , 1,0)
DEG.dn = ifelse(DEG & (sdata$LOG2.X.Y. < 0) , 1,0)
names(DEG) <- sdata$ENTREZ;
names(DEG.up) <- sdata$ENTREZ;
names(DEG.dn) <- sdata$ENTREZ;
pdf(plotFile)
pwf <- nullp(DEG, G$GOSEQ$genome, G$GOSEQ$system)
#rownames(pwf)<-sdata$REFSEQ # we're calling goseq() with gene system 'refGene'
#error.
GO.wall <- goseq(pwf, G$GOSEQ$genome,
id=G$GOSEQ$system, use_genes_without_cat=G$GOSEQ$use_genes_without_cat)
title(main="GOSeq pwf Allgenes")
if(sum(DEG.up)>1) {
pwf <- nullp(DEG.up, G$GOSEQ$genome, G$GOSEQ$system)
#rownames(pwf)<-sdata$REFSEQ # we're calling goseq() with gene system 'refGene'
GO.wall.Up <- goseq(pwf, G$GOSEQ$genome,
id=G$GOSEQ$system, use_genes_without_cat=G$GOSEQ$use_genes_without_cat)
title(main="GOSeq pwf Upreg")
}else{
GO.wall.Up <- GO.wall[FALSE,]
}
if(sum(DEG.dn)>1){
pwf <- nullp(DEG.dn, G$GOSEQ$genome, G$GOSEQ$system)
#rownames(pwf)<-sdata$REFSEQ # we're calling goseq() with gene system 'refGene'
GO.wall.Dn <- goseq(pwf, genome=G$GOSEQ$genome,
id=G$GOSEQ$system, use_genes_without_cat=G$GOSEQ$use_genes_without_cat)
title(main="GOSeq pwf Downregulated genes.")
}else{
GO.wall.Dn <- GO.wall[FALSE,]
}
dev.off();
## here's a good place to send a warning if some whitelisted GO terms are not available
igo <- setdiff(G$Force_GO_Include, "GO:0000000")
Missing_GO_terms <- igo[!(igo%in%GO.wall$category)]
if(length(Missing_GO_terms)>0) # allow one for the default GO:00000
warning(sprintf("Some %d Whitelisted GO terms from Force_GO_Include.csv not found in database: %s",
length(Missing_GO_terms),
paste(Missing_GO_terms, collapse=" ")))
FilterGOSEQ <- function(GOR, PLIM) {
# do a Benjamini-Hochberg correction of p-values.
GOR$over_represented_padj = p.adjust(GOR$over_represented_pvalue, method="BH")
GOR$under_represented_padj = p.adjust(GOR$under_represented_pvalue, method="BH")
#filter reported set to something with a pvalue:
GOR$minp <- apply(cbind(
GOR$over_represented_padj , GOR$under_represented_padj),
1,min)
GOR <- subset(GOR,GOR$minp <= PLIM | GOR$category %in% igo)
gots = select(GO.db,keys=as.character(GOR$category),
columns=c("GOID","TERM","ONTOLOGY","DEFINITION"),
keytype="GOID")
colnames(gots) <- c("GOID","Term","Ontology","Definition")
merge(GOR, gots, by.x="category",by.y="GOID", all.x=TRUE)
}
GO.wall <- FilterGOSEQ(GO.wall , PLIM=G$LIMIT$GO_PVAL*1.5)
GO.wall.Up <- FilterGOSEQ(GO.wall.Up, PLIM=G$LIMIT$GO_PVAL*1.5)
GO.wall.Dn <- FilterGOSEQ(GO.wall.Dn, PLIM=G$LIMIT$GO_PVAL*1.5)
Check_Missing_ForcedGO(GO.wall,igo)
if(!G$QUIET)Message_Generate(outfile)
write.csv(GO.wall ,outfile);
write.csv(GO.wall.Up,GSEPD_GOUpFile(G));
write.csv(GO.wall.Dn,GSEPD_GODownFile(G));
message("Written GO categories, now reverse mapping");
test.cats = c("GO:CC", "GO:BP", "GO:MF")
gene2cat = getgo(rownames(pwf), G$GOSEQ$genome,G$GOSEQ$system, fetch.cats = test.cats)
names(gene2cat) = rownames(pwf)
cat2gene = reverseMap(gene2cat)
gene2cat = reverseMap(cat2gene)
GO.wall <- rbind(GO.wall.Dn, GO.wall.Up, GO.wall) ; rm(GO.wall.Dn, GO.wall.Up)
GO.wall <- subset(GO.wall, !(duplicated(GO.wall$category)))
#this will pick whoever was first, probably wall.Dn, in the case of duplications.
#okay awesome, let's make a table of those genes in GO.wall with over_represented_pvalue and genes in sdata
OM=matrix(nrow=0,ncol=2); colnames(OM)<-c("category","ENTREZ");
AllCategories <- unique( unlist(GO.wall$category) )
for(gocat in AllCategories){
GID=cat2gene[[gocat]] #as vector of strings, but numeric entrez ids
GID=GID[GID %in% sdata$ENTREZ]
if(length(GID)>0)
OM=rbind(OM, cbind(rep(gocat,length(GID)), GID))
}
merged=merge(OM,sdata,by="ENTREZ")
#now making the GO2 file out of just the single GOseq.
merged <- merge(merged, GO.wall, by="category", all.x=TRUE)
#count genes per category:
Repetition <- table(merged$category)
merged$GO_GCount <- Repetition[merged$category];
if(!G$QUIET)Message_Generate(outfile2)
keepCol=c("REFSEQ","HGNC",G$C2T[1],G$C2T[2], "LOG2.X.Y.", "PVAL",
"PADJ", "category", "over_represented_pvalue", "under_represented_pvalue",
"over_represented_padj", "under_represented_padj",
"numDEInCat", "numInCat", "Term", "Ontology")
#keepCol[ !(keepCol %in% colnames(merged))]
merged=subset(merged,merged$numInCat<800 | merged$category %in% igo, select=keepCol)
merged[,c(G$C2T[1],G$C2T[2], "LOG2.X.Y.")] = signif( merged[,c(G$C2T[1],G$C2T[2], "LOG2.X.Y.")], digits=3 )
write.csv(merged,outfile2)
}
Check_Missing_ForcedGO <- function(go2, include_list){
m <- include_list[ !(include_list%in% go2$category) ]
if(length(m)>0)
warning(sprintf("Trying to include some GO terms but they arent found by GO.db : %s",paste(m,collapse=" ")))
}
GSEPD_Export_DESeq<-function(G){
colData<-G$sampleMeta
rownames(colData)<-G$sampleMeta$Sample
colData$Condition <- as.factor(colData$Condition)
colData$Condition <- relevel(colData$Condition, G$Conditions[1])
dds <- DESeqDataSetFromMatrix(countData = G$finalCounts,
colData = colData,
design = ~ Condition)
return(dds)
}
GSEPD_CheckCounts <- function(GSEPD) {
if(is.null(GSEPD$normCounts)) {
#rld <- rlog()
vsd <- DESeq2::varianceStabilizingTransformation(GSEPD_Export_DESeq(GSEPD),
blind=GSEPD$vstBlind)
#rlogMat <- assay(rld)
vstMat <- assay(vsd)
GSEPD$normCounts <- vstMat
}
GSEPD
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.