Nothing
#' Generate bootstrap plots
#'
#' \code{generate.bootstrap.plots} takes a genelist and a single cell type transcriptome dataset
#' and generates plots which show how the expression of the genes in the list compares to those
#' in randomly generated gene lists
#'
#' @param sct_data List generated using \code{\link{read_celltype_data}}
#' @param mouse.hits Array of MGI gene symbols containing the target gene list.
#' @param mouse.bg Array of MGI gene symbols containing the background gene list.
#' @param reps Number of random gene lists to generate (default=100 but should be over 10000 for publication quality results)
#' @param sub a logical indicating whether to analyse sub-cell type annotations (TRUE) or cell-type annotations (FALSE). Default is FALSE.
#' @param full_results The full output of \code{\link{bootstrap.enrichment.test}} for the same genelist
#' @param listFileName String used as the root for files saved using this function
#' @return Saves a set of pdf files containing graphs. These will be saved with the filename adjusted using the
#' value of listFileName. The files are saved into the 'BootstrapPlot' folder. The files start with one of the following:
#' \itemize{
#' \item \code{qqplot_noText}: sorts the gene list according to how enriched it is in the relevant celltype. Plots the value in the target list against the mean value in the bootstrapped lists.
#' \item \code{qqplot_wtGSym}: as above but labels the gene symbols for the highest expressed genes.
#' \item \code{bootDists}: rather than just showing the mean of the bootstrapped lists, a boxplot shows the distribution of values
#' \item \code{bootDists_LOG}: shows the bootstrapped distributions with the y-axis shown on a log scale
#' }
#'
#'
#' @examples
#' # Load the single cell data
#' data(celltype_data)
#'
#' # Set the parameters for the analysis
#' reps=100 # <- Use 100 bootstrap lists so it runs quickly, for publishable analysis use >10000
#' subCellStatus=0 # <- Use subcell level annotations (i.e. Interneuron type 3)
#' if(subCellStatus==1){subCellStatus=TRUE;cellTag="SubCells"}
#' if(subCellStatus==0){subCellStatus=FALSE;cellTag="FullCells"}
#'
#' # Load the gene list and get human orthologs
#' data("example_genelist")
#' data("mouse_to_human_homologs")
#' m2h = unique(mouse_to_human_homologs[,c("HGNC.symbol","MGI.symbol")])
#' mouse.hits = unique(m2h[m2h$HGNC.symbol %in% example_genelist,"MGI.symbol"])
#' mouse.bg = unique(setdiff(m2h$MGI.symbol,mouse.hits))
#'
#' # Bootstrap significance testing, without controlling for transcript length and GC content
#' full_results = bootstrap.enrichment.test(sct_data=celltype_data,mouse.hits=mouse.hits,
#' mouse.bg=mouse.bg,reps=reps,sub=subCellStatus)
#'
#' generate.bootstrap.plots(sct_data=celltype_data,mouse.hits=mouse.hits, mouse.bg=mouse.bg,
#' reps=reps,sub=FALSE,full_results=full_results,listFileName="Example")
#' @export
#' @import ggplot2
#' @importFrom reshape2 melt
# @import plyr
generate.bootstrap.plots <- function(sct_data,mouse.hits, mouse.bg,reps,sub=FALSE,full_results=NA,listFileName=""){
# Check the arguments
if(length(full_results)!=3){stop("ERROR: full_results is not valid output from the bootstrap.enrichment.test function")}
if(length(mouse.hits)>length(mouse.bg)){stop("ERROR: target list must be longer than background list")}
results = full_results$results
# Drop genes lacking expression data
mouse.hits = mouse.hits[mouse.hits %in% rownames(sct_data$cell_dists)]
mouse.bg = mouse.bg[mouse.bg %in% rownames(sct_data$cell_dists)]
combinedGenes = c(mouse.hits, mouse.bg)
# Get expression data of bootstrapped genes
signif_res = rownames(results)[results$p<0.05]
nReps = 1000
exp_mats = list()
for(cc in signif_res){
exp_mats[[cc]] = matrix(0,nrow=nReps,ncol=length(mouse.hits))
rownames(exp_mats[[cc]]) = sprintf("Rep%s",1:nReps)
}
for(s in 1:nReps){
bootstrap_set = sample(combinedGenes,length(mouse.hits))
ValidGenes = rownames(sct_data$subcell_dists)[rownames(sct_data$subcell_dists) %in% bootstrap_set]
if(sub==TRUE){
expD = sct_data$subcell_dists[ValidGenes,]
}else{
expD = sct_data$cell_dists[ValidGenes,]
}
for(cc in signif_res){
exp_mats[[cc]][s,] = sort(expD[,cc])
}
}
# Get expression levels of the hit genes
if(sub==TRUE){
hit.exp = sct_data$subcell_dists[mouse.hits,] #subcell.list.exp(mouse.hits)
}else{
hit.exp = sct_data$cell_dists[mouse.hits,] #cell.list.exp(mouse.hits)
}
#print(hit.exp)
graph_theme = theme_bw(base_size = 12, base_family = "Helvetica") +
theme(panel.grid.major = element_line(size = .5, color = "grey"),
axis.line = element_line(size=.7, color = "black"),legend.position = c(0.75, 0.7), text = element_text(size=14),
axis.title.x = element_text(vjust = -0.35), axis.title.y = element_text(vjust = 0.6)) + theme(legend.title=element_blank())
if (!file.exists("BootstrapPlots")){
dir.create(file.path(getwd(), "BootstrapPlots"))
}
# Plot the QQ plots
for(cc in signif_res){
mean_boot_exp = apply(exp_mats[[cc]],2,mean)
hit_exp = sort(hit.exp[,cc])
hit_exp_names = rownames(hit.exp)[order(hit.exp[,cc])]#names(hit_exp)
dat = data.frame(boot=mean_boot_exp,hit=hit_exp,Gnames=hit_exp_names)
dat$hit = dat$hit*100
dat$boot = dat$boot*100
maxHit = max(dat$hit)
maxX = max(dat$boot)+0.1*max(dat$boot)
if(sub==TRUE){dat$hit=dat$hit/10;dat$boot=dat$boot/10}
# Plot several variants of the graph
#basic_graph = ggplot(dat,aes(x=boot,y=hit))+geom_point(size=1)+xlab("Mean Bootstrap Expression")+ylab("Expression in cell type (%)\n") + graph_theme +
# geom_abline(intercept = 0, slope = 1, colour = "red")
basic_graph = ggplot(dat,aes_string(x="boot",y="hit"))+geom_point(size=1)+xlab("Mean Bootstrap Expression")+ylab("Expression in cell type (%)\n") + graph_theme +
geom_abline(intercept = 0, slope = 1, colour = "red")
# Plot without text
pdf(sprintf("BootstrapPlots/qqplot_noText____%s____%s.pdf",listFileName,cc),width=3.5,height=3.5)
print(basic_graph)
dev.off()
dat$symLab = ifelse(dat$hit>25,sprintf(" %s", dat$Gnames),'')
#basic_graph = ggplot(dat,aes(x=boot,y=hit))+geom_point(size=2)+xlab("Mean Bootstrap Expression")+ylab("Expression in cell type (%)\n") + graph_theme +
basic_graph = ggplot(dat,aes_string(x="boot",y="hit"))+geom_point(size=2)+xlab("Mean Bootstrap Expression")+ylab("Expression in cell type (%)\n") + graph_theme +
geom_abline(intercept = 0, slope = 1, colour = "red")
# Plot with gene names
pdf(sprintf("BootstrapPlots/qqplot_wtGSym____%s____%s.pdf",listFileName,cc),width=3.5,height=3.5)
print(basic_graph +
geom_text(aes_string(label="symLab"),hjust=0,vjust=0,size=3) + xlim(c(0,maxX))
)
dev.off()
# Plot with bootstrap distribution
melt_boot = melt(exp_mats[[cc]])
colnames(melt_boot) = c("Rep","Pos","Exp")
actVals = data.frame(pos=as.factor(1:length(hit_exp)),vals=hit_exp)
if(sub==TRUE){melt_boot$Exp=melt_boot$Exp/10; actVals$vals=actVals$vals/10}
pdf(sprintf("BootstrapPlots/bootDists____%s____%s.pdf",listFileName,cc),width=3.5,height=3.5)
melt_boot$Pos = as.factor(melt_boot$Pos)
#print(ggplot(melt_boot)+geom_boxplot(aes(x=as.factor(Pos),y=Exp),outlier.size=0)+
# geom_point(aes(x=pos,y=vals),col="red",data=actVals)+
print(ggplot(melt_boot)+geom_boxplot(aes_string(x="Pos",y="Exp"),outlier.size=0)+
geom_point(aes_string(x="pos",y="vals"),col="red",data=actVals)+
ylab("Expression in cell type (%)\n")+
xlab("Least specific --> Most specific") + scale_x_discrete(breaks=NULL)+graph_theme)
dev.off()
# Plot with LOG bootstrap distribution
# - First get the ordered gene names
rownames(dat)=dat$Gnames
datOrdered = data.frame(GSym=rownames(dat),Pos=1:dim(dat)[1])
# - Arrange the data frame for plotting
melt_boot = melt(exp_mats[[cc]])
colnames(melt_boot) = c("Rep","Pos","Exp")
melt_boot$Exp = melt_boot$Exp*100
melt_boot = merge(melt_boot,datOrdered,by="Pos")
melt_boot$GSym = factor(as.character(melt_boot$GSym),levels=as.character(datOrdered$GSym))
# - Prepare the values of the list genes to be plotted as red dots
actVals = data.frame(Pos=as.factor(1:length(hit_exp)),vals=hit_exp*100)
actVals = merge(actVals,datOrdered,by="Pos")
actVals$GSym = factor(as.character(actVals$GSym),levels=as.character(datOrdered$GSym))
# - Determine whether changes are significant
p = rep(1,max(melt_boot$Pos))
for(i in 1:max(melt_boot$Pos)){
p[i] = sum(actVals[actVals$Pos==i,"vals"]<melt_boot[melt_boot$Pos==i,"Exp"])/length(melt_boot[melt_boot$Pos==i,"Exp"])
}
ast = rep("*",max(melt_boot$Pos))
ast[p>0.05] = ""
actVals = cbind(actVals[order(actVals$Pos),],ast)
# - Plot the graph!
if(sub==TRUE){melt_boot$Exp=melt_boot$Exp/10; actVals$vals=actVals$vals/10}
wd = 1+length(unique(melt_boot[,4]))*0.175
pdf(sprintf("BootstrapPlots/bootDists_LOG____%s____%s.pdf",listFileName,cc),width=wd,height=4)
print(ggplot(melt_boot)+geom_boxplot(aes_string(x="GSym",y="Exp"),outlier.size=0)+graph_theme+
theme(axis.text.x = element_text(angle = 90, hjust = 1))+
geom_point(aes_string(x="GSym",y="vals"),col="red",data=actVals)+
geom_text(aes_string(x="GSym",y="vals",label="ast"),colour="black",col="black",data=actVals)+
ylab("Expression in cell type (%)\n")+
xlab("Least specific --> Most specific")+scale_y_log10()
)
dev.off()
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.