Nothing
# Generates the random lists when controlling for transcript length and GC content
prepare.genesize.control.network <- function(humanGenelist,human.bg,mouseGenes,numBOOT = 10000){
### PREPARE TO QUERY BIOMART
combined_human_genes = unique(c(humanGenelist,human.bg))
listMarts(host="www.ensembl.org")
human <- useMart(host="www.ensembl.org", "ENSEMBL_MART_ENSEMBL", dataset="hsapiens_gene_ensembl")
### FIRST GET ALL ENSEMBL GENE IDS FOR THE HUMAN GENES
hum_ens = getBM(attributes=c("hgnc_symbol","ensembl_gene_id"), filters="hgnc_symbol", values= combined_human_genes, mart= human)
### GET THE TRANSCRIPT LENGTHS AND GC CONTENT FROM BIOMART
all_lengths = getBM(attributes=c("transcript_length","percentage_gc_content","ensembl_gene_id"), filters="ensembl_gene_id", values= hum_ens$ensembl_gene_id, mart= human)
all_lengths = all_lengths[!is.na(all_lengths$transcript_length),]
all_lens = merge(all_lengths,hum_ens,by="ensembl_gene_id")
# TAKE THE MEAN TRANSCRIPT LENGTH & GC-CONTENT FOR EACH GENE
transcript_lengths = aggregate(all_lens$transcript_length,by=list(all_lens$hgnc_symbol),FUN=mean)
percentage_gc_content = aggregate(all_lens$percentage_gc_content,by=list(all_lens$hgnc_symbol),FUN=mean)
data_byGene = cbind(transcript_lengths, percentage_gc_content[,2])
colnames(data_byGene) = c("HGNC.symbol","transcript_lengths","percentage_gc_content")
data_byGene = data_byGene[data_byGene$HGNC.symbol!="",]
### DROP ANY HUMAN GENES WITHOUT HOMOLOGOUS MOUSE GENES
mouse_to_human_homologs=NULL # <-- THIS LINE ONLY INCLUDED TO FOOL devtools::check()
data("mouse_to_human_homologs", envir = environment())
m2h = unique(mouse_to_human_homologs[,c("HGNC.symbol","MGI.symbol")])
data_byGene2 = data_byGene[data_byGene$HGNC.symbol %in% m2h$HGNC.symbol,]
### MERGE THE LENGTH/GC DATA WITH MOUSE ORTHOLOG DATA
data_byGene3 = merge(data_byGene2,m2h,by="HGNC.symbol")
data_byGene2 = data_byGene3
# GET QUANTILES FOR TRANSCRIPT LENGTH + GC CONTENT
tl_quants = quantile(data_byGene2$transcript_length, probs = seq(0.1, 1, 0.1))
gc_quants = quantile(data_byGene2$percentage_gc_content, probs = seq(0.1, 1, 0.1))
# ASSIGN EACH GENE TO A QUANTILE QUADRANT
quadrant = matrix(0,nrow=dim(data_byGene2)[1],ncol=2)
colnames(quadrant) = c("TL","GC")
for(i in 1:dim(data_byGene2)[1]){
quadrant[i,1] = which(data_byGene2[i,2]<tl_quants)[1]
quadrant[i,2] = which(data_byGene2[i,3]<gc_quants)[1]
}
data_byGene2$uniq_quad = sprintf("%s_%s",quadrant[,1],quadrant[,2])
uq = data_byGene2$uniq_quad
data_byGene2 = data_byGene2[uq!="2_NA" & uq!="NA_2" & uq!="3_NA",]
### FOR EACH 'DISEASE LIST' GENERATE A SET OF 10000 QUADRANT MATCHED GENE LISTS
# - Get new set of mouse hitGenes, containing only those within data_byGene2
hitGenes_NEW = data_byGene2[data_byGene2$HGNC.symbol %in% humanGenelist,"MGI.symbol"]
list_genes1d = humanGenelist[humanGenelist %in% data_byGene$HGNC.symbol]
# GET ALL MOUSE GENES IN EACH QUADRANT
quad_genes = list()
for(uq in unique(data_byGene2$uniq_quad)){
quad_genes[[uq]] = unique(data_byGene2[data_byGene2$uniq_quad==uq,"MGI.symbol"])
}
# GET MATRIX WITH 10000 RANDOMLY SAMPLED GENES FROM THE SAME QUADRANT AS THE LIST GENE
list_network = matrix("",nrow=numBOOT,ncol=length(hitGenes_NEW))
count=0
for(gene in hitGenes_NEW){
count=count+1
this_gene_quad = data_byGene2[data_byGene2$MGI.symbol==gene,"uniq_quad"][1]
candidates = as.vector(unlist(quad_genes[this_gene_quad]))
list_network[,count] = sample(candidates,numBOOT,replace=TRUE)
}
print("CONTROLLED BOOTSTRAPPING NETWORK GENERATED")
return(list(hitGenes=hitGenes_NEW,list_network=list_network))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.