R/prepare.genesize.control.network.R
In EWCE: Expression Weighted Celltype Enrichment

# Generates the random lists when controlling for transcript length and GC content
prepare.genesize.control.network <- function(humanGenelist,human.bg,mouseGenes,numBOOT = 10000){
    ### PREPARE TO QUERY BIOMART
    combined_human_genes = unique(c(humanGenelist,human.bg))
    listMarts(host="www.ensembl.org")
    human <- useMart(host="www.ensembl.org", "ENSEMBL_MART_ENSEMBL", dataset="hsapiens_gene_ensembl")

    ### FIRST GET ALL ENSEMBL GENE IDS FOR THE HUMAN GENES
    hum_ens = getBM(attributes=c("hgnc_symbol","ensembl_gene_id"), filters="hgnc_symbol", values= combined_human_genes, mart= human)

    ### GET THE TRANSCRIPT LENGTHS AND GC CONTENT FROM BIOMART
    all_lengths = getBM(attributes=c("transcript_length","percentage_gc_content","ensembl_gene_id"), filters="ensembl_gene_id", values= hum_ens$ensembl_gene_id, mart= human)
    all_lengths = all_lengths[!is.na(all_lengths$transcript_length),]
    all_lens = merge(all_lengths,hum_ens,by="ensembl_gene_id")

    # TAKE THE MEAN TRANSCRIPT LENGTH & GC-CONTENT FOR EACH GENE
    transcript_lengths = aggregate(all_lens$transcript_length,by=list(all_lens$hgnc_symbol),FUN=mean)
    percentage_gc_content = aggregate(all_lens$percentage_gc_content,by=list(all_lens$hgnc_symbol),FUN=mean)
    data_byGene = cbind(transcript_lengths, percentage_gc_content[,2])
    colnames(data_byGene) = c("HGNC.symbol","transcript_lengths","percentage_gc_content")
    data_byGene = data_byGene[data_byGene$HGNC.symbol!="",]

    ### DROP ANY HUMAN GENES WITHOUT HOMOLOGOUS MOUSE GENES
    mouse_to_human_homologs=NULL # <-- THIS LINE ONLY INCLUDED TO FOOL devtools::check()
    data("mouse_to_human_homologs", envir = environment())
    m2h = unique(mouse_to_human_homologs[,c("HGNC.symbol","MGI.symbol")])
    data_byGene2 = data_byGene[data_byGene$HGNC.symbol %in% m2h$HGNC.symbol,]

    ### MERGE THE LENGTH/GC DATA WITH MOUSE ORTHOLOG DATA
    data_byGene3 = merge(data_byGene2,m2h,by="HGNC.symbol")
    data_byGene2 = data_byGene3

    # GET QUANTILES FOR TRANSCRIPT LENGTH + GC CONTENT
    tl_quants = quantile(data_byGene2$transcript_length, probs = seq(0.1, 1, 0.1))
    gc_quants = quantile(data_byGene2$percentage_gc_content, probs = seq(0.1, 1, 0.1))

    # ASSIGN EACH GENE TO A QUANTILE QUADRANT
    quadrant = matrix(0,nrow=dim(data_byGene2)[1],ncol=2)
    colnames(quadrant) = c("TL","GC")
    for(i in 1:dim(data_byGene2)[1]){
        quadrant[i,1] = which(data_byGene2[i,2]<tl_quants)[1]
        quadrant[i,2] = which(data_byGene2[i,3]<gc_quants)[1]
    }
    data_byGene2$uniq_quad = sprintf("%s_%s",quadrant[,1],quadrant[,2])
    uq = data_byGene2$uniq_quad
    data_byGene2 = data_byGene2[uq!="2_NA" & uq!="NA_2" & uq!="3_NA",]

    ### FOR EACH 'DISEASE LIST' GENERATE A SET OF 10000 QUADRANT MATCHED GENE LISTS
    # - Get new set of mouse hitGenes, containing only those within data_byGene2
    hitGenes_NEW = data_byGene2[data_byGene2$HGNC.symbol %in% humanGenelist,"MGI.symbol"]
    list_genes1d = humanGenelist[humanGenelist %in% data_byGene$HGNC.symbol]

    # GET ALL MOUSE GENES IN EACH QUADRANT
    quad_genes = list()
    for(uq in unique(data_byGene2$uniq_quad)){
        quad_genes[[uq]] = unique(data_byGene2[data_byGene2$uniq_quad==uq,"MGI.symbol"])
    }

    # GET MATRIX WITH 10000 RANDOMLY SAMPLED GENES FROM THE SAME QUADRANT AS THE LIST GENE
    list_network = matrix("",nrow=numBOOT,ncol=length(hitGenes_NEW))
    count=0
    for(gene in hitGenes_NEW){
        count=count+1
        this_gene_quad = data_byGene2[data_byGene2$MGI.symbol==gene,"uniq_quad"][1]
        candidates = as.vector(unlist(quad_genes[this_gene_quad]))
        list_network[,count] = sample(candidates,numBOOT,replace=TRUE)
    }
    print("CONTROLLED BOOTSTRAPPING NETWORK GENERATED")
    return(list(hitGenes=hitGenes_NEW,list_network=list_network))
}