R/set-based-enrichment-test.r
In mulea: Enrichment Analysis using Multiple Ontologies and False Discovery Rate

##########################################################
# Function for FDR corrected hypergeometric enrichment test
##########################################################
#' @import parallel
#' @import mulea

initialize_result_df <- function(pool, select, DB) {
    DB_names <- names(DB)
    number_of_DB_categories <- length(DB)  # Number of DB categories
    size_pool <- length(pool)
    size_select <- length(select)
    # init empty data frame with the row-size of DB 
    # (number of DB categories) to store the result
    result_df1 <- data.frame(DB_names = DB_names, DB_in_select = as.integer(NA),
        DB_in_pool = as.integer(NA), Genes_in_DB = as.integer(NA),
        P_val = as.double(NA), R_obs = as.integer(NA))
    # for every DB entity in the DB list
    for (i in seq_len(number_of_DB_categories)) {
        # create a vector of genes connected to the i-th DB category
        current_DB_category <- DB[[i]]
        # hypergometric test
        # q: number of common genes between a DBterm and select
        result_df1$DB_in_select[i] <- length(intersect(select,
            current_DB_category))
        # m: number of common genes between DBterm and BackGround
        result_df1$DB_in_pool[i] <- length(intersect(pool,
            current_DB_category))
        result_df1$Genes_in_DB[i] <- length(current_DB_category)
    }
    # compute the p values (raw p values without any corrections)
    # n: number of non-pool genes among DB
    # k: number of genes in select
    # phyper(q, m, n, k, lower.tail = TRUE, log.p = FALSE)
    result_df1$P_val <- 1-stats::phyper(result_df1$DB_in_select-1, 
        result_df1$DB_in_pool,size_pool-result_df1$DB_in_pool,size_select) 
    # can change the digits, this is important for 
    # the precision of '0' is R
    P_val_round <- round(result_df1$P_val, digits = 15)
    result_df1$R_obs <- rank(P_val_round, na.last = TRUE, ties.method ="max")
    result_df1$P_adj_Bonf <- stats::p.adjust(result_df1$P_val, 
        method = "bonferroni")
    result_df1$P_adj_BH <- stats::p.adjust(result_df1$P_val, method = "BH")
    return(result_df1)
}

do_the_simulation <- function(list_of_all_genes, pool, select,DB, steps,
    nthread, random_seed = 0) {if(nthread==1) { # simple call op C++ part of the code
        simulation_result_tbl <- tryCatch(enrichment_test_simulation(DB, 
        list_of_all_genes, pool, length(select), steps, random_seed), error = warning)  
    } else if (nthread>1 && round(nthread) == nthread) {# parallel call of C++ 
        vv <- floor(steps / nthread)
        cc <- steps %% nthread
        steps_per_thread <- c(rep(vv, nthread-cc), rep(vv+1 , cc))
        
        seeds_per_thread <- NULL
        if (random_seed == 0) {
          seeds_per_thread <- sample(2^15, nthread)  
        } else {
          seeds_per_thread <- rep(random_seed, nthread)  
        }
        
        stopifnot(sum(steps_per_thread)==steps)
        rm(cc, vv)
        requireNamespace("parallel")
        if (interactive()) {cl_outfile <- paste(tempdir(), 'paralell.log', 
            sep = "\\")
            cl <- makeCluster(spec=nthread, type="PSOCK", outfile=cl_outfile)
        } else { cl <- makeCluster(spec = nthread, type = "PSOCK")}
        clusterExport(cl, c("DB","list_of_all_genes", "pool", "select",
            "seeds_per_thread", "steps_per_thread"), envir = environment())
        clusterEvalQ(cl, library("mulea"))
        result_of_paralel <- clusterApplyLB(cl = cl, seq_len(nthread), 
            function(idx) {simulation_result_tbl <- tryCatch(
                enrichment_test_simulation(DB, list_of_all_genes, pool, 
                    length(select), steps_per_thread[[idx]], 
                    seeds_per_thread[[idx]]), error = warning)  # RCPP
                return(simulation_result_tbl)})
        stopCluster(cl)
        simulation_result_tbl <- do.call(rbind, result_of_paralel)
    } else {  stop("nthred parameter must be a positive integer number")  }
    names(simulation_result_tbl) <- c("DB_in_pool", "intersect.size",
        "multiplicity") # set the column names
    simulation_result_tbl$p <- 1-stats::phyper(
        simulation_result_tbl$intersect.size-1,
        simulation_result_tbl$DB_in_pool,
        length(pool)-simulation_result_tbl$DB_in_pool, length(select))
    # test consistency
    stopifnot(steps*length(DB) == sum(simulation_result_tbl$multiplicity))
    if(! all(is.finite(simulation_result_tbl$p)) ) {
        stopifnot(all(is.finite(simulation_result_tbl$p)), "ERROR_002")  }
    # group by the equal p-values and summarise the multiplicities 
    simulation_result_tbl <- simulation_result_tbl[,c("multiplicity", "p")]
    simulation_result_tbl$p <- round(simulation_result_tbl$p, digits = 15)
    simulation_result_tbl <- simulation_result_tbl[order(
        simulation_result_tbl$p),]
    list11 <- split(simulation_result_tbl, simulation_result_tbl$p)
    for(i in seq(list11)) { if(nrow(list11[[i]])>1) { list11[[i]] <- data.frame(
        multiplicity = sum(list11[[i]]$multiplicity), p = list11[[i]]$p[1])}}
    simulation_result_tbl <- do.call(rbind, list11)
    rownames(simulation_result_tbl) <- NULL
    return(simulation_result_tbl)  }


set.based.enrichment.test <- function(steps, pool, select, DB, nthread=1, 
    debug=FALSE, random_seed=0) {
    ## convert the database and the select and pollt to sorted integer lists
    list_of_all_genes <- unique(c(unlist(DB),pool))
    select <- intersect(select, pool)
    number_of_DB_categories <- length(DB)  # Number of DB categories
    result_df1 <- initialize_result_df(pool,select, DB)
    P_val_round <- round(result_df1$P_val, digits = 15)
    ############# simualtion ############ 
    if(length(select)==0 ) {result_df1$R_exp <- NaN
        result_df1$FDR <- NaN
        return(result_df1)}
    names(DB) <- NULL
    simulation_result_tbl <- do_the_simulation(list_of_all_genes, pool, 
        select,DB, steps, nthread, random_seed)
    ##############################################
    # some preparation to help the binary search
    if(simulation_result_tbl$p[1]!=0) {
        simulation_result_tbl <- rbind(data.frame(multiplicity = 0, p = 0), 
            simulation_result_tbl)}
    if(simulation_result_tbl$p[nrow(simulation_result_tbl)]!=1) {
        simulation_result_tbl <- rbind(simulation_result_tbl, 
            data.frame(multiplicity = 0, p = 1))}
    simulation_result_tbl$cum_sum_multiplicity <- cumsum(
        simulation_result_tbl$multiplicity)
    # init an empty vector. I  will store here the result
    R_exp <- integer(number_of_DB_categories)
    # this id the greatest/last in cumsum.multiplicity
    NN <- simulation_result_tbl$cum_sum_multiplicity[nrow(
        simulation_result_tbl)] 
    for (i in seq_len(number_of_DB_categories)) {if(P_val_round[i]>=1) {
        R_exp[i] <- NN  # it makes things much faster 
        } else {target <- P_val_round[i]
            # binary search
            a1 <- 1
            a2 <- nrow(simulation_result_tbl) #-cnt.of.ones
            while(a1+1 < a2) {a3 <- floor((a1+a2)/2)
                current <- simulation_result_tbl$p[a3]
                if( current <= target) {a1 <- a3} else { a2<-a3 }}
            # at the end of the binary search the 
            # simulation_result_tbl$cum_sum_multiplicity[[a1]] 
            # is the count of  p where p<=target  and a2=a1+1
            R_exp[i] <- simulation_result_tbl$cum_sum_multiplicity[[a1]]}}
    result_df1$R_exp <- R_exp/steps
    result_df1$FDR <- result_df1$R_exp/result_df1$R_obs
    return(result_df1)
}

Any scripts or data that you put into this service are public.

mulea documentation built on Sept. 30, 2024, 9:44 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

mulea
Enrichment Analysis using Multiple Ontologies and False Discovery Rate

R/set-based-enrichment-test.r
In mulea: Enrichment Analysis using Multiple Ontologies and False Discovery Rate

Defines functions set.based.enrichment.test do_the_simulation initialize_result_df

Try the mulea package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

mulea Enrichment Analysis using Multiple Ontologies and False Discovery Rate

R/set-based-enrichment-test.r In mulea: Enrichment Analysis using Multiple Ontologies and False Discovery Rate

Defines functions set.based.enrichment.test do_the_simulation initialize_result_df

Try the mulea package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

mulea
Enrichment Analysis using Multiple Ontologies and False Discovery Rate

R/set-based-enrichment-test.r
In mulea: Enrichment Analysis using Multiple Ontologies and False Discovery Rate