R/sigora.R

Defines functions sigora .getWeightfunc

Documented in sigora

.getWeightfunc <- function() {
  weightfunc <- list()
  weightfunc$invhm <- function(a, b) {
    0.5 * ((1 / a) + (1 / b))
  }
  weightfunc$reciprod <- function(a, b) {
    1 / (a * b)
  }
  weightfunc$jac <- function(a, b) {
    1 / (a + b - 1)
  }
  weightfunc$topov <- function(a, b) {
    1 / (apply(as.matrix(cbind(a, b)), FUN = min, MARGIN = 1))
  }
  weightfunc$cosine <- function(a, b) {
    1 / (sqrt(a * b))
  }
  weightfunc$noweights <- function(a, b) {
    rep(1, length(a))
  }
  weightfunc$justPUGs <- function(a, b) {
    rep(0, length(a))
  }
  return(weightfunc)
}

#' Sigora's main function.
#'
#' This function determines which Signatures (GPS) from a collection of GPS
#' data (\code{GPSrepo} argument) for the specified pathway repository are
#' present in the specified list of genes of interest (\code{queryList}
#' argument)). It then uses the distribution function of hypergeometric
#' probabilities to identify the pathways whose GPS are over-represented among
#' the present GPS and saves the results to the file specified in the
#' \code{saveFile} argument.
#'
#' @export
#'
#' @param GPSrepo An object created by \code{makeGPS} or one of the precompiled
#' GPS data collections that are provided with this package (currently for KEGG
#' and Reactome). e.g. \code{reaH} for human Reactome GPS, \code{kegH} for
#' human KEGG GPS, and \code{reaM} and \code{kegM} for corresponding mouse GPS.
#' See the examples section for creating and using your own GPS.
#' @param level In hierarchical repositories (e.g. Reactome) number of levels
#' to consider. Recommended value for KEGG: 2, for Reactome: 4.
#' @param markers Whether to take single genes that are uniquely associated
#' with only one pathway into account (i.e. should pathway unique genes/PUGs be
#' considered GPS?). Recommended value: TRUE (1).
#' @param queryList A user specified list of genes of interest ('query list'),
#' as a vector of ENSEMBL/ ENTREZ IDs or gene symbols (HGNC/MGI).
#' @param saveFile If provided, the results are saved here as a tab delimited
#' File (including , for each pathway, a list of genes ordered by their
#' contribution to the statistical significance of the pathway).
#' @param weighting.method The weighting method or GPS. The default weighting
#' scheme for the GPS is the reciproc of the harmonic mean of the degrees of
#' the two component genes of a GPS. A wide range of alternative weighting
#' schemes are pre-implemented (see below). Additional user defined weighting
#' schemes are also supported. Currently, the following alternatives are
#' pre-implemented:\cr
#' 'noweights','cosine','topov','reciprod','jac','justPUGs'and'invhm'.\cr
#' Additional user defined weighting schemes are also supported (see section
#' examples).\cr 'noweights': assigns a constant of 1 to all GPS.\cr 'cosine':
#' all GPS are weighted by the cosine of the degrees of their consituent
#' genes.\cr 'topov': all GPS are weighted by topological overlap of their
#' consituent genes.\cr 'reciprod': all GPS are weighted by reciproc of product
#' of the number of pathway annotations of their consituent genes.\cr 'jac':all
#' GPS are weighted by the jaccard similarity of the pathway annotations
#' consituent genes.\cr 'justPUGs': Analysis is performed using PUGs only.\cr
#' 'invhm': all GPS are weighted by the reciproc of the harmonic mean of the
#' degrees of their consituent genes (default).\cr
#' @param idmap A dataframe for converting between different gene-identifier
#' types (e.g. ENSEMBL, ENTREZ and HGNC-Symbols of genes). Most users do not
#' need to set this argument, as there is a built-in conversion table.
#' @return \item{ summary_results}{A dataframe listing the analysis results.}
#' \item{detailed_results }{A dataframe describing the detailed evidence
#' (present Gene-Pair Signatures) for each pathway.}
#' @seealso \code{\link{sigora-package}} , \code{\link{makeGPS}}
#' @references Foroushani AB, Brinkman FS and Lynn DJ
#' (2013).\dQuote{Pathway-GPS and SIGORA: identifying relevant pathways based
#' on the over-representation of their gene-pair signatures.}\emph{PeerJ},
#' \bold{1}
#' @keywords functions
#' @examples
#'
#' ##query list
#' ils <- grep("^IL",load_data('idmap')[["Symbol"]],value=TRUE)
#' ## using precompiled GPS repositories:
#' sigRes.ilreact <- sigora(queryList=ils,GPSrepo=load_data('reaH'),level=4)
#' \donttest{
#' sigRes.ilkeg <- sigora(queryList=ils,GPSrepo=load_data('kegH'),level=2)
#' ## user created GPS repository:
#' nciH<-makeGPS(pathwayTable=load_data('nciTable'))
#' sigRes.ilnci<-sigora(queryList=ils,GPSrepo=nciH,level=2)
#' ## user defined weighting schemes :
#' myfunc<-function(a,b){1/log(a+b)}
#' sigora(queryList=ils,GPSrepo=nciH,level=2, weighting.method = myfunc)
#' }
sigora <-
  function(GPSrepo,
           level,
           markers = FALSE,
           queryList = NULL,
           saveFile = NULL,
           weighting.method = "invhm",
           idmap = load_data('idmap')) {
    ##` GPSrepo:Output of makeGPS
    ##` queryList: query list
    ##`
    if ("character" %in% class(weighting.method)) {
      WEIGHTMETHODS <- names(.getWeightfunc())
      weighting.method <- pmatch(weighting.method, WEIGHTMETHODS)
      weightfunc <- .getWeightfunc()[[weighting.method]]
    } else if ("function" %in% class(weighting.method)) {
      weightfunc <- weighting.method
    } else {
      stop("invalid parameter weighting.method")
    }

    hh <- NULL
    jj <- grep("^L", names(GPSrepo), value = TRUE)
    weights <- 0
    ## mapping
    if (length(intersect(queryList, GPSrepo$origRepo[[2]])) == 0) {
      t1 <-
        which.max(vapply(idmap, function(x) {
          length(intersect(x, GPSrepo$origRepo[[2]]))
        }, FUN.VALUE = integer(1)))
      t2 <-
        which.max(vapply(idmap, function(x) {
          length(intersect(x, queryList))
        }, FUN.VALUE = integer(1)))
      queryList <- idmap[which(idmap[, t2] %in% queryList), t1]
      print(paste(
        "Mapped identifiers from" ,
        colnames(idmap)[t2],
        " to ",
        colnames(idmap)[t1],
        "..."
      ))
    }
    for (ind in seq_len(level)) {
      v1 <- GPSrepo[[paste("L", ind, sep = '')]]
      weights <- weightfunc(v1$degs[v1$gs[v1$GPS[, 1]]],
                            v1$degs[v1$gs[v1$GPS[, 2]]])
      hh <- rbind(hh, cbind(v1$gs[v1$GPS[, 1]],
                            v1$gs[v1$GPS[, 2]],
                            v1$ps[v1$GPS[, 3]],
                            round(weights * 100) / 100))
      rm(v1)
    }
    colnames(hh) <- c("gene1", "gene2", "pathway", "weight")
    if (markers == TRUE) {
      for (ind in seq_len(level)) {
        #v1<-GPSrepo[[eval(jj[ind])]]
        v1 <- GPSrepo[[paste("L", ind, sep = '')]]
        hh <- rbind(hh, cbind(v1$gs[v1$PU[, 1]],
                              v1$gs[v1$PUG[, 1]],
                              v1$ps[v1$PUG[, 2]],
                              1))
        rm(v1)
      }
    }
    if (is.null(queryList)) {
      ##TODO
    }
    hhd <-
      (hh[hh[, 1] %in% queryList &
            hh[, 2] %in% queryList, , drop = FALSE])
    k1 <-
      (stats::aggregate(as.numeric(hhd[, 4]), by = list(hhd[, 3]), FUN = sum))
    kN <-
      (stats::aggregate(as.numeric(hh[, 4]), by = list(hh[, 3]), FUN =
                          sum))
    sum(kN[, 2])
    sum(k1[, 2])
    ps <- stats::phyper(k1[, 2] - 1, kN[match(k1[, 1], kN[, 1]), 2],
                        sum(kN[, 2]) - kN[match(k1[, 1], kN[, 1]), 2],
                        sum(k1[, 2]),
                        lower.tail = FALSE)
    ps <- signif(ps, digits = 4)
    Bonfer <-
      signif(stats::p.adjust(ps, n = length(unique(hh[, 3])), method = 'bonfer'), digits = 4)
    sp1 <- GPSrepo$pathwaydescriptions
    summary_results <-
      data.frame(k1[, 1], sp1[match(k1[, 1], sp1[, 1]), 2], ps, Bonfer, k1[, 2],
                 kN[match(k1[, 1], kN[, 1]), 2], sum(kN[, 2]), sum(k1[, 2]))
    detailed_results <- hhd
    rownames(detailed_results) <- NULL
    colnames(summary_results) <-
      c(
        "pathwy.id",
        "description",
        "pvalues",
        "Bonferroni",
        "successes",
        "PathwaySize",
        "N",
        "sample.size"
      )
    summary_results <-
      summary_results[with(summary_results, order(pvalues)), ]
    rownames(summary_results) <- seq_len(nrow(summary_results))
    print(summary_results[which(summary_results$Bonfer < 0.01), ])
    res <- list()
    res[["summary_results"]] <- summary_results
    res[["detailed_results"]] <- detailed_results
    if (!is.null(saveFile)) {
      Genes <- vector(mode = 'character', length = nrow(summary_results))
      for (i in seq_len(nrow(summary_results))) {
        v11 <- getGenes(res, i)
        if ("Symbol" %in% colnames(v11)) {
          Genes[i] <- paste(v11$Symbol, sep = "" , collapse = ";")
        } else{
          Genes[i] <- paste(v11[, 1], sep = "", collapse = ";")
        }
      }
      utils::write.table(
        cbind(summary_results, Genes),
        file = saveFile,
        quote = FALSE,
        row.names = FALSE,
        sep = '\t'
      )
    }
    invisible(res)
    ##take a look at weget.cmbi.umcn.nl
  }

Try the sigora package in your browser

Any scripts or data that you put into this service are public.

sigora documentation built on March 18, 2022, 8:05 p.m.