R/geneSetSummaryByGenes.R

Defines functions rename.feature.columns

# The geneSetSummaryByGenes function (and its utility helper funcionts) are
# too hairy to separate, so putting them here. I especially don't intend the
# rename.feature.columns function, in particular, to be reused.
#
# Although the code looks hairy, note that there are unit tests in
# test-geneSetSummaryByGenes.R

#' @rdname geneSetSummaryByGenes
setMethod("geneSetSummaryByGenes", c(x="GeneSetDb"),
function(x, features, with.features=TRUE, feature.rename=NULL, ...,
         as.dt = FALSE) {
  stopifnot(is.character(features))
  unk.f <- setdiff(features, featureIds(x))
  if (length(unk.f)) {
    warning(length(unk.f), "/", length(features), " do not exist in GeneSetDb")
    features <- setdiff(features, unk.f)
  }
  if (length(features) == 0) {
    stop("No selected features in GeneSetDb")
  }
  x.sub <- subsetByFeatures(x, features)
  x.db <- x.sub@db[feature_id %in% features]
  x.gs <- geneSets(x.sub, as.dt=TRUE)

  gs.cols <- c('collection', 'name', 'active')
  if (is.conformed(x)) {
    out <- x.gs[, c(gs.cols, 'n'), with=FALSE]
    setnames(out, 'n', 'N')
  } else {
    out <- x.gs[, c(gs.cols, 'N'), with=FALSE]
  }

  ## Each geneset row will be annotated with the number of features it
  ## it has, even if caller doesn't ask for `with.features`. To do so we first
  ## create geneset <-> feature_id contingency table.
  ## we turn x.dt$feature_id into a factor with levels == features because there
  ## maybe be some features that don't appear anywhere (due to conformation(?))

  x.dt <- dcast.data.table(transform(x.db, present=TRUE),
                           collection + name ~ feature_id,
                           value.var='present', fill=FALSE)

  ## I was once given a strange GeneSetDb object with genesets in gdb@db that
  ## were not in gdb@table. Such an object would fail validObject(gdb), so this
  ## should never be. Let's dance around it, but also warn when it happens.
  nfids <- intersect(features, tail(names(x.dt), -2))
  if (!setequal(nfids, features)) {
    warning("You are in a scenario you thought should be impossible",
            immediate.=TRUE)
    features <- nfids
  }
  x.dt$n <- rowSums(as.matrix(x.dt[, -(1:2), with=FALSE]))
  setcolorder(x.dt, c(setdiff(names(x.dt), features), features))
  if (with.features) {
    x.dt <- rename.feature.columns(x.dt, x.sub, feature.rename)
  } else {
    ## If the caller didn't want the features, we just return the n
    x.dt <- x.dt[, list(collection, name, n)]
  }
  out <- out[x.dt, nomatch=0]
  if (!as.dt) out <- setDF(copy(out))
  out
})

#' @describeIn geneSetSummaryByGenes get geneset:feature incidence table from
#'   a SparrowResult, optionally filtered by statistical significance from
#'   a given gsea `method`
#' @param method The GSEA method to pull statistics from
#' @param max.p the maximum p-value from the analysis `method` to allow for the
#'   geneSets included in the returned table
#' @param p.col which p-value column to select from: `'padj'`,
#'   `'padj.by.collection'`, or `'pval'`
setMethod("geneSetSummaryByGenes", c(x="SparrowResult"),
function(x, features, with.features=TRUE, feature.rename=NULL,
         method=NULL, max.p=0.3, p.col=c('padj', 'padj.by.collection', 'pval'),
         ..., as.dt=FALSE) {
  stopifnot(is.character(features))
  unk.f <- setdiff(features, featureIds(x))
  if (length(unk.f)) {
    warning(length(unk.f), "/", length(features), " do not exist in GeneSetDb")
    features <- setdiff(features, unk.f)
  }
  if (length(features) == 0) {
    stop("No selected features in GeneSetDb")
  }
  if (is.character(method)) {
    p.col <- match.arg(p.col)
    method <- match.arg(method, resultNames(x))
    stopifnot(max.p >= 0 & max.p <= 1)
    r <- result(x, method, as.dt=TRUE)
  }

  gdb <- copy(geneSetDb(x))
  gs <- geneSets(x)

  res <- geneSetSummaryByGenes(gdb, features, with.features,
                               feature.rename=FALSE, as.dt=TRUE, ...)

  if (with.features) {
    ## Account for impossible scenario that I "dance around" in the function
    ## upstairs
    features <- intersect(features, names(res))

    fstart <- ncol(res) - length(features) + 1
    meta <- res[, 1:(fstart - 1), with=FALSE]
    fcols <- res[, fstart:ncol(res), with=FALSE]
  } else {
    meta <- res
  }

  ## add logFC and pvalues
  xref <- match(paste(res$collection, res$name), paste(gs$collection, gs$name))
  meta$logFC <- gs$mean.logFC.trim[xref]
  if (is.character(method)) {
    rxref <- match(paste(res$collection, res$name), paste(r$collection, r$name))
    meta$padj <- r$padj[rxref]
  }

  ## When this is called on a result object, instead of a TRUE/FALSE
  ## geneset <-> feature contingency table, we replace the TRUE's with the
  ## logFC of that feature
  if (with.features) {
    ## spank on the logFCs of the genes
    lfc <- setkeyv(copy(logFC(x, as.dt=TRUE)), 'feature_id')
    for (fid in features) {
      replace.me <- fcols[[fid]]
      fcols[, (fid) := ifelse(replace.me, lfc[fid]$logFC, 0)]
    }

    if (is.character(feature.rename)) {
      feature.rename <- feature.rename[1]
      ## Where do we find the feature_id <-> renamed xref? If we find this column
      ## in the logFC(x) table, then that trumps all.
      if (is.character(lfc[[feature.rename]])) {
        xref <- match(gdb@db$feature_id, lfc$feature_id)
        gdb@db[[feature.rename]] <- lfc[[feature.rename]][xref]
      }
    }

    fcols <- rename.feature.columns(fcols, gdb, feature.rename)
    res <- cbind(meta, fcols)
  } else {
    res <- meta
  }

  setkeyv(res, c('collection', 'name'))

  if (is.character(method)) {
    keep <- r[[p.col]] <= max.p
    keep <- r[keep][, list(collection, name)]
    res <- res[keep, nomatch=0]
  }

  if (!as.dt) res <- setDF(copy(res))
  res
})

#' utility function. accepts a pivoted geneset <-> feature table, and renames
#' the columns associated with features in gdb. if feature.names is NULL, then
#' the column names are prefixed with 'featureId_' to avoid column names that
#' are numbers.
#'
#' columns in `out` are identified as features by virtue of them matching values
#' returned from `featureids(gdb)`
#'
#' when an internal function calls this with feature.rename=FALSE, then no
#' rename will happen at all.
#'
#' @noRd
rename.feature.columns <- function(x, gdb, feature.rename=NULL) {
  stopifnot(is.data.table(x))
  stopifnot(is(gdb, "GeneSetDb"))
  if (is.logical(feature.rename[1L]) && !feature.rename[1L]) {
    return(x)
  }
  rename.cols <- colnames(x)[colnames(x) %in% featureIds(gdb)]
  if (length(rename.cols) == 0) {
    return(x)
  }

  if (!is.null(feature.rename)) {
    feature.rename <- feature.rename[1L]
    if (is.null(gdb@db[[feature.rename]])) {
      warning("Can't rename features by: ", feature.rename,
              " (not found as a column in gdb@db")
      feature.rename <- NULL
    }
  }

  default.names <- setNames(paste0('featureId_', rename.cols), rename.cols)

  if (is.null(feature.rename)) {
    new.names <- default.names
  } else {
    db <- gdb@db[feature_id %in% rename.cols]
    db <- unique(db, by=c('feature_id'))
    new.names <- setNames(db[[feature.rename]], db[['feature_id']])
    isna <- is.na(new.names)
    new.names[isna] <- default.names[isna]
  }
  setnames(x, names(new.names), new.names)
  setcolorder(x, c(setdiff(names(x), new.names), sort(new.names)))
}
lianos/sparrow documentation built on Dec. 8, 2024, 2:19 a.m.