R/focuslevel.R

Defines functions ancestors2parents leaveNodes findFocus getNoAtoms getAtoms turnListAround .turnListAround intersectPatterns fillPattern focusLevel

Documented in findFocus focusLevel

focusLevel <- function(test, sets, focus, ancestors, offspring,  
  stop = 1, atoms = TRUE, trace) {

  # rename input
  useAtoms <- atoms
  if (stop <= 1) {
    maxalpha <- stop
    stopafter <- Inf
  } else {
    maxalpha <- 1
    stopafter <- stop
  }       
  if (missing(trace))
    trace <- gt.options()$trace
       
  # input checking 1: sets
  if (missing(sets) && is(test, "gt.object") && !is.null(test@subsets)) 
    sets <- test@subsets
  if (missing(sets)) stop("argument \"sets\" is missing with no default")
  if (is.character(sets))
    if (is(test, "gt.object") && !is.null(test@subsets)) {
      test <- test[sets]
      sets <- test@subsets
    }
  if (is.null(names(sets)))
    stop("sets input has no names attribute.")

  # input checking 2: ancestors and offspring
  if (missing(ancestors) && is(test, "gt.object") && !is.null(test@structure$ancestors))
    ancestors <- test@structure$ancestors
  if (missing(offspring) && is(test, "gt.object") && !is.null(test@structure$offspring))
    offspring <- test@structure$offspring
  if (missing(ancestors) && missing(offspring)) {     # Infer from sets
    ancestors <- new.env(hash=TRUE)
    offspring <- new.env(hash=TRUE)
    for (i in 1:length(sets)) {
      namei <- names(sets)[i]
      for (j in 1:length(sets)) {
        namej <- names(sets)[j]
        if (i != j && length(sets[[i]]) <= length(sets[[j]]) && all(sets[[i]] %in% sets[[j]])) {
          ancestors[[namei]] <- c(ancestors[[namei]], namej)
          offspring[[namej]] <- c(offspring[[namej]], namei)
        }
      }
    }
  }      
  if ((!missing(ancestors)) && is.environment(ancestors))
    ancestors <- as.list(ancestors)
  if ((!missing(offspring)) && is.environment(offspring))
    offspring <- as.list(offspring)
  if (missing(ancestors))
    ancestors <- turnListAround(offspring)
  if (missing(offspring))
    offspring <- turnListAround(ancestors)    

  # Preprocess gt.object input and calculate focus level p-values
  if (trace) cat("Getting raw p-values: ")
  K <- length(sets)
  digitsK <- trunc(log10(K))+1
  if (is(test, "gt.object")) {
    if (is.list(test@weights))
      stop("The focus level method is not applicable with individually weighted sets")
    rawgt <- test
    test <- function(set) rawgt@functions$test(set)[1]
    if (all(names(sets) %in% names(rawgt))) {
      rawgt <- rawgt[names(sets)]
    } else {
      result <- t(sapply(1:K, function(i) {
        if (trace) {
          if (i > 1) cat(rep("\b", digitsK+trunc(log10(i-1))+4), sep="")
          cat(i, " / ", K, sep="")
          flush.console()
        }
        rawgt@functions$test(sets[[i]])
      }))
      rawgt@result <- result
      colnames(rawgt@result) <- c("p-value", "Statistic", "Expected", "Std.dev", "#Cov")
      rownames(rawgt@result) <- names(sets)
      rawgt@subsets <- sets
    }
    rawp <- p.value(rawgt)[focus]
  } else {
    rawgt <- NULL
    rawp <- sapply(1:K, function(i) {
      if (trace) {
        cat(rep("\b", 2*digitsK+3), i, " / ", K, sep="")
        flush.console()
      }
      test(sets[[i]])
    })
    names(rawp) <- names(sets)
  }
  rawp[is.na(rawp)] <- 1
  if (trace) cat(rep("\b", 2*digitsK+25), sep="")

     
  # Prepare the objects that store the test results
  pFocus <- rawp[focus]                                   # Stores the p-values of focus nodes
  sigFocus <- logical(length(focus))                # Keeps track of focus nodes which are declared significant
  names(sigFocus) <- focus
  emptyFocus <- logical(length(focus))              # Keeps track of subgraphs which are completely significant
  names(emptyFocus) <- focus
  atoms <- new.env(hash=TRUE, size=length(focus))   # Stores the atoms of offspring sets of each focus node used to make unions
  unions <- new.env(hash=TRUE, size=length(focus))  # Keeps track of all unions that may be tested
  #pUnions <- vector("list", length(focus))          # Stores the p-values of unions that may be tested
  #names(pUnions) <- focus 
  pUnions <- new.env(hash=TRUE, size=length(focus))
  
  sigUnions <- vector("list", length(focus))        # Keeps track of unions that have so far been called significant
  names(sigUnions) <- focus 
  offspringAtoms <- lapply(as.list(focus), function(term) { # Stores the offspring nodes as unions of atoms
    offnames <- intersect(offspring[[term]], names(sets))
    off <- matrix(,length(offnames),0)
    rownames(off) <- offnames
    off
  })
  names(offspringAtoms) <- focus
  sigOffspring <- vector("list", length(focus))     # Stores which offspring has already been declared significant
  names(sigOffspring) <- focus
  adjustedP <- rep(NA, length(sets))             # Prepare the vector of corrected p-values
  names(adjustedP) <- names(sets)

  # Find all GO terms above the focus level
  forefathers <- unique(unlist(ancestors[focus]))
  forefathers <- setdiff(forefathers, focus)
  forefathers <- setdiff(forefathers, unique(unlist(offspring[focus])))

  # Initialize
  alpha <- 0
  holm <- length(focus)
  indirectAffected <- list()
  ready <- FALSE
  change <- TRUE
  if (missing(trace)) trace <- interactive()

  
  while (! ready) {
  
    # Find the focus terms where the action is
    affected1 <- focus[(!sigFocus) & (pFocus * holm <= alpha)]    # Find newly significant focus level terms
    opensubgraphs <- focus[sigFocus & (!emptyFocus)]              # Find the subgraphs in which something may happen
    minP <- sapply(opensubgraphs, function(ff) min(pUnions[[ff]][!sigUnions[[ff]]]))  # Find the smallest p-value not yet declared significant
    if (length(minP) > 0) {
      names(minP) <- opensubgraphs
      affected2 <- names(minP * holm <= alpha)                      # Find those subgraphs for which that minimum is significant
    } else {
      affected2 <- character(0)
    }
    affected <- unique(c(affected1, affected2, names(indirectAffected)))  # Include those significant through upward propagation
    
    newSigGO <- character(0)
    change <- FALSE
    
    for (term in affected) {
      
      # If a focus term was declared significant. Start a new subgraph
      if (!sigFocus[term]) {                                
        sigFocus[term] <- TRUE
        newSigGO <- c(newSigGO, term)                       # A GO term has been declared significant
        if (trace > 1) cat("Rejected:", term, "\n")
        alloffspring <- intersect(offspring[[term]], names(sets))
        if (length(alloffspring) > 0) {
          offspringSets <- sets[alloffspring]
          if (useAtoms)
            TermAtoms <- getAtoms(offspringSets)                                  # Get the set of atoms
          else
            TermAtoms <- getNoAtoms(offspringSets) 
          atoms[[term]] <- TermAtoms
          unions[[term]] <- matrix(rep(TRUE, length(TermAtoms)), 1, length(TermAtoms))  # Prepare the matrix of unions of atoms
          sigUnions[[term]] <- FALSE
          pUnions[[term]] <- sapply(list(unique(unlist(TermAtoms))), test)                  # Start by testing the union of all atoms
          pUnions[[term]][is.na(pUnions[[term]])] <- 1
          offspringAtoms[[term]] <- matrix(sapply(TermAtoms, function(y)            # Write all offspring as unions of atoms
            sapply(offspringSets, function(x) all(y %in% x))), ncol = length(TermAtoms))
          rownames(offspringAtoms[[term]]) <- names(offspringSets)
          sigOffspring[[term]] <- logical(length(offspringSets))
          names(sigOffspring[[term]]) <- names(offspringSets)
        } else {                                                                # An empty version for when the focus term is an end node
          sigUnions[[term]] <- logical(0)
          atoms[[term]] <- list()
          unions[[term]] <- matrix(,0,0)
          offspringAtoms[[term]] <- matrix(,0,0)
          sigOffspring[[term]] <- logical(0)
        }
        change <- TRUE
      } 
      
      # Propagate significance from offspring that was declared significant in another subgraph
      propagate <- indirectAffected[[term]]                 
      propagate <- propagate[!(propagate %in% term)]
      propagate <- propagate[!sigOffspring[[term]][propagate]]
      if (length(propagate) > 0) {
        pPatterns <- matrix(,0,length(atoms[[term]]))   # Get all superset patterns of a propagated GO term 
        for (i in 1:length(propagate)) {
          basePattern <- offspringAtoms[[term]][propagate[[i]],] # The propagated GO term as unions of atoms
          ancestorPatterns <- fillPattern(basePattern)      # All superset patterns as unions of atoms
          ancestorPatterns <- ancestorPatterns[!intersectPatterns(ancestorPatterns, pPatterns),,drop=FALSE]    # Remove duplicates
          pPatterns <- rbind(pPatterns, ancestorPatterns)
        }
        matchedPatterns1 <- intersectPatterns(pPatterns, unions[[term]])  # These new patterns were already testable
        matchedPatterns2 <- intersectPatterns(unions[[term]], pPatterns)  # These already testable patterns are now called significant
        newPatterns <- pPatterns[!matchedPatterns1,, drop=FALSE]          # These are new patterns
        if (trace > 1) cat("\t\tSignificance of", nrow(newPatterns), "sets from", propagate, " propagated to", term, "\n")
        unions[[term]] <- rbind(unions[[term]], newPatterns)              # Make all supersets testable and give them raw p 0
        pUnions[[term]][which(matchedPatterns2)] <- 0                     
        pUnions[[term]] <- c(pUnions[[term]], rep(0, nrow(newPatterns)))
        sigUnions[[term]] <- c(sigUnions[[term]], rep(FALSE, nrow(newPatterns)))    # These patterns will be called significant later
        change <- TRUE
      }
      
      # Find the testable non-significant unions with small p-values
      newsigs <- (pUnions[[term]] * holm <= alpha) & !sigUnions[[term]]     
      newsigs <- which(newsigs)
      
      # Find whether there are GO terms among the newly significant unions
      sigPatterns <- unions[[term]][newsigs,, drop=FALSE]
      offspringPatterns <- offspringAtoms[[term]][!sigOffspring[[term]],,drop=FALSE]     # Patterns of not already significant offspring
      matched <- intersectPatterns(offspringPatterns, sigPatterns)                  # Any new offspring terms among the significant patterns?
      newSigOffspring <- rownames(offspringPatterns)[matched]
      if (trace > 1 && (length(setdiff(newSigOffspring, propagate))>0)) {
        cat("Signficant:", setdiff(newSigOffspring, propagate), "\n")
      }
      newSigGO <- c(newSigGO, newSigOffspring)                 
      sigOffspring[[term]][newSigOffspring] <- TRUE
      
      # Expand the current subgraph by finding new unions that may now be tested, and find the p-values
      newpatterns <- matrix(, 0, length(atoms[[term]]))
      for (ix in newsigs) {
        sigUnionsMatrix <- unions[[term]][sigUnions[[term]], ,drop = FALSE]         # The unions so far significant for this term
        pattern <- unions[[term]][ix,]      # the significant pattern
        if (sum(pattern) > 1) {
          for (iy in (1:length(pattern))[pattern]) { # For-loop over the TRUEs
            newpattern <- pattern
            newpattern[iy] <- FALSE # All direct subsets have one extra FALSE
            reallyNew <- !any(intersectPatterns(matrix(newpattern,nrow=1), unions[[term]])) # Is the pattern really new?
            if (reallyNew) {      # Are the other parents of this pattern also present?
              parentspresent <- all(sapply((1:length(newpattern))[!pattern], function(iz) { # Loop over the FALSEs of pattern
                newpatternparent <- newpattern
                newpatternparent[iz] <- TRUE                        # The parents of newpattern have one extra TRUE
                any(apply(sigUnionsMatrix, 1, function(pttn) all(pttn == newpatternparent))) # Is this superset present among the significant tests?
              }))
              if (parentspresent) {
                newpatterns <- rbind(newpatterns, newpattern)
                rownames(newpatterns) <- NULL
              }
            }
          }
        }
        sigUnions[[term]][ix] <- TRUE # Only now call the term itself significant (prevents duplicate patterns)
      }
      unions[[term]] <- rbind(unions[[term]], newpatterns)
      sigUnions[[term]] <- c(sigUnions[[term]], rep(FALSE, nrow(newpatterns)))
      if (length(newpatterns) > 0) {                        # Calculate and store their p-values
        if (trace > 1) cat("\t\ttests:", nrow(newpatterns), "in", term, "\n")
        newsets <- lapply(as.list(1:nrow(newpatterns)), function(i) { # Assemble gene sets from the atoms
          unique(unlist(atoms[[term]][newpatterns[i,]]))
        })
        newpvalues <- sapply(newsets, test)
        newpvalues[is.na(newpvalues)] <- 1
        pUnions[[term]] <- c(pUnions[[term]], newpvalues) 
        change <- TRUE
      }
      
      # Is the subgraph emptied?
      empty <- (length(atoms[[term]]) == 0) || (all(sigOffspring[[term]]))
      if (empty) {                                          
        emptyFocus[term] <- TRUE
      }
    } 
    
    if (length(newSigGO)>0) {
      change <- TRUE                # Don't change alpha yet; there may be propagation to be done
      adjustedP[newSigGO] <- alpha  # Adjusted P for newly affected GO terms
    }
    
    # Find all forefather terms that are now significant
    affectedGOforefathers <- unique(unlist(ancestors[newSigGO])) # Find GO terms above the focus level significant through propagation
    affectedGOforefathers <- intersect(affectedGOforefathers, forefathers)
    newlyAffectedGOforefathers <- intersect(affectedGOforefathers, names(adjustedP[is.na(adjustedP)]))
    adjustedP[newlyAffectedGOforefathers] <- alpha
    if (trace > 1 && (length(newlyAffectedGOforefathers) > 0))
      cat("Rejected because of rejected offspring:", newlyAffectedGOforefathers, "\n")
      
    # For all significant GO terms, see if they are also present in other subtrees  
    affectedGOwithParents <- lapply(as.list(newSigGO), function(nsg) { 
      present1 <- sapply(offspringAtoms, function(off) nsg %in% rownames(off))
      present2 <- focus %in% nsg
      focus[present1 | present2]
    })      # Creates a list of new significant GO terms, listing the subtrees they appear in
    names(affectedGOwithParents) <- newSigGO
    indirectAffected <- turnListAround(affectedGOwithParents)  # Reverses the list to a list of subtrees, listing the significant GO offspring terms
    
    # Output progress information
    if (trace == 1) {
      cat(paste(rep("\b", 48), collapse=""))
      cat("Alpha = ", format(alpha, digits=3, scientific=T, width= 10), ": rejected terms: ", format(sum(!is.na(adjustedP)), width=5), ".", sep="")
      flush.console()
    }
    
    # If nothing happened, increase alpha
    if (!change) { 
      allPs <- sort(c(pFocus, unlist(as.list(pUnions))))
      allPs <- allPs[allPs * holm > alpha]
      alpha <- allPs[1] * holm
      names(alpha) <- NULL
      if (trace > 1) cat("\tAlpha =", format(alpha, digits=3, scientific=T, width= 10), "\n")
    } else {
      if (trace > 1 && (holm > sum(!emptyFocus))) cat("Holm's factor:", sum(!emptyFocus), "\n")
      holm <- sum(!emptyFocus)
    }
    ready <- (holm == 0) || (alpha > maxalpha) || (sum(!is.na(adjustedP)) >= stopafter)
  } 
  
  # ready
  if (trace==1) {
    cat(paste(rep("\b", 48), collapse=""))
    cat("Alpha = ", format(1, digits=3, scientific=T, width= 10), ": rejected terms: ", format(length(adjustedP), width=5), ".\n", sep="")
  }
  if (alpha >= 1)
    adjustedP[is.na(adjustedP)] <- 1
  
  # return
  if (!is.null(rawgt)) {
    extra <- rawgt@extra
    extra[["focuslevel"]] <- adjustedP
    rawgt@extra <- as.data.frame(extra) 
    rawgt@structure <- list(ancestors=ancestors,offspring=offspring)
    return(rawgt)  
  } else {
    return(data.frame(raw.p = rawp, focuslevel = adjustedP))
  }
}


# Finds all ancestor patterns of a pattern
fillPattern <- function(pattern) {
  k <- sum(!pattern)
  out <- matrix(T, 2^k, length(pattern))
  if (k>0) {
    out[,!pattern] <- sapply(1:k, function(i) {
      rep(c(T,F), each = 2^(i-1), times = 2^(k-i))
    })
  }
  out
}


# Finds patterns in "patterns" which also occur in "matchto"
# Equivalent to %in% but for matrices
intersectPatterns <- function(patterns, matchto) {
  if (nrow(matchto) > 0) {
    matched <- apply(patterns, 1, function(pattern1) {
      any(apply(matchto, 1, function(pattern2) 
        all(pattern1 == pattern2)
      ))
    })
  } else {
    matched <- logical(nrow(patterns))
  }
  dim(matched) <- NULL
  matched
}


# Starts from a named list and "turns it around"
# Returns a named list of length(all elements of the list)
# Each element of the list gives the names of the original list elements
# that contained that element
.turnListAround <- function(aList) { #old
  newlist <- new.env(hash=TRUE)
  objs <- names(aList)
  if (is.null(objs)) objs <- seq_along(alist)
  for (i in objs) {
    for (j in aList[[i]]) {
      newlist[[j]] <- c(newlist[[j]], i)
    }
  }
  as.list(newlist)
}
      
turnListAround <- function(alist) {
  alist2 <- unlist(alist)
  alist1 <- rep(names(alist), sapply(alist, length))
  res <- tapply(alist1, alist2, identity, simplify=FALSE)
  res
}

# This function extracts the smallest possible number of atoms from 
# sets, such that all sets can be built as unions of these atoms
# Note: These atoms lead to slightly conservative closed testing
# example: 1,2,23 is reduced to 1,2,3 and leads to unnessesary testing of 13 prior to testing 1.
# But: 1,12,13,23 is now reduced to 1,2,3 while otherwise it would not be reduced.
getAtoms <- function(allsets) {
          
  if (!is(allsets, "list")) {                                           # Coerce to a list
    allsets <- list(allsets)
  }
  allsets <- allsets[sapply(allsets, length) > 0]                       # Remove empty sets
  allsets <- lapply(allsets, function(set) set[!is.na(set)])            # Remove NAs
  setdiffs <- vector("list", length(allsets))                           # Prepare output
  i <- 0
  lastAtom <- character(0)
  while (length(allsets) > 0) {
    allsets <- lapply(allsets, function(set) setdiff(set, lastAtom))    # Remove probes in the last atom from every set
    allsets <- allsets[sapply(allsets, length) > 0]                     # Remove empty sets
    if (length(allsets) > 0) {
      i <- i+1
      allsets <- allsets[sort.list(sapply(allsets, length))]            # Sort by size
      lastAtom <- allsets[[1]]                                          # Next atom: smallest non-empty set
      setdiffs[i] <- allsets[1]                                         # Store the atom
    }
  } 

  setdiffs[1:i]
}

# No atoms variant: only throws away redundant sets
getNoAtoms <- function(sets) {
                     
  if (!is(sets, "list")) {                                     # Coerce to a list
    sets <- list(sets)
  }
  sets <- sets[sapply(sets, length) > 0]                       # Remove empty sets
  sets <- lapply(sets, function(set) set[!is.na(set)])         # Remove NAs
  K <- length(sets)
  redundant <- rep(FALSE, K)                                   # redundant sets are sets that are unions of other sets
  for (i in 1:K) {
    contained <- sapply(sets, function(set) all(set %in% sets[[i]]))
    redundant[i] <- all(sets[[i]] %in% unlist(sets[contained & (1:K != i) & !redundant]))
  }
  sets[!redundant]
}


findFocus <- function(sets, ancestors, offspring, maxsize = 10, atoms = TRUE) {

  useAtoms <- atoms

  if (is(sets, "gt.object") && !is.null(sets@subsets)) 
    sets <- sets@subsets
  if (missing(sets)) stop("argument \"sets\" is missing with no default")
  if (is.character(sets))
    if (is(test, "gt.object") && !is.null(test@subsets)) {
      test <- test[sets]
      sets <- test@subsets
    }
  if (missing(ancestors) && missing(offspring)) {     # Infer from sets
    ancestors <- new.env(hash=TRUE)
    offspring <- new.env(hash=TRUE)
    for (i in 1:length(sets)) {
      namei <- names(sets)[i]
      for (j in 1:length(sets)) {
        namej <- names(sets)[j]
        if (i != j && length(sets[[i]]) <= length(sets[[j]]) && all(sets[[i]] %in% sets[[j]])) {
          ancestors[[namei]] <- c(ancestors[[namei]], namej)
          offspring[[namej]] <- c(offspring[[namej]], namei)
        }
      }
    }
  }      
  if ((!missing(ancestors)) && is.environment(ancestors))
    ancestors <- as.list(ancestors)
  if ((!missing(offspring)) && is.environment(offspring))
    offspring <- as.list(offspring)
  if (missing(ancestors))
    ancestors <- turnListAround(offspring)
  if (missing(offspring))
    offspring <- turnListAround(ancestors)    
                        
  # Prune offspring
  offspring <- lapply(offspring, function(set) intersect(set, names(sets)))                      
                        
  # Find all nodes with at most 5*maxatoms offspring sets
  # and at most maxatoms atoms
  allnodes <- names(sets)
  doable <- sapply(offspring[allnodes], length) <= 5 * maxsize
  names(doable) <- names(sets)
  for (term in allnodes[doable]) {
    if (length(offspring[[term]]) > 0) {
      if (useAtoms)
        atoms <- getAtoms(sets[offspring[[term]]])
      else
        atoms <- getNoAtoms(sets[offspring[[term]]])
    } else {
      atoms <- character(0)
    }
    doable[term] <- (length(atoms) <= maxsize)
  }

  # Find all doable nodes with no doable ancestors
  alldoable <- allnodes[doable]
  ancestors <- ancestors[doable]
  redundant <- sapply(alldoable, function(id) {
    any(ancestors[[id]] %in% alldoable)
  })
  alldoable[!redundant]
}


# Finds the leaves of a significant graph
leaveNodes <- function(object, alpha=0.05, ancestors,type) {

  if (missing(ancestors)) {     # Infer from sets
    sets <- object@subsets
    ancestors <- new.env(hash=TRUE)
    for (i in 1:length(sets)) {
      namei <- names(sets)[i]
      for (j in 1:length(sets)) {
        namej <- names(sets)[j]
        if (i != j && length(sets[[i]]) <= length(sets[[j]]) && all(sets[[i]] %in% sets[[j]])) {
          ancestors[[namei]] <- c(ancestors[[namei]], namej)
        }
      }
    }
    ancestors <- as.list(ancestors)
  }

 if (missing(type)) type=names(object@extra)[1]
 
  focusP <- object@extra[[type]]      
  if (is.null(focusP)) stop("No focus level p-values")
  
  significant <- names(object)[focusP <= alpha]
  sign.anc <- unique(unlist(ancestors[significant]))
  significant <- setdiff(significant, sign.anc)
  
  object[significant]
}

# Takes sets to ancestors mapping; converts it to sets to parents mapping
ancestors2parents <- function(ancestors) {
  lapply(ancestors, function(anc) {
    setdiff(anc, unique(unlist(ancestors[anc])))
  })
}
jellegoeman/globaltest documentation built on Dec. 29, 2021, 9:11 p.m.