R/perm.features.R
In OmicsMarkeR: Classification and Feature Selection for 'Omics' Datasets

Documented in perm.features

### Monte Carlo Permutation Testing for Feature Significance

# declare global variables (i.e. the foreach iterators)
globalVariables('p')

#' @title Feature Selection via Monte Carlo Permutation
#' @description Applies Monte Carlo permutations to user specified models.  
#' The user can either use the results from \code{fs.stability} or provide 
#' specified model parameters.
#' @param fs.model Object containing results from \code{fs.stability}
#' @param X A scaled matrix or dataframe containing numeric values of 
#' each feature
#' @param Y A factor vector containing group membership of samples
#' @param method A vector listing models to be fit.
#' Available options are \code{"plsda"} (Partial Least Squares 
#' Discriminant Analysis), \code{"rf"} (Random Forest), \code{"gbm"} 
#' (Gradient Boosting Machine), \code{"svm"} (Support Vector Machines), 
#' \code{"glmnet"} (Elastic-net Generalized Linear Model), and 
#' \code{"pam"} (Prediction Analysis of Microarrays)
#' @param sig.level Desired significance level for features, 
#' default \code{sig.level = .05}
#' @param nperm Number of permutations, default \code{nperm = 10}
#' @param allowParallel Logical argument dictating if parallel processing 
#' is allowed via foreach package.
#' Default \code{allowParallel = FALSE}
#' @param verbose Logical argument whether output printed automatically
#' in 'pretty' format.
#' @param ... Extra arguments that the user would like to apply to the models
#' @return \item{sig.level}{User-specified significance level}
#' @return \item{num.sig.features}{Number of significant features}
#' @return \item{sig.features}{Dataframe of significant features}
#' @author Charles Determan Jr.
#' @references Wongravee K., et. al. (2009) \emph{Monte-Carlo methods 
#' for determining optimal number of significant variables.  Application to 
#' mouse urinary profiles}. Metabolomics 5:387-406.
#' @example inst/examples/perm.features.R
#' @import DiscriMiner
#' @import randomForest
#' @import e1071
#' @import gbm
#' @import pamr
#' @import glmnet
#' @importFrom permute shuffle
#' @export

# user-defined -> sig.level, model, nperm
perm.features <- 
    function(
        fs.model = NULL, 
        X, 
        Y, 
        method, 
        sig.level = .05, 
        nperm = 10, 
        allowParallel = FALSE, 
        verbose = TRUE,
        ...)
    {
        
        assert_is_matrix(X)
        assert_is_numeric(X)
        assert_is_factor(Y)
        assert_is_character(method)
        assert_is_in_closed_range(sig.level, 0, 1)
        assert_is_numeric(nperm)
        # currently limiting to no more than 100,000 permutations
        assert_is_in_closed_range(nperm, 0, 100000)
        
        assert_is_logical(allowParallel)
        assert_is_logical(verbose)
        
        if(!method %in% modelList()$methods){
            stop("Method not recognized.  Check for method
                 code in 'modelList()'")
        }
        
        `%op%` <- if(allowParallel){
            `%dopar%` 
        }else{
            `%do%`
        } 
        
        theDots <- list(...)
        if(is.null(fs.model) & length(theDots) == 0){
            stop("Error: you must either provide fitted model from fs.stability 
                 or the parameters for the desired model")
        }
        
        obsLevels <- levels(as.factor(Y))
        
        data <- as.data.frame(cbind(X, .classes = Y))
        
        ## pam and will crash if there is a resample with <2 observations
        ## in a class. We will detect this and remove those classes.
        if(method == "pam")
        {
            yDist <- table(data$.classes)
            if(any(yDist < 2))
            {
                smallClasses <- names(yDist[yDist < 2])
                data <- data[!(data$.classes %in% smallClasses),]
            }
        }
        
        ## Factor the class labels
        levels(data$.classes) <- obsLevels
        
        xNames <- names(data)[!(names(data) %in% ".classes")]
        
        trainX <- data[,!(names(data) %in% ".classes"), drop = FALSE]
        
        if(method == "gbm" & length(obsLevels) == 2) {
            numClasses <- ifelse(data$.classes == obsLevels[1], 1, 0)
        }
        
        if(!is.null(fs.model)){
            args <- extract.args(fs.model, method)
        }else{
            if(!is.null(theDots) & length(theDots) != 0){
                args <- theDots
            }
        }
        
        if(!is.null(theDots) & length(theDots) != 0){
            if(names(theDots) %in% names(args)){
                arg.ind <- which(names(theDots) %in% names(args))
                args <- theDots[arg.ind]
                theDots <- theDots[-arg.ind]
            }  
        }
        
        args <- data.frame(do.call("cbind", args))
        orig.groups <- Y
        
        # extract new values of variables
        N <- nrow(trainX)
        set.seed(666)
        perms <- replicate(nperm, shuffle(N), simplify = FALSE)
        perms <- append(list(seq(N)), perms)
        
        scores <- foreach(p = seq.int(nperm+1),
                          .packages = c("OmicsMarkeR", "foreach", 
                                        "randomForest", "e1071", "DiscriMiner",
                                        "gbm","pamr","glmnet"),
                          .verbose = FALSE,
                          .errorhandling = "stop") %op% 
{
    #permute the groups
    trainY <- orig.groups[perms[[p]]]
    
    features <- switch(method,
                       plsda ={
                           mod <- plsDA(trainX, 
                                        trainY,
                                        autosel= FALSE,
                                        validation = NULL,
                                        comps = if(args$ncomp > 1){
                                            args$ncomp}else{2},
                                        cv ="none")
                           
                           if(args < 2){
                               l <- 1
                           }else{
                               l <- ncol(mod$VIP)  
                           }
                           out <- mod$VIP[,l]
                       },
                       
                       gbm ={
                           gbm.args <- c("w", "var.monotone", "n.minobsinnode", 
                                         "bag.fraction", "var.names", 
                                         "response.name", "group") 
                           theDots <- theDots[names(theDots) %in% gbm.args]
                           
                           if(ncol(trainX) < 50 | nrow(trainX) < 50){
                               if(is.null(theDots) | length(theDots) == 0){
                                   if(nrow(trainX) < 30){
                                       theDots <- list(n.minobsinnode = 2)
                                   }else{
                                       theDots <- list(n.minobsinnode = 5)  
                                   }
                               }
                           }
                           # determine if binary or multiclass
                           gbmdist <- if(length(unique(trainY)) == 2){
                               "bernoulli"}else{
                                   "multinomial"
                               }         
                           
                           # check gbm setup file to see if this is necessary
                           modY <- if(gbmdist != "multinomial"){
                               numClasses
                           }else{
                               trainY
                           } 
                           
                           if(gbmdist != "multinomial"){
                               modY <- numClasses
                           }else{
                               modY <- trainY
                           }
                           
                           modArgs <- list(
                               x = X,
                               y = modY,
                               interaction.depth = args$interaction.depth,
                               n.trees = args$n.trees,
                               shrinkage = args$shrinkage, 
                               distribution = gbmdist,
                               verbose = FALSE)
                           
                           if(length(theDots) > 0) modArgs <- c(modArgs, 
                                                                theDots)
                           
                           mod <- do.call("gbm.fit", modArgs)
                           
                           out <- relative.influence(mod, 
                                                     n.trees = args$n.trees)
                       },
                       
                       rf ={
                           rf.args <- c("maxnodes", 
                                        "keep.forest", 
                                        "keep.inbag")
                           theDots <- theDots[names(theDots) %in% rf.args]
                           
                           modArgs <- list(
                               x = trainX,
                               y = trainY,
                               importance = TRUE,
                               mtry = args$mtry,
                               ntree=round.multiple(sqrt(ncol(trainX)), 
                                                    target = 50)
                           )
                           
                           if(length(theDots) > 0) modArgs <- c(modArgs, 
                                                                theDots)
                           
                           mod <- do.call("randomForest", modArgs)
                           
                           # Mean Decrease in Accuracy metric (type=1)
                           # Gini Index (type=2)
                           out <- importance(mod, type = 1)
                       },                  
                       
                       svm ={
                           best.C <- c(args$C)
                           
                           if(nlevels(trainY) == 2){
                               out <- svmrfeFeatureRanking(trainX, 
                                                           trainY, 
                                                           best.C)
                           }else{
                               out <- 
                                   svmrfeFeatureRankingForMulticlass(trainX, 
                                                                     trainY, 
                                                                     best.C)
                           }
                       },
                       
                       pam ={
                           pamr.args <- c("n.threshold", "threshold.scale",
                                          "scale.sd", "se.scale")
                           theDots <- theDots[names(theDots) %in% pamr.args]
                           #p=100
                           modArgs <- list(data = list(x = t(trainX), 
                                                       y = trainY, 
                                                       geneid = as.character(
                                                           colnames(trainX))),
                                           threshold = args$threshold
                           )
                           mydata <- list(x = t(trainX), 
                                          y = trainY, 
                                          geneid = as.character(
                                              colnames(trainX)))
                           
                           
                           if(length(theDots) > 0) modArgs <- c(modArgs, 
                                                                theDots)
                           
                           mod <- do.call("pamr.train", modArgs)
                           if(mod$nonzero == 0){
                               zero <- replicate(length(obsLevels), 
                                                 rep(0, length(xNames)))
                               pam.features <- data.frame(factor(xNames), zero)
                               colnames(pam.features) <- 
                                   c("id", 
                                     paste(seq(length(obsLevels)),
                                           "score", 
                                           sep = "-")
                                   )
                           }else{
                               pam.features <- data.frame(
                                   pamr.listgenes(mod,
                                                  mydata,
                                                  threshold = args$threshold)
                               )
                           }
                           
                           pam.scores <- sapply(
                               pam.features[,2:ncol(pam.features)], 
                               FUN = function(x) as.numeric(as.character(x)))
                           
                           if(!is.matrix(pam.scores)){
                               pam.scores <- t(as.data.frame(pam.scores))
                           }
                           
                           out <- rowSums(abs(pam.scores))
                           names(out) <- as.character(pam.features[,1])
                           out
                       },         
                       
                       glmnet ={
                           numLev <- nlevels(trainY)
                           
                           glmnet.args <- c("offset", "nlambda", "weights", 
                                            "standardize","intecept", 
                                            "dfmax", "pmax","exclude",
                                            "penalty.factor","lower.limits",
                                            "upper.limits","maxit",
                                            "standardize.response",
                                            "type.multinomial")
                           theDots <- theDots[names(theDots) %in% glmnet.args]
                           
                           if(!is.null(theDots)){
                               if(all(names(theDots) != "family"))
                               {
                                   if(!is.na(numLev))
                                   {
                                       fam <- ifelse(numLev > 2, 
                                                     "multinomial",
                                                     "binomial")
                                   } else stop("Error: levels of classes 
                                            couldn't be determined for glmnet")
                                   
                                   if(is.null(theDots)){
                                       theDots <- list(family = fam)
                                   }else{
                                       theDots$family <- fam
                                   }
                               }
                           }else{
                               if(!is.na(numLev))
                               {
                                   fam <- ifelse(numLev > 2, 
                                                 "multinomial", 
                                                 "binomial")
                               } else stop("Error: levels of classes couldn't 
                                       be determined for glmnet")
                               
                               if(is.null(theDots)){
                                   theDots <- list(family = fam)
                               }
                           }
                           
                           modelArgs <- c(
                               list(x = as.matrix(trainX), 
                                    y = trainY, 
                                    alpha = args$alpha), 
                               theDots)
                           
                           # fit the model
                           mod <- do.call("glmnet", modelArgs)
                           
                           # extract coefficients
                           full.coefs <- coef(mod, s = 0)
                           if(nlevels(trainY) > 2){
                               full.coefs <- 
                                   lapply(full.coefs, 
                                          FUN = function(x) abs(
                                              as.matrix(x[2:(ncol(trainX)+1),
                                                          , drop = FALSE])
                                          ))
                               coefs <- 
                                   lapply(full.coefs, 
                                          FUN = function(x){
                                              x[x[,1] >= 0,, drop = FALSE]}
                                   )
                               
                               coef.names <- lapply(coefs, row.names)
                               coefs <- unlist(coefs, use.names = TRUE)
                               names(coefs) <- unlist(coef.names)
                               
                               #length(unique(names(coefs)))
                               var.names <- unique(names(coefs))
                               
                               for(n in seq(along = unique(names(coefs)))){
                                   ind <- which(names(coefs) == var.names[n])
                                   uni <- sum(abs(coefs[ind]))
                                   names(uni) <- var.names[n]
                                   coefs <- coefs[-ind]
                                   coefs <- c(coefs, uni)
                               }   
                               out <- coefs
                           }else{
                               out <- abs(
                                   as.matrix(
                                       full.coefs[2:(ncol(trainX)+1),, 
                                                  drop = FALSE]
                                   )
                               )
                           }
                           scs <- out[,1]
                           names(scs) <- rownames(out)
                           scs
                       }
    )
    features
}

        # extract p-value (one-tailed)
        var.scores <- lapply(scores, "[")  
        
        # check for var.scores structure
        # mainly for randomforest bug which returns data.frame object
        # must be named numeric vector
        var.scores <- lapply(var.scores, function(x) {
            if(is.vector(x)){
                x
            }else{
                xNames <- row.names(x)
                x <- as.numeric(x)
                names(x) <- xNames
                x
            }
        })
        
        
        miss <- lapply(var.scores, FUN = function(x) which(!xNames %in% names(x)))
        for(i in 1:(nperm+1)){
            if(length(miss[[i]]) >= 1){
                zero <- rep(0, length(miss[[i]]))
                names(zero) <- xNames[miss[[i]]]
                var.scores[[i]] <- c(var.scores[[i]], zero)
            } 
        }
        
        var.scores <- sapply(var.scores, FUN = function(x) x[sort(names(x))])
        perm.p.val <- 
            apply(var.scores, 
                  1, 
                  FUN = function(x){
                      sprintf("%.3f", 
                              round(sum(x[2:(nperm+1)] >= x[1])/nperm, 
                                    digits = 3)
                      )}
            )
        
        # return number of significant features and features themselves
        num.features <- sum(perm.p.val <= sig.level)
        features <- names(which(perm.p.val <= sig.level))
        p.vals <- perm.p.val[which(perm.p.val <= sig.level)]
        
        if(verbose){
            cat("\nFeature Permutation Results\n")
            cat(rep("-",30), sep="")
            cat("\n\n")
            cat(paste(num.features, "features were significant at significance level", 
                      sig.level, sep = " "))
            cat("\nIdentified features:\n")
            print(data.frame(features = features, p.val = p.vals))
        }

        sig.features <- data.frame(features = features, p.val = p.vals)
        all.features <- data.frame(features = names(perm.p.val), 
                                   p.val = c(perm.p.val))
        
        perm.results = list(sig.level = sig.level,
                            num.sig.features = num.features,
                            sig.features = sig.features,
                            all.features = all.features
        )
        
        return(perm.results)
    }