R/AllSupplementsTNA.R

Defines functions gseaScoresBatch4RTN gseaScores4RTN .hyperGeoTest4RTN hyperGeoTest4RTN tna.perm.pvalues tna.duplicate.remover gsea2tna gsea1tna

##------------------------------------------------------------------------
##------------------------------------------------------------------------
##      OPTIMIZED GSEA FUNCTIONS FOR REGULONS AND TNETS
##------------------------------------------------------------------------
##------------------------------------------------------------------------

##------------------------------------------------------------------------
##This function computes observed and permutation-based scores from 
##a gene set enrichment analysis for a collection of regulons
gsea1tna <- function(listOfRegulons, phenotype, exponent=1, 
                     nPermutations=1000,verbose=TRUE) {	 
  #OBS1:names provided as integer values!
  #OBS2:max/min sizes checked in the previous functions!
  pheno.names <- as.integer(names(phenotype))
  nRegulons <- length(listOfRegulons)
  if(nRegulons > 0) {
    ##Generate a matrix to store the permutation-based scores, with 
    ##one row for each gene set (that has been tagged) and one column 
    ##for each permutation	
    permScores <- matrix(rep(0, (nPermutations * nRegulons)), nrow=nRegulons)
    rownames(permScores) <- names(listOfRegulons)
    ##Generate a vector to store the experimental scores
    ##one entry for each gene set (that has been tagged)
    observedScores <- rep(0, nRegulons)
    names(observedScores) <- names(listOfRegulons)
    ##Compute the scores	
    ##create permutation gene list
    perm.gL <- sapply(1:nPermutations, function(n) pheno.names[
      sample(1:length(phenotype), length(phenotype),replace=FALSE)])
    perm.gL<-cbind(pheno.names,perm.gL)
    ##check if package snow has been loaded and a cluster object 
    ##has been created for HTSanalyzeR
    if(isParallel() && nRegulons>1) {
      if(verbose)
        cat("-Performing GSEA (parallel version - ProgressBar disabled)...\n")
      if(verbose)
        cat("--For", length(listOfRegulons), "regulons...\n")  
      cl<-getOption("cluster")
      snow::clusterExport(cl, list("gseaScoresBatch4RTN"), envir=environment())
      scores <- snow::parSapply(cl, 1:length(listOfRegulons), function(i){
        gseaScoresBatch4RTN(geneList=phenotype, geneNames.perm=perm.gL, 
                            geneSet=listOfRegulons[[i]], exponent=exponent,
                            nPermutations=nPermutations)
      })
      for(i in 1:nRegulons){
        permScores[i,]<-unlist(scores["permScores",i])
        observedScores[i]<-unlist(scores["observedScores",i])
      }
    } else {
      if(verbose) cat("-Performing gene set enrichment analysis...\n")
      if(verbose) cat("--For", length(listOfRegulons), "regulons...\n")
      if(verbose) pb <- txtProgressBar(style=3)
      for(i in 1:nRegulons) {
        scores <- gseaScoresBatch4RTN(geneList=phenotype, 
                                      geneNames.perm=perm.gL, 
                                      geneSet=listOfRegulons[[i]],
                                      exponent=exponent,
                                      nPermutations=nPermutations)
        observedScores[i] <- scores$observedScores
        permScores[i,] <- scores$permScores
        if(verbose) setTxtProgressBar(pb, i/nRegulons)
      }	
      if(verbose) close(pb)
    }
  } else {
    observedScores <- NULL
    permScores <- NULL
  }
  return(list("Observed.scores" = observedScores , 
              "Permutation.scores" = permScores))	
}

##------------------------------------------------------------------------
##This function computes observed and permutation-based scores 
gsea2tna <- function(listOfRegulons, phenotype, exponent=1, 
                     nPermutations=1000, verbose1=TRUE, 
                     verbose2=TRUE) {   
  pheno.names <- as.integer(names(phenotype))
  nRegulons <- length(listOfRegulons)
  if(nRegulons > 0){
    permScores <- matrix(rep(0, (nPermutations * nRegulons)), nrow=nRegulons)
    rownames(permScores) <- names(listOfRegulons)
    observedScores <- rep(0, nRegulons)
    names(observedScores) <- names(listOfRegulons)
    perm.gL <- sapply(1:nPermutations, function(n) pheno.names[
      sample(1:length(phenotype), length(phenotype),replace=FALSE)])
    perm.gL<-cbind(pheno.names,perm.gL)
    if(isParallel() && nRegulons>1){
      if(verbose1 && verbose2)
        cat("-Performing two-tailed GSEA (parallel version - ProgressBar disabled)...\n")
      if(verbose1 && verbose2)
        cat("--For", length(listOfRegulons), "regulons...\n") 
      cl<-getOption("cluster")
      snow::clusterExport(cl, list("gseaScoresBatch4RTN"), envir=environment())
      scores <- snow::parSapply(cl, 1:length(listOfRegulons), function(i){
        gseaScoresBatch4RTN(geneList=phenotype, geneNames.perm=perm.gL, 
                            geneSet=listOfRegulons[[i]], exponent=exponent,
                            nPermutations=nPermutations)
      })
      for(i in 1:nRegulons){
        permScores[i,]<-unlist(scores["permScores",i])
        observedScores[i]<-unlist(scores["observedScores",i])
      }
    } else {
      if(verbose1 && verbose2) 
        cat("-Performing two-tailed GSEA analysis...\n")
      if(verbose1 && verbose2) 
        cat("--For", length(listOfRegulons), "regulons...\n")
      if(verbose1) pb <- txtProgressBar(style=3)
      for(i in 1:nRegulons) {
        scores <- gseaScoresBatch4RTN(geneList=phenotype, 
                                      geneNames.perm=perm.gL, 
                                      geneSet=listOfRegulons[[i]],
                                      exponent=exponent, 
                                      nPermutations=nPermutations)
        observedScores[i] <- scores$observedScores
        permScores[i,] <- scores$permScores
        if(verbose1) setTxtProgressBar(pb, i/nRegulons)
      }	
      if(verbose1) close(pb)
    }
  } else {
    observedScores <- NULL
    permScores <- NULL
  }
  return(list("Observed.scores" = observedScores , 
              "Permutation.scores" = permScores))	
}

##------------------------------------------------------------------------
##This function gets rid of the duplicates in a gene list.
tna.duplicate.remover <- function(phenotype, method = "max") {
  ##Get the unique names and create a vector that will store the 
  ##processed values corresponding to those names
  pheno.names <- names(phenotype)
  datanames <- unique(pheno.names)
  data.processed <- rep(0, length(datanames))
  names(data.processed) <- datanames
  data.processed2 <- data.processed
  l.data.processed <- length(data.processed)	
  ##If the absolute value of the min is bigger than the absolute 
  ##value of the max, then it is the min that is kept
  if(method=="max") {
    abmax<-function(x){
      imax<-max(x);imin<-min(x)
      ifelse(imax>abs(imin),imax,imin)
    }
    tp<-aggregate(phenotype,by=list(names(phenotype)),abmax)
    data.processed<-tp[,2]
    names(data.processed)<-tp[,1] 
  } else if(method=="min") {
    abmin<-function(x){
      imax<-max(x);imin<-min(x)
      ifelse(imax>abs(imin),imin,imax)
    }
    tp<-aggregate(phenotype,by=list(names(phenotype)),abmin)
    data.processed<-tp[,2]
    names(data.processed)<-tp[,1] 
  } else if(method=="average") {
    tp<-aggregate(phenotype,by=list(names(phenotype)),mean)
    data.processed<-tp[,2]
    names(data.processed)<-tp[,1]
  }	
  data.processed
}

##------------------------------------------------------------------------
##This function returns the nominal p-value for the GSEA results
tna.perm.pvalues <- function(permScores, observedScores){
	nPerm <- ncol(permScores)
	pval <- rep(1, length(observedScores))
	##check how many permScores are higher (in magnitude) than
	##the observedScores; done separately for negative and positive
	##scores; pseudocounts are added to avoid P-values of zero.
	valid.id <- which(!is.na(observedScores) & observedScores!=0)
	for(i in valid.id) {
	  pval[i]<-ifelse(
			observedScores[i] > 0,
			(sum(permScores[i, ] > observedScores[i])+1)/(nPerm+1),
			(sum(permScores[i, ] < observedScores[i])+1)/(nPerm+1)
		)
	}
	names(pval)<-names(observedScores)		
	return(pval)
}

#--------------------------------------------------------------------
##This function performs hypergeometric tests for over-representation 
##of hits, on a list of gene sets.
hyperGeoTest4RTN <- function(collectionOfGeneSets, universe, hits, 
                              minGeneSetSize = 15, pAdjustMethod = "BH", 
                              verbose = TRUE) {
  geneset.size <- lapply(lapply(collectionOfGeneSets, intersect, y=universe), 
                         length)
  geneset.size <- unlist(geneset.size)
  geneset.filtered <- which(geneset.size >= minGeneSetSize)
  l.GeneSet <- length(geneset.filtered)
  if(verbose) pb <- txtProgressBar(style=3)
  results <- t(
    sapply(geneset.filtered, 
           function(i) {
             if(verbose) setTxtProgressBar(pb, i/l.GeneSet)		
             .hyperGeoTest4RTN(collectionOfGeneSets[i], universe, hits)
           }
    )
  )
  if(verbose) close(pb)
  if(length(results) > 0) {
    adjPvals <- p.adjust(results[, "Pvalue"], method = pAdjustMethod)
    results <- cbind(results, adjPvals)
    colnames(results)[ncol(results)] <- "Adjusted.Pvalue"
    results <- results[order(results[, "Adjusted.Pvalue"]), , drop=FALSE]		
  } else {
    reuslts <- matrix(, nrow=0, ncol=7)
    colnames(results) <- c("Universe Size", "Gene Set Size", 
                           "Total Hits", "Expected Hits", "Observed Hits", 
                           "Pvalue", "Adjusted.Pvalue")
  }
  return(results)
}
.hyperGeoTest4RTN <- function(geneSet, universe, hits) {
  #number of genes in universe
  N <- length(universe) 			
  #remove genes from gene set that are not in universe			
  geneSet <- intersect(geneSet[[1]], universe) 
  #size of gene set	
  m <- length(geneSet) 							
  Nm <- N-m	
  #hits in gene set
  overlap <- intersect(geneSet, hits) 	
  #number of hits in gene set		
  k <- length(overlap) 							
  n <- length(hits)	
  HGTresults <- phyper(k-1, m, Nm, n, lower.tail = FALSE)
  ex <- (n/N)*m
  if(m == 0) HGTresults <- NA
  hyp.vec <- c(N, m, n, ex, k, HGTresults)
  names(hyp.vec) <- c("Universe Size", "Gene Set Size", "Total Hits", 
                      "Expected Hits", "Observed Hits", "Pvalue")
  return(hyp.vec)
}

#--------------------------------------------------------------------
##This function returns enrichment scores, running scores, and 
##position of hits for a gene set.
gseaScores4RTN <- function(geneList, geneSet, exponent=1, 
                           mode=c("score","runningscores")) {
  if( is.character(geneSet) || is.null(names(geneSet)) ){
    geneSetType<-rep(1,length(geneSet))
    names(geneSetType)<-geneSet
  } else {
    geneSetType<-geneSet
    geneSet<-names(geneSet)
  }
  geneList <- geneList[!is.na(geneList)]
  geneSet<-intersect(names(geneList), geneSet)
  nh <- length(geneSet)
  N <- length(geneList)
  ES <- 0
  Phit <- rep(0, N)
  Pmiss <- rep(0, N)
  runningES <- rep(0, N)
  hits <- rep(FALSE, N)
  hitsType <- rep(0, N)
  hits[which(!is.na(match(names(geneList), geneSet)))] <- TRUE
  hitsType[which(!is.na(
    match(names(geneList), names(geneSetType[geneSetType>0]))))] <- 1
  hitsType[which(!is.na(
    match(names(geneList), names(geneSetType[geneSetType<0]))))] <- -1
  if(sum(hits)!=0){
    Phit[which(hits)]<-abs(geneList[which(hits)])^exponent
    NR=sum(Phit)
    Pmiss[which(!hits)]<-1/(N-nh)
    Phit=cumsum(Phit/NR)
    Pmiss=cumsum(Pmiss)
    runningES<-Phit-Pmiss
    runningES[is.nan(runningES)] <- 0
    ESmax<-max(runningES)
    ESmin<-min(runningES)
    ES<-ifelse(abs(ESmin)>abs(ESmax), ESmin, ESmax)
  }
  if(mode[1]=="score"){
    return(ES)
  } else if(mode[1]=="runningscores"){
    #for a hit at extreme positions, set zero to avoid a misleading plot
    runningES[1] <- 0; runningES[length(runningES)] <- 0
    return(list("enrichmentScore"=ES, "runningScore"=runningES, 
                "positions"=hitsType))
  }
}

#--------------------------------------------------------------------
##This function computes observed and permutation enrichment scores 
gseaScoresBatch4RTN <- function(geneList, geneNames.perm, geneSet, 
                                exponent=1, nPermutations=1000) {
  nh <- length(geneSet)
  N <- length(geneList)
  if(N>0 && nh>0){
    geneSet <- as.numeric(geneSet)
    Phit <- Pmiss <- matrix(0,nrow=N,ncol=nPermutations+1)
    runningES <- NULL
    for(i in 1:(nPermutations+1)){
      idx <- geneNames.perm[geneSet,i]
      Phit[idx, i] <- abs(geneList[idx])^exponent
      Pmiss[-idx, i] <- 1/(N-nh)
    }
    NR <- colSums(Phit)
    Phit <- sapply(1:(nPermutations+1), function(i) cumsum(Phit[, i])/NR[i])		
    Pmiss <- sapply(1:(nPermutations+1), function(i) cumsum(Pmiss[, i]))
    runningES <- Phit-Pmiss		
    runningES[is.nan(runningES)] <- 0
    ES <- sapply(1:(nPermutations+1), function(i){
      tp <- runningES[,i]
      tp[which.max(abs(tp))]	
    })
  } else {
    ES <- rep(0,nPermutations+1)
  }
  ES <- list(observedScores=ES[1], permScores=ES[2:(nPermutations+1)])
  return(ES)	
}

Try the RTN package in your browser

Any scripts or data that you put into this service are public.

RTN documentation built on Nov. 12, 2020, 2:02 a.m.