R/champ.ebGSEA.R

Defines functions champ.ebGSEA

Documented in champ.ebGSEA

if(getRversion() >= "3.1.0") utils::globalVariables("PathwayList")

champ.ebGSEA <- function(beta=myNorm, 
                         pheno=myLoad$pd$Sample_Group, 
                         minN=5, 
                         adjPval=0.05, 
                         arraytype="450K", 
                         cores=1)
{
  #library(Hmisc);
  #library(globaltest);
  message("ebGSEA function require no NA in beta and pheno parameter.")
  
  mapEIDtoCpG <- function(beta,arraytype) {
    # if(arraytype=="EPIC"){
    #   message("  Extracting annotation from IlluminaHumanMethylationEPICilm10b4.hg19.")
    #   RSobject <- RatioSet(beta, annotation = c(array = "IlluminaHumanMethylationEPIC",annotation = "ilm10b4.hg19"))
    # }else{
    #   message("  Extracting annotation from IlluminaHumanMethylation450kilmn12.hg19.")
    #   RSobject <- RatioSet(beta, annotation = c(array = "IlluminaHumanMethylation450k",annotation = "ilmn12.hg19"))
    # }
    # RSanno <- as.data.frame(getAnnotation(RSobject))
    if(arraytype %in% c("EPIC", "EPICv2")) {
      data("probe.features.epicv2")
    } else if(arraytype == "EPICv1") {
      data("probe.features.epicv1")
    } else if(arraytype == "450K") { 
      data("probe.features")
    } else (
      stop("arraytype must be `EPICv2`, `EPICv1`, `450K`")
    )
    RSanno <- probe.features[rownames(beta), c("CHR", "MAPINFO", "gene", "feature")]
    colnames(RSanno) <- c("chr", "pos", "UCSC_RefGene_Name", "UCSC_RefGene_Group")
    
    
    message("  Removing Non-CG probes out of annotation.")
    ann.keep <- RSanno[substr(rownames(RSanno),1,2)=="cg" & RSanno$UCSC_RefGene_Name != "",]
    
    message("  Flat all genes on each CpG.")
    geneslist<-strsplit(ann.keep$UCSC_RefGene_Name,split=";")
    names(geneslist)<-rownames(ann.keep)
    
    grouplist<-strsplit(ann.keep$UCSC_RefGene_Group,split=";")
    names(grouplist)<-rownames(ann.keep)
    
    flat<-data.frame(symbol=unlist(geneslist),group=unlist(grouplist))
    flat$symbol<-as.character(flat$symbol)
    flat$group <- as.character(flat$group)
    
    flat$cpg<- substr(rownames(flat),1,10)
    
    flat$alias <- alias2SymbolTable(flat$symbol)
    
    #eg <- toTable(org.Hs.egSYMBOL2EG)
    #m <- match(flat$alias,eg$symbol)
    #flat$entrezid <- eg$gene_id[m]        #transform symbol to entrezid
    #flat <- flat[!is.na(flat$entrezid),]  #get rid of NA entrezid 
    
    message("  Removing all duplicated CpG-genes.")
    id<-paste(flat$cpg,flat$alias,sep=".")
    d <- duplicated(id)
    flat.u <- flat[!d,]
    
    message("  Annotation Prepared.")
    mapEIDtoCpG <- split(flat.u$cpg, flat.u$alias)
    return(mapEIDtoCpG)
  }
  
  gseaWTfn <- function(termEID.v,rankEID.v,minN=minN){
    commonEID.v <- intersect(termEID.v,rankEID.v);
    nrep <- length(commonEID.v);
    if(length(commonEID.v)>=minN){
      otherEID.v <- setdiff(rankEID.v,termEID.v);
      match(commonEID.v,rankEID.v) -> rank1.idx;
      match(otherEID.v,rankEID.v) -> rank2.idx;
      wilcox <- wilcox.test(rank1.idx,rank2.idx,alt="less")
      pv <- wilcox$p.value
      n1n2 <- nrep * length(otherEID.v)
      auc <- (n1n2-wilcox$statistic)/(n1n2)
      
      ## add kpmt here.
      pop.v <- 1:length(rankEID.v);
      names(pop.v) <- rankEID.v;
      obs.v <- commonEID.v
      pvKPMT <- kpmt::kpmt(pop=pop.v,obs=obs.v,tail="lower")[[4]]
      
      out.v <- c(nrep,auc,pv,pvKPMT);
    } else {
      out.v <- c(nrep,0,1,1);
    }
    return(out.v);
  }
  
  
  message("\n[ Section 1:  Generate Annotation Start ]\n")
  mapEID <- mapEIDtoCpG(beta,arraytype)
  message("\n[ Section 1:  Generate Annotation Done ]\n")
  
  
  message("\n[ Section 2:  Running Global Test Start ]\n")
  if(class(pheno)=="numeric")
  {
    message("  Applying Linear Model on Global Test. It could be very slow...")
    gt.o <- gt(response=pheno, 
               alternative=t(beta),
               model="linear",
               directional = FALSE,
               standardize = FALSE, 
               permutations = 0, 
               subsets=mapEID,
               trace=FALSE);
  } else {
    message("  Applying Binary Model on Global Test. It could be very slow...")
    gt.o <- gt(response=(as.numeric(as.factor(pheno))-1), 
               alternative=t(beta),
               model="logistic",
               directional = FALSE,
               standardize = FALSE, 
               permutations = 0, 
               subsets=mapEID,
               trace=FALSE);
  }
  resGT.m <- result(gt.o);
  tmp.s <- sort(resGT.m[,1],index.return=TRUE);
  sresGT.m <- resGT.m[tmp.s$ix,];
  message("\n[ Section 2:  Running Global Test Done ]\n")
  
  message("\n[ Section 3:  GSEA on Pathway Start ]\n")
  message("  Loading MsigDB PathwayList information.")
  data(PathwayList)
  
  message("  Doing Wilcox Test and Known Population Median Test, it could be slow here.")
  
  if(cores > 1)
  {
    if(cores > detectCores()) cores <- detectCores()
    message(cores," cores will be used to do parallel GSEA computing.")
    cl <- makeCluster(cores)
    registerDoParallel(cl)
    gseaWT.m <- foreach(x = 1:length(PathwayList), .combine = rbind, .export=c("gseaWTfn", "PathwayList", "sresGT.m", "minN")) %dopar% {
      gseaWTfn(termEID.v= PathwayList[[x]] , rankEID.v=rownames(sresGT.m),minN=minN)
    }
    stopCluster(cl)
  } else
  {
    gseaWT.m <- do.call(rbind,lapply(PathwayList,function(x) gseaWTfn(termEID.v=x, rankEID.v=rownames(sresGT.m),minN=minN)))
  }
  
  
  commonEID <- lapply(PathwayList, function(x) intersect(x, rownames(sresGT.m)))
  
  colnames(gseaWT.m) <- c("nREP","AUC","P","P(KPMT)")
  rownames(gseaWT.m) <- names(PathwayList);
  
  message("  Adjusting Pathway P value with BH method.")
  tmp.s <- sort(gseaWT.m[,3],decreasing=FALSE,index.return=TRUE);
  sgseaWT.m <- gseaWT.m[tmp.s$ix,];
  padj.v <- p.adjust(sgseaWT.m[,3],method="BH");
  message("  Your adjPval is ",adjPval, ", only pathway below BH adjusted P value 0.05 would be returned.")
  sel.idx <- which(padj.v <= adjPval);
  
  message("  Forming up final result.")
  if(length(sel.idx) > 1) {
    topGSEAwt.m <- cbind(sgseaWT.m[sel.idx,],padj.v[sel.idx]);
    colnames(topGSEAwt.m) <- c("nREP","AUC","P(WT)","P(KPMT)","adjP");
    
    topGSEAwt.lm <- list();
    topGSEAwt.lm[[1]] <- topGSEAwt.m;
    tmp.s <- sort(sgseaWT.m[,2],decreasing=TRUE,index.return=TRUE);
    topGSEAwt.lm[[2]] <- sgseaWT.m[tmp.s$ix,];
    names(topGSEAwt.lm) <- c("Rank(P)","Rank(AUC)");
  } else if (length(sel.idx)==1) {
    topGSEAwt.v <- as.vector(c(sgseaWT.m[sel.idx,],padj.v[sel.idx]));
    names(topGSEAwt.v) <- c("nREP","AUC","P(WT)","P(KPMT)","adjP");
    topGSEAwt.lm <- list("Rank(P)"=topGSEAwt.v,"Rank(AUC)"=topGSEAwt.v,"POI"=rownames(sgseaWT.m)[sel.idx]);
  } else {
    message("No significant pathway enriched. You may try relax the threshold like adjPva.")
    topGSEAwt.lm <- list();
  }
  
  message("\n[ Section 3:  GSEA on Pathway Done ]\n")
  
  return(list(GSEA=topGSEAwt.lm, EnrichGene=commonEID, gtResult=sresGT.m))
} 
YuanTian1991/ChAMP documentation built on Feb. 21, 2023, 1:13 p.m.