--
title: "Qusage: Speeding up AggregateGeneSet in RcppArmadillo"
author: "Timothy J. Triche, Jr, Anthony R. Colombo"
output:
pdf_document:
toc: true
number_sections: true
date: "r format(Sys.time(), '%d %B, %Y')
"
qusage is published software that is slow for large runs, SpeedSage corrects for speed and efficiency at large orders. there is Bottlenecking of Functions Qusage can improve the speed of its algorithm by minimizing the cost of computaiton.
This test will perform the computations in the apply functions in c++. for makeComparisons and calcVIF, speedSage assumes the Welch's approximation for variances between groups being unequal. Welch's approximation is used for aggregateGeneSet function. Previously the qusage.single/makeComparison in Armadillo/speedSage has a 1.3-1.48X speed up, aggregateGeneSet is the second bottleneck that must be accelerated. the speed gains is calculated by the differences of the sub-routeins called by the parent-routines between respective methods. it is linear.
library(inline) library(microbenchmark) library(Rcpp) library(parallel) library(speedSage) library(qusage) library(ggplot2) eset<-system.file("extdata","eset.RData",package="speedSage") load(eset) labels<-c(rep("t0",134),rep("t1",134)) contrast<-"t1-t0" colnames(eset)<-c(rep("t0",134),rep("t1",134)) fileISG<-system.file("extdata","c2.cgp.v5.1.symbols.gmt",package="speedSage") ISG.geneSet<-read.gmt(fileISG) geneSets<-ISG.geneSet[grepl("DER_IFN_GAMMA_RESPONSE_UP",names(ISG.geneSet))] #cpp functions sourceCpp(file="/home/arcolombo/Documents/github_repos/SpeedSage/R/sigmaArm.cpp") sourceCpp(file="/home/arcolombo/Documents/github_repos/SpeedSage/R/sigmaSingle.cpp") sourceCpp(file="/home/arcolombo/Documents/github_repos/SpeedSage/R/bayesEstimation.cpp") sourceCpp(file="/home/arcolombo/Documents/github_repos/SpeedSage/R/notbayesEstimation.cpp") sourceCpp(file="/home/arcolombo/Documents/github_repos/SpeedSage/R/calcVIFarm.cpp") sourceCpp(file="/home/arcolombo/Documents/github_repos/SpeedSage/R/calcVIFarmalt.cpp") sourceCpp(file="/home/arcolombo/Documents/github_repos/SpeedSage/R/calcVIFarm_nosdalphaalt.cpp") sourceCpp(file="/home/arcolombo/Documents/github_repos/SpeedSage/R/aggregategsSumSigma.cpp") source(file="/home/arcolombo/Documents/github_repos/SpeedSage/R/qusageArm.R") pairVector<-NULL var.equal<-FALSE filter.genes<-FALSE n.points<-2^12 #setting up aggregateGeneSetArm call objects results = makeComparisonArm(eset, labels, contrast, pairVector=pairVector,var.equal=var.equal) nu = floor(min(results$dof,na.rm=T)) defaultAggregate = aggregateGeneSet(results, geneSets, silent=F, n.points=n.points) geneResults<-results NumSDs<-abs(qt(10^-10,1:250)) NumSDs[NumSDs>750] = 750 Means = geneResults$mean SD = geneResults$SD DOF=geneResults$dof COLS = names(Means) if(is.vector(geneSets) & !is.list(geneSets)){ n = deparse(substitute(geneSets)) geneSets = list(geneSets) names(geneSets) = n } if(is.null(names(geneSets))){names(geneSets) = 1:length(geneSets)} geneSets<- lapply(geneSets,function(x){ if(is.numeric(x)){ if(any(!(x %in% 1:length(COLS)))){stop("Numeric gene set indices out of bounds")} return(x) } which(COLS%in%x) }) myAg<-aggregategsSumSigma(results$SD, results$dof,geneSets)
I've tried to reduce the which command using Rcpp find but it was much slower
microbenchmark( aggregategsSumSigma(geneResults$mean,geneResults$SD, geneResults$dof,names( geneResults$mean),geneSets[[1]]), lapply(geneSets,function(x){ if(is.numeric(x)){ if(any(!(x %in% 1:length(COLS)))){stop("Numeric gene set indices out of bounds")} return(x) } which(COLS%in%x) }) )
min lq mean median uq max neval cld 2169.671 2197.8315 2250.4651 2217.4730 2260.0395 3051.073 100 b 288.634 302.8425 314.9078 311.6455 322.9065 401.846 100 a so I've used a double for loop to manually find index matches and used the C++ find method to find the index positions of names(geneResults$mean) %in% geneSets[[1]], and both of these functions were 10X slower in C++ compared to lapply. I will abandon my attempts here.
#guts of the function, comparing the direct methods mb<-microbenchmark( sapply(names(geneSets),function(i){ Indexes = geneSets[[i]] x<-sqrt(sum((SD^2*(DOF/(DOF-2)))[Indexes])) return(x) }), MinDof<-sapply(names(geneSets),function(i){ Indexes = geneSets[[i]] if(length(Indexes)==0){return(NA)} return(floor(min(DOF[Indexes]))) }), SumSigma_MinDof<-aggregategsSumSigma(geneResults$SD,geneResults$dof,geneSets) , times=2500) #mb shows microseconds speed up #need to check accuracy, and need to show speed. #need to check the aggregateGeneSet.R versus aggregateGeneSetArm.R for 1 geneSet, and 3 geneSets, compare accuracy and speed. then wrap up. source("/home/arcolombo/Documents/github_repos/SpeedSage/R/aggregateGeneSet.R") source("/home/arcolombo/Documents/github_repos/SpeedSage/R/aggregateGeneSetArm.R") #accuracy defaultAggregate = aggregateGeneSet(results, geneSets, silent=F, n.points=n.points) myAg<-aggregateGeneSetArm(results, geneSets, silent=F,n.points=n.points, armadillo=TRUE) defAg<-aggregateGeneSetArm(results, geneSets, silent=F,n.points=n.points, armadillo=FALSE) singleMb<-microbenchmark( myAg<-aggregateGeneSetArm(results, geneSets, silent=F,n.points=n.points, armadillo=TRUE), defaultAggregate = aggregateGeneSet(results, geneSets, silent=F, n.points=n.points),times=250) #single gene set no speed pu singleMb library(ggplot2) autoplot(singleMb) geneSets[2]<-geneSets[1] geneSets[3]<-geneSets[1] multiSets<-geneSets multiMb<-microbenchmark( myAg<-aggregateGeneSetArm(results, multiSets, silent=F,n.points=n.points, armadillo=TRUE), defaultAggregate = aggregateGeneSet(results, multiSets, silent=F, n.points=n.points),times=250) multiMb #the gain is only in terms of microseconds, although additive, it is not noticeable. autoplot(multiMb) #the grouping of the PDF formation is also bottlenecking
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.