title: "Power analysis for mvIC"
subtitle: ''
author: "Developed by Gabriel Hoffman"
date: "Run on r Sys.time()
"
documentclass: article
output:
html_document:
toc: true
smart: false
vignette: >
%\VignetteIndexEntry{Gene set enrichment from genomic intervals}
%\VignetteEngine{knitr::rmarkdown}
%\VignetteEncoding{UTF-8}
%\usepackage[utf8]{inputenc}
suppressPackageStartupMessages(library(knitr)) suppressPackageStartupMessages(library(data.table)) suppressPackageStartupMessages(library(mvIC)) suppressPackageStartupMessages(library(variancePartition)) library(Matrix) library(mvtnorm) library(TAS) library(corpcor) library(ShrinkCovMat) library(beam) library(sparseMVN) options(xtable.type="html") setDTthreads(3, restore_after_fork=FALSE) knitr::opts_chunk$set( echo=FALSE, warning=FALSE, message=FALSE, error = FALSE, tidy = FALSE, cache = TRUE, cache.lazy = FALSE, dev = c("png", "pdf"), fig.width=7, fig.height=7) options(markdown.HTML.stylesheet = 'css/custom.css')
do EpiMap GEUVADIS, SEQC, GTEX, CMCv2 + simulations add single cell, and EHR Compare to summing standard BIC
Simulation, show probability of selection true model increases with number of features
Compare with "standard" Multivariate AIC, AICc, BIC for n > p
with high signam, why do other perform better? Is that an issue with the randomization of the simulations???
add AICc from see Yanagihara, et al. 2015 # doi:10.1214/15-EJS1022
As expected with p > n, sum BIC is best method when Sigma is almost identity can I fix that with adjusting gdf?
devtools::reload("/Users/gabrielhoffman/workspace/repos/mvIC")
n = 20 p = 300 m_active = 1 m_total = 10 rho = .8 sigFrac = .5 X = matrix(rnorm(m_total*n), n, m_total) colnames(X) = paste0('X_', 1:m_total) trueSet = c('1', sort(colnames(X)[1:m_active])) # simulate coefficient for each gene beta = matrix(rnorm(p*m_active, 0, 10), m_active,p) eta = X[,1:m_active,drop=FALSE] %*% beta trueModel = paste('Y ~', paste(colnames(X)[1:m_active], collapse=' + ')) n_clusters = 50 # changes correlation structure of noise. clustID = sample.int(n_clusters, p, prob = dexp(1:n_clusters, .5), replace=TRUE) sigList = lapply(unique(clustID), function(id){ count = sum(clustID == id) Sigma = matrix(rho, count, count) diag(Sigma) = 1 Sigma }) Sigma = bdiag(sigList) # Sigma = matrix(rho, p, p) # diag(Sigma) = 1 # Sigma = AR1(p, rho) Noise = rmvn.sparse(n, mu = rep(0, p), CH=Cholesky(Sigma), prec=FALSE) fctr = mean(apply(eta,1, var)) * (1-sigFrac)/sigFrac Y = eta + Noise*fctr fit = lm(as.formula(trueModel), data=as.data.frame(X)) mvIC(fit, criterion = "BIC", shrink.method="EB") modelSearch = bestSubsetSearch(Y, as.data.frame(X), colnames(X), maxk=3) modelSearch = data.table(modelSearch) df = modelSearch[,data.frame(rank = rank(score), isTrue = form==trueModel) ,by=c('method', 'criterion')] df[isTrue==TRUE,1:3]
set.seed(2) res = lapply( c(seq(10, 90, by=10), seq(100, 1100, by=200)), function(p){ cat(p, "\n") n = 50 m_active = 1 m_total = 10 rho = .8 sigFrac = .6 X = matrix(rnorm(m_total*n), n, m_total) colnames(X) = paste0('X_', 1:m_total) trueSet = c('1', sort(colnames(X)[1:m_active])) # simulate coefficient for each gene beta = matrix(rnorm(p*m_active, 0, 10), m_active,p) eta = X[,1:m_active,drop=FALSE] %*% beta trueModel = paste('Y ~', paste(colnames(X)[1:m_active], collapse=' + ')) n_clusters = 5 # changes correlation structure of noise. clustID = sample.int(n_clusters, p, prob = dexp(1:n_clusters, .5), replace=TRUE) sigList = lapply(unique(clustID), function(id){ count = sum(clustID == id) Sigma = matrix(rho, count, count) diag(Sigma) = 1 Sigma }) Sigma = bdiag(sigList) # Sigma = matrix(rho, p, p) # diag(Sigma) = 1 # Sigma = AR1(p, rho) Noise = rmvn.sparse(n, mu = rep(0, p), CH=Cholesky(Sigma), prec=FALSE) fctr = mean(apply(eta,1, var)) * (1-sigFrac)/sigFrac Y = eta + Noise*fctr fit = lm(as.formula(trueModel), data=as.data.frame(X)) residMatrix = mvIC:::getResids(fit) # modelSearch = bestSubsetSearch(Y, as.data.frame(X), colnames(X), maxk=2) # modelSearch = data.table(modelSearch) # df = modelSearch[,data.frame(rank = rank(score), isTrue = form==trueModel) ,by=c('method', 'criterion')] # df[isTrue==TRUE,1:3] data.frame(p,# rank=df[isTrue==TRUE,1:3][2,3], custom = mvIC(fit, criterion = "BIC", shrink.method="EB")@params$lambda, Strimmer = attr(cov.shrink(t(mvIC:::getResids(fit)), lambda.var=0, verbose=FALSE), "lambda"), Touloumis = shrinkcovmat.unequal(mvIC:::getResids(fit))$lambdahat, eb_cov_est = mvIC:::eb_cov_est( t(residMatrix) )$alpha, eb_cov_est2 = mvIC:::eb_cov_est2( t(residMatrix) )$alpha, eb_cov_est3 = mvIC:::eb_cov_est3( t(residMatrix) )$alpha, beam = beam::beam(t(mvIC:::getResids(fit)), verbose=FALSE )@alphaOpt) # estimateMVN_EB = mvIC:::estimateMVN_EB( t(residMatrix) )$alpha, #, gcShrink = gcShrink((mvIC:::getResids(fit)), plot=FALSE)$optimalpha ) }) res = do.call(rbind, res) mvIC(fit, criterion = "BIC", shrink.method="EB")@params$lambda mvIC:::eb_cov_est(t(mvIC:::getResids(fit)))$alpha mvIC:::eb_cov_est((mvIC:::getResids(fit)))$alpha mvIC:::estimateMVN_EB(t(mvIC:::getResids(fit)))$alpha mvIC:::estimateMVN_EB((mvIC:::getResids(fit)))$alpha mvIC:::eb_cov_est3(t(mvIC:::getResids(fit)))$alpha mvIC:::eb_cov_est3((mvIC:::getResids(fit)))$alpha # # modelSearch = bestSubsetSearch(Y, as.data.frame(X), colnames(X), maxk=2) # # modelSearch = data.table(modelSearch) # # df = modelSearch[,data.frame(rank = rank(score), isTrue = form==trueModel) ,by=c('method', 'criterion')] # # df[isTrue==TRUE,1:3] # Y = rmvn.sparse(n, mu = rep(0, p), CH=Cholesky(Sigma), prec=FALSE) # # Y = rmvnorm(n, sig=as.matrix(Sigma)) # fit = lm(Y ~ 1, data=as.data.frame(X)) # # lambda = mvIC(fit, criterion = "BIC", shrink.method="EB")@params$lambda # # b = beam::beam(t(mvIC:::getResids(fit)), verbose=FALSE ) # # b@alphaOpt # # b@valOpt # SigTrue = as.matrix(Sigma) # residMatrix = mvIC:::getResids(fit) # diag(cov(t(residMatrix))) # tst1 = mvIC:::estimateMVN_EB( (t(residMatrix)), MAP=TRUE ) # tst2 = gcShrink( t((t(residMatrix))), var=3, cor=1, plot=FALSE) # tst3 = shrinkcovmat.unequal( residMatrix ) # data.frame(p, # estimateMVN_EB = norm(SigTrue - tst1$Sigmahat, "F"), # alpha1 = tst1$alpha, # gcShrink = norm(SigTrue - tst2$sigmahat, "F"), # alpha2 = tst2$optimalpha, # shrinkcovmat.unequal = norm(SigTrue - tst3$Sigmahat, "F"), # alpha3 = tst3$lambdahat) # plot(SigTrue, tst3$Sigmahat) # diag(tst3$Sigmahat) # }) # res = do.call(rbind, res) # data.frame(lambda, p, rank=df[isTrue==TRUE,1:3][2,3], # Strimmer = attr(cov.shrink(t(mvIC:::getResids(fit)), lambda.var=0, verbose=FALSE), "lambda"), # Touloumis = shrinkcovmat.unequal(mvIC:::getResids(fit))$lambdahat, # eb_cov_est = mvIC:::eb_cov_est( t(residMatrix) )$alpha, # eb_cov_est2 = mvIC:::eb_cov_est2( t(residMatrix) )$alpha, # eb_cov_est3 = mvIC:::eb_cov_est3( t(residMatrix) )$alpha, # beam = b@alphaOpt) # # estimateMVN_EB = mvIC:::estimateMVN_EB( t(residMatrix) )$alpha, # #, gcShrink = gcShrink((mvIC:::getResids(fit)), plot=FALSE)$optimalpha ) # }) # res = do.call(rbind, res) # nu = alpha/(1-alpha) * n # x = 194007.63 # p = 1000 # # x = 30 # # p = 40 # f = function(x,p){ # p*(p-1)/4*log(pi) + sum(sapply(1:p, function(j) lgamma(x + (1-j)/2))) # } # CholWishart::lmvgamma(x,p) # f(x,p) # CholWishart::lmvgamma(x,p) - CholWishart::lmvgamma(x+1,p) # f(x,p) - f(x+1,p) # lmvgamma_diff = function(x,x2, p){ # sum(sapply(1:p, function(j) lgamma(x + (1-j)/2))) - sum(sapply(1:p, function(j) lgamma(x2 + (1-j)/2))) # } # lmvgamma_diff(x, x+1,p) # fit = lm(as.formula(trueModel), data=as.data.frame(X)) # fitB = lm(Y ~ X_3, data=as.data.frame(X)) # a = mvIC(fit, shrink.method="EB", criterion = "BIC") # b = mvIC(fitB, shrink.method="EB", criterion = "BIC") # a < b # fit = lm(as.formula(trueModel), data=as.data.frame(X)) # fitB = lm(Y ~ X_3, data=as.data.frame(X)) # a = mvIC(fit, shrink.method="EB", criterion = "BIC") # b = mvIC(fitB, shrink.method="EB", criterion = "BIC") # a < b # mvIC:::estimateMVN_EB( t(mvIC:::getResids(fit)) ) # mvIC:::test_run( t(mvIC:::getResids(fit)) ) # eb_cov_est3( t(mvIC:::getResids(fit)), MAP=FALSE ) # mvIC:::eb_cov_est2( t(mvIC:::getResids(fit)), MAP=FALSE ) # mvIC:::eb_cov_est( t(mvIC:::getResids(fit)) ) # b = beam::beam(t(mvIC:::getResids(fit)) ) # str(b) # res = mvIC:::eb_cov_est2(t(mvIC:::getResids(fit))) # str(res) # res = mvIC:::eb_cov_est3(t(mvIC:::getResids(fit))) # str(res) # # cov(t(mvIC:::getResids(fit))) # str(TAS::gcShrink(t(mvIC:::getResids(fit)))) # str(eb_cov_est2(t(mvIC:::getResids(fit)))) # str(eb_cov_est3(t(mvIC:::getResids(fit)))) # The lambda from var_unequal seems to be better, but is not a likelihood estimate # How does scaling affect the log-likelihood?? # mvIC:::eb_cov_est(t(residMatrix)) # mvIC:::eb_cov_est(t(scale(residMatrix)))
set.seed(2) res = lapply( c(seq(10, 90, by=10), seq(100, 1100, by=50)), function(p){ cat(p, "\n") n = 20 m_active = 1 m_total = 10 rho = 0 sigFrac = .6 X = matrix(rnorm(m_total*n), n, m_total) colnames(X) = paste0('X_', 1:m_total) trueSet = c('1', sort(colnames(X)[1:m_active])) # simulate coefficient for each gene beta = matrix(rnorm(p*m_active, 0, 10), m_active,p) eta = X[,1:m_active,drop=FALSE] %*% beta trueModel = paste('Y ~', paste(colnames(X)[1:m_active], collapse=' + ')) n_clusters = 5 # changes correlation structure of noise. clustID = sample.int(n_clusters, p, prob = dexp(1:n_clusters, .5), replace=TRUE) sigList = lapply(unique(clustID), function(id){ count = sum(clustID == id) Sigma = matrix(rho, count, count) diag(Sigma) = 1 Sigma }) Sigma = bdiag(sigList) # Sigma = matrix(rho, p, p) # diag(Sigma) = 1 # Sigma = AR1(p, rho) Noise = rmvn.sparse(n, mu = rep(0, p), CH=Cholesky(Sigma), prec=FALSE) fctr = mean(apply(eta,1, var)) * (1-sigFrac)/sigFrac # Y = eta + Noise*fctr Y = Noise fit = lm(as.formula(trueModel), data=as.data.frame(X)) residMatrix = mvIC:::getResids(fit) tst1 = estimateMVN_EB( (t(residMatrix)), MAP=TRUE ) # tst2 = gcShrink( t((t(residMatrix))), var=3, cor=1, plot=FALSE) tst2 = eb_cov_est( (residMatrix)) tst3 = shrinkcovmat.unequal( residMatrix ) SigTrue = as.matrix(Sigma) Target = diag(apply(residMatrix, 1, var)) Omega = cov(t(residMatrix)) x = seq(1e-3, 1-1e-3, length.out=100 ) y_norm = sapply(x, function(alpha){ Sig_hat = (1-alpha) * Omega + alpha * Target norm(SigTrue - Sig_hat, "F") }) alphaOpt = x[which.min(y_norm)] data.frame(p, none = norm(SigTrue, "F"), estimateMVN_EB = norm(SigTrue - tst1$Sigmahat, "F"), alpha1 = tst1$alpha, # gcShrink = norm(SigTrue - tst2$sigmahat, "F"), alpha2 = tst2$alpha, shrinkcovmat.unequal = norm(SigTrue - tst3$Sigmahat, "F"), alpha3 = tst3$lambdahat, Oracle = min(y_norm), alpha4 = alphaOpt) }) res = do.call(rbind, res)
--->
library(poolr) library(mvIC) library(ggplot2) library(mvtnorm) library(data.table) library(CVTuningCov) library(Matrix) library(gridExtra) library(TAS) set.seed(2) n_reps = 5 n = 50 # number of samples m_total = 10 # total number of variables n_clusters = 5 m_active_array = 1 signalFraction = .4 #seq(.2, .8, by=.2) ngenes_array = 1000 #c(10, 20, 30, 40, 60, 80, 100, 200, 1000) rho_array = seq(0, .98, length.out=5) resSearch = lapply(1:n_reps, function(k){ resSearch = lapply(m_active_array, function(m_active){ # simulate variables X = matrix(rnorm(m_total*n), n, m_total) colnames(X) = paste0('X_', 1:m_total) trueSet = c('1', sort(colnames(X)[1:m_active])) resSearch = lapply(ngenes_array, function(p){ # simulate coefficient for each gene beta = matrix(rnorm(p*m_active, 0, 10), m_active,p) # beta[sample.int(length(beta), length(beta)*.5)] = 0 eta = X[,1:m_active,drop=FALSE] %*% beta # changes correlation structure of noise. clustID = sample.int(n_clusters, p, prob = dexp(1:n_clusters, .5), replace=TRUE) resSearch = lapply( rho_array, function(rho){ sigList = lapply(unique(clustID), function(id){ count = sum(clustID == id) Sigma = matrix(rho, count, count) diag(Sigma) = 1 Sigma }) Sigma = bdiag(sigList) # Sigma = matrix(rho, p, p) # diag(Sigma) = 1 # Sigma = AR1(p, rho) Noise = rmvn.sparse(n, mu = rep(0, p), CH=Cholesky(Sigma), prec=FALSE) resSearch = lapply(signalFraction, function(sigFrac){ fctr = mean(apply(eta,1, var)) * (1-sigFrac)/sigFrac Y = eta + Noise*fctr # cor(eta[1,], eta[1,] + Noise[1,]*fctr)^2 message("\rk = ", k, ' p = ', p, ' rho = ', round(rho,2), ' sigFrac = ', sigFrac, ' m_active = ', m_active, ' ') modelSearch = bestSubsetSearch(Y, as.data.frame(X), colnames(X), maxk=3) modelSearch = data.table(modelSearch) trueModel = paste('Y ~', paste(colnames(X)[1:m_active], collapse=' + ')) # get best model # modelSearch[,form[which.min(score)],by=c('method', 'criterion')] # get best model df = modelSearch[,data.frame(rank = rank(score), isTrue = form==trueModel) ,by=c('method', 'criterion')] # rank of best model res = df[isTrue==TRUE,1:3] res$k = k res$m_active = m_active res$p = p res$rho = rho res$sigFrac = sigFrac res }) do.call(rbind, resSearch) }) do.call(rbind, resSearch) }) do.call(rbind, resSearch) }) do.call(rbind, resSearch) }) resSearch = do.call(rbind, resSearch) resSearch = data.table(resSearch)
library(gridExtra) df = data.frame(resSearch[, data.frame(rank=mean(rank), sd=sd(rank)),by=c('method', 'criterion', 'm_active', 'p', 'rho', 'sigFrac')]) df$isNone = "no" df$isNone[grep("none", df$method)] = 'yes' maxRank = max(df$rank) figList = lapply(unique(df$sigFrac), function(v){ ggplot( subset(df, sigFrac == v), aes(rho, rank, color=paste(method, criterion), fill=paste(method, criterion))) + geom_point(aes(shape=isNone)) + geom_line() + facet_wrap( ~ m_active + p, nrow=2) + scale_color_discrete("Method") + scale_fill_discrete("Method") + theme_bw() + theme(aspect.ratio=1) + ggtitle(v) + scale_y_log10() # + geom_ribbon(aes(rho, ymin=pmax(rank-sd,0), ymax=pmin(rank+sd, maxRank)), alpha=.2, linetype=0) }) # pdf("~/www/mvBIC.pdf", width=12) do.call("grid.arrange", c(figList, nrow=1)) # dev.off()
knit_exit()
library(poolr) library(mvIC) library(ggplot2) library(mvtnorm) library(data.table) library(CVTuningCov) library(Matrix) library(gridExtra) library(TAS) set.seed(1) n_reps = 10 n = 50 # number of samples # p = 10 # number of genes # m_active = 2 # number of variables affecting phenotype m_total = 10 # total number of variables n_clusters = 5 m_active_array = 1#c(1, 2) signalFraction = .6#seq(.4, .8, by=.2) ngenes_array = c(100)#, 20, 40, 60, 80, 100, 200, 500) rho_array = seq(0, .98, length.out=3) # n_reps = 10 # n = 50 # number of samples # m_active_array = 1 # signalFraction = .65# seq(.5, .7, length.out=3) # ngenes_array = c(5, 20)#, 50, 100, 200)#, 25, 30, 40, 50) # rho_array = seq(0, .98, length.out=4) resRecovery = lapply(1:n_reps, function(k){ set.seed(k) resRecovery = lapply(m_active_array, function(m_active){ # simulate variables X = matrix(rnorm(m_total*n), n, m_total) colnames(X) = paste0('X_', 1:m_total) trueSet = c('1', sort(colnames(X)[1:m_active])) resRecovery = lapply(ngenes_array, function(p){ # simulate coefficient for each gene beta = matrix(rnorm(p*m_active, 0, 10), m_active,p) # beta[sample.int(length(beta), length(beta)*.5)] = 0 eta = X[,1:m_active,drop=FALSE] %*% beta # changes correlation structure of noise. clustID = sample.int(n_clusters, p, prob = dexp(1:n_clusters, .5), replace=TRUE) resRecovery = lapply( rho_array, function(rho){ sigList = lapply(unique(clustID), function(id){ count = sum(clustID == id) Sigma = matrix(rho, count, count) diag(Sigma) = 1 Sigma }) Sigma = as.matrix(bdiag(sigList)) # Sigma = matrix(rho, p, p) diag(Sigma) = 1 # Sigma = AR1(p, rho) Noise = rmvnorm(n, sig=Sigma) resRecovery = lapply(signalFraction, function(sigFrac){ fctr = mean(apply(eta,1, var)) * (1-sigFrac)/sigFrac Y = eta + Noise*fctr # cor(eta[1,], eta[1,] + Noise[1,]*fctr)^2 message("\rk = ", k, ' p = ', p, ' rho = ', round(rho,2), ' sigFrac = ', sigFrac, ' m_active = ', m_active, ' ') # save(list=ls(), file="test.RDATA") # Multivariate methods methods = c( "EB" ) #"var_equal", if( n > p){ methods = c(methods, "none") } criteria = c("AIC", "BIC") res = lapply( methods, function(method){ res = lapply( criteria, function(criterion){ bestModel = mvForwardStepwise( t(Y), ~1, X, colnames(X), criterion=criterion, verbose=FALSE, shrink.method=method) # test of selected set is the true set vars = subset(bestModel$trace, isAdded == "yes")$variable vars = as.character(vars) data.frame(criterion, method, recovery = identical( sort(vars), trueSet), rho = rho, k = k, p = p, sigFrac = sigFrac, m_active = m_active,stringsAsFactors=FALSE) }) do.call(rbind, res) }) res = do.call(rbind, res) # sum scores for each response resSum = lapply(c("AIC", "BIC"), function(criterion){ # fit naive model bestModelNaive = mvForwardStepwise( t(Y), ~1, X, colnames(X), criterion=paste("sum", criterion), verbose=FALSE, deltaCutoff=5) # test of selected set is the true set vars = subset(bestModelNaive$trace, isAdded == "yes")$variable vars = as.character(vars) result_naive = identical( sort(vars), trueSet) df_add = data.frame( criterion = "criterion", method = paste("sum", criterion), recovery = result_naive, rho = rho, k = k, p = p, sigFrac = sigFrac, m_active = m_active) }) resSum = do.call(rbind, resSum) rbind(res, resSum) }) do.call(rbind, resRecovery) }) do.call(rbind, resRecovery) }) do.call(rbind, resRecovery) }) do.call(rbind, resRecovery) }) resRecovery = do.call(rbind, resRecovery) resRecovery = data.table(resRecovery)
# summarize df = resRecovery[,data.frame(recoveryRate = sum(recovery)/length(recovery)),by=c('rho', 'p', 'sigFrac', 'm_active','method', 'criterion')] df$sd = with(df, sqrt(recoveryRate*(1-recoveryRate)/n_reps)) df$up = with(df, recoveryRate + sd) df$down = with(df, recoveryRate - sd) df = df[grep("var_", method, invert=TRUE),] # cols = c("red", "orange", "dodgerblue", "blue", "navy", "grey", "green", "black") figList = lapply(unique(df$sigFrac), function(v){ ggplot(subset(df, sigFrac==v), aes(rho, recoveryRate, color=paste(criterion, '-', method), fill=paste(criterion, '-', method))) + geom_ribbon(aes(ymin=down, ymax=up), alpha=.3, linetype=0) + geom_line() + geom_point() + scale_color_discrete("Method" ) + scale_fill_discrete("Method" ) + xlab(bquote(Correlation~(rho))) + ylab("Power to recover true model") + theme_bw() + theme(aspect.ratio=1) + ylim(0, 1) + facet_wrap(~m_active+p, ncol=length(ngenes_array) ) + ggtitle(v) }) # pdf("~/www/mvIC.pdf", width=12, height=120) do.call("grid.arrange", c(figList, ncol=1)) # dev.off()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.