#' Evaluate difference in group trajectories based on the comparison of model fit (F-test) between one and two groups
#'
#' Evaluate the difference in group trajectories by executing a t-test based on the comparison of improvement in model fit \emph{(F-test)} between fitting one group mean curve to all individuals and fitting two group mean curves. This between-class differential evolution test, evaluates whether fitting 2 group curves decreases the residuals compared to a single group mean curve. The statistic employed is defined as a quantification of evidence for differential evolution, with the larger the statistic the more differentially evolving the variable appears to be. Individual group membership is repeatedly randomly permuted to generate new random groups and group mean curves, then employed to compute a \emph{Null} distribution of the statistic (improvement in model fit from one to two groups). The improvement in model fit for the real group membership is then compared to this \emph{Null} distribution \emph{(of no group difference)} and a \emph{p}-value is computed. Adapted from \cite{Storey and al. 'Significance analysis of time course microarray experiments', PNAS, 2005 [1]}.
#' \itemize{
#' \item The \emph{p}-value is calculated as \code{(b+1)/(nPerm+1)} as to not report a \emph{p}-value=0 (which would give problem with FDR correction) and reduce type I error.
#' \item The \emph{p}-value will vary depending on the random sampling. Therefore a confidence interval can be constructed using Wilson's interval which presents good properties for small number of trials and probabilities close to 0 or 1.
#' }
#'
#' @param SANTAObj A fitted \emph{SANTAObj} as generated by \code{\link{santaR_fit}}.
#' @param nPerm (int) Number of permutations. Default 1000.
#' @param alpha (float) Confidence Interval on the permuted \emph{p}-value \emph{(0.05 for 95\% Confidence Interval)}. Default 0.05.
#'
#' @return A \emph{SANTAObj} with added \emph{p}-value fit and confidence interval on the \emph{p}-value.
#'
#' @examples
#' ## 56 measurements, 8 subjects, 7 unique time-points
#' ## Default parameter values decreased to ensure an execution < 2 seconds
#' Yi <- acuteInflammation$data$var_3
#' ind <- acuteInflammation$meta$ind
#' time <- acuteInflammation$meta$time
#' group <- acuteInflammation$meta$group
#' grouping <- get_grouping(ind, group)
#' inputMatrix <- get_ind_time_matrix(Yi, ind, time)
#' SANTAObj <- santaR_fit(inputMatrix, df=5, grouping=grouping, verbose=TRUE)
#' SANTAObj <- santaR_pvalue_fit(SANTAObj, nPerm=100)
#'
#' @references [1] Storey, J. D., Xiao, W., Leek, J. T., Tompkins, R. G. & Davis, R. W. Significance analysis of time course microarray experiments. \emph{Proceedings of the National Academy of Sciences of the United States of America} \strong{102}, 12837-42 (2005).
#'
#' @family Analysis
#' @seealso Comparison with constant model with \code{\link{santaR_pvalue_fit_within}}
#'
#' @export
santaR_pvalue_fit <- function(SANTAObj,nPerm=1000,alpha=0.05) {
## COMMENT
# Ind .pred have the smoothness penalty, groupMeanCurve is therefore fitted at df=nbTP.
# as we permutate the Ind (.in and .pred), no need to refit @ original df
## FUNCTION
SSquare <- function(observed,fitted) {
## Sum of Square of the residuals across all individuals and timepoints
# observed = matrix IND x TIME
# fitted = predicted values for TIME (row vector)
SSq.ind <- apply( observed, 1, function(x) sum((x-fitted)^2,na.rm=TRUE) )
return(sum(SSq.ind))
}
stat.fit <- function(models) {
## Compares goodness of fit of the model under the null hypothesis to that under the alternative hypothesis
# models = list(mod.null, mod.alt)
SSq.null <- SSquare(models$mod.null$obs, models$mod.null$fit)
SSq.alt1 <- SSquare(models$mod.alt1$obs, models$mod.alt1$fit)
SSq.alt2 <- SSquare(models$mod.alt2$obs, models$mod.alt2$fit)
SSq.alt <- SSq.alt1 + SSq.alt2
if(SSq.alt==0) { # stop from dividing by zero
SSq.alt <- 10*.Machine$double.eps
if(SSq.null==0) { SSq.null <- 10*.Machine$double.eps } # stop from having a -1 when it should be 0
}
Ftest <- (SSq.null - SSq.alt)/SSq.alt
return(Ftest)
} # F-test statistic (goodness of fit)
between.null.sim.ind <- function(group1,group2) {
## Simulate NULL population by resampling the individuals
# init
n1 <- dim(group1$inp)[1]
n2 <- dim(group2$inp)[1]
n <- n1+n2
inp <- rbind(group1$inp, group2$inp)
pred <- rbind(group1$pred,group2$pred)
# random draw
sampler1 <- sample(1:n, size=n1, replace=TRUE)
sampler2 <- sample(1:n, size=n2, replace=TRUE)
# make groups
pop1 <- list( inp=inp[sampler1,], pred=pred[sampler1,] )
pop2 <- list( inp=inp[sampler2,], pred=pred[sampler2,] )
return(list(pop1=pop1,pop2=pop2))
} # Simulate NULL population by resampling individuals
fit.mean.curve <- function(group,df){
## Works on .pred values for each individual, that already has the smoothness penalty, so fit groupMeanCurve with df=nbTP
# The if condition shouldn't be necessary as no NA in .pred values
time <- as.numeric(colnames(group))
meanVal <- colMeans(group, na.rm=TRUE)
notNA <- !is.na(meanVal)
if( sum(notNA) >= df ) {
return(stats::smooth.spline( x=time[notNA], y=meanVal[notNA], df=sum(notNA) ))
} else {
return(NA)
}
} # return the fitted smooth.spline object for that group
fit.models <- function(population,df) {
## Fit both the NULL meanCurve, and the 2 ALTERNATE groupMeanCurve on a population IND x TIME
#
## Returns a list of models, observations and corresponding fitted values
#
## fit on .pred to save the step of ind prediction at each run of the function, obs on inp points for residuals calculation
group1 <- population$pop1
group2 <- population$pop2
# Fit NULL model
group.null <- rbind(group1$inp,group2$inp)
fit.null <- fit.mean.curve( rbind(group1$pred,group2$pred),df)
if(! inherits(fit.null, 'smooth.spline')) { return(NA) } # detect failure in fitting mean curve
model.null <- list(obs=group.null, fit=stats::predict(fit.null,y=as.numeric(colnames(group.null)))$y)
# Fit ALTERNATE model
fit.alt1 <- fit.mean.curve(group1$pred,df)
fit.alt2 <- fit.mean.curve(group2$pred,df)
if( (! inherits(fit.alt1, 'smooth.spline')) & (! inherits(fit.alt2, 'smooth.spline')) ) { return(NA) }
model.alt1 <- list(obs=group1$inp, fit=stats::predict(fit.alt1,y=as.numeric(colnames(group1$pred)))$y)
model.alt2 <- list(obs=group2$inp, fit=stats::predict(fit.alt2,y=as.numeric(colnames(group2$pred)))$y)
return(list(mod.null=model.null, mod.alt1=model.alt1, mod.alt2=model.alt2))
} # fit NULL and ALTERNATE models
## Init
if (length(SANTAObj$groups) != 2) {
message("Error: Check input, p-values can only be calculated on 2 groups")
stop("Error: Check input, p-values can only be calculated on 2 groups")
}
z <- stats::qnorm(1-0.5*alpha)
df <- SANTAObj$properties$df
groupData1 <- list( inp=SANTAObj$groups[[1]]$groupData.in, pred=SANTAObj$groups[[1]]$groupData.pred )
groupData2 <- list( inp=SANTAObj$groups[[2]]$groupData.in, pred=SANTAObj$groups[[2]]$groupData.pred )
if( is.data.frame(groupData1$inp) & is.data.frame(groupData2$inp)){
## Fstat
models <- fit.models(list(pop1=groupData1, pop2=groupData2), df) # mod.null, mod.alt1, mod.alt2
if( !is.list(models) ) { return(NA) } # fitting of one of the mean curve must have failed
Fstat <- stat.fit( models ) # NULL real vs alternative
## Bootstrap, NULL real vs NULL generated
Fstat0 <- replicate( nPerm, stat.fit( fit.models(between.null.sim.ind(groupData1,groupData2), df) ) )
p <- (sum( Fstat0 >= Fstat)+1)/(nPerm+1)
SANTAObj$general$pval.fit <- p
SANTAObj$general$pval.fit.l <- (1/(1+((z^2)/nPerm))) * ( p+((z^2)/(2*nPerm)) - z*sqrt( (1/nPerm)*p*(1-p) + (z^2)/(4*(nPerm^2)) ) )
SANTAObj$general$pval.fit.u <- (1/(1+((z^2)/nPerm))) * ( p+((z^2)/(2*nPerm)) + z*sqrt( (1/nPerm)*p*(1-p) + (z^2)/(4*(nPerm^2)) ) )
}
SANTAObj$properties$pval.fit$status <- TRUE
SANTAObj$properties$pval.fit$nPerm <- nPerm
SANTAObj$properties$pval.fit$alpha <- alpha
return(SANTAObj)
}
#' Evaluate difference between a group mean curve and a constant model using the comparison of model fit (F-test)
#'
#' Execute a t-test based on the comparison of improvement of model fit from a single group mean curve to the fit of both a group mean curve and a constant linear model. This statistic identifies within-class differential evolution, and test whether the population average time curve is flat or not. \emph{n} constant linear model are generated to match the \emph{n} individual trajetories. The \emph{Null} distribution is generated by permuting the \emph{n} group individuals and the \emph{n} constant trajectories. The real improvement in model fit for the real group membership versus flat trajectories is then compared to the \emph{Null} distribution of model fit improvement, similarly to \code{\link{santaR_pvalue_fit}}. Adapted from \cite{Storey and al. 'Significance analysis of time course microarray experiments', PNAS, 2005 [1]}.
#'
#' @param SANTAGroup A fitted group extracted from a \emph{SANTAObj} generated by \code{\link{santaR_fit}}.
#' @param nPerm (int) Number of permutations. Default 1000.
#'
#' @return A \emph{p-value}
#'
#' @examples
#' ## 56 measurements, 8 subjects, 7 unique time-points
#' ## Default parameter values decreased to ensure an execution < 2 seconds
#' Yi <- acuteInflammation$data$var_3
#' ind <- acuteInflammation$meta$ind
#' time <- acuteInflammation$meta$time
#' group <- acuteInflammation$meta$group
#' grouping <- get_grouping(ind, group)
#' inputMatrix <- get_ind_time_matrix(Yi, ind, time)
#' SANTAObj <- santaR_fit(inputMatrix, df=5, grouping=grouping, verbose=TRUE)
#' SANTAGroup <- SANTAObj$groups[[1]]
#' #SANTAGroup <- SANTAObj$groups$Group1
#' santaR_pvalue_fit_within(SANTAGroup, nPerm=500)
#' # ~0.6726747
#'
#' @references [1] Storey, J. D., Xiao, W., Leek, J. T., Tompkins, R. G. & Davis, R. W. Significance analysis of time course microarray experiments. \emph{Proceedings of the National Academy of Sciences of the United States of America} \strong{102}, 12837-42 (2005).
#'
#' @seealso Inter-group comparison with \code{\link{santaR_pvalue_fit}}
#'
#' @export
santaR_pvalue_fit_within <- function(SANTAGroup,nPerm=1000) {
## COMMENT
# Ind .pred have the smoothness penalty, groupMeanCurve is therefore fitted at df=nbTP.
# as we bootstrap the Ind (.in and .pred), no need to refit @ original df
## FUNCTION
SSquare <- function(observed,fitted) {
## Sum of Square of the residuals across all individuals and timepoints
# observed = matrix IND x TIME
# fitted = predicted values for TIME (row vector)
SSq.ind <- apply( observed, 1, function(x) sum((x-fitted)^2,na.rm=TRUE) )
return(sum(SSq.ind))
}
stat.fit <- function(models) {
## Compares goodness of fit of the model under the null hypothesis to that under the alternative hypothesis
# models = list(mod.null, mod.alt)
if( is.data.frame(models$mod.alt$obs) ){ # check if the spline fitting did work
SSq.alt <- SSquare(models$mod.alt$obs, models$mod.alt$fit)
SSq.null <- SSquare(models$mod.null$obs,models$mod.null$fit)
Ftest <- (SSq.null - SSq.alt)/SSq.alt
return(Ftest)
} else {
return(NA)
}
} # F-test statistic (goodness of fit)
within.null.sim.ind <- function(group) {
## Simulate NULL population by resampling the individuals
time <- as.numeric(colnames(group$inp))
n.ind <- dim(group$inp)[1]
sampler <- sample(1:n.ind, size=n.ind, replace=TRUE)
population <- list( inp=group$inp[sampler,], pred=group$pred[sampler,] )
return(population)
} # Simulate NULL population by resampling individuals
fit.models <- function(groupData) {
## Fit both the NULL model lm( y ~ 1 ), and the ALTERNATIVE groupMeanCurve on a population IND x TIME
# groupData = IND x TIME matrix to fit
#
## Returns a list of models, observations and corresponding fitted values
#
# NULL model on inp data, ALT model employs predicted data for model, inp for residuals
# groupMean on .pred values for each individual, that already has the smoothness penalty, so fit groupMeanCurve with df=nbTP
# init
time <- as.numeric(colnames(groupData$inp))
# Fit NULL model
suppressMessages( y <- reshape2::melt(groupData$inp, na.rm=TRUE)$value )
model.null <- list(obs=groupData$inp, fit=stats::predict(object=stats::lm( y ~ 1),newdata=data.frame(y=time)))
# Fit ALTERNATIVE model
meanVal <- colMeans(groupData$pred, na.rm=TRUE) # using pred saves time on the step of predicting the individual before the average curve
notNA <- !is.na(meanVal)
model.alternative <- list(obs=groupData$inp, fit=stats::predict(stats::smooth.spline( x=time[notNA], y=meanVal[notNA], df=sum(notNA) ), time)$y)
return(list(mod.null=model.null, mod.alt=model.alternative))
} # fit NULL and ALTERNATIVE models
if( is.data.frame(SANTAGroup$groupData.in) ){
## INIT
groupData <- list( inp=SANTAGroup$groupData.in, pred=SANTAGroup$groupData.pred )
models <- fit.models(groupData) # return mod.null, mod.alt
Fstat <- stat.fit( models ) # NULL real vs alternative
if(is.na(Fstat)){
return(NA)
} else {
## Bootstrap, NULL real vs NULL generated
Fstat0 <- replicate( nPerm, stat.fit( fit.models(within.null.sim.ind(groupData)) ) ) # only need the individuals
p <- (sum( Fstat0 >= Fstat)+1)/(nPerm+1)
return(p)
}
} else {
return(NA)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.