Nothing
###################################
### Simulated Metabolomics Data ###
###################################
#' @import stats
plus_minus <- function(beta){
inv <-rbinom(beta,1,0.5)
inv <-ifelse(inv==0,-1,1)
return(inv)
}
ind_corr <- function(dat, group_size, start, end){
gs <- if(group_size == 1) group_size else group_size-1
pm <- plus_minus(gs)
# take first column of block
f <- dat[,start]
#copy to each subsequent column
dat[,(start+1):end] <- f
dat[,(start+1):end]
#change sign
finish <- t(t(dat[,(start+1):end])*pm)
return(finish)
}
#' @title Noise Matrix Generator
#' @description Provides a matrix to perturb randomly generated data
#' to facilitate a more realistic dataset.
#' @param matrix A matrix of simulated data with dimensions comparable
#' to 'real' datasets
#' @param k Correlation Perturbation - The higher k, the more the data
#' is perturbed.
#' @return Returns a matrix of the same dimensions as \code{matrix} that
#' can add to perturb the
#' original simulated data.
#' @author Charles E. Determan Jr.
noise.matrix <- function(matrix, k){
nvar <- ncol(matrix)
nsamp <- nrow(matrix)
# for each variable j a value wj is obtained from a random generated
# uniform distribution from 0 to k
wj <- runif(nvar, max=k, min=0)
# wij - vector of uniform random numbers -Wj < wij < Wj
# add wij to each element of matrix B
noise <- matrix(0, nrow=nsamp, ncol=nvar)
for (i in 1:ncol(noise)){
wij <- runif(nsamp, min=-wj[i], max=wj[i])
noise[,i] <- wij}
return(noise)
}
#' @title Random Multivariate Data Generator
#' @description Generates a matrix of dimensions \code{nvar} by
#' \code{nsamp} consisting of random numbers generated from a normal
#' distriubtion. This normal distribution is then perturbed to more
#' accurately reflect experimentally acquired multivariate data.
#' @param nvar Number of features (i.e. variables)
#' @param nsamp Number of samples
#' @param st.dev The variation (i.e. standard deviation) that is typical
#' in datasets of interest to the user. Default \code{spread = 1}
#' @param perturb The amount of perturbation to the normal distribution.
#' Default \code{perturb = 0.2}
#' @return Matrix of dimension \code{nvar} by \code{nsamp}
#' @author Charles E. Determan Jr.
#' @references Wongravee, K., Lloyd, G R., Hall, J., Holmboe, M. E., &
#' Schaefer, M. L. (2009). \emph{Monte-Carlo methods for determining optimal
#' number of significant variables. Application to mouse urinary profiles.}
#' Metabolomics, 5(4), 387-406. http://dx.doi.org/10.1007/s11306-009-0164-4
#' @seealso \code{\link{create.corr.matrix}}, \code{\link{create.discr.matrix}}
#' @example inst/examples/data.generation.R
#' @export
create.random.matrix <-
function(nvar,
nsamp,
st.dev = 1,
perturb = 0.2
)
{
## Random numbers matrix
R <- replicate(nvar, rnorm(nsamp, mean=0, sd=st.dev))
### Perturb Normal Distribution
# Random uniform numbers matrix
N <- replicate(nvar, runif(nsamp, max=perturb, min=-perturb))
# Null matrix w/o any correlations
U <- R + N
U
}
#' @title Correlated Multivariate Data Generator
#' @description Generates a matrix of dimensions \code{dim(U)} with
#' induced correlations. Blocks of variables are randomly assigned and
#' correlations are induced. A noise matrix is applied
#' to the final matrix to perturb 'perfect' correlations.
#' @param U Numeric matrix
#' @param k Correlation Perturbation - The higher k, the more the data
#' is perturbed. Default \code{k = 4}
#' @param min.block.size minimum number of variables to correlate
#' Default \code{min.block.size = 2}
#' @param max.block.size maximum number of variables to correlate
#' Default \code{max.block.size = 5}
#' @return A numberic matrix of dimension \code{dim(U)} with correlations induced
#' between variables
#' @note Output does not contain classes, may provide externally as
#' classes are irrelevant in this function.
#' @author Charles E. Determan Jr.
#' @references Wongravee, K., Lloyd, G R., Hall, J., Holmboe, M. E., &
#' Schaefer, M. L. (2009). \emph{Monte-Carlo methods for determining optimal
#' number of significant variables. Application to mouse urinary profiles.}
#' Metabolomics, 5(4), 387-406. http://dx.doi.org/10.1007/s11306-009-0164-4
#' @seealso \code{\link{create.random.matrix}},
#' \code{\link{create.discr.matrix}}
#' @example inst/examples/data.generation.R
#' @export
create.corr.matrix <-
function(U,
k=4,
min.block.size = 2,
max.block.size = 5
)
{
assert_is_matrix(U)
assert_is_numeric(U)
nvar <- ncol(U)
nsamp <- nrow(U)
# generate block size (groups of variables to correlate)
# discrete uniform distribution
# 2>beta>5 - original (more appropriate for MS?)
# set beta -> 1>x>5 to reflect that some variables likely
# independent with less dimensions in NMR
beta <- sample(min.block.size:max.block.size,
size=nvar-1,
replace=TRUE)
# loop through each block
for (i in 1:length(beta)){
if (i == 1){
start <- 1
end <- start + beta[i] - 1
U[,(start+1):end] <- ind_corr(U, beta[i], start, end)
}else{
start <- end + 1
if(start > nvar){
start <- nvar
end <- nvar
cat('solo last variable')
break
}
end <- end + beta[i]
if(end > nvar){
end <- nvar
is_group <- end - start
if (is_group > 1){
U[,(start+1):end] <- ind_corr(U, is_group, start, end)
}
break
}
if (beta[i] != 1){
U[,(start+1):end] <- ind_corr(U, beta[i], start, end)
}
B <- U
}
}
## Create a noise matrix to perturb distributions
W <- noise.matrix(B, k)
B <- B + W
# randomize order of columns to make blocks no longer obvious
rand <- sample(nsamp)
# very naieve class structure, only to say it is from this function
V <- structure(B[rand,], class = c("matrix.corr", "matrix"))
#V <- B[rand,]
V
}
#' @title Discriminatory Multivariate Data Generator
#' @description Generates a matrix of dimensions \code{dim(U)} with
#' induced correlations. \code{D} variables are randomly selected as
#' discriminatory. If \code{num.groups = 2} then discrimination is
#' induced by adding and subtracting values derived from the level of
#' of discrimination, \code{l}, for the classes respectively.
#' Multi-class datasets have a few further levels of randomization. For
#' each variable, a random number of the groups are selected as
#' discriminating while the remaining groups are not altered.
#' For each discriminatory group, a unique change is provided by
#' randomly assigning addition or subtraction of the discrimination factor.
#' For example, if 3 groups are selected and two groups are assigned
#' as addition and the third subtraction, the second addition is
#' multiplied by its number of replicates. E.g. (1,1,-1) -> (1,2,-1).
#' These values are randomized and then multiplied by the respective
#' discrimination factor. The resulting values are then added/subtracted
#' from the respective groups. A noise matrix is applied to the final
#' matrix to perturb 'perfect' discrimination.
#' @param V Numeric matrix
#' @param D Number of discriminatory variables induced. Default \code{D = 20}
#' @param l Level of discrimination, higher = greater separation.
#' Default \code{l = 1.5}
#' @param num.groups Number of groups in the dataset
#' @param k Correlation Perturbation - The higher k, the more the data
#' is perturbed. Default \code{k = 4}
#' @return List of the following elements
#' @return \item{discr.mat}{Matrix of dimension \code{dim(V)+1} with
#' discriminatory variables induced and the .classes added to the
#' end of the matrix.}
#' @return \item{features}{Vector of features that were induced to
#' be discriminatory.}
#' @author Charles E. Determan Jr.
#' @references Wongravee, K., Lloyd, G R., Hall, J., Holmboe, M. E., &
#' Schaefer, M. L. (2009). \emph{Monte-Carlo methods for determining optimal
#' number of significant variables. Application to mouse urinary profiles.}
#' Metabolomics, 5(4), 387-406. http://dx.doi.org/10.1007/s11306-009-0164-4
#' @example inst/examples/data.generation.R
#' @export
create.discr.matrix <-
function(V,
D = 20,
l = 1.5,
num.groups = 2,
k = 4
)
{
if(!"matrix.corr" %in% class(V)){
warning("Matrix provided was not produced from create.corr.matrix.
Assumptions are unknown. Matrix may assume complete
independence and/or normality. Proceed with caution")
}
nc <- ncol(V)
nr <- nrow(V)
groups <- LETTERS[seq(num.groups)]
if(is.null(colnames(V))){
colnames(V) <- paste("V", seq(1:nc), sep = "")
}
# randomly select which variables to be discriminatory
d <- sample(nc, size = D, replace=FALSE)
Z <- V[,d]
# which variables were sampled?
xNames <- colnames(Z)
# get range of discriminatory ability (as is typical in real data)
di <- runif(D, min=-l, max=l)
if(num.groups == 2){
# Binary class induced discrimination
I <- nr/2
classes <- rep(groups, each = I)
# add di to first group, subtract from second
# to induce discrimination
for(i in seq(D)){
Z[1:I,i] <- Z[1:I,i]+di[i]
Z[(1+I):nr,i] <- Z[(1+I):nr,i]-di[i]
}
}else{
# Multi-class induced discrimination
# split Z into n groups
Z.list <- tryCatch({
splt <- split(as.data.frame(Z),
rep(1:num.groups, each = nrow(Z)/num.groups))
}, warning = function(war){
warning("Uneven distribution of groups")
splt <- suppressWarnings(
split(as.data.frame(Z),
rep(1:num.groups, each = nrow(Z)/num.groups)
)
)
return(splt)
}
)
classes <-
unlist(
mapply(
groups,
FUN = function(x,y) rep(x, nrow(y)), y = Z.list),
use.names=FALSE)
classes <- sapply(classes, rbind)
Z.rows <- lapply(Z.list, rownames)
# Create matrix of instructions for discriminating variables
mat <- matrix(0, nrow=D, ncol=num.groups)
rownames(mat) <- paste("Var", seq(D), sep=".")
for(e in seq(D)){
# for each variable, randomize number of groups discriminating
num <- sample.int(num.groups, 1)
# determine how values will be added/subtracted
pm <- plus_minus(num)
# remainder of groups retain same value
num.zero <- num.groups - num
pm.complete <- c(pm, rep(0, num.zero))
# randomize how value added
mat[e,] <- sample(pm.complete)
}
# make each change unique to facilitate each group is
# discriminated while leaving 0's alone
mat.uniq <-
t(
apply(
mat,
1,
FUN = function(m){
unsplit(
lapply(
split(m, factor(m)),
FUN = function(x) x*seq(length(x))
),
factor(m))
}))
# multiple direction changes by di value
mat.discr <- mat.uniq*di
for(l in seq(along = Z.list)){
# randomize order of columns so different groups
# are changed differently
mat.rand <- mat.discr[,sample(ncol(mat.discr))]
Z.dat <- Z.list[[l]]
#vector for each group
di.sub <- mat.rand[,l]
Z.list[[l]] <- sweep(Z.dat, MARGIN = 2, di.sub, '+')
rownames(Z.list[[l]]) <- Z.rows[[l]]
}
#require(data.table)
Z <- rbindlist(Z.list)
}
# add newly discriminated variables back to matrix V to make matrix S
S <- V
S[,d] <- as.matrix(Z)
## Create a noise matrix to perturb distributions again
W <- noise.matrix(S, k)
Y <- as.matrix(S + W)
# Y$.classes <- as.factor(classes)
# Y[1:nc] <- lapply(Y[1:nc],
# FUN = function(x) as.numeric(as.character(x)))
out <- list(discr.mat = Y,
classes = as.factor(classes),
features = xNames)
out
}
# histograms of correlation coefficients
# function to determine max correlation
# must be the second highest because always 1 for correlation with self
# therefore remove max in each column and return new max
#f4 <- function (x)
# {
# apply(x, 2, function(column) max(column[-which.max(column)]))
#}
#hist(f4(abs(cor(U))))
#hist(f4(abs(cor(B))))
#hist(f4(abs(cor(Y))))
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.