## Computes the Isomap embedding by Tenenbaum, de Silva and Langford (2000).
## It is based on the Matlab implementation by Tenenbaum.
## This version uses Floyd's Algorithm to compute the shortest path tree.
##
## A modified version of the original Isomap algorithm is included. It respects nearest and farthest neighbours.
## To estimate the intrinsic dimension of the data, the function can plot the residuals between the high and the
## low dimensional data for a given range of dimensions.
##
## input:
## data: NxD matrix (N samples, D features)
## dims: dimension of the target space (default 2)
## k: number of neighbours (default 5 or number of samples)
## mod: use modified Isomap algorithm (default FALSE)
## plotResiduals: show a plot with the residuals between the high and the low dimensional data (default FALSE)
## verbose: show a summary of the embedding procedure at the end (default TRUE)
##
## output:
## Nxdim matrix (N samples, dim features) with the reduced input data (list of several matrices if more than one dimension specified)
Isomap = function(data, dims=2, k, mod=FALSE, plotResiduals=FALSE, verbose=TRUE){
## catch missing or bad input
if(missing(data))
stop("data argument missing")
else
if(!is.matrix(data))
stop("invalid argument: data argument is required to be a N x D matrix (N samples, D features)")
if(!all(is.numeric(dims)) | any(dims < 1))
stop("invalid argument: target dimension is required to be a (vector of) positive integer value(s)")
if(missing(k))
k = min(nrow(data),5)
else{
if(k >= nrow(data))
stop("invalid argument: more neighbours than samples")
if(!is.numeric(k) | k <= 1)
stop("invalid argument: neighbour parameter is required to be an integer value >= 2")
}
k = round(k)
dims = round(dims)
num_samples = nrow(data)
num_features = ncol(data)
## compute pairwise distances
message("Computing distance matrix ... ", appendLF = FALSE)
d = pairwiseDistances(data)
message("done")
## build graph of shortest paths between two neighbours (using Floyd's Algorithm)
## modified Isomap: Use k/2 nearest neighbours and k/2 farthest neighbours
if(mod == TRUE){
first_k = round(k/2)
last_k = k - first_k
message("Building graph with shortest paths (using ", first_k ," nearest and ", last_k, " farthest neighbours) ... ", appendLF = FALSE)
}
## original Isomap: Use k nearest neighbours only
else{
first_k = k
last_k = 0
message("Building graph with shortest paths (using ", first_k ," nearest neighbours) ... ", appendLF = FALSE)
}
sort_idx = apply(d,2,order)
## set weights to "Infinite" for not neighboured points
for(i in 1:num_samples)
d[i,sort_idx[(first_k+2):(num_samples-last_k),i]] = Inf
## ensure that graph matrix is symmetric
d = pmin(d,t(d))
## Floyd's Algorithm
for(i in 1:num_samples){
d_col_i = t(d[,i])
d_col_i = t(d_col_i[rep(1, each = num_samples),])
d_row_i = t(d[i,])
d_row_i = d_row_i[rep(1, each = num_samples),]
d = pmin(d, d_col_i + d_row_i)
}
## determine all connected components of the graph
num_connections = rowSums(!(is.infinite(d)))
first_connections = apply(is.infinite(d),1,which.min)
components = unique(first_connections)
num_components = length(components)
message("done")
## reduce dimension of all components seperately and merge them to a single dataset
message("Computing low dimensional embedding ... ", appendLF = FALSE)
all_Y = list(NULL)
residuals = rep(0,length(dims))
i = 1
for(dim in dims){
Y = matrix(0,num_samples,dim)
for(c in 1:num_components){
## only use the distances of points in the connected component
comp_indices = which(first_connections == components[c])
D = d[comp_indices,comp_indices]
N = length(comp_indices)
## convert graph matrix to inner products
T = -0.5 * ( D^2 - rowSums(D^2) %*% t(rep(1/N,N)) - rep(1,N) %*% t(rowSums(D^2))/N + sum(D^2)/(N^2))
## catch possible errors caused by too small connected components
if(dim(T)[1] < dim)
stop("Problem occured while computing embedding: Connected component ", c, " consists of too few samples (", dim(T)[1], ") and cannot be reduced to ", dim, " dimensions. Try Isomap with a neighbourhood parameter higher than ", k, " or with dimension parameters lower than ", dim, ".")
## embedding Y is given by the eigenvectors of T belonging to (some of) its biggest eigenvalues
eig_T = eigen(T)
sweep(eig_T$vectors, 2, sqrt(colSums(eig_T$vectors^2)), "/") #normalize eingenvectors
Y[comp_indices,] = Re(eig_T$vectors[,1:dim] * (matrix(1,N,1) %*% sqrt(as.complex(eig_T$values[1:dim]))))
}
## calculate residual variances (take care of not neighboured samples (Inf values))
if(plotResiduals == TRUE){
tmp = as.vector(d)
tmp[is.infinite(tmp)] = NA
r = 1 - cor(cbind(tmp ,as.vector(pairwiseDistances(Y))), use="complete.obs")^2
residuals[i] = r[1,2]
rm(tmp)
}
all_Y[[i]] = Y
i = i+1
}
message("done")
names(all_Y) = paste("dim", dims, sep="")
## give a summary of what has been done
if(verbose == TRUE){
message("number of samples: ", num_samples)
message("reduction from ", num_features, " to ", dims, " dimensions")
message("number of connected components in graph: ", num_components)
}
## plot residual variances for all dimensions
if(plotResiduals == TRUE)
plot(dims,residuals,type="b",xlab="dimension",ylab="residual variance")
return(all_Y)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.