R/Isomap.R

## Computes the Isomap embedding by Tenenbaum, de Silva and Langford (2000).
## It is based on the Matlab implementation by Tenenbaum.
## This version uses Floyd's Algorithm to compute the shortest path tree.
##
## A modified version of the original Isomap algorithm is included. It respects nearest and farthest neighbours.
## To estimate the intrinsic dimension of the data, the function can plot the residuals between the high and the 
## low dimensional data for a given range of dimensions.
##
## input: 
##	data: 		NxD matrix (N samples, D features)
##	dims:		dimension of the target space (default 2)
##	k:		number of neighbours (default 5 or number of samples)
##	mod:		use modified Isomap algorithm (default FALSE)
##	plotResiduals:	show a plot with the residuals between the high and the low dimensional data (default FALSE)
##	verbose:	show a summary of the embedding procedure at the end (default TRUE)	 
##
## output: 
##	Nxdim matrix (N samples, dim features) with the reduced input data (list of several matrices if more than one dimension specified)


Isomap = function(data, dims=2, k, mod=FALSE, plotResiduals=FALSE, verbose=TRUE){

	## catch missing or bad input
	if(missing(data))
          stop("data argument missing")
        else
          if(!is.matrix(data))
            stop("invalid argument: data argument is required to be a N x D matrix (N samples, D features)")
        if(!all(is.numeric(dims)) | any(dims < 1))
          stop("invalid argument: target dimension is required to be a (vector of) positive integer value(s)")
	if(missing(k))
		k = min(nrow(data),5)
	else{
          if(k >= nrow(data))
            stop("invalid argument: more neighbours than samples")
          if(!is.numeric(k) | k <= 1)
            stop("invalid argument: neighbour parameter is required to be an integer value >= 2")
        }

        k = round(k)
        dims = round(dims)
        
        num_samples = nrow(data)
        num_features = ncol(data)

        ## compute pairwise distances
	message("Computing distance matrix ... ", appendLF = FALSE)
	d = pairwiseDistances(data)
	message("done")

	## build graph of shortest paths between two neighbours (using Floyd's Algorithm)
	## modified Isomap: Use k/2 nearest neighbours and k/2 farthest neighbours
	if(mod == TRUE){
		first_k = round(k/2)
		last_k = k - first_k
		message("Building graph with shortest paths (using ", first_k ," nearest and ", last_k, " farthest neighbours) ... ", appendLF = FALSE)
	}
	## original Isomap: Use k nearest neighbours only
	else{
		first_k = k
		last_k = 0
		message("Building graph with shortest paths (using ", first_k ," nearest neighbours) ... ", appendLF = FALSE)
	}
	sort_idx = apply(d,2,order)

	## set weights to "Infinite" for not neighboured points
	for(i in 1:num_samples)
		d[i,sort_idx[(first_k+2):(num_samples-last_k),i]] = Inf

	## ensure that graph matrix is symmetric
	d = pmin(d,t(d))

	## Floyd's Algorithm
	for(i in 1:num_samples){
		d_col_i = t(d[,i])
		d_col_i = t(d_col_i[rep(1, each = num_samples),])
		d_row_i = t(d[i,])
		d_row_i = d_row_i[rep(1, each = num_samples),]
		d = pmin(d, d_col_i + d_row_i)
	}
        
	## determine all connected components of the graph
	num_connections = rowSums(!(is.infinite(d)))
	first_connections = apply(is.infinite(d),1,which.min)
	components = unique(first_connections)
	num_components = length(components)
	message("done")	
	## reduce dimension of all components seperately and merge them to a single dataset
	message("Computing low dimensional embedding ... ", appendLF = FALSE)
	all_Y = list(NULL)
	residuals = rep(0,length(dims))
	i = 1
	for(dim in dims){
		Y = matrix(0,num_samples,dim)
		for(c in 1:num_components){
	
			## only use the distances of points in the connected component
			comp_indices = which(first_connections == components[c])
			D = d[comp_indices,comp_indices]
			N = length(comp_indices)

			## convert graph matrix to inner products	
			T = -0.5 * ( D^2 - rowSums(D^2) %*% t(rep(1/N,N)) - rep(1,N) %*% t(rowSums(D^2))/N + sum(D^2)/(N^2))			

                        ## catch possible errors caused by too small connected components
                        if(dim(T)[1] < dim)
                          stop("Problem occured while computing embedding: Connected component ", c, " consists of too few samples (", dim(T)[1], ") and cannot be reduced to ", dim, " dimensions. Try Isomap with a neighbourhood parameter higher than ", k, " or with dimension parameters lower than ", dim, ".")
                        
			## embedding Y is given by the eigenvectors of T belonging to (some of) its biggest eigenvalues
	   		eig_T = eigen(T)
			sweep(eig_T$vectors, 2, sqrt(colSums(eig_T$vectors^2)), "/")	#normalize eingenvectors
               		Y[comp_indices,] = Re(eig_T$vectors[,1:dim] * (matrix(1,N,1) %*%  sqrt(as.complex(eig_T$values[1:dim]))))		
		}
		## calculate residual variances (take care of not neighboured samples (Inf values))
		if(plotResiduals == TRUE){
                        tmp = as.vector(d)
                        tmp[is.infinite(tmp)] = NA
			r = 1 - cor(cbind(tmp ,as.vector(pairwiseDistances(Y))), use="complete.obs")^2
			residuals[i] = r[1,2]
                        rm(tmp)
		}
		all_Y[[i]] = Y
		i = i+1
	}
	message("done")
	names(all_Y) = paste("dim", dims, sep="")	

	## give a summary of what has been done
	if(verbose == TRUE){
		message("number of samples: ", num_samples)
		message("reduction from ", num_features, " to ", dims, " dimensions")
		message("number of connected components in graph: ", num_components)
	}
	
	## plot residual variances for all dimensions
	if(plotResiduals == TRUE)
		plot(dims,residuals,type="b",xlab="dimension",ylab="residual variance")

	return(all_Y)

}
Budheimer/RDRToolbox documentation built on May 30, 2019, 1:26 p.m.