R/KNN_T.R
In proFIA: Preprocessing of FIA-HRMS data

###
# Imputation by KNN for truncated Distribution
#
#Distribution based nearest neighbor imputation for truncated high dimensional data with applications to pre-clinical and clinical metabolomics studies
#Jasmit S. ShahEmail author, Shesh N. Rai, Andrew P. DeFilippis, Bradford G. Hill, Aruni Bhatnagar and Guy N. BrockEmail author
#

##################################################################################
#### MLE for the Truncated Normal
#### Creating a Function that Returns the Log Likelihood, Gradient and
#### Hessian Functions
##################################################################################

## data = numeric vector
## t    = truncation limits
mklhood <- function(data, t, ...) {
	
	data <- na.omit(data)
	n <- length(data)
	t <- sort(t)
	
	psi<-function(y, mu, sigma){
		exp(-(y-mu)^2/(2*sigma^2))/(sigma*sqrt(2*pi))
	}
	
	psi.mu<-function(y,mu,sigma){
		exp(-(y-mu)^2/(2*sigma^2)) * ((y-mu)/(sigma^3*sqrt(2*pi)))
	}
	
	psi.sigma<-function(y,mu,sigma){
		exp(-(y-mu)^2/(2*sigma^2)) *
			(((y-mu)^2)/(sigma^4*sqrt(2*pi)) - 1/(sigma^2*sqrt(2*pi)))
	}
	
	psi2.mu<-function(y,mu,sigma){
		exp(-(y - mu)^2/(2*sigma^2)) *
			(((y - mu)^2)/(sigma^5*sqrt(2*pi))-1/(sigma^3*sqrt(2*pi)))
	}
	
	psi2.sigma<-function(y,mu,sigma){
		exp(-(y-mu)^2/(2*sigma^2)) *
			((2)/(sigma^3*sqrt(2*pi)) - (5*(y-mu))/(sigma^5*sqrt(2*pi)) +
			 	((y-mu)^4)/(sigma^7*sqrt(2*pi)))
	}
	
	psi12.musig<-function(y,mu,sigma){
		exp(-(y-mu)^2/(2*sigma^2)) *
			(((y-mu)^3)/(sigma^6*sqrt(2*pi)) - (3*(y-mu))/(sigma^4*sqrt(2*pi)))
	}
	
	ll.tnorm2<-function(p){
		out <- (-n*log(pnorm(t[2],p[1],p[2])-pnorm(t[1],p[1],p[2]))) -
			(n*log(sqrt(2*pi*p[2]^2))) - (sum((data-p[1])^2)/(2*p[2]^2))
		-1*out
	}
	
	grad.tnorm<-function(p){
		g1 <- (-n*(integrate(psi.mu,t[1],t[2],mu=p[1],sigma=p[2], stop.on.error = FALSE)$value) /
			   	(pnorm(max(t),p[1],p[2])-pnorm(min(t),p[1],p[2]))) - ((n*p[1]-sum(data))/p[2]^2)
		g2 <- (-n*(integrate(psi.sigma,t[1],t[2],mu=p[1],sigma=p[2], stop.on.error = FALSE)$value) /
			   	(pnorm(max(t),p[1],p[2])-pnorm(min(t),p[1],p[2]))) - ((n)/(p[2])) + ((sum((data-p[1])^2))/(p[2]^3))
		out <- c(g1,g2)
		return(out)
	}
	
	hessian.tnorm<-function(p){
		
		h1<- -n*(integrate(psi,t[1],t[2],mu=p[1],sigma=p[2], stop.on.error = FALSE)$value *
				 	integrate(psi2.mu,t[1],t[2],mu=p[1],sigma=p[2], stop.on.error = FALSE)$value -
				 	integrate(psi.mu,t[1],t[2],mu=p[1],sigma=p[2], stop.on.error = FALSE)$value^2) /
			(integrate(psi,t[1],t[2],mu=p[1],sigma=p[2], stop.on.error = FALSE)$value^2) -
			n/(p[2]^2)
		
		h3<- -n*(integrate(psi,t[1],t[2],mu=p[1],sigma=p[2], stop.on.error = FALSE)$value *
				 	integrate(psi12.musig,t[1],t[2],mu=p[1],sigma=p[2], stop.on.error = FALSE)$value -
				 	integrate(psi.mu,t[1],t[2],mu=p[1],sigma=p[2], stop.on.error = FALSE)$value *
				 	integrate(psi.sigma,t[1],t[2],mu=p[1],sigma=p[2], stop.on.error = FALSE)$value) /
			(integrate(psi,t[1],t[2],mu=p[1],sigma=p[2], stop.on.error = FALSE)$value^2) +
			(2*(n*p[1]-sum(data)))/(p[2]^3)
		
		h2<- -n*(integrate(psi,t[1],t[2],mu=p[1],sigma=p[2], stop.on.error = FALSE)$value *
				 	integrate(psi2.sigma,t[1],t[2],mu=p[1],sigma=p[2], stop.on.error = FALSE)$value -
				 	integrate(psi.sigma,t[1],t[2],mu=p[1],sigma=p[2], stop.on.error = FALSE)$value^2) /
			(integrate(psi,t[1],t[2],mu=p[1],sigma=p[2], stop.on.error = FALSE)$value^2) +
			(n)/(p[2]^2)-(3*sum((data-p[1])^2))/(p[2]^4)
		
		H<-matrix(0,nrow=2,ncol=2)
		H[1,1]<-h1
		H[2,2]<-h2
		H[1,2]<-H[2,1]<-h3
		return(H)
	}
	
	
	return(list(ll.tnorm2 = ll.tnorm2, grad.tnorm = grad.tnorm, hessian.tnorm = hessian.tnorm))
}
##################################################################################
###### Newton Raphson Function
###### This takes in the Objects Returned from mklhood Function above
##################################################################################

NewtonRaphsonLike <- function(lhood, p, tol = 1e-07, maxit = 100) {
	
	cscore <- lhood$grad.tnorm(p)
	if(sum(abs(cscore)) < tol)
		return(list(estimate = p, value = lhood$ll.tnorm2(p), iter = 0))
	cur <- p
	for(i in 1:maxit) {
		inverseHess <- solve(lhood$hessian.tnorm(cur))
		cscore <- lhood$grad.tnorm(cur)
		new <- cur - cscore %*% inverseHess
		if (new[2] <= 0) stop("Sigma < 0")
		cscore <- lhood$grad.tnorm(new)
		
		if(((abs(lhood$ll.tnorm2(cur)- lhood$ll.tnorm2(new))/(lhood$ll.tnorm2(cur))) < tol))
			return(list(estimate = new, value= lhood$ll.tnorm2(new), iter = i))
		cur <- new
	}
	
	return(list(estimate = new, value= lhood$ll.tnorm2(new), iter = i))
}

##################################################################################
###### Based on the MLE Functions (mklhood) and NewtonRaphson Function
###### (NewtonRaphsonLike), This function estimates the MEAN and SD from the
###### Truncated using Newton Raphson.
##################################################################################

## missingdata = matrix where rows = features, columns = samples
## perc = if %MVs > perc then just sample mean / SD
## iter = # iterations in NR algorithm

EstimatesComputation <- function(missingdata, perc, iter=50) {
	
	## 2 column matrix where column 1 = means, column 2 = SD
	ParamEstim <- matrix(NA, nrow = nrow(missingdata), ncol = 2)
	nsamp <- ncol(missingdata)
	
	## sample means / SDs
	ParamEstim[,1] <- rowMeans(missingdata, na.rm = TRUE)
	ParamEstim[,2] <- apply(missingdata, 1, function(x) sd(x, na.rm = TRUE))
	
	## Case 1: missing % > perc => use sample mean / SD
	na.sum <- apply(missingdata, 1, function(x) sum(is.na(x)))
	idx1 <- which(na.sum/nsamp >= perc)
	
	## Case 2: sample mean > 3 SD away from LOD => use sample mean / SD
	lod <- min(missingdata, na.rm=TRUE)
	idx2 <- which(ParamEstim[,1] > 3*ParamEstim[,2] + lod)
	
	## Case 3: for all others, use NR method to obtain truncated mean / SD estimate
	idx.nr <- setdiff(1:nrow(missingdata), c(idx1, idx2))
	## t = limits of integration (LOD and upper)
	upplim <- max(missingdata, na.rm=TRUE) + 2*max(ParamEstim[,2])
	for (i in idx.nr) {
		Likelihood <- mklhood(missingdata[i,], t=c(lod, upplim))
		res <- tryCatch(NewtonRaphsonLike(Likelihood, p = ParamEstim[i,]),
						error = function(e) 1000)
		
		if (length(res) == 1) {
			next
		} else if (res$iter >= iter) {
			next
		} else {
			ParamEstim[i,] <- as.numeric(res$estimate)
		}
	}
	return(ParamEstim)
}



####################################################################
#### This Function imputes the data BASED on KNN-EUCLIDEAN
####################################################################

## data = data set to be imputed, where rows = features, columns = samples
## k    = number of neighbors for imputing values
## rm.na, rm.nan, rm.inf = whether NA, NaN, and Inf values should be imputed


KNNEuc <- function (data, k, rm.na = TRUE, rm.nan = TRUE, rm.inf = TRUE) {
	nr <- dim(data)[1]
	
	imp.knn <- data
	imp.knn[is.finite(data) == FALSE] <- NA
	t.data<-t(data)
	
	mv.ind <- which(is.na(imp.knn), arr.ind = TRUE)
	arrays <- unique(mv.ind[, 2])
	array.ind <- match(arrays, mv.ind[, 2])
	nfeatures <- 1:nr
	
	for (i in 1:length(arrays)) {
		set <- array.ind[i]:min((array.ind[(i + 1)] - 1), dim(mv.ind)[1], na.rm = TRUE)
		cand.features <- nfeatures[-unique(mv.ind[set, 1])]
		cand.vectors <- t.data[,cand.features]
		exp.num <- arrays[i]
		
		for (j in set) {
			feature.num <- mv.ind[j, 1]
			tar.vector <- data[feature.num,]
			
			dist <- sqrt(colMeans((tar.vector-cand.vectors)^2, na.rm = TRUE))
			dist[is.nan(dist) | is.na(dist)] <- Inf
			dist[dist==0] <- ifelse(is.finite(min(dist[dist>0])), min(dist[dist>0])/2, 1)
			
			if (sum(is.finite(dist)) < k) {
				stop(message = "Fewer than K finite distances found")
			}
			k.features.ind <- order(dist)[1:k]
			k.features <- cand.features[k.features.ind]
			wghts <- 1/dist[k.features.ind]/sum(1/dist[k.features.ind])
			imp.knn[feature.num, exp.num] <- wghts %*% data[k.features, exp.num]
		}
	}
	
	if (!rm.na) {
		imp.knn[is.na(data) == TRUE & is.nan(data) == FALSE] <- NA
	}
	if (!rm.inf) {
		index <- is.finite(data) == FALSE & is.na(data) == FALSE &
			is.nan(data) == FALSE
		imp.knn[index] <- data[index]
	}
	if (!rm.nan) {
		imp.knn[is.nan(data) == TRUE] <- NaN
	}
	return(imp.knn)
}

####################################################################
#### This Function imputes the data based on KNN-CORRELATION or
#### KNN-TRUNCATION. The Parameter Estimates based on the Truncated
#### Normal from EstimateComputation function is run on this function
####################################################################

imputeKNN <- function (data, k , distance = "correlation",
					   rm.na = TRUE, rm.nan = TRUE, rm.inf = TRUE, perc=1,...) {
	#cat("odata",head(data))
	if (!(is.matrix(data))) {
		stop(message = paste(deparse(substitute(data)),
							 " is not a matrix.", sep = ""))
	}
	
	distance <- match.arg(distance, c("correlation","truncation"))
	
	nr <- dim(data)[1]
	if (k <= 1 | k > nr) {
		stop(message = "k should be between 1 and the number of rows")
	}
	
	if (distance=="correlation"){
		genemeans<-rowMeans(data,na.rm=TRUE)
		genesd<-apply(data, 1, function(x) sd(x, na.rm = TRUE))
		data<-(data-genemeans)/genesd
	}
	#cat("dim before",dim(data),"\n")
	if (distance=="truncation"){
		
		ParamMat <- EstimatesComputation(data, perc = perc)
		
		genemeans<-ParamMat[,1]
		genesd<-ParamMat[,2]
		data<-(data-genemeans)/genesd
	}
	#cat("dim after",dim(data),"\n")
	imputationOK <- rep(TRUE,nrow(data))
	imp.knn <- data
	#cat(head(imp.knn))
	imp.knn[is.finite(data) == FALSE] <- NA
	t.data<-t(data)
	
	mv.ind <- which(is.na(imp.knn), arr.ind = TRUE)
	arrays <- unique(mv.ind[, 2])
	array.ind <- match(arrays, mv.ind[, 2])
	#cat("array.ind",array.ind)
	ngenes <- 1:nr
	
	for (i in 1:length(arrays)) {
		set <- array.ind[i]:min((array.ind[(i + 1)] - 1), dim(mv.ind)[1],
								na.rm = TRUE)
		cand.genes <- ngenes[-unique(mv.ind[set, 1])]
		cand.vectors <- t.data[,cand.genes]
		exp.num<- arrays[i]
		#cat("cand.genes",cand.genes,"cand.vectors",cand.vectors,"\n")
		for (j in set) {
			
			gene.num <- mv.ind[j, 1]
			tar.vector <- data[gene.num,]
			
			r <- (cor(cand.vectors,tar.vector, use = "pairwise.complete.obs"))
			dist <- switch(distance,
						   correlation = (1 - abs(r)),
						   truncation = (1 - abs(r)))
			dist[is.nan(dist) | is.na(dist)] <- Inf
			dist[dist==0]<-ifelse(is.finite(min(dist[dist>0])), min(dist[dist>0])/2, 1)
			dist[abs(r) == 1] <- Inf
			
			###MODIFIED TO ACCOUNT FOR KNN
			if (sum(is.finite(dist)) < k) {
				imputationOK[gene.num] <- FALSE
				next
				#stop(message = "Fewer than K finite distances found")
			}
			###END MODIF
			k.genes.ind <- order(dist)[1:k]
			k.genes <- cand.genes[k.genes.ind]
			#cat("gene",gene.num,"exp.nem",exp.num,"selected",k.genes,"stop\n")
			wghts <- (1/dist[k.genes.ind]/sum(1/dist[k.genes.ind])) * sign(r[k.genes.ind])
			imp.knn[gene.num, exp.num] <- wghts %*% data[k.genes, exp.num]
		}
	}
	
	if (distance=="correlation") {
		imp.knn <- (imp.knn * genesd) + genemeans
	}
	
	if(distance=="truncation") {
		imp.knn <- (imp.knn * genesd) + genemeans
	}
	
	if (!rm.na) {
		imp.knn[is.na(data) == TRUE & is.nan(data) == FALSE] <- NA
	}
	if (!rm.inf) {
		index <- is.finite(data) == FALSE & is.na(data) == FALSE &
			is.nan(data) == FALSE
		imp.knn[index] <- data[index]
	}
	if (!rm.nan) {
		imp.knn[is.nan(data) == TRUE] <- NaN
	}
	return(list(imputedData = imp.knn, problems = imputationOK))
}