netDx: Network-based patient classifier

Documented in countIntType countIntType_batch getEnr getOR

#' Get relative proportion of patient classes that contribute to a set of
#' networks
#'
#' @details Feature selected networks should have the property of being
#' enriched in the class of interest; e.g. be enriched in 'case' relative
#' to 'control'. When given a list of networks N, this method computes the
#' number and proportion of patients that overlap N. A high relative 
#' fraction of the predicted class indicates successful feature selection.
#' To create a ROC or precision-recall curve, several calls can be made
#' to this function, one per cutoff.
#' @param pNetworks (matrix) rows are patients, columns are network file
#' filenames. a[i,j] = 1 if patient i has a structural variant in network
#' j; else a[i,j] = 0
#' @param pheno_DF (data.frame) Column "ID" has unique patient identifiers;
#' column "STATUS" has patient class
#' @param predClass (char) Class for which predictor is being built
#' @param netFile (char) vector of networks of interest (e.g. those 
#' passing feature selection)
#' @param verbose (logical) print messages
#' @return List. 1) stats: statistics on group overlap with ,
#' This is a 2xK matrix, where rows are classes (predClass,other), and 
#' columns are: total samples, samples overlapping nets, % overlap
#' 2) relEnr: relative enrichment of \code{predClass} over other
#' @examples
#' d <- tempdir()
#' options(stringsAsFactors=FALSE)
#' pids <- paste("P",seq_len(5),sep="")
#' pheno <- data.frame(ID=pids,STATUS=c(rep("case",3),rep("control",2)))
#' 
#' # write PSN
#' m1 <- matrix(c("P1","P1","P2","P2","P3","P4",1,1,1),byrow=FALSE,ncol=3)
#' write.table(m1,file=paste(d,"net1.txt",sep=getFileSep()),sep="\t",
#'	col.names=FALSE,row.names=FALSE,quote=FALSE)
#' m2 <- matrix(c("P3","P4",1),nrow=1)
#' write.table(m2,file=paste(d,"net2.txt",sep=getFileSep()),sep="\t",
#'	col.names=FALSE,row.names=FALSE,quote=FALSE)
#'
#' # compute enrichment
#' x <- countPatientsInNet(d,dir(d,pattern=c("net1.txt","net2.txt")), pids)
#' getOR(x,pheno,"case",colnames(x)) # should give large RelEnr
#' @export
getOR <- function(pNetworks, pheno_DF, predClass, netFile ,verbose=TRUE) {
# TODO this function only makes sense in the context of structural
# variants. Not e.g. in the case of continuous valued data like gene 
# expression. The name should perhaps reflect that.

predSamps	<- pheno_DF$ID[pheno_DF$STATUS %in% predClass]
otherSamps	<- pheno_DF$ID[!pheno_DF$STATUS %in% predClass]

#print(length())
# limit universe to 
idx			<- which(colnames(pNetworks)%in% netFile)
if (length(idx)<1) 
		return(out=list(stats=matrix(NA,nrow=2,ncol=3),relEnr=NA,
						OLsamps=NA))

pNetworks 	<- pNetworks[,idx,drop=FALSE]
# limit patients to those that overlap 
OLsamps		<- rownames(pNetworks)[which(rowSums(pNetworks)>=1)]

OLpred		<- sum(OLsamps %in% predSamps)
OLother		<- sum(OLsamps %in% otherSamps)

pctPred		<- OLpred/length(predSamps)
pctOther	<- OLother/length(otherSamps)

if (pctPred < .Machine$double.eps) pctPred <- .Machine$double.eps
if (pctOther < .Machine$double.eps) pctOther <- .Machine$double.eps

relEnr		<- pctPred/pctOther

outmat <- matrix(nrow=2,ncol=3)
colnames(outmat) <- c("total","num OL","pct OL")
rownames(outmat) <- c(predClass, "(other)")
# cases - total, overlapping selPath, fraction
# ctrl - total, overlapping selPath, fraction
outmat[1,] <- c(length(predSamps), OLpred,round(pctPred*100,digits=1))
outmat[2,] <- c(length(otherSamps),OLother,round(pctOther*100,digits=1))

if (verbose) print(outmat)
if (verbose)
	message(sprintf("Relative enrichment of %s: %1.3f", 
		predClass, relEnr))
out <- list(stats=outmat, relEnr=relEnr,OLsamps=OLsamps)
out
}

#' Get ENR for all networks in a specified directory 
#' 
#' @details For each network, compute the number of (+,+) and other 
#' {(+,-),(-,+),(-,-)} interactions. 
#'  From this compute network ENR.
#' The measure of (+,+)-enrichment is defined as: 
#' ENR(network N) = ((num (+,+) edges) - (num other edges))/(num edges).
#' A network with only (+,+) interactions has an ENR=1 ; a network with
#' no (+,+) interactions has an ENR=-1; a network with a balance of the two
#' has ENR=0.
#' @param netDir (char) directory containing interaction networks
#' @param pheno_DF (data.frame) table with patient ID and status.
#'	Must contain columns for Patient ID (named "ID") and class
#' (named "STATUS"). Status should be a char; value of predictor class 
#' should be specified in \code{predClass} param; 
#'	all other values are considered non-predictor class
#' Rows with duplicate IDs will be excluded.
#' @param predClass (char) value for patients in predictor class
#' @param netGrep (char) pattern for grep-ing network text files, used in
#' dir(pattern=..) argument
#' @param enrType (char) how enrichment should be computed. Options are:
#' 1) binary: Skew of number of (+,+) interactions relative to other
#' interactions. Used when all edges in network are set to 1 (e.g. 
#' shared CNV overlap)
#' 2) corr: 0.5*((mean weight of (+,+) edges)-(mean weight of other edges))
#' @param ... arguments for \code{countIntType_batch}
#' @return (list):
#' 1) plusID (char) vector of + nodes
#' 2) minusID (char) vector of - nodes
#' 3) orig_rat (numeric) \code{ENR} for data networks
#' 4) fList (char) set of networks processed
#' 5) orig (data.frame) output of \code{countIntType_batch} for input
#' networks
#' @export
#' @examples
#' d <- tempdir()
#' options(stringsAsFactors=FALSE)
#' pids <- paste("P",seq_len(5),sep="")
#' pheno <- data.frame(ID=pids,STATUS=c(rep("case",3),rep("control",2)))
#' 
#' # write PSN
#' m1 <- matrix(c("P1","P1","P2","P2","P3","P4",1,1,1),byrow=FALSE,ncol=3)
#' write.table(m1,file=paste(d,"net1.nettxt",sep=getFileSep()),sep="\t",
#'	col.names=FALSE,row.names=FALSE,quote=FALSE)
#' m2 <- matrix(c("P3","P4",1),nrow=1)
#' write.table(m2,file=paste(d,"net2.nettxt",sep=getFileSep()),sep="\t",
#'	col.names=FALSE,row.names=FALSE,quote=FALSE)
#'
#' # compute enrichment
#' x <- countPatientsInNet(d,dir(d,pattern=c("net1.nettxt","net2.nettxt")), pids)
#' getEnr(d,pheno,"case","nettxt$")
getEnr	<- function(netDir, pheno_DF,predClass,netGrep="_cont.txt$",
	enrType="binary",...) {
if (missing(predClass)) stop("predClass must be supplied.\n")

fList	<- dir(path=netDir,pattern=netGrep)
fList	<- paste(netDir,fList,sep=getFileSep())
message(sprintf("Got %i networks", length(fList)))

# get + and - IDs
pheno	<- pheno_DF
pheno	<- pheno[!duplicated(pheno$ID),]
plus_idx	<- which(pheno$STATUS %in% predClass)
plusID	<- pheno$ID[plus_idx]
minusID	<- pheno$ID[setdiff(seq_len(nrow(pheno)),plus_idx)]

message(sprintf("Total %i subjects ; %i of class %s, %i other", 
			nrow(pheno), length(plusID), predClass, length(minusID)))

message("* Computing real (+,+) (+,-)")
	# first get original 
	t0 <- system.time(orig <- countIntType_batch(fList,plusID,minusID,
							enrType=enrType,...))
	#print(head(orig))
	print(t0)
	if (enrType=="binary") {
		orig_rat	<- (orig[,1]-orig[,2])/(orig[,1]+orig[,2])
	} else if (enrType=="corr") {
		# divide by 2 to rescale to -1 to 1
		orig_rat <- (orig[,1]-orig[,2])/2
	} else { # invalid type
		orig_rat <- NA	
	}
	names(orig_rat)	<- basename(fList)
	
	return(list(plusID=plusID, minusID=minusID,orig_rat=orig_rat,
				fList=fList,orig=orig))
}

#' Counts the number of (+,+) and (+,-) interactions in a single network
#' 
#' @param inFile (char) path to interaction networks
#' @param plusID (char) vector of + nodes
#' @param minusID (char) vector of - nodes
#' @return (numeric of length 2) Number of (+,+) interactions, and 
#' non-(+,+) interactions
#' (i.e. (+,-) and (-,-) interactions)
#' @export
#' @examples
#' d <- tempdir()
#' # write PSN
#' m1 <- matrix(c("P1","P1","P2","P2","P3","P4",1,1,1),byrow=FALSE,ncol=3)
#' write.table(m1,file=paste(d,"net1.txt",sep=getFileSep()),
#'	sep="\t",
#'	col.names=FALSE,row.names=FALSE,quote=FALSE)
#'
#' countIntType(paste(d,"net1.txt",sep=getFileSep()),c("P1","P2","P3"),
#'	c("P4","P5"))
countIntType <- function(inFile, plusID, minusID) { 
	dat <- read.delim(inFile,sep="\t",header=FALSE,as.is=TRUE)
	pp	<- sum(dat[,1] %in% plusID & dat[,2] %in% plusID)

	return(c(pp,nrow(dat)-pp))
}

#' Counts number of (+,+) and (+,-) interactions in a set of networks
#' 
#' @param inFiles (char) path to interaction networks to process
#' @param plusID (char) IDs of + nodes
#' @param minusID (char) IDs of - nodes
#' @param tmpDir (char) path to dir where temporary files can be stored
#' @param enrType (char) see getEnr.R
#' @param numCores (integer) number of cores for parallel processing
#' @import bigmemory
#' @return (matrix) two columns, one row per network 
#' If \code{enrType="binary"}, number of (+,+) and other interactions
#' Otherwise if \code{enrType="corr"} mean edge weight of (+,+) edges and
#' of other edges
#' @examples
#' d <- tempdir()
#' # write PSN
#' m1 <- matrix(c("P1","P1","P2","P2","P3","P4",1,1,1),byrow=FALSE,ncol=3)
#' write.table(m1,file=paste(d,"net1.txt",sep=getFileSep()),sep="\t",
#'	col.names=FALSE,row.names=FALSE,quote=FALSE)
#' m2 <- matrix(c("P3","P4",1),nrow=1)
#' write.table(m2,file=paste(d,"net2.txt",sep=getFileSep()),sep="\t",
#'	col.names=FALSE,row.names=FALSE,quote=FALSE)
#' 
#' countIntType_batch(paste(d,c("net1.txt","net2.txt"),sep=getFileSep()),
#' 	c("P1","P2","P3"),c("P4","P5"))
#' @export
countIntType_batch <- function(inFiles,plusID, minusID,tmpDir=tempdir(),
	   enrType="binary",numCores=1L){

	randString <- function(n = 1) {
  	a <- do.call(paste0, replicate(5, sample(LETTERS, n, TRUE), FALSE))
  	paste0(a, sprintf("%04d", 
			sample(9999, n, TRUE)), 
			sample(LETTERS, n, TRUE))
	}
	curRand <- randString()

	bkFile <- sprintf("%s.bk",curRand)
	descFile <- sprintf("%s.desc",curRand)
	bkFull <- paste(tmpDir,bkFile,sep=getFileSep())
	descFull <- paste(tmpDir,descFile,sep=getFileSep())
	if (file.exists(bkFull)) file.remove(bkFull)
	if (file.exists(descFull)) file.remove(descFull)

	out <- big.matrix(NA, nrow=length(inFiles),ncol=2,
					  type="double",
						backingfile=bkFile,
					  backingpath=tmpDir,
					  descriptorfile=descFile
	)
	cl <- makeCluster(numCores,
		outfile=paste(tmpDir,"shuffled_log.txt",sep=getFileSep()))
	registerDoParallel(cl)

	k <- 0
	foreach (k=seq_len(length(inFiles))) %dopar% {
		m <-attach.big.matrix(paste(tmpDir,descFile,sep=getFileSep()))
		if (enrType == "binary")
			m[k,]	<- countIntType(inFiles[k], plusID,minusID)
		else if (enrType == "corr")
			m[k,]	<- getCorrType(inFiles[k],plusID,minusID)
	}

	stopCluster(cl)
	
	out	<- as.matrix(out)
	unlink(bkFull)
	unlink(descFull)

	return(out)
}