Nothing
##########################################################################
#
#
# MiPP(Misclassification Penalized Posterior)-based Classification
#
# by
#
# HyungJun Cho, Sukwoo Kim, Mat Soukup, and Jae K. Lee
#
# Version 1.2.0 (2007-01-17)
#
##########################################################################
.First.lib <- function(lib, pkg) {
invisible()
if(.Platform$OS.type=="windows" && interactive() &&
.Platform$GUI=="Rgui") { addVigs2WinMenu("MiPP") }
}
##########################################################################
#
# Main function
#
##########################################################################
mipp <- function(x, y, x.test=NULL, y.test=NULL, probe.ID=NULL, rule="lda",
method.cut="t.test", percent.cut = 0.01,
model.sMiPP.margin=0.01, min.sMiPP=0.85, n.drops=2,
n.fold=5, p.test=1/3, n.split=20, n.split.eval=100){
if(is.null(probe.ID)==TRUE) probe.ID <- 1:nrow(x)
nfold <- max(2, min(n.fold, nrow(x))) # 2 ~ N
if(length(unique(y)) < 2) stop("The number of classes must be >=2")
if((length(unique(y)) > 2) & (rule !="lda") & (rule !="qda")) {
stop("The rule should be 'lda' or 'qda' for multi-class problem.")
}
cat("Please wait")
#####################################
#when there is an indepedent test set
if(is.null(x.test) == FALSE) {
cat(".")
#Data manipulation
n.train.sample <- ncol(x)
n.test.sample <- ncol(x.test)
if(is.data.frame(x)==FALSE) x <- data.frame(x)
if(is.data.frame(x.test)==FALSE) x.test <- data.frame(x.test)
colnames(x) <- 1:ncol(x)
rownames(x) <- 1:nrow(x)
colnames(x.test) <- 1:ncol(x.test)
rownames(x.test) <- 1:nrow(x.test)
x <- t(x) #convert (gene x sample) into (sample x gene)
x.test <- t(x.test)
y <- factor(y)
y.test <- factor(y.test)
#pre-selection
pre.model <- "ALL"
ii <- 1:ncol(x)
if(percent.cut < 1) {
ii <- pre.select(x, y, percent.cut=percent.cut)
pre.model <- ii
}
if(length(ii) < 2) stop("There are too small number of candidate genes.")
x.tr <- x[,ii,drop=FALSE]; y.tr <- y
x.te <- x.test[,ii,drop=FALSE]; y.te <- y.test
out <- mipp.rule(x.train=x.tr, y.train=y.tr, x.test=x.te, y.test=y.te,
nfold=nfold, min.sMiPP=min.sMiPP, n.drops=n.drops, rule=rule)
out[,2] <- probe.ID[ii[out[,2]]]
Select <- rep(" ", nrow(out))
i <- min(which(out$sMiPP >= max(out$sMiPP))); Select[i] <- "*"
j <- min(which(out$sMiPP >= out$sMiPP[i]-model.sMiPP.margin)); Select[j] <- "**"
out <- cbind(out, Select)
colnames(out) <- c("Order","Gene","Tr.ER","Tr.MiPP","Tr.sMiPP","Te.ER","Te.MiPP","Te.sMiPP", "Select")
out$Tr.ER <- round(out$Tr.ER, 4); out$Te.ER <- round(out$Te.ER, 4)
out$Tr.MiPP <- round(out$Tr.MiPP, 2); out$Te.MiPP <- round(out$Te.MiPP, 2)
out$Tr.sMiPP <- round(out$Tr.sMiPP, 4); out$Te.sMiPP <- round(out$Te.sMiPP, 4)
cat(" Done. \n")
return(list(rule=rule, n.fold=n.fold, n.train.sample=n.train.sample, n.test.sample=n.test.sample, pre.model=pre.model, model=out))
}
#####################################
#when there is no indepedent test set
if(is.null(x.test) == TRUE) {
#Data manipulation
n.sample <- ncol(x)
if(is.data.frame(x)==FALSE) x <- data.frame(x)
colnames(x) <- 1:ncol(x)
rownames(x) <- 1:nrow(x)
x <- t(x) #convert (gene x sample) into (sample x gene)
y <- factor(y)
#pre-selection
pre.model <- "ALL"
ii <- 1:ncol(x)
if(percent.cut < 1) {
ii <- pre.select(x, y, percent.cut=percent.cut)
pre.model <- ii
}
if(length(ii) < 2) stop("There are too small number of candidate genes.")
x.tr <- x[,ii,drop=FALSE]
y.tr <- y
out <- cv.mipp.rule(x=x.tr, y=y.tr, nfold=nfold, p.test=p.test, n.split=n.split, n.split.eval=n.split.eval,
model.sMiPP.margin=model.sMiPP.margin, min.sMiPP=min.sMiPP, n.drops=n.drops, rule=rule)
out$CV.out$Gene <- probe.ID[ii[out$CV.out$Gene]]
for(i in 1:n.split) {
k <- ncol(out$CVCV.out)-9 ###note
k <- max(which(!is.na(out$CVCV.out[i,1:k])))
kk <- as.numeric(out$CVCV.out[i, 2:k, drop=FALSE])
out$CVCV.out[i,2:k] <- probe.ID[ii[kk]]
}
rownames(out$CV.out) <- 1:nrow(out$CV.out)
colnames(out$CV.out) <- c("Split","Order","Gene","Tr.ER","Tr.MiPP","Tr.sMiPP","Te.ER","Te.MiPP","Te.sMiPP", "Select")
out$CV.out$Tr.ER <- round(out$CV.out$Tr.ER, 4); out$CV.out$Te.ER <- round(out$CV.out$Te.ER, 4)
out$CV.out$Tr.MiPP <- round(out$CV.out$Tr.MiPP, 2); out$CV.out$Te.MiPP <- round(out$CV.out$Te.MiPP, 2)
out$CV.out$Tr.sMiPP <- round(out$CV.out$Tr.sMiPP, 4); out$CV.out$Te.sMiPP <- round(out$CV.out$Te.sMiPP, 4)
#n <- ncol(out$CVCV.out)
#out$CVCV.out[,(n-5):n] <- round(out$CVCV.out[,(n-5):n], 4)
#out$CVCV.out[,(n-4)] <- round(out$CVCV.out[,(n-4)], 2)
cat("Done. \n")
return(list(rule=rule, n.fold=n.fold, n.sample=n.sample, n.split=n.split, n.split.eval=n.split.eval,
sMiPP.margin=model.sMiPP.margin, p.test=p.test,
pre.model=pre.model, model=out$CV.out, model.eval=out$CVCV.out))
}
}
##########################################################################
#
# MiPP-based selection with cross-validation
#
##########################################################################
cv.mipp.rule <- function(x, y, nfold, p.test, n.split, n.split.eval,
model.sMiPP.margin=0.01, min.sMiPP=0, n.drops=n.drops, rule="lda") {
n.gene <- ncol(x)
CV.out <- data.frame(matrix(NA, n.split, 10))
colnames(CV.out) <- c("Split","Order","Gene","Train.ErrorRate","Train.MiPP","Train.sMiPP",
"ErrorRate","MiPP","sMiPP","Select")
#colnames(CV.out) <- c("Split","Order","Gene","ErrorRate","MiPP","sMiPP","Select")
u.y <- unique(y)
n.y <- length(u.y)
#################################
#Select genes from n.split splits
gene.list <- data.frame(matrix(NA, n.split, n.gene))
rownames(gene.list) <- paste("S",1:n.split, sep="")
colnames(gene.list) <- paste("G",1:n.gene, sep="")
for(iter in 1:n.split) {
cat(".")
i.test <- c()
for(i in 1:n.y) {
part <- sample(which(y==u.y[i]))
n.part <- round(length(part)*p.test)
i.test <- c(i.test, part[1:n.part])
}
y.train <- y[-i.test]
y.test <- y[ i.test]
x.train <- x[-i.test,,drop=FALSE]
x.test <- x[ i.test,,drop=FALSE]
if(is.data.frame(x.train)==FALSE) x.train <- data.frame(x.train)
if(is.data.frame(x.test)==FALSE) x.test <- data.frame(x.test)
tmp <- mipp.rule(x.train=x.train,y.train=y.train,x.test=x.test,y.test=y.test,
nfold=nfold, min.sMiPP=min.sMiPP, n.drops=n.drops, rule=rule)
Split <- rep(iter, nrow(tmp))
Select <- rep(" ", nrow(tmp))
i <- min(which(tmp$sMiPP >= max(tmp$sMiPP))); Select[i] <- "*"
j <- min(which(tmp$sMiPP >= tmp$sMiPP[i]-model.sMiPP.margin)); Select[j] <- "**"
gene.list[iter,1:j] <- tmp$Gene[1:j]
tmp <- cbind(Split, tmp, Select)
CV.out <- rbind(CV.out, tmp)
}
tmp <- apply(gene.list, 2, is.na)
i <- which(apply(tmp, 2, sum) >= n.split)
gene.list <- gene.list[,-i,drop=FALSE] #fixed on 01/17/2007
CV.out <- CV.out[-c(1:n.split),,drop=FALSE]
###################################
#Evaluate optimal models of splits
out.Er <- matrix(NA, n.split, n.split.eval)
out.MiPP <- matrix(NA, n.split, n.split.eval)
out.sMiPP <- matrix(NA, n.split, n.split.eval)
out2 <- data.frame(matrix(NA, n.split, 9))
rownames(out2) <- 1:n.split
colnames(out2) <- c("mean ER","mean MiPP","mean sMiPP",
"5% ER","50% ER","95% ER",
"5% sMiPP","50% sMiPP","95% sMiPP")
for(j in 1:n.split.eval) { #Splits for evaluation
i.test <- c()
for(i in 1:n.y) {
part <- sample(which(y==u.y[i]))
n.part <- round(length(part)*p.test)
i.test <- c(i.test, part[1:n.part])
}
y.train <- y[-i.test]
y.test <- y[ i.test]
x.train <- x[-i.test,,drop=FALSE]
x.test <- x[ i.test,,drop=FALSE]
if(is.data.frame(x.train)==FALSE) x.train <- data.frame(x.train)
if(is.data.frame(x.test)==FALSE) x.test <- data.frame(x.test)
for(jj in 1:n.split) { #Split
k <- max(which(!is.na(gene.list[jj,,drop=FALSE])==TRUE))
kk <- as.numeric(gene.list[jj,1:k,drop=FALSE])
xx.train <- x.train[,kk,drop=FALSE]
xx.test <- x.test[,kk,drop=FALSE]
if(is.data.frame(xx.train)==FALSE) xx.train <- data.frame(xx.train)
if(is.data.frame(xx.test)==FALSE) xx.test <- data.frame(xx.test)
tmp2 <- get.mipp(xx.train, y.train, xx.test, y.test, rule=rule)
out.Er[jj,j] <- tmp2$ErrorRate
out.MiPP[jj,j] <- tmp2$MiPP
out.sMiPP[jj,j] <- tmp2$sMiPP
}
}
out2[,1] <- apply(out.Er, 1, mean)
out2[,2] <- apply(out.MiPP, 1, mean)
out2[,3] <- apply(out.sMiPP, 1, mean)
out2[,4:6] <- t(apply(out.Er, 1, quantile, probs=c(0.95, 0.50, 0.05)))
out2[,7:9] <- t(apply(out.sMiPP, 1, quantile, probs=c(0.05, 0.50, 0.95)))
Split <- 1:n.split
CVCV.out <- cbind(Split, gene.list, out2)
return(list(genes=gene.list, CV.out=CV.out, CVCV.out=CVCV.out))
}
##########################################################################
#
# MiPP-based selection
#
##########################################################################
mipp.rule <- function(x.train, y.train, x.test=NULL, y.test=NULL, nfold=5, min.sMiPP=0, n.drops=2, rule="lda") {
n.gene <- ncol(x.train)
n.sample.train <- nrow(x.train)
n.sample.test <- nrow(x.test)
colnames(x.train) <- 1:n.gene
colnames(x.test) <- 1:n.gene
tmp <- round(n.sample.train/nfold*2)
id <- rep((1:nfold),tmp)[1:n.sample.train] #CHECK
i <- (1:n.sample.train)[sort.list(y.train)]
id <- id[sort.list(i)]
opt.genes <- c()
opt.Er <- c()
opt.MiPP <- c()
opt.sMiPP <- c()
opt.Er.train <- c()
opt.MiPP.train <- c()
opt.sMiPP.train <- c()
##################
#Pick 1-gene model
out <- matrix(0, nfold, n.gene)
for(i in 1:nfold) {
y.tr <- y.train[id!=i]
y.te <- y.train[id==i]
for(j in 1:n.gene) {
x.tr <- data.frame(x.train[id!=i,j,drop=FALSE])
x.te <- data.frame(x.train[id==i,j,drop=FALSE])
out[i,j] <- get.mipp(x.tr, y.tr, x.te, y.te, rule=rule)$MiPP
}
}
out.sum <- apply(out, 2, sum)
pick.gene <- min(which(out.sum >= max(out.sum)))
pick.gene <- as.numeric(colnames(x.train)[pick.gene])
opt.genes <- c(opt.genes, pick.gene)
x.train.cand <- x.train[,-opt.genes,drop=FALSE]
x.train.opt <- data.frame(x.train[,opt.genes,drop=FALSE])
colnames(x.train.opt) <- opt.genes
#Evaluate by test set
xx.train <- data.frame(x.train[,opt.genes,drop=FALSE])
xx.test <- data.frame(x.test[,opt.genes,drop=FALSE])
tmp <- get.mipp(xx.train, y.train, xx.test, y.test, rule=rule)
opt.Er <-c(opt.Er, tmp$ErrorRate)
opt.MiPP <-c(opt.MiPP, tmp$MiPP)
opt.sMiPP <-c(opt.sMiPP, tmp$sMiPP)
#Evaluate by train set
tmp <- get.mipp(xx.train, y.train, xx.train, y.train, rule=rule)
opt.Er.train <-c(opt.Er.train, tmp$ErrorRate)
opt.MiPP.train <-c(opt.MiPP.train, tmp$MiPP)
opt.sMiPP.train <-c(opt.sMiPP.train, tmp$sMiPP)
#########################
#Pick k-gene model (k >1)
i.stop <- 0
max.sMiPP <- opt.sMiPP
for(jj in 2:(n.gene-1)) {
n.gene.cand <- n.gene-jj+1
out <- matrix(0, nfold, n.gene.cand)
for(i in 1:nfold) {
y.tr <- y.train[id!=i]
y.te <- y.train[id==i]
for(j in 1:n.gene.cand) {
x.tr <- data.frame(x.train.opt[id!=i,,drop=FALSE], x.train.cand[id!=i,j,drop=FALSE])
x.te <- data.frame(x.train.opt[id==i,,drop=FALSE], x.train.cand[id==i,j,drop=FALSE])
out[i,j] <- get.mipp(x.tr,y.tr, x.te, y.te, rule=rule)$MiPP
}
}
out.sum <- apply(out, 2, sum)
pick.gene <- min(which(out.sum >= max(out.sum)))
pick.gene <- as.numeric(colnames(x.train.cand)[pick.gene])
opt.genes <- c(opt.genes, pick.gene)
x.train.opt <- x.train[, opt.genes,drop=FALSE]
x.train.cand <- x.train[,-opt.genes,drop=FALSE]
#Evaluate by test set
xx.train <- x.train[,opt.genes,drop=FALSE]
xx.test <- x.test[,opt.genes,drop=FALSE]
tmp <- get.mipp(xx.train, y.train, xx.test, y.test, rule=rule)
opt.Er <-c(opt.Er, tmp$ErrorRate)
opt.MiPP <-c(opt.MiPP, tmp$MiPP)
opt.sMiPP <-c(opt.sMiPP, tmp$sMiPP)
#Evaluate by train set
tmp <- get.mipp(xx.train, y.train, xx.train, y.train, rule=rule)
opt.Er.train <-c(opt.Er.train, tmp$ErrorRate)
opt.MiPP.train <-c(opt.MiPP.train, tmp$MiPP)
opt.sMiPP.train <-c(opt.sMiPP.train, tmp$sMiPP)
#stopping rule
if(max.sMiPP < tmp$sMiPP) {
max.sMiPP <- tmp$sMiPP
i.stop <- 0
}
else i.stop <- i.stop + 1
#stop if n.drops and at least min.sMiPP
if((i.stop >= n.drops) & (max.sMiPP >= min.sMiPP)) break
#stop if min number of classes is less than 4
n.min.genes <- 2
if(rule=="qda") n.min.genes <- 4
if(min(unique(table(y.train))) < (jj+n.min.genes)) break
}
##################
#Output
i <- 1:length(opt.genes)
final.out <- data.frame(i, opt.genes, opt.Er.train, opt.MiPP.train, opt.sMiPP.train, opt.Er, opt.MiPP, opt.sMiPP)
colnames(final.out) <- c("Order","Gene","Train.ErrorRate","Train.MiPP","Train.sMiPP","ErrorRate","MiPP","sMiPP")
#final.out <- data.frame(i, opt.genes, opt.Er, opt.MiPP, opt.sMiPP)
#colnames(final.out) <- c("Order","Gene","ErrorRate","MiPP","sMiPP")
return(final.out)
}
######END############################################################################
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.