estimateAufcWeights <- function(counts,normalization,statistics,nsim=10,
N=10000,samples=c(3,3),ndeg=c(500,500),top=500,modelOrg="mm9",fcBasis=1.5,
drawFpc=FALSE,rc=NULL,...) {
if (!requireNamespace("zoo"))
stopwrap("R pacakage zoo is required in order to estimate AUFC ",
"weights!")
if (ncol(counts)<4)
stopwrap("Cannot estimate AUFC weights with an initial dataset with ",
"less than 4 samples!")
else if (ncol(counts)>=4 && ncol(counts)<10) {
reind <- sample(seq_len(ncol(counts)),20,replace=TRUE)
counts <- counts[,reind]
}
parList <- estimateSimParams(counts,...)
disp("Running simulations... This procedure requires time... Please ",
"wait...")
simResults <- cmclapply(seq_len(nsim),function(x,normalization,statistics,N,
parList,samples,ndeg,fcBasis,modelOrg) {
D <- makeSimDataSd(N=N,param=parList,samples=samples,ndeg=ndeg,
fcBasis=fcBasis,modelOrg=modelOrg)
dd <- D$simdata
if (!is.null(modelOrg)) {
tmp <- metaseqr2(
counts=dd,
sampleList=list(G1=paste("G1_rep",seq_len(samples[1]),sep=""),
G2=paste("G2_rep",seq_len(samples[2]),sep="")),
contrast=c("G1_vs_G2"),
annotation="embedded",
embedCols=list(
idCol=4,
gcCol=5,
nameCol=7,
btCol=8
),
org=modelOrg,
countType="gene",
normalization=normalization,
statistics=statistics,
metaP="simes",
figFormat="png",
preset="all_basic",
exportWhere=tempdir(),
restrictCores=rc,
qcPlots=NULL,
report=FALSE,
runLog=FALSE,
outList=TRUE
)
}
else {
tmp <- metaseqr2(
counts=dd,
sampleList=list(G1=paste("G1_rep",seq_len(samples[1]),sep=""),
G2=paste("G2_rep",seq_len(samples[2]),sep="")),
contrast=c("G1_vs_G2"),
annotation="embedded",
embedCols=list(
idCol=4,
gcCol=5,
nameCol=7,
btCol=8
),
countType="gene",
normalization=normalization,
statistics=statistics,
metaP="simes",
figFormat="png",
preset="all_basic",
exportWhere=tempdir(),
qcPlots=NULL,
report=FALSE,
runLog=FALSE,
outList=TRUE,
restrictCores=rc
)
}
# Retrieve several p-values
pList <- vector("list",length(statistics))
for (s in statistics) {
field <- paste("p-value",s,sep="_")
pList[[s]] <- tmp$data[[1]][,field]
names(pList[[s]]) <- rownames(tmp$data[[1]])
}
pMatrix <- do.call("cbind",pList)
return(list(simdata=D,pvalues=pMatrix))
},normalization,statistics,N,parList,samples,ndeg,fcBasis,modelOrg,rc=rc)
disp("Estimating AUFC weights... Please wait...")
fpcObj <- cmclapply(simResults,function(x) {
trueDe <- x$simdata$truedeg
names(trueDe) <- rownames(x$simdata$simdata)
pMatrix <- x$pvalues
trueDe <- trueDe[rownames(pMatrix)]
fdc <- diagplotFtd(trueDe,pMatrix,type="fpc",draw=FALSE)
},rc=rc)
avgFpc <- diagplotAvgFtd(fpcObj,draw=drawFpc)
x <- seq_len(top)
aufc <- apply(avgFpc$avgFtdr$means[seq_len(top),],2,function(x,i) {
return(sum(diff(i)*rollmean(x,2)))
},x)
weightAufc <- (sum(aufc)/aufc)/sum(sum(aufc)/aufc)
return(weightAufc)
}
makeSimDataTcc <- function(...) {
if (suppressWarnings(!requireNamespace("TCC")))
stopwrap("Bioconductor package TCC is required to create ",
"simulated data!")
#tcc <- simulateReadCounts(Ngene=Ngene,PDEG=PDEG,DEG.assign=DEG.assign,
# DEG.foldchange=DEG.foldchange,replicates=replicates)
tcc <- TCC::simulateReadCounts(...)
n <- nrow(tcc$count)
# Now we have to simulate annotation
chromosome <- paste("chr",1+round(20*runif(n)),sep="")
start <- 1 + round(1e+6*runif(n))
end <- start + 250 + round(1e+6*runif(n))
gene_id <- gene_name <- rownames(tcc$count)
gc_content <- runif(n)
strand <- sample(c("+","-"),n,replace=TRUE)
biotype <- sample(paste("biotype",seq_len(10)),n,replace=TRUE)
simData <- data.frame(
chromosome=chromosome,
start=start,
end=end,
gene_id=gene_id,
gc_content=gc_content,
strand=strand,
gene_name=gene_name,
biotype=biotype
)
simData <- cbind(simData,tcc$count)
return(list(simdata=simData,simparam=tcc$simulation))
}
makeSimDataSd <- function(N,param,samples=c(5,5),ndeg=rep(round(0.1*N),2),
fcBasis=1.5,libsizeRange=c(0.7,1.4),libsizeMag=1e+7,modelOrg=NULL,
simLengthBias=FALSE) {
if (!is.null(modelOrg)) {
modelOrg <- tolower(modelOrg)
checkTextArgs("modelOrg",modelOrg,c("hg18","hg19","mm9","mm10",
"rno5","dm3","rn5","rn6","danrer7","pantro4","tair10"),
multiarg=FALSE)
ann <- getAnnotation(modelOrg,"gene")
realGc <- as.numeric(ann$gc_content)
realStart <- as.numeric(ann$start)
realEnd <- as.numeric(ann$end)
realStrand <- as.character(ann$strand)
}
muHat <- param$muHat
phiHat <- param$phiHat
if (simLengthBias) {
sind <- sort(muHat,index.return=TRUE)$ix
muHat <- muHat[sind]
phiHat <- phiHat[sind]
if (length(muHat)>=N)
ii <- sort(sample(seq_len(length(muHat)),N))
else
ii <- sort(sample(seq_len(length(muHat)),N,replace=TRUE))
}
else {
if (length(muHat)>=N)
ii <- sample(seq_len(length(muHat)),N)
else
ii <- sample(seq_len(length(muHat)),N,replace=TRUE)
}
s1 <- samples[1]
s2 <- samples[2]
L1 <- round(libsizeMag*runif(s1,min=libsizeRange[1],
max=libsizeRange[2]))
L2 <- round(libsizeMag*runif(s2,min=libsizeRange[1],
max=libsizeRange[2]))
lambda1 <- do.call("cbind",rep(list(muHat[ii]),s1))
mu1 <- sweep(lambda1,2,L1/sum(lambda1[,1]),"*")
sim1 <- matrix(0,N,s1)
for (j in seq_len(s1))
sim1[,j] <- rnbinom(N,size=1/phiHat[ii],mu=mu1[,j])
v <- numeric(N)
if (sum(ndeg)>0) {
iUpdown <- sample(seq_len(length(v)),sum(ndeg))
regDir <- rep(c(1,-1),c(ndeg[1],ndeg[2]))
v[iUpdown] <- regDir
lambda2 <- ((fcBasis + rexp(N))^v)*lambda1
mu2 <- sweep(lambda2,2,L2/sum(lambda2[,1]),"*")
sim2 <- matrix(0,N,s2)
for (j in seq_len(s2))
sim2[,j] <- rnbinom(N,size=1/phiHat[ii],mu=mu2[,j])
}
else {
lambda2 <- lambda1
mu2 <- sweep(lambda2,2,L2/sum(lambda2[,1]),"*")
sim2 <- matrix(0,N,s2)
for (j in seq_len(s2))
sim2[,j] <- rnbinom(N,size=1/phiHat[ii],mu=mu2[,j])
}
# Now we have to simulate annotation
chromosome <- paste("chr",1+round(20*runif(N)),sep="")
gene_id <- gene_name <- paste("gene",seq_len(N),sep="_")
if (!is.null(modelOrg)) {
if (length(realGc)>=N)
sampleInd <- sample(seq_len(length(realGc)),N)
else
sampleInd <- sample(seq_len(length(realGc)),N,replace=TRUE)
gc_content <- realGc[sampleInd]
start <- realStart[sampleInd]
end <- realEnd[sampleInd]
strand <- realStrand[sampleInd]
if (simLengthBias) {
lenix <- sort(end-start,index.return=TRUE)$ix
start <- start[lenix]
end <- end[lenix]
gc_content <- gc_content[lenix]
strand <- strand[lenix]
}
}
else {
gc_content <- runif(N)
start <- 1 + round(1e+6*runif(N))
end <- start + 250 + round(1e+6*runif(N))
strand <- sample(c("+","-"),N,replace=TRUE)
if (simLengthBias) {
lenix <- sort(end-start,index.return=TRUE)$ix
start <- start[lenix]
end <- end[lenix]
gc_content <- gc_content[lenix]
strand <- strand[lenix]
}
}
biotype <- sample(paste("biotype",seq_len(10)),N,replace=TRUE)
simData <- data.frame(
chromosome=chromosome,
start=start,
end=end,
gene_id=gene_id,
gc_content=gc_content,
strand=strand,
gene_name=gene_name,
biotype=biotype
)
colnames(sim1) <- paste("G1_rep",seq_len(s1),sep="")
colnames(sim2) <- paste("G2_rep",seq_len(s2),sep="")
rownames(sim1) <- rownames(sim2) <- names(v) <- gene_id
return(list(simdata=cbind(simData,sim1,sim2),truedeg=v))
}
estimateSimParams <- function(realCounts,libsizeGt=3e+6,rowmeansGt=5,
eps=1e-11,rc=NULL,draw=FALSE) {
if (is.data.frame(realCounts))
mat <- as.matrix(realCounts)
else if (is.matrix(realCounts))
mat <- realCounts
else if (file.exists(realCounts)) {
realData <- read.delim(realCounts,row.names=1)
mat <- as.matrix(realData)
}
else
stopwrap("The input count data must be either a file, a matrix or a ",
"data frame!")
lowLib <- which(apply(mat,2,sum)<libsizeGt)
if (length(lowLib)==ncol(mat))
stopwrap("Cannot estimate simulation parameters as the library sizes ",
"are too small! Try lowering the value of the libsizeGt ",
"parameter...")
if (length(lowLib)>0)
mat <- mat[,-lowLib]
disp("Downsampling counts...")
dmat <- downsampleCounts(mat)
lowCo <- which(apply(dmat,1,
function(x) if (mean(x)<5) TRUE else FALSE))
if (length(lowCo)>0)
dmat <- dmat[-lowCo,]
muHat <- apply(dmat,1,mean)
disp("Estimating initial dispersion population...")
phiEst <- apply(dmat,1,function(x) {
m <- mean(x)
v <- var(x)
phi <- (v-m)/m^2
return(phi)
})
phiInd <- which(phiEst>0)
phiEst <- phiEst[phiInd]
dmat <- dmat[phiInd,]
disp("Estimating dispersions using log-likelihood...\n")
init <- cmclapply(seq_along(seq_len(nrow(dmat))),function(i,d,p) {
list(y=d[i,],h=p[i])
},dmat,phiEst,rc=rc)
phiHat <- unlist(cmclapply(init,function(x,eps) {
optimize(mlfo,c(x$h-1e-2,x$h+1e-2),y=x$y,tol=eps)$minimum
},eps,rc=rc))
if (draw) {
dev.new()
plot(log10(muHat[phiInd]),log10(phiHat),col="blue",pch=20,cex=0.5,
xlab="",ylab="")
title(xlab="mean",ylab="dispesion",font=2,cex=0.9)
grid()
}
return(list(muHat=muHat[phiInd],phiHat=phiHat))
}
downsampleCounts <- function(counts) {
libSizes <- apply(counts,2,sum)
targetSize <- min(libSizes)
toRemove <- libSizes-targetSize
ii <- which(toRemove>0)
dcounts <- counts
for (i in ii) {
tmp <- round(toRemove[i]*(counts[,i]/sum(counts[,i])))
victimSize <- sum(tmp)
if (victimSize>toRemove[i]) {
dif <- victimSize - toRemove[i]
#victims <- sample(seq_len(length(tmp)),dif)
victims <- sort(tmp,decreasing=TRUE,
index.return=TRUE)$ix[seq_len(dif)]
tmp[victims] <- tmp[victims] - 1
}
else if (victimSize<toRemove[i]) {
dif <- toRemove[i] - victimSize
#victims <- sample(seq_len(length(tmp)),dif)
victims <- sort(tmp,decreasing=TRUE,
index.return=TRUE)$ix[seq_len(dif)]
tmp[victims] <- tmp[victims] + 1
}
dcounts[,i] <- dcounts[,i] - tmp
}
return(dcounts)
}
mlfo <- function(phi,y) {
N <- length(y)
mu <- mean(y)
-(sum(lgamma(y+1/phi)) - N*lgamma(1/phi) - sum(lgamma(y+1)) +
sum(y*log(mu*phi/(1+mu*phi))) - (N/phi)*log(1+mu*phi))
}
makePermutation <- function(counts,sampleList,contrast,repl=FALSE) {
cnts <- strsplit(contrast,"_vs_")[[1]]
virtualContrast <- paste(paste("VirtCond",seq_len(length(cnts)),sep=""),
collapse="_vs_")
vitualSampleList <- vector("list",length(sampleList))
names(vitualSampleList) <- paste("VirtCond",seq_along(sampleList),sep="")
# Avoid the extreme case of returning a vector with all samples the same
if (repl) {
resample <- rep(1,ncol(counts))
while(length(unique(resample))==1)
resample <- sample(seq_len(ncol(counts)),ncol(counts),replace=repl)
}
else
resample <- sample(seq_len(ncol(counts)),ncol(counts),replace=repl)
virtualCounts <- counts[,resample]
samples <- paste("VirtSamp",seq_len(ncol(counts)),sep="")
colnames(virtualCounts) <- samples
nsample <- vapply(sampleList,length,numeric(1))
virtualSamples <- split(samples,rep(seq_len(length(nsample)),nsample))
names(virtualSamples) <- names(vitualSampleList)
for (n in names(vitualSampleList))
vitualSampleList[[n]] <- virtualSamples[[n]]
return(list(counts=virtualCounts,sampleList=vitualSampleList,
contrast=virtualContrast))
}
calcOtr <- function(truth,p,sig=0.05) {
if (is.list(p))
pmat <- do.call("cbind",p)
else if (is.data.frame(p))
pmat <- as.matrix(p)
else if (is.matrix(p))
pmat <- p
if (is.null(colnames(pmat)))
colnames(pmat) <- paste("p",seq_len(ncol(pmat)),sep="_")
sigGenes <- trueIsects <- missed <- vector("list",ncol(pmat))
names(sigGenes) <- names(trueIsects) <- names(missed) <- colnames(pmat)
for (n in colnames(pmat)) {
sigGenes[[n]] <- names(which(pmat[,n]<sig))
trueIsects[[n]] <- intersect(sigGenes[[n]],names(which(truth!=0)))
missed[[n]] <- setdiff(names(which(truth!=0)),trueIsects[[n]])
}
result <- data.frame(
P=vapply(sigGenes,length,numeric(1)),
TP=vapply(trueIsects,length,numeric(1)),
FN=vapply(missed,length,numeric(1))
)
result$FP <- result$P - result$TP
otr <- result$TP/(result$FP+result$FN)
names(otr) <- rownames(result)
return(list(result=result,otr=otr))
}
calcF1Score <- function(truth,p,sig=0.05) {
if (is.list(p))
pmat <- do.call("cbind",p)
else if (is.data.frame(p))
pmat <- as.matrix(p)
else if (is.matrix(p))
pmat <- p
if (is.null(colnames(pmat)))
colnames(pmat) <- paste("p",seq_len(ncol(pmat)),sep="_")
sigGenes <- trueIsects <- missed <- vector("list",ncol(pmat))
names(sigGenes) <- names(trueIsects) <- names(missed) <- colnames(pmat)
for (n in colnames(pmat)) {
sigGenes[[n]] <- names(which(pmat[,n]<sig))
trueIsects[[n]] <- intersect(sigGenes[[n]],names(which(truth!=0)))
missed[[n]] <- setdiff(names(which(truth!=0)),trueIsects[[n]])
}
result <- data.frame(
P=vapply(sigGenes,length,numeric(1)),
TP=vapply(trueIsects,length,numeric(1)),
FN=vapply(missed,length,numeric(1))
)
result$FP <- result$P - result$TP
f1 <- 2*result$TP/(2*result$TP+result$FP+result$FN)
names(f1) <- rownames(result)
return(list(result=result,f1=f1))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.