krlmm <- function(cnSet, cdfName, gender=NULL, trueCalls=NULL, verbose=TRUE) {
pkgname <- getCrlmmAnnotationName(cdfName)
### pre-processing, output M
M = computeLogRatio(cnSet, verbose)
message("Leaving out non-variant SNPs")
# For SNPs with less than 3 distinct data point, exclude them from downstream analysis
uniqueCount = apply(M[,], 1, function(x){length(unique(x))})
SNPtoProcessIndex = uniqueCount >= 3
noVariantSNPIndex = uniqueCount < 3
M = M[SNPtoProcessIndex, ]
numSNP = nrow(M)
numSample = ncol(M)
calls = oligoClasses::initializeBigMatrix(name="calls", numSNP, numSample, vmode = "integer")
scores = oligoClasses::initializeBigMatrix(name="scores", numSNP, numSample, vmode = "double")
open(calls)
open(scores)
rownames(calls) = rownames(M)
rownames(scores) = rownames(M)
colnames(calls) = colnames(M)
colnames(scores) = colnames(M)
priormeans = calculatePriorValues(M, numSNP, verbose)
### retrieve or calculate coefficients
krlmmCoefficients = getKrlmmVGLMCoefficients(pkgname, trueCalls, VGLMparameters, verbose, numSample, colnames(M))
### do VGLM fit, to predict k for each SNP
kPrediction <- predictKwithVGLM(VGLMparameters, krlmmCoefficients, verbose)
rm(VGLMparameters)
### assign calls
assignCalls(calls, M, kPrediction, priormeans, numSNP, numSample, verbose)
rm(kPrediction)
### assign confidence scores
computeCallPr(scores, M, calls, numSNP, numSample, verbose)
### add back SNPs excluded before
AddBackNoVarianceSNPs(cnSet, calls, scores, numSNP, numSample, SNPtoProcessIndex, noVariantSNPIndex)
loader("genotypeStuff.rda", .crlmmPkgEnv, pkgname)
YIndex <- getVarInEnv("YIndex")
loader("annotation.rda", .crlmmPkgEnv, pkgname)
annotation <- getVarInEnv("annot")
### impute gender if gender information not provided
if (is.null(gender)) {
gender = krlmmImputeGender(cnSet, annotation, YIndex, verbose)
}
### double-check ChrY SNP cluster, impute gender if gender information not provided
if (!(is.null(gender))) {
verifyChrYSNPs(cnSet, M, calls, gender, annotation, YIndex, priormeans, verbose)
}
close(calls)
close(scores)
rm(calls)
rm(scores)
rm(M)
rm(annotation)
rm(YIndex)
TRUE
}
krlmmnopkg <- function(cnSet, offset, gender=NULL, normalize.method=NULL, annotation=NULL, verbose=TRUE) {
if(is.null(normalize.method)){
message("No normalization method specified. Quantile normalization applied")
M=quantilenormalization(cnSet,verbose,offset=offset)
}else{
if(normalize.method=="quantile"){
M=quantilenormalization(cnSet,verbose,offset=offset)
}
if(normalize.method=="loess"){
M=loessnormalization(cnSet,verbose,offset=offset)
}
}
message("Leaving out non-variant SNPs")
# For SNPs with less than 3 distinct data point, exclude them from downstream analysis
uniqueCount = apply(M[,], 1, function(x){length(unique(x))})
SNPtoProcessIndex = uniqueCount >= 3
noVariantSNPIndex = uniqueCount < 3
M = M[SNPtoProcessIndex, ]
numSNP = nrow(M)
numSample = ncol(M)
calls = oligoClasses::initializeBigMatrix(name="calls", numSNP, numSample, vmode = "integer")
scores = oligoClasses::initializeBigMatrix(name="scores", numSNP, numSample, vmode = "double")
open(calls)
open(scores)
rownames(calls) = rownames(M)
rownames(scores) = rownames(M)
colnames(calls) = colnames(M)
colnames(scores) = colnames(M)
prepriormeans=calculatePriorcentersC(M,numSample)
trueCalls=matrix(NA,nrow(M),ncol(M))
for(i in 1:ncol(M)){
tmp=kmeans(M[,i],centers=prepriormeans,nstart=45)
trueCalls[,i]=tmp$cluster
}
colnames(trueCalls)=colnames(M)
rownames(trueCalls)=rownames(M)
if(numSample<=30){
priormeans=prepriormeans
}else{
priormeans = calculatePriorValues(M, numSNP, verbose)
if((abs(priormeans[1]-prepriormeans[1])+abs(priormeans[2]-prepriormeans[2])+abs(priormeans[3]-prepriormeans[3]))>=3){
priormeans=prepriormeans
}else{
priormeans=priormeans
}
}
VGLMparameters = calculateParameters(M, priormeans, numSNP, verbose)
### retrieve or calculate coefficients
krlmmCoefficients = getKrlmmVGLMCoefficients(trueCalls=trueCalls,params=VGLMparameters,numSample=ncol(M),samplenames=colnames(M),verbose=T)
### do VGLM fit, to predict k for each SNP
kPrediction <- predictKwithVGLM(VGLMparameters, krlmmCoefficients, verbose)
rm(VGLMparameters)
### assign calls
assignCalls(calls, M, kPrediction, priormeans, numSNP, numSample, verbose=T)
rm(kPrediction)
### assign confidence scores
computeCallPr(scores, M, calls, numSNP, numSample, verbose)
YIndex = seq(1:nrow(cnSet))[chromosome(cnSet)==24 & isSnp(cnSet)]
XIndex = seq(1:nrow(cnSet))[chromosome(cnSet)==23 & isSnp(cnSet)]
### impute gender if gender information not provided
if (is.null(gender)) {
if(sum(is.na(YIndex))==length(YIndex)){
gender=NULL
}else{
gender = krlmmImputeGender(cnSet, annotation, YIndex, verbose, offset=offset)
}
}
### double-check ChrY ChrX SNP cluster, impute gender if gender information not provided
if (!(is.null(gender))) {
verifyChrYSNPs(cnSet, M, calls, gender, annotation, YIndex, priormeans, verbose)
verifyChrXSNPs(cnSet, M, calls, gender, annotation, XIndex, priormeans, verbose)
}
# if (length(YIndex)>0 && !(is.null(gender))) {
# verifyChrYSNPs(cnSet, M, calls, gender, annotation, YIndex, priormeans, verbose)
# }
### add back SNPs excluded before
AddBackNoVarianceSNPs(cnSet, calls, scores, numSNP, numSample, SNPtoProcessIndex, noVariantSNPIndex)
close(calls)
close(scores)
rm(calls)
rm(scores)
rm(M)
TRUE
}
#######################################################################################################
loessnormalization<- function(cnSet, verbose, offset=16, blockSize = 300000){
# compute log-ratio in blocksize of 300,000 by default
message("Start computing log-ratios")
A <- A(cnSet)
B <- B(cnSet)
open(A)
open(B)
numSNP <- nrow(A)
numSample <- ncol(A)
M <- oligoClasses::initializeBigMatrix(name="M", numSNP, numSample, vmode = "double")
S <- oligoClasses::initializeBigMatrix(name="S", numSNP, numSample, vmode = "double")
rownames(M) = rownames(A)
colnames(M) = colnames(A)
numBlock = ceiling(numSNP / blockSize)
for (i in 1:numBlock){
if (verbose) message(" -- Processing segment ", i, " out of ", numBlock)
subsetA = as.matrix(A[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])+offset
subsetB = as.matrix(B[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])+offset
subsetM = .Call("krlmmComputeM", subsetA, subsetB, PACKAGE="crlmm")
M[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP),] = subsetM[1:nrow(subsetM), ]
subsetS = .Call("krlmmComputeS", subsetA, subsetB, PACKAGE="crlmm")
S[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP),] = subsetS[1:nrow(subsetS), ]
rm(subsetA, subsetB, subsetM,subsetS)
}
close(A)
close(B)
gc()
isna=is.na(M[,1])
prepriormeansL=calculatePriorcentersC(M[,][!isna,],numSample)
F01=abs(prepriormeansL[1])-prepriormeansL[2]
F02=abs(prepriormeansL[3])+prepriormeansL[2]
for(i in 1:ncol(M)) {
isna=is.na(M[,i])
Msub = M[!isna,i]
Mna=M[isna,i]
Ssub = S[!isna,i]
Sna=S[isna,i]
km = kmeans(Msub, prepriormeansL)
ind1v2 = km$cluster==1
ind2v2 = km$cluster==2
ind3v2 = km$cluster==3
l1v2 = loessFit(Msub[ind1v2], Ssub[ind1v2])
l2v2 = loessFit(Msub[ind2v2], Ssub[ind2v2])
l3v2 = loessFit(Msub[ind3v2], Ssub[ind3v2])
Msub[ind1v2] = l1v2$residuals+F01
Msub[ind2v2] = l2v2$residuals
Msub[ind3v2] = l3v2$residuals-F02
Mnew=rbind(as.matrix(Msub),as.matrix(Mna))
rownames(Mnew)=c(rownames(as.matrix(Msub)),rownames(as.matrix(Mna)))
m=match(rownames(M),rownames(Mnew))
Mnew=Mnew[m]
M[,i] = Mnew
rm(Msub, Ssub, ind1v2, ind2v2, ind3v2, l1v2, l2v2, l3v2)
}
return(M)
}
#######################################################################################################
quantilenormalization <- function(cnSet, verbose, offset=16, blockSize = 300000){
A <- A(cnSet)
B <- B(cnSet)
open(A)
open(B)
numSNP <- nrow(A)
numSample <- ncol(A)
M <- oligoClasses::initializeBigMatrix(name="M", numSNP, numSample, vmode = "double")
X <- oligoClasses::initializeBigMatrix(name="X", numSNP, numSample, vmode = "integer")
Y <- oligoClasses::initializeBigMatrix(name="Y", numSNP, numSample, vmode = "integer")
rownames(M) = rownames(X) = rownames(Y) = rownames(A)
colnames(M) = colnames(X) = colnames(Y) = colnames(A)
# This step may chew up a lot of memory...
X = normalize.quantiles(as.matrix(A[,]))+offset
Y = normalize.quantiles(as.matrix(B[,]))+offset
numBlock = ceiling(numSNP / blockSize)
for (i in 1:numBlock){
if (verbose) message(" -- Processing segment ", i, " out of ", numBlock)
subsetXqws = as.matrix(X[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])
subsetYqws = as.matrix(Y[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])
subsetM = .Call("krlmmComputeM", subsetXqws, subsetYqws, PACKAGE="crlmm")
M[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP),] = subsetM[1:nrow(subsetM), ]
rm(subsetYqws, subsetXqws, subsetM)
}
close(A)
close(B)
return(M)
delete(X)
delete(Y)
rm(X)
rm(Y)
gc()
}
#######################################################################################################
getNumberOfCores <- function(){
defaultCores = min(detectCores(), 8)
return(getOption("krlmm.cores", defaultCores))
}
#######################################################################################################
computeLogRatio <- function(cnSet, verbose=FALSE, offset=0, blockSize = 500000, col=NULL, row=NULL){
# compute log-ratio in blocksize of 500,000 by default
if(verbose)
message("Start computing log-ratios")
if(!is.null(col) | !is.null(row)) {
if(is.null(row)) {
A <- A(cnSet)[,col]
B <- B(cnSet)[,col]
}
if(is.null(col)) {
A <- A(cnSet)[row,]
B <- B(cnSet)[row,]
}
if(!is.null(col) & !is.null(row)) {
A <- A(cnSet)[row,col]
B <- B(cnSet)[row,col]
}
} else {
A <- A(cnSet)
B <- B(cnSet)
}
open(A)
open(B)
numSNP <- nrow(A)
numSample <- ncol(A)
M <- oligoClasses::initializeBigMatrix(name="M", numSNP, numSample, vmode = "double")
rownames(M) = rownames(A)
colnames(M) = colnames(A)
numBlock = ceiling(numSNP / blockSize)
for (i in 1:numBlock){
if(verbose) message(" -- Processing segment ", i, " out of ", numBlock)
subsetA = as.matrix(A[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])+offset
subsetB = as.matrix(B[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])+offset
subsetM = .Call("krlmmComputeM", subsetA, subsetB, PACKAGE="crlmm")
M[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP),] = subsetM[1:nrow(subsetM), ]
rm(subsetA, subsetB, subsetM)
}
close(A)
close(B)
if(verbose)
message("Done computing log-ratios")
return(M)
}
computeAverageLogIntensity <- function(cnSet, verbose=FALSE, offset=0, blockSize = 500000, col=NULL, row=NULL){
# compute average log intensity in blocksize of 500,000 by default
if(verbose)
message("Start computing average log-intensities")
if(!is.null(col) | !is.null(row)) {
if(is.null(row)) {
A <- A(cnSet)[,col]
B <- B(cnSet)[,col]
}
if(is.null(col)) {
A <- A(cnSet)[row,]
B <- B(cnSet)[row,]
}
if(!is.null(col) & !is.null(row)) {
A <- A(cnSet)[row,col]
B <- B(cnSet)[row,col]
}
} else {
A <- A(cnSet)
B <- B(cnSet)
}
open(A)
open(B)
numSNP <- nrow(A)
numSample <- ncol(A)
S <- oligoClasses::initializeBigMatrix(name="S", numSNP, numSample, vmode = "double")
rownames(S) = rownames(A)
colnames(S) = colnames(A)
numBlock = ceiling(numSNP / blockSize)
for (i in 1:numBlock){
if(verbose) message(" -- Processing segment ", i, " out of ", numBlock)
subsetA = as.matrix(A[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])+offset
subsetB = as.matrix(B[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])+offset
subsetS = .Call("krlmmComputeS", subsetA, subsetB, PACKAGE="crlmm")
S[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP),] = subsetS[1:nrow(subsetS), ]
rm(subsetA, subsetB, subsetS)
}
close(A)
close(B)
if(verbose)
message("Done computing average log-intensities")
return(S)
}
#######################################################################################################
calculatePriorValues <- function(M, numSNP, verbose) {
calculateOneKmeans <- function(x) {
tmp = kmeans(x, 3, nstart=45)
return(sort(tmp$centers, decreasing = T))
}
if (verbose) message("Start calculating prior means")
cl <- makeCluster(getNumberOfCores())
centers <- parRapply(cl, M, calculateOneKmeans)
stopCluster(cl)
centers <- matrix(centers, numSNP, 3, byrow = TRUE)
priormeans = apply(centers, 2, FUN="median", na.rm=TRUE)
if(abs(sum(priormeans))>1) {
checksymmetric= apply(centers,1,function(x){abs(sum(x))})<1
priormeans=apply(centers[checksymmetric,],2, FUN="median", na.rm=TRUE)
}
if (verbose) message("Done calculating prior means")
return(priormeans)
}
calculatePriorcentersC <- function(M, numSample) {
calculateOneKmeans <- function(x) {
tmp = kmeans(x, 3, nstart=45)
return(sort(tmp$centers, decreasing = T))
}
cl <- makeCluster(getNumberOfCores())
centers <- parCapply(cl, M, calculateOneKmeans)
stopCluster(cl)
centers <- matrix(centers, numSample, 3, byrow = TRUE)
prepriormeans = apply(centers, 2, FUN="median", na.rm=TRUE)
return(prepriormeans)
}
#######################################################################################################
calculateKrlmmCoefficients <- function(trueCalls, params, numSample, samplenames){
# if (!require(VGAM)) {
# message("VGAM package not found, fall back to use defined platform-specific coefficients")
# return(NULL)
# }
amatch = match(rownames(params), rownames(trueCalls))
amatch = amatch[!is.na(amatch)]
trueCalls = trueCalls[amatch,]
amatch = match(rownames(trueCalls), rownames(params))
params = params[amatch, ]
amatch = match(samplenames, colnames(trueCalls))
amatch = amatch[!is.na(amatch)]
trueCalls = trueCalls[, amatch]
if ((numSample <= 40) && (ncol(trueCalls) < round(numSample/2))){
message("Sample size is ", numSample, ", KRLMM requires at least trueCalls of ", round(numSample/2), " samples to calculate coefficients")
return(NULL)
}
if ((numSample > 40) && (ncol(trueCalls) < 20)){
message("At least trueCalls of 20 samples are required to calculate coefficients")
return(NULL)
}
if (nrow(trueCalls) < 1000){
message("At lease trueCalls of 1000 SNPs are required to calculate coefficients")
return(NULL)
}
getna = apply(trueCalls, 1, function(x){sum(is.na(x))>=10})
truek = apply(trueCalls, 1, function(x){length(unique(x[!is.na(x)]))})
params1 = params[!getna,]
truek1 = truek[!getna]
xx = data.frame(params1)
t = as.factor(as.numeric(truek1))
xx = data.frame(xx, t)
fit = suppressWarnings(vglm(t~., multinomial(refLevel=1), xx))
coe = coefficients(fit) # VGAM::
return(coe)
}
getKrlmmVGLMCoefficients <- function(pkgname, trueCalls, params, verbose, numSample, samplenames){
if (!is.null(trueCalls)) {
coe = calculateKrlmmCoefficients(trueCalls, params, numSample, samplenames)
if (!is.null(coe)) {
if (verbose) message ("Done calculating platform-specific coefficients to predict number of clusters")
return(coe)
}
}
if (!is.null(trueCalls)) message("Fall back to use defined platform-specific coefficients")
if (verbose) message ("Retrieving defined platform-specific coefficients to predict number of clusters")
loader("krlmmVGLMCoefficients.rda", .crlmmPkgEnv, pkgname)
return(getVarInEnv("krlmmCoefficients"))
}
predictKwithVGLM <- function(data, coe, verbose){
if (verbose) message("Start predicting number of clusters")
logit1 <- rep(0, nrow(data))
logit2 <- coe[1]+coe[3]*data[,1]+coe[5]*data[,2]+coe[7]*data[,3]+coe[9]*data[,4]+coe[11]*data[,5]+coe[13]*data[,6]+coe[15]*data[,7]+coe[17]*data[,8]
logit23 <- coe[2]+coe[4]*data[,1]+coe[6]*data[,2]+coe[8]*data[,3]+coe[10]*data[,4]+coe[12]*data[,5]+coe[14]*data[,6]+coe[16]*data[,7]+coe[18]*data[,8]
logits <- cbind(logit1, logit2, logit23)
rm(logit1)
rm(logit2)
rm(logit23)
p.unscaled <- exp(logits)
rm(logits)
p <- p.unscaled / rowSums(p.unscaled)
clusterPrediction = apply(p, 1, function(x){which.max(x)})
rm(p.unscaled)
rm(p)
if (verbose) message("Done predicting number of clusters")
return(clusterPrediction)
}
#######################################################################################################
assignCalls3Cluster <- function(intensities, priormeans){
prior31 = c(priormeans[1]/2,priormeans[2],priormeans[3])
prior32 = c(priormeans[1],priormeans[2],priormeans[3]/2)
emp <- rep(NA, length(intensities))
ansCalls <- emp
tmp <- try(kmeans(intensities, priormeans, nstart=1),TRUE)
if(class(tmp) == "try-error") {
tmp1 <- try(kmeans(intensities, prior31, nstart=1),TRUE)
if(class(tmp1) == "try-error"){
tmp2 <- try(kmeans(intensities, prior32, nstart=1),TRUE)
if(class(tmp2) == "try-error"){
ansCalls = emp
}else{
ansCalls = tmp2$cluster
}
}else{
ansCalls = tmp1$cluster
}
}else{
ansCalls = tmp$cluster
}
rm(prior31, prior32)
return(ansCalls)
}
#######################################################################################################
assignCalls2Cluster <- function(intensities, priormeans){
closest <- rep(NA, 3)
for(j in 1:3){
distance <- as.vector(abs(priormeans[j]-intensities))
closest[j] <- intensities[which.min(distance)]
}
prior2 <- priormeans[priormeans!=priormeans[which.max(abs(closest-priormeans))]]
emp <- rep(NA, length(intensities))
ansCalls <- emp
tmp <- try(kmeans(intensities, prior2, nstart=1), TRUE)
if(class(tmp) == "try-error") {
ansCalls <- emp
}else{
if(prior2[1]==priormeans[2] && prior2[2]==priormeans[3]){
mp <- abs(tmp$centers[1]-tmp$centers[2])
pp <- abs(priormeans[2]-priormeans[3])
if((mp/pp)<=0.25){
ansCalls <- emp
}else{
ansCalls <- tmp$cluster+1
}
}
if(prior2[1]==priormeans[1] && prior2[2]==priormeans[2]){
mp=abs(tmp$centers[1]-tmp$centers[2])
pp=abs(priormeans[1]-priormeans[2])
if((mp/pp)<=0.25){
ansCalls = emp
}else{
ansCalls <- tmp$cluster
}
}
if(prior2[1]==priormeans[1] && prior2[2]==priormeans[3]){
mp <- abs(tmp$centers[1]-tmp$centers[2])
pp <- abs(priormeans[1]-priormeans[3])
if ((mp/pp) <=0.25){
ansCalls <- emp
}else{
ansCalls[tmp$cluster==1] <- 1
ansCalls[tmp$cluster==2] <- 3
}
}
}
rm(tmp)
return(ansCalls)
}
#######################################################################################################
assignCalls1Cluster <- function(intensities, priormeans){
closest <- rep(NA, 3)
for(j in 1:3){
distance <- as.vector(abs(priormeans[j]-intensities))
closest[j]=intensities[which.min(distance)]
}
clusterindex <- which.min(abs(closest-priormeans))
return(rep(clusterindex, length(intensities)))
}
#######################################################################################################
assignCallsOneSNP <- function(x, priormeans, numSample){
tolerance = 1e-3
k = x[numSample + 1]
values = x[1:numSample]
if (abs(k - 2) <= tolerance) {
tmp <- try(kmeans(values, priormeans, nstart=1),TRUE)
if (!(class(tmp)=="try-error")) {
k <- 3;
}
}
successful <- FALSE;
if (abs(k - 3) <= tolerance){
SNPCalls <- assignCalls3Cluster(values, priormeans)
successful <- !is.na(SNPCalls[1])
if (!successful) {
k <- 2
}
}
if ( (abs(k - 2) <= tolerance) && (!successful)){
SNPCalls <- assignCalls2Cluster(values, priormeans)
successful <- !is.na(SNPCalls[1])
if (!successful) {
k <- 1
}
}
if ( (abs(k - 1) <= tolerance) && (!successful)){
SNPCalls <- assignCalls1Cluster(values, priormeans)
}
return(SNPCalls)
}
assignCalls <- function(callsGt, M, a, priormeans, numSNP, numSample, verbose, blockSize=500000){
# process by block size of 500,000 by default
message("Start assigning calls")
numBlock = ceiling(numSNP / blockSize)
for (i in 1:numBlock){
if (verbose) message(" -- Processing segment ", i, " out of ", numBlock)
subsetM = as.matrix(M[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])
thisnumSNP = nrow(subsetM)
subseta = a[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP)]
subseta = matrix(subseta, thisnumSNP, 1)
testValues = cbind(subsetM, subseta)
cl <- makeCluster(getNumberOfCores())
callAns <- parRapply(cl, testValues, assignCallsOneSNP, priormeans, numSample)
stopCluster(cl)
callAnsAllValues = unlist(callAns)
subsetcalls <- matrix(callAnsAllValues, thisnumSNP, numSample, byrow = TRUE)
callsGt[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP),] = subsetcalls[1:thisnumSNP, ]
rm(subsetM, subseta, subsetcalls)
}
message("Done assigning calls")
}
#######################################################################################################
calculateParameters <- function(M, priormeans, numSNP, verbose) {
if (verbose) message("Start calculating 3-cluster parameters")
params3cluster <- calculateParameters3Cluster(M, priormeans, numSNP, verbose);
if (verbose) message("Done calculating 3-cluster parameters")
if (verbose) message("Start calculating 2-cluster parameters")
params2cluster <- calculateParameters2Cluster(M, priormeans, numSNP, verbose);
if (verbose) message("Done calculating 2-cluster parameters")
if (verbose) message("Start calculating 1-cluster parameters")
params1cluster <- calculateParameters1Cluster(M, priormeans, numSNP, verbose);
if (verbose) message("Done calculating 1-cluster parameters")
parameters <- cbind(as.matrix(params1cluster$elemh), as.matrix(params1cluster$eless), as.matrix(params2cluster$elemh), as.matrix(params2cluster$eless),
as.matrix(params3cluster$elemh), as.matrix(params3cluster$eless), as.matrix(params2cluster$elehw), as.matrix(params3cluster$elehw));
rownames(parameters) = rownames(M)
rm(params3cluster)
rm(params2cluster)
rm(params1cluster)
return(parameters)
}
hardyweinberg <- function(x, minN = 8){
if (length(x) < minN){
return(NA)
} else {
result = .Call("krlmmHardyweinberg", x)
return(result)
}
}
calculateOneParams3Cluster <- function(x, priors, priorDown, priorUp){
tmp = try(kmeans(x, priors, nstart=1), TRUE)
if(class(tmp) == "kmeans") {
flag = 1
} else {
tmp = try(kmeans(x, priorDown, nstart=1), TRUE)
if(class(tmp) == "kmeans") {
flag = 2
} else {
tmp = try(kmeans(x, priorUp, nstart=1), TRUE)
if (class(tmp) == "kmeans") {
flag = 3
} else {
tmp = kmeans(x, 3, nstart=35)
flag = 4
}
}
}
ss = sum(tmp$withinss)
hw = hardyweinberg(tmp$cluster)
centers = sort(tmp$centers, decreasing = TRUE)
return(c(centers, ss, hw, flag))
}
calculateParameters3Cluster <- function(M, priormeans, numSNP, verbose) {
Apriors = priormeans
ApriorDown = c(Apriors[1]/2, Apriors[2], Apriors[3]) # shift-down
ApriorUp = c(Apriors[1], Apriors[2], Apriors[3]/2) # shift-up
cl <- makeCluster(getNumberOfCores())
clusterEvalQ(cl, library(crlmm))
parAns <- parRapply(cl, M, calculateOneParams3Cluster, Apriors, ApriorDown, ApriorUp)
stopCluster(cl)
parAnsAllValues = unlist(parAns)
parameters <- matrix(parAnsAllValues, numSNP, 6, byrow = TRUE)
centers <- parameters[, 1:3]
ss <- parameters[, 4]
hw <- parameters[, 5]
flag <- parameters[, 6]
rm(parAns)
rm(parameters)
sigma=solve(cov(centers, use="complete.obs"))
mh = calculateMahalDist3Cluster(centers, sigma, flag, Apriors, ApriorDown, ApriorUp, numSNP)
rm(sigma)
rm(centers)
gc()
return(list(eless = ss, elemh = mh, elehw = hw))
}
calculateMahalDist3Cluster <- function(centers, sigma, flag, priors, priorDown, priorUp, numSNP){
mahaldist = rep(NA, numSNP)
tolerance = 1e-3
for (i in 1:numSNP) {
if ((abs(flag[i] - 1) <= tolerance) || (abs(flag[i]- 4) <= tolerance)) difference = centers[i, ] - priors
if (abs(flag[i] - 2) <= tolerance) difference = centers[i, ] - priorDown
if (abs(flag[i] - 3) <= tolerance) difference = centers[i, ] - priorUp
mahaldist[i] = as.vector(difference)%*%sigma%*%as.vector(difference)
}
return(mahaldist)
}
calculateOneParams2Cluster <- function(x, priors){
aa = rep(NA, 3)
for(j in 1:3){
dist = as.vector(abs(priors[j]-x))
aa[j]=x[which.min(dist)]
}
prior2 = priors[priors!=priors[which.max(abs(aa-priors))]]
tmp = try(kmeans(x, prior2, nstart=1), TRUE)
if (class(tmp)=="kmeans") {
centers = tmp$centers
hw = hardyweinberg(tmp$cluster)
}
rm(tmp)
tmp = kmeans(x, 2, nstart = 35)
if (tmp$centers[1] < tmp$centers[2]){
centers = c(tmp$centers[2], tmp$centers[1])
hw = hardyweinberg(3 - tmp$cluster)
} else {
centers = tmp$centers
hw = hardyweinberg(tmp$cluster)
}
ss = sum(tmp$withinss)
return(c(centers, prior2, ss, hw))
}
calculateParameters2Cluster <- function(M, priormeans, numSNP, verbose) {
Apriors = priormeans
cl <- makeCluster(getNumberOfCores())
clusterEvalQ(cl, library(crlmm))
parAns <- parRapply(cl, M, calculateOneParams2Cluster, Apriors)
stopCluster(cl)
parAnsAllValues = unlist(parAns)
parameters <- matrix(parAnsAllValues, numSNP, 6, byrow = TRUE)
centers <- parameters[, 1:2]
priors2cluster <- parameters[, 3:4]
ss <- parameters[, 5]
hw <- parameters[, 6]
rm(parAns)
rm(parameters)
sigma=solve(cov(centers, use="complete.obs"))
mh = calculateMahalDist2Cluster(centers, sigma, priors2cluster, numSNP)
rm(sigma)
rm(centers)
gc()
return(list(eless = ss, elemh = mh, elehw = hw))
}
calculateMahalDist2Cluster <- function(centers, sigma, priors, numSNP){
mahaldist = rep(NA, numSNP)
for (i in 1:numSNP) {
difference <- centers[i,] - priors[i,]
mahaldist[i] = as.vector(difference)%*%sigma%*%as.vector(difference)
}
return(mahaldist)
}
calculateOneParams1Cluster <- function(x, priors){
center = mean(x)
diff <- x - center
diffsquare <- diff^2
ss = sum(diffsquare)
closestPrior = priors[which.min(abs(priors - center))]
return(c(center, closestPrior, ss))
}
calculateParameters1Cluster <- function(M, priormeans, numSNP, verbose) {
Apriors = priormeans
cl <- makeCluster(getNumberOfCores())
clusterEvalQ(cl, library(crlmm))
parAns <- parRapply(cl, M, calculateOneParams1Cluster, Apriors)
stopCluster(cl)
parAnsAllValues = unlist(parAns)
parameters <- matrix(parAnsAllValues, numSNP, 3, byrow = TRUE)
centers <- matrix(parameters[, 1], numSNP, 1)
prior1cluster <- matrix(parameters[, 2], numSNP, 1)
ss <- parameters[, 3]
rm(parAns)
rm(parameters)
sigma=solve(cov(centers, use="complete.obs"))
mh = calculateMahalDist1Cluster(centers, sigma, prior1cluster, numSNP)
rm(sigma)
rm(centers)
gc()
return(list(eless = ss, elemh = mh))
}
calculateMahalDist1Cluster <- function(centers, sigma, priors, numSNP){
mahaldist = rep(NA, numSNP)
for(i in 1:numSNP) {
difference <- as.vector(centers[i, 1] - priors[i, 1])
mahaldist[i]=difference%*%sigma%*%difference
}
return(mahaldist)
}
#############################################
computeCallPr <- function(callsPr, M, calls, numSNP, numSample, verbose, blockSize = 500000){
# compute log-ratio in blocksize of 500,000 by default
if (verbose) message("Start computing confidence scores")
numBlock = ceiling(numSNP / blockSize)
for (i in 1:numBlock){
if (verbose) message(" -- Processing segment ", i, " out of ", numBlock)
subsetM = as.matrix(M[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])
subsetCalls = as.matrix(calls[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])
subsetCallProb = .Call("krlmmConfidenceScore", subsetM, subsetCalls, PACKAGE="crlmm");
callsPr[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP),] = subsetCallProb[1:nrow(subsetM), ]
rm(subsetM, subsetCalls, subsetCallProb)
}
if (verbose) message("Done computing confidence scores")
}
#############################################
AddBackNoVarianceSNPs <- function(cnSet, calls, scores, numSNP, numSample, variantSNPIndex, noVariantSNPIndex){
callsGt = calls(cnSet)
callsPr = snpCallProbability(cnSet)
open(callsGt)
open(callsPr)
callsGt[variantSNPIndex, ] = calls[,]
callsPr[variantSNPIndex, ] = scores[,]
callsGt[noVariantSNPIndex, ] = NA
callsPr[noVariantSNPIndex, ] = 0
close(callsGt)
close(callsPr)
}
#############################################
krlmmImputeGender <- function(cnSet, annotation, YIndex, verbose, offset=0){
if (verbose) message("Start imputing gender")
S = computeAverageLogIntensity(cnSet, verbose, offset=offset)
# S is calculated and saved in original SNP order.
matchy = match(rownames(annotation)[YIndex], rownames(S))
matchy = matchy[!is.na(matchy)]
if (length(matchy) <= 10){
predictedGender = rep(NA, ncol(S))
}
Sy = S[matchy,]
uniqueDataPoint = apply(Sy, 1, function(x){length(unique(x))})
validYSNPs = uniqueDataPoint >= 2
SyValid = Sy[validYSNPs, ]
if (nrow(SyValid) < 20){
message("Not enough ChrY SNPs, skipping gender prediction step");
predictedGender = rep(NA, ncol(Sy))
}
rm(S)
numYChrSNP = nrow(SyValid)
allS = matrix(NA, numYChrSNP, 2)
for (i in 1:numYChrSNP) {
tmp = kmeans(SyValid[i,] ,2, nstart=45)
allS[i,] = sort(tmp$centers, decreasing=F)
}
priorS = apply(allS, 2, FUN="median", na.rm=TRUE)
if (abs(priorS[1] - priorS[2]) <= 1.6) {
message("Separation between clusters too small (samples probabaly all the same gender): ");
}
meanmatrix = apply(Sy, 2, median)
Sy1 = abs(meanmatrix - priorS[1])
Sy2 = abs(meanmatrix - priorS[2])
# output male - 1, female - 2, female S-values are smaller than male S-values.
test = cbind(Sy2, Sy1)
predictedGender = apply(test, 1, which.min)
open(cnSet$gender)
cnSet$gender[,] = predictedGender
close(cnSet$gender)
if (verbose) message("Done imputing gender")
return(predictedGender)
}
#############################################
verifyChrYSNPs <- function(cnSet, M, calls, gender, annotation, YIndex, priormeans, verbose){
if (verbose) message("Start verifying SNPs on Chromosome Y")
callsGt = calls(cnSet)
callsPr = snpCallProbability(cnSet)
open(callsGt)
open(callsPr)
matchy = match(rownames(annotation)[YIndex], rownames(M))
matchy = matchy[!is.na(matchy)]
MChrY = M[matchy,]
callsChrY = calls[matchy,]
male = gender == 1
female = gender == 2
checkK = apply(callsChrY[, male], 1, function(x){ length(unique(x[!is.na(x)])) } )
for(i in 1:nrow(MChrY)){
# Chromosome Y SNPs, no calls for female, k = 1 or 2 permitted for male samples
thisChrYSNPorigPosition = match(rownames(callsChrY)[i], rownames(callsGt))
callsGt[thisChrYSNPorigPosition, female] = NA
callsPr[thisChrYSNPorigPosition, female] = 0
# early exit for k = 1 or 2 cases. Only re-assign calls to male samples if we previouly assigned
# male samples to 3 clusters by mistake.
if (checkK[i] < 3) next;
if (class(try(kmeans(MChrY[i, male], c(priormeans[1], priormeans[3]), nstart=1), TRUE)) != "try-error"){
maleSampleCalls = kmeans(MChrY[i, male],c(priormeans[1], priormeans[3]), nstart=45)$cluster
callsGt[thisChrYSNPorigPosition, male][maleSampleCalls == 1] = 1
callsGt[thisChrYSNPorigPosition, male][maleSampleCalls == 2] = 3
} else {
distanceToPrior1 = mean(abs(MChrY[i, male] - priormeans[1]))
distanceToPrior3 = mean(abs(MChrY[i, male] - priormeans[3]))
callsGt[thisChrYSNPorigPosition, male] = ifelse(distanceToPrior1 < distanceToPrior3, 1, 3)
}
MMaleSamples = MChrY[i, male]
callsMaleSample = callsGt[thisChrYSNPorigPosition, male]
scoresMaleSample = .Call("krlmmConfidenceScore", t(as.matrix(MMaleSamples)), t(as.matrix(callsMaleSample)), PACKAGE="crlmm");
callsPr[thisChrYSNPorigPosition, male] = scoresMaleSample
}
close(callsGt)
close(callsPr)
if (verbose) message("Done verifying SNPs on Chromosome Y")
}
#######################################
verifyChrXSNPs <- function(cnSet, M, calls, gender, annotation, XIndex, priormeans, verbose){
if (verbose) message("Start verifying SNPs on Chromosome X")
callsGt = calls(cnSet)
callsPr = snpCallProbability(cnSet)
open(callsGt)
open(callsPr)
matchx = match(rownames(annotation)[XIndex], rownames(M))
matchx = matchx[!is.na(matchx)]
MChrX= M[matchx,]
callsChrX = callsGt[matchx,]
male = gender == 1
female = gender == 2
checkK = apply(callsChrX[, male], 1, function(x){ length(unique(x[!is.na(x)])) } )
for(i in 1:nrow(MChrX)){
# Chromosome X SNPs for male, k = 1 or 2 permitted for male samples
thisChrXSNPorigPosition = match(rownames(callsChrX)[i], rownames(callsGt))
# early exit for k = 1 or 2 cases. Only re-assign calls to male samples if we previouly assigned
# male samples to 3 clusters by mistake.
if (checkK[i] < 3) next;
if (class(try(kmeans(MChrX[i, male], c(priormeans[1], priormeans[3]), nstart=1), TRUE)) != "try-error"){
maleSampleCalls = kmeans(MChrX[i, male],c(priormeans[1], priormeans[3]), nstart=45)$cluster
callsGt[thisChrXSNPorigPosition, male][maleSampleCalls == 1] = 1
callsGt[thisChrXSNPorigPosition, male][maleSampleCalls == 2] = 3
} else {
distanceToPrior1 = mean(abs(MChrX[i, male] - priormeans[1]))
distanceToPrior3 = mean(abs(MChrX[i, male] - priormeans[3]))
callsGt[thisChrXSNPorigPosition, male] = ifelse(distanceToPrior1 < distanceToPrior3, 1, 3)
}
MMaleSamples = MChrX[i, male]
callsMaleSample = callsGt[thisChrXSNPorigPosition, male]
scoresMaleSample = .Call("krlmmConfidenceScore", t(as.matrix(MMaleSamples)), t(as.matrix(callsMaleSample)), PACKAGE="crlmm");
callsPr[thisChrXSNPorigPosition, male] = scoresMaleSample
}
close(callsGt)
close(callsPr)
if (verbose) message("Done verifying SNPs on Chromosome X")
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.