krlmm <- function(cnSet, cdfName, gender=NULL, trueCalls=NULL, verbose=TRUE) {
pkgname <- getCrlmmAnnotationName(cdfName) # crlmm:::
### pre-processing, output M
M = computeLogRatio(cnSet, verbose)
message("leaving out novariant SNPs")
# For SNPs with less than 3 distinct data point, exclude them from downstream analysis
uniqueCount = apply(M[,], 1, function(x){length(unique(x))})
SNPtoProcessIndex = uniqueCount >= 3
noVariantSNPIndex = uniqueCount < 3
M = M[SNPtoProcessIndex, ]
numSNP = nrow(M)
numSample = ncol(M)
calls = oligoClasses::initializeBigMatrix(name="calls", numSNP, numSample, vmode = "integer")
scores = oligoClasses::initializeBigMatrix(name="scores", numSNP, numSample, vmode = "double")
open(calls)
open(scores)
rownames(calls) = rownames(M)
rownames(scores) = rownames(M)
colnames(calls) = colnames(M)
colnames(scores) = colnames(M)
priormeans = calculatePriorValues(M, numSNP, verbose)
VGLMparameters = calculateParameters(M, priormeans, numSNP, verbose)
### retrieve or calculate coefficients
krlmmCoefficients = getKrlmmVGLMCoefficients(pkgname, trueCalls, VGLMparameters, verbose, numSample, colnames(M))
### do VGLM fit, to predict k for each SNP
kPrediction <- predictKwithVGLM(VGLMparameters, krlmmCoefficients, verbose);
rm(VGLMparameters)
### assign calls
assignCalls(calls, M, kPrediction, priormeans, numSNP, numSample, verbose);
rm(kPrediction)
### assign confidence scores
computeCallPr(scores, M, calls, numSNP, numSample, verbose)
### add back SNPs excluded before
AddBackNoVarianceSNPs(cnSet, calls, scores, numSNP, numSample, SNPtoProcessIndex, noVariantSNPIndex)
loader("genotypeStuff.rda", .crlmmPkgEnv, pkgname)
YIndex <- getVarInEnv("YIndex")
loader("annotation.rda", .crlmmPkgEnv, pkgname)
annotation <- getVarInEnv("annot")
### impute gender if gender information not provided
if (is.null(gender)) {
gender = krlmmImputeGender(cnSet, annotation, YIndex, verbose)
}
### double-check ChrY SNP cluster, impute gender if gender information not provided
if (!(is.null(gender))) {
verifyChrYSNPs(cnSet, M, calls, gender, annotation, YIndex, priormeans, verbose)
}
close(calls)
close(scores)
rm(calls)
rm(scores)
rm(M)
rm(annotation)
rm(YIndex)
TRUE
}
#######################################################################################################
getNumberOfCores <- function(){
defaultCores = min(detectCores(), 8)
return(getOption("krlmm.cores", defaultCores))
}
#######################################################################################################
computeLogRatio <- function(cnSet, verbose, blockSize = 500000){
# compute log-ratio in blocksize of 500,000 by default
message("Start computing log ratio")
A <- A(cnSet)
B <- B(cnSet)
open(A)
open(B)
numSNP <- nrow(A)
numSample <- ncol(A)
M <- oligoClasses::initializeBigMatrix(name="M", numSNP, numSample, vmode = "double")
rownames(M) = rownames(A)
colnames(M) = colnames(A)
numBlock = ceiling(numSNP / blockSize)
for (i in 1:numBlock){
if (verbose) message(" -- Processing segment ", i, " out of ", numBlock)
subsetA = as.matrix(A[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])
subsetB = as.matrix(B[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])
subsetM = .Call("krlmmComputeM", subsetA, subsetB, PACKAGE="crlmm")
M[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP),] = subsetM[1:nrow(subsetM), ]
rm(subsetA, subsetB, subsetM)
}
close(A)
close(B)
if (verbose) message("Done computing log ratio")
return(M)
}
computeAverageLogIntensity <- function(cnSet, verbose, blockSize = 500000){
# compute average log intensity in blocksize of 500,000 by default
message("Start computing average log intensity")
A <- A(cnSet)
B <- B(cnSet)
open(A)
open(B)
numSNP <- nrow(A)
numSample <- ncol(A)
S <- oligoClasses::initializeBigMatrix(name="S", numSNP, numSample, vmode = "double")
rownames(S) = rownames(A)
colnames(S) = colnames(A)
numBlock = ceiling(numSNP / blockSize)
for (i in 1:numBlock){
if (verbose) message(" -- Processing segment ", i, " out of ", numBlock)
subsetA = as.matrix(A[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])
subsetB = as.matrix(B[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])
subsetS = .Call("krlmmComputeS", subsetA, subsetB, PACKAGE="crlmm")
S[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP),] = subsetS[1:nrow(subsetS), ]
rm(subsetA, subsetB, subsetS)
}
close(A)
close(B)
if (verbose) message("Done computing average log intensity")
return(S)
}
#######################################################################################################
calculatePriorValues <- function(M, numSNP, verbose) {
calculateOneKmeans <- function(x) {
tmp = kmeans(x, 3, nstart=45)
return(sort(tmp$centers, decreasing = T))
}
if (verbose) message("Start calculating Prior Means")
cl <- makeCluster(getNumberOfCores())
centers <- parRapply(cl, M, calculateOneKmeans)
stopCluster(cl)
centers <- matrix(centers, numSNP, 3, byrow = TRUE)
priormeans = apply(centers, 2, FUN="median", na.rm=TRUE)
if(abs(sum(priormeans))>1)
checksymmetric= apply(centers,1,function(x){abs(sum(x))})<1
priormeans=apply(centers[checksymmetric,],2, FUN="median", na.rm=TRUE)
if (verbose) message("Done calculating Prior Means")
return(priormeans)
}
#######################################################################################################
calculateKrlmmCoefficients <- function(trueCalls, params, numSample, samplenames){
# if (!require(VGAM)) {
# message("VGAM package not found, fall back to use defined platform-specific coefficients")
# return(NULL)
# }
amatch = match(rownames(params), rownames(trueCalls))
amatch = amatch[!is.na(amatch)]
trueCalls = trueCalls[amatch,]
amatch = match(rownames(trueCalls), rownames(params))
params = params[amatch, ]
amatch = match(samplenames, colnames(trueCalls))
amatch = amatch[!is.na(amatch)]
trueCalls = trueCalls[, amatch]
if ((numSample <= 40) && (ncol(trueCalls) < round(numSample/2))){
message("Sample size is ", numSample, ", KRLMM requires at least trueCalls of ", round(numSample/2), " samples to calculate coefficients")
return(NULL)
}
if ((numSample > 40) && (ncol(trueCalls) < 20)){
message("At least trueCalls of 20 samples are required to calculate coefficients")
return(NULL)
}
if (nrow(trueCalls) < 1000){
message("At lease trueCalls of 1000 SNPs are required to calculate coefficients")
return(NULL)
}
getna = apply(trueCalls, 1, function(x){sum(is.na(x))>=10})
truek = apply(trueCalls, 1, function(x){length(unique(x[!is.na(x)]))})
params1 = params[!getna,]
truek1 = truek[!getna]
xx = data.frame(params1)
t = as.factor(as.numeric(truek1))
xx = data.frame(xx, t)
fit = suppressWarnings(vglm(t~., multinomial(refLevel=1), xx))
coe = coefficients(fit) # VGAM::
return(coe)
}
getKrlmmVGLMCoefficients <- function(pkgname, trueCalls, params, verbose, numSample, samplenames){
if (!is.null(trueCalls)) {
coe = calculateKrlmmCoefficients(trueCalls, params, numSample, samplenames)
if (!is.null(coe)) {
if (verbose) message ("Done calculating platform-specific coefficients to predict number of clusters")
return(coe)
}
}
if (!is.null(trueCalls)) message("Fall back to use defined platform-specific coefficients")
if (verbose) message ("Retrieving defined platform-specific coefficients to predict number of clusters")
loader("krlmmVGLMCoefficients.rda", .crlmmPkgEnv, pkgname)
return(getVarInEnv("krlmmCoefficients"))
}
predictKwithVGLM <- function(data, coe, verbose){
if (verbose) message("Start predicting number of clusters")
logit1 <- rep(0, nrow(data))
logit2 <- coe[1]+coe[3]*data[,1]+coe[5]*data[,2]+coe[7]*data[,3]+coe[9]*data[,4]+coe[11]*data[,5]+coe[13]*data[,6]+coe[15]*data[,7]+coe[17]*data[,8]
logit23 <- coe[2]+coe[4]*data[,1]+coe[6]*data[,2]+coe[8]*data[,3]+coe[10]*data[,4]+coe[12]*data[,5]+coe[14]*data[,6]+coe[16]*data[,7]+coe[18]*data[,8]
logits <- cbind(logit1, logit2, logit23)
rm(logit1)
rm(logit2)
rm(logit23)
p.unscaled <- exp(logits)
rm(logits)
p <- p.unscaled / rowSums(p.unscaled)
clusterPrediction = apply(p, 1, function(x){which.max(x)})
rm(p.unscaled)
rm(p)
if (verbose) message("Done predicting number of clusters")
return(clusterPrediction)
}
#######################################################################################################
assignCalls3Cluster <- function(intensities, priormeans){
prior31 = c(priormeans[1]/2,priormeans[2],priormeans[3])
prior32 = c(priormeans[1],priormeans[2],priormeans[3]/2)
emp <- rep(NA, length(intensities))
ansCalls <- emp
tmp <- try(kmeans(intensities, priormeans, nstart=1),TRUE)
if(class(tmp) == "try-error") {
tmp1 <- try(kmeans(intensities, prior31, nstart=1),TRUE)
if(class(tmp1) == "try-error"){
tmp2 <- try(kmeans(intensities, prior32, nstart=1),TRUE)
if(class(tmp2) == "try-error"){
ansCalls = emp
}else{
ansCalls = tmp2$cluster
}
}else{
ansCalls = tmp1$cluster
}
}else{
ansCalls = tmp$cluster
}
rm(prior31, prior32)
return(ansCalls)
}
#######################################################################################################
assignCalls2Cluster <- function(intensities, priormeans){
closest <- rep(NA, 3)
for(j in 1:3){
distance <- as.vector(abs(priormeans[j]-intensities))
closest[j] <- intensities[which.min(distance)]
}
prior2 <- priormeans[priormeans!=priormeans[which.max(abs(closest-priormeans))]]
emp <- rep(NA, length(intensities))
ansCalls <- emp
tmp <- try(kmeans(intensities, prior2, nstart=1), TRUE)
if(class(tmp) == "try-error") {
ansCalls <- emp
}else{
if(prior2[1]==priormeans[2] && prior2[2]==priormeans[3]){
mp <- abs(tmp$centers[1]-tmp$centers[2])
pp <- abs(priormeans[2]-priormeans[3])
if((mp/pp)<=0.25){
ansCalls <- emp
}else{
ansCalls <- tmp$cluster+1
}
}
if(prior2[1]==priormeans[1] && prior2[2]==priormeans[2]){
mp=abs(tmp$centers[1]-tmp$centers[2])
pp=abs(priormeans[1]-priormeans[2])
if((mp/pp)<=0.25){
ansCalls = emp
}else{
ansCalls <- tmp$cluster
}
}
if(prior2[1]==priormeans[1] && prior2[2]==priormeans[3]){
mp <- abs(tmp$centers[1]-tmp$centers[2])
pp <- abs(priormeans[1]-priormeans[3])
if ((mp/pp) <=0.25){
ansCalls <- emp
}else{
ansCalls[tmp$cluster==1] <- 1
ansCalls[tmp$cluster==2] <- 3
}
}
}
rm(tmp)
return(ansCalls)
}
#######################################################################################################
assignCalls1Cluster <- function(intensities, priormeans){
closest <- rep(NA, 3)
for(j in 1:3){
distance <- as.vector(abs(priormeans[j]-intensities))
closest[j]=intensities[which.min(distance)]
}
clusterindex <- which.min(abs(closest-priormeans))
return(rep(clusterindex, length(intensities)))
}
#######################################################################################################
assignCallsOneSNP <- function(x, priormeans, numSample){
tolerance = 1e-3
k = x[numSample + 1]
values = x[1:numSample]
if (abs(k - 2) <= tolerance) {
tmp <- try(kmeans(values, priormeans, nstart=1),TRUE)
if (!(class(tmp)=="try-error")) {
k <- 3;
}
}
successful <- FALSE;
if (abs(k - 3) <= tolerance){
SNPCalls <- assignCalls3Cluster(values, priormeans)
successful <- !is.na(SNPCalls[1])
if (!successful) {
k <- 2
}
}
if ( (abs(k - 2) <= tolerance) && (!successful)){
SNPCalls <- assignCalls2Cluster(values, priormeans)
successful <- !is.na(SNPCalls[1])
if (!successful) {
k <- 1
}
}
if ( (abs(k - 1) <= tolerance) && (!successful)){
SNPCalls <- assignCalls1Cluster(values, priormeans)
}
return(SNPCalls)
}
assignCalls <- function(callsGt, M, a, priormeans, numSNP, numSample, verbose, blockSize=500000){
# process by block size of 500,000 by default
message("Start assign calls")
numBlock = ceiling(numSNP / blockSize)
for (i in 1:numBlock){
if (verbose) message(" -- Processing segment ", i, " out of ", numBlock)
subsetM = as.matrix(M[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])
thisnumSNP = nrow(subsetM)
subseta = a[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP)]
subseta = matrix(subseta, thisnumSNP, 1)
testValues = cbind(subsetM, subseta)
cl <- makeCluster(getNumberOfCores())
callAns <- parRapply(cl, testValues, assignCallsOneSNP, priormeans, numSample)
stopCluster(cl)
callAnsAllValues = unlist(callAns)
subsetcalls <- matrix(callAnsAllValues, thisnumSNP, numSample, byrow = TRUE)
callsGt[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP),] = subsetcalls[1:thisnumSNP, ]
rm(subsetM, subseta, subsetcalls)
}
message("Done assign calls")
}
#######################################################################################################
calculateParameters <- function(M, priormeans, numSNP, verbose) {
if (verbose) message("Start calculating 3-clusters parameters")
params3cluster <- calculateParameters3Cluster(M, priormeans, numSNP, verbose);
if (verbose) message("Done calculating 3-cluster parameters")
if (verbose) message("Start calculating 2-cluster parameters")
params2cluster <- calculateParameters2Cluster(M, priormeans, numSNP, verbose);
if (verbose) message("Done calculating 2-cluster parameters")
if (verbose) message("Start calculating 1-cluster parameters")
params1cluster <- calculateParameters1Cluster(M, priormeans, numSNP, verbose);
if (verbose) message("Done calculating 1-cluster parameters")
parameters <- cbind(as.matrix(params1cluster$elemh), as.matrix(params1cluster$eless), as.matrix(params2cluster$elemh), as.matrix(params2cluster$eless),
as.matrix(params3cluster$elemh), as.matrix(params3cluster$eless), as.matrix(params2cluster$elehw), as.matrix(params3cluster$elehw));
rownames(parameters) = rownames(M)
rm(params3cluster)
rm(params2cluster)
rm(params1cluster)
return(parameters)
}
hardyweinberg <- function(x, minN = 8){
if (length(x) < minN){
return(NA)
} else {
result = .Call("krlmmHardyweinberg", x)
return(result)
}
}
calculateOneParams3Cluster <- function(x, priors, priorDown, priorUp){
tmp = try(kmeans(x, priors, nstart=1), TRUE)
if(class(tmp) == "kmeans") {
flag = 1
} else {
tmp = try(kmeans(x, priorDown, nstart=1), TRUE)
if(class(tmp) == "kmeans") {
flag = 2
} else {
tmp = try(kmeans(x, priorUp, nstart=1), TRUE)
if (class(tmp) == "kmeans") {
flag = 3
} else {
tmp = kmeans(x, 3, nstart=35)
flag = 4
}
}
}
ss = sum(tmp$withinss)
hw = hardyweinberg(tmp$cluster)
centers = sort(tmp$centers, decreasing = TRUE)
return(c(centers, ss, hw, flag))
}
calculateParameters3Cluster <- function(M, priormeans, numSNP, verbose) {
Apriors = priormeans
ApriorDown = c(Apriors[1]/2, Apriors[2], Apriors[3]) # shift-down
ApriorUp = c(Apriors[1], Apriors[2], Apriors[3]/2) # shift-up
cl <- makeCluster(getNumberOfCores())
clusterEvalQ(cl, library(crlmm))
parAns <- parRapply(cl, M, calculateOneParams3Cluster, Apriors, ApriorDown, ApriorUp)
stopCluster(cl)
parAnsAllValues = unlist(parAns)
parameters <- matrix(parAnsAllValues, numSNP, 6, byrow = TRUE)
centers <- parameters[, 1:3]
ss <- parameters[, 4]
hw <- parameters[, 5]
flag <- parameters[, 6]
rm(parAns)
rm(parameters)
sigma=solve(cov(centers, use="complete.obs"))
mh = calculateMahalDist3Cluster(centers, sigma, flag, Apriors, ApriorDown, ApriorUp, numSNP)
rm(sigma)
rm(centers)
gc()
return(list(eless = ss, elemh = mh, elehw = hw))
}
calculateMahalDist3Cluster <- function(centers, sigma, flag, priors, priorDown, priorUp, numSNP){
mahaldist = rep(NA, numSNP)
tolerance = 1e-3
for (i in 1:numSNP) {
if ((abs(flag[i] - 1) <= tolerance) || (abs(flag[i]- 4) <= tolerance)) difference = centers[i, ] - priors
if (abs(flag[i] - 2) <= tolerance) difference = centers[i, ] - priorDown
if (abs(flag[i] - 3) <= tolerance) difference = centers[i, ] - priorUp
mahaldist[i] = as.vector(difference)%*%sigma%*%as.vector(difference)
}
return(mahaldist)
}
calculateOneParams2Cluster <- function(x, priors){
aa = rep(NA, 3)
for(j in 1:3){
dist = as.vector(abs(priors[j]-x))
aa[j]=x[which.min(dist)]
}
prior2 = priors[priors!=priors[which.max(abs(aa-priors))]]
tmp = try(kmeans(x, prior2, nstart=1), TRUE)
if (class(tmp)=="kmeans") {
centers = tmp$centers
hw = hardyweinberg(tmp$cluster)
}
rm(tmp)
tmp = kmeans(x, 2, nstart = 35)
if (tmp$centers[1] < tmp$centers[2]){
centers = c(tmp$centers[2], tmp$centers[1])
hw = hardyweinberg(3 - tmp$cluster)
} else {
centers = tmp$centers
hw = hardyweinberg(tmp$cluster)
}
ss = sum(tmp$withinss)
return(c(centers, prior2, ss, hw))
}
calculateParameters2Cluster <- function(M, priormeans, numSNP, verbose) {
Apriors = priormeans
cl <- makeCluster(getNumberOfCores())
clusterEvalQ(cl, library(crlmm))
parAns <- parRapply(cl, M, calculateOneParams2Cluster, Apriors)
stopCluster(cl)
parAnsAllValues = unlist(parAns)
parameters <- matrix(parAnsAllValues, numSNP, 6, byrow = TRUE)
centers <- parameters[, 1:2]
priors2cluster <- parameters[, 3:4]
ss <- parameters[, 5]
hw <- parameters[, 6]
rm(parAns)
rm(parameters)
sigma=solve(cov(centers, use="complete.obs"))
mh = calculateMahalDist2Cluster(centers, sigma, priors2cluster, numSNP)
rm(sigma)
rm(centers)
gc()
return(list(eless = ss, elemh = mh, elehw = hw))
}
calculateMahalDist2Cluster <- function(centers, sigma, priors, numSNP){
mahaldist = rep(NA, numSNP)
for (i in 1:numSNP) {
difference <- centers[i,] - priors[i,]
mahaldist[i] = as.vector(difference)%*%sigma%*%as.vector(difference)
}
return(mahaldist)
}
calculateOneParams1Cluster <- function(x, priors){
center = mean(x)
diff <- x - center
diffsquare <- diff^2
ss = sum(diffsquare)
closestPrior = priors[which.min(abs(priors - center))]
return(c(center, closestPrior, ss))
}
calculateParameters1Cluster <- function(M, priormeans, numSNP, verbose) {
Apriors = priormeans
cl <- makeCluster(getNumberOfCores())
clusterEvalQ(cl, library(crlmm))
parAns <- parRapply(cl, M, calculateOneParams1Cluster, Apriors)
stopCluster(cl)
parAnsAllValues = unlist(parAns)
parameters <- matrix(parAnsAllValues, numSNP, 3, byrow = TRUE)
centers <- matrix(parameters[, 1], numSNP, 1)
prior1cluster <- matrix(parameters[, 2], numSNP, 1)
ss <- parameters[, 3]
rm(parAns)
rm(parameters)
sigma=solve(cov(centers, use="complete.obs"))
mh = calculateMahalDist1Cluster(centers, sigma, prior1cluster, numSNP)
rm(sigma)
rm(centers)
gc()
return(list(eless = ss, elemh = mh))
}
calculateMahalDist1Cluster <- function(centers, sigma, priors, numSNP){
mahaldist = rep(NA, numSNP)
for(i in 1:numSNP) {
difference <- as.vector(centers[i, 1] - priors[i, 1])
mahaldist[i]=difference%*%sigma%*%difference
}
return(mahaldist)
}
#############################################
computeCallPr <- function(callsPr, M, calls, numSNP, numSample, verbose, blockSize = 500000){
# compute log-ratio in blocksize of 500,000 by default
if (verbose) message("Start computing confidence score")
numBlock = ceiling(numSNP / blockSize)
for (i in 1:numBlock){
if (verbose) message(" -- Processing segment ", i, " out of ", numBlock)
subsetM = as.matrix(M[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])
subsetCalls = as.matrix(calls[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP), ])
subsetCallProb = .Call("krlmmConfidenceScore", subsetM, subsetCalls, PACKAGE="crlmm");
callsPr[((i-1) * (blockSize) + 1):min(i * blockSize, numSNP),] = subsetCallProb[1:nrow(subsetM), ]
rm(subsetM, subsetCalls, subsetCallProb)
}
if (verbose) message("Done computing confidence score")
}
#############################################
AddBackNoVarianceSNPs <- function(cnSet, calls, scores, numSNP, numSample, variantSNPIndex, noVariantSNPIndex){
callsGt = calls(cnSet)
callsPr = snpCallProbability(cnSet)
open(callsGt)
open(callsPr)
callsGt[variantSNPIndex, ] = calls[,]
callsPr[variantSNPIndex, ] = scores[,]
callsGt[noVariantSNPIndex, ] = NA
callsPr[noVariantSNPIndex, ] = 0
close(callsGt)
close(callsPr)
}
#############################################
krlmmImputeGender <- function(cnSet, annotation, YIndex, verbose){
if (verbose) message("Start imputing gender")
S = computeAverageLogIntensity(cnSet, verbose)
# S is calculated and saved in original SNP order.
matchy = match(annotation[YIndex, 2], rownames(S))
matchy = matchy[!is.na(matchy)]
if (length(matchy) <= 10){
predictedGender = rep(NA, ncol(A))
}
Sy = S[matchy,]
uniqueDataPoint = apply(Sy, 1, function(x){length(unique(x))})
validYSNPs = uniqueDataPoint >= 2
SyValid = Sy[validYSNPs, ]
if (nrow(SyValid) < 20){
message("Not enough ChrY SNPs, skipping gender prediction step");
predictedGender = rep(NA, ncol(Sy))
}
rm(S)
numYChrSNP = nrow(SyValid)
allS = matrix(NA, numYChrSNP, 2)
for (i in 1:numYChrSNP) {
tmp = kmeans(SyValid[i,] ,2, nstart=45)
allS[i,] = sort(tmp$centers, decreasing=F)
}
priorS = apply(allS, 2, FUN="median", na.rm=TRUE)
if (abs(priorS[1] - priorS[2]) <= 1.6) {
message("Skipping gender prediction step");
predictedGender = rep(NA, ncol(Sy))
}
meanmatrix = apply(Sy, 2, median)
Sy1 = abs(meanmatrix - priorS[1])
Sy2 = abs(meanmatrix - priorS[2])
# output male - 1, female - 2, female S-values are smaller than male S-values.
test = cbind(Sy2, Sy1)
predictedGender = apply(test, 1, which.min)
open(cnSet$gender)
cnSet$gender[,] = predictedGender
close(cnSet$gender)
if (verbose) message("Done imputing gender")
return(predictedGender)
}
#############################################
verifyChrYSNPs <- function(cnSet, M, calls, gender, annotation, YIndex, priormeans, verbose){
if (verbose) message("Start verifying SNPs on Chromosome Y")
callsGt = calls(cnSet)
callsPr = snpCallProbability(cnSet)
open(callsGt)
open(callsPr)
matchy = match(annotation[YIndex, 2], rownames(M))
matchy = matchy[!is.na(matchy)]
MChrY = M[matchy,]
callsChrY = calls[matchy,]
male = gender == 1
female = gender == 2
checkK = apply(callsChrY[, male], 1, function(x){ length(unique(x[!is.na(x)])) } )
for(i in 1:nrow(MChrY)){
# Chromosome Y SNPs, no calls for female, k = 1 or 2 permitted for male samples
thisChrYSNPorigPosition = match(rownames(callsChrY)[i], rownames(callsGt))
callsGt[thisChrYSNPorigPosition, female] = NA
callsPr[thisChrYSNPorigPosition, female] = 0
# early exit for k = 1 or 2 cases. Only re-assign calls to male samples if we previouly assigned
# male samples to 3 clusters by mistake.
if (checkK[i] < 3) next;
if (class(try(kmeans(MChrY[i, male], c(priormeans[1], priormeans[3]), nstart=1), TRUE)) != "try-error"){
maleSampleCalls = kmeans(MChrY[i, male],c(priormeans[1], priormeans[3]), nstart=45)$cluster
callsGt[thisChrYSNPorigPosition, male][maleSampleCalls == 1] = 1
callsGt[thisChrYSNPorigPosition, male][maleSampleCalls == 2] = 3
} else {
distanceToPrior1 = mean(abs(MChrY[i, male] - priormeans[1]))
distanceToPrior3 = mean(abs(MChrY[i, male] - priormeans[3]))
callsGt[thisChrYSNPorigPosition, male] = ifelse(distanceToPrior1 < distanceToPrior3, 1, 3)
}
MMaleSamples = MChrY[i, male]
callsMaleSample = callsGt[thisChrYSNPorigPosition, male]
scoresMaleSample = .Call("krlmmConfidenceScore", t(as.matrix(MMaleSamples)), t(as.matrix(callsMaleSample)), PACKAGE="crlmm");
callsPr[thisChrYSNPorigPosition, male] = scoresMaleSample
}
close(callsGt)
close(callsPr)
if (verbose) message("Done verifying SNPs on Chromosome Y")
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.