.rectangular <- function(x, y, verbose=TRUE){
if( verbose )
message("Running `rectangular` IBS algorithm!")
if( nrow(x) != nrow(y) )
stop("Dimension mismatch!")
if( is.null(colnames(x)) | is.null(colnames(y)) )
stop("Colnames should be provided!")
na.rm <- sum(is.na(x)) > 0 | sum(is.na(y)) > 0
##calculates mean and variance of IBS between all pairs of x and y
N <- ncol(x)
M <- ncol(y)
indices <- seq_len(N)
mn <- s2 <- numeric(length=N*M)
for(j in seq_len(M)) {
ibs <- 2 - abs(x - y[,j])
mn[indices + N*(j-1)] <- colMeans(ibs, na.rm=na.rm)
s2[indices + N*(j-1)] <- colVars(as.matrix(ibs), na.rm=na.rm)
if( verbose & (j %% 100 == 0 | j == 1) & (N > 10 | M >10))
message(j*N, " of ", N*M, " (", round(100*j/M, 2), "%) ...")
}
data.frame(mean=mn, var=s2,
colnames.x=rep(colnames(x), M),
colnames.y=rep(colnames(y), each=N))
}
.square <- function(x, y, verbose=TRUE){
if( verbose )
message("Running `square` IBS algorithm!")
if( (ncol(x) != ncol(y)) | (nrow(x) != nrow(y)) )
stop("Dimension mismatch!")
if( is.null(colnames(x)) | is.null(colnames(y)) )
stop("Colnames should be provided!")
na.rm <- sum(is.na(x)) > 0 | sum(is.na(y)) > 0
##calculates mean and variance of IBS between all unique pairs of x (and y)
N <- ncol(x)
k <- 1
mn <- s2 <- numeric(length=N*(N+1)/2)
for(j in seq_len(N)) {
ibs <- 2 - abs(x[, j:N, drop=FALSE] - y[,j])
mn[k + 0:(N - j)] <- colMeans(ibs, na.rm=na.rm)
s2[k + 0:(N - j)] <- colVars(as.matrix(ibs), na.rm=na.rm)
k <- k + 1 + N - j
if( verbose & (j %% 100 == 0 | j == 1) & N > 10)
message(k, " of ", N*(N+1)/2, " (", round(100*k/(N*(N+1)/2), 2),
"%) ...")
}
data.frame(mean=mn, var=s2,
colnames.x=unlist(sapply(1:N, function(k) colnames(x)[k:N])),
colnames.y=rep(colnames(y), N:1))
}
.strandAlignment <- function(x, y, rHash, verbose = FALSE) {
##relabel those in x according to those in y
##relabelling is based on the idea that snp's in x should be
##positively correlated with those in y robust against NA's and
##outliers
##FIX level `identical`
identical <- names(grep("identical", as.list(rHash), value=TRUE))
colnames.x <- gsub(":.*$", "", identical)
colnames.y <- gsub("^.*:", "", identical)
colnames.x <- colnames.x[!grepl("^NA$", colnames.y)]
colnames.y <- colnames.y[!grepl("^NA$", colnames.y)]
keep <- colnames.x %in% colnames(x) & colnames.y %in% colnames(y)
if( sum(keep) == 0 )
stop("No overlapping samples!")
colnames.x <- colnames.x[keep]
colnames.y <- colnames.y[keep]
midx <- match(colnames.x, colnames(x))
midy <- match(colnames.y, colnames(y))
##correlation based on 'Strand Alignment' not always correct
##signs <- sign(unlist(lapply(seq_len(nrow(x)), function(i)
## cov(x[i,midx], y[i,midy], use="complete.obs", method="spearman"))))
swaps <- unlist(lapply(seq_len(nrow(x)), function(i) {
sum(x[i,midx] == 1 & y[i,midy] == 3, na.rm=TRUE) +
sum(x[i,midx] == 3 & y[i,midy] == 1, na.rm=TRUE) >
sum(x[i,midx] == 1 & y[i,midy] == 1, na.rm=TRUE) +
sum(x[i,midx] == 3 & y[i,midy] == 3, na.rm=TRUE)
}))
if(verbose)
message("Swapping ", sum(swaps), " alleles!")
for(i in seq_len(nrow(x))) {
if( swaps[i]) {
xi <- x[i,]
x[i, xi==1] <- 3
x[i, xi==3] <- 1
}
}
## swaps <- unlist(lapply(seq_len(nrow(x)), function(i) {
## sum(x[i,midx] == 1 & y[i,midy] == 3) + sum(x[i,midx] == 3 & y[i,midy] == 1) >
## sum(x[i,midx] == 1 & y[i,midy] == 1) + sum(x[i,midx] == 3 & y[i,midy] == 3)
## }))
## print(table(swaps))
x
}
.hashRelations <- function(relations, idx.col="idx", idy.col="idy",
rel.col="relation_type"){
hash <- new.env()
keys <- paste(relations[, idx.col], relations[, idy.col], sep=":")
values <- as.character(relations[, rel.col])
##are there inconsistent relations
if( any(tapply(values, keys, function(x) length(unique(x)) > 1)) )
stop("Nonconsistent relations!")
##redundant relations are automatically removed!
tmp <- mapply(assign, keys, values, MoreArgs=list(envir=hash))
hash
}
.constructRelations <- function(xnames, ynames, idx.col="idx",
idy.col="idy", rel.col="relation_type") {
if( !is.null(ynames) ) {
if( length(intersect(xnames, ynames)) == 0 )
stop("There must be at least some overlapping samples!")
relations <- expand.grid(idx=xnames, idy=ynames, stringsAsFactors=FALSE)
} else
relations <- expand.grid(idx=xnames, idy=xnames)
relations[,rel.col] <- "unrelated"
identical <- relations[,idx.col] == relations[,idy.col]
relations[identical, rel.col] <- "identical"
relations <- relations[relations[,rel.col] != "unrelated",]
rownames(relations) <- 1:nrow(relations)
relations
}
.hardyweinberg <- function(x, alpha=0){
##fix single SNP case
if(!is.matrix(x))
x <- matrix(x, nrow=1, ncol=3)
obs <- rowTabulates(matrix(as.integer(x), nrow=nrow(x)))
n <- ncol(x)
p <- (2*obs[,1] + obs[,2])/(2*n)
q <- 1 - p
exp <- n*cbind(p^2, 2*p*q, q^2)
stat <- rowSums(((obs - exp)^2)/exp)
##pval <- pchisq(stat, df = 1)
##true in Hardy-Weinberg equilibrium
inEquilibrium <- stat < qchisq(1 - alpha/nrow(x), df = 1)
inEquilibrium[is.na(inEquilibrium)] <- FALSE
inEquilibrium
}
.pruning <- function(x, callRate=0.95, coverageRate=2/3, alpha = 0, maf = 0, verbose=TRUE) {
nsnps <- nrow(x)
nsamples <- ncol(x)
##drop SNPs difficult to call
if ( verbose )
message("Pruning ", nrow(x), " SNPs ...")
x[x == 0] <- NA
calledSNPs <- apply(x, 1, function(x) sum(!is.na(x))/nsamples)
if( verbose )
message(sum(calledSNPs <= callRate), " SNPs removed because of low call rate!")
x <- x[calledSNPs >= callRate,, drop=FALSE]
##if the coverage of called SNPs is not larger then coverageRate do
##not calculate IBS
coverage <- apply(x, 2, function(x) sum(!is.na(x))/nsnps)
if( verbose )
message(sum(coverage < coverageRate), " samples removed because too few SNPs called!")
x <- x[, coverage >= coverageRate, drop=FALSE] ##keep these
##Exclude SNP that violate Hardy-Weinberg principle
##not calculate IBS
if (alpha > 0 & alpha <= 1) {
inEquilibrium <- .hardyweinberg(x, alpha = alpha)
if( verbose )
message(sum(!inEquilibrium), " SNPs removed because they violate Hardy-Weinberg equilibrium!")
x <- x[inEquilibrium,, drop=FALSE] ##keep these
}
if (maf > 0 & maf <= 1) {
## Remove low frequent SNPs
mafs <- apply(x, 1, function(x) {
freq = min(table(x)/length(x))
ifelse(freq < 0.5, freq, 1-freq)
})
if( verbose )
message(sum(mafs < maf), " SNPs removed because they have minor allele frequency <", maf, "!")
x <- x[mafs >= maf,, drop=FALSE] ##keep these
}
x
}
##' Run the allele sharing algorithm based on ibs
##'
##' calculate mean and variance of identity by state between
##' genotypes, coded as 1,2,3, of all sample pairs either give one
##' omic inferred set of SNPs or two from different omic types.
##'
##' 'Strand Alignment' is required if methylation data inferred genotypes are
##' compared with DNA based genotypes, i.e., DNA based genotype is 3
##' whereas methylation is 1. The strand alignment step will fix this.
##'
##' Notice that there are two algorithms for calculating allele
##' sharing. One for the case one matrix is provided and one for the
##' case two, x and y, are provided. If one is provided the algorithm
##' takes in account the symmetric relations between pairs i.e. x12 =
##' x21 etc.
##'
##' To improve the lookup of relations, which can be millions of say
##' 1000 samples are provided, a hash-table is created from the
##' provided data.frame with relations.
##'
##' @title allele sharing based on ibs
##' @param relations 'data.frame' with relations and their mapping
##' identifiers, provide columns if different from the default and
##' beware identifiers should match with colnames of x and y
##' @param idx.col column name containing mapping identifiers
##' @param idy.col column name containing mapping identifiers
##' @param rel.col column name containing the relations,
##' i.e. identical, parentoffspring, etc.
##' @param callRate default 0.95 SNPs that are called in less then the
##' threshold are dropped
##' @param coverageRate default 2/3 samples with less then threshold
##' SNPs called are set to NA
##' @param alpha significance level for Hardy-Weinberg test default alpha = 0,
##' no filtering, internaly Bonferonni multiple testing will be applied
##' @param maf minor allele frequency threshold,
##' variants with lower frequency (default 0 no filtering) will be dropped
##' @param alignment default FALSE
##' @param assayNameX the name of the assay to be used for x (see x, y)
##' @param assayNameY same as assayNameX, but for y; if y is not
##' specified, the assay will be retreived from x
##' @param verbose show progress default TRUE
##' @param x,y genotype matrix with row and column names; if this is a
##' SummarizedExperiment or a MultiAssayExperiment assayName must
##' also be specified
##' @return data.frame with mean and variance ibs between all pairs
##' @author mvaniterson
##' @importFrom matrixStats colVars rowTabulates
##' @importFrom stats cov qchisq
##' @importFrom SummarizedExperiment assays
##' @importFrom MultiAssayExperiment assays
##' @importFrom RaggedExperiment compactAssay
##' @importFrom methods extends
##' @export
##' @examples
##' set.seed(12345)
##' beta <- matrix(runif(100*10, 0,1), nrow=100)
##' beta[1:5, 1:5]
##' colnames(beta) <- paste0("sample", 1:10)
##' genotype <- beta2genotype(beta)
##' genotype[1:5, 1:5]
##' data <- alleleSharing(genotype)
##' head(data)
alleleSharing <- function(x, y=NULL, relations=NULL, idx.col="idx",
idy.col="idy", rel.col="relation_type",
callRate=0.95, coverageRate=2/3,
alpha = 0, maf = 0, alignment=FALSE,
assayNameX=NULL, assayNameY=NULL, verbose=TRUE) {
if(!is.null(y)) {
if(extends(class(y), "SummarizedExperiment") |
extends(class(y), "MultiAssayExperiment")) {
if(is.null(assayNameY))
stop("Assay name should be given!")
y <- assays(y)[[assayNameY]]
} else if (extends(class(y), "RaggedExperiment"))
y <- compactAssay(y)
} else if (!is.null(assayNameY))
y <- assays(x)[[assayNameY]]
if(extends(class(x), "SummarizedExperiment") |
extends(class(x), "MultiAssayExperiment")) {
if(is.null(assayNameX))
stop("Assay name should be given!")
x <- assays(x)[[assayNameX]]
} else if (extends(class(x), "RaggedExperiment"))
x <- compactAssay(x)
if(is.null(colnames(x)))
stop("Colnames should be given!")
if (!is.null(y) & is.null(colnames(y)))
stop("Colnames should be given!")
if( is.null(relations )) {
relations <- .constructRelations(xnames=colnames(x),
ynames=colnames(y), idx.col=idx.col, idy.col=idy.col,
rel.col=rel.col)
relations <- relations[!duplicated(relations),]
}
if( verbose )
message("Hash relations")
rHash <- .hashRelations(relations, idx.col=idx.col, idy.col=idy.col,
rel.col=rel.col)
if( is.null(y) ) {
x <- .pruning(x, callRate=callRate, coverageRate=coverageRate,
alpha = alpha, maf = maf, verbose=verbose)
if( verbose )
message("Using ", nrow(x),
" polymorphic SNPs to determine allele sharing.")
data <- .square(x, x, verbose)
} else {
x <- .pruning(x, callRate=callRate, coverageRate=coverageRate,
alpha = alpha, maf = maf, verbose=verbose)
y <- .pruning(y, callRate=callRate, coverageRate=coverageRate,
alpha = alpha, maf = maf, verbose=verbose)
rows <- intersect(rownames(x), rownames(y))
rId <- match(rows, rownames(x))
x <- x[rId,]
rId <- match(rows, rownames(y))
y <- y[rId,]
if( alignment )
x <- .strandAlignment(x, y, rHash, verbose)
if( verbose )
message("Using ", nrow(x),
" polymophic SNPs to determine allele sharing.")
data <- .rectangular(x, y, verbose)
if( !(any(colnames(x) %in% data$colnames.x) &
any(colnames(y) %in% data$colnames.y)) )
stop("rHash and x or y do not match: probably swap 'x' and 'y'!")
}
pairs <- paste(data$colnames.x, data$colnames.y, sep=":")
mId <- pairs %in% ls(rHash)
data$relation <- "unrelated"
data$relation[mId] <- unlist(mget(pairs[mId], rHash, mode="character",
ifnotfound=list("unrelated")), use.names=FALSE)
data$relation <- factor(data$relation)
invisible(data)
}
##' predict mismatches
##'
##' based on all data a classifier is build using Linear Discriminant
##' Analysis and on the same data a prediction is performed in order
##' to detect wrong sample relationships. The assumption is that the
##' majority of sample relations is correct otherwise we could not do
##' this!
##' @title predict mismatches
##' @param data output from allelesharing
##' @param n = 100 default interpolation for showing the
##' classification boundaries
##' @param plot.it = TRUE default plot classification graph and
##' returing mismatches otherwise return all
##' @param verbose default FALSE, if TRUE show confusion matrix
##' @param ... optional plotting argument passed to plot
##' @return predicted mismatches
##' @author mvaniterson
##' @importFrom graphics contour legend plot points
##' @importFrom MASS lda
##' @importFrom stats predict
##' @export
##' @examples
##' set.seed(12345)
##' beta <- matrix(runif(100*10, 0,1), nrow=100)
##' beta[1:5, 1:5]
##' colnames(beta) <- paste0("sample", 1:10)
##' genotype <- beta2genotype(beta)
##' genotype[1:5, 1:5]
##' data <- alleleSharing(genotype)
##' head(data)
##' inferRelations(data)
inferRelations <- function(data, n=100, plot.it=TRUE, verbose=FALSE, ...) {
data <- droplevels(data)
model <- lda(relation~mean+var, data=data)
predicted <- predict(model, data)
data$predicted <- predicted$class
if(verbose) {
print(table(`Predicted relation`=data$predicted,
`Assumed relation`=data$relation), zero.print=".")
}
if( plot.it ) {
id <- which(data$predicted != data$relation)
plot(data[, c("mean", "var")], pch=".", cex=3,
col=as.integer(data$relation), xlab="mean (IBS)",
ylab="variance (IBS)", ...)
xp <- seq(min(data$mean), max(data$mean), length=n)
yp <- seq(min(data$var), max(data$var), length=n)
grid <- expand.grid(mean=xp, var=yp)
predicted <- predict(model, grid)
posterior <- predicted$posterior
if( ncol(posterior) > 2 ) {
for(k in seq_len(ncol(posterior))) {
zp <- posterior[, k] - apply(posterior[,-k], 1, max)
contour(xp, yp, matrix(zp, n), add=TRUE, levels=0,
drawlabels=FALSE, lty=1, lwd=2, col="grey")
}
} else {
zp <- posterior[, 2] - pmax(posterior[, 1])
contour(xp, yp, matrix(zp, n), add=TRUE, levels=0,
drawlabels=FALSE, lty=1, lwd=2, col="grey")
}
points(data[id, c("mean", "var")], pch=".", cex=3,
col=as.integer(data$relation[id]))
legend("topright", paste("assumed", levels(data$relation)),
col=1:nlevels(data$relation), pch=15, bty="n")
return(invisible(data[id,]))
} else
return(invisible(data))
}
##' convert DNA methylation beta-value to inferred genotypes
##'
##' Using kmeans unsupervised clustering to infer genotypes based on
##' idea's from Leonard Schalkwyk; wateRmelon packages.
##'
##' 'minSep' and 'minSize' ensure good clusters are found.
##' This function is similar to the gaphunter approach implemented in minfi.
##' @title converts beta-values to genotypes (1, 2 and 3)
##' @param betas beta matrix of probes possibly affected SNPs; if this is a
##' SummarizedExperiment or a MultiAssayExperiment assayName must also be
##' specified
##' @param na.rm TRUE drop cpg for which no clustering was observed
##' @param minSep minimal separation between clusters
##' @param minSize size of smallest cluster (in percentage)
##' @param centers center of clusters, defaults to 0.2, 0.5, 0.8.
##' @param assayName the name of the assay to be used (see betas)
##' @return matrix with genotypes
##' @author mvaniterson
##' @importFrom stats kmeans
##' @importFrom SummarizedExperiment assays
##' @importFrom MultiAssayExperiment assays
##' @importFrom methods extends
##' @export
##' @examples
##' set.seed(12345)
##' beta <- matrix(runif(100*10, 0,1), nrow=100)
##' beta[1:5, 1:5]
##' genotype <- beta2genotype(beta)
##' genotype[1:5, 1:5]
beta2genotype <- function (betas, na.rm=TRUE, minSep=0.25, minSize=5,
centers=c(0.2, 0.5, 0.8), assayName=NULL) {
if(extends(class(betas), "SummarizedExperiment") |
extends(class(betas), "MultiAssayExperiment")) {
if(is.null(assayName))
stop("Assay name should be given!")
betas <- assays(betas)[[assayName]]
}
genotypes <- apply(betas, 1, function(x) {
km <- try(kmeans(as.numeric(x), centers), silent=TRUE)
if( !inherits(km, "try-error") ) {
if( all(abs(rep(km$centers, 3) -
rep(km$centers, each=3))[-c(1, 5, 9)] > minSep) ) {
if( 100 * min(as.numeric(table(km$cluster)))/length(x) >
minSize )
return(km$cluster)
}
}
return(rep(NA, length(x)))
})
genotypes <- t(genotypes)
colnames(genotypes) <- colnames(betas)
rownames(genotypes) <- rownames(betas)
if( na.rm ) {
nas <- apply(genotypes, 1, anyNA)
genotypes <- genotypes[!nas, ]
}
genotypes
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.