Nothing
.structure <- function(pval_mat) {
if(is.matrix(pval_mat)) {
if(nrow(pval_mat) > 1) {
id <- apply(pval_mat[, c(2, 4)], 1, which.min)
return(cbind(pval_mat[, c(1, 3)][cbind(seq_along(id), id)],
pval_mat[, c(2, 4)][cbind(seq_along(id), id)]))
}
}
pval_mat <- as.vector(pval_mat)
id <- which.min(pval_mat[c(2, 4)])
return(cbind(pval_mat[c(1, 3)][id],
pval_mat[c(2, 4)][id]))
}
CheckSameLength <- function(x) {
if(length(x) == 1) {
return(TRUE)
}
return(var(unlist(sapply(x, length))) == 0)
}
myStrSplit <- function(x, split) {
ret <- list(seq_along(x))
for(i in seq_along(x)) {
ret[[i]] <- x[i]
for(sp in split) {
ret[[i]] <- unlist(strsplit(ret[[i]], sp))
ret[[i]] <- ret[[i]][nchar(ret[[i]]) > 0]
if(length(ret[[i]]) == 0)
break
}
}
return(ret)
}
checkSNPids<-function(ids) {
return(all(any(!is(ids, "character"), length(ids)==0), is.null(ids)==FALSE))
}
checkMotifs<-function(m) {
return(all(any(!is(m, "character"), length(m)==0), is.null(m)==FALSE))
}
motif_score_par<-function(i, par.k, par.ncores, par.motifs, par.nmotifs, par.snpids, par.snpbases, par.len_seq, par.motif.lib, par.snp.info) {
if(i < par.ncores) {
ids <- seq(par.k) + par.k * (i - 1)
} else {
ids <- (par.k * (par.ncores - 1) + 1):length(par.snp.info$ref_base)
}
this.snp.info <- list(sequence_matrix = t(t(par.snp.info$sequence_matrix[, ids])),
ref_base = par.snp.info$ref_base[ids], snp_base = par.snp.info$snp_base[ids])
motif.scores <- .Call("motif_score", par.motif.lib, this.snp.info, package = "atSNP")
for(j in seq_along(motif.scores)) {
rownames(motif.scores[[j]]) <- par.snpids[ids]
colnames(motif.scores[[j]]) <- par.motifs
}
nids<-length(ids)
motif_tbl <- data.table(motif = par.motifs, motif_len = sapply(par.motif.lib, nrow))
motif_len.m<-matrix(motif_tbl$motif_len, nids, par.nmotifs, byrow=TRUE)
rownames(motif_len.m)<-par.snpids[ids]
strand_ref <- (motif.scores$match_ref_base > 0)
ref_start <- motif.scores$match_ref_base
ref_start[!strand_ref] <- par.len_seq + motif.scores$match_ref_base[!strand_ref] - motif_len.m[!strand_ref]+2
ref_end <- ref_start + motif_len.m - 1
strand_snp <- (motif.scores$match_snp_base > 0)
snp_start <- motif.scores$match_snp_base
snp_start[!strand_snp] <- par.len_seq + motif.scores$match_snp_base[!strand_snp] - motif_len.m[!strand_snp]+2
snp_end <- snp_start + motif_len.m - 1
motif_score_tbl_dt <- data.table(snpid = rep(par.snpids[ids], par.nmotifs),
motif = rep(par.motifs, each = length(ids)),
log_lik_ref = c(motif.scores$log_lik_ref),
log_lik_snp = c(motif.scores$log_lik_snp),
log_lik_ratio = c(motif.scores$log_lik_ratio),
log_enhance_odds = c(motif.scores$log_enhance_odds),
log_reduce_odds = c(motif.scores$log_reduce_odds),
ref_start = c(ref_start),
snp_start = c(snp_start),
ref_end=c(ref_end),
snp_end=c(snp_end),
ref_strand = c("-", "+")[1 + as.integer(strand_ref)],
snp_strand = c("-", "+")[1 + as.integer(strand_snp)],
snpbase= rep(par.snpbases[ids], par.nmotifs)
)
setkey(motif_score_tbl_dt, motif, snpbase)
setkey(motif_tbl, motif)
motif_score_tbl <- as.data.frame(motif_tbl[motif_score_tbl_dt], stringAsFactors = FALSE)
return(motif_score_tbl)
}
match_subseq_par<-function(i, par.k, par.ncores, par.snp.tbl, par.snpids, par.motif.scores, par.motif, par.motif.tbl) {
if(i < par.ncores) {
ids <- seq(par.k) + par.k * (i - 1)
} else {
ids <- (par.k * (par.ncores - 1) + 1):length(par.snpids)
}
motif.scores_i <- par.motif.scores[snpid %in% par.snpids[ids], ]
setkey(motif.scores_i, motif)
motif.scores_i <- par.motif.tbl[motif.scores_i]
setkey(motif.scores_i, snpid, snpbase)
motif.scores_i <- par.snp.tbl[motif.scores_i]
motif.scores_i[ref_strand == "+", ref_match_seq := substr(ref_seq, ref_start, ref_end)]
motif.scores_i[ref_strand == "-", ref_match_seq := substr(ref_seq_rev, len_seq - ref_end + 1, len_seq - ref_start + 1)]
motif.scores_i[snp_strand == "+", snp_match_seq := substr(snp_seq, snp_start, snp_end)]
motif.scores_i[snp_strand == "-", snp_match_seq := substr(snp_seq_rev, len_seq - snp_end + 1, len_seq - snp_start + 1)]
motif.scores_i[ref_strand == "+", snp_seq_ref_match := substr(snp_seq, ref_start, ref_end)]
motif.scores_i[ref_strand == "-", snp_seq_ref_match := substr(snp_seq_rev, len_seq - ref_end + 1, len_seq - ref_start + 1)]
motif.scores_i[snp_strand == "+", ref_seq_snp_match := substr(ref_seq, snp_start, snp_end)]
motif.scores_i[snp_strand == "-", ref_seq_snp_match := substr(ref_seq_rev, len_seq - snp_end + 1, len_seq - snp_start + 1)]
return(motif.scores_i[, list(snpid, motif, ref_seq, snp_seq, motif_len, ref_start, ref_end, ref_strand, snp_start,
snp_end, snp_strand,log_lik_ref, log_lik_snp, log_lik_ratio, log_enhance_odds, log_reduce_odds,
IUPAC, ref_match_seq, snp_match_seq, ref_seq_snp_match, snp_seq_ref_match, snpbase)])
}
results_motif_par<-function(motif.id, par.prior, par.transition, par.motif.lib, par.motif.scores, par.testing.mc, par.figdir) {
rowids <- which(par.motif.scores$motif == names(par.motif.lib)[motif.id])
scores <- cbind(par.motif.scores$log_lik_ref[rowids],
par.motif.scores$log_lik_snp[rowids])
pwm <- par.motif.lib[[motif.id]]
pwm[pwm < 1e-10] <- 1e-10
wei.mat <- pwm
for(i in seq(nrow(wei.mat))) {
for(j in seq(ncol(wei.mat))) {
wei.mat[i, j] <- exp(mean(log(pwm[i, j] / pwm[i, -j])))
}
}
if(nrow(scores) > 5000) {
p <- 5 / nrow(scores)
} else {
p <- 1 / max(2, nrow(scores))
}
# Use percentiles of the score distribution to construct groups
m <- 20
b <- (1 / p) ^ ( 1 / sum(seq(m)))
allp <- rep(1, m + 1)
step <- b
for(k in rev(seq(m))) {
allp[k] <- allp[k + 1] / step
step <- step * b
}
allp <- allp[-(m + 1)]
score.p <- quantile(c(scores), 1 - allp)
dedup.idx <- c(1, 1 + which(diff(score.p) != 0))
score.p <- score.p[dedup.idx]
allp <- allp[dedup.idx]
# When there are few distinct score values, directly use those values
# instead of percentiles
if(length(score.p) > length(unique(c(scores)))) {
score.p <- rev(unique(sort(c(scores))))
allp <- seq_along(score.p) / length(c(scores))
}
stopifnot(length(allp) == length(score.p))
pval_a <- pval_cond <- matrix(1, nrow = nrow(scores), ncol = 4)
for(l in seq_along(allp)) {
if(l == 1) {
score.upp = max(scores) + 1
} else if(l <= length(allp)) {
score.upp = score.p[l - 1]
} else {
score.upp = quantile(c(scores), 0.2)
}
if(l >= length(allp)) {
score.low = min(scores) - 1
} else {
score.low = score.p[l + 1]
}
compute.id <- which(scores < score.upp & scores >= score.low)
if(length(compute.id) == 0) {
next
}
if(l < length(allp) + 1) {
theta <- .Call("test_find_theta", pwm, par.prior, par.transition, score.p[l], package = "atSNP")
} else {
theta <- 0
}
## set the importance sample size
if(par.testing.mc==FALSE) {
n_sample <- 2000
if(l <= length(allp)) {
n_sample <- as.integer((1 - allp[l]) / allp[l] * 100)
}
n_sample <- max(2000, min(n_sample, 1e5))
pval_a.new <- .Call("test_p_value", pwm, par.prior, par.transition, scores[compute.id], theta, n_sample, package = "atSNP")
} else {
pval_a.new <- .Call("test_p_value", pwm, par.prior, par.transition, scores[compute.id], theta, 100, package = "atSNP")
}
pval_cond.new <- .structure(pval_a.new[, 4 + seq(4)])
pval_a.new <- .structure(pval_a.new[, seq(4)])
update.id <- which(pval_a.new[, 2] < pval_a[, 3:4][compute.id])
pval_a[compute.id[update.id]] <- pval_a.new[update.id, 1]
pval_a[compute.id[update.id] + 2 * nrow(pval_a)] <- pval_a.new[update.id, 2]
update.id <- which(pval_cond.new[, 2] < pval_cond[, 3:4][compute.id])
pval_cond[compute.id[update.id]] <- pval_cond.new[update.id, 1]
pval_cond[compute.id[update.id] + 2 * nrow(pval_cond)] <- pval_cond.new[update.id, 2]
}
pval_a[pval_a[, seq(2)] > 1] <- 1
pval_cond[pval_cond[, seq(2)] > 1] <- 1
adjusted <- FALSE
## Force the p-values to be increasing
while(!adjusted) {
pval_a.sorted <- sort(pval_a[, seq(2)])[rank(-c(scores))]
pval_cond.sorted <- sort(pval_cond[, seq(2)])[rank(-c(scores))]
flag1 <- flag2 <- TRUE
if(prod(pval_a.sorted == pval_a[, seq(2)]) != 1 |
prod(pval_cond.sorted == pval_cond[, seq(2)]) != 1) {
pval_a[, seq(2)] <- pval_a.sorted
pval_cond[, seq(2)] <- pval_cond.sorted
flag1 <- FALSE
}
## force the conditional p-value <= p-value
adjust.id <- which(pval_a[, seq(2)] < pval_cond[, seq(2)])
if(length(adjust.id) > 0) {
flag2 <- FALSE
pval_cond[adjust.id] <- (pval_cond[adjust.id] + pval_a[adjust.id]) / 2
pval_a[adjust.id] <- pval_cond[adjust.id]
}
adjusted <- flag1 & flag2
}
rank_ratio <- abs(log(pval_a[, 1] + 1e-10) - log(pval_a[, 2] + 1e-10))
score_diff <- apply(scores, 1, function(x) abs(diff(x)))
score.p <- round(quantile(score_diff, c((seq(8) + 1) / 10, 0.9 + seq(9) / 100)))
if(round(quantile(score_diff, 0.1) + 1) < round(quantile(score_diff, 0.9))) {
score.p <- c(score.p,
seq(round(quantile(score_diff, 0.1) + 1),
round(quantile(score_diff, 0.9)),
by = 2)
)
}
score.p <- rev(sort(unique(score.p)))
pval_diff <- pval_rank <- matrix(1, nrow = length(score_diff), ncol = 2)
for(l in seq_along(score.p)) {
if(l == 1) {
score.upp <- max(score_diff) + 1
} else {
score.upp <- score.p[l - 1]
}
if(l == length(score.p)) {
score.low <- min(score_diff) - 1
} else {
score.low <- score.p[l + 1]
}
compute.id <- which(score_diff < score.upp & score_diff >= score.low)
if(length(compute.id) == 0) {
next
}
## set the importance sample size
if(par.testing.mc==FALSE) {
n_sample <- 2000
p <- max(1e-4, mean(score_diff >= score.p[l]))
n_sample <- as.integer((1 - p) / p * 100)
n_sample <- max(2000, min(n_sample, 1e5))
pval_diff.new <- .Call("test_p_value_change", pwm,
wei.mat, pwm + 0.25, par.prior,
par.transition, score_diff[compute.id],
rank_ratio[compute.id],
score.p[l], n_sample, package = "atSNP")
} else {
pval_diff.new <- .Call("test_p_value_change", pwm,
wei.mat, pwm + 0.25, par.prior,
par.transition, score_diff[compute.id],
rank_ratio[compute.id],
score.p[l], 100, package = "atSNP")
}
pval_rank.new <- .structure(pval_diff.new$rank)
pval_diff.new <- .structure(pval_diff.new$score)
update.id <- which(pval_diff.new[, 2] < pval_diff[compute.id, 2])
pval_diff[compute.id[update.id], ] <- pval_diff.new[update.id, ]
update.id <- which(pval_rank.new[, 2] < pval_rank[compute.id, 2])
pval_rank[compute.id[update.id], ] <- pval_rank.new[update.id, ]
## print(summary(pval_diff.new[,1]))
}
## force the monotonicity
pval_diff[, 1] <- sort(pval_diff[,1])[rank(-score_diff)]
pval_diff[pval_diff[, 1] > 1, 1] <- 1
pval_rank[, 1] <- sort(pval_rank[,1])[rank(-rank_ratio)]
pval_rank[pval_rank[, 1] > 1, 1] <- 1
message("Finished testing motif No. ", motif.id)
if(!is.null(par.figdir)) {
if(!file.exists(par.figdir)) {
dir.create(par.figdir)
}
plotdat <- data.frame(
score = c(scores),
p.value = c(pval_a[, seq(2)]),
var = c(pval_a[, 3:4]),
Allele = rep(c("ref", "snp"), each = nrow(scores))
)
plotdat.diff <- data.frame(
score = score_diff,
p.value = pval_diff[,1],
var = pval_diff[,2]
)
plotdat <- unique(plotdat)
plotdat.diff <- unique(plotdat.diff)
localenv <- environment()
options(warn = -1)
pdf(file.path(par.figdir, paste("motif", motif.id, ".pdf", sep = "")), width = 10, height = 10)
id <- which(rank(plotdat$p.value[plotdat$Allele == "ref"]) <= 500)
print(ggplot(aes(x = score, y = p.value), data = plotdat[plotdat$Allele == "ref", ], environment = localenv) + geom_point() + scale_y_log10(breaks = 10 ^ seq(-8, 0)) + geom_errorbar(aes(ymax = p.value + sqrt(var), ymin = p.value - sqrt(var))) + ggtitle(paste(names(par.motif.lib)[i], "ref")))
id <- which(rank(plotdat$p.value[plotdat$Allele == "snp"]) <= 500)
print(ggplot(aes(x = score, y = p.value), data = plotdat[plotdat$Allele == "snp", ], environment = localenv) + geom_point() + scale_y_log10(breaks = 10 ^ seq(-8, 0)) + geom_errorbar(aes(ymax = p.value + sqrt(var), ymin = p.value - sqrt(var))) + ggtitle(paste(names(par.motif.lib)[i], "SNP")))
print(ggplot(aes(x = score, y = p.value), data = plotdat.diff, environment = localenv) + geom_point() + scale_y_log10(breaks = 10 ^ seq(-8, 0)) + geom_errorbar(aes(ymax = p.value + sqrt(var), ymin = p.value - sqrt(var))) + ggtitle(paste(names(par.motif.lib)[i], " Change")))
dev.off()
}
return(list(rowids = rowids,
pval_a = pval_a,
pval_cond = pval_cond,
pval_diff = pval_diff,
pval_rank = pval_rank))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.