#' @useDynLib gpart, .registration = TRUE
#'
VCFtogeno <- function(vcf) {
.Call(`_gpart_VCFtogeno`, vcf)
}
pairCubeX <- function(b1, b2) {
.Call(`_gpart_pairCubeX`, b1, b2)
}
matCubeX <- function(geno) {
.Call(`_gpart_matCubeX`, geno)
}
matCubeX2 <- function(geno1, geno2) {
.Call(`_gpart_matCubeX2`, geno1, geno2)
}
estiD <- function(pA, pB, n11, n12, n21, n22, n1212) {
.Call(`_gpart_estiD`, pA, pB, n11, n12, n21, n22, n1212)
}
CIDp <- function(b1, b2) {
.Call(`_gpart_CIDp`, b1, b2)
}
genoDp <- function(geno, strLD = TRUE, lower = 0.7, upper = 0.98) {
.Call(`_gpart_genoDp`, geno, strLD, lower, upper)
}
genoCubeDp <- function(geno) {
.Call(`_gpart_genoCubeDp`, geno)
}
genoDp2 <- function(geno1, geno2, strLD = TRUE, lower = 0.7, upper = 0.98) {
.Call(`_gpart_genoDp2`, geno1, geno2, strLD, lower, upper)
}
calc_lnlike <- function(known11, known12, known21, known22, center_ct_d, freq11, freq12, freq21, freq22, half_hethet_share, freq11_incr) {
.Call(`_gpart_calc_lnlike`, known11, known12, known21, known22, center_ct_d, freq11, freq12, freq21, freq22, half_hethet_share, freq11_incr)
}
cubic_real_roots <- function(coef_a, coef_b, coef_c, solutions) {
.Call(`_gpart_cubic_real_roots`, coef_a, coef_b, coef_c, solutions)
}
calc_lnlike_quantile <- function(known11, known12, known21, known22, unknown_dh, freqx1, freq1x, freq2x, freq11_expected, denom, quantile) {
.Call(`_gpart_calc_lnlike_quantile`, known11, known12, known21, known22, unknown_dh, freqx1, freq1x, freq2x, freq11_expected, denom, quantile)
}
em_phase_hethet <- function(known11, known12, known21, known22, center_ct, onside_sol_ct_ptr) {
.Call(`_gpart_em_phase_hethet`, known11, known12, known21, known22, center_ct, onside_sol_ct_ptr)
}
haploview_blocks_classify <- function(counts, lowci_max, lowci_min, recomb_highci, strong_highci, strong_lowci, strong_lowci_outer, is_x, recomb_fast_ln_thresh) {
.Call(`_gpart_haploview_blocks_classify`, counts, lowci_max, lowci_min, recomb_highci, strong_highci, strong_lowci, strong_lowci_outer, is_x, recomb_fast_ln_thresh)
}
CIDp_strLD <- function(b1, b2, lower, upper) {
.Call(`_gpart_CIDp_strLD`, b1, b2, lower, upper)
}
################# CLQD subFUNCTIONS ################
# subfunctions : 1.newSplitCliques 2. heuristicCLQ
#1
newSplitCliques <- function(cliques.bp, gapdist)
{
nowlist <- lapply(cliques.bp, sort)
fixlist <- NULL
repeat {
need.split <- which(lapply(nowlist, function(x) max(diff(x)) > gapdist) == TRUE)
need.fix <- which(lapply(nowlist, function(x) max(diff(x)) > gapdist) == FALSE)
addlist <- nowlist[need.fix]
fixlist <- c(fixlist, addlist)
if (length(need.split) == 0) {
break
}
nowlist <- nowlist[need.split]
nowlength <- length(nowlist)
newlist <- as.list(rep(NA, nowlength))
for (i in seq_len(nowlength)) {
gap <- diff(nowlist[[i]])
frontpart <- nowlist[[i]][seq_len(min(which(gap > gapdist)))]
restpart <- nowlist[[i]][-(seq_len(min(which(gap > gapdist))))]
nowlist[[i]] <- frontpart
newlist[[i]] <- restpart
}
addlist <- nowlist[vapply(nowlist, function(x) length(x) > 1, TRUE)]
fixlist <- c(fixlist, addlist)
nowlist <- newlist[vapply(newlist, function(x) length(x) > 1, TRUE)]
}
return(fixlist)
}
#2
heuristicCLQ <- function(subOCM, hrstParam)
{
degs <- apply(subOCM, 1, sum)
top5pctDegVs <- which(degs>=quantile(degs, 0.95))
top5pctDegVs <- top5pctDegVs[order(degs[top5pctDegVs],decreasing=TRUE)]
heuristicBinbasket <- NULL
total_density <- NULL
v_dens_all <- c()
find_opt <- FALSE
for(v in top5pctDegVs){
v_nbds <- as.integer(which(subOCM[v,]!=0))
candibin <- c(v, v_nbds)
v_density <- sum(subOCM[candibin,candibin])/(length(candibin)^2 - length(candibin))
v_dens <- c(v_density, candibin)
if(v_density>=0.95) {
return(candibin)
find_opt <- TRUE
break
}else{
new_v_density <- 1
while((v_density+0.0001)<new_v_density){
newdeg <- apply(subOCM[v_nbds,], 1, sum)
v_nbds <- v_nbds[-(which(newdeg==min(newdeg)))]
candibin <- c(v, v_nbds)
new_v_density <- sum(subOCM[candibin,candibin])/(length(candibin)^2 - length(candibin))
if(new_v_density>=0.95){
return(candibin)
}else{
v_dens_all <- c(v_dens_all, new_v_density)
heuristicBinbasket<- c(heuristicBinbasket, list(candibin))
}
if(length(v_nbds) < (2*hrstParam/3)) break
}#end while
}#end else
}#end for
## return results
max_den_bin = heuristicBinbasket[which(v_dens_all == max(v_dens_all))]
if(length(max_den_bin) == 1) {
return(max_den_bin[[1]])
}else{
max_den_bin_leng = vapply(max_den_bin, length, 1)
candibin = max_den_bin[[which(max_den_bin_leng == max(max_den_bin_leng))[1]]]
return(candibin)
}
}
################# BigLD subFUNCTIONS ##########################
# subfunctions : CLQD,
# dataPreparation , cutsequence, intervalCliqueList,
# findMaximumIndept, constructLDblock, subBigLD, appendcutByForce
dataPreparation <- function(genofile, SNPinfofile, geno, SNPinfo,
chrN = NULL , startbp = -Inf, endbp = Inf)
{
#chrN : vector of chromosome
#startbp, endbp : integer
# filetype <- txt, the input data have header
# if(is.null(genofile)) stop("Please input genofile (and SNPinfofile)")
filetype <- tail(strsplit(genofile, "\\.")[[1]], 1)
if(filetype == "txt"){
if(is.null(SNPinfo)){
SNPinfo <- data.table::fread(SNPinfofile, header=TRUE)
}
class(SNPinfo)<-"data.frame"
if(dim(SNPinfo)[2]==4){
SNPinfo <- SNPinfo[,c(1,3,4)] # chrN, rsID, bp
}
geno <- data.table::fread(genofile, header = TRUE) # rsID
geno <- as.matrix(geno)
# header generation
if(is.null(chrN)) chrN <- unique(SNPinfo[,1])
colnames(SNPinfo) <- c("chrN", "rsID", "bp")
if(startbp != -Inf | endbp != Inf){
if((length(chrN)>1) | (length(unique(SNPinfo[,1]))>1))
stop("If your input data include more than one chromosome or you choose more than one chromosome,
then do not specify the startbp and endbp!")
SNPloca <- which(SNPinfo$bp>=startbp & SNPinfo$bp<=endbp & SNPinfo$chrN == chrN)
SNPinfo <- SNPinfo[SNPloca,]
geno <- geno[,SNPloca]
}
colnames(geno)<- SNPinfo$rsID
if(dim(geno)[2] != dim(SNPinfo)[1]) stop("Please check the input file format!")
}else if(filetype == "raw"){
if(is.null(SNPinfofile) & is.null(SNPinfo)) stop("Please input map file for SNPinfofile")
if(!is.null(SNPinfofile)){
if((tail(strsplit(SNPinfofile, "\\.")[[1]],1)!="map")) stop("Please input map file for SNPinfofile")
SNPinfo <- data.table::fread(SNPinfofile, header=FALSE, sep = "\t")
class(SNPinfo)<-"data.frame"
SNPinfo <- SNPinfo[,c(1,2,4)]
colnames(SNPinfo) <- c("chrN", "rsID", "bp")
geno <- data.table::fread(genofile, header = TRUE, sep = " ")
geno <- geno[, -seq_len(6)]
geno <- as.matrix(geno)
if(is.null(chrN)) chrN <- unique(SNPinfo[,1])
if(startbp != -Inf | endbp != Inf){
if((length(chrN)>1) | (length(unique(SNPinfo[,1]))>1))
stop("If your input data include more than one chromosome or you choose more than one chromosome,
then do not specify the startbp and endbp!")
SNPloca <- which(SNPinfo$bp>=startbp & SNPinfo$bp<=endbp & SNPinfo$chrN == chrN)
SNPinfo <- SNPinfo[SNPloca,]
geno <- geno[,SNPloca]
}
colnames(geno)<- SNPinfo$rsID
}else{
stop("Please check your input files")
}
}else if(filetype == "traw"){
genodata <- data.table::fread(genofile, header = TRUE, sep = "\t")
SNPinfo <- genodata[,c(1,2,4)]
colnames(SNPinfo) <- c("chrN", "rsID", "bp")
class(SNPinfo)<-"data.frame"
geno <- t(genodata[,-seq_len(6)])
geno <- as.matrix(geno)
colnames(geno) <- SNPinfo$rsID
if(is.null(chrN)) chrN <- unique(SNPinfo[,1])
if(startbp != -Inf | endbp != Inf){
if((length(chrN)>1) | (length(unique(SNPinfo[,1]))>1))
stop("If your input data include more than one chromosome or you choose more than one chromosome,
then do not specify the startbp and endbp!")
takenIndex <- which((SNPinfo$bp >= startbp) & (SNPinfo$bp <= endbp) & (SNPinfo$chrN == chrN))
SNPinfo <- SNPinfo[takenIndex,]
geno <- geno[, takenIndex]
}
}else if(filetype == "ped"){
if(is.null(SNPinfofile) & is.null(SNPinfo)) stop("Please input map file for SNPinfofile")
if(!is.null(SNPinfofile)){
# system.time({
if((tail(strsplit(SNPinfofile, "\\.")[[1]],1)!="map")) stop("Please input map file for SNPinfofile")
SNPinfo <- data.table::fread(SNPinfofile, header=FALSE, sep = "\t")
SNPinfo <- SNPinfo[,c(1,2,4)]
colnames(SNPinfo) <- c("chrN", "rsID", "bp")
class(SNPinfo)<-"data.frame"
geno <- data.table::fread(genofile, header=FALSE, sep = " ", colClasses = "character")
geno <- geno[,-seq_len(6)]
geno <- as.matrix(geno)
# message(dim(geno))
if(is.null(chrN)) chrN <- unique(SNPinfo[,1])
if(startbp != -Inf | endbp != Inf){
if((length(chrN)>1) | (length(unique(SNPinfo[,1]))>1))
stop("If your input data include more than one chromosome or you choose more than one chromosome,
then do not specify the startbp and endbp!")
SNPloca <- which(SNPinfo$bp >= startbp & SNPinfo$bp <= endbp & SNPinfo$chrN == chrN)
SNPinfo <- SNPinfo[SNPloca,]
genoloca1 <- min(SNPloca) * 2 - 1
genoloca2 <- max(SNPloca) * 2
geno <- geno[, genoloca1:genoloca2]
}
multiAllele<-c()
nInd = nrow(geno)
for(i in seq_len(ncol(geno)/2)){
nowseq <- c(geno[, (i*2-1)], geno[, (i*2)])
naInd = unique(c(which(geno[, (i*2-1)] == "0"), which(geno[, (i*2)] == "0")))
uniqueAllele <- setdiff(unique(nowseq), "0")
if(length(uniqueAllele)>2) multiAllele <- c(multiAllele, i)
line1 <- rep(NA, nInd)
line2 <- rep(NA, nInd)
line1[geno[,(i*2-1)] == uniqueAllele[1]] <- 0
line1[geno[,(i*2-1)] == uniqueAllele[2]] <- 1
line2[geno[,(i*2)] == uniqueAllele[1]] <- 0
line2[geno[,(i*2)] == uniqueAllele[2]] <- 1
nline = (line1+line2)
if(sum(nline, na.rm = TRUE) > nInd) nline = 2-nline
geno[,(i)]<- nline
}
geno<-geno[,seq_len(ncol(geno)/2)]
# message(dim(geno))
class(geno)<-"integer"
# remove multi-allelic
if(length(multiAllele)>0){
SNPinfo <- SNPinfo[-multiAllele,]
geno <- geno[, -multiAllele]
}
# })
}
}else if(filetype == "vcf"){
rawgeno <- data.table::fread(genofile, header=TRUE, sep = "\t",
colClasses = "character", skip = "#CHROM")
SNPinfo <- rawgeno[,c(1,3,2)]
colnames(SNPinfo) <- c("chrN", "rsID", "bp")
class(SNPinfo)<-"data.frame"
infocol <- max(which(colnames(rawgeno) == "INFO"),which(colnames(rawgeno) == "FORMAT"))
rawgeno <- rawgeno[, -seq_len(infocol)]
rawgeno <- as.matrix(rawgeno)
geno <- VCFtogeno(rawgeno)
if(is.null(chrN)) chrN <- unique(SNPinfo[,1])
if(startbp != -Inf | endbp != Inf){
if((length(chrN)>1) | (length(unique(SNPinfo[,1]))>1))
stop("If your input data include more than one chromosome or you choose more than one chromosome,
then do not specify the startbp and endbp!")
SNPloca <- which(SNPinfo$bp >= startbp & SNPinfo$bp <= endbp & SNPinfo$chrN == chrN)
SNPinfo <- SNPinfo[SNPloca,]
geno <- geno[,SNPloca]
}
}else {
stop("Please check your input files.\n(supporting file format: txt, ped, map, raw, traw, vcf)")
}
#choose chrN
message("data reformating done!")
reverseind <- which(colSums(geno)>nrow(geno))
geno[,reverseind] <- 2-geno[,reverseind]
class(SNPinfo$bp)<-"numeric"
class(SNPinfo$rsID) <- "character"
#choose chrN
index <- (SNPinfo$chrN %in% chrN)
if(all(index == TRUE) == FALSE){
geno <- geno[,index]
SNPinfo <- SNPinfo[,index]
}
return(list(geno = geno, SNPinfo = SNPinfo))
}
cutsequence <- function(geno, SNPinfo, leng, subTaskSize, LD, clstgap, CLQcut, cutByForce)
{
message("start to split the whole sequence into sub-task regions")
modeNum <- 1
lastnum <- 0
nSNPs <- dim(SNPinfo)[1]
geno <- as.matrix(geno)
mincut <- max(0.5, CLQcut^2)
# region length<=3000
SNPbpgap <- diff(SNPinfo[,3])
if(is.null(cutByForce)){
precutpoints <- which(SNPbpgap>clstgap)
}else{
cutByForce <- vapply(cutByForce[,3], function(x) max(which(SNPinfo[,3]<=x)), 1)
precutpoints <- sort(unique(cutByForce))
precutpoints1 <- which(SNPbpgap>clstgap)
precutpoints <- c(precutpoints, precutpoints1)
}
if (dim(geno)[2] <= subTaskSize) {
message("there is only one sub-region!")
return(list(dim(geno)[2], NULL))
} else {
calterms <- c(1, 10, leng)
cutpoints <- NULL
i <- leng # i :current candidate cutposition
while (i <= (dim(geno)[2] - leng)) {
if((i-lastnum) > 5*subTaskSize){
modeNum <- 2
break;
}
if(i %in% precutpoints){
cutpoints <- c(cutpoints, i)
lastnum <- i
cat("total SNPs = ",nSNPs, " | cutpoint = ", i, "\r")
i<-i+(leng/2)
# message(i)
cutnow <- FALSE
}else{
for(j in calterms){
if(LD == "r2"){
nowcm <- suppressWarnings(cor(geno[,(i-j+1):(i)], geno[,((i+1):(i+j))]
,use="pairwise.complete.obs"))
nowr2 <- nowcm^2
nowr2[which(nowr2 < mincut)] <- 0
}else{
nowr2 <- genoDp2(geno[,(i-j+1):(i), drop=FALSE], geno[,((i+1):(i+j)), drop=FALSE])
}
if(sum(nowr2,na.rm=TRUE)>0){
i <- i+1
cutnow <- FALSE
break
}
if(j == leng){
cutnow <- TRUE
# message(i)
}
}
if(cutnow == TRUE){
cutpoints <- c(cutpoints, i)
lastnum <- i
cat("total SNPs = ",nSNPs, " | cutpoint = ", i, "\r")
i<-i+(leng/2)
cutnow <- FALSE
}
}
}##end while
if(modeNum == 1){
cutpoints <- c(0,cutpoints, dim(geno)[2])
# separate too big regions candi.cutpoints return(cutpoints,candi.cutpoints)
atfcut <- NULL
while (max(diff(cutpoints)) > subTaskSize) {
diffseq <- diff(cutpoints)
recutpoint <- which(diffseq > subTaskSize)
nowmaxsize <- max(diff(cutpoints))
tt <- cbind((cutpoints[recutpoint] + 1), cutpoints[recutpoint + 1])
numvec <- NULL
for (i in seq_len(dim(tt)[1])){
st <- tt[i, 1]
ed <- tt[i, 2]
if (ed > (dim(geno)[2] - leng)) {
ed <- dim(geno)[2] - leng
}
weakcount <- vapply(c((st + leng):(ed - leng)), function(x) {
tick <- as.integer(leng/5)
if(LD == "r2"){
nowCM <- suppressWarnings(cor(geno[, (x - tick + 1):(x)], geno[, (x+ 1):(x + tick)]
,use="pairwise.complete.obs"))
nowr2 <- nowCM^2
length(which(nowr2>= mincut))
}else{
nowr2 <- genoDp2(geno[, (x - tick + 1):(x), drop=FALSE], geno[, (x+ 1):(x + tick), drop=FALSE])
length(which(nowr2>= mincut))
}
}, 1)
weakcount.s <- sort(weakcount)
weaks <- weakcount.s[10]
weakpoint <- which(weakcount <= weaks)
weakpoint <- weakpoint + st + leng - 1
nearcenter <- vapply(weakpoint, function(x) abs((ed - x) - (x - st)), 1)
addcut <- weakpoint[which(nearcenter == min(nearcenter))][1]
cat("total SNPs = ",nSNPs, " | additional cutpoint = ", addcut, "\r")
numvec <- c(numvec, addcut)
atfcut <- c(atfcut, addcut)
} ##end for
cutpoints <- sort(c(cutpoints, numvec))
newcandi <- which(diff(cutpoints) > subTaskSize)
# remaxsize <- max(diff(cutpoints))
# message(remaxsize) message(newcandi)
if (length(newcandi) == 0) {
break
}
}
}
##end while
if(modeNum == 2){
message(paste("split the whole sequence into sub-task regions of length", subTaskSize))
precutpoints <- which(SNPbpgap>clstgap)
cutpoints <- seq(subTaskSize, dim(geno)[2], subTaskSize)
remaincutpoints <- vapply(cutpoints, function(x) sum(abs(x-precutpoints)<(leng/2))<1, TRUE)
atfcut <- cutpoints[remaincutpoints]
cutpoints <- sort(c(cutpoints[remaincutpoints], precutpoints))
if(max(atfcut) == dim(geno)[2]){
atfcut <- atfcut[-(length(atfcut))]
}else{
cutpoints <- c(cutpoints, dim(geno)[2])
}
}
}
cat("\n")
message("splitting sequence, done!")
if(!is.null(atfcut)) atfcut <- sort(atfcut)
cutpoints <- unique(c(cutpoints, precutpoints))
return(list(sort(cutpoints), atfcut))
}
intervalCliqueList <- function(clstlist)
{
clstlist <- clstlist[order(vapply(clstlist, min, 1))]
bp.clstlist <- t(vapply(clstlist, function(x) range(x), c(1,2))) ###
bp.clstlist <- bp.clstlist[order(bp.clstlist[,1]),]
IMsize <- dim(bp.clstlist)[1] ## adjacency matrix of intervals in interval graph
adjacencyM <- matrix(0, IMsize, IMsize)
for (i in seq_len(IMsize-1)) {
for (j in (i+1):IMsize) {
if(bp.clstlist[i,2]>bp.clstlist[j,1]){
adjacencyM[j, i]<-adjacencyM[i, j] <- 1
}else{
next
}
}
}
diag(adjacencyM) <- 0
interval.graph <- igraph::graph.adjacency(adjacencyM, mode="undirected",
weighted=TRUE, diag=FALSE, add.colnames=NULL)
# message(paste("max coreness", max(coreness(interval.graph))))
# message(paste("ecount", ecount(interval.graph), "vertex*5 ", 5*IMsize))
if(max(igraph::coreness(interval.graph))>10){ #ecount(interval.graph)> 5*IMsize|
interval.cliques <- igraph::maximal.cliques(interval.graph, min=1)
}else{
interval.cliques <- igraph::cliques(interval.graph, min=1)
}
interval.cliques <- interval.cliques[order(vapply(interval.cliques, min, 1))]
intervals <- lapply(interval.cliques, function(x) unlist(clstlist[x]))
intervals <- lapply(intervals, sort)
intervals <- lapply(intervals, unique)
weight.itv <- vapply(intervals, length, 1)
intervals.range <- t(vapply(intervals, range, c(1,2)))
unique.intervals.range <- unique(intervals.range)
rangeinfo <- cbind(intervals.range, weight.itv)
interval.info <- apply(unique.intervals.range, 1, function(x) {
sameitv <- which(rangeinfo[, 1] == x[1] & rangeinfo[, 2] == x[2])
maxweight <- max(rangeinfo[sameitv, 3])
sameitv[which(maxweight == rangeinfo[sameitv, 3])][1]
})
res <- rangeinfo[interval.info,,drop=FALSE]
return(res)
# final.intervals <- intervals[interval.info]
# final.intervals.w <- rangeinfo[interval.info, 3]
# return(list(final.intervals, final.intervals.w))
}
findMaximumIndept <- function(interval.range, sample.weight)
{
n.of.sample <- length(sample.weight)
pre.range <- as.list(rep(NA, n.of.sample)) #pre.range : range of predecessor
## pre.range : n by 2, i row (x,y) : possible predecessors of i interval are from x interval to y interval
for (i in seq_len(n.of.sample)) {
nowstart <- interval.range[i, 1]
if (sum(interval.range[, 2] < nowstart) > 0) {
pre.range[[i]] <- which(interval.range[, 2] < nowstart)
}
}
sources <- seq_len(n.of.sample)[(vapply(pre.range, function(x) all(is.na(x)) == TRUE, TRUE))]
## source of comparability graph of complement of Interval graph
if (length(sources) < n.of.sample) {
not.s <- setdiff(seq_len(n.of.sample), sources)
for (i in not.s) {
pre.pre <- sort(unique(unlist(pre.range[pre.range[[i]]])))
pre.range[[i]] <- setdiff(pre.range[[i]], pre.pre)
}
names(pre.range) <- sample.weight
n.interval <- seq_len(n.of.sample)
route.weights <- rep(0, n.of.sample) ##cumulative weights
route.weights[sources] <- sample.weight[sources]
pointers <- rep(0, n.of.sample) ## predecessor of current interval
pointers[sources] <- NA
explored <- rep(0, n.of.sample)
explored[sources] <- 1
info <- cbind(n.interval, route.weights, pointers, explored)
for (i in not.s) {
maybe.pred <- pre.range[[i]]
now.info <- info[maybe.pred, , drop=FALSE]
max.info <- now.info[which(now.info[, 2] == max(now.info[, 2])), , drop=FALSE]
if (dim(max.info)[1] > 1)
max.info <- max.info[1, , drop=FALSE]
info[i, 2] <- sample.weight[i] + max.info[2]
info[i, 3] <- max.info[1]
info[i, 4] <- 1
}
#### trace maximum independent set
start.itv <- which(info[, 2] == max(info[, 2]))[1]
predecessor <- info[start.itv, 3]
indept.set <- c(predecessor, start.itv)
while (!is.na(predecessor)) {
predecessor <- info[predecessor, 3]
indept.set <- c(predecessor, indept.set)
}
indept.set <- as.vector(indept.set)
indept.set <- indept.set[-which(is.na(indept.set))]
indept.set.weight <- max(info[, 2])
} else {
indept.set <- which(sample.weight == max(sample.weight))
indept.set.weight <- max(sample.weight)
}
final.result <- list(indept.set=indept.set, indept.set.weight=indept.set.weight)
return(final.result)
}
constructLDblock <- function(clstlist, subSNPinfo)
{
# subfunction: intervalCliqueList, findMaximumIndept
Totalblocks <- NULL
while (length(clstlist) > 0) {
if(length(clstlist)==1){
Totalblocks <- rbind(Totalblocks, range(clstlist[[1]]))
break
}else{
allsnps <- lapply(clstlist, function(x) c(min(x):max(x)))
candi.interval <- intervalCliqueList(clstlist)
intervals <- candi.interval[,seq_len(2),drop=FALSE] ## list of SNPs in each cliques
weight.itv <- candi.interval[,3] ## weights of each cliques
MWIS <- findMaximumIndept(intervals, weight.itv) ##find independent set
subLDblocks <- intervals[MWIS[[1]],, drop=FALSE]
Totalblocks <- rbind(Totalblocks, subLDblocks)
takenSNPs <- apply(subLDblocks, 1, function(x) c(min(x):max(x)))
takenSNPs <- as.integer(unlist(takenSNPs))
clstlist <- lapply(clstlist, function(x) setdiff(x, takenSNPs))
clstlist <- clstlist[vapply(clstlist, length, 1)>0]
if (length(clstlist) == 0) break
# when a taken cluster located in the middle of the other cluster, we split the other cluster
while(length(clstlist)>0){
addinglist <- NULL
for (n in seq_len(length(clstlist))) {
nowbin <- clstlist[[n]]
intersection <- intersect(c(min(nowbin):max(nowbin)), takenSNPs)
if (length(intersection) > 0) {
part1 <- nowbin[which(nowbin < min(intersection))]
part2 <- setdiff(nowbin, c(min(part1):max(intersection)))
clstlist[[n]] <- part1
addinglist <- c(addinglist, list(part2))
}
}
clstlist <- c(clstlist, addinglist)
clstlist <- clstlist[vapply(clstlist, function(x) length(x) > 1, TRUE)]
clstlist <- clstlist[order(vapply(clstlist, min, 1))]
if(length(addinglist) ==0) break;
}
}
}
return(Totalblocks)
}
subBigLD <- function(subgeno, subSNPinfo, CLQcut, clstgap, CLQmode, hrstParam, LD, hrstType)
{
subbinvec <- CLQD(geno=subgeno, SNPinfo=subSNPinfo, CLQcut=CLQcut, clstgap=clstgap,
hrstType=hrstType, hrstParam=hrstParam, CLQmode=CLQmode, LD=LD)
if(all(is.na(subbinvec))){return(NULL)}
bins <- seq_len(max(subbinvec[which(!is.na(subbinvec))]))
clstlist <- lapply(bins, function(x) which(subbinvec == x))
clstlist <- lapply(clstlist, sort) ###
clstlist <- clstlist[order(vapply(clstlist, min, 1))] ###
nowLDblocks <- constructLDblock(clstlist, subSNPinfo)
# message('constructLDblock done!')
nowLDblocks <- nowLDblocks[order(nowLDblocks[, 1]), , drop=FALSE]
return(nowLDblocks)
}
appendcutByForce <- function(LDblocks, Ogeno, OSNPinfo, CLQcut, clstgap,
CLQmode, hrstParam, LD, hrstType)
{
expandB <- NULL
snp1 <- which(OSNPinfo[,3]<LDblocks[1,5])
if(length(snp1)>2){
OSNPs <- seq_len(max(snp1))
firstB <- LDblocks[1,]
secondSNPs <- (as.numeric(firstB[1]):as.numeric(firstB[2]))
if(LD == "r2"){
cor2 <- suppressWarnings(cor(Ogeno[,OSNPs,drop=FALSE], Ogeno[,secondSNPs,drop=FALSE],
use="pairwise.complete.obs")^2)
}else if(LD == "Dprime"){
cor2 <- genoDp2(Ogeno[,OSNPs,drop=FALSE], Ogeno[,secondSNPs,drop=FALSE])
}
cor2num <- apply(cor2, 1, function(x) {
sum(x>CLQcut^2, na.rm = TRUE)
})
cor2ratio <- cor2num/(dim(cor2)[2])
cor2numT <- cor2ratio>0.6
cor2numT <- c(cor2numT, 1)
points2 <- min(which(cor2numT>0))
NsecondSNPs <- points2:max(secondSNPs)
reOSNPs <- setdiff(seq_len(max(NsecondSNPs)), NsecondSNPs)
if(length(reOSNPs)>1){
subgeno <- Ogeno[, reOSNPs]
subSNPinfo <- OSNPinfo[reOSNPs,]
subBlocks <- subBigLD(subgeno, subSNPinfo, CLQcut, clstgap, CLQmode, hrstParam, LD, hrstType)
subBlocks <- subBlocks+min(reOSNPs)-1
expandB <- rbind(expandB, subBlocks)
}
firstSNPs <- NsecondSNPs
}else{
firstB <- LDblocks[1,]
firstSNPs <- (as.numeric(firstB[1]):as.numeric(firstB[2]))
}
if(dim(LDblocks)[1]>1){
for(i in seq_len(dim(LDblocks)[1]-1)){
# if(i==2192) stop("stop")
secondB <- LDblocks[(i+1),]
secondSNPs <- (as.numeric(secondB[1]):as.numeric(secondB[2]))
OSNPs <- setdiff(max(firstSNPs):min(secondSNPs), c(max(firstSNPs), min(secondSNPs)))
if(length(OSNPs)==0){
expandB <- rbind(expandB, range(firstSNPs))
firstSNPs <- secondSNPs
}else{
if(LD == "r2"){
cor1 <- suppressWarnings(cor(Ogeno[,firstSNPs,drop=FALSE], Ogeno[,OSNPs,drop=FALSE],
use="pairwise.complete.obs")^2)
}else if(LD == "Dprime"){
cor1 <- genoDp2(Ogeno[,firstSNPs,drop=FALSE], Ogeno[,OSNPs,drop=FALSE])
}
cor1num <- apply(cor1, 2, function(x) {
sum(x>CLQcut^2)
})
cor1ratio <- cor1num/(dim(cor1)[1])
# cor1num <- apply(cor1r2, 2, function(x) length(which(x>CLQcut^2))/length(firstSNPs))
if(LD == "r2"){
cor2 <- suppressWarnings(cor(Ogeno[,OSNPs,drop=FALSE], Ogeno[,secondSNPs,drop=FALSE],
use="pairwise.complete.obs")^2)
}else if (LD == "Dprime"){
cor2 <- genoDp2(Ogeno[,OSNPs,drop=FALSE], Ogeno[,secondSNPs,drop=FALSE])
}
cor2num <- apply(cor2, 1, function(x) {
sum(x>CLQcut^2)
})
cor2ratio <- cor2num/(dim(cor2)[2])
cor1numT <- cor1ratio>0.6
cor2numT <- cor2ratio>0.6
cor1numT <- c(1, cor1numT, 0)
cor2numT <- c(0, cor2numT, 1)
points1 <- max(firstSNPs)+max(which(cor1numT>0))-1
NfirstSNPs <- min(firstSNPs):points1
points2 <- max(firstSNPs)+max(which(cor2numT>0))-1
NsecondSNPs <- points2:max(secondSNPs)
if(max(NfirstSNPs)<min(NsecondSNPs)){
expandB <- rbind(expandB, range(NfirstSNPs))
reOSNPs <- setdiff(c(min(NfirstSNPs):max(NsecondSNPs)), c(NfirstSNPs, NsecondSNPs))
if(length(reOSNPs)>1){
subgeno <- Ogeno[, reOSNPs]
subSNPinfo <- OSNPinfo[reOSNPs,]
subBlocks <- subBigLD(subgeno, subSNPinfo, CLQcut, clstgap, CLQmode, hrstParam, LD, hrstType)
subBlocks <- subBlocks+min(reOSNPs)-1
expandB <- rbind(expandB, subBlocks)
}
firstSNPs <- NsecondSNPs
}else{
#merge two blocks
# message(paste("i", i,"now!!!!!!"))
subgeno <- Ogeno[, c(min(firstSNPs):max(secondSNPs))]
subSNPinfo <- OSNPinfo[c(min(firstSNPs):max(secondSNPs)),]
if(dim(subgeno)[2]<=3000){
subBlocks <- subBigLD(subgeno, subSNPinfo, CLQcut, clstgap, CLQmode, hrstParam, LD, hrstType)
subBlocks <- subBlocks+min(firstSNPs)-1
}else{
# Too big LD blocks
subgeno <- Ogeno[, c((max(firstSNPs)-1500):(min(secondSNPs)+1500))]
subSNPinfo <- Ogeno[ c((max(firstSNPs)-1500):(min(secondSNPs)+1500)),]
subBlocks <- subBigLD(subgeno, subSNPinfo, CLQcut, clstgap, CLQmode, hrstParam, LD, hrstType)
for(i in seq_len(dim(subBlocks)[1]-1)){
nowB <- subBlocks[i,]
nextB <- subBlocks[(i+1),]
# merging steps!!
if(nowB[2]<nextB[1]){
next
}else{
subBlocks[i,]<-c(NA, NA)
subBlocks[(i+1),]<-range(c(nowB, nextB))
}
}
subBlocks<-subBlocks[!is.na(subBlocks[,1]),]
subBlocks <- subBlocks+min(firstSNPs)-1
}
if(dim(subBlocks)[1]==1) {
firstSNPs <- subBlocks[1,1]:subBlocks[1,2]
}else{
expandB <- rbind(expandB, subBlocks[-(dim(subBlocks)[1]),])
firstSNPs <- subBlocks[(dim(subBlocks)[1]),1]:subBlocks[(dim(subBlocks)[1]),2]
}
}
cat(paste(i," | ", dim(LDblocks)[1], "\r", sep = ""))
# message(tail(expandB))
# if(i >= 30) break
}
}
}
# firstSNPs
if(max(firstSNPs)<(dim(Ogeno)[2]-1)){
OSNPs <- (max(firstSNPs)+1):(dim(Ogeno)[2])
if(LD == "r2"){
cor1 <- suppressWarnings(cor(Ogeno[,firstSNPs,drop=FALSE], Ogeno[,OSNPs,drop=FALSE],
use="pairwise.complete.obs")^2)
}else if (LD == "Dprime"){
cor1 <- genoDp2(Ogeno[,firstSNPs,drop=FALSE], Ogeno[,OSNPs,drop=FALSE])
}
cor1num <- apply(cor1, 2, function(x) {
sum(x>CLQcut^2)
})
cor1ratio <- cor1num/(dim(cor1)[1])
cor1numT <- cor1ratio>0.6
cor1numT <- c(1, cor1numT, 0)
points1 <- max(firstSNPs)+max(which(cor1numT>0))-1
NfirstSNPs <- min(firstSNPs):points1
expandB <- rbind(expandB, range(NfirstSNPs))
reOSNPs <- setdiff(c(min(NfirstSNPs):dim(Ogeno)[2]), c(NfirstSNPs))
if(length(reOSNPs)>1){
subgeno <- Ogeno[, reOSNPs]
subSNPinfo <- OSNPinfo[reOSNPs,]
subBlocks <- subBigLD(subgeno, subSNPinfo, CLQcut, clstgap, CLQmode, hrstParam, LD, hrstType)
subBlocks <- subBlocks+min(reOSNPs)-1
expandB <- rbind(expandB, subBlocks)
}
}else{
expandB <- rbind(expandB, range(firstSNPs))
}
# LDblocks <- expandB
expandB <- expandB[(expandB[,1]!=expandB[,2]),,drop=FALSE]
start.bp <- OSNPinfo[, 3][expandB[, 1]]
end.bp <- OSNPinfo[, 3][expandB[, 2]]
start.rsID <- as.character(OSNPinfo[, 2][expandB[, 1]])
end.rsID <- as.character(OSNPinfo[, 2][expandB[, 2]])
TexpandB <- data.frame(expandB, start.rsID, end.rsID, start.bp, end.bp)
colnames(TexpandB) <- c("start.index", "end.index", "start.rsID", "end.rsID", "start.bp", "end.bp")
return(TexpandB)
}
################# GPART subFUNCTIONS ########################################################################
# sub-Functions
# BigLD,CLQD, LDblockSplit, dataPreparation,
# mergeOverlapGene, LDblockGeneMerge, splitBigLD, mergeSmallRegion, namingRegion2
#
# blocks with Big-LD result
LDblockSplit <- function(geno, LDblocks, maxsize, LD){
LDblockSizes <- apply(LDblocks, 1, diff) +1
LargeBlocksN <- which(LDblockSizes> maxsize)
LargeBlocks <- LDblocks[LargeBlocksN,,drop=FALSE]
Newblocks <- NULL
btwr2 <- NULL
if(length(LargeBlocksN)!=0){
while(dim(LargeBlocks)[1]>0){
# message(dim(LargeBlocks))
nowblocks <- LargeBlocks[1,]
# if(nowblocks[1]>4000) break
if(is.null(btwr2)){
nowgeno <- geno[,nowblocks[1]:nowblocks[2]]
if(LD == "r2"){
nowr2 <- suppressWarnings(cor(nowgeno, use="pairwise.complete.obs")^2)
nowr2[is.na(nowr2)] <- 0
}else if(LD == "Dprime"){
nowr2 <- genoDp(nowgeno, strLD = FALSE)
nowr2[is.na(nowr2)] <- 0
nowr2[(nowr2==Inf)|(nowr2==-Inf)] <- 0
}
btwr2 <- vapply(seq_len(dim(nowr2)[1]-1), function(x) mean(nowr2[seq_len(x), (x+1):(dim(nowr2)[1]-1)]), 1)
}
if(length(btwr2)< ceiling(maxsize*0.8*2)){
subbtwr2 <- btwr2[(ceiling(length(btwr2)/2)-(maxsize*0.2)):(floor(length(btwr2)/2)+(maxsize*0.2))]
addN <- which(subbtwr2 == min(subbtwr2))[1]+ (ceiling(length(btwr2)/2)-(maxsize*0.2)) - 1
Newblocks <- rbind(Newblocks, c(nowblocks[1], (nowblocks[1]+addN-1)))
if(Newblocks[dim(Newblocks)[1], 2]-Newblocks[dim(Newblocks)[1], 1]>=maxsize) {
# message(Newblocks[dim(Newblocks)[1], ])
break}
Newblocks <- rbind(Newblocks, c(nowblocks[1]+addN, nowblocks[2]))
if(Newblocks[dim(Newblocks)[1], 2]-Newblocks[dim(Newblocks)[1], 1]>=maxsize) {
# message(Newblocks[dim(Newblocks)[1], ])
break}
LargeBlocks <- LargeBlocks[-1,, drop=FALSE]
btwr2<- NULL
}else{
# message(LargeBlocks[1,]) Error in LargeBlocks[1, 1] <- (nowblocks[1] + addN)
subbtwr2 <- btwr2[(maxsize*0.4):maxsize]
addN <- which(subbtwr2 == min(subbtwr2))[1]+ (maxsize*0.4) - 1
Newblocks <- rbind(Newblocks, c(nowblocks[1], (nowblocks[1]+addN-1)))
LargeBlocks[1,1] <- (nowblocks[1]+addN)
btwr2<-btwr2[-seq_len(addN)]
if(diff(LargeBlocks[1,])<maxsize) {
Newblocks <- rbind(Newblocks, LargeBlocks[1,])
if(Newblocks[dim(Newblocks)[1], 2]-Newblocks[dim(Newblocks)[1], 1]>=maxsize) {
# message(Newblocks[dim(Newblocks)[1], ])
break}
LargeBlocks <- LargeBlocks[-1,, drop=FALSE]
btwr2<- NULL
}
}
if(Newblocks[dim(Newblocks)[1], 2]-Newblocks[dim(Newblocks)[1], 1]>=maxsize) {
# message(Newblocks[dim(Newblocks)[1], ])
break}
} # end while
FinalLDblocks <- rbind(LDblocks[-LargeBlocksN,], Newblocks)
FinalLDblocks <- FinalLDblocks[order(FinalLDblocks[,1]),]
return(FinalLDblocks)
}else{
return(LDblocks)
}
}
# merge overlapped gene regions
mergeOverlapGene <- function(Geneblocks) {
overlaplist <- as.list(rep(NA, dim(Geneblocks)[1]))
Geneblocks<-Geneblocks[order(Geneblocks[,1]),]
for (i in seq_len(dim(Geneblocks)[1] - 1)) {
overlaplist[[i]] <- i
for (j in (i + 1):dim(Geneblocks)[1]) {
if(Geneblocks[i,2]>=min(Geneblocks[j,1])){
overlaplist[[i]] <- c(overlaplist[[i]], j)
}else{
next
}
}
}
overlaplist[[length(overlaplist)]]<- length(overlaplist)
if(is.null(overlaplist)){
return(Geneblocks)
}else{
for(i in seq_len(length(overlaplist)-1)){
nowbin <- overlaplist[[i]]
if(all(is.na(nowbin))) next
for(j in (i+1):length(overlaplist)){
if(all(overlaplist[[j]] %in% nowbin)){
overlaplist[[j]]<-NA
}else if(max(nowbin)<max(overlaplist[[j]])){
break
}
}
}
overlaplist <- overlaplist[!vapply(overlaplist, function(x) all(is.na(x)), TRUE)]
newblocks <- matrix(NA, nrow=length(overlaplist), ncol=2)
rownames(newblocks)<-seq_len(length(overlaplist))
for(i in seq_len(length(overlaplist))){
if(length(overlaplist[[i]])==1){
newblocks[i,]<-Geneblocks[overlaplist[[i]],,drop=FALSE]
rownames(newblocks)[i]<-rownames(Geneblocks[overlaplist[[i]],,drop=FALSE])
}else{
newblocks[i,]<-c(min(Geneblocks[overlaplist[[i]],,drop=FALSE]),
max(Geneblocks[overlaplist[[i]],,drop=FALSE]))
rownames(newblocks)[i]<-paste(rownames(Geneblocks[overlaplist[[i]],,drop=FALSE]), collapse="/")
}
}
return(newblocks)
}
}
# LDblock construction and split Large regions
LDblockGeneMerge <- function(LDblocks.T, Geneblocks.M) {
LDblocks.T <- LDblocks.T[order(LDblocks.T[,1]),]
Geneblocks.M <- Geneblocks.M[order(Geneblocks.M[,1]),]
# for each gene blocks, we find the overlapping LD blocks
LDwithGene.list <- matrix(NA, nrow=dim(Geneblocks.M)[1], ncol=2)
rownames(LDwithGene.list) <-rownames(Geneblocks.M)
overlapLDBlocks <- NULL
for(i in seq_len(dim(Geneblocks.M)[1])){
LD1 <- max(which(LDblocks.T[,1] <= Geneblocks.M[i,1]))
LD2 <- min(which(LDblocks.T[,2] >= Geneblocks.M[i,2]))
LDwithGene.list[i,] <- c(LD1, LD2)
overlapLDBlocks <- c(overlapLDBlocks, LD1:LD2)
}
NonOverlapLDBlocks <- LDblocks.T[-overlapLDBlocks,]
## LDblocks which are not overlapped with gene regions
overlapLDGene <- NULL
NonoverlapLDgene <- NULL
i <- 1
while(i <= (dim(LDwithGene.list)[1])){
nowGeneR <- LDwithGene.list[i,, drop=FALSE]
if((LDwithGene.list[i,2] < LDwithGene.list[i+1,1])){
NonoverlapLDgene <- rbind(NonoverlapLDgene, nowGeneR)
# message(i);message(nowGeneR)
i <- i+1
}else{
merge.candi <- nowGeneR
for(j in (i+1):dim(LDwithGene.list)[1]){
if(max(LDwithGene.list[i:(j-1),])>=LDwithGene.list[(j),1]){
merge.candi<- rbind(merge.candi, LDwithGene.list[(j),,drop=FALSE])
if(j == dim(LDwithGene.list)[1]){
i <- j+1
}
}else{
i <- j
break
}
}
overlapLDGene <- c(overlapLDGene, list(merge.candi))
}
if(i == dim(LDwithGene.list)[1]){
nowGeneR <- LDwithGene.list[i,, drop=FALSE]
NonoverlapLDgene <- rbind(NonoverlapLDgene, nowGeneR)
# message(i);message(nowGeneR)
break
}
}
mergeLDgene <- NULL
if(length(overlapLDGene)>0){
mergeLDgene <- matrix(NA, length(overlapLDGene), 2)
rownames(mergeLDgene) <- seq_len(length(overlapLDGene))
for(i in seq_len(length(overlapLDGene))){
nowmerge <- overlapLDGene[[i]]
rownames(mergeLDgene)[i] <- paste(rownames(nowmerge), collapse="+")
mergeLDgene[i,1] <- min(nowmerge)
mergeLDgene[i,2] <- max(nowmerge)
}
}
finalGeneBlocks <- rbind(mergeLDgene, NonoverlapLDgene)
finalGeneBlocks <- finalGeneBlocks[order(finalGeneBlocks[,1]),]
for(i in seq_len(dim(finalGeneBlocks)[1])){
st <- LDblocks.T[finalGeneBlocks[i,1],1]
ed <- LDblocks.T[finalGeneBlocks[i,2],2]
finalGeneBlocks[i,] <-c(st, ed)
}
rownames(NonOverlapLDBlocks) <-rep("No", dim(NonOverlapLDBlocks)[1])
res <- rbind(finalGeneBlocks, NonOverlapLDBlocks)
res <- res[order(res[,1]),]
return(res)
}
# split big Gene region
splitBigLD <- function(GeneLDblocks, LDblocks.T, Geneblocks,maxsize){
BigN <- which(GeneLDblocks[,3]>maxsize)
BigGeneblocks <- GeneLDblocks[BigN,,drop=FALSE]
GeneLDblocks <- GeneLDblocks[-BigN,,drop=FALSE]
newblocks <- NULL
for (i in seq_len(dim(BigGeneblocks)[1])){
nowBlock <- BigGeneblocks[i,,drop=FALSE]
# nowSNPs<- nowBlock[1,1]:nowBlock[1,2]
intersectLD <- LDblocks.T[max(which(nowBlock[1,1]>=LDblocks.T[,1])):min(which(nowBlock[1,2]<=LDblocks.T[,2])),]
intersectLD[1,1] <- nowBlock[1,1]
intersectLD[dim(intersectLD)[1],2] <- nowBlock[1,2]
NintersectLD <- NULL
while(dim(intersectLD)[1]>0){
st <- intersectLD[1,1]
edposi <- max(which(intersectLD[,2]<(st+maxsize)))
ed <- intersectLD[edposi,2]
NintersectLD <- rbind(NintersectLD, c(st, ed))
intersectLD <- intersectLD[-seq_len(edposi),,drop=FALSE]
}
intersectLD <- NintersectLD
# if(all(apply(NintersectLD, 1, diff)<50)==FALSE) message(i)
rownames(intersectLD) <- rep(rownames(nowBlock), dim(intersectLD)[1])
blockL <- apply(intersectLD, 1, diff)+1
Addblocks <- cbind(intersectLD, blockL)
newblocks <- rbind(newblocks, Addblocks)
}
GeneLDblocks <- rbind(GeneLDblocks, newblocks)
GeneLDblocks <- GeneLDblocks[order(GeneLDblocks[,1]),]
return(GeneLDblocks)
}
# small region merging
mergeSmallRegion <- function(GeneLDblocks, maxsize, minsize) {
gname <- rownames(GeneLDblocks)
GeneLDblocks <- as.data.frame(GeneLDblocks, stringsAsFactors=FALSE)
# row.names(GeneLDblocks)<- NULL
# GeneLDblocks<- cbind(GeneLDblocks, gname, stringsAsFactors<- F)
smallbin <- NULL
completeBin <- NULL
while (dim(GeneLDblocks)[1] > 0) {
# message(dim(GeneLDblocks))
nowbin <- GeneLDblocks[1, ,drop=FALSE]
# if(nowbin[1,1]>75530) break
if (is.null(smallbin) & nowbin[1,3] >= minsize) {
completeBin <- rbind(completeBin, nowbin)
GeneLDblocks <- GeneLDblocks[-1, ,drop=FALSE]
} else if (is.null(smallbin) & nowbin[1,3] < minsize) {
smallbin <- nowbin
GeneLDblocks<- GeneLDblocks[-1, ,drop=FALSE]
} else if (smallbin[1, 3] + nowbin[1,3] < minsize) {
st <- min(smallbin[1, 1], smallbin[1, 2], nowbin[1, 1], nowbin[1, 2])
ed <- max(smallbin[1, 1], smallbin[1, 2], nowbin[1, 1], nowbin[1, 2])
blockL <- smallbin[1, 3] + nowbin[1,3]
smallbin <- data.frame(st, ed, blockL)
GeneLDblocks <- GeneLDblocks[-1, ,drop=FALSE]
} else if (smallbin[1, 3] + nowbin[1,3] >= minsize & smallbin[1, 3] + nowbin[1,3] <= maxsize) {
st <- min(smallbin[1, 1], smallbin[1, 2], nowbin[1, 1], nowbin[1, 2])
ed <- max(smallbin[1, 1], smallbin[1, 2], nowbin[1, 1], nowbin[1, 2])
blockL <- smallbin[1, 3] + nowbin[1,3]
completeBin <- rbind(completeBin, data.frame(st, ed, blockL))
smallbin <- NULL
GeneLDblocks <- GeneLDblocks[-1,,drop=FALSE]
} else if (smallbin[1, 3] + nowbin[1,3] > maxsize) {
lastbin <- completeBin[dim(completeBin)[1], ]
if(is.null(lastbin)){
message(c("Large-small-Large"))
message(rbind(lastbin, smallbin, nowbin))
completeBin <- rbind(completeBin, smallbin, nowbin)
smallbin <- NULL
}else if (lastbin[1,3] + smallbin[1,3] <= maxsize) {
st <- min(smallbin[1, 1], smallbin[1, 2], lastbin[1, 1], lastbin[1, 2])
ed <- max(smallbin[1, 1], smallbin[1, 2], lastbin[1, 1], lastbin[1, 2])
blockL <- smallbin[1, 3] + lastbin[1, 3]
completeBin <- completeBin[-dim(completeBin)[1], ,drop=FALSE]
completeBin <- rbind(completeBin, data.frame(st, ed, blockL))
smallbin <- NULL
} else {
message(c("Large-small-Large"))
message(rbind(lastbin, smallbin, nowbin))
completeBin <- rbind(completeBin, smallbin, nowbin)
smallbin <- NULL
GeneLDblocks <- GeneLDblocks[-1, ,drop=FALSE]
}
}
}
if(!is.null(smallbin) & is.null(completeBin)){
completeBin < -smallbin
}
if(!is.null(smallbin) & !is.null(completeBin)){
lastindex <- dim(completeBin)[1]
if(completeBin[lastindex,3,drop=FALSE]+smallbin[1,3] > maxsize){
completeBin <- rbind(completeBin, smallbin)
}else{
nowbin <- c(completeBin[lastindex, 1], smallbin[1,2], completeBin[lastindex, 3] + smallbin[1,3])
completeBin[lastindex,seq_len(3)] <- nowbin
}
}
return(completeBin)
}
# naming each region
namingRegion2 <- function(GeneLDblocks, genelist, chrSNPinfo) {
st.bp <- chrSNPinfo[GeneLDblocks[,1],3]
ed.bp <- chrSNPinfo[GeneLDblocks[,2],3]
bpBlocks <- cbind(GeneLDblocks[,seq_len(3), drop=FALSE], st.bp, ed.bp)
FinalGeneLDblocks <- NULL
PartN <- 1
Pgene <- NULL
gene <- NULL
Ngene <- NULL
gname <- rep(NA, dim(bpBlocks)[1])
FinalGeneLDblocks<- data.frame(bpBlocks, gname)
for (i in seq_len(dim(bpBlocks)[1])){
# if (i%%10 == 0)
# message(i)
nowloca <- bpBlocks[i,4:5,drop=FALSE]
intro.index <- (which(genelist[,4]<nowloca[1,1]))
intro.index1 <- setdiff(seq_len(dim(genelist)[1]), intro.index)
outro.index <- (which(genelist[,3]>nowloca[1,2]))
outro.index1 <- setdiff(seq_len(dim(genelist)[1]), outro.index)
common.index <- intersect(intro.index1, outro.index1)
if(length(common.index)==0){
if(length(intro.index)==0){
#before gene
gname <- paste("before-", as.character(genelist[1, 1]), sep="")
}else if(length(outro.index)==0){
#after gene
gname <- paste("after-", as.character(genelist[dim(genelist)[1], 1]), sep="")
}else{
intronum <- which(genelist[,4]==max(genelist[intro.index,4]))
outronum <- which(genelist[,3]==min(genelist[outro.index,3]))
gname <- paste("inter-", paste(as.character(genelist[intronum, 1]),collapse = "/"),":",paste(as.character(genelist[outronum, 1]), collapse = "/"), sep="")
}
}else{
subgenelist <- genelist[common.index,,drop=FALSE]
gname <- as.character(subgenelist[1,1])
nowregion <- c(subgenelist[1,3], subgenelist[1,4])
if(length(common.index)>1){
for(k in 2:dim(subgenelist)[1]){
nextregion <- subgenelist[k,3:4]
if(max(nowregion)<min(nextregion)){
gname <- paste(c(gname, as.character(subgenelist[k,1])), collapse="+")
nowregion <- subgenelist[k,3:4]
}else{
gname <- paste(c(gname, as.character(subgenelist[k,1])), collapse="/")
candi.r <- rbind(nowregion, nextregion)
nowregion <- c(min(candi.r)[1], max(candi.r)[1])
}
}
}
}
FinalGeneLDblocks$gname[i]<-gname
}
# part numbering
FinalGeneLDblocks <- data.frame(FinalGeneLDblocks, 0)
FinalGeneLDblocks[1, 7] <- 1
for (i in 2:dim(FinalGeneLDblocks)[1]) {
ifelse(FinalGeneLDblocks[i - 1, 6] == FinalGeneLDblocks[i, 6],
FinalGeneLDblocks[i, 7] <- FinalGeneLDblocks[(i - 1), 7] + 1, FinalGeneLDblocks[i, 7] <- 1)
# if(i%%10==0) message(i)
}
Finalnames <- apply(FinalGeneLDblocks, 1, function(x) {
# paste(as.character(x[6]), '-part', as.character(x[7]), sep='')
paste(c(x[6], "-part", as.numeric(x[7])), collapse="")
})
FinalGeneLDblocks <- cbind(FinalGeneLDblocks[, seq_len(5)], Finalnames)
return(FinalGeneLDblocks)
}
geneinfoExt <- function(geneinfofile=geneinfofile, geneDB = geneDB, assembly = assembly, geneid = geneid,
ensbversion = ensbversion, chrNs = chrNs){
if(!is.null(geneinfofile)){
message("load gene information from inputed file")
geneinfo <- data.table::fread(geneinfofile, stringsAsFactors = FALSE)
geneinfo <- data.frame(geneinfo)
colnames(geneinfo) <- c("genename", "chr", "start.bp", "end.bp")
}else if(geneDB == "ensembl"){
if(assembly=="GRCh38"){
message("load gene information from ensembl (assembly GRCh38)")
grch <- 38
ensembl <- biomaRt::useEnsembl(biomart="ensembl", dataset="hsapiens_gene_ensembl", version=ensbversion)#, version=version
if(geneid!="ensembl_gene_id"){
geneinfo <- biomaRt::getBM(attributes = c(geneid, 'ensembl_gene_id', 'chromosome_name','start_position', 'end_position'), #external_gene_name
filters = 'chromosome_name',
value = chrNs,
mart = ensembl)
geneinfo[(geneinfo[,1]==""),1]<-geneinfo[(geneinfo[,1]==""),2]
geneinfo<-geneinfo[,c(1,3,4,5)]
}else{
geneinfo <- biomaRt::getBM(attributes = c(geneid, 'chromosome_name','start_position', 'end_position'), #external_gene_name
filters = 'chromosome_name',
value = chrNs,
mart = ensembl)
}
# geneinfo <- geneinfo[geneinfo[,1]!="",]
colnames(geneinfo) <- c("genename", "chr", "start.bp", "end.bp")
}else if(assembly=="GRCh37"){
message("load gene information from ensembl (assembly GRCh37)")
ensembl <- biomaRt::useEnsembl(biomart="ensembl", dataset="hsapiens_gene_ensembl", GRCh = 37)#, version=version
if(geneid!="ensembl_gene_id"){
geneinfo <- biomaRt::getBM(attributes = c(geneid, 'ensembl_gene_id', 'chromosome_name','start_position', 'end_position'), #external_gene_name
filters = 'chromosome_name',
value = chrNs,
mart = ensembl)
geneinfo[(geneinfo[,1]==""),1]<-geneinfo[(geneinfo[,1]==""),2]
geneinfo<-geneinfo[,c(1,3,4,5)]
}else{
geneinfo <- biomaRt::getBM(attributes = c(geneid, 'chromosome_name','start_position', 'end_position'), #external_gene_name
filters = 'chromosome_name',
value = chrNs,
mart = ensembl)
}
# geneinfo <- geneinfo[geneinfo[,1]!="",]
colnames(geneinfo) <- c("genename", "chr", "start.bp", "end.bp")
}else{
stop("wrong assembly")
}
}else if(geneDB == "ucsc"){
if(assembly == "GRCh37"){
message("load gene information from UCSC genome browser (assembly GRCh37)")
Homo.sapiens <- Homo.sapiens::Homo.sapiens
cls <- AnnotationDbi::columns(Homo.sapiens::Homo.sapiens)
cls <- cls[match(c("TXCHROM","TXEND","TXSTART"), cls)]
kts <- AnnotationDbi::keytypes(Homo.sapiens)
if(geneid == 'hgnc_symbol'){
gene_id <- "SYMBOL"
}else{
gene_id <- geneid
}
kt <- kts[match(c(gene_id), kts)]
ks <- AnnotationDbi::keys(Homo.sapiens, keytype=kt)
res <- AnnotationDbi::select(Homo.sapiens, keys=ks, columns=cls, keytype=kt)
geneinfo = res[res$TXCHROM %in% paste("chr", chrNs, sep = ""),]
geneinfo$TXCHROM <- gsub(pattern = "chr", replacement = "", geneinfo$TXCHROM)
class(geneinfo$TXCHROM)<-"integer"
colnames(geneinfo) <- c("genename", "chr", "start.bp", "end.bp")
} else if(assembly == "GRCh38"){
message("load gene information from UCSC genome browser (assembly GRCh38)")
Homo.sapiens <- Homo.sapiens::Homo.sapiens
OrganismDbi::TxDb(Homo.sapiens) <- TxDb.Hsapiens.UCSC.hg38.knownGene::TxDb.Hsapiens.UCSC.hg38.knownGene
cls <- AnnotationDbi::columns(Homo.sapiens)
cls <- cls[match(c("TXCHROM","TXEND","TXSTART"), cls)]
kts <- AnnotationDbi::keytypes(Homo.sapiens)
if(geneid == 'hgnc_symbol'){
gene_id <- "SYMBOL"
}else{
gene_id <- geneid
}
kt <- kts[match(c(gene_id), kts)]
ks <- AnnotationDbi::keys(Homo.sapiens, keytype=kt)
res <- AnnotationDbi::select(Homo.sapiens, keys=ks, columns=cls, keytype=kt)
geneinfo = res[res$TXCHROM %in% paste("chr", chrNs, sep = ""),]
geneinfo$TXCHROM <- gsub(pattern = "chr", replacement = "", geneinfo$TXCHROM)
class(geneinfo$TXCHROM)<-"integer"
colnames(geneinfo) <- c("genename", "chr", "start.bp", "end.bp")
}else{
stop("wrong assembly")
}
}else{
stop("wrong DB name!")
}
return(geneinfo)
}
#gene overlap merging : combine gene region if there are genes with same name
combineOverlapSamegene = function(genelist){
combgene <- names(table(genelist[,1])[table(genelist[,1])>1])
for(cgene in combgene){
subgeneinfo <- genelist[genelist[,1]==cgene,]
subgeneinfo <- subgeneinfo[order(subgeneinfo[,3]),]
genelist <- genelist[-which(genelist[,1]==cgene),]
addgene <- subgeneinfo[1,,drop=FALSE]
for(i in 2:dim(subgeneinfo)[1]){
if(addgene[nrow(addgene),4]<subgeneinfo[i,3]){
addgene <- rbind(addgene, subgeneinfo[i,])
}else{
addgene[nrow(addgene),4] <- max(subgeneinfo[i,4])
}
}
genelist <- rbind(genelist, addgene)
}
genelist <- genelist[order(genelist[,3]),,drop=FALSE]
return(genelist)
}
# MAIN FUNCTIONS ------------------------------------------
# CLQD < input >
#' @title partitioning into cliques
#' @name CLQD
#' @aliases CLQD
#' @description \code{CLQD} partitioning the given data into subgroups that contain SNPs which are highly correlated.
#' @usage CLQD(geno, SNPinfo, CLQcut=0.5, clstgap=40000,
#' hrstType=c("near-nonhrst", "fast", "nonhrst"), hrstParam=200,
#' CLQmode=c("density", "maximal"), LD=c("r2", "Dprime"))
#' @param geno Data frame or matrix of additive genotype data, each column is additive genotype of each SNP. (Use data of non-monomorphic SNPs)
#' @param SNPinfo Data frame or matrix of SNPs information. 1st column is rsID and 2nd column is bp position.
#' @param CLQcut Numeric constant; a threshold for the LD measure value |r|, between 0 to 1. Default 0.5.
#' @param clstgap Numeric constant; a threshold of physical distance (bp) between two consecutive SNPs
#' which do not belong to the same clique, i.e., if a physical distance between two consecutive SNPs in a clique
#' greater than \code{clstgap}, then the algorithm split the cliques satisfying each
#' clique do not contain such consecutive SNPs. Default 40000.
#' @param hrstType Character constant; heuristic methods.
#' If you want to do not use heuristic algorithm, set \code{hrstType = "nonhrst"}.
#' If you want to use heuristic algorithm suggested in Kim et al.,(2017), set \code{hrstType = "fast"}.
#' That algorithm is fastest heuristic algorithm and suitable when your memory capacity is not greater than 8GB.
#' If you want to obtain the results similar to the that of non-heuristic algorithm, set \code{hrstType = "near-nonhrst"}.
#' @param hrstParam Numeric constant; parameter for heuristic algorithm "near-nonhrst".
#' Default is \code{200}. It is recommended that you set the parameter to greater than 150.
#' @param CLQmode Character constant; the way to give priority among detected cliques.
#' if \code{CLQmode = "density"} then the algorithm gives priority to the clique of largest value of \eqn{(Number of SNPs)/(range of clique)},
#' else if \code{CLQmode = "maximal"}, then the algorithm gives priority to the largest clique. The default is "density".
#' @param LD Character constant; LD measure to use, "r2" or "Dprime". Default "r2".
# <output>
#' @return A vector of cluster numbers of all SNPs (\code{NA} represents singleton cluster).
#'
#' @seealso \code{\link{BigLD}}
#' @examples
#'
#' data(geno)
#' data(SNPinfo)
#' CLQD(geno=geno[,1:100],SNPinfo=SNPinfo[1:100,])
#' \dontrun{
#' CLQD(geno=geno[,1:100],SNPinfo=SNPinfo[1:100,], CLQmode = 'maximal')
#' CLQD(geno=geno[,1:100],SNPinfo=SNPinfo[1:100,], LD='Dprime')
#'}
#' @author Sun-Ah Kim <sunny03@snu.ac.kr>, Yun Joo Yoo <yyoo@snu.ac.kr>
#' @importFrom stats cor median quantile
#' @importFrom utils tail
#' @importFrom Rcpp sourceCpp
#' @export
CLQD <- function(geno, SNPinfo, CLQcut=0.5, clstgap=40000, hrstType=c("near-nonhrst", "fast", "nonhrst"), hrstParam=200,
CLQmode=c("density", "maximal"), LD=c("r2", "Dprime"))
{
########################################################################################################
skipRatio <- 0
CLQmode <- match.arg(CLQmode)
LD <- match.arg(LD)
hrstType <- match.arg(hrstType)
geno <- as.matrix(geno)
# filtering all NA SNPs
allNASNPs = apply(geno, 2, function(x) all(is.na(x)))
geno<-geno[, !allNASNPs]
SNPinfo<-SNPinfo[!allNASNPs, ]
# Main Function
SNPbps <- SNPinfo[, 3]
if(LD == "r2"){
OCM <- suppressWarnings(cor(geno, use="pairwise.complete.obs"))
diag(OCM) <- 0
OCM[is.na(OCM)]<-0
OCM[abs(OCM) < CLQcut] <- 0
OCM[abs(OCM) >= CLQcut] <- 1
}else if(LD == "Dprime"){
OCM <- genoDp(geno)
OCM[(OCM==Inf)|(OCM==-Inf)] <- 0
}
binvector <- rep(NA, dim(OCM)[2])
binnum <- 1
# re.SNPbps <- SNPbps
if(all(OCM==0)) return(binvector)
# test graph complexity
OCM1 <- OCM
# bothhigh <- (degs>=400 & cores>=400)
if(hrstType == "nonhrst"){
heuristic <- FALSE
}else if(hrstType == "near-nonhrst"){
heuristic <- TRUE
heuristicNum <- 0
while(heuristic == TRUE){
g <- igraph::graph_from_adjacency_matrix(OCM1, mode="undirected", weighted=TRUE, diag=FALSE, add.colnames=NA)
cores <- igraph::coreness(g)
highcore <- sum(cores>=hrstParam)
local_cores <- table(cores[cores>=(hrstParam)])
local_cores <- local_cores[local_cores>=(hrstParam)]
if(length(local_cores>0)){
if(heuristicNum==0){
message("Use near-CLQ heuristic procedure!!")
heuristicNum <- heuristicNum+1
message(paste("hueristic loop", heuristicNum))
}else{
heuristicNum <- heuristicNum+1
message(paste("hueristic loop", heuristicNum))
}
# find dense region
# local_cores <- table(cores[cores>=(hrstParam)])
# local_cores <- local_cores[local_cores>=(hrstParam)]
local_hrstParam <- names(local_cores[which(as.integer(names(local_cores))==max(as.integer(names(local_cores))))])
local_hrstParam <- as.numeric(local_hrstParam)
bothhighSNPs <- which(cores == local_hrstParam)
SNPset1 <- which(is.na(binvector))[bothhighSNPs]
nowOCM <- OCM[SNPset1, SNPset1]
heuristicBins <- heuristicCLQ(nowOCM, hrstParam)
binvector[SNPset1[heuristicBins]]<-binnum
binnum <- binnum+1
OCM1 <- OCM[is.na(binvector), is.na(binvector)]
}else{
# if(heuristicNum ==0){
# # message("We do not need heuristic procedure")
# }else{
# message("End new heuristic procedure")
# }
heuristic <- FALSE
}
} #end while
}else if(hrstType == "fast"){
checkLargest <- TRUE
r2Mat <- OCM
re.SNPbps <- SNPinfo[,3]
# firstTerm = T
while(checkLargest == TRUE){
g <- igraph::graph_from_adjacency_matrix(r2Mat, mode = "undirected", weighted = TRUE, diag = FALSE, add.colnames = NA)
compo = igraph::components(g)
componum = which(compo$csize==max(compo$csize))[1]
compov = which(compo$membership==componum)
compadjM = OCM1[compov, compov]
cg = igraph::graph_from_adjacency_matrix(compadjM, mode = "undirected", weighted = TRUE, diag = FALSE, add.colnames = NA)
if((median(igraph::coreness(cg))>80 & max(igraph::coreness(cg))>100)| (quantile(igraph::coreness(cg), 0.75)>100 & max(igraph::coreness(cg))>100)){
message("use fast heuristic procedure!")
# if(quantile(degrees, 0.7) == 1) break
# message(head((quantile(degrees, 0.7))))
degrees = apply(r2Mat, 1, sum)
maxdegv = which(degrees >=max(quantile(degrees, 0.7), 80))
# if(length(maxdegv)>=1){
maxdegvs = maxdegv
edgeDens = NULL
for(maxdegv in maxdegvs){
Bignbds = which(r2Mat[maxdegv,, drop = FALSE]>0, arr.ind = TRUE)
Bignbds.c = unique(Bignbds[,2])
newr2Mat = r2Mat[Bignbds.c,Bignbds.c]
EdgeDen = sum(newr2Mat)/((dim(newr2Mat)[1])*(dim(newr2Mat)[1]-1))
edgeDens = c(edgeDens, EdgeDen)
}
maxdegvs = maxdegvs[order(edgeDens, decreasing = TRUE)]
edgeDens = edgeDens[order(edgeDens, decreasing = TRUE)]
degv = maxdegvs[1]
edgeD = edgeDens[1]
Bignbds = which(r2Mat[degv,, drop = FALSE]>0, arr.ind = TRUE)
Bignbds.c = unique(Bignbds[,2])
# maxiC = maximal.cliques(g, min = dim(OCM1)[1]*0.9)
# largestOneRange = range(Bignbds.c)
# largestSNPn = diff(largestOneRange)
# largestCsize = length(Bignbds.c)
nowSNPsbp = re.SNPbps[Bignbds.c]
nowSNPsbploca = match(nowSNPsbp, SNPbps)
binvector[nowSNPsbploca] <- binnum
binnum = binnum + 1
r2Mat <- r2Mat[-Bignbds.c, -Bignbds.c, drop = FALSE]
OCM1 <- OCM1[-Bignbds.c, -Bignbds.c, drop = FALSE]
re.SNPbps <- re.SNPbps[-Bignbds.c]
# message("case2")
checkLargest = TRUE
if(length(re.SNPbps)<500) checkLargest = FALSE
}else{
checkLargest = FALSE
}
}
}
# binvector splitting by clstgap
if(all(is.na(binvector))==FALSE){
binveclist <- lapply(seq_len(max(binvector, na.rm = TRUE)),
function(x) SNPinfo[,3][which(binvector == x)])
binvecbplist <- newSplitCliques(binveclist, clstgap)
binvecbpindex <- lapply(binvecbplist, function(x) match(x, SNPinfo[,3]))
binvecbpindex <- binvecbpindex[vapply(binvecbpindex, length, c(1))>1]
binvector <- rep(NA, length(binvector))
for(i in seq_len(length(binvecbpindex))){
binvector[binvecbpindex[[i]]]<-i
}
}
# take Toooo Big block First!
r2Mat <- OCM[is.na(binvector), is.na(binvector)]
if(sum(is.na(binvector))<=1 | (sum(r2Mat)==0)) return(binvector)
g <- igraph::graph_from_adjacency_matrix(r2Mat, mode="undirected", weighted=TRUE, diag=FALSE, add.colnames=NA)
max.cliques <- igraph::max_cliques(g, min=2)
if(length(max.cliques)==0) stop("max.cliques is empty")
re.SNPbps <- SNPbps[is.na(binvector)]
bp.cliques <- lapply(max.cliques, function(x) re.SNPbps[x])
split.bp.cliques <- newSplitCliques(bp.cliques, clstgap)
# reduce the number of maximal clique? (candidate) or
# modify density function. narrow SNP distance, so small cliques are chosen preferencely.
repeat {
# message(binnum)
if (all(is.na(binvector) == FALSE)) {
break
}
if(length(split.bp.cliques)==0) break
now.split.bp.cliques <- split.bp.cliques
if(CLQmode =="density"){
density.v <- vapply(now.split.bp.cliques, function(x) ((length(x)))/(diff(range(x))/1000), 1)
}else{
density.v <- vapply(now.split.bp.cliques, length, 1)
}
max.d <- which(density.v == max(density.v))
max.cluster <- now.split.bp.cliques[max.d]
if (length(max.cluster) > 1) {
# if there are two bins of same density, then we choose the bigger one.
max.cluster <- max.cluster[order(vapply(max.cluster, length, 1), decreasing=TRUE)]
}
max.cluster <- max.cluster[[1]]
max.cluster.od <- match(max.cluster, re.SNPbps)
# if (codechange == TRUE) {
# max.cluster.od <- ChooseMaximal(max.cluster.od, CLQcut, OCM)
# max.cluster <- re.SNPbps[max.cluster.od]
# }
## excluding all SNPs in max.cluster from re.SNPbps
split.bp.cliques <- lapply(split.bp.cliques, function(x) setdiff(x, max.cluster))
split.bp.cliques <- unique(split.bp.cliques)
split.bp.cliques <- split.bp.cliques[which(vapply(split.bp.cliques, length, 1) > 1)]
binvector[match(max.cluster, SNPbps)] <- binnum
binnum=binnum + 1
# r2Mat <- r2Mat[-max.cluster.od, -max.cluster.od]
# OCM <- OCM[-max.cluster.od, -max.cluster.od]
# re.SNPbps <- setdiff(re.SNPbps, max.cluster)
if (length(re.SNPbps) < 2) {
break
}
if(length(split.bp.cliques)==0) break
# message(sum(is.na(binvector)))
} ##end repeat
return(binvector)
}
# Big-LD <input>
#' @title Estimation of LD block regions
#' @name BigLD
#' @aliases BigLD
#' @description \code{BigLD} returns the estimation of LD block regions of given data.
#' @usage BigLD(geno=NULL, SNPinfo=NULL,genofile=NULL, SNPinfofile=NULL,
#' cutByForce=NULL, LD=c("r2", "Dprime"), CLQcut=0.5,
#' clstgap=40000, CLQmode=c("density", "maximal"),
#' leng=200, subTaskSize=1500, MAFcut=0.05, appendRare=FALSE,
#' hrstType=c("near-nonhrst", "fast", "nonhrst"),
#' hrstParam=200, chrN=NULL, startbp=-Inf, endbp=Inf)
#' @param geno Data frame or matrix of additive genotype data, each column is additive genotype of each SNP.
#' @param SNPinfo Data frame or matrix of SNPs information. 1st column is rsID and 2nd column is bp position.
#' @param genofile Character constant; Genotype data file name (supporting format: .txt, .ped, .raw, .traw, .vcf).
#' @param SNPinfofile Character constant; SNPinfo data file name (supporting format: .txt, .map).
#' @param cutByForce Data frame; information of SNPs which are forced to be the last SNP LD block boundary.
#' @param LD Character constant; LD measure to use, r2 or Dprime. Default "r2".
#' @param CLQcut Numeric constant; a threshold for the LD measure value |r|, between 0 to 1. Default 0.5.
#' @param clstgap Numeric constant; a threshold of physical distance (bp) between two consecutive SNPs
#' which do not belong to the same clique, i.e., if a physical distance between two consecutive SNPs in a clique
#' greater than \code{clstgap}, then the algorithm split the cliques satisfying each
#' clique do not contain such consecutive SNPs. Default 40000.
#' @param CLQmode Character constant; the way to give priority among detected cliques.
#' if \code{CLQmode = "density"} then the algorithm gives priority to the clique of largest value of \eqn{(Number of SNPs)/(range of clique)},
#' else if \code{CLQmode = "maximal"}, then the algorithm gives priority to the largest clique. The default is "density".
#' @param leng Numeric constant; the number of SNPs in a preceding and a following region
#' of each sub-region boundary, every SNP in a preceding and every SNP in a following region need to be in weak LD. Default 200.
#' @param subTaskSize Numeric constant; upper bound of the number of SNPs in a one-take sub-region. Default 1500.
#' @param MAFcut Numeric constant; the MAF threshold. Default 0.05.
#' @param appendRare logical; if TRUE, the function append rare variants with MAF<MAFcut to the constructed blocks.
#' @param hrstType Character constant; heuristic methods.
#' If you want to do not use heuristic algorithm, set \code{hrstType = "nonhrst"}.
#' If you want to use heuristic algorithm suggested in Kim et al.,(2017), set \code{hrstType = "fast"}.
#' That algorithm is fastest heuristic algorithm and suitable when your memory capacity is not greater than 8GB.
#' If you want to obtain the results similar to the that of non-heuristic algorithm, set \code{hrstType = "near-nonhrst"}.
#' @param hrstParam Numeric constant; parameter for heuristic algorithm "near-nonhrst".
#' Default is \code{200}. It is recommended that you set the parameter to greater than 150.
#' @param chrN Numeric(or Character) constant (or vector); chromosome number to use.
#' @param startbp Numeric constant; starting bp position of the \code{chrN}. Default -Inf.
#' @param endbp Numeric constant; last bp position of the \code{chrN}. Default Inf.
# <output>
#' @return A data frame of block estimation result.
#' Each row of data frame shows the starting SNP and end SNP of each estimated LD block.
#'
#' @author Sun-Ah Kim <sunny03@snu.ac.kr>, Yun Joo Yoo <yyoo@snu.ac.kr>
#'
#'
#'
#' @seealso \code{\link{CLQD}}, \code{\link{LDblockHeatmap}}
#'
#' @examples
#'
#' data(geno)
#' data(SNPinfo)
#' BigLD(geno=geno, SNPinfo=SNPinfo, startbp = 16058400, endbp = 16076500)
#'
#' \dontrun{
#' BigLD(geno, SNPinfo, LD = "Dprime")
#' BigLD(geno, SNPinfo, CLQcut = 0.5, clstgap = 40000, leng = 200, subTaskSize = 1500)
#' }
#' @importFrom stats cor median quantile
#' @importFrom utils tail
#' @importFrom Rcpp sourceCpp
#' @export
BigLD <- function(geno=NULL, SNPinfo=NULL,genofile=NULL, SNPinfofile=NULL, cutByForce=NULL,
LD=c("r2", "Dprime"), CLQcut=0.5, clstgap=40000, CLQmode=c("density", "maximal"),
leng=200, subTaskSize=1500, MAFcut=0.05, appendRare=FALSE,
hrstType=c("near-nonhrst", "fast", "nonhrst"), hrstParam=200,
chrN=NULL, startbp=-Inf, endbp=Inf)
{
skipRatio=0.0
CLQmode <- match.arg(CLQmode)
hrstType <- match.arg(hrstType)
LD <- match.arg(LD)
# data preparation
if(!is.null(genofile)){
inputdata <- dataPreparation(genofile, SNPinfofile, geno, SNPinfo, chrN=chrN, startbp=startbp, endbp=endbp)
geno <- inputdata[[1]]
SNPinfo <- inputdata[[2]]
chrNs <- unique(SNPinfo[,1])
}else{
if(is.null(geno) |is.null(SNPinfo)){
stop("Need input data (or files)")
}
if(ncol(geno) != nrow(SNPinfo)){
stop("The number of SNPs in geno and SNPinfo are not matched")
}
geno <- as.matrix(geno)
# header generation
if(is.null(chrN)) chrN <- unique(SNPinfo[,1])
colnames(SNPinfo) <- c("chrN", "rsID", "bp")
if(startbp != -Inf | endbp != Inf){
if((length(chrN)>1) | (length(unique(SNPinfo[,1]))>1))
stop("If your input data include more than one chromosome or you choose more than one chromosome,
then do not specify the startbp and endbp!")
SNPloca <- which(SNPinfo$bp>=startbp & SNPinfo$bp<=endbp & SNPinfo$chrN == chrN)
SNPinfo <- SNPinfo[SNPloca,]
geno <- geno[,SNPloca]
}
colnames(geno)<- SNPinfo$rsID
chrNs <- unique(SNPinfo[,1])
}
# filtering all NA SNPs
allNASNPs = apply(geno, 2, function(x) all(is.na(x)))
geno<-geno[, !allNASNPs]
SNPinfo<-SNPinfo[!allNASNPs, ]
totalLDBres <- NULL
totalCLQres <- rep(NA,dim(SNPinfo)[1])
stindex <- 0
if(ncol(geno)!=nrow(SNPinfo)) stop("The number of SNPs in geno data and SNPinfo data does not match")
# for each chrN, seperately apply BigLD algorithm
for(chrN in chrNs){
message(paste("working chromosome name:", chrN))
chrSNPinfo <- SNPinfo[(SNPinfo[,1]==chrN),,drop=FALSE]
chrgeno <- geno[,(SNPinfo[,1]==chrN),drop=FALSE]
if(dim(chrSNPinfo)[1]==1)next
# pruning by MAF ---
message(paste("remove SNPs with MAF <", MAFcut))
MAF <- apply(chrgeno, 2, function(x) mean(x,na.rm=TRUE)/2)
MAF_ok <- ifelse(MAF>=0.5,1-MAF,MAF)
MAF <- MAF_ok
message(paste("Number of SNPs after prunning SNPs with MAF<", MAFcut, ":", sum(MAF>=MAFcut)))
underMAFgeno <- chrgeno[,MAF<MAFcut]
chrgeno <- chrgeno[,MAF>=MAFcut]
underMAFSNPinfo <- chrSNPinfo[MAF<MAFcut,]
chrSNPinfo <- chrSNPinfo[MAF>=MAFcut,]
chrLDBres <- matrix(NA, dim(chrSNPinfo)[1],2)
chrCLQres <- rep(NA, dim(chrSNPinfo)[1])
# divide whole sequence into sub-task size segments.
chrcutbyforce = cutByForce[cutByForce[,1] == chrN, ]
cutpoints.all<-cutsequence(geno = chrgeno, SNPinfo = chrSNPinfo, leng = leng,
subTaskSize = subTaskSize, LD=LD, clstgap = clstgap, CLQcut = CLQcut, cutByForce = chrcutbyforce)
cutpoints <- cutpoints.all[[1]]
atfcut <- (cutpoints.all[[2]])
if (!is.null(atfcut)){
atfcut <- sort(atfcut)
}
cutpoints <- setdiff(cutpoints, 0)
cutblock <- cbind(c(1, cutpoints + 1), c(cutpoints, dim(chrgeno)[2]))
cutblock <- cutblock[-(dim(cutblock)[1]), , drop=FALSE]
cutblock <- cutblock[cutblock[,1]!=cutblock[,2],, drop=FALSE]
# for each subtaskregion, adapt BigLD algorithm ------------------
for(i in seq_len(dim(cutblock)[1])){
nowst <- cutblock[i, 1]
nowed <- cutblock[i, 2]
subgeno <- chrgeno[, nowst:nowed]
subSNPinfo <- chrSNPinfo[nowst:nowed, ]
message(paste(Sys.time()," | ", "chr",chrN,":",min(subSNPinfo[,3]), "-" ,max(subSNPinfo[,3]), " | sub-region: ", i,"/",dim(cutblock)[1], sep = ""))
subbinvec <- CLQD(geno = subgeno, SNPinfo = subSNPinfo, CLQcut = CLQcut, clstgap = clstgap, hrstType=hrstType,
hrstParam = hrstParam, CLQmode = CLQmode, LD = LD)
# chrCLQres <- c(chrCLQres, subbinvec)
chrCLQres[nowst: nowed]<-subbinvec
cat('CLQ done!\r')
if(all(is.na(subbinvec) == TRUE)) next;
bins <- seq_len(max(subbinvec[which(!is.na(subbinvec))]))
clstlist <- lapply(bins, function(x) which(subbinvec == x))
clstlist <- lapply(clstlist, sort) ###
clstlist <- clstlist[order(vapply(clstlist, min, 1))] ###
nowLDblocks <- constructLDblock(clstlist, subSNPinfo)
nowLDblocks <- nowLDblocks[order(nowLDblocks[, 1]), , drop = FALSE]
cat('constructLDblock done!\r')
nowLDblocks.bp <- cbind(subSNPinfo[nowLDblocks[,1],3], subSNPinfo[nowLDblocks[,2],3])
nowLDblocks <- nowLDblocks + (cutblock[i, 1] - 1)
nowLDblocks <- nowLDblocks[order(nowLDblocks[, 1]), , drop = FALSE]
preleng1 <- length(which(!is.na(chrLDBres[, 1])))
chrLDBres[(preleng1 + 1):(preleng1 + dim(nowLDblocks)[1]), ] <- nowLDblocks
}
doneLDblocks <- chrLDBres[which(!is.na(chrLDBres[, 1])), , drop = FALSE]
largeLDblocks <- doneLDblocks[apply(doneLDblocks, 1, diff)>5,, drop = FALSE]
# atfcut -- LD block reconst ----------
# newLDblocks <- matrix(NA, dim(chrSNPinfo)[1], 2)
if(length(atfcut)!=0){
addingBlocks <- NULL
message("handling the ambiguous sub-task regions..")
for(i in atfcut){
print(paste("ambiguous boundary", i, "\r"))
if(length(which(largeLDblocks[,1]>i))==0) break
st <- largeLDblocks[max(which(largeLDblocks[,2]<=i)),1]
ed <- largeLDblocks[min(which(largeLDblocks[,1]>i)),2]
# saveBlocks <- doneLDblocks[doneLDblocks[,1]<st,, drop = FALSE]
delIndex <- which(doneLDblocks[,1]>=st & doneLDblocks[,2]<=ed)
doneLDblocks <- doneLDblocks[-delIndex,]
# if(dim(saveBlocks)[1] != 0){
# newLDblocks[which(is.na(newLDblocks)==TRUE)[1]:(which(is.na(newLDblocks)==TRUE)[1] +dim(saveBlocks)[1]),] <- saveBlocks
# }
subgeno <- chrgeno[, st:ed]
subSNPinfo <- chrSNPinfo[st:ed, ]
subbinvec <- CLQD(geno = subgeno, SNPinfo = subSNPinfo, CLQcut = CLQcut, clstgap = clstgap, LD = LD, hrstType=hrstType,
hrstParam = hrstParam)
# chrCLQres <- c(chrCLQres, subbinvec)
chrCLQres[st: ed] <- subbinvec #### change vector procedure
cat('CLQ done!\r')
bins <- seq_len(max(subbinvec[which(!is.na(subbinvec))]))
clstlist <- lapply(bins, function(x) which(subbinvec == x))
clstlist <- lapply(clstlist, sort) ###
clstlist <- clstlist[order(vapply(clstlist, min, 1))] ###
nowLDblocks <- constructLDblock(clstlist, subSNPinfo)
nowLDblocks <- nowLDblocks + (st - 1)
nowLDblocks <- nowLDblocks[order(nowLDblocks[, 1]), , drop=FALSE]
addingBlocks <- rbind(addingBlocks, nowLDblocks)
}
addingBlocks <- rbind(doneLDblocks, addingBlocks)
addingBlocks <- addingBlocks[order(addingBlocks[, 2]), , drop=FALSE]
addingBlocks <- addingBlocks[order(addingBlocks[, 1]), , drop=FALSE]
for(i in 1:(dim(addingBlocks)[1]-1)){
if(addingBlocks[i,2]>=addingBlocks[(i+1),2]){
addingBlocks[(i+1),]<-c(min(addingBlocks[i:(i+1),]), max(addingBlocks[i:(i+1),]))
addingBlocks[i,1] <- NA
next
}
if(addingBlocks[i,2]>=addingBlocks[(i+1),1]){
addingBlocks[(i+1),] <-c(min(addingBlocks[i:(i+1),]), max(addingBlocks[i:(i+1),]))
addingBlocks[i,1] <- NA
next;
}
}
newLDblocks <- addingBlocks[!is.na(addingBlocks[,1]),]
}else{
newLDblocks<-doneLDblocks
} # LDblock reconst ...atfcut
start.index <- newLDblocks[,1]
end.index <- newLDblocks[,2]
start.rsID <- as.character(chrSNPinfo[newLDblocks[,1],2])
end.rsID <- as.character(chrSNPinfo[newLDblocks[,2],2])
start.bp <- chrSNPinfo[newLDblocks[,1],3]
end.bp <- chrSNPinfo[newLDblocks[,2],3]
chrLDblocks <- data.frame(start.index, end.index, start.rsID, end.rsID, start.bp, end.bp)
chrSNPinfo <- SNPinfo[(SNPinfo[,1]==chrN),,drop=FALSE]
chrLDblocks$start.index<- match(chrLDblocks[,5], chrSNPinfo[,3])
chrLDblocks$end.index<- match(chrLDblocks[,6], chrSNPinfo[,3])
#append rare variants --------------------
if(appendRare==TRUE){
message( "Append rare variants!")
chrgeno <- geno[,(SNPinfo[,1]==chrN),drop=FALSE]
chrLDblocks<-appendcutByForce(chrLDblocks, chrgeno, chrSNPinfo, CLQcut=CLQcut, clstgap=clstgap,
CLQmode=CLQmode, hrstType = hrstType, hrstParam=hrstParam, LD=LD)
}
chrLDblocks <- data.frame(chr=chrN, chrLDblocks)
chrLDblocks$start.index <- chrLDblocks$start.index+stindex
chrLDblocks$end.index <- chrLDblocks$end.index+stindex
stindex <- stindex + dim(chrSNPinfo)[1]
totalLDBres <- rbind(totalLDBres, chrLDblocks)
} # end for current chr
message("\nBigLD done!")
return(totalLDBres)
}
#-GPART
# GPART < input >
#' @title Partitioning genodata based on the result obtained by using Big-LD and gene region information.
#' @name GPART
#' @aliases GPART
#' @description
#' \code{GPART} partition the given genodata using the result obtained by Big-LD and gene region information.
#' The algorithm partition the whole sequence into sub sequences of which size do not exceed the given threshold.
#' @usage GPART(geno=NULL, SNPinfo=NULL, geneinfo=NULL, genofile=NULL,
#' SNPinfofile=NULL, geneinfofile=NULL, geneDB = c("ensembl","ucsc"),
#' assembly = c("GRCh38", "GRCh37"), geneid = "hgnc_symbol",ensbversion = NULL,
#' chrN=NULL, startbp=-Inf, endbp=Inf, BigLDresult=NULL, minsize=4, maxsize=50,
#' LD=c("r2", "Dprime"), CLQcut=0.5, CLQmode=c("density", "maximal"), MAFcut = 0.05,
#' GPARTmode=c("geneBased", "LDblockBased"),
#' Blockbasedmode=c("onlyBlocks", "useGeneRegions"))
#' @param geno Data frame or matrix of additive genotype data, each column is additive genotype of each SNP.
#' @param SNPinfo Data frame or matrix of SNPs information. 1st column is rsID and 2nd column is bp position.
#' @param geneinfo Data frame or matrix of Gene info data.
#' (1st col : Genename, 2nd col : chromosome, 3rd col : start bp, 4th col : end bp)
#' @param genofile Character constant; Genotype data file name (supporting format: .txt, .ped, .raw, .traw, .vcf).
#' @param SNPinfofile Character constant; SNPinfo data file name (supporting format: .txt, .map).
#' @param geneinfofile A Character constant; file containing the gene information
#' (1st col : Genename, 2nd col : chromosome, 3rd col : start bp, 4th col : end bp)
#' @param geneDB A Character constant; database type for gene information. Set \code{"ensembl"} to get gene info from "Ensembl",
#' or set \code{"ucsc"} to get gene info from "UCSC genome browser" (See package "biomaRt" or
#' package "homo.sapiens"/"TxDb.Hsapiens.UCSC.hg38.knownGene"/ "TxDb.Hsapiens.UCSC.hg19.knownGene" for details.)
#' @param assembly A character constant; set \code{"GRCh37"} for GRCh37, or set \code{"GRCh38"} for GRCh38
#' @param geneid A character constant; When you use the gene information by \code{geneDB}. specity the symbol for gene name to use.
#' default is "hgnc_symbol".
#' (eg. 'ensembl_gene_id' for \code{geneDB = "ensembl"}, "ENTREZID"/"ENSEMBL"/"REFSEQ"/"TXNAME" for \code{geneDB="ucsc"}.
#' See package 'biomaRt' or package 'Homo.sapiens' for details)
#' @param ensbversion a integer constant; you can set the release version of ensembl
#' when you use the gene information by using \code{geneDB='emsembl'} and \code{assembly='GRCh38'}
#' @param chrN Numeric(or Character) constant (or vector); chromosome number to use. If \code{NULL}(default), we use all chromosome.
#' @param startbp Numeric constant; starting bp position of the \code{chrN}. Default -Inf.
#' @param endbp Numeric constant; last bp position of the \code{chrN}. Default Inf.
#' @param BigLDresult Data frame; a result obtained by \code{BigLD} function. If \code{NULL}(default),
#' the \code{GPART} function first excute \code{BigLD} function to obtain LD blocks estimation result.
#' @param minsize Numeric constant; the lower bound of number of SNPs in a partition.
#' @param maxsize Numeric constant; the upper bound of number of SNPs in a partition.
#' @param LD Character constant; LD measure to use, r2 or Dprime.
#' @param CLQcut Numeric constant; threshold for the correlation value |r|, between 0 to 1.
#' @param CLQmode Character constant; the way to give priority among detected cliques.
#' if \code{CLQmode = "density"} then the algorithm gives priority to the clique of largest value of \eqn{(Number of SNPs)/(range of clique)},
#' else if \code{CLQmode = "maximal"}, then the algorithm gives priority to the largest clique. The default is "density".
#' @param MAFcut Numeric constant; the MAF threshold. Default 0.05.
#' @param GPARTmode Character constant; GPART algorithm methods to use, "geneBased" or "LDblockBased". Default is ??geneBased??
#' @param Blockbasedmode Character constant; When you set \code{GPARTmode = "LDblockBased"}, specify LDblock based method as
#' \code{"onlyBlocks"}("LDblock based only" algorithm) or \code{"useGeneRegions"}(LDblock based and also use gene info algorithm).
#'
# < output >
#' @return \code{GPART} returns data frame which contains 9 information of each partition
#' (chromosome, index number of the first SNP and last SNP, rsID of the first SNP and last SNP,
#' basepair position of the first SNP and last SNP, blocksize, Name of a block)
#'
#' @author Sun Ah Kim <sunny03@snu.ac.kr>, Yun Joo Yoo <yyoo@snu.ac.kr>
#' @seealso \code{\link{BigLD}}
#'
#'
#' @examples
#' data(geno)
#' data(SNPinfo)
#' data(geneinfo)
#' GPART(geno=geno, SNPinfo=SNPinfo, geneinfo=geneinfo, startbp = 16058400, endbp = 16076500)
#'
#' @importFrom stats cor median quantile
#' @importFrom utils tail
#' @importFrom Rcpp sourceCpp
#' @import Homo.sapiens
#' @import TxDb.Hsapiens.UCSC.hg38.knownGene
#' @export
GPART <- function(geno=NULL, SNPinfo=NULL, geneinfo=NULL, genofile=NULL, SNPinfofile=NULL, geneinfofile=NULL,
geneDB = c("ensembl","ucsc"), assembly = c("GRCh38", "GRCh37"), geneid = "hgnc_symbol",
ensbversion = NULL,
chrN=NULL, startbp=-Inf, endbp=Inf, BigLDresult=NULL, minsize=4, maxsize=50,
LD=c("r2", "Dprime"), CLQcut=0.5, CLQmode=c("density", "maximal"), MAFcut = 0.05,
GPARTmode=c("geneBased", "LDblockBased"), Blockbasedmode=c("onlyBlocks", "useGeneRegions"))
{
# Main part
GPARTmode <- match.arg(GPARTmode)
geneDB <- match.arg(geneDB)
assembly <- match.arg(assembly)
CLQmode <- match.arg(CLQmode)
LD <- match.arg(LD)# if null choose "null"
# data preparation
# input data preparation
if(!is.null(genofile)){
inputdata <- dataPreparation(genofile, SNPinfofile, geno, SNPinfo, chrN=chrN, startbp=startbp, endbp=endbp)
geno <- inputdata[[1]]
SNPinfo <- inputdata[[2]]
}else{
geno <- geno
geno <- as.matrix(geno)
SNPinfo <- SNPinfo
if(is.null(geno) |is.null(SNPinfo)){
stop("Need input data (or files)")
}
if(ncol(geno) != nrow(SNPinfo)){
stop("The number of SNPs in geno and SNPinfo are not matched")
}
if(is.null(chrN)) chrN <- unique(SNPinfo[,1])
colnames(SNPinfo) <- c("chrN", "rsID", "bp")
if(startbp != -Inf | endbp != Inf){
if((length(chrN)>1) | (length(unique(SNPinfo[,1]))>1))
stop("If your input data include more than one chromosome or you choose more than one chromosome,
then do not specify the startbp and endbp!")
SNPloca <- which(SNPinfo$bp>=startbp & SNPinfo$bp<=endbp & SNPinfo$chrN == chrN)
SNPinfo <- SNPinfo[SNPloca,]
geno <- geno[,SNPloca]
}
colnames(geno)<- SNPinfo$rsID
}
chrNs <- unique(SNPinfo[,1])
# filtering all NA SNPs
allNASNPs = apply(geno, 2, function(x) all(is.na(x)))
geno<-geno[, !allNASNPs]
SNPinfo<-SNPinfo[!allNASNPs, ]
#SNP filtering
MAF <- apply(geno, 2, function(x) mean(x,na.rm=TRUE)/2)
MAF_ok <- ifelse(MAF>=0.5,1-MAF,MAF)
MAF <- MAF_ok
message(paste("Number of SNPs after prunning SNPs with MAF<", MAFcut, ":", sum(MAF>=MAFcut)))
geno <- geno[,MAF>=MAFcut]
SNPinfo <- SNPinfo[MAF>=MAFcut,]
if(is.null(BigLDresult)){
message("Start to execute BigLD function!")
BigLDresult <- BigLD(geno, SNPinfo, CLQcut=CLQcut, CLQmode=CLQmode, LD=LD, MAFcut=MAFcut)#, MAFcut=MAFcut
message("Big-LD, done!")
}else{
message("Use the inputted BigLD result")
# BigLDblocks <- BigLDresult #[BigLDresult$chr==chrN,]
newindex.st <- match(BigLDresult$start.rsID, SNPinfo[,2])
newindex.ed <- match(BigLDresult$end.rsID, SNPinfo[,2])
#BigLDresult check
if(sum(is.na(newindex.st))!=0 |sum(is.na(newindex.ed))!=0){
stop("The inputted data and the bigLD results are not paired.
\nPlease check whether the BigLD results are obtained from the input data")
}
BigLDresult$start.index<-newindex.st
BigLDresult$end.index<-newindex.ed
}
# gene based
# load gene info
if(is.null(geneinfo)){
geneinfo <- geneinfoExt(geneinfofile=geneinfofile, geneDB = geneDB, assembly = assembly, geneid = geneid,
ensbversion = ensbversion, chrNs = chrNs)
}
if(GPARTmode == "geneBased"){
message(paste("GPART algorithm: geneBased"))
TotalRes <- NULL
startindex <- 0
for(chrN in unique(SNPinfo[,1])){
message(paste("working chromosome name:", chrN))
chrSNPinfo <- SNPinfo[(SNPinfo[,1]==chrN),,drop=FALSE]
chrgeno <- geno[,(SNPinfo[,1]==chrN),drop=FALSE]
BigLDblocks <- BigLDresult[BigLDresult$chr==chrN,,drop=FALSE]
genelist <- geneinfo[which(geneinfo[,2] == chrN), ]
colnames(genelist)<- c("genename", "chr", "start.bp", "end.bp")
genelist <- genelist[order(genelist[,3]),,drop=FALSE]
genelist <- combineOverlapSamegene(genelist)
GeneRegionSNPs1 <- NULL
for (i in seq_len(dim(genelist)[1])){
test <- (which(chrSNPinfo[, 3] >= genelist[i, 3] & chrSNPinfo[, 3] <= genelist[i, 4]))
ifelse(length(test) > 0, test <- range(test), test <- c(0, 0))
GeneRegionSNPs1 <- rbind(GeneRegionSNPs1, test)
# if(i%%10) message(i)
}
rownames(GeneRegionSNPs1) <- genelist[, 1]
SNPexist <- apply(GeneRegionSNPs1, 1, function(x) (x[1] != 0 & x[2] != 0))
SNPexist <- unlist(SNPexist)
Geneblocks <- GeneRegionSNPs1[SNPexist, ,drop=FALSE]
if(dim(Geneblocks)[1]>0){
Geneblocks.M <- mergeOverlapGene(Geneblocks)
}else{
Geneblocks.M <- Geneblocks
}
if(dim(BigLDblocks)[1]>0){
newindex.st <- match(BigLDblocks$start.rsID, chrSNPinfo[,2])
newindex.ed <- match(BigLDblocks$end.rsID, chrSNPinfo[,2])
#BigLDresult check
# if(sum(is.na(newindex.st))!=0 |sum(is.na(newindex.ed))!=0){
# stop("The inputted data and the bigLD results are not paired.
# \nPlease check whether the BigLD results are obtained from the input data")
# }
BigLDblocks$start.index<-newindex.st
BigLDblocks$end.index<-newindex.ed
nowLDblocks <- BigLDblocks
LDblocks <- cbind(nowLDblocks[, 2], nowLDblocks[, 3])
# Split Large LDblocks
FinalLDblocks <- LDblockSplit(chrgeno, LDblocks, maxsize, LD)
LDblocks.sgt <- setdiff(seq_len(dim(chrSNPinfo)[1]), unlist(apply(FinalLDblocks, 1, function(x) min(x):max(x))))
LDblocks.sgt <- cbind(LDblocks.sgt, LDblocks.sgt)
LDblocks.T <- rbind(FinalLDblocks, LDblocks.sgt)
LDblocks.T <- LDblocks.T[order(LDblocks.T[, 1]), ]
colnames(LDblocks.T) <- c("st", "ed")
# gene-base block partitioning
}else{
LDblocks.T <- cbind(seq_len(dim(chrSNPinfo)[1]), seq_len(dim(chrSNPinfo)[1]))
colnames(LDblocks.T) <- c("st", "ed")
}
if(dim(Geneblocks.M)[1]>0){
GeneLDblocks2 <- LDblockGeneMerge(LDblocks.T, Geneblocks.M)
}else{
GeneLDblocks2 <- LDblocks.T
rownames(GeneLDblocks2) <- rep("No", dim(GeneLDblocks2)[1])
}
blockL <- GeneLDblocks2[, 2] - GeneLDblocks2[, 1] + 1
GeneLDblocks3 <- cbind(GeneLDblocks2, blockL)
# split big block
if(sum(blockL>maxsize)>0){
GeneLDblocks4 <- splitBigLD(GeneLDblocks3, LDblocks.T, Geneblocks, maxsize)
}else{
GeneLDblocks4 <- GeneLDblocks3
# GeneLDblocks4 <- data.frame(GeneLDblocks4, rep("No", dim(GeneLDblocks4)[1]))
# colnames(GeneLDblocks4)[4] <-"gname"
}
GeneLDblocks5 <- mergeSmallRegion(GeneLDblocks4, maxsize, minsize)
# naming regions
GeneLDblocks <- namingRegion2(GeneLDblocks5, genelist, chrSNPinfo)
st.rsID <- chrSNPinfo[GeneLDblocks[, 1], 2]
ed.rsID <- chrSNPinfo[GeneLDblocks[, 2], 2]
Finalresult <- data.frame(chrN,GeneLDblocks$st, GeneLDblocks$ed, st.rsID, ed.rsID, GeneLDblocks$st.bp, GeneLDblocks$ed.bp,
GeneLDblocks$blockL, GeneLDblocks$Finalnames)
colnames(Finalresult) <- c("chr","st", "ed", "st.rsID", "ed.rsID", "st.bp", "ed.bp", "blocksize", "Name")
Finalresult$st <- Finalresult$st+startindex
Finalresult$ed <- Finalresult$ed+startindex
startindex <- startindex + dim(chrgeno)[2]
message(paste("chr", chrN, "done!"))
TotalRes <- rbind(TotalRes, Finalresult)
}
# LD block based
}else if (GPARTmode == "LDblockBased"){
Blockbasedmode <- match.arg(Blockbasedmode)
message(paste("GPART algorithm: LDblockBased-", Blockbasedmode, sep = ""))
TotalRes <- NULL
startindex <- 0
for(chrN in unique(SNPinfo[,1])){
message(paste("working chromosome name:", chrN))
chrSNPinfo <- SNPinfo[(SNPinfo[,1]==chrN),,drop=FALSE]
chrgeno <- geno[,(SNPinfo[,1]==chrN),drop=FALSE]
BigLDblocks <- BigLDresult[BigLDresult$chr==chrN,,drop=FALSE]
genelist <- geneinfo[which(geneinfo[,2] == chrN), ]
colnames(genelist)<- c("genename", "chr", "start.bp", "end.bp")
genelist <- genelist[order(genelist[,3]),]
genelist <- combineOverlapSamegene(genelist)
# gene-region merging
GeneRegionSNPs1 <- NULL
for (i in seq_len(dim(genelist)[1])){
test <- (which(chrSNPinfo[, 3] >= genelist[i, 3] & chrSNPinfo[, 3] <= genelist[i, 4]))
ifelse(length(test) > 0, test <- range(test), test <- c(0, 0))
GeneRegionSNPs1 <- rbind(GeneRegionSNPs1, test)
# if(i%%10) message(i)
}
rownames(GeneRegionSNPs1) <- genelist[, 1]
SNPexist <- apply(GeneRegionSNPs1, 1, function(x) (x[1] != 0 & x[2] != 0))
SNPexist <- unlist(SNPexist)
Geneblocks <- GeneRegionSNPs1[SNPexist, ,drop=FALSE]
if(dim(Geneblocks)[1]>0){
Geneblocks.M <- mergeOverlapGene(Geneblocks)
}else{
Geneblocks.M <- Geneblocks
}
if(dim(BigLDblocks)[1]>0){
newindex.st <- match(BigLDblocks$start.rsID, chrSNPinfo[,2])
newindex.ed <- match(BigLDblocks$end.rsID, chrSNPinfo[,2])
BigLDblocks$start.index <- newindex.st
BigLDblocks$end.index <- newindex.ed
nowLDblocks <- BigLDblocks
LDblocks <- cbind(nowLDblocks[, 2], nowLDblocks[, 3])
# Split Large LDblocks
FinalLDblocks <- LDblockSplit(chrgeno, LDblocks, maxsize, LD)
LDblocks.sgt <- setdiff(seq_len(dim(chrSNPinfo)[1]), unlist(apply(FinalLDblocks, 1, function(x) min(x):max(x))))
LDblocks.sgt <- cbind(LDblocks.sgt, LDblocks.sgt)
LDblocks.T <- rbind(FinalLDblocks, LDblocks.sgt)
LDblocks.T <- LDblocks.T[order(LDblocks.T[, 1]), ]
colnames(LDblocks.T) <- c("st", "ed")
}else{
LDblocks.T <- cbind(seq_len(dim(chrSNPinfo)[1]), seq_len(dim(chrSNPinfo)[1]))
colnames(LDblocks.T) <- c("st", "ed")
}
# onlyblocks? or useGeneRegions?
if(Blockbasedmode == "onlyBlocks"){
blockL <- apply(LDblocks.T, 1, function(x) diff(x)+1)
LDblocks.T <- data.frame(LDblocks.T, blockL)
LDblocks1 <- mergeSmallRegion(LDblocks.T, maxsize, minsize)
GeneLDblocks <- namingRegion2(LDblocks1, genelist, chrSNPinfo)
# end: if(Blockbasedmode == "onlyBlocks")
}else if(Blockbasedmode == "useGeneRegions"){
blockL <- apply(LDblocks.T, 1, function(x) diff(x)+1)
LDblocks.T <- data.frame(LDblocks.T, blockL)
gLDblocks <- data.frame(LDblocks.T, NA)
for(k in seq_len(dim(gLDblocks)[1])){
nowblock <- gLDblocks[k,,drop=FALSE]
intro.index <- (which(Geneblocks[,2]<nowblock[1,1]))
intro.index1 <- setdiff(seq_len(dim(Geneblocks)[1]), intro.index)
outro.index <- (which(Geneblocks[,1]>nowblock[1,2]))
outro.index1 <- setdiff(seq_len(dim(Geneblocks)[1]), outro.index)
common.index <- intersect(intro.index1, outro.index1)
if(length(common.index)==0) {
gLDblocks[k,4]<-NA
}else{
gnames <- rownames(Geneblocks[common.index,1, drop=FALSE])
gname <- paste(gnames, collapse="/")
gLDblocks[k,4]<-gname
}
}
## LD block
bigblocks <- gLDblocks[gLDblocks$blockL>=minsize,]
smallblocks <- gLDblocks[gLDblocks$blockL<minsize,]
small.gene <- smallblocks[!is.na(smallblocks[,4]),]
small.nongene <- smallblocks[is.na(smallblocks[,4]),]
#merge small blocks which are in gene regions
addedblocks <- NULL
generegions <- unique(small.gene[,4])
for(g in generegions){
# message(g)
nowgeneblocks <- small.gene[(small.gene[,4]==g),,drop=FALSE]
nowbin <- NULL
while(dim(nowgeneblocks)[1]>0){
if(is.null(nowbin)){
nowbin <- nowgeneblocks[1,,drop=FALSE]
nowgeneblocks <- nowgeneblocks[-1,,drop=FALSE]
}
if(dim(nowgeneblocks)[1]==0 & !is.null(nowbin)){
addedblocks<- rbind(addedblocks, nowbin)
break
}
newbin <- nowgeneblocks[1,,drop=FALSE]
nowgeneblocks <- nowgeneblocks[-1,,drop=FALSE]
if(nowbin[1,3]+newbin[1,3]>maxsize){
addedblocks<- rbind(addedblocks, nowbin)
nowbin <- newbin
}else if((nowbin[1,2]+1)!=newbin[1,1]){
addedblocks<- rbind(addedblocks, nowbin)
nowbin <- newbin
}else if((nowbin[1,2]+1)==newbin[1,1]){
nowbin[1,2] <- newbin[1,2]
nowbin[1,3] <- nowbin[1,3]+newbin[1,3]
}
if(dim(nowgeneblocks)[1]==0 & !is.null(nowbin)){
addedblocks<- rbind(addedblocks, nowbin)
break
}
}
}
GeneLDblocks <- rbind(bigblocks, addedblocks, small.nongene)
GeneLDblocks <- GeneLDblocks[order(GeneLDblocks[,1]),]
LDblocks1 <- mergeSmallRegion(GeneLDblocks[,seq_len(3)], maxsize, minsize)
GeneLDblocks <- namingRegion2(LDblocks1, genelist, chrSNPinfo)
} # end if(Blockbasedmode == "useGeneRegions")
st.rsID <- chrSNPinfo[GeneLDblocks[, 1], 2]
ed.rsID <- chrSNPinfo[GeneLDblocks[, 2], 2]
Finalresult <- data.frame(chrN,GeneLDblocks$st, GeneLDblocks$ed, st.rsID, ed.rsID, GeneLDblocks$st.bp, GeneLDblocks$ed.bp,
GeneLDblocks$blockL, GeneLDblocks$Finalnames)
colnames(Finalresult) <- c("chr","st", "ed", "st.rsID", "ed.rsID", "st.bp", "ed.bp", "blocksize", "Name")
Finalresult$st <- Finalresult$st+startindex
Finalresult$ed <- Finalresult$ed+startindex
startindex <- startindex + dim(chrgeno)[2]
message(paste("chr", chrN, "done!"))
TotalRes <- rbind(TotalRes, Finalresult)
}
}
colnames(TotalRes)[seq_len(7)] <- c( "chr","start.index", "end.index", "start.rsID","end.rsID", "start.bp", "end.bp")
return(TotalRes)
}
#' @title visualization of LD block structure
#' @name LDblockHeatmap
#' @aliases LDblockHeatmap
#' @description
#' \code{LDblockHeatmap} visualize the LD structure or LD block results of the inputed data.
#' LDblockHeatmap <- function(geno=NULL, SNPinfo=NULL, genofile=NULL, SNPinfofile=NULL,geneinfo=NULL, geneinfofile = NULL,
#' geneDB = c("ensembl","ucsc","file"), assembly = c("GRCh38", "GRCh37"), geneid = "hgnc_symbol",
#' ensbversion = NULL, chrN=NULL,startbp=-Inf, endbp=Inf, blockresult=NULL, blocktype=c("bigld", "gpart"),
#' minsize=4, maxsize=50, LD=c("r2", "Dprime", "Dp-str"), MAFcut=0.05,CLQcut=0.5,
#' CLQmode=c("density", "maximal"), CLQshow=FALSE,
#' type=c("png", "tif"), filename="heatmap", res=300, onlyHeatmap=FALSE)
#' @param geno Data frame or matrix of additive genotype data, each column is additive genotype of each SNP.
#' @param SNPinfo Data frame or matrix of SNPs information. 1st column is rsID and 2nd column is bp position.
#' @param genofile Character constant; Genotype data file name (supporting format: .txt, .ped, .raw, .traw, .vcf).
#' @param SNPinfofile Character constant; SNPinfo data file name (supporting format: .txt, .map).
#' @param geneinfo Data frame or matrix of Gene info data.
#' (1st col : Genename, 2nd col : chromosome, 3rd col : start bp, 4th col : end bp)
#' @param geneinfofile A Character constant; file containing the gene information
#' (1st col : Genename, 2nd col : chromosome, 3rd col : start bp, 4th col : end bp)
#' @param geneDB A Character constant; database type for gene information. Set \code{"ensembl"} to get gene info from "Ensembl",
#' or set \code{"ucsc"} to get gene info from "UCSC genome browser" (See package "biomaRt" or
#' package "homo.sapiens"/"TxDb.Hsapiens.UCSC.hg38.knownGene"/ "TxDb.Hsapiens.UCSC.hg19.knownGene" for details.)
#' @param assembly A character constant; set \code{"GRCh37"} for GRCh37, or set \code{"GRCh38"} for GRCh38
#' @param geneid A character constant; When you use the gene information by \code{geneDB}. specity the symbol for gene name to use.
#' default is "hgnc_symbol".
#' (eg. 'ensembl_gene_id' for \code{geneDB = "ensembl"}, "ENTREZID"/"ENSEMBL"/"REFSEQ"/"TXNAME" for \code{geneDB="ucsc"}.
#' See package 'biomaRt' or package 'Homo.sapiens' for details)
#' @param ensbversion a integer constant; you can set the release version of ensembl
#' when you use the gene information by using \code{geneDB='emsembl'} and \code{assembly='GRCh38'}
#' @param geneshow logical; do not show the gene information if \code{geneshow=FALSE}.
#' Default is \code{geneshow=TRUE}
#' @param chrN Numeric(or Character) constant ; chromosome number to use.
#' If the data contains more than one chromosome, you need to specify the chromosome to show.
#' @param startbp Numeric constant; starting bp position of the \code{chrN}.
#' @param endbp Numeric constant; last bp position of the \code{chrN}.
#' @param blockresult Data frame; a result obtained by \code{BigLD} function or \code{GPART}. If \code{NULL}(default),
#' the function first excute \code{BigLD} or\code{GPART} to obtain block estimation result denpending on the \code{blocktype}.
#' @param blocktype Character constant; "bigld" for \code{Big-LD} or "gpart" for \code{GPART}. Default is \code{"gpart"}.
#' @param minsize Integer constant; when \code{blockresult=NULL, blocktype="gpart"}
#' specify the threshold for minsize of a block obtained by \code{GPART}
#' @param maxsize Integer constant; when \code{blockresult=NULL, blocktype="gpart"}
#' specify the threshold for maxsize of a block obtained by \code{GPART}
#' @param LD Character constant; LD measure to use, "r2" or "Dprime" or "Dp-str".
#' LD measures for LD heatmap (and BigLD execution when \code{BigLDresult=NULL}). When \code{LD = "Dp-str"},
#' heatmap shows only two cases, "weak LD or not-informative" and "strong LD". When \code{LD= Dprime} heatmap shows the estimated D' measures.
#' @param MAFcut Numeric constant; MAF threshold of SNPs to use. Default 0.05
#' @param CLQcut Numeric constant; threshold for the correlation value |r|, between 0 to 1. Default 0.5.
#' @param CLQmode Character constant; the way to give priority among detected cliques.
#' if \code{CLQmode = "density"} then the algorithm gives priority to the clique of largest value of \eqn{(Number of SNPs)/(range of clique)},
#' else if \code{CLQmode = "maximal"}, then the algorithm gives priority to the largest clique. Default "density".
#' @param CLQshow logical; Show the LD bin structures, i.e. CLQ results, in each LD blocks.
#' Notice that if the region to show includes more than 200 SNPs, the function do now show LD bin structures automatically. Default false.
#' @param type Character constant; file format of output image file. set \code{png} for PNG format file, or \code{tif} for TIFF format file.
#' @param filename Character constant; filename of output image file. Default "heatmap".
#' @param res Numeric constant; resolution of image. Default 300.
#' @param onlyHeatmap logical; show the LD heatmap without the LD block boundaries. Default false.
# < output >
#' @return \code{GPART} returns data frame which contains 9 information of each partition
#' (chromosome, index number of the first SNP and last SNP, rsID of the first SNP and last SNP,
#' basepair position of the first SNP and last SNP, blocksize, Name of a block)
#'
#' @author Sun-Ah Kim <sunny03@snu.ac.kr>, Yun Joo Yoo <yyoo@snu.ac.kr>
#' @seealso \code{\link{BigLD}}
#'
#' @examples
#'
#' LDblockHeatmap(geno=geno[,1:100], SNPinfo=SNPinfo[1:100,], geneinfo=geneinfo,
#' filename="chr21Heatmap")
#'
#' @importFrom grDevices adjustcolor colorRampPalette dev.off heat.colors png tiff
#' @importFrom stats cor median quantile
#' @importFrom utils tail
#' @import grid
#' @importFrom Rcpp sourceCpp
#' @export
#-LDblockHeatmap2
LDblockHeatmap <- function(geno=NULL, SNPinfo=NULL, genofile=NULL, SNPinfofile=NULL,geneinfo=NULL, geneinfofile=NULL,
geneDB=c("ensembl","ucsc","file"), assembly=c("GRCh38", "GRCh37"), geneid="hgnc_symbol",
ensbversion=NULL, geneshow=TRUE, chrN=NULL,startbp=-Inf, endbp=Inf, blockresult=NULL, blocktype=c("bigld", "gpart"),
minsize=4, maxsize=50, LD=c("r2", "Dprime", "Dp-str"), MAFcut=0.05,CLQcut=0.5,
CLQmode=c("density", "maximal"), CLQshow=FALSE,
type=c("png", "tif"), filename="heatmap", res=300, onlyHeatmap=FALSE)
{
# -set data and parameters ------
tick <- "both"
maxisize <- 20000
longleng <- 2000
shortleng <- ifelse(is.null(geneinfo), 1000, 100)
CLQmode <- match.arg(CLQmode)
blocktype <- match.arg(blocktype)
type <- match.arg(type)
LD <- match.arg(LD)
LDcal <- ifelse(LD=="r2", "r2", "Dprime")
# tick <- match.arg(tick)
## data preparation ----------
# input data preparation
if(!is.null(genofile)){
inputdata <- dataPreparation(genofile, SNPinfofile, geno, SNPinfo, chrN=chrN, startbp=startbp, endbp=endbp)
geno <- inputdata[[1]]
SNPinfo <- inputdata[[2]]
}else{
if(is.null(geno) |is.null(SNPinfo)){
stop("Need input data (or files)")
}
if(ncol(geno) != nrow(SNPinfo)){
stop("The number of SNPs in geno and SNPinfo are not matched")
}
if(is.null(chrN)){
if(length(unique(SNPinfo[,1]))==1){
chrN <- unique(SNPinfo[,1])
}else{
stop("Please choose a chromosome!")
}
}
colnames(SNPinfo) <- c("chrN", "rsID", "bp")
geno <- as.matrix(geno)
}
if(is.null(geneinfo)){
if(geneshow == TRUE){
geneinfo <- geneinfoExt(geneinfofile=geneinfofile, geneDB = geneDB, assembly = assembly, geneid = geneid,
ensbversion = ensbversion, chrNs = chrN)
}
}
# filtering all NA SNPs
allNASNPs = apply(geno, 2, function(x) all(is.na(x)))
geno<-geno[, !allNASNPs]
SNPinfo<-SNPinfo[!allNASNPs, ]
## MAF filtering
MAF <- apply(geno, 2, function(x) mean(x,na.rm=TRUE)/2)
MAF_ok <- ifelse(MAF>=0.5,1-MAF,MAF)
MAF <- MAF_ok
message(paste("Number of SNPs after prunning SNPs with MAF<", MAFcut, ":", sum(MAF>=MAFcut)))
# underMAFgeno <- geno[,MAF<MAFcut]
geno <- geno[,MAF>=MAFcut]
# underMAFSNPinfo <- SNPinfo[MAF<MAFcut,]
SNPinfo <- SNPinfo[MAF>=MAFcut,]
trascolor <- adjustcolor( "red", alpha.f=0)
col1 <- c('#7D0112','#820920','#87112B','#8C1834','#911E3D','#952445','#9A294C','#9E2F54','#A2345B','#A73962',
'#AA3F69','#AE4470','#B24977','#B64E7D','#B95483','#BC598A','#C05E90','#C36396','#C6689C','#C86EA1',
'#CB73A7','#CE78AC','#D07DB2','#D282B7','#D587BC','#D78CC1','#D991C5','#DB96CA','#DC9BCE','#DEA0D3',
'#E0A5D7','#E1AADB','#E3AFDF','#E4B3E2','#E5B8E6','#E7BCE9','#E8C1EC','#E9C5EF','#EACAF1','#EBCEF4',
'#ECD2F6','#EDD6F8','#EEDAF9','#EFDDFB','#F0E1FC','#F1E4FC','#F1E8FC','#F2EBFC','#F2EDFA','#F2F0F6')
col2 <- c('#2D3184','#283C87','#23458A','#1E4E8E','#185692','#125E95','#0A6699','#026E9D','#0075A0','#007CA3',
'#0083A6','#008AA9','#0790AC','#1296AE','#1C9CB0','#26A2B2','#2FA8B4','#38ADB6','#40B3B7','#49B8B9',
'#51BCBA','#5AC1BB','#62C5BC','#6AC9BD','#73CDBE','#7BD1BE','#83D5BF','#8AD8C0','#92DBC1','#99DEC1',
'#A0E0C2','#A7E3C3','#AEE5C4','#B5E7C5','#BBE9C6','#C1EBC7','#C7ECC9','#CCEECA','#D2EFCB','#D7F0CD',
'#DBF0CF','#DFF1D0','#E3F2D2','#E7F2D4','#EAF2D6','#EDF2D9','#EFF2DB','#F1F2DD','#F2F2E0','#F3F1E4')
col3 <- c('#D33F6A','#D44468','#D54866','#D74C63','#D85161','#D9545F','#DB585C','#DC5C5A','#DD6057','#DE6455',
'#DF6752','#E06B50','#E16F4D','#E2724A','#E37647','#E47A45','#E57D42','#E5813F','#E6843C','#E78839',
'#E78B37','#E88E34','#E89231','#E8952F','#E9992D','#E99C2B','#E99F2A','#E9A328','#EAA628','#EAAA28',
'#EAAD29','#EAB02A','#E9B42C','#E9B72E','#E9BA31','#E9BE35','#E8C139','#E8C43D','#E8C742','#E7CB46',
'#E7CE4C','#E6D151','#E6D457','#E5D75D','#E4DA64','#E4DD6B','#E3E073','#E2E37C','#E2E688','#E2E6BD')
col4 <- c('#035F33','#176438','#23693C','#2D6D41','#367245','#3E764A','#457B4F','#4D7F53','#538458','#5A885C',
'#608C61','#679165','#6D9569','#73996E','#799D72','#7EA176','#84A57A','#8AA97F','#8FAD83','#94B087',
'#99B48B','#9EB88F','#A3BB93','#A8BE97','#ADC29B','#B1C59F','#B6C8A2','#BACBA6','#BECEAA','#C3D1AD',
'#C7D4B1','#CAD6B4','#CED9B7','#D2DBBB','#D5DEBE','#D8E0C1','#DBE2C4','#DEE4C7','#E1E6CA','#E4E8CD',
'#E6E9CF','#E9EBD2','#EBECD5','#EDEDD7','#EEEFD9','#F0EFDC','#F1F0DE','#F2F1E0','#F3F1E2','#F3F1E4')
col5 <- c("#8B4497","#8D499B","#8F4E9F","#9052A4","#9257A8","#945CAC","#9560B0","#9765B4","#9869B8","#9A6DBC",
"#9B72C0","#9D76C4","#9E7AC8","#9F7FCC","#A083D0","#A187D4","#A28BD8","#A390DB","#A494DF","#A598E2",
"#A69CE6","#A7A0E9","#A7A4ED","#A8A8F0","#A8ACF3","#A9B0F7","#A9B4FA","#AAB8FD","#AABCFF","#AABFFF",
"#ABC3FF","#ABC7FF","#ABCBFF","#ABCEFF","#ABD2FF","#ABD5FF","#ABD9FF","#ABDCFF","#ABDFFF","#AAE3FF",
"#AAE6FF","#AAE9FF","#A9ECFF","#A9EFFF","#A8F1FF","#A7F4FF","#A7F6FF","#A6F9FF","#A4FBFF","#A3FCFF")
allcol <- cbind(col5, col2, col3, col4)
blockcol <- c(col2[5], col3[5], col4[5],col5[5])
deepblockcol <- c(col2[1], col3[1], col4[1],col5[1])
if(is.null(chrN)){
chrN <- unique(SNPinfo[,1])
if(length(chrN)>1) stop("The data contains two or more chromosomes. Please specify the chromosome.")
}
SNPloca <- which(SNPinfo$chrN==chrN & SNPinfo$bp>=startbp & SNPinfo$bp<=endbp)
NSNPs <- length(SNPloca)
if(NSNPs<5){
stop("There are less than 5 SNPs in the chosen region!")
}
if(NSNPs>maxisize){
Pmessage <- paste("There are too many SNPs in the chosen region. \n We use the first ",maxisize,"SNPs in the region!")
message(Pmessage)
SNPloca <- min(SNPloca):(min(SNPloca)+maxisize-1)
NSNPs <- length(SNPloca)
}
if(NSNPs>200){
if(CLQshow == TRUE){
message("CLQ results can not be displayed because the number of SNPs in the region is too large.")
}
CLQshow <- FALSE
}
geno <- geno[,SNPloca]
geno <- as.matrix(geno)
SNPinfo <- SNPinfo[SNPloca,]
SNPstbp <- min(SNPinfo[,3])
SNPedbp <- max(SNPinfo[,3])
SNPbp_cordi <- (SNPinfo[,3]-SNPstbp)/(SNPedbp - SNPstbp)
## matching BigLD index and SNP/genodata index
if((!is.null(blockresult)) & (onlyHeatmap == FALSE)){
if(blocktype == "gpart"){
blockresult <- blockresult[,seq_len(7)]
colnames(blockresult)[seq_len(7)] <- c( "chr","start.index", "end.index", "start.rsID","end.rsID", "start.bp", "end.bp")
}
blockresult <- blockresult[which(blockresult$chr==chrN & blockresult$end.bp>=SNPstbp & blockresult$start.bp<=SNPedbp),,drop=FALSE]
blockresult$start.bp[blockresult$start.bp < SNPstbp] <- min(SNPinfo[,3])
blockresult$end.bp[blockresult$end.bp > SNPedbp] <- max(SNPinfo[,3])
stindex <- match(blockresult$start.bp, SNPinfo[,3])
edindex <- match(blockresult$end.bp, SNPinfo[,3])
blockresult$start.index <- stindex
blockresult$end.index <- edindex
blockresult$start.rsID <- SNPinfo[stindex,2]
blockresult$end.rsID <- SNPinfo[edindex,2]
blockresult <- blockresult[blockresult[,2]!=blockresult[,3],,drop=FALSE]
}else if ((is.null(blockresult)) & (onlyHeatmap == FALSE)){
if(blocktype == "bigld"){
oblockresult <- BigLD(geno=geno, SNPinfo=SNPinfo, CLQcut=CLQcut, CLQmode=CLQmode, LD=LDcal, MAFcut=MAFcut)
blockresult <- oblockresult
}else if(blocktype == "gpart"){
oblockresult <- GPART(geno=geno, SNPinfo=SNPinfo, geneinfo = geneinfo, CLQcut=CLQcut, CLQmode=CLQmode, LD=LDcal,
minsize = minsize, maxsize = maxsize, MAFcut = MAFcut)
blockresult <- oblockresult[,seq_len(7)]
colnames(blockresult)[seq_len(7)] <- c( "chr","start.index", "end.index", "start.rsID","end.rsID", "start.bp", "end.bp")
}
}
if(NSNPs < 1000){
unitleng <- 1.1
unitlwd <- 0.4
}else if(NSNPs >= 1000 & NSNPs < 4000){
unitleng <- 2
unitlwd <- 0.3
}else if(NSNPs >= 4000 & NSNPs < 7000){
unitleng <- 3
unitlwd <- 0.2
}else if(NSNPs >= 7000 & NSNPs <= maxisize){
unitleng <- 4
unitlwd <- 0.1
}
if(!is.null(geneinfo)){
nowgeneinfo <- geneinfo[geneinfo[,2]==chrN,,drop=FALSE]
nowgeneinfo <- nowgeneinfo[which(nowgeneinfo[,3]<SNPedbp & nowgeneinfo[,4]>SNPstbp),,drop=FALSE]
nowgeneinfo <- nowgeneinfo[order(nowgeneinfo[,3]),,drop=FALSE]
nowgeneinfo <- combineOverlapSamegene(nowgeneinfo)
}else{
message("There is no gene region information!")
nowgeneinfo <- matrix(1,1,1)[-1,,drop=FALSE]
}
stratifyGene <- list()
if(dim(nowgeneinfo)[1]>0){
nowgeneinfo[which(nowgeneinfo[,3]<SNPstbp),3] <- SNPstbp
nowgeneinfo[which(nowgeneinfo[,4]>SNPedbp),4] <- SNPedbp
nowgeneinfo[,3:4] <- (nowgeneinfo[,3:4]-SNPstbp)/(SNPedbp - SNPstbp)
stratifyGene <- list()
# stratifyGene[[1]]<-nowgeneinfo[1,,drop=F]
# nowgeneinfo<-nowgeneinfo[-1,]
for(i in seq_len(dim(nowgeneinfo)[1])){
nowgene <- nowgeneinfo[i,,drop=FALSE]
appendlistnum <- which(vapply(stratifyGene, function(x) (max(x[,4])+0.01) < nowgene[1,3], TRUE)==TRUE)
if(length(appendlistnum)==0){
stratifyGene[[length(stratifyGene)+1]]<-nowgene
}else{
stratifyGene[[appendlistnum[1]]]<-rbind(stratifyGene[[appendlistnum[1]]], nowgene)
}
}
}
if(CLQshow == TRUE){
geneheight <- min(0.05*length(stratifyGene), 0.35)
bottomloca <- 0.33 + geneheight
}else{
geneheight <- min(0.05*length(stratifyGene), 0.35)
bottomloca <- 0.25 + geneheight
}
if(NSNPs<300){
nowcex <- 0.5
}else if(NSNPs > 300 & NSNPs<1000) {
nowcex <- 0.4
}else if(NSNPs >= 1000 & NSNPs<=2000) {
nowcex <- 0.4
}else if(NSNPs >= 2000 & NSNPs<=3000) {
nowcex <- 0.4
}else if(NSNPs>=3000) {
nowcex <- 0.3
}
# start drawing part ---------------------------------------------------------------------------------
# heatmap construct---------------------------------------------------------------------------------
##color
if(LD == 'Dprime'){
RWcolor <- colorRampPalette(c("red", "white"))
color <- RWcolor(50)
}else if (LD == 'r2'){
color=heat.colors(50)
}else if (LD == "Dp-str"){
RWcolor <- colorRampPalette(c("red", "white"))
color <- RWcolor(50)[c(1,50)]
}
mybreak<-0:length(color)/length(color)
VPheatmap<-viewport(x=0,y=0,width=unit(0.85/1.414,"snpc"),height=unit(0.85/1.414,"snpc"),just=c("left","bottom"), name="VPheatmap")
rot.vp <- viewport(x=.11, y=bottomloca, width=unit(unitleng,"snpc"), height=unit(unitleng,"snpc"), just=c("left", "bottom"), angle=-45)
# genodata
if(NSNPs > longleng){
# M<-Matrix::Matrix(0, dim(SNPinfo)[1], dim(SNPinfo)[1])
startseq <- seq(1, dim(SNPinfo)[1],shortleng)
# system.time({
rectlist <- NULL
for(i in startseq){
if(LD == "r2"){
imgLDmatrix <- suppressWarnings(cor(geno[,(i:min(i+shortleng-1, NSNPs)),drop=FALSE], geno[,(i:min(i+longleng+shortleng, NSNPs)),drop=FALSE],
use="pairwise.complete.obs")^2)
}else if(LD == "Dprime"){
imgLDmatrix <- matCubeX2(geno[,(i:min(i+shortleng-1, NSNPs)),drop=FALSE], geno[,(i:min(i+longleng+shortleng, NSNPs)),drop=FALSE])
}else if(LD == "Dp-str"){
imgLDmatrix <- genoDp2(geno[,(i:min(i+shortleng-1, NSNPs)),drop=FALSE], geno[,(i:min(i+longleng+shortleng, NSNPs)),drop=FALSE],
strLD=TRUE)
}
imgLDmatrix[(imgLDmatrix==Inf)|(imgLDmatrix==-Inf)] <- 0
imgLDmatrix[lower.tri(imgLDmatrix)] <- NA
colcut <- as.character(cut(1-imgLDmatrix,mybreak,labels=as.character(color), include.lowest=TRUE, include.highest=FALSE))
xx <- ((i:min(i+shortleng-1, NSNPs)))/NSNPs
yy <- ((i):min(i+longleng+shortleng, NSNPs))/NSNPs
right <- rep(xx, length((i):min(i+longleng+shortleng, NSNPs)))
top <- rep(yy, each=length((i:min(i+shortleng-1, NSNPs))))
nowrect <- rectGrob(x=right,y=top,width=1/NSNPs,height=1/NSNPs,just=c("right","top"),gp=gpar(col=NA,fill=colcut))
cat(paste("Heatmap generating",i, "|", NSNPs, "\r"))
rectlist <- gList(rectlist, nowrect)
}
LD.heat.map<-gTree(children=rectlist,name="LDheatmap")
# }) #system.time
}else{
if(LD == "r2"){
imgLDmatrix <- (cor(geno,use="pairwise.complete.obs"))^2
}else if(LD == "Dprime"){
imgLDmatrix <- matCubeX(geno)
}else if(LD == "Dp-str"){
imgLDmatrix <- genoDp(geno,strLD=TRUE)
}
imgLDmatrix[(imgLDmatrix==Inf)|(imgLDmatrix==-Inf)] <- 0
imgLDmatrix[lower.tri(imgLDmatrix)] <- NA
colcut <- as.character(cut(1-imgLDmatrix,mybreak,labels=as.character(color), include.lowest=TRUE, include.highest=FALSE))
xx <- seq_len(NSNPs)/NSNPs
yy <- seq_len(NSNPs)/NSNPs
right <- rep(xx, NSNPs)
top <- rep(yy, each=NSNPs)
nowrect <- rectGrob(x=right,y=top,width=1/NSNPs,height=1/NSNPs,just=c("right","top"),gp=gpar(col=NA,fill=colcut))
rectlist <- gList(nowrect)
LD.heat.map <- gTree(children=rectlist,name="LDheatmap")
}
if(onlyHeatmap == FALSE){
### block boundaries
BlockstP <- blockresult[,2]-1
BlockedP <- blockresult[,3]
x1s <- BlockstP*(1/NSNPs)
x2s <- BlockstP*(1/NSNPs)
x3s <- BlockedP*(1/NSNPs)
y1s <- BlockstP*(1/NSNPs)
y2s <- BlockedP*(1/NSNPs)
y3s <- BlockedP*(1/NSNPs)
BlockBoundaries <- polygonGrob(x=c(x1s, x2s, x3s), y=c(y1s, y2s, y3s), id=rep(seq_len(length(x1s)),3),
gp=gpar(fill=trascolor, lwd=unitlwd, col="black"))
LDblockHeatmap2 <- gTree(children=gList(LD.heat.map, BlockBoundaries),name="LDHeatmap", vp=VPheatmap)
blocksizes <- blockresult[,3]-blockresult[,2]+1
blocksizes <- sort(blocksizes, decreasing=TRUE)
if(length(blocksizes) > (15*unitleng)){
showLDsize <- blocksizes[(15*unitleng)]
if(length(blocksizes[blocksizes>=showLDsize])>(15*unitleng)) showLDsize <- showLDsize+1
}else{
showLDsize <- min(blocksizes)
}
regionstbp <- (blockresult[1,6])
regionedbp <- (blockresult[dim(blockresult)[1],7])
rotLDtest <- gTree(children=gList(LDblockHeatmap2), name="rorLDheatmap", vp=rot.vp)
# CLQ res show
if(CLQshow == TRUE){
CLQres <- NULL
for(bn in seq_len(dim(blockresult)[1])){
nowblock <- blockresult[bn,,drop=FALSE]
nowgeno <- geno[,nowblock[1,2]:nowblock[1,3]]
nowSNPinfo <- SNPinfo[nowblock[1,2]:nowblock[1,3],]
nowCLQres <- CLQD(geno=nowgeno, SNPinfo=nowSNPinfo, CLQcut=0.5, clstgap=40000, hrstParam=200, hrstType="nonhrst",
CLQmode=CLQmode, LD=LDcal)
CLQres <- c(CLQres, list(nowCLQres))
}
CLQmaxnum <- max(unlist(CLQres), na.rm=TRUE)
CLQwidth <- 1/NSNPs
CLQboxgL <- gList()
CLQregionheight <- (0.015-0.002*unitleng)*CLQmaxnum
vpCLQ <- viewport(x=0.11,y=bottomloca-CLQregionheight,width=unit(unitleng*0.85,"snpc"),height=unit(CLQregionheight,"snpc"),just=c("left","bottom"), name="vpCLQ")
totalbox <- rectGrob(x=0, y=0, width=1, height=1, just=c("left","bottom"),
gp=gpar(fill="white",lwd=unitlwd, col="black"), vp=vpCLQ)
CLQboxgL <- gList(CLQboxgL, totalbox)
ifelse(CLQmaxnum<25, colgap <- (50%/%CLQmaxnum), colgap <- 1)
bgcol <- c("gray60", "gray90")
for(strt in seq_len(length(CLQres))){
nowbgcol <- bgcol[strt%%2+1]
nowCLQres <- CLQres[[strt]]
stpoint <- blockresult[strt, 2]-1
edpoint <- blockresult[strt, 3]
leftbottomP <- (stpoint:edpoint)/(NSNPs)
totalbox <- rectGrob(x=leftbottomP[1], y=0, width=CLQwidth*length(nowCLQres), height=1,
just=c("left","bottom"), gp=gpar(fill=nowbgcol, lwd=unitlwd, col="black"), vp=vpCLQ)
CLQboxgL <- gList(CLQboxgL, totalbox)
nowcols <- allcol[,(strt%%4+1)]
clqnum <- max(nowCLQres, na.rm=TRUE)
for(CLQn in seq_len(clqnum)){
nowx <- leftbottomP[which((nowCLQres==CLQn)==TRUE)]
nowx <- nowx[!is.na(nowx)]
nownowcol <- nowcols[1+(colgap*(CLQn-1))]
nowbox <- rectGrob(x=nowx, y=(1/CLQmaxnum)*(CLQn-1), width=CLQwidth, height=(1/CLQmaxnum),
just=c("left","bottom"), gp=gpar(fill=nownowcol ,lwd=unitlwd, col="black"), vp=vpCLQ)
CLQboxgL <- gList(CLQboxgL, nowbox)
}
}
#write.snpnumber
binvector <- rep(0, NSNPs)
for(strt in seq_len(length(CLQres))){
stpoint <- blockresult[strt, 2]
edpoint <- blockresult[strt, 3]
binvector[stpoint:edpoint]<-CLQres[[strt]]
}
binvector[is.na(binvector)]<-0
textx <- seq_len(NSNPs)/(NSNPs)-(0.5/NSNPs)
texty <- binvector*(1/max(binvector)) - (0.5/max(binvector))
textcol <- rep(NA, NSNPs)
textcol[which(binvector <= round(max(binvector)/2))] <- "white"
textcol[which(binvector > round(max(binvector)/2))] <- "black"
clqtextsize =0.4
if(NSNPs>50) {clqtextsize <- 0.3}
if(NSNPs>100) {clqtextsize <- 0.2}
CLQtext <- textGrob(seq_len(NSNPs), x=textx, y=texty, just=c("centre", "centre"), rot=90,
gp=gpar(col=textcol, cex=clqtextsize), vp=vpCLQ)
CLQboxgL <- gList(CLQboxgL, CLQtext)
}else{
CLQboxgL <- gList()
CLQregionheight <- 0
}
# oriblockresult <- blockresult
vpSNPinfo <- viewport(x=0.11,y=0.90,width=unit(unitleng*0.85,"snpc"),height=unit(0.10,"snpc"),just=c("left","bottom"), name="vpSNPinfo")
if(tick == "rsID"){
tickname.st <- as.character(blockresult$start.rsID)
tickname.ed <- as.character(blockresult$end.rsID)
ticknames <- paste(tickname.st,"\n" ,tickname.ed, sep="")
} else if(tick =="bp"){
tickname.st <- (blockresult$start.bp)
tickname.ed <- (blockresult$end.bp)
ticknames <- paste(tickname.st,"\n" ,tickname.ed, sep="")
}else if(tick == "both"){
tickname.st1 <- as.character(blockresult$start.rsID)
tickname.ed1 <- as.character(blockresult$end.rsID)
tickname.st2 <- round(blockresult$start.bp*0.001)
tickname.ed2 <- round(blockresult$end.bp*0.001)
tickname.st <- paste(tickname.st1, "\n(",tickname.st2,"kb)", sep="")
tickname.ed <- paste(tickname.ed1, "\n(",tickname.ed2,"kb)", sep="")
ticknames <- paste(tickname.st,"\n" ,tickname.ed, sep="")
}
#SNP name write
# blockresult <- blockresult[(blockresult[,3]-blockresult[,2]+1)>=showLDsize,]
oriblockresult <- blockresult
ticknames <- ticknames[(blockresult[,3]-blockresult[,2]+1)>=showLDsize]
rectGrobposi <- seq_len(length(ticknames))/length(ticknames)
Blockcol <- rep(blockcol, dim(blockresult)[1])[seq_len(nrow(blockresult))]
Blockcol <- Blockcol[(blockresult[,3]-blockresult[,2]+1)>=showLDsize]
LDSNPticksBox <- rectGrob(x=rectGrobposi, y=0.01, height=0.05, width=unit( 0.45, "snpc"),
gp=gpar(fill =Blockcol, col=Blockcol, alpha=0.7), just=c("centre","bottom"), vp=vpSNPinfo)
# SNPnameposi=sort(c(rectGrobposi-min(2/length(rectGrobposi),0.01), rectGrobposi+min(2/length(rectGrobposi),0.01)))
LDSNPticks <- textGrob(ticknames, x=rectGrobposi, y=0.1, just=c("left", "centre"),
rot=90, gp=gpar(cex=0.4, lineheight=0.8), vp= vpSNPinfo)
#SNPinfo box mappingline draw
vpSNPinfomapping <- viewport(x=0.11,y=bottomloca,width=unit(unitleng*0.85,"snpc"),height=unit(0.9-bottomloca,"snpc"),
just=c("left","bottom"), name="vpSNPinfo")
sblockresult <- blockresult[(blockresult[,3]-blockresult[,2]+1)>=showLDsize,]
Blockcenter.x <- apply(cbind((sblockresult[,2]-1)*(1/NSNPs), sblockresult[,3]*(1/NSNPs)), 1, mean)
Blockcenter.y <- apply(cbind((sblockresult[,2]-1)*(1/NSNPs), sblockresult[,3]*(1/NSNPs)), 1, function(x) (diff(x)/2))
blocktop <- max(Blockcenter.y*unitleng*0.85)/(0.9-bottomloca)
SNPblockmapping <- segmentsGrob(x0=rectGrobposi, x1=Blockcenter.x,
y0=1, y1=blocktop, vp=vpSNPinfomapping, gp=gpar(lwd=unitlwd, col=Blockcol))
SNPblockmapping1 <- segmentsGrob(x0=Blockcenter.x, x1=Blockcenter.x,
y0=blocktop, y1=(Blockcenter.y*unitleng*0.85)/(0.9-bottomloca),
vp=vpSNPinfomapping, gp=gpar(lwd=unitlwd, col=Blockcol))
#SNP box name wright
Blocknames <- paste("B-", which(oriblockresult[,3]-oriblockresult[,2]+1>=showLDsize), sep="")
LDblockticks <- textGrob(Blocknames, x=rectGrobposi, y=-0.01, just=c("centre", "top"),
gp=gpar(cex=0.4), vp=vpSNPinfo)
# (Blockcenter.y*unitleng*0.85)/(0.35)
#
# grid.draw(Locatick)
#rotate LD heatmap
#mapping lines
#2
mappingheight<- ifelse(CLQshow==FALSE, 0.13, 0.13)
vpMappingLine <- viewport(x=0.11,y=bottomloca-(CLQregionheight+mappingheight),width=unit(unitleng*0.85,"snpc"),
height=unit(mappingheight,"snpc"),just=c("left","bottom"), name="vpmappingLine")
# x0 <- c((blockresult[,2]-1)*(1/NSNPs), blockresult[,3]*(1/NSNPs))
x0 <- apply(cbind((oriblockresult[,2]-1)*(1/NSNPs), oriblockresult[,3]*(1/NSNPs)), 1, mean)
y0 <- rep(1, length(x0))
# x1ori <- c(blockresult[,6], blockresult[,7])
x1ori <- apply(cbind(oriblockresult[,6], oriblockresult[,7]), 1, mean)
x1 <- (x1ori-SNPstbp)/(SNPedbp-SNPstbp)
y1 <- rep(0, length(x1))
mappingLine <- segmentsGrob(x0=x0, y0=y0, x1=x1, y1=y1,vp=vpMappingLine,gp=gpar(lwd=unitlwd, col=blockcol))
# blockname on loca bar wright
Blocknames <- paste("B-", which(oriblockresult[,3]-oriblockresult[,2]+1>=showLDsize), sep="")
x1ori <- apply(cbind(sblockresult[,6], sblockresult[,7]), 1, mean)
x1 <- (x1ori-SNPstbp)/(SNPedbp-SNPstbp)
LDblocktickslocaBar <- textGrob(Blocknames, x=x1, y=0, just=c("left", "centre"), rot=90,
gp=gpar(cex=0.4), vp=vpMappingLine)
#location Bar
locabarheight <- 0.02
vpLocaBar <- viewport(x=0.11,y=(bottomloca-(CLQregionheight+mappingheight+locabarheight)),
width=unit(unitleng*0.85,"snpc"),height=unit(locabarheight,"snpc"),just=c("left","bottom"), name="vpLocaBar")
locaBar <- rectGrob(x=0, y=0, width=1, height=1, just=c("left","bottom"), vp=vpLocaBar, gp=gpar(col="gray50", fill="gray50"))
#draw LD blocks on the location bar
x1 <- (blockresult[,6]-SNPstbp)/(SNPedbp-SNPstbp)
blockwidth <- (blockresult[,7]-blockresult[,6])/(SNPedbp-SNPstbp)
blockonBar <- rectGrob(x=x1, y=rep(0, dim(blockresult)[1]), height=1, width=blockwidth,
just=c("left","bottom"), gp=gpar(col=deepblockcol, fill=blockcol, lwd=0.1), vp=vpLocaBar)
x0 <- seq_len(NSNPs)/NSNPs - (1/(2*NSNPs))
y0 <- rep(1, length(x0))
# x1ori <- c(blockresult[,6], blockresult[,7])
x1 <- SNPbp_cordi
y1 <- rep(0, length(x1))
blocksizes <- blockresult[,3]-blockresult[,2]+1
sgtblocks <- setdiff(seq_len(NSNPs), unlist(apply(blockresult[,2:3], 1, function(x) x[1]:x[2])))
sgtblocks <- cbind(sgtblocks, sgtblocks)
totalBigLDres <- rbind(as.matrix(blockresult[,2:3]), sgtblocks)
totalBigLDres <- totalBigLDres[order(totalBigLDres[,1]),, drop=FALSE]
SNPonBlockcol <- c()
j<-0
for(i in seq_len(dim(totalBigLDres)[1])){
if(totalBigLDres[i, 2] == totalBigLDres[i, 1] ){
SNPonBlockcol <-c(SNPonBlockcol, "gray50")
}else{
j<-j+1
jj = ifelse(j%%4==0, 4, j%%4)
SNPonBlockcol <-c(SNPonBlockcol, rep(blockcol[jj], (totalBigLDres[i, 2] - totalBigLDres[i, 1] + 1)))
}
}
if(NSNPs>1000){
x0 <- x0[seq(1,length(x0), 10)]
x1 <- x1[seq(1,length(x1), 10)]
y0 <- y0[seq(1,length(y0), 10)]
y1 <- y1[seq(1,length(y1), 10)]
SNPonBlockcol=SNPonBlockcol[seq(1,length(SNPonBlockcol), 10)]
}
deepSNPonBlockcol <- c()
j<-0
for(i in seq_len(dim(totalBigLDres)[1])){
if(totalBigLDres[i, 2] == totalBigLDres[i, 1] ){
deepSNPonBlockcol <-c(deepSNPonBlockcol, "gray50")
}else{
j<-j+1
jj = ifelse(j%%4==0, 4, j%%4)
deepSNPonBlockcol <-c(deepSNPonBlockcol, rep(deepblockcol[jj], (totalBigLDres[i, 2] - totalBigLDres[i, 1] + 1)))
}
}
if(NSNPs>1000){
deepSNPonBlockcol=deepSNPonBlockcol[seq(1,length(deepSNPonBlockcol), 10)]
}
SNPmappingLine <- segmentsGrob(x0=x0, y0=y0, x1=x1, y1=y1,vp=vpMappingLine,gp=gpar(lwd=unitlwd, col=SNPonBlockcol))
# blockname on loca bar wright
locaBar <- rectGrob(x=0, y=0, width=1, height=1, just=c("left","bottom"), vp=vpLocaBar, gp=gpar(col="gray50", fill="gray50"))
SNPsonBar <- segmentsGrob(x0 =x1, x1=x1, y0=0.05, y1=0.95, vp=vpLocaBar, gp=gpar(lwd=unitlwd, col=deepSNPonBlockcol))
}else{
# no block boundaries, only heatmap
LDblockHeatmap2 <- gTree(children=gList(LD.heat.map),name="LDHeatmap", vp=VPheatmap)
rotLDtest <- gTree(children=gList(LDblockHeatmap2), name="rorLDheatmap", vp=rot.vp)
CLQshow = FALSE
CLQregionheight <- 0
mappingheight <- 0.13
locabarheight <- 0.02
mappingheight<- ifelse(CLQshow==FALSE, 0.13, 0.05)
vpMappingLine <- viewport(x=0.11,y=bottomloca-(CLQregionheight+mappingheight),width=unit(unitleng*0.85,"snpc"),
height=unit(mappingheight,"snpc"),just=c("left","bottom"), name="vpmappingLine")
# x0 <- c((blockresult[,2]-1)*(1/NSNPs), blockresult[,3]*(1/NSNPs))
x0 <- seq_len(NSNPs)/NSNPs - (1/(2*NSNPs))
y0 <- rep(1, length(x0))
# x1ori <- c(blockresult[,6], blockresult[,7])
x1 <- SNPbp_cordi
y1 <- rep(0, length(x1))
if(NSNPs>1000){
x0 <- x0[seq(1,length(x0), 10)]
x1 <- x1[seq(1,length(x1), 10)]
y0 <- y0[seq(1,length(y0), 10)]
y1 <- y1[seq(1,length(y1), 10)]
# SNPonBlockcol=SNPonBlockcol[seq(1,length(y1), 10)]
}
SNPmappingLine <- segmentsGrob(x0=x0, y0=y0, x1=x1, y1=y1,vp=vpMappingLine,gp=gpar(lwd=unitlwd, col="gray50"))
# blockname on loca bar wright
vpLocaBar <- viewport(x=0.11,y=(bottomloca-(CLQregionheight+mappingheight+locabarheight)),width=unit(unitleng*0.85,"snpc"),
height=unit(locabarheight,"snpc"),just=c("left","bottom"), name="vpLocaBar")
locaBar <- rectGrob(x=0, y=0, width=1, height=1, just=c("left","bottom"), vp=vpLocaBar, gp=gpar(col="gray50", fill="gray50"))
SNPsonBar <- segmentsGrob(x0 =x1, x1=x1, y0=0.05, y1=0.95, vp=vpLocaBar, gp=gpar(lwd=unitlwd, col="black"))
CLQboxgL <- gList()
LDSNPticksBox <- gList()
LDSNPticks <- gList()
SNPblockmapping <- gList()
SNPblockmapping1 <- gList()
LDblockticks <- gList()
LDblocktickslocaBar <- gList()
blockonBar <- gList()
mappingLine <- gList()
}
# gene loca info ---------
if(dim(nowgeneinfo)[1]>0){
generegionH <- bottomloca-(CLQregionheight+mappingheight+locabarheight)-0.045
stratNum <- length(stratifyGene)
stratheight <- min(generegionH/stratNum, 0.05)
geneinfoList <- gList()
for(i in seq_len(stratNum)){
# each info region : height 0.5
nowvp <- viewport(x=0.11,y=(bottomloca-(CLQregionheight+mappingheight+locabarheight)-(stratheight*i)),
width=unit(unitleng*0.85,"snpc"),height=unit((stratheight*0.9),"snpc"),just=c("left","bottom"))
nowgenes <- stratifyGene[[i]]
x0s <- nowgenes[,3]
x1s <- nowgenes[,4]
nowwidth <- x1s-x0s
# nowwidth[which(nowwidth<0.01)]<-0.008
nowcol <- c("blue", "green3", "darkmagenta")
geneinfoList <- gList(geneinfoList,
rectGrob(x=x0s, y=rep(0.8, length(x0s)), width=nowwidth, height=0.2,
vp=nowvp, gp=gpar(fill=nowcol, col=nowcol, cex=nowcex), just=c("left","bottom")),
textGrob(as.character(nowgenes[,1]),x=x0s, y=c(0.75, 0.45), vp=nowvp, gp=gpar(col=nowcol, cex=0.4),
just=c("left", "top")),
segmentsGrob(x0=x0s, x1=x0s, y0=0.8, y1=c(0.75, 0.45),vp=nowvp,
gp=gpar(col=nowcol, cex=0.4, lty="dotted", alpha=0.5))
)
}
geneMap <- gTree(children=geneinfoList,name="Genemap")
# grid.draw(geneMap)
}else{
if(!is.null(geneinfo)){message("This region does not overlap any gene region!")}
stratheight <- 0;stratNum <- 0
geneMap<-NULL
text_gene <- gList()
}
vpLocationTick <- viewport(x=0.11,y=bottomloca-(CLQregionheight+mappingheight),width=unit(unitleng*0.85,"snpc"),height=unit((stratheight*stratNum)+0.03,"snpc"),
just=c("left","top"), name="vpLocaTick")
vpLocationTickname <- viewport(x=0.11,y=bottomloca-(CLQregionheight+mappingheight+locabarheight)-(stratheight*stratNum)-0.05,width=unit(unitleng*0.85,"snpc"),height=unit(0.05,"snpc"),
just=c("left","bottom"), name="vpLocaTick")
x0range <- SNPedbp-SNPstbp
x0num <- length(strsplit(as.character(x0range), split="")[[1]])-1
x0tickround <- seq(round(SNPstbp/(10^x0num)), round(SNPedbp/(10^x0num)),0.5)
x0tickloca <- x0tickround*10^x0num
x0tickloca <- x0tickloca[which(x0tickloca>=SNPstbp & x0tickloca<=SNPedbp)]
x0tickloca <- c(SNPstbp, x0tickloca, SNPedbp)
x0tickloca.adj <- (x0tickloca-SNPstbp)/x0range
Locatick <- segmentsGrob(x0=x0tickloca.adj, y0=rep(1, length(x0tickloca.adj)),
x1=x0tickloca.adj, y1=rep(0, length(x0tickloca.adj)),
vp= vpLocationTick, gp=gpar(lty="dashed", col="gray50"))
# locationTickname
if(x0num >= 6){
Locatickname <- textGrob(paste(round(x0tickloca/10^6,1), "Mb", sep=""),
x=x0tickloca.adj, y=0.9, rot=-45, just=c("left", "top"),vp=vpLocationTickname,gp=gpar(cex=0.5) )
}else if((x0num >= 3) & (x0num < 6)){
Locatickname <- textGrob(paste(round(x0tickloca/10^3,1), "Kb", sep=""),
x=x0tickloca.adj, y=0.9, rot=-45, just=c("left", "top"),vp=vpLocationTickname,gp=gpar(cex=0.5))
}else if(x0num < 3){
Locatickname <- textGrob(paste(x0tickloca, "bp", sep=""),
x=x0tickloca.adj, y=0.9, rot=-45, just=c("left", "top"),vp=vpLocationTickname,gp=gpar(cex=0.5))
}
#Locatick2
x0tickround1 <- seq(round(SNPstbp/(10^(x0num-1))), round(SNPedbp/(10^(x0num-1))),1)
x0tickloca1 <- x0tickround1*(10^(x0num-1))
x0tickloca1 <- x0tickloca1[which(x0tickloca1 >= SNPstbp & x0tickloca1 <= SNPedbp)]
x0tickloca1 <- c(SNPstbp, x0tickloca1, SNPedbp)
x0tickloca1<-setdiff(x0tickloca1, x0tickloca)
x0tickloca.adj1 <- (x0tickloca1-SNPstbp)/x0range
Locatick1 <- segmentsGrob(x0=x0tickloca.adj1, y0=rep(1, length(x0tickloca.adj1)),
x1=x0tickloca.adj1, y1=rep(0, length(x0tickloca.adj1)),
vp= vpLocationTick, gp=gpar(lty="dotted", col="gray50"))
#color key ----------------
LDheatmapLegend<- function(color, LD) {
colorrect <- rectGrob(x=(length(color):1/length(color)),y=0.7,width=1/length(color),height=0.3,just=c("right","top"),gp=gpar(col=NA,fill=color))
if(LD == "r2"){
ttt <-expression(paste("r"^{2}, " Color Key"))
}else {
ttt <-"D' Color Key"
}
title <- textGrob(ttt, x=0.5, y=0.9, name="title",
gp=gpar(cex=0.5))
if(LD != "Dp-str"){
labels <- textGrob(paste(0.2 * 0:5), x=0.2 * 0:5, y=0.2,
gp=gpar(cex=0.3), name="labels")
ticks <- segmentsGrob(x0=c(0:5) * 0.2, y0=rep(0.4,6), x1=c(0:5) * 0.2, y1=rep(0.3, 6), name="ticks")
}else{
labels <- textGrob(c("weak LD\nor\nnot informative", "strong LD"), x=c(0.25, 0.75), y=0.35,
gp=gpar(cex=0.3,lineheight=0.8), name="labels",just=c("centre","top"))
ticks=nullGrob()
}
box <- rectGrob(x=0, y=0.7, height=0.3, width=1,gp=gpar(col="black",fill= trascolor), name="box", just=c("left","top"))
key <- gTree(children=gList(colorrect, title, labels,ticks, box), name="Key")
key
}
keyVP <- viewport(x=0.02, y=0.98, height=unit(0.1, "snpc"), width=unit(0.09, "snpc"), ######need modify
just=c("left", "top"), name="keyVP")
## mapname--------------
vpmapname <- viewport(x=0.095, y=0, height=unit(1, "snpc"), width=unit(0.1, "snpc"), ######need modify
just=c("right", "bottom"), name="keyVP")
text_heatmap <- textGrob("LD Heat Map", x=1, y=0.75, just=c("right", "centre"), gp=gpar(cex=0.5), vp=vpmapname)
if(CLQshow == TRUE){
text_clq <- textGrob("CLQ results", x=1, y=bottomloca, just=c("right", "top"),gp=gpar(cex=0.5), vp=vpmapname)
}else{
text_clq <- nullGrob(vp=vpmapname)
}
if(onlyHeatmap == FALSE){
blockregionname <- ifelse(blocktype == "bigld", "block regions\n(Big-LD)", "block regions\n(GPART)")
text_LD <- textGrob(blockregionname, x=1, y=bottomloca-(CLQregionheight+mappingheight), just=c("right", "centre"),
gp=gpar(cex=0.5), vp=vpmapname)
}else{
text_LD <- textGrob("SNP locations", x=1, y=bottomloca-(CLQregionheight+mappingheight), just=c("right", "top"),
gp=gpar(cex=0.5), vp=vpmapname)
}
if(dim(nowgeneinfo)[1]>0){
text_gene <- textGrob("Genes", x=1, y= (bottomloca-(CLQregionheight+mappingheight+locabarheight)-0.02),
just=c("right", "top"), gp=gpar(cex=0.5),
vp=vpmapname)
}else{
text_gene <-gList()
}
mapname <- gTree(children=gList(text_heatmap,text_LD, text_clq, text_gene))
# --------------------------------------------end ------
filename <- filename
if(unitleng == 1.1){
widthleng <- unitleng *1.1
}else if(unitleng == 2){
widthleng <- unitleng *1.1
}else if(unitleng == 3){
widthleng <- unitleng
}else if(unitleng == 4){
widthleng <- unitleng
}
if((NSNPs > 200) & (onlyHeatmap == FALSE)){
SNPmappingLine <- gList()
# SNPsonBar <- gList()
}
if((NSNPs <= 200) & (onlyHeatmap == FALSE)){
mappingLine <- gList()
# SNPsonBar <- gList()
}
cat("\n")
message("Generating file...." )
if(type == "tif"){
filename <- paste(filename, ".tif", sep="")
tiff(filename,res=res, units="cm", height=15, width=15*widthleng)
grid.draw(mappingLine)
grid.draw(SNPmappingLine)
grid.draw(rotLDtest)
grid.draw(CLQboxgL)#
grid.draw(LDSNPticksBox)#
grid.draw(Locatick)
grid.draw(Locatick1)
grid.draw(locaBar)
grid.draw(Locatickname)
grid.draw(blockonBar)
grid.draw(SNPsonBar)
grid.draw(geneMap)
grid.draw(LDSNPticks)#
grid.draw(SNPblockmapping)#
grid.draw(SNPblockmapping1)#
grid.draw(LDblockticks)#
grid.draw(LDblocktickslocaBar)#
grid.draw(mapname)
pushViewport(keyVP)
grid.draw(LDheatmapLegend(color, LD))
upViewport()
dev.off()
message(filename)
}else if(type == "png"){
filename <- paste(filename, ".png", sep="")
png(filename,res=res, units="cm", height=15, width=15*widthleng)
grid.draw(mappingLine)
grid.draw(SNPmappingLine)
grid.draw(rotLDtest)
grid.draw(CLQboxgL)#
grid.draw(LDSNPticksBox)#
grid.draw(Locatick)
grid.draw(Locatick1)
grid.draw(locaBar)
grid.draw(Locatickname)
grid.draw(blockonBar)
grid.draw(SNPsonBar)
grid.draw(geneMap)
grid.draw(LDSNPticks)#
grid.draw(SNPblockmapping)#
grid.draw(SNPblockmapping1)#
grid.draw(LDblockticks)#
grid.draw(LDblocktickslocaBar)#
grid.draw(mapname)
pushViewport(keyVP)
grid.draw(LDheatmapLegend(color, LD))
upViewport()
dev.off()
message(filename)
}
}
# data ------------
#' @title genotype data
#' @name geno
#' @docType data
#' @aliases geno
#' @description This data set gives genotype data that are subset of 1000 Genomes Project phase1 release 3 genotype data
#' with 286 individuals from JPT, CHB and CHS populations (1000 Genomes Project Consortium, 2012).
#' The dataset contains 9000 SNPs in chromosome 21
#' @usage data(geno)
#' @format A data frame with 286 rows and 9000 columns
#' @source 1000 genomes project Phase 1 dataset <http://www.internationalgenome.org/>
#' @references {
#' 1000 Genomes Project Consortium.
#' "An integrated map of genetic variation from 1,092 human genomes." \emph{Nature} 491.7422 (2012): 56.
#' }
#' @keywords datasets
NULL
#' @title SNP information data
#' @name SNPinfo
#' @docType data
#' @aliases SNPinfo
#' @description This data set gives information data of SNPs in \code{geno}
#' The dataset contains chromosome name, rsID and bp information of the 9000 SNPs in \code{geno}
#' @usage data(SNPinfo)
#' @format A data frame with 9000 rows and 3 columns for chromosome name, rsID and bp
#' @source 1000 genomes project Phase 1 dataset <http://www.internationalgenome.org/>
#' @references {
#' 1000 Genomes Project Consortium.
#' "An integrated map of genetic variation from 1,092 human genomes." \emph{Nature} 491.7422 (2012): 56.
#' }
#' @keywords datasets
NULL
#' @title gene information data
#' @name geneinfo
#' @docType data
#' @aliases geneinfo
#' @description This data set gives gene information in chromosome 21.
#' @usage data(geneinfo)
#' @format A data frame with 736 rows and 4 columns for genename, chromosome name, start bp and end bp of each gene.
#' @source <http://grch37.ensembl.org/index.html>
#' @references {
#' Zerbino, Daniel R., et al. "Ensembl 2018." \emph{Nucleic acids research 46.D1} (2017): D754-D761.
#' }
#'
#' @keywords datasets
NULL
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.