pcairPartition <- function(kinobj, divobj = NULL,
kin.thresh = 2^(-11/2),
div.thresh = -2^(-11/2),
unrel.set = NULL,
sample.include = NULL,
verbose = TRUE){
if(verbose) message('Using kinobj and divobj to partition samples into unrelated and related sets')
# sample.id from kinship matrix
kin.id.all <- .readSampleId(kinobj)
id.all <- kin.id.all
# check for ids provided
if(is.null(kin.id.all)) {
stop('colnames must be provided for kinobj')
}
# filter to samples in sample.include
if(!is.null(sample.include)){
if(!all(sample.include %in% kin.id.all)){
warning('some samples in sample.include are not in kinobj; they will not be included')
}
# subset the read indicators to samples in sample.include
kin.read <- kin.id.all %in% sample.include
} else{
kin.read <- rep(TRUE, length(kin.id.all))
}
if (!is.null(divobj)) {
# sample.id from divergence matrix
div.id.all <- .readSampleId(divobj)
# check for ids provided
if(is.null(div.id.all)) {
stop('colnames must be provided for divobj')
}
# filter to samples in sample.include
if(!is.null(sample.include)){
if(!all(sample.include %in% div.id.all)){
warning('some samples in sample.include are not in divobj; they will not be included')
}
# subset the read indicators to samples in sample.include
div.read <- div.id.all %in% sample.include
} else{
div.read <- rep(TRUE, length(div.id.all))
}
# logical indicators of which samples are in both sets
kin.match <- kin.id.all %in% div.id.all
div.match <- div.id.all %in% kin.id.all
if(!(all(kin.match) & all(div.match))){
warning('kinobj and divobj contain non-matching samples; only partitioning those present in both objects')
}
kin.read <- kin.read & kin.match
div.read <- div.read & div.match
rm(kin.match, div.match)
id.all <- intersect(kin.id.all, div.id.all)
}
if(verbose) message('Working with ', sum(kin.read), ' samples')
# checks on unrel.set
if(!is.null(unrel.set)){
if(!all(unrel.set %in% id.all)){
warning('some samples in unrel.set are not in kinobj or divobj; they will not be included')
# subset unrel.set to only those in kinobj and divobj
unrel.set <- unrel.set[(unrel.set %in% id.all)]
}
# if using sample.include, only keep unrel.set in sample.include
if(!is.null(sample.include)){
unrel.set <- unrel.set[unrel.set %in% sample.include]
}
# if no samples left in unrel.set; don't use it downstream
if(length(unrel.set) == 0){
unrel.set <- NULL
}
}
if(verbose) message('Identifying relatives for each sample using kinship threshold ', kin.thresh)
# create a vector of ids for samples to be read
kin.id <- kin.id.all[kin.read]
# create list of relatives for each sample
rellist <- .apply(kinobj, MARGIN = 2,
FUN = function(x){ kin.id[x > kin.thresh] },
selection = list(kin.read, kin.read))
if (is.matrix(rellist)) {
# everyone has the same number of relatives
if (nrow(rellist) == length(kin.id)) {
# everyone is related to everyone else
stop("All samples related at threshold ", kin.thresh, "; please select a higher kinship threshold to identify an unrelated set")
}
dimnames(rellist) <- NULL
rellist <- lapply(seq_len(ncol(rellist)), function(i) rellist[,i])
} else if (!is.list(rellist)) {
if(verbose) message("No relatives found using threshold ", kin.thresh)
return(list(rels = NULL, unrels = kin.id))
}
names(rellist) <- kin.id
# remove self from rellist
for(i in 1:length(rellist)){
rellist[[i]] <- rellist[[i]][rellist[[i]] != names(rellist)[i]]
}
# compute number of relatives for each sample
nrel <- sapply(rellist, length)
# set aside samples with no relatives
unrels <- names(nrel[nrel == 0])
# subset rellist, nrel to those with relatives
rellist <- rellist[!(names(rellist) %in% unrels)]
nrel <- nrel[nrel > 0]
# update logicial indicator of which samples in kinobj still need to be read
kin.read <- kin.id.all %in% names(nrel)
# update the vector of ids we are reading
kin.id <- kin.id.all[kin.read]
# compute 'total kinship' values
kinsum <- .apply(kinobj, MARGIN = 2,
FUN = function(x){ sum(x[x > kin.thresh]) },
selection = list(kin.read, kin.read))
names(kinsum) <- kin.id
kinsum <- unlist(kinsum)
if (!is.null(divobj)) {
if(verbose) message('Identifying pairs of divergent samples using divergence threshold ', div.thresh)
# logical indicator of which samples in divobj need divergence measures
div.read.col <- div.id.all %in% kin.id
# vector of ids we are reading
div.id <- div.id.all[div.read]
div.id.col <- div.id.all[div.read.col]
# create list of divergent pairs for each sample
divlist <- .apply(divobj, MARGIN = 2,
FUN = function(x){ div.id[x < div.thresh] },
selection = list(div.read, div.read.col))
if (length(divlist) > 0) {
names(divlist) <- div.id.col
# create a vector matching ids of rellist and divlist
idx <- match(names(divlist), names(rellist))
# not divergent if actually related
for(i in 1:length(divlist)){
j <- idx[i]
divlist[[i]] <- divlist[[i]][!(divlist[[i]] %in% rellist[[j]])]
}
# compute number of divergent pairs for each sample
ndiv <- sapply(divlist, length)
} else {
ndiv <- setNames(rep(0, length(div.id.col)), div.id.col)
}
# clean up
rm(divlist, div.id.all, div.read, div.read.col, div.id, div.id.col)
} else {
ndiv <- setNames(rep(0, length(kin.id)), kin.id)
}
# empty vector to store related set
rels <- NULL
# take care of user specified unrel.set
if(!is.null(unrel.set)){
if(verbose) message('Forcing samples specified in unrel.set into the unrelated set')
# add these samples to unrels
unrels <- unique(append(unrels, unrel.set))
# identify samples related to a sample in unrel.set
rel.new <- names(rellist)[sapply(rellist, function(x){ any(x %in% unrel.set) })]
# filter out relatives that were specified in unrel.set
rel.new <- rel.new[!(rel.new %in% unrel.set)]
# append rel.new to the master relative list
rels <- append(rels, rel.new)
# remove unrel.set and rel.new from rellist
keep <- !(names(nrel) %in% c(unrel.set, rel.new))
rellist <- rellist[keep]
# recompute nrel
nrel <- sapply(rellist, length)
# identify any with no relatives left
unrel.new <- names(nrel[nrel == 0])
if(length(unrel.new) > 0){
# append unrel.new to the master unrelated list
unrels <- append(unrels, unrel.new)
# remove unrel.new from rellist and nrel
keep <- !(names(nrel) %in% unrel.new)
rellist <- rellist[keep]
nrel <- nrel[keep]
}
}
if(verbose) message('Partitioning samples into unrelated and related sets...')
# iterate
iter <- 0
while(length(nrel) > 0){
iter <- iter + 1
if(verbose & iter %% 1000 == 0) message(' ...', iter, ' samples added to related.set...')
# who has the most relatives
rel.new <- names(nrel)[nrel == max(nrel)]
if(length(rel.new) > 1){
# who has the least divergent pairs
ndiv2 <- ndiv[names(ndiv) %in% rel.new]
rel.new <- names(ndiv2)[ndiv2 == min(ndiv2)]
if(length(rel.new) > 1){
# who has the lowest 'total kinship'
kinsum2 <- kinsum[names(kinsum) %in% rel.new]
rel.new <- names(kinsum2)[kinsum2 == min(kinsum2)][1]
}
}
# append rel.new to the master relative list
rels <- append(rels, rel.new)
# remove rel.new from rellist and nrel
keep <- names(nrel) != rel.new
rellist <- rellist[keep]
nrel <- nrel[keep]
# subtract 1 from nrel for those related to rel.new
nrel <- nrel - ifelse(sapply(rellist, function(x){ rel.new %in% x }), 1, 0)
# identify any with no relatives left
unrel.new <- names(nrel[nrel == 0])
if(length(unrel.new) > 0){
# append unrel.new to the master unrelated list
unrels <- append(unrels, unrel.new)
# remove unrel.new from rellist and nrel
keep <- !(names(nrel) %in% unrel.new)
rellist <- rellist[keep]
nrel <- nrel[keep]
}
}
# return results
return(list(rels = rels, unrels = unrels))
}
.pcairPartitionUser <- function(gdsobj, unrel.set = NULL, sample.include = NULL, verbose = TRUE){
# get sample ids
sample.id <- as.character(.readSampleId(gdsobj))
if(!is.null(sample.include)){
if(!all(sample.include %in% sample.id)){
warning('some samples in sample.include are not in gdsobj; they will not be included')
sample.include <- sample.include[sample.include %in% sample.id]
}
}else{
sample.include <- sample.id
}
if(verbose) message('Working with ', length(sample.include), ' samples')
if(!is.null(unrel.set)){
if(verbose) message('Using user specified unrel.set to parition samples into unrelated and related sets')
if(!all(unrel.set %in% sample.include)){
warning('some samples in unrel.set are not in sample.include or gdsobj; they will not be included')
unrel.set <- unrel.set[unrel.set %in% sample.include]
}
rels <- sample.include[!(sample.include %in% unrel.set)]
part <- list(rels = rels, unrels = unrel.set)
}else{
if(verbose) message('kinobj, divobj, and unrel.set all NULL; performing standard PCA analysis')
part <- list(rels = NULL, unrels = sample.include)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.